1
2
3 """This module lets you search through label files
4 to find particular ngrams."""
5
6
7 __version__ = "$Revision: 1.7 $"
8
9
10
11 import re
12 import xwaves_mark
13 import xwaves_lab
14 from xwaves_errs import *
15
16
17 WildAnyOne = re.compile('.')
18
20 if len(data) != len(pattern):
21 return 0
22 for (d, p) in zip(data, pattern):
23 if hasattr(p, 'match'):
24 if not p.match(d):
25 return 0
26 elif d != p:
27 return 0
28 return 1
29
30
32 """ngram = list of labels.
33 fname = file name for xwaves_mark datafile.
34 datatype = xwaves_mark.PHONE or xwaves_mark.WORD
35
36 This function returns all instances
37 (even overlapping instances) of the specified
38 N-gram in the file. The return format
39 is [ (end_time, label), ... ],
40 where the zeroth entry in the list is the
41 symbol before the start of the N-gram.
42 It's end time is the beginning of the N-gram.
43
44 In the argument list, the N-gram is an array of labels; the labels
45 need to match the file's labels exactly.
46 """
47
48 header, data = xwaves_mark.read(fname)
49 data = xwaves_mark.mark_to_lab(data, datatype)
50 return find(ngram, data)
51
52
54 """ngram = list of labels.
55 fname = file name for xwaves_lab datafile.
56
57 This function returns all instances
58 (even overlapping instances) of the specified
59 N-gram in the file. The return format
60 is [ (end_time, label), ... ],
61 where the zeroth entry in the list is the
62 symbol before the start of the N-gram.
63 It's end time is the beginning of the N-gram.
64
65 In the argument list, the N-gram is an array of labels; the labels
66 need to match the file's labels exactly.
67 """
68
69 header, data = xwaves_lab.read(fname, loose)
70 return find(ngram, data)
71
72
73
74 -def find(ngram, data):
75 """ngram = list of labels.
76 data = list of (time, label, ...) as produced by xwaves_lab.py or similar.
77
78 This function returns all instances
79 (even overlapping instances) of the specified
80 N-gram in the file. The return format
81 is [ [ label, ...], ... ] .
82 It is a list of n-grams, and each n-gram is a list of entities,
83 and each entities is a tuple which marks when it ends,
84 what it is specifically, and perhaps other things.
85
86 In the argument list, the N-gram is an array of labels; the labels
87 need to match the file's labels exactly.
88 """
89 o = []
90
91 ll = [lbl for (t, lbl) in data ]
92 N = len(ngram)
93 for i in range(1, max(1, len(data)-N+1)):
94 if matches(ll[i:i+N], ngram):
95 o.append(data[i-1:i+N])
96 return o
97