Package gmisclib :: Module find_ngram
[frames] | no frames]

Source Code for Module gmisclib.find_ngram

 1  #!/usr/bin/env python 
 2   
 3  """This module lets you search through label files 
 4  to find particular ngrams.""" 
 5   
 6   
 7  __version__ = "$Revision: 1.7 $" 
 8   
 9  # from xwaves_errs import * 
10   
11  import re 
12  import xwaves_mark 
13  import xwaves_lab 
14  from xwaves_errs import * 
15   
16   
17  WildAnyOne = re.compile('.') 
18   
19 -def matches(data, pattern):
20 if len(data) != len(pattern): 21 return 0 22 for (d, p) in zip(data, pattern): 23 if hasattr(p, 'match'): 24 if not p.match(d): 25 return 0 26 elif d != p: 27 return 0 28 return 1
29 30
31 -def find_mark(ngram, fname, datatype=xwaves_mark.PHONE):
32 """ngram = list of labels. 33 fname = file name for xwaves_mark datafile. 34 datatype = xwaves_mark.PHONE or xwaves_mark.WORD 35 36 This function returns all instances 37 (even overlapping instances) of the specified 38 N-gram in the file. The return format 39 is [ (end_time, label), ... ], 40 where the zeroth entry in the list is the 41 symbol before the start of the N-gram. 42 It's end time is the beginning of the N-gram. 43 44 In the argument list, the N-gram is an array of labels; the labels 45 need to match the file's labels exactly. 46 """ 47 48 header, data = xwaves_mark.read(fname) 49 data = xwaves_mark.mark_to_lab(data, datatype) 50 return find(ngram, data)
51 52
53 -def find_lab(ngram, fname, loose=0):
54 """ngram = list of labels. 55 fname = file name for xwaves_lab datafile. 56 57 This function returns all instances 58 (even overlapping instances) of the specified 59 N-gram in the file. The return format 60 is [ (end_time, label), ... ], 61 where the zeroth entry in the list is the 62 symbol before the start of the N-gram. 63 It's end time is the beginning of the N-gram. 64 65 In the argument list, the N-gram is an array of labels; the labels 66 need to match the file's labels exactly. 67 """ 68 69 header, data = xwaves_lab.read(fname, loose) 70 return find(ngram, data)
71 72 73
74 -def find(ngram, data):
75 """ngram = list of labels. 76 data = list of (time, label, ...) as produced by xwaves_lab.py or similar. 77 78 This function returns all instances 79 (even overlapping instances) of the specified 80 N-gram in the file. The return format 81 is [ [ label, ...], ... ] . 82 It is a list of n-grams, and each n-gram is a list of entities, 83 and each entities is a tuple which marks when it ends, 84 what it is specifically, and perhaps other things. 85 86 In the argument list, the N-gram is an array of labels; the labels 87 need to match the file's labels exactly. 88 """ 89 o = [] 90 91 ll = [lbl for (t, lbl) in data ] 92 N = len(ngram) 93 for i in range(1, max(1, len(data)-N+1)): 94 if matches(ll[i:i+N], ngram): 95 o.append(data[i-1:i+N]) 96 return o
97