1
2 """This reads MLF (Master Label Files) for/from the HTK speech recognition
3 toolkit.
4 """
5
6 import os
7 import glob
8 from gmisclib import die
9 from gmisclib.xwaves_errs import *
10
11 TIME_QUANTUM = 1e-7
12
13
14
18
19
21 """Find a file. Starting with a glob expression C{f}, strip off
22 any .something suffix, then add on C{postfix}.
23 If that glob evaluates to a file, return (d,f)
24 where C{d} is the directory and C{f} is the filename (with postfix removed).
25 Postfix would normally start with '.'.
26 At the end, os.path.join(d, f) + postfix will name a file.
27 """
28 if not f:
29 return (None, None)
30 d0, f0 = os.path.split(f)
31 f0 = os.path.splitext(f0)[0]
32 fx = os.path.join(d0, f0) + postfix
33 if verbose:
34 die.info("HTK_MLF_io._findfile: glob=%s" % fx)
35 gl = glob.glob(fx)
36 if verbose:
37 die.info("HTK_MLF_io._findfile globs=%s" % ','.join(gl))
38 if len(gl) == 0:
39 raise ReferencedFileNotFound, fx
40 assert len(gl) == 1, 'Too many alternatives for %s: %d' % (fx, len(gl))
41 d1, f1 = os.path.split(gl[0])
42
43 f1 = os.path.splitext(f1)[0]
44
45 if verbose:
46 die.info("HTK_MLF_io._findfile d1=%s d1=%s" % (d1, f1))
47 return (d1, f1)
48
49
51 """This parses a line from a MLF into a tuple.
52 @param tq: time quantum (normally 1e-7 seconds)
53 @type tq: L{float}
54 @param s: line to be parsed
55 @type s: C{str}
56 @raise BadFileFormatError: when parsing is not possible.
57 """
58 a = s.strip().split()
59 la = len(a)
60 if la == 1:
61 if a[0] == '///':
62 die.die("Sorry! cannot handle /// in MLF.")
63 return a[0]
64 elif tq is None:
65 raise ValueError, "File data needs time_quantum to be a number."
66 elif la >= 3:
67 tmp = (float(a[0])*tq, float(a[1])*tq, a[2])
68 if la == 3:
69 return tmp
70 tmp = list(tmp)
71 a = a[3:]
72 while a:
73 tmp.append(float(a.pop(0)))
74 if not a:
75 break
76 tmp.append(a.pop(0))
77 return tuple(tmp)
78 elif la == 2:
79 return (float(a[0])*tq, a[1])
80 raise BadFileFormatError
81
82
84 """This reads part of a MLF, grabbing all the labels for one utterance."""
85 sym = []
86 while True:
87 s = fd.readline()
88 if s == '':
89 break
90 s = s.rstrip('\r\n')
91 if s == '.':
92 break
93 elif s != '':
94 sym.append( parse_label_line(s, time_quantum) )
95 return sym
96
97
98 -def readone(mlf_efn, postfix='.wav', datapath='.', strict=True,
99 findfile=True, pathedit=None, time_quantum=TIME_QUANTUM,
100 verbose=False):
101 """Read a single set of labels from a MLF file.
102 You specify the labels as part of the extended filename, like this:
103 name_of_MLF_file:name_of_labels'. The function returns
104 only a single value and raises an exception if the extended
105 filename is ambiguous.
106 @type mlf_efn: string in the form "F:S"
107 @rtype dict()
108 @return: a dictionary that describes the labels as per
109 L{readiter}.
110 """
111 filename, subname = mlf_efn.split(':')
112 candidate = None
113 for x in readiter(filename, postfix=postfix, datapath=datapath,
114 strict=strict, findfile=findfile,
115 pathedit=pathedit, time_quantum=time_quantum,
116 verbose=verbose):
117 if subname in x['filespec']:
118 if candidate is not None:
119 raise ValueError, "Not unique: %s in %s" % (subname, filename)
120 candidate = x
121 return candidate
122
123
124 -def readiter(mlf_fn, postfix='.wav', datapath='.', strict=True,
125 findfile=True, pathedit=None,
126 time_quantum=TIME_QUANTUM,
127 verbose=False):
128 """Read a HTK Master Label (MLF) file.
129 Datapath and pathedit are ways to deal with the
130 situation where the MLF file has been moved, or (for other reasons)
131 the filenames in the MLF file don't point to the actual data.
132 @param mlf_fn: filename of the data file.
133 @type mlf_fn: str
134 @type strict: bool
135 @param strict: If true, raise an exception if an audio file cannot be found.
136 @type time_quantum: L{float}
137 @param time_quantum: A factor to convert from the time information in the MLF
138 to real units of time (like seconds). Ideally, time_quantum=1e-7 seconds
139 for MLF files, but that isn't exactly accurate for some sampling rates
140 (like 11025 samples/sec) when the sampling interval is not an integral
141 multiple of 100 nanoseconds.
142 @rtype: an iterator producing C{dict(str: various)}
143 @return: sequence of C{{'filespec':path, 'd': d, 'f': f, 'symbols': [...] }}, ... .
144 This is an iterator of dictionaries. Each dictionary corresponds to
145 one utterance, or one "label file" in the MLF.
146 Attributes 'd' and 'f' are only present if findfile==True;
147 C{os.path.join(x['d'], x['f'])} is a path to the corresponding audio.
148 C{x['filespec']} is the path information in the MLF,
149 C{x['i']} is an C{int} indexing which utterance this is within the MLF,
150 and C{x['symbols']} is the label information for that utterance.
151 It is a list of tuples produced by L{parse_label_line}.
152 """
153 if pathedit is None:
154 pathedit = os.path.join
155
156 dmlf, fmlf = os.path.split(mlf_fn)
157
158 try:
159 fd = open(mlf_fn, 'r')
160 except IOError, x:
161 raise NoSuchFileError(*(x.args))
162 l = fd.readline()
163 assert l=='#!MLF!#\n', 'l=%s' % l
164 i = 0
165 while True:
166 f = fd.readline()
167 if f == '':
168 break
169 f = f.strip()
170 if f == '':
171 continue
172 if f.startswith('"') and f.endswith('"'):
173 f = f[1:-1]
174 fspec = f
175 rv = {'filespec': fspec, 'i': i}
176 if findfile:
177 if verbose:
178 die.info("dmlf=%s; datapath=%s; f=%s" % (dmlf, datapath, f))
179 try:
180 d1, f1 = _findfile(pathedit(dmlf,datapath,f), postfix, verbose)
181 except ReferencedFileNotFound, x:
182 if strict:
183 raise
184 else:
185 die.warn('No such file: %s from %s' % (x, fspec))
186 _get_symbols(fd, None)
187 continue
188 else:
189 rv['d'] = d1
190 rv['f'] = f1
191 rv['symbols'] = _get_symbols(fd, time_quantum)
192 yield rv
193 i += 1
194
195
196 -def read(mlf_fn, **kw):
197 """Read a HTK Master Label (MLF) file.
198 Datapath and pathedit are ways to deal with the
199 situation where the MLF file has been moved, or (for other reasons)
200 the filenames in the MLF file don't point to the actual data.
201 @param mlf_fn: filename of the data file.
202 @type mlf_fn: str
203 @param kw: Key-value parameters from L{readiter}.
204 @rtype: list of dict. See L{readiter} for details.
205 @return: see L{readiter} for details.
206 """
207 return list( readiter(mlf_fn, **kw) )
208
209
210
213 assert time_quantum > 0.0
214 self.fd = mlf_fd
215 self.fd.writelines('#!MLF!#\n')
216 self.nchunks = 0
217 self.scale = 1.0/time_quantum
218
219 - def chunk(self, filespec, data):
220 if self.nchunks > 0:
221 self.fd.writelines('.\n')
222 self.fd.writelines( [ '"%s"\n' % filespec,
223 '\n'.join(data), '\n'
224 ]
225 )
226 self.nchunks += 1
227 self.fd.flush()
228
230 d = [ '%d %d %s' % (int(round(t0*self.scale)),
231 int(round(te*self.scale)),
232 lbl)
233 for (t0, te, lbl) in tcdata
234 ]
235 self.chunk(filespec, d)
236
238 self.fd.writelines('\n')
239 self.fd.flush()
240 os.fsync(self.fd.fileno())
241 self.fd = None
242
244 if self.fd is not None:
245 self.close()
246
247
248 if __name__ == '__main__':
249 import sys
250 DATAPATH = None
251 arglist = sys.argv[1:]
252 while arglist and arglist[0].startswith('-'):
253 arg = arglist.pop(0)
254 if arg == '-datapath':
255 DATAPATH = arglist.pop(0)
256 else:
257 die.die('Unrecognized argument: %s' % arg)
258 for tmp in readiter(arglist[0], datapath=DATAPATH,
259 findfile=(DATAPATH is not None)
260 ):
261 print '[', tmp['filespec'], tmp.get('d', ''), tmp.get('f', ''), ']'
262 for tmps in tmp['symbols']:
263 if isinstance(tmps, tuple):
264 print ' '.join([str(q) for q in tmps])
265 else:
266 print tmps
267