1
2
3 """Perceptual spectrum for speech."""
4
5 import sys
6 import math as M
7 from gmisclib import erb_scale
8 from gmisclib import gpkmisc
9 from gmisclib import die
10
11 import numpy
12
13 pylab = None
14
15 import gpk_voicing.voice_misc as VM
16 from gpk_voicing import power
17 from gpk_voicing import percep_spec_extras as PS
18 from gpk_voicing import gammatone2
19
20 CBmax = erb_scale.f_to_erb(8000.0)
21 CBmin = erb_scale.f_to_erb(50.0)
22 BBSZ = 0.5
23 E = 0.333
24 Neural_Tick = 1e-4
25
26 DISS_BWF = 0.25
27 MAX_DISS_FREQ = 250.0
28 HP_DISS_FREQ = 30.0
29 BLOCK_EDGE = 0.3
30
31
32
33
34
36 """This is the cutoff frequency of the modulation transfer function
37 in the ear, as a function of frequency.
38 From R. Plomp and M. A. Bouman, JASA 31(6), page 749ff, June 1959
39 'Relation of hearing threshold and duration of tone pulses'
40 """
41 Tau200 = 0.4
42 Tau10k = 0.13
43 tmp = Tau200 + (M.log(fc/200.0)/M.log(1e4/200.0))*(Tau10k-Tau200)
44 assert tmp>0
45 return tmp
46
47
49 """Transfer function taken from "Improved Audio Coding Using a
50 Psychoacoustic Model Based on a Cochlear Filter Bank
51 IEEE Transactions of Speech and Audio Processing, 10(7) October 2002,
52 Pages 495-503. au=Frank Baumgarte.
53 """
54 Q = 4.0
55 Slp = 25.0/(20.0*M.log10(1.2))
56 Shp = 8.0/(20.0*M.log10(1.2))
57 fofc_Shp = fofc**Shp
58 return (fofc_Shp) / ( (1+fofc**Slp) * (1 + (1j/Q)*(fofc**(Shp/2))
59 - fofc_Shp)
60 )
61
62
64 """IN pressure amplitude.
65 Crudely taken from Handbook of Perception vol 4: hearing,
66 E.C.Carterette and M.P.Friedman, editors,
67 Academic Press 1978, isbn 0-12-161904-4.
68 Curve near 70db used."""
69 return M.sqrt((200.0/f)**4 + 1 + (f/8000.0)**16)
70
71
73 """This does a little computation in a block-wise fashion
74 to keep all the data witin the processor's cache.
75 It computes sum( abs(a-b) ).
76 """
77 BKS = 6000
78 n = o.shape[0]
79 assert a.shape == (n,)
80 assert b.shape == (n,)
81 assert o.shape == (n,)
82 tmp = numpy.zeros((BKS,), numpy.float)
83 for i in range(0, n, BKS):
84 e = min(i+BKS, n)
85 tmpi = tmp[:e-i]
86 ai = a[i:e]
87 bi = b[i:e]
88 oi = o[i:e]
89 numpy.subtract(ai, bi, tmpi)
90 numpy.absolute(tmpi, tmpi)
91 numpy.add(oi, tmpi, oi)
92 return o
93
94
96 lng = tk[0][dly:].shape[0]
97 o = numpy.zeros((lng,), numpy.float)
98 for tkd in tk:
99 a = tkd[:-dly]
100 b = tkd[dly:]
101 accum_abs_sub(a, b, o)
102 return o
103
104
105
107 """@param Dt: the sampling rate of the output vectors.
108 @type Dt: C{float}
109 @param tick: the sampling rate of the vectors in C{tk}.
110 @type tick: C{float}
111 @param tk: a list of vectors, each of which represents the neural firing
112 of a particular point in the cochlea.
113 @type tk: [ numpy.ndarray, ...]
114 @return: (avg, min) where avg-min is the voicing estimator.
115 avg: C{float}, min: C{float}
116 """
117 LOWF = 50.0
118 HIGHF = 400.0
119 lowf = int(round(1.0/(tick*LOWF)))
120 highf = int(round(1.0/(tick*HIGHF)))
121 tmps = None
122 tmpmin = None
123
124 tmpn = 0
125 for dly in range(highf, lowf):
126
127
128
129 dlytmp = PS.list_accum_abs_sub_dly(tk, dly)
130 assert len(dlytmp.shape) == 1
131
132
133
134 tmp2, t0 = power.smooth(dlytmp, tick, Dt, extra=1.0/(LOWF*M.sqrt(12.0)))
135
136 if tmps is None:
137 tmpn = 1
138 tmps = tmp2
139 tmpmin = numpy.array(tmp2, copy=True)
140 else:
141 tmpn += 1
142 numpy.add(tmps, tmp2, tmps)
143 numpy.minimum(tmpmin, tmp2, tmpmin)
144 return (tmps, tmpn, tmpmin)
145
146
148 """This should be based on Hutchinson and KNopoff 1978
149 Kameoka an Kuriygawa 1969a; Viemeister 1988;
150 Aures 1985; Plomp and Levelt 1965.
151 But it isn't yet.
152
153 Hutchinson W. and Knopoff, L. 1978 The acoustic componenet of
154 Western consonance. Interface 7, 1-29.
155
156 Kameoka and Kuriygawa 1969. Consonance Theory: Part 1.
157 J. Acoustical Soc. America. 45 1451-1458
158 (and 1459-1469 for part II).
159
160 Viemeister, N. F. 1977 Temporal factors in audition: A system
161 analysis approach. In E. F. Evans and J. P. Wilson (eds)
162 Psychophysics and physiology of hearing (pp. 419-429)
163 London: Academic Press
164
165 Aures, W. 1985 Ein Berechnun gsverfahren der Rauhigkeit
166 [A roughness calculation method] Acustica 58 268-281
167
168 Plomp R. and Levelt W. 1965 Tonal consonance and critical
169 bandwidth. JASA 38, 548-560.
170 """
172 self.y = None
173 self.n = 0
174 self.dt = None
175
176 - def add(self, y, dt, diss_bw):
177 if self.dt is None:
178 assert dt > 0
179 self.dt = dt
180 else:
181 assert abs(dt-self.dt) < 0.0001*self.dt
182 tmp = VM.lowpass_sym_butterworth(y, diss_bw*dt)
183 tmp = VM.hipass_first_order(tmp, HP_DISS_FREQ*dt)
184 tmp = numpy.absolute(tmp)
185
186
187
188 if self.y is None:
189 self.y = tmp
190 else:
191 numpy.add(self.y, tmp, self.y)
192 self.n += 1
193
195
196
197 yfilt, t0 = power.smooth(self.y, self.dt, Dt)
198
199
200
201 return yfilt*(0.1/self.n)
202
203
206 self.y = None
207 self.shifts = []
208 self.dbg = []
209 self.n = 0
210 self.dt = None
211
212 EDGE = 700
213
214 - def add(self, y, dt, fc):
215 if self.dt is None:
216 assert dt > 0
217 self.dt = dt
218 else:
219 assert abs(dt-self.dt) < 0.0001*self.dt
220 FF = -3.2
221 sh = int(round(FF/(dt*fc))) + self.EDGE
222 if self.y is None:
223 self.y = numpy.zeros((y.shape[0]+2*self.EDGE,))
224 numpy.add(self.y[sh:sh+y.shape[0]], y, self.y[sh:sh+y.shape[0]])
225 if pylab and fc>400:
226 self.dbg.append(y)
227 self.shifts.append(sh)
228 self.n += 1
229
230
232 if pylab:
233 for (i, (y,sh)) in enumerate(zip(self.dbg, self.shifts)):
234 xpl = []
235 ypl = []
236 for dly in range(-100,100):
237 ypl.append(numpy.sum(y*self.y[sh+dly:sh+y.shape[0]+dly]))
238 xpl.append(dly)
239 pylab.plot(xpl, [q/max(ypl) for q in ypl], linestyle='-',
240 color=(float(i)/float(self.n), 0.5, float(self.n-i)/float(self.n)))
241 if pylab:
242 pylab.show()
243 cfilt, t0 = power.smooth(self.y*self.y, self.dt, Dt)
244 return cfilt/self.n**2
245
246
248 """This computes the perceptual spectrum in blocks and glues them
249 together. It's useful when the data is too large to fit in memory.
250 """
251 if kwargs.get('blocksize', None) is None:
252 return perceptual_spec(data, dt, Dt, **kwargs)
253 assert BLOCK_EDGE >= Dt
254 blocksize = Dt * int(round(kwargs['blocksize']/Dt))
255 del kwargs['blocksize']
256 assert blocksize > BLOCK_EDGE
257 m = data.shape[0]
258 ntau = int(M.ceil(m*dt/blocksize))
259 nout = int(M.ceil(m*dt/Dt))
260 pad = int(round(Dt*round(BLOCK_EDGE/Dt)/dt))
261
262 neural = None
263 tshift = None
264 bctrs = None
265 S0 = 0.0
266 e = 0
267 for i in range(ntau):
268 s0 = int(round((m*i)/float(ntau)))
269 se = int(round((m*(i+1))/float(ntau)))
270 ps0 = max(0, s0-pad)
271 pse = min(m, se+pad)
272 tbc, x, ttshift = perceptual_spec(data[ps0:pse], dt, Dt, **kwargs)
273 if tshift is None:
274 tshift = ttshift
275 if bctrs is None:
276 bctrs = tbc
277 else:
278 assert tbc == bctrs
279 if neural is None:
280 neural = numpy.zeros((x.shape[0], nout+1), numpy.float)
281
282 tau0 = int(M.floor((s0-ps0)*(dt/Dt)))
283 nS = min((se-s0)*(dt/Dt), x.shape[1]-tau0)
284
285
286 inS = int(M.ceil(nS))
287 iS0 = int(M.floor(S0))
288 assert iS0+inS <= nout, "S0=%.1f nS=%.1f S0+nS=%.1f nout=%d" % (S0, nS, S0+nS, nout)
289
290 neural[:, iS0:iS0+inS] = x[:, tau0:tau0+inS]
291
292 S0 += nS
293 e = iS0+inS
294
295 assert nout-2 <= e <= nout+1, "e=%d nout=%d" % (e, nout)
296 return (bctrs, neural[:,:e], tshift)
297
298
301 self.d = VM.dataprep_flat_real(data, dt, Dt)
302 self.ss = numpy.fft.real_fft( self.d )
303 self.f = numpy.absolute(VM.real_fft_freq(self.ss.shape[0], d=dt))
304
306 xferfcn = cochlear_filter(self.f/fc)
307 xferfcn[0] = xferfcn[0].real
308 xferfcn[-1] = xferfcn[-1].real
309 return numpy.fft.inverse_real_fft( self.ss * xferfcn )
310
311
312
313 -def perceptual_spec(data, dt, Dt, bmin=CBmin, bmax=CBmax, db=BBSZ,
314 do_mod=0, do_dissonance=False, PlompBouman=True,
315 do_peakalign=False, e=None):
316 """This returns something roughly like the neural signals leaving the ear.
317 It filters into 1-erb-wide bins, then takes the cube root of
318 the amplitude.
319 @return: (channel_info, data, time_offset), where
320 - C{time_offset} is the time of the the first output datum
321 relative to the time of the first input sample.
322 - C{channel_info}
323 - C{data}
324 """
325 if e is None:
326 e = E
327 do_mod = int(do_mod)
328 Neural_Time_Resolution = max(Neural_Tick, dt)
329 m = data.shape[0]
330 assert m*dt > Dt, "Tiny amount of data: %d*%g=%.2gs < Dt=%2g" % (m, dt, m*dt, Dt)
331
332 ntau = int(M.floor(m*dt/Dt))
333
334 nband0 = int(M.ceil((bmax-bmin)/db))
335 nbands = nband0
336 assert do_mod in [0,1,2], "Do_mod=%s" % str(do_mod)
337 if do_mod:
338 nbands += int(do_mod)
339 tk = []
340 if do_dissonance:
341 nbands += 1
342 diss = roughness_c()
343 if do_peakalign:
344 nbands += 1
345 palign = peakalign_c()
346 neural = numpy.zeros((nbands, ntau), numpy.float)
347 bctrs = []
348 t0x = []
349 iband = 0
350
351 fclist = []
352 b = bmin
353 while b < bmax:
354
355 fc = erb_scale.erb_to_f(b)
356 fclist.append(b, fc)
357 b += db
358
359 hair_cells = gammatone2.gammatone_ear_bank(data, dt, e, [ (fc, threshold(fc)) for (b, fc) in fclist] )
360
361 for (iband, (b, fc)) in enumerate(fclist):
362 pwrfilt, t0 = power.smooth(hair_cells[iband], dt, Dt)
363
364 t0x.append(t0)
365 if PlompBouman:
366 pwrfilt = numpy.maximum(VM.lopass_first_order(pwrfilt, Dt/tau_lp(fc)), 0.0)
367 else:
368 pwrfilt = numpy.maximum(pwrfilt, 0.0)
369
370
371 neural[iband,:] = pwrfilt[:ntau]
372 bctrs.append({'type':'band', 'smooth': tau_lp(fc), 'fc': fc,
373 'erb': b, 'E': e, 'id': 'B%.1f' % b
374 })
375
376 if do_mod or do_dissonance or do_peakalign:
377 tkfilt, t0 = power.smooth(hair_cells, dt, Neural_Time_Resolution)
378 if do_dissonance:
379 diss_bw = min(DISS_BWF*erb_scale.ebw(b), MAX_DISS_FREQ)
380 diss.add(tkfilt, Neural_Time_Resolution, diss_bw)
381 if do_mod:
382 tk.append(tkfilt)
383 if do_peakalign:
384 palign.add(tkfilt, Neural_Time_Resolution, fc)
385 assert iband == nband0
386
387 if do_mod == 1:
388 tksum, tkn, tkmin = process_voicing(tk, Neural_Time_Resolution, Dt)
389 neural[iband,:] = (tksum/tkn-tkmin)[:ntau]*0.15
390 bctrs.append({'type':'haspitch', 'id':'haspitch1', 'variant': 1,
391 'Fentropy': -M.log(len(tk))
392 })
393 iband += 1
394 assert len(bctrs) == iband
395 elif do_mod == 2:
396 Voicing_divide = 0.5*(CBmax+CBmin)
397 tk_low = []
398 tk_high = []
399 for (bctr, tki) in zip(bctrs, tk):
400 assert tki.shape[0] > 0
401 if bctr['erb'] > Voicing_divide:
402 tk_high.append(tki)
403 else:
404 tk_low.append(tki)
405 tksum_low, tkn_low, tkmin_low = process_voicing(tk_low, Neural_Time_Resolution, Dt)
406 tksum_high, tkn_high, tkmin_high = process_voicing(tk_high, Neural_Time_Resolution, Dt)
407 avgmmin = (tksum_low+tksum_high)/(tkn_low+tkn_high) - numpy.minimum(tkmin_low, tkmin_high)
408 neural[iband,:] = avgmmin[:ntau]*0.15
409 bctrs.append({'type':'haspitch', 'id':'haspitch_all',
410 'variant': 'all',
411 'Fentropy': -M.log(len(tk))
412 })
413 iband += 1
414
415 neural[iband,:] = (tksum_high/tkn_high-tkmin_high)[:ntau]*0.15
416 bctrs.append({'type':'haspitch', 'id':'haspitch_high',
417 'variant': 'high',
418 'Fentropy': -M.log(len(tk_high))
419 })
420 iband += 1
421 assert len(bctrs) == iband
422 if do_dissonance:
423 neural[iband,:] = diss.get(Dt)[:ntau]*20.0
424
425
426
427 bctrs.append({'type':'roughness', 'id':'roughness1',
428 'variant': 1,
429 'Fentropy': -M.log(diss.n)
430 })
431 iband += 1
432 if do_peakalign:
433 neural[iband,:] = palign.get(Dt)[:ntau]
434 bctrs.append({'type':'peakalign', 'id':'peakalign1',
435 'variant': 1,
436 'Fentropy': -M.log(palign.n)
437 })
438 iband += 1
439 assert iband == neural.shape[0], "iband=%d != shape=%s" % (iband, str(neural.shape))
440
441 return (bctrs, neural, gpkmisc.median(t0x))
442
443
444
446 dt = 1e-4
447 d = numpy.zeros((10000,1))
448 for i in range(0, 10000, 100):
449 d[i,0] = 1.0
450 return gpkimgclass.gpk_img({'CDELT2': dt}, d)
451
452
453
454 __version__ = "$Revision: 1.43 $"
455 __date__ = "$Date: 2007/03/07 23:54:40 $"
456
457
458 if __name__ == '__main__':
459
460
461
462
463
464 import gpkimgclass
465 DT = 0.01
466 arglist = sys.argv[1:]
467 Blocksize = 10.0
468 arglist0 = arglist
469 do_mod, dpa, dd, pb = 1, True, True, True
470 column = None
471 signal = None
472 outfile = "-"
473 extrahdr = {}
474 while arglist and arglist[0].startswith('-'):
475 arg = arglist.pop(0)
476 if arg == '--':
477 break
478 elif arg == '-dt':
479 DT = float(arglist.pop(0))
480 elif arg == '-f':
481 signal = gpkimgclass.read(arglist.pop(0))
482 elif arg == '-c':
483 tmp = arglist.pop(0)
484 try:
485 column = int( tmp )
486 except ValueError:
487 column = tmp
488 elif arg == '-b':
489 Blocksize = float(arglist.pop(0))
490 elif arg == '-plot':
491 import pylab
492 elif arg == '-pb':
493 pb = False
494 elif arg == '-dd':
495 dd = False
496 elif arg == '-dpa':
497 dpa = False
498 elif arg == '-do_mod':
499 do_mod = 0
500 elif arg == '-test':
501 signal = test()
502 column = 0
503 elif arg == '-o':
504 outfile = arglist.pop(0)
505 elif arg == '-write':
506 extrahdr = dict( [q.strip().split('=', 1)
507 for q in arglist.pop(0).split(';') ]
508 )
509 else:
510 die.info("Unrecognized flag: %s" % arg)
511 print __doc__
512 die.exit(1)
513 if arglist:
514 if signal is None:
515 signal = gpkimgclass.read(arglist[0])
516 else:
517 die.die("Usage: cannot specify name both with -f and as last argument.")
518
519 if signal is None:
520 die.info("No signal file specified.")
521 print __doc__
522 die.exit(1)
523 try:
524 signal.column(column)
525 except KeyError:
526 die.info("File has %d columns" % signal.n[1])
527 die.die("Bad column: %s" % str(column))
528
529 bctrs, neural, tshift = block_percep_spec(signal.column(0), signal.dt(), DT,
530 do_mod=do_mod, do_dissonance=dd,
531 do_peakalign=dpa, PlompBouman=pb,
532 blocksize=Blocksize)
533 o = numpy.transpose(neural)
534 if pylab:
535 pylab.matshow(numpy.transpose(o))
536 pylab.show()
537
538 hdr = signal.hdr.copy()
539 hdr['program'] = sys.argv[0]
540 hdr['ARGV'] = arglist0
541 hdr['input_file'] = signal.hdr.get('_NAME', '???')
542 hdr['column'] = column
543 hdr['CDELT2'] = DT
544 hdr['CRPIX2'] = 1
545 hdr['CRVAL2'] = signal.start() + tshift
546 hdr['CRPIX1'] = 1
547 hdr['CRVAL1'] = bctrs[0]['erb']
548 hdr['CDELT1'] = bctrs[1]['erb']-bctrs[0]['erb']
549 hdr['BITPIX'] = -32
550 hdr.update( extrahdr )
551 for (i,b) in enumerate(bctrs):
552 hdr['TTYPE%d' % (i+1)] = b['id']
553 gpkimgclass.gpk_img(hdr, o).write(outfile)
554