1
2
3 """Perceptual spectrum for speech."""
4
5 import sys
6 import math as M
7 from gmisclib import erb_scale
8 from gmisclib import gpkmisc
9 from gmisclib import die
10
11 import numpy
12
13 pylab = None
14
15 import gpk_voicing.voice_misc as VM
16 from gpk_voicing import power
17 from gpk_voicing import percep_spec_extras as PS
18 from gpk_voicing import gammatone
19
20 CBmax = erb_scale.f_to_erb(8000.0)
21 CBmin = erb_scale.f_to_erb(50.0)
22 BBSZ = 0.5
23 E = 0.333
24 Neural_Tick = 1e-4
25
26 DISS_BWF = 0.25
27 MAX_DISS_FREQ = 250.0
28 HP_DISS_FREQ = 30.0
29 BLOCK_EDGE = 0.3
30
31
32
33
34
36 """This is the cutoff frequency of the modulation transfer function
37 in the ear, as a function of frequency.
38 From R. Plomp and M. A. Bouman, JASA 31(6), page 749ff, June 1959
39 'Relation of hearing threshold and duration of tone pulses'
40 """
41 Tau200 = 0.4
42 Tau10k = 0.13
43 tmp = Tau200 + (M.log(fc/200.0)/M.log(1e4/200.0))*(Tau10k-Tau200)
44 assert tmp>0
45 return tmp
46
47
49 """Transfer function taken from "Improved Audio Coding Using a
50 Psychoacoustic Model Based on a Cochlear Filter Bank
51 IEEE Transactions of Speech and Audio Processing, 10(7) October 2002,
52 Pages 495-503. au=Frank Baumgarte.
53 """
54 Q = 4.0
55 Slp = 25.0/(20.0*M.log10(1.2))
56 Shp = 8.0/(20.0*M.log10(1.2))
57 fofc_Shp = fofc**Shp
58 return (fofc_Shp) / ( (1+fofc**Slp) * (1 + (1j/Q)*(fofc**(Shp/2))
59 - fofc_Shp)
60 )
61
62
64 """IN pressure amplitude.
65 Crudely taken from Handbook of Perception vol 4: hearing,
66 E.C.Carterette and M.P.Friedman, editors,
67 Academic Press 1978, isbn 0-12-161904-4.
68 Curve near 70db used."""
69 return M.sqrt((200.0/f)**4 + 1 + (f/8000.0)**16)
70
71
73 """This does a little computation in a block-wise fashion
74 to keep all the data witin the processor's cache.
75 It computes sum( abs(a-b) ).
76 """
77 BKS = 6000
78 n = o.shape[0]
79 assert a.shape == (n,)
80 assert b.shape == (n,)
81 assert o.shape == (n,)
82 tmp = numpy.zeros((BKS,), numpy.float)
83 for i in range(0, n, BKS):
84 e = min(i+BKS, n)
85 tmpi = tmp[:e-i]
86 ai = a[i:e]
87 bi = b[i:e]
88 oi = o[i:e]
89 numpy.subtract(ai, bi, tmpi)
90 numpy.absolute(tmpi, tmpi)
91 numpy.add(oi, tmpi, oi)
92 return o
93
94
96 lng = tk[0][dly:].shape[0]
97 o = numpy.zeros((lng,), numpy.float)
98 for tkd in tk:
99 a = tkd[:-dly]
100 b = tkd[dly:]
101 accum_abs_sub(a, b, o)
102 return o
103
104
105
107 """@param Dt: the sampling rate of the output vectors.
108 @type Dt: C{float}
109 @param tick: the sampling rate of the vectors in C{tk}.
110 @type tick: C{float}
111 @param tk: a list of vectors, each of which represents the neural firing
112 of a particular point in the cochlea.
113 @type tk: [ numpy.ndarray, ...]
114 @return: (avg, min) where avg-min is the voicing estimator.
115 avg: C{float}, min: C{float}
116 """
117 LOWF = 50.0
118 HIGHF = 400.0
119 lowf = int(round(1.0/(tick*LOWF)))
120 highf = int(round(1.0/(tick*HIGHF)))
121 tmps = None
122 tmpmin = None
123
124 tmpn = 0
125 for dly in range(highf, lowf):
126
127
128
129 dlytmp = PS.list_accum_abs_sub_dly(tk, dly)
130 assert len(dlytmp.shape) == 1
131
132
133
134 tmp2, t0 = power.smooth(dlytmp, tick, Dt, extra=1.0/(LOWF*M.sqrt(12.0)))
135
136 if tmps is None:
137 tmpn = 1
138 tmps = tmp2
139 tmpmin = numpy.array(tmp2, copy=True)
140 else:
141 tmpn += 1
142 numpy.add(tmps, tmp2, tmps)
143 numpy.minimum(tmpmin, tmp2, tmpmin)
144 return (tmps, tmpn, tmpmin)
145
146
148 """This should be based on Hutchinson and KNopoff 1978
149 Kameoka an Kuriygawa 1969a; Viemeister 1988;
150 Aures 1985; Plomp and Levelt 1965.
151 But it isn't yet.
152
153 Hutchinson W. and Knopoff, L. 1978 The acoustic componenet of
154 Western consonance. Interface 7, 1-29.
155
156 Kameoka and Kuriygawa 1969. Consonance Theory: Part 1.
157 J. Acoustical Soc. America. 45 1451-1458
158 (and 1459-1469 for part II).
159
160 Viemeister, N. F. 1977 Temporal factors in audition: A system
161 analysis approach. In E. F. Evans and J. P. Wilson (eds)
162 Psychophysics and physiology of hearing (pp. 419-429)
163 London: Academic Press
164
165 Aures, W. 1985 Ein Berechnun gsverfahren der Rauhigkeit
166 [A roughness calculation method] Acustica 58 268-281
167
168 Plomp R. and Levelt W. 1965 Tonal consonance and critical
169 bandwidth. JASA 38, 548-560.
170 """
172 self.y = None
173 self.n = 0
174 self.dt = None
175
176 - def add(self, y, dt, diss_bw):
177 if self.dt is None:
178 assert dt > 0
179 self.dt = dt
180 else:
181 assert abs(dt-self.dt) < 0.0001*self.dt
182 tmp = VM.lowpass_sym_butterworth(y, diss_bw*dt)
183 tmp = VM.hipass_first_order(tmp, HP_DISS_FREQ*dt)
184 tmp = numpy.absolute(tmp)
185
186
187
188 if self.y is None:
189 self.y = tmp
190 else:
191 numpy.add(self.y, tmp, self.y)
192 self.n += 1
193
195
196
197 yfilt, t0 = power.smooth(self.y, self.dt, Dt)
198
199
200
201 return yfilt*(0.1/self.n)
202
203
206 self.y = None
207 self.shifts = []
208 self.dbg = []
209 self.n = 0
210 self.dt = None
211
212 EDGE = 700
213
214 - def add(self, y, dt, fc):
215 if self.dt is None:
216 assert dt > 0
217 self.dt = dt
218 else:
219 assert abs(dt-self.dt) < 0.0001*self.dt
220 FF = -3.2
221 sh = int(round(FF/(dt*fc))) + self.EDGE
222 if self.y is None:
223 self.y = numpy.zeros((y.shape[0]+2*self.EDGE,))
224 numpy.add(self.y[sh:sh+y.shape[0]], y, self.y[sh:sh+y.shape[0]])
225 if pylab and fc>400:
226 self.dbg.append(y)
227 self.shifts.append(sh)
228 self.n += 1
229
230
232 if pylab:
233 for (i, (y,sh)) in enumerate(zip(self.dbg, self.shifts)):
234 xpl = []
235 ypl = []
236 for dly in range(-100,100):
237 ypl.append(numpy.sum(y*self.y[sh+dly:sh+y.shape[0]+dly]))
238 xpl.append(dly)
239 pylab.plot(xpl, [q/max(ypl) for q in ypl], linestyle='-',
240 color=(float(i)/float(self.n), 0.5, float(self.n-i)/float(self.n)))
241 if pylab:
242 pylab.show()
243 cfilt, t0 = power.smooth(self.y*self.y, self.dt, Dt)
244 return cfilt/self.n**2
245
246
248 """This computes the perceptual spectrum in blocks and glues them
249 together. It's useful when the data is too large to fit in memory.
250 """
251 if kwargs.get('blocksize', None) is None:
252 return perceptual_spec(data, dt, Dt, **kwargs)
253 assert BLOCK_EDGE >= Dt
254 blocksize = Dt * int(round(kwargs['blocksize']/Dt))
255 del kwargs['blocksize']
256 assert blocksize > BLOCK_EDGE
257 m = data.shape[0]
258 ntau = int(M.ceil(m*dt/blocksize))
259 nout = int(M.ceil(m*dt/Dt))
260 pad = int(round(Dt*round(BLOCK_EDGE/Dt)/dt))
261
262 neural = None
263 tshift = None
264 bctrs = None
265 S0 = 0.0
266 e = 0
267 for i in range(ntau):
268 s0 = int(round((m*i)/float(ntau)))
269 se = int(round((m*(i+1))/float(ntau)))
270 ps0 = max(0, s0-pad)
271 pse = min(m, se+pad)
272 tbc, x, ttshift = perceptual_spec(data[ps0:pse], dt, Dt, **kwargs)
273 if tshift is None:
274 tshift = ttshift
275 if bctrs is None:
276 bctrs = tbc
277 else:
278 assert tbc == bctrs
279 if neural is None:
280 neural = numpy.zeros((x.shape[0], nout+1), numpy.float)
281
282 tau0 = int(M.floor((s0-ps0)*(dt/Dt)))
283 nS = min((se-s0)*(dt/Dt), x.shape[1]-tau0)
284
285
286 inS = int(M.ceil(nS))
287 iS0 = int(M.floor(S0))
288 assert iS0+inS <= nout, "S0=%.1f nS=%.1f S0+nS=%.1f nout=%d" % (S0, nS, S0+nS, nout)
289
290 neural[:, iS0:iS0+inS] = x[:, tau0:tau0+inS]
291
292 S0 += nS
293 e = iS0+inS
294
295 assert nout-2 <= e <= nout+1, "e=%d nout=%d" % (e, nout)
296 return (bctrs, neural[:,:e], tshift)
297
298
301 self.d = VM.dataprep_flat_real(data, dt, Dt)
302 self.ss = numpy.fft.real_fft( self.d )
303 self.f = numpy.absolute(VM.real_fft_freq(self.ss.shape[0], d=dt))
304
306 xferfcn = cochlear_filter(self.f/fc)
307 xferfcn[0] = xferfcn[0].real
308 xferfcn[-1] = xferfcn[-1].real
309 return numpy.fft.inverse_real_fft( self.ss * xferfcn )
310
311
314 self.dt = dt
315 if not data.flags['C_CONTIGUOUS']:
316
317
318
319
320 data = data.copy()
321 assert data.flags['C_CONTIGUOUS'], "Monotone filter bank needs C-contiguous data."
322 self.data = data
323
325 return gammatone.gammatone(self.data, self.dt, fc, gammatone.M_BASILAR)
326
327
328
329
330
331 -def perceptual_spec(data, dt, Dt, bmin=CBmin, bmax=CBmax, db=BBSZ,
332 do_mod=0, do_dissonance=False, PlompBouman=True,
333 do_peakalign=False, e=None):
334 """This returns something roughly like the neural signals leaving the ear.
335 It filters into 1-erb-wide bins, then takes the cube root of
336 the amplitude.
337 @return: (channel_info, data, time_offset), where
338 - C{time_offset} is the time of the the first output datum
339 relative to the time of the first input sample.
340 - C{channel_info}
341 - C{data}
342 """
343 if e is None:
344 e = E
345 do_mod = int(do_mod)
346 Neural_Time_Resolution = max(Neural_Tick, dt)
347 m = data.shape[0]
348 assert m*dt > Dt, "Tiny amount of data: %d*%g=%.2gs < Dt=%2g" % (m, dt, m*dt, Dt)
349
350 ntau = int(M.floor(m*dt/Dt))
351
352 fbank = gammatone_filterbank(data, dt)
353 nband0 = int(M.ceil((bmax-bmin)/db))
354 nbands = nband0
355 assert do_mod in [0,1,2], "Do_mod=%s" % str(do_mod)
356 if do_mod:
357 nbands += int(do_mod)
358 tk = []
359 if do_dissonance:
360 nbands += 1
361 diss = roughness_c()
362 if do_peakalign:
363 nbands += 1
364 palign = peakalign_c()
365 neural = numpy.zeros((nbands, ntau), numpy.float)
366 bctrs = []
367 t0x = []
368 iband = 0
369 b = bmin
370 while b < bmax:
371
372 fc = erb_scale.erb_to_f(b)
373 filtered_sig = fbank.filter(fc)
374 hair_cells = filtered_sig / threshold(fc)
375 numpy.maximum(hair_cells, 0.0, hair_cells)
376 numpy.power(hair_cells, 2*e, hair_cells)
377
378
379
380 pwrfilt, t0 = power.smooth(hair_cells, dt, Dt)
381
382 t0x.append(t0)
383 bctr = {'type':'band', 'fc': fc, 'erb': b, 'E': e, 'id': 'B%.1f' % b }
384 if PlompBouman:
385 pwrfilt = VM.lopass_first_order(pwrfilt, Dt/tau_lp(fc))
386 bctr['smooth'] = tau_lp(fc)
387 bctrs.append(bctr)
388 numpy.maximum(pwrfilt[:ntau], 0.0, neural[iband,:])
389
390
391
392
393
394 if do_mod or do_dissonance or do_peakalign:
395 tkfilt, t0 = power.smooth(hair_cells, dt, Neural_Time_Resolution)
396 if do_dissonance:
397 diss_bw = min(DISS_BWF*erb_scale.ebw(b), MAX_DISS_FREQ)
398 diss.add(tkfilt, Neural_Time_Resolution, diss_bw)
399 if do_mod:
400 tk.append(tkfilt)
401 if do_peakalign:
402 palign.add(tkfilt, Neural_Time_Resolution, fc)
403 b += db
404 iband += 1
405 assert iband == nband0
406
407 if do_mod == 1:
408 tksum, tkn, tkmin = process_voicing(tk, Neural_Time_Resolution, Dt)
409 neural[iband,:] = (tksum/tkn-tkmin)[:ntau]*0.15
410 bctrs.append({'type':'haspitch', 'id':'haspitch1', 'variant': 1,
411 'Fentropy': -M.log(len(tk))
412 })
413 iband += 1
414 assert len(bctrs) == iband
415 elif do_mod == 2:
416 Voicing_divide = 0.5*(CBmax+CBmin)
417 tk_low = []
418 tk_high = []
419 for (bctr, tki) in zip(bctrs, tk):
420 assert tki.shape[0] > 0
421 if bctr['erb'] > Voicing_divide:
422 tk_high.append(tki)
423 else:
424 tk_low.append(tki)
425 tksum_low, tkn_low, tkmin_low = process_voicing(tk_low, Neural_Time_Resolution, Dt)
426 tksum_high, tkn_high, tkmin_high = process_voicing(tk_high, Neural_Time_Resolution, Dt)
427 avgmmin = (tksum_low+tksum_high)/(tkn_low+tkn_high) - numpy.minimum(tkmin_low, tkmin_high)
428 neural[iband,:] = avgmmin[:ntau]*0.15
429 bctrs.append({'type':'haspitch', 'id':'haspitch_all',
430 'variant': 'all',
431 'Fentropy': -M.log(len(tk))
432 })
433 iband += 1
434
435 neural[iband,:] = (tksum_high/tkn_high-tkmin_high)[:ntau]*0.15
436 bctrs.append({'type':'haspitch', 'id':'haspitch_high',
437 'variant': 'high',
438 'Fentropy': -M.log(len(tk_high))
439 })
440 iband += 1
441 assert len(bctrs) == iband
442 if do_dissonance:
443 neural[iband,:] = diss.get(Dt)[:ntau]*20.0
444
445
446
447 bctrs.append({'type':'roughness', 'id':'roughness1',
448 'variant': 1,
449 'Fentropy': -M.log(diss.n)
450 })
451 iband += 1
452 if do_peakalign:
453 neural[iband,:] = palign.get(Dt)[:ntau]
454 bctrs.append({'type':'peakalign', 'id':'peakalign1',
455 'variant': 1,
456 'Fentropy': -M.log(palign.n)
457 })
458 iband += 1
459 assert iband == neural.shape[0], "iband=%d != shape=%s" % (iband, str(neural.shape))
460
461 return (bctrs, neural, gpkmisc.median(t0x))
462
463
464
466 dt = 1e-4
467 d = numpy.zeros((10000,1))
468 for i in range(0, 10000, 100):
469 d[i,0] = 1.0
470 return gpkimgclass.gpk_img({'CDELT2': dt}, d)
471
472
473
474 __version__ = "$Revision: 1.43 $"
475 __date__ = "$Date: 2007/03/07 23:54:40 $"
476
477
478 if __name__ == '__main__':
479
480
481
482
483
484 import gpkimgclass
485 DT = 0.01
486 arglist = sys.argv[1:]
487 Blocksize = 10.0
488 arglist0 = arglist
489 do_mod, dpa, dd, pb = 1, True, True, True
490 column = None
491 signal = None
492 outfile = "-"
493 extrahdr = {}
494 while arglist and arglist[0].startswith('-'):
495 arg = arglist.pop(0)
496 if arg == '--':
497 break
498 elif arg == '-dt':
499 DT = float(arglist.pop(0))
500 elif arg == '-f':
501 signal = gpkimgclass.read(arglist.pop(0))
502 elif arg == '-c':
503 tmp = arglist.pop(0)
504 try:
505 column = int( tmp )
506 except ValueError:
507 column = tmp
508 elif arg == '-b':
509 Blocksize = float(arglist.pop(0))
510 elif arg == '-plot':
511 import pylab
512 elif arg == '-pb':
513 pb = False
514 elif arg == '-dd':
515 dd = False
516 elif arg == '-dpa':
517 dpa = False
518 elif arg == '-do_mod':
519 do_mod = 0
520 elif arg == '-test':
521 signal = test()
522 column = 0
523 elif arg == '-o':
524 outfile = arglist.pop(0)
525 elif arg == '-write':
526 extrahdr = dict( [q.strip().split('=', 1)
527 for q in arglist.pop(0).split(';') ]
528 )
529 else:
530 die.info("Unrecognized flag: %s" % arg)
531 print __doc__
532 die.exit(1)
533 if arglist:
534 if signal is None:
535 signal = gpkimgclass.read(arglist[0])
536 else:
537 die.die("Usage: cannot specify name both with -f and as last argument.")
538
539 if signal is None:
540 die.info("No signal file specified.")
541 print __doc__
542 die.exit(1)
543 try:
544 signal.column(column)
545 except KeyError:
546 die.info("File has %d columns" % signal.d.shape[0])
547 die.die("Bad column: %s" % str(column))
548
549 bctrs, neural, tshift = block_percep_spec(signal.column(0), signal.dt(), DT,
550 do_mod=do_mod, do_dissonance=dd,
551 do_peakalign=dpa, PlompBouman=pb,
552 blocksize=Blocksize)
553 o = numpy.transpose(neural)
554 if pylab:
555 pylab.matshow(numpy.transpose(o))
556 pylab.show()
557
558 hdr = signal.hdr.copy()
559 hdr['program'] = sys.argv[0]
560 hdr['ARGV'] = arglist0
561 hdr['input_file'] = signal.hdr.get('_NAME', '???')
562 hdr['column'] = column
563 hdr['CDELT2'] = DT
564 hdr['CRPIX2'] = 1
565 hdr['CRVAL2'] = signal.start() + tshift
566 hdr['CRPIX1'] = 1
567 hdr['CRVAL1'] = bctrs[0]['erb']
568 hdr['CDELT1'] = bctrs[1]['erb']-bctrs[0]['erb']
569 hdr['BITPIX'] = -32
570 hdr.update( extrahdr )
571 for (i,b) in enumerate(bctrs):
572 hdr['TTYPE%d' % (i+1)] = b['id']
573 gpkimgclass.gpk_img(hdr, o).write(outfile)
574