Package lib :: Module percep_spec
[frames] | no frames]

Source Code for Module lib.percep_spec

  1  #!/usr/bin/env python 
  2   
  3  """Perceptual spectrum for speech.""" 
  4   
  5  import sys 
  6  import math as M 
  7  from gmisclib import erb_scale 
  8  from gmisclib import gpkmisc 
  9  from gmisclib import die 
 10   
 11  import numpy 
 12   
 13  pylab = None 
 14   
 15  import gpk_voicing.voice_misc as VM 
 16  from gpk_voicing import power 
 17  from gpk_voicing import percep_spec_extras as PS 
 18  from gpk_voicing import gammatone 
 19   
 20  CBmax = erb_scale.f_to_erb(8000.0) 
 21  CBmin = erb_scale.f_to_erb(50.0) 
 22  BBSZ = 0.5 
 23  E = 0.333 
 24  Neural_Tick = 1e-4      # The best possible time resolution of the auditory 
 25                          # system.  (E.g. from binaural direction measurements.) 
 26  DISS_BWF = 0.25 
 27  MAX_DISS_FREQ = 250.0 
 28  HP_DISS_FREQ = 30.0 
 29  BLOCK_EDGE = 0.3        # Seconds 
 30   
 31  # def SafeExp(x): 
 32          # return Num.exp(Num.maximum(-200.0, x)) 
 33   
 34   
35 -def tau_lp(fc):
36 """This is the cutoff frequency of the modulation transfer function 37 in the ear, as a function of frequency. 38 From R. Plomp and M. A. Bouman, JASA 31(6), page 749ff, June 1959 39 'Relation of hearing threshold and duration of tone pulses' 40 """ 41 Tau200 = 0.4 42 Tau10k = 0.13 43 tmp = Tau200 + (M.log(fc/200.0)/M.log(1e4/200.0))*(Tau10k-Tau200) 44 assert tmp>0 45 return tmp
46 47
48 -def cochlear_filter(fofc):
49 """Transfer function taken from "Improved Audio Coding Using a 50 Psychoacoustic Model Based on a Cochlear Filter Bank 51 IEEE Transactions of Speech and Audio Processing, 10(7) October 2002, 52 Pages 495-503. au=Frank Baumgarte. 53 """ 54 Q = 4.0 55 Slp = 25.0/(20.0*M.log10(1.2)) 56 Shp = 8.0/(20.0*M.log10(1.2)) 57 fofc_Shp = fofc**Shp 58 return (fofc_Shp) / ( (1+fofc**Slp) * (1 + (1j/Q)*(fofc**(Shp/2)) 59 - fofc_Shp) 60 )
61 62
63 -def threshold(f):
64 """IN pressure amplitude. 65 Crudely taken from Handbook of Perception vol 4: hearing, 66 E.C.Carterette and M.P.Friedman, editors, 67 Academic Press 1978, isbn 0-12-161904-4. 68 Curve near 70db used.""" 69 return M.sqrt((200.0/f)**4 + 1 + (f/8000.0)**16)
70 71
72 -def accum_abs_sub(a, b, o):
73 """This does a little computation in a block-wise fashion 74 to keep all the data witin the processor's cache. 75 It computes sum( abs(a-b) ). 76 """ 77 BKS = 6000 78 n = o.shape[0] 79 assert a.shape == (n,) 80 assert b.shape == (n,) 81 assert o.shape == (n,) 82 tmp = numpy.zeros((BKS,), numpy.float) 83 for i in range(0, n, BKS): 84 e = min(i+BKS, n) 85 tmpi = tmp[:e-i] 86 ai = a[i:e] 87 bi = b[i:e] 88 oi = o[i:e] 89 numpy.subtract(ai, bi, tmpi) 90 numpy.absolute(tmpi, tmpi) 91 numpy.add(oi, tmpi, oi) 92 return o
93 94
95 -def list_accum_abs_sub_dly(tk, dly):
96 lng = tk[0][dly:].shape[0] 97 o = numpy.zeros((lng,), numpy.float) 98 for tkd in tk: 99 a = tkd[:-dly] 100 b = tkd[dly:] 101 accum_abs_sub(a, b, o) 102 return o
103 104 105
106 -def process_voicing(tk, tick, Dt):
107 """@param Dt: the sampling rate of the output vectors. 108 @type Dt: C{float} 109 @param tick: the sampling rate of the vectors in C{tk}. 110 @type tick: C{float} 111 @param tk: a list of vectors, each of which represents the neural firing 112 of a particular point in the cochlea. 113 @type tk: [ numpy.ndarray, ...] 114 @return: (avg, min) where avg-min is the voicing estimator. 115 avg: C{float}, min: C{float} 116 """ 117 LOWF = 50.0 118 HIGHF = 400.0 119 lowf = int(round(1.0/(tick*LOWF))) 120 highf = int(round(1.0/(tick*HIGHF))) 121 tmps = None 122 tmpmin = None 123 ## tmpss = None 124 tmpn = 0 125 for dly in range(highf, lowf): 126 # dhl = dly//2 127 # dhr = dly - dhl 128 # dlytmp = list_accum_abs_sub_dly(tk, dly) 129 dlytmp = PS.list_accum_abs_sub_dly(tk, dly) 130 assert len(dlytmp.shape) == 1 131 # al = Num.zeros((dhl,), Num.Float) 132 # ar = Num.zeros((dhr,), Num.Float) 133 # dlytmp = Num.concatenate((al, dlytmp, ar)) 134 tmp2, t0 = power.smooth(dlytmp, tick, Dt, extra=1.0/(LOWF*M.sqrt(12.0))) 135 136 if tmps is None: 137 tmpn = 1 138 tmps = tmp2 139 tmpmin = numpy.array(tmp2, copy=True) 140 else: 141 tmpn += 1 142 numpy.add(tmps, tmp2, tmps) 143 numpy.minimum(tmpmin, tmp2, tmpmin) 144 return (tmps, tmpn, tmpmin)
145 146
147 -class roughness_c(object):
148 """This should be based on Hutchinson and KNopoff 1978 149 Kameoka an Kuriygawa 1969a; Viemeister 1988; 150 Aures 1985; Plomp and Levelt 1965. 151 But it isn't yet. 152 153 Hutchinson W. and Knopoff, L. 1978 The acoustic componenet of 154 Western consonance. Interface 7, 1-29. 155 156 Kameoka and Kuriygawa 1969. Consonance Theory: Part 1. 157 J. Acoustical Soc. America. 45 1451-1458 158 (and 1459-1469 for part II). 159 160 Viemeister, N. F. 1977 Temporal factors in audition: A system 161 analysis approach. In E. F. Evans and J. P. Wilson (eds) 162 Psychophysics and physiology of hearing (pp. 419-429) 163 London: Academic Press 164 165 Aures, W. 1985 Ein Berechnun gsverfahren der Rauhigkeit 166 [A roughness calculation method] Acustica 58 268-281 167 168 Plomp R. and Levelt W. 1965 Tonal consonance and critical 169 bandwidth. JASA 38, 548-560. 170 """
171 - def __init__(self):
172 self.y = None 173 self.n = 0 174 self.dt = None
175
176 - def add(self, y, dt, diss_bw):
177 if self.dt is None: 178 assert dt > 0 179 self.dt = dt 180 else: 181 assert abs(dt-self.dt) < 0.0001*self.dt 182 tmp = VM.lowpass_sym_butterworth(y, diss_bw*dt) 183 tmp = VM.hipass_first_order(tmp, HP_DISS_FREQ*dt) 184 tmp = numpy.absolute(tmp) 185 # if pylab: 186 # print 'roughness_c.add', numpy.average(tmp), diss_bw 187 # pylab.plot(tmp) 188 if self.y is None: 189 self.y = tmp 190 else: 191 numpy.add(self.y, tmp, self.y) 192 self.n += 1
193
194 - def get(self, Dt):
195 # if pylab: 196 # pylab.figure() 197 yfilt, t0 = power.smooth(self.y, self.dt, Dt) 198 # if pylab: 199 # pylab.plot(yfilt) 200 # pylab.show() 201 return yfilt*(0.1/self.n)
202 203
204 -class peakalign_c(object):
205 - def __init__(self):
206 self.y = None 207 self.shifts = [] 208 self.dbg = [] 209 self.n = 0 210 self.dt = None
211 212 EDGE = 700 213
214 - def add(self, y, dt, fc):
215 if self.dt is None: 216 assert dt > 0 217 self.dt = dt 218 else: 219 assert abs(dt-self.dt) < 0.0001*self.dt 220 FF = -3.2 221 sh = int(round(FF/(dt*fc))) + self.EDGE 222 if self.y is None: 223 self.y = numpy.zeros((y.shape[0]+2*self.EDGE,)) 224 numpy.add(self.y[sh:sh+y.shape[0]], y, self.y[sh:sh+y.shape[0]]) 225 if pylab and fc>400: 226 self.dbg.append(y) 227 self.shifts.append(sh) 228 self.n += 1
229 230
231 - def get(self, Dt):
232 if pylab: 233 for (i, (y,sh)) in enumerate(zip(self.dbg, self.shifts)): 234 xpl = [] 235 ypl = [] 236 for dly in range(-100,100): 237 ypl.append(numpy.sum(y*self.y[sh+dly:sh+y.shape[0]+dly])) 238 xpl.append(dly) 239 pylab.plot(xpl, [q/max(ypl) for q in ypl], linestyle='-', 240 color=(float(i)/float(self.n), 0.5, float(self.n-i)/float(self.n))) 241 if pylab: 242 pylab.show() 243 cfilt, t0 = power.smooth(self.y*self.y, self.dt, Dt) 244 return cfilt/self.n**2
245 246
247 -def block_percep_spec(data, dt, Dt, **kwargs):
248 """This computes the perceptual spectrum in blocks and glues them 249 together. It's useful when the data is too large to fit in memory. 250 """ 251 if kwargs.get('blocksize', None) is None: 252 return perceptual_spec(data, dt, Dt, **kwargs) 253 assert BLOCK_EDGE >= Dt 254 blocksize = Dt * int(round(kwargs['blocksize']/Dt)) 255 del kwargs['blocksize'] 256 assert blocksize > BLOCK_EDGE 257 m = data.shape[0] 258 ntau = int(M.ceil(m*dt/blocksize)) 259 nout = int(M.ceil(m*dt/Dt)) 260 pad = int(round(Dt*round(BLOCK_EDGE/Dt)/dt)) 261 262 neural = None 263 tshift = None 264 bctrs = None 265 S0 = 0.0 266 e = 0 267 for i in range(ntau): 268 s0 = int(round((m*i)/float(ntau))) 269 se = int(round((m*(i+1))/float(ntau))) 270 ps0 = max(0, s0-pad) 271 pse = min(m, se+pad) 272 tbc, x, ttshift = perceptual_spec(data[ps0:pse], dt, Dt, **kwargs) 273 if tshift is None: 274 tshift = ttshift 275 if bctrs is None: 276 bctrs = tbc 277 else: 278 assert tbc == bctrs 279 if neural is None: 280 neural = numpy.zeros((x.shape[0], nout+1), numpy.float) 281 282 tau0 = int(M.floor((s0-ps0)*(dt/Dt))) 283 nS = min((se-s0)*(dt/Dt), x.shape[1]-tau0) 284 # print 'dt=', dt, "Dt=", Dt 285 # print 'nS=', nS, "min(", (se-s0)*(dt/Dt), x.shape[0]-tau0, ")" 286 inS = int(M.ceil(nS)) 287 iS0 = int(M.floor(S0)) 288 assert iS0+inS <= nout, "S0=%.1f nS=%.1f S0+nS=%.1f nout=%d" % (S0, nS, S0+nS, nout) 289 # print 'neural', neural.shape, iS0, iS0+inS, 'x=', x.shape, tau0, tau0+inS 290 neural[:, iS0:iS0+inS] = x[:, tau0:tau0+inS] 291 # print 'S0=', S0, '->', S0+nS 292 S0 += nS 293 e = iS0+inS 294 # print 'e=', e 295 assert nout-2 <= e <= nout+1, "e=%d nout=%d" % (e, nout) 296 return (bctrs, neural[:,:e], tshift)
297 298
299 -class fft_filterbank(object):
300 - def __init__(self, data, dt, Dt):
301 self.d = VM.dataprep_flat_real(data, dt, Dt) 302 self.ss = numpy.fft.real_fft( self.d ) 303 self.f = numpy.absolute(VM.real_fft_freq(self.ss.shape[0], d=dt))
304
305 - def filter(self, fc):
306 xferfcn = cochlear_filter(self.f/fc) 307 xferfcn[0] = xferfcn[0].real 308 xferfcn[-1] = xferfcn[-1].real 309 return numpy.fft.inverse_real_fft( self.ss * xferfcn )
310 311
312 -class gammatone_filterbank(object):
313 - def __init__(self, data, dt):
314 self.dt = dt 315 if not data.flags['C_CONTIGUOUS']: 316 # This turns out to be necessary if you start with stereo 317 # (multichannel) data and then select a column from it. 318 # The selection means that the resulting array is not C_CONTIGUOUS, 319 # even if the original data was. 320 data = data.copy() 321 assert data.flags['C_CONTIGUOUS'], "Monotone filter bank needs C-contiguous data." 322 self.data = data
323
324 - def filter(self, fc):
325 return gammatone.gammatone(self.data, self.dt, fc, gammatone.M_BASILAR)
326 327 328 329 330
331 -def perceptual_spec(data, dt, Dt, bmin=CBmin, bmax=CBmax, db=BBSZ, 332 do_mod=0, do_dissonance=False, PlompBouman=True, 333 do_peakalign=False, e=None):
334 """This returns something roughly like the neural signals leaving the ear. 335 It filters into 1-erb-wide bins, then takes the cube root of 336 the amplitude. 337 @return: (channel_info, data, time_offset), where 338 - C{time_offset} is the time of the the first output datum 339 relative to the time of the first input sample. 340 - C{channel_info} 341 - C{data} 342 """ 343 if e is None: 344 e = E 345 do_mod = int(do_mod) 346 Neural_Time_Resolution = max(Neural_Tick, dt) 347 m = data.shape[0] 348 assert m*dt > Dt, "Tiny amount of data: %d*%g=%.2gs < Dt=%2g" % (m, dt, m*dt, Dt) 349 # ntau = int(round(m*dt/Dt)) 350 ntau = int(M.floor(m*dt/Dt)) 351 # fbank = fft_filterbank(data, dt, Dt) 352 fbank = gammatone_filterbank(data, dt) 353 nband0 = int(M.ceil((bmax-bmin)/db)) 354 nbands = nband0 355 assert do_mod in [0,1,2], "Do_mod=%s" % str(do_mod) 356 if do_mod: 357 nbands += int(do_mod) 358 tk = [] 359 if do_dissonance: 360 nbands += 1 361 diss = roughness_c() 362 if do_peakalign: 363 nbands += 1 364 palign = peakalign_c() 365 neural = numpy.zeros((nbands, ntau), numpy.float) 366 bctrs = [] 367 t0x = [] 368 iband = 0 369 b = bmin 370 while b < bmax: 371 # print 'iband=', iband, 'b=', b, 'nbands=', nbands 372 fc = erb_scale.erb_to_f(b) 373 filtered_sig = fbank.filter(fc) 374 hair_cells = filtered_sig / threshold(fc) 375 numpy.maximum(hair_cells, 0.0, hair_cells) 376 numpy.power(hair_cells, 2*e, hair_cells) 377 # pylab.plot(hair_cells) 378 # pylab.title('hair cells') 379 # pylab.figure() 380 pwrfilt, t0 = power.smooth(hair_cells, dt, Dt) 381 # pylab.plot(pwrfilt) 382 t0x.append(t0) 383 bctr = {'type':'band', 'fc': fc, 'erb': b, 'E': e, 'id': 'B%.1f' % b } 384 if PlompBouman: 385 pwrfilt = VM.lopass_first_order(pwrfilt, Dt/tau_lp(fc)) 386 bctr['smooth'] = tau_lp(fc) 387 bctrs.append(bctr) 388 numpy.maximum(pwrfilt[:ntau], 0.0, neural[iband,:]) 389 # neural[iband,:] = numpy.maximum(pwrfilt[:ntau], 0.0) 390 # pylab.plot(pwrfilt) 391 # pylab.show() 392 # neural[iband,:] = pwrfilt[:ntau] 393 394 if do_mod or do_dissonance or do_peakalign: 395 tkfilt, t0 = power.smooth(hair_cells, dt, Neural_Time_Resolution) 396 if do_dissonance: 397 diss_bw = min(DISS_BWF*erb_scale.ebw(b), MAX_DISS_FREQ) 398 diss.add(tkfilt, Neural_Time_Resolution, diss_bw) 399 if do_mod: 400 tk.append(tkfilt) 401 if do_peakalign: 402 palign.add(tkfilt, Neural_Time_Resolution, fc) 403 b += db 404 iband += 1 405 assert iband == nband0 406 407 if do_mod == 1: 408 tksum, tkn, tkmin = process_voicing(tk, Neural_Time_Resolution, Dt) 409 neural[iband,:] = (tksum/tkn-tkmin)[:ntau]*0.15 410 bctrs.append({'type':'haspitch', 'id':'haspitch1', 'variant': 1, 411 'Fentropy': -M.log(len(tk)) 412 }) 413 iband += 1 414 assert len(bctrs) == iband 415 elif do_mod == 2: 416 Voicing_divide = 0.5*(CBmax+CBmin) 417 tk_low = [] 418 tk_high = [] 419 for (bctr, tki) in zip(bctrs, tk): 420 assert tki.shape[0] > 0 421 if bctr['erb'] > Voicing_divide: 422 tk_high.append(tki) 423 else: 424 tk_low.append(tki) 425 tksum_low, tkn_low, tkmin_low = process_voicing(tk_low, Neural_Time_Resolution, Dt) 426 tksum_high, tkn_high, tkmin_high = process_voicing(tk_high, Neural_Time_Resolution, Dt) 427 avgmmin = (tksum_low+tksum_high)/(tkn_low+tkn_high) - numpy.minimum(tkmin_low, tkmin_high) 428 neural[iband,:] = avgmmin[:ntau]*0.15 429 bctrs.append({'type':'haspitch', 'id':'haspitch_all', 430 'variant': 'all', 431 'Fentropy': -M.log(len(tk)) 432 }) 433 iband += 1 434 435 neural[iband,:] = (tksum_high/tkn_high-tkmin_high)[:ntau]*0.15 436 bctrs.append({'type':'haspitch', 'id':'haspitch_high', 437 'variant': 'high', 438 'Fentropy': -M.log(len(tk_high)) 439 }) 440 iband += 1 441 assert len(bctrs) == iband 442 if do_dissonance: 443 neural[iband,:] = diss.get(Dt)[:ntau]*20.0 444 # pylab.plot(diss.get(Dt)) 445 # pylab.title('roughness_c') 446 # pylab.show() 447 bctrs.append({'type':'roughness', 'id':'roughness1', 448 'variant': 1, 449 'Fentropy': -M.log(diss.n) 450 }) 451 iband += 1 452 if do_peakalign: 453 neural[iband,:] = palign.get(Dt)[:ntau] 454 bctrs.append({'type':'peakalign', 'id':'peakalign1', 455 'variant': 1, 456 'Fentropy': -M.log(palign.n) 457 }) 458 iband += 1 459 assert iband == neural.shape[0], "iband=%d != shape=%s" % (iband, str(neural.shape)) 460 461 return (bctrs, neural, gpkmisc.median(t0x))
462 463 464
465 -def test():
466 dt = 1e-4 467 d = numpy.zeros((10000,1)) 468 for i in range(0, 10000, 100): 469 d[i,0] = 1.0 470 return gpkimgclass.gpk_img({'CDELT2': dt}, d)
471 472 473 474 __version__ = "$Revision: 1.43 $" 475 __date__ = "$Date: 2007/03/07 23:54:40 $" 476 477 478 if __name__ == '__main__': 479 # try: 480 # import psyco 481 # psyco.full() 482 # except ImportError: 483 # pass 484 import gpkimgclass 485 DT = 0.01 # Seconds. Default output sampling interval. 486 arglist = sys.argv[1:] 487 Blocksize = 10.0 488 arglist0 = arglist 489 do_mod, dpa, dd, pb = 1, True, True, True 490 column = None 491 signal = None 492 outfile = "-" 493 extrahdr = {} 494 while arglist and arglist[0].startswith('-'): 495 arg = arglist.pop(0) 496 if arg == '--': 497 break 498 elif arg == '-dt': 499 DT = float(arglist.pop(0)) 500 elif arg == '-f': 501 signal = gpkimgclass.read(arglist.pop(0)) 502 elif arg == '-c': 503 tmp = arglist.pop(0) 504 try: 505 column = int( tmp ) 506 except ValueError: 507 column = tmp 508 elif arg == '-b': 509 Blocksize = float(arglist.pop(0)) 510 elif arg == '-plot': 511 import pylab 512 elif arg == '-pb': 513 pb = False 514 elif arg == '-dd': 515 dd = False 516 elif arg == '-dpa': 517 dpa = False 518 elif arg == '-do_mod': 519 do_mod = 0 520 elif arg == '-test': 521 signal = test() 522 column = 0 523 elif arg == '-o': 524 outfile = arglist.pop(0) 525 elif arg == '-write': 526 extrahdr = dict( [q.strip().split('=', 1) 527 for q in arglist.pop(0).split(';') ] 528 ) 529 else: 530 die.info("Unrecognized flag: %s" % arg) 531 print __doc__ 532 die.exit(1) 533 if arglist: 534 if signal is None: 535 signal = gpkimgclass.read(arglist[0]) 536 else: 537 die.die("Usage: cannot specify name both with -f and as last argument.") 538 539 if signal is None: 540 die.info("No signal file specified.") 541 print __doc__ 542 die.exit(1) 543 try: 544 signal.column(column) 545 except KeyError: 546 die.info("File has %d columns" % signal.d.shape[0]) 547 die.die("Bad column: %s" % str(column)) 548 549 bctrs, neural, tshift = block_percep_spec(signal.column(0), signal.dt(), DT, 550 do_mod=do_mod, do_dissonance=dd, 551 do_peakalign=dpa, PlompBouman=pb, 552 blocksize=Blocksize) 553 o = numpy.transpose(neural) 554 if pylab: 555 pylab.matshow(numpy.transpose(o)) 556 pylab.show() 557 558 hdr = signal.hdr.copy() 559 hdr['program'] = sys.argv[0] 560 hdr['ARGV'] = arglist0 561 hdr['input_file'] = signal.hdr.get('_NAME', '???') 562 hdr['column'] = column 563 hdr['CDELT2'] = DT 564 hdr['CRPIX2'] = 1 565 hdr['CRVAL2'] = signal.start() + tshift 566 hdr['CRPIX1'] = 1 567 hdr['CRVAL1'] = bctrs[0]['erb'] 568 hdr['CDELT1'] = bctrs[1]['erb']-bctrs[0]['erb'] 569 hdr['BITPIX'] = -32 570 hdr.update( extrahdr ) 571 for (i,b) in enumerate(bctrs): 572 hdr['TTYPE%d' % (i+1)] = b['id'] 573 gpkimgclass.gpk_img(hdr, o).write(outfile) 574