lib.percep

1 #!/usr/bin/env python 2 3 """Perceptual spectrum for speech.""" 4 5 import sys 6 import math as M 7 from gmisclib import erb_scale 8 from gmisclib import gpkmisc 9 from gmisclib import die 10 11 import numpy 12 13 pylab = None 14 15 import gpk_voicing.voice_misc as VM 16 from gpk_voicing import power 17 from gpk_voicing import percep_spec_extras as PS 18 from gpk_voicing import gammatone 19 20 CBmax = erb_scale.f_to_erb(8000.0) 21 CBmin = erb_scale.f_to_erb(50.0) 22 BBSZ = 0.5 23 E = 0.333 24 Neural_Tick = 1e-4 # The best possible time resolution of the auditory 25 # system. (E.g. from binaural direction measurements.) 26 DISS_BWF = 0.25 27 MAX_DISS_FREQ = 250.0 28 HP_DISS_FREQ = 30.0 29 BLOCK_EDGE = 0.3 # Seconds 30 31 # def SafeExp(x): 32 # return Num.exp(Num.maximum(-200.0, x)) 33 34

35 -def tau_lp(fc):

36 """This is the cutoff frequency of the modulation transfer function 37 in the ear, as a function of frequency. 38 From R. Plomp and M. A. Bouman, JASA 31(6), page 749ff, June 1959 39 'Relation of hearing threshold and duration of tone pulses' 40 """ 41 Tau200 = 0.4 42 Tau10k = 0.13 43 tmp = Tau200 + (M.log(fc/200.0)/M.log(1e4/200.0))*(Tau10k-Tau200) 44 assert tmp>0 45 return tmp

46 47

48 -def cochlear_filter(fofc):

49 """Transfer function taken from "Improved Audio Coding Using a 50 Psychoacoustic Model Based on a Cochlear Filter Bank 51 IEEE Transactions of Speech and Audio Processing, 10(7) October 2002, 52 Pages 495-503. au=Frank Baumgarte. 53 """ 54 Q = 4.0 55 Slp = 25.0/(20.0*M.log10(1.2)) 56 Shp = 8.0/(20.0*M.log10(1.2)) 57 fofc_Shp = fofc**Shp 58 return (fofc_Shp) / ( (1+fofc**Slp) * (1 + (1j/Q)*(fofc**(Shp/2)) 59 - fofc_Shp) 60 )

61 62

63 -def threshold(f):

64 """IN pressure amplitude. 65 Crudely taken from Handbook of Perception vol 4: hearing, 66 E.C.Carterette and M.P.Friedman, editors, 67 Academic Press 1978, isbn 0-12-161904-4. 68 Curve near 70db used.""" 69 return M.sqrt((200.0/f)**4 + 1 + (f/8000.0)**16)

70 71

72 -def accum_abs_sub(a, b, o):

73 """This does a little computation in a block-wise fashion 74 to keep all the data witin the processor's cache. 75 It computes sum( abs(a-b) ). 76 """ 77 BKS = 6000 78 n = o.shape[0] 79 assert a.shape == (n,) 80 assert b.shape == (n,) 81 assert o.shape == (n,) 82 tmp = numpy.zeros((BKS,), numpy.float) 83 for i in range(0, n, BKS): 84 e = min(i+BKS, n) 85 tmpi = tmp[:e-i] 86 ai = a[i:e] 87 bi = b[i:e] 88 oi = o[i:e] 89 numpy.subtract(ai, bi, tmpi) 90 numpy.absolute(tmpi, tmpi) 91 numpy.add(oi, tmpi, oi) 92 return o

93 94

95 -def list_accum_abs_sub_dly(tk, dly):

96 lng = tk[0][dly:].shape[0] 97 o = numpy.zeros((lng,), numpy.float) 98 for tkd in tk: 99 a = tkd[:-dly] 100 b = tkd[dly:] 101 accum_abs_sub(a, b, o) 102 return o

103 104 105

106 -def process_voicing(tk, tick, Dt):

107 """@param Dt: the sampling rate of the output vectors. 108 @type Dt: C{float} 109 @param tick: the sampling rate of the vectors in C{tk}. 110 @type tick: C{float} 111 @param tk: a list of vectors, each of which represents the neural firing 112 of a particular point in the cochlea. 113 @type tk: [ numpy.ndarray, ...] 114 @return: (avg, min) where avg-min is the voicing estimator. 115 avg: C{float}, min: C{float} 116 """ 117 LOWF = 50.0 118 HIGHF = 400.0 119 lowf = int(round(1.0/(tick*LOWF))) 120 highf = int(round(1.0/(tick*HIGHF))) 121 tmps = None 122 tmpmin = None 123 ## tmpss = None 124 tmpn = 0 125 for dly in range(highf, lowf): 126 # dhl = dly//2 127 # dhr = dly - dhl 128 # dlytmp = list_accum_abs_sub_dly(tk, dly) 129 dlytmp = PS.list_accum_abs_sub_dly(tk, dly) 130 assert len(dlytmp.shape) == 1 131 # al = Num.zeros((dhl,), Num.Float) 132 # ar = Num.zeros((dhr,), Num.Float) 133 # dlytmp = Num.concatenate((al, dlytmp, ar)) 134 tmp2, t0 = power.smooth(dlytmp, tick, Dt, extra=1.0/(LOWF*M.sqrt(12.0))) 135 136 if tmps is None: 137 tmpn = 1 138 tmps = tmp2 139 tmpmin = numpy.array(tmp2, copy=True) 140 else: 141 tmpn += 1 142 numpy.add(tmps, tmp2, tmps) 143 numpy.minimum(tmpmin, tmp2, tmpmin) 144 return (tmps, tmpn, tmpmin)

145 146

147 -class roughness_c(object):

148 """This should be based on Hutchinson and KNopoff 1978 149 Kameoka an Kuriygawa 1969a; Viemeister 1988; 150 Aures 1985; Plomp and Levelt 1965. 151 But it isn't yet. 152 153 Hutchinson W. and Knopoff, L. 1978 The acoustic componenet of 154 Western consonance. Interface 7, 1-29. 155 156 Kameoka and Kuriygawa 1969. Consonance Theory: Part 1. 157 J. Acoustical Soc. America. 45 1451-1458 158 (and 1459-1469 for part II). 159 160 Viemeister, N. F. 1977 Temporal factors in audition: A system 161 analysis approach. In E. F. Evans and J. P. Wilson (eds) 162 Psychophysics and physiology of hearing (pp. 419-429) 163 London: Academic Press 164 165 Aures, W. 1985 Ein Berechnun gsverfahren der Rauhigkeit 166 [A roughness calculation method] Acustica 58 268-281 167 168 Plomp R. and Levelt W. 1965 Tonal consonance and critical 169 bandwidth. JASA 38, 548-560. 170 """

171 - def __init__(self):

172 self.y = None 173 self.n = 0 174 self.dt = None

175

176 - def add(self, y, dt, diss_bw):

177 if self.dt is None: 178 assert dt > 0 179 self.dt = dt 180 else: 181 assert abs(dt-self.dt) < 0.0001*self.dt 182 tmp = VM.lowpass_sym_butterworth(y, diss_bw*dt) 183 tmp = VM.hipass_first_order(tmp, HP_DISS_FREQ*dt) 184 tmp = numpy.absolute(tmp) 185 # if pylab: 186 # print 'roughness_c.add', numpy.average(tmp), diss_bw 187 # pylab.plot(tmp) 188 if self.y is None: 189 self.y = tmp 190 else: 191 numpy.add(self.y, tmp, self.y) 192 self.n += 1

193

194 - def get(self, Dt):

195 # if pylab: 196 # pylab.figure() 197 yfilt, t0 = power.smooth(self.y, self.dt, Dt) 198 # if pylab: 199 # pylab.plot(yfilt) 200 # pylab.show() 201 return yfilt*(0.1/self.n)

202 203

204 -class peakalign_c(object):

205 - def __init__(self):

206 self.y = None 207 self.shifts = [] 208 self.dbg = [] 209 self.n = 0 210 self.dt = None

211 212 EDGE = 700 213

214 - def add(self, y, dt, fc):

215 if self.dt is None: 216 assert dt > 0 217 self.dt = dt 218 else: 219 assert abs(dt-self.dt) < 0.0001*self.dt 220 FF = -3.2 221 sh = int(round(FF/(dt*fc))) + self.EDGE 222 if self.y is None: 223 self.y = numpy.zeros((y.shape[0]+2*self.EDGE,)) 224 numpy.add(self.y[sh:sh+y.shape[0]], y, self.y[sh:sh+y.shape[0]]) 225 if pylab and fc>400: 226 self.dbg.append(y) 227 self.shifts.append(sh) 228 self.n += 1

229 230

231 - def get(self, Dt):

232 if pylab: 233 for (i, (y,sh)) in enumerate(zip(self.dbg, self.shifts)): 234 xpl = [] 235 ypl = [] 236 for dly in range(-100,100): 237 ypl.append(numpy.sum(y*self.y[sh+dly:sh+y.shape[0]+dly])) 238 xpl.append(dly) 239 pylab.plot(xpl, [q/max(ypl) for q in ypl], linestyle='-', 240 color=(float(i)/float(self.n), 0.5, float(self.n-i)/float(self.n))) 241 if pylab: 242 pylab.show() 243 cfilt, t0 = power.smooth(self.y*self.y, self.dt, Dt) 244 return cfilt/self.n**2

245 246

247 -def block_percep_spec(data, dt, Dt, **kwargs):

248 """This computes the perceptual spectrum in blocks and glues them 249 together. It's useful when the data is too large to fit in memory. 250 """ 251 if kwargs.get('blocksize', None) is None: 252 return perceptual_spec(data, dt, Dt, **kwargs) 253 assert BLOCK_EDGE >= Dt 254 blocksize = Dt * int(round(kwargs['blocksize']/Dt)) 255 del kwargs['blocksize'] 256 assert blocksize > BLOCK_EDGE 257 m = data.shape[0] 258 ntau = int(M.ceil(m*dt/blocksize)) 259 nout = int(M.ceil(m*dt/Dt)) 260 pad = int(round(Dt*round(BLOCK_EDGE/Dt)/dt)) 261 262 neural = None 263 tshift = None 264 bctrs = None 265 S0 = 0.0 266 e = 0 267 for i in range(ntau): 268 s0 = int(round((m*i)/float(ntau))) 269 se = int(round((m*(i+1))/float(ntau))) 270 ps0 = max(0, s0-pad) 271 pse = min(m, se+pad) 272 tbc, x, ttshift = perceptual_spec(data[ps0:pse], dt, Dt, **kwargs) 273 if tshift is None: 274 tshift = ttshift 275 if bctrs is None: 276 bctrs = tbc 277 else: 278 assert tbc == bctrs 279 if neural is None: 280 neural = numpy.zeros((x.shape[0], nout+1), numpy.float) 281 282 tau0 = int(M.floor((s0-ps0)*(dt/Dt))) 283 nS = min((se-s0)*(dt/Dt), x.shape[1]-tau0) 284 # print 'dt=', dt, "Dt=", Dt 285 # print 'nS=', nS, "min(", (se-s0)*(dt/Dt), x.shape[0]-tau0, ")" 286 inS = int(M.ceil(nS)) 287 iS0 = int(M.floor(S0)) 288 assert iS0+inS <= nout, "S0=%.1f nS=%.1f S0+nS=%.1f nout=%d" % (S0, nS, S0+nS, nout) 289 # print 'neural', neural.shape, iS0, iS0+inS, 'x=', x.shape, tau0, tau0+inS 290 neural[:, iS0:iS0+inS] = x[:, tau0:tau0+inS] 291 # print 'S0=', S0, '->', S0+nS 292 S0 += nS 293 e = iS0+inS 294 # print 'e=', e 295 assert nout-2 <= e <= nout+1, "e=%d nout=%d" % (e, nout) 296 return (bctrs, neural[:,:e], tshift)

297 298

299 -class fft_filterbank(object):

300 - def __init__(self, data, dt, Dt):

301 self.d = VM.dataprep_flat_real(data, dt, Dt) 302 self.ss = numpy.fft.real_fft( self.d ) 303 self.f = numpy.absolute(VM.real_fft_freq(self.ss.shape[0], d=dt))

304

305 - def filter(self, fc):

306 xferfcn = cochlear_filter(self.f/fc) 307 xferfcn[0] = xferfcn[0].real 308 xferfcn[-1] = xferfcn[-1].real 309 return numpy.fft.inverse_real_fft( self.ss * xferfcn )

310 311

312 -class gammatone_filterbank(object):

313 - def __init__(self, data, dt):

314 self.dt = dt 315 if not data.flags['C_CONTIGUOUS']: 316 # This turns out to be necessary if you start with stereo 317 # (multichannel) data and then select a column from it. 318 # The selection means that the resulting array is not C_CONTIGUOUS, 319 # even if the original data was. 320 data = data.copy() 321 assert data.flags['C_CONTIGUOUS'], "Monotone filter bank needs C-contiguous data." 322 self.data = data

323

324 - def filter(self, fc):

325 return gammatone.gammatone(self.data, self.dt, fc, gammatone.M_BASILAR)

326 327 328 329 330

331 -def perceptual_spec(data, dt, Dt, bmin=CBmin, bmax=CBmax, db=BBSZ, 332 do_mod=0, do_dissonance=False, PlompBouman=True, 333 do_peakalign=False, e=None):

334 """This returns something roughly like the neural signals leaving the ear. 335 It filters into 1-erb-wide bins, then takes the cube root of 336 the amplitude. 337 @return: (channel_info, data, time_offset), where 338 - C{time_offset} is the time of the the first output datum 339 relative to the time of the first input sample. 340 - C{channel_info} 341 - C{data} 342 """ 343 if e is None: 344 e = E 345 do_mod = int(do_mod) 346 Neural_Time_Resolution = max(Neural_Tick, dt) 347 m = data.shape[0] 348 assert m*dt > Dt, "Tiny amount of data: %d*%g=%.2gs < Dt=%2g" % (m, dt, m*dt, Dt) 349 # ntau = int(round(m*dt/Dt)) 350 ntau = int(M.floor(m*dt/Dt)) 351 # fbank = fft_filterbank(data, dt, Dt) 352 fbank = gammatone_filterbank(data, dt) 353 nband0 = int(M.ceil((bmax-bmin)/db)) 354 nbands = nband0 355 assert do_mod in [0,1,2], "Do_mod=%s" % str(do_mod) 356 if do_mod: 357 nbands += int(do_mod) 358 tk = [] 359 if do_dissonance: 360 nbands += 1 361 diss = roughness_c() 362 if do_peakalign: 363 nbands += 1 364 palign = peakalign_c() 365 neural = numpy.zeros((nbands, ntau), numpy.float) 366 bctrs = [] 367 t0x = [] 368 iband = 0 369 b = bmin 370 while b < bmax: 371 # print 'iband=', iband, 'b=', b, 'nbands=', nbands 372 fc = erb_scale.erb_to_f(b) 373 filtered_sig = fbank.filter(fc) 374 hair_cells = filtered_sig / threshold(fc) 375 numpy.maximum(hair_cells, 0.0, hair_cells) 376 numpy.power(hair_cells, 2*e, hair_cells) 377 # pylab.plot(hair_cells) 378 # pylab.title('hair cells') 379 # pylab.figure() 380 pwrfilt, t0 = power.smooth(hair_cells, dt, Dt) 381 # pylab.plot(pwrfilt) 382 t0x.append(t0) 383 bctr = {'type':'band', 'fc': fc, 'erb': b, 'E': e, 'id': 'B%.1f' % b } 384 if PlompBouman: 385 pwrfilt = VM.lopass_first_order(pwrfilt, Dt/tau_lp(fc)) 386 bctr['smooth'] = tau_lp(fc) 387 bctrs.append(bctr) 388 numpy.maximum(pwrfilt[:ntau], 0.0, neural[iband,:]) 389 # neural[iband,:] = numpy.maximum(pwrfilt[:ntau], 0.0) 390 # pylab.plot(pwrfilt) 391 # pylab.show() 392 # neural[iband,:] = pwrfilt[:ntau] 393 394 if do_mod or do_dissonance or do_peakalign: 395 tkfilt, t0 = power.smooth(hair_cells, dt, Neural_Time_Resolution) 396 if do_dissonance: 397 diss_bw = min(DISS_BWF*erb_scale.ebw(b), MAX_DISS_FREQ) 398 diss.add(tkfilt, Neural_Time_Resolution, diss_bw) 399 if do_mod: 400 tk.append(tkfilt) 401 if do_peakalign: 402 palign.add(tkfilt, Neural_Time_Resolution, fc) 403 b += db 404 iband += 1 405 assert iband == nband0 406 407 if do_mod == 1: 408 tksum, tkn, tkmin = process_voicing(tk, Neural_Time_Resolution, Dt) 409 neural[iband,:] = (tksum/tkn-tkmin)[:ntau]*0.15 410 bctrs.append({'type':'haspitch', 'id':'haspitch1', 'variant': 1, 411 'Fentropy': -M.log(len(tk)) 412 }) 413 iband += 1 414 assert len(bctrs) == iband 415 elif do_mod == 2: 416 Voicing_divide = 0.5*(CBmax+CBmin) 417 tk_low = [] 418 tk_high = [] 419 for (bctr, tki) in zip(bctrs, tk): 420 assert tki.shape[0] > 0 421 if bctr['erb'] > Voicing_divide: 422 tk_high.append(tki) 423 else: 424 tk_low.append(tki) 425 tksum_low, tkn_low, tkmin_low = process_voicing(tk_low, Neural_Time_Resolution, Dt) 426 tksum_high, tkn_high, tkmin_high = process_voicing(tk_high, Neural_Time_Resolution, Dt) 427 avgmmin = (tksum_low+tksum_high)/(tkn_low+tkn_high) - numpy.minimum(tkmin_low, tkmin_high) 428 neural[iband,:] = avgmmin[:ntau]*0.15 429 bctrs.append({'type':'haspitch', 'id':'haspitch_all', 430 'variant': 'all', 431 'Fentropy': -M.log(len(tk)) 432 }) 433 iband += 1 434 435 neural[iband,:] = (tksum_high/tkn_high-tkmin_high)[:ntau]*0.15 436 bctrs.append({'type':'haspitch', 'id':'haspitch_high', 437 'variant': 'high', 438 'Fentropy': -M.log(len(tk_high)) 439 }) 440 iband += 1 441 assert len(bctrs) == iband 442 if do_dissonance: 443 neural[iband,:] = diss.get(Dt)[:ntau]*20.0 444 # pylab.plot(diss.get(Dt)) 445 # pylab.title('roughness_c') 446 # pylab.show() 447 bctrs.append({'type':'roughness', 'id':'roughness1', 448 'variant': 1, 449 'Fentropy': -M.log(diss.n) 450 }) 451 iband += 1 452 if do_peakalign: 453 neural[iband,:] = palign.get(Dt)[:ntau] 454 bctrs.append({'type':'peakalign', 'id':'peakalign1', 455 'variant': 1, 456 'Fentropy': -M.log(palign.n) 457 }) 458 iband += 1 459 assert iband == neural.shape[0], "iband=%d != shape=%s" % (iband, str(neural.shape)) 460 461 return (bctrs, neural, gpkmisc.median(t0x))

462 463 464

465 -def test():

466 dt = 1e-4 467 d = numpy.zeros((10000,1)) 468 for i in range(0, 10000, 100): 469 d[i,0] = 1.0 470 return gpkimgclass.gpk_img({'CDELT2': dt}, d)

471 472 473 474 __version__ = "$Revision: 1.43 $" 475 __date__ = "$Date: 2007/03/07 23:54:40 $" 476 477 478 if __name__ == '__main__': 479 # try: 480 # import psyco 481 # psyco.full() 482 # except ImportError: 483 # pass 484 import gpkimgclass 485 DT = 0.01 # Seconds. Default output sampling interval. 486 arglist = sys.argv[1:] 487 Blocksize = 10.0 488 arglist0 = arglist 489 do_mod, dpa, dd, pb = 1, True, True, True 490 column = None 491 signal = None 492 outfile = "-" 493 extrahdr = {} 494 while arglist and arglist[0].startswith('-'): 495 arg = arglist.pop(0) 496 if arg == '--': 497 break 498 elif arg == '-dt': 499 DT = float(arglist.pop(0)) 500 elif arg == '-f': 501 signal = gpkimgclass.read(arglist.pop(0)) 502 elif arg == '-c': 503 tmp = arglist.pop(0) 504 try: 505 column = int( tmp ) 506 except ValueError: 507 column = tmp 508 elif arg == '-b': 509 Blocksize = float(arglist.pop(0)) 510 elif arg == '-plot': 511 import pylab 512 elif arg == '-pb': 513 pb = False 514 elif arg == '-dd': 515 dd = False 516 elif arg == '-dpa': 517 dpa = False 518 elif arg == '-do_mod': 519 do_mod = 0 520 elif arg == '-test': 521 signal = test() 522 column = 0 523 elif arg == '-o': 524 outfile = arglist.pop(0) 525 elif arg == '-write': 526 extrahdr = dict( [q.strip().split('=', 1) 527 for q in arglist.pop(0).split(';') ] 528 ) 529 else: 530 die.info("Unrecognized flag: %s" % arg) 531 print __doc__ 532 die.exit(1) 533 if arglist: 534 if signal is None: 535 signal = gpkimgclass.read(arglist[0]) 536 else: 537 die.die("Usage: cannot specify name both with -f and as last argument.") 538 539 if signal is None: 540 die.info("No signal file specified.") 541 print __doc__ 542 die.exit(1) 543 try: 544 signal.column(column) 545 except KeyError: 546 die.info("File has %d columns" % signal.d.shape[0]) 547 die.die("Bad column: %s" % str(column)) 548 549 bctrs, neural, tshift = block_percep_spec(signal.column(0), signal.dt(), DT, 550 do_mod=do_mod, do_dissonance=dd, 551 do_peakalign=dpa, PlompBouman=pb, 552 blocksize=Blocksize) 553 o = numpy.transpose(neural) 554 if pylab: 555 pylab.matshow(numpy.transpose(o)) 556 pylab.show() 557 558 hdr = signal.hdr.copy() 559 hdr['program'] = sys.argv[0] 560 hdr['ARGV'] = arglist0 561 hdr['input_file'] = signal.hdr.get('_NAME', '???') 562 hdr['column'] = column 563 hdr['CDELT2'] = DT 564 hdr['CRPIX2'] = 1 565 hdr['CRVAL2'] = signal.start() + tshift 566 hdr['CRPIX1'] = 1 567 hdr['CRVAL1'] = bctrs[0]['erb'] 568 hdr['CDELT1'] = bctrs[1]['erb']-bctrs[0]['erb'] 569 hdr['BITPIX'] = -32 570 hdr.update( extrahdr ) 571 for (i,b) in enumerate(bctrs): 572 hdr['TTYPE%d' % (i+1)] = b['id'] 573 gpkimgclass.gpk_img(hdr, o).write(outfile) 574

Source Code for Module lib.percep_spec