lib.percep

1 #!/usr/bin/env python 2 3 """Perceptual spectrum for speech.""" 4 5 import sys 6 import math as M 7 from gmisclib import erb_scale 8 from gmisclib import gpkmisc 9 from gmisclib import die 10 11 import numpy 12 13 pylab = None 14 15 import gpk_voicing.voice_misc as VM 16 from gpk_voicing import power 17 from gpk_voicing import percep_spec_extras as PS 18 from gpk_voicing import gammatone2 19 20 CBmax = erb_scale.f_to_erb(8000.0) 21 CBmin = erb_scale.f_to_erb(50.0) 22 BBSZ = 0.5 23 E = 0.333 24 Neural_Tick = 1e-4 # The best possible time resolution of the auditory 25 # system. (E.g. from binaural direction measurements.) 26 DISS_BWF = 0.25 27 MAX_DISS_FREQ = 250.0 28 HP_DISS_FREQ = 30.0 29 BLOCK_EDGE = 0.3 # Seconds 30 31 # def SafeExp(x): 32 # return Num.exp(Num.maximum(-200.0, x)) 33 34

35 -def tau_lp(fc):

36 """This is the cutoff frequency of the modulation transfer function 37 in the ear, as a function of frequency. 38 From R. Plomp and M. A. Bouman, JASA 31(6), page 749ff, June 1959 39 'Relation of hearing threshold and duration of tone pulses' 40 """ 41 Tau200 = 0.4 42 Tau10k = 0.13 43 tmp = Tau200 + (M.log(fc/200.0)/M.log(1e4/200.0))*(Tau10k-Tau200) 44 assert tmp>0 45 return tmp

46 47

48 -def cochlear_filter(fofc):

49 """Transfer function taken from "Improved Audio Coding Using a 50 Psychoacoustic Model Based on a Cochlear Filter Bank 51 IEEE Transactions of Speech and Audio Processing, 10(7) October 2002, 52 Pages 495-503. au=Frank Baumgarte. 53 """ 54 Q = 4.0 55 Slp = 25.0/(20.0*M.log10(1.2)) 56 Shp = 8.0/(20.0*M.log10(1.2)) 57 fofc_Shp = fofc**Shp 58 return (fofc_Shp) / ( (1+fofc**Slp) * (1 + (1j/Q)*(fofc**(Shp/2)) 59 - fofc_Shp) 60 )

61 62

63 -def threshold(f):

64 """IN pressure amplitude. 65 Crudely taken from Handbook of Perception vol 4: hearing, 66 E.C.Carterette and M.P.Friedman, editors, 67 Academic Press 1978, isbn 0-12-161904-4. 68 Curve near 70db used.""" 69 return M.sqrt((200.0/f)**4 + 1 + (f/8000.0)**16)

70 71

72 -def accum_abs_sub(a, b, o):

73 """This does a little computation in a block-wise fashion 74 to keep all the data witin the processor's cache. 75 It computes sum( abs(a-b) ). 76 """ 77 BKS = 6000 78 n = o.shape[0] 79 assert a.shape == (n,) 80 assert b.shape == (n,) 81 assert o.shape == (n,) 82 tmp = numpy.zeros((BKS,), numpy.float) 83 for i in range(0, n, BKS): 84 e = min(i+BKS, n) 85 tmpi = tmp[:e-i] 86 ai = a[i:e] 87 bi = b[i:e] 88 oi = o[i:e] 89 numpy.subtract(ai, bi, tmpi) 90 numpy.absolute(tmpi, tmpi) 91 numpy.add(oi, tmpi, oi) 92 return o

93 94

95 -def list_accum_abs_sub_dly(tk, dly):

96 lng = tk[0][dly:].shape[0] 97 o = numpy.zeros((lng,), numpy.float) 98 for tkd in tk: 99 a = tkd[:-dly] 100 b = tkd[dly:] 101 accum_abs_sub(a, b, o) 102 return o

103 104 105

106 -def process_voicing(tk, tick, Dt):

107 """@param Dt: the sampling rate of the output vectors. 108 @type Dt: C{float} 109 @param tick: the sampling rate of the vectors in C{tk}. 110 @type tick: C{float} 111 @param tk: a list of vectors, each of which represents the neural firing 112 of a particular point in the cochlea. 113 @type tk: [ numpy.ndarray, ...] 114 @return: (avg, min) where avg-min is the voicing estimator. 115 avg: C{float}, min: C{float} 116 """ 117 LOWF = 50.0 118 HIGHF = 400.0 119 lowf = int(round(1.0/(tick*LOWF))) 120 highf = int(round(1.0/(tick*HIGHF))) 121 tmps = None 122 tmpmin = None 123 ## tmpss = None 124 tmpn = 0 125 for dly in range(highf, lowf): 126 # dhl = dly//2 127 # dhr = dly - dhl 128 # dlytmp = list_accum_abs_sub_dly(tk, dly) 129 dlytmp = PS.list_accum_abs_sub_dly(tk, dly) 130 assert len(dlytmp.shape) == 1 131 # al = Num.zeros((dhl,), Num.Float) 132 # ar = Num.zeros((dhr,), Num.Float) 133 # dlytmp = Num.concatenate((al, dlytmp, ar)) 134 tmp2, t0 = power.smooth(dlytmp, tick, Dt, extra=1.0/(LOWF*M.sqrt(12.0))) 135 136 if tmps is None: 137 tmpn = 1 138 tmps = tmp2 139 tmpmin = numpy.array(tmp2, copy=True) 140 else: 141 tmpn += 1 142 numpy.add(tmps, tmp2, tmps) 143 numpy.minimum(tmpmin, tmp2, tmpmin) 144 return (tmps, tmpn, tmpmin)

145 146

147 -class roughness_c(object):

148 """This should be based on Hutchinson and KNopoff 1978 149 Kameoka an Kuriygawa 1969a; Viemeister 1988; 150 Aures 1985; Plomp and Levelt 1965. 151 But it isn't yet. 152 153 Hutchinson W. and Knopoff, L. 1978 The acoustic componenet of 154 Western consonance. Interface 7, 1-29. 155 156 Kameoka and Kuriygawa 1969. Consonance Theory: Part 1. 157 J. Acoustical Soc. America. 45 1451-1458 158 (and 1459-1469 for part II). 159 160 Viemeister, N. F. 1977 Temporal factors in audition: A system 161 analysis approach. In E. F. Evans and J. P. Wilson (eds) 162 Psychophysics and physiology of hearing (pp. 419-429) 163 London: Academic Press 164 165 Aures, W. 1985 Ein Berechnun gsverfahren der Rauhigkeit 166 [A roughness calculation method] Acustica 58 268-281 167 168 Plomp R. and Levelt W. 1965 Tonal consonance and critical 169 bandwidth. JASA 38, 548-560. 170 """

171 - def __init__(self):

172 self.y = None 173 self.n = 0 174 self.dt = None

175

176 - def add(self, y, dt, diss_bw):

177 if self.dt is None: 178 assert dt > 0 179 self.dt = dt 180 else: 181 assert abs(dt-self.dt) < 0.0001*self.dt 182 tmp = VM.lowpass_sym_butterworth(y, diss_bw*dt) 183 tmp = VM.hipass_first_order(tmp, HP_DISS_FREQ*dt) 184 tmp = numpy.absolute(tmp) 185 # if pylab: 186 # print 'roughness_c.add', numpy.average(tmp), diss_bw 187 # pylab.plot(tmp) 188 if self.y is None: 189 self.y = tmp 190 else: 191 numpy.add(self.y, tmp, self.y) 192 self.n += 1

193

194 - def get(self, Dt):

195 # if pylab: 196 # pylab.figure() 197 yfilt, t0 = power.smooth(self.y, self.dt, Dt) 198 # if pylab: 199 # pylab.plot(yfilt) 200 # pylab.show() 201 return yfilt*(0.1/self.n)

202 203

204 -class peakalign_c(object):

205 - def __init__(self):

206 self.y = None 207 self.shifts = [] 208 self.dbg = [] 209 self.n = 0 210 self.dt = None

211 212 EDGE = 700 213

214 - def add(self, y, dt, fc):

215 if self.dt is None: 216 assert dt > 0 217 self.dt = dt 218 else: 219 assert abs(dt-self.dt) < 0.0001*self.dt 220 FF = -3.2 221 sh = int(round(FF/(dt*fc))) + self.EDGE 222 if self.y is None: 223 self.y = numpy.zeros((y.shape[0]+2*self.EDGE,)) 224 numpy.add(self.y[sh:sh+y.shape[0]], y, self.y[sh:sh+y.shape[0]]) 225 if pylab and fc>400: 226 self.dbg.append(y) 227 self.shifts.append(sh) 228 self.n += 1

229 230

231 - def get(self, Dt):

232 if pylab: 233 for (i, (y,sh)) in enumerate(zip(self.dbg, self.shifts)): 234 xpl = [] 235 ypl = [] 236 for dly in range(-100,100): 237 ypl.append(numpy.sum(y*self.y[sh+dly:sh+y.shape[0]+dly])) 238 xpl.append(dly) 239 pylab.plot(xpl, [q/max(ypl) for q in ypl], linestyle='-', 240 color=(float(i)/float(self.n), 0.5, float(self.n-i)/float(self.n))) 241 if pylab: 242 pylab.show() 243 cfilt, t0 = power.smooth(self.y*self.y, self.dt, Dt) 244 return cfilt/self.n**2

245 246

247 -def block_percep_spec(data, dt, Dt, **kwargs):

248 """This computes the perceptual spectrum in blocks and glues them 249 together. It's useful when the data is too large to fit in memory. 250 """ 251 if kwargs.get('blocksize', None) is None: 252 return perceptual_spec(data, dt, Dt, **kwargs) 253 assert BLOCK_EDGE >= Dt 254 blocksize = Dt * int(round(kwargs['blocksize']/Dt)) 255 del kwargs['blocksize'] 256 assert blocksize > BLOCK_EDGE 257 m = data.shape[0] 258 ntau = int(M.ceil(m*dt/blocksize)) 259 nout = int(M.ceil(m*dt/Dt)) 260 pad = int(round(Dt*round(BLOCK_EDGE/Dt)/dt)) 261 262 neural = None 263 tshift = None 264 bctrs = None 265 S0 = 0.0 266 e = 0 267 for i in range(ntau): 268 s0 = int(round((m*i)/float(ntau))) 269 se = int(round((m*(i+1))/float(ntau))) 270 ps0 = max(0, s0-pad) 271 pse = min(m, se+pad) 272 tbc, x, ttshift = perceptual_spec(data[ps0:pse], dt, Dt, **kwargs) 273 if tshift is None: 274 tshift = ttshift 275 if bctrs is None: 276 bctrs = tbc 277 else: 278 assert tbc == bctrs 279 if neural is None: 280 neural = numpy.zeros((x.shape[0], nout+1), numpy.float) 281 282 tau0 = int(M.floor((s0-ps0)*(dt/Dt))) 283 nS = min((se-s0)*(dt/Dt), x.shape[1]-tau0) 284 # print 'dt=', dt, "Dt=", Dt 285 # print 'nS=', nS, "min(", (se-s0)*(dt/Dt), x.shape[0]-tau0, ")" 286 inS = int(M.ceil(nS)) 287 iS0 = int(M.floor(S0)) 288 assert iS0+inS <= nout, "S0=%.1f nS=%.1f S0+nS=%.1f nout=%d" % (S0, nS, S0+nS, nout) 289 # print 'neural', neural.shape, iS0, iS0+inS, 'x=', x.shape, tau0, tau0+inS 290 neural[:, iS0:iS0+inS] = x[:, tau0:tau0+inS] 291 # print 'S0=', S0, '->', S0+nS 292 S0 += nS 293 e = iS0+inS 294 # print 'e=', e 295 assert nout-2 <= e <= nout+1, "e=%d nout=%d" % (e, nout) 296 return (bctrs, neural[:,:e], tshift)

297 298

299 -class fft_filterbank(object):

300 - def __init__(self, data, dt, Dt):

301 self.d = VM.dataprep_flat_real(data, dt, Dt) 302 self.ss = numpy.fft.real_fft( self.d ) 303 self.f = numpy.absolute(VM.real_fft_freq(self.ss.shape[0], d=dt))

304

305 - def filter(self, fc):

306 xferfcn = cochlear_filter(self.f/fc) 307 xferfcn[0] = xferfcn[0].real 308 xferfcn[-1] = xferfcn[-1].real 309 return numpy.fft.inverse_real_fft( self.ss * xferfcn )

310 311 312

313 -def perceptual_spec(data, dt, Dt, bmin=CBmin, bmax=CBmax, db=BBSZ, 314 do_mod=0, do_dissonance=False, PlompBouman=True, 315 do_peakalign=False, e=None):

316 """This returns something roughly like the neural signals leaving the ear. 317 It filters into 1-erb-wide bins, then takes the cube root of 318 the amplitude. 319 @return: (channel_info, data, time_offset), where 320 - C{time_offset} is the time of the the first output datum 321 relative to the time of the first input sample. 322 - C{channel_info} 323 - C{data} 324 """ 325 if e is None: 326 e = E 327 do_mod = int(do_mod) 328 Neural_Time_Resolution = max(Neural_Tick, dt) 329 m = data.shape[0] 330 assert m*dt > Dt, "Tiny amount of data: %d*%g=%.2gs < Dt=%2g" % (m, dt, m*dt, Dt) 331 # ntau = int(round(m*dt/Dt)) 332 ntau = int(M.floor(m*dt/Dt)) 333 # fbank = fft_filterbank(data, dt, Dt) 334 nband0 = int(M.ceil((bmax-bmin)/db)) 335 nbands = nband0 336 assert do_mod in [0,1,2], "Do_mod=%s" % str(do_mod) 337 if do_mod: 338 nbands += int(do_mod) 339 tk = [] 340 if do_dissonance: 341 nbands += 1 342 diss = roughness_c() 343 if do_peakalign: 344 nbands += 1 345 palign = peakalign_c() 346 neural = numpy.zeros((nbands, ntau), numpy.float) 347 bctrs = [] 348 t0x = [] 349 iband = 0 350 351 fclist = [] 352 b = bmin 353 while b < bmax: 354 # print 'iband=', iband, 'b=', b, 'nbands=', nbands 355 fc = erb_scale.erb_to_f(b) 356 fclist.append(b, fc) 357 b += db 358 359 hair_cells = gammatone2.gammatone_ear_bank(data, dt, e, [ (fc, threshold(fc)) for (b, fc) in fclist] ) 360 361 for (iband, (b, fc)) in enumerate(fclist): 362 pwrfilt, t0 = power.smooth(hair_cells[iband], dt, Dt) 363 # pylab.plot(pwrfilt) 364 t0x.append(t0) 365 if PlompBouman: 366 pwrfilt = numpy.maximum(VM.lopass_first_order(pwrfilt, Dt/tau_lp(fc)), 0.0) 367 else: 368 pwrfilt = numpy.maximum(pwrfilt, 0.0) 369 # pylab.plot(pwrfilt) 370 # pylab.show() 371 neural[iband,:] = pwrfilt[:ntau] 372 bctrs.append({'type':'band', 'smooth': tau_lp(fc), 'fc': fc, 373 'erb': b, 'E': e, 'id': 'B%.1f' % b 374 }) 375 376 if do_mod or do_dissonance or do_peakalign: 377 tkfilt, t0 = power.smooth(hair_cells, dt, Neural_Time_Resolution) 378 if do_dissonance: 379 diss_bw = min(DISS_BWF*erb_scale.ebw(b), MAX_DISS_FREQ) 380 diss.add(tkfilt, Neural_Time_Resolution, diss_bw) 381 if do_mod: 382 tk.append(tkfilt) 383 if do_peakalign: 384 palign.add(tkfilt, Neural_Time_Resolution, fc) 385 assert iband == nband0 386 387 if do_mod == 1: 388 tksum, tkn, tkmin = process_voicing(tk, Neural_Time_Resolution, Dt) 389 neural[iband,:] = (tksum/tkn-tkmin)[:ntau]*0.15 390 bctrs.append({'type':'haspitch', 'id':'haspitch1', 'variant': 1, 391 'Fentropy': -M.log(len(tk)) 392 }) 393 iband += 1 394 assert len(bctrs) == iband 395 elif do_mod == 2: 396 Voicing_divide = 0.5*(CBmax+CBmin) 397 tk_low = [] 398 tk_high = [] 399 for (bctr, tki) in zip(bctrs, tk): 400 assert tki.shape[0] > 0 401 if bctr['erb'] > Voicing_divide: 402 tk_high.append(tki) 403 else: 404 tk_low.append(tki) 405 tksum_low, tkn_low, tkmin_low = process_voicing(tk_low, Neural_Time_Resolution, Dt) 406 tksum_high, tkn_high, tkmin_high = process_voicing(tk_high, Neural_Time_Resolution, Dt) 407 avgmmin = (tksum_low+tksum_high)/(tkn_low+tkn_high) - numpy.minimum(tkmin_low, tkmin_high) 408 neural[iband,:] = avgmmin[:ntau]*0.15 409 bctrs.append({'type':'haspitch', 'id':'haspitch_all', 410 'variant': 'all', 411 'Fentropy': -M.log(len(tk)) 412 }) 413 iband += 1 414 415 neural[iband,:] = (tksum_high/tkn_high-tkmin_high)[:ntau]*0.15 416 bctrs.append({'type':'haspitch', 'id':'haspitch_high', 417 'variant': 'high', 418 'Fentropy': -M.log(len(tk_high)) 419 }) 420 iband += 1 421 assert len(bctrs) == iband 422 if do_dissonance: 423 neural[iband,:] = diss.get(Dt)[:ntau]*20.0 424 # pylab.plot(diss.get(Dt)) 425 # pylab.title('roughness_c') 426 # pylab.show() 427 bctrs.append({'type':'roughness', 'id':'roughness1', 428 'variant': 1, 429 'Fentropy': -M.log(diss.n) 430 }) 431 iband += 1 432 if do_peakalign: 433 neural[iband,:] = palign.get(Dt)[:ntau] 434 bctrs.append({'type':'peakalign', 'id':'peakalign1', 435 'variant': 1, 436 'Fentropy': -M.log(palign.n) 437 }) 438 iband += 1 439 assert iband == neural.shape[0], "iband=%d != shape=%s" % (iband, str(neural.shape)) 440 441 return (bctrs, neural, gpkmisc.median(t0x))

442 443 444

445 -def test():

446 dt = 1e-4 447 d = numpy.zeros((10000,1)) 448 for i in range(0, 10000, 100): 449 d[i,0] = 1.0 450 return gpkimgclass.gpk_img({'CDELT2': dt}, d)

451 452 453 454 __version__ = "$Revision: 1.43 $" 455 __date__ = "$Date: 2007/03/07 23:54:40 $" 456 457 458 if __name__ == '__main__': 459 # try: 460 # import psyco 461 # psyco.full() 462 # except ImportError: 463 # pass 464 import gpkimgclass 465 DT = 0.01 # Seconds. Default output sampling interval. 466 arglist = sys.argv[1:] 467 Blocksize = 10.0 468 arglist0 = arglist 469 do_mod, dpa, dd, pb = 1, True, True, True 470 column = None 471 signal = None 472 outfile = "-" 473 extrahdr = {} 474 while arglist and arglist[0].startswith('-'): 475 arg = arglist.pop(0) 476 if arg == '--': 477 break 478 elif arg == '-dt': 479 DT = float(arglist.pop(0)) 480 elif arg == '-f': 481 signal = gpkimgclass.read(arglist.pop(0)) 482 elif arg == '-c': 483 tmp = arglist.pop(0) 484 try: 485 column = int( tmp ) 486 except ValueError: 487 column = tmp 488 elif arg == '-b': 489 Blocksize = float(arglist.pop(0)) 490 elif arg == '-plot': 491 import pylab 492 elif arg == '-pb': 493 pb = False 494 elif arg == '-dd': 495 dd = False 496 elif arg == '-dpa': 497 dpa = False 498 elif arg == '-do_mod': 499 do_mod = 0 500 elif arg == '-test': 501 signal = test() 502 column = 0 503 elif arg == '-o': 504 outfile = arglist.pop(0) 505 elif arg == '-write': 506 extrahdr = dict( [q.strip().split('=', 1) 507 for q in arglist.pop(0).split(';') ] 508 ) 509 else: 510 die.info("Unrecognized flag: %s" % arg) 511 print __doc__ 512 die.exit(1) 513 if arglist: 514 if signal is None: 515 signal = gpkimgclass.read(arglist[0]) 516 else: 517 die.die("Usage: cannot specify name both with -f and as last argument.") 518 519 if signal is None: 520 die.info("No signal file specified.") 521 print __doc__ 522 die.exit(1) 523 try: 524 signal.column(column) 525 except KeyError: 526 die.info("File has %d columns" % signal.n[1]) 527 die.die("Bad column: %s" % str(column)) 528 529 bctrs, neural, tshift = block_percep_spec(signal.column(0), signal.dt(), DT, 530 do_mod=do_mod, do_dissonance=dd, 531 do_peakalign=dpa, PlompBouman=pb, 532 blocksize=Blocksize) 533 o = numpy.transpose(neural) 534 if pylab: 535 pylab.matshow(numpy.transpose(o)) 536 pylab.show() 537 538 hdr = signal.hdr.copy() 539 hdr['program'] = sys.argv[0] 540 hdr['ARGV'] = arglist0 541 hdr['input_file'] = signal.hdr.get('_NAME', '???') 542 hdr['column'] = column 543 hdr['CDELT2'] = DT 544 hdr['CRPIX2'] = 1 545 hdr['CRVAL2'] = signal.start() + tshift 546 hdr['CRPIX1'] = 1 547 hdr['CRVAL1'] = bctrs[0]['erb'] 548 hdr['CDELT1'] = bctrs[1]['erb']-bctrs[0]['erb'] 549 hdr['BITPIX'] = -32 550 hdr.update( extrahdr ) 551 for (i,b) in enumerate(bctrs): 552 hdr['TTYPE%d' % (i+1)] = b['id'] 553 gpkimgclass.gpk_img(hdr, o).write(outfile) 554

Source Code for Module lib.percep_spec2