1
2
3 """Duration estimator for speech.
4
5 Usage: s_slope [flags]
6 Flags: -f FFF opens file FFF for reading.
7 -c XXX selects column XXX (either a data name or an integer).
8 -o FFF sets the output file.
9 -dt #.## Sets the interval for calculating the output.
10
11 It takes a local spectrum,
12 bins it onto the Bark scale,
13 converts to perceptual loudness (via **E).
14 Then, it computes a measure of how far you can go
15 from each point before the spectrum changes too much.
16 """
17
18 import math
19 import numpy
20
21 from gmisclib import cache as CC
22 from gmisclib import gpkmisc
23 from gmisclib import erb_scale
24
25 from gpk_voicing import percep_spec as PS
26 from gpk_voicing import fv_misc as FVM
27 from gpk_voicing import voice_misc as VM
28 SillyWidthException = FVM.SillyWidthException
29 NSV = 0.75
30
31 _wincache = {}
32 _WIN_CACHE_SIZE = 20
50 rv = VM.cont_kernel(n, 0, norm=Norm, kernel=inner_window)
51 _wincache[key] = rv
52 return rv
53
54
56 ALPHA = -1.0
57 E = 0.333
58 f = erb_scale.erb_to_f(b)/1000.0
59 bw = erb_scale.ebw(b)/100.0
60 return (f**ALPHA * bw)**E
61
62
63 -def feature_vec_guts(data, dt, Dt, do_irx=False,
64 LF=1.0, fdif2=0.0, fdif2x=0.0, Nsv=NSV,
65 cache_info=None, ps_cache_info=None
66 ):
67 FORMANT_LOW = erb_scale.f_to_erb(200.0)
68 FORMANT_HIGH = erb_scale.f_to_erb(6000.0)
69 assert Dt > 0.0 and float(Dt)>0.0
70 bmin = FORMANT_LOW - 0.5*FVM.DB
71 bmax = FORMANT_HIGH + 0.5*FVM.DB
72 width = 0.04 * LF / Dt
73 if do_irx:
74 width = max(1, int(round(width)))
75 cs = win(width)
76 css = cs.sum()
77 f_kernel = numpy.array([-fdif2x, -fdif2, 1+2*fdif2+2*fdif2x, -fdif2, -fdif2x])
78
79 if ps_cache_info is not None:
80 ic_i = ps_cache_info
81 else:
82 ic_i = cache_info
83
84 t0, neural, ectrs, nneural, nectrs = \
85 FVM.normalize_neural(data, dt, Dt, bmin=bmin, bmax=bmax, db=FVM.DB,
86 do_mod=0, do_dissonance=False,
87 PlompBouman=False,
88 norm_kernel=cs/css, Nsv=Nsv, t_kernel=cs/css,
89 f_kernel=f_kernel,
90 cache_info=ic_i
91 )
92 o = []
93 descr = []
94 nsum = numpy.zeros((neural.shape[1],))
95 nna = 0
96 avss = 0.0
97 for (i, e) in enumerate(ectrs):
98 if e['type']=='band' and FORMANT_LOW < e['erb'] < FORMANT_HIGH:
99 numpy.add(nsum, neural[i,:], nsum)
100 avss += avg_spec(e['erb'])
101 nna += 1
102 dtmp = e.copy()
103 del dtmp['erb']
104 del dtmp['fc']
105 dtmp['type'] = 'loudness'
106 dtmp['width'] = width
107 dtmp['Kentropy'] = gpkmisc.entropy(numpy.absolute(cs))
108 dtmp['Fentropy'] = math.log(nna)
109 dtmp['t_symmetry'] = 1
110 dtmp['id'] = '%s:%d' % (dtmp['type'], width)
111 descr.append(dtmp)
112 o.append(nsum/nna)
113 for (i, e) in enumerate(ectrs):
114 if e['type']=='band' and FORMANT_LOW < e['erb'] < FORMANT_HIGH:
115 tmp = neural[i,:] - nsum * (avg_spec(e['erb'])/avss)
116
117
118 dtmp = e.copy()
119 dtmp['type'] = 'vowel'
120 dtmp['width'] = width
121 dtmp['Kentropy'] = gpkmisc.entropy(numpy.absolute(cs))
122 dtmp['Fentropy'] = 0.0
123 dtmp['t_symmetry'] = 1
124 dtmp['a_scaling'] = 1
125
126
127 o.append(tmp)
128 dtmp['id'] = '%s:%d:%.1f' % (dtmp['type'], width, e['erb'])
129 descr.append( dtmp )
130 assert len(descr)==len(o), "Descriptor mismatch"
131 return (o, descr, Dt, t0)
132
133
134 -def feature_vec(data, dt, DT, LF=1.0, Nsv=NSV, fdif2=0.0, fdif2x=0.0,
135 do_irx=False, cache_info=None, ps_cache_info=None):
136 o = None
137 ci = None
138 if cache_info is not None:
139 assert isinstance(cache_info, CC.cache_info)
140 ci = cache_info.addinfo(dt, DT, LF, Nsv, fdif2, fdif2x, do_irx, 'fv201105-2')
141 try:
142 o = ci.load()
143 except ci.Errors:
144 pass
145 if o is None:
146 o = feature_vec_guts(data, dt, DT, LF=LF, Nsv=Nsv,
147 do_irx=do_irx, fdif2=fdif2, fdif2x=fdif2x,
148 cache_info=cache_info, ps_cache_info=ps_cache_info
149 )
150 if ci is not None:
151 ci.bg_dump(o)
152 return o
153
154
155
156
157 Opt_text = """
158 # RUN 1
159 # summarize_logs -uid UID -fromstart -best logS_advLImMDREf,-fv201102_4.av
160 # len(currentlist)= 26 tail= 0.0 nsamp= 26
161 # max(logS_advLImMDREf,-fv201102_4.av) = 376.74
162 # samples used = 1 filename= logS_advLImMDREf,-fv201102_4.av
163 376.74 logp
164 # n= 39
165 26.031 Scale,vowel:.*:10.4
166 0.650176 Scale,vowel:.*:11.1
167 0.994825 Scale,vowel:.*:11.9
168 3.65683 Scale,vowel:.*:12.6
169 12.2741 Scale,vowel:.*:13.3
170 0.23901 Scale,vowel:.*:14.0
171 1.53292 Scale,vowel:.*:14.7
172 7.75427 Scale,vowel:.*:15.4
173 0.784154 Scale,vowel:.*:16.1
174 3.23822 Scale,vowel:.*:16.8
175 3.84116 Scale,vowel:.*:17.5
176 12.1566 Scale,vowel:.*:18.2
177 6.9634 Scale,vowel:.*:18.9
178 4.90421 Scale,vowel:.*:19.6
179 3.72337 Scale,vowel:.*:20.3
180 0.0150866 Scale,vowel:.*:21.0
181 6.29358 Scale,vowel:.*:21.8
182 3.50677 Scale,vowel:.*:22.5
183 1.94812 Scale,vowel:.*:23.2
184 17.1486 Scale,vowel:.*:23.9
185 1.76959 Scale,vowel:.*:24.6
186 1.27658 Scale,vowel:.*:25.3
187 0.439394 Scale,vowel:.*:26.0
188 0.185877 Scale,vowel:.*:26.7
189 12.2699 Scale,vowel:.*:6.2
190 10.2536 Scale,vowel:.*:6.9
191 5.84667 Scale,vowel:.*:7.6
192 3.5585 Scale,vowel:.*:8.3
193 0.405896 Scale,vowel:.*:9.0
194 2.4074 Scale,vowel:.*:9.7
195 2.94796 lf,lf
196 -0.302882 norm,fdif2
197 1.28682 norm,nsv
198 4.824 norm,tdif
199 0.16849 norm,tdif2
200 #
201 # RUN 2
202 # summarize_logs -best -fromstart -uid UID logS_advLImMDREf,-fv201102_2.av
203 # len(currentlist)= 20 tail= 0.0 nsamp= 20
204 # max(logS_advLImMDREf,-fv201102_2.av) = 348.56
205 # samples used = 1 filename= logS_advLImMDREf,-fv201102_2.av
206 348.56 logp
207 # n= 39
208 0.367388 Scale,vowel:.*:10.4
209 0.668657 Scale,vowel:.*:11.1
210 3.73234 Scale,vowel:.*:11.9
211 9.59746 Scale,vowel:.*:12.6
212 45.2418 Scale,vowel:.*:13.3
213 12.4549 Scale,vowel:.*:14.0
214 5.18612 Scale,vowel:.*:14.7
215 8.10955 Scale,vowel:.*:15.4
216 16.1181 Scale,vowel:.*:16.1
217 8.2858 Scale,vowel:.*:16.8
218 6.15389 Scale,vowel:.*:17.5
219 2.11999 Scale,vowel:.*:18.2
220 8.04381 Scale,vowel:.*:18.9
221 3.69372 Scale,vowel:.*:19.6
222 0.302413 Scale,vowel:.*:20.3
223 3.42239 Scale,vowel:.*:21.0
224 18.7842 Scale,vowel:.*:21.8
225 7.25486 Scale,vowel:.*:22.5
226 0.435129 Scale,vowel:.*:23.2
227 2.46633 Scale,vowel:.*:23.9
228 0.828264 Scale,vowel:.*:24.6
229 0.604598 Scale,vowel:.*:25.3
230 15.0723 Scale,vowel:.*:26.0
231 2.12652 Scale,vowel:.*:26.7
232 0.258391 Scale,vowel:.*:6.2
233 3.00883 Scale,vowel:.*:6.9
234 19.7981 Scale,vowel:.*:7.6
235 0.098635 Scale,vowel:.*:8.3
236 0.144953 Scale,vowel:.*:9.0
237 21.9714 Scale,vowel:.*:9.7
238 3.96667 lf,lf
239 -0.523753 norm,fdif2
240 0.703901 norm,nsv
241 4.49354 norm,tdif
242 -0.147517 norm,tdif2
243 4000 FORMANT_HIGH
244 """
245
246 Scale = FVM.scale_xform(Opt_text,
247 name="2011-04-17 optimization on mace and cayenne:/home/gpk/ItakuraSaitoDistance/IS-DTW/logS_advLImMDREf,-fv201102/{1,2}"
248 )
249