1
2 import math
3 import numpy
4
5 import die
6 import fiatio
7 import gpkmisc
8 import mcmc_logger
9 import mcmc_newlogger as LG
10 import mcmc_indexclass as IC
11
12
13
14 FILE_DROP_FAC = 0.2
15 TRIGGER = 'run_to_bottom finished'
16
17
19 """OBSOLETE
20 @return: a dictionary mapping filenames onto "data" and header information.
21 Header information is from the last file read.
22 The "data" are L{onelog} class instances that encapsulate
23 log information from one log file.
24 @rtype: tuple(dict(str:onelog), dict(str:str))
25 """
26 hdr = {}
27 per_fn = {}
28 if uid is None:
29 for fname in fnames:
30 hdr, indexers, logps = mcmc_logger.read_multisample(fname, Nsamp=Nsamp, tail=tail, trigger=trigger)
31 per_fn[fname] = onelog(logps, indexers, fname)
32 else:
33 for fname in fnames:
34 try:
35 hdr, indexers, logps = LG.read_multi_uid(fname, uid,
36 Nsamp=Nsamp, tail=tail,
37 trigger=trigger
38 )
39 except LG.BadFormatError, x:
40 try:
41 hdr, statedict, logps = LG.read_human_fmt(fname)
42 except LG.BadFormatError, y:
43 raise LG.BadFormatError, "Unreadable in either format: {%s} or {%s}" % (x, y)
44 try:
45 indexers = [statedict[uid]]
46 logps = [logps[uid]]
47 except KeyError:
48 raise LG.BadFormatError, "human format: uid=%s not found" % uid
49 else:
50 per_fn[fname] = onelog(logps, indexers, fname)
51
52 except LG.NoDataError, x:
53 die.warn("No data in %s: %s" % (fname, x))
54 else:
55 per_fn[fname] = onelog(logps, indexers, fname)
56 return (per_fn, hdr)
57
58
60 """
61 @return: a dictionary mapping filenames onto "data" and header information.
62 Header information is from the last file read.
63 The "data" are L{onelog} class instances that encapsulate
64 log information from one log file.
65 @rtype: tuple(dict(str:onelog), dict(str:str))
66 """
67 hdr = {}
68 per_fn = {}
69 for fname in fnames:
70 try:
71 hdr, loglines = LG.read_tail_uid(fname, uid,
72 Nsamp=Nsamp, tail=tail,
73 trigger=trigger
74 )
75 per_fn[fname] = loglines
76 except LG.NoDataError, ex:
77 die.warn("No data from %s: %s" % (fname, str(ex)))
78 return (per_fn, hdr)
79
80
82 """
83 @type per_fn: dict(str: L{mcmc_newlogger.logline})
84 """
85 for ol in per_fn.values():
86 for ll in ol:
87 return ll.idxr().map
88 raise ValueError, "No data - cannot get_pmap."
89
90
92 """
93 @type per_fn: dict(str: L{mcmc_newlogger.logline})
94 """
95 w = fiatio.writer(out)
96 for ll in sample_selector(per_fn):
97 tmp = {'T': ll.T() }
98 for (k, v) in ll.idxr().get_all().items():
99 tmp[IC.index_base._fmt(k)] = v
100 w.datum(tmp)
101
102
104 """
105 @type per_fn: dict(str: L{mcmc_newlogger.logline})
106 """
107 idxr_map = get_pmap(per_fn)
108 ndim = len(idxr_map)
109 sum = numpy.zeros((ndim,))
110 sw = 0
111 for ll in sample_selector(per_fn):
112 idxr = ll.idxr()
113 if weight_by_T:
114 wt = 1.0/math.sqrt(ll.T())
115 numpy.add(sum, idxr.get_prms()*wt, sum)
116 else:
117 wt = 1.0
118 numpy.add(sum, idxr.get_prms(), sum)
119 sw += wt
120 assert sw > 0
121 m = sum/sw
122
123 n = 0
124 mxwt = 0.0
125 sum = numpy.zeros((ndim, ndim))
126 for ll in sample_selector(per_fn):
127 idxr = ll.idxr()
128 delta = idxr.get_prms() - m
129 assert len(delta.shape) == 1
130 if weight_by_T:
131 wt = 1.0/math.sqrt(ll.T())
132 numpy.multiply(delta, wt, delta)
133 if wt > mxwt:
134 mxwt = wt
135 else:
136 mxwt = 1.0
137 delta2 = numpy.outer(delta, delta)
138 assert len(delta2.shape) == 2
139 numpy.add(sum, delta2, sum)
140 n += 1
141 if sw > mxwt:
142 var = (1.0-mxwt/sw)*sum/n
143 else:
144 var = None
145 return (m, var, sw, idxr_map)
146
147
149 """
150 @type per_fn: dict(str: L{mcmc_newlogger.logline})
151 """
152 lp = []
153 T = []
154 for ll in sample_selector(per_fn):
155 lp.append( ll.logp() )
156 try:
157 T.append( ll.T() )
158 except ValueError:
159 pass
160 assert len(lp) > 0, "No data in average in logp_stdev"
161 rv = [ (('logp',),) + gpkmisc.mean_stdev(lp) ]
162 if len(T) > 1:
163 rv.append( (('T',),) + gpkmisc.mean_stdev(T) )
164 return rv
165
166
167 -def all(per_fn, source=None):
168 """This selects all measurements."""
169 for (fn, ol) in per_fn.items():
170 if source is not None:
171 source.add(fn, len(ol))
172 for ll in ol:
173 yield ll
174
175
177 """This selects which measurements will be used.
178 It looks after convergence, then throws out optimizations that haven't
179 converged.
180 """
181 maxes = []
182 for ol in per_fn.values():
183 maxes.append( max( q.logp() for q in ol ) )
184 ndim = len(get_pmap(per_fn))
185 assert ndim > 0
186 tol = 6 * math.sqrt(ndim)
187 threshold = max(maxes) - tol
188 for (fn, ol) in per_fn.items():
189 for ll in ol:
190 if ll.logp() > threshold:
191 if source is not None:
192 source.add(fn, 1)
193 yield ll
194
195
197 """This selects which measurements will be used.
198 It looks after convergence, then gives you the best few
199 results from each run.
200 @type per_fn: dict(str: L{mcmc_newlogger.logline})
201 """
202 ndim = len(get_pmap(per_fn))
203 assert ndim > 0
204 tol = 6 * math.sqrt(ndim)
205 for (fn,ol) in per_fn.items():
206 themax = max( q.logp() for q in ol )
207 threshold = themax - tol
208 for ll in ol:
209 if ll.logp() > threshold:
210 if source is not None:
211 source.add(fn, 1)
212 yield ll
213
214
215
216 -def last(per_fn, source=None):
217 """This selects which measurements will be used.
218 @type per_fn: dict(str: L{mcmc_newlogger.logline})
219 """
220 for (fn, ol) in per_fn.items():
221 if source is not None:
222 source.add(fn, 1)
223 yield ol[-1]
224
225
227 """This selects which measurements will be used."""
228 HUGE = 1e30
229 for (fn, ol) in per_fn.items():
230 bestlogp = -HUGE
231 best = None
232 for ll in ol:
233 if ll.logp() > bestlogp:
234 best = ll
235 bestlogp = ll.logp()
236 if best is not None:
237 if source is not None:
238 source.add(fn, 1)
239 yield best
240
241
242
244 """This selects which measurements will be used."""
245 HUGE = 1e30
246 bestlogp = -HUGE
247 best = None
248 for (fn, ol) in per_fn.items():
249 for ll in ol:
250 if ll.logp() > bestlogp:
251 best = ll
252 bestlogp = ll.logp()
253 if best is not None:
254 if source is not None:
255 source.add(fn, 1)
256 yield best
257
258
259
260
262 """Return a summary of the properties of the selected indexers.
263 @param selector: A generator that selects some of the C{logline}s in C{per_fn} that meet certain
264 selection criteria.
265 @type selector: function(dict(filename: list(L{mcmc_newlogger.logline})))
266 that yields L{mcmc_newlogger.logline}
267 """
268 mean, covar, n, idxr_map = indexer_covar(per_fn, selector, weight_by_T=weight_by_T)
269 o = []
270 for (nm, i) in idxr_map.items():
271 if covar is None:
272 std = None
273 else:
274 std = math.sqrt(covar[i,i])
275 o.append( (nm, mean[i], std) )
276 return o
277
278
280 print 'Cannot find key= %s in index.' % str(ke.args[0])
281 idxr = ke.args[1]
282 kl = sorted(idxr.map.keys())
283 print 'The index has these keys:'
284 for k in kl:
285 print 'Key=', idxr._fmt(k)
286
287
288
289
291 - def __init__(self, logps, indexers, fname):
292 assert len(indexers) == len(logps)
293 assert isinstance(logps, numpy.ndarray)
294 assert isinstance(indexers, list)
295 self.logps = logps
296 self.indexers = indexers
297 self.fname = fname
298 self.convergence = None
299
300
301
302
303
304 -def drop_files(per_fn, FileDropFac, Trim=None, Stretch=None):
305 """
306 @param per_fn: produced byLT.read_uid_many_files()
307 """
308 mxvs = []
309 for (k, lls) in per_fn.items():
310 mxvs.append( (max([q.logp() for q in lls]), k) )
311 mxvs.sort()
312 for (lp, kk) in mxvs:
313 print '# max(%s) = %g' % (kk, lp)
314 ndrop = int(math.floor(FileDropFac * len(per_fn)))
315 rv = {}
316 for (mxlogp, k) in mxvs[ndrop:]:
317 rv[k] = per_fn[k]
318 return rv
319
320
322 """Compares the ASCII form of keys, for sorting purposes. Does a good
323 attempt at ASCII ordering for strings and numeric ordering for numbers.
324 """
325 asp = a.split(',')
326 bsp = b.split(',')
327 return key_cmp(asp, bsp)
328
329
331 """Compares the tuple form of keys, for sorting purposes. Does a good
332 attempt at ASCII ordering for strings and numeric ordering for numbers.
333 """
334 for (aa, bb) in zip(a, b):
335 for fcn in [float, str]:
336 try:
337 c = cmp(fcn(aa), fcn(bb))
338 if c != 0:
339 return c
340 break
341 except ValueError:
342 pass
343 return 0
344
345
347 """C{Log(PosteriorMarginalLikelihood)} is the posterior marginal likelihood of this model:
348 C{P_posterior[Data|model] = integral( P[D|params,model] * P[params|data,model] * d_params)}.
349 It's the average of P[D|params,model] over the posterior distribution
350 of P[params|data,model]*Prior(params).
351
352 References are:
353
354 - Rampal S. Etienne, Han Olff (2005) Confronting different models of community structure to species-abundance data: a Bayesian model comparison Ecology Letters 8 (5) , 493-504 doi:10.1111/j.1461-0248.2005.00745.x
355
356 and that references
357
358 - M. Aitkin 1991: Posterior Bayes Factors, J. of the Royal Statistical Soc. B 53: 111-142
359
360 - P. W. Laud and J. G. Ibrahim 1995: Predictive Model Selection, J. of the Royal Statistical Soc. B 57: 247-262.
361
362 - F. De Santis and F. Spezzaferri 1997: Alternative Bayes Factors for Model Selection, Canadian J. of Statistics 25: 503-515
363
364 - S. K. Upadhyay and M. Peswani 2003: Choice between Weibull and Lognormal Models: a simulation based Bayesian Study Communications in Statistics: Theory and Methods 32: 318-405
365
366 - P. K. Vlachos and A. E. Gelfand 2003: On the calibration of Bayeseian model choice criteria, J. of Statistical Planning and Inference 111: 223-234
367
368 - R. E. Kass and A. E. Rafferty 1995: Bayes Factors, J. of the American Statistical Assoc. 90: 773-795
369
370 The other thing, Log(BayesWeightedBayes)
371 is the average of P(D|params,M)*Prior(params) over the posterior distribution
372 of P[params|data,model]*Prior(params).
373 It has no real statistical backing, but it's a crude approximation
374 for the Bayes evidence itself,
375 the normalized P[params|data,model]*Prior(params).
376 """
377 pd = m.pd_factory(argv, hdr=hdr)
378 lplist = []
379 lblist = []
380 idxrs = []
381 for ll in selector(per_fn):
382 idxr = ll.idxr()
383 idxrs.append(idxr)
384 try:
385 lpdata = float(pd.logp_data_normalized(idxr))
386 lplist.append(lpdata)
387 if hasattr(pd, 'logp_prior_normalized'):
388 lpprior = float(pd.logp_prior_normalized(idxr))
389 lblist.append(lpdata + lpprior)
390 except IC.IndexKeyError, ke:
391 print_index_error(ke)
392 raise
393 return (lplist, lblist, idxrs)
394
395
396 -def P_bayes(per_fn, argv, m, arg, selector, hdr):
397 lplist, lblist, idxrs = P_bayes_list(per_fn, argv, m, arg, selector, hdr)
398 rv = []
399 mxlp = max(lplist)
400 psum = 0.0
401 for lp in lplist:
402 psum += math.exp(max(-100.0, lp-mxlp))
403 rv.append( (('Log(PosteriorMarginalLikelihood)',),
404 mxlp + math.log(psum/len(lplist)))
405 )
406 mxlb = max(lplist)
407 bsum = 0.0
408 if lblist:
409 for lb in lblist:
410 bsum += math.exp(max(-100.0, lb-mxlb))
411 rv.append( (('Log(BayesWeightedBayes)',),
412 mxlb + math.log(bsum/len(lblist)))
413 )
414 else:
415 die.info("You may want to have problem_definition.logp_prior_normalized() to compute the BayesWeightedBayes evidence.")
416 return rv
417