gmisclib.mcmc_logtools

1 2 import math 3 import numpy 4 5 import die 6 import fiatio 7 import gpkmisc 8 import mcmc_logger 9 import mcmc_newlogger as LG 10 import mcmc_indexclass as IC 11 12 13 14 FILE_DROP_FAC = 0.2 15 TRIGGER = 'run_to_bottom finished' 16 17

18 -def read_many_files(fnames, uid, Nsamp, tail, trigger):

19 """OBSOLETE 20 @return: a dictionary mapping filenames onto "data" and header information. 21 Header information is from the last file read. 22 The "data" are L{onelog} class instances that encapsulate 23 log information from one log file. 24 @rtype: tuple(dict(str:onelog), dict(str:str)) 25 """ 26 hdr = {} 27 per_fn = {} 28 if uid is None: 29 for fname in fnames: 30 hdr, indexers, logps = mcmc_logger.read_multisample(fname, Nsamp=Nsamp, tail=tail, trigger=trigger) 31 per_fn[fname] = onelog(logps, indexers, fname) 32 else: 33 for fname in fnames: 34 try: 35 hdr, indexers, logps = LG.read_multi_uid(fname, uid, 36 Nsamp=Nsamp, tail=tail, 37 trigger=trigger 38 ) 39 except LG.BadFormatError, x: 40 try: 41 hdr, statedict, logps = LG.read_human_fmt(fname) 42 except LG.BadFormatError, y: 43 raise LG.BadFormatError, "Unreadable in either format: {%s} or {%s}" % (x, y) 44 try: 45 indexers = [statedict[uid]] 46 logps = [logps[uid]] 47 except KeyError: 48 raise LG.BadFormatError, "human format: uid=%s not found" % uid 49 else: 50 per_fn[fname] = onelog(logps, indexers, fname) 51 52 except LG.NoDataError, x: 53 die.warn("No data in %s: %s" % (fname, x)) 54 else: 55 per_fn[fname] = onelog(logps, indexers, fname) 56 return (per_fn, hdr)

57 58

59 -def read_uid_many_files(fnames, uid, Nsamp, tail, trigger):

60 """ 61 @return: a dictionary mapping filenames onto "data" and header information. 62 Header information is from the last file read. 63 The "data" are L{onelog} class instances that encapsulate 64 log information from one log file. 65 @rtype: tuple(dict(str:onelog), dict(str:str)) 66 """ 67 hdr = {} 68 per_fn = {} 69 for fname in fnames: 70 try: 71 hdr, loglines = LG.read_tail_uid(fname, uid, 72 Nsamp=Nsamp, tail=tail, 73 trigger=trigger 74 ) 75 per_fn[fname] = loglines 76 except LG.NoDataError, ex: 77 die.warn("No data from %s: %s" % (fname, str(ex))) 78 return (per_fn, hdr)

79 80

81 -def get_pmap(per_fn):

82 """ 83 @type per_fn: dict(str: L{mcmc_newlogger.logline}) 84 """ 85 for ol in per_fn.values(): 86 for ll in ol: 87 return ll.idxr().map 88 raise ValueError, "No data - cannot get_pmap."

89 90

91 -def list_prm_samples(per_fn, sample_selector, out):

92 """ 93 @type per_fn: dict(str: L{mcmc_newlogger.logline}) 94 """ 95 w = fiatio.writer(out) 96 for ll in sample_selector(per_fn): 97 tmp = {'T': ll.T() } 98 for (k, v) in ll.idxr().get_all().items(): 99 tmp[IC.index_base._fmt(k)] = v 100 w.datum(tmp)

101 102

103 -def indexer_covar(per_fn, sample_selector, weight_by_T=False):

104 """ 105 @type per_fn: dict(str: L{mcmc_newlogger.logline}) 106 """ 107 idxr_map = get_pmap(per_fn) 108 ndim = len(idxr_map) 109 sum = numpy.zeros((ndim,)) 110 sw = 0 111 for ll in sample_selector(per_fn): 112 idxr = ll.idxr() 113 if weight_by_T: 114 wt = 1.0/math.sqrt(ll.T()) 115 numpy.add(sum, idxr.get_prms()*wt, sum) 116 else: 117 wt = 1.0 118 numpy.add(sum, idxr.get_prms(), sum) 119 sw += wt 120 assert sw > 0 121 m = sum/sw 122 123 n = 0 124 mxwt = 0.0 125 sum = numpy.zeros((ndim, ndim)) 126 for ll in sample_selector(per_fn): 127 idxr = ll.idxr() 128 delta = idxr.get_prms() - m 129 assert len(delta.shape) == 1 130 if weight_by_T: 131 wt = 1.0/math.sqrt(ll.T()) 132 numpy.multiply(delta, wt, delta) 133 if wt > mxwt: 134 mxwt = wt 135 else: 136 mxwt = 1.0 137 delta2 = numpy.outer(delta, delta) 138 assert len(delta2.shape) == 2 139 numpy.add(sum, delta2, sum) 140 n += 1 141 if sw > mxwt: 142 var = (1.0-mxwt/sw)*sum/n 143 else: 144 var = None 145 return (m, var, sw, idxr_map)

146 147

148 -def logp_stdev(per_fn, sample_selector):

149 """ 150 @type per_fn: dict(str: L{mcmc_newlogger.logline}) 151 """ 152 lp = [] 153 T = [] 154 for ll in sample_selector(per_fn): 155 lp.append( ll.logp() ) 156 try: 157 T.append( ll.T() ) 158 except ValueError: 159 pass 160 assert len(lp) > 0, "No data in average in logp_stdev" 161 rv = [ (('logp',),) + gpkmisc.mean_stdev(lp) ] 162 if len(T) > 1: 163 rv.append( (('T',),) + gpkmisc.mean_stdev(T) ) 164 return rv

165 166

167 -def all(per_fn, source=None):

168 """This selects all measurements.""" 169 for (fn, ol) in per_fn.items(): 170 if source is not None: 171 source.add(fn, len(ol)) 172 for ll in ol: 173 yield ll

174 175

176 -def some_after_convergence(per_fn, source=None):

177 """This selects which measurements will be used. 178 It looks after convergence, then throws out optimizations that haven't 179 converged. 180 """ 181 maxes = [] 182 for ol in per_fn.values(): 183 maxes.append( max( q.logp() for q in ol ) ) 184 ndim = len(get_pmap(per_fn)) 185 assert ndim > 0 186 tol = 6 * math.sqrt(ndim) 187 threshold = max(maxes) - tol 188 for (fn, ol) in per_fn.items(): 189 for ll in ol: 190 if ll.logp() > threshold: 191 if source is not None: 192 source.add(fn, 1) 193 yield ll

194 195

196 -def near_each_max(per_fn, source=None):

197 """This selects which measurements will be used. 198 It looks after convergence, then gives you the best few 199 results from each run. 200 @type per_fn: dict(str: L{mcmc_newlogger.logline}) 201 """ 202 ndim = len(get_pmap(per_fn)) 203 assert ndim > 0 204 tol = 6 * math.sqrt(ndim) 205 for (fn,ol) in per_fn.items(): 206 themax = max( q.logp() for q in ol ) 207 threshold = themax - tol 208 for ll in ol: 209 if ll.logp() > threshold: 210 if source is not None: 211 source.add(fn, 1) 212 yield ll

213 214 215

216 -def last(per_fn, source=None):

217 """This selects which measurements will be used. 218 @type per_fn: dict(str: L{mcmc_newlogger.logline}) 219 """ 220 for (fn, ol) in per_fn.items(): 221 if source is not None: 222 source.add(fn, 1) 223 yield ol[-1]

224 225

226 -def each_best(per_fn, source=None):

227 """This selects which measurements will be used.""" 228 HUGE = 1e30 229 for (fn, ol) in per_fn.items(): 230 bestlogp = -HUGE 231 best = None 232 for ll in ol: 233 if ll.logp() > bestlogp: 234 best = ll 235 bestlogp = ll.logp() 236 if best is not None: 237 if source is not None: 238 source.add(fn, 1) 239 yield best

240 241 242

243 -def overall_best(per_fn, source=None):

244 """This selects which measurements will be used.""" 245 HUGE = 1e30 246 bestlogp = -HUGE 247 best = None 248 for (fn, ol) in per_fn.items(): 249 for ll in ol: 250 if ll.logp() > bestlogp: 251 best = ll 252 bestlogp = ll.logp() 253 if best is not None: 254 if source is not None: 255 source.add(fn, 1) 256 yield best

257 258 259 260

261 -def indexer_stdev(per_fn, selector, weight_by_T=False):

262 """Return a summary of the properties of the selected indexers. 263 @param selector: A generator that selects some of the C{logline}s in C{per_fn} that meet certain 264 selection criteria. 265 @type selector: function(dict(filename: list(L{mcmc_newlogger.logline}))) 266 that yields L{mcmc_newlogger.logline} 267 """ 268 mean, covar, n, idxr_map = indexer_covar(per_fn, selector, weight_by_T=weight_by_T) 269 o = [] 270 for (nm, i) in idxr_map.items(): 271 if covar is None: 272 std = None 273 else: 274 std = math.sqrt(covar[i,i]) 275 o.append( (nm, mean[i], std) ) 276 return o

277 278

279 -def print_index_error(ke):

280 print 'Cannot find key= %s in index.' % str(ke.args[0]) 281 idxr = ke.args[1] 282 kl = sorted(idxr.map.keys()) 283 print 'The index has these keys:' 284 for k in kl: 285 print 'Key=', idxr._fmt(k)

286 287 288 289

290 -class onelog:

291 - def __init__(self, logps, indexers, fname):

292 assert len(indexers) == len(logps) 293 assert isinstance(logps, numpy.ndarray) 294 assert isinstance(indexers, list) 295 self.logps = logps 296 self.indexers = indexers 297 self.fname = fname 298 self.convergence = None

299 300 301 302 303

304 -def drop_files(per_fn, FileDropFac, Trim=None, Stretch=None):

305 """ 306 @param per_fn: produced byLT.read_uid_many_files() 307 """ 308 mxvs = [] 309 for (k, lls) in per_fn.items(): 310 mxvs.append( (max([q.logp() for q in lls]), k) ) 311 mxvs.sort() 312 for (lp, kk) in mxvs: 313 print '# max(%s) = %g' % (kk, lp) 314 ndrop = int(math.floor(FileDropFac * len(per_fn))) 315 rv = {} 316 for (mxlogp, k) in mxvs[ndrop:]: 317 rv[k] = per_fn[k] 318 return rv

319 320

321 -def ascii_cmp(a, b):

322 """Compares the ASCII form of keys, for sorting purposes. Does a good 323 attempt at ASCII ordering for strings and numeric ordering for numbers. 324 """ 325 asp = a.split(',') 326 bsp = b.split(',') 327 return key_cmp(asp, bsp)

328 329

330 -def key_cmp(a, b):

331 """Compares the tuple form of keys, for sorting purposes. Does a good 332 attempt at ASCII ordering for strings and numeric ordering for numbers. 333 """ 334 for (aa, bb) in zip(a, b): 335 for fcn in [float, str]: 336 try: 337 c = cmp(fcn(aa), fcn(bb)) 338 if c != 0: 339 return c 340 break # c==0 341 except ValueError: 342 pass 343 return 0

344 345

346 -def P_bayes_list(per_fn, argv, m, arg, selector, hdr):

347 """C{Log(PosteriorMarginalLikelihood)} is the posterior marginal likelihood of this model: 348 C{P_posterior[Data|model] = integral( P[D|params,model] * P[params|data,model] * d_params)}. 349 It's the average of P[D|params,model] over the posterior distribution 350 of P[params|data,model]*Prior(params). 351 352 References are: 353 354 - Rampal S. Etienne, Han Olff (2005) Confronting different models of community structure to species-abundance data: a Bayesian model comparison Ecology Letters 8 (5) , 493-504 doi:10.1111/j.1461-0248.2005.00745.x 355 356 and that references 357 358 - M. Aitkin 1991: Posterior Bayes Factors, J. of the Royal Statistical Soc. B 53: 111-142 359 360 - P. W. Laud and J. G. Ibrahim 1995: Predictive Model Selection, J. of the Royal Statistical Soc. B 57: 247-262. 361 362 - F. De Santis and F. Spezzaferri 1997: Alternative Bayes Factors for Model Selection, Canadian J. of Statistics 25: 503-515 363 364 - S. K. Upadhyay and M. Peswani 2003: Choice between Weibull and Lognormal Models: a simulation based Bayesian Study Communications in Statistics: Theory and Methods 32: 318-405 365 366 - P. K. Vlachos and A. E. Gelfand 2003: On the calibration of Bayeseian model choice criteria, J. of Statistical Planning and Inference 111: 223-234 367 368 - R. E. Kass and A. E. Rafferty 1995: Bayes Factors, J. of the American Statistical Assoc. 90: 773-795 369 370 The other thing, Log(BayesWeightedBayes) 371 is the average of P(D|params,M)*Prior(params) over the posterior distribution 372 of P[params|data,model]*Prior(params). 373 It has no real statistical backing, but it's a crude approximation 374 for the Bayes evidence itself, 375 the normalized P[params|data,model]*Prior(params). 376 """ 377 pd = m.pd_factory(argv, hdr=hdr) 378 lplist = [] 379 lblist = [] 380 idxrs = [] 381 for ll in selector(per_fn): 382 idxr = ll.idxr() 383 idxrs.append(idxr) 384 try: 385 lpdata = float(pd.logp_data_normalized(idxr)) 386 lplist.append(lpdata) 387 if hasattr(pd, 'logp_prior_normalized'): 388 lpprior = float(pd.logp_prior_normalized(idxr)) 389 lblist.append(lpdata + lpprior) 390 except IC.IndexKeyError, ke: 391 print_index_error(ke) 392 raise 393 return (lplist, lblist, idxrs)

394 395

396 -def P_bayes(per_fn, argv, m, arg, selector, hdr):

397 lplist, lblist, idxrs = P_bayes_list(per_fn, argv, m, arg, selector, hdr) 398 rv = [] 399 mxlp = max(lplist) 400 psum = 0.0 401 for lp in lplist: 402 psum += math.exp(max(-100.0, lp-mxlp)) 403 rv.append( (('Log(PosteriorMarginalLikelihood)',), 404 mxlp + math.log(psum/len(lplist))) 405 ) 406 mxlb = max(lplist) 407 bsum = 0.0 408 if lblist: 409 for lb in lblist: 410 bsum += math.exp(max(-100.0, lb-mxlb)) 411 rv.append( (('Log(BayesWeightedBayes)',), 412 mxlb + math.log(bsum/len(lblist))) 413 ) 414 else: 415 die.info("You may want to have problem_definition.logp_prior_normalized() to compute the BayesWeightedBayes evidence.") 416 return rv

417

Source Code for Module gmisclib.mcmc_logtools