gmisclib.kl

1 2 """Suppose there is a random variable with true distribution p. 3 Then (as we will see) we could represent that random variable with a code that has average length H(p). 4 However, due to incomplete information we do not know p; 5 instead we assume that the distribution of the random variable is q. 6 Then (as we will see) the code would need more bits to represent the random variable. 7 The difference in the number of bits is denoted as D(p|q). 8 The quantity D(p|q) comes up often enough that it has a name: it is known as the relative entropy:: 9 10 The relative entropy or Kullback-Leibler distance between two 11 probability mass functions p(x) and q(x) is defined as 12 D(p||q) = Sum{x in X} p(x) log(p(x)/q(x)) 13 14 Note that this is not symmetric, and the q (the second argument) appears only in the denominator. 15 """ 16 17 import Num 18 import mcmc 19 import mcmc_helper 20 import gpkavg 21 import gpkmisc 22 import math 23 import random 24

25 -def P(x):

26 assert Num.alltrue(x >= 0) 27 sx = Num.sum(x, axis=0) 28 return x/sx

29 30

31 -def multinomial_logp(x, cF):

32 c, F = cF 33 assert x.shape == c.shape 34 if Num.sometrue(x <= 0): 35 return None 36 sx = Num.sum(x, axis=0) 37 xe = x / sx 38 log_combinations = gpkmisc.log_factorial(Num.sum(c, axis=0)) 39 for cc in c: 40 log_combinations -= gpkmisc.log_factorial(cc) 41 normc = x.shape[0] * math.log(sx)**2 42 return (Num.sum(Num.log(xe)*c, axis=0) + log_combinations - normc) * F

43 44

45 -def multinomial_fixer(x, c):

46 EPS = 1e-8 47 while Num.sometrue( x <= 0.0): 48 for i in Num.nonzero( x <= 0.0): 49 x[i] = -x[i] + EPS 50 return x

51 52

53 -def kl_nonzero_probs(p, q):

54 """Kullback-Lieber distance between two normalized, 55 nonzero probability distributions.""" 56 # print 'kl_nonzero_probs:', p, q 57 rv = Num.sum( kl_nonzero_prob_m(p, q), axis=0 ) 58 # print ' result=', rv 59 return rv

60 61

62 -def kl_nonzero_prob_m(p, q):

63 rv = p*Num.log(p/q) / math.log(2.0) 64 return rv

65 66

67 -def kldist_vec(p, q, N=None, Fp=1.0, Fq=1.0, Clip=0.01):

68 """Relative entropy or Kullback-Liebler distance between 69 two frequency distributions p and q. 70 Here, we assume that both p and q are counts 71 derived from multinomial distributed data; 72 they are not normalized to one. 73 """ 74 75 p = Num.asarray(p, Num.Int) 76 q = Num.asarray(q, Num.Int) 77 assert p.shape == q.shape 78 if N is None: 79 N = p.shape[0]**2 * 30 80 assert Num.sum(p) > 0 81 assert Num.sum(q) > 0 82 qstart = ((0.5+q)/Num.sum(0.5+q, axis=0)) 83 pstart = ((0.5+p)/Num.sum(0.5+p, axis=0)) 84 pV = 0.1*Num.identity(p.shape[0])/float(p.shape[0])**1.5 85 qV = 0.1*Num.identity(q.shape[0])/float(q.shape[0])**1.5 86 xq = mcmc.bootstepper(multinomial_logp, qstart, qV, 87 c=(q,Fq), fixer=multinomial_fixer) 88 xp = mcmc.bootstepper(multinomial_logp, pstart, pV, 89 c=(p,Fp), fixer=multinomial_fixer) 90 mcmchp = mcmc_helper.stepper(xp) 91 mcmchq = mcmc_helper.stepper(xq) 92 mcmchp.run_to_bottom() 93 mcmchp.run_to_ergodic(5.0) 94 mcmchq.run_to_bottom() 95 mcmchq.run_to_ergodic(5.0) 96 o = [] 97 for i in range(N): 98 mcmchp.run_to_ergodic(1.0) 99 mcmchq.run_to_ergodic(1.0) 100 o.append( kl_nonzero_probs(P(xp.prms()), P(xq.prms())) ) 101 avg, sigma = gpkavg.avg(o, None, Clip) 102 return (avg, sigma)

103 104

105 -def tr_from_obs(x):

106 """Given a matrix of P(i and j) as x[i,j], 107 we compute P(j given i) and return it as y[i,j]. 108 The result is a transition probability matrix 109 where the first index is the input state, and the second 110 index marks the result state. 111 """ 112 # print 'tr_from_obs:', x 113 assert x.shape[0] == x.shape[1] 114 y = Num.zeros(x.shape, Num.Float) 115 for i in range( x.shape[0] ): 116 sxi = Num.sum( x[i], axis=0 ) 117 assert sxi > 0 118 y[i] = x[i] / sxi 119 # print '\ttr_from_obs:', y 120 return y

121 122

123 -def estimate_tr_probs(counts, N, F=1.0):

124 n = counts.shape[0] 125 assert counts.shape == (n,n) 126 q = Num.ravel(counts) 127 assert Num.alltrue(Num.equal(Num.reshape(q, (n,n)), counts)) 128 assert Num.sum(q) > 0 129 qstart = ((0.5+q)/Num.sum(0.5+q, axis=0)) 130 qV = 0.1*Num.identity(q.shape[0])/float(q.shape[0])**1.5 131 xq = mcmc.bootstepper(multinomial_logp, qstart, qV, 132 c=(q,F), fixer=multinomial_fixer) 133 mcmch = mcmc_helper.stepper(xq) 134 mcmch.run_to_bottom() 135 mcmch.run_to_ergodic(5.0) 136 o = [] 137 for i in range(N): 138 mcmch.run_to_ergodic(1.0) 139 # print 'xq.prms=', xq.prms() 140 est_p_obs = Num.reshape(xq.prms(), (n,n)) / Num.sum( xq.prms(), axis=0 ) 141 tmptr = tr_from_obs( est_p_obs ) 142 # print '# tmptr=', tmptr 143 o.append( tmptr ) 144 return o

145 146

147 -class NoConvergenceError(ValueError):

148 - def __init__(self, s):

149 ValueError.__init__(self, s)

150

151 -class NotMarkovError(ValueError):

152 - def __init__(self, s):

153 ValueError.__init__(self, s)

154

155 -def solve_for_pi(p):

156 """Given a transition probability matrix p, 157 where the first index is the initial state 158 and the second index is the resultant state, 159 compute the steady-state probability distribution, 160 assuming a Markov process. 161 """ 162 MAXITERS = 50 163 # print 'Solve for pi:', p 164 n = p.shape[0] 165 assert p.shape == (n,n) 166 pi = Num.ones((n,), Num.Float)/float(n) 167 nxtnorm = None 168 iter = 0 169 npi = None 170 while iter < MAXITERS: 171 nxt = Num.matrixmultiply(pi, p) 172 # print "pi*p=", nxt 173 nxtnorm = Num.sum(nxt, axis=0) 174 # print "nxtnorm=", nxtnorm 175 npi = nxt / nxtnorm 176 if Num.square(pi-npi).sum() < 1e-8: 177 break 178 pi = npi 179 iter += 1 180 if iter == MAXITERS: 181 raise NoConvergenceError, (nxtnorm, Num.square(pi-npi).sum()) 182 if abs(nxtnorm-1) > 1e-3: 183 raise NotMarkovError, nxtnorm 184 return npi

185 186

187 -def kl_nonzero_tr_probs(pp, qq):

188 """KL distance, given a matrix of nonzero transition probabilities. 189 Each matrix indexes states as pp[from,to], and contains 190 P(to given from) as a conditional probability, 191 where for any from, the Sum over to( pp[from,to]) = 1. 192 """ 193 EPS = 1e-6 194 pi = solve_for_pi(pp) 195 # print "pi=", pi 196 o = 0.0 197 for (i, pi_i) in enumerate(pi): 198 if pi_i > EPS: 199 # These following two normalizations are formally 200 # unnecessary. 201 # print 'Nspp,qq=', Num.sum(pp[i,:]), Num.sum(qq[i,:]) 202 assert abs( Num.sum(pp[i,:]) - 1) < 1e-4 203 assert abs( Num.sum(qq[i,:]) - 1) < 1e-4 204 d = kl_nonzero_probs(pp[i,:], qq[i,:]) 205 # print 'i,d,pi=', i, d, pi_i 206 o += d * pi_i 207 return o

208 209

210 -def kl_nonzero_tr_prob_m(pp, qq):

211 assert len(pp.shape) == 2 212 assert pp.shape == qq.shape 213 EPS = 1e-6 214 pi = solve_for_pi(pp) 215 o = Num.zeros(pp.shape, Num.Float) 216 for (i, pi_i) in enumerate(pi): 217 if pi_i > EPS: 218 pnext = pp[i,:]/Num.sum(pp[i,:]) 219 qnext = qq[i,:]/Num.sum(qq[i,:]) 220 d = kl_nonzero_prob_m(pnext, qnext) * pi_i 221 Num.add(o[i], d, o[i]) 222 return o

223 224

225 -def cross(a, b):

226 o = [] 227 for aa in a: 228 for ab in b: 229 o.append( (aa,ab) ) 230 return o

231 232

233 -def kldist_Markov(p, q, N=None):

234 """Kullback-Liebler distance between two matricies 235 of bigram counts. 236 """ 237 238 p = Num.asarray(p, Num.Int) 239 q = Num.asarray(q, Num.Int) 240 assert p.shape == q.shape 241 if N is None: 242 N = p.shape[0]**2 * 30 243 rN = int(round(math.sqrt(2*N))) 244 pP = estimate_tr_probs(p, rN) 245 # print 'pP=', pP 246 qP = estimate_tr_probs(q, rN) 247 # print 'qP=', qP 248 o = [] 249 for (pp,qq) in random.sample(cross(pP,qP), N): 250 # print "pp=", pp 251 # print "qq=", qq 252 o.append(kl_nonzero_tr_probs(pp, qq)) 253 avg, sigma = gpkavg.avg(o, None, 0.0001) 254 return (avg, sigma)

255 256

257 -def kldist_Markov_m(p, q, N=None):

258 """Kullback-Liebler distance between two matrices 259 of bigram counts. 260 """ 261 262 p = Num.asarray(p, Num.Int) 263 q = Num.asarray(q, Num.Int) 264 assert p.shape == q.shape 265 if N is None: 266 N = p.shape[0]**2 * 30 267 rN = int(round(math.sqrt(2*N))) 268 pP = estimate_tr_probs(p, rN) 269 # print 'pP=', pP 270 qP = estimate_tr_probs(q, rN) 271 # print 'qP=', qP 272 n = p.shape[0] 273 assert q.shape[0] == n 274 o = Num.zeros((n,n), Num.Float) 275 oo = [] 276 for (pp,qq) in random.sample(cross(pP,qP), N): 277 # print 'pp,qq=', pp, qq 278 tmp = kl_nonzero_tr_prob_m(pp, qq) 279 # print 'tmp=', tmp 280 o += tmp 281 oo.append(kl_nonzero_tr_probs(pp, qq)) 282 avg, sigma = gpkavg.avg(oo, None, 0.0001) 283 return (o/(n*n), avg, sigma)

284 285

286 -def kldist_Markov_mm(*p):

287 """List of Kullback-Liebler distances between all combinations 288 of pairs of matrices of bigram counts. 289 It returns a list of matrices of all the distances. 290 Each item on the list is a sample of the distance histogram. 291 """ 292 293 # ??? Darned if I know what "N" is. 294 rN = int(round(math.sqrt(2*N))) 295 pP = [] 296 n = None 297 for pp in p: 298 ppa = Num.asarray(p, Num.Int) 299 if n is None: 300 n = p.shape[0] 301 assert p.shape[0] == n 302 pP.append( estimate_tr_probs(ppa, rN) ) 303 304 samples = random.sample(cross(range(rN), range(rN)), N) 305 o = [] 306 for (x,y) in samples: 307 tmp = Num.zeros((len(p), len(p)), Num.Float) 308 for (i, ppl) in enumerate(pP): 309 for (j, qql) in enumerate(pP): 310 tmp[i,j] = kl_nonzero_probs(ppl[x], qql[y]) 311 o.append( tmp ) 312 return o

313 314 315 316 if __name__ == '__main__': 317 # print kldist_vec([100,100], [1,100]) 318 print kldist_Markov( 319 [[0,0,10],[0,0,10],[0,0,10]], 320 [[20,0,0],[0,20,0],[0,0,20]] 321 ) 322

Source Code for Module gmisclib.kl_dist