gmisclib.g_localfit

1 """Fit a linear transform to a bunch of tt 2 input/output vectors. 3 4 @note: If you compute Pearson's R^2 via 1-localfit()/err_before_fit(), you get the "unadjusted" R^2 value. 5 There is also an "adjusted" R^2 value, which is 6 Ra^2 = 1-(1-R^2)*(n-1)/(n-p-1) where n is the number of data and p is the number of adjustable 7 parameters of the linear regression. 8 @note: When you are computing Pearson's r^2 by way of localfit()/err_before_fit() the two function calls 9 I{MUST} have the same data and flags. Specifically, C{constant} must be equal in both calls. 10 """ 11 12 13 import numpy 14 import warnings 15 from gmisclib import gpk_lsq 16 from gmisclib import gpk_rlsq 17 18 _Float = numpy.dtype('float') 19

20 -def err_before_fit(data, minsv=None, minsvr=None, constant=True):

21 """How much variation did the data have before the fit? 22 This is used to compare with the error after the fit, to 23 allow a F-test or ANOVA. Strictly speaking, we compute 24 the mean-squared error about a constant. 25 @return: Returns the per-coordinate sum-squared-error 26 of the output coordinates. 27 @rtype: C{numpy.ndarray} 28 @param data: See L{pack}. 29 @type data: See L{pack}. 30 """ 31 32 tmp = [] 33 for (ioc) in data: 34 if len(ioc) == 2: 35 ic, oc = ioc 36 tmp.append( (numpy.zeros((0,), _Float), oc) ) 37 elif len(ioc) == 3: 38 ic, oc, w = ioc 39 w = float(w) 40 assert w >= 0.0 41 tmp.append( (numpy.zeros((0,), _Float), oc, w) ) 42 A, B, errs, sv, rank = localfit(tmp, minsv=minsv, minsvr=minsvr, constant=constant) 43 return errs

44 45 46

47 -def pack(data, constant=True):

48 """Prepare the data array and (optionally) weight the data. 49 @param data: [(input_coordinates, output_coordinates), ...] 50 or [(input_coordinates, output_coordinates, weight), ...]. 51 A list of (in,out) tuples corresponding to the 52 independent and dependent parameters of the linear transform. 53 Both C{in} and C{out} are one-dimensional 54 numpy vectors. They don't need to have 55 the same length, though (obviously) all instances of "in" 56 need to have the same length and all instances of "out" 57 also need to match one another. 58 If C{weight} is given, it must be a scalar. 59 @type data: [(L{numpy.ndarray}, L{numpy.ndarray}), ...] or 60 [(L{numpy.ndarray}, L{numpy.ndarray}, float), ...] 61 """ 62 nd = len(data) 63 if not ( nd > 0): 64 raise ValueError, "No data: cannot deduce dimensionality." 65 ic, oc = data[0][:2] # We ignore the weight, if any. 66 m = len(oc) #: The number of output coordinates (data). 67 if not (m > 0): 68 raise ValueError, "Output coordinates have zero dimension." 69 n = len(ic) #: The number of input coordinates (parameters). 70 use_c = bool(constant) #: Should we add a constant term? 71 if not (n+use_c > 0): 72 warnings.warn("g_localfit: This is a zero-parameter model.") 73 i = numpy.zeros((nd, n+use_c), _Float) 74 o = numpy.zeros((nd, m), _Float) 75 try: 76 if use_c and len(data[0])==2: 77 for (j, iow) in enumerate(data): 78 ic, oc = iow 79 o[j,:] = oc 80 i[j,0] = 1.0 81 i[j,1:] = ic 82 elif use_c and len(data[0])==3: 83 for (j, iow) in enumerate(data): 84 ic, oc, w = iow 85 i[j,0] = 1.0 86 i[j,1:] = ic 87 w = float(w) 88 numpy.multiply(oc, w, o[j,1:]) 89 elif len(data[0])==2: 90 for (j, iow) in enumerate(data): 91 ic, oc = iow 92 o[j,:] = oc 93 i[j,:] = ic 94 elif len(data[0])==3: 95 for (j, iow) in enumerate(data): 96 ic, oc, w = iow 97 i[j,:] = ic 98 w = float(w) 99 numpy.multiply(oc, w, o[j,:]) 100 except ValueError, x: 101 if len(iow) != len(data[0]): 102 raise ValueError, "Data must either be uniformly weighted or not: see data[%d]" % j 103 if len(oc) != o.shape[1]: 104 raise ValueError, "Output data length must match: got %d on data[%d], expecting %d" % (len(oc), j, o.shape[1]) 105 if len(ic) != i.shape[1]-use_c: 106 raise ValueError, "Input data length must match: got %d on data[%d], expecting %d" % (len(oc), j, i.shape[1]-use_c) 107 except TypeError, x: 108 if 'float' in str(x): 109 raise TypeError, "Weight must be convertible to float in data[%d]: %s" % (j, x) 110 return (i, o, m, n)

111 112 113 114 115 # This is used to compute the matrix of derivitives, 116 # which is then used to compute the step in mcmcSQ.py

117 -def localfit(data, minsv=None, minsvr=None, constant=True):

118 """Does a linear fit to data via a singular value decomposition 119 algorithm. 120 It returns the matrix A and vector B such that 121 C{A*input_coordinates+B} is the best fit to C{output_coordinates}. 122 @param minsv: sets the minimum useable singular value; 123 @type minsv: float or None 124 @param minsvr: sets the minimum useable s.v. in terms of the largest s.v. 125 @type minsvr: float or None 126 @param constant: Do you want a constant built into the linear equations? 127 @type constant: Boolean 128 @rtype: (A, B, errs, sv, rank) where 129 - A is a 2-D C{numpy.ndarray} matrix. 130 - B is a 1-D C{numpy.ndarray} vector (if constant, else C{None}). 131 - errs is a C{numpy.ndarray} vector, one value for each output coordinate. 132 The length is the same as the C{out} vector (see L{pack}). 133 - sv is a C{numpy.ndarray} vector. 134 The length is the same as the C{in} vector (see L{pack}). 135 @return: (A, B, errs, sv, rank) where 136 - A is the matrix of coefficients 137 - B is a vector of constants. (Or L{None}) One constant per component of the 138 C{out} vector in L{pack}. 139 - errs Each component gives the sum of squared residuals for the corresponding 140 component of the C{out} vector. 141 - sv are the singular values, sorted into decreasing order. 142 @param data: list of tuples. See L{pack}. 143 @type data: See L{pack}. 144 """ 145 146 ii, oo, m, n = pack(data, constant=constant) 147 del data # Reclaim memory 148 soln = gpk_lsq.linear_least_squares(ii, oo, minsv=minsv, minsvr=minsvr, copy=False) 149 assert soln.q == m 150 del ii # Free unneeded memory. 151 del oo # Free unneeded memory. 152 errs = numpy.sum( numpy.square(soln.residual()), axis=0 ) 153 sv = soln.sv() 154 x = soln.x(copy=False) 155 rank = soln.rank() 156 assert sv.shape[0]>=rank and sv.shape[0]<=n+1, "sv.shape=%s rank=%d m=%d n=%d" % (str(sv.shape), rank, m, n) 157 assert rank <= 1+n 158 assert len(errs) == m 159 assert x.ndim == 2 160 if constant: 161 return ( x[1:,:].transpose(), x[0,:], 162 errs, numpy.sort(sv)[::-1], rank 163 ) 164 return ( x.transpose(), None, 165 errs, numpy.sort(sv)[::-1], rank 166 )

167 168 169

170 -def reg_localfit(data, regstr=0.0, regtgt=None, rscale=None, constant=True):

171 """Does a linear fit to data via a singular value decomposition 172 algorithm. 173 It returns the matrix A and vector B such that 174 C{A*input_coordinates+B} is the best fit to C{output_coordinates}. 175 @param constant: Do you want a constant built into the linear equations? 176 @type constant: Boolean 177 @return: (A, B, errs, sv, rank) where 178 - A is a 2-D C{numpy.ndarray} matrix. 179 - B is a 1-D C{numpy.ndarray} vector (if constant, else C{None}). 180 - errs is a C{numpy.ndarray} vector, one value for each output coordinate. 181 It gives the sum of squared residuals. 182 - sv are the singular values, sorted into decreasing order. 183 @param data: list of tuples. See L{pack}. 184 @type data: See L{pack}. 185 """ 186 187 ii, oo, m, n = pack(data, constant=constant) 188 del data # Reclaim memory 189 soln = gpk_lsq.reg_linear_least_squares(ii, oo, regstr=regstr, regtgt=regtgt, rscale=rscale, copy=False) 190 del ii # Free unneeded memory. 191 del oo # Free unneeded memory. 192 errs = numpy.sum( numpy.square(soln.residual()), axis=0 ) 193 sv = soln.sv_unreg() 194 x = soln.x(copy=False) 195 rank = soln.eff_rank() 196 # print 'm=', m, 'errs=', errs.shape, errs 197 assert len(errs) == m 198 assert x.ndim == 2 199 if constant: 200 return ( x[1:,:].transpose(), x[0,:], 201 errs, numpy.sort(sv)[::-1], rank 202 ) 203 return ( x.transpose(), None, 204 errs, numpy.sort(sv)[::-1], rank 205 )

206 207 # This is used to compute the matrix of derivitives, 208 # which is then used to compute the step in mcmcSQ.py

209 -def robust_localfit(data, minsv=None, minsvr=None, constant=True):

210 """Data is [ (input_coordinates, output_coordinates), ... ] 211 Minsv sets the minimum useable singular value; 212 minsvr sets the minimum useable s.v. in terms of the largest s.v.. 213 It returns the matrix A and vector B such that the best fit 214 is A*input_coordinates+B in a tuple 215 (A, B, errs, rank). 216 errs is a vector, one value for each output coordinate. 217 Rank is the minimum rank of the various fits. 218 219 Warning! Not tested. 220 """ 221 222 ii, oo, m, n = pack(data, constant=constant) 223 del data # Reclaim memory 224 errs = [] 225 rank = None 226 const = numpy.zeros((m,), _Float) 227 coef = numpy.zeros((n,m), _Float) 228 for j in range(m): 229 assert oo.shape[1] == m 230 soln = gpk_rlsq.robust_linear_fit(ii, oo[:,j], 1, minsv=minsv, minsvr=minsvr, copy=False) 231 errs.append( numpy.sum( numpy.square(soln.residual()) ) ) 232 if rank is None or rank > soln.rank(): 233 rank = soln.rank() 234 const[j] = soln.x()[0] 235 coef[:,j] = soln.x()[1:] 236 del ii # Free unneeded memory. 237 del oo # Free unneeded memory. 238 assert rank <= 1+n 239 assert len(errs) == m 240 return ( coef.transpose(), const, errs, rank)

241 242 243

244 -def fit_giving_sigmas(data, minsv=None, minsvr=None, constant=True):

245 """Does a linear fit to data via a singular value decomposition 246 algorithm. 247 It returns the matrix A and vector B such that 248 C{A*input_coordinates+B} is the best fit to C{output_coordinates}. 249 @param minsv: sets the minimum useable singular value; 250 @param minsvr: sets the minimum useable s.v. in terms of the largest s.v. 251 @type minsv: float or None 252 @type minsvr: float or None 253 @param constant: Do you want a constant built into the linear equations? 254 @type constant: Boolean 255 @return: (A, B, sigmaA, sigmaB) where 256 - A is a 2-D C{numpy.ndarray} matrix. 257 - B is a 1-D C{numpy.ndarray} vector (if constant, else C{None}). 258 ) 259 @param data: [(input_coordinates, output_coordinates), ...]. 260 A list of (in,out) tuples corresponding to the 261 independent and dependent parameters of the linear transform. 262 Both C{in} and C{out} are one-dimensional 263 L{numpy vectors<numpy.ndarray>}. They don't need to have 264 the same length, though (obviously) all instances of "in" 265 need to have the same length and all instances of "out" 266 also need to match one another. 267 @type data: [(L{numpy.ndarray}, L{numpy.ndarray}), ...] 268 """ 269 270 ii, oo, m, n = pack(data, constant=constant) 271 del data # Reclaim memory 272 soln = gpk_lsq.linear_least_squares(ii, oo, minsv=minsv, minsvr=minsvr) 273 del ii # Free unneeded memory. 274 del oo # Free unneeded memory. 275 x = soln.x() 276 sigmas = numpy.sqrt(soln.x_variances()) 277 assert sigmas.shape == x.shape 278 assert x.ndim == 2 279 if constant: 280 return ( x[1:,:].transpose(), x[0,:], 281 sigmas[1:,:].transpose(), sigmas[0,:] 282 ) 283 return ( x.transpose(), None, 284 sigmas.transpose(), None 285 )

286 287

288 -def leaktest():

289 import RandomArray 290 while 1: 291 d = [] 292 for i in range(100): 293 d.append( (RandomArray.standard_normal((30,)), 294 RandomArray.standard_normal((1000,)))) 295 localfit(d)

296

297 -def test0():

298 d = [( numpy.zeros((0,)), numpy.array((1,)) ), 299 ( numpy.zeros((0,)), numpy.array((2,)) ), 300 ( numpy.zeros((0,)), numpy.array((3,)) ) 301 ] 302 coef, const, errs, sv, rank = localfit(d, constant=False) 303 assert const is None 304 assert coef.shape == (1,0) 305 assert len(errs) == 1 306 assert abs(errs[0]-14) < 1e-4 307 assert rank==0 308 assert abs(err_before_fit(d, constant=False)-errs[0]) < 1e-4

309 310

311 -def test_localfit11(r):

312 d = [ (numpy.array((0,)), numpy.array((-1,))), 313 (numpy.array((1,)), numpy.array((0,))), 314 (numpy.array((2,)), numpy.array((1+1e-7,))), 315 (numpy.array((3,)), numpy.array((2,))), 316 ] 317 if r: 318 # INCOMPATIBILITY HERE! 319 coef, const, errs, sv, rank = r_localfit(d) 320 else: 321 coef, const, errs, sv, rank = localfit(d) 322 assert const.shape == (1,) 323 assert abs(const[0]-(-1)) < 1e-6 324 assert coef.shape == (1,1) 325 assert abs(coef[0,0]-1) < 1e-6 326 assert len(errs)==1 327 assert errs[0] < 1e-6 328 assert len(sv) == 2 329 assert rank == 2 330 assert numpy.alltrue(err_before_fit(d) >= errs)

331 332

333 -def test_localfit11e():

334 d = [ (numpy.array((0.0,)), numpy.array((0.0,))), 335 (numpy.array((1.0,)), numpy.array((1.0,))), 336 (numpy.array((2.0,)), numpy.array((0.0,))), 337 (numpy.array((3.0,)), numpy.array((-2.0,))), 338 (numpy.array((4.0,)), numpy.array((0.0,))), 339 (numpy.array((5.0,)), numpy.array((1.0,))), 340 (numpy.array((6.0,)), numpy.array((0.0,))), 341 ] 342 coef, const, errs, sv, rank = localfit(d) 343 print 'coef=', coef 344 print 'const=', const 345 print 'errs=', errs 346 assert const.shape == (1,) 347 assert abs(const[0]-0) < 1e-6 348 assert coef.shape == (1,1) 349 assert abs(coef[0,0]-0) < 1e-6 350 assert len(errs)==1 351 assert abs(errs[0]-6) < 1e-6 352 assert len(sv) == 2 353 assert rank == 2 354 assert numpy.absolute(err_before_fit(d)-errs).sum() < 1e-6

355 356

357 -def test_localfit21():

358 d = [ (numpy.array((0,0)), numpy.array((-1,))), 359 (numpy.array((1,2)), numpy.array((0,))), 360 (numpy.array((2,-1)), numpy.array((1,))) ] 361 coef, const, errs, sv, rank = localfit(d) 362 assert const.shape == (1,) 363 assert abs(const[0]-(-1)) < 1e-6 364 assert coef.shape == (1,2) 365 assert numpy.absolute(coef-[[1,0]]).sum() < 1e-6 366 assert len(errs)==1 367 assert errs[0] < 1e-6 368 assert len(sv) == 3 369 assert rank == 3 370 assert numpy.alltrue(err_before_fit(d) >= errs)

371

372 -def test_localfit21u():

373 d = [ (numpy.array((0,0)), numpy.array((-1,))), 374 (numpy.array((2,-1)), numpy.array((1,))) ] 375 coef, const, errs, sv, rank = localfit(d) 376 assert const.shape == (1,) 377 assert abs(const[0]-(-1)) < 1e-6 378 assert abs(const[0] + 2*coef[0,0]-coef[0,1] - 1) < 1e-6 379 assert coef.shape == (1,2) 380 assert len(errs)==1 381 assert errs[0] < 1e-6 382 assert len(sv) == 2 383 assert rank == 2 384 assert numpy.alltrue(err_before_fit(d) >= errs)

385 386

387 -def test_localfit22(r):

388 d = [ (numpy.array((0,0)), numpy.array((-1,0))), 389 (numpy.array((1,2)), numpy.array((0,1.0))), 390 (numpy.array((-1,2)), numpy.array((-2,1.0))), 391 (numpy.array((-1,1)), numpy.array((-2+1e-7,0.5+1e-7))), 392 (numpy.array((2,-1)), numpy.array((1,-0.5))) ] 393 394 if r: 395 coef, const, errs, sv, rank = r_localfit(d) 396 else: 397 coef, const, errs, sv, rank = localfit(d) 398 399 print const 400 print coef 401 print errs 402 print sv 403 print rank 404 assert const.shape == (2,) 405 assert numpy.absolute(const - [-1,0]).sum() < 1e-6 406 assert coef.shape == (2,2) 407 assert numpy.absolute(coef[0]-[1,0]).sum() < 1e-6 408 assert numpy.absolute(coef[1]-[0,0.5]).sum() < 1e-6 409 assert len(errs)==2 410 assert errs.sum() < 1e-6 411 assert len(sv) == 3 412 assert rank == 3 413 assert numpy.alltrue(err_before_fit(d) >= errs)

414 415

416 -def test_fgs1():

417 N = 2000 418 DSIG = 10.0 419 import random 420 import math 421 d = [] 422 for i in range(N): 423 s = random.normalvariate(0.0, DSIG) 424 d.append((numpy.array([-1.0]), numpy.array([1-s]))) 425 d.append((numpy.array([1.0]), numpy.array([1+s]))) 426 coef, const, scoef, sconst = fit_giving_sigmas(d) 427 assert abs(coef[0][0]) < 3*DSIG/math.sqrt(N) 428 assert abs(coef[0][0]) > 0.001*DSIG/math.sqrt(N) 429 assert abs(const[0]-1.0) < DSIG/math.sqrt(N) 430 assert sconst[0] < DSIG/math.sqrt(N) 431 assert abs(coef[0][0]) < 4*DSIG/2.0/math.sqrt(N) 432 assert scoef[0][0] < 1.3*DSIG/math.sqrt(N) 433 assert scoef[0][0] > 0.7*DSIG/math.sqrt(N) 434 print 'FGS1 OK'

435 436

437 -def test_wt():

438 N = 1000 439 DSIG = 1.0 440 import random 441 d = [] 442 for i in range(N): 443 s = random.normalvariate(0.0, DSIG) 444 x = random.normalvariate(0.0, DSIG) 445 w = random.expovariate(1.0) 446 d.append((numpy.array([-x]), numpy.array([1-s]), w)) 447 d.append((numpy.array([x]), numpy.array([1+s]), w)) 448 d.append((numpy.array([x]), numpy.array([1-s]), w)) 449 d.append((numpy.array([-x]), numpy.array([1+s]), w)) 450 var1 = err_before_fit(d) 451 A, B, errs, sv, rank = localfit(d) 452 assert numpy.absolute((var1-errs)).sum() < 0.1

453 454 455 if __name__ == '__main__': 456 test0() 457 test_wt() 458 test_localfit11(False) 459 test_localfit11e() 460 test_localfit21() 461 test_localfit21u() 462 for r in [False]: # Do Not test r_localfit() 463 test_localfit11(r) 464 test_localfit22(r) 465 test_fgs1() 466

Source Code for Module gmisclib.g_localfit