1 """Fit a linear transform to a bunch of tt
2 input/output vectors.
3
4 @note: If you compute Pearson's R^2 via 1-localfit()/err_before_fit(), you get the "unadjusted" R^2 value.
5 There is also an "adjusted" R^2 value, which is
6 Ra^2 = 1-(1-R^2)*(n-1)/(n-p-1) where n is the number of data and p is the number of adjustable
7 parameters of the linear regression.
8 @note: When you are computing Pearson's r^2 by way of localfit()/err_before_fit() the two function calls
9 I{MUST} have the same data and flags. Specifically, C{constant} must be equal in both calls.
10 """
11
12
13 import numpy
14 import warnings
15 from gmisclib import gpk_lsq
16 from gmisclib import gpk_rlsq
17
18 _Float = numpy.dtype('float')
19
21 """How much variation did the data have before the fit?
22 This is used to compare with the error after the fit, to
23 allow a F-test or ANOVA. Strictly speaking, we compute
24 the mean-squared error about a constant.
25 @return: Returns the per-coordinate sum-squared-error
26 of the output coordinates.
27 @rtype: C{numpy.ndarray}
28 @param data: See L{pack}.
29 @type data: See L{pack}.
30 """
31
32 tmp = []
33 for (ioc) in data:
34 if len(ioc) == 2:
35 ic, oc = ioc
36 tmp.append( (numpy.zeros((0,), _Float), oc) )
37 elif len(ioc) == 3:
38 ic, oc, w = ioc
39 w = float(w)
40 assert w >= 0.0
41 tmp.append( (numpy.zeros((0,), _Float), oc, w) )
42 A, B, errs, sv, rank = localfit(tmp, minsv=minsv, minsvr=minsvr, constant=constant)
43 return errs
44
45
46
47 -def pack(data, constant=True):
48 """Prepare the data array and (optionally) weight the data.
49 @param data: [(input_coordinates, output_coordinates), ...]
50 or [(input_coordinates, output_coordinates, weight), ...].
51 A list of (in,out) tuples corresponding to the
52 independent and dependent parameters of the linear transform.
53 Both C{in} and C{out} are one-dimensional
54 numpy vectors. They don't need to have
55 the same length, though (obviously) all instances of "in"
56 need to have the same length and all instances of "out"
57 also need to match one another.
58 If C{weight} is given, it must be a scalar.
59 @type data: [(L{numpy.ndarray}, L{numpy.ndarray}), ...] or
60 [(L{numpy.ndarray}, L{numpy.ndarray}, float), ...]
61 """
62 nd = len(data)
63 if not ( nd > 0):
64 raise ValueError, "No data: cannot deduce dimensionality."
65 ic, oc = data[0][:2]
66 m = len(oc)
67 if not (m > 0):
68 raise ValueError, "Output coordinates have zero dimension."
69 n = len(ic)
70 use_c = bool(constant)
71 if not (n+use_c > 0):
72 warnings.warn("g_localfit: This is a zero-parameter model.")
73 i = numpy.zeros((nd, n+use_c), _Float)
74 o = numpy.zeros((nd, m), _Float)
75 try:
76 if use_c and len(data[0])==2:
77 for (j, iow) in enumerate(data):
78 ic, oc = iow
79 o[j,:] = oc
80 i[j,0] = 1.0
81 i[j,1:] = ic
82 elif use_c and len(data[0])==3:
83 for (j, iow) in enumerate(data):
84 ic, oc, w = iow
85 i[j,0] = 1.0
86 i[j,1:] = ic
87 w = float(w)
88 numpy.multiply(oc, w, o[j,1:])
89 elif len(data[0])==2:
90 for (j, iow) in enumerate(data):
91 ic, oc = iow
92 o[j,:] = oc
93 i[j,:] = ic
94 elif len(data[0])==3:
95 for (j, iow) in enumerate(data):
96 ic, oc, w = iow
97 i[j,:] = ic
98 w = float(w)
99 numpy.multiply(oc, w, o[j,:])
100 except ValueError, x:
101 if len(iow) != len(data[0]):
102 raise ValueError, "Data must either be uniformly weighted or not: see data[%d]" % j
103 if len(oc) != o.shape[1]:
104 raise ValueError, "Output data length must match: got %d on data[%d], expecting %d" % (len(oc), j, o.shape[1])
105 if len(ic) != i.shape[1]-use_c:
106 raise ValueError, "Input data length must match: got %d on data[%d], expecting %d" % (len(oc), j, i.shape[1]-use_c)
107 except TypeError, x:
108 if 'float' in str(x):
109 raise TypeError, "Weight must be convertible to float in data[%d]: %s" % (j, x)
110 return (i, o, m, n)
111
112
113
114
115
116
117 -def localfit(data, minsv=None, minsvr=None, constant=True):
118 """Does a linear fit to data via a singular value decomposition
119 algorithm.
120 It returns the matrix A and vector B such that
121 C{A*input_coordinates+B} is the best fit to C{output_coordinates}.
122 @param minsv: sets the minimum useable singular value;
123 @type minsv: float or None
124 @param minsvr: sets the minimum useable s.v. in terms of the largest s.v.
125 @type minsvr: float or None
126 @param constant: Do you want a constant built into the linear equations?
127 @type constant: Boolean
128 @rtype: (A, B, errs, sv, rank) where
129 - A is a 2-D C{numpy.ndarray} matrix.
130 - B is a 1-D C{numpy.ndarray} vector (if constant, else C{None}).
131 - errs is a C{numpy.ndarray} vector, one value for each output coordinate.
132 The length is the same as the C{out} vector (see L{pack}).
133 - sv is a C{numpy.ndarray} vector.
134 The length is the same as the C{in} vector (see L{pack}).
135 @return: (A, B, errs, sv, rank) where
136 - A is the matrix of coefficients
137 - B is a vector of constants. (Or L{None}) One constant per component of the
138 C{out} vector in L{pack}.
139 - errs Each component gives the sum of squared residuals for the corresponding
140 component of the C{out} vector.
141 - sv are the singular values, sorted into decreasing order.
142 @param data: list of tuples. See L{pack}.
143 @type data: See L{pack}.
144 """
145
146 ii, oo, m, n = pack(data, constant=constant)
147 del data
148 soln = gpk_lsq.linear_least_squares(ii, oo, minsv=minsv, minsvr=minsvr, copy=False)
149 assert soln.q == m
150 del ii
151 del oo
152 errs = numpy.sum( numpy.square(soln.residual()), axis=0 )
153 sv = soln.sv()
154 x = soln.x(copy=False)
155 rank = soln.rank()
156 assert sv.shape[0]>=rank and sv.shape[0]<=n+1, "sv.shape=%s rank=%d m=%d n=%d" % (str(sv.shape), rank, m, n)
157 assert rank <= 1+n
158 assert len(errs) == m
159 assert x.ndim == 2
160 if constant:
161 return ( x[1:,:].transpose(), x[0,:],
162 errs, numpy.sort(sv)[::-1], rank
163 )
164 return ( x.transpose(), None,
165 errs, numpy.sort(sv)[::-1], rank
166 )
167
168
169
170 -def reg_localfit(data, regstr=0.0, regtgt=None, rscale=None, constant=True):
171 """Does a linear fit to data via a singular value decomposition
172 algorithm.
173 It returns the matrix A and vector B such that
174 C{A*input_coordinates+B} is the best fit to C{output_coordinates}.
175 @param constant: Do you want a constant built into the linear equations?
176 @type constant: Boolean
177 @return: (A, B, errs, sv, rank) where
178 - A is a 2-D C{numpy.ndarray} matrix.
179 - B is a 1-D C{numpy.ndarray} vector (if constant, else C{None}).
180 - errs is a C{numpy.ndarray} vector, one value for each output coordinate.
181 It gives the sum of squared residuals.
182 - sv are the singular values, sorted into decreasing order.
183 @param data: list of tuples. See L{pack}.
184 @type data: See L{pack}.
185 """
186
187 ii, oo, m, n = pack(data, constant=constant)
188 del data
189 soln = gpk_lsq.reg_linear_least_squares(ii, oo, regstr=regstr, regtgt=regtgt, rscale=rscale, copy=False)
190 del ii
191 del oo
192 errs = numpy.sum( numpy.square(soln.residual()), axis=0 )
193 sv = soln.sv_unreg()
194 x = soln.x(copy=False)
195 rank = soln.eff_rank()
196
197 assert len(errs) == m
198 assert x.ndim == 2
199 if constant:
200 return ( x[1:,:].transpose(), x[0,:],
201 errs, numpy.sort(sv)[::-1], rank
202 )
203 return ( x.transpose(), None,
204 errs, numpy.sort(sv)[::-1], rank
205 )
206
207
208
210 """Data is [ (input_coordinates, output_coordinates), ... ]
211 Minsv sets the minimum useable singular value;
212 minsvr sets the minimum useable s.v. in terms of the largest s.v..
213 It returns the matrix A and vector B such that the best fit
214 is A*input_coordinates+B in a tuple
215 (A, B, errs, rank).
216 errs is a vector, one value for each output coordinate.
217 Rank is the minimum rank of the various fits.
218
219 Warning! Not tested.
220 """
221
222 ii, oo, m, n = pack(data, constant=constant)
223 del data
224 errs = []
225 rank = None
226 const = numpy.zeros((m,), _Float)
227 coef = numpy.zeros((n,m), _Float)
228 for j in range(m):
229 assert oo.shape[1] == m
230 soln = gpk_rlsq.robust_linear_fit(ii, oo[:,j], 1, minsv=minsv, minsvr=minsvr, copy=False)
231 errs.append( numpy.sum( numpy.square(soln.residual()) ) )
232 if rank is None or rank > soln.rank():
233 rank = soln.rank()
234 const[j] = soln.x()[0]
235 coef[:,j] = soln.x()[1:]
236 del ii
237 del oo
238 assert rank <= 1+n
239 assert len(errs) == m
240 return ( coef.transpose(), const, errs, rank)
241
242
243
245 """Does a linear fit to data via a singular value decomposition
246 algorithm.
247 It returns the matrix A and vector B such that
248 C{A*input_coordinates+B} is the best fit to C{output_coordinates}.
249 @param minsv: sets the minimum useable singular value;
250 @param minsvr: sets the minimum useable s.v. in terms of the largest s.v.
251 @type minsv: float or None
252 @type minsvr: float or None
253 @param constant: Do you want a constant built into the linear equations?
254 @type constant: Boolean
255 @return: (A, B, sigmaA, sigmaB) where
256 - A is a 2-D C{numpy.ndarray} matrix.
257 - B is a 1-D C{numpy.ndarray} vector (if constant, else C{None}).
258 )
259 @param data: [(input_coordinates, output_coordinates), ...].
260 A list of (in,out) tuples corresponding to the
261 independent and dependent parameters of the linear transform.
262 Both C{in} and C{out} are one-dimensional
263 L{numpy vectors<numpy.ndarray>}. They don't need to have
264 the same length, though (obviously) all instances of "in"
265 need to have the same length and all instances of "out"
266 also need to match one another.
267 @type data: [(L{numpy.ndarray}, L{numpy.ndarray}), ...]
268 """
269
270 ii, oo, m, n = pack(data, constant=constant)
271 del data
272 soln = gpk_lsq.linear_least_squares(ii, oo, minsv=minsv, minsvr=minsvr)
273 del ii
274 del oo
275 x = soln.x()
276 sigmas = numpy.sqrt(soln.x_variances())
277 assert sigmas.shape == x.shape
278 assert x.ndim == 2
279 if constant:
280 return ( x[1:,:].transpose(), x[0,:],
281 sigmas[1:,:].transpose(), sigmas[0,:]
282 )
283 return ( x.transpose(), None,
284 sigmas.transpose(), None
285 )
286
287
289 import RandomArray
290 while 1:
291 d = []
292 for i in range(100):
293 d.append( (RandomArray.standard_normal((30,)),
294 RandomArray.standard_normal((1000,))))
295 localfit(d)
296
298 d = [( numpy.zeros((0,)), numpy.array((1,)) ),
299 ( numpy.zeros((0,)), numpy.array((2,)) ),
300 ( numpy.zeros((0,)), numpy.array((3,)) )
301 ]
302 coef, const, errs, sv, rank = localfit(d, constant=False)
303 assert const is None
304 assert coef.shape == (1,0)
305 assert len(errs) == 1
306 assert abs(errs[0]-14) < 1e-4
307 assert rank==0
308 assert abs(err_before_fit(d, constant=False)-errs[0]) < 1e-4
309
310
312 d = [ (numpy.array((0,)), numpy.array((-1,))),
313 (numpy.array((1,)), numpy.array((0,))),
314 (numpy.array((2,)), numpy.array((1+1e-7,))),
315 (numpy.array((3,)), numpy.array((2,))),
316 ]
317 if r:
318
319 coef, const, errs, sv, rank = r_localfit(d)
320 else:
321 coef, const, errs, sv, rank = localfit(d)
322 assert const.shape == (1,)
323 assert abs(const[0]-(-1)) < 1e-6
324 assert coef.shape == (1,1)
325 assert abs(coef[0,0]-1) < 1e-6
326 assert len(errs)==1
327 assert errs[0] < 1e-6
328 assert len(sv) == 2
329 assert rank == 2
330 assert numpy.alltrue(err_before_fit(d) >= errs)
331
332
334 d = [ (numpy.array((0.0,)), numpy.array((0.0,))),
335 (numpy.array((1.0,)), numpy.array((1.0,))),
336 (numpy.array((2.0,)), numpy.array((0.0,))),
337 (numpy.array((3.0,)), numpy.array((-2.0,))),
338 (numpy.array((4.0,)), numpy.array((0.0,))),
339 (numpy.array((5.0,)), numpy.array((1.0,))),
340 (numpy.array((6.0,)), numpy.array((0.0,))),
341 ]
342 coef, const, errs, sv, rank = localfit(d)
343 print 'coef=', coef
344 print 'const=', const
345 print 'errs=', errs
346 assert const.shape == (1,)
347 assert abs(const[0]-0) < 1e-6
348 assert coef.shape == (1,1)
349 assert abs(coef[0,0]-0) < 1e-6
350 assert len(errs)==1
351 assert abs(errs[0]-6) < 1e-6
352 assert len(sv) == 2
353 assert rank == 2
354 assert numpy.absolute(err_before_fit(d)-errs).sum() < 1e-6
355
356
358 d = [ (numpy.array((0,0)), numpy.array((-1,))),
359 (numpy.array((1,2)), numpy.array((0,))),
360 (numpy.array((2,-1)), numpy.array((1,))) ]
361 coef, const, errs, sv, rank = localfit(d)
362 assert const.shape == (1,)
363 assert abs(const[0]-(-1)) < 1e-6
364 assert coef.shape == (1,2)
365 assert numpy.absolute(coef-[[1,0]]).sum() < 1e-6
366 assert len(errs)==1
367 assert errs[0] < 1e-6
368 assert len(sv) == 3
369 assert rank == 3
370 assert numpy.alltrue(err_before_fit(d) >= errs)
371
373 d = [ (numpy.array((0,0)), numpy.array((-1,))),
374 (numpy.array((2,-1)), numpy.array((1,))) ]
375 coef, const, errs, sv, rank = localfit(d)
376 assert const.shape == (1,)
377 assert abs(const[0]-(-1)) < 1e-6
378 assert abs(const[0] + 2*coef[0,0]-coef[0,1] - 1) < 1e-6
379 assert coef.shape == (1,2)
380 assert len(errs)==1
381 assert errs[0] < 1e-6
382 assert len(sv) == 2
383 assert rank == 2
384 assert numpy.alltrue(err_before_fit(d) >= errs)
385
386
388 d = [ (numpy.array((0,0)), numpy.array((-1,0))),
389 (numpy.array((1,2)), numpy.array((0,1.0))),
390 (numpy.array((-1,2)), numpy.array((-2,1.0))),
391 (numpy.array((-1,1)), numpy.array((-2+1e-7,0.5+1e-7))),
392 (numpy.array((2,-1)), numpy.array((1,-0.5))) ]
393
394 if r:
395 coef, const, errs, sv, rank = r_localfit(d)
396 else:
397 coef, const, errs, sv, rank = localfit(d)
398
399 print const
400 print coef
401 print errs
402 print sv
403 print rank
404 assert const.shape == (2,)
405 assert numpy.absolute(const - [-1,0]).sum() < 1e-6
406 assert coef.shape == (2,2)
407 assert numpy.absolute(coef[0]-[1,0]).sum() < 1e-6
408 assert numpy.absolute(coef[1]-[0,0.5]).sum() < 1e-6
409 assert len(errs)==2
410 assert errs.sum() < 1e-6
411 assert len(sv) == 3
412 assert rank == 3
413 assert numpy.alltrue(err_before_fit(d) >= errs)
414
415
417 N = 2000
418 DSIG = 10.0
419 import random
420 import math
421 d = []
422 for i in range(N):
423 s = random.normalvariate(0.0, DSIG)
424 d.append((numpy.array([-1.0]), numpy.array([1-s])))
425 d.append((numpy.array([1.0]), numpy.array([1+s])))
426 coef, const, scoef, sconst = fit_giving_sigmas(d)
427 assert abs(coef[0][0]) < 3*DSIG/math.sqrt(N)
428 assert abs(coef[0][0]) > 0.001*DSIG/math.sqrt(N)
429 assert abs(const[0]-1.0) < DSIG/math.sqrt(N)
430 assert sconst[0] < DSIG/math.sqrt(N)
431 assert abs(coef[0][0]) < 4*DSIG/2.0/math.sqrt(N)
432 assert scoef[0][0] < 1.3*DSIG/math.sqrt(N)
433 assert scoef[0][0] > 0.7*DSIG/math.sqrt(N)
434 print 'FGS1 OK'
435
436
438 N = 1000
439 DSIG = 1.0
440 import random
441 d = []
442 for i in range(N):
443 s = random.normalvariate(0.0, DSIG)
444 x = random.normalvariate(0.0, DSIG)
445 w = random.expovariate(1.0)
446 d.append((numpy.array([-x]), numpy.array([1-s]), w))
447 d.append((numpy.array([x]), numpy.array([1+s]), w))
448 d.append((numpy.array([x]), numpy.array([1-s]), w))
449 d.append((numpy.array([-x]), numpy.array([1+s]), w))
450 var1 = err_before_fit(d)
451 A, B, errs, sv, rank = localfit(d)
452 assert numpy.absolute((var1-errs)).sum() < 0.1
453
454
455 if __name__ == '__main__':
456 test0()
457 test_wt()
458 test_localfit11(False)
459 test_localfit11e()
460 test_localfit21()
461 test_localfit21u()
462 for r in [False]:
463 test_localfit11(r)
464 test_localfit22(r)
465 test_fgs1()
466