1 import math
2 import numpy
3 import gmisclib.Numeric_gpk as NG
4
5 _Float = numpy.dtype('float')
6
9 self.ginv = None
10 self._fit = None
11 self._x = None
12 self._hatdiag = None
13 self._y = None
14 self.a = numpy.array(a, _Float, copy=copy)
15 if self.a.ndim != 2:
16 raise ValueError, "a needs to be 2-dimensional: shape=%s" % str(self.a.shape)
17 self.m, self.n = self.a.shape
18 self.q = None
19
20
21 - def set_y(self, y, copy=True):
22 if y is None:
23 return
24 self._fit = None
25 self._x = None
26 self._y = numpy.array(y, _Float, copy=copy)
27 if self._y.ndim == 1:
28 self.vector = True
29 self._y = self._y.reshape((self._y.shape[0], 1))
30
31 elif self._y.ndim > 2:
32 raise ValueError, "y needs to be 1- or 2-dimensional: shape=%s" % str(self._y.shape)
33 else:
34 self.vector = False
35 if self._y.shape[0] != self.m:
36 raise ValueError, "Matrix sizes do not match: (%d,%d) and %s" % (
37 self.m, self.n, str(self._y.shape)
38 )
39 self.q = self._y.shape[1]
40
41
42 - def y(self, copy=True):
43 assert self._y.shape == (self.m, self.q)
44 if self.vector:
45 return numpy.array(self._y[:,0], copy=copy)
46 return numpy.array(self._y, copy=copy)
47
48
49
51 raise RuntimeError, "Virtual Function"
52
53
54 - def hat(self, copy=True):
55 raise RuntimeError, "Virtual Function"
56
57
58 - def x(self, y=None, copy=True):
59 """
60 @raise numpy.linalg.linalg.LinAlgError: when the matrix is singular.
61 """
62 self.set_y(y)
63 assert self._y is not None, "Y not yet set"
64 if self._x is None:
65 self._x = self._solve()
66 if self.vector:
67 return numpy.array(self._x[:, 0], copy=copy)
68 return numpy.array(self._x, copy=copy)
69
70
71 - def fit(self, copy=False):
72 """
73 @raise numpy.linalg.linalg.LinAlgError: when the matrix is singular.
74 """
75 if self._fit is None:
76 if self._x is None:
77 self._x = self._solve()
78
79 self._fit = numpy.dot(self.a, self._x)
80
81 assert self._fit.shape == (self.m, self.q)
82 if self.vector:
83 return numpy.array(self._fit[:, 0], copy=copy)
84 return numpy.array(self._fit, copy=copy)
85
86
88 tmp = self.y(copy=False) - self.fit(copy=False)
89
90 assert tmp.shape[0] == self.m
91 return tmp
92
93
95 """Returns the estimator of the standard deviation
96 of the data about the fit.
97 @return: L{numpy.ndarray} with shape=(q,). Each entry corresponds
98 to one of the C{q} sets of equations that are being fit.
99 """
100 r2 = numpy.sum(numpy.square(self.residual()), axis=0)
101 assert self.vector and r2.shape==() or not self.vector and r2.shape==(self.q,)
102 return r2/(self.m-self.n)
103
105 """Returns something like the number of data, except that it looks at their
106 weighting and the structure of the problem. It counts how many data have a
107 relatively large effect on the solution, and if a datum has little influence,
108 it doesn't count for much.
109 @rtype: float
110 """
111 return _perplexity(self.hat())
112
114 """Returns something like the rank of the solution, but rather than counting
115 how many dimensions can be solved at all, it reports how many dimensions can be
116 solved with a relatively good accuracy.
117 @rtype: float
118 """
119 raise RuntimeError, "Virtual Method"
120
121
122
127
128
129
131 - def __init__(self, a, y=None, minsv=None, minsvr=None, copy=True):
132 """This solves the set of linear equations a*x = y,
133 and allows you to get properties of the fit via methods.
134 Normally, a.shape==(m,n) and y.shape==(m,q),
135 and the returned x.shape==(n,q).
136 where m is the number of constraints provided by the data,
137 n is the number of parameters to use in a fit
138 (equivalently, the number of basis functions),
139 and q is the number of separate sets of equations
140 that you are fitting.
141 Then, C{self.x()} has shape (n,q) and C{self.the_fit()} has shape (m,q).
142 Interpreting this as a linear regression, there are n
143 parameters in the model, and m measurements.
144 Q is the number of times you apply the model to a
145 different data set; each on yields a different solution.
146
147 The procedure uses a singular value decomposition algorithm,
148 and treats all singular values that are smaller than minsv
149 as zero (i.e. drops them).
150 If minsvr is specified, it treates all singular values that
151 are smaller than minsvr times the largest s.v. as zero.
152 The returned rank is the
153 rank of the solution, which is normally the number of
154 nonzero elements in x.
155 Note that the shape of the solution vector or matrix
156 is defined by a and y, and the rank can be smaller
157 than m.
158
159 @note: Y may be a 1-D matrix (a vector), in which case
160 the fit is a vector. This is the normal
161 case where you are fitting one equation.
162 If y is a 2-D matrix,
163 each column (second index) in y is a separate fit, and
164 each column in the solution is a separate result.
165 """
166
167 lls_base.__init__(self, a, copy=copy)
168 self.set_y(y, copy=copy)
169
170 assert minsv is None or minsv >= 0.0
171 assert minsvr is None or 0.0 <= minsvr <= 1.0
172 r = min(self.m, self.n)
173 if self.n > 0:
174 u, self.s, vh = numpy.linalg.svd(self.a, full_matrices=False)
175 else:
176 u = numpy.zeros((self.m, r))
177 vh = numpy.zeros((r, self.n))
178 self.s = numpy.zeros((r,))
179
180 assert u.shape == (self.m, r)
181
182 assert vh.shape == (r, self.n)
183 assert self.s.shape == (r,)
184
185
186
187
188
189 if minsv is not None and minsvr is not None:
190 svrls = max(minsv, minsvr*self.s[numpy.argmax(self.s)] )
191 elif minsv is not None:
192 svrls = minsv
193 elif minsvr is not None and self.s.shape[0]>0:
194 svrls = minsvr * self.s[numpy.argmax(self.s)]
195 else:
196 svrls = 0.0
197
198 self.sim = numpy.greater(self.s, svrls)
199 isi = numpy.where(self.sim, 1.0/numpy.where(self.sim, self.s, 1.0), 0.0)
200
201 ur = u[:,:r]
202 numpy.multiply(ur, isi, ur)
203
204 self.ginv = numpy.dot(ur, vh[:r]).transpose()
205
207 return numpy.dot(self.ginv, self._y)
208
211
213 return self.sim.sum()
214
215 - def hat(self, copy=True):
216 """Hat Matrix Diagonal
217 Data points that are far from the centroid of the X-space are potentially influential.
218 A measure of the distance between a data point, xi,
219 and the centroid of the X-space is the data point's associated diagonal
220 element hi in the hat matrix. Belsley, Kuh, and Welsch (1980) propose a cutoff of
221 2 p/n for the diagonal elements of the hat matrix, where n is the number
222 of observations used to fit the model, and p is the number of parameters in the model.
223 Observations with hi values above this cutoff should be investigated.
224 For linear models, the hat matrix
225
226 C{H = X inv(X'X) X'}
227
228 can be used as a projection matrix.
229 The hat matrix diagonal variable contains the diagonal elements
230 of the hat matrix
231
232 C{hi = xi inv(X'X) xi'}
233 """
234 if self._hatdiag is None:
235 aainv = numpy.dot(self.ginv, self.ginv.transpose())
236 hatdiag = numpy.zeros((self.a.shape[0],), _Float)
237 for i in range(hatdiag.shape[0]):
238 hatdiag[i] = NG.qform(self.a[i,:], aainv)
239 assert -0.001 < hatdiag[i] < 1.001
240 self._hatdiag = hatdiag
241 return numpy.array(self._hatdiag, copy=copy)
242
243
245 """Estimated standard deviations of the solution.
246 This is the diagonal of the solution covariance matrix.
247 """
248 aainv = numpy.dot(self.ginv, self.ginv.transpose())
249 vaf = self.variance_about_fit()
250 rv = numpy.outer(aainv.diagonal(), vaf)
251 assert rv.shape == self._x.shape
252 assert rv.shape == (self.n, self.q)
253 if self.vector:
254 return rv[0,:]
255 return rv
256
258 return _perplexity(self.sv())
259
260
261
263 - def __init__(self, a, y, regstr=0.0, regtgt=None, rscale=None, copy=True):
264 """This solves min! |a*x - y|^2 + |regstr*(x-regtgt)|^2,
265 and returns (x, the_fit, rank, s).
266 Normally, a.shape==(m,n) and y.shape==(m,q),
267 where m is the number of data to be fit,
268 n is the number of parameters to use in a fit
269 (equivalently, the number of basis functions),
270 and q is the number of separate sets of equations
271 that you are fitting.
272 Then, x has shape (n,q) and the_fit has shape (m,q).
273
274 The regularization target,
275 regtgt is the same shape as x, that is (n,q).
276 (It must be a vector if and only if y is a vector.)
277 Regstr, the strength of the regularization is
278 normally an (n,n) matrix, though (*,n) will work,
279 as will a scalar.
280
281 Y may be a 1-D matrix (a vector), in which case
282 the fit is a vector. This is the normal
283 case where you are fitting one equation.
284 If y is a 2-D matrix,
285 each column (second index) in y is a separate fit, and
286 each column in the solution is a separate result.
287 """
288 lls_base.__init__(self, a, copy=copy)
289 self.set_y(y, copy=copy)
290
291 if regtgt is None:
292 self.regtgt = numpy.zeros((self.n, self.q))
293 else:
294 self.regtgt = numpy.array(regtgt, _Float, copy=copy)
295 if self.vector:
296 if regtgt.ndim != 1:
297 raise ValueError, "regtgt must be a vector if y is a vector."
298 self.regtgt = self.regtgt.reshape((regtgt.shape[0],1))
299
300 assert self.regtgt.ndim == 2, "Bad dimensionality for regtgt: %d" % self.regtgt.ndim
301 if self.regtgt.shape[0] != self.n:
302 raise ValueError, "Regtgt shape must match the shape of a"
303
304 regstr = numpy.asarray(regstr, _Float)
305 if regstr.ndim == 0:
306 regstr = numpy.identity(self.n) * regstr
307 assert regstr.ndim == 2, "Wrong dimensionality for regstr: %d" % regstr.ndim
308 assert regstr.shape[1] == self.n
309
310 self.rr = numpy.dot(regstr.transpose(), regstr)
311 self.aa = numpy.dot(self.a.transpose(), self.a)
312 if rscale is None:
313 self.scale = 1.0
314 else:
315 self.scale = rscale * self.aa.trace()/self.rr.trace()
316
317
318
319
320
321
322 self.aareg = self.aa + self.rr*self.scale
323
324
326 """
327 @raise numpy.linalg.linalg.LinAlgError: when the matrix is singular.
328 """
329 ayreg = numpy.dot(self.a.transpose(), self._y) \
330 + numpy.dot(self.rr, self.regtgt)*self.scale
331
332
333
334 return numpy.linalg.solve(self.aareg, ayreg)
335
336
338 """Singular values of the regularized problem."""
339 tmp = numpy.linalg.eigvalsh(self.aareg)
340 if not numpy.greater_equal(tmp, 0.0).all():
341 raise ValueError, "Negative eigenvalue: %s..." % tmp[0]
342 return numpy.sqrt(tmp)
343
345 """Singular values of the unregularized problem."""
346 return numpy.sqrt(numpy.linalg.eigvalsh(self.aa))
347
349 return _perplexity(self.sv_reg())
350
351 - def hat(self, copy=True):
352 """Hat Matrix Diagonal
353 Data points that are far from the centroid of the X-space are potentially influential.
354 A measure of the distance between a data point, xi,
355 and the centroid of the X-space is the data point's associated diagonal
356 element hi in the hat matrix. Belsley, Kuh, and Welsch (1980) propose a cutoff of
357 2 p/n for the diagonal elements of the hat matrix, where n is the number
358 of observations used to fit the model, and p is the number of parameters in the model.
359 Observations with hi values above this cutoff should be investigated.
360 For linear models, the hat matrix
361
362 C{H = X (X'X)-1 X' }
363
364 can be used as a projection matrix.
365 The hat matrix diagonal variable contains the diagonal elements
366 of the hat matrix
367
368 C{hi = xi (X'X)-1 xi' }
369 """
370 if self._hatdiag is None:
371 hatdiag = numpy.zeros((self.aa.shape[0],), _Float)
372 iaareg = numpy.linalg.inv(self.aareg)
373 for i in range(hatdiag.shape[0]):
374 hatdiag[i] = NG.qform(self.a[i,:], iaareg)
375 assert -0.001 < hatdiag[i] < 1.001
376 self._hatdiag = hatdiag
377 return numpy.array(self._hatdiag, copy=copy)
378
379
380
381
383 a0 = numpy.array([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
384 [0, 0, 0, 0, 0, 0, 0, -3, -2, -1, 0, 1, 2, 3, 0, 0, 0, 0, 0],
385 [0, 0, 0, 0, 0, 0, 0, 1, -1, 1, -1, 1, -1, 1, 0, 0, 0, 0, 0]], _Float)
386 a1 = numpy.array([[1, 1, 1],
387 [-3, -2, -1],
388 [1, -1, 1]], _Float)
389 a2 = numpy.array([[1, 1, 1], [-3, -2, -1], [1, -1, 1], [0, 0, 2.2], [2, 0, 0], [0, 1, 0]],
390 _Float)
391 for a in [a0, a1, a2]:
392 u, s, vh = numpy.linalg.svd(a)
393 r = min(a.shape)
394
395 assert numpy.alltrue( s >= 0.0)
396 err = numpy.sum(numpy.square(numpy.dot(u[:,:r]*s, vh[:r]) - a))
397 assert err < 1e-6
398
399
405
407 y = numpy.array([0, 1, 2, 3, 4, 5], numpy.float)
408 basis = numpy.zeros((6,0))
409 soln = linear_least_squares(basis, y, 1e-6)
410 assert numpy.absolute(soln.fit()).sum() < 1e-4
411 assert numpy.absolute(soln.residual()-y).sum() < 1e-4
412 assert soln.rank() == 0
413 assert soln.x().shape==(0,)
414
415
417 basis = numpy.transpose([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
418 y = numpy.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], _Float)
419 soln = linear_least_squares(basis, y, 1e-6)
420 hat = soln.hat()
421 assert numpy.alltrue(hat > 1.0/y.shape[0])
422 assert numpy.alltrue(hat < 4.0/y.shape[0])
423 assert soln.rank() == 2
424 print 'fitshape=', soln.fit().shape, y.shape
425 print 'y=', y
426 print 'fit=', soln.fit()
427 err = numpy.sum(numpy.square(soln.fit()-y))
428 assert 0.0 <= err < 1e-6
429 print 'soln.residual=', soln.residual()
430 assert all_between(-1e-6, soln.residual(), 1e-6)
431 assert soln.sv().shape == (2,)
432 assert abs(soln.x()[0] - 1) < 1e-6
433 assert abs(soln.x()[1] + 1) < 1e-6
434 assert numpy.absolute(numpy.dot(basis, soln.x()) - soln.fit()).sum() < 1e-6
435
436
438 basis = numpy.transpose([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
439 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
440 y = numpy.array([0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0], _Float)
441
442 soln = linear_least_squares(basis, y, 1e-6)
443 assert soln.rank() == 2
444 err = numpy.sum((soln.fit()-y)**2)
445 avg = 10.0/11.0
446 epred = 6.0*avg**2 + 5.0*(2-avg)**2
447 assert soln.sv().shape == (2,)
448 assert abs(soln.x()[1] - avg) < 1e-6
449 assert abs(soln.x()[0]) < 1e-6
450 assert abs(err - epred) < 1e-4
451
452
453
455 basis = numpy.transpose([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
456 y = numpy.transpose([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])
457 assert y.shape[1] == 1
458 soln = linear_least_squares(basis, y, 1e-6)
459 assert soln.x().shape[1] == 1
460 assert soln.fit().shape[1] == 1
461 assert soln.rank() == 2
462 err = soln.residual()**2
463 assert all_between(0, err, 1e-6)
464 assert soln.sv().shape == (2,)
465 assert abs(soln.x()[0,0] - 1) < 1e-6
466 assert abs(soln.x()[1,0] + 1) < 1e-6
467 assert numpy.absolute(numpy.ravel(numpy.dot(basis, soln.x())-y)).sum() < 1e-6
468
469
471 basis = numpy.transpose([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
472 y = numpy.transpose([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]])
473 assert y.shape[1] == 2
474 soln = linear_least_squares(basis, y, 1e-6)
475 assert soln.x().shape[1] == 2
476 assert soln.fit().shape[1] == 2
477 assert soln.rank() == 2
478 err = numpy.sum((soln.fit()-y)**2, axis=0)
479 assert err.shape[0]==2
480 assert all_between(0.0, err, 1e-6)
481 assert soln.sv().shape == (2,)
482 assert abs(soln.x()[0,0] - 1) < 1e-6
483 assert abs(soln.x()[1,0] + 1) < 1e-6
484 assert abs(soln.x()[0,1] + 1) < 1e-6
485 assert abs(soln.x()[1,1] - 10) < 1e-6
486 assert numpy.absolute(numpy.ravel(numpy.dot(basis, soln.x())-y)).sum() < 1e-6
487
488
490 basis = numpy.transpose([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
491 y = numpy.transpose([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]])
492 assert y.shape[1] == 2
493 soln = reg_linear_least_squares(basis, y, numpy.zeros((2,2), _Float), [[0.0, 0.0], [0.0, 0.0]])
494 hat = soln.hat()
495 assert numpy.alltrue(hat > 1.0/y.shape[0])
496 assert numpy.alltrue(hat < 4.0/y.shape[0])
497 assert soln.x().shape[1] == 2
498 assert soln.fit().shape[1] == 2
499 err = numpy.sum((soln.fit()-y)**2, axis=0)
500 assert err.shape[0]==2
501 assert all_between(0.0, err, 1e-6)
502 assert abs(soln.x()[0,0] - 1) < 1e-6
503 assert abs(soln.x()[1,0] + 1) < 1e-6
504 assert abs(soln.x()[0,1] + 1) < 1e-6
505 assert abs(soln.x()[1,1] - 10) < 1e-6
506 assert numpy.absolute(numpy.ravel(numpy.dot(basis, soln.x())-y)).sum() < 1e-6
507
508
510 basis = numpy.transpose([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
511 y = numpy.transpose([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]])
512 assert y.shape[1] == 2
513 soln = reg_linear_least_squares(basis, y, 1e6*numpy.identity(2), [[0.5, 0.5],[0.5,0.5]])
514 assert soln.x().shape[1] == 2
515 assert soln.fit().shape[1] == 2
516 err = numpy.sum((soln.fit()-y)**2, axis=0)
517 assert err.shape[0]==2
518 assert all_between(1.0, err, 500.0)
519 assert abs(soln.x()[0,0] - 0.5) < 1e-3
520 assert abs(soln.x()[1,0] - 0.5) < 1e-3
521 assert abs(soln.x()[0,1] - 0.5) < 1e-3
522 assert abs(soln.x()[1,1] - 0.5) < 1e-3
523 soln = reg_linear_least_squares(basis, y, 1e6, numpy.zeros((2,2)))
524 print 'soln.x=', soln.x()
525 assert abs(soln.x()[0,0]) < 1e-3
526 assert abs(soln.x()[1,0]) < 1e-3
527 assert abs(soln.x()[0,1]) < 1e-3
528 assert abs(soln.x()[1,1]) < 1e-3
529 soln = reg_linear_least_squares(basis, y, 1, [[0.5, 0.5],[0.5,0.5]], rscale=1e6)
530 assert abs(soln.x()[0,0] - 0.5) < 1e-3
531 assert abs(soln.x()[1,0] - 0.5) < 1e-3
532 assert abs(soln.x()[0,1] - 0.5) < 1e-3
533 assert abs(soln.x()[1,1] - 0.5) < 1e-3
534 print 's', soln.x()
535 print 'f', soln.fit()
536 print 'r', soln.eff_rank()
537
538
540 """Make sure that the hat function meets its definition."""
541 basis = numpy.transpose([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
542 y0 = numpy.transpose([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
543 s0 = linear_least_squares(basis, y0)
544 assert y0.shape[0] > 1
545 for i in range(y0.shape[0]):
546 y = numpy.array(y0)
547 y[i] += 1
548 s = linear_least_squares(basis, y)
549 ishift = s.fit()[i] - s0.fit()[i]
550 assert -0.001 <= ishift <= 1.001
551 assert abs(ishift - s0.hat()[i]) < 0.001
552
553
554 if __name__ == '__main__':
555 test0()
556 test_svd()
557 test_vec()
558 test_m1()
559 test_m2()
560 test_vec2()
561 test_m2r()
562 test_m2rR()
563 test_hat()
564