gmisclib.stats

1 """Some of these functions, specifically f_value(), fprob(), betai(), and betacf(), 2 are taken from stats.py "A collection of basic statistical functions for python." 3 by Gary Strangman. They are copyright 1999-2007 Gary Strangman; 4 All Rights Reserved and released under the MIT license. 5 6 The rest is copyright Greg Kochanski 2009. 7 """ 8 9 import math 10 11

12 -def f_value(ER, EF, dfR, dfF):

13 """Returns an F-statistic given the following: 14 @param ER: error associated with the null hypothesis (the Restricted model) 15 @param EF: error associated with the alternate hypothesis (the Full model) 16 @param dfR: degrees of freedom the Restricted model (null hypothesis) 17 @param dfF: degrees of freedom associated with the Full model 18 @return: f-statistic (not the probability) 19 @rtype: float 20 """ 21 if not ( ER >= EF ): 22 raise ValueError, "Error with the alternative hypothesis is larger: %s vs %s" % (ER, EF) 23 if not( dfR >= dfF ): 24 raise ValueError, "Degrees of freedom with the alternative hypothesis is larger: %s vs %s" % (dfR, dfF) 25 return ((ER-EF)/float(dfR-dfF) / (EF/float(dfF)))

26 27 28 29 30

31 -def fprob (dfnum, dfden, F):

32 """Returns the (1-tailed) significance level (p-value) of an F 33 statistic given the degrees of freedom for the numerator (dfR-dfF) and 34 the degrees of freedom for the denominator (dfF). 35 36 Usage: lfprob(dfnum, dfden, F) where usually dfnum=dfbn, dfden=dfwn 37 """ 38 if not (dfnum>0 and dfden>0 and F>=0.0): 39 raise ValueError, "Bad arguments to fprob(%s, %s, %s)" % (dfnum, dfden, F) 40 p = betai(0.5*dfden, 0.5*dfnum, dfden/float(dfden+dfnum*F)) 41 return p

42 43

44 -def betai(a,b,x):

45 """Returns the incomplete beta function: 46 47 I-sub-x(a,b) = 1/B(a,b)*(Integral(0,x) of t^(a-1)(1-t)^(b-1) dt) 48 49 where a>=0,b>0 and B(a,b) = G(a)*G(b)/(G(a+b)) where G(a) is the gamma 50 function of a. The continued fraction formulation is implemented here, 51 using the betacf function. (Adapted from: Numerical Recipes in C.) 52 53 Usage: betai(a,b,x) 54 """ 55 if not (a>=0 and b>=0): 56 raise ValueError, 'Bad a=%s or b=%s in betai' % (a, b) 57 if not (x>0.0 or x>1.0): 58 raise ValueError, 'Bad x in betai' 59 if (x==0.0 or x==1.0): 60 bt = 0.0 61 else: 62 bt = math.exp(gammaln(a+b)-gammaln(a)-gammaln(b)+a*math.log(x)+b* 63 math.log(1.0-x)) 64 if (x<(a+1.0)/(a+b+2.0)): 65 return bt*betacf(a,b,x)/float(a) 66 else: 67 return 1.0-bt*betacf(b,a,1.0-x)/float(b)

68 69 70 71

72 -def betacf(a,b,x):

73 """This function evaluates the continued fraction form of the incomplete 74 Beta function, betai. (Adapted from: Numerical Recipes in C.) 75 76 Usage: betacf(a,b,x) 77 """ 78 ITMAX = 200 79 EPS = 3.0e-7 80 81 bm = az = am = 1.0 82 qab = a+b 83 qap = a+1.0 84 qam = a-1.0 85 bz = 1.0-qab*x/qap 86 for i in range(ITMAX+1): 87 em = float(i+1) 88 tem = em + em 89 d = em*(b-em)*x/((qam+tem)*(a+tem)) 90 ap = az + d*am 91 bp = bz+d*bm 92 d = -(a+em)*(qab+em)*x/((qap+tem)*(a+tem)) 93 app = ap+d*az 94 bpp = bp+d*bz 95 aold = az 96 am = ap/bpp 97 bm = bp/bpp 98 az = app/bpp 99 bz = 1.0 100 if (abs(az-aold)<(EPS*abs(az))): 101 return az 102 raise RuntimeError, 'a or b too big, or ITMAX too small in Betacf.'

103 104 105

106 -def gammaln(x):

107 """Returns the gamma function of xx. 108 Gamma(z) = Integral(0,infinity) of t^(z-1)exp(-t) dt. 109 (Adapted from: Numerical Recipies in C. via code 110 Copyright (c) 1999-2000 Gary Strangman and released under the LGPL.) 111 """ 112 if not (x > 0): 113 raise ValueError, "Range Error: Gamma(0) is undefined." 114 coeff = [76.18009173, -86.50532033, 24.01409822, -1.231739516, 115 0.120858003e-2, -0.536382e-5] 116 tmp = x + 4.5 117 tmp -= (x-0.5)*math.log(tmp) 118 ser = 1.0 119 for cj in coeff: 120 ser += cj/x 121 x += 1 122 return -tmp + math.log(2.50662827465*ser)

123 124 125 # def log_t_density(x, nu): 126 # c = gammaln((nu+1)/2.0) - 0.5*math.log(nu*math.pi) - gammaln(nu/2.0) 127 # c is not used! 128 # return math.log(1+x**2/nu)*(-(nu+1)/2.0) 129 130 131 132

133 -def test_fprob(dfnum, dfden):

134 N = 30000 135 # These mean and sigma values are taken from 136 # Wolfram Mathworld, http://mathworld.wolfram.com/F-Distribution.html 137 mean = float(dfden)/float(dfden-2) 138 sigma = math.sqrt(float(2*dfden**2*(dfden+dfnum-2))/ 139 float(dfnum*(dfden-2)**2*(dfden-4))) 140 assert abs(fprob(dfnum, dfden, 0.0)-1.0) < 0.001 141 assert abs(fprob(10, 20, 10.0)-0.0) < 0.001 142 m = 0.0 143 ss = 0.0 144 lpc = 1.0 145 lf = 0.0 146 ps = 0.0 147 i = 0 148 while lpc > 1.0/float(N): 149 nf = (mean+3*sigma)*((i+0.5)/float(N))**2 150 f = 0.5*(lf+nf) 151 lf = nf 152 pc = fprob(dfnum, dfden, nf) 153 dp = lpc-pc 154 lpc = pc 155 m += f*dp 156 ss += f**2 * dp 157 ps += dp 158 i += 1 159 m += lpc*f 160 ss += lpc*f**2 161 assert abs(ps-1.0) < 0.001 162 s = math.sqrt(ss-m**2) 163 print 'm=', m, mean 164 print 's=', s, sigma 165 assert abs(m-mean) < 0.03*mean 166 assert abs(s-sigma) < 0.03*sigma

167 168 169 t_Table = [(1, 0.5, 1.000), (1, 0.95, 12.71), (1, 0.98, 31.82), (1, 0.99, 63.66), 170 (1, 0.995, 127.3), (1, 0.998, 318.3), (1, 0.999, 636.6), 171 (2, 0.5, 0.816), (2, 0.95, 4.303), (2, 0.98, 6.965), (2, 0.99, 9.925), 172 (2, 0.999, 31.60), 173 (3, 0.5, 0.765), (3, 0.95, 3.182), (3, 0.99, 5.841), (3, 0.999, 12.92), 174 (4, 0.5, 0.741), (4, 0.95, 2.776), (4, 0.99, 3.797), (4, 0.999, 8.610), 175 (10, 0.5, 0.700), (10, 0.95, 2.228), (10, 0.99, 3.106), (10, 0.999, 4.587), 176 (120, 0.5, 0.677), (10, 0.95, 1.980), (120, 0.99, 2.617), (120, 0.999, 3.373), 177 (1e6, 0.5, 0.674), (1e6, 0.95, 1.960), (1e6, 0.99, 2.576), (1e6, 0.999, 3.291) 178 ] 179

180 -def t_value(ndof, p2sided=0.99):

181 for (n, p2s, t) in t_Table: 182 if n==ndof and p2s==p2sided: 183 return t 184 t_lower = None 185 n_lower = None 186 t_higher = None 187 n_higher = None 188 for (n, p2s, t) in t_Table: 189 if p2s==p2sided and n<=ndof and (n_lower is None or n>n_lower): 190 n_lower = n 191 t_lower = t 192 if p2s==p2sided and n>=ndof and (n_higher is None or n<n_higher): 193 n_higher = n 194 t_higher = t 195 if n_lower is not None and n_higher is not None: 196 return t_lower + ((t_higher-t_lower)*(math.log(n)-math.log(n_lower)) 197 / (math.log(n_higher)-math.log(n_lower))) 198 raise RuntimeError, "Sorry: not implemented yet"

199 200 201

202 -def ltqnorm( p ):

203 """ 204 205 Lower tail quantile for standard normal distribution function. 206 207 This function returns an approximation of the inverse cumulative 208 standard normal distribution function. I.e., given P, it returns 209 an approximation to the X satisfying P = Pr{Z <= X} where Z is a 210 random variable from the standard normal distribution. 211 212 The algorithm uses a minimax approximation by rational functions 213 and the result has a relative error whose absolute value is less 214 than 1.15e-9. 215 216 @author: Peter John Acklam 217 @note: Time-stamp: 2000-07-19 18:26:14 218 @contact: pjacklam@online.no 219 @see: http://home.online.no/~pjacklam 220 221 @type p: float (0,1) 222 @rtype: float 223 @raise ValueError: if the argument is out of range. 224 @note: Downloaded from http://home.online.no/~pjacklam/notes/invnorm/impl/field/ltqnorm.txt 225 GPK 4/22/2011. Documentation at http://home.online.no/~pjacklam/notes/invnorm/index.html, 226 which is part of this package at .../references/stats_invnorm_pjacklam_2011.html. 227 @contact: Greg Kochanski <gpk@kochanski.org> 228 @note: Modified from the author's original perl code (original comments follow below) 229 by dfield@yahoo-inc.com. May 3, 2004. 230 """ 231 232 if p <= 0 or p >= 1: 233 # The original perl code exits here, we'll throw an exception instead 234 raise ValueError( "Argument to ltqnorm %f must be in open interval (0,1)" % p ) 235 236 # Coefficients in rational approximations. 237 a = (-3.969683028665376e+01, 2.209460984245205e+02, \ 238 -2.759285104469687e+02, 1.383577518672690e+02, \ 239 -3.066479806614716e+01, 2.506628277459239e+00) 240 b = (-5.447609879822406e+01, 1.615858368580409e+02, \ 241 -1.556989798598866e+02, 6.680131188771972e+01, \ 242 -1.328068155288572e+01 ) 243 c = (-7.784894002430293e-03, -3.223964580411365e-01, \ 244 -2.400758277161838e+00, -2.549732539343734e+00, \ 245 4.374664141464968e+00, 2.938163982698783e+00) 246 d = ( 7.784695709041462e-03, 3.224671290700398e-01, \ 247 2.445134137142996e+00, 3.754408661907416e+00) 248 249 # Define break-points. 250 plow = 0.02425 251 phigh = 1 - plow 252 253 # Rational approximation for lower region: 254 if p < plow: 255 q = math.sqrt(-2*math.log(p)) 256 return (((((c[0]*q+c[1])*q+c[2])*q+c[3])*q+c[4])*q+c[5]) / \ 257 ((((d[0]*q+d[1])*q+d[2])*q+d[3])*q+1) 258 259 # Rational approximation for upper region: 260 if phigh < p: 261 q = math.sqrt(-2*math.log(1-p)) 262 return -(((((c[0]*q+c[1])*q+c[2])*q+c[3])*q+c[4])*q+c[5]) / \ 263 ((((d[0]*q+d[1])*q+d[2])*q+d[3])*q+1) 264 265 # Rational approximation for central region: 266 q = p - 0.5 267 r = q*q 268 return (((((a[0]*r+a[1])*r+a[2])*r+a[3])*r+a[4])*r+a[5])*q / \ 269 (((((b[0]*r+b[1])*r+b[2])*r+b[3])*r+b[4])*r+1)

270 271 272 273 if __name__ == '__main__': 274 # test_fprob(2, 5) # This fails the test, but only slightly 275 test_fprob(2, 8) 276 test_fprob(4, 8) 277 test_fprob(7, 8) 278 test_fprob(7, 20) 279 test_fprob(2, 20) 280 test_fprob(18, 20) 281

Source Code for Module gmisclib.stats