classifiers.data

1 """ 2 @note: The split into training set and test set is completely deterministic, 3 depending only on each datum's identifier, the total number of data. 4 (If you use a single instance of L{bluedata} or L{bluedata_groups} to 5 make several splits, then each split will be different. But if you created 6 a new instance, you'd get the exact same sequence of splits.) 7 """ 8 9 import math 10 import random 11 import zlib 12 from gmisclib import gpkmisc 13 from gmisclib import dictops 14 from gmisclib import dict_vector as DV 15 from gmisclib import blue_data_selector as BDS 16 17

18 -def Hash(x):

19 Mask1 = 0xffffffff 20 Mask2 = 0xffffffffffff 21 rv = 0 22 for (s,i) in sorted(x): 23 tmp = zlib.crc32(s) & Mask1 24 rv = ((rv+tmp) * (17*i + 7)) & Mask2 25 return rv

26 27 assert Hash([('a', 3), ('b', 4)]) == Hash([('a', 3), ('b', 4)]) 28 assert Hash([('ss', 0), ('ta', 4)]) == Hash([('ss', 0), ('ta', 4)]) 29 30

31 -class bluedata(object):

32 """This is an extension of L{gmisclib.blue_data_selector.bluedata} which 33 is aware of the classids of data. It tries to split the 34 data to take the same fraction of each class. 35 """ 36

37 - def __init__(self, data, blueness = 2.0):

38 """@type data: a sequence of L{datum_tr} objects. 39 """ 40 r = random.Random(len(data)) 41 self.ntot = 0 42 per_class_data = dictops.dict_of_lists() 43 for datum in data: 44 per_class_data.add(datum.classid, datum) 45 ncltmp = [(nm, len(d)) for (nm,d) in per_class_data.items()] 46 # Start the random number generator in a random, data-dependent 47 # state, but one that is independent of the order of the data. 48 r.jumpahead(abs(Hash(ncltmp))) 49 self.r = r.getstate() 50 51 self.selector = [] 52 for (cid, cldata) in sorted(per_class_data.items()): 53 self.ntot += len(cldata) 54 self.selector.append( (cid, BDS.bluedata(cldata, blueness, rng=r)) )

55 56

57 - def _chooseint(self, n, r):

58 """This chooses integers math.floor(n) or math.ceil(n) with 59 appropriate probabilities so that the expectation value of 60 _chooseint(n) is n. 61 @type n: float 62 @rtype: int 63 @type r: L{random.Random} 64 @param r: random number generator. 65 """ 66 f = math.floor(n) 67 p = n - f 68 assert 0.0 <= p <= 1.0 69 i = int(f) 70 if r.random() < p: 71 return i+1 72 return i

73 74

75 - def split(self, n, seed=0):

76 """Split the stored data into a group of C{n} and the remainder. 77 Successive calls to split() will tend to (as nearly as possible) 78 put different data into the first list. 79 @type n: int or float 80 @return: tuple(list of length n(or nearly), list of remainder) 81 """ 82 frac = float(n)/float(self.ntot) 83 # First, we choose how many items to select from each class. 84 r = random.Random() 85 r.setstate(self.r) 86 r.jumpahead(abs(seed)) 87 while True: 88 # We pick random numbers for each class, then check to see if the 89 # total is right. 90 npc = {} 91 ntot = 0 92 for (cid, selector) in self.selector: 93 tmp = self._chooseint(frac*len(selector), r) 94 assert tmp >= 0 and tmp <= len(selector) 95 ntot += tmp 96 npc[cid] = tmp 97 if n-0.5 <= ntot <= n+0.5: 98 break 99 100 # Then, we use a blue data selector for each class to pick the right number 101 # of items from that class. 102 o1 = [] 103 o2 = [] 104 for (cid, selector) in self.selector: 105 t1, t2 = selector.split(npc[cid]) 106 o1.extend( t1 ) 107 o2.extend( t2 ) 108 assert len(o1) == ntot 109 # Finally, we shuffle the classes back together. 110 r.shuffle(o1) 111 r.shuffle(o2) 112 # ...and return. 113 return (o1, o2)

114 115

116 - def __len__(self):

117 return self.ntot

118 119

120 -class bluedata_groups(object):

121 - def __init__(self, d, gr):

122 """A 'grouper' function takes a DUID (a unique i.d. string 123 for a datum) and returns the name of the data group to which 124 it belongs. This group name is used in constructing the training 125 and test sets. 126 @param d: a list of data 127 @type d: list(L{datum_c}) 128 @return: A dictionary full of lists; each list corresponds to one group. 129 @rtype: L{dictops.dict_of_lists<dictops>} 130 """ 131 r = random.Random(len(d)) 132 grps = dictops.dict_of_lists() 133 self.ngc = {} 134 ncl = dictops.dict_of_accums() 135 self.ntot = len(d) 136 self.dused = {} 137 self.gmap = {} 138 for t in d: 139 g = gr(t) 140 grps.add(g, t) 141 ncl.add(t.classid, 1) 142 self.gmap[id(t)] = g 143 self.grps = sorted(grps.items()) 144 145 # Start the random number generator in a random, data-dependent 146 # state, but one that is independent of the order of the data. 147 ncltmp = [(nm, n) for (nm,n) in ncl.items()] 148 ngtmp = [(nm, len(t)) for (nm,t) in self.grps] 149 r.jumpahead(abs(Hash(ncltmp)^Hash(ngtmp))) 150 151 for t in d: 152 self.dused[id(t)] = r.random() 153 self.gused = {} 154 n_in_c = DV.dict_vector() 155 for (g, pgd) in self.grps: 156 self.gused[g] = 0.0 157 nc = dictops.dict_of_accums() 158 for t in pgd: 159 nc.add(t.classid, 1) 160 self.ngc[g] = DV.dict_vector(nc.items()) 161 n_in_c += self.ngc[g] 162 assert isinstance(self.ngc[g], DV.dict_vector) 163 assert (n_in_c > 0).all(), "Weird - zero total examples of a class we know about???" 164 self.ncl = DV.dict_vector(ncl.items()) 165 assert isinstance(self.ncl, DV.dict_vector) 166 self.r = r.getstate()

167 168

169 - def __len__(self):

170 return self.ntot

171 172 HUGE = 1e30 173

174 - def split(self, n, seed=0):

175 """Take groups until you have accumulated approximately n data. 176 The trick is to make all classifiers as identical as possible 177 by getting the same balance of classes in the training set. 178 We are willing to sacrifice some data to do that. 179 @param n: how many data in the testing set 180 @type n: float or int 181 @return: (testing set, training set, names in training set) 182 @rtype: tuple(list(L{datum_tr}), list(L{datum_tr}), list(str)) 183 """ 184 r = random.Random() 185 r.setstate(self.r) 186 r.jumpahead(abs(seed)) 187 assert 1 <= n <= self.ntot 188 ftr = 1.0 - float(n)/float(self.ntot) 189 assert isinstance(self.ncl, DV.dict_vector) 190 #: How many data from each class should the training set have? 191 tgt = self.ncl * ftr 192 assert isinstance(tgt, DV.dict_vector) 193 #: This is an accumulator of how many we've found in each class: 194 ta = DV.dict_vector() 195 196 #First, we select which groups to use in the training set. 197 myscore = self.HUGE 198 groups = set() 199 # Never put all your groups into the training set! 200 while len(groups)<len(self.grps)-1: 201 minused = min([self.gused[k] for (k,v) in self.grps 202 if k not in groups]) 203 q = [k for (k,v) in self.grps 204 if self.gused[k]<minused+r.random()**2 205 and k not in groups 206 ] 207 assert q 208 r.shuffle(q) 209 scores = DV.dict_vector() 210 for g in q: 211 scores[g] = self._score(ta + self.ngc[g] - tgt) 212 gfound, newscore = DV.min_within(scores) 213 if newscore < myscore: 214 myscore = newscore 215 groups.add(gfound) 216 ta += self.ngc[gfound] 217 else: 218 break 219 assert len(groups) > 0 220 # Then, we go through and select data from the previously selected 221 # groups. 222 tgt *= (ta/self.ncl.float()).median() 223 o2 = [] 224 o1 = [] 225 for (g, d) in self.grps: 226 if g in groups: 227 assert (ta >= self.ngc[g]).all() 228 o2.extend( self._sample(d, tgt, ta, self.ngc[g]) ) 229 else: 230 o1.extend(d) 231 r.shuffle(o1) 232 r.shuffle(o2) 233 return (o1, o2, groups)

234 235

236 - def _score(self, future_tgt):

237 return abs(future_tgt).sum()

238 239

240 - def _sample(self, d, tgt, ta, ngc):

241 """Here, we attempt to sample items from the list of data d, 242 taking care to sample less-used data first. We also 243 increment the group used counter (for later use in choosing 244 groups, in C{split()}. 245 Data items will be used at most once, so you may not get all 246 that you ask for. 247 248 @param d: list of data 249 @type d: list(data_tr) 250 @param tgt: the desired remaining number of data to select for the training set. 251 This is how many we will try to get out of the groups that have 252 not yet been C{_select()}ed. It is decremented at the end of the call. 253 @param ta: the total remaining number of data in each class. This 254 looks at the data within groups we have selected but where 255 we on which we have not yet called C{_sample()}. 256 It is decremented at the end of the call. 257 @param ngc: the number of data in each class, within our particular group. 258 @type tgt: L{dict_vector.dict_vector} 259 @type ta: L{dict_vector.dict_vector} 260 @type ngc: L{dict_vector.dict_vector} 261 """ 262 assert isinstance(ngc, DV.dict_vector), "ngc=%s" % str(ngc) 263 n = DV.round(tgt * (ngc/ta.float())) 264 n0 = n.copy() 265 tmp = [ (self.dused[id(t)], t) for t in d ] 266 tmp.sort() 267 f = 1.0/len(d) 268 o = [] 269 for (u, t) in tmp: 270 if n[t.classid] >= 0.5: 271 self.dused[id(t)] += 1 272 self.gused[self.gmap[id(t)]] += f 273 o.append(t) 274 n[t.classid] -= 1 275 tgt -= n0 - n 276 ta -= ngc 277 assert (ta >= 0).all() 278 ta.idropzero() 279 return o

280

Source Code for Module classifiers.data_splitter