1 """
2 @note: The split into training set and test set is completely deterministic,
3 depending only on each datum's identifier, the total number of data.
4 (If you use a single instance of L{bluedata} or L{bluedata_groups} to
5 make several splits, then each split will be different. But if you created
6 a new instance, you'd get the exact same sequence of splits.)
7 """
8
9 import math
10 import random
11 import zlib
12 from gmisclib import gpkmisc
13 from gmisclib import dictops
14 from gmisclib import dict_vector as DV
15 from gmisclib import blue_data_selector as BDS
16
17
19 Mask1 = 0xffffffff
20 Mask2 = 0xffffffffffff
21 rv = 0
22 for (s,i) in sorted(x):
23 tmp = zlib.crc32(s) & Mask1
24 rv = ((rv+tmp) * (17*i + 7)) & Mask2
25 return rv
26
27 assert Hash([('a', 3), ('b', 4)]) == Hash([('a', 3), ('b', 4)])
28 assert Hash([('ss', 0), ('ta', 4)]) == Hash([('ss', 0), ('ta', 4)])
29
30
32 """This is an extension of L{gmisclib.blue_data_selector.bluedata} which
33 is aware of the classids of data. It tries to split the
34 data to take the same fraction of each class.
35 """
36
37 - def __init__(self, data, blueness = 2.0):
38 """@type data: a sequence of L{datum_tr} objects.
39 """
40 r = random.Random(len(data))
41 self.ntot = 0
42 per_class_data = dictops.dict_of_lists()
43 for datum in data:
44 per_class_data.add(datum.classid, datum)
45 ncltmp = [(nm, len(d)) for (nm,d) in per_class_data.items()]
46
47
48 r.jumpahead(abs(Hash(ncltmp)))
49 self.r = r.getstate()
50
51 self.selector = []
52 for (cid, cldata) in sorted(per_class_data.items()):
53 self.ntot += len(cldata)
54 self.selector.append( (cid, BDS.bluedata(cldata, blueness, rng=r)) )
55
56
58 """This chooses integers math.floor(n) or math.ceil(n) with
59 appropriate probabilities so that the expectation value of
60 _chooseint(n) is n.
61 @type n: float
62 @rtype: int
63 @type r: L{random.Random}
64 @param r: random number generator.
65 """
66 f = math.floor(n)
67 p = n - f
68 assert 0.0 <= p <= 1.0
69 i = int(f)
70 if r.random() < p:
71 return i+1
72 return i
73
74
75 - def split(self, n, seed=0):
76 """Split the stored data into a group of C{n} and the remainder.
77 Successive calls to split() will tend to (as nearly as possible)
78 put different data into the first list.
79 @type n: int or float
80 @return: tuple(list of length n(or nearly), list of remainder)
81 """
82 frac = float(n)/float(self.ntot)
83
84 r = random.Random()
85 r.setstate(self.r)
86 r.jumpahead(abs(seed))
87 while True:
88
89
90 npc = {}
91 ntot = 0
92 for (cid, selector) in self.selector:
93 tmp = self._chooseint(frac*len(selector), r)
94 assert tmp >= 0 and tmp <= len(selector)
95 ntot += tmp
96 npc[cid] = tmp
97 if n-0.5 <= ntot <= n+0.5:
98 break
99
100
101
102 o1 = []
103 o2 = []
104 for (cid, selector) in self.selector:
105 t1, t2 = selector.split(npc[cid])
106 o1.extend( t1 )
107 o2.extend( t2 )
108 assert len(o1) == ntot
109
110 r.shuffle(o1)
111 r.shuffle(o2)
112
113 return (o1, o2)
114
115
118
119
122 """A 'grouper' function takes a DUID (a unique i.d. string
123 for a datum) and returns the name of the data group to which
124 it belongs. This group name is used in constructing the training
125 and test sets.
126 @param d: a list of data
127 @type d: list(L{datum_c})
128 @return: A dictionary full of lists; each list corresponds to one group.
129 @rtype: L{dictops.dict_of_lists<dictops>}
130 """
131 r = random.Random(len(d))
132 grps = dictops.dict_of_lists()
133 self.ngc = {}
134 ncl = dictops.dict_of_accums()
135 self.ntot = len(d)
136 self.dused = {}
137 self.gmap = {}
138 for t in d:
139 g = gr(t)
140 grps.add(g, t)
141 ncl.add(t.classid, 1)
142 self.gmap[id(t)] = g
143 self.grps = sorted(grps.items())
144
145
146
147 ncltmp = [(nm, n) for (nm,n) in ncl.items()]
148 ngtmp = [(nm, len(t)) for (nm,t) in self.grps]
149 r.jumpahead(abs(Hash(ncltmp)^Hash(ngtmp)))
150
151 for t in d:
152 self.dused[id(t)] = r.random()
153 self.gused = {}
154 n_in_c = DV.dict_vector()
155 for (g, pgd) in self.grps:
156 self.gused[g] = 0.0
157 nc = dictops.dict_of_accums()
158 for t in pgd:
159 nc.add(t.classid, 1)
160 self.ngc[g] = DV.dict_vector(nc.items())
161 n_in_c += self.ngc[g]
162 assert isinstance(self.ngc[g], DV.dict_vector)
163 assert (n_in_c > 0).all(), "Weird - zero total examples of a class we know about???"
164 self.ncl = DV.dict_vector(ncl.items())
165 assert isinstance(self.ncl, DV.dict_vector)
166 self.r = r.getstate()
167
168
171
172 HUGE = 1e30
173
174 - def split(self, n, seed=0):
175 """Take groups until you have accumulated approximately n data.
176 The trick is to make all classifiers as identical as possible
177 by getting the same balance of classes in the training set.
178 We are willing to sacrifice some data to do that.
179 @param n: how many data in the testing set
180 @type n: float or int
181 @return: (testing set, training set, names in training set)
182 @rtype: tuple(list(L{datum_tr}), list(L{datum_tr}), list(str))
183 """
184 r = random.Random()
185 r.setstate(self.r)
186 r.jumpahead(abs(seed))
187 assert 1 <= n <= self.ntot
188 ftr = 1.0 - float(n)/float(self.ntot)
189 assert isinstance(self.ncl, DV.dict_vector)
190
191 tgt = self.ncl * ftr
192 assert isinstance(tgt, DV.dict_vector)
193
194 ta = DV.dict_vector()
195
196
197 myscore = self.HUGE
198 groups = set()
199
200 while len(groups)<len(self.grps)-1:
201 minused = min([self.gused[k] for (k,v) in self.grps
202 if k not in groups])
203 q = [k for (k,v) in self.grps
204 if self.gused[k]<minused+r.random()**2
205 and k not in groups
206 ]
207 assert q
208 r.shuffle(q)
209 scores = DV.dict_vector()
210 for g in q:
211 scores[g] = self._score(ta + self.ngc[g] - tgt)
212 gfound, newscore = DV.min_within(scores)
213 if newscore < myscore:
214 myscore = newscore
215 groups.add(gfound)
216 ta += self.ngc[gfound]
217 else:
218 break
219 assert len(groups) > 0
220
221
222 tgt *= (ta/self.ncl.float()).median()
223 o2 = []
224 o1 = []
225 for (g, d) in self.grps:
226 if g in groups:
227 assert (ta >= self.ngc[g]).all()
228 o2.extend( self._sample(d, tgt, ta, self.ngc[g]) )
229 else:
230 o1.extend(d)
231 r.shuffle(o1)
232 r.shuffle(o2)
233 return (o1, o2, groups)
234
235
236 - def _score(self, future_tgt):
237 return abs(future_tgt).sum()
238
239
240 - def _sample(self, d, tgt, ta, ngc):
241 """Here, we attempt to sample items from the list of data d,
242 taking care to sample less-used data first. We also
243 increment the group used counter (for later use in choosing
244 groups, in C{split()}.
245 Data items will be used at most once, so you may not get all
246 that you ask for.
247
248 @param d: list of data
249 @type d: list(data_tr)
250 @param tgt: the desired remaining number of data to select for the training set.
251 This is how many we will try to get out of the groups that have
252 not yet been C{_select()}ed. It is decremented at the end of the call.
253 @param ta: the total remaining number of data in each class. This
254 looks at the data within groups we have selected but where
255 we on which we have not yet called C{_sample()}.
256 It is decremented at the end of the call.
257 @param ngc: the number of data in each class, within our particular group.
258 @type tgt: L{dict_vector.dict_vector}
259 @type ta: L{dict_vector.dict_vector}
260 @type ngc: L{dict_vector.dict_vector}
261 """
262 assert isinstance(ngc, DV.dict_vector), "ngc=%s" % str(ngc)
263 n = DV.round(tgt * (ngc/ta.float()))
264 n0 = n.copy()
265 tmp = [ (self.dused[id(t)], t) for t in d ]
266 tmp.sort()
267 f = 1.0/len(d)
268 o = []
269 for (u, t) in tmp:
270 if n[t.classid] >= 0.5:
271 self.dused[id(t)] += 1
272 self.gused[self.gmap[id(t)]] += f
273 o.append(t)
274 n[t.classid] -= 1
275 tgt -= n0 - n
276 ta -= ngc
277 assert (ta >= 0).all()
278 ta.idropzero()
279 return o
280