1 """This chooses samples such that the specified attributes are
2 broadly distributed.
3 """
4
5 import math
6 import random
7 from gmisclib import dictops
8
9
11 s = 0
12 for d in a:
13 s += d
14 e = 0.0
15 for d in a:
16 if d > 0:
17 p = float(d)/float(s)
18 e += p*math.log(p)
19
20 return -e
21
22
23
26 """@param inspector: a function on a datum that returns a list of attributes."""
27 self.inspector = inspector
28 self.data = list(data)
29 self.inspections = None
30 self.counts = None
31 self._peek = None
32 self._haspeek = False
33 self.na = None
34
35
37 for i in range(n):
38 yield self.pick_one()
39
40
42 """Pick one item from the data set.
43 """
44 if self.inspections is None:
45 self._initialize()
46 if self._haspeek:
47 self._haspeek = False
48 return self._peek
49 n = int(round(math.sqrt(len(self.data))))
50 tmp = []
51 assert len(self.inspections) == len(self.data)
52 for j in random.sample(range(len(self.data)), n):
53
54 tmp.append((self._entropy(self.inspections[j]), j))
55 tmp.sort()
56 e, j = tmp[-1]
57 d = self.data.pop(j)
58 ins = self.inspections.pop(j)
59 self._add(ins)
60 return d
61
62
64 attr = self.inspector(self.data[0])
65 self.na = len(attr)
66 self.counts = [dictops.dict_of_accums() for q in attr]
67 self.inspections = []
68 for d in self.data:
69 alist = self.inspector(d)
70 assert len(alist) == self.na
71 self.inspections.append(alist)
72 for (i,a) in enumerate(alist):
73 self.counts[i].add(a, 0)
74
76 assert len(xtra) == self.na
77 e = 0.0
78 for i in range(self.na):
79 tmp = self.counts[i].copy()
80 tmp.add(xtra[i], 1)
81 e += _entropy_of_list(tmp.values())
82
83 return e
84
85 - def _add(self, xtra):
86 assert len(xtra) == self.na
87 for i in range(self.na):
88 self.counts[i].add(xtra[i], 1)
89
91 """Inspect (but do not remove) the next item to be picked.
92 @return: the next item to be picked.
93 @rtype: whatever (not a list!)
94 """
95 if not self._haspeek:
96 self._peek = self.pick()
97 self._haspeek = True
98 return self._peek
99
100
101 - def add(self, datum):
102 """Add another datum to be sampled.
103 @type datum: whatever
104 @param datum: thing to be added. It
105 has a probability of C{1/len(self)} of being the next sample.
106 """
107 self.data.append(datum)
108 alist = self.inspector(datum)
109 self.inspections.append(alist)
110 self._haspeek = False
111 assert len(alist) == self.na
112 for (i,a) in enumerate(alist):
113 self.counts[i].add(a, 0)
114
115
117 return len(self.data)
118
119
121 """Forget prior history of usage. Choices after this
122 call are uncorrelated with choices before this call."""
123
124
125
126
137
138 if __name__ == '__main__':
139 test()
140