1 """This takes anticorrelated samples from a sequence of data.
2 In other words, it tends to choose data that have not been seen
3 frequently before. With blueness>=1, it never shows a sample
4 n times until all samples have been shown n-1 times.
5
6 (Note that it is possible for an item to appear twice in sucession,
7 if it appears as the Nth item in a set of N, then appears as the first
8 item in the second set of N.)
9 """
10
11 import heapq
12 import random
13 import math
14
15
16
18 - def __init__(self, data, blueness=2.0, rng=None):
19 """@param data: data to sample
20 @type data: list(whatever)
21 @param blueness: How much should you avoid choosing the same item
22 in succession? If C{-0.5 < blueness < 0}, then you are
23 more likely to choose the same item several times in a row.
24 If C{0 < blueness < 1}, you are unlikely to choose the same
25 item twice in succession (though it is possible).
26 If C{blueness => 1}, you will never get the same item twice
27 until you've run through the entire list of data.
28 @type blueness: float
29 @param rng: a random number generator, per the L{random} module.
30 You can generate repeated sequences of data by
31 repeatedly passing it a random number generator in the
32 same state.
33 """
34 assert blueness > -0.5
35 if rng is None:
36 self.r = random.Random()
37 else:
38 self.r = rng
39 self.blue = float(blueness)
40 self.q = [ (self.r.random(), datum) for datum in data ]
41 heapq.heapify(self.q)
42
44 """This exists to let you run some kind of read-only check on the stored data."""
45 for (x, v) in self.q:
46 fcn(v)
47
48 - def add(self, datum):
49 """Add another datum to be sampled.
50 @type datum: whatever
51 @param datum: thing to be added. It
52 has a probability of C{1/len(self)} of being the next sample.
53 """
54 heapq.heappush(self.q, (self.q[0][0]+self.r.random()-1.0/float(len(self.q)+1), datum) )
55
56
58 """Pick C{n} items from the data set.
59 @param n: how many items to pick.
60 @type n: int
61 """
62 for i in range(n):
63 yield self.pick1()
64
65
67 u, d = heapq.heappop(self.q)
68 heapq.heappush( self.q, (math.floor(u) + self.blue + self.r.random(), d) )
69 return d
70
71
73 """Pick C{n} items from the data set.
74 @param n: how many items to pick.
75 @type n: int
76 @rtype: list(whatever), with length==n
77 """
78 return list(self.pickiter(n))
79
80
82 """Inspect (but do not remove) the next item to be picked.
83 @return: the next item to be picked.
84 @rtype: whatever (not a list!)
85 """
86 return self.q[0][1]
87
88
90 """Split the data set into two parts, of size n and len(data)-n.
91 Successive calls to split will avoid having the same items appear
92 in the first part. For instance, given a bluedata of size 100,
93 with ten calls to split(10), each datum will appear in the first
94 part of the return tuple exactly once.
95 @rtype: tuple(list, list)
96 """
97 assert n <= len(self.q), "Split requires 0 <= n <= len."
98 o1 = []
99 o2 = []
100 nq = []
101 while len(o1) < n:
102 u, d = heapq.heappop(self.q)
103 o1.append( d )
104 heapq.heappush(nq, (int(u) + self.blue + self.r.random(), d) )
105 for (u, d) in self.q:
106 o2.append(d)
107 heapq.heappush(nq, (int(u) + self.r.random(), d) )
108 self.q = nq
109 return (o1, o2)
110
111
113 """This iterator will produce samples forever."""
114 while True:
115 u, d = heapq.heappop(self.q)
116 heapq.heappush(self.q, (int(u) + self.blue + self.r.random(), d) )
117 yield d
118
119
122
123
125 """Forget prior history of usage. Choices after this
126 call are uncorrelated with choices before this call."""
127 self.q = [ (self.r.random(), d) for (u, d) in self.q ]
128 heapq.heapify(self.q)
129
130
131
169
170 if __name__ == '__main__':
171 test()
172