Package gmisclib :: Module blue_data_selector
[frames] | no frames]

Source Code for Module gmisclib.blue_data_selector

  1  """This takes anticorrelated samples from a sequence of data. 
  2  In other words, it tends to choose data that have not been seen 
  3  frequently before.   With blueness>=1, it never shows a sample 
  4  n times until all samples have been shown n-1 times. 
  5   
  6  (Note that it is possible for an item to appear twice in sucession, 
  7  if it appears as the Nth item in a set of N, then appears as the first 
  8  item in the second set of N.) 
  9  """ 
 10   
 11  import heapq 
 12  import random 
 13  import math 
 14   
 15   
 16   
17 -class bluedata(object):
18 - def __init__(self, data, blueness=2.0, rng=None):
19 """@param data: data to sample 20 @type data: list(whatever) 21 @param blueness: How much should you avoid choosing the same item 22 in succession? If C{-0.5 < blueness < 0}, then you are 23 more likely to choose the same item several times in a row. 24 If C{0 < blueness < 1}, you are unlikely to choose the same 25 item twice in succession (though it is possible). 26 If C{blueness => 1}, you will never get the same item twice 27 until you've run through the entire list of data. 28 @type blueness: float 29 @param rng: a random number generator, per the L{random} module. 30 You can generate repeated sequences of data by 31 repeatedly passing it a random number generator in the 32 same state. 33 """ 34 assert blueness > -0.5 35 if rng is None: 36 self.r = random.Random() 37 else: 38 self.r = rng 39 self.blue = float(blueness) 40 self.q = [ (self.r.random(), datum) for datum in data ] 41 heapq.heapify(self.q)
42
43 - def check(self, fcn):
44 """This exists to let you run some kind of read-only check on the stored data.""" 45 for (x, v) in self.q: 46 fcn(v)
47
48 - def add(self, datum):
49 """Add another datum to be sampled. 50 @type datum: whatever 51 @param datum: thing to be added. It 52 has a probability of C{1/len(self)} of being the next sample. 53 """ 54 heapq.heappush(self.q, (self.q[0][0]+self.r.random()-1.0/float(len(self.q)+1), datum) )
55 56
57 - def pickiter(self, n):
58 """Pick C{n} items from the data set. 59 @param n: how many items to pick. 60 @type n: int 61 """ 62 for i in range(n): 63 yield self.pick1()
64 65
66 - def pick1(self):
67 u, d = heapq.heappop(self.q) 68 heapq.heappush( self.q, (math.floor(u) + self.blue + self.r.random(), d) ) 69 return d
70 71
72 - def pick(self, n):
73 """Pick C{n} items from the data set. 74 @param n: how many items to pick. 75 @type n: int 76 @rtype: list(whatever), with length==n 77 """ 78 return list(self.pickiter(n))
79 80
81 - def peek(self):
82 """Inspect (but do not remove) the next item to be picked. 83 @return: the next item to be picked. 84 @rtype: whatever (not a list!) 85 """ 86 return self.q[0][1]
87 88
89 - def split(self, n):
90 """Split the data set into two parts, of size n and len(data)-n. 91 Successive calls to split will avoid having the same items appear 92 in the first part. For instance, given a bluedata of size 100, 93 with ten calls to split(10), each datum will appear in the first 94 part of the return tuple exactly once. 95 @rtype: tuple(list, list) 96 """ 97 assert n <= len(self.q), "Split requires 0 <= n <= len." 98 o1 = [] 99 o2 = [] 100 nq = [] 101 while len(o1) < n: 102 u, d = heapq.heappop(self.q) 103 o1.append( d ) 104 heapq.heappush(nq, (int(u) + self.blue + self.r.random(), d) ) 105 for (u, d) in self.q: 106 o2.append(d) 107 heapq.heappush(nq, (int(u) + self.r.random(), d) ) 108 self.q = nq 109 return (o1, o2)
110 111
112 - def __iter__(self):
113 """This iterator will produce samples forever.""" 114 while True: 115 u, d = heapq.heappop(self.q) 116 heapq.heappush(self.q, (int(u) + self.blue + self.r.random(), d) ) 117 yield d
118 119
120 - def __len__(self):
121 return len(self.q)
122 123
124 - def reset(self):
125 """Forget prior history of usage. Choices after this 126 call are uncorrelated with choices before this call.""" 127 self.q = [ (self.r.random(), d) for (u, d) in self.q ] 128 heapq.heapify(self.q)
129 130 131
132 -def test():
133 data = [0, 1, 2, 3, 4, 5, 6] 134 x = bluedata(data) 135 data.sort() 136 o1 = x.pick(3) 137 o2 = x.pick(4) 138 o = o1 + o2 139 o.sort() 140 assert o == data 141 o1 = x.pick(2) 142 o2 = x.pick(5) 143 o = o1 + o2 144 o.sort() 145 assert o == data 146 o1 = x.pick(1) 147 o2 = x.pick(6) 148 o = o1 + o2 149 o.sort() 150 assert o == data 151 o1 = x.pick(7) 152 o2 = x.pick(0) 153 o = o1 + o2 154 o.sort() 155 assert o == data 156 o = x.pick(len(data)) 157 o.sort() 158 assert o == data 159 160 assert len(x) == len(data) 161 162 tmp = [] 163 for (i, q) in enumerate(x): 164 tmp.append(q) 165 if len(tmp) >= len(data): 166 break 167 tmp.sort() 168 assert tmp == data
169 170 if __name__ == '__main__': 171 test() 172