Package gmisclib :: Module blue_data_attributes
[frames] | no frames]

Source Code for Module gmisclib.blue_data_attributes

  1  """This chooses samples such that the specified attributes are 
  2  broadly distributed. 
  3  """ 
  4   
  5  import math 
  6  import random 
  7  from gmisclib import dictops 
  8   
  9   
10 -def _entropy_of_list(a):
11 s = 0 12 for d in a: 13 s += d 14 e = 0.0 15 for d in a: 16 if d > 0: 17 p = float(d)/float(s) 18 e += p*math.log(p) 19 # print '\t\tEOL=%g from %s' % (-e, a) 20 return -e
21 22 23
24 -class blue_attributes(object):
25 - def __init__(self, inspector, data):
26 """@param inspector: a function on a datum that returns a list of attributes.""" 27 self.inspector = inspector 28 self.data = list(data) 29 self.inspections = None 30 self.counts = None 31 self._peek = None 32 self._haspeek = False 33 self.na = None
34 35
36 - def pick(self, n):
37 for i in range(n): 38 yield self.pick_one()
39 40
41 - def pick_one(self):
42 """Pick one item from the data set. 43 """ 44 if self.inspections is None: 45 self._initialize() 46 if self._haspeek: 47 self._haspeek = False 48 return self._peek 49 n = int(round(math.sqrt(len(self.data)))) 50 tmp = [] 51 assert len(self.inspections) == len(self.data) 52 for j in random.sample(range(len(self.data)), n): 53 # print 'Try %d=%s -> %s' % (j, self.data[j], self.inspections[j]) 54 tmp.append((self._entropy(self.inspections[j]), j)) 55 tmp.sort() 56 e, j = tmp[-1] 57 d = self.data.pop(j) 58 ins = self.inspections.pop(j) 59 self._add(ins) 60 return d
61 62
63 - def _initialize(self):
64 attr = self.inspector(self.data[0]) 65 self.na = len(attr) 66 self.counts = [dictops.dict_of_accums() for q in attr] 67 self.inspections = [] 68 for d in self.data: 69 alist = self.inspector(d) 70 assert len(alist) == self.na 71 self.inspections.append(alist) 72 for (i,a) in enumerate(alist): 73 self.counts[i].add(a, 0)
74
75 - def _entropy(self, xtra):
76 assert len(xtra) == self.na 77 e = 0.0 78 for i in range(self.na): 79 tmp = self.counts[i].copy() 80 tmp.add(xtra[i], 1) 81 e += _entropy_of_list(tmp.values()) 82 # print '\t xtra=%s -> e=%g' % (xtra, e) 83 return e
84
85 - def _add(self, xtra):
86 assert len(xtra) == self.na 87 for i in range(self.na): 88 self.counts[i].add(xtra[i], 1)
89
90 - def peek(self):
91 """Inspect (but do not remove) the next item to be picked. 92 @return: the next item to be picked. 93 @rtype: whatever (not a list!) 94 """ 95 if not self._haspeek: 96 self._peek = self.pick() 97 self._haspeek = True 98 return self._peek
99 100
101 - def add(self, datum):
102 """Add another datum to be sampled. 103 @type datum: whatever 104 @param datum: thing to be added. It 105 has a probability of C{1/len(self)} of being the next sample. 106 """ 107 self.data.append(datum) 108 alist = self.inspector(datum) 109 self.inspections.append(alist) 110 self._haspeek = False 111 assert len(alist) == self.na 112 for (i,a) in enumerate(alist): 113 self.counts[i].add(a, 0)
114 115
116 - def __len__(self):
117 return len(self.data)
118 119
120 - def reset(self):
121 """Forget prior history of usage. Choices after this 122 call are uncorrelated with choices before this call."""
123 124 125 126
127 -def test():
128 x = ['a']*1000 + ['b']*100 129 tmp = blue_attributes(lambda x: (x,), x) 130 c = dictops.dict_of_accums() 131 for i in range(20): 132 c.add(tmp.pick_one(), 1) 133 assert abs(c['a']-c['b']) <= 1 134 for i in range(50): 135 c.add(tmp.pick(1), 1) 136 assert abs(c['a']-c['b']) <= 2
137 138 if __name__ == '__main__': 139 test() 140