1
2 """This is a support module, used by many types of classifiers."""
3
4 import re
5 import math
6 import zlib
7
8 import numpy
9
10 from gmisclib import chunkio
11 from g_classifiers import data_splitter as DS
12 from gmisclib import die
13 from gmisclib import mcmc
14 from gmisclib import mcmc_helper
15 mcmc_helper.Debug = 0
16 mcmc.Debug = 0
17 from gmisclib import g_implements
18 from gmisclib import fiatio
19 from gmisclib import dictops
20 from gmisclib import gpkmisc
21 from gmisclib import dict_vector as DV
22 import gpkavg
23
24
25
26 ERGCOVER = 4.0
27 D = False
28 CONSTRAIN = 1e-6
31 """@return: a hash of the UID's of data items.
32 @rtype: str
33 @param dl: A list of data
34 @type dl: list(L{datum_c})
35 """
36 Mask1 = 0xffffffff
37 Mask2 = 0xffffffffffff
38 rv = 0
39 for x in sorted(dl):
40 tmp = zlib.crc32(x.uid) & Mask1
41 rv = ((rv+tmp) * 13) & Mask2
42 return '%d' % rv
43
46 """@return: a hash of a vector.
47 @rtype: str
48 @param l: A list of data
49 """
50 Mask1 = 0xffffffff
51 Mask2 = 0xffffffff
52 rv = 0
53 for t in l:
54 tmp = zlib.crc32(str(t)) & Mask1
55 rv = ((rv+tmp) * 11) & Mask2
56 return '%d' % rv
57
58 assert Hash1([3.712, 3.71]) == Hash1([3.712, 3.71])
62 """This is an unclassified datum, either in the
63 test or training set.
64 @ivar value: a feature vector
65 @type value: L{numpy.ndarray}
66 @ivar uid: a string that identfies a particular datum.
67 This is not used for classification, but is passed
68 through to the output. (It can also used to define groups
69 of data.)
70 @type uid: str
71 """
72
73 __slots__ = ['value', 'uid']
74
76 """@param vector: a vector of numbers
77 @param uid: an arbitrary identifier for a datum
78 @type uid: str
79 """
80 self.value = numpy.asarray(vector, numpy.float)
81 if uid is None:
82 uid = Hash1(vector)
83 self.uid = uid
84
86 return "<datum_c fv=%s uid=%s>" % (str(self.value), self.uid)
87
88
90 return cmp(self.uid, other.uid)
91
94 """This is a datum where we know the true class, presumably in
95 the training set.
96 @ivar classid: the name of the class to which the datum belongs.
97 @type classid: str
98 """
99
100 __slots__ = ['classid']
101
102 - def __init__(self, vector, classid, uid=None):
103 """@param vector: a vector of numbers
104 @param uid: an arbitrary identifier for a datum
105 @type uid: str
106 @param classid: is the true class,
107 @type classid: str
108 """
109 datum_c.__init__(self, vector, uid)
110 self.classid = classid
111
113 return "<datum_tr class=%s fv=%s uid=%s>" % (self.classid,
114 str(self.value),
115 self.uid)
116 __repr__ = __str__
117
118
119
120
121
122 -def prior(training):
123 """This computes the probability of correct classification,
124 assuming you can't see the feature vector.
125 It is used to compute P(chance).
126 It assumes that you choose class C
127 with probability 1 if P(C) is the
128 biggest among all the classes.
129 """
130 dtr = dictops.dict_of_accums()
131 for datum in training:
132 dtr.add(datum.classid, 1)
133 kbest, chance = DV.max_within(DV.dict_vector(dtr))
134 return chance/float(len(training))
135
139 """This is a hard, conservative upper limit
140 for the probability of correct classification.
141 """
142
143
144
145
146
147
148
149
150
151
152 dtr = dictops.dict_of_accums()
153 for datum in training:
154 dtr.add(datum.classid, 1)
155
156 impossible = 0
157 for datum in testing:
158 if not datum.classid in dtr:
159 impossible += 1
160
161 return float(len(testing)-impossible)/float(len(testing))
162
163
164
165
166
167
168 -def _addwrongD(wrong, failures):
169 for (k, v) in failures.items():
170 wrong.add(k, v)
171
174 for t in failures:
175 wrong.add(t, 1)
176
177
178 -def _doclass(qc, testing, verbose=True):
179 """This classifies the data
180 @return: yields a sequence of dictionaries,
181 each of which contains a bunch of results and information.
182 @param testing: a list of data.
183 @param qc: a classifier.
184 @type qc: subclass of L{classifier}.
185 """
186 info = qc.info.copy()
187 for datum in testing:
188 P = qc.P(datum)
189 tmp = info.copy()
190
191
192
193
194 bestcl = None
195 bestp = 0.0
196 for (cl, p) in P:
197 if p > bestp:
198 bestp = p
199 bestcl = cl
200 tmp['compclass'] = bestcl
201
202 if datum.uid is not None:
203 tmp['Duid'] = datum.uid
204
205 if verbose:
206 tmp['V'] = chunkio.chunkstring_w().write_NumArray(datum.value, converter=lambda x:"%.6g"%x).close()
207 tmp['P'] = chunkio.chunkstring_w().write_dict(dict(P), converter=lambda x:"%.3e"%x).close()
208 try:
209 tmp['trueclass'] = datum.classid
210 except AttributeError:
211 pass
212 yield tmp
213
214
215
216
217 -def read_data(fd, commentarray=None):
218 """Reads in feature vectors where the first element is the true class.
219 This is the main data input for l_classifier, qdg_classifier and qd_classifier.
220 @type fd: L{file}
221 @type commentarray: a L{list} or L{None}
222 @rtype: L{list}(L{datum_tr})
223 """
224 d = []
225 len_a = None
226
227 print '# Reading'
228 ln = 0
229 for l in fd:
230 ln += 1
231 if l.startswith('#'):
232 if commentarray is not None:
233 commentarray.append(l[1:])
234 continue
235 aa = l.split('#', 1)
236 if len(aa) > 1:
237 uid = aa[1].strip()
238 else:
239 uid = 'Line:%d' % ln
240 a = aa[0].split()
241 if len(a) != len_a:
242 if len_a is None:
243 len_a = len(a)
244 else:
245 die.die('Not all vectors have length=%d. Problem on line %d'
246 % (len_a-1, ln)
247 )
248 d.append( datum_tr(numpy.array([float(x) for x in a[1:]]),
249 a[0], uid)
250 )
251 return d
252
255 """This function takes a list of data (type datum_tr)
256 and makes sure that they all have the same length feature
257 vector. If so, it reports the length (dimension) of the
258 feature vector.
259 @type fd: list(L{datum_tr})
260 @param fd: input data
261 @return: length of vectors
262 @rtype: int
263 """
264 dim = None
265 for (i, x) in enumerate(fd):
266 assert isinstance(x, datum_tr), "Whoops: needs a datum_tr, got %s" % str(x)
267 if len(x.value) != dim:
268 if dim is None:
269 dim = len(x.value)
270 if len(x.value) != dim:
271 raise ValueError, "Not all vectors are length=%d (len(vec[%d])=%d)" % (dim, i, len(x.value))
272 return dim
273
274
275
276 -def compute_cross_class(training, testing,
277 modelchoice=None, n_per_dim=None,
278 builder=None, classout=None,
279 trainingset_name=None, modify_class=None,
280 verbose=True):
281 """Build classifiers based on the training set,
282 and test them on the testing set.
283 Modelchoice here is the completed class object,
284 not a closure."""
285
286
287 if len(training) == 0:
288 die.die('No training data.')
289 if len(testing) == 0:
290 die.die('No data to classify.')
291
292 dim = get_dim(training)
293 if dim <= 0:
294 die.die('zero dimensional data')
295 assert get_dim(testing) == dim
296
297 nok = 0
298 total = 0
299 wrong = dictops.dict_of_accums()
300 k = None
301
302 priorchance = prior(training)
303 maxcorrect = max_correct(training, testing)
304 classifiers = builder(training, n_per_dim*dim,
305 modelchoice=modelchoice,
306 trainingset_name=trainingset_name)
307 for qc in classifiers:
308 if modify_class is not None:
309 modify_class(qc, count_classes(training), count_classes(testing))
310 failures = qc.list_wrong_classifications(testing)
311 qc.add_info('correct', len(testing)-len(failures))
312 qc.add_info('Ntests', len(testing))
313 qc.add_info('Chance_on_this_trainingset', priorchance)
314 _addwrong(wrong, failures)
315 if classout is not None:
316 classout.extend( _doclass(qc, testing, verbose=verbose) )
317 nok += len(failures)
318 total += len(testing)
319 pcorrect = float(total-nok)/float(total)
320
321 if priorchance < maxcorrect:
322 k = (pcorrect-priorchance)/(maxcorrect-priorchance)
323 print '#NOK=', nok, total, float(nok)/total
324
325 summary = {'nok': nok, 'total': total,
326 'Pcorrect': float(total-nok)/float(total),
327 'Chance': priorchance,
328 'Pperfect': maxcorrect,
329 'N_per_dim': n_per_dim
330 }
331 if k is not None:
332 summary['K'] = k
333
334 return (summary, classifiers, wrong)
335
336
337
338 -def compute_self_class(d, coverage=None, ftest=None,
339 modelchoice=None, n_per_dim=None, modify_class=None,
340 builder=None, classout=None, verbose=True):
341
342 """modelchoice here is expected to take one argument-- the data."""
343
344 if len(d) == 0:
345 die.die('No data to classify.')
346 dim = get_dim(d)
347 if dim <= 0:
348 die.die('zero dimensional data')
349 Ntry = int(round(coverage/ftest))
350 bdata = DS.bluedata(d)
351 nok = 0; total = 0;
352 wrong = dictops.dict_of_accums()
353 out = []; k = []; pch = []; pmx = []; pcrct = []
354 for tr in range(Ntry):
355 print '# Building'
356 testing, training = bdata.split(ftest*len(bdata), seed=tr)
357
358
359 if modelchoice is not None:
360 c_modelchoice = modelchoice(training)
361 else:
362 c_modelchoice = None
363 tsum, classifiers, twr = compute_cross_class(training, testing, c_modelchoice,
364 n_per_dim, builder, classout,
365 trainingset_name=Hash(training),
366 modify_class=modify_class,
367 verbose=verbose
368 )
369
370 pmx.append( tsum['Pperfect'] )
371 pch.append( tsum['Chance'] )
372 total += tsum['total']
373 nok += tsum['nok']
374 pcrct.append( 1.0-float(tsum['nok'])/tsum['total'] )
375 _addwrongD(wrong, twr)
376 out.extend(classifiers)
377 if 'K' in tsum:
378 k.append( tsum['K'] )
379
380 summary = {'nok': nok, 'total': total, 'Ftest': ftest,
381 'N_per_dim': n_per_dim, 'Coverage': coverage}
382
383 try:
384 summary['K'], ksigma = gpkavg.avg(k, None, 0.0001)
385 summary['KSigma'] = ksigma
386 except ValueError, x:
387 die.info('%s: K' % str(x))
388
389 try:
390 summary['Chance'], chsigma = gpkavg.avg(pch, None, 0.0001)
391 summary['ChSigma'] = chsigma
392 except ValueError, x:
393 die.info('%s: PChance' % str(x))
394 try:
395 summary['Perfection'], prfsigma = gpkavg.avg(pmx, None, 0.0001)
396 summary['PerfectionSigma'] = prfsigma
397 except ValueError, x:
398 die.info('%s: Pperfect' % str(x))
399
400 try:
401 summary['Pcorrect'], psigma = gpkavg.avg(pcrct, None, 0.0001)
402 summary['PSigma'] = psigma
403 except ValueError, x:
404 die.info('%s: Pcorrect' % str(x))
405
406
407 return (summary, out, wrong)
408
414 """A 'grouper' function takes a DUID (a unique i.d. string
415 for a datum) and returns the name of the data group to which
416 it belongs. This group name is used in constructing the training
417 and test sets.
418 """
419 - def __init__(self, pattern='^([^/]*)/', which=1):
420 self.pattern = re.compile(pattern)
421 self.which = which
422
424 """
425 @param x: a datum
426 @type x: L{datum_c}
427 @return: group name
428 @rtype: str
429 """
430 m = self.pattern.search(x.uid)
431 if m:
432 return m.group(self.which)
433 return None
434
438 groups = set()
439 for t in d:
440 groups.add(gr(t))
441 return groups
442
443
444
445
446
447
448 -def compute_group_class(dg, modelchoice=None, n_per_dim=None,
449 builder=None, classout=None, ftest=None,
450 grouper=None, coverage=None, modify_class=None,
451 verbose=True):
452 """This function makes sure that the training set and testing set
453 come from different groups. The 'grouper' returns a group
454 name, when given a datum. Modelchoice is expected to take
455 one argument, the training set.
456 @param grouper: function returning a group name for each datum
457 @type grouper: function from L{datum_tr} to C{str}
458 """
459
460 if len(dg) == 0:
461 die.die('No data to classify.')
462 dim = get_dim(dg)
463 if dim <= 0:
464 die.die('zero dimensional data')
465
466 Ntry = int(round(coverage/ftest))
467 nok = 0; total = 0;
468 wrong = dictops.dict_of_accums()
469 out = []; k = []; pch = []; pcrct = []; pmx = []
470
471 bdata = DS.bluedata_groups(dg, grouper)
472 for i in range(Ntry):
473 print '# Building'
474 testing, training, trn = bdata.split(ftest*len(bdata), seed=i)
475 print 'testing:', ftest*len(bdata), len(testing), "training:", len(training), "from:", len(bdata), len(dg)
476
477
478 if modelchoice is not None:
479 c_modelchoice = modelchoice(training)
480 else:
481 c_modelchoice = None
482 tsum, classifiers, twr = compute_cross_class(training, testing,
483 c_modelchoice, n_per_dim,
484 builder, classout, modify_class=modify_class,
485 trainingset_name=Hash(training),
486 verbose=verbose
487 )
488
489 pmx.append( tsum['Pperfect'] )
490 pch.append( tsum['Chance'] )
491 total += tsum['total']
492 nok += tsum['nok']
493 pcrct.append( 1.0-float(tsum['nok'])/tsum['total'] )
494 _addwrongD(wrong, twr)
495 out.extend(classifiers)
496 if 'K' in tsum:
497 k.append( tsum['K'] )
498
499 summary = {'nok': nok, 'total': total, 'Ftest': ftest,
500 'N_per_dim': n_per_dim, 'Coverage': coverage}
501
502 try:
503 summary['K'], ksigma = gpkavg.avg(k, None, 0.0001)
504 summary['KSigma'] = ksigma
505 except ValueError, x:
506 die.info('%s: K' % str(x))
507
508 try:
509 summary['Chance'], chsigma = gpkavg.avg(pch, None, 0.0001)
510 summary['ChSigma'] = chsigma
511 except ValueError, x:
512 die.info('%s: PChance' % str(x))
513 try:
514 summary['Perfection'], prfsigma = gpkavg.avg(pmx, None, 0.0001)
515 summary['PerfectionSigma'] = prfsigma
516 except ValueError, x:
517 die.info('%s: Pperfect' % str(x))
518
519 try:
520 summary['Pcorrect'], psigma = gpkavg.avg(pcrct, None, 0.0001)
521 summary['PSigma'] = psigma
522 except ValueError, x:
523 die.info('%s: Pcorrect' % str(x))
524
525 return (summary, out, wrong)
526
532 """This class describes how to compute the relative probability that
533 a datum is a member of a particular class. It also knows how to
534 package up all necessary information for storage in a file.
535 """
536
537
538
539
540 - def logp(self, datum):
541 raise RuntimeError, "Virtual Function"
542
544 raise RuntimeError, "Virtual Function"
545
547 raise RuntimeError, "Virtual Function"
548
549 @staticmethod
551 raise RuntimeError, "Virtual Function"
552
553
554
555 -class qmodel(model_template):
556 - def __init__(self, mu, invsigma, offset):
557
558
559 self.mu = numpy.array(mu, numpy.float, copy=True)
560 self.invsigma = numpy.array(invsigma, numpy.float, copy=True)
561 assert len(self.mu.shape) == 1
562 assert self.invsigma.shape == (self.mu.shape * 2), "Shapes must match: %s vs %s" % (str(self.invsigma.shape), str(self.mu.shape))
563
564 self.bias = offset
565
566 - def logp(self, datum):
567 delta = datum - self.mu
568 parab = gpkmisc.qform(delta, self.invsigma)
569 return -parab/2.0 + self.bias
570
571
573 chunkwriter.groupstart('quadratic_class_model', b=1 )
574 chunkwriter.comment('Offset:')
575 chunkwriter.write_float(self.bias)
576 chunkwriter.comment('Mu:')
577 chunkwriter.write_NumArray(self.mu, b=1)
578 chunkwriter.comment('Inverse(covariance):')
579 chunkwriter.write_NumArray(self.invsigma, b=1)
580 chunkwriter.groupend()
581
582
584 return '<qmodel: mu=%s invsigma=%s bias=%g>' % (
585 str(self.mu), str(self.invsigma),
586 self.bias)
587
588
589 @staticmethod
591 """This the inverse of qmodel.tochunk(),
592 except the group start is already read.
593 """
594 offset = chunk.read_float()
595 mu = chunk.read_NumArray()
596 if len(mu.shape) != 1:
597 raise chunkio.BadFileFormat, 'mu must be 1-d array'
598 invsigma = chunk.read_NumArray()
599 if len(invsigma.shape) != 2:
600 raise chunkio.BadFileFormat, 'invsigma must be 2-d array'
601 if invsigma.shape != mu.shape*2:
602 raise chunkio.BadFileFormat, 'sizes do not match'
603 return qmodel(mu, invsigma, offset)
604
605
606
607
608 _qzmodel_cache = {}
620
621
622
623
624 -class lmodel(model_template):
625 - def __init__(self, direction, offset, reference_pt):
626
627
628 self.direction = numpy.array(direction, numpy.float, copy=True)
629 self.bias = offset
630 self.reference = reference_pt
631
632 - def logp(self, datum):
633
634
635
636
637
638 return self.bias + numpy.dot(datum-self.reference, self.direction)
639
641 return '<lmodel: %s ref=%s bias=%g>' % (str(self.direction), str(self.reference), self.bias)
642
644 chunkwriter.groupstart('linear_class_description', b=1 )
645 chunkwriter.comment('Offset:')
646 chunkwriter.write_float(self.bias)
647 chunkwriter.comment('dir:')
648 chunkwriter.write_NumArray(self.direction, b=1)
649 chunkwriter.comment('reference_pt:')
650 chunkwriter.write_NumArray(self.reference, b=1)
651 chunkwriter.groupend()
652
653
654 @staticmethod
656 """This the inverse of lmodel.tochunk(),
657 except the group start is already read.
658 """
659 offset = chunk.read_float()
660 direction = chunk.read_NumArray()
661 if len(direction.shape) != 1:
662 raise chunkio.BadFileFormat, 'direction must be 1-d array'
663 ref = chunk.read_NumArray()
664 if len(direction.shape) != 1:
665 raise chunkio.BadFileFormat, 'reference_point must be 1-d array'
666 if direction.shape != ref.shape:
667 raise chunkio.BadFileFormat, 'sizes do not match'
668 return lmodel(direction, offset, ref)
669
670
671 _lzmodel_cache = {}
681
686 """This is a thing that describes and generates L{classifier}s.
687 """
688
689 - def __init__(self, list_of_classes, fvdim, evaluator=None, ftrim=None):
690 assert isinstance(fvdim, int)
691 assert fvdim > 0
692 self.ndim = fvdim
693
694
695 self.c = list(list_of_classes)
696
697 self.nc = len(self.c)
698 assert self.nc > 0, "No classes!"
699 self.evaluator = evaluator
700 self.ftrim = ftrim
701
702
704 """@return: The number of parameters required to define one class.
705 @rtype: int
706 """
707 raise RuntimeError, "Virtual Function."
708
710 """@return:The number of parameters required to define the classifier.
711 @rtype: int
712 """
713 raise RuntimeError, "Virtual Function."
714
715 - def unpack(self, prmvec, trainingset_name=None, uid=None):
716 """Produce a classifier from a parameter vector.
717 @param prmvec: a vector of parameters that describe a classifier model.
718 @type prmvec: numpy.ndarray
719 @return: the classifier.
720 @rtype: the corresponding subclass of L{classifier}.
721 """
722 raise RuntimeError, "Virtual Function."
723
725 """Starting position for Markov Chain Monte Carlo."""
726 raise RuntimeError, "Virtual Function."
727
729
730 return "<classifier_desc: ndim=%d nc=%d>" % (self.ndim, self.nc)
731 __repr__ = __str__
732
733
735 """@return: a string that names the subclass - what kind of L{classifier_desc} is it?
736 @rtype str
737 """
738 raise RuntimeError, "Virtual function"
739
740
741 -def _logp(x, (data, cdesc)):
747
748
749
750 -def forest_build(data, N, modelchoice=None, trainingset_name=None):
751 """Build a forest of classifiers.
752 @param data: data to train the classifiers on.
753 @type data: L{datum_c}
754 @param N: How many to build.
755 @type N: int,
756 @param modelchoice: what kind of classifier to build
757 @type modelchoice: subclass of L{model_template}
758 @param trainingset_name: (stored for later use).
759 @type trainingset_name: str
760 """
761
762 assert len(data) > 1, "Not enough data"
763 assert modelchoice is not None
764 start, V = modelchoice.start(data)
765 x = mcmc.bootstepper(_logp, start, V, c=(data, modelchoice))
766 mcmch = mcmc_helper.stepper(x)
767 nsteps = mcmch.run_to_bottom()
768 mcmch.run_to_ergodic(1.0)
769 if nsteps > 100:
770 print '#NSTEPS:', nsteps
771 o = []
772 for i in range(N):
773 mcmch.run_to_ergodic(ERGCOVER/float(N))
774 tmp = modelchoice.unpack( x.current().prms(),
775 trainingset_name=trainingset_name,
776 uid='%s:%d' % (trainingset_name,i)
777 )
778 if D:
779 print 'Forest evaluate=', tmp.evaluate(data), "for", tmp
780 o.append(tmp)
781 return o
782
786 """This is the base class for all kinds of classifers.
787 """
788
789 - def __init__(self, typename, models, info=None,
790 trainingset_name=None, uid=None, cdesc=None):
791 """
792 @param models: a dictionary containg a probabilistic model for each class
793 @type models: dict(str: subclass of L{model_template})
794 @param info: not used in the internal operation of the classifier,
795 but it is stuff that is important to write out.
796 @type info: dict(str: whatever)
797 """
798 self.typename = typename
799 assert isinstance(models, dict)
800 self.class_models = models
801 if info is not None:
802 self.info = info.copy()
803 else:
804 self.info = {}
805 self.info['trainingset'] = trainingset_name
806 self.info['Cuid'] = uid
807 self.cdesc = cdesc
808
809
811 """Add a class to an existing classifier."""
812 g_implements.check(model, model_template)
813 assert isinstance(classname, str)
814 self.class_models[classname] = model
815
816
818 return self.class_models.keys()
819
820
823
824 - def P(self, datum, whichclass=None):
825 """Determine the probability of being in each class.
826 If whichclass=None, then it returns a list of
827 tuples [(classname,P), ...] for all classes.
828 Otherwise, it returns the probability of the
829 specified class.
830 """
831 if whichclass is None:
832 return [ (cn, math.exp(lp)) for (cn, lp) in self.logPv(datum) ]
833 return math.exp(self.logPw(datum, whichclass, 0.0))
834
835
836 - def logP(self, datum, whichclass=None):
837 if whichclass is None:
838 return self.logPv(datum)
839 return self.logPw(datum, whichclass, 0.0)
840
841
842 - def logPw(self, datum, whichclass, constrain):
843 """Determine the probability of C{datum} being in C{whichclass}.
844 """
845
846 Psum = 0.0
847 bgst = -self.HUGE
848 smlst = self.HUGE
849 the_lP = None
850 for (cn, cm) in self.class_models.items():
851 lp = cm.logp(datum.value)
852 if cn == whichclass:
853 the_lP = lp
854 if smlst > lp:
855 smlst = lp
856 if bgst < lp:
857 Psum *= math.exp(bgst-lp)
858 bgst = lp
859 Psum += math.exp(lp-bgst)
860 return the_lP - bgst - math.log(Psum) - constrain*(bgst-smlst)**2
861
862
864 """Determine the probability of being in each class.
865 If whichclass=None, then it returns a list of
866 tuples [(classname,P), ...] for all classes.
867 """
868
869 Psum = 0.0
870 lcnP = []
871 bgst = None
872 for (cn, cm) in self.class_models.items():
873 lp = cm.logp(datum.value)
874 lcnP.append( (cn, lp) )
875 if bgst is None:
876 bgst = lp
877 elif bgst<lp:
878 Psum *= math.exp(bgst-lp)
879 bgst = lp
880 Psum += math.exp(lp-bgst)
881 q = bgst + math.log(Psum)
882 return [ (cn, lp-q) for (cn,lp) in lcnP ]
883
884 HUGE = 1e30
885
887 """Determine the best class for a datum."""
888
889 bgst = -self.HUGE
890 bgc = None
891 for (c, qp) in self.class_models.items():
892 tmp = qp.logp(datum.value)
893 if tmp > bgst:
894 bgst = tmp
895 bgc = c
896 return bgc
897
898
900 o = [ '<classifier: %s' % self.typename ]
901 for (c, q) in self.class_models.items():
902 o.append('%s: %s' % (c, str(q)))
903 o.append( '>' )
904 return '\n'.join(o)
905
906 __repr__ = __str__
907
908
910 nok = []
911 for datum in classdata:
912 bestc = self.bestc(datum)
913 if bestc != datum.classid:
914 nok.append(datum.uid)
915 return nok
916
917
919 """Write this classifier out (usually to a data file).
920 @type dcw: L{chunkio.chunk_w}
921 """
922 dcw.groupstart(self.typename)
923 dcw.comment('Classifier:')
924 dcw.write_dict_of(self.class_models,
925 lambda dcw, ac: ac.tochunk(dcw), b=1)
926 dcw.write_dict(self.info)
927 dcw.groupend()
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942 -def evaluate_match(cl, data):
943 """This can be passed into a classifier descriptor as
944 the evaluate argument.
945 It returns the number of exact matches between the classified
946 data and the input, true classification.
947 @param cl: some classifier that has a bestc() method.
948 @type cl: typically a subclass of L{classifier}.
949 @param data: list of classes that describe data points.
950 @type data: Typically a subclass of L{datum_c}.
951 @rtype: int
952 @return: the number of classification errors made
953 """
954
955 o = 0
956 for datum in data:
957 sbc = cl.bestc(datum)
958
959 if sbc != datum.classid:
960 o += 1
961 return o
962
966 """This is called in the same way as L{evaluate_match}
967 or L{evaluate_Bayes}. It pretends to be a function,
968 except that you can weight the values of different
969 classes when you construct the class.
970 """
971 __name__ = 'evaluate_match_w_rare'
972
975
977 classcounts = dictops.dict_of_accums()
978 for datum in data:
979 classcounts.add(datum.classid, 1)
980
981 ctotal = 0
982 for counts in classcounts.values():
983 ctotal += counts
984 ctotal = float(ctotal)
985 weights = {}
986 for (classname, counts) in classcounts.items():
987 weights[classname] = -math.log(float(counts)/ctotal)
988 return weights
989
990
992 if self.weights is None:
993 self.weights = self._compute_weights(data)
994
995 o = 0
996 for datum in data:
997 if clssfr.bestc(datum) != datum.classid:
998 o += self.weights[datum.classid]
999 return o
1000
1005 """Evaluates the negative log of the probability that the classifier would assign
1006 to the datum being in the observed class (i.e. whatever class
1007 is specified in the L{datum_tr}). Obviously, you
1008 want this to be a relatively small number.
1009 @param cl: a classifier
1010 @type cl: L{classifier}
1011 @param data: data
1012 @type data: list(L{datum_c})
1013 @rtype: float
1014 @return: the log of the probability of being in the observed class.
1015
1016 If C{cdesc.ftrim} is not None, we assume that some of the
1017 data in each class are dubious, and should
1018 be ignored if they are sufficiently improbable.
1019 We modify the probability scores of data that is
1020 among the worst (C{cl.cdesc.ftrim[0]} fraction),
1021 and limit those scores to be no larger than
1022 C{cl.cdesc.ftrim[1]} larger than the best score.
1023 This lets you limit by score or limit by fraction
1024 or any mixture in between. If C{cl.cdesc.ftrim} is C{None},
1025 then no limiting or trimming is done.
1026 """
1027 if cl.cdesc.ftrim is None:
1028 o = 0.0
1029 for datum in data:
1030 o += cl.logPw(datum, datum.classid, constrain)
1031 return -o
1032
1033
1034
1035 trimfrac, trimlevel = cl.cdesc.ftrim
1036 assert trimfrac>=0 and trimfrac<=1.0
1037 o = dictops.dict_of_lists()
1038 for datum in data:
1039 sb = cl.logPw(datum, datum.classid, constrain)
1040 assert isinstance(datum, datum_tr), "Use ftrim only with training set!"
1041 o.add(datum.classid, sb)
1042 oo = 0.0
1043 for classid, scorelist in o.items():
1044 scorelist.sort()
1045 n = int(round(trimfrac*(len(scorelist)-1)))
1046 tcut = scorelist[-1] - trimlevel
1047 assert n > 0
1048 for t1 in scorelist[n:]:
1049 if D and t1 < tcut:
1050 print 'Trimming %g %g in %s' % (t1, tcut, classid)
1051 oo += max(tcut, t1)
1052 if D:
1053 for t1 in scorelist[:n]:
1054 print 'Trimming %g (bot) in %s' % (t1, classid)
1055 return -oo
1056
1057
1058
1059
1060
1061 -def default_writer(summary, out, classout, wrong, fname="classes.chunk"):
1062 """This writes out classifiers to a data file.
1063 @attention: out needs to be a list, not an iterator, because we use it twice.
1064 """
1065
1066 def classifier_writer(dcw, a_classifier):
1067 """Helper function to work with L{chunkio}."""
1068 a_classifier.writer(dcw)
1069
1070 dc = chunkio.datachunk_w( open(fname, "w") )
1071 dc.comment('Header:')
1072 header = summary.copy()
1073 ctype = None
1074 for cl in out:
1075 if ctype is None:
1076 ctype = cl.typename
1077 else:
1078 assert cl.typename == ctype, 'Cannot handle mixtures of different classifiers.'
1079 header['classifier_type'] = ctype
1080 dc.write_dict( header )
1081 dc.comment('classifiers:')
1082 dc.write_array_of(out, classifier_writer, b=1)
1083 out = None
1084 dc = None
1085 for (uid, nfailures) in wrong.items():
1086 print 'WRONG', nfailures, summary['total'], uid
1087
1090 """Count how many instances there are of each class.
1091 @type data: L{datum_c}
1092 @rtype map from str to int
1093 """
1094 cids = dictops.dict_of_accums()
1095 for datum in data:
1096 assert isinstance(datum.classid, str)
1097 cids.add(datum.classid, 1)
1098 return cids
1099
1102 """List the names of the classes in a dataset,
1103 with the most populus classes first.
1104 @type data: L{datum_c}
1105 @rtype list(str)
1106 """
1107 cn = [(-n, cid) for (cid, n) in count_classes(data).items() ]
1108 cn.sort()
1109 return [ cid for (n, cid) in cn ]
1110
1116 """Used to get the name of an evaluator, to write it
1117 to a file header.
1118 @rtype: str
1119 @param e:
1120 @type e: function, preferable with __name__ attribute.
1121 """
1122 if e is None:
1123 return evaluate_Bayes.__name__
1124 elif hasattr(e, '__name__'):
1125 return e.__name__
1126 else:
1127 return str(e)
1128
1132 """Maps a name to a function that will evaluate how well a classifier performs.
1133 @param nm: a printable name
1134 @type nm: str
1135 @return: a function
1136 """
1137 if nm is None or nm == 'Bayes' or nm == 'evaluate_Bayes':
1138 return evaluate_Bayes
1139 elif nm == 'match' or nm == 'evaluate_match':
1140 return evaluate_match
1141 elif nm == 'match_w_rare' or nm == 'evaluate_match_w_rare':
1142 return evaluate_match_w_rare()
1143 else:
1144 die.die('Bad name for evaluator: %s' % nm)
1145
1152 """Modifies a classifier so it isn't so dominated by the most frequent classes.
1153 @type qc: L{classifier}
1154 @param training_counts: how many data are there in each class in the training set
1155 @type training_counts: map str to int
1156 @param testing_counts: how many data are there in each class in the testing set
1157 @type testing_counts: map str to int
1158 """
1159 assert isinstance(qc, classifier)
1160 F = 0.5
1161 ntr = 0
1162 for n in training_counts.values():
1163 ntr += n
1164 nts = 0
1165 for n in testing_counts.values():
1166 nts += n
1167 ntyp = 0
1168 neff = {}
1169 for nm in qc.class_models.keys():
1170 tmp = (training_counts[nm]*nts+testing_counts[nm]*ntr)/(ntr+nts)
1171 ntyp += tmp
1172 neff[nm] = tmp
1173 ntyp = float(ntyp)/float(len(qc.class_models))
1174 for (nm, mod) in qc.class_models.items():
1175 mod.bias -= F*math.log(float(neff[nm])/float(ntyp))
1176