classifiers.q_classifier

1 # -*- coding: utf-8 -*- 2 """This is a support module, used by many types of classifiers.""" 3 4 import re 5 import math 6 import zlib 7 8 import numpy 9 10 from gmisclib import chunkio 11 from g_classifiers import data_splitter as DS 12 from gmisclib import die 13 from gmisclib import mcmc 14 from gmisclib import mcmc_helper 15 mcmc_helper.Debug = 0 16 mcmc.Debug = 0 17 from gmisclib import g_implements 18 from gmisclib import fiatio 19 from gmisclib import dictops 20 from gmisclib import gpkmisc 21 from gmisclib import dict_vector as DV 22 import gpkavg 23 24 # from g_classifiers import logP 25 26 ERGCOVER = 4.0 27 D = False 28 CONSTRAIN = 1e-6

29 30 -def Hash(dl):

31 """@return: a hash of the UID's of data items. 32 @rtype: str 33 @param dl: A list of data 34 @type dl: list(L{datum_c}) 35 """ 36 Mask1 = 0xffffffff 37 Mask2 = 0xffffffffffff 38 rv = 0 39 for x in sorted(dl): 40 tmp = zlib.crc32(x.uid) & Mask1 41 rv = ((rv+tmp) * 13) & Mask2 42 return '%d' % rv

43

44 45 -def Hash1(l):

46 """@return: a hash of a vector. 47 @rtype: str 48 @param l: A list of data 49 """ 50 Mask1 = 0xffffffff 51 Mask2 = 0xffffffff 52 rv = 0 53 for t in l: 54 tmp = zlib.crc32(str(t)) & Mask1 55 rv = ((rv+tmp) * 11) & Mask2 56 return '%d' % rv

57 58 assert Hash1([3.712, 3.71]) == Hash1([3.712, 3.71])

59 60 61 -class datum_c(object):

62 """This is an unclassified datum, either in the 63 test or training set. 64 @ivar value: a feature vector 65 @type value: L{numpy.ndarray} 66 @ivar uid: a string that identfies a particular datum. 67 This is not used for classification, but is passed 68 through to the output. (It can also used to define groups 69 of data.) 70 @type uid: str 71 """ 72 73 __slots__ = ['value', 'uid'] 74

75 - def __init__(self, vector, uid=None):

76 """@param vector: a vector of numbers 77 @param uid: an arbitrary identifier for a datum 78 @type uid: str 79 """ 80 self.value = numpy.asarray(vector, numpy.float) 81 if uid is None: 82 uid = Hash1(vector) 83 self.uid = uid

84

85 - def __repr__(self):

86 return "<datum_c fv=%s uid=%s>" % (str(self.value), self.uid)

87 88

89 - def __cmp__(self, other):

90 return cmp(self.uid, other.uid)

91

92 93 -class datum_tr(datum_c):

94 """This is a datum where we know the true class, presumably in 95 the training set. 96 @ivar classid: the name of the class to which the datum belongs. 97 @type classid: str 98 """ 99 100 __slots__ = ['classid'] 101

102 - def __init__(self, vector, classid, uid=None):

103 """@param vector: a vector of numbers 104 @param uid: an arbitrary identifier for a datum 105 @type uid: str 106 @param classid: is the true class, 107 @type classid: str 108 """ 109 datum_c.__init__(self, vector, uid) 110 self.classid = classid

111

112 - def __str__(self):

113 return "<datum_tr class=%s fv=%s uid=%s>" % (self.classid, 114 str(self.value), 115 self.uid)

116 __repr__ = __str__

117

118 119 120 121 122 -def prior(training):

123 """This computes the probability of correct classification, 124 assuming you can't see the feature vector. 125 It is used to compute P(chance). 126 It assumes that you choose class C 127 with probability 1 if P(C) is the 128 biggest among all the classes. 129 """ 130 dtr = dictops.dict_of_accums() 131 for datum in training: 132 dtr.add(datum.classid, 1) 133 kbest, chance = DV.max_within(DV.dict_vector(dtr)) 134 return chance/float(len(training))

135

136 137 138 -def max_correct(training, testing):

139 """This is a hard, conservative upper limit 140 for the probability of correct classification. 141 """ 142 # One cannot hope to classify an item correctly, 143 # if the item's class is not represented 144 # in the training set. 145 # If the class is present in the training 146 # set but not in the test set, then what happens? 147 # It could be buried among items from other classes, 148 # in which case it wouldn't lead to any false 149 # predictions. Consequently, absences from the 150 # test set doesn't hurt the maximum possible 151 # classification probability. 152 dtr = dictops.dict_of_accums() 153 for datum in training: 154 dtr.add(datum.classid, 1) 155 156 impossible = 0 157 for datum in testing: 158 if not datum.classid in dtr: 159 impossible += 1 160 161 return float(len(testing)-impossible)/float(len(testing))

162

163 164 165 166 167 168 -def _addwrongD(wrong, failures):

169 for (k, v) in failures.items(): 170 wrong.add(k, v)

171

172 173 -def _addwrong(wrong, failures):

174 for t in failures: 175 wrong.add(t, 1)

176

177 178 -def _doclass(qc, testing, verbose=True):

179 """This classifies the data 180 @return: yields a sequence of dictionaries, 181 each of which contains a bunch of results and information. 182 @param testing: a list of data. 183 @param qc: a classifier. 184 @type qc: subclass of L{classifier}. 185 """ 186 info = qc.info.copy() 187 for datum in testing: 188 P = qc.P(datum) 189 tmp = info.copy() 190 191 # It is tempting to use qc.bestc() here, but 192 # not a good idea, as it would involve calculating 193 # log(P) twice, and that can be expensive. 194 bestcl = None 195 bestp = 0.0 196 for (cl, p) in P: 197 if p > bestp: 198 bestp = p 199 bestcl = cl 200 tmp['compclass'] = bestcl 201 202 if datum.uid is not None: 203 tmp['Duid'] = datum.uid 204 205 if verbose: 206 tmp['V'] = chunkio.chunkstring_w().write_NumArray(datum.value, converter=lambda x:"%.6g"%x).close() 207 tmp['P'] = chunkio.chunkstring_w().write_dict(dict(P), converter=lambda x:"%.3e"%x).close() 208 try: 209 tmp['trueclass'] = datum.classid 210 except AttributeError: 211 pass 212 yield tmp

213

214 215 216 217 -def read_data(fd, commentarray=None):

218 """Reads in feature vectors where the first element is the true class. 219 This is the main data input for l_classifier, qdg_classifier and qd_classifier. 220 @type fd: L{file} 221 @type commentarray: a L{list} or L{None} 222 @rtype: L{list}(L{datum_tr}) 223 """ 224 d = [] 225 len_a = None 226 # die.info('Reading') 227 print '# Reading' 228 ln = 0 229 for l in fd: 230 ln += 1 231 if l.startswith('#'): 232 if commentarray is not None: 233 commentarray.append(l[1:]) 234 continue 235 aa = l.split('#', 1) 236 if len(aa) > 1: 237 uid = aa[1].strip() 238 else: 239 uid = 'Line:%d' % ln 240 a = aa[0].split() 241 if len(a) != len_a: 242 if len_a is None: 243 len_a = len(a) 244 else: 245 die.die('Not all vectors have length=%d. Problem on line %d' 246 % (len_a-1, ln) 247 ) 248 d.append( datum_tr(numpy.array([float(x) for x in a[1:]]), 249 a[0], uid) 250 ) 251 return d

252

253 254 -def get_dim(fd):

255 """This function takes a list of data (type datum_tr) 256 and makes sure that they all have the same length feature 257 vector. If so, it reports the length (dimension) of the 258 feature vector. 259 @type fd: list(L{datum_tr}) 260 @param fd: input data 261 @return: length of vectors 262 @rtype: int 263 """ 264 dim = None 265 for (i, x) in enumerate(fd): 266 assert isinstance(x, datum_tr), "Whoops: needs a datum_tr, got %s" % str(x) 267 if len(x.value) != dim: 268 if dim is None: 269 dim = len(x.value) 270 if len(x.value) != dim: 271 raise ValueError, "Not all vectors are length=%d (len(vec[%d])=%d)" % (dim, i, len(x.value)) 272 return dim

273

274 275 276 -def compute_cross_class(training, testing, 277 modelchoice=None, n_per_dim=None, 278 builder=None, classout=None, 279 trainingset_name=None, modify_class=None, 280 verbose=True):

281 """Build classifiers based on the training set, 282 and test them on the testing set. 283 Modelchoice here is the completed class object, 284 not a closure.""" 285 286 # print 'ccc: trainingset=', trainingset_name 287 if len(training) == 0: 288 die.die('No training data.') 289 if len(testing) == 0: 290 die.die('No data to classify.') 291 292 dim = get_dim(training) 293 if dim <= 0: 294 die.die('zero dimensional data') 295 assert get_dim(testing) == dim 296 297 nok = 0 298 total = 0 299 wrong = dictops.dict_of_accums() 300 k = None 301 302 priorchance = prior(training) 303 maxcorrect = max_correct(training, testing) 304 classifiers = builder(training, n_per_dim*dim, 305 modelchoice=modelchoice, 306 trainingset_name=trainingset_name) 307 for qc in classifiers: 308 if modify_class is not None: 309 modify_class(qc, count_classes(training), count_classes(testing)) 310 failures = qc.list_wrong_classifications(testing) 311 qc.add_info('correct', len(testing)-len(failures)) 312 qc.add_info('Ntests', len(testing)) 313 qc.add_info('Chance_on_this_trainingset', priorchance) 314 _addwrong(wrong, failures) 315 if classout is not None: 316 classout.extend( _doclass(qc, testing, verbose=verbose) ) 317 nok += len(failures) 318 total += len(testing) 319 pcorrect = float(total-nok)/float(total) 320 321 if priorchance < maxcorrect: 322 k = (pcorrect-priorchance)/(maxcorrect-priorchance) 323 print '#NOK=', nok, total, float(nok)/total 324 325 summary = {'nok': nok, 'total': total, 326 'Pcorrect': float(total-nok)/float(total), 327 'Chance': priorchance, 328 'Pperfect': maxcorrect, 329 'N_per_dim': n_per_dim 330 } 331 if k is not None: 332 summary['K'] = k 333 334 return (summary, classifiers, wrong)

335

336 337 338 -def compute_self_class(d, coverage=None, ftest=None, 339 modelchoice=None, n_per_dim=None, modify_class=None, 340 builder=None, classout=None, verbose=True):

341 342 """modelchoice here is expected to take one argument-- the data.""" 343 344 if len(d) == 0: 345 die.die('No data to classify.') 346 dim = get_dim(d) 347 if dim <= 0: 348 die.die('zero dimensional data') 349 Ntry = int(round(coverage/ftest)) 350 bdata = DS.bluedata(d) 351 nok = 0; total = 0; 352 wrong = dictops.dict_of_accums() 353 out = []; k = []; pch = []; pmx = []; pcrct = [] 354 for tr in range(Ntry): 355 print '# Building' 356 testing, training = bdata.split(ftest*len(bdata), seed=tr) 357 358 # Modelchoice is a closure, and evaluation happens here: 359 if modelchoice is not None: 360 c_modelchoice = modelchoice(training) 361 else: 362 c_modelchoice = None 363 tsum, classifiers, twr = compute_cross_class(training, testing, c_modelchoice, 364 n_per_dim, builder, classout, 365 trainingset_name=Hash(training), 366 modify_class=modify_class, 367 verbose=verbose 368 ) 369 370 pmx.append( tsum['Pperfect'] ) 371 pch.append( tsum['Chance'] ) 372 total += tsum['total'] 373 nok += tsum['nok'] 374 pcrct.append( 1.0-float(tsum['nok'])/tsum['total'] ) 375 _addwrongD(wrong, twr) 376 out.extend(classifiers) 377 if 'K' in tsum: 378 k.append( tsum['K'] ) 379 380 summary = {'nok': nok, 'total': total, 'Ftest': ftest, 381 'N_per_dim': n_per_dim, 'Coverage': coverage} 382 383 try: 384 summary['K'], ksigma = gpkavg.avg(k, None, 0.0001) 385 summary['KSigma'] = ksigma 386 except ValueError, x: 387 die.info('%s: K' % str(x)) 388 389 try: 390 summary['Chance'], chsigma = gpkavg.avg(pch, None, 0.0001) 391 summary['ChSigma'] = chsigma 392 except ValueError, x: 393 die.info('%s: PChance' % str(x)) 394 try: 395 summary['Perfection'], prfsigma = gpkavg.avg(pmx, None, 0.0001) 396 summary['PerfectionSigma'] = prfsigma 397 except ValueError, x: 398 die.info('%s: Pperfect' % str(x)) 399 400 try: 401 summary['Pcorrect'], psigma = gpkavg.avg(pcrct, None, 0.0001) 402 summary['PSigma'] = psigma 403 except ValueError, x: 404 die.info('%s: Pcorrect' % str(x)) 405 406 407 return (summary, out, wrong)

408

409 410 411 412 413 -class grouper_c(object):

414 """A 'grouper' function takes a DUID (a unique i.d. string 415 for a datum) and returns the name of the data group to which 416 it belongs. This group name is used in constructing the training 417 and test sets. 418 """

419 - def __init__(self, pattern='^([^/]*)/', which=1):

420 self.pattern = re.compile(pattern) 421 self.which = which

422

423 - def __call__(self, x):

424 """ 425 @param x: a datum 426 @type x: L{datum_c} 427 @return: group name 428 @rtype: str 429 """ 430 m = self.pattern.search(x.uid) 431 if m: 432 return m.group(self.which) 433 return None

434

435 436 437 -def list_groups(d, gr):

438 groups = set() 439 for t in d: 440 groups.add(gr(t)) 441 return groups

442

443 444 445 446 447 448 -def compute_group_class(dg, modelchoice=None, n_per_dim=None, 449 builder=None, classout=None, ftest=None, 450 grouper=None, coverage=None, modify_class=None, 451 verbose=True):

452 """This function makes sure that the training set and testing set 453 come from different groups. The 'grouper' returns a group 454 name, when given a datum. Modelchoice is expected to take 455 one argument, the training set. 456 @param grouper: function returning a group name for each datum 457 @type grouper: function from L{datum_tr} to C{str} 458 """ 459 460 if len(dg) == 0: 461 die.die('No data to classify.') 462 dim = get_dim(dg) 463 if dim <= 0: 464 die.die('zero dimensional data') 465 466 Ntry = int(round(coverage/ftest)) 467 nok = 0; total = 0; 468 wrong = dictops.dict_of_accums() 469 out = []; k = []; pch = []; pcrct = []; pmx = [] 470 471 bdata = DS.bluedata_groups(dg, grouper) 472 for i in range(Ntry): 473 print '# Building' 474 testing, training, trn = bdata.split(ftest*len(bdata), seed=i) 475 print 'testing:', ftest*len(bdata), len(testing), "training:", len(training), "from:", len(bdata), len(dg) 476 477 # Modelchoice is a closure, and evaluation happens here: 478 if modelchoice is not None: 479 c_modelchoice = modelchoice(training) 480 else: 481 c_modelchoice = None 482 tsum, classifiers, twr = compute_cross_class(training, testing, 483 c_modelchoice, n_per_dim, 484 builder, classout, modify_class=modify_class, 485 trainingset_name=Hash(training), 486 verbose=verbose 487 ) 488 489 pmx.append( tsum['Pperfect'] ) 490 pch.append( tsum['Chance'] ) 491 total += tsum['total'] 492 nok += tsum['nok'] 493 pcrct.append( 1.0-float(tsum['nok'])/tsum['total'] ) 494 _addwrongD(wrong, twr) 495 out.extend(classifiers) 496 if 'K' in tsum: 497 k.append( tsum['K'] ) 498 499 summary = {'nok': nok, 'total': total, 'Ftest': ftest, 500 'N_per_dim': n_per_dim, 'Coverage': coverage} 501 502 try: 503 summary['K'], ksigma = gpkavg.avg(k, None, 0.0001) 504 summary['KSigma'] = ksigma 505 except ValueError, x: 506 die.info('%s: K' % str(x)) 507 508 try: 509 summary['Chance'], chsigma = gpkavg.avg(pch, None, 0.0001) 510 summary['ChSigma'] = chsigma 511 except ValueError, x: 512 die.info('%s: PChance' % str(x)) 513 try: 514 summary['Perfection'], prfsigma = gpkavg.avg(pmx, None, 0.0001) 515 summary['PerfectionSigma'] = prfsigma 516 except ValueError, x: 517 die.info('%s: Pperfect' % str(x)) 518 519 try: 520 summary['Pcorrect'], psigma = gpkavg.avg(pcrct, None, 0.0001) 521 summary['PSigma'] = psigma 522 except ValueError, x: 523 die.info('%s: Pcorrect' % str(x)) 524 525 return (summary, out, wrong)

526

527 528 529 530 531 -class model_template(object):

532 """This class describes how to compute the relative probability that 533 a datum is a member of a particular class. It also knows how to 534 package up all necessary information for storage in a file. 535 """ 536 # def __init__(self): 537 # pass 538 # __init__.g_implements = g_implements.varargs 539

540 - def logp(self, datum):

541 raise RuntimeError, "Virtual Function"

542

543 - def tochunk(self, chunkwriter):

544 raise RuntimeError, "Virtual Function"

545

546 - def __str__(self):

547 raise RuntimeError, "Virtual Function"

548 549 @staticmethod

550 - def fromchunk(chunk):

551 raise RuntimeError, "Virtual Function"

552

553 554 555 -class qmodel(model_template):

556 - def __init__(self, mu, invsigma, offset):

557 # Copy the data, so there is no risk of 558 # it changing underneath you... 559 self.mu = numpy.array(mu, numpy.float, copy=True) 560 self.invsigma = numpy.array(invsigma, numpy.float, copy=True) 561 assert len(self.mu.shape) == 1 562 assert self.invsigma.shape == (self.mu.shape * 2), "Shapes must match: %s vs %s" % (str(self.invsigma.shape), str(self.mu.shape)) 563 # The above is a duplication of a tuple, not multiplication. 564 self.bias = offset

565

566 - def logp(self, datum):

567 delta = datum - self.mu 568 parab = gpkmisc.qform(delta, self.invsigma) 569 return -parab/2.0 + self.bias

570 571

572 - def tochunk(self, chunkwriter):

573 chunkwriter.groupstart('quadratic_class_model', b=1 ) 574 chunkwriter.comment('Offset:') 575 chunkwriter.write_float(self.bias) 576 chunkwriter.comment('Mu:') 577 chunkwriter.write_NumArray(self.mu, b=1) 578 chunkwriter.comment('Inverse(covariance):') 579 chunkwriter.write_NumArray(self.invsigma, b=1) 580 chunkwriter.groupend()

581 582

583 - def __str__(self):

584 return '<qmodel: mu=%s invsigma=%s bias=%g>' % ( 585 str(self.mu), str(self.invsigma), 586 self.bias)

587 588 589 @staticmethod

590 - def fromchunk(chunk):

591 """This the inverse of qmodel.tochunk(), 592 except the group start is already read. 593 """ 594 offset = chunk.read_float() 595 mu = chunk.read_NumArray() 596 if len(mu.shape) != 1: 597 raise chunkio.BadFileFormat, 'mu must be 1-d array' 598 invsigma = chunk.read_NumArray() 599 if len(invsigma.shape) != 2: 600 raise chunkio.BadFileFormat, 'invsigma must be 2-d array' 601 if invsigma.shape != mu.shape*2: 602 raise chunkio.BadFileFormat, 'sizes do not match' 603 return qmodel(mu, invsigma, offset)

604 605 606 607 608 _qzmodel_cache = {}

609 -def qzmodel(ndim):

610 global _qzmodel_cache 611 if ndim in _qzmodel_cache: 612 return _qzmodel_cache[ndim] 613 zmu = numpy.zeros((ndim,), numpy.float) 614 zis = numpy.zeros((ndim, ndim), numpy.float) 615 zm = qmodel(zmu, zis, 0.0) 616 if len(_qzmodel_cache) > 20: 617 _qzmodel_cache.popitem() 618 _qzmodel_cache[ndim] = zm 619 return zm

620

621 622 623 624 -class lmodel(model_template):

625 - def __init__(self, direction, offset, reference_pt):

626 # Copy direction, so no chance of it changing 627 # due to external causes... 628 self.direction = numpy.array(direction, numpy.float, copy=True) 629 self.bias = offset 630 self.reference = reference_pt

631

632 - def logp(self, datum):

633 # print "\t bias=", self.bias 634 # print "\tdir=", self.direction 635 # print "\tdatum=", datum 636 # print "\treference=", self.reference 637 # print '\to=', self.bias+numpy.dot(datum-self.reference, self.direction) 638 return self.bias + numpy.dot(datum-self.reference, self.direction)

639

640 - def __str__(self):

641 return '<lmodel: %s ref=%s bias=%g>' % (str(self.direction), str(self.reference), self.bias)

642

643 - def tochunk(self, chunkwriter):

644 chunkwriter.groupstart('linear_class_description', b=1 ) 645 chunkwriter.comment('Offset:') 646 chunkwriter.write_float(self.bias) 647 chunkwriter.comment('dir:') 648 chunkwriter.write_NumArray(self.direction, b=1) 649 chunkwriter.comment('reference_pt:') 650 chunkwriter.write_NumArray(self.reference, b=1) 651 chunkwriter.groupend()

652 653 654 @staticmethod

655 - def fromchunk(chunk):

656 """This the inverse of lmodel.tochunk(), 657 except the group start is already read. 658 """ 659 offset = chunk.read_float() 660 direction = chunk.read_NumArray() 661 if len(direction.shape) != 1: 662 raise chunkio.BadFileFormat, 'direction must be 1-d array' 663 ref = chunk.read_NumArray() 664 if len(direction.shape) != 1: 665 raise chunkio.BadFileFormat, 'reference_point must be 1-d array' 666 if direction.shape != ref.shape: 667 raise chunkio.BadFileFormat, 'sizes do not match' 668 return lmodel(direction, offset, ref)

669 670 671 _lzmodel_cache = {}

672 -def lzmodel(ndim):

673 if ndim in _lzmodel_cache: 674 return _lzmodel_cache[ndim] 675 z = numpy.zeros((ndim,), numpy.float) 676 zm = lmodel(z, 0.0, z) 677 if len(_lzmodel_cache) > 50: 678 _lzmodel_cache.popitem() 679 _lzmodel_cache[ndim] = zm 680 return zm

681

682 683 684 685 -class classifier_desc(object):

686 """This is a thing that describes and generates L{classifier}s. 687 """ 688

689 - def __init__(self, list_of_classes, fvdim, evaluator=None, ftrim=None):

690 assert isinstance(fvdim, int) 691 assert fvdim > 0 692 self.ndim = fvdim 693 694 # Make sure that list_of_classes is iterable 695 self.c = list(list_of_classes) 696 697 self.nc = len(self.c) 698 assert self.nc > 0, "No classes!" 699 self.evaluator = evaluator 700 self.ftrim = ftrim

701 702

703 - def np(self):

704 """@return: The number of parameters required to define one class. 705 @rtype: int 706 """ 707 raise RuntimeError, "Virtual Function."

708

709 - def npt(self):

710 """@return:The number of parameters required to define the classifier. 711 @rtype: int 712 """ 713 raise RuntimeError, "Virtual Function."

714

715 - def unpack(self, prmvec, trainingset_name=None, uid=None):

716 """Produce a classifier from a parameter vector. 717 @param prmvec: a vector of parameters that describe a classifier model. 718 @type prmvec: numpy.ndarray 719 @return: the classifier. 720 @rtype: the corresponding subclass of L{classifier}. 721 """ 722 raise RuntimeError, "Virtual Function."

723

724 - def start(self, data):

725 """Starting position for Markov Chain Monte Carlo.""" 726 raise RuntimeError, "Virtual Function."

727

728 - def __str__(self):

729 # return "<classifier_desc: ndim=%d nc=%d cmap=%s>" % (self.ndim, self.nc, self.cmap) 730 return "<classifier_desc: ndim=%d nc=%d>" % (self.ndim, self.nc)

731 __repr__ = __str__ 732 733

734 - def typename(self):

735 """@return: a string that names the subclass - what kind of L{classifier_desc} is it? 736 @rtype str 737 """ 738 raise RuntimeError, "Virtual function"

739

740 741 -def _logp(x, (data, cdesc)):

742 assert isinstance(cdesc, classifier_desc) 743 cl = cdesc.unpack(x) 744 assert isinstance(cl, classifier) 745 o1 = evaluate_Bayes(cl, data, constrain=CONSTRAIN) 746 return -o1

747

748 749 750 -def forest_build(data, N, modelchoice=None, trainingset_name=None):

751 """Build a forest of classifiers. 752 @param data: data to train the classifiers on. 753 @type data: L{datum_c} 754 @param N: How many to build. 755 @type N: int, 756 @param modelchoice: what kind of classifier to build 757 @type modelchoice: subclass of L{model_template} 758 @param trainingset_name: (stored for later use). 759 @type trainingset_name: str 760 """ 761 # print 'forest: trainingset=', trainingset_name 762 assert len(data) > 1, "Not enough data" 763 assert modelchoice is not None 764 start, V = modelchoice.start(data) 765 x = mcmc.bootstepper(_logp, start, V, c=(data, modelchoice)) 766 mcmch = mcmc_helper.stepper(x) 767 nsteps = mcmch.run_to_bottom() 768 mcmch.run_to_ergodic(1.0) 769 if nsteps > 100: 770 print '#NSTEPS:', nsteps 771 o = [] 772 for i in range(N): 773 mcmch.run_to_ergodic(ERGCOVER/float(N)) 774 tmp = modelchoice.unpack( x.current().prms(), 775 trainingset_name=trainingset_name, 776 uid='%s:%d' % (trainingset_name,i) 777 ) 778 if D: 779 print 'Forest evaluate=', tmp.evaluate(data), "for", tmp 780 o.append(tmp) 781 return o

782

783 784 785 -class classifier(object):

786 """This is the base class for all kinds of classifers. 787 """ 788

789 - def __init__(self, typename, models, info=None, 790 trainingset_name=None, uid=None, cdesc=None):

791 """ 792 @param models: a dictionary containg a probabilistic model for each class 793 @type models: dict(str: subclass of L{model_template}) 794 @param info: not used in the internal operation of the classifier, 795 but it is stuff that is important to write out. 796 @type info: dict(str: whatever) 797 """ 798 self.typename = typename 799 assert isinstance(models, dict) 800 self.class_models = models 801 if info is not None: 802 self.info = info.copy() 803 else: 804 self.info = {} 805 self.info['trainingset'] = trainingset_name 806 self.info['Cuid'] = uid 807 self.cdesc = cdesc

808 809

810 - def add_model(self, classname, model):

811 """Add a class to an existing classifier.""" 812 g_implements.check(model, model_template) 813 assert isinstance(classname, str) 814 self.class_models[classname] = model

815 816

817 - def list_classes(self):

818 return self.class_models.keys()

819 820

821 - def add_info(self, k, v):

822 self.info[k] = v

823

824 - def P(self, datum, whichclass=None):

825 """Determine the probability of being in each class. 826 If whichclass=None, then it returns a list of 827 tuples [(classname,P), ...] for all classes. 828 Otherwise, it returns the probability of the 829 specified class. 830 """ 831 if whichclass is None: 832 return [ (cn, math.exp(lp)) for (cn, lp) in self.logPv(datum) ] 833 return math.exp(self.logPw(datum, whichclass, 0.0))

834 835

836 - def logP(self, datum, whichclass=None):

837 if whichclass is None: 838 return self.logPv(datum) 839 return self.logPw(datum, whichclass, 0.0)

840 841

842 - def logPw(self, datum, whichclass, constrain):

843 """Determine the probability of C{datum} being in C{whichclass}. 844 """ 845 846 Psum = 0.0 847 bgst = -self.HUGE 848 smlst = self.HUGE 849 the_lP = None 850 for (cn, cm) in self.class_models.items(): 851 lp = cm.logp(datum.value) 852 if cn == whichclass: 853 the_lP = lp 854 if smlst > lp: 855 smlst = lp 856 if bgst < lp: 857 Psum *= math.exp(bgst-lp) 858 bgst = lp 859 Psum += math.exp(lp-bgst) 860 return the_lP - bgst - math.log(Psum) - constrain*(bgst-smlst)**2

861 862

863 - def logPv(self, datum):

864 """Determine the probability of being in each class. 865 If whichclass=None, then it returns a list of 866 tuples [(classname,P), ...] for all classes. 867 """ 868 869 Psum = 0.0 870 lcnP = [] 871 bgst = None 872 for (cn, cm) in self.class_models.items(): 873 lp = cm.logp(datum.value) 874 lcnP.append( (cn, lp) ) 875 if bgst is None: 876 bgst = lp 877 elif bgst<lp: 878 Psum *= math.exp(bgst-lp) 879 bgst = lp 880 Psum += math.exp(lp-bgst) 881 q = bgst + math.log(Psum) 882 return [ (cn, lp-q) for (cn,lp) in lcnP ]

883 884 HUGE = 1e30 885

886 - def bestc(self, datum):

887 """Determine the best class for a datum.""" 888 # print "Datum=", datum 889 bgst = -self.HUGE 890 bgc = None 891 for (c, qp) in self.class_models.items(): 892 tmp = qp.logp(datum.value) 893 if tmp > bgst: 894 bgst = tmp 895 bgc = c 896 return bgc

897 898

899 - def __str__(self):

900 o = [ '<classifier: %s' % self.typename ] 901 for (c, q) in self.class_models.items(): 902 o.append('%s: %s' % (c, str(q))) 903 o.append( '>' ) 904 return '\n'.join(o)

905 906 __repr__ = __str__ 907 908

909 - def list_wrong_classifications(self, classdata):

910 nok = [] 911 for datum in classdata: 912 bestc = self.bestc(datum) 913 if bestc != datum.classid: 914 nok.append(datum.uid) 915 return nok

916 917

918 - def writer(self, dcw):

919 """Write this classifier out (usually to a data file). 920 @type dcw: L{chunkio.chunk_w} 921 """ 922 dcw.groupstart(self.typename) 923 dcw.comment('Classifier:') 924 dcw.write_dict_of(self.class_models, 925 lambda dcw, ac: ac.tochunk(dcw), b=1) 926 dcw.write_dict(self.info) 927 dcw.groupend()

928

929 930 931 932 # class forest_mixin(object): 933 # def __init__(self, cdesc): 934 # self.cdesc = cdesc 935 # 936 # def evaluate(self, data): 937 # if self.cdesc.evaluator is None: 938 # return evaluate_Bayes(self, data) 939 # return self.cdesc.evaluator(self, data) 940 941 942 -def evaluate_match(cl, data):

943 """This can be passed into a classifier descriptor as 944 the evaluate argument. 945 It returns the number of exact matches between the classified 946 data and the input, true classification. 947 @param cl: some classifier that has a bestc() method. 948 @type cl: typically a subclass of L{classifier}. 949 @param data: list of classes that describe data points. 950 @type data: Typically a subclass of L{datum_c}. 951 @rtype: int 952 @return: the number of classification errors made 953 """ 954 # print 'classifier=', cl 955 o = 0 956 for datum in data: 957 sbc = cl.bestc(datum) 958 # print 'sbc(', datum, ')=', sbc, 'dc=', datum.classid, 'P=', cl.P(datum) 959 if sbc != datum.classid: 960 o += 1 961 return o

962

963 964 965 -class evaluate_match_w_rare(object):

966 """This is called in the same way as L{evaluate_match} 967 or L{evaluate_Bayes}. It pretends to be a function, 968 except that you can weight the values of different 969 classes when you construct the class. 970 """ 971 __name__ = 'evaluate_match_w_rare' 972

973 - def __init__(self):

974 self.weights = None

975

976 - def _compute_weights(self, data):

977 classcounts = dictops.dict_of_accums() 978 for datum in data: 979 classcounts.add(datum.classid, 1) 980 981 ctotal = 0 982 for counts in classcounts.values(): 983 ctotal += counts 984 ctotal = float(ctotal) 985 weights = {} 986 for (classname, counts) in classcounts.items(): 987 weights[classname] = -math.log(float(counts)/ctotal) 988 return weights

989 990

991 - def __call__(self, clssfr, data):

992 if self.weights is None: 993 self.weights = self._compute_weights(data) 994 995 o = 0 996 for datum in data: 997 if clssfr.bestc(datum) != datum.classid: 998 o += self.weights[datum.classid] 999 return o

1000

1001 1002 1003 1004 -def evaluate_Bayes(cl, data, constrain=0.0):

1005 """Evaluates the negative log of the probability that the classifier would assign 1006 to the datum being in the observed class (i.e. whatever class 1007 is specified in the L{datum_tr}). Obviously, you 1008 want this to be a relatively small number. 1009 @param cl: a classifier 1010 @type cl: L{classifier} 1011 @param data: data 1012 @type data: list(L{datum_c}) 1013 @rtype: float 1014 @return: the log of the probability of being in the observed class. 1015 1016 If C{cdesc.ftrim} is not None, we assume that some of the 1017 data in each class are dubious, and should 1018 be ignored if they are sufficiently improbable. 1019 We modify the probability scores of data that is 1020 among the worst (C{cl.cdesc.ftrim[0]} fraction), 1021 and limit those scores to be no larger than 1022 C{cl.cdesc.ftrim[1]} larger than the best score. 1023 This lets you limit by score or limit by fraction 1024 or any mixture in between. If C{cl.cdesc.ftrim} is C{None}, 1025 then no limiting or trimming is done. 1026 """ 1027 if cl.cdesc.ftrim is None: 1028 o = 0.0 1029 for datum in data: 1030 o += cl.logPw(datum, datum.classid, constrain) 1031 return -o 1032 # Below, we assume a fraction trimfrac of the 1033 # data in each class are dubious, and should 1034 # be ignored if they are sufficiently improbable. 1035 trimfrac, trimlevel = cl.cdesc.ftrim 1036 assert trimfrac>=0 and trimfrac<=1.0 1037 o = dictops.dict_of_lists() 1038 for datum in data: 1039 sb = cl.logPw(datum, datum.classid, constrain) 1040 assert isinstance(datum, datum_tr), "Use ftrim only with training set!" 1041 o.add(datum.classid, sb) 1042 oo = 0.0 1043 for classid, scorelist in o.items(): 1044 scorelist.sort() 1045 n = int(round(trimfrac*(len(scorelist)-1))) 1046 tcut = scorelist[-1] - trimlevel 1047 assert n > 0 1048 for t1 in scorelist[n:]: 1049 if D and t1 < tcut: 1050 print 'Trimming %g %g in %s' % (t1, tcut, classid) 1051 oo += max(tcut, t1) 1052 if D: 1053 for t1 in scorelist[:n]: 1054 print 'Trimming %g (bot) in %s' % (t1, classid) 1055 return -oo

1056

1057 1058 1059 1060 1061 -def default_writer(summary, out, classout, wrong, fname="classes.chunk"):

1062 """This writes out classifiers to a data file. 1063 @attention: out needs to be a list, not an iterator, because we use it twice. 1064 """ 1065 1066 def classifier_writer(dcw, a_classifier): 1067 """Helper function to work with L{chunkio}.""" 1068 a_classifier.writer(dcw)

1069 1070 dc = chunkio.datachunk_w( open(fname, "w") ) 1071 dc.comment('Header:') 1072 header = summary.copy() 1073 ctype = None 1074 for cl in out: 1075 if ctype is None: 1076 ctype = cl.typename 1077 else: 1078 assert cl.typename == ctype, 'Cannot handle mixtures of different classifiers.' 1079 header['classifier_type'] = ctype 1080 dc.write_dict( header ) 1081 dc.comment('classifiers:') 1082 dc.write_array_of(out, classifier_writer, b=1) 1083 out = None 1084 dc = None 1085 for (uid, nfailures) in wrong.items(): 1086 print 'WRONG', nfailures, summary['total'], uid 1087

1088 1089 -def count_classes(data):

1090 """Count how many instances there are of each class. 1091 @type data: L{datum_c} 1092 @rtype map from str to int 1093 """ 1094 cids = dictops.dict_of_accums() 1095 for datum in data: 1096 assert isinstance(datum.classid, str) 1097 cids.add(datum.classid, 1) 1098 return cids

1099

1100 1101 -def list_classes(data):

1102 """List the names of the classes in a dataset, 1103 with the most populus classes first. 1104 @type data: L{datum_c} 1105 @rtype list(str) 1106 """ 1107 cn = [(-n, cid) for (cid, n) in count_classes(data).items() ] 1108 cn.sort() 1109 return [ cid for (n, cid) in cn ]

1110

1111 1112 1113 1114 1115 -def name_of_evaluator(e):

1116 """Used to get the name of an evaluator, to write it 1117 to a file header. 1118 @rtype: str 1119 @param e: 1120 @type e: function, preferable with __name__ attribute. 1121 """ 1122 if e is None: 1123 return evaluate_Bayes.__name__ 1124 elif hasattr(e, '__name__'): 1125 return e.__name__ 1126 else: 1127 return str(e)

1128

1129 1130 1131 -def evaluator_from_name(nm):

1132 """Maps a name to a function that will evaluate how well a classifier performs. 1133 @param nm: a printable name 1134 @type nm: str 1135 @return: a function 1136 """ 1137 if nm is None or nm == 'Bayes' or nm == 'evaluate_Bayes': 1138 return evaluate_Bayes 1139 elif nm == 'match' or nm == 'evaluate_match': 1140 return evaluate_match 1141 elif nm == 'match_w_rare' or nm == 'evaluate_match_w_rare': 1142 return evaluate_match_w_rare() 1143 else: 1144 die.die('Bad name for evaluator: %s' % nm)

1145

1146 1147 1148 1149 1150 1151 -def default_modify_class(qc, training_counts, testing_counts):

1152 """Modifies a classifier so it isn't so dominated by the most frequent classes. 1153 @type qc: L{classifier} 1154 @param training_counts: how many data are there in each class in the training set 1155 @type training_counts: map str to int 1156 @param testing_counts: how many data are there in each class in the testing set 1157 @type testing_counts: map str to int 1158 """ 1159 assert isinstance(qc, classifier) 1160 F = 0.5 1161 ntr = 0 1162 for n in training_counts.values(): 1163 ntr += n 1164 nts = 0 1165 for n in testing_counts.values(): 1166 nts += n 1167 ntyp = 0 1168 neff = {} 1169 for nm in qc.class_models.keys(): 1170 tmp = (training_counts[nm]*nts+testing_counts[nm]*ntr)/(ntr+nts) 1171 ntyp += tmp 1172 neff[nm] = tmp 1173 ntyp = float(ntyp)/float(len(qc.class_models)) 1174 for (nm, mod) in qc.class_models.items(): 1175 mod.bias -= F*math.log(float(neff[nm])/float(ntyp))

1176

Source Code for Module classifiers.q_classifier_r