gmisclib.accent

1 """This module provides a way of safely specifing accent 2 positions in running text. 3 4 If you have a transcription "I did not eat the orange ball.", 5 you can attach a "+" symbol to the word "eat" like this: 6 "+eat". 7 8 You enter an accent specification, which consists of words with 9 a prefix (the prefix is anything that ends in a punctuation mark). 10 The program matches up the words in the accent specification 11 to the words in the transcription. 12 You can have many words in the accent specification, if necessary. 13 You can also disambiguate things by putting context words into 14 the accent spec (without a prefix). 15 All matching is done left-to-right. 16 """ 17 18 import re 19 20 __version__ = "$Revision: 1.4 $" 21 22 _goodprefix = re.compile("""^.*\W$""") 23 _goodsuffix = re.compile("""^\W.*$""") 24

25 -class BadMatchError(RuntimeError):

26 - def __init__(self, x):

27 RuntimeError.__init__(self, x)

28

29 -def prefix(text_array, accent_spec, map_fcn=lambda x:x):

30 """This function takes an array of words and an accent spec. 31 It matches the accent spec to the words, 32 and outputs an array of tuples which tells you 33 where the accents are, and what kind. 34 The optional map_fcn can be used to map other kinds 35 of objects into a array of strings. 36 37 More specifically, an accent_spec is a whitespace-separated 38 list of strings. Each string is a word from the text_array, 39 with an optional prefix. The strings are matched in 40 order to the words, and the output array is a list 41 of (index_in_text_array, prefix_text) tuples. 42 43 So, if you have text_array = ['my', 'cat', 'is', 'my', 'cat'] 44 and accent_spec="is +my", then align() will match 45 "is" to text_array[2], and "+my" to text_array[3], 46 and it will return [ (3, "+") ] . 47 Note that "+is +my" does not imply that 'is' and 'my' 48 are adjacent, and "+is +cat" simply returns [ (2, "+"), (4, "+") ]. 49 50 If the accent_spec were "+my", then it would match 51 text_array[0], and return [ (0, "+") ]. 52 53 Prefixes can be multiple characters, 54 but they cannot end in letters, digits, or underscore. 55 """ 56 57 58 asa = accent_spec.split() 59 if len(asa) > len(text_array): 60 raise BadMatchError, "Accent spec longer than text array: %d vs. %d" % (len(asa), len(text_array)) 61 as0 = asa.pop(0) 62 out = [] 63 j = 0 64 for t in map_fcn(text_array): 65 # print "AS=", as0, "t=", t 66 if as0 == t: # A match for alignment purposes. 67 # print "Align match" 68 if len(asa) == 0: 69 return out 70 else: 71 as0 = asa.pop(0) 72 elif as0.endswith(t) and _goodprefix.match(as0[:-len(t)]): 73 # A match with an accent. 74 # print "Acc match:", as0[:-len(t)] 75 out.append( (j, as0[:-len(t)]) ) 76 if len(asa) == 0: 77 return out 78 else: 79 as0 = asa.pop(0) 80 j += 1 81 raise BadMatchError, "Accent_spec not consistent with text: %d/%d." % (len(asa), len(text_array))

82 83 84 85

86 -def suffix(text_array, accent_spec, map_fcn=lambda x:x):

87 """See prefix, but with the obvious changes.""" 88 89 asa = accent_spec.split() 90 if len(asa) > len(text_array): 91 raise BadMatchError, "Accent spec longer than text array: %d vs. %d" % (len(asa), len(text_array)) 92 as0 = asa.pop(0) 93 out = [] 94 j = 0 95 for t in map_fcn(text_array): 96 # print "AS=", as0, "t=", t 97 if as0 == t: # A match for alignment purposes. 98 # print "Align match" 99 if len(asa) == 0: 100 return out 101 else: 102 as0 = asa.pop(0) 103 elif as0.startswith(t) and _goodsuffix.match(as0[len(t):]): 104 # A match with an accent. 105 # print "Acc match:", as0[:-len(t)] 106 out.append( (j, as0[len(t):]) ) 107 if len(asa) == 0: 108 return out 109 else: 110 as0 = asa.pop(0) 111 j += 1 112 raise BadMatchError, "Accent_spec not consistent with text: %d/%d." % (len(asa), len(text_array))

113 114 115

116 -def preshow(text_array, alignment, map_fcn=lambda x:x):

117 """Shows an alignment in a printable form. 118 It puts the prefixes in the appropriate places. 119 """ 120 tm = [ map_fcn(t) for t in text_array ] 121 for (idx, tag) in alignment: 122 tm[idx] = tag + tm[idx] 123 return tm

124 125 126

127 -def sufshow(text_array, alignment, map_fcn=lambda x:x):

128 """Shows an alignment in a printable form. 129 It puts the suffixes in the appropriate places. 130 """ 131 tm = [ map_fcn(t) for t in text_array ] 132 for (idx, tag) in alignment: 133 tm[idx] = tm[idx] + tag 134 return tm

135 136 137 138

139 -def test():

140 ta = ['my', 'cat', 'is', 'my', 'cat'] 141 assert prefix(ta, "is +my") == [ (3, '+') ] 142 assert prefix(ta, "+my") == [ (0, '+') ] 143 assert suffix(ta, "my+") == [ (0, '+') ] 144 assert preshow(['my', 'cat', 'Fred'], [(1, '!')]) == ['my', '!cat', 'Fred'] 145 assert sufshow(['my', 'cat'], [(1, '!')]) == ['my', 'cat!']

146 147 148 if __name__ == '__main__': 149 test() 150 import sys 151 arglist = sys.argv[1:] 152 pre = 1 153 if len(arglist)>0 and arglist[0] == '-s': 154 pre = 0 155 arglist.pop(0) 156 elif len(arglist)>0 and arglist[0] == '-p': 157 pre = 1 158 arglist.pop(0) 159 textarray = arglist 160 accspec = sys.stdin.readline().strip() 161 if pre: 162 alignment = prefix(textarray, accspec) 163 print ' '.join(preshow(textarray, alignment)) 164 else: 165 alignment = suffix(textarray, accspec) 166 print ' '.join(sufshow(textarray, alignment)) 167

Source Code for Module gmisclib.accent_spec