1 """This module provides a way of safely specifing accent
2 positions in running text.
3
4 If you have a transcription "I did not eat the orange ball.",
5 you can attach a "+" symbol to the word "eat" like this:
6 "+eat".
7
8 You enter an accent specification, which consists of words with
9 a prefix (the prefix is anything that ends in a punctuation mark).
10 The program matches up the words in the accent specification
11 to the words in the transcription.
12 You can have many words in the accent specification, if necessary.
13 You can also disambiguate things by putting context words into
14 the accent spec (without a prefix).
15 All matching is done left-to-right.
16 """
17
18 import re
19
20 __version__ = "$Revision: 1.4 $"
21
22 _goodprefix = re.compile("""^.*\W$""")
23 _goodsuffix = re.compile("""^\W.*$""")
24
28
29 -def prefix(text_array, accent_spec, map_fcn=lambda x:x):
30 """This function takes an array of words and an accent spec.
31 It matches the accent spec to the words,
32 and outputs an array of tuples which tells you
33 where the accents are, and what kind.
34 The optional map_fcn can be used to map other kinds
35 of objects into a array of strings.
36
37 More specifically, an accent_spec is a whitespace-separated
38 list of strings. Each string is a word from the text_array,
39 with an optional prefix. The strings are matched in
40 order to the words, and the output array is a list
41 of (index_in_text_array, prefix_text) tuples.
42
43 So, if you have text_array = ['my', 'cat', 'is', 'my', 'cat']
44 and accent_spec="is +my", then align() will match
45 "is" to text_array[2], and "+my" to text_array[3],
46 and it will return [ (3, "+") ] .
47 Note that "+is +my" does not imply that 'is' and 'my'
48 are adjacent, and "+is +cat" simply returns [ (2, "+"), (4, "+") ].
49
50 If the accent_spec were "+my", then it would match
51 text_array[0], and return [ (0, "+") ].
52
53 Prefixes can be multiple characters,
54 but they cannot end in letters, digits, or underscore.
55 """
56
57
58 asa = accent_spec.split()
59 if len(asa) > len(text_array):
60 raise BadMatchError, "Accent spec longer than text array: %d vs. %d" % (len(asa), len(text_array))
61 as0 = asa.pop(0)
62 out = []
63 j = 0
64 for t in map_fcn(text_array):
65
66 if as0 == t:
67
68 if len(asa) == 0:
69 return out
70 else:
71 as0 = asa.pop(0)
72 elif as0.endswith(t) and _goodprefix.match(as0[:-len(t)]):
73
74
75 out.append( (j, as0[:-len(t)]) )
76 if len(asa) == 0:
77 return out
78 else:
79 as0 = asa.pop(0)
80 j += 1
81 raise BadMatchError, "Accent_spec not consistent with text: %d/%d." % (len(asa), len(text_array))
82
83
84
85
86 -def suffix(text_array, accent_spec, map_fcn=lambda x:x):
87 """See prefix, but with the obvious changes."""
88
89 asa = accent_spec.split()
90 if len(asa) > len(text_array):
91 raise BadMatchError, "Accent spec longer than text array: %d vs. %d" % (len(asa), len(text_array))
92 as0 = asa.pop(0)
93 out = []
94 j = 0
95 for t in map_fcn(text_array):
96
97 if as0 == t:
98
99 if len(asa) == 0:
100 return out
101 else:
102 as0 = asa.pop(0)
103 elif as0.startswith(t) and _goodsuffix.match(as0[len(t):]):
104
105
106 out.append( (j, as0[len(t):]) )
107 if len(asa) == 0:
108 return out
109 else:
110 as0 = asa.pop(0)
111 j += 1
112 raise BadMatchError, "Accent_spec not consistent with text: %d/%d." % (len(asa), len(text_array))
113
114
115
116 -def preshow(text_array, alignment, map_fcn=lambda x:x):
117 """Shows an alignment in a printable form.
118 It puts the prefixes in the appropriate places.
119 """
120 tm = [ map_fcn(t) for t in text_array ]
121 for (idx, tag) in alignment:
122 tm[idx] = tag + tm[idx]
123 return tm
124
125
126
127 -def sufshow(text_array, alignment, map_fcn=lambda x:x):
128 """Shows an alignment in a printable form.
129 It puts the suffixes in the appropriate places.
130 """
131 tm = [ map_fcn(t) for t in text_array ]
132 for (idx, tag) in alignment:
133 tm[idx] = tm[idx] + tag
134 return tm
135
136
137
138
140 ta = ['my', 'cat', 'is', 'my', 'cat']
141 assert prefix(ta, "is +my") == [ (3, '+') ]
142 assert prefix(ta, "+my") == [ (0, '+') ]
143 assert suffix(ta, "my+") == [ (0, '+') ]
144 assert preshow(['my', 'cat', 'Fred'], [(1, '!')]) == ['my', '!cat', 'Fred']
145 assert sufshow(['my', 'cat'], [(1, '!')]) == ['my', 'cat!']
146
147
148 if __name__ == '__main__':
149 test()
150 import sys
151 arglist = sys.argv[1:]
152 pre = 1
153 if len(arglist)>0 and arglist[0] == '-s':
154 pre = 0
155 arglist.pop(0)
156 elif len(arglist)>0 and arglist[0] == '-p':
157 pre = 1
158 arglist.pop(0)
159 textarray = arglist
160 accspec = sys.stdin.readline().strip()
161 if pre:
162 alignment = prefix(textarray, accspec)
163 print ' '.join(preshow(textarray, alignment))
164 else:
165 alignment = suffix(textarray, accspec)
166 print ' '.join(sufshow(textarray, alignment))
167