Package gmisclib :: Module xmlmisc
[frames] | no frames]

Source Code for Module gmisclib.xmlmisc

  1  """This contains helper functions and classes for processing XML, 
  2  based on the ElementTree module. 
  3  """ 
  4   
  5  import xml.etree.cElementTree as elTree 
6 7 8 -def treestructures(elt):
9 """This gives you a view of what the XML hierarchy looks like. 10 It's intended to help you deduce the correct DTD, or just to understand 11 how the data is arranged. It shows you the kinds of leaf tags you have 12 and the kinds of paths through the XML tree necessary to get to each leaf. 13 @param elt: an XML tree structure 14 @type elt: an xml.etree.cElementTree Element 15 @return: a sequence of embedding lists. Each embedding list contains 16 the tags you go through en route to a leaf node. So, 17 C{<a> <b> <c>x</c> </b> </a>} would yield a list C{['a', 'b', 'c']} 18 and C{<a> <b>x</b> <c> y </c> </a>} would yield 19 C{['a', 'b']} and C{['a', 'c']}. 20 @rtype: iterator(list(str)) 21 """ 22 assert elTree.iselement(elt) 23 if len(elt) == 0: 24 return set( [ ( elt.tag,) ] ) 25 rv = set() 26 eltt = (elt.tag,) 27 for e in elt: 28 for re in treestructures(e): 29 rv.add(eltt + re) 30 return rv
31
32 33 -class _tmpnode(object):
34 - def __init__(self, el, parent, i):
35 self.e = el 36 self.p = parent 37 self.id = id(el) 38 self.i = i
39
40 - def __hash__(self):
41 return self.id
42
43 - def __eq__(self, other):
44 return self.e() is other.e()
45
46 - def check(self, e):
47 if self.e is not e: 48 raise ValueError, "Whoops" 49 return self
50
51 52 53 54 -class loc_finder(object):
55 """A class for specifying a path through an XML tree to a particular node. 56 Basically, this is a way of referring to a particular element that will 57 survive serialization of the XML tree. 58 """ 59
60 - def __init__(self, root_element):
61 """Create an instance of the L{loc_finder} class and initialize it from a 62 portion of an XML tree. 63 @param root_element: the top element of the tree (or subtree). All positions 64 will be computed relative to this element. 65 @type root_element: L{elTree.Element}. 66 @note: This walks the entire XML file and builds a cache, so it is expensive 67 to initialize, but fairly cheap to use. 68 @note: this works on L{elTree.Element} object, not on an L{elTree.ElementTree} 69 object. If you have x which is a L{elTree.ElementTree}, then use 70 loc_finder(x.get_root()). 71 """ 72 self.tree = root_element 73 self.up = {} 74 self.maxdepth = self.__walk(self.tree)
75 76
77 - def __walk(self, tree):
78 depth = 0 79 for (i,t) in enumerate(tree): 80 self.up[id(t)] = _tmpnode(t, tree, i) 81 td = self.__walk(t)+1 82 if td > depth: 83 depth = td 84 return depth + 1
85 86
87 - def find_up(self, el, tag):
88 """Search upwards (towards the document root) to find the 89 first tag of the specified type. 90 @type tag: L{str} 91 @param tag: the tag of an XML element. 92 @rtype: L{elTree.Element} 93 @return: The smallest enclosing element that has the specified tag. 94 """ 95 depth = 0 96 while el is not self.tree: 97 if el.tag == tag: 98 return el 99 tmp = self.up[id(el)].check(el) 100 el = tmp.p 101 if depth >= self.maxdepth: 102 raise ValueError, "Whoops" 103 depth += 1 104 return None
105 106
107 - def path(self, el):
108 """Describe the path through an XML file to a specific element. 109 @param el: The target element. 110 @return: A tuple, intended to be handed to L{walkto}. 111 @type el: L{elTree.Element} 112 @rtype: L{tuple}C{(L{int})} 113 """ 114 rv = [] 115 while el is not self.tree: 116 tmp = self.up[id(el)].check(el) 117 rv.append(tmp.i) 118 el = tmp.p 119 if len(rv) >= self.maxdepth: 120 raise ValueError, "Whoops" 121 rv.reverse() 122 return tuple(rv)
123 124 125 @staticmethod
126 - def walkto(elem, path):
127 """If you have an element in an elementTree and a path obtained from L{loc_finder}C{(elem)}.L{path}C{(X)}, 128 this will walk you back to element C{X}. 129 @param elem: an element in an XML tree. Normally, this is the root element. 130 @type elem: L{elTree.Element} 131 @param path: a location in the specified XML (sub)tree. 132 @type path: C{tuple(int)} 133 @rtype: L{elTree.Element} 134 @return: The element specified by the C{path}, in the (sub)tree. 135 """ 136 # if not path: 137 # return elem 138 # p0 = path[0] 139 # for (i,t) in enumerate(elem): 140 # if i == p0: 141 # return loc_finder.walkto(t, path[1:]) 142 for (lvl,p) in enumerate(path): 143 found = False 144 for (i, t) in enumerate(elem): 145 if i == p: 146 found = True 147 elem = t 148 break 149 if not found: 150 raise ValueError, "Path at level %d specifies element %d, but there are only %d" % (lvl, p, i+1) 151 return elem
152
153 154 155 -def test_loc_finder():
156 tree = elTree.Element('a') 157 t1 = elTree.Element('b') 158 t2 = elTree.Element('c') 159 tree.append(t1) 160 tree.append(t2) 161 top = elTree.Element('0') 162 top.append(tree) 163 164 lf = loc_finder(top) 165 assert loc_finder.walkto(top, lf.path(t2)) is t2 166 assert loc_finder.walkto(top, lf.path(t1)) is t1 167 assert loc_finder.walkto(top, lf.path(tree)) is tree 168 print list(loc_finder.walk_between(top, lf.path(t1), lf.path(t2))) 169 assert [t.tag for t in loc_finder.walk_between(top, lf.path(t1), lf.path(t2))] == ['a', 'b']
170
171 -def test():
172 test_loc_finder()
173 174 if __name__ == '__main__': 175 test() 176