00001
00002
00003
00004
00005 import re
00006 import os
00007 import xml.sax.handler
00008 import pprint
00009
00010 class DataNode (object):
00011
00012 spaces = 4
00013
00014 def __init__ (self, **kwargs):
00015 self._attrs = {}
00016 self._data = None
00017 self._ncDict = kwargs.get ('nameChangeDict', {})
00018
00019
00020 def __len__ (self):
00021
00022 return 1
00023
00024
00025 def __getitem__ (self, key):
00026 if isinstance (key, basestring):
00027 return self._attrs.get(key,None)
00028 else:
00029 return [self][key]
00030
00031
00032 def __contains__ (self, name):
00033 return self._attrs.has_key(name)
00034
00035
00036 def __nonzero__ (self):
00037 return bool (self._attrs or self._data)
00038
00039
00040 def __getattr__ (self, name):
00041 if name.startswith('__'):
00042
00043 raise AttributeError (name)
00044 return self._attrs.get (name, None)
00045
00046
00047 def _add_xml_attr (self, name, value):
00048 change = self._ncDict.get (name)
00049 if change:
00050 name = change
00051 if name in self._attrs:
00052
00053 children = self._attrs[name]
00054 if not isinstance(children, list):
00055 children = [children]
00056 self._attrs[name] = children
00057 children.append(value)
00058 else:
00059 self._attrs[name] = value
00060
00061
00062 def __str__ (self):
00063 return self.stringify()
00064
00065
00066 def __repr__ (self):
00067 items = sorted (self._attrs.items())
00068 if self._data:
00069 items.append(('data', self._data))
00070 return u'{%s}' % ', '.join([u'%s:%s' % (k,repr(v)) for k,v in items])
00071
00072
00073 def attributes (self):
00074 return self._attrs
00075
00076
00077 @staticmethod
00078 def isiterable (obj):
00079 return getattr (obj, '__iter__', False)
00080
00081
00082 @staticmethod
00083 def _outputValues (obj, name, offset):
00084 retval = ' ' * offset
00085 if name:
00086 retval += '%s: ' % name
00087 offset += len (name) + DataNode.spaces
00088
00089 if isinstance (obj, list):
00090 first = True
00091 for value in obj:
00092 print "value", value, value.__class__.__name__
00093 if first:
00094 tempoffset = offset
00095 first = False
00096 retval += '[\n ' + ' ' * offset
00097 else:
00098 retval += ',\n ' + ' ' * offset
00099 tempoffset = offset
00100 if isinstance (value, DataNode):
00101 retval += value.stringify (offset=tempoffset)
00102 print " calling stringify for %s" % value
00103 elif DataNode.isiterable (value):
00104 retval += DataNode._outputValues (value, '', offset)
00105 else:
00106 retval += "%s" % value
00107 retval += '\n' + ' ' * (offset - 2) +']\n'
00108 return retval
00109 retval += pprint.pformat(obj,
00110 indent= offset,
00111 width=1)
00112 return retval
00113
00114
00115 def stringify (self, name = '', offset = 0):
00116
00117 if self._data and not len (self._attrs):
00118 return _outputValues (self._data, name, offset)
00119 retval = ' ' * offset
00120 if name:
00121 retval += '%s : %s\n' % \
00122 (name,
00123 pprint.pformat (self._data,
00124 indent= offset+DataNode.spaces,
00125 width=1) )
00126 else:
00127 retval += pprint.pformat (self._data,
00128 indent=offset+DataNode.spaces,
00129 width=1)
00130 return retval
00131
00132 retval = ''
00133 if name:
00134 retval += '\n' + ' ' * offset
00135 retval += '%s: ' % name
00136 first = True
00137 for key, value in sorted (self._attrs.iteritems()):
00138 if first:
00139 retval += '{ \n'
00140 tempspace = offset + 3
00141 first = False
00142 else:
00143 retval += ',\n'
00144 tempspace = offset + 3
00145 if isinstance (value, DataNode):
00146 retval += value.stringify (key, tempspace)
00147 else:
00148 retval += DataNode._outputValues (value, key, tempspace)
00149
00150 if self._data:
00151 retval += ',\n'
00152 tempspace = offset + 3
00153 retval += DataNode._ouptputValues (self._data, name, tempspace)
00154 retval += '\n ' + ' ' * offset + '}'
00155 return retval
00156
00157
00158
00159 class TreeBuilder (xml.sax.handler.ContentHandler):
00160
00161 non_id_char = re.compile('[^_0-9a-zA-Z]')
00162
00163 def __init__ (self, **kwargs):
00164 self._stack = []
00165 self._text_parts = []
00166 self._ncDict = kwargs.get ('nameChangeDict', {})
00167 self._root = DataNode (nameChangeDict = self._ncDict)
00168 self.current = self._root
00169
00170 def startElement (self, name, attrs):
00171 self._stack.append( (self.current, self._text_parts))
00172 self.current = DataNode (nameChangeDict = self._ncDict)
00173 self._text_parts = []
00174
00175 for k, v in attrs.items():
00176 self.current._add_xml_attr (TreeBuilder._name_mangle(k), v)
00177
00178 def endElement (self, name):
00179 text = ''.join (self._text_parts).strip()
00180 if text:
00181 self.current._data = text
00182 if self.current.attributes():
00183 obj = self.current
00184 else:
00185
00186 obj = text or ''
00187 self.current, self._text_parts = self._stack.pop()
00188 self.current._add_xml_attr (TreeBuilder._name_mangle(name), obj)
00189
00190 def characters (self, content):
00191 self._text_parts.append(content)
00192
00193 def root (self):
00194 return self._root
00195
00196 def topLevel (self):
00197 '''Returns top level object'''
00198 return self._root.attributes().values()[0]
00199
00200
00201 @staticmethod
00202 def _name_mangle (name):
00203 return TreeBuilder.non_id_char.sub('_', name)
00204
00205
00206 regexList = [ (re.compile (r'&'), '&' ),
00207 (re.compile (r'<'), '<' ),
00208 (re.compile (r'>'), '>' ),
00209 (re.compile (r'"'), '"e;' ),
00210 (re.compile (r"'"), ''' )
00211 ]
00212
00213 quoteRE = re.compile (r'(\w\s*=\s*")([^"]+)"')
00214
00215 def fixQuoteValue (match):
00216 '''Changes all characters inside of the match'''
00217 quote = match.group(2)
00218 for regexTup in regexList:
00219 quote = regexTup[0].sub( regexTup[1], quote )
00220 return match.group(1) + quote + '"'
00221
00222
00223 def xml2obj (**kwargs):
00224 ''' Converts XML data into native Python object. Takes either
00225 file handle or string as input. Does NOT fix illegal characters.
00226
00227 input source: Exactly one of the three following is needed
00228 filehandle - input from file handle
00229 contents - input from string
00230 filename - input from filename
00231
00232 options:
00233 filtering - boolean value telling code whether or not to fileter
00234 input selection to remove illegal XML characters
00235 nameChangeDict - dictionaries of names to change in python object'''
00236
00237
00238 filehandle = kwargs.get ('filehandle')
00239 contents = kwargs.get ('contents')
00240 filename = kwargs.get ('filename')
00241 if not filehandle and not contents and not filename:
00242 raise RuntimeError, "You must provide 'filehandle', 'contents', or 'filename'"
00243 if filehandle and contents or \
00244 filehandle and filename or \
00245 contents and filename:
00246 raise RuntimeError, "You must provide only ONE of 'filehandle', 'contents', or 'filename'"
00247
00248
00249 filtering = kwargs.get ('filtering')
00250 if filtering:
00251
00252 if not contents:
00253 if not filehandle:
00254 try:
00255 filehandle = open (filename, 'r')
00256 except:
00257 raise RuntimeError, "Failed to open '%s'" % filename
00258 contents = ''
00259 for line in filehandle:
00260 contents += line
00261 filehandle.close()
00262 filehandle = filename = ''
00263 contents = quoteRE.sub (fixQuoteValue, contents)
00264
00265 ncDict = kwargs.get ('nameChangeDict', {})
00266 builder = TreeBuilder (nameChangeDict = ncDict)
00267 if contents:
00268 xml.sax.parseString(contents, builder)
00269 else:
00270 if not filehandle:
00271 try:
00272 filehandle = open (filename, 'r')
00273 except:
00274 raise RuntimeError, "Failed to open '%s'" % filename
00275 xml.sax.parse(filehandle, builder)
00276 return builder.topLevel()