00001
00002
00003
00004
00005 import re
00006 import os
00007 import xml.sax.handler
00008
00009 class DataNode (object):
00010
00011 def __init__ (self, **kwargs):
00012 self._attrs = {}
00013 self._data = None
00014 self._ncDict = kwargs.get ('nameChangeDict', {})
00015
00016 def __len__ (self):
00017
00018 return 1
00019
00020 def __getitem__ (self, key):
00021 if isinstance (key, basestring):
00022 return self._attrs.get(key,None)
00023 else:
00024 return [self][key]
00025
00026 def __contains__ (self, name):
00027 return self._attrs.has_key(name)
00028
00029 def __nonzero__ (self):
00030 return bool (self._attrs or self._data)
00031
00032 def __getattr__ (self, name):
00033 if name.startswith('__'):
00034
00035 raise AttributeError (name)
00036 return self._attrs.get (name, None)
00037
00038 def _add_xml_attr (self, name, value):
00039 change = self._ncDict.get (name)
00040 if change:
00041 name = change
00042 if name in self._attrs:
00043
00044 children = self._attrs[name]
00045 if not isinstance(children, list):
00046 children = [children]
00047 self._attrs[name] = children
00048 children.append(value)
00049 else:
00050 self._attrs[name] = value
00051
00052 def __str__ (self):
00053 return self._data or ''
00054
00055 def __repr__ (self):
00056 items = sorted (self._attrs.items())
00057 if self._data:
00058 items.append(('data', self._data))
00059 return u'{%s}' % ', '.join([u'%s:%s' % (k,repr(v)) for k,v in items])
00060
00061 def attributes (self):
00062 return self._attrs
00063
00064
00065 class TreeBuilder (xml.sax.handler.ContentHandler):
00066
00067 non_id_char = re.compile('[^_0-9a-zA-Z]')
00068
00069 def __init__ (self, **kwargs):
00070 self._stack = []
00071 self._text_parts = []
00072 self._ncDict = kwargs.get ('nameChangeDict', {})
00073 self._root = DataNode (nameChangeDict = self._ncDict)
00074 self.current = self._root
00075
00076 def startElement (self, name, attrs):
00077 self._stack.append( (self.current, self._text_parts))
00078 self.current = DataNode (nameChangeDict = self._ncDict)
00079 self._text_parts = []
00080
00081 for k, v in attrs.items():
00082 self.current._add_xml_attr (TreeBuilder._name_mangle(k), v)
00083
00084 def endElement (self, name):
00085 text = ''.join (self._text_parts).strip()
00086 if text:
00087 self.current._data = text
00088 if self.current.attributes():
00089 obj = self.current
00090 else:
00091
00092 obj = text or ''
00093 self.current, self._text_parts = self._stack.pop()
00094 self.current._add_xml_attr (TreeBuilder._name_mangle(name), obj)
00095
00096 def characters (self, content):
00097 self._text_parts.append(content)
00098
00099 def root (self):
00100 return self._root
00101
00102 def topLevel (self):
00103 '''Returns top level object'''
00104 return self._root.attributes().values()[0]
00105
00106
00107 @staticmethod
00108 def _name_mangle (name):
00109 return TreeBuilder.non_id_char.sub('_', name)
00110
00111
00112 regexList = [ (re.compile (r'&'), '&' ),
00113 (re.compile (r'<'), '<' ),
00114 (re.compile (r'>'), '>' ),
00115 (re.compile (r'"'), '"e;' ),
00116 (re.compile (r"'"), ''' )
00117 ]
00118
00119 quoteRE = re.compile (r'(\w\s*=\s*")([^"]+)"')
00120
00121 def fixQuoteValue (match):
00122 '''Changes all characters inside of the match'''
00123 quote = match.group(2)
00124 for regexTup in regexList:
00125 quote = regexTup[0].sub( regexTup[1], quote )
00126 return match.group(1) + quote + '"'
00127
00128
00129 def xml2obj (**kwargs):
00130 ''' Converts XML data into native Python object. Takes either
00131 file handle or string as input. Does NOT fix illegal characters.
00132
00133 input source: Exactly one of the three following is needed
00134 filehandle - input from file handle
00135 contents - input from string
00136 filename - input from filename
00137
00138 options:
00139 filtering - boolean value telling code whether or not to fileter
00140 input selection to remove illegal XML characters
00141 nameChangeDict - dictionaries of names to change in python object'''
00142
00143
00144 filehandle = kwargs.get ('filehandle')
00145 contents = kwargs.get ('contents')
00146 filename = kwargs.get ('filename')
00147 if not filehandle and not contents and not filename:
00148 raise RuntimeError, "You must provide 'filehandle', 'contents', or 'filename'"
00149 if filehandle and contents or \
00150 filehandle and filename or \
00151 contents and filename:
00152 raise RuntimeError, "You must provide only ONE of 'filehandle', 'contents', or 'filename'"
00153
00154
00155 filtering = kwargs.get ('filtering')
00156 if filtering:
00157
00158 if not contents:
00159 if not filehandle:
00160 try:
00161 filehandle = open (filename, 'r')
00162 except:
00163 raise RuntimeError, "Failed to open '%s'" % filename
00164 contents = ''
00165 for line in filehandle:
00166 contents += line
00167 filehandle.close()
00168 filehandle = filename = ''
00169 contents = quoteRE.sub (fixQuoteValue, contents)
00170
00171 ncDict = kwargs.get ('nameChangeDict', {})
00172 builder = TreeBuilder (nameChangeDict = ncDict)
00173 if contents:
00174 xml.sax.parseString(contents, builder)
00175 else:
00176 if not filehandle:
00177 try:
00178 filehandle = open (filename, 'r')
00179 except:
00180 raise RuntimeError, "Failed to open '%s'" % filename
00181 xml.sax.parse(filehandle, builder)
00182 return builder.topLevel()