CMS 3D CMS Logo

/data/doxygen/doxygen-1.7.3/gen/CMSSW_4_2_8/src/FWCore/PythonUtilities/python/XML2Python.py

Go to the documentation of this file.
00001 ## Original version of code heavily based on recipe written by Wai Yip
00002 ## Tung, released under PSF license.
00003 ## https://code.activestate.com/recipes/534109/
00004 
00005 import re
00006 import os
00007 import xml.sax.handler
00008 import pprint
00009 
00010 class DataNode (object):
00011 
00012     spaces = 4
00013 
00014     def __init__ (self, **kwargs):
00015         self._attrs = {}     # XML attributes and child elements
00016         self._data  = None   # child text data
00017         self._ncDict = kwargs.get ('nameChangeDict', {})
00018 
00019 
00020     def __len__ (self):
00021         # treat single element as a list of 1
00022         return 1
00023 
00024 
00025     def __getitem__ (self, key):
00026         if isinstance (key, basestring):
00027             return self._attrs.get(key,None)
00028         else:
00029             return [self][key]
00030 
00031 
00032     def __contains__ (self, name):
00033         return self._attrs.has_key(name)
00034 
00035 
00036     def __nonzero__ (self):
00037         return bool (self._attrs or self._data)
00038 
00039 
00040     def __getattr__ (self, name):
00041         if name.startswith('__'):
00042             # need to do this for Python special methods???
00043             raise AttributeError (name)
00044         return self._attrs.get (name, None)
00045 
00046 
00047     def _add_xml_attr (self, name, value):
00048         change = self._ncDict.get (name)
00049         if change:
00050             name = change
00051         if name in self._attrs:
00052             # multiple attribute of the same name are represented by a list
00053             children = self._attrs[name]
00054             if not isinstance(children, list):
00055                 children = [children]
00056                 self._attrs[name] = children
00057             children.append(value)
00058         else:
00059             self._attrs[name] = value
00060 
00061 
00062     def __str__ (self):
00063         return self.stringify()
00064 
00065 
00066     def __repr__ (self):
00067         items = sorted (self._attrs.items())
00068         if self._data:
00069             items.append(('data', self._data))
00070         return u'{%s}' % ', '.join([u'%s:%s' % (k,repr(v)) for k,v in items])
00071 
00072 
00073     def attributes (self):
00074         return self._attrs
00075 
00076 
00077     @staticmethod
00078     def isiterable (obj):
00079         return getattr (obj, '__iter__', False)
00080 
00081 
00082     @staticmethod
00083     def _outputValues (obj, name, offset):
00084         retval = ' ' * offset
00085         if name:
00086             retval += '%s: ' % name
00087             offset += len (name) + DataNode.spaces
00088         # if this is a list
00089         if isinstance (obj, list):
00090             first = True
00091             for value in obj:
00092                 print "value", value, value.__class__.__name__
00093                 if first:
00094                     tempoffset = offset
00095                     first = False
00096                     retval += '[\n ' + ' ' * offset
00097                 else:
00098                     retval += ',\n ' + ' ' * offset
00099                     tempoffset = offset
00100                 if isinstance (value, DataNode):
00101                     retval += value.stringify (offset=tempoffset)
00102                     print "  calling stringify for %s" % value
00103                 elif DataNode.isiterable (value):
00104                     retval += DataNode._outputValues (value, '', offset)
00105                 else:
00106                     retval += "%s" % value
00107             retval += '\n' + ' ' * (offset - 2) +']\n'
00108             return retval
00109         retval += pprint.pformat(obj,
00110                                  indent= offset,
00111                                  width=1)
00112         return retval
00113 
00114 
00115     def stringify (self, name = '', offset = 0):
00116         # is this just data and nothing below
00117         if self._data and not len (self._attrs):
00118             return _outputValues (self._data, name, offset)
00119             retval = ' ' * offset
00120             if name:
00121                 retval += '%s : %s\n' % \
00122                           (name,
00123                            pprint.pformat (self._data,
00124                                           indent= offset+DataNode.spaces,
00125                                           width=1) )
00126             else:
00127                 retval += pprint.pformat (self._data,
00128                                           indent=offset+DataNode.spaces,
00129                                           width=1)
00130             return retval
00131         # this has attributes
00132         retval = ''
00133         if name:
00134             retval += '\n' + ' ' * offset
00135             retval += '%s: ' % name
00136         first = True
00137         for key, value in sorted (self._attrs.iteritems()):
00138             if first:
00139                 retval += '{ \n'
00140                 tempspace = offset + 3
00141                 first = False
00142             else:
00143                 retval += ',\n'
00144                 tempspace = offset + 3
00145             if isinstance (value, DataNode):
00146                 retval += value.stringify (key, tempspace)
00147             else:
00148                 retval += DataNode._outputValues (value, key, tempspace)
00149         # this has data too
00150         if self._data:
00151             retval += ',\n'
00152             tempspace = offset + 3
00153             retval += DataNode._ouptputValues (self._data, name, tempspace)
00154         retval += '\n ' + ' ' * offset + '}'
00155         return retval 
00156         
00157 
00158 
00159 class TreeBuilder (xml.sax.handler.ContentHandler):
00160 
00161     non_id_char = re.compile('[^_0-9a-zA-Z]')
00162 
00163     def __init__ (self, **kwargs):
00164         self._stack = []
00165         self._text_parts = []
00166         self._ncDict = kwargs.get ('nameChangeDict', {})
00167         self._root = DataNode (nameChangeDict = self._ncDict)
00168         self.current = self._root
00169 
00170     def startElement (self, name, attrs):
00171         self._stack.append( (self.current, self._text_parts))
00172         self.current = DataNode (nameChangeDict = self._ncDict)
00173         self._text_parts = []
00174         # xml attributes --> python attributes
00175         for k, v in attrs.items():
00176             self.current._add_xml_attr (TreeBuilder._name_mangle(k), v)
00177 
00178     def endElement (self, name):
00179         text = ''.join (self._text_parts).strip()
00180         if text:
00181             self.current._data = text
00182         if self.current.attributes():
00183             obj = self.current
00184         else:
00185             # a text only node is simply represented by the string
00186             obj = text or ''
00187         self.current, self._text_parts = self._stack.pop()
00188         self.current._add_xml_attr (TreeBuilder._name_mangle(name), obj)
00189 
00190     def characters (self, content):
00191         self._text_parts.append(content)
00192 
00193     def root (self):
00194         return self._root
00195 
00196     def topLevel (self):
00197         '''Returns top level object'''
00198         return self._root.attributes().values()[0]
00199         
00200 
00201     @staticmethod
00202     def _name_mangle (name):
00203         return TreeBuilder.non_id_char.sub('_', name)
00204 
00205 
00206 regexList = [ (re.compile (r'&'), '&'   ),
00207               (re.compile (r'<'), '&lt;'    ),
00208               (re.compile (r'>'), '&gt;'    ),
00209               (re.compile (r'"'), '&quote;' ),
00210               (re.compile (r"'"), '&#39;'   )
00211               ]
00212 
00213 quoteRE = re.compile (r'(\w\s*=\s*")([^"]+)"')
00214 
00215 def fixQuoteValue (match):
00216     '''Changes all characters inside of the match'''
00217     quote = match.group(2)
00218     for regexTup in regexList:
00219         quote = regexTup[0].sub( regexTup[1], quote )
00220     return match.group(1) + quote + '"'
00221 
00222 
00223 def xml2obj (**kwargs):
00224     ''' Converts XML data into native Python object.  Takes either
00225     file handle or string as input.  Does NOT fix illegal characters.
00226 
00227     input source:  Exactly one of the three following is needed
00228     filehandle     - input from file handle
00229     contents       - input from string
00230     filename       - input from filename
00231 
00232     options:
00233     filtering      - boolean value telling code whether or not to fileter
00234                      input selection to remove illegal XML characters
00235     nameChangeDict - dictionaries of names to change in python object'''
00236 
00237     # make sure we have exactly 1 input source
00238     filehandle = kwargs.get ('filehandle')
00239     contents   = kwargs.get ('contents')
00240     filename   = kwargs.get ('filename')
00241     if not filehandle and not contents and not filename:
00242         raise RuntimeError, "You must provide 'filehandle', 'contents', or 'filename'"
00243     if     filehandle and contents or \
00244            filehandle and filename or \
00245            contents   and filename:
00246         raise RuntimeError, "You must provide only ONE of 'filehandle', 'contents', or 'filename'"
00247 
00248     # are we filtering?
00249     filtering = kwargs.get ('filtering')
00250     if filtering:
00251         # if we are filtering, we need to read in the contents to modify them
00252         if not contents:
00253             if not filehandle:
00254                 try:
00255                     filehandle = open (filename, 'r')
00256                 except:
00257                     raise RuntimeError, "Failed to open '%s'" % filename
00258             contents = ''
00259             for line in filehandle:
00260                 contents += line
00261             filehandle.close()
00262             filehandle = filename = ''
00263         contents = quoteRE.sub (fixQuoteValue, contents)
00264     
00265     ncDict = kwargs.get ('nameChangeDict', {})
00266     builder = TreeBuilder (nameChangeDict = ncDict)
00267     if contents:
00268         xml.sax.parseString(contents, builder)
00269     else:
00270         if not filehandle:
00271             try:
00272                 filehandle = open (filename, 'r')
00273             except:
00274                 raise RuntimeError, "Failed to open '%s'" % filename
00275         xml.sax.parse(filehandle, builder)
00276     return builder.topLevel()