Reference Manual

Go to the documentation of this file.
00001 """Beautiful Soup
00002 Elixir and Tonic
00003 "The Screen-Scraper's Friend"
00004 http://www.crummy.com/software/BeautifulSoup/
00005 
00006 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
00007 tree representation. It provides methods and Pythonic idioms that make
00008 it easy to navigate, search, and modify the tree.
00009 
00010 A well-formed XML/HTML document yields a well-formed data
00011 structure. An ill-formed XML/HTML document yields a correspondingly
00012 ill-formed data structure. If your document is only locally
00013 well-formed, you can use this library to find and process the
00014 well-formed part of it.
00015 
00016 Beautiful Soup works with Python 2.2 and up. It has no external
00017 dependencies, but you'll have more success at converting data to UTF-8
00018 if you also install these three packages:
00019 
00020 * chardet, for auto-detecting character encodings
00021   http://chardet.feedparser.org/
00022 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
00023   by stock Python.
00024   http://cjkpython.i18n.org/
00025 
00026 Beautiful Soup defines classes for two main parsing strategies:
00027 
00028  * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
00029    language that kind of looks like XML.
00030 
00031  * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
00032    or invalid. This class has web browser-like heuristics for
00033    obtaining a sensible parse tree in the face of common HTML errors.
00034 
00035 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
00036 the encoding of an HTML or XML document, and converting it to
00037 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
00038 
00039 For more than you ever wanted to know about Beautiful Soup, see the
00040 documentation:
00041 http://www.crummy.com/software/BeautifulSoup/documentation.html
00042 
00043 Here, have some legalese:
00044 
00045 Copyright (c) 2004-2009, Leonard Richardson
00046 
00047 All rights reserved.
00048 
00049 Redistribution and use in source and binary forms, with or without
00050 modification, are permitted provided that the following conditions are
00051 met:
00052 
00053   * Redistributions of source code must retain the above copyright
00054     notice, this list of conditions and the following disclaimer.
00055 
00056   * Redistributions in binary form must reproduce the above
00057     copyright notice, this list of conditions and the following
00058     disclaimer in the documentation and/or other materials provided
00059     with the distribution.
00060 
00061   * Neither the name of the the Beautiful Soup Consortium and All
00062     Night Kosher Bakery nor the names of its contributors may be
00063     used to endorse or promote products derived from this software
00064     without specific prior written permission.
00065 
00066 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00067 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00068 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
00069 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
00070 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00071 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00072 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00073 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00074 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00075 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00076 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
00077 
00078 """
00079 from __future__ import generators
00080 
00081 __author__ = "Leonard Richardson (leonardr@segfault.org)"
00082 __version__ = "3.1.0.1"
00083 __copyright__ = "Copyright (c) 2004-2009 Leonard Richardson"
00084 __license__ = "New-style BSD"
00085 
00086 import codecs
00087 import markupbase
00088 import types
00089 import re
00090 from HTMLParser import HTMLParser, HTMLParseError
00091 try:
00092     from htmlentitydefs import name2codepoint
00093 except ImportError:
00094     name2codepoint = {}
00095 try:
00096     set
00097 except NameError:
00098     from sets import Set as set
00099 
00100 #These hacks make Beautiful Soup able to parse XML with namespaces
00101 markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
00102 
00103 DEFAULT_OUTPUT_ENCODING = "utf-8"
00104 
00105 # First, the classes that represent markup elements.
00106 
00107 def sob(unicode, encoding):
00108     """Returns either the given Unicode string or its encoding."""
00109     if encoding is None:
00110         return unicode
00111     else:
00112         return unicode.encode(encoding)
00113 
00114 class PageElement:
00115     """Contains the navigational information for some part of the page
00116     (either a tag or a piece of text)"""
00117 
00118     def setup(self, parent=None, previous=None):
00119         """Sets up the initial relations between this element and
00120         other elements."""
00121         self.parent = parent
00122         self.previous = previous
00123         self.next = None
00124         self.previousSibling = None
00125         self.nextSibling = None
00126         if self.parent and self.parent.contents:
00127             self.previousSibling = self.parent.contents[-1]
00128             self.previousSibling.nextSibling = self
00129 
00130     def replaceWith(self, replaceWith):
00131         oldParent = self.parent
00132         myIndex = self.parent.contents.index(self)
00133         if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
00134             # We're replacing this element with one of its siblings.
00135             index = self.parent.contents.index(replaceWith)
00136             if index and index < myIndex:
00137                 # Furthermore, it comes before this element. That
00138                 # means that when we extract it, the index of this
00139                 # element will change.
00140                 myIndex = myIndex - 1
00141         self.extract()
00142         oldParent.insert(myIndex, replaceWith)
00143 
00144     def extract(self):
00145         """Destructively rips this element out of the tree."""
00146         if self.parent:
00147             try:
00148                 self.parent.contents.remove(self)
00149             except ValueError:
00150                 pass
00151 
00152         #Find the two elements that would be next to each other if
00153         #this element (and any children) hadn't been parsed. Connect
00154         #the two.
00155         lastChild = self._lastRecursiveChild()
00156         nextElement = lastChild.next
00157 
00158         if self.previous:
00159             self.previous.next = nextElement
00160         if nextElement:
00161             nextElement.previous = self.previous
00162         self.previous = None
00163         lastChild.next = None
00164 
00165         self.parent = None
00166         if self.previousSibling:
00167             self.previousSibling.nextSibling = self.nextSibling
00168         if self.nextSibling:
00169             self.nextSibling.previousSibling = self.previousSibling
00170         self.previousSibling = self.nextSibling = None
00171         return self
00172 
00173     def _lastRecursiveChild(self):
00174         "Finds the last element beneath this object to be parsed."
00175         lastChild = self
00176         while hasattr(lastChild, 'contents') and lastChild.contents:
00177             lastChild = lastChild.contents[-1]
00178         return lastChild
00179 
00180     def insert(self, position, newChild):
00181         if (isinstance(newChild, basestring)
00182             or isinstance(newChild, unicode)) \
00183             and not isinstance(newChild, NavigableString):
00184             newChild = NavigableString(newChild)
00185 
00186         position =  min(position, len(self.contents))
00187         if hasattr(newChild, 'parent') and newChild.parent != None:
00188             # We're 'inserting' an element that's already one
00189             # of this object's children.
00190             if newChild.parent == self:
00191                 index = self.find(newChild)
00192                 if index and index < position:
00193                     # Furthermore we're moving it further down the
00194                     # list of this object's children. That means that
00195                     # when we extract this element, our target index
00196                     # will jump down one.
00197                     position = position - 1
00198             newChild.extract()
00199 
00200         newChild.parent = self
00201         previousChild = None
00202         if position == 0:
00203             newChild.previousSibling = None
00204             newChild.previous = self
00205         else:
00206             previousChild = self.contents[position-1]
00207             newChild.previousSibling = previousChild
00208             newChild.previousSibling.nextSibling = newChild
00209             newChild.previous = previousChild._lastRecursiveChild()
00210         if newChild.previous:
00211             newChild.previous.next = newChild
00212 
00213         newChildsLastElement = newChild._lastRecursiveChild()
00214 
00215         if position >= len(self.contents):
00216             newChild.nextSibling = None
00217 
00218             parent = self
00219             parentsNextSibling = None
00220             while not parentsNextSibling:
00221                 parentsNextSibling = parent.nextSibling
00222                 parent = parent.parent
00223                 if not parent: # This is the last element in the document.
00224                     break
00225             if parentsNextSibling:
00226                 newChildsLastElement.next = parentsNextSibling
00227             else:
00228                 newChildsLastElement.next = None
00229         else:
00230             nextChild = self.contents[position]
00231             newChild.nextSibling = nextChild
00232             if newChild.nextSibling:
00233                 newChild.nextSibling.previousSibling = newChild
00234             newChildsLastElement.next = nextChild
00235 
00236         if newChildsLastElement.next:
00237             newChildsLastElement.next.previous = newChildsLastElement
00238         self.contents.insert(position, newChild)
00239 
00240     def append(self, tag):
00241         """Appends the given tag to the contents of this tag."""
00242         self.insert(len(self.contents), tag)
00243 
00244     def findNext(self, name=None, attrs={}, text=None, **kwargs):
00245         """Returns the first item that matches the given criteria and
00246         appears after this Tag in the document."""
00247         return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
00248 
00249     def findAllNext(self, name=None, attrs={}, text=None, limit=None,
00250                     **kwargs):
00251         """Returns all items that match the given criteria and appear
00252         after this Tag in the document."""
00253         return self._findAll(name, attrs, text, limit, self.nextGenerator,
00254                              **kwargs)
00255 
00256     def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
00257         """Returns the closest sibling to this Tag that matches the
00258         given criteria and appears after this Tag in the document."""
00259         return self._findOne(self.findNextSiblings, name, attrs, text,
00260                              **kwargs)
00261 
00262     def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
00263                          **kwargs):
00264         """Returns the siblings of this Tag that match the given
00265         criteria and appear after this Tag in the document."""
00266         return self._findAll(name, attrs, text, limit,
00267                              self.nextSiblingGenerator, **kwargs)
00268     fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
00269 
00270     def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
00271         """Returns the first item that matches the given criteria and
00272         appears before this Tag in the document."""
00273         return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
00274 
00275     def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
00276                         **kwargs):
00277         """Returns all items that match the given criteria and appear
00278         before this Tag in the document."""
00279         return self._findAll(name, attrs, text, limit, self.previousGenerator,
00280                            **kwargs)
00281     fetchPrevious = findAllPrevious # Compatibility with pre-3.x
00282 
00283     def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
00284         """Returns the closest sibling to this Tag that matches the
00285         given criteria and appears before this Tag in the document."""
00286         return self._findOne(self.findPreviousSiblings, name, attrs, text,
00287                              **kwargs)
00288 
00289     def findPreviousSiblings(self, name=None, attrs={}, text=None,
00290                              limit=None, **kwargs):
00291         """Returns the siblings of this Tag that match the given
00292         criteria and appear before this Tag in the document."""
00293         return self._findAll(name, attrs, text, limit,
00294                              self.previousSiblingGenerator, **kwargs)
00295     fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
00296 
00297     def findParent(self, name=None, attrs={}, **kwargs):
00298         """Returns the closest parent of this Tag that matches the given
00299         criteria."""
00300         # NOTE: We can't use _findOne because findParents takes a different
00301         # set of arguments.
00302         r = None
00303         l = self.findParents(name, attrs, 1)
00304         if l:
00305             r = l[0]
00306         return r
00307 
00308     def findParents(self, name=None, attrs={}, limit=None, **kwargs):
00309         """Returns the parents of this Tag that match the given
00310         criteria."""
00311 
00312         return self._findAll(name, attrs, None, limit, self.parentGenerator,
00313                              **kwargs)
00314     fetchParents = findParents # Compatibility with pre-3.x
00315 
00316     #These methods do the real heavy lifting.
00317 
00318     def _findOne(self, method, name, attrs, text, **kwargs):
00319         r = None
00320         l = method(name, attrs, text, 1, **kwargs)
00321         if l:
00322             r = l[0]
00323         return r
00324 
00325     def _findAll(self, name, attrs, text, limit, generator, **kwargs):
00326         "Iterates over a generator looking for things that match."
00327 
00328         if isinstance(name, SoupStrainer):
00329             strainer = name
00330         else:
00331             # Build a SoupStrainer
00332             strainer = SoupStrainer(name, attrs, text, **kwargs)
00333         results = ResultSet(strainer)
00334         g = generator()
00335         while True:
00336             try:
00337                 i = g.next()
00338             except StopIteration:
00339                 break
00340             if i:
00341                 found = strainer.search(i)
00342                 if found:
00343                     results.append(found)
00344                     if limit and len(results) >= limit:
00345                         break
00346         return results
00347 
00348     #These Generators can be used to navigate starting from both
00349     #NavigableStrings and Tags.
00350     def nextGenerator(self):
00351         i = self
00352         while i:
00353             i = i.next
00354             yield i
00355 
00356     def nextSiblingGenerator(self):
00357         i = self
00358         while i:
00359             i = i.nextSibling
00360             yield i
00361 
00362     def previousGenerator(self):
00363         i = self
00364         while i:
00365             i = i.previous
00366             yield i
00367 
00368     def previousSiblingGenerator(self):
00369         i = self
00370         while i:
00371             i = i.previousSibling
00372             yield i
00373 
00374     def parentGenerator(self):
00375         i = self
00376         while i:
00377             i = i.parent
00378             yield i
00379 
00380     # Utility methods
00381     def substituteEncoding(self, str, encoding=None):
00382         encoding = encoding or "utf-8"
00383         return str.replace("%SOUP-ENCODING%", encoding)
00384 
00385     def toEncoding(self, s, encoding=None):
00386         """Encodes an object to a string in some encoding, or to Unicode.
00387         ."""
00388         if isinstance(s, unicode):
00389             if encoding:
00390                 s = s.encode(encoding)
00391         elif isinstance(s, str):
00392             if encoding:
00393                 s = s.encode(encoding)
00394             else:
00395                 s = unicode(s)
00396         else:
00397             if encoding:
00398                 s  = self.toEncoding(str(s), encoding)
00399             else:
00400                 s = unicode(s)
00401         return s
00402 
00403 class NavigableString(unicode, PageElement):
00404 
00405     def __new__(cls, value):
00406         """Create a new NavigableString.
00407 
00408         When unpickling a NavigableString, this method is called with
00409         the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
00410         passed in to the superclass's __new__ or the superclass won't know
00411         how to handle non-ASCII characters.
00412         """
00413         if isinstance(value, unicode):
00414             return unicode.__new__(cls, value)
00415         return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
00416 
00417     def __getnewargs__(self):
00418         return (unicode(self),)
00419 
00420     def __getattr__(self, attr):
00421         """text.string gives you text. This is for backwards
00422         compatibility for Navigable*String, but for CData* it lets you
00423         get the string without the CData wrapper."""
00424         if attr == 'string':
00425             return self
00426         else:
00427             raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
00428 
00429     def encode(self, encoding=DEFAULT_OUTPUT_ENCODING):
00430         return self.decode().encode(encoding)
00431 
00432     def decodeGivenEventualEncoding(self, eventualEncoding):
00433         return self
00434 
00435 class CData(NavigableString):
00436 
00437     def decodeGivenEventualEncoding(self, eventualEncoding):
00438         return u'<![CDATA[' + self + u']]>'
00439 
00440 class ProcessingInstruction(NavigableString):
00441 
00442     def decodeGivenEventualEncoding(self, eventualEncoding):
00443         output = self
00444         if u'%SOUP-ENCODING%' in output:
00445             output = self.substituteEncoding(output, eventualEncoding)
00446         return u'<?' + output + u'?>'
00447 
00448 class Comment(NavigableString):
00449     def decodeGivenEventualEncoding(self, eventualEncoding):
00450         return u'<!--' + self + u'-->'
00451 
00452 class Declaration(NavigableString):
00453     def decodeGivenEventualEncoding(self, eventualEncoding):
00454         return u'<!' + self + u'>'
00455 
00456 class Tag(PageElement):
00457 
00458     """Represents a found HTML tag with its attributes and contents."""
00459 
00460     def _invert(h):
00461         "Cheap function to invert a hash."
00462         i = {}
00463         for k,v in h.items():
00464             i[v] = k
00465         return i
00466 
00467     XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
00468                                       "quot" : '"',
00469                                       "amp" : "&",
00470                                       "lt" : "<",
00471                                       "gt" : ">" }
00472 
00473     XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
00474 
00475     def _convertEntities(self, match):
00476         """Used in a call to re.sub to replace HTML, XML, and numeric
00477         entities with the appropriate Unicode characters. If HTML
00478         entities are being converted, any unrecognized entities are
00479         escaped."""
00480         x = match.group(1)
00481         if self.convertHTMLEntities and x in name2codepoint:
00482             return unichr(name2codepoint[x])
00483         elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
00484             if self.convertXMLEntities:
00485                 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
00486             else:
00487                 return u'&%s;' % x
00488         elif len(x) > 0 and x[0] == '#':
00489             # Handle numeric entities
00490             if len(x) > 1 and x[1] == 'x':
00491                 return unichr(int(x[2:], 16))
00492             else:
00493                 return unichr(int(x[1:]))
00494 
00495         elif self.escapeUnrecognizedEntities:
00496             return u'&amp;%s;' % x
00497         else:
00498             return u'&%s;' % x
00499 
00500     def __init__(self, parser, name, attrs=None, parent=None,
00501                  previous=None):
00502         "Basic constructor."
00503 
00504         # We don't actually store the parser object: that lets extracted
00505         # chunks be garbage-collected
00506         self.parserClass = parser.__class__
00507         self.isSelfClosing = parser.isSelfClosingTag(name)
00508         self.name = name
00509         if attrs == None:
00510             attrs = []
00511         self.attrs = attrs
00512         self.contents = []
00513         self.setup(parent, previous)
00514         self.hidden = False
00515         self.containsSubstitutions = False
00516         self.convertHTMLEntities = parser.convertHTMLEntities
00517         self.convertXMLEntities = parser.convertXMLEntities
00518         self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
00519 
00520         def convert(kval):
00521             "Converts HTML, XML and numeric entities in the attribute value."
00522             k, val = kval
00523             if val is None:
00524                 return kval
00525             return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
00526                               self._convertEntities, val))
00527         self.attrs = map(convert, self.attrs)
00528 
00529     def get(self, key, default=None):
00530         """Returns the value of the 'key' attribute for the tag, or
00531         the value given for 'default' if it doesn't have that
00532         attribute."""
00533         return self._getAttrMap().get(key, default)
00534 
00535     def has_key(self, key):
00536         return self._getAttrMap().has_key(key)
00537 
00538     def __getitem__(self, key):
00539         """tag[key] returns the value of the 'key' attribute for the tag,
00540         and throws an exception if it's not there."""
00541         return self._getAttrMap()[key]
00542 
00543     def __iter__(self):
00544         "Iterating over a tag iterates over its contents."
00545         return iter(self.contents)
00546 
00547     def __len__(self):
00548         "The length of a tag is the length of its list of contents."
00549         return len(self.contents)
00550 
00551     def __contains__(self, x):
00552         return x in self.contents
00553 
00554     def __nonzero__(self):
00555         "A tag is non-None even if it has no contents."
00556         return True
00557 
00558     def __setitem__(self, key, value):
00559         """Setting tag[key] sets the value of the 'key' attribute for the
00560         tag."""
00561         self._getAttrMap()
00562         self.attrMap[key] = value
00563         found = False
00564         for i in range(0, len(self.attrs)):
00565             if self.attrs[i][0] == key:
00566                 self.attrs[i] = (key, value)
00567                 found = True
00568         if not found:
00569             self.attrs.append((key, value))
00570         self._getAttrMap()[key] = value
00571 
00572     def __delitem__(self, key):
00573         "Deleting tag[key] deletes all 'key' attributes for the tag."
00574         for item in self.attrs:
00575             if item[0] == key:
00576                 self.attrs.remove(item)
00577                 #We don't break because bad HTML can define the same
00578                 #attribute multiple times.
00579             self._getAttrMap()
00580             if self.attrMap.has_key(key):
00581                 del self.attrMap[key]
00582 
00583     def __call__(self, *args, **kwargs):
00584         """Calling a tag like a function is the same as calling its
00585         findAll() method. Eg. tag('a') returns a list of all the A tags
00586         found within this tag."""
00587         return apply(self.findAll, args, kwargs)
00588 
00589     def __getattr__(self, tag):
00590         #print "Getattr %s.%s" % (self.__class__, tag)
00591         if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
00592             return self.find(tag[:-3])
00593         elif tag.find('__') != 0:
00594             return self.find(tag)
00595         raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
00596 
00597     def __eq__(self, other):
00598         """Returns true iff this tag has the same name, the same attributes,
00599         and the same contents (recursively) as the given tag.
00600 
00601         NOTE: right now this will return false if two tags have the
00602         same attributes in a different order. Should this be fixed?"""
00603         if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
00604             return False
00605         for i in range(0, len(self.contents)):
00606             if self.contents[i] != other.contents[i]:
00607                 return False
00608         return True
00609 
00610     def __ne__(self, other):
00611         """Returns true iff this tag is not identical to the other tag,
00612         as defined in __eq__."""
00613         return not self == other
00614 
00615     def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
00616         """Renders this tag as a string."""
00617         return self.decode(eventualEncoding=encoding)
00618 
00619     BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
00620                                            + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
00621                                            + ")")
00622 
00623     def _sub_entity(self, x):
00624         """Used with a regular expression to substitute the
00625         appropriate XML entity for an XML special character."""
00626         return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
00627 
00628     def __unicode__(self):
00629         return self.decode()
00630 
00631     def __str__(self):
00632         return self.encode()
00633 
00634     def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
00635                prettyPrint=False, indentLevel=0):
00636         return self.decode(prettyPrint, indentLevel, encoding).encode(encoding)
00637 
00638     def decode(self, prettyPrint=False, indentLevel=0,
00639                eventualEncoding=DEFAULT_OUTPUT_ENCODING):
00640         """Returns a string or Unicode representation of this tag and
00641         its contents. To get Unicode, pass None for encoding."""
00642 
00643         attrs = []
00644         if self.attrs:
00645             for key, val in self.attrs:
00646                 fmt = '%s="%s"'
00647                 if isString(val):
00648                     if (self.containsSubstitutions
00649                         and eventualEncoding is not None
00650                         and '%SOUP-ENCODING%' in val):
00651                         val = self.substituteEncoding(val, eventualEncoding)
00652 
00653                     # The attribute value either:
00654                     #
00655                     # * Contains no embedded double quotes or single quotes.
00656                     #   No problem: we enclose it in double quotes.
00657                     # * Contains embedded single quotes. No problem:
00658                     #   double quotes work here too.
00659                     # * Contains embedded double quotes. No problem:
00660                     #   we enclose it in single quotes.
00661                     # * Embeds both single _and_ double quotes. This
00662                     #   can't happen naturally, but it can happen if
00663                     #   you modify an attribute value after parsing
00664                     #   the document. Now we have a bit of a
00665                     #   problem. We solve it by enclosing the
00666                     #   attribute in single quotes, and escaping any
00667                     #   embedded single quotes to XML entities.
00668                     if '"' in val:
00669                         fmt = "%s='%s'"
00670                         if "'" in val:
00671                             # TODO: replace with apos when
00672                             # appropriate.
00673                             val = val.replace("'", "&squot;")
00674 
00675                     # Now we're okay w/r/t quotes. But the attribute
00676                     # value might also contain angle brackets, or
00677                     # ampersands that aren't part of entities. We need
00678                     # to escape those to XML entities too.
00679                     val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
00680                 if val is None:
00681                     # Handle boolean attributes.
00682                     decoded = key
00683                 else:
00684                     decoded = fmt % (key, val)
00685                 attrs.append(decoded)
00686         close = ''
00687         closeTag = ''
00688         if self.isSelfClosing:
00689             close = ' /'
00690         else:
00691             closeTag = '</%s>' % self.name
00692 
00693         indentTag, indentContents = 0, 0
00694         if prettyPrint:
00695             indentTag = indentLevel
00696             space = (' ' * (indentTag-1))
00697             indentContents = indentTag + 1
00698         contents = self.decodeContents(prettyPrint, indentContents,
00699                                        eventualEncoding)
00700         if self.hidden:
00701             s = contents
00702         else:
00703             s = []
00704             attributeString = ''
00705             if attrs:
00706                 attributeString = ' ' + ' '.join(attrs)
00707             if prettyPrint:
00708                 s.append(space)
00709             s.append('<%s%s%s>' % (self.name, attributeString, close))
00710             if prettyPrint:
00711                 s.append("\n")
00712             s.append(contents)
00713             if prettyPrint and contents and contents[-1] != "\n":
00714                 s.append("\n")
00715             if prettyPrint and closeTag:
00716                 s.append(space)
00717             s.append(closeTag)
00718             if prettyPrint and closeTag and self.nextSibling:
00719                 s.append("\n")
00720             s = ''.join(s)
00721         return s
00722 
00723     def decompose(self):
00724         """Recursively destroys the contents of this tree."""
00725         contents = [i for i in self.contents]
00726         for i in contents:
00727             if isinstance(i, Tag):
00728                 i.decompose()
00729             else:
00730                 i.extract()
00731         self.extract()
00732 
00733     def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
00734         return self.encode(encoding, True)
00735 
00736     def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
00737                        prettyPrint=False, indentLevel=0):
00738         return self.decodeContents(prettyPrint, indentLevel).encode(encoding)
00739 
00740     def decodeContents(self, prettyPrint=False, indentLevel=0,
00741                        eventualEncoding=DEFAULT_OUTPUT_ENCODING):
00742         """Renders the contents of this tag as a string in the given
00743         encoding. If encoding is None, returns a Unicode string.."""
00744         s=[]
00745         for c in self:
00746             text = None
00747             if isinstance(c, NavigableString):
00748                 text = c.decodeGivenEventualEncoding(eventualEncoding)
00749             elif isinstance(c, Tag):
00750                 s.append(c.decode(prettyPrint, indentLevel, eventualEncoding))
00751             if text and prettyPrint:
00752                 text = text.strip()
00753             if text:
00754                 if prettyPrint:
00755                     s.append(" " * (indentLevel-1))
00756                 s.append(text)
00757                 if prettyPrint:
00758                     s.append("\n")
00759         return ''.join(s)
00760 
00761     #Soup methods
00762 
00763     def find(self, name=None, attrs={}, recursive=True, text=None,
00764              **kwargs):
00765         """Return only the first child of this Tag matching the given
00766         criteria."""
00767         r = None
00768         l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
00769         if l:
00770             r = l[0]
00771         return r
00772     findChild = find
00773 
00774     def findAll(self, name=None, attrs={}, recursive=True, text=None,
00775                 limit=None, **kwargs):
00776         """Extracts a list of Tag objects that match the given
00777         criteria.  You can specify the name of the Tag and any
00778         attributes you want the Tag to have.
00779 
00780         The value of a key-value pair in the 'attrs' map can be a
00781         string, a list of strings, a regular expression object, or a
00782         callable that takes a string and returns whether or not the
00783         string matches for some custom definition of 'matches'. The
00784         same is true of the tag name."""
00785         generator = self.recursiveChildGenerator
00786         if not recursive:
00787             generator = self.childGenerator
00788         return self._findAll(name, attrs, text, limit, generator, **kwargs)
00789     findChildren = findAll
00790 
00791     # Pre-3.x compatibility methods. Will go away in 4.0.
00792     first = find
00793     fetch = findAll
00794 
00795     def fetchText(self, text=None, recursive=True, limit=None):
00796         return self.findAll(text=text, recursive=recursive, limit=limit)
00797 
00798     def firstText(self, text=None, recursive=True):
00799         return self.find(text=text, recursive=recursive)
00800 
00801     # 3.x compatibility methods. Will go away in 4.0.
00802     def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
00803                        prettyPrint=False, indentLevel=0):
00804         if encoding is None:
00805             return self.decodeContents(prettyPrint, indentLevel, encoding)
00806         else:
00807             return self.encodeContents(encoding, prettyPrint, indentLevel)
00808 
00809 
00810     #Private methods
00811 
00812     def _getAttrMap(self):
00813         """Initializes a map representation of this tag's attributes,
00814         if not already initialized."""
00815         if not getattr(self, 'attrMap'):
00816             self.attrMap = {}
00817             for (key, value) in self.attrs:
00818                 self.attrMap[key] = value
00819         return self.attrMap
00820 
00821     #Generator methods
00822     def recursiveChildGenerator(self):
00823         if not len(self.contents):
00824             raise StopIteration
00825         stopNode = self._lastRecursiveChild().next
00826         current = self.contents[0]
00827         while current is not stopNode:
00828             yield current
00829             current = current.next
00830 
00831     def childGenerator(self):
00832         if not len(self.contents):
00833             raise StopIteration
00834         current = self.contents[0]
00835         while current:
00836             yield current
00837             current = current.nextSibling
00838         raise StopIteration
00839 
00840 # Next, a couple classes to represent queries and their results.
00841 class SoupStrainer:
00842     """Encapsulates a number of ways of matching a markup element (tag or
00843     text)."""
00844 
00845     def __init__(self, name=None, attrs={}, text=None, **kwargs):
00846         self.name = name
00847         if isString(attrs):
00848             kwargs['class'] = attrs
00849             attrs = None
00850         if kwargs:
00851             if attrs:
00852                 attrs = attrs.copy()
00853                 attrs.update(kwargs)
00854             else:
00855                 attrs = kwargs
00856         self.attrs = attrs
00857         self.text = text
00858 
00859     def __str__(self):
00860         if self.text:
00861             return self.text
00862         else:
00863             return "%s|%s" % (self.name, self.attrs)
00864 
00865     def searchTag(self, markupName=None, markupAttrs={}):
00866         found = None
00867         markup = None
00868         if isinstance(markupName, Tag):
00869             markup = markupName
00870             markupAttrs = markup
00871         callFunctionWithTagData = callable(self.name) \
00872                                 and not isinstance(markupName, Tag)
00873 
00874         if (not self.name) \
00875                or callFunctionWithTagData \
00876                or (markup and self._matches(markup, self.name)) \
00877                or (not markup and self._matches(markupName, self.name)):
00878             if callFunctionWithTagData:
00879                 match = self.name(markupName, markupAttrs)
00880             else:
00881                 match = True
00882                 markupAttrMap = None
00883                 for attr, matchAgainst in self.attrs.items():
00884                     if not markupAttrMap:
00885                          if hasattr(markupAttrs, 'get'):
00886                             markupAttrMap = markupAttrs
00887                          else:
00888                             markupAttrMap = {}
00889                             for k,v in markupAttrs:
00890                                 markupAttrMap[k] = v
00891                     attrValue = markupAttrMap.get(attr)
00892                     if not self._matches(attrValue, matchAgainst):
00893                         match = False
00894                         break
00895             if match:
00896                 if markup:
00897                     found = markup
00898                 else:
00899                     found = markupName
00900         return found
00901 
00902     def search(self, markup):
00903         #print 'looking for %s in %s' % (self, markup)
00904         found = None
00905         # If given a list of items, scan it for a text element that
00906         # matches.
00907         if isList(markup) and not isinstance(markup, Tag):
00908             for element in markup:
00909                 if isinstance(element, NavigableString) \
00910                        and self.search(element):
00911                     found = element
00912                     break
00913         # If it's a Tag, make sure its name or attributes match.
00914         # Don't bother with Tags if we're searching for text.
00915         elif isinstance(markup, Tag):
00916             if not self.text:
00917                 found = self.searchTag(markup)
00918         # If it's text, make sure the text matches.
00919         elif isinstance(markup, NavigableString) or \
00920                  isString(markup):
00921             if self._matches(markup, self.text):
00922                 found = markup
00923         else:
00924             raise Exception, "I don't know how to match against a %s" \
00925                   % markup.__class__
00926         return found
00927 
00928     def _matches(self, markup, matchAgainst):
00929         #print "Matching %s against %s" % (markup, matchAgainst)
00930         result = False
00931         if matchAgainst == True and type(matchAgainst) == types.BooleanType:
00932             result = markup != None
00933         elif callable(matchAgainst):
00934             result = matchAgainst(markup)
00935         else:
00936             #Custom match methods take the tag as an argument, but all
00937             #other ways of matching match the tag name as a string.
00938             if isinstance(markup, Tag):
00939                 markup = markup.name
00940             if markup is not None and not isString(markup):
00941                 markup = unicode(markup)
00942             #Now we know that chunk is either a string, or None.
00943             if hasattr(matchAgainst, 'match'):
00944                 # It's a regexp object.
00945                 result = markup and matchAgainst.search(markup)
00946             elif (isList(matchAgainst)
00947                   and (markup is not None or not isString(matchAgainst))):
00948                 result = markup in matchAgainst
00949             elif hasattr(matchAgainst, 'items'):
00950                 result = markup.has_key(matchAgainst)
00951             elif matchAgainst and isString(markup):
00952                 if isinstance(markup, unicode):
00953                     matchAgainst = unicode(matchAgainst)
00954                 else:
00955                     matchAgainst = str(matchAgainst)
00956 
00957             if not result:
00958                 result = matchAgainst == markup
00959         return result
00960 
00961 class ResultSet(list):
00962     """A ResultSet is just a list that keeps track of the SoupStrainer
00963     that created it."""
00964     def __init__(self, source):
00965         list.__init__([])
00966         self.source = source
00967 
00968 # Now, some helper functions.
00969 
00970 def isList(l):
00971     """Convenience method that works with all 2.x versions of Python
00972     to determine whether or not something is listlike."""
00973     return ((hasattr(l, '__iter__') and not isString(l))
00974             or (type(l) in (types.ListType, types.TupleType)))
00975 
00976 def isString(s):
00977     """Convenience method that works with all 2.x versions of Python
00978     to determine whether or not something is stringlike."""
00979     try:
00980         return isinstance(s, unicode) or isinstance(s, basestring)
00981     except NameError:
00982         return isinstance(s, str)
00983 
00984 def buildTagMap(default, *args):
00985     """Turns a list of maps, lists, or scalars into a single map.
00986     Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
00987     NESTING_RESET_TAGS maps out of lists and partial maps."""
00988     built = {}
00989     for portion in args:
00990         if hasattr(portion, 'items'):
00991             #It's a map. Merge it.
00992             for k,v in portion.items():
00993                 built[k] = v
00994         elif isList(portion) and not isString(portion):
00995             #It's a list. Map each item to the default.
00996             for k in portion:
00997                 built[k] = default
00998         else:
00999             #It's a scalar. Map it to the default.
01000             built[portion] = default
01001     return built
01002 
01003 # Now, the parser classes.
01004 
01005 class HTMLParserBuilder(HTMLParser):
01006 
01007     def __init__(self, soup):
01008         HTMLParser.__init__(self)
01009         self.soup = soup
01010 
01011     # We inherit feed() and reset().
01012 
01013     def handle_starttag(self, name, attrs):
01014         if name == 'meta':
01015             self.soup.extractCharsetFromMeta(attrs)
01016         else:
01017             self.soup.unknown_starttag(name, attrs)
01018 
01019     def handle_endtag(self, name):
01020         self.soup.unknown_endtag(name)
01021 
01022     def handle_data(self, content):
01023         self.soup.handle_data(content)
01024 
01025     def _toStringSubclass(self, text, subclass):
01026         """Adds a certain piece of text to the tree as a NavigableString
01027         subclass."""
01028         self.soup.endData()
01029         self.handle_data(text)
01030         self.soup.endData(subclass)
01031 
01032     def handle_pi(self, text):
01033         """Handle a processing instruction as a ProcessingInstruction
01034         object, possibly one with a %SOUP-ENCODING% slot into which an
01035         encoding will be plugged later."""
01036         if text[:3] == "xml":
01037             text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
01038         self._toStringSubclass(text, ProcessingInstruction)
01039 
01040     def handle_comment(self, text):
01041         "Handle comments as Comment objects."
01042         self._toStringSubclass(text, Comment)
01043 
01044     def handle_charref(self, ref):
01045         "Handle character references as data."
01046         if self.soup.convertEntities:
01047             data = unichr(int(ref))
01048         else:
01049             data = '&#%s;' % ref
01050         self.handle_data(data)
01051 
01052     def handle_entityref(self, ref):
01053         """Handle entity references as data, possibly converting known
01054         HTML and/or XML entity references to the corresponding Unicode
01055         characters."""
01056         data = None
01057         if self.soup.convertHTMLEntities:
01058             try:
01059                 data = unichr(name2codepoint[ref])
01060             except KeyError:
01061                 pass
01062 
01063         if not data and self.soup.convertXMLEntities:
01064                 data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
01065 
01066         if not data and self.soup.convertHTMLEntities and \
01067             not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
01068                 # TODO: We've got a problem here. We're told this is
01069                 # an entity reference, but it's not an XML entity
01070                 # reference or an HTML entity reference. Nonetheless,
01071                 # the logical thing to do is to pass it through as an
01072                 # unrecognized entity reference.
01073                 #
01074                 # Except: when the input is "&carol;" this function
01075                 # will be called with input "carol". When the input is
01076                 # "AT&T", this function will be called with input
01077                 # "T". We have no way of knowing whether a semicolon
01078                 # was present originally, so we don't know whether
01079                 # this is an unknown entity or just a misplaced
01080                 # ampersand.
01081                 #
01082                 # The more common case is a misplaced ampersand, so I
01083                 # escape the ampersand and omit the trailing semicolon.
01084                 data = "&amp;%s" % ref
01085         if not data:
01086             # This case is different from the one above, because we
01087             # haven't already gone through a supposedly comprehensive
01088             # mapping of entities to Unicode characters. We might not
01089             # have gone through any mapping at all. So the chances are
01090             # very high that this is a real entity, and not a
01091             # misplaced ampersand.
01092             data = "&%s;" % ref
01093         self.handle_data(data)
01094 
01095     def handle_decl(self, data):
01096         "Handle DOCTYPEs and the like as Declaration objects."
01097         self._toStringSubclass(data, Declaration)
01098 
01099     def parse_declaration(self, i):
01100         """Treat a bogus SGML declaration as raw data. Treat a CDATA
01101         declaration as a CData object."""
01102         j = None
01103         if self.rawdata[i:i+9] == '<![CDATA[':
01104              k = self.rawdata.find(']]>', i)
01105              if k == -1:
01106                  k = len(self.rawdata)
01107              data = self.rawdata[i+9:k]
01108              j = k+3
01109              self._toStringSubclass(data, CData)
01110         else:
01111             try:
01112                 j = HTMLParser.parse_declaration(self, i)
01113             except HTMLParseError:
01114                 toHandle = self.rawdata[i:]
01115                 self.handle_data(toHandle)
01116                 j = i + len(toHandle)
01117         return j
01118 
01119 
01120 class BeautifulStoneSoup(Tag):
01121 
01122     """This class contains the basic parser and search code. It defines
01123     a parser that knows nothing about tag behavior except for the
01124     following:
01125 
01126       You can't close a tag without closing all the tags it encloses.
01127       That is, "<foo><bar></foo>" actually means
01128       "<foo><bar></bar></foo>".
01129 
01130     [Another possible explanation is "<foo><bar /></foo>", but since
01131     this class defines no SELF_CLOSING_TAGS, it will never use that
01132     explanation.]
01133 
01134     This class is useful for parsing XML or made-up markup languages,
01135     or when BeautifulSoup makes an assumption counter to what you were
01136     expecting."""
01137 
01138     SELF_CLOSING_TAGS = {}
01139     NESTABLE_TAGS = {}
01140     RESET_NESTING_TAGS = {}
01141     QUOTE_TAGS = {}
01142     PRESERVE_WHITESPACE_TAGS = []
01143 
01144     MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
01145                        lambda x: x.group(1) + ' />'),
01146                       (re.compile('<!\s+([^<>]*)>'),
01147                        lambda x: '<!' + x.group(1) + '>')
01148                       ]
01149 
01150     ROOT_TAG_NAME = u'[document]'
01151 
01152     HTML_ENTITIES = "html"
01153     XML_ENTITIES = "xml"
01154     XHTML_ENTITIES = "xhtml"
01155     # TODO: This only exists for backwards-compatibility
01156     ALL_ENTITIES = XHTML_ENTITIES
01157 
01158     # Used when determining whether a text node is all whitespace and
01159     # can be replaced with a single space. A text node that contains
01160     # fancy Unicode spaces (usually non-breaking) should be left
01161     # alone.
01162     STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
01163 
01164     def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
01165                  markupMassage=True, smartQuotesTo=XML_ENTITIES,
01166                  convertEntities=None, selfClosingTags=None, isHTML=False,
01167                  builder=HTMLParserBuilder):
01168         """The Soup object is initialized as the 'root tag', and the
01169         provided markup (which can be a string or a file-like object)
01170         is fed into the underlying parser.
01171 
01172         HTMLParser will process most bad HTML, and the BeautifulSoup
01173         class has some tricks for dealing with some HTML that kills
01174         HTMLParser, but Beautiful Soup can nonetheless choke or lose data
01175         if your data uses self-closing tags or declarations
01176         incorrectly.
01177 
01178         By default, Beautiful Soup uses regexes to sanitize input,
01179         avoiding the vast majority of these problems. If the problems
01180         don't apply to you, pass in False for markupMassage, and
01181         you'll get better performance.
01182 
01183         The default parser massage techniques fix the two most common
01184         instances of invalid HTML that choke HTMLParser:
01185 
01186          <br/> (No space between name of closing tag and tag close)
01187          <! --Comment--> (Extraneous whitespace in declaration)
01188 
01189         You can pass in a custom list of (RE object, replace method)
01190         tuples to get Beautiful Soup to scrub your input the way you
01191         want."""
01192 
01193         self.parseOnlyThese = parseOnlyThese
01194         self.fromEncoding = fromEncoding
01195         self.smartQuotesTo = smartQuotesTo
01196         self.convertEntities = convertEntities
01197         # Set the rules for how we'll deal with the entities we
01198         # encounter
01199         if self.convertEntities:
01200             # It doesn't make sense to convert encoded characters to
01201             # entities even while you're converting entities to Unicode.
01202             # Just convert it all to Unicode.
01203             self.smartQuotesTo = None
01204             if convertEntities == self.HTML_ENTITIES:
01205                 self.convertXMLEntities = False
01206                 self.convertHTMLEntities = True
01207                 self.escapeUnrecognizedEntities = True
01208             elif convertEntities == self.XHTML_ENTITIES:
01209                 self.convertXMLEntities = True
01210                 self.convertHTMLEntities = True
01211                 self.escapeUnrecognizedEntities = False
01212             elif convertEntities == self.XML_ENTITIES:
01213                 self.convertXMLEntities = True
01214                 self.convertHTMLEntities = False
01215                 self.escapeUnrecognizedEntities = False
01216         else:
01217             self.convertXMLEntities = False
01218             self.convertHTMLEntities = False
01219             self.escapeUnrecognizedEntities = False
01220 
01221         self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
01222         self.builder = builder(self)
01223         self.reset()
01224 
01225         if hasattr(markup, 'read'):        # It's a file-type object.
01226             markup = markup.read()
01227         self.markup = markup
01228         self.markupMassage = markupMassage
01229         try:
01230             self._feed(isHTML=isHTML)
01231         except StopParsing:
01232             pass
01233         self.markup = None                 # The markup can now be GCed.
01234         self.builder = None                # So can the builder.
01235 
01236     def _feed(self, inDocumentEncoding=None, isHTML=False):
01237         # Convert the document to Unicode.
01238         markup = self.markup
01239         if isinstance(markup, unicode):
01240             if not hasattr(self, 'originalEncoding'):
01241                 self.originalEncoding = None
01242         else:
01243             dammit = UnicodeDammit\
01244                      (markup, [self.fromEncoding, inDocumentEncoding],
01245                       smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
01246             markup = dammit.unicode
01247             self.originalEncoding = dammit.originalEncoding
01248             self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
01249         if markup:
01250             if self.markupMassage:
01251                 if not isList(self.markupMassage):
01252                     self.markupMassage = self.MARKUP_MASSAGE
01253                 for fix, m in self.markupMassage:
01254                     markup = fix.sub(m, markup)
01255                 # TODO: We get rid of markupMassage so that the
01256                 # soup object can be deepcopied later on. Some
01257                 # Python installations can't copy regexes. If anyone
01258                 # was relying on the existence of markupMassage, this
01259                 # might cause problems.
01260                 del(self.markupMassage)
01261         self.builder.reset()
01262 
01263         self.builder.feed(markup)
01264         # Close out any unfinished strings and close all the open tags.
01265         self.endData()
01266         while self.currentTag.name != self.ROOT_TAG_NAME:
01267             self.popTag()
01268 
01269     def isSelfClosingTag(self, name):
01270         """Returns true iff the given string is the name of a
01271         self-closing tag according to this parser."""
01272         return self.SELF_CLOSING_TAGS.has_key(name) \
01273                or self.instanceSelfClosingTags.has_key(name)
01274 
01275     def reset(self):
01276         Tag.__init__(self, self, self.ROOT_TAG_NAME)
01277         self.hidden = 1
01278         self.builder.reset()
01279         self.currentData = []
01280         self.currentTag = None
01281         self.tagStack = []
01282         self.quoteStack = []
01283         self.pushTag(self)
01284 
01285     def popTag(self):
01286         tag = self.tagStack.pop()
01287         # Tags with just one string-owning child get the child as a
01288         # 'string' property, so that soup.tag.string is shorthand for
01289         # soup.tag.contents[0]
01290         if len(self.currentTag.contents) == 1 and \
01291            isinstance(self.currentTag.contents[0], NavigableString):
01292             self.currentTag.string = self.currentTag.contents[0]
01293 
01294         #print "Pop", tag.name
01295         if self.tagStack:
01296             self.currentTag = self.tagStack[-1]
01297         return self.currentTag
01298 
01299     def pushTag(self, tag):
01300         #print "Push", tag.name
01301         if self.currentTag:
01302             self.currentTag.contents.append(tag)
01303         self.tagStack.append(tag)
01304         self.currentTag = self.tagStack[-1]
01305 
01306     def endData(self, containerClass=NavigableString):
01307         if self.currentData:
01308             currentData = u''.join(self.currentData)
01309             if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
01310                 not set([tag.name for tag in self.tagStack]).intersection(
01311                     self.PRESERVE_WHITESPACE_TAGS)):
01312                 if '\n' in currentData:
01313                     currentData = '\n'
01314                 else:
01315                     currentData = ' '
01316             self.currentData = []
01317             if self.parseOnlyThese and len(self.tagStack) <= 1 and \
01318                    (not self.parseOnlyThese.text or \
01319                     not self.parseOnlyThese.search(currentData)):
01320                 return
01321             o = containerClass(currentData)
01322             o.setup(self.currentTag, self.previous)
01323             if self.previous:
01324                 self.previous.next = o
01325             self.previous = o
01326             self.currentTag.contents.append(o)
01327 
01328 
01329     def _popToTag(self, name, inclusivePop=True):
01330         """Pops the tag stack up to and including the most recent
01331         instance of the given tag. If inclusivePop is false, pops the tag
01332         stack up to but *not* including the most recent instqance of
01333         the given tag."""
01334         #print "Popping to %s" % name
01335         if name == self.ROOT_TAG_NAME:
01336             return
01337 
01338         numPops = 0
01339         mostRecentTag = None
01340         for i in range(len(self.tagStack)-1, 0, -1):
01341             if name == self.tagStack[i].name:
01342                 numPops = len(self.tagStack)-i
01343                 break
01344         if not inclusivePop:
01345             numPops = numPops - 1
01346 
01347         for i in range(0, numPops):
01348             mostRecentTag = self.popTag()
01349         return mostRecentTag
01350 
01351     def _smartPop(self, name):
01352 
01353         """We need to pop up to the previous tag of this type, unless
01354         one of this tag's nesting reset triggers comes between this
01355         tag and the previous tag of this type, OR unless this tag is a
01356         generic nesting trigger and another generic nesting trigger
01357         comes between this tag and the previous tag of this type.
01358 
01359         Examples:
01360          <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
01361          <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
01362          <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
01363 
01364          <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
01365          <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
01366          <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
01367         """
01368 
01369         nestingResetTriggers = self.NESTABLE_TAGS.get(name)
01370         isNestable = nestingResetTriggers != None
01371         isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
01372         popTo = None
01373         inclusive = True
01374         for i in range(len(self.tagStack)-1, 0, -1):
01375             p = self.tagStack[i]
01376             if (not p or p.name == name) and not isNestable:
01377                 #Non-nestable tags get popped to the top or to their
01378                 #last occurance.
01379                 popTo = name
01380                 break
01381             if (nestingResetTriggers != None
01382                 and p.name in nestingResetTriggers) \
01383                 or (nestingResetTriggers == None and isResetNesting
01384                     and self.RESET_NESTING_TAGS.has_key(p.name)):
01385 
01386                 #If we encounter one of the nesting reset triggers
01387                 #peculiar to this tag, or we encounter another tag
01388                 #that causes nesting to reset, pop up to but not
01389                 #including that tag.
01390                 popTo = p.name
01391                 inclusive = False
01392                 break
01393             p = p.parent
01394         if popTo:
01395             self._popToTag(popTo, inclusive)
01396 
01397     def unknown_starttag(self, name, attrs, selfClosing=0):
01398         #print "Start tag %s: %s" % (name, attrs)
01399         if self.quoteStack:
01400             #This is not a real tag.
01401             #print "<%s> is not real!" % name
01402             attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
01403             self.handle_data('<%s%s>' % (name, attrs))
01404             return
01405         self.endData()
01406 
01407         if not self.isSelfClosingTag(name) and not selfClosing:
01408             self._smartPop(name)
01409 
01410         if self.parseOnlyThese and len(self.tagStack) <= 1 \
01411                and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
01412             return
01413 
01414         tag = Tag(self, name, attrs, self.currentTag, self.previous)
01415         if self.previous:
01416             self.previous.next = tag
01417         self.previous = tag
01418         self.pushTag(tag)
01419         if selfClosing or self.isSelfClosingTag(name):
01420             self.popTag()
01421         if name in self.QUOTE_TAGS:
01422             #print "Beginning quote (%s)" % name
01423             self.quoteStack.append(name)
01424             self.literal = 1
01425         return tag
01426 
01427     def unknown_endtag(self, name):
01428         #print "End tag %s" % name
01429         if self.quoteStack and self.quoteStack[-1] != name:
01430             #This is not a real end tag.
01431             #print "</%s> is not real!" % name
01432             self.handle_data('</%s>' % name)
01433             return
01434         self.endData()
01435         self._popToTag(name)
01436         if self.quoteStack and self.quoteStack[-1] == name:
01437             self.quoteStack.pop()
01438             self.literal = (len(self.quoteStack) > 0)
01439 
01440     def handle_data(self, data):
01441         self.currentData.append(data)
01442 
01443     def extractCharsetFromMeta(self, attrs):
01444         self.unknown_starttag('meta', attrs)
01445 
01446 
01447 class BeautifulSoup(BeautifulStoneSoup):
01448 
01449     """This parser knows the following facts about HTML:
01450 
01451     * Some tags have no closing tag and should be interpreted as being
01452       closed as soon as they are encountered.
01453 
01454     * The text inside some tags (ie. 'script') may contain tags which
01455       are not really part of the document and which should be parsed
01456       as text, not tags. If you want to parse the text as tags, you can
01457       always fetch it and parse it explicitly.
01458 
01459     * Tag nesting rules:
01460 
01461       Most tags can't be nested at all. For instance, the occurance of
01462       a <p> tag should implicitly close the previous <p> tag.
01463 
01464        <p>Para1<p>Para2
01465         should be transformed into:
01466        <p>Para1</p><p>Para2
01467 
01468       Some tags can be nested arbitrarily. For instance, the occurance
01469       of a <blockquote> tag should _not_ implicitly close the previous
01470       <blockquote> tag.
01471 
01472        Alice said: <blockquote>Bob said: <blockquote>Blah
01473         should NOT be transformed into:
01474        Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
01475 
01476       Some tags can be nested, but the nesting is reset by the
01477       interposition of other tags. For instance, a <tr> tag should
01478       implicitly close the previous <tr> tag within the same <table>,
01479       but not close a <tr> tag in another table.
01480 
01481        <table><tr>Blah<tr>Blah
01482         should be transformed into:
01483        <table><tr>Blah</tr><tr>Blah
01484         but,
01485        <tr>Blah<table><tr>Blah
01486         should NOT be transformed into
01487        <tr>Blah<table></tr><tr>Blah
01488 
01489     Differing assumptions about tag nesting rules are a major source
01490     of problems with the BeautifulSoup class. If BeautifulSoup is not
01491     treating as nestable a tag your page author treats as nestable,
01492     try ICantBelieveItsBeautifulSoup, MinimalSoup, or
01493     BeautifulStoneSoup before writing your own subclass."""
01494 
01495     def __init__(self, *args, **kwargs):
01496         if not kwargs.has_key('smartQuotesTo'):
01497             kwargs['smartQuotesTo'] = self.HTML_ENTITIES
01498         kwargs['isHTML'] = True
01499         BeautifulStoneSoup.__init__(self, *args, **kwargs)
01500 
01501     SELF_CLOSING_TAGS = buildTagMap(None,
01502                                     ['br' , 'hr', 'input', 'img', 'meta',
01503                                     'spacer', 'link', 'frame', 'base'])
01504 
01505     PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
01506 
01507     QUOTE_TAGS = {'script' : None, 'textarea' : None}
01508 
01509     #According to the HTML standard, each of these inline tags can
01510     #contain another tag of the same type. Furthermore, it's common
01511     #to actually use these tags this way.
01512     NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
01513                             'center']
01514 
01515     #According to the HTML standard, these block tags can contain
01516     #another tag of the same type. Furthermore, it's common
01517     #to actually use these tags this way.
01518     NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
01519 
01520     #Lists can contain other lists, but there are restrictions.
01521     NESTABLE_LIST_TAGS = { 'ol' : [],
01522                            'ul' : [],
01523                            'li' : ['ul', 'ol'],
01524                            'dl' : [],
01525                            'dd' : ['dl'],
01526                            'dt' : ['dl'] }
01527 
01528     #Tables can contain other tables, but there are restrictions.
01529     NESTABLE_TABLE_TAGS = {'table' : [],
01530                            'tr' : ['table', 'tbody', 'tfoot', 'thead'],
01531                            'td' : ['tr'],
01532                            'th' : ['tr'],
01533                            'thead' : ['table'],
01534                            'tbody' : ['table'],
01535                            'tfoot' : ['table'],
01536                            }
01537 
01538     NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
01539 
01540     #If one of these tags is encountered, all tags up to the next tag of
01541     #this type are popped.
01542     RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
01543                                      NON_NESTABLE_BLOCK_TAGS,
01544                                      NESTABLE_LIST_TAGS,
01545                                      NESTABLE_TABLE_TAGS)
01546 
01547     NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
01548                                 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
01549 
01550     # Used to detect the charset in a META tag; see start_meta
01551     CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
01552 
01553     def extractCharsetFromMeta(self, attrs):
01554         """Beautiful Soup can detect a charset included in a META tag,
01555         try to convert the document to that charset, and re-parse the
01556         document from the beginning."""
01557         httpEquiv = None
01558         contentType = None
01559         contentTypeIndex = None
01560         tagNeedsEncodingSubstitution = False
01561 
01562         for i in range(0, len(attrs)):
01563             key, value = attrs[i]
01564             key = key.lower()
01565             if key == 'http-equiv':
01566                 httpEquiv = value
01567             elif key == 'content':
01568                 contentType = value
01569                 contentTypeIndex = i
01570 
01571         if httpEquiv and contentType: # It's an interesting meta tag.
01572             match = self.CHARSET_RE.search(contentType)
01573             if match:
01574                 if (self.declaredHTMLEncoding is not None or
01575                     self.originalEncoding == self.fromEncoding):
01576                     # An HTML encoding was sniffed while converting
01577                     # the document to Unicode, or an HTML encoding was
01578                     # sniffed during a previous pass through the
01579                     # document, or an encoding was specified
01580                     # explicitly and it worked. Rewrite the meta tag.
01581                     def rewrite(match):
01582                         return match.group(1) + "%SOUP-ENCODING%"
01583                     newAttr = self.CHARSET_RE.sub(rewrite, contentType)
01584                     attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
01585                                                newAttr)
01586                     tagNeedsEncodingSubstitution = True
01587                 else:
01588                     # This is our first pass through the document.
01589                     # Go through it again with the encoding information.
01590                     newCharset = match.group(3)
01591                     if newCharset and newCharset != self.originalEncoding:
01592                         self.declaredHTMLEncoding = newCharset
01593                         self._feed(self.declaredHTMLEncoding)
01594                         raise StopParsing
01595                     pass
01596         tag = self.unknown_starttag("meta", attrs)
01597         if tag and tagNeedsEncodingSubstitution:
01598             tag.containsSubstitutions = True
01599 
01600 
01601 class StopParsing(Exception):
01602     pass
01603 
01604 class ICantBelieveItsBeautifulSoup(BeautifulSoup):
01605 
01606     """The BeautifulSoup class is oriented towards skipping over
01607     common HTML errors like unclosed tags. However, sometimes it makes
01608     errors of its own. For instance, consider this fragment:
01609 
01610      <b>Foo<b>Bar</b></b>
01611 
01612     This is perfectly valid (if bizarre) HTML. However, the
01613     BeautifulSoup class will implicitly close the first b tag when it
01614     encounters the second 'b'. It will think the author wrote
01615     "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
01616     there's no real-world reason to bold something that's already
01617     bold. When it encounters '</b></b>' it will close two more 'b'
01618     tags, for a grand total of three tags closed instead of two. This
01619     can throw off the rest of your document structure. The same is
01620     true of a number of other tags, listed below.
01621 
01622     It's much more common for someone to forget to close a 'b' tag
01623     than to actually use nested 'b' tags, and the BeautifulSoup class
01624     handles the common case. This class handles the not-co-common
01625     case: where you can't believe someone wrote what they did, but
01626     it's valid HTML and BeautifulSoup screwed up by assuming it
01627     wouldn't be."""
01628 
01629     I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
01630      ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
01631       'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
01632       'big']
01633 
01634     I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
01635 
01636     NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
01637                                 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
01638                                 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
01639 
01640 class MinimalSoup(BeautifulSoup):
01641     """The MinimalSoup class is for parsing HTML that contains
01642     pathologically bad markup. It makes no assumptions about tag
01643     nesting, but it does know which tags are self-closing, that
01644     <script> tags contain Javascript and should not be parsed, that
01645     META tags may contain encoding information, and so on.
01646 
01647     This also makes it better for subclassing than BeautifulStoneSoup
01648     or BeautifulSoup."""
01649 
01650     RESET_NESTING_TAGS = buildTagMap('noscript')
01651     NESTABLE_TAGS = {}
01652 
01653 class BeautifulSOAP(BeautifulStoneSoup):
01654     """This class will push a tag with only a single string child into
01655     the tag's parent as an attribute. The attribute's name is the tag
01656     name, and the value is the string child. An example should give
01657     the flavor of the change:
01658 
01659     <foo><bar>baz</bar></foo>
01660      =>
01661     <foo bar="baz"><bar>baz</bar></foo>
01662 
01663     You can then access fooTag['bar'] instead of fooTag.barTag.string.
01664 
01665     This is, of course, useful for scraping structures that tend to
01666     use subelements instead of attributes, such as SOAP messages. Note
01667     that it modifies its input, so don't print the modified version
01668     out.
01669 
01670     I'm not sure how many people really want to use this class; let me
01671     know if you do. Mainly I like the name."""
01672 
01673     def popTag(self):
01674         if len(self.tagStack) > 1:
01675             tag = self.tagStack[-1]
01676             parent = self.tagStack[-2]
01677             parent._getAttrMap()
01678             if (isinstance(tag, Tag) and len(tag.contents) == 1 and
01679                 isinstance(tag.contents[0], NavigableString) and
01680                 not parent.attrMap.has_key(tag.name)):
01681                 parent[tag.name] = tag.contents[0]
01682         BeautifulStoneSoup.popTag(self)
01683 
01684 #Enterprise class names! It has come to our attention that some people
01685 #think the names of the Beautiful Soup parser classes are too silly
01686 #and "unprofessional" for use in enterprise screen-scraping. We feel
01687 #your pain! For such-minded folk, the Beautiful Soup Consortium And
01688 #All-Night Kosher Bakery recommends renaming this file to
01689 #"RobustParser.py" (or, in cases of extreme enterprisiness,
01690 #"RobustParserBeanInterface.class") and using the following
01691 #enterprise-friendly class aliases:
01692 class RobustXMLParser(BeautifulStoneSoup):
01693     pass
01694 class RobustHTMLParser(BeautifulSoup):
01695     pass
01696 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
01697     pass
01698 class RobustInsanelyWackAssHTMLParser(MinimalSoup):
01699     pass
01700 class SimplifyingSOAPParser(BeautifulSOAP):
01701     pass
01702 
01703 ######################################################
01704 #
01705 # Bonus library: Unicode, Dammit
01706 #
01707 # This class forces XML data into a standard format (usually to UTF-8
01708 # or Unicode).  It is heavily based on code from Mark Pilgrim's
01709 # Universal Feed Parser. It does not rewrite the XML or HTML to
01710 # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
01711 # (XML) and BeautifulSoup.start_meta (HTML).
01712 
01713 # Autodetects character encodings.
01714 # Download from http://chardet.feedparser.org/
01715 try:
01716     import chardet
01717 #    import chardet.constants
01718 #    chardet.constants._debug = 1
01719 except ImportError:
01720     chardet = None
01721 
01722 # cjkcodecs and iconv_codec make Python know about more character encodings.
01723 # Both are available from http://cjkpython.i18n.org/
01724 # They're built in if you use Python 2.4.
01725 try:
01726     import cjkcodecs.aliases
01727 except ImportError:
01728     pass
01729 try:
01730     import iconv_codec
01731 except ImportError:
01732     pass
01733 
01734 class UnicodeDammit:
01735     """A class for detecting the encoding of a *ML document and
01736     converting it to a Unicode string. If the source encoding is
01737     windows-1252, can replace MS smart quotes with their HTML or XML
01738     equivalents."""
01739 
01740     # This dictionary maps commonly seen values for "charset" in HTML
01741     # meta tags to the corresponding Python codec names. It only covers
01742     # values that aren't in Python's aliases and can't be determined
01743     # by the heuristics in find_codec.
01744     CHARSET_ALIASES = { "macintosh" : "mac-roman",
01745                         "x-sjis" : "shift-jis" }
01746 
01747     def __init__(self, markup, overrideEncodings=[],
01748                  smartQuotesTo='xml', isHTML=False):
01749         self.declaredHTMLEncoding = None
01750         self.markup, documentEncoding, sniffedEncoding = \
01751                      self._detectEncoding(markup, isHTML)
01752         self.smartQuotesTo = smartQuotesTo
01753         self.triedEncodings = []
01754         if markup == '' or isinstance(markup, unicode):
01755             self.originalEncoding = None
01756             self.unicode = unicode(markup)
01757             return
01758 
01759         u = None
01760         for proposedEncoding in overrideEncodings:
01761             u = self._convertFrom(proposedEncoding)
01762             if u: break
01763         if not u:
01764             for proposedEncoding in (documentEncoding, sniffedEncoding):
01765                 u = self._convertFrom(proposedEncoding)
01766                 if u: break
01767 
01768         # If no luck and we have auto-detection library, try that:
01769         if not u and chardet and not isinstance(self.markup, unicode):
01770             u = self._convertFrom(chardet.detect(self.markup)['encoding'])
01771 
01772         # As a last resort, try utf-8 and windows-1252:
01773         if not u:
01774             for proposed_encoding in ("utf-8", "windows-1252"):
01775                 u = self._convertFrom(proposed_encoding)
01776                 if u: break
01777 
01778         self.unicode = u
01779         if not u: self.originalEncoding = None
01780 
01781     def _subMSChar(self, match):
01782         """Changes a MS smart quote character to an XML or HTML
01783         entity."""
01784         orig = match.group(1)
01785         sub = self.MS_CHARS.get(orig)
01786         if type(sub) == types.TupleType:
01787             if self.smartQuotesTo == 'xml':
01788                 sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
01789             else:
01790                 sub = '&'.encode() + sub[0].encode() + ';'.encode()
01791         else:
01792             sub = sub.encode()
01793         return sub
01794 
01795     def _convertFrom(self, proposed):
01796         proposed = self.find_codec(proposed)
01797         if not proposed or proposed in self.triedEncodings:
01798             return None
01799         self.triedEncodings.append(proposed)
01800         markup = self.markup
01801 
01802         # Convert smart quotes to HTML if coming from an encoding
01803         # that might have them.
01804         if self.smartQuotesTo and proposed.lower() in("windows-1252",
01805                                                       "iso-8859-1",
01806                                                       "iso-8859-2"):
01807             smart_quotes_re = "([\x80-\x9f])"
01808             smart_quotes_compiled = re.compile(smart_quotes_re)
01809             markup = smart_quotes_compiled.sub(self._subMSChar, markup)
01810 
01811         try:
01812             # print "Trying to convert document to %s" % proposed
01813             u = self._toUnicode(markup, proposed)
01814             self.markup = u
01815             self.originalEncoding = proposed
01816         except Exception, e:
01817             # print "That didn't work!"
01818             # print e
01819             return None
01820         #print "Correct encoding: %s" % proposed
01821         return self.markup
01822 
01823     def _toUnicode(self, data, encoding):
01824         '''Given a string and its encoding, decodes the string into Unicode.
01825         %encoding is a string recognized by encodings.aliases'''
01826 
01827         # strip Byte Order Mark (if present)
01828         if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
01829                and (data[2:4] != '\x00\x00'):
01830             encoding = 'utf-16be'
01831             data = data[2:]
01832         elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
01833                  and (data[2:4] != '\x00\x00'):
01834             encoding = 'utf-16le'
01835             data = data[2:]
01836         elif data[:3] == '\xef\xbb\xbf':
01837             encoding = 'utf-8'
01838             data = data[3:]
01839         elif data[:4] == '\x00\x00\xfe\xff':
01840             encoding = 'utf-32be'
01841             data = data[4:]
01842         elif data[:4] == '\xff\xfe\x00\x00':
01843             encoding = 'utf-32le'
01844             data = data[4:]
01845         newdata = unicode(data, encoding)
01846         return newdata
01847 
01848     def _detectEncoding(self, xml_data, isHTML=False):
01849         """Given a document, tries to detect its XML encoding."""
01850         xml_encoding = sniffed_xml_encoding = None
01851         try:
01852             if xml_data[:4] == '\x4c\x6f\xa7\x94':
01853                 # EBCDIC
01854                 xml_data = self._ebcdic_to_ascii(xml_data)
01855             elif xml_data[:4] == '\x00\x3c\x00\x3f':
01856                 # UTF-16BE
01857                 sniffed_xml_encoding = 'utf-16be'
01858                 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
01859             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
01860                      and (xml_data[2:4] != '\x00\x00'):
01861                 # UTF-16BE with BOM
01862                 sniffed_xml_encoding = 'utf-16be'
01863                 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
01864             elif xml_data[:4] == '\x3c\x00\x3f\x00':
01865                 # UTF-16LE
01866                 sniffed_xml_encoding = 'utf-16le'
01867                 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
01868             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
01869                      (xml_data[2:4] != '\x00\x00'):
01870                 # UTF-16LE with BOM
01871                 sniffed_xml_encoding = 'utf-16le'
01872                 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
01873             elif xml_data[:4] == '\x00\x00\x00\x3c':
01874                 # UTF-32BE
01875                 sniffed_xml_encoding = 'utf-32be'
01876                 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
01877             elif xml_data[:4] == '\x3c\x00\x00\x00':
01878                 # UTF-32LE
01879                 sniffed_xml_encoding = 'utf-32le'
01880                 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
01881             elif xml_data[:4] == '\x00\x00\xfe\xff':
01882                 # UTF-32BE with BOM
01883                 sniffed_xml_encoding = 'utf-32be'
01884                 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
01885             elif xml_data[:4] == '\xff\xfe\x00\x00':
01886                 # UTF-32LE with BOM
01887                 sniffed_xml_encoding = 'utf-32le'
01888                 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
01889             elif xml_data[:3] == '\xef\xbb\xbf':
01890                 # UTF-8 with BOM
01891                 sniffed_xml_encoding = 'utf-8'
01892                 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
01893             else:
01894                 sniffed_xml_encoding = 'ascii'
01895                 pass
01896         except:
01897             xml_encoding_match = None
01898         xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode()
01899         xml_encoding_match = re.compile(xml_encoding_re).match(xml_data)
01900         if not xml_encoding_match and isHTML:
01901             meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode()
01902             regexp = re.compile(meta_re, re.I)
01903             xml_encoding_match = regexp.search(xml_data)
01904         if xml_encoding_match is not None:
01905             xml_encoding = xml_encoding_match.groups()[0].decode(
01906                 'ascii').lower()
01907             if isHTML:
01908                 self.declaredHTMLEncoding = xml_encoding
01909             if sniffed_xml_encoding and \
01910                (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
01911                                  'iso-10646-ucs-4', 'ucs-4', 'csucs4',
01912                                  'utf-16', 'utf-32', 'utf_16', 'utf_32',
01913                                  'utf16', 'u16')):
01914                 xml_encoding = sniffed_xml_encoding
01915         return xml_data, xml_encoding, sniffed_xml_encoding
01916 
01917 
01918     def find_codec(self, charset):
01919         return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
01920                or (charset and self._codec(charset.replace("-", ""))) \
01921                or (charset and self._codec(charset.replace("-", "_"))) \
01922                or charset
01923 
01924     def _codec(self, charset):
01925         if not charset: return charset
01926         codec = None
01927         try:
01928             codecs.lookup(charset)
01929             codec = charset
01930         except (LookupError, ValueError):
01931             pass
01932         return codec
01933 
01934     EBCDIC_TO_ASCII_MAP = None
01935     def _ebcdic_to_ascii(self, s):
01936         c = self.__class__
01937         if not c.EBCDIC_TO_ASCII_MAP:
01938             emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
01939                     16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
01940                     128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
01941                     144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
01942                     32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
01943                     38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
01944                     45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
01945                     186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
01946                     195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
01947                     201,202,106,107,108,109,110,111,112,113,114,203,204,205,
01948                     206,207,208,209,126,115,116,117,118,119,120,121,122,210,
01949                     211,212,213,214,215,216,217,218,219,220,221,222,223,224,
01950                     225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
01951                     73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
01952                     82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
01953                     90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
01954                     250,251,252,253,254,255)
01955             import string
01956             c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
01957             ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
01958         return s.translate(c.EBCDIC_TO_ASCII_MAP)
01959 
01960     MS_CHARS = { '\x80' : ('euro', '20AC'),
01961                  '\x81' : ' ',
01962                  '\x82' : ('sbquo', '201A'),
01963                  '\x83' : ('fnof', '192'),
01964                  '\x84' : ('bdquo', '201E'),
01965                  '\x85' : ('hellip', '2026'),
01966                  '\x86' : ('dagger', '2020'),
01967                  '\x87' : ('Dagger', '2021'),
01968                  '\x88' : ('circ', '2C6'),
01969                  '\x89' : ('permil', '2030'),
01970                  '\x8A' : ('Scaron', '160'),
01971                  '\x8B' : ('lsaquo', '2039'),
01972                  '\x8C' : ('OElig', '152'),
01973                  '\x8D' : '?',
01974                  '\x8E' : ('#x17D', '17D'),
01975                  '\x8F' : '?',
01976                  '\x90' : '?',
01977                  '\x91' : ('lsquo', '2018'),
01978                  '\x92' : ('rsquo', '2019'),
01979                  '\x93' : ('ldquo', '201C'),
01980                  '\x94' : ('rdquo', '201D'),
01981                  '\x95' : ('bull', '2022'),
01982                  '\x96' : ('ndash', '2013'),
01983                  '\x97' : ('mdash', '2014'),
01984                  '\x98' : ('tilde', '2DC'),
01985                  '\x99' : ('trade', '2122'),
01986                  '\x9a' : ('scaron', '161'),
01987                  '\x9b' : ('rsaquo', '203A'),
01988                  '\x9c' : ('oelig', '153'),
01989                  '\x9d' : '?',
01990                  '\x9e' : ('#x17E', '17E'),
01991                  '\x9f' : ('Yuml', ''),}
01992 
01993 #######################################################################
01994 
01995 
01996 #By default, act as an HTML pretty-printer.
01997 if __name__ == '__main__':
01998     import sys
01999     soup = BeautifulSoup(sys.stdin)
02000     print soup.prettify()
/afs/cern.ch/work/a/aaltunda/public/www/CMSSW_5_3_13_patch3/src/Documentation/ReferenceManualScripts/doxygen/utils/splitter/BeautifulSoup.py