00001 """Beautiful Soup
00002 Elixir and Tonic
00003 "The Screen-Scraper's Friend"
00004 https://www.crummy.com/software/BeautifulSoup/
00005
00006 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
00007 tree representation. It provides methods and Pythonic idioms that make
00008 it easy to navigate, search, and modify the tree.
00009
00010 A well-formed XML/HTML document yields a well-formed data
00011 structure. An ill-formed XML/HTML document yields a correspondingly
00012 ill-formed data structure. If your document is only locally
00013 well-formed, you can use this library to find and process the
00014 well-formed part of it.
00015
00016 Beautiful Soup works with Python 2.2 and up. It has no external
00017 dependencies, but you'll have more success at converting data to UTF-8
00018 if you also install these three packages:
00019
00020 * chardet, for auto-detecting character encodings
00021 https://chardet.feedparser.org/
00022 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
00023 by stock Python.
00024 https://cjkpython.i18n.org/
00025
00026 Beautiful Soup defines classes for two main parsing strategies:
00027
00028 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
00029 language that kind of looks like XML.
00030
00031 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
00032 or invalid. This class has web browser-like heuristics for
00033 obtaining a sensible parse tree in the face of common HTML errors.
00034
00035 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
00036 the encoding of an HTML or XML document, and converting it to
00037 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
00038
00039 For more than you ever wanted to know about Beautiful Soup, see the
00040 documentation:
00041 https://www.crummy.com/software/BeautifulSoup/documentation.html
00042
00043 Here, have some legalese:
00044
00045 Copyright (c) 2004-2009, Leonard Richardson
00046
00047 All rights reserved.
00048
00049 Redistribution and use in source and binary forms, with or without
00050 modification, are permitted provided that the following conditions are
00051 met:
00052
00053 * Redistributions of source code must retain the above copyright
00054 notice, this list of conditions and the following disclaimer.
00055
00056 * Redistributions in binary form must reproduce the above
00057 copyright notice, this list of conditions and the following
00058 disclaimer in the documentation and/or other materials provided
00059 with the distribution.
00060
00061 * Neither the name of the the Beautiful Soup Consortium and All
00062 Night Kosher Bakery nor the names of its contributors may be
00063 used to endorse or promote products derived from this software
00064 without specific prior written permission.
00065
00066 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00067 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00068 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
00069 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
00070 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00071 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00072 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00073 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00074 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00075 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00076 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
00077
00078 """
00079 from __future__ import generators
00080
00081 __author__ = "Leonard Richardson (leonardr@segfault.org)"
00082 __version__ = "3.1.0.1"
00083 __copyright__ = "Copyright (c) 2004-2009 Leonard Richardson"
00084 __license__ = "New-style BSD"
00085
00086 import codecs
00087 import markupbase
00088 import types
00089 import re
00090 from HTMLParser import HTMLParser, HTMLParseError
00091 try:
00092 from htmlentitydefs import name2codepoint
00093 except ImportError:
00094 name2codepoint = {}
00095 try:
00096 set
00097 except NameError:
00098 from sets import Set as set
00099
00100
00101 markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
00102
00103 DEFAULT_OUTPUT_ENCODING = "utf-8"
00104
00105
00106
00107 def sob(unicode, encoding):
00108 """Returns either the given Unicode string or its encoding."""
00109 if encoding is None:
00110 return unicode
00111 else:
00112 return unicode.encode(encoding)
00113
00114 class PageElement:
00115 """Contains the navigational information for some part of the page
00116 (either a tag or a piece of text)"""
00117
00118 def setup(self, parent=None, previous=None):
00119 """Sets up the initial relations between this element and
00120 other elements."""
00121 self.parent = parent
00122 self.previous = previous
00123 self.next = None
00124 self.previousSibling = None
00125 self.nextSibling = None
00126 if self.parent and self.parent.contents:
00127 self.previousSibling = self.parent.contents[-1]
00128 self.previousSibling.nextSibling = self
00129
00130 def replaceWith(self, replaceWith):
00131 oldParent = self.parent
00132 myIndex = self.parent.contents.index(self)
00133 if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
00134
00135 index = self.parent.contents.index(replaceWith)
00136 if index and index < myIndex:
00137
00138
00139
00140 myIndex = myIndex - 1
00141 self.extract()
00142 oldParent.insert(myIndex, replaceWith)
00143
00144 def extract(self):
00145 """Destructively rips this element out of the tree."""
00146 if self.parent:
00147 try:
00148 self.parent.contents.remove(self)
00149 except ValueError:
00150 pass
00151
00152
00153
00154
00155 lastChild = self._lastRecursiveChild()
00156 nextElement = lastChild.next
00157
00158 if self.previous:
00159 self.previous.next = nextElement
00160 if nextElement:
00161 nextElement.previous = self.previous
00162 self.previous = None
00163 lastChild.next = None
00164
00165 self.parent = None
00166 if self.previousSibling:
00167 self.previousSibling.nextSibling = self.nextSibling
00168 if self.nextSibling:
00169 self.nextSibling.previousSibling = self.previousSibling
00170 self.previousSibling = self.nextSibling = None
00171 return self
00172
00173 def _lastRecursiveChild(self):
00174 "Finds the last element beneath this object to be parsed."
00175 lastChild = self
00176 while hasattr(lastChild, 'contents') and lastChild.contents:
00177 lastChild = lastChild.contents[-1]
00178 return lastChild
00179
00180 def insert(self, position, newChild):
00181 if (isinstance(newChild, basestring)
00182 or isinstance(newChild, unicode)) \
00183 and not isinstance(newChild, NavigableString):
00184 newChild = NavigableString(newChild)
00185
00186 position = min(position, len(self.contents))
00187 if hasattr(newChild, 'parent') and newChild.parent != None:
00188
00189
00190 if newChild.parent == self:
00191 index = self.find(newChild)
00192 if index and index < position:
00193
00194
00195
00196
00197 position = position - 1
00198 newChild.extract()
00199
00200 newChild.parent = self
00201 previousChild = None
00202 if position == 0:
00203 newChild.previousSibling = None
00204 newChild.previous = self
00205 else:
00206 previousChild = self.contents[position-1]
00207 newChild.previousSibling = previousChild
00208 newChild.previousSibling.nextSibling = newChild
00209 newChild.previous = previousChild._lastRecursiveChild()
00210 if newChild.previous:
00211 newChild.previous.next = newChild
00212
00213 newChildsLastElement = newChild._lastRecursiveChild()
00214
00215 if position >= len(self.contents):
00216 newChild.nextSibling = None
00217
00218 parent = self
00219 parentsNextSibling = None
00220 while not parentsNextSibling:
00221 parentsNextSibling = parent.nextSibling
00222 parent = parent.parent
00223 if not parent:
00224 break
00225 if parentsNextSibling:
00226 newChildsLastElement.next = parentsNextSibling
00227 else:
00228 newChildsLastElement.next = None
00229 else:
00230 nextChild = self.contents[position]
00231 newChild.nextSibling = nextChild
00232 if newChild.nextSibling:
00233 newChild.nextSibling.previousSibling = newChild
00234 newChildsLastElement.next = nextChild
00235
00236 if newChildsLastElement.next:
00237 newChildsLastElement.next.previous = newChildsLastElement
00238 self.contents.insert(position, newChild)
00239
00240 def append(self, tag):
00241 """Appends the given tag to the contents of this tag."""
00242 self.insert(len(self.contents), tag)
00243
00244 def findNext(self, name=None, attrs={}, text=None, **kwargs):
00245 """Returns the first item that matches the given criteria and
00246 appears after this Tag in the document."""
00247 return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
00248
00249 def findAllNext(self, name=None, attrs={}, text=None, limit=None,
00250 **kwargs):
00251 """Returns all items that match the given criteria and appear
00252 after this Tag in the document."""
00253 return self._findAll(name, attrs, text, limit, self.nextGenerator,
00254 **kwargs)
00255
00256 def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
00257 """Returns the closest sibling to this Tag that matches the
00258 given criteria and appears after this Tag in the document."""
00259 return self._findOne(self.findNextSiblings, name, attrs, text,
00260 **kwargs)
00261
00262 def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
00263 **kwargs):
00264 """Returns the siblings of this Tag that match the given
00265 criteria and appear after this Tag in the document."""
00266 return self._findAll(name, attrs, text, limit,
00267 self.nextSiblingGenerator, **kwargs)
00268 fetchNextSiblings = findNextSiblings
00269
00270 def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
00271 """Returns the first item that matches the given criteria and
00272 appears before this Tag in the document."""
00273 return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
00274
00275 def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
00276 **kwargs):
00277 """Returns all items that match the given criteria and appear
00278 before this Tag in the document."""
00279 return self._findAll(name, attrs, text, limit, self.previousGenerator,
00280 **kwargs)
00281 fetchPrevious = findAllPrevious
00282
00283 def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
00284 """Returns the closest sibling to this Tag that matches the
00285 given criteria and appears before this Tag in the document."""
00286 return self._findOne(self.findPreviousSiblings, name, attrs, text,
00287 **kwargs)
00288
00289 def findPreviousSiblings(self, name=None, attrs={}, text=None,
00290 limit=None, **kwargs):
00291 """Returns the siblings of this Tag that match the given
00292 criteria and appear before this Tag in the document."""
00293 return self._findAll(name, attrs, text, limit,
00294 self.previousSiblingGenerator, **kwargs)
00295 fetchPreviousSiblings = findPreviousSiblings
00296
00297 def findParent(self, name=None, attrs={}, **kwargs):
00298 """Returns the closest parent of this Tag that matches the given
00299 criteria."""
00300
00301
00302 r = None
00303 l = self.findParents(name, attrs, 1)
00304 if l:
00305 r = l[0]
00306 return r
00307
00308 def findParents(self, name=None, attrs={}, limit=None, **kwargs):
00309 """Returns the parents of this Tag that match the given
00310 criteria."""
00311
00312 return self._findAll(name, attrs, None, limit, self.parentGenerator,
00313 **kwargs)
00314 fetchParents = findParents
00315
00316
00317
00318 def _findOne(self, method, name, attrs, text, **kwargs):
00319 r = None
00320 l = method(name, attrs, text, 1, **kwargs)
00321 if l:
00322 r = l[0]
00323 return r
00324
00325 def _findAll(self, name, attrs, text, limit, generator, **kwargs):
00326 "Iterates over a generator looking for things that match."
00327
00328 if isinstance(name, SoupStrainer):
00329 strainer = name
00330 else:
00331
00332 strainer = SoupStrainer(name, attrs, text, **kwargs)
00333 results = ResultSet(strainer)
00334 g = generator()
00335 while True:
00336 try:
00337 i = g.next()
00338 except StopIteration:
00339 break
00340 if i:
00341 found = strainer.search(i)
00342 if found:
00343 results.append(found)
00344 if limit and len(results) >= limit:
00345 break
00346 return results
00347
00348
00349
00350 def nextGenerator(self):
00351 i = self
00352 while i:
00353 i = i.next
00354 yield i
00355
00356 def nextSiblingGenerator(self):
00357 i = self
00358 while i:
00359 i = i.nextSibling
00360 yield i
00361
00362 def previousGenerator(self):
00363 i = self
00364 while i:
00365 i = i.previous
00366 yield i
00367
00368 def previousSiblingGenerator(self):
00369 i = self
00370 while i:
00371 i = i.previousSibling
00372 yield i
00373
00374 def parentGenerator(self):
00375 i = self
00376 while i:
00377 i = i.parent
00378 yield i
00379
00380
00381 def substituteEncoding(self, str, encoding=None):
00382 encoding = encoding or "utf-8"
00383 return str.replace("%SOUP-ENCODING%", encoding)
00384
00385 def toEncoding(self, s, encoding=None):
00386 """Encodes an object to a string in some encoding, or to Unicode.
00387 ."""
00388 if isinstance(s, unicode):
00389 if encoding:
00390 s = s.encode(encoding)
00391 elif isinstance(s, str):
00392 if encoding:
00393 s = s.encode(encoding)
00394 else:
00395 s = unicode(s)
00396 else:
00397 if encoding:
00398 s = self.toEncoding(str(s), encoding)
00399 else:
00400 s = unicode(s)
00401 return s
00402
00403 class NavigableString(unicode, PageElement):
00404
00405 def __new__(cls, value):
00406 """Create a new NavigableString.
00407
00408 When unpickling a NavigableString, this method is called with
00409 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
00410 passed in to the superclass's __new__ or the superclass won't know
00411 how to handle non-ASCII characters.
00412 """
00413 if isinstance(value, unicode):
00414 return unicode.__new__(cls, value)
00415 return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
00416
00417 def __getnewargs__(self):
00418 return (unicode(self),)
00419
00420 def __getattr__(self, attr):
00421 """text.string gives you text. This is for backwards
00422 compatibility for Navigable*String, but for CData* it lets you
00423 get the string without the CData wrapper."""
00424 if attr == 'string':
00425 return self
00426 else:
00427 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
00428
00429 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING):
00430 return self.decode().encode(encoding)
00431
00432 def decodeGivenEventualEncoding(self, eventualEncoding):
00433 return self
00434
00435 class CData(NavigableString):
00436
00437 def decodeGivenEventualEncoding(self, eventualEncoding):
00438 return u'<![CDATA[' + self + u']]>'
00439
00440 class ProcessingInstruction(NavigableString):
00441
00442 def decodeGivenEventualEncoding(self, eventualEncoding):
00443 output = self
00444 if u'%SOUP-ENCODING%' in output:
00445 output = self.substituteEncoding(output, eventualEncoding)
00446 return u'<?' + output + u'?>'
00447
00448 class Comment(NavigableString):
00449 def decodeGivenEventualEncoding(self, eventualEncoding):
00450 return u'<!--' + self + u'-->'
00451
00452 class Declaration(NavigableString):
00453 def decodeGivenEventualEncoding(self, eventualEncoding):
00454 return u'<!' + self + u'>'
00455
00456 class Tag(PageElement):
00457
00458 """Represents a found HTML tag with its attributes and contents."""
00459
00460 def _invert(h):
00461 "Cheap function to invert a hash."
00462 i = {}
00463 for k,v in h.items():
00464 i[v] = k
00465 return i
00466
00467 XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
00468 "quot" : '"',
00469 "amp" : "&",
00470 "lt" : "<",
00471 "gt" : ">" }
00472
00473 XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
00474
00475 def _convertEntities(self, match):
00476 """Used in a call to re.sub to replace HTML, XML, and numeric
00477 entities with the appropriate Unicode characters. If HTML
00478 entities are being converted, any unrecognized entities are
00479 escaped."""
00480 x = match.group(1)
00481 if self.convertHTMLEntities and x in name2codepoint:
00482 return unichr(name2codepoint[x])
00483 elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
00484 if self.convertXMLEntities:
00485 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
00486 else:
00487 return u'&%s;' % x
00488 elif len(x) > 0 and x[0] == '#':
00489
00490 if len(x) > 1 and x[1] == 'x':
00491 return unichr(int(x[2:], 16))
00492 else:
00493 return unichr(int(x[1:]))
00494
00495 elif self.escapeUnrecognizedEntities:
00496 return u'&%s;' % x
00497 else:
00498 return u'&%s;' % x
00499
00500 def __init__(self, parser, name, attrs=None, parent=None,
00501 previous=None):
00502 "Basic constructor."
00503
00504
00505
00506 self.parserClass = parser.__class__
00507 self.isSelfClosing = parser.isSelfClosingTag(name)
00508 self.name = name
00509 if attrs == None:
00510 attrs = []
00511 self.attrs = attrs
00512 self.contents = []
00513 self.setup(parent, previous)
00514 self.hidden = False
00515 self.containsSubstitutions = False
00516 self.convertHTMLEntities = parser.convertHTMLEntities
00517 self.convertXMLEntities = parser.convertXMLEntities
00518 self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
00519
00520 def convert(kval):
00521 "Converts HTML, XML and numeric entities in the attribute value."
00522 k, val = kval
00523 if val is None:
00524 return kval
00525 return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
00526 self._convertEntities, val))
00527 self.attrs = map(convert, self.attrs)
00528
00529 def get(self, key, default=None):
00530 """Returns the value of the 'key' attribute for the tag, or
00531 the value given for 'default' if it doesn't have that
00532 attribute."""
00533 return self._getAttrMap().get(key, default)
00534
00535 def has_key(self, key):
00536 return self._getAttrMap().has_key(key)
00537
00538 def __getitem__(self, key):
00539 """tag[key] returns the value of the 'key' attribute for the tag,
00540 and throws an exception if it's not there."""
00541 return self._getAttrMap()[key]
00542
00543 def __iter__(self):
00544 "Iterating over a tag iterates over its contents."
00545 return iter(self.contents)
00546
00547 def __len__(self):
00548 "The length of a tag is the length of its list of contents."
00549 return len(self.contents)
00550
00551 def __contains__(self, x):
00552 return x in self.contents
00553
00554 def __nonzero__(self):
00555 "A tag is non-None even if it has no contents."
00556 return True
00557
00558 def __setitem__(self, key, value):
00559 """Setting tag[key] sets the value of the 'key' attribute for the
00560 tag."""
00561 self._getAttrMap()
00562 self.attrMap[key] = value
00563 found = False
00564 for i in range(0, len(self.attrs)):
00565 if self.attrs[i][0] == key:
00566 self.attrs[i] = (key, value)
00567 found = True
00568 if not found:
00569 self.attrs.append((key, value))
00570 self._getAttrMap()[key] = value
00571
00572 def __delitem__(self, key):
00573 "Deleting tag[key] deletes all 'key' attributes for the tag."
00574 for item in self.attrs:
00575 if item[0] == key:
00576 self.attrs.remove(item)
00577
00578
00579 self._getAttrMap()
00580 if self.attrMap.has_key(key):
00581 del self.attrMap[key]
00582
00583 def __call__(self, *args, **kwargs):
00584 """Calling a tag like a function is the same as calling its
00585 findAll() method. Eg. tag('a') returns a list of all the A tags
00586 found within this tag."""
00587 return apply(self.findAll, args, kwargs)
00588
00589 def __getattr__(self, tag):
00590
00591 if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
00592 return self.find(tag[:-3])
00593 elif tag.find('__') != 0:
00594 return self.find(tag)
00595 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
00596
00597 def __eq__(self, other):
00598 """Returns true iff this tag has the same name, the same attributes,
00599 and the same contents (recursively) as the given tag.
00600
00601 NOTE: right now this will return false if two tags have the
00602 same attributes in a different order. Should this be fixed?"""
00603 if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
00604 return False
00605 for i in range(0, len(self.contents)):
00606 if self.contents[i] != other.contents[i]:
00607 return False
00608 return True
00609
00610 def __ne__(self, other):
00611 """Returns true iff this tag is not identical to the other tag,
00612 as defined in __eq__."""
00613 return not self == other
00614
00615 def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
00616 """Renders this tag as a string."""
00617 return self.decode(eventualEncoding=encoding)
00618
00619 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
00620 + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
00621 + ")")
00622
00623 def _sub_entity(self, x):
00624 """Used with a regular expression to substitute the
00625 appropriate XML entity for an XML special character."""
00626 return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
00627
00628 def __unicode__(self):
00629 return self.decode()
00630
00631 def __str__(self):
00632 return self.encode()
00633
00634 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
00635 prettyPrint=False, indentLevel=0):
00636 return self.decode(prettyPrint, indentLevel, encoding).encode(encoding)
00637
00638 def decode(self, prettyPrint=False, indentLevel=0,
00639 eventualEncoding=DEFAULT_OUTPUT_ENCODING):
00640 """Returns a string or Unicode representation of this tag and
00641 its contents. To get Unicode, pass None for encoding."""
00642
00643 attrs = []
00644 if self.attrs:
00645 for key, val in self.attrs:
00646 fmt = '%s="%s"'
00647 if isString(val):
00648 if (self.containsSubstitutions
00649 and eventualEncoding is not None
00650 and '%SOUP-ENCODING%' in val):
00651 val = self.substituteEncoding(val, eventualEncoding)
00652
00653
00654
00655
00656
00657
00658
00659
00660
00661
00662
00663
00664
00665
00666
00667
00668 if '"' in val:
00669 fmt = "%s='%s'"
00670 if "'" in val:
00671
00672
00673 val = val.replace("'", "&squot;")
00674
00675
00676
00677
00678
00679 val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
00680 if val is None:
00681
00682 decoded = key
00683 else:
00684 decoded = fmt % (key, val)
00685 attrs.append(decoded)
00686 close = ''
00687 closeTag = ''
00688 if self.isSelfClosing:
00689 close = ' /'
00690 else:
00691 closeTag = '</%s>' % self.name
00692
00693 indentTag, indentContents = 0, 0
00694 if prettyPrint:
00695 indentTag = indentLevel
00696 space = (' ' * (indentTag-1))
00697 indentContents = indentTag + 1
00698 contents = self.decodeContents(prettyPrint, indentContents,
00699 eventualEncoding)
00700 if self.hidden:
00701 s = contents
00702 else:
00703 s = []
00704 attributeString = ''
00705 if attrs:
00706 attributeString = ' ' + ' '.join(attrs)
00707 if prettyPrint:
00708 s.append(space)
00709 s.append('<%s%s%s>' % (self.name, attributeString, close))
00710 if prettyPrint:
00711 s.append("\n")
00712 s.append(contents)
00713 if prettyPrint and contents and contents[-1] != "\n":
00714 s.append("\n")
00715 if prettyPrint and closeTag:
00716 s.append(space)
00717 s.append(closeTag)
00718 if prettyPrint and closeTag and self.nextSibling:
00719 s.append("\n")
00720 s = ''.join(s)
00721 return s
00722
00723 def decompose(self):
00724 """Recursively destroys the contents of this tree."""
00725 contents = [i for i in self.contents]
00726 for i in contents:
00727 if isinstance(i, Tag):
00728 i.decompose()
00729 else:
00730 i.extract()
00731 self.extract()
00732
00733 def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
00734 return self.encode(encoding, True)
00735
00736 def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
00737 prettyPrint=False, indentLevel=0):
00738 return self.decodeContents(prettyPrint, indentLevel).encode(encoding)
00739
00740 def decodeContents(self, prettyPrint=False, indentLevel=0,
00741 eventualEncoding=DEFAULT_OUTPUT_ENCODING):
00742 """Renders the contents of this tag as a string in the given
00743 encoding. If encoding is None, returns a Unicode string.."""
00744 s=[]
00745 for c in self:
00746 text = None
00747 if isinstance(c, NavigableString):
00748 text = c.decodeGivenEventualEncoding(eventualEncoding)
00749 elif isinstance(c, Tag):
00750 s.append(c.decode(prettyPrint, indentLevel, eventualEncoding))
00751 if text and prettyPrint:
00752 text = text.strip()
00753 if text:
00754 if prettyPrint:
00755 s.append(" " * (indentLevel-1))
00756 s.append(text)
00757 if prettyPrint:
00758 s.append("\n")
00759 return ''.join(s)
00760
00761
00762
00763 def find(self, name=None, attrs={}, recursive=True, text=None,
00764 **kwargs):
00765 """Return only the first child of this Tag matching the given
00766 criteria."""
00767 r = None
00768 l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
00769 if l:
00770 r = l[0]
00771 return r
00772 findChild = find
00773
00774 def findAll(self, name=None, attrs={}, recursive=True, text=None,
00775 limit=None, **kwargs):
00776 """Extracts a list of Tag objects that match the given
00777 criteria. You can specify the name of the Tag and any
00778 attributes you want the Tag to have.
00779
00780 The value of a key-value pair in the 'attrs' map can be a
00781 string, a list of strings, a regular expression object, or a
00782 callable that takes a string and returns whether or not the
00783 string matches for some custom definition of 'matches'. The
00784 same is true of the tag name."""
00785 generator = self.recursiveChildGenerator
00786 if not recursive:
00787 generator = self.childGenerator
00788 return self._findAll(name, attrs, text, limit, generator, **kwargs)
00789 findChildren = findAll
00790
00791
00792 first = find
00793 fetch = findAll
00794
00795 def fetchText(self, text=None, recursive=True, limit=None):
00796 return self.findAll(text=text, recursive=recursive, limit=limit)
00797
00798 def firstText(self, text=None, recursive=True):
00799 return self.find(text=text, recursive=recursive)
00800
00801
00802 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
00803 prettyPrint=False, indentLevel=0):
00804 if encoding is None:
00805 return self.decodeContents(prettyPrint, indentLevel, encoding)
00806 else:
00807 return self.encodeContents(encoding, prettyPrint, indentLevel)
00808
00809
00810
00811
00812 def _getAttrMap(self):
00813 """Initializes a map representation of this tag's attributes,
00814 if not already initialized."""
00815 if not getattr(self, 'attrMap'):
00816 self.attrMap = {}
00817 for (key, value) in self.attrs:
00818 self.attrMap[key] = value
00819 return self.attrMap
00820
00821
00822 def recursiveChildGenerator(self):
00823 if not len(self.contents):
00824 raise StopIteration
00825 stopNode = self._lastRecursiveChild().next
00826 current = self.contents[0]
00827 while current is not stopNode:
00828 yield current
00829 current = current.next
00830
00831 def childGenerator(self):
00832 if not len(self.contents):
00833 raise StopIteration
00834 current = self.contents[0]
00835 while current:
00836 yield current
00837 current = current.nextSibling
00838 raise StopIteration
00839
00840
00841 class SoupStrainer:
00842 """Encapsulates a number of ways of matching a markup element (tag or
00843 text)."""
00844
00845 def __init__(self, name=None, attrs={}, text=None, **kwargs):
00846 self.name = name
00847 if isString(attrs):
00848 kwargs['class'] = attrs
00849 attrs = None
00850 if kwargs:
00851 if attrs:
00852 attrs = attrs.copy()
00853 attrs.update(kwargs)
00854 else:
00855 attrs = kwargs
00856 self.attrs = attrs
00857 self.text = text
00858
00859 def __str__(self):
00860 if self.text:
00861 return self.text
00862 else:
00863 return "%s|%s" % (self.name, self.attrs)
00864
00865 def searchTag(self, markupName=None, markupAttrs={}):
00866 found = None
00867 markup = None
00868 if isinstance(markupName, Tag):
00869 markup = markupName
00870 markupAttrs = markup
00871 callFunctionWithTagData = callable(self.name) \
00872 and not isinstance(markupName, Tag)
00873
00874 if (not self.name) \
00875 or callFunctionWithTagData \
00876 or (markup and self._matches(markup, self.name)) \
00877 or (not markup and self._matches(markupName, self.name)):
00878 if callFunctionWithTagData:
00879 match = self.name(markupName, markupAttrs)
00880 else:
00881 match = True
00882 markupAttrMap = None
00883 for attr, matchAgainst in self.attrs.items():
00884 if not markupAttrMap:
00885 if hasattr(markupAttrs, 'get'):
00886 markupAttrMap = markupAttrs
00887 else:
00888 markupAttrMap = {}
00889 for k,v in markupAttrs:
00890 markupAttrMap[k] = v
00891 attrValue = markupAttrMap.get(attr)
00892 if not self._matches(attrValue, matchAgainst):
00893 match = False
00894 break
00895 if match:
00896 if markup:
00897 found = markup
00898 else:
00899 found = markupName
00900 return found
00901
00902 def search(self, markup):
00903
00904 found = None
00905
00906
00907 if isList(markup) and not isinstance(markup, Tag):
00908 for element in markup:
00909 if isinstance(element, NavigableString) \
00910 and self.search(element):
00911 found = element
00912 break
00913
00914
00915 elif isinstance(markup, Tag):
00916 if not self.text:
00917 found = self.searchTag(markup)
00918
00919 elif isinstance(markup, NavigableString) or \
00920 isString(markup):
00921 if self._matches(markup, self.text):
00922 found = markup
00923 else:
00924 raise Exception, "I don't know how to match against a %s" \
00925 % markup.__class__
00926 return found
00927
00928 def _matches(self, markup, matchAgainst):
00929
00930 result = False
00931 if matchAgainst == True and type(matchAgainst) == types.BooleanType:
00932 result = markup != None
00933 elif callable(matchAgainst):
00934 result = matchAgainst(markup)
00935 else:
00936
00937
00938 if isinstance(markup, Tag):
00939 markup = markup.name
00940 if markup is not None and not isString(markup):
00941 markup = unicode(markup)
00942
00943 if hasattr(matchAgainst, 'match'):
00944
00945 result = markup and matchAgainst.search(markup)
00946 elif (isList(matchAgainst)
00947 and (markup is not None or not isString(matchAgainst))):
00948 result = markup in matchAgainst
00949 elif hasattr(matchAgainst, 'items'):
00950 result = markup.has_key(matchAgainst)
00951 elif matchAgainst and isString(markup):
00952 if isinstance(markup, unicode):
00953 matchAgainst = unicode(matchAgainst)
00954 else:
00955 matchAgainst = str(matchAgainst)
00956
00957 if not result:
00958 result = matchAgainst == markup
00959 return result
00960
00961 class ResultSet(list):
00962 """A ResultSet is just a list that keeps track of the SoupStrainer
00963 that created it."""
00964 def __init__(self, source):
00965 list.__init__([])
00966 self.source = source
00967
00968
00969
00970 def isList(l):
00971 """Convenience method that works with all 2.x versions of Python
00972 to determine whether or not something is listlike."""
00973 return ((hasattr(l, '__iter__') and not isString(l))
00974 or (type(l) in (types.ListType, types.TupleType)))
00975
00976 def isString(s):
00977 """Convenience method that works with all 2.x versions of Python
00978 to determine whether or not something is stringlike."""
00979 try:
00980 return isinstance(s, unicode) or isinstance(s, basestring)
00981 except NameError:
00982 return isinstance(s, str)
00983
00984 def buildTagMap(default, *args):
00985 """Turns a list of maps, lists, or scalars into a single map.
00986 Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
00987 NESTING_RESET_TAGS maps out of lists and partial maps."""
00988 built = {}
00989 for portion in args:
00990 if hasattr(portion, 'items'):
00991
00992 for k,v in portion.items():
00993 built[k] = v
00994 elif isList(portion) and not isString(portion):
00995
00996 for k in portion:
00997 built[k] = default
00998 else:
00999
01000 built[portion] = default
01001 return built
01002
01003
01004
01005 class HTMLParserBuilder(HTMLParser):
01006
01007 def __init__(self, soup):
01008 HTMLParser.__init__(self)
01009 self.soup = soup
01010
01011
01012
01013 def handle_starttag(self, name, attrs):
01014 if name == 'meta':
01015 self.soup.extractCharsetFromMeta(attrs)
01016 else:
01017 self.soup.unknown_starttag(name, attrs)
01018
01019 def handle_endtag(self, name):
01020 self.soup.unknown_endtag(name)
01021
01022 def handle_data(self, content):
01023 self.soup.handle_data(content)
01024
01025 def _toStringSubclass(self, text, subclass):
01026 """Adds a certain piece of text to the tree as a NavigableString
01027 subclass."""
01028 self.soup.endData()
01029 self.handle_data(text)
01030 self.soup.endData(subclass)
01031
01032 def handle_pi(self, text):
01033 """Handle a processing instruction as a ProcessingInstruction
01034 object, possibly one with a %SOUP-ENCODING% slot into which an
01035 encoding will be plugged later."""
01036 if text[:3] == "xml":
01037 text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
01038 self._toStringSubclass(text, ProcessingInstruction)
01039
01040 def handle_comment(self, text):
01041 "Handle comments as Comment objects."
01042 self._toStringSubclass(text, Comment)
01043
01044 def handle_charref(self, ref):
01045 "Handle character references as data."
01046 if self.soup.convertEntities:
01047 data = unichr(int(ref))
01048 else:
01049 data = '&#%s;' % ref
01050 self.handle_data(data)
01051
01052 def handle_entityref(self, ref):
01053 """Handle entity references as data, possibly converting known
01054 HTML and/or XML entity references to the corresponding Unicode
01055 characters."""
01056 data = None
01057 if self.soup.convertHTMLEntities:
01058 try:
01059 data = unichr(name2codepoint[ref])
01060 except KeyError:
01061 pass
01062
01063 if not data and self.soup.convertXMLEntities:
01064 data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
01065
01066 if not data and self.soup.convertHTMLEntities and \
01067 not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
01068
01069
01070
01071
01072
01073
01074
01075
01076
01077
01078
01079
01080
01081
01082
01083
01084 data = "&%s" % ref
01085 if not data:
01086
01087
01088
01089
01090
01091
01092 data = "&%s;" % ref
01093 self.handle_data(data)
01094
01095 def handle_decl(self, data):
01096 "Handle DOCTYPEs and the like as Declaration objects."
01097 self._toStringSubclass(data, Declaration)
01098
01099 def parse_declaration(self, i):
01100 """Treat a bogus SGML declaration as raw data. Treat a CDATA
01101 declaration as a CData object."""
01102 j = None
01103 if self.rawdata[i:i+9] == '<![CDATA[':
01104 k = self.rawdata.find(']]>', i)
01105 if k == -1:
01106 k = len(self.rawdata)
01107 data = self.rawdata[i+9:k]
01108 j = k+3
01109 self._toStringSubclass(data, CData)
01110 else:
01111 try:
01112 j = HTMLParser.parse_declaration(self, i)
01113 except HTMLParseError:
01114 toHandle = self.rawdata[i:]
01115 self.handle_data(toHandle)
01116 j = i + len(toHandle)
01117 return j
01118
01119
01120 class BeautifulStoneSoup(Tag):
01121
01122 """This class contains the basic parser and search code. It defines
01123 a parser that knows nothing about tag behavior except for the
01124 following:
01125
01126 You can't close a tag without closing all the tags it encloses.
01127 That is, "<foo><bar></foo>" actually means
01128 "<foo><bar></bar></foo>".
01129
01130 [Another possible explanation is "<foo><bar /></foo>", but since
01131 this class defines no SELF_CLOSING_TAGS, it will never use that
01132 explanation.]
01133
01134 This class is useful for parsing XML or made-up markup languages,
01135 or when BeautifulSoup makes an assumption counter to what you were
01136 expecting."""
01137
01138 SELF_CLOSING_TAGS = {}
01139 NESTABLE_TAGS = {}
01140 RESET_NESTING_TAGS = {}
01141 QUOTE_TAGS = {}
01142 PRESERVE_WHITESPACE_TAGS = []
01143
01144 MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
01145 lambda x: x.group(1) + ' />'),
01146 (re.compile('<!\s+([^<>]*)>'),
01147 lambda x: '<!' + x.group(1) + '>')
01148 ]
01149
01150 ROOT_TAG_NAME = u'[document]'
01151
01152 HTML_ENTITIES = "html"
01153 XML_ENTITIES = "xml"
01154 XHTML_ENTITIES = "xhtml"
01155
01156 ALL_ENTITIES = XHTML_ENTITIES
01157
01158
01159
01160
01161
01162 STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
01163
01164 def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
01165 markupMassage=True, smartQuotesTo=XML_ENTITIES,
01166 convertEntities=None, selfClosingTags=None, isHTML=False,
01167 builder=HTMLParserBuilder):
01168 """The Soup object is initialized as the 'root tag', and the
01169 provided markup (which can be a string or a file-like object)
01170 is fed into the underlying parser.
01171
01172 HTMLParser will process most bad HTML, and the BeautifulSoup
01173 class has some tricks for dealing with some HTML that kills
01174 HTMLParser, but Beautiful Soup can nonetheless choke or lose data
01175 if your data uses self-closing tags or declarations
01176 incorrectly.
01177
01178 By default, Beautiful Soup uses regexes to sanitize input,
01179 avoiding the vast majority of these problems. If the problems
01180 don't apply to you, pass in False for markupMassage, and
01181 you'll get better performance.
01182
01183 The default parser massage techniques fix the two most common
01184 instances of invalid HTML that choke HTMLParser:
01185
01186 <br/> (No space between name of closing tag and tag close)
01187 <! --Comment--> (Extraneous whitespace in declaration)
01188
01189 You can pass in a custom list of (RE object, replace method)
01190 tuples to get Beautiful Soup to scrub your input the way you
01191 want."""
01192
01193 self.parseOnlyThese = parseOnlyThese
01194 self.fromEncoding = fromEncoding
01195 self.smartQuotesTo = smartQuotesTo
01196 self.convertEntities = convertEntities
01197
01198
01199 if self.convertEntities:
01200
01201
01202
01203 self.smartQuotesTo = None
01204 if convertEntities == self.HTML_ENTITIES:
01205 self.convertXMLEntities = False
01206 self.convertHTMLEntities = True
01207 self.escapeUnrecognizedEntities = True
01208 elif convertEntities == self.XHTML_ENTITIES:
01209 self.convertXMLEntities = True
01210 self.convertHTMLEntities = True
01211 self.escapeUnrecognizedEntities = False
01212 elif convertEntities == self.XML_ENTITIES:
01213 self.convertXMLEntities = True
01214 self.convertHTMLEntities = False
01215 self.escapeUnrecognizedEntities = False
01216 else:
01217 self.convertXMLEntities = False
01218 self.convertHTMLEntities = False
01219 self.escapeUnrecognizedEntities = False
01220
01221 self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
01222 self.builder = builder(self)
01223 self.reset()
01224
01225 if hasattr(markup, 'read'):
01226 markup = markup.read()
01227 self.markup = markup
01228 self.markupMassage = markupMassage
01229 try:
01230 self._feed(isHTML=isHTML)
01231 except StopParsing:
01232 pass
01233 self.markup = None
01234 self.builder = None
01235
01236 def _feed(self, inDocumentEncoding=None, isHTML=False):
01237
01238 markup = self.markup
01239 if isinstance(markup, unicode):
01240 if not hasattr(self, 'originalEncoding'):
01241 self.originalEncoding = None
01242 else:
01243 dammit = UnicodeDammit\
01244 (markup, [self.fromEncoding, inDocumentEncoding],
01245 smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
01246 markup = dammit.unicode
01247 self.originalEncoding = dammit.originalEncoding
01248 self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
01249 if markup:
01250 if self.markupMassage:
01251 if not isList(self.markupMassage):
01252 self.markupMassage = self.MARKUP_MASSAGE
01253 for fix, m in self.markupMassage:
01254 markup = fix.sub(m, markup)
01255
01256
01257
01258
01259
01260 del(self.markupMassage)
01261 self.builder.reset()
01262
01263 self.builder.feed(markup)
01264
01265 self.endData()
01266 while self.currentTag.name != self.ROOT_TAG_NAME:
01267 self.popTag()
01268
01269 def isSelfClosingTag(self, name):
01270 """Returns true iff the given string is the name of a
01271 self-closing tag according to this parser."""
01272 return self.SELF_CLOSING_TAGS.has_key(name) \
01273 or self.instanceSelfClosingTags.has_key(name)
01274
01275 def reset(self):
01276 Tag.__init__(self, self, self.ROOT_TAG_NAME)
01277 self.hidden = 1
01278 self.builder.reset()
01279 self.currentData = []
01280 self.currentTag = None
01281 self.tagStack = []
01282 self.quoteStack = []
01283 self.pushTag(self)
01284
01285 def popTag(self):
01286 tag = self.tagStack.pop()
01287
01288
01289
01290 if len(self.currentTag.contents) == 1 and \
01291 isinstance(self.currentTag.contents[0], NavigableString):
01292 self.currentTag.string = self.currentTag.contents[0]
01293
01294
01295 if self.tagStack:
01296 self.currentTag = self.tagStack[-1]
01297 return self.currentTag
01298
01299 def pushTag(self, tag):
01300
01301 if self.currentTag:
01302 self.currentTag.contents.append(tag)
01303 self.tagStack.append(tag)
01304 self.currentTag = self.tagStack[-1]
01305
01306 def endData(self, containerClass=NavigableString):
01307 if self.currentData:
01308 currentData = u''.join(self.currentData)
01309 if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
01310 not set([tag.name for tag in self.tagStack]).intersection(
01311 self.PRESERVE_WHITESPACE_TAGS)):
01312 if '\n' in currentData:
01313 currentData = '\n'
01314 else:
01315 currentData = ' '
01316 self.currentData = []
01317 if self.parseOnlyThese and len(self.tagStack) <= 1 and \
01318 (not self.parseOnlyThese.text or \
01319 not self.parseOnlyThese.search(currentData)):
01320 return
01321 o = containerClass(currentData)
01322 o.setup(self.currentTag, self.previous)
01323 if self.previous:
01324 self.previous.next = o
01325 self.previous = o
01326 self.currentTag.contents.append(o)
01327
01328
01329 def _popToTag(self, name, inclusivePop=True):
01330 """Pops the tag stack up to and including the most recent
01331 instance of the given tag. If inclusivePop is false, pops the tag
01332 stack up to but *not* including the most recent instqance of
01333 the given tag."""
01334
01335 if name == self.ROOT_TAG_NAME:
01336 return
01337
01338 numPops = 0
01339 mostRecentTag = None
01340 for i in range(len(self.tagStack)-1, 0, -1):
01341 if name == self.tagStack[i].name:
01342 numPops = len(self.tagStack)-i
01343 break
01344 if not inclusivePop:
01345 numPops = numPops - 1
01346
01347 for i in range(0, numPops):
01348 mostRecentTag = self.popTag()
01349 return mostRecentTag
01350
01351 def _smartPop(self, name):
01352
01353 """We need to pop up to the previous tag of this type, unless
01354 one of this tag's nesting reset triggers comes between this
01355 tag and the previous tag of this type, OR unless this tag is a
01356 generic nesting trigger and another generic nesting trigger
01357 comes between this tag and the previous tag of this type.
01358
01359 Examples:
01360 <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
01361 <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
01362 <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
01363
01364 <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
01365 <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
01366 <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
01367 """
01368
01369 nestingResetTriggers = self.NESTABLE_TAGS.get(name)
01370 isNestable = nestingResetTriggers != None
01371 isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
01372 popTo = None
01373 inclusive = True
01374 for i in range(len(self.tagStack)-1, 0, -1):
01375 p = self.tagStack[i]
01376 if (not p or p.name == name) and not isNestable:
01377
01378
01379 popTo = name
01380 break
01381 if (nestingResetTriggers != None
01382 and p.name in nestingResetTriggers) \
01383 or (nestingResetTriggers == None and isResetNesting
01384 and self.RESET_NESTING_TAGS.has_key(p.name)):
01385
01386
01387
01388
01389
01390 popTo = p.name
01391 inclusive = False
01392 break
01393 p = p.parent
01394 if popTo:
01395 self._popToTag(popTo, inclusive)
01396
01397 def unknown_starttag(self, name, attrs, selfClosing=0):
01398
01399 if self.quoteStack:
01400
01401
01402 attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
01403 self.handle_data('<%s%s>' % (name, attrs))
01404 return
01405 self.endData()
01406
01407 if not self.isSelfClosingTag(name) and not selfClosing:
01408 self._smartPop(name)
01409
01410 if self.parseOnlyThese and len(self.tagStack) <= 1 \
01411 and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
01412 return
01413
01414 tag = Tag(self, name, attrs, self.currentTag, self.previous)
01415 if self.previous:
01416 self.previous.next = tag
01417 self.previous = tag
01418 self.pushTag(tag)
01419 if selfClosing or self.isSelfClosingTag(name):
01420 self.popTag()
01421 if name in self.QUOTE_TAGS:
01422
01423 self.quoteStack.append(name)
01424 self.literal = 1
01425 return tag
01426
01427 def unknown_endtag(self, name):
01428
01429 if self.quoteStack and self.quoteStack[-1] != name:
01430
01431
01432 self.handle_data('</%s>' % name)
01433 return
01434 self.endData()
01435 self._popToTag(name)
01436 if self.quoteStack and self.quoteStack[-1] == name:
01437 self.quoteStack.pop()
01438 self.literal = (len(self.quoteStack) > 0)
01439
01440 def handle_data(self, data):
01441 self.currentData.append(data)
01442
01443 def extractCharsetFromMeta(self, attrs):
01444 self.unknown_starttag('meta', attrs)
01445
01446
01447 class BeautifulSoup(BeautifulStoneSoup):
01448
01449 """This parser knows the following facts about HTML:
01450
01451 * Some tags have no closing tag and should be interpreted as being
01452 closed as soon as they are encountered.
01453
01454 * The text inside some tags (ie. 'script') may contain tags which
01455 are not really part of the document and which should be parsed
01456 as text, not tags. If you want to parse the text as tags, you can
01457 always fetch it and parse it explicitly.
01458
01459 * Tag nesting rules:
01460
01461 Most tags can't be nested at all. For instance, the occurance of
01462 a <p> tag should implicitly close the previous <p> tag.
01463
01464 <p>Para1<p>Para2
01465 should be transformed into:
01466 <p>Para1</p><p>Para2
01467
01468 Some tags can be nested arbitrarily. For instance, the occurance
01469 of a <blockquote> tag should _not_ implicitly close the previous
01470 <blockquote> tag.
01471
01472 Alice said: <blockquote>Bob said: <blockquote>Blah
01473 should NOT be transformed into:
01474 Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
01475
01476 Some tags can be nested, but the nesting is reset by the
01477 interposition of other tags. For instance, a <tr> tag should
01478 implicitly close the previous <tr> tag within the same <table>,
01479 but not close a <tr> tag in another table.
01480
01481 <table><tr>Blah<tr>Blah
01482 should be transformed into:
01483 <table><tr>Blah</tr><tr>Blah
01484 but,
01485 <tr>Blah<table><tr>Blah
01486 should NOT be transformed into
01487 <tr>Blah<table></tr><tr>Blah
01488
01489 Differing assumptions about tag nesting rules are a major source
01490 of problems with the BeautifulSoup class. If BeautifulSoup is not
01491 treating as nestable a tag your page author treats as nestable,
01492 try ICantBelieveItsBeautifulSoup, MinimalSoup, or
01493 BeautifulStoneSoup before writing your own subclass."""
01494
01495 def __init__(self, *args, **kwargs):
01496 if not kwargs.has_key('smartQuotesTo'):
01497 kwargs['smartQuotesTo'] = self.HTML_ENTITIES
01498 kwargs['isHTML'] = True
01499 BeautifulStoneSoup.__init__(self, *args, **kwargs)
01500
01501 SELF_CLOSING_TAGS = buildTagMap(None,
01502 ['br' , 'hr', 'input', 'img', 'meta',
01503 'spacer', 'link', 'frame', 'base'])
01504
01505 PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
01506
01507 QUOTE_TAGS = {'script' : None, 'textarea' : None}
01508
01509
01510
01511
01512 NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
01513 'center']
01514
01515
01516
01517
01518 NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
01519
01520
01521 NESTABLE_LIST_TAGS = { 'ol' : [],
01522 'ul' : [],
01523 'li' : ['ul', 'ol'],
01524 'dl' : [],
01525 'dd' : ['dl'],
01526 'dt' : ['dl'] }
01527
01528
01529 NESTABLE_TABLE_TAGS = {'table' : [],
01530 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
01531 'td' : ['tr'],
01532 'th' : ['tr'],
01533 'thead' : ['table'],
01534 'tbody' : ['table'],
01535 'tfoot' : ['table'],
01536 }
01537
01538 NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
01539
01540
01541
01542 RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
01543 NON_NESTABLE_BLOCK_TAGS,
01544 NESTABLE_LIST_TAGS,
01545 NESTABLE_TABLE_TAGS)
01546
01547 NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
01548 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
01549
01550
01551 CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
01552
01553 def extractCharsetFromMeta(self, attrs):
01554 """Beautiful Soup can detect a charset included in a META tag,
01555 try to convert the document to that charset, and re-parse the
01556 document from the beginning."""
01557 httpsEquiv = None
01558 contentType = None
01559 contentTypeIndex = None
01560 tagNeedsEncodingSubstitution = False
01561
01562 for i in range(0, len(attrs)):
01563 key, value = attrs[i]
01564 key = key.lower()
01565 if key == 'https-equiv':
01566 httpsEquiv = value
01567 elif key == 'content':
01568 contentType = value
01569 contentTypeIndex = i
01570
01571 if httpsEquiv and contentType:
01572 match = self.CHARSET_RE.search(contentType)
01573 if match:
01574 if (self.declaredHTMLEncoding is not None or
01575 self.originalEncoding == self.fromEncoding):
01576
01577
01578
01579
01580
01581 def rewrite(match):
01582 return match.group(1) + "%SOUP-ENCODING%"
01583 newAttr = self.CHARSET_RE.sub(rewrite, contentType)
01584 attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
01585 newAttr)
01586 tagNeedsEncodingSubstitution = True
01587 else:
01588
01589
01590 newCharset = match.group(3)
01591 if newCharset and newCharset != self.originalEncoding:
01592 self.declaredHTMLEncoding = newCharset
01593 self._feed(self.declaredHTMLEncoding)
01594 raise StopParsing
01595 pass
01596 tag = self.unknown_starttag("meta", attrs)
01597 if tag and tagNeedsEncodingSubstitution:
01598 tag.containsSubstitutions = True
01599
01600
01601 class StopParsing(Exception):
01602 pass
01603
01604 class ICantBelieveItsBeautifulSoup(BeautifulSoup):
01605
01606 """The BeautifulSoup class is oriented towards skipping over
01607 common HTML errors like unclosed tags. However, sometimes it makes
01608 errors of its own. For instance, consider this fragment:
01609
01610 <b>Foo<b>Bar</b></b>
01611
01612 This is perfectly valid (if bizarre) HTML. However, the
01613 BeautifulSoup class will implicitly close the first b tag when it
01614 encounters the second 'b'. It will think the author wrote
01615 "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
01616 there's no real-world reason to bold something that's already
01617 bold. When it encounters '</b></b>' it will close two more 'b'
01618 tags, for a grand total of three tags closed instead of two. This
01619 can throw off the rest of your document structure. The same is
01620 true of a number of other tags, listed below.
01621
01622 It's much more common for someone to forget to close a 'b' tag
01623 than to actually use nested 'b' tags, and the BeautifulSoup class
01624 handles the common case. This class handles the not-co-common
01625 case: where you can't believe someone wrote what they did, but
01626 it's valid HTML and BeautifulSoup screwed up by assuming it
01627 wouldn't be."""
01628
01629 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
01630 ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
01631 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
01632 'big']
01633
01634 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
01635
01636 NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
01637 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
01638 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
01639
01640 class MinimalSoup(BeautifulSoup):
01641 """The MinimalSoup class is for parsing HTML that contains
01642 pathologically bad markup. It makes no assumptions about tag
01643 nesting, but it does know which tags are self-closing, that
01644 <script> tags contain Javascript and should not be parsed, that
01645 META tags may contain encoding information, and so on.
01646
01647 This also makes it better for subclassing than BeautifulStoneSoup
01648 or BeautifulSoup."""
01649
01650 RESET_NESTING_TAGS = buildTagMap('noscript')
01651 NESTABLE_TAGS = {}
01652
01653 class BeautifulSOAP(BeautifulStoneSoup):
01654 """This class will push a tag with only a single string child into
01655 the tag's parent as an attribute. The attribute's name is the tag
01656 name, and the value is the string child. An example should give
01657 the flavor of the change:
01658
01659 <foo><bar>baz</bar></foo>
01660 =>
01661 <foo bar="baz"><bar>baz</bar></foo>
01662
01663 You can then access fooTag['bar'] instead of fooTag.barTag.string.
01664
01665 This is, of course, useful for scraping structures that tend to
01666 use subelements instead of attributes, such as SOAP messages. Note
01667 that it modifies its input, so don't print the modified version
01668 out.
01669
01670 I'm not sure how many people really want to use this class; let me
01671 know if you do. Mainly I like the name."""
01672
01673 def popTag(self):
01674 if len(self.tagStack) > 1:
01675 tag = self.tagStack[-1]
01676 parent = self.tagStack[-2]
01677 parent._getAttrMap()
01678 if (isinstance(tag, Tag) and len(tag.contents) == 1 and
01679 isinstance(tag.contents[0], NavigableString) and
01680 not parent.attrMap.has_key(tag.name)):
01681 parent[tag.name] = tag.contents[0]
01682 BeautifulStoneSoup.popTag(self)
01683
01684
01685
01686
01687
01688
01689
01690
01691
01692 class RobustXMLParser(BeautifulStoneSoup):
01693 pass
01694 class RobustHTMLParser(BeautifulSoup):
01695 pass
01696 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
01697 pass
01698 class RobustInsanelyWackAssHTMLParser(MinimalSoup):
01699 pass
01700 class SimplifyingSOAPParser(BeautifulSOAP):
01701 pass
01702
01703
01704
01705
01706
01707
01708
01709
01710
01711
01712
01713
01714
01715 try:
01716 import chardet
01717
01718
01719 except ImportError:
01720 chardet = None
01721
01722
01723
01724
01725 try:
01726 import cjkcodecs.aliases
01727 except ImportError:
01728 pass
01729 try:
01730 import iconv_codec
01731 except ImportError:
01732 pass
01733
01734 class UnicodeDammit:
01735 """A class for detecting the encoding of a *ML document and
01736 converting it to a Unicode string. If the source encoding is
01737 windows-1252, can replace MS smart quotes with their HTML or XML
01738 equivalents."""
01739
01740
01741
01742
01743
01744 CHARSET_ALIASES = { "macintosh" : "mac-roman",
01745 "x-sjis" : "shift-jis" }
01746
01747 def __init__(self, markup, overrideEncodings=[],
01748 smartQuotesTo='xml', isHTML=False):
01749 self.declaredHTMLEncoding = None
01750 self.markup, documentEncoding, sniffedEncoding = \
01751 self._detectEncoding(markup, isHTML)
01752 self.smartQuotesTo = smartQuotesTo
01753 self.triedEncodings = []
01754 if markup == '' or isinstance(markup, unicode):
01755 self.originalEncoding = None
01756 self.unicode = unicode(markup)
01757 return
01758
01759 u = None
01760 for proposedEncoding in overrideEncodings:
01761 u = self._convertFrom(proposedEncoding)
01762 if u: break
01763 if not u:
01764 for proposedEncoding in (documentEncoding, sniffedEncoding):
01765 u = self._convertFrom(proposedEncoding)
01766 if u: break
01767
01768
01769 if not u and chardet and not isinstance(self.markup, unicode):
01770 u = self._convertFrom(chardet.detect(self.markup)['encoding'])
01771
01772
01773 if not u:
01774 for proposed_encoding in ("utf-8", "windows-1252"):
01775 u = self._convertFrom(proposed_encoding)
01776 if u: break
01777
01778 self.unicode = u
01779 if not u: self.originalEncoding = None
01780
01781 def _subMSChar(self, match):
01782 """Changes a MS smart quote character to an XML or HTML
01783 entity."""
01784 orig = match.group(1)
01785 sub = self.MS_CHARS.get(orig)
01786 if type(sub) == types.TupleType:
01787 if self.smartQuotesTo == 'xml':
01788 sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
01789 else:
01790 sub = '&'.encode() + sub[0].encode() + ';'.encode()
01791 else:
01792 sub = sub.encode()
01793 return sub
01794
01795 def _convertFrom(self, proposed):
01796 proposed = self.find_codec(proposed)
01797 if not proposed or proposed in self.triedEncodings:
01798 return None
01799 self.triedEncodings.append(proposed)
01800 markup = self.markup
01801
01802
01803
01804 if self.smartQuotesTo and proposed.lower() in("windows-1252",
01805 "iso-8859-1",
01806 "iso-8859-2"):
01807 smart_quotes_re = "([\x80-\x9f])"
01808 smart_quotes_compiled = re.compile(smart_quotes_re)
01809 markup = smart_quotes_compiled.sub(self._subMSChar, markup)
01810
01811 try:
01812
01813 u = self._toUnicode(markup, proposed)
01814 self.markup = u
01815 self.originalEncoding = proposed
01816 except Exception, e:
01817
01818
01819 return None
01820
01821 return self.markup
01822
01823 def _toUnicode(self, data, encoding):
01824 '''Given a string and its encoding, decodes the string into Unicode.
01825 %encoding is a string recognized by encodings.aliases'''
01826
01827
01828 if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
01829 and (data[2:4] != '\x00\x00'):
01830 encoding = 'utf-16be'
01831 data = data[2:]
01832 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
01833 and (data[2:4] != '\x00\x00'):
01834 encoding = 'utf-16le'
01835 data = data[2:]
01836 elif data[:3] == '\xef\xbb\xbf':
01837 encoding = 'utf-8'
01838 data = data[3:]
01839 elif data[:4] == '\x00\x00\xfe\xff':
01840 encoding = 'utf-32be'
01841 data = data[4:]
01842 elif data[:4] == '\xff\xfe\x00\x00':
01843 encoding = 'utf-32le'
01844 data = data[4:]
01845 newdata = unicode(data, encoding)
01846 return newdata
01847
01848 def _detectEncoding(self, xml_data, isHTML=False):
01849 """Given a document, tries to detect its XML encoding."""
01850 xml_encoding = sniffed_xml_encoding = None
01851 try:
01852 if xml_data[:4] == '\x4c\x6f\xa7\x94':
01853
01854 xml_data = self._ebcdic_to_ascii(xml_data)
01855 elif xml_data[:4] == '\x00\x3c\x00\x3f':
01856
01857 sniffed_xml_encoding = 'utf-16be'
01858 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
01859 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
01860 and (xml_data[2:4] != '\x00\x00'):
01861
01862 sniffed_xml_encoding = 'utf-16be'
01863 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
01864 elif xml_data[:4] == '\x3c\x00\x3f\x00':
01865
01866 sniffed_xml_encoding = 'utf-16le'
01867 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
01868 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
01869 (xml_data[2:4] != '\x00\x00'):
01870
01871 sniffed_xml_encoding = 'utf-16le'
01872 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
01873 elif xml_data[:4] == '\x00\x00\x00\x3c':
01874
01875 sniffed_xml_encoding = 'utf-32be'
01876 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
01877 elif xml_data[:4] == '\x3c\x00\x00\x00':
01878
01879 sniffed_xml_encoding = 'utf-32le'
01880 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
01881 elif xml_data[:4] == '\x00\x00\xfe\xff':
01882
01883 sniffed_xml_encoding = 'utf-32be'
01884 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
01885 elif xml_data[:4] == '\xff\xfe\x00\x00':
01886
01887 sniffed_xml_encoding = 'utf-32le'
01888 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
01889 elif xml_data[:3] == '\xef\xbb\xbf':
01890
01891 sniffed_xml_encoding = 'utf-8'
01892 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
01893 else:
01894 sniffed_xml_encoding = 'ascii'
01895 pass
01896 except:
01897 xml_encoding_match = None
01898 xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode()
01899 xml_encoding_match = re.compile(xml_encoding_re).match(xml_data)
01900 if not xml_encoding_match and isHTML:
01901 meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode()
01902 regexp = re.compile(meta_re, re.I)
01903 xml_encoding_match = regexp.search(xml_data)
01904 if xml_encoding_match is not None:
01905 xml_encoding = xml_encoding_match.groups()[0].decode(
01906 'ascii').lower()
01907 if isHTML:
01908 self.declaredHTMLEncoding = xml_encoding
01909 if sniffed_xml_encoding and \
01910 (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
01911 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
01912 'utf-16', 'utf-32', 'utf_16', 'utf_32',
01913 'utf16', 'u16')):
01914 xml_encoding = sniffed_xml_encoding
01915 return xml_data, xml_encoding, sniffed_xml_encoding
01916
01917
01918 def find_codec(self, charset):
01919 return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
01920 or (charset and self._codec(charset.replace("-", ""))) \
01921 or (charset and self._codec(charset.replace("-", "_"))) \
01922 or charset
01923
01924 def _codec(self, charset):
01925 if not charset: return charset
01926 codec = None
01927 try:
01928 codecs.lookup(charset)
01929 codec = charset
01930 except (LookupError, ValueError):
01931 pass
01932 return codec
01933
01934 EBCDIC_TO_ASCII_MAP = None
01935 def _ebcdic_to_ascii(self, s):
01936 c = self.__class__
01937 if not c.EBCDIC_TO_ASCII_MAP:
01938 emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
01939 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
01940 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
01941 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
01942 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
01943 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
01944 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
01945 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
01946 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
01947 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
01948 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
01949 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
01950 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
01951 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
01952 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
01953 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
01954 250,251,252,253,254,255)
01955 import string
01956 c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
01957 ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
01958 return s.translate(c.EBCDIC_TO_ASCII_MAP)
01959
01960 MS_CHARS = { '\x80' : ('euro', '20AC'),
01961 '\x81' : ' ',
01962 '\x82' : ('sbquo', '201A'),
01963 '\x83' : ('fnof', '192'),
01964 '\x84' : ('bdquo', '201E'),
01965 '\x85' : ('hellip', '2026'),
01966 '\x86' : ('dagger', '2020'),
01967 '\x87' : ('Dagger', '2021'),
01968 '\x88' : ('circ', '2C6'),
01969 '\x89' : ('permil', '2030'),
01970 '\x8A' : ('Scaron', '160'),
01971 '\x8B' : ('lsaquo', '2039'),
01972 '\x8C' : ('OElig', '152'),
01973 '\x8D' : '?',
01974 '\x8E' : ('#x17D', '17D'),
01975 '\x8F' : '?',
01976 '\x90' : '?',
01977 '\x91' : ('lsquo', '2018'),
01978 '\x92' : ('rsquo', '2019'),
01979 '\x93' : ('ldquo', '201C'),
01980 '\x94' : ('rdquo', '201D'),
01981 '\x95' : ('bull', '2022'),
01982 '\x96' : ('ndash', '2013'),
01983 '\x97' : ('mdash', '2014'),
01984 '\x98' : ('tilde', '2DC'),
01985 '\x99' : ('trade', '2122'),
01986 '\x9a' : ('scaron', '161'),
01987 '\x9b' : ('rsaquo', '203A'),
01988 '\x9c' : ('oelig', '153'),
01989 '\x9d' : '?',
01990 '\x9e' : ('#x17E', '17E'),
01991 '\x9f' : ('Yuml', ''),}
01992
01993
01994
01995
01996
01997 if __name__ == '__main__':
01998 import sys
01999 soup = BeautifulSoup(sys.stdin)
02000 print soup.prettify()