3 "The Screen-Scraper's Friend" 4 http://www.crummy.com/software/BeautifulSoup/ 6 Beautiful Soup parses a (possibly invalid) XML or HTML document into a 7 tree representation. It provides methods and Pythonic idioms that make 8 it easy to navigate, search, and modify the tree. 10 A well-formed XML/HTML document yields a well-formed data 11 structure. An ill-formed XML/HTML document yields a correspondingly 12 ill-formed data structure. If your document is only locally 13 well-formed, you can use this library to find and process the 14 well-formed part of it. 16 Beautiful Soup works with Python 2.2 and up. It has no external 17 dependencies, but you'll have more success at converting data to UTF-8 18 if you also install these three packages: 20 * chardet, for auto-detecting character encodings 21 http://chardet.feedparser.org/ 22 * cjkcodecs and iconv_codec, which add more encodings to the ones supported 24 http://cjkpython.i18n.org/ 26 Beautiful Soup defines classes for two main parsing strategies: 28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific 29 language that kind of looks like XML. 31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid 32 or invalid. This class has web browser-like heuristics for 33 obtaining a sensible parse tree in the face of common HTML errors. 35 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting 36 the encoding of an HTML or XML document, and converting it to 37 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. 39 For more than you ever wanted to know about Beautiful Soup, see the 41 http://www.crummy.com/software/BeautifulSoup/documentation.html 43 Here, have some legalese: 45 Copyright (c) 2004-2010, Leonard Richardson 49 Redistribution and use in source and binary forms, with or without 50 modification, are permitted provided that the following conditions are 53 * Redistributions of source code must retain the above copyright 54 notice, this list of conditions and the following disclaimer. 56 * Redistributions in binary form must reproduce the above 57 copyright notice, this list of conditions and the following 58 disclaimer in the documentation and/or other materials provided 59 with the distribution. 61 * Neither the name of the the Beautiful Soup Consortium and All 62 Night Kosher Bakery nor the names of its contributors may be 63 used to endorse or promote products derived from this software 64 without specific prior written permission. 66 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 67 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 68 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 69 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 70 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 71 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 72 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 73 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 74 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 75 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 76 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. 79 from __future__
import generators
81 __author__ =
"Leonard Richardson (leonardr@segfault.org)" 83 __copyright__ =
"Copyright (c) 2004-2012 Leonard Richardson" 84 __license__ =
"New-style BSD" 86 from sgmllib
import SGMLParser, SGMLParseError
93 from htmlentitydefs
import name2codepoint
99 from sets
import Set
as set
102 sgmllib.tagfind = re.compile(
'[a-zA-Z][-_.:a-zA-Z0-9]*')
103 markupbase._declname_match = re.compile(
r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
105 DEFAULT_OUTPUT_ENCODING =
"utf-8" 108 """Build a RE to match the given CSS class.""" 109 return re.compile(
r"(^|.*\s)%s($|\s)" % str)
114 """Contains the navigational information for some part of the page 115 (either a tag or a piece of text)""" 118 "Cheap function to invert a hash." 120 for k,v
in h.items():
124 XML_ENTITIES_TO_SPECIAL_CHARS = {
"apos" :
"'",
130 XML_SPECIAL_CHARS_TO_ENTITIES =
_invert(XML_ENTITIES_TO_SPECIAL_CHARS)
132 def setup(self, parent=None, previous=None):
133 """Sets up the initial relations between this element and 147 if hasattr(replaceWith,
"parent")\
148 and replaceWith.parent
is self.
parent:
150 index = replaceWith.parent.index(replaceWith)
151 if index
and index < myIndex:
155 myIndex = myIndex - 1
157 oldParent.insert(myIndex, replaceWith)
163 reversedChildren = list(self.contents)
164 reversedChildren.reverse()
165 for child
in reversedChildren:
166 myParent.insert(myIndex, child)
169 """Destructively rips this element out of the tree.""" 180 nextElement = lastChild.next
185 nextElement.previous = self.
previous 187 lastChild.next =
None 198 "Finds the last element beneath this object to be parsed." 200 while hasattr(lastChild,
'contents')
and lastChild.contents:
201 lastChild = lastChild.contents[-1]
205 if isinstance(newChild, basestring) \
206 and not isinstance(newChild, NavigableString):
209 position =
min(position, len(self.contents))
210 if hasattr(newChild,
'parent')
and newChild.parent
is not None:
213 if newChild.parent
is self:
214 index = self.index(newChild)
220 position = position - 1
223 newChild.parent = self
226 newChild.previousSibling =
None 227 newChild.previous = self
229 previousChild = self.contents[position-1]
230 newChild.previousSibling = previousChild
231 newChild.previousSibling.nextSibling = newChild
232 newChild.previous = previousChild._lastRecursiveChild()
233 if newChild.previous:
234 newChild.previous.next = newChild
236 newChildsLastElement = newChild._lastRecursiveChild()
238 if position >= len(self.contents):
239 newChild.nextSibling =
None 242 parentsNextSibling =
None 243 while not parentsNextSibling:
244 parentsNextSibling = parent.nextSibling
245 parent = parent.parent
248 if parentsNextSibling:
249 newChildsLastElement.next = parentsNextSibling
251 newChildsLastElement.next =
None 253 nextChild = self.contents[position]
254 newChild.nextSibling = nextChild
255 if newChild.nextSibling:
256 newChild.nextSibling.previousSibling = newChild
257 newChildsLastElement.next = nextChild
259 if newChildsLastElement.next:
260 newChildsLastElement.next.previous = newChildsLastElement
261 self.contents.
insert(position, newChild)
264 """Appends the given tag to the contents of this tag.""" 265 self.
insert(len(self.contents), tag)
267 def findNext(self, name=None, attrs={}, text=None, **kwargs):
268 """Returns the first item that matches the given criteria and 269 appears after this Tag in the document.""" 272 def findAllNext(self, name=None, attrs={}, text=None, limit=None,
274 """Returns all items that match the given criteria and appear 275 after this Tag in the document.""" 280 """Returns the closest sibling to this Tag that matches the 281 given criteria and appears after this Tag in the document.""" 287 """Returns the siblings of this Tag that match the given 288 criteria and appear after this Tag in the document.""" 289 return self.
_findAll(name, attrs, text, limit,
291 fetchNextSiblings = findNextSiblings
294 """Returns the first item that matches the given criteria and 295 appears before this Tag in the document.""" 300 """Returns all items that match the given criteria and appear 301 before this Tag in the document.""" 304 fetchPrevious = findAllPrevious
307 """Returns the closest sibling to this Tag that matches the 308 given criteria and appears before this Tag in the document.""" 313 limit=None, **kwargs):
314 """Returns the siblings of this Tag that match the given 315 criteria and appear before this Tag in the document.""" 316 return self.
_findAll(name, attrs, text, limit,
318 fetchPreviousSiblings = findPreviousSiblings
321 """Returns the closest parent of this Tag that matches the given 332 """Returns the parents of this Tag that match the given 337 fetchParents = findParents
341 def _findOne(self, method, name, attrs, text, **kwargs):
343 l =
method(name, attrs, text, 1, **kwargs)
348 def _findAll(self, name, attrs, text, limit, generator, **kwargs):
349 "Iterates over a generator looking for things that match." 351 if isinstance(name, SoupStrainer):
354 elif text
is None and not limit
and not attrs
and not kwargs:
357 return [element
for element
in generator()
358 if isinstance(element, Tag)]
360 elif isinstance(name, basestring):
361 return [element
for element
in generator()
362 if isinstance(element, Tag)
and 363 element.name == name]
374 except StopIteration:
377 found = strainer.search(i)
379 results.append(found)
380 if limit
and len(results) >= limit:
407 i = i.previousSibling
418 encoding = encoding
or "utf-8" 419 return str.replace(
"%SOUP-ENCODING%", encoding)
422 """Encodes an object to a string in some encoding, or to Unicode. 424 if isinstance(s, unicode):
426 s = s.encode(encoding)
427 elif isinstance(s, str):
429 s = s.encode(encoding)
439 BARE_AMPERSAND_OR_BRACKET = re.compile(
"([<>]|" 440 +
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" 444 """Used with a regular expression to substitute the 445 appropriate XML entity for an XML special character.""" 452 """Create a new NavigableString. 454 When unpickling a NavigableString, this method is called with 455 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be 456 passed in to the superclass's __new__ or the superclass won't know 457 how to handle non-ASCII characters. 459 if isinstance(value, unicode):
460 return unicode.__new__(cls, value)
461 return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
464 return (NavigableString.__str__(self),)
467 """text.string gives you text. This is for backwards 468 compatibility for Navigable*String, but for CData* it lets you 469 get the string without the CData wrapper.""" 473 raise AttributeError,
"'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
476 return str(self).
decode(DEFAULT_OUTPUT_ENCODING)
478 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
482 return data.encode(encoding)
488 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
489 return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
492 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
494 if "%SOUP-ENCODING%" in output:
496 return "<?%s?>" % self.
toEncoding(output, encoding)
499 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
500 return "<!--%s-->" % NavigableString.__str__(self, encoding)
503 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
504 return "<!%s>" % NavigableString.__str__(self, encoding)
508 """Represents a found HTML tag with its attributes and contents.""" 511 """Used in a call to re.sub to replace HTML, XML, and numeric 512 entities with the appropriate Unicode characters. If HTML 513 entities are being converted, any unrecognized entities are 517 return unichr(name2codepoint[x])
523 elif len(x) > 0
and x[0] ==
'#':
525 if len(x) > 1
and x[1] ==
'x':
526 return unichr(
int(x[2:], 16))
528 return unichr(
int(x[1:]))
531 return u'&%s;' % x
535 def __init__(self, parser, name, attrs=None, parent=None,
546 elif isinstance(attrs, dict):
547 attrs = attrs.items()
550 self.
setup(parent, previous)
558 convert = lambda(k, val): (k,
559 re.sub(
"&(#\d+|#x[0-9a-fA-F]+|\w+);",
566 and isinstance(self.
contents[0], NavigableString)):
570 """Replace the contents of the tag with a string""" 574 string = property(getString, setString)
582 while current
is not stopNode:
583 if isinstance(current, NavigableString):
584 strings.append(current.strip())
585 current = current.next
586 return separator.join(strings)
588 text = property(getText)
590 def get(self, key, default=None):
591 """Returns the value of the 'key' attribute for the tag, or 592 the value given for 'default' if it doesn't have that 597 """Extract all children.""" 602 for i, child
in enumerate(self.
contents):
605 raise ValueError(
"Tag.index: element not in tag")
611 """tag[key] returns the value of the 'key' attribute for the tag, 612 and throws an exception if it's not there.""" 616 "Iterating over a tag iterates over its contents." 620 "The length of a tag is the length of its list of contents." 627 "A tag is non-None even if it has no contents." 631 """Setting tag[key] sets the value of the 'key' attribute for the 637 if self.
attrs[i][0] == key:
638 self.
attrs[i] = (key, value)
645 "Deleting tag[key] deletes all 'key' attributes for the tag." 646 for item
in self.
attrs:
656 """Calling a tag like a function is the same as calling its 657 findAll() method. Eg. tag('a') returns a list of all the A tags 658 found within this tag.""" 663 if len(tag) > 3
and tag.rfind(
'Tag') == len(tag)-3:
664 return self.
find(tag[:-3])
665 elif tag.find(
'__') != 0:
666 return self.
find(tag)
667 raise AttributeError,
"'%s' object has no attribute '%s'" % (self.__class__, tag)
670 """Returns true iff this tag has the same name, the same attributes, 671 and the same contents (recursively) as the given tag. 673 NOTE: right now this will return false if two tags have the 674 same attributes in a different order. Should this be fixed?""" 677 if not hasattr(other,
'name')
or not hasattr(other,
'attrs')
or not hasattr(other,
'contents')
or self.
name != other.name
or self.
attrs != other.attrs
or len(self) != len(other):
680 if self.
contents[i] != other.contents[i]:
685 """Returns true iff this tag is not identical to the other tag, 686 as defined in __eq__.""" 687 return not self == other
689 def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
690 """Renders this tag as a string.""" 696 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
697 prettyPrint=False, indentLevel=0):
698 """Returns a string or Unicode representation of this tag and 699 its contents. To get Unicode, pass None for encoding. 701 NOTE: since Python's HTML parser consumes whitespace, this 702 method is not certain to reproduce the whitespace present in 703 the original string.""" 709 for key, val
in self.
attrs:
711 if isinstance(val, basestring):
735 val = val.replace(
"'",
"&squot;")
743 attrs.append(fmt % (self.
toEncoding(key, encoding),
750 closeTag =
'</%s>' % encodedName
752 indentTag, indentContents = 0, 0
754 indentTag = indentLevel
755 space = (
' ' * (indentTag-1))
756 indentContents = indentTag + 1
757 contents = self.
renderContents(encoding, prettyPrint, indentContents)
764 attributeString =
' ' +
' '.
join(attrs)
767 s.append(
'<%s%s%s>' % (encodedName, attributeString, close))
771 if prettyPrint
and contents
and contents[-1] !=
"\n":
773 if prettyPrint
and closeTag:
782 """Recursively destroys the contents of this tree.""" 787 while current
is not None:
789 if isinstance(current, Tag):
790 del current.contents[:]
791 current.parent =
None 792 current.previous =
None 793 current.previousSibling =
None 795 current.nextSibling =
None 798 def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
799 return self.
__str__(encoding,
True)
802 prettyPrint=False, indentLevel=0):
803 """Renders the contents of this tag as a string in the given 804 encoding. If encoding is None, returns a Unicode string..""" 808 if isinstance(c, NavigableString):
809 text = c.__str__(encoding)
810 elif isinstance(c, Tag):
811 s.append(c.__str__(encoding, prettyPrint, indentLevel))
812 if text
and prettyPrint:
816 s.append(
" " * (indentLevel-1))
824 def find(self, name=None, attrs={}, recursive=True, text=None,
826 """Return only the first child of this Tag matching the given 829 l = self.
findAll(name, attrs, recursive, text, 1, **kwargs)
835 def findAll(self, name=None, attrs={}, recursive=True, text=None,
836 limit=None, **kwargs):
837 """Extracts a list of Tag objects that match the given 838 criteria. You can specify the name of the Tag and any 839 attributes you want the Tag to have. 841 The value of a key-value pair in the 'attrs' map can be a 842 string, a list of strings, a regular expression object, or a 843 callable that takes a string and returns whether or not the 844 string matches for some custom definition of 'matches'. The 845 same is true of the tag name.""" 849 return self.
_findAll(name, attrs, text, limit, generator, **kwargs)
850 findChildren = findAll
856 def fetchText(self, text=None, recursive=True, limit=None):
857 return self.
findAll(text=text, recursive=recursive, limit=limit)
860 return self.
find(text=text, recursive=recursive)
865 """Initializes a map representation of this tag's attributes, 866 if not already initialized.""" 867 if not getattr(self,
'attrMap'):
869 for (key, value)
in self.
attrs:
883 while current
is not stopNode:
885 current = current.next
890 """Encapsulates a number of ways of matching a markup element (tag or 893 def __init__(self, name=None, attrs={}, text=None, **kwargs):
895 if isinstance(attrs, basestring):
916 if isinstance(markupName, Tag):
919 callFunctionWithTagData = callable(self.
name) \
920 and not isinstance(markupName, Tag)
923 or callFunctionWithTagData \
925 or (
not markup
and self.
_matches(markupName, self.
name)):
926 if callFunctionWithTagData:
927 match = self.
name(markupName, markupAttrs)
932 if not markupAttrMap:
933 if hasattr(markupAttrs,
'get'):
934 markupAttrMap = markupAttrs
937 for k,v
in markupAttrs:
939 attrValue = markupAttrMap.get(attr)
940 if not self.
_matches(attrValue, matchAgainst):
955 if hasattr(markup,
"__iter__") \
956 and not isinstance(markup, Tag):
957 for element
in markup:
958 if isinstance(element, NavigableString) \
964 elif isinstance(markup, Tag):
968 elif isinstance(markup, NavigableString)
or \
969 isinstance(markup, basestring):
973 raise Exception,
"I don't know how to match against a %s" \
980 if matchAgainst
is True:
981 result = markup
is not None 982 elif callable(matchAgainst):
983 result = matchAgainst(markup)
987 if isinstance(markup, Tag):
989 if markup
and not isinstance(markup, basestring):
992 if hasattr(matchAgainst,
'match'):
994 result = markup
and matchAgainst.search(markup)
995 elif hasattr(matchAgainst,
'__iter__'):
996 result = markup
in matchAgainst
997 elif hasattr(matchAgainst,
'items'):
998 result = markup.has_key(matchAgainst)
999 elif matchAgainst
and isinstance(markup, basestring):
1000 if isinstance(markup, unicode):
1001 matchAgainst =
unicode(matchAgainst)
1003 matchAgainst =
str(matchAgainst)
1006 result = matchAgainst == markup
1010 """A ResultSet is just a list that keeps track of the SoupStrainer 1019 """Turns a list of maps, lists, or scalars into a single map. 1020 Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and 1021 NESTING_RESET_TAGS maps out of lists and partial maps.""" 1023 for portion
in args:
1024 if hasattr(portion,
'items'):
1026 for k,v
in portion.items():
1028 elif hasattr(portion,
'__iter__'):
1034 built[portion] = default
1041 """This class contains the basic parser and search code. It defines 1042 a parser that knows nothing about tag behavior except for the 1045 You can't close a tag without closing all the tags it encloses. 1046 That is, "<foo><bar></foo>" actually means 1047 "<foo><bar></bar></foo>". 1049 [Another possible explanation is "<foo><bar /></foo>", but since 1050 this class defines no SELF_CLOSING_TAGS, it will never use that 1053 This class is useful for parsing XML or made-up markup languages, 1054 or when BeautifulSoup makes an assumption counter to what you were 1057 SELF_CLOSING_TAGS = {}
1059 RESET_NESTING_TAGS = {}
1061 PRESERVE_WHITESPACE_TAGS = []
1063 MARKUP_MASSAGE = [(re.compile(
'(<[^<>]*)/>'),
1064 lambda x: x.group(1) +
' />'),
1065 (re.compile(
'<!\s+([^<>]*)>'),
1066 lambda x:
'<!' + x.group(1) +
'>')
1069 ROOT_TAG_NAME =
u'[document]' 1071 HTML_ENTITIES =
"html" 1072 XML_ENTITIES =
"xml" 1073 XHTML_ENTITIES =
"xhtml" 1075 ALL_ENTITIES = XHTML_ENTITIES
1081 STRIP_ASCII_SPACES = { 9:
None, 10:
None, 12:
None, 13:
None, 32:
None, }
1083 def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
1084 markupMassage=True, smartQuotesTo=XML_ENTITIES,
1085 convertEntities=None, selfClosingTags=None, isHTML=False):
1086 """The Soup object is initialized as the 'root tag', and the 1087 provided markup (which can be a string or a file-like object) 1088 is fed into the underlying parser. 1090 sgmllib will process most bad HTML, and the BeautifulSoup 1091 class has some tricks for dealing with some HTML that kills 1092 sgmllib, but Beautiful Soup can nonetheless choke or lose data 1093 if your data uses self-closing tags or declarations 1096 By default, Beautiful Soup uses regexes to sanitize input, 1097 avoiding the vast majority of these problems. If the problems 1098 don't apply to you, pass in False for markupMassage, and 1099 you'll get better performance. 1101 The default parser massage techniques fix the two most common 1102 instances of invalid HTML that choke sgmllib: 1104 <br/> (No space between name of closing tag and tag close) 1105 <! --Comment--> (Extraneous whitespace in declaration) 1107 You can pass in a custom list of (RE object, replace method) 1108 tuples to get Beautiful Soup to scrub your input the way you 1140 SGMLParser.__init__(self)
1142 if hasattr(markup,
'read'):
1143 markup = markup.read()
1147 self.
_feed(isHTML=isHTML)
1153 """This method fixes a bug in Python's SGMLParser.""" 1158 if not 0 <= n <= 127 :
1160 return self.convert_codepoint(n)
1162 def _feed(self, inDocumentEncoding=None, isHTML=False):
1165 if isinstance(markup, unicode):
1166 if not hasattr(self,
'originalEncoding'):
1169 dammit = UnicodeDammit\
1172 markup = dammit.unicode
1180 markup = fix.sub(m, markup)
1189 SGMLParser.feed(self, markup)
1196 """This method routes method call requests to either the SGMLParser 1197 superclass or the Tag superclass, depending on the method name.""" 1200 if methodName.startswith(
'start_')
or methodName.startswith(
'end_') \
1201 or methodName.startswith(
'do_'):
1202 return SGMLParser.__getattr__(self, methodName)
1203 elif not methodName.startswith(
'__'):
1204 return Tag.__getattr__(self, methodName)
1206 raise AttributeError
1209 """Returns true iff the given string is the name of a 1210 self-closing tag according to this parser.""" 1217 SGMLParser.reset(self)
1239 def endData(self, containerClass=NavigableString):
1245 if '\n' in currentData:
1254 o = containerClass(currentData)
1263 """Pops the tag stack up to and including the most recent 1264 instance of the given tag. If inclusivePop is false, pops the tag 1265 stack up to but *not* including the most recent instqance of 1272 mostRecentTag =
None 1277 if not inclusivePop:
1278 numPops = numPops - 1
1280 for i
in range(0, numPops):
1281 mostRecentTag = self.
popTag()
1282 return mostRecentTag
1286 """We need to pop up to the previous tag of this type, unless 1287 one of this tag's nesting reset triggers comes between this 1288 tag and the previous tag of this type, OR unless this tag is a 1289 generic nesting trigger and another generic nesting trigger 1290 comes between this tag and the previous tag of this type. 1293 <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'. 1294 <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'. 1295 <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'. 1297 <li><ul><li> *<li>* should pop to 'ul', not the first 'li'. 1298 <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' 1299 <td><tr><td> *<td>* should pop to 'tr', not the first 'td' 1303 isNestable = nestingResetTriggers !=
None 1309 if (
not p
or p.name == name)
and not isNestable:
1314 if (nestingResetTriggers
is not None 1315 and p.name
in nestingResetTriggers) \
1316 or (nestingResetTriggers
is None and isResetNesting
1335 attrs =
''.
join([
' %s="%s"' % (x, y)
for x, y
in attrs])
1377 """Adds a certain piece of text to the tree as a NavigableString 1384 """Handle a processing instruction as a ProcessingInstruction 1385 object, possibly one with a %SOUP-ENCODING% slot into which an 1386 encoding will be plugged later.""" 1387 if text[:3] ==
"xml":
1388 text =
u"xml version='1.0' encoding='%SOUP-ENCODING%'" 1392 "Handle comments as Comment objects." 1396 "Handle character references as data." 1398 data = unichr(
int(ref))
1400 data =
'&#%s;' % ref
1404 """Handle entity references as data, possibly converting known 1405 HTML and/or XML entity references to the corresponding Unicode 1410 data = unichr(name2codepoint[ref])
1435 data =
"&%s" % ref
1447 "Handle DOCTYPEs and the like as Declaration objects." 1451 """Treat a bogus SGML declaration as raw data. Treat a CDATA 1452 declaration as a CData object.""" 1454 if self.rawdata[i:i+9] ==
'<![CDATA[':
1455 k = self.rawdata.
find(
']]>', i)
1457 k = len(self.rawdata)
1458 data = self.rawdata[i+9:k]
1463 j = SGMLParser.parse_declaration(self, i)
1464 except SGMLParseError:
1465 toHandle = self.rawdata[i:]
1467 j = i + len(toHandle)
1472 """This parser knows the following facts about HTML: 1474 * Some tags have no closing tag and should be interpreted as being 1475 closed as soon as they are encountered. 1477 * The text inside some tags (ie. 'script') may contain tags which 1478 are not really part of the document and which should be parsed 1479 as text, not tags. If you want to parse the text as tags, you can 1480 always fetch it and parse it explicitly. 1482 * Tag nesting rules: 1484 Most tags can't be nested at all. For instance, the occurance of 1485 a <p> tag should implicitly close the previous <p> tag. 1488 should be transformed into: 1489 <p>Para1</p><p>Para2 1491 Some tags can be nested arbitrarily. For instance, the occurance 1492 of a <blockquote> tag should _not_ implicitly close the previous 1495 Alice said: <blockquote>Bob said: <blockquote>Blah 1496 should NOT be transformed into: 1497 Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah 1499 Some tags can be nested, but the nesting is reset by the 1500 interposition of other tags. For instance, a <tr> tag should 1501 implicitly close the previous <tr> tag within the same <table>, 1502 but not close a <tr> tag in another table. 1504 <table><tr>Blah<tr>Blah 1505 should be transformed into: 1506 <table><tr>Blah</tr><tr>Blah 1508 <tr>Blah<table><tr>Blah 1509 should NOT be transformed into 1510 <tr>Blah<table></tr><tr>Blah 1512 Differing assumptions about tag nesting rules are a major source 1513 of problems with the BeautifulSoup class. If BeautifulSoup is not 1514 treating as nestable a tag your page author treats as nestable, 1515 try ICantBelieveItsBeautifulSoup, MinimalSoup, or 1516 BeautifulStoneSoup before writing your own subclass.""" 1519 if not kwargs.has_key(
'smartQuotesTo'):
1521 kwargs[
'isHTML'] =
True 1522 BeautifulStoneSoup.__init__(self, *args, **kwargs)
1525 (
'br' ,
'hr',
'input',
'img',
'meta',
1526 'spacer',
'link',
'frame',
'base',
'col'))
1528 PRESERVE_WHITESPACE_TAGS = set([
'pre',
'textarea'])
1530 QUOTE_TAGS = {
'script' :
None,
'textarea' :
None}
1535 NESTABLE_INLINE_TAGS = (
'span',
'font',
'q',
'object',
'bdo',
'sub',
'sup',
1541 NESTABLE_BLOCK_TAGS = (
'blockquote',
'div',
'fieldset',
'ins',
'del')
1544 NESTABLE_LIST_TAGS = {
'ol' : [],
1546 'li' : [
'ul',
'ol'],
1552 NESTABLE_TABLE_TAGS = {
'table' : [],
1553 'tr' : [
'table',
'tbody',
'tfoot',
'thead'],
1556 'thead' : [
'table'],
1557 'tbody' : [
'table'],
1558 'tfoot' : [
'table'],
1561 NON_NESTABLE_BLOCK_TAGS = (
'address',
'form',
'p',
'pre')
1565 RESET_NESTING_TAGS =
buildTagMap(
None, NESTABLE_BLOCK_TAGS,
'noscript',
1566 NON_NESTABLE_BLOCK_TAGS,
1568 NESTABLE_TABLE_TAGS)
1570 NESTABLE_TAGS =
buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1571 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1574 CHARSET_RE = re.compile(
"((^|;)\s*charset=)([^;]*)", re.M)
1577 """Beautiful Soup can detect a charset included in a META tag, 1578 try to convert the document to that charset, and re-parse the 1579 document from the beginning.""" 1582 contentTypeIndex =
None 1583 tagNeedsEncodingSubstitution =
False 1585 for i
in range(0, len(attrs)):
1586 key, value = attrs[i]
1588 if key ==
'http-equiv':
1590 elif key ==
'content':
1592 contentTypeIndex = i
1594 if httpEquiv
and contentType:
1605 return match.group(1) +
"%SOUP-ENCODING%" 1606 newAttr = self.
CHARSET_RE.sub(rewrite, contentType)
1607 attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1609 tagNeedsEncodingSubstitution =
True 1613 newCharset = match.group(3)
1620 if tag
and tagNeedsEncodingSubstitution:
1621 tag.containsSubstitutions =
True 1628 """The BeautifulSoup class is oriented towards skipping over 1629 common HTML errors like unclosed tags. However, sometimes it makes 1630 errors of its own. For instance, consider this fragment: 1632 <b>Foo<b>Bar</b></b> 1634 This is perfectly valid (if bizarre) HTML. However, the 1635 BeautifulSoup class will implicitly close the first b tag when it 1636 encounters the second 'b'. It will think the author wrote 1637 "<b>Foo<b>Bar", and didn't close the first 'b' tag, because 1638 there's no real-world reason to bold something that's already 1639 bold. When it encounters '</b></b>' it will close two more 'b' 1640 tags, for a grand total of three tags closed instead of two. This 1641 can throw off the rest of your document structure. The same is 1642 true of a number of other tags, listed below. 1644 It's much more common for someone to forget to close a 'b' tag 1645 than to actually use nested 'b' tags, and the BeautifulSoup class 1646 handles the common case. This class handles the not-co-common 1647 case: where you can't believe someone wrote what they did, but 1648 it's valid HTML and BeautifulSoup screwed up by assuming it 1651 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1652 (
'em',
'big',
'i',
'small',
'tt',
'abbr',
'acronym',
'strong',
1653 'cite',
'code',
'dfn',
'kbd',
'samp',
'strong',
'var',
'b',
1656 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = (
'noscript',)
1659 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1660 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1663 """The MinimalSoup class is for parsing HTML that contains 1664 pathologically bad markup. It makes no assumptions about tag 1665 nesting, but it does know which tags are self-closing, that 1666 <script> tags contain Javascript and should not be parsed, that 1667 META tags may contain encoding information, and so on. 1669 This also makes it better for subclassing than BeautifulStoneSoup 1670 or BeautifulSoup.""" 1676 """This class will push a tag with only a single string child into 1677 the tag's parent as an attribute. The attribute's name is the tag 1678 name, and the value is the string child. An example should give 1679 the flavor of the change: 1681 <foo><bar>baz</bar></foo> 1683 <foo bar="baz"><bar>baz</bar></foo> 1685 You can then access fooTag['bar'] instead of fooTag.barTag.string. 1687 This is, of course, useful for scraping structures that tend to 1688 use subelements instead of attributes, such as SOAP messages. Note 1689 that it modifies its input, so don't print the modified version 1692 I'm not sure how many people really want to use this class; let me 1693 know if you do. Mainly I like the name.""" 1699 parent._getAttrMap()
1700 if (isinstance(tag, Tag)
and len(tag.contents) == 1
and 1701 isinstance(tag.contents[0], NavigableString)
and 1702 not parent.attrMap.has_key(tag.name)):
1703 parent[tag.name] = tag.contents[0]
1704 BeautifulStoneSoup.popTag(self)
1748 import cjkcodecs.aliases
1757 """A class for detecting the encoding of a *ML document and 1758 converting it to a Unicode string. If the source encoding is 1759 windows-1252, can replace MS smart quotes with their HTML or XML 1766 CHARSET_ALIASES = {
"macintosh" :
"mac-roman",
1767 "x-sjis" :
"shift-jis" }
1769 def __init__(self, markup, overrideEncodings=[],
1770 smartQuotesTo='xml', isHTML=False):
1772 self.
markup, documentEncoding, sniffedEncoding = \
1776 if markup ==
'' or isinstance(markup, unicode):
1782 for proposedEncoding
in overrideEncodings:
1786 for proposedEncoding
in (documentEncoding, sniffedEncoding):
1791 if not u
and chardet
and not isinstance(self.
markup, unicode):
1796 for proposed_encoding
in (
"utf-8",
"windows-1252"):
1804 """Changes a MS smart quote character to an XML or HTML 1807 if isinstance(sub, tuple):
1809 sub =
'&#x%s;' % sub[1]
1811 sub =
'&%s;' % sub[0]
1826 markup = re.compile(
"([\x80-\x9f])").sub \
1835 except Exception, e:
1843 '''Given a string and its encoding, decodes the string into Unicode. 1844 %encoding is a string recognized by encodings.aliases''' 1847 if (len(data) >= 4)
and (data[:2] ==
'\xfe\xff') \
1848 and (data[2:4] !=
'\x00\x00'):
1849 encoding =
'utf-16be' 1851 elif (len(data) >= 4)
and (data[:2] ==
'\xff\xfe') \
1852 and (data[2:4] !=
'\x00\x00'):
1853 encoding =
'utf-16le' 1855 elif data[:3] ==
'\xef\xbb\xbf':
1858 elif data[:4] ==
'\x00\x00\xfe\xff':
1859 encoding =
'utf-32be' 1861 elif data[:4] ==
'\xff\xfe\x00\x00':
1862 encoding =
'utf-32le' 1864 newdata =
unicode(data, encoding)
1868 """Given a document, tries to detect its XML encoding.""" 1869 xml_encoding = sniffed_xml_encoding =
None 1871 if xml_data[:4] ==
'\x4c\x6f\xa7\x94':
1874 elif xml_data[:4] ==
'\x00\x3c\x00\x3f':
1876 sniffed_xml_encoding =
'utf-16be' 1878 elif (len(xml_data) >= 4)
and (xml_data[:2] ==
'\xfe\xff') \
1879 and (xml_data[2:4] !=
'\x00\x00'):
1881 sniffed_xml_encoding =
'utf-16be' 1882 xml_data =
unicode(xml_data[2:],
'utf-16be').
encode(
'utf-8')
1883 elif xml_data[:4] ==
'\x3c\x00\x3f\x00':
1885 sniffed_xml_encoding =
'utf-16le' 1887 elif (len(xml_data) >= 4)
and (xml_data[:2] ==
'\xff\xfe')
and \
1888 (xml_data[2:4] !=
'\x00\x00'):
1890 sniffed_xml_encoding =
'utf-16le' 1891 xml_data =
unicode(xml_data[2:],
'utf-16le').
encode(
'utf-8')
1892 elif xml_data[:4] ==
'\x00\x00\x00\x3c':
1894 sniffed_xml_encoding =
'utf-32be' 1896 elif xml_data[:4] ==
'\x3c\x00\x00\x00':
1898 sniffed_xml_encoding =
'utf-32le' 1900 elif xml_data[:4] ==
'\x00\x00\xfe\xff':
1902 sniffed_xml_encoding =
'utf-32be' 1903 xml_data =
unicode(xml_data[4:],
'utf-32be').
encode(
'utf-8')
1904 elif xml_data[:4] ==
'\xff\xfe\x00\x00':
1906 sniffed_xml_encoding =
'utf-32le' 1907 xml_data =
unicode(xml_data[4:],
'utf-32le').
encode(
'utf-8')
1908 elif xml_data[:3] ==
'\xef\xbb\xbf':
1910 sniffed_xml_encoding =
'utf-8' 1913 sniffed_xml_encoding =
'ascii' 1916 xml_encoding_match =
None 1917 xml_encoding_match = re.compile(
1918 '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
1919 if not xml_encoding_match
and isHTML:
1920 regexp = re.compile(
'<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
1921 xml_encoding_match = regexp.search(xml_data)
1922 if xml_encoding_match
is not None:
1923 xml_encoding = xml_encoding_match.groups()[0].lower()
1926 if sniffed_xml_encoding
and \
1927 (xml_encoding
in (
'iso-10646-ucs-2',
'ucs-2',
'csunicode',
1928 'iso-10646-ucs-4',
'ucs-4',
'csucs4',
1929 'utf-16',
'utf-32',
'utf_16',
'utf_32',
1931 xml_encoding = sniffed_xml_encoding
1932 return xml_data, xml_encoding, sniffed_xml_encoding
1937 or (charset
and self.
_codec(charset.replace(
"-",
""))) \
1938 or (charset
and self.
_codec(charset.replace(
"-",
"_"))) \
1942 if not charset:
return charset
1945 codecs.lookup(charset)
1947 except (LookupError, ValueError):
1951 EBCDIC_TO_ASCII_MAP =
None 1954 if not c.EBCDIC_TO_ASCII_MAP:
1955 emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1956 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1957 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1958 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1959 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1960 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1961 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1962 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1963 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1964 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1965 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1966 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1967 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1968 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1969 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1970 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1971 250,251,252,253,254,255)
1973 c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1975 return s.translate(c.EBCDIC_TO_ASCII_MAP)
1977 MS_CHARS = {
'\x80' : (
'euro',
'20AC'),
1979 '\x82' : (
'sbquo',
'201A'),
1980 '\x83' : (
'fnof',
'192'),
1981 '\x84' : (
'bdquo',
'201E'),
1982 '\x85' : (
'hellip',
'2026'),
1983 '\x86' : (
'dagger',
'2020'),
1984 '\x87' : (
'Dagger',
'2021'),
1985 '\x88' : (
'circ',
'2C6'),
1986 '\x89' : (
'permil',
'2030'),
1987 '\x8A' : (
'Scaron',
'160'),
1988 '\x8B' : (
'lsaquo',
'2039'),
1989 '\x8C' : (
'OElig',
'152'),
1991 '\x8E' : (
'#x17D',
'17D'),
1994 '\x91' : (
'lsquo',
'2018'),
1995 '\x92' : (
'rsquo',
'2019'),
1996 '\x93' : (
'ldquo',
'201C'),
1997 '\x94' : (
'rdquo',
'201D'),
1998 '\x95' : (
'bull',
'2022'),
1999 '\x96' : (
'ndash',
'2013'),
2000 '\x97' : (
'mdash',
'2014'),
2001 '\x98' : (
'tilde',
'2DC'),
2002 '\x99' : (
'trade',
'2122'),
2003 '\x9a' : (
'scaron',
'161'),
2004 '\x9b' : (
'rsaquo',
'203A'),
2005 '\x9c' : (
'oelig',
'153'),
2007 '\x9e' : (
'#x17E',
'17E'),
2008 '\x9f' : (
'Yuml',
''),}
2014 if __name__ ==
'__main__':
2017 print soup.prettify()
def unknown_endtag(self, name)
def fetchText(self, text=None, recursive=True, limit=None)
def get(self, key, default=None)
def _convertFrom(self, proposed)
def _findOne(self, method, name, attrs, text, kwargs)
def findNext(self, name=None, attrs={}, text=None, kwargs)
def findPrevious(self, name=None, attrs={}, text=None, kwargs)
def _toStringSubclass(self, text, subclass)
def toEncoding(self, s, encoding=None)
std::vector< T >::const_iterator search(const cond::Time_t &val, const std::vector< T > &container)
def handle_pi(self, text)
def _matches(self, markup, matchAgainst)
def _popToTag(self, name, inclusivePop=True)
def _codec(self, charset)
def findPreviousSibling(self, name=None, attrs={}, text=None, kwargs)
def __getattr__(self, methodName)
def unknown_starttag(self, name, attrs, selfClosing=0)
def setString(self, string)
def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, prettyPrint=False, indentLevel=0)
def handle_decl(self, data)
def getText(self, separator=u"")
def isSelfClosingTag(self, name)
def _toUnicode(self, data, encoding)
def parse_declaration(self, i)
BARE_AMPERSAND_OR_BRACKET
def _subMSChar(self, orig)
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING)
def _smartPop(self, name)
XML_ENTITIES_TO_SPECIAL_CHARS
def __init__(self, args, kwargs)
def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING)
def __init__(self, parser, name, attrs=None, parent=None, previous=None)
def recursiveChildGenerator(self)
def findParents(self, name=None, attrs={}, limit=None, kwargs)
double intersection(double r12)
def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING)
def searchTag(self, markupName=None, markupAttrs={})
def findPreviousSiblings(self, name=None, attrs={}, text=None, limit=None, kwargs)
def _lastRecursiveChild(self)
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, prettyPrint=False, indentLevel=0)
def __getattr__(self, tag)
bool decode(bool &, std::string const &)
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING)
def buildTagMap(default, args)
def nextSiblingGenerator(self)
def findParent(self, name=None, attrs={}, kwargs)
def _ebcdic_to_ascii(self, s)
def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, kwargs)
def _findAll(self, name, attrs, text, limit, generator, kwargs)
def find_codec(self, charset)
def handle_comment(self, text)
def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, kwargs)
def __call__(self, args, kwargs)
def handle_data(self, data)
def findNextSibling(self, name=None, attrs={}, text=None, kwargs)
def __getitem__(self, key)
def firstText(self, text=None, recursive=True)
escapeUnrecognizedEntities
def start_meta(self, attrs)
def previousGenerator(self)
def parentGenerator(self)
def setup(self, parent=None, previous=None)
def __init__(self, source)
def __getattr__(self, attr)
static std::string join(char **cmd)
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING)
def findAllNext(self, name=None, attrs={}, text=None, limit=None, kwargs)
def __contains__(self, x)
def replaceWithChildren(self)
def remove(d, key, TELL=False)
def _feed(self, inDocumentEncoding=None, isHTML=False)
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING)
def __init__(self, markup, overrideEncodings=[], smartQuotesTo='xml', isHTML=False)
def findAll(self, name=None, attrs={}, recursive=True, text=None, limit=None, kwargs)
def insert(self, position, newChild)
def find(self, name=None, attrs={}, recursive=True, text=None, kwargs)
def replaceWith(self, replaceWith)
def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, markupMassage=True, smartQuotesTo=XML_ENTITIES, convertEntities=None, selfClosingTags=None, isHTML=False)
def previousSiblingGenerator(self)
def __init__(self, name=None, attrs={}, text=None, kwargs)
def handle_entityref(self, ref)
def convert_charref(self, name)
def _detectEncoding(self, xml_data, isHTML=False)
def __setitem__(self, key, value)
def handle_charref(self, ref)
def __delitem__(self, key)
def _match_css_class(str)
def _convertEntities(self, match)
def substituteEncoding(self, str, encoding=None)
XML_SPECIAL_CHARS_TO_ENTITIES
def endData(self, containerClass=NavigableString)