3 "The Screen-Scraper's Friend"
4 http://www.crummy.com/software/BeautifulSoup/
6 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7 tree representation. It provides methods and Pythonic idioms that make
8 it easy to navigate, search, and modify the tree.
10 A well-formed XML/HTML document yields a well-formed data
11 structure. An ill-formed XML/HTML document yields a correspondingly
12 ill-formed data structure. If your document is only locally
13 well-formed, you can use this library to find and process the
14 well-formed part of it.
16 Beautiful Soup works with Python 2.2 and up. It has no external
17 dependencies, but you'll have more success at converting data to UTF-8
18 if you also install these three packages:
20 * chardet, for auto-detecting character encodings
21 http://chardet.feedparser.org/
22 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
24 http://cjkpython.i18n.org/
26 Beautiful Soup defines classes for two main parsing strategies:
28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29 language that kind of looks like XML.
31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32 or invalid. This class has web browser-like heuristics for
33 obtaining a sensible parse tree in the face of common HTML errors.
35 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36 the encoding of an HTML or XML document, and converting it to
37 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
39 For more than you ever wanted to know about Beautiful Soup, see the
41 http://www.crummy.com/software/BeautifulSoup/documentation.html
43 Here, have some legalese:
45 Copyright (c) 2004-2010, Leonard Richardson
49 Redistribution and use in source and binary forms, with or without
50 modification, are permitted provided that the following conditions are
53 * Redistributions of source code must retain the above copyright
54 notice, this list of conditions and the following disclaimer.
56 * Redistributions in binary form must reproduce the above
57 copyright notice, this list of conditions and the following
58 disclaimer in the documentation and/or other materials provided
59 with the distribution.
61 * Neither the name of the the Beautiful Soup Consortium and All
62 Night Kosher Bakery nor the names of its contributors may be
63 used to endorse or promote products derived from this software
64 without specific prior written permission.
66 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
79 from __future__
import generators
81 __author__ =
"Leonard Richardson (leonardr@segfault.org)"
83 __copyright__ =
"Copyright (c) 2004-2012 Leonard Richardson"
84 __license__ =
"New-style BSD"
86 from sgmllib
import SGMLParser, SGMLParseError
93 from htmlentitydefs
import name2codepoint
99 from sets
import Set
as set
102 sgmllib.tagfind = re.compile(
'[a-zA-Z][-_.:a-zA-Z0-9]*')
103 markupbase._declname_match = re.compile(
r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
105 DEFAULT_OUTPUT_ENCODING =
"utf-8"
108 """Build a RE to match the given CSS class."""
109 return re.compile(
r"(^|.*\s)%s($|\s)" % str)
114 """Contains the navigational information for some part of the page
115 (either a tag or a piece of text)"""
118 "Cheap function to invert a hash."
120 for k,v
in h.items():
124 XML_ENTITIES_TO_SPECIAL_CHARS = {
"apos" :
"'",
130 XML_SPECIAL_CHARS_TO_ENTITIES =
_invert(XML_ENTITIES_TO_SPECIAL_CHARS)
132 def setup(self, parent=None, previous=None):
133 """Sets up the initial relations between this element and
147 if hasattr(replaceWith,
"parent")\
148 and replaceWith.parent
is self.
parent:
150 index = replaceWith.parent.index(replaceWith)
151 if index
and index < myIndex:
155 myIndex = myIndex - 1
157 oldParent.insert(myIndex, replaceWith)
163 reversedChildren = list(self.contents)
164 reversedChildren.reverse()
165 for child
in reversedChildren:
166 myParent.insert(myIndex, child)
169 """Destructively rips this element out of the tree."""
180 nextElement = lastChild.next
185 nextElement.previous = self.
previous
187 lastChild.next =
None
198 "Finds the last element beneath this object to be parsed."
200 while hasattr(lastChild,
'contents')
and lastChild.contents:
201 lastChild = lastChild.contents[-1]
205 if isinstance(newChild, basestring) \
206 and not isinstance(newChild, NavigableString):
209 position =
min(position, len(self.contents))
210 if hasattr(newChild,
'parent')
and newChild.parent
is not None:
213 if newChild.parent
is self:
214 index = self.index(newChild)
220 position = position - 1
223 newChild.parent = self
226 newChild.previousSibling =
None
227 newChild.previous = self
229 previousChild = self.contents[position-1]
230 newChild.previousSibling = previousChild
231 newChild.previousSibling.nextSibling = newChild
232 newChild.previous = previousChild._lastRecursiveChild()
233 if newChild.previous:
234 newChild.previous.next = newChild
236 newChildsLastElement = newChild._lastRecursiveChild()
238 if position >= len(self.contents):
239 newChild.nextSibling =
None
242 parentsNextSibling =
None
243 while not parentsNextSibling:
244 parentsNextSibling = parent.nextSibling
245 parent = parent.parent
248 if parentsNextSibling:
249 newChildsLastElement.next = parentsNextSibling
251 newChildsLastElement.next =
None
253 nextChild = self.contents[position]
254 newChild.nextSibling = nextChild
255 if newChild.nextSibling:
256 newChild.nextSibling.previousSibling = newChild
257 newChildsLastElement.next = nextChild
259 if newChildsLastElement.next:
260 newChildsLastElement.next.previous = newChildsLastElement
261 self.contents.
insert(position, newChild)
264 """Appends the given tag to the contents of this tag."""
265 self.
insert(len(self.contents), tag)
267 def findNext(self, name=None, attrs={}, text=None, **kwargs):
268 """Returns the first item that matches the given criteria and
269 appears after this Tag in the document."""
274 """Returns all items that match the given criteria and appear
275 after this Tag in the document."""
280 """Returns the closest sibling to this Tag that matches the
281 given criteria and appears after this Tag in the document."""
287 """Returns the siblings of this Tag that match the given
288 criteria and appear after this Tag in the document."""
289 return self.
_findAll(name, attrs, text, limit,
291 fetchNextSiblings = findNextSiblings
294 """Returns the first item that matches the given criteria and
295 appears before this Tag in the document."""
300 """Returns all items that match the given criteria and appear
301 before this Tag in the document."""
304 fetchPrevious = findAllPrevious
307 """Returns the closest sibling to this Tag that matches the
308 given criteria and appears before this Tag in the document."""
313 limit=None, **kwargs):
314 """Returns the siblings of this Tag that match the given
315 criteria and appear before this Tag in the document."""
316 return self.
_findAll(name, attrs, text, limit,
318 fetchPreviousSiblings = findPreviousSiblings
321 """Returns the closest parent of this Tag that matches the given
332 """Returns the parents of this Tag that match the given
337 fetchParents = findParents
341 def _findOne(self, method, name, attrs, text, **kwargs):
343 l =
method(name, attrs, text, 1, **kwargs)
348 def _findAll(self, name, attrs, text, limit, generator, **kwargs):
349 "Iterates over a generator looking for things that match."
351 if isinstance(name, SoupStrainer):
354 elif text
is None and not limit
and not attrs
and not kwargs:
357 return [element
for element
in generator()
358 if isinstance(element, Tag)]
360 elif isinstance(name, basestring):
361 return [element
for element
in generator()
362 if isinstance(element, Tag)
and
363 element.name == name]
374 except StopIteration:
377 found = strainer.search(i)
379 results.append(found)
380 if limit
and len(results) >= limit:
407 i = i.previousSibling
418 encoding = encoding
or "utf-8"
419 return str.replace(
"%SOUP-ENCODING%", encoding)
422 """Encodes an object to a string in some encoding, or to Unicode.
424 if isinstance(s, unicode):
426 s = s.encode(encoding)
427 elif isinstance(s, str):
429 s = s.encode(encoding)
439 BARE_AMPERSAND_OR_BRACKET = re.compile(
"([<>]|"
440 +
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
444 """Used with a regular expression to substitute the
445 appropriate XML entity for an XML special character."""
452 """Create a new NavigableString.
454 When unpickling a NavigableString, this method is called with
455 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
456 passed in to the superclass's __new__ or the superclass won't know
457 how to handle non-ASCII characters.
459 if isinstance(value, unicode):
460 return unicode.__new__(cls, value)
461 return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
464 return (NavigableString.__str__(self),)
467 """text.string gives you text. This is for backwards
468 compatibility for Navigable*String, but for CData* it lets you
469 get the string without the CData wrapper."""
473 raise AttributeError,
"'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
476 return str(self).
decode(DEFAULT_OUTPUT_ENCODING)
478 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
482 return data.encode(encoding)
488 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
489 return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
492 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
494 if "%SOUP-ENCODING%" in output:
496 return "<?%s?>" % self.
toEncoding(output, encoding)
499 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
500 return "<!--%s-->" % NavigableString.__str__(self, encoding)
503 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
504 return "<!%s>" % NavigableString.__str__(self, encoding)
508 """Represents a found HTML tag with its attributes and contents."""
511 """Used in a call to re.sub to replace HTML, XML, and numeric
512 entities with the appropriate Unicode characters. If HTML
513 entities are being converted, any unrecognized entities are
517 return unichr(name2codepoint[x])
523 elif len(x) > 0
and x[0] ==
'#':
525 if len(x) > 1
and x[1] ==
'x':
526 return unichr(
int(x[2:], 16))
528 return unichr(
int(x[1:]))
531 return u'&%s;' % x
535 def __init__(self, parser, name, attrs=None, parent=None,
546 elif isinstance(attrs, dict):
547 attrs = attrs.items()
558 convert = lambda(k, val): (k,
559 re.sub(
"&(#\d+|#x[0-9a-fA-F]+|\w+);",
566 and isinstance(self.
contents[0], NavigableString)):
570 """Replace the contents of the tag with a string"""
574 string = property(getString, setString)
582 while current
is not stopNode:
583 if isinstance(current, NavigableString):
584 strings.append(current.strip())
585 current = current.next
586 return separator.join(strings)
588 text = property(getText)
590 def get(self, key, default=None):
591 """Returns the value of the 'key' attribute for the tag, or
592 the value given for 'default' if it doesn't have that
597 """Extract all children."""
602 for i, child
in enumerate(self.
contents):
605 raise ValueError(
"Tag.index: element not in tag")
611 """tag[key] returns the value of the 'key' attribute for the tag,
612 and throws an exception if it's not there."""
616 "Iterating over a tag iterates over its contents."
620 "The length of a tag is the length of its list of contents."
627 "A tag is non-None even if it has no contents."
631 """Setting tag[key] sets the value of the 'key' attribute for the
637 if self.
attrs[i][0] == key:
638 self.
attrs[i] = (key, value)
645 "Deleting tag[key] deletes all 'key' attributes for the tag."
646 for item
in self.
attrs:
656 """Calling a tag like a function is the same as calling its
657 findAll() method. Eg. tag('a') returns a list of all the A tags
658 found within this tag."""
663 if len(tag) > 3
and tag.rfind(
'Tag') == len(tag)-3:
664 return self.
find(tag[:-3])
665 elif tag.find(
'__') != 0:
666 return self.
find(tag)
667 raise AttributeError,
"'%s' object has no attribute '%s'" % (self.__class__, tag)
670 """Returns true iff this tag has the same name, the same attributes,
671 and the same contents (recursively) as the given tag.
673 NOTE: right now this will return false if two tags have the
674 same attributes in a different order. Should this be fixed?"""
677 if not hasattr(other,
'name')
or not hasattr(other,
'attrs')
or not hasattr(other,
'contents')
or self.
name != other.name
or self.
attrs != other.attrs
or len(self) != len(other):
680 if self.
contents[i] != other.contents[i]:
685 """Returns true iff this tag is not identical to the other tag,
686 as defined in __eq__."""
687 return not self == other
689 def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
690 """Renders this tag as a string."""
696 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
697 prettyPrint=False, indentLevel=0):
698 """Returns a string or Unicode representation of this tag and
699 its contents. To get Unicode, pass None for encoding.
701 NOTE: since Python's HTML parser consumes whitespace, this
702 method is not certain to reproduce the whitespace present in
703 the original string."""
709 for key, val
in self.
attrs:
711 if isinstance(val, basestring):
735 val = val.replace(
"'",
"&squot;")
743 attrs.append(fmt % (self.
toEncoding(key, encoding),
750 closeTag =
'</%s>' % encodedName
752 indentTag, indentContents = 0, 0
754 indentTag = indentLevel
755 space = (
' ' * (indentTag-1))
756 indentContents = indentTag + 1
757 contents = self.
renderContents(encoding, prettyPrint, indentContents)
764 attributeString =
' ' +
' '.
join(attrs)
767 s.append(
'<%s%s%s>' % (encodedName, attributeString, close))
771 if prettyPrint
and contents
and contents[-1] !=
"\n":
773 if prettyPrint
and closeTag:
782 """Recursively destroys the contents of this tree."""
787 while current
is not None:
789 if isinstance(current, Tag):
790 del current.contents[:]
791 current.parent =
None
792 current.previous =
None
793 current.previousSibling =
None
795 current.nextSibling =
None
798 def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
799 return self.
__str__(encoding,
True)
802 prettyPrint=False, indentLevel=0):
803 """Renders the contents of this tag as a string in the given
804 encoding. If encoding is None, returns a Unicode string.."""
808 if isinstance(c, NavigableString):
809 text = c.__str__(encoding)
810 elif isinstance(c, Tag):
811 s.append(c.__str__(encoding, prettyPrint, indentLevel))
812 if text
and prettyPrint:
816 s.append(
" " * (indentLevel-1))
824 def find(self, name=None, attrs={}, recursive=True, text=None,
826 """Return only the first child of this Tag matching the given
829 l = self.
findAll(name, attrs, recursive, text, 1, **kwargs)
835 def findAll(self, name=None, attrs={}, recursive=True, text=None,
836 limit=None, **kwargs):
837 """Extracts a list of Tag objects that match the given
838 criteria. You can specify the name of the Tag and any
839 attributes you want the Tag to have.
841 The value of a key-value pair in the 'attrs' map can be a
842 string, a list of strings, a regular expression object, or a
843 callable that takes a string and returns whether or not the
844 string matches for some custom definition of 'matches'. The
845 same is true of the tag name."""
849 return self.
_findAll(name, attrs, text, limit, generator, **kwargs)
850 findChildren = findAll
856 def fetchText(self, text=None, recursive=True, limit=None):
857 return self.
findAll(text=text, recursive=recursive, limit=limit)
860 return self.
find(text=text, recursive=recursive)
865 """Initializes a map representation of this tag's attributes,
866 if not already initialized."""
867 if not getattr(self,
'attrMap'):
869 for (key, value)
in self.
attrs:
883 while current
is not stopNode:
885 current = current.next
890 """Encapsulates a number of ways of matching a markup element (tag or
893 def __init__(self, name=None, attrs={}, text=None, **kwargs):
895 if isinstance(attrs, basestring):
916 if isinstance(markupName, Tag):
919 callFunctionWithTagData = callable(self.
name) \
920 and not isinstance(markupName, Tag)
923 or callFunctionWithTagData \
925 or (
not markup
and self.
_matches(markupName, self.
name)):
926 if callFunctionWithTagData:
927 match = self.
name(markupName, markupAttrs)
932 if not markupAttrMap:
933 if hasattr(markupAttrs,
'get'):
934 markupAttrMap = markupAttrs
937 for k,v
in markupAttrs:
939 attrValue = markupAttrMap.get(attr)
940 if not self.
_matches(attrValue, matchAgainst):
955 if hasattr(markup,
"__iter__") \
956 and not isinstance(markup, Tag):
957 for element
in markup:
958 if isinstance(element, NavigableString) \
964 elif isinstance(markup, Tag):
968 elif isinstance(markup, NavigableString)
or \
969 isinstance(markup, basestring):
973 raise Exception,
"I don't know how to match against a %s" \
980 if matchAgainst
is True:
981 result = markup
is not None
982 elif callable(matchAgainst):
983 result = matchAgainst(markup)
987 if isinstance(markup, Tag):
989 if markup
and not isinstance(markup, basestring):
992 if hasattr(matchAgainst,
'match'):
994 result = markup
and matchAgainst.search(markup)
995 elif hasattr(matchAgainst,
'__iter__'):
996 result = markup
in matchAgainst
997 elif hasattr(matchAgainst,
'items'):
998 result = markup.has_key(matchAgainst)
999 elif matchAgainst
and isinstance(markup, basestring):
1000 if isinstance(markup, unicode):
1001 matchAgainst =
unicode(matchAgainst)
1003 matchAgainst =
str(matchAgainst)
1006 result = matchAgainst == markup
1010 """A ResultSet is just a list that keeps track of the SoupStrainer
1019 """Turns a list of maps, lists, or scalars into a single map.
1020 Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
1021 NESTING_RESET_TAGS maps out of lists and partial maps."""
1023 for portion
in args:
1024 if hasattr(portion,
'items'):
1026 for k,v
in portion.items():
1028 elif hasattr(portion,
'__iter__'):
1034 built[portion] = default
1041 """This class contains the basic parser and search code. It defines
1042 a parser that knows nothing about tag behavior except for the
1045 You can't close a tag without closing all the tags it encloses.
1046 That is, "<foo><bar></foo>" actually means
1047 "<foo><bar></bar></foo>".
1049 [Another possible explanation is "<foo><bar /></foo>", but since
1050 this class defines no SELF_CLOSING_TAGS, it will never use that
1053 This class is useful for parsing XML or made-up markup languages,
1054 or when BeautifulSoup makes an assumption counter to what you were
1057 SELF_CLOSING_TAGS = {}
1059 RESET_NESTING_TAGS = {}
1061 PRESERVE_WHITESPACE_TAGS = []
1063 MARKUP_MASSAGE = [(re.compile(
'(<[^<>]*)/>'),
1064 lambda x: x.group(1) +
' />'),
1065 (re.compile(
'<!\s+([^<>]*)>'),
1066 lambda x:
'<!' + x.group(1) +
'>')
1069 ROOT_TAG_NAME =
u'[document]'
1071 HTML_ENTITIES =
"html"
1072 XML_ENTITIES =
"xml"
1073 XHTML_ENTITIES =
"xhtml"
1075 ALL_ENTITIES = XHTML_ENTITIES
1081 STRIP_ASCII_SPACES = { 9:
None, 10:
None, 12:
None, 13:
None, 32:
None, }
1083 def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
1084 markupMassage=True, smartQuotesTo=XML_ENTITIES,
1085 convertEntities=None, selfClosingTags=None, isHTML=False):
1086 """The Soup object is initialized as the 'root tag', and the
1087 provided markup (which can be a string or a file-like object)
1088 is fed into the underlying parser.
1090 sgmllib will process most bad HTML, and the BeautifulSoup
1091 class has some tricks for dealing with some HTML that kills
1092 sgmllib, but Beautiful Soup can nonetheless choke or lose data
1093 if your data uses self-closing tags or declarations
1096 By default, Beautiful Soup uses regexes to sanitize input,
1097 avoiding the vast majority of these problems. If the problems
1098 don't apply to you, pass in False for markupMassage, and
1099 you'll get better performance.
1101 The default parser massage techniques fix the two most common
1102 instances of invalid HTML that choke sgmllib:
1104 <br/> (No space between name of closing tag and tag close)
1105 <! --Comment--> (Extraneous whitespace in declaration)
1107 You can pass in a custom list of (RE object, replace method)
1108 tuples to get Beautiful Soup to scrub your input the way you
1140 SGMLParser.__init__(self)
1142 if hasattr(markup,
'read'):
1143 markup = markup.read()
1147 self.
_feed(isHTML=isHTML)
1153 """This method fixes a bug in Python's SGMLParser."""
1158 if not 0 <= n <= 127 :
1160 return self.convert_codepoint(n)
1162 def _feed(self, inDocumentEncoding=None, isHTML=False):
1165 if isinstance(markup, unicode):
1166 if not hasattr(self,
'originalEncoding'):
1169 dammit = UnicodeDammit\
1172 markup = dammit.unicode
1180 markup = fix.sub(m, markup)
1189 SGMLParser.feed(self, markup)
1196 """This method routes method call requests to either the SGMLParser
1197 superclass or the Tag superclass, depending on the method name."""
1200 if methodName.startswith(
'start_')
or methodName.startswith(
'end_') \
1201 or methodName.startswith(
'do_'):
1202 return SGMLParser.__getattr__(self, methodName)
1203 elif not methodName.startswith(
'__'):
1204 return Tag.__getattr__(self, methodName)
1206 raise AttributeError
1209 """Returns true iff the given string is the name of a
1210 self-closing tag according to this parser."""
1217 SGMLParser.reset(self)
1239 def endData(self, containerClass=NavigableString):
1245 if '\n' in currentData:
1254 o = containerClass(currentData)
1263 """Pops the tag stack up to and including the most recent
1264 instance of the given tag. If inclusivePop is false, pops the tag
1265 stack up to but *not* including the most recent instqance of
1272 mostRecentTag =
None
1277 if not inclusivePop:
1278 numPops = numPops - 1
1280 for i
in range(0, numPops):
1281 mostRecentTag = self.
popTag()
1282 return mostRecentTag
1286 """We need to pop up to the previous tag of this type, unless
1287 one of this tag's nesting reset triggers comes between this
1288 tag and the previous tag of this type, OR unless this tag is a
1289 generic nesting trigger and another generic nesting trigger
1290 comes between this tag and the previous tag of this type.
1293 <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1294 <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1295 <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1297 <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1298 <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1299 <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1303 isNestable = nestingResetTriggers !=
None
1309 if (
not p
or p.name == name)
and not isNestable:
1314 if (nestingResetTriggers
is not None
1315 and p.name
in nestingResetTriggers) \
1316 or (nestingResetTriggers
is None and isResetNesting
1335 attrs =
''.
join([
' %s="%s"' % (x, y)
for x, y
in attrs])
1377 """Adds a certain piece of text to the tree as a NavigableString
1384 """Handle a processing instruction as a ProcessingInstruction
1385 object, possibly one with a %SOUP-ENCODING% slot into which an
1386 encoding will be plugged later."""
1387 if text[:3] ==
"xml":
1388 text =
u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1392 "Handle comments as Comment objects."
1396 "Handle character references as data."
1398 data = unichr(
int(ref))
1400 data =
'&#%s;' % ref
1404 """Handle entity references as data, possibly converting known
1405 HTML and/or XML entity references to the corresponding Unicode
1410 data = unichr(name2codepoint[ref])
1435 data =
"&%s" % ref
1447 "Handle DOCTYPEs and the like as Declaration objects."
1451 """Treat a bogus SGML declaration as raw data. Treat a CDATA
1452 declaration as a CData object."""
1454 if self.rawdata[i:i+9] ==
'<![CDATA[':
1455 k = self.rawdata.
find(
']]>', i)
1457 k = len(self.rawdata)
1458 data = self.rawdata[i+9:k]
1463 j = SGMLParser.parse_declaration(self, i)
1464 except SGMLParseError:
1465 toHandle = self.rawdata[i:]
1467 j = i + len(toHandle)
1472 """This parser knows the following facts about HTML:
1474 * Some tags have no closing tag and should be interpreted as being
1475 closed as soon as they are encountered.
1477 * The text inside some tags (ie. 'script') may contain tags which
1478 are not really part of the document and which should be parsed
1479 as text, not tags. If you want to parse the text as tags, you can
1480 always fetch it and parse it explicitly.
1482 * Tag nesting rules:
1484 Most tags can't be nested at all. For instance, the occurance of
1485 a <p> tag should implicitly close the previous <p> tag.
1488 should be transformed into:
1489 <p>Para1</p><p>Para2
1491 Some tags can be nested arbitrarily. For instance, the occurance
1492 of a <blockquote> tag should _not_ implicitly close the previous
1495 Alice said: <blockquote>Bob said: <blockquote>Blah
1496 should NOT be transformed into:
1497 Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1499 Some tags can be nested, but the nesting is reset by the
1500 interposition of other tags. For instance, a <tr> tag should
1501 implicitly close the previous <tr> tag within the same <table>,
1502 but not close a <tr> tag in another table.
1504 <table><tr>Blah<tr>Blah
1505 should be transformed into:
1506 <table><tr>Blah</tr><tr>Blah
1508 <tr>Blah<table><tr>Blah
1509 should NOT be transformed into
1510 <tr>Blah<table></tr><tr>Blah
1512 Differing assumptions about tag nesting rules are a major source
1513 of problems with the BeautifulSoup class. If BeautifulSoup is not
1514 treating as nestable a tag your page author treats as nestable,
1515 try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1516 BeautifulStoneSoup before writing your own subclass."""
1519 if not kwargs.has_key(
'smartQuotesTo'):
1521 kwargs[
'isHTML'] =
True
1522 BeautifulStoneSoup.__init__(self, *args, **kwargs)
1525 (
'br' ,
'hr',
'input',
'img',
'meta',
1526 'spacer',
'link',
'frame',
'base',
'col'))
1528 PRESERVE_WHITESPACE_TAGS = set([
'pre',
'textarea'])
1530 QUOTE_TAGS = {
'script' :
None,
'textarea' :
None}
1535 NESTABLE_INLINE_TAGS = (
'span',
'font',
'q',
'object',
'bdo',
'sub',
'sup',
1541 NESTABLE_BLOCK_TAGS = (
'blockquote',
'div',
'fieldset',
'ins',
'del')
1544 NESTABLE_LIST_TAGS = {
'ol' : [],
1546 'li' : [
'ul',
'ol'],
1552 NESTABLE_TABLE_TAGS = {
'table' : [],
1553 'tr' : [
'table',
'tbody',
'tfoot',
'thead'],
1556 'thead' : [
'table'],
1557 'tbody' : [
'table'],
1558 'tfoot' : [
'table'],
1561 NON_NESTABLE_BLOCK_TAGS = (
'address',
'form',
'p',
'pre')
1565 RESET_NESTING_TAGS =
buildTagMap(
None, NESTABLE_BLOCK_TAGS,
'noscript',
1566 NON_NESTABLE_BLOCK_TAGS,
1568 NESTABLE_TABLE_TAGS)
1570 NESTABLE_TAGS =
buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1571 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1574 CHARSET_RE = re.compile(
"((^|;)\s*charset=)([^;]*)", re.M)
1577 """Beautiful Soup can detect a charset included in a META tag,
1578 try to convert the document to that charset, and re-parse the
1579 document from the beginning."""
1582 contentTypeIndex =
None
1583 tagNeedsEncodingSubstitution =
False
1585 for i
in range(0, len(attrs)):
1586 key, value = attrs[i]
1588 if key ==
'http-equiv':
1590 elif key ==
'content':
1592 contentTypeIndex = i
1594 if httpEquiv
and contentType:
1605 return match.group(1) +
"%SOUP-ENCODING%"
1606 newAttr = self.
CHARSET_RE.sub(rewrite, contentType)
1607 attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1609 tagNeedsEncodingSubstitution =
True
1613 newCharset = match.group(3)
1620 if tag
and tagNeedsEncodingSubstitution:
1621 tag.containsSubstitutions =
True
1628 """The BeautifulSoup class is oriented towards skipping over
1629 common HTML errors like unclosed tags. However, sometimes it makes
1630 errors of its own. For instance, consider this fragment:
1632 <b>Foo<b>Bar</b></b>
1634 This is perfectly valid (if bizarre) HTML. However, the
1635 BeautifulSoup class will implicitly close the first b tag when it
1636 encounters the second 'b'. It will think the author wrote
1637 "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1638 there's no real-world reason to bold something that's already
1639 bold. When it encounters '</b></b>' it will close two more 'b'
1640 tags, for a grand total of three tags closed instead of two. This
1641 can throw off the rest of your document structure. The same is
1642 true of a number of other tags, listed below.
1644 It's much more common for someone to forget to close a 'b' tag
1645 than to actually use nested 'b' tags, and the BeautifulSoup class
1646 handles the common case. This class handles the not-co-common
1647 case: where you can't believe someone wrote what they did, but
1648 it's valid HTML and BeautifulSoup screwed up by assuming it
1651 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1652 (
'em',
'big',
'i',
'small',
'tt',
'abbr',
'acronym',
'strong',
1653 'cite',
'code',
'dfn',
'kbd',
'samp',
'strong',
'var',
'b',
1656 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = (
'noscript',)
1659 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1660 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1663 """The MinimalSoup class is for parsing HTML that contains
1664 pathologically bad markup. It makes no assumptions about tag
1665 nesting, but it does know which tags are self-closing, that
1666 <script> tags contain Javascript and should not be parsed, that
1667 META tags may contain encoding information, and so on.
1669 This also makes it better for subclassing than BeautifulStoneSoup
1670 or BeautifulSoup."""
1676 """This class will push a tag with only a single string child into
1677 the tag's parent as an attribute. The attribute's name is the tag
1678 name, and the value is the string child. An example should give
1679 the flavor of the change:
1681 <foo><bar>baz</bar></foo>
1683 <foo bar="baz"><bar>baz</bar></foo>
1685 You can then access fooTag['bar'] instead of fooTag.barTag.string.
1687 This is, of course, useful for scraping structures that tend to
1688 use subelements instead of attributes, such as SOAP messages. Note
1689 that it modifies its input, so don't print the modified version
1692 I'm not sure how many people really want to use this class; let me
1693 know if you do. Mainly I like the name."""
1699 parent._getAttrMap()
1700 if (isinstance(tag, Tag)
and len(tag.contents) == 1
and
1701 isinstance(tag.contents[0], NavigableString)
and
1702 not parent.attrMap.has_key(tag.name)):
1703 parent[tag.name] = tag.contents[0]
1704 BeautifulStoneSoup.popTag(self)
1748 import cjkcodecs.aliases
1757 """A class for detecting the encoding of a *ML document and
1758 converting it to a Unicode string. If the source encoding is
1759 windows-1252, can replace MS smart quotes with their HTML or XML
1766 CHARSET_ALIASES = {
"macintosh" :
"mac-roman",
1767 "x-sjis" :
"shift-jis" }
1770 smartQuotesTo='xml', isHTML=False):
1772 self.
markup, documentEncoding, sniffedEncoding = \
1776 if markup ==
'' or isinstance(markup, unicode):
1782 for proposedEncoding
in overrideEncodings:
1786 for proposedEncoding
in (documentEncoding, sniffedEncoding):
1791 if not u
and chardet
and not isinstance(self.
markup, unicode):
1796 for proposed_encoding
in (
"utf-8",
"windows-1252"):
1804 """Changes a MS smart quote character to an XML or HTML
1807 if isinstance(sub, tuple):
1809 sub =
'&#x%s;' % sub[1]
1811 sub =
'&%s;' % sub[0]
1826 markup = re.compile(
"([\x80-\x9f])").sub \
1835 except Exception, e:
1843 '''Given a string and its encoding, decodes the string into Unicode.
1844 %encoding is a string recognized by encodings.aliases'''
1847 if (len(data) >= 4)
and (data[:2] ==
'\xfe\xff') \
1848 and (data[2:4] !=
'\x00\x00'):
1849 encoding =
'utf-16be'
1851 elif (len(data) >= 4)
and (data[:2] ==
'\xff\xfe') \
1852 and (data[2:4] !=
'\x00\x00'):
1853 encoding =
'utf-16le'
1855 elif data[:3] ==
'\xef\xbb\xbf':
1858 elif data[:4] ==
'\x00\x00\xfe\xff':
1859 encoding =
'utf-32be'
1861 elif data[:4] ==
'\xff\xfe\x00\x00':
1862 encoding =
'utf-32le'
1864 newdata =
unicode(data, encoding)
1868 """Given a document, tries to detect its XML encoding."""
1869 xml_encoding = sniffed_xml_encoding =
None
1871 if xml_data[:4] ==
'\x4c\x6f\xa7\x94':
1874 elif xml_data[:4] ==
'\x00\x3c\x00\x3f':
1876 sniffed_xml_encoding =
'utf-16be'
1878 elif (len(xml_data) >= 4)
and (xml_data[:2] ==
'\xfe\xff') \
1879 and (xml_data[2:4] !=
'\x00\x00'):
1881 sniffed_xml_encoding =
'utf-16be'
1882 xml_data =
unicode(xml_data[2:],
'utf-16be').
encode(
'utf-8')
1883 elif xml_data[:4] ==
'\x3c\x00\x3f\x00':
1885 sniffed_xml_encoding =
'utf-16le'
1887 elif (len(xml_data) >= 4)
and (xml_data[:2] ==
'\xff\xfe')
and \
1888 (xml_data[2:4] !=
'\x00\x00'):
1890 sniffed_xml_encoding =
'utf-16le'
1891 xml_data =
unicode(xml_data[2:],
'utf-16le').
encode(
'utf-8')
1892 elif xml_data[:4] ==
'\x00\x00\x00\x3c':
1894 sniffed_xml_encoding =
'utf-32be'
1896 elif xml_data[:4] ==
'\x3c\x00\x00\x00':
1898 sniffed_xml_encoding =
'utf-32le'
1900 elif xml_data[:4] ==
'\x00\x00\xfe\xff':
1902 sniffed_xml_encoding =
'utf-32be'
1903 xml_data =
unicode(xml_data[4:],
'utf-32be').
encode(
'utf-8')
1904 elif xml_data[:4] ==
'\xff\xfe\x00\x00':
1906 sniffed_xml_encoding =
'utf-32le'
1907 xml_data =
unicode(xml_data[4:],
'utf-32le').
encode(
'utf-8')
1908 elif xml_data[:3] ==
'\xef\xbb\xbf':
1910 sniffed_xml_encoding =
'utf-8'
1913 sniffed_xml_encoding =
'ascii'
1916 xml_encoding_match =
None
1917 xml_encoding_match = re.compile(
1918 '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').
match(xml_data)
1919 if not xml_encoding_match
and isHTML:
1920 regexp = re.compile(
'<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
1921 xml_encoding_match = regexp.search(xml_data)
1922 if xml_encoding_match
is not None:
1923 xml_encoding = xml_encoding_match.groups()[0].lower()
1926 if sniffed_xml_encoding
and \
1927 (xml_encoding
in (
'iso-10646-ucs-2',
'ucs-2',
'csunicode',
1928 'iso-10646-ucs-4',
'ucs-4',
'csucs4',
1929 'utf-16',
'utf-32',
'utf_16',
'utf_32',
1931 xml_encoding = sniffed_xml_encoding
1932 return xml_data, xml_encoding, sniffed_xml_encoding
1937 or (charset
and self.
_codec(charset.replace(
"-",
""))) \
1938 or (charset
and self.
_codec(charset.replace(
"-",
"_"))) \
1942 if not charset:
return charset
1945 codecs.lookup(charset)
1947 except (LookupError, ValueError):
1951 EBCDIC_TO_ASCII_MAP =
None
1954 if not c.EBCDIC_TO_ASCII_MAP:
1955 emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1956 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1957 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1958 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1959 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1960 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1961 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1962 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1963 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1964 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1965 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1966 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1967 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1968 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1969 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1970 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1971 250,251,252,253,254,255)
1973 c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1975 return s.translate(c.EBCDIC_TO_ASCII_MAP)
1977 MS_CHARS = {
'\x80' : (
'euro',
'20AC'),
1979 '\x82' : (
'sbquo',
'201A'),
1980 '\x83' : (
'fnof',
'192'),
1981 '\x84' : (
'bdquo',
'201E'),
1982 '\x85' : (
'hellip',
'2026'),
1983 '\x86' : (
'dagger',
'2020'),
1984 '\x87' : (
'Dagger',
'2021'),
1985 '\x88' : (
'circ',
'2C6'),
1986 '\x89' : (
'permil',
'2030'),
1987 '\x8A' : (
'Scaron',
'160'),
1988 '\x8B' : (
'lsaquo',
'2039'),
1989 '\x8C' : (
'OElig',
'152'),
1991 '\x8E' : (
'#x17D',
'17D'),
1994 '\x91' : (
'lsquo',
'2018'),
1995 '\x92' : (
'rsquo',
'2019'),
1996 '\x93' : (
'ldquo',
'201C'),
1997 '\x94' : (
'rdquo',
'201D'),
1998 '\x95' : (
'bull',
'2022'),
1999 '\x96' : (
'ndash',
'2013'),
2000 '\x97' : (
'mdash',
'2014'),
2001 '\x98' : (
'tilde',
'2DC'),
2002 '\x99' : (
'trade',
'2122'),
2003 '\x9a' : (
'scaron',
'161'),
2004 '\x9b' : (
'rsaquo',
'203A'),
2005 '\x9c' : (
'oelig',
'153'),
2007 '\x9e' : (
'#x17E',
'17E'),
2008 '\x9f' : (
'Yuml',
''),}
2014 if __name__ ==
'__main__':
2017 print soup.prettify()