3 "The Screen-Scraper's Friend" 4 http://www.crummy.com/software/BeautifulSoup/ 6 Beautiful Soup parses a (possibly invalid) XML or HTML document into a 7 tree representation. It provides methods and Pythonic idioms that make 8 it easy to navigate, search, and modify the tree. 10 A well-formed XML/HTML document yields a well-formed data 11 structure. An ill-formed XML/HTML document yields a correspondingly 12 ill-formed data structure. If your document is only locally 13 well-formed, you can use this library to find and process the 14 well-formed part of it. 16 Beautiful Soup works with Python 2.2 and up. It has no external 17 dependencies, but you'll have more success at converting data to UTF-8 18 if you also install these three packages: 20 * chardet, for auto-detecting character encodings 21 http://chardet.feedparser.org/ 22 * cjkcodecs and iconv_codec, which add more encodings to the ones supported 24 http://cjkpython.i18n.org/ 26 Beautiful Soup defines classes for two main parsing strategies: 28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific 29 language that kind of looks like XML. 31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid 32 or invalid. This class has web browser-like heuristics for 33 obtaining a sensible parse tree in the face of common HTML errors. 35 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting 36 the encoding of an HTML or XML document, and converting it to 37 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. 39 For more than you ever wanted to know about Beautiful Soup, see the 41 http://www.crummy.com/software/BeautifulSoup/documentation.html 43 Here, have some legalese: 45 Copyright (c) 2004-2010, Leonard Richardson 49 Redistribution and use in source and binary forms, with or without 50 modification, are permitted provided that the following conditions are 53 * Redistributions of source code must retain the above copyright 54 notice, this list of conditions and the following disclaimer. 56 * Redistributions in binary form must reproduce the above 57 copyright notice, this list of conditions and the following 58 disclaimer in the documentation and/or other materials provided 59 with the distribution. 61 * Neither the name of the the Beautiful Soup Consortium and All 62 Night Kosher Bakery nor the names of its contributors may be 63 used to endorse or promote products derived from this software 64 without specific prior written permission. 66 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 67 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 68 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 69 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 70 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 71 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 72 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 73 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 74 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 75 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 76 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. 79 from __future__
import generators
80 from __future__
import print_function
82 __author__ =
"Leonard Richardson (leonardr@segfault.org)" 84 __copyright__ =
"Copyright (c) 2004-2012 Leonard Richardson" 85 __license__ =
"New-style BSD" 87 from sgmllib
import SGMLParser, SGMLParseError
94 from htmlentitydefs
import name2codepoint
100 from sets
import Set
as set
103 sgmllib.tagfind = re.compile(
'[a-zA-Z][-_.:a-zA-Z0-9]*')
104 markupbase._declname_match = re.compile(
r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
106 DEFAULT_OUTPUT_ENCODING =
"utf-8" 109 """Build a RE to match the given CSS class.""" 110 return re.compile(
r"(^|.*\s)%s($|\s)" % str)
115 """Contains the navigational information for some part of the page 116 (either a tag or a piece of text)""" 119 "Cheap function to invert a hash." 121 for k,v
in h.items():
125 XML_ENTITIES_TO_SPECIAL_CHARS = {
"apos" :
"'",
131 XML_SPECIAL_CHARS_TO_ENTITIES =
_invert(XML_ENTITIES_TO_SPECIAL_CHARS)
133 def setup(self, parent=None, previous=None):
134 """Sets up the initial relations between this element and 141 if self.
parent and self.parent.contents:
143 self.previousSibling.nextSibling = self
147 myIndex = self.parent.index(self)
148 if hasattr(replaceWith,
"parent")\
149 and replaceWith.parent
is self.
parent:
151 index = replaceWith.parent.index(replaceWith)
152 if index
and index < myIndex:
156 myIndex = myIndex - 1
158 oldParent.insert(myIndex, replaceWith)
162 myIndex = self.parent.index(self)
164 reversedChildren =
list(self.contents)
165 reversedChildren.reverse()
166 for child
in reversedChildren:
167 myParent.insert(myIndex, child)
170 """Destructively rips this element out of the tree.""" 173 del self.parent.contents[self.parent.index(self)]
181 nextElement = lastChild.next
184 self.previous.next = nextElement
186 nextElement.previous = self.
previous 188 lastChild.next =
None 192 self.previousSibling.nextSibling = self.
nextSibling 199 "Finds the last element beneath this object to be parsed." 201 while hasattr(lastChild,
'contents')
and lastChild.contents:
202 lastChild = lastChild.contents[-1]
206 if isinstance(newChild, str) \
207 and not isinstance(newChild, NavigableString):
210 position =
min(position, len(self.contents))
211 if hasattr(newChild,
'parent')
and newChild.parent
is not None:
214 if newChild.parent
is self:
215 index = self.index(newChild)
221 position = position - 1
224 newChild.parent = self
227 newChild.previousSibling =
None 228 newChild.previous = self
230 previousChild = self.contents[position-1]
231 newChild.previousSibling = previousChild
232 newChild.previousSibling.nextSibling = newChild
233 newChild.previous = previousChild._lastRecursiveChild()
234 if newChild.previous:
235 newChild.previous.next = newChild
237 newChildsLastElement = newChild._lastRecursiveChild()
239 if position >= len(self.contents):
240 newChild.nextSibling =
None 243 parentsNextSibling =
None 244 while not parentsNextSibling:
245 parentsNextSibling = parent.nextSibling
246 parent = parent.parent
249 if parentsNextSibling:
250 newChildsLastElement.next = parentsNextSibling
252 newChildsLastElement.next =
None 254 nextChild = self.contents[position]
255 newChild.nextSibling = nextChild
256 if newChild.nextSibling:
257 newChild.nextSibling.previousSibling = newChild
258 newChildsLastElement.next = nextChild
260 if newChildsLastElement.next:
261 newChildsLastElement.next.previous = newChildsLastElement
262 self.contents.insert(position, newChild)
265 """Appends the given tag to the contents of this tag.""" 266 self.
insert(len(self.contents), tag)
268 def findNext(self, name=None, attrs={}, text=None, **kwargs):
269 """Returns the first item that matches the given criteria and 270 appears after this Tag in the document.""" 273 def findAllNext(self, name=None, attrs={}, text=None, limit=None,
275 """Returns all items that match the given criteria and appear 276 after this Tag in the document.""" 281 """Returns the closest sibling to this Tag that matches the 282 given criteria and appears after this Tag in the document.""" 288 """Returns the siblings of this Tag that match the given 289 criteria and appear after this Tag in the document.""" 290 return self.
_findAll(name, attrs, text, limit,
292 fetchNextSiblings = findNextSiblings
295 """Returns the first item that matches the given criteria and 296 appears before this Tag in the document.""" 301 """Returns all items that match the given criteria and appear 302 before this Tag in the document.""" 305 fetchPrevious = findAllPrevious
308 """Returns the closest sibling to this Tag that matches the 309 given criteria and appears before this Tag in the document.""" 314 limit=
None, **kwargs):
315 """Returns the siblings of this Tag that match the given 316 criteria and appear before this Tag in the document.""" 317 return self.
_findAll(name, attrs, text, limit,
319 fetchPreviousSiblings = findPreviousSiblings
322 """Returns the closest parent of this Tag that matches the given 333 """Returns the parents of this Tag that match the given 338 fetchParents = findParents
342 def _findOne(self, method, name, attrs, text, **kwargs):
344 l =
method(name, attrs, text, 1, **kwargs)
349 def _findAll(self, name, attrs, text, limit, generator, **kwargs):
350 "Iterates over a generator looking for things that match." 352 if isinstance(name, SoupStrainer):
355 elif text
is None and not limit
and not attrs
and not kwargs:
358 return [element
for element
in generator()
359 if isinstance(element, Tag)]
361 elif isinstance(name, str):
362 return [element
for element
in generator()
363 if isinstance(element, Tag)
and 364 element.name == name]
375 except StopIteration:
378 found = strainer.search(i)
380 results.append(found)
381 if limit
and len(results) >= limit:
408 i = i.previousSibling
419 encoding = encoding
or "utf-8" 420 return str.replace(
"%SOUP-ENCODING%", encoding)
423 """Encodes an object to a string in some encoding, or to Unicode. 425 if isinstance(s, unicode):
427 s = s.encode(encoding)
428 elif isinstance(s, str):
430 s = s.encode(encoding)
440 BARE_AMPERSAND_OR_BRACKET = re.compile(
"([<>]|" 441 +
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" 445 """Used with a regular expression to substitute the 446 appropriate XML entity for an XML special character.""" 447 return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] +
";" 453 """Create a new NavigableString. 455 When unpickling a NavigableString, this method is called with 456 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be 457 passed in to the superclass's __new__ or the superclass won't know 458 how to handle non-ASCII characters. 460 if isinstance(value, unicode):
461 return unicode.__new__(cls, value)
462 return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
465 return (NavigableString.__str__(self),)
468 """text.string gives you text. This is for backwards 469 compatibility for Navigable*String, but for CData* it lets you 470 get the string without the CData wrapper.""" 474 raise AttributeError(
"'%s' object has no attribute '%s'" % (self.__class__.__name__, attr))
477 return str(self).
decode(DEFAULT_OUTPUT_ENCODING)
479 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
481 data = self.BARE_AMPERSAND_OR_BRACKET.sub(self.
_sub_entity, self)
483 return data.encode(encoding)
489 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
490 return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
493 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
495 if "%SOUP-ENCODING%" in output:
497 return "<?%s?>" % self.
toEncoding(output, encoding)
500 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
501 return "<!--%s-->" % NavigableString.__str__(self, encoding)
504 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
505 return "<!%s>" % NavigableString.__str__(self, encoding)
509 """Represents a found HTML tag with its attributes and contents.""" 512 """Used in a call to re.sub to replace HTML, XML, and numeric 513 entities with the appropriate Unicode characters. If HTML 514 entities are being converted, any unrecognized entities are 518 return unichr(name2codepoint[x])
519 elif x
in self.XML_ENTITIES_TO_SPECIAL_CHARS:
521 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
524 elif len(x) > 0
and x[0] ==
'#':
526 if len(x) > 1
and x[1] ==
'x':
527 return unichr(
int(x[2:], 16))
529 return unichr(
int(x[1:]))
532 return u'&%s;' % x
536 def __init__(self, parser, name, attrs=None, parent=None,
547 elif isinstance(attrs, dict):
548 attrs = attrs.items()
551 self.
setup(parent, previous)
559 convert =
lambda k_val: (k_val[0],
560 re.sub(
"&(#\d+|#x[0-9a-fA-F]+|\w+);",
567 and isinstance(self.
contents[0], NavigableString)):
571 """Replace the contents of the tag with a string""" 575 string = property(getString, setString)
583 while current
is not stopNode:
584 if isinstance(current, NavigableString):
585 strings.append(current.strip())
586 current = current.next
587 return separator.join(strings)
589 text = property(getText)
591 def get(self, key, default=None):
592 """Returns the value of the 'key' attribute for the tag, or 593 the value given for 'default' if it doesn't have that 598 """Extract all children.""" 603 for i, child
in enumerate(self.
contents):
606 raise ValueError(
"Tag.index: element not in tag")
612 """tag[key] returns the value of the 'key' attribute for the tag, 613 and throws an exception if it's not there.""" 617 "Iterating over a tag iterates over its contents." 621 "The length of a tag is the length of its list of contents." 628 "A tag is non-None even if it has no contents." 632 """Setting tag[key] sets the value of the 'key' attribute for the 637 for i
in range(0, len(self.
attrs)):
638 if self.
attrs[i][0] == key:
639 self.
attrs[i] = (key, value)
642 self.attrs.append((key, value))
646 "Deleting tag[key] deletes all 'key' attributes for the tag." 647 for item
in self.
attrs:
649 self.attrs.remove(item)
657 """Calling a tag like a function is the same as calling its 658 findAll() method. Eg. tag('a') returns a list of all the A tags 659 found within this tag.""" 660 return self.
findAll(*args, **kwargs)
664 if len(tag) > 3
and tag.rfind(
'Tag') == len(tag)-3:
665 return self.
find(tag[:-3])
666 elif tag.find(
'__') != 0:
667 return self.
find(tag)
668 raise AttributeError(
"'%s' object has no attribute '%s'" % (self.__class__, tag))
671 """Returns true iff this tag has the same name, the same attributes, 672 and the same contents (recursively) as the given tag. 674 NOTE: right now this will return false if two tags have the 675 same attributes in a different order. Should this be fixed?""" 678 if not hasattr(other,
'name')
or not hasattr(other,
'attrs')
or not hasattr(other,
'contents')
or self.
name != other.name
or self.
attrs != other.attrs
or len(self) != len(other):
680 for i
in range(0, len(self.
contents)):
681 if self.
contents[i] != other.contents[i]:
686 """Returns true iff this tag is not identical to the other tag, 687 as defined in __eq__.""" 688 return not self == other
690 def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
691 """Renders this tag as a string.""" 697 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
698 prettyPrint=
False, indentLevel=0):
699 """Returns a string or Unicode representation of this tag and 700 its contents. To get Unicode, pass None for encoding. 702 NOTE: since Python's HTML parser consumes whitespace, this 703 method is not certain to reproduce the whitespace present in 704 the original string.""" 710 for key, val
in self.
attrs:
712 if isinstance(val, str):
736 val = val.replace(
"'",
"&squot;")
742 val = self.BARE_AMPERSAND_OR_BRACKET.sub(self.
_sub_entity, val)
744 attrs.append(fmt % (self.
toEncoding(key, encoding),
751 closeTag =
'</%s>' % encodedName
753 indentTag, indentContents = 0, 0
755 indentTag = indentLevel
756 space = (
' ' * (indentTag-1))
757 indentContents = indentTag + 1
758 contents = self.
renderContents(encoding, prettyPrint, indentContents)
765 attributeString =
' ' +
' '.
join(attrs)
768 s.append(
'<%s%s%s>' % (encodedName, attributeString, close))
772 if prettyPrint
and contents
and contents[-1] !=
"\n":
774 if prettyPrint
and closeTag:
783 """Recursively destroys the contents of this tree.""" 788 while current
is not None:
790 if isinstance(current, Tag):
791 del current.contents[:]
792 current.parent =
None 793 current.previous =
None 794 current.previousSibling =
None 796 current.nextSibling =
None 799 def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
800 return self.
__str__(encoding,
True)
803 prettyPrint=
False, indentLevel=0):
804 """Renders the contents of this tag as a string in the given 805 encoding. If encoding is None, returns a Unicode string..""" 809 if isinstance(c, NavigableString):
810 text = c.__str__(encoding)
811 elif isinstance(c, Tag):
812 s.append(c.__str__(encoding, prettyPrint, indentLevel))
813 if text
and prettyPrint:
817 s.append(
" " * (indentLevel-1))
825 def find(self, name=None, attrs={}, recursive=True, text=None,
827 """Return only the first child of this Tag matching the given 830 l = self.
findAll(name, attrs, recursive, text, 1, **kwargs)
836 def findAll(self, name=None, attrs={}, recursive=True, text=None,
837 limit=
None, **kwargs):
838 """Extracts a list of Tag objects that match the given 839 criteria. You can specify the name of the Tag and any 840 attributes you want the Tag to have. 842 The value of a key-value pair in the 'attrs' map can be a 843 string, a list of strings, a regular expression object, or a 844 callable that takes a string and returns whether or not the 845 string matches for some custom definition of 'matches'. The 846 same is true of the tag name.""" 850 return self.
_findAll(name, attrs, text, limit, generator, **kwargs)
851 findChildren = findAll
857 def fetchText(self, text=None, recursive=True, limit=None):
858 return self.
findAll(text=text, recursive=recursive, limit=limit)
861 return self.
find(text=text, recursive=recursive)
866 """Initializes a map representation of this tag's attributes, 867 if not already initialized.""" 868 if not getattr(self,
'attrMap'):
870 for (key, value)
in self.
attrs:
884 while current
is not stopNode:
886 current = current.next
891 """Encapsulates a number of ways of matching a markup element (tag or 894 def __init__(self, name=None, attrs={}, text=None, **kwargs):
896 if isinstance(attrs, str):
917 if isinstance(markupName, Tag):
920 callFunctionWithTagData = callable(self.
name) \
921 and not isinstance(markupName, Tag)
924 or callFunctionWithTagData \
926 or (
not markup
and self.
_matches(markupName, self.
name)):
927 if callFunctionWithTagData:
928 match = self.
name(markupName, markupAttrs)
932 for attr, matchAgainst
in self.attrs.items():
933 if not markupAttrMap:
934 if hasattr(markupAttrs,
'get'):
935 markupAttrMap = markupAttrs
938 for k,v
in markupAttrs:
940 attrValue = markupAttrMap.get(attr)
941 if not self.
_matches(attrValue, matchAgainst):
956 if hasattr(markup,
"__iter__") \
957 and not isinstance(markup, Tag):
958 for element
in markup:
959 if isinstance(element, NavigableString) \
965 elif isinstance(markup, Tag):
969 elif isinstance(markup, NavigableString)
or \
970 isinstance(markup, str):
974 raise Exception(
"I don't know how to match against a %s" \
981 if matchAgainst
is True:
982 result = markup
is not None 983 elif callable(matchAgainst):
984 result = matchAgainst(markup)
988 if isinstance(markup, Tag):
990 if markup
and not isinstance(markup, str):
991 markup = unicode(markup)
993 if hasattr(matchAgainst,
'match'):
995 result = markup
and matchAgainst.search(markup)
996 elif hasattr(matchAgainst,
'__iter__'):
997 result = markup
in matchAgainst
998 elif hasattr(matchAgainst,
'items'):
999 result = matchAgainst
in markup
1000 elif matchAgainst
and isinstance(markup, str):
1001 if isinstance(markup, unicode):
1002 matchAgainst = unicode(matchAgainst)
1004 matchAgainst =
str(matchAgainst)
1007 result = matchAgainst == markup
1011 """A ResultSet is just a list that keeps track of the SoupStrainer 1020 """Turns a list of maps, lists, or scalars into a single map. 1021 Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and 1022 NESTING_RESET_TAGS maps out of lists and partial maps.""" 1024 for portion
in args:
1025 if hasattr(portion,
'items'):
1027 for k,v
in portion.items():
1029 elif hasattr(portion,
'__iter__'):
1035 built[portion] = default
1042 """This class contains the basic parser and search code. It defines 1043 a parser that knows nothing about tag behavior except for the 1046 You can't close a tag without closing all the tags it encloses. 1047 That is, "<foo><bar></foo>" actually means 1048 "<foo><bar></bar></foo>". 1050 [Another possible explanation is "<foo><bar /></foo>", but since 1051 this class defines no SELF_CLOSING_TAGS, it will never use that 1054 This class is useful for parsing XML or made-up markup languages, 1055 or when BeautifulSoup makes an assumption counter to what you were 1058 SELF_CLOSING_TAGS = {}
1060 RESET_NESTING_TAGS = {}
1062 PRESERVE_WHITESPACE_TAGS = []
1064 MARKUP_MASSAGE = [(re.compile(
'(<[^<>]*)/>'),
1065 lambda x: x.group(1) +
' />'),
1066 (re.compile(
'<!\s+([^<>]*)>'),
1067 lambda x:
'<!' + x.group(1) +
'>')
1070 ROOT_TAG_NAME =
u'[document]' 1072 HTML_ENTITIES =
"html" 1073 XML_ENTITIES =
"xml" 1074 XHTML_ENTITIES =
"xhtml" 1076 ALL_ENTITIES = XHTML_ENTITIES
1082 STRIP_ASCII_SPACES = { 9:
None, 10:
None, 12:
None, 13:
None, 32:
None, }
1084 def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
1085 markupMassage=
True, smartQuotesTo=XML_ENTITIES,
1086 convertEntities=
None, selfClosingTags=
None, isHTML=
False):
1087 """The Soup object is initialized as the 'root tag', and the 1088 provided markup (which can be a string or a file-like object) 1089 is fed into the underlying parser. 1091 sgmllib will process most bad HTML, and the BeautifulSoup 1092 class has some tricks for dealing with some HTML that kills 1093 sgmllib, but Beautiful Soup can nonetheless choke or lose data 1094 if your data uses self-closing tags or declarations 1097 By default, Beautiful Soup uses regexes to sanitize input, 1098 avoiding the vast majority of these problems. If the problems 1099 don't apply to you, pass in False for markupMassage, and 1100 you'll get better performance. 1102 The default parser massage techniques fix the two most common 1103 instances of invalid HTML that choke sgmllib: 1105 <br/> (No space between name of closing tag and tag close) 1106 <! --Comment--> (Extraneous whitespace in declaration) 1108 You can pass in a custom list of (RE object, replace method) 1109 tuples to get Beautiful Soup to scrub your input the way you 1123 if convertEntities == self.HTML_ENTITIES:
1127 elif convertEntities == self.XHTML_ENTITIES:
1131 elif convertEntities == self.XML_ENTITIES:
1141 SGMLParser.__init__(self)
1143 if hasattr(markup,
'read'):
1144 markup = markup.read()
1148 self.
_feed(isHTML=isHTML)
1154 """This method fixes a bug in Python's SGMLParser.""" 1159 if not 0 <= n <= 127 :
1161 return self.convert_codepoint(n)
1163 def _feed(self, inDocumentEncoding=None, isHTML=False):
1166 if isinstance(markup, unicode):
1167 if not hasattr(self,
'originalEncoding'):
1170 dammit = UnicodeDammit\
1173 markup = dammit.unicode
1181 markup = fix.sub(m, markup)
1190 SGMLParser.feed(self, markup)
1193 while self.currentTag.name != self.ROOT_TAG_NAME:
1197 """This method routes method call requests to either the SGMLParser 1198 superclass or the Tag superclass, depending on the method name.""" 1201 if methodName.startswith(
'start_')
or methodName.startswith(
'end_') \
1202 or methodName.startswith(
'do_'):
1203 return SGMLParser.__getattr__(self, methodName)
1204 elif not methodName.startswith(
'__'):
1205 return Tag.__getattr__(self, methodName)
1207 raise AttributeError
1210 """Returns true iff the given string is the name of a 1211 self-closing tag according to this parser.""" 1212 return name
in self.SELF_CLOSING_TAGS \
1216 Tag.__init__(self, self, self.ROOT_TAG_NAME)
1218 SGMLParser.reset(self)
1226 tag = self.tagStack.pop()
1236 self.currentTag.contents.append(tag)
1237 self.tagStack.append(tag)
1240 def endData(self, containerClass=NavigableString):
1243 if (currentData.translate(self.STRIP_ASCII_SPACES) ==
'' and 1245 self.PRESERVE_WHITESPACE_TAGS)):
1246 if '\n' in currentData:
1252 (
not self.parseOnlyThese.text
or \
1253 not self.parseOnlyThese.search(currentData)):
1255 o = containerClass(currentData)
1258 self.previous.next = o
1260 self.currentTag.contents.append(o)
1264 """Pops the tag stack up to and including the most recent 1265 instance of the given tag. If inclusivePop is false, pops the tag 1266 stack up to but *not* including the most recent instqance of 1269 if name == self.ROOT_TAG_NAME:
1273 mostRecentTag =
None 1274 for i
in range(len(self.
tagStack)-1, 0, -1):
1278 if not inclusivePop:
1279 numPops = numPops - 1
1281 for i
in range(0, numPops):
1282 mostRecentTag = self.
popTag()
1283 return mostRecentTag
1287 """We need to pop up to the previous tag of this type, unless 1288 one of this tag's nesting reset triggers comes between this 1289 tag and the previous tag of this type, OR unless this tag is a 1290 generic nesting trigger and another generic nesting trigger 1291 comes between this tag and the previous tag of this type. 1294 <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'. 1295 <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'. 1296 <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'. 1298 <li><ul><li> *<li>* should pop to 'ul', not the first 'li'. 1299 <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' 1300 <td><tr><td> *<td>* should pop to 'tr', not the first 'td' 1303 nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1304 isNestable = nestingResetTriggers !=
None 1305 isResetNesting = name
in self.RESET_NESTING_TAGS
1308 for i
in range(len(self.
tagStack)-1, 0, -1):
1310 if (
not p
or p.name == name)
and not isNestable:
1315 if (nestingResetTriggers
is not None 1316 and p.name
in nestingResetTriggers) \
1317 or (nestingResetTriggers
is None and isResetNesting
1318 and p.name
in self.RESET_NESTING_TAGS):
1336 attrs =
''.
join([
' %s="%s"' % (x, y)
for x, y
in attrs])
1345 and (self.parseOnlyThese.text
or not self.parseOnlyThese.searchTag(name, attrs)):
1350 self.previous.next = tag
1355 if name
in self.QUOTE_TAGS:
1357 self.quoteStack.append(name)
1371 self.quoteStack.pop()
1375 self.currentData.append(data)
1378 """Adds a certain piece of text to the tree as a NavigableString 1385 """Handle a processing instruction as a ProcessingInstruction 1386 object, possibly one with a %SOUP-ENCODING% slot into which an 1387 encoding will be plugged later.""" 1388 if text[:3] ==
"xml":
1389 text =
u"xml version='1.0' encoding='%SOUP-ENCODING%'" 1393 "Handle comments as Comment objects." 1397 "Handle character references as data." 1399 data = unichr(
int(ref))
1401 data =
'&#%s;' % ref
1405 """Handle entity references as data, possibly converting known 1406 HTML and/or XML entity references to the corresponding Unicode 1411 data = unichr(name2codepoint[ref])
1416 data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1419 not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1436 data =
"&%s" % ref
1448 "Handle DOCTYPEs and the like as Declaration objects." 1452 """Treat a bogus SGML declaration as raw data. Treat a CDATA 1453 declaration as a CData object.""" 1455 if self.rawdata[i:i+9] ==
'<![CDATA[':
1456 k = self.rawdata.find(
']]>', i)
1458 k = len(self.rawdata)
1459 data = self.rawdata[i+9:k]
1464 j = SGMLParser.parse_declaration(self, i)
1465 except SGMLParseError:
1466 toHandle = self.rawdata[i:]
1468 j = i + len(toHandle)
1473 """This parser knows the following facts about HTML: 1475 * Some tags have no closing tag and should be interpreted as being 1476 closed as soon as they are encountered. 1478 * The text inside some tags (ie. 'script') may contain tags which 1479 are not really part of the document and which should be parsed 1480 as text, not tags. If you want to parse the text as tags, you can 1481 always fetch it and parse it explicitly. 1483 * Tag nesting rules: 1485 Most tags can't be nested at all. For instance, the occurance of 1486 a <p> tag should implicitly close the previous <p> tag. 1489 should be transformed into: 1490 <p>Para1</p><p>Para2 1492 Some tags can be nested arbitrarily. For instance, the occurance 1493 of a <blockquote> tag should _not_ implicitly close the previous 1496 Alice said: <blockquote>Bob said: <blockquote>Blah 1497 should NOT be transformed into: 1498 Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah 1500 Some tags can be nested, but the nesting is reset by the 1501 interposition of other tags. For instance, a <tr> tag should 1502 implicitly close the previous <tr> tag within the same <table>, 1503 but not close a <tr> tag in another table. 1505 <table><tr>Blah<tr>Blah 1506 should be transformed into: 1507 <table><tr>Blah</tr><tr>Blah 1509 <tr>Blah<table><tr>Blah 1510 should NOT be transformed into 1511 <tr>Blah<table></tr><tr>Blah 1513 Differing assumptions about tag nesting rules are a major source 1514 of problems with the BeautifulSoup class. If BeautifulSoup is not 1515 treating as nestable a tag your page author treats as nestable, 1516 try ICantBelieveItsBeautifulSoup, MinimalSoup, or 1517 BeautifulStoneSoup before writing your own subclass.""" 1520 if 'smartQuotesTo' not in kwargs:
1521 kwargs[
'smartQuotesTo'] = self.HTML_ENTITIES
1522 kwargs[
'isHTML'] =
True 1523 BeautifulStoneSoup.__init__(self, *args, **kwargs)
1526 (
'br' ,
'hr',
'input',
'img',
'meta',
1527 'spacer',
'link',
'frame',
'base',
'col'))
1529 PRESERVE_WHITESPACE_TAGS = set([
'pre',
'textarea'])
1531 QUOTE_TAGS = {
'script' :
None,
'textarea' :
None}
1536 NESTABLE_INLINE_TAGS = (
'span',
'font',
'q',
'object',
'bdo',
'sub',
'sup',
1542 NESTABLE_BLOCK_TAGS = (
'blockquote',
'div',
'fieldset',
'ins',
'del')
1545 NESTABLE_LIST_TAGS = {
'ol' : [],
1547 'li' : [
'ul',
'ol'],
1553 NESTABLE_TABLE_TAGS = {
'table' : [],
1554 'tr' : [
'table',
'tbody',
'tfoot',
'thead'],
1557 'thead' : [
'table'],
1558 'tbody' : [
'table'],
1559 'tfoot' : [
'table'],
1562 NON_NESTABLE_BLOCK_TAGS = (
'address',
'form',
'p',
'pre')
1566 RESET_NESTING_TAGS =
buildTagMap(
None, NESTABLE_BLOCK_TAGS,
'noscript',
1567 NON_NESTABLE_BLOCK_TAGS,
1569 NESTABLE_TABLE_TAGS)
1571 NESTABLE_TAGS =
buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1572 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1575 CHARSET_RE = re.compile(
"((^|;)\s*charset=)([^;]*)", re.M)
1578 """Beautiful Soup can detect a charset included in a META tag, 1579 try to convert the document to that charset, and re-parse the 1580 document from the beginning.""" 1583 contentTypeIndex =
None 1584 tagNeedsEncodingSubstitution =
False 1586 for i
in range(0, len(attrs)):
1587 key, value = attrs[i]
1589 if key ==
'http-equiv':
1591 elif key ==
'content':
1593 contentTypeIndex = i
1595 if httpEquiv
and contentType:
1596 match = self.CHARSET_RE.search(contentType)
1606 return match.group(1) +
"%SOUP-ENCODING%" 1607 newAttr = self.CHARSET_RE.sub(rewrite, contentType)
1608 attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1610 tagNeedsEncodingSubstitution =
True 1614 newCharset = match.group(3)
1621 if tag
and tagNeedsEncodingSubstitution:
1622 tag.containsSubstitutions =
True 1629 """The BeautifulSoup class is oriented towards skipping over 1630 common HTML errors like unclosed tags. However, sometimes it makes 1631 errors of its own. For instance, consider this fragment: 1633 <b>Foo<b>Bar</b></b> 1635 This is perfectly valid (if bizarre) HTML. However, the 1636 BeautifulSoup class will implicitly close the first b tag when it 1637 encounters the second 'b'. It will think the author wrote 1638 "<b>Foo<b>Bar", and didn't close the first 'b' tag, because 1639 there's no real-world reason to bold something that's already 1640 bold. When it encounters '</b></b>' it will close two more 'b' 1641 tags, for a grand total of three tags closed instead of two. This 1642 can throw off the rest of your document structure. The same is 1643 true of a number of other tags, listed below. 1645 It's much more common for someone to forget to close a 'b' tag 1646 than to actually use nested 'b' tags, and the BeautifulSoup class 1647 handles the common case. This class handles the not-co-common 1648 case: where you can't believe someone wrote what they did, but 1649 it's valid HTML and BeautifulSoup screwed up by assuming it 1652 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1653 (
'em',
'big',
'i',
'small',
'tt',
'abbr',
'acronym',
'strong',
1654 'cite',
'code',
'dfn',
'kbd',
'samp',
'strong',
'var',
'b',
1657 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = (
'noscript',)
1659 NESTABLE_TAGS =
buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1660 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1661 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1664 """The MinimalSoup class is for parsing HTML that contains 1665 pathologically bad markup. It makes no assumptions about tag 1666 nesting, but it does know which tags are self-closing, that 1667 <script> tags contain Javascript and should not be parsed, that 1668 META tags may contain encoding information, and so on. 1670 This also makes it better for subclassing than BeautifulStoneSoup 1671 or BeautifulSoup.""" 1677 """This class will push a tag with only a single string child into 1678 the tag's parent as an attribute. The attribute's name is the tag 1679 name, and the value is the string child. An example should give 1680 the flavor of the change: 1682 <foo><bar>baz</bar></foo> 1684 <foo bar="baz"><bar>baz</bar></foo> 1686 You can then access fooTag['bar'] instead of fooTag.barTag.string. 1688 This is, of course, useful for scraping structures that tend to 1689 use subelements instead of attributes, such as SOAP messages. Note 1690 that it modifies its input, so don't print the modified version 1693 I'm not sure how many people really want to use this class; let me 1694 know if you do. Mainly I like the name.""" 1700 parent._getAttrMap()
1701 if (isinstance(tag, Tag)
and len(tag.contents) == 1
and 1702 isinstance(tag.contents[0], NavigableString)
and 1703 tag.name
not in parent.attrMap):
1704 parent[tag.name] = tag.contents[0]
1705 BeautifulStoneSoup.popTag(self)
1749 import cjkcodecs.aliases
1758 """A class for detecting the encoding of a *ML document and 1759 converting it to a Unicode string. If the source encoding is 1760 windows-1252, can replace MS smart quotes with their HTML or XML 1767 CHARSET_ALIASES = {
"macintosh" :
"mac-roman",
1768 "x-sjis" :
"shift-jis" }
1770 def __init__(self, markup, overrideEncodings=[],
1771 smartQuotesTo=
'xml', isHTML=
False):
1773 self.
markup, documentEncoding, sniffedEncoding = \
1777 if markup ==
'' or isinstance(markup, unicode):
1783 for proposedEncoding
in overrideEncodings:
1787 for proposedEncoding
in (documentEncoding, sniffedEncoding):
1792 if not u
and chardet
and not isinstance(self.
markup, unicode):
1797 for proposed_encoding
in (
"utf-8",
"windows-1252"):
1805 """Changes a MS smart quote character to an XML or HTML 1807 sub = self.MS_CHARS.get(orig)
1808 if isinstance(sub, tuple):
1810 sub =
'&#x%s;' % sub[1]
1812 sub =
'&%s;' % sub[0]
1819 self.triedEncodings.append(proposed)
1827 markup = re.compile(
"([\x80-\x9f])").sub \
1836 except Exception
as e:
1844 '''Given a string and its encoding, decodes the string into Unicode. 1845 %encoding is a string recognized by encodings.aliases''' 1848 if (len(data) >= 4)
and (data[:2] ==
'\xfe\xff') \
1849 and (data[2:4] !=
'\x00\x00'):
1850 encoding =
'utf-16be' 1852 elif (len(data) >= 4)
and (data[:2] ==
'\xff\xfe') \
1853 and (data[2:4] !=
'\x00\x00'):
1854 encoding =
'utf-16le' 1856 elif data[:3] ==
'\xef\xbb\xbf':
1859 elif data[:4] ==
'\x00\x00\xfe\xff':
1860 encoding =
'utf-32be' 1862 elif data[:4] ==
'\xff\xfe\x00\x00':
1863 encoding =
'utf-32le' 1865 newdata =
unicode(data, encoding)
1869 """Given a document, tries to detect its XML encoding.""" 1870 xml_encoding = sniffed_xml_encoding =
None 1872 if xml_data[:4] ==
'\x4c\x6f\xa7\x94':
1875 elif xml_data[:4] ==
'\x00\x3c\x00\x3f':
1877 sniffed_xml_encoding =
'utf-16be' 1879 elif (len(xml_data) >= 4)
and (xml_data[:2] ==
'\xfe\xff') \
1880 and (xml_data[2:4] !=
'\x00\x00'):
1882 sniffed_xml_encoding =
'utf-16be' 1883 xml_data =
unicode(xml_data[2:],
'utf-16be').
encode(
'utf-8')
1884 elif xml_data[:4] ==
'\x3c\x00\x3f\x00':
1886 sniffed_xml_encoding =
'utf-16le' 1888 elif (len(xml_data) >= 4)
and (xml_data[:2] ==
'\xff\xfe')
and \
1889 (xml_data[2:4] !=
'\x00\x00'):
1891 sniffed_xml_encoding =
'utf-16le' 1892 xml_data =
unicode(xml_data[2:],
'utf-16le').
encode(
'utf-8')
1893 elif xml_data[:4] ==
'\x00\x00\x00\x3c':
1895 sniffed_xml_encoding =
'utf-32be' 1897 elif xml_data[:4] ==
'\x3c\x00\x00\x00':
1899 sniffed_xml_encoding =
'utf-32le' 1901 elif xml_data[:4] ==
'\x00\x00\xfe\xff':
1903 sniffed_xml_encoding =
'utf-32be' 1904 xml_data =
unicode(xml_data[4:],
'utf-32be').
encode(
'utf-8')
1905 elif xml_data[:4] ==
'\xff\xfe\x00\x00':
1907 sniffed_xml_encoding =
'utf-32le' 1908 xml_data =
unicode(xml_data[4:],
'utf-32le').
encode(
'utf-8')
1909 elif xml_data[:3] ==
'\xef\xbb\xbf':
1911 sniffed_xml_encoding =
'utf-8' 1914 sniffed_xml_encoding =
'ascii' 1917 xml_encoding_match =
None 1918 xml_encoding_match = re.compile(
1919 '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').
match(xml_data)
1920 if not xml_encoding_match
and isHTML:
1921 regexp = re.compile(
'<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
1922 xml_encoding_match = regexp.search(xml_data)
1923 if xml_encoding_match
is not None:
1924 xml_encoding = xml_encoding_match.groups()[0].lower()
1927 if sniffed_xml_encoding
and \
1928 (xml_encoding
in (
'iso-10646-ucs-2',
'ucs-2',
'csunicode',
1929 'iso-10646-ucs-4',
'ucs-4',
'csucs4',
1930 'utf-16',
'utf-32',
'utf_16',
'utf_32',
1932 xml_encoding = sniffed_xml_encoding
1933 return xml_data, xml_encoding, sniffed_xml_encoding
1937 return self.
_codec(self.CHARSET_ALIASES.get(charset, charset)) \
1938 or (charset
and self.
_codec(charset.replace(
"-",
""))) \
1939 or (charset
and self.
_codec(charset.replace(
"-",
"_"))) \
1943 if not charset:
return charset
1946 codecs.lookup(charset)
1948 except (LookupError, ValueError):
1952 EBCDIC_TO_ASCII_MAP =
None 1955 if not c.EBCDIC_TO_ASCII_MAP:
1956 emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1957 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1958 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1959 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1960 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1961 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1962 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1963 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1964 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1965 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1966 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1967 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1968 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1969 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1970 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1971 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1972 250,251,252,253,254,255)
1974 c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1976 return s.translate(c.EBCDIC_TO_ASCII_MAP)
1978 MS_CHARS = {
'\x80' : (
'euro',
'20AC'),
1980 '\x82' : (
'sbquo',
'201A'),
1981 '\x83' : (
'fnof',
'192'),
1982 '\x84' : (
'bdquo',
'201E'),
1983 '\x85' : (
'hellip',
'2026'),
1984 '\x86' : (
'dagger',
'2020'),
1985 '\x87' : (
'Dagger',
'2021'),
1986 '\x88' : (
'circ',
'2C6'),
1987 '\x89' : (
'permil',
'2030'),
1988 '\x8A' : (
'Scaron',
'160'),
1989 '\x8B' : (
'lsaquo',
'2039'),
1990 '\x8C' : (
'OElig',
'152'),
1992 '\x8E' : (
'#x17D',
'17D'),
1995 '\x91' : (
'lsquo',
'2018'),
1996 '\x92' : (
'rsquo',
'2019'),
1997 '\x93' : (
'ldquo',
'201C'),
1998 '\x94' : (
'rdquo',
'201D'),
1999 '\x95' : (
'bull',
'2022'),
2000 '\x96' : (
'ndash',
'2013'),
2001 '\x97' : (
'mdash',
'2014'),
2002 '\x98' : (
'tilde',
'2DC'),
2003 '\x99' : (
'trade',
'2122'),
2004 '\x9a' : (
'scaron',
'161'),
2005 '\x9b' : (
'rsaquo',
'203A'),
2006 '\x9c' : (
'oelig',
'153'),
2008 '\x9e' : (
'#x17E',
'17E'),
2009 '\x9f' : (
'Yuml',
''),}
2015 if __name__ ==
'__main__':
2018 print(soup.prettify())
def unknown_endtag(self, name)
def fetchText(self, text=None, recursive=True, limit=None)
def get(self, key, default=None)
def _convertFrom(self, proposed)
def _findOne(self, method, name, attrs, text, kwargs)
def findNext(self, name=None, attrs={}, text=None, kwargs)
def findPrevious(self, name=None, attrs={}, text=None, kwargs)
def _toStringSubclass(self, text, subclass)
def toEncoding(self, s, encoding=None)
def handle_pi(self, text)
def _matches(self, markup, matchAgainst)
def _popToTag(self, name, inclusivePop=True)
def _codec(self, charset)
def findPreviousSibling(self, name=None, attrs={}, text=None, kwargs)
def __getattr__(self, methodName)
def unknown_starttag(self, name, attrs, selfClosing=0)
def setString(self, string)
def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, prettyPrint=False, indentLevel=0)
def handle_decl(self, data)
def getText(self, separator=u"")
S & print(S &os, JobReport::InputFile const &f)
def isSelfClosingTag(self, name)
def _toUnicode(self, data, encoding)
def parse_declaration(self, i)
def _subMSChar(self, orig)
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING)
def _smartPop(self, name)
def __init__(self, args, kwargs)
def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING)
def __init__(self, parser, name, attrs=None, parent=None, previous=None)
def recursiveChildGenerator(self)
def findParents(self, name=None, attrs={}, limit=None, kwargs)
double intersection(double r12)
def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING)
def searchTag(self, markupName=None, markupAttrs={})
def findPreviousSiblings(self, name=None, attrs={}, text=None, limit=None, kwargs)
def _lastRecursiveChild(self)
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, prettyPrint=False, indentLevel=0)
def __getattr__(self, tag)
bool decode(bool &, std::string const &)
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING)
def buildTagMap(default, args)
def nextSiblingGenerator(self)
def findParent(self, name=None, attrs={}, kwargs)
def _ebcdic_to_ascii(self, s)
def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, kwargs)
def _findAll(self, name, attrs, text, limit, generator, kwargs)
def find_codec(self, charset)
def handle_comment(self, text)
def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, kwargs)
def __call__(self, args, kwargs)
def handle_data(self, data)
def findNextSibling(self, name=None, attrs={}, text=None, kwargs)
def __getitem__(self, key)
def firstText(self, text=None, recursive=True)
escapeUnrecognizedEntities
def start_meta(self, attrs)
def previousGenerator(self)
def parentGenerator(self)
def setup(self, parent=None, previous=None)
def __init__(self, source)
def __getattr__(self, attr)
static std::string join(char **cmd)
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING)
def findAllNext(self, name=None, attrs={}, text=None, limit=None, kwargs)
def __contains__(self, x)
def replaceWithChildren(self)
def _feed(self, inDocumentEncoding=None, isHTML=False)
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING)
def __init__(self, markup, overrideEncodings=[], smartQuotesTo='xml', isHTML=False)
def findAll(self, name=None, attrs={}, recursive=True, text=None, limit=None, kwargs)
def insert(self, position, newChild)
def find(self, name=None, attrs={}, recursive=True, text=None, kwargs)
def replaceWith(self, replaceWith)
def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, markupMassage=True, smartQuotesTo=XML_ENTITIES, convertEntities=None, selfClosingTags=None, isHTML=False)
def previousSiblingGenerator(self)
def __init__(self, name=None, attrs={}, text=None, kwargs)
def handle_entityref(self, ref)
def convert_charref(self, name)
def _detectEncoding(self, xml_data, isHTML=False)
std::pair< typename Association::data_type::first_type, double > match(Reference key, Association association, bool bestMatchByMaxValue)
Generic matching function.
def __setitem__(self, key, value)
def handle_charref(self, ref)
def __delitem__(self, key)
def _match_css_class(str)
def _convertEntities(self, match)
def substituteEncoding(self, str, encoding=None)
How EventSelector::AcceptEvent() decides whether to accept an event for output otherwise it is excluding the probing of A single or multiple positive and the trigger will pass if any such matching triggers are PASS or EXCEPTION[A criterion thatmatches no triggers at all is detected and causes a throw.] A single negative with an expectation of appropriate bit checking in the decision and the trigger will pass if any such matching triggers are FAIL or EXCEPTION A wildcarded negative criterion that matches more than one trigger in the trigger list("!*","!HLTx*"if it matches 2 triggers or more) will accept the event if all the matching triggers are FAIL.It will reject the event if any of the triggers are PASS or EXCEPTION(this matches the behavior of"!*"before the partial wildcard feature was incorporated).Triggers which are in the READY state are completely ignored.(READY should never be returned since the trigger paths have been run
def endData(self, containerClass=NavigableString)