Public Member Functions | |
def | __getattr__ (self, methodName) |
def | __init__ (self, markup="", parseOnlyThese=None, fromEncoding=None, markupMassage=True, smartQuotesTo=XML_ENTITIES, convertEntities=None, selfClosingTags=None, isHTML=False) |
def | convert_charref (self, name) |
def | endData (self, containerClass=NavigableString) |
def | handle_charref (self, ref) |
def | handle_comment (self, text) |
def | handle_data (self, data) |
def | handle_decl (self, data) |
def | handle_entityref (self, ref) |
def | handle_pi (self, text) |
def | isSelfClosingTag (self, name) |
def | parse_declaration (self, i) |
def | popTag (self) |
def | pushTag (self, tag) |
def | reset (self) |
def | unknown_endtag (self, name) |
def | unknown_starttag (self, name, attrs, selfClosing=0) |
Public Member Functions inherited from BeautifulSoup.Tag | |
def | __call__ (self, args, kwargs) |
def | __contains__ (self, x) |
def | __delitem__ (self, key) |
def | __eq__ (self, other) |
def | __getattr__ (self, tag) |
def | __getitem__ (self, key) |
def | __init__ (self, parser, name, attrs=None, parent=None, previous=None) |
def | __iter__ (self) |
def | __len__ (self) |
def | __ne__ (self, other) |
def | __nonzero__ (self) |
def | __repr__ (self, encoding=DEFAULT_OUTPUT_ENCODING) |
def | __setitem__ (self, key, value) |
def | __str__ (self, encoding=DEFAULT_OUTPUT_ENCODING, prettyPrint=False, indentLevel=0) |
def | __unicode__ (self) |
def | childGenerator (self) |
def | clear (self) |
def | decompose (self) |
def | fetchText (self, text=None, recursive=True, limit=None) |
def | find (self, name=None, attrs={}, recursive=True, text=None, kwargs) |
def | findAll (self, name=None, attrs={}, recursive=True, text=None, limit=None, kwargs) |
def | firstText (self, text=None, recursive=True) |
def | get (self, key, default=None) |
def | getString (self) |
def | getText (self, separator=u"") |
def | has_key (self, key) |
def | index (self, element) |
def | prettify (self, encoding=DEFAULT_OUTPUT_ENCODING) |
def | recursiveChildGenerator (self) |
def | renderContents (self, encoding=DEFAULT_OUTPUT_ENCODING, prettyPrint=False, indentLevel=0) |
def | setString (self, string) |
Public Member Functions inherited from BeautifulSoup.PageElement | |
def | append (self, tag) |
def | extract (self) |
def | findAllNext (self, name=None, attrs={}, text=None, limit=None, kwargs) |
def | findAllPrevious (self, name=None, attrs={}, text=None, limit=None, kwargs) |
def | findNext (self, name=None, attrs={}, text=None, kwargs) |
def | findNextSibling (self, name=None, attrs={}, text=None, kwargs) |
def | findNextSiblings (self, name=None, attrs={}, text=None, limit=None, kwargs) |
def | findParent (self, name=None, attrs={}, kwargs) |
def | findParents (self, name=None, attrs={}, limit=None, kwargs) |
def | findPrevious (self, name=None, attrs={}, text=None, kwargs) |
def | findPreviousSibling (self, name=None, attrs={}, text=None, kwargs) |
def | findPreviousSiblings (self, name=None, attrs={}, text=None, limit=None, kwargs) |
def | insert (self, position, newChild) |
def | nextGenerator (self) |
def | nextSiblingGenerator (self) |
def | parentGenerator (self) |
def | previousGenerator (self) |
def | previousSiblingGenerator (self) |
def | replaceWith (self, replaceWith) |
def | replaceWithChildren (self) |
def | setup (self, parent=None, previous=None) |
def | substituteEncoding (self, str, encoding=None) |
def | toEncoding (self, s, encoding=None) |
Public Attributes | |
convertEntities | |
convertHTMLEntities | |
convertXMLEntities | |
currentData | |
currentTag | |
declaredHTMLEncoding | |
escapeUnrecognizedEntities | |
fromEncoding | |
hidden | |
instanceSelfClosingTags | |
literal | |
markup | |
markupMassage | |
originalEncoding | |
parseOnlyThese | |
previous | |
quoteStack | |
smartQuotesTo | |
tagStack | |
Public Attributes inherited from BeautifulSoup.Tag | |
attrMap | |
attrs | |
containsSubstitutions | |
contents | |
convertHTMLEntities | |
convertXMLEntities | |
escapeUnrecognizedEntities | |
hidden | |
isSelfClosing | |
name | |
parserClass | |
Public Attributes inherited from BeautifulSoup.PageElement | |
next | |
nextSibling | |
parent | |
previous | |
previousSibling | |
Private Member Functions | |
def | _feed (self, inDocumentEncoding=None, isHTML=False) |
def | _popToTag (self, name, inclusivePop=True) |
def | _smartPop (self, name) |
def | _toStringSubclass (self, text, subclass) |
Additional Inherited Members | |
Properties inherited from BeautifulSoup.Tag | |
string = property(getString, setString) | |
text = property(getText) | |
This class contains the basic parser and search code. It defines a parser that knows nothing about tag behavior except for the following: You can't close a tag without closing all the tags it encloses. That is, "<foo><bar></foo>" actually means "<foo><bar></bar></foo>". [Another possible explanation is "<foo><bar /></foo>", but since this class defines no SELF_CLOSING_TAGS, it will never use that explanation.] This class is useful for parsing XML or made-up markup languages, or when BeautifulSoup makes an assumption counter to what you were expecting.
Definition at line 1040 of file BeautifulSoup.py.
def BeautifulSoup.BeautifulStoneSoup.__init__ | ( | self, | |
markup = "" , |
|||
parseOnlyThese = None , |
|||
fromEncoding = None , |
|||
markupMassage = True , |
|||
smartQuotesTo = XML_ENTITIES , |
|||
convertEntities = None , |
|||
selfClosingTags = None , |
|||
isHTML = False |
|||
) |
The Soup object is initialized as the 'root tag', and the provided markup (which can be a string or a file-like object) is fed into the underlying parser. sgmllib will process most bad HTML, and the BeautifulSoup class has some tricks for dealing with some HTML that kills sgmllib, but Beautiful Soup can nonetheless choke or lose data if your data uses self-closing tags or declarations incorrectly. By default, Beautiful Soup uses regexes to sanitize input, avoiding the vast majority of these problems. If the problems don't apply to you, pass in False for markupMassage, and you'll get better performance. The default parser massage techniques fix the two most common instances of invalid HTML that choke sgmllib: <br/> (No space between name of closing tag and tag close) <! --Comment--> (Extraneous whitespace in declaration) You can pass in a custom list of (RE object, replace method) tuples to get Beautiful Soup to scrub your input the way you want.
Definition at line 1086 of file BeautifulSoup.py.
def BeautifulSoup.BeautifulStoneSoup.__getattr__ | ( | self, | |
methodName | |||
) |
This method routes method call requests to either the SGMLParser superclass or the Tag superclass, depending on the method name.
Definition at line 1196 of file BeautifulSoup.py.
Referenced by VarParsing.VarParsing.setType().
|
private |
Definition at line 1163 of file BeautifulSoup.py.
References BeautifulSoup.BeautifulStoneSoup.markup.
Pops the tag stack up to and including the most recent instance of the given tag. If inclusivePop is false, pops the tag stack up to but *not* including the most recent instqance of the given tag.
Definition at line 1263 of file BeautifulSoup.py.
References BeautifulSoup.BeautifulStoneSoup.popTag(), and BeautifulSoup.BeautifulStoneSoup.tagStack.
Referenced by BeautifulSoup.BeautifulStoneSoup._smartPop(), and BeautifulSoup.BeautifulStoneSoup.unknown_endtag().
|
private |
We need to pop up to the previous tag of this type, unless one of this tag's nesting reset triggers comes between this tag and the previous tag of this type, OR unless this tag is a generic nesting trigger and another generic nesting trigger comes between this tag and the previous tag of this type. Examples: <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'. <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'. <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'. <li><ul><li> *<li>* should pop to 'ul', not the first 'li'. <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
Definition at line 1285 of file BeautifulSoup.py.
References BeautifulSoup.BeautifulStoneSoup._popToTag(), and BeautifulSoup.BeautifulStoneSoup.tagStack.
Referenced by BeautifulSoup.BeautifulStoneSoup.unknown_starttag().
|
private |
Adds a certain piece of text to the tree as a NavigableString subclass.
Definition at line 1377 of file BeautifulSoup.py.
References BeautifulSoup.BeautifulStoneSoup.endData(), and BeautifulSoup.BeautifulStoneSoup.handle_data().
Referenced by BeautifulSoup.BeautifulStoneSoup.handle_comment(), BeautifulSoup.BeautifulStoneSoup.handle_decl(), BeautifulSoup.BeautifulStoneSoup.handle_pi(), and BeautifulSoup.BeautifulStoneSoup.parse_declaration().
def BeautifulSoup.BeautifulStoneSoup.convert_charref | ( | self, | |
name | |||
) |
This method fixes a bug in Python's SGMLParser.
Definition at line 1153 of file BeautifulSoup.py.
References createfilelist.int.
def BeautifulSoup.BeautifulStoneSoup.endData | ( | self, | |
containerClass = NavigableString |
|||
) |
Definition at line 1240 of file BeautifulSoup.py.
References BeautifulSoup.BeautifulStoneSoup.currentData, BeautifulSoup.BeautifulStoneSoup.currentTag, reco::helper::VirtualJetProducerHelper.intersection(), join(), BeautifulSoup.BeautifulStoneSoup.parseOnlyThese, BeautifulSoup.PageElement.previous, and BeautifulSoup.BeautifulStoneSoup.tagStack.
Referenced by BeautifulSoup.BeautifulStoneSoup._toStringSubclass(), BeautifulSoup.BeautifulStoneSoup.unknown_endtag(), and BeautifulSoup.BeautifulStoneSoup.unknown_starttag().
def BeautifulSoup.BeautifulStoneSoup.handle_charref | ( | self, | |
ref | |||
) |
Definition at line 1396 of file BeautifulSoup.py.
References BeautifulSoup.BeautifulStoneSoup.convertEntities, BeautifulSoup.BeautifulStoneSoup.handle_data(), and createfilelist.int.
def BeautifulSoup.BeautifulStoneSoup.handle_comment | ( | self, | |
text | |||
) |
Definition at line 1392 of file BeautifulSoup.py.
References BeautifulSoup.BeautifulStoneSoup._toStringSubclass().
def BeautifulSoup.BeautifulStoneSoup.handle_data | ( | self, | |
data | |||
) |
Definition at line 1374 of file BeautifulSoup.py.
Referenced by BeautifulSoup.BeautifulStoneSoup._toStringSubclass(), BeautifulSoup.BeautifulStoneSoup.handle_charref(), BeautifulSoup.BeautifulStoneSoup.handle_entityref(), BeautifulSoup.BeautifulStoneSoup.parse_declaration(), BeautifulSoup.BeautifulStoneSoup.unknown_endtag(), and BeautifulSoup.BeautifulStoneSoup.unknown_starttag().
def BeautifulSoup.BeautifulStoneSoup.handle_decl | ( | self, | |
data | |||
) |
Definition at line 1447 of file BeautifulSoup.py.
References BeautifulSoup.BeautifulStoneSoup._toStringSubclass().
def BeautifulSoup.BeautifulStoneSoup.handle_entityref | ( | self, | |
ref | |||
) |
Handle entity references as data, possibly converting known HTML and/or XML entity references to the corresponding Unicode characters.
Definition at line 1404 of file BeautifulSoup.py.
References BeautifulSoup.Tag.convertHTMLEntities, BeautifulSoup.Tag.convertXMLEntities, and BeautifulSoup.BeautifulStoneSoup.handle_data().
def BeautifulSoup.BeautifulStoneSoup.handle_pi | ( | self, | |
text | |||
) |
Handle a processing instruction as a ProcessingInstruction object, possibly one with a %SOUP-ENCODING% slot into which an encoding will be plugged later.
Definition at line 1384 of file BeautifulSoup.py.
References BeautifulSoup.BeautifulStoneSoup._toStringSubclass().
def BeautifulSoup.BeautifulStoneSoup.isSelfClosingTag | ( | self, | |
name | |||
) |
Returns true iff the given string is the name of a self-closing tag according to this parser.
Definition at line 1209 of file BeautifulSoup.py.
References BeautifulSoup.BeautifulStoneSoup.instanceSelfClosingTags.
Referenced by BeautifulSoup.BeautifulStoneSoup.unknown_starttag().
def BeautifulSoup.BeautifulStoneSoup.parse_declaration | ( | self, | |
i | |||
) |
Treat a bogus SGML declaration as raw data. Treat a CDATA declaration as a CData object.
Definition at line 1451 of file BeautifulSoup.py.
References BeautifulSoup.BeautifulStoneSoup._toStringSubclass(), BeautifulSoup.BeautifulStoneSoup.handle_data(), and DQMNet::Object.rawdata.
def BeautifulSoup.BeautifulStoneSoup.popTag | ( | self | ) |
Definition at line 1225 of file BeautifulSoup.py.
References BeautifulSoup.BeautifulStoneSoup.currentTag, and BeautifulSoup.BeautifulStoneSoup.tagStack.
Referenced by BeautifulSoup.BeautifulStoneSoup._popToTag(), and BeautifulSoup.BeautifulStoneSoup.unknown_starttag().
def BeautifulSoup.BeautifulStoneSoup.pushTag | ( | self, | |
tag | |||
) |
Definition at line 1233 of file BeautifulSoup.py.
References BeautifulSoup.BeautifulStoneSoup.currentTag, and BeautifulSoup.BeautifulStoneSoup.tagStack.
Referenced by BeautifulSoup.BeautifulStoneSoup.unknown_starttag().
def BeautifulSoup.BeautifulStoneSoup.reset | ( | self | ) |
Definition at line 1215 of file BeautifulSoup.py.
def BeautifulSoup.BeautifulStoneSoup.unknown_endtag | ( | self, | |
name | |||
) |
Definition at line 1361 of file BeautifulSoup.py.
References BeautifulSoup.BeautifulStoneSoup._popToTag(), BeautifulSoup.BeautifulStoneSoup.endData(), BeautifulSoup.BeautifulStoneSoup.handle_data(), BeautifulSoup.BeautifulStoneSoup.literal, and BeautifulSoup.BeautifulStoneSoup.quoteStack.
def BeautifulSoup.BeautifulStoneSoup.unknown_starttag | ( | self, | |
name, | |||
attrs, | |||
selfClosing = 0 |
|||
) |
Definition at line 1331 of file BeautifulSoup.py.
References BeautifulSoup.BeautifulStoneSoup._smartPop(), BeautifulSoup.BeautifulStoneSoup.currentTag, BeautifulSoup.BeautifulStoneSoup.endData(), BeautifulSoup.BeautifulStoneSoup.handle_data(), BeautifulSoup.BeautifulStoneSoup.isSelfClosingTag(), join(), BeautifulSoup.BeautifulStoneSoup.parseOnlyThese, BeautifulSoup.BeautifulStoneSoup.popTag(), BeautifulSoup.PageElement.previous, BeautifulSoup.BeautifulStoneSoup.pushTag(), BeautifulSoup.BeautifulStoneSoup.quoteStack, and BeautifulSoup.BeautifulStoneSoup.tagStack.
BeautifulSoup.BeautifulStoneSoup.convertEntities |
Definition at line 1115 of file BeautifulSoup.py.
Referenced by BeautifulSoup.BeautifulStoneSoup.handle_charref().
BeautifulSoup.BeautifulStoneSoup.convertHTMLEntities |
Definition at line 1125 of file BeautifulSoup.py.
BeautifulSoup.BeautifulStoneSoup.convertXMLEntities |
Definition at line 1124 of file BeautifulSoup.py.
BeautifulSoup.BeautifulStoneSoup.currentData |
Definition at line 1219 of file BeautifulSoup.py.
Referenced by BeautifulSoup.BeautifulStoneSoup.endData().
BeautifulSoup.BeautifulStoneSoup.currentTag |
Definition at line 1220 of file BeautifulSoup.py.
Referenced by BeautifulSoup.BeautifulStoneSoup.endData(), BeautifulSoup.BeautifulStoneSoup.popTag(), BeautifulSoup.BeautifulStoneSoup.pushTag(), and BeautifulSoup.BeautifulStoneSoup.unknown_starttag().
BeautifulSoup.BeautifulStoneSoup.declaredHTMLEncoding |
Definition at line 1175 of file BeautifulSoup.py.
Referenced by BeautifulSoup.UnicodeDammit._detectEncoding(), and BeautifulSoup.BeautifulSoup.start_meta().
BeautifulSoup.BeautifulStoneSoup.escapeUnrecognizedEntities |
Definition at line 1126 of file BeautifulSoup.py.
BeautifulSoup.BeautifulStoneSoup.fromEncoding |
Definition at line 1113 of file BeautifulSoup.py.
BeautifulSoup.BeautifulStoneSoup.hidden |
Definition at line 1217 of file BeautifulSoup.py.
BeautifulSoup.BeautifulStoneSoup.instanceSelfClosingTags |
Definition at line 1140 of file BeautifulSoup.py.
Referenced by BeautifulSoup.BeautifulStoneSoup.isSelfClosingTag().
BeautifulSoup.BeautifulStoneSoup.literal |
Definition at line 1358 of file BeautifulSoup.py.
Referenced by BeautifulSoup.BeautifulStoneSoup.unknown_endtag().
BeautifulSoup.BeautifulStoneSoup.markup |
Definition at line 1145 of file BeautifulSoup.py.
Referenced by BeautifulSoup.UnicodeDammit._convertFrom(), and BeautifulSoup.BeautifulStoneSoup._feed().
BeautifulSoup.BeautifulStoneSoup.markupMassage |
Definition at line 1146 of file BeautifulSoup.py.
BeautifulSoup.BeautifulStoneSoup.originalEncoding |
Definition at line 1168 of file BeautifulSoup.py.
BeautifulSoup.BeautifulStoneSoup.parseOnlyThese |
Definition at line 1112 of file BeautifulSoup.py.
Referenced by BeautifulSoup.BeautifulStoneSoup.endData(), and BeautifulSoup.BeautifulStoneSoup.unknown_starttag().
BeautifulSoup.BeautifulStoneSoup.previous |
Definition at line 1259 of file BeautifulSoup.py.
BeautifulSoup.BeautifulStoneSoup.quoteStack |
Definition at line 1222 of file BeautifulSoup.py.
Referenced by BeautifulSoup.BeautifulStoneSoup.unknown_endtag(), and BeautifulSoup.BeautifulStoneSoup.unknown_starttag().
BeautifulSoup.BeautifulStoneSoup.smartQuotesTo |
Definition at line 1114 of file BeautifulSoup.py.
Referenced by BeautifulSoup.UnicodeDammit._convertFrom(), and BeautifulSoup.UnicodeDammit._subMSChar().
BeautifulSoup.BeautifulStoneSoup.tagStack |
Definition at line 1221 of file BeautifulSoup.py.
Referenced by BeautifulSoup.BeautifulStoneSoup._popToTag(), BeautifulSoup.BeautifulStoneSoup._smartPop(), BeautifulSoup.BeautifulStoneSoup.endData(), BeautifulSoup.BeautifulStoneSoup.popTag(), BeautifulSoup.BeautifulSOAP.popTag(), BeautifulSoup.BeautifulStoneSoup.pushTag(), and BeautifulSoup.BeautifulStoneSoup.unknown_starttag().