Inherits HTMLParser::HTMLParser, and HTMLParser::HTMLParser.
Public Member Functions | |
def | __init__ |
def | __init__ |
def | handle_charref |
def | handle_charref |
def | handle_comment |
def | handle_comment |
def | handle_data |
def | handle_data |
def | handle_decl |
def | handle_decl |
def | handle_endtag |
def | handle_endtag |
def | handle_entityref |
def | handle_entityref |
def | handle_pi |
def | handle_pi |
def | handle_starttag |
def | handle_starttag |
def | parse_declaration |
def | parse_declaration |
Public Attributes | |
soup | |
Private Member Functions | |
def | _toStringSubclass |
def | _toStringSubclass |
Definition at line 1005 of file BeautifulSoup.py.
def BeautifulSoup::HTMLParserBuilder::__init__ | ( | self, | |
soup | |||
) |
Definition at line 1007 of file BeautifulSoup.py.
def BeautifulSoup::HTMLParserBuilder::__init__ | ( | self, | |
soup | |||
) |
Definition at line 1007 of file BeautifulSoup.py.
def BeautifulSoup::HTMLParserBuilder::_toStringSubclass | ( | self, | |
text, | |||
subclass | |||
) | [private] |
Adds a certain piece of text to the tree as a NavigableString subclass.
Definition at line 1025 of file BeautifulSoup.py.
def BeautifulSoup::HTMLParserBuilder::_toStringSubclass | ( | self, | |
text, | |||
subclass | |||
) | [private] |
Adds a certain piece of text to the tree as a NavigableString subclass.
Definition at line 1025 of file BeautifulSoup.py.
def BeautifulSoup::HTMLParserBuilder::handle_charref | ( | self, | |
ref | |||
) |
Definition at line 1044 of file BeautifulSoup.py.
def BeautifulSoup::HTMLParserBuilder::handle_charref | ( | self, | |
ref | |||
) |
Definition at line 1044 of file BeautifulSoup.py.
def BeautifulSoup::HTMLParserBuilder::handle_comment | ( | self, | |
text | |||
) |
Definition at line 1040 of file BeautifulSoup.py.
def BeautifulSoup::HTMLParserBuilder::handle_comment | ( | self, | |
text | |||
) |
Definition at line 1040 of file BeautifulSoup.py.
def BeautifulSoup::HTMLParserBuilder::handle_data | ( | self, | |
content | |||
) |
Definition at line 1022 of file BeautifulSoup.py.
def BeautifulSoup::HTMLParserBuilder::handle_data | ( | self, | |
content | |||
) |
Definition at line 1022 of file BeautifulSoup.py.
def BeautifulSoup::HTMLParserBuilder::handle_decl | ( | self, | |
data | |||
) |
Definition at line 1095 of file BeautifulSoup.py.
def BeautifulSoup::HTMLParserBuilder::handle_decl | ( | self, | |
data | |||
) |
Definition at line 1095 of file BeautifulSoup.py.
def BeautifulSoup::HTMLParserBuilder::handle_endtag | ( | self, | |
name | |||
) |
Definition at line 1019 of file BeautifulSoup.py.
def BeautifulSoup::HTMLParserBuilder::handle_endtag | ( | self, | |
name | |||
) |
Definition at line 1019 of file BeautifulSoup.py.
def BeautifulSoup::HTMLParserBuilder::handle_entityref | ( | self, | |
ref | |||
) |
Handle entity references as data, possibly converting known HTML and/or XML entity references to the corresponding Unicode characters.
Definition at line 1052 of file BeautifulSoup.py.
01053 : 01054 """Handle entity references as data, possibly converting known 01055 HTML and/or XML entity references to the corresponding Unicode 01056 characters.""" 01057 data = None 01058 if self.soup.convertHTMLEntities: 01059 try: 01060 data = unichr(name2codepoint[ref]) 01061 except KeyError: 01062 pass 01063 01064 if not data and self.soup.convertXMLEntities: 01065 data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) 01066 01067 if not data and self.soup.convertHTMLEntities and \ 01068 not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): 01069 # TODO: We've got a problem here. We're told this is 01070 # an entity reference, but it's not an XML entity 01071 # reference or an HTML entity reference. Nonetheless, 01072 # the logical thing to do is to pass it through as an 01073 # unrecognized entity reference. 01074 # 01075 # Except: when the input is "&carol;" this function 01076 # will be called with input "carol". When the input is 01077 # "AT&T", this function will be called with input 01078 # "T". We have no way of knowing whether a semicolon 01079 # was present originally, so we don't know whether 01080 # this is an unknown entity or just a misplaced 01081 # ampersand. 01082 # 01083 # The more common case is a misplaced ampersand, so I 01084 # escape the ampersand and omit the trailing semicolon. 01085 data = "&%s" % ref 01086 if not data: 01087 # This case is different from the one above, because we 01088 # haven't already gone through a supposedly comprehensive 01089 # mapping of entities to Unicode characters. We might not 01090 # have gone through any mapping at all. So the chances are 01091 # very high that this is a real entity, and not a 01092 # misplaced ampersand. 01093 data = "&%s;" % ref 01094 self.handle_data(data)
def BeautifulSoup::HTMLParserBuilder::handle_entityref | ( | self, | |
ref | |||
) |
Handle entity references as data, possibly converting known HTML and/or XML entity references to the corresponding Unicode characters.
Definition at line 1052 of file BeautifulSoup.py.
01053 : 01054 """Handle entity references as data, possibly converting known 01055 HTML and/or XML entity references to the corresponding Unicode 01056 characters.""" 01057 data = None 01058 if self.soup.convertHTMLEntities: 01059 try: 01060 data = unichr(name2codepoint[ref]) 01061 except KeyError: 01062 pass 01063 01064 if not data and self.soup.convertXMLEntities: 01065 data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) 01066 01067 if not data and self.soup.convertHTMLEntities and \ 01068 not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): 01069 # TODO: We've got a problem here. We're told this is 01070 # an entity reference, but it's not an XML entity 01071 # reference or an HTML entity reference. Nonetheless, 01072 # the logical thing to do is to pass it through as an 01073 # unrecognized entity reference. 01074 # 01075 # Except: when the input is "&carol;" this function 01076 # will be called with input "carol". When the input is 01077 # "AT&T", this function will be called with input 01078 # "T". We have no way of knowing whether a semicolon 01079 # was present originally, so we don't know whether 01080 # this is an unknown entity or just a misplaced 01081 # ampersand. 01082 # 01083 # The more common case is a misplaced ampersand, so I 01084 # escape the ampersand and omit the trailing semicolon. 01085 data = "&%s" % ref 01086 if not data: 01087 # This case is different from the one above, because we 01088 # haven't already gone through a supposedly comprehensive 01089 # mapping of entities to Unicode characters. We might not 01090 # have gone through any mapping at all. So the chances are 01091 # very high that this is a real entity, and not a 01092 # misplaced ampersand. 01093 data = "&%s;" % ref 01094 self.handle_data(data)
def BeautifulSoup::HTMLParserBuilder::handle_pi | ( | self, | |
text | |||
) |
Handle a processing instruction as a ProcessingInstruction object, possibly one with a %SOUP-ENCODING% slot into which an encoding will be plugged later.
Definition at line 1032 of file BeautifulSoup.py.
01033 : 01034 """Handle a processing instruction as a ProcessingInstruction 01035 object, possibly one with a %SOUP-ENCODING% slot into which an 01036 encoding will be plugged later.""" 01037 if text[:3] == "xml": 01038 text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" 01039 self._toStringSubclass(text, ProcessingInstruction)
def BeautifulSoup::HTMLParserBuilder::handle_pi | ( | self, | |
text | |||
) |
Handle a processing instruction as a ProcessingInstruction object, possibly one with a %SOUP-ENCODING% slot into which an encoding will be plugged later.
Definition at line 1032 of file BeautifulSoup.py.
01033 : 01034 """Handle a processing instruction as a ProcessingInstruction 01035 object, possibly one with a %SOUP-ENCODING% slot into which an 01036 encoding will be plugged later.""" 01037 if text[:3] == "xml": 01038 text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" 01039 self._toStringSubclass(text, ProcessingInstruction)
def BeautifulSoup::HTMLParserBuilder::handle_starttag | ( | self, | |
name, | |||
attrs | |||
) |
Definition at line 1013 of file BeautifulSoup.py.
def BeautifulSoup::HTMLParserBuilder::handle_starttag | ( | self, | |
name, | |||
attrs | |||
) |
Definition at line 1013 of file BeautifulSoup.py.
def BeautifulSoup::HTMLParserBuilder::parse_declaration | ( | self, | |
i | |||
) |
Treat a bogus SGML declaration as raw data. Treat a CDATA declaration as a CData object.
Definition at line 1099 of file BeautifulSoup.py.
01100 : 01101 """Treat a bogus SGML declaration as raw data. Treat a CDATA 01102 declaration as a CData object.""" 01103 j = None 01104 if self.rawdata[i:i+9] == '<![CDATA[': 01105 k = self.rawdata.find(']]>', i) 01106 if k == -1: 01107 k = len(self.rawdata) 01108 data = self.rawdata[i+9:k] 01109 j = k+3 01110 self._toStringSubclass(data, CData) 01111 else: 01112 try: 01113 j = HTMLParser.parse_declaration(self, i) 01114 except HTMLParseError: 01115 toHandle = self.rawdata[i:] 01116 self.handle_data(toHandle) 01117 j = i + len(toHandle) 01118 return j 01119
def BeautifulSoup::HTMLParserBuilder::parse_declaration | ( | self, | |
i | |||
) |
Treat a bogus SGML declaration as raw data. Treat a CDATA declaration as a CData object.
Definition at line 1099 of file BeautifulSoup.py.
01100 : 01101 """Treat a bogus SGML declaration as raw data. Treat a CDATA 01102 declaration as a CData object.""" 01103 j = None 01104 if self.rawdata[i:i+9] == '<![CDATA[': 01105 k = self.rawdata.find(']]>', i) 01106 if k == -1: 01107 k = len(self.rawdata) 01108 data = self.rawdata[i+9:k] 01109 j = k+3 01110 self._toStringSubclass(data, CData) 01111 else: 01112 try: 01113 j = HTMLParser.parse_declaration(self, i) 01114 except HTMLParseError: 01115 toHandle = self.rawdata[i:] 01116 self.handle_data(toHandle) 01117 j = i + len(toHandle) 01118 return j 01119
Definition at line 1007 of file BeautifulSoup.py.