Inherits HTMLParser::HTMLParser, and HTMLParser::HTMLParser.

Public Member Functions
def	__init__
def	__init__
def	handle_charref
def	handle_charref
def	handle_comment
def	handle_comment
def	handle_data
def	handle_data
def	handle_decl
def	handle_decl
def	handle_endtag
def	handle_endtag
def	handle_entityref
def	handle_entityref
def	handle_pi
def	handle_pi
def	handle_starttag
def	handle_starttag
def	parse_declaration
def	parse_declaration
Public Attributes
	soup
Private Member Functions
def	_toStringSubclass
def	_toStringSubclass

Detailed Description

Definition at line 1005 of file BeautifulSoup.py.

Constructor & Destructor Documentation

def BeautifulSoup::HTMLParserBuilder::__init__	(	self,
		soup
	)

Definition at line 1007 of file BeautifulSoup.py.

01008                             :
01009         HTMLParser.__init__(self)
01010         self.soup = soup

def BeautifulSoup::HTMLParserBuilder::__init__	(	self,
		soup
	)

Definition at line 1007 of file BeautifulSoup.py.

01008                             :
01009         HTMLParser.__init__(self)
01010         self.soup = soup

Member Function Documentation

def BeautifulSoup::HTMLParserBuilder::_toStringSubclass	(	self,
		text,
		subclass
	)	`[private]`

Adds a certain piece of text to the tree as a NavigableString
subclass.

Definition at line 1025 of file BeautifulSoup.py.

01026                                                :
01027         """Adds a certain piece of text to the tree as a NavigableString
01028         subclass."""
01029         self.soup.endData()
01030         self.handle_data(text)
01031         self.soup.endData(subclass)

def BeautifulSoup::HTMLParserBuilder::_toStringSubclass	(	self,
		text,
		subclass
	)	`[private]`

Adds a certain piece of text to the tree as a NavigableString
subclass.

Definition at line 1025 of file BeautifulSoup.py.

01026                                                :
01027         """Adds a certain piece of text to the tree as a NavigableString
01028         subclass."""
01029         self.soup.endData()
01030         self.handle_data(text)
01031         self.soup.endData(subclass)

def BeautifulSoup::HTMLParserBuilder::handle_charref	(	self,
		ref
	)

Definition at line 1044 of file BeautifulSoup.py.

01045                                  :
01046         "Handle character references as data."
01047         if self.soup.convertEntities:
01048             data = unichr(int(ref))
01049         else:
01050             data = '&#%s;' % ref
01051         self.handle_data(data)

def BeautifulSoup::HTMLParserBuilder::handle_charref	(	self,
		ref
	)

Definition at line 1044 of file BeautifulSoup.py.

01045                                  :
01046         "Handle character references as data."
01047         if self.soup.convertEntities:
01048             data = unichr(int(ref))
01049         else:
01050             data = '&#%s;' % ref
01051         self.handle_data(data)

def BeautifulSoup::HTMLParserBuilder::handle_comment	(	self,
		text
	)

Definition at line 1040 of file BeautifulSoup.py.

01041                                   :
01042         "Handle comments as Comment objects."
01043         self._toStringSubclass(text, Comment)

def BeautifulSoup::HTMLParserBuilder::handle_comment	(	self,
		text
	)

Definition at line 1040 of file BeautifulSoup.py.

01041                                   :
01042         "Handle comments as Comment objects."
01043         self._toStringSubclass(text, Comment)

def BeautifulSoup::HTMLParserBuilder::handle_data	(	self,
		content
	)

Definition at line 1022 of file BeautifulSoup.py.

01023                                   :
01024         self.soup.handle_data(content)

def BeautifulSoup::HTMLParserBuilder::handle_data	(	self,
		content
	)

Definition at line 1022 of file BeautifulSoup.py.

01023                                   :
01024         self.soup.handle_data(content)

def BeautifulSoup::HTMLParserBuilder::handle_decl	(	self,
		data
	)

Definition at line 1095 of file BeautifulSoup.py.

01096                                :
01097         "Handle DOCTYPEs and the like as Declaration objects."
01098         self._toStringSubclass(data, Declaration)

def BeautifulSoup::HTMLParserBuilder::handle_decl	(	self,
		data
	)

Definition at line 1095 of file BeautifulSoup.py.

01096                                :
01097         "Handle DOCTYPEs and the like as Declaration objects."
01098         self._toStringSubclass(data, Declaration)

def BeautifulSoup::HTMLParserBuilder::handle_endtag	(	self,
		name
	)

Definition at line 1019 of file BeautifulSoup.py.

01020                                  :
01021         self.soup.unknown_endtag(name)

def BeautifulSoup::HTMLParserBuilder::handle_endtag	(	self,
		name
	)

Definition at line 1019 of file BeautifulSoup.py.

01020                                  :
01021         self.soup.unknown_endtag(name)

def BeautifulSoup::HTMLParserBuilder::handle_entityref	(	self,
		ref
	)

Handle entity references as data, possibly converting known
HTML and/or XML entity references to the corresponding Unicode
characters.

Definition at line 1052 of file BeautifulSoup.py.

01053                                    :
01054         """Handle entity references as data, possibly converting known
01055         HTML and/or XML entity references to the corresponding Unicode
01056         characters."""
01057         data = None
01058         if self.soup.convertHTMLEntities:
01059             try:
01060                 data = unichr(name2codepoint[ref])
01061             except KeyError:
01062                 pass
01063 
01064         if not data and self.soup.convertXMLEntities:
01065                 data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
01066 
01067         if not data and self.soup.convertHTMLEntities and \
01068             not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
01069                 # TODO: We've got a problem here. We're told this is
01070                 # an entity reference, but it's not an XML entity
01071                 # reference or an HTML entity reference. Nonetheless,
01072                 # the logical thing to do is to pass it through as an
01073                 # unrecognized entity reference.
01074                 #
01075                 # Except: when the input is "&carol;" this function
01076                 # will be called with input "carol". When the input is
01077                 # "AT&T", this function will be called with input
01078                 # "T". We have no way of knowing whether a semicolon
01079                 # was present originally, so we don't know whether
01080                 # this is an unknown entity or just a misplaced
01081                 # ampersand.
01082                 #
01083                 # The more common case is a misplaced ampersand, so I
01084                 # escape the ampersand and omit the trailing semicolon.
01085                 data = "&amp;%s" % ref
01086         if not data:
01087             # This case is different from the one above, because we
01088             # haven't already gone through a supposedly comprehensive
01089             # mapping of entities to Unicode characters. We might not
01090             # have gone through any mapping at all. So the chances are
01091             # very high that this is a real entity, and not a
01092             # misplaced ampersand.
01093             data = "&%s;" % ref
01094         self.handle_data(data)

def BeautifulSoup::HTMLParserBuilder::handle_entityref	(	self,
		ref
	)

Handle entity references as data, possibly converting known
HTML and/or XML entity references to the corresponding Unicode
characters.

Definition at line 1052 of file BeautifulSoup.py.

01053                                    :
01054         """Handle entity references as data, possibly converting known
01055         HTML and/or XML entity references to the corresponding Unicode
01056         characters."""
01057         data = None
01058         if self.soup.convertHTMLEntities:
01059             try:
01060                 data = unichr(name2codepoint[ref])
01061             except KeyError:
01062                 pass
01063 
01064         if not data and self.soup.convertXMLEntities:
01065                 data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
01066 
01067         if not data and self.soup.convertHTMLEntities and \
01068             not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
01069                 # TODO: We've got a problem here. We're told this is
01070                 # an entity reference, but it's not an XML entity
01071                 # reference or an HTML entity reference. Nonetheless,
01072                 # the logical thing to do is to pass it through as an
01073                 # unrecognized entity reference.
01074                 #
01075                 # Except: when the input is "&carol;" this function
01076                 # will be called with input "carol". When the input is
01077                 # "AT&T", this function will be called with input
01078                 # "T". We have no way of knowing whether a semicolon
01079                 # was present originally, so we don't know whether
01080                 # this is an unknown entity or just a misplaced
01081                 # ampersand.
01082                 #
01083                 # The more common case is a misplaced ampersand, so I
01084                 # escape the ampersand and omit the trailing semicolon.
01085                 data = "&amp;%s" % ref
01086         if not data:
01087             # This case is different from the one above, because we
01088             # haven't already gone through a supposedly comprehensive
01089             # mapping of entities to Unicode characters. We might not
01090             # have gone through any mapping at all. So the chances are
01091             # very high that this is a real entity, and not a
01092             # misplaced ampersand.
01093             data = "&%s;" % ref
01094         self.handle_data(data)

def BeautifulSoup::HTMLParserBuilder::handle_pi	(	self,
		text
	)

Handle a processing instruction as a ProcessingInstruction
object, possibly one with a %SOUP-ENCODING% slot into which an
encoding will be plugged later.

Definition at line 1032 of file BeautifulSoup.py.

01033                              :
01034         """Handle a processing instruction as a ProcessingInstruction
01035         object, possibly one with a %SOUP-ENCODING% slot into which an
01036         encoding will be plugged later."""
01037         if text[:3] == "xml":
01038             text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
01039         self._toStringSubclass(text, ProcessingInstruction)

def BeautifulSoup::HTMLParserBuilder::handle_pi	(	self,
		text
	)

Handle a processing instruction as a ProcessingInstruction
object, possibly one with a %SOUP-ENCODING% slot into which an
encoding will be plugged later.

Definition at line 1032 of file BeautifulSoup.py.

01033                              :
01034         """Handle a processing instruction as a ProcessingInstruction
01035         object, possibly one with a %SOUP-ENCODING% slot into which an
01036         encoding will be plugged later."""
01037         if text[:3] == "xml":
01038             text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
01039         self._toStringSubclass(text, ProcessingInstruction)

def BeautifulSoup::HTMLParserBuilder::handle_starttag	(	self,
		name,
		attrs
	)

Definition at line 1013 of file BeautifulSoup.py.

01014                                           :
01015         if name == 'meta':
01016             self.soup.extractCharsetFromMeta(attrs)
01017         else:
01018             self.soup.unknown_starttag(name, attrs)

def BeautifulSoup::HTMLParserBuilder::handle_starttag	(	self,
		name,
		attrs
	)

Definition at line 1013 of file BeautifulSoup.py.

01014                                           :
01015         if name == 'meta':
01016             self.soup.extractCharsetFromMeta(attrs)
01017         else:
01018             self.soup.unknown_starttag(name, attrs)

def BeautifulSoup::HTMLParserBuilder::parse_declaration	(	self,
		i
	)

Treat a bogus SGML declaration as raw data. Treat a CDATA
declaration as a CData object.

Definition at line 1099 of file BeautifulSoup.py.

01100                                   :
01101         """Treat a bogus SGML declaration as raw data. Treat a CDATA
01102         declaration as a CData object."""
01103         j = None
01104         if self.rawdata[i:i+9] == '<![CDATA[':
01105              k = self.rawdata.find(']]>', i)
01106              if k == -1:
01107                  k = len(self.rawdata)
01108              data = self.rawdata[i+9:k]
01109              j = k+3
01110              self._toStringSubclass(data, CData)
01111         else:
01112             try:
01113                 j = HTMLParser.parse_declaration(self, i)
01114             except HTMLParseError:
01115                 toHandle = self.rawdata[i:]
01116                 self.handle_data(toHandle)
01117                 j = i + len(toHandle)
01118         return j
01119

def BeautifulSoup::HTMLParserBuilder::parse_declaration	(	self,
		i
	)

Treat a bogus SGML declaration as raw data. Treat a CDATA
declaration as a CData object.

Definition at line 1099 of file BeautifulSoup.py.

01100                                   :
01101         """Treat a bogus SGML declaration as raw data. Treat a CDATA
01102         declaration as a CData object."""
01103         j = None
01104         if self.rawdata[i:i+9] == '<![CDATA[':
01105              k = self.rawdata.find(']]>', i)
01106              if k == -1:
01107                  k = len(self.rawdata)
01108              data = self.rawdata[i+9:k]
01109              j = k+3
01110              self._toStringSubclass(data, CData)
01111         else:
01112             try:
01113                 j = HTMLParser.parse_declaration(self, i)
01114             except HTMLParseError:
01115                 toHandle = self.rawdata[i:]
01116                 self.handle_data(toHandle)
01117                 j = i + len(toHandle)
01118         return j
01119

Member Data Documentation

BeautifulSoup::HTMLParserBuilder::soup

Definition at line 1007 of file BeautifulSoup.py.

BeautifulSoup::HTMLParserBuilder Class Reference

Public Member Functions

Public Attributes

Private Member Functions

Detailed Description

Constructor & Destructor Documentation

Member Function Documentation

Member Data Documentation