Public Member Functions | |
def | __init__ |
def | __init__ |
def | find_codec |
def | find_codec |
Public Attributes | |
declaredHTMLEncoding | |
markup | |
originalEncoding | |
smartQuotesTo | |
triedEncodings | |
unicode | |
Static Public Attributes | |
dictionary | CHARSET_ALIASES |
EBCDIC_TO_ASCII_MAP = None | |
dictionary | MS_CHARS |
Private Member Functions | |
def | _codec |
def | _codec |
def | _convertFrom |
def | _convertFrom |
def | _detectEncoding |
def | _detectEncoding |
def | _ebcdic_to_ascii |
def | _ebcdic_to_ascii |
def | _subMSChar |
def | _subMSChar |
def | _toUnicode |
def | _toUnicode |
A class for detecting the encoding of a *ML document and converting it to a Unicode string. If the source encoding is windows-1252, can replace MS smart quotes with their HTML or XML equivalents.
Definition at line 1734 of file BeautifulSoup.py.
def BeautifulSoup::UnicodeDammit::__init__ | ( | self, | |
markup, | |||
overrideEncodings = [] , |
|||
smartQuotesTo = 'xml' , |
|||
isHTML = False |
|||
) |
Definition at line 1747 of file BeautifulSoup.py.
01749 : 01750 self.declaredHTMLEncoding = None 01751 self.markup, documentEncoding, sniffedEncoding = \ 01752 self._detectEncoding(markup, isHTML) 01753 self.smartQuotesTo = smartQuotesTo 01754 self.triedEncodings = [] 01755 if markup == '' or isinstance(markup, unicode): 01756 self.originalEncoding = None 01757 self.unicode = unicode(markup) 01758 return 01759 01760 u = None 01761 for proposedEncoding in overrideEncodings: 01762 u = self._convertFrom(proposedEncoding) 01763 if u: break 01764 if not u: 01765 for proposedEncoding in (documentEncoding, sniffedEncoding): 01766 u = self._convertFrom(proposedEncoding) 01767 if u: break 01768 01769 # If no luck and we have auto-detection library, try that: 01770 if not u and chardet and not isinstance(self.markup, unicode): 01771 u = self._convertFrom(chardet.detect(self.markup)['encoding']) 01772 01773 # As a last resort, try utf-8 and windows-1252: 01774 if not u: 01775 for proposed_encoding in ("utf-8", "windows-1252"): 01776 u = self._convertFrom(proposed_encoding) 01777 if u: break 01778 01779 self.unicode = u 01780 if not u: self.originalEncoding = None
def BeautifulSoup::UnicodeDammit::__init__ | ( | self, | |
markup, | |||
overrideEncodings = [] , |
|||
smartQuotesTo = 'xml' , |
|||
isHTML = False |
|||
) |
Definition at line 1747 of file BeautifulSoup.py.
01749 : 01750 self.declaredHTMLEncoding = None 01751 self.markup, documentEncoding, sniffedEncoding = \ 01752 self._detectEncoding(markup, isHTML) 01753 self.smartQuotesTo = smartQuotesTo 01754 self.triedEncodings = [] 01755 if markup == '' or isinstance(markup, unicode): 01756 self.originalEncoding = None 01757 self.unicode = unicode(markup) 01758 return 01759 01760 u = None 01761 for proposedEncoding in overrideEncodings: 01762 u = self._convertFrom(proposedEncoding) 01763 if u: break 01764 if not u: 01765 for proposedEncoding in (documentEncoding, sniffedEncoding): 01766 u = self._convertFrom(proposedEncoding) 01767 if u: break 01768 01769 # If no luck and we have auto-detection library, try that: 01770 if not u and chardet and not isinstance(self.markup, unicode): 01771 u = self._convertFrom(chardet.detect(self.markup)['encoding']) 01772 01773 # As a last resort, try utf-8 and windows-1252: 01774 if not u: 01775 for proposed_encoding in ("utf-8", "windows-1252"): 01776 u = self._convertFrom(proposed_encoding) 01777 if u: break 01778 01779 self.unicode = u 01780 if not u: self.originalEncoding = None
def BeautifulSoup::UnicodeDammit::_codec | ( | self, | |
charset | |||
) | [private] |
Definition at line 1924 of file BeautifulSoup.py.
def BeautifulSoup::UnicodeDammit::_codec | ( | self, | |
charset | |||
) | [private] |
Definition at line 1924 of file BeautifulSoup.py.
def BeautifulSoup::UnicodeDammit::_convertFrom | ( | self, | |
proposed | |||
) | [private] |
Definition at line 1795 of file BeautifulSoup.py.
01796 : 01797 proposed = self.find_codec(proposed) 01798 if not proposed or proposed in self.triedEncodings: 01799 return None 01800 self.triedEncodings.append(proposed) 01801 markup = self.markup 01802 01803 # Convert smart quotes to HTML if coming from an encoding 01804 # that might have them. 01805 if self.smartQuotesTo and proposed.lower() in("windows-1252", 01806 "iso-8859-1", 01807 "iso-8859-2"): 01808 smart_quotes_re = "([\x80-\x9f])" 01809 smart_quotes_compiled = re.compile(smart_quotes_re) 01810 markup = smart_quotes_compiled.sub(self._subMSChar, markup) 01811 01812 try: 01813 # print "Trying to convert document to %s" % proposed 01814 u = self._toUnicode(markup, proposed) 01815 self.markup = u 01816 self.originalEncoding = proposed 01817 except Exception, e: 01818 # print "That didn't work!" 01819 # print e 01820 return None 01821 #print "Correct encoding: %s" % proposed 01822 return self.markup
def BeautifulSoup::UnicodeDammit::_convertFrom | ( | self, | |
proposed | |||
) | [private] |
Definition at line 1795 of file BeautifulSoup.py.
01796 : 01797 proposed = self.find_codec(proposed) 01798 if not proposed or proposed in self.triedEncodings: 01799 return None 01800 self.triedEncodings.append(proposed) 01801 markup = self.markup 01802 01803 # Convert smart quotes to HTML if coming from an encoding 01804 # that might have them. 01805 if self.smartQuotesTo and proposed.lower() in("windows-1252", 01806 "iso-8859-1", 01807 "iso-8859-2"): 01808 smart_quotes_re = "([\x80-\x9f])" 01809 smart_quotes_compiled = re.compile(smart_quotes_re) 01810 markup = smart_quotes_compiled.sub(self._subMSChar, markup) 01811 01812 try: 01813 # print "Trying to convert document to %s" % proposed 01814 u = self._toUnicode(markup, proposed) 01815 self.markup = u 01816 self.originalEncoding = proposed 01817 except Exception, e: 01818 # print "That didn't work!" 01819 # print e 01820 return None 01821 #print "Correct encoding: %s" % proposed 01822 return self.markup
def BeautifulSoup::UnicodeDammit::_detectEncoding | ( | self, | |
xml_data, | |||
isHTML = False |
|||
) | [private] |
Given a document, tries to detect its XML encoding.
Definition at line 1848 of file BeautifulSoup.py.
01849 : 01850 """Given a document, tries to detect its XML encoding.""" 01851 xml_encoding = sniffed_xml_encoding = None 01852 try: 01853 if xml_data[:4] == '\x4c\x6f\xa7\x94': 01854 # EBCDIC 01855 xml_data = self._ebcdic_to_ascii(xml_data) 01856 elif xml_data[:4] == '\x00\x3c\x00\x3f': 01857 # UTF-16BE 01858 sniffed_xml_encoding = 'utf-16be' 01859 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') 01860 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ 01861 and (xml_data[2:4] != '\x00\x00'): 01862 # UTF-16BE with BOM 01863 sniffed_xml_encoding = 'utf-16be' 01864 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') 01865 elif xml_data[:4] == '\x3c\x00\x3f\x00': 01866 # UTF-16LE 01867 sniffed_xml_encoding = 'utf-16le' 01868 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') 01869 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ 01870 (xml_data[2:4] != '\x00\x00'): 01871 # UTF-16LE with BOM 01872 sniffed_xml_encoding = 'utf-16le' 01873 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') 01874 elif xml_data[:4] == '\x00\x00\x00\x3c': 01875 # UTF-32BE 01876 sniffed_xml_encoding = 'utf-32be' 01877 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') 01878 elif xml_data[:4] == '\x3c\x00\x00\x00': 01879 # UTF-32LE 01880 sniffed_xml_encoding = 'utf-32le' 01881 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') 01882 elif xml_data[:4] == '\x00\x00\xfe\xff': 01883 # UTF-32BE with BOM 01884 sniffed_xml_encoding = 'utf-32be' 01885 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') 01886 elif xml_data[:4] == '\xff\xfe\x00\x00': 01887 # UTF-32LE with BOM 01888 sniffed_xml_encoding = 'utf-32le' 01889 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') 01890 elif xml_data[:3] == '\xef\xbb\xbf': 01891 # UTF-8 with BOM 01892 sniffed_xml_encoding = 'utf-8' 01893 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') 01894 else: 01895 sniffed_xml_encoding = 'ascii' 01896 pass 01897 except: 01898 xml_encoding_match = None 01899 xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode() 01900 xml_encoding_match = re.compile(xml_encoding_re).match(xml_data) 01901 if not xml_encoding_match and isHTML: 01902 meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode() 01903 regexp = re.compile(meta_re, re.I) 01904 xml_encoding_match = regexp.search(xml_data) 01905 if xml_encoding_match is not None: 01906 xml_encoding = xml_encoding_match.groups()[0].decode( 01907 'ascii').lower() 01908 if isHTML: 01909 self.declaredHTMLEncoding = xml_encoding 01910 if sniffed_xml_encoding and \ 01911 (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 01912 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 01913 'utf-16', 'utf-32', 'utf_16', 'utf_32', 01914 'utf16', 'u16')): 01915 xml_encoding = sniffed_xml_encoding 01916 return xml_data, xml_encoding, sniffed_xml_encoding 01917
def BeautifulSoup::UnicodeDammit::_detectEncoding | ( | self, | |
xml_data, | |||
isHTML = False |
|||
) | [private] |
Given a document, tries to detect its XML encoding.
Definition at line 1848 of file BeautifulSoup.py.
01849 : 01850 """Given a document, tries to detect its XML encoding.""" 01851 xml_encoding = sniffed_xml_encoding = None 01852 try: 01853 if xml_data[:4] == '\x4c\x6f\xa7\x94': 01854 # EBCDIC 01855 xml_data = self._ebcdic_to_ascii(xml_data) 01856 elif xml_data[:4] == '\x00\x3c\x00\x3f': 01857 # UTF-16BE 01858 sniffed_xml_encoding = 'utf-16be' 01859 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') 01860 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ 01861 and (xml_data[2:4] != '\x00\x00'): 01862 # UTF-16BE with BOM 01863 sniffed_xml_encoding = 'utf-16be' 01864 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') 01865 elif xml_data[:4] == '\x3c\x00\x3f\x00': 01866 # UTF-16LE 01867 sniffed_xml_encoding = 'utf-16le' 01868 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') 01869 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ 01870 (xml_data[2:4] != '\x00\x00'): 01871 # UTF-16LE with BOM 01872 sniffed_xml_encoding = 'utf-16le' 01873 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') 01874 elif xml_data[:4] == '\x00\x00\x00\x3c': 01875 # UTF-32BE 01876 sniffed_xml_encoding = 'utf-32be' 01877 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') 01878 elif xml_data[:4] == '\x3c\x00\x00\x00': 01879 # UTF-32LE 01880 sniffed_xml_encoding = 'utf-32le' 01881 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') 01882 elif xml_data[:4] == '\x00\x00\xfe\xff': 01883 # UTF-32BE with BOM 01884 sniffed_xml_encoding = 'utf-32be' 01885 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') 01886 elif xml_data[:4] == '\xff\xfe\x00\x00': 01887 # UTF-32LE with BOM 01888 sniffed_xml_encoding = 'utf-32le' 01889 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') 01890 elif xml_data[:3] == '\xef\xbb\xbf': 01891 # UTF-8 with BOM 01892 sniffed_xml_encoding = 'utf-8' 01893 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') 01894 else: 01895 sniffed_xml_encoding = 'ascii' 01896 pass 01897 except: 01898 xml_encoding_match = None 01899 xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode() 01900 xml_encoding_match = re.compile(xml_encoding_re).match(xml_data) 01901 if not xml_encoding_match and isHTML: 01902 meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode() 01903 regexp = re.compile(meta_re, re.I) 01904 xml_encoding_match = regexp.search(xml_data) 01905 if xml_encoding_match is not None: 01906 xml_encoding = xml_encoding_match.groups()[0].decode( 01907 'ascii').lower() 01908 if isHTML: 01909 self.declaredHTMLEncoding = xml_encoding 01910 if sniffed_xml_encoding and \ 01911 (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 01912 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 01913 'utf-16', 'utf-32', 'utf_16', 'utf_32', 01914 'utf16', 'u16')): 01915 xml_encoding = sniffed_xml_encoding 01916 return xml_data, xml_encoding, sniffed_xml_encoding 01917
def BeautifulSoup::UnicodeDammit::_ebcdic_to_ascii | ( | self, | |
s | |||
) | [private] |
Definition at line 1935 of file BeautifulSoup.py.
01936 : 01937 c = self.__class__ 01938 if not c.EBCDIC_TO_ASCII_MAP: 01939 emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, 01940 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, 01941 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, 01942 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, 01943 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, 01944 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, 01945 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, 01946 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, 01947 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, 01948 201,202,106,107,108,109,110,111,112,113,114,203,204,205, 01949 206,207,208,209,126,115,116,117,118,119,120,121,122,210, 01950 211,212,213,214,215,216,217,218,219,220,221,222,223,224, 01951 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, 01952 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, 01953 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, 01954 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, 01955 250,251,252,253,254,255) 01956 import string 01957 c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ 01958 ''.join(map(chr, range(256))), ''.join(map(chr, emap))) 01959 return s.translate(c.EBCDIC_TO_ASCII_MAP)
def BeautifulSoup::UnicodeDammit::_ebcdic_to_ascii | ( | self, | |
s | |||
) | [private] |
Definition at line 1935 of file BeautifulSoup.py.
01936 : 01937 c = self.__class__ 01938 if not c.EBCDIC_TO_ASCII_MAP: 01939 emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, 01940 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, 01941 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, 01942 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, 01943 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, 01944 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, 01945 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, 01946 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, 01947 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, 01948 201,202,106,107,108,109,110,111,112,113,114,203,204,205, 01949 206,207,208,209,126,115,116,117,118,119,120,121,122,210, 01950 211,212,213,214,215,216,217,218,219,220,221,222,223,224, 01951 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, 01952 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, 01953 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, 01954 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, 01955 250,251,252,253,254,255) 01956 import string 01957 c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ 01958 ''.join(map(chr, range(256))), ''.join(map(chr, emap))) 01959 return s.translate(c.EBCDIC_TO_ASCII_MAP)
def BeautifulSoup::UnicodeDammit::_subMSChar | ( | self, | |
match | |||
) | [private] |
Changes a MS smart quote character to an XML or HTML entity.
Definition at line 1781 of file BeautifulSoup.py.
01782 : 01783 """Changes a MS smart quote character to an XML or HTML 01784 entity.""" 01785 orig = match.group(1) 01786 sub = self.MS_CHARS.get(orig) 01787 if type(sub) == types.TupleType: 01788 if self.smartQuotesTo == 'xml': 01789 sub = '&#x'.encode() + sub[1].encode() + ';'.encode() 01790 else: 01791 sub = '&'.encode() + sub[0].encode() + ';'.encode() 01792 else: 01793 sub = sub.encode() 01794 return sub
def BeautifulSoup::UnicodeDammit::_subMSChar | ( | self, | |
match | |||
) | [private] |
Changes a MS smart quote character to an XML or HTML entity.
Definition at line 1781 of file BeautifulSoup.py.
01782 : 01783 """Changes a MS smart quote character to an XML or HTML 01784 entity.""" 01785 orig = match.group(1) 01786 sub = self.MS_CHARS.get(orig) 01787 if type(sub) == types.TupleType: 01788 if self.smartQuotesTo == 'xml': 01789 sub = '&#x'.encode() + sub[1].encode() + ';'.encode() 01790 else: 01791 sub = '&'.encode() + sub[0].encode() + ';'.encode() 01792 else: 01793 sub = sub.encode() 01794 return sub
def BeautifulSoup::UnicodeDammit::_toUnicode | ( | self, | |
data, | |||
encoding | |||
) | [private] |
Given a string and its encoding, decodes the string into Unicode. %encoding is a string recognized by encodings.aliases
Definition at line 1823 of file BeautifulSoup.py.
01824 : 01825 '''Given a string and its encoding, decodes the string into Unicode. 01826 %encoding is a string recognized by encodings.aliases''' 01827 01828 # strip Byte Order Mark (if present) 01829 if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ 01830 and (data[2:4] != '\x00\x00'): 01831 encoding = 'utf-16be' 01832 data = data[2:] 01833 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ 01834 and (data[2:4] != '\x00\x00'): 01835 encoding = 'utf-16le' 01836 data = data[2:] 01837 elif data[:3] == '\xef\xbb\xbf': 01838 encoding = 'utf-8' 01839 data = data[3:] 01840 elif data[:4] == '\x00\x00\xfe\xff': 01841 encoding = 'utf-32be' 01842 data = data[4:] 01843 elif data[:4] == '\xff\xfe\x00\x00': 01844 encoding = 'utf-32le' 01845 data = data[4:] 01846 newdata = unicode(data, encoding) 01847 return newdata
def BeautifulSoup::UnicodeDammit::_toUnicode | ( | self, | |
data, | |||
encoding | |||
) | [private] |
Given a string and its encoding, decodes the string into Unicode. %encoding is a string recognized by encodings.aliases
Definition at line 1823 of file BeautifulSoup.py.
01824 : 01825 '''Given a string and its encoding, decodes the string into Unicode. 01826 %encoding is a string recognized by encodings.aliases''' 01827 01828 # strip Byte Order Mark (if present) 01829 if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ 01830 and (data[2:4] != '\x00\x00'): 01831 encoding = 'utf-16be' 01832 data = data[2:] 01833 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ 01834 and (data[2:4] != '\x00\x00'): 01835 encoding = 'utf-16le' 01836 data = data[2:] 01837 elif data[:3] == '\xef\xbb\xbf': 01838 encoding = 'utf-8' 01839 data = data[3:] 01840 elif data[:4] == '\x00\x00\xfe\xff': 01841 encoding = 'utf-32be' 01842 data = data[4:] 01843 elif data[:4] == '\xff\xfe\x00\x00': 01844 encoding = 'utf-32le' 01845 data = data[4:] 01846 newdata = unicode(data, encoding) 01847 return newdata
def BeautifulSoup::UnicodeDammit::find_codec | ( | self, | |
charset | |||
) |
Definition at line 1918 of file BeautifulSoup.py.
def BeautifulSoup::UnicodeDammit::find_codec | ( | self, | |
charset | |||
) |
Definition at line 1918 of file BeautifulSoup.py.
dictionary BeautifulSoup::UnicodeDammit::CHARSET_ALIASES [static] |
{ "macintosh" : "mac-roman", "x-sjis" : "shift-jis" }
Definition at line 1744 of file BeautifulSoup.py.
Definition at line 1747 of file BeautifulSoup.py.
BeautifulSoup::UnicodeDammit::EBCDIC_TO_ASCII_MAP = None [static] |
Definition at line 1934 of file BeautifulSoup.py.
Definition at line 1795 of file BeautifulSoup.py.
dictionary BeautifulSoup::UnicodeDammit::MS_CHARS [static] |
Definition at line 1960 of file BeautifulSoup.py.
Definition at line 1747 of file BeautifulSoup.py.
Definition at line 1747 of file BeautifulSoup.py.
Definition at line 1747 of file BeautifulSoup.py.
Definition at line 1747 of file BeautifulSoup.py.