1######################## BEGIN LICENSE BLOCK ######################## 2# The Original Code is Mozilla Universal charset detector code. 3# 4# The Initial Developer of the Original Code is 5# Netscape Communications Corporation. 6# Portions created by the Initial Developer are Copyright (C) 2001 7# the Initial Developer. All Rights Reserved. 8# 9# Contributor(s): 10# Mark Pilgrim - port to Python 11# Shy Shalom - original C code 12# 13# This library is free software; you can redistribute it and/or 14# modify it under the terms of the GNU Lesser General Public 15# License as published by the Free Software Foundation; either 16# version 2.1 of the License, or (at your option) any later version. 17# 18# This library is distributed in the hope that it will be useful, 19# but WITHOUT ANY WARRANTY; without even the implied warranty of 20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21# Lesser General Public License for more details. 22# 23# You should have received a copy of the GNU Lesser General Public 24# License along with this library; if not, write to the Free Software 25# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26# 02110-1301 USA 27######################### END LICENSE BLOCK ######################### 28""" 29Module containing the UniversalDetector detector class, which is the primary 30class a user of ``chardet`` should use. 31 32:author: Mark Pilgrim (initial port to Python) 33:author: Shy Shalom (original C code) 34:author: Dan Blanchard (major refactoring for 3.0) 35:author: Ian Cordasco 36""" 37 38 39import codecs 40import logging 41import re 42 43from .charsetgroupprober import CharSetGroupProber 44from .enums import InputState, LanguageFilter, ProbingState 45from .escprober import EscCharSetProber 46from .latin1prober import Latin1Prober 47from .mbcsgroupprober import MBCSGroupProber 48from .sbcsgroupprober import SBCSGroupProber 49 50 51class UniversalDetector(object): 52 """ 53 The ``UniversalDetector`` class underlies the ``chardet.detect`` function 54 and coordinates all of the different charset probers. 55 56 To get a ``dict`` containing an encoding and its confidence, you can simply 57 run: 58 59 .. code:: 60 61 u = UniversalDetector() 62 u.feed(some_bytes) 63 u.close() 64 detected = u.result 65 66 """ 67 68 MINIMUM_THRESHOLD = 0.20 69 HIGH_BYTE_DETECTOR = re.compile(b'[\x80-\xFF]') 70 ESC_DETECTOR = re.compile(b'(\033|~{)') 71 WIN_BYTE_DETECTOR = re.compile(b'[\x80-\x9F]') 72 ISO_WIN_MAP = {'iso-8859-1': 'Windows-1252', 73 'iso-8859-2': 'Windows-1250', 74 'iso-8859-5': 'Windows-1251', 75 'iso-8859-6': 'Windows-1256', 76 'iso-8859-7': 'Windows-1253', 77 'iso-8859-8': 'Windows-1255', 78 'iso-8859-9': 'Windows-1254', 79 'iso-8859-13': 'Windows-1257'} 80 81 def __init__(self, lang_filter=LanguageFilter.ALL): 82 self._esc_charset_prober = None 83 self._charset_probers = [] 84 self.result = None 85 self.done = None 86 self._got_data = None 87 self._input_state = None 88 self._last_char = None 89 self.lang_filter = lang_filter 90 self.logger = logging.getLogger(__name__) 91 self._has_win_bytes = None 92 self.reset() 93 94 def reset(self): 95 """ 96 Reset the UniversalDetector and all of its probers back to their 97 initial states. This is called by ``__init__``, so you only need to 98 call this directly in between analyses of different documents. 99 """ 100 self.result = {'encoding': None, 'confidence': 0.0, 'language': None} 101 self.done = False 102 self._got_data = False 103 self._has_win_bytes = False 104 self._input_state = InputState.PURE_ASCII 105 self._last_char = b'' 106 if self._esc_charset_prober: 107 self._esc_charset_prober.reset() 108 for prober in self._charset_probers: 109 prober.reset() 110 111 def feed(self, byte_str): 112 """ 113 Takes a chunk of a document and feeds it through all of the relevant 114 charset probers. 115 116 After calling ``feed``, you can check the value of the ``done`` 117 attribute to see if you need to continue feeding the 118 ``UniversalDetector`` more data, or if it has made a prediction 119 (in the ``result`` attribute). 120 121 .. note:: 122 You should always call ``close`` when you're done feeding in your 123 document if ``done`` is not already ``True``. 124 """ 125 if self.done: 126 return 127 128 if not len(byte_str): 129 return 130 131 if not isinstance(byte_str, bytearray): 132 byte_str = bytearray(byte_str) 133 134 # First check for known BOMs, since these are guaranteed to be correct 135 if not self._got_data: 136 # If the data starts with BOM, we know it is UTF 137 if byte_str.startswith(codecs.BOM_UTF8): 138 # EF BB BF UTF-8 with BOM 139 self.result = {'encoding': "UTF-8-SIG", 140 'confidence': 1.0, 141 'language': ''} 142 elif byte_str.startswith((codecs.BOM_UTF32_LE, 143 codecs.BOM_UTF32_BE)): 144 # FF FE 00 00 UTF-32, little-endian BOM 145 # 00 00 FE FF UTF-32, big-endian BOM 146 self.result = {'encoding': "UTF-32", 147 'confidence': 1.0, 148 'language': ''} 149 elif byte_str.startswith(b'\xFE\xFF\x00\x00'): 150 # FE FF 00 00 UCS-4, unusual octet order BOM (3412) 151 self.result = {'encoding': "X-ISO-10646-UCS-4-3412", 152 'confidence': 1.0, 153 'language': ''} 154 elif byte_str.startswith(b'\x00\x00\xFF\xFE'): 155 # 00 00 FF FE UCS-4, unusual octet order BOM (2143) 156 self.result = {'encoding': "X-ISO-10646-UCS-4-2143", 157 'confidence': 1.0, 158 'language': ''} 159 elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)): 160 # FF FE UTF-16, little endian BOM 161 # FE FF UTF-16, big endian BOM 162 self.result = {'encoding': "UTF-16", 163 'confidence': 1.0, 164 'language': ''} 165 166 self._got_data = True 167 if self.result['encoding'] is not None: 168 self.done = True 169 return 170 171 # If none of those matched and we've only see ASCII so far, check 172 # for high bytes and escape sequences 173 if self._input_state == InputState.PURE_ASCII: 174 if self.HIGH_BYTE_DETECTOR.search(byte_str): 175 self._input_state = InputState.HIGH_BYTE 176 elif self._input_state == InputState.PURE_ASCII and \ 177 self.ESC_DETECTOR.search(self._last_char + byte_str): 178 self._input_state = InputState.ESC_ASCII 179 180 self._last_char = byte_str[-1:] 181 182 # If we've seen escape sequences, use the EscCharSetProber, which 183 # uses a simple state machine to check for known escape sequences in 184 # HZ and ISO-2022 encodings, since those are the only encodings that 185 # use such sequences. 186 if self._input_state == InputState.ESC_ASCII: 187 if not self._esc_charset_prober: 188 self._esc_charset_prober = EscCharSetProber(self.lang_filter) 189 if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT: 190 self.result = {'encoding': 191 self._esc_charset_prober.charset_name, 192 'confidence': 193 self._esc_charset_prober.get_confidence(), 194 'language': 195 self._esc_charset_prober.language} 196 self.done = True 197 # If we've seen high bytes (i.e., those with values greater than 127), 198 # we need to do more complicated checks using all our multi-byte and 199 # single-byte probers that are left. The single-byte probers 200 # use character bigram distributions to determine the encoding, whereas 201 # the multi-byte probers use a combination of character unigram and 202 # bigram distributions. 203 elif self._input_state == InputState.HIGH_BYTE: 204 if not self._charset_probers: 205 self._charset_probers = [MBCSGroupProber(self.lang_filter)] 206 # If we're checking non-CJK encodings, use single-byte prober 207 if self.lang_filter & LanguageFilter.NON_CJK: 208 self._charset_probers.append(SBCSGroupProber()) 209 self._charset_probers.append(Latin1Prober()) 210 for prober in self._charset_probers: 211 if prober.feed(byte_str) == ProbingState.FOUND_IT: 212 self.result = {'encoding': prober.charset_name, 213 'confidence': prober.get_confidence(), 214 'language': prober.language} 215 self.done = True 216 break 217 if self.WIN_BYTE_DETECTOR.search(byte_str): 218 self._has_win_bytes = True 219 220 def close(self): 221 """ 222 Stop analyzing the current document and come up with a final 223 prediction. 224 225 :returns: The ``result`` attribute, a ``dict`` with the keys 226 `encoding`, `confidence`, and `language`. 227 """ 228 # Don't bother with checks if we're already done 229 if self.done: 230 return self.result 231 self.done = True 232 233 if not self._got_data: 234 self.logger.debug('no data received!') 235 236 # Default to ASCII if it is all we've seen so far 237 elif self._input_state == InputState.PURE_ASCII: 238 self.result = {'encoding': 'ascii', 239 'confidence': 1.0, 240 'language': ''} 241 242 # If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD 243 elif self._input_state == InputState.HIGH_BYTE: 244 prober_confidence = None 245 max_prober_confidence = 0.0 246 max_prober = None 247 for prober in self._charset_probers: 248 if not prober: 249 continue 250 prober_confidence = prober.get_confidence() 251 if prober_confidence > max_prober_confidence: 252 max_prober_confidence = prober_confidence 253 max_prober = prober 254 if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD): 255 charset_name = max_prober.charset_name 256 lower_charset_name = max_prober.charset_name.lower() 257 confidence = max_prober.get_confidence() 258 # Use Windows encoding name instead of ISO-8859 if we saw any 259 # extra Windows-specific bytes 260 if lower_charset_name.startswith('iso-8859'): 261 if self._has_win_bytes: 262 charset_name = self.ISO_WIN_MAP.get(lower_charset_name, 263 charset_name) 264 self.result = {'encoding': charset_name, 265 'confidence': confidence, 266 'language': max_prober.language} 267 268 # Log all prober confidences if none met MINIMUM_THRESHOLD 269 if self.logger.getEffectiveLevel() <= logging.DEBUG: 270 if self.result['encoding'] is None: 271 self.logger.debug('no probers hit minimum threshold') 272 for group_prober in self._charset_probers: 273 if not group_prober: 274 continue 275 if isinstance(group_prober, CharSetGroupProber): 276 for prober in group_prober.probers: 277 self.logger.debug('%s %s confidence = %s', 278 prober.charset_name, 279 prober.language, 280 prober.get_confidence()) 281 else: 282 self.logger.debug('%s %s confidence = %s', 283 group_prober.charset_name, 284 group_prober.language, 285 group_prober.get_confidence()) 286 return self.result 287