1######################## BEGIN LICENSE BLOCK ######################## 2# This library is free software; you can redistribute it and/or 3# modify it under the terms of the GNU Lesser General Public 4# License as published by the Free Software Foundation; either 5# version 2.1 of the License, or (at your option) any later version. 6# 7# This library is distributed in the hope that it will be useful, 8# but WITHOUT ANY WARRANTY; without even the implied warranty of 9# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 10# Lesser General Public License for more details. 11# 12# You should have received a copy of the GNU Lesser General Public 13# License along with this library; if not, write to the Free Software 14# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 15# 02110-1301 USA 16######################### END LICENSE BLOCK ######################### 17 18 19from .universaldetector import UniversalDetector 20from .enums import InputState 21from .version import __version__, VERSION 22 23 24__all__ = ['UniversalDetector', 'detect', 'detect_all', '__version__', 'VERSION'] 25 26 27def detect(byte_str): 28 """ 29 Detect the encoding of the given byte string. 30 31 :param byte_str: The byte sequence to examine. 32 :type byte_str: ``bytes`` or ``bytearray`` 33 """ 34 if not isinstance(byte_str, bytearray): 35 if not isinstance(byte_str, bytes): 36 raise TypeError('Expected object of type bytes or bytearray, got: ' 37 '{}'.format(type(byte_str))) 38 else: 39 byte_str = bytearray(byte_str) 40 detector = UniversalDetector() 41 detector.feed(byte_str) 42 return detector.close() 43 44 45def detect_all(byte_str): 46 """ 47 Detect all the possible encodings of the given byte string. 48 49 :param byte_str: The byte sequence to examine. 50 :type byte_str: ``bytes`` or ``bytearray`` 51 """ 52 if not isinstance(byte_str, bytearray): 53 if not isinstance(byte_str, bytes): 54 raise TypeError('Expected object of type bytes or bytearray, got: ' 55 '{}'.format(type(byte_str))) 56 else: 57 byte_str = bytearray(byte_str) 58 59 detector = UniversalDetector() 60 detector.feed(byte_str) 61 detector.close() 62 63 if detector._input_state == InputState.HIGH_BYTE: 64 results = [] 65 for prober in detector._charset_probers: 66 if prober.get_confidence() > detector.MINIMUM_THRESHOLD: 67 charset_name = prober.charset_name 68 lower_charset_name = prober.charset_name.lower() 69 # Use Windows encoding name instead of ISO-8859 if we saw any 70 # extra Windows-specific bytes 71 if lower_charset_name.startswith('iso-8859'): 72 if detector._has_win_bytes: 73 charset_name = detector.ISO_WIN_MAP.get(lower_charset_name, 74 charset_name) 75 results.append({ 76 'encoding': charset_name, 77 'confidence': prober.get_confidence(), 78 'language': prober.language, 79 }) 80 if len(results) > 0: 81 return sorted(results, key=lambda result: -result['confidence']) 82 83 return [detector.result] 84