comparison env/lib/python3.7/site-packages/chardet/universaldetector.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author shellac
date Sat, 02 May 2020 07:14:21 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:26e78fe6e8c4
1 ######################## BEGIN LICENSE BLOCK ########################
2 # The Original Code is Mozilla Universal charset detector code.
3 #
4 # The Initial Developer of the Original Code is
5 # Netscape Communications Corporation.
6 # Portions created by the Initial Developer are Copyright (C) 2001
7 # the Initial Developer. All Rights Reserved.
8 #
9 # Contributor(s):
10 # Mark Pilgrim - port to Python
11 # Shy Shalom - original C code
12 #
13 # This library is free software; you can redistribute it and/or
14 # modify it under the terms of the GNU Lesser General Public
15 # License as published by the Free Software Foundation; either
16 # version 2.1 of the License, or (at your option) any later version.
17 #
18 # This library is distributed in the hope that it will be useful,
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 # Lesser General Public License for more details.
22 #
23 # You should have received a copy of the GNU Lesser General Public
24 # License along with this library; if not, write to the Free Software
25 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26 # 02110-1301 USA
27 ######################### END LICENSE BLOCK #########################
28 """
29 Module containing the UniversalDetector detector class, which is the primary
30 class a user of ``chardet`` should use.
31
32 :author: Mark Pilgrim (initial port to Python)
33 :author: Shy Shalom (original C code)
34 :author: Dan Blanchard (major refactoring for 3.0)
35 :author: Ian Cordasco
36 """
37
38
39 import codecs
40 import logging
41 import re
42
43 from .charsetgroupprober import CharSetGroupProber
44 from .enums import InputState, LanguageFilter, ProbingState
45 from .escprober import EscCharSetProber
46 from .latin1prober import Latin1Prober
47 from .mbcsgroupprober import MBCSGroupProber
48 from .sbcsgroupprober import SBCSGroupProber
49
50
51 class UniversalDetector(object):
52 """
53 The ``UniversalDetector`` class underlies the ``chardet.detect`` function
54 and coordinates all of the different charset probers.
55
56 To get a ``dict`` containing an encoding and its confidence, you can simply
57 run:
58
59 .. code::
60
61 u = UniversalDetector()
62 u.feed(some_bytes)
63 u.close()
64 detected = u.result
65
66 """
67
68 MINIMUM_THRESHOLD = 0.20
69 HIGH_BYTE_DETECTOR = re.compile(b'[\x80-\xFF]')
70 ESC_DETECTOR = re.compile(b'(\033|~{)')
71 WIN_BYTE_DETECTOR = re.compile(b'[\x80-\x9F]')
72 ISO_WIN_MAP = {'iso-8859-1': 'Windows-1252',
73 'iso-8859-2': 'Windows-1250',
74 'iso-8859-5': 'Windows-1251',
75 'iso-8859-6': 'Windows-1256',
76 'iso-8859-7': 'Windows-1253',
77 'iso-8859-8': 'Windows-1255',
78 'iso-8859-9': 'Windows-1254',
79 'iso-8859-13': 'Windows-1257'}
80
81 def __init__(self, lang_filter=LanguageFilter.ALL):
82 self._esc_charset_prober = None
83 self._charset_probers = []
84 self.result = None
85 self.done = None
86 self._got_data = None
87 self._input_state = None
88 self._last_char = None
89 self.lang_filter = lang_filter
90 self.logger = logging.getLogger(__name__)
91 self._has_win_bytes = None
92 self.reset()
93
94 def reset(self):
95 """
96 Reset the UniversalDetector and all of its probers back to their
97 initial states. This is called by ``__init__``, so you only need to
98 call this directly in between analyses of different documents.
99 """
100 self.result = {'encoding': None, 'confidence': 0.0, 'language': None}
101 self.done = False
102 self._got_data = False
103 self._has_win_bytes = False
104 self._input_state = InputState.PURE_ASCII
105 self._last_char = b''
106 if self._esc_charset_prober:
107 self._esc_charset_prober.reset()
108 for prober in self._charset_probers:
109 prober.reset()
110
111 def feed(self, byte_str):
112 """
113 Takes a chunk of a document and feeds it through all of the relevant
114 charset probers.
115
116 After calling ``feed``, you can check the value of the ``done``
117 attribute to see if you need to continue feeding the
118 ``UniversalDetector`` more data, or if it has made a prediction
119 (in the ``result`` attribute).
120
121 .. note::
122 You should always call ``close`` when you're done feeding in your
123 document if ``done`` is not already ``True``.
124 """
125 if self.done:
126 return
127
128 if not len(byte_str):
129 return
130
131 if not isinstance(byte_str, bytearray):
132 byte_str = bytearray(byte_str)
133
134 # First check for known BOMs, since these are guaranteed to be correct
135 if not self._got_data:
136 # If the data starts with BOM, we know it is UTF
137 if byte_str.startswith(codecs.BOM_UTF8):
138 # EF BB BF UTF-8 with BOM
139 self.result = {'encoding': "UTF-8-SIG",
140 'confidence': 1.0,
141 'language': ''}
142 elif byte_str.startswith((codecs.BOM_UTF32_LE,
143 codecs.BOM_UTF32_BE)):
144 # FF FE 00 00 UTF-32, little-endian BOM
145 # 00 00 FE FF UTF-32, big-endian BOM
146 self.result = {'encoding': "UTF-32",
147 'confidence': 1.0,
148 'language': ''}
149 elif byte_str.startswith(b'\xFE\xFF\x00\x00'):
150 # FE FF 00 00 UCS-4, unusual octet order BOM (3412)
151 self.result = {'encoding': "X-ISO-10646-UCS-4-3412",
152 'confidence': 1.0,
153 'language': ''}
154 elif byte_str.startswith(b'\x00\x00\xFF\xFE'):
155 # 00 00 FF FE UCS-4, unusual octet order BOM (2143)
156 self.result = {'encoding': "X-ISO-10646-UCS-4-2143",
157 'confidence': 1.0,
158 'language': ''}
159 elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)):
160 # FF FE UTF-16, little endian BOM
161 # FE FF UTF-16, big endian BOM
162 self.result = {'encoding': "UTF-16",
163 'confidence': 1.0,
164 'language': ''}
165
166 self._got_data = True
167 if self.result['encoding'] is not None:
168 self.done = True
169 return
170
171 # If none of those matched and we've only see ASCII so far, check
172 # for high bytes and escape sequences
173 if self._input_state == InputState.PURE_ASCII:
174 if self.HIGH_BYTE_DETECTOR.search(byte_str):
175 self._input_state = InputState.HIGH_BYTE
176 elif self._input_state == InputState.PURE_ASCII and \
177 self.ESC_DETECTOR.search(self._last_char + byte_str):
178 self._input_state = InputState.ESC_ASCII
179
180 self._last_char = byte_str[-1:]
181
182 # If we've seen escape sequences, use the EscCharSetProber, which
183 # uses a simple state machine to check for known escape sequences in
184 # HZ and ISO-2022 encodings, since those are the only encodings that
185 # use such sequences.
186 if self._input_state == InputState.ESC_ASCII:
187 if not self._esc_charset_prober:
188 self._esc_charset_prober = EscCharSetProber(self.lang_filter)
189 if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT:
190 self.result = {'encoding':
191 self._esc_charset_prober.charset_name,
192 'confidence':
193 self._esc_charset_prober.get_confidence(),
194 'language':
195 self._esc_charset_prober.language}
196 self.done = True
197 # If we've seen high bytes (i.e., those with values greater than 127),
198 # we need to do more complicated checks using all our multi-byte and
199 # single-byte probers that are left. The single-byte probers
200 # use character bigram distributions to determine the encoding, whereas
201 # the multi-byte probers use a combination of character unigram and
202 # bigram distributions.
203 elif self._input_state == InputState.HIGH_BYTE:
204 if not self._charset_probers:
205 self._charset_probers = [MBCSGroupProber(self.lang_filter)]
206 # If we're checking non-CJK encodings, use single-byte prober
207 if self.lang_filter & LanguageFilter.NON_CJK:
208 self._charset_probers.append(SBCSGroupProber())
209 self._charset_probers.append(Latin1Prober())
210 for prober in self._charset_probers:
211 if prober.feed(byte_str) == ProbingState.FOUND_IT:
212 self.result = {'encoding': prober.charset_name,
213 'confidence': prober.get_confidence(),
214 'language': prober.language}
215 self.done = True
216 break
217 if self.WIN_BYTE_DETECTOR.search(byte_str):
218 self._has_win_bytes = True
219
220 def close(self):
221 """
222 Stop analyzing the current document and come up with a final
223 prediction.
224
225 :returns: The ``result`` attribute, a ``dict`` with the keys
226 `encoding`, `confidence`, and `language`.
227 """
228 # Don't bother with checks if we're already done
229 if self.done:
230 return self.result
231 self.done = True
232
233 if not self._got_data:
234 self.logger.debug('no data received!')
235
236 # Default to ASCII if it is all we've seen so far
237 elif self._input_state == InputState.PURE_ASCII:
238 self.result = {'encoding': 'ascii',
239 'confidence': 1.0,
240 'language': ''}
241
242 # If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD
243 elif self._input_state == InputState.HIGH_BYTE:
244 prober_confidence = None
245 max_prober_confidence = 0.0
246 max_prober = None
247 for prober in self._charset_probers:
248 if not prober:
249 continue
250 prober_confidence = prober.get_confidence()
251 if prober_confidence > max_prober_confidence:
252 max_prober_confidence = prober_confidence
253 max_prober = prober
254 if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD):
255 charset_name = max_prober.charset_name
256 lower_charset_name = max_prober.charset_name.lower()
257 confidence = max_prober.get_confidence()
258 # Use Windows encoding name instead of ISO-8859 if we saw any
259 # extra Windows-specific bytes
260 if lower_charset_name.startswith('iso-8859'):
261 if self._has_win_bytes:
262 charset_name = self.ISO_WIN_MAP.get(lower_charset_name,
263 charset_name)
264 self.result = {'encoding': charset_name,
265 'confidence': confidence,
266 'language': max_prober.language}
267
268 # Log all prober confidences if none met MINIMUM_THRESHOLD
269 if self.logger.getEffectiveLevel() == logging.DEBUG:
270 if self.result['encoding'] is None:
271 self.logger.debug('no probers hit minimum threshold')
272 for group_prober in self._charset_probers:
273 if not group_prober:
274 continue
275 if isinstance(group_prober, CharSetGroupProber):
276 for prober in group_prober.probers:
277 self.logger.debug('%s %s confidence = %s',
278 prober.charset_name,
279 prober.language,
280 prober.get_confidence())
281 else:
282 self.logger.debug('%s %s confidence = %s',
283 prober.charset_name,
284 prober.language,
285 prober.get_confidence())
286 return self.result