Mercurial > repos > guerler > hhblits
comparison lib/python3.8/site-packages/pip/_vendor/chardet/sbcharsetprober.py @ 0:9e54283cc701 draft
"planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
author | guerler |
---|---|
date | Mon, 27 Jul 2020 03:47:31 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9e54283cc701 |
---|---|
1 ######################## BEGIN LICENSE BLOCK ######################## | |
2 # The Original Code is Mozilla Universal charset detector code. | |
3 # | |
4 # The Initial Developer of the Original Code is | |
5 # Netscape Communications Corporation. | |
6 # Portions created by the Initial Developer are Copyright (C) 2001 | |
7 # the Initial Developer. All Rights Reserved. | |
8 # | |
9 # Contributor(s): | |
10 # Mark Pilgrim - port to Python | |
11 # Shy Shalom - original C code | |
12 # | |
13 # This library is free software; you can redistribute it and/or | |
14 # modify it under the terms of the GNU Lesser General Public | |
15 # License as published by the Free Software Foundation; either | |
16 # version 2.1 of the License, or (at your option) any later version. | |
17 # | |
18 # This library is distributed in the hope that it will be useful, | |
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
21 # Lesser General Public License for more details. | |
22 # | |
23 # You should have received a copy of the GNU Lesser General Public | |
24 # License along with this library; if not, write to the Free Software | |
25 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA | |
26 # 02110-1301 USA | |
27 ######################### END LICENSE BLOCK ######################### | |
28 | |
29 from .charsetprober import CharSetProber | |
30 from .enums import CharacterCategory, ProbingState, SequenceLikelihood | |
31 | |
32 | |
33 class SingleByteCharSetProber(CharSetProber): | |
34 SAMPLE_SIZE = 64 | |
35 SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2 | |
36 POSITIVE_SHORTCUT_THRESHOLD = 0.95 | |
37 NEGATIVE_SHORTCUT_THRESHOLD = 0.05 | |
38 | |
39 def __init__(self, model, reversed=False, name_prober=None): | |
40 super(SingleByteCharSetProber, self).__init__() | |
41 self._model = model | |
42 # TRUE if we need to reverse every pair in the model lookup | |
43 self._reversed = reversed | |
44 # Optional auxiliary prober for name decision | |
45 self._name_prober = name_prober | |
46 self._last_order = None | |
47 self._seq_counters = None | |
48 self._total_seqs = None | |
49 self._total_char = None | |
50 self._freq_char = None | |
51 self.reset() | |
52 | |
53 def reset(self): | |
54 super(SingleByteCharSetProber, self).reset() | |
55 # char order of last character | |
56 self._last_order = 255 | |
57 self._seq_counters = [0] * SequenceLikelihood.get_num_categories() | |
58 self._total_seqs = 0 | |
59 self._total_char = 0 | |
60 # characters that fall in our sampling range | |
61 self._freq_char = 0 | |
62 | |
63 @property | |
64 def charset_name(self): | |
65 if self._name_prober: | |
66 return self._name_prober.charset_name | |
67 else: | |
68 return self._model['charset_name'] | |
69 | |
70 @property | |
71 def language(self): | |
72 if self._name_prober: | |
73 return self._name_prober.language | |
74 else: | |
75 return self._model.get('language') | |
76 | |
77 def feed(self, byte_str): | |
78 if not self._model['keep_english_letter']: | |
79 byte_str = self.filter_international_words(byte_str) | |
80 if not byte_str: | |
81 return self.state | |
82 char_to_order_map = self._model['char_to_order_map'] | |
83 for i, c in enumerate(byte_str): | |
84 # XXX: Order is in range 1-64, so one would think we want 0-63 here, | |
85 # but that leads to 27 more test failures than before. | |
86 order = char_to_order_map[c] | |
87 # XXX: This was SYMBOL_CAT_ORDER before, with a value of 250, but | |
88 # CharacterCategory.SYMBOL is actually 253, so we use CONTROL | |
89 # to make it closer to the original intent. The only difference | |
90 # is whether or not we count digits and control characters for | |
91 # _total_char purposes. | |
92 if order < CharacterCategory.CONTROL: | |
93 self._total_char += 1 | |
94 if order < self.SAMPLE_SIZE: | |
95 self._freq_char += 1 | |
96 if self._last_order < self.SAMPLE_SIZE: | |
97 self._total_seqs += 1 | |
98 if not self._reversed: | |
99 i = (self._last_order * self.SAMPLE_SIZE) + order | |
100 model = self._model['precedence_matrix'][i] | |
101 else: # reverse the order of the letters in the lookup | |
102 i = (order * self.SAMPLE_SIZE) + self._last_order | |
103 model = self._model['precedence_matrix'][i] | |
104 self._seq_counters[model] += 1 | |
105 self._last_order = order | |
106 | |
107 charset_name = self._model['charset_name'] | |
108 if self.state == ProbingState.DETECTING: | |
109 if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD: | |
110 confidence = self.get_confidence() | |
111 if confidence > self.POSITIVE_SHORTCUT_THRESHOLD: | |
112 self.logger.debug('%s confidence = %s, we have a winner', | |
113 charset_name, confidence) | |
114 self._state = ProbingState.FOUND_IT | |
115 elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD: | |
116 self.logger.debug('%s confidence = %s, below negative ' | |
117 'shortcut threshhold %s', charset_name, | |
118 confidence, | |
119 self.NEGATIVE_SHORTCUT_THRESHOLD) | |
120 self._state = ProbingState.NOT_ME | |
121 | |
122 return self.state | |
123 | |
124 def get_confidence(self): | |
125 r = 0.01 | |
126 if self._total_seqs > 0: | |
127 r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) / | |
128 self._total_seqs / self._model['typical_positive_ratio']) | |
129 r = r * self._freq_char / self._total_char | |
130 if r >= 1.0: | |
131 r = 0.99 | |
132 return r |