Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/chardet/sbcharsetprober.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author | shellac |
---|---|
date | Mon, 22 Mar 2021 18:12:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4f3585e2f14b |
---|---|
1 ######################## BEGIN LICENSE BLOCK ######################## | |
2 # The Original Code is Mozilla Universal charset detector code. | |
3 # | |
4 # The Initial Developer of the Original Code is | |
5 # Netscape Communications Corporation. | |
6 # Portions created by the Initial Developer are Copyright (C) 2001 | |
7 # the Initial Developer. All Rights Reserved. | |
8 # | |
9 # Contributor(s): | |
10 # Mark Pilgrim - port to Python | |
11 # Shy Shalom - original C code | |
12 # | |
13 # This library is free software; you can redistribute it and/or | |
14 # modify it under the terms of the GNU Lesser General Public | |
15 # License as published by the Free Software Foundation; either | |
16 # version 2.1 of the License, or (at your option) any later version. | |
17 # | |
18 # This library is distributed in the hope that it will be useful, | |
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
21 # Lesser General Public License for more details. | |
22 # | |
23 # You should have received a copy of the GNU Lesser General Public | |
24 # License along with this library; if not, write to the Free Software | |
25 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA | |
26 # 02110-1301 USA | |
27 ######################### END LICENSE BLOCK ######################### | |
28 | |
29 from collections import namedtuple | |
30 | |
31 from .charsetprober import CharSetProber | |
32 from .enums import CharacterCategory, ProbingState, SequenceLikelihood | |
33 | |
34 | |
35 SingleByteCharSetModel = namedtuple('SingleByteCharSetModel', | |
36 ['charset_name', | |
37 'language', | |
38 'char_to_order_map', | |
39 'language_model', | |
40 'typical_positive_ratio', | |
41 'keep_ascii_letters', | |
42 'alphabet']) | |
43 | |
44 | |
45 class SingleByteCharSetProber(CharSetProber): | |
46 SAMPLE_SIZE = 64 | |
47 SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2 | |
48 POSITIVE_SHORTCUT_THRESHOLD = 0.95 | |
49 NEGATIVE_SHORTCUT_THRESHOLD = 0.05 | |
50 | |
51 def __init__(self, model, reversed=False, name_prober=None): | |
52 super(SingleByteCharSetProber, self).__init__() | |
53 self._model = model | |
54 # TRUE if we need to reverse every pair in the model lookup | |
55 self._reversed = reversed | |
56 # Optional auxiliary prober for name decision | |
57 self._name_prober = name_prober | |
58 self._last_order = None | |
59 self._seq_counters = None | |
60 self._total_seqs = None | |
61 self._total_char = None | |
62 self._freq_char = None | |
63 self.reset() | |
64 | |
65 def reset(self): | |
66 super(SingleByteCharSetProber, self).reset() | |
67 # char order of last character | |
68 self._last_order = 255 | |
69 self._seq_counters = [0] * SequenceLikelihood.get_num_categories() | |
70 self._total_seqs = 0 | |
71 self._total_char = 0 | |
72 # characters that fall in our sampling range | |
73 self._freq_char = 0 | |
74 | |
75 @property | |
76 def charset_name(self): | |
77 if self._name_prober: | |
78 return self._name_prober.charset_name | |
79 else: | |
80 return self._model.charset_name | |
81 | |
82 @property | |
83 def language(self): | |
84 if self._name_prober: | |
85 return self._name_prober.language | |
86 else: | |
87 return self._model.language | |
88 | |
89 def feed(self, byte_str): | |
90 # TODO: Make filter_international_words keep things in self.alphabet | |
91 if not self._model.keep_ascii_letters: | |
92 byte_str = self.filter_international_words(byte_str) | |
93 if not byte_str: | |
94 return self.state | |
95 char_to_order_map = self._model.char_to_order_map | |
96 language_model = self._model.language_model | |
97 for char in byte_str: | |
98 order = char_to_order_map.get(char, CharacterCategory.UNDEFINED) | |
99 # XXX: This was SYMBOL_CAT_ORDER before, with a value of 250, but | |
100 # CharacterCategory.SYMBOL is actually 253, so we use CONTROL | |
101 # to make it closer to the original intent. The only difference | |
102 # is whether or not we count digits and control characters for | |
103 # _total_char purposes. | |
104 if order < CharacterCategory.CONTROL: | |
105 self._total_char += 1 | |
106 # TODO: Follow uchardet's lead and discount confidence for frequent | |
107 # control characters. | |
108 # See https://github.com/BYVoid/uchardet/commit/55b4f23971db61 | |
109 if order < self.SAMPLE_SIZE: | |
110 self._freq_char += 1 | |
111 if self._last_order < self.SAMPLE_SIZE: | |
112 self._total_seqs += 1 | |
113 if not self._reversed: | |
114 lm_cat = language_model[self._last_order][order] | |
115 else: | |
116 lm_cat = language_model[order][self._last_order] | |
117 self._seq_counters[lm_cat] += 1 | |
118 self._last_order = order | |
119 | |
120 charset_name = self._model.charset_name | |
121 if self.state == ProbingState.DETECTING: | |
122 if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD: | |
123 confidence = self.get_confidence() | |
124 if confidence > self.POSITIVE_SHORTCUT_THRESHOLD: | |
125 self.logger.debug('%s confidence = %s, we have a winner', | |
126 charset_name, confidence) | |
127 self._state = ProbingState.FOUND_IT | |
128 elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD: | |
129 self.logger.debug('%s confidence = %s, below negative ' | |
130 'shortcut threshhold %s', charset_name, | |
131 confidence, | |
132 self.NEGATIVE_SHORTCUT_THRESHOLD) | |
133 self._state = ProbingState.NOT_ME | |
134 | |
135 return self.state | |
136 | |
137 def get_confidence(self): | |
138 r = 0.01 | |
139 if self._total_seqs > 0: | |
140 r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) / | |
141 self._total_seqs / self._model.typical_positive_ratio) | |
142 r = r * self._freq_char / self._total_char | |
143 if r >= 1.0: | |
144 r = 0.99 | |
145 return r |