Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/chardet/sbcharsetprober.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
| author | shellac |
|---|---|
| date | Mon, 22 Mar 2021 18:12:50 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4f3585e2f14b |
|---|---|
| 1 ######################## BEGIN LICENSE BLOCK ######################## | |
| 2 # The Original Code is Mozilla Universal charset detector code. | |
| 3 # | |
| 4 # The Initial Developer of the Original Code is | |
| 5 # Netscape Communications Corporation. | |
| 6 # Portions created by the Initial Developer are Copyright (C) 2001 | |
| 7 # the Initial Developer. All Rights Reserved. | |
| 8 # | |
| 9 # Contributor(s): | |
| 10 # Mark Pilgrim - port to Python | |
| 11 # Shy Shalom - original C code | |
| 12 # | |
| 13 # This library is free software; you can redistribute it and/or | |
| 14 # modify it under the terms of the GNU Lesser General Public | |
| 15 # License as published by the Free Software Foundation; either | |
| 16 # version 2.1 of the License, or (at your option) any later version. | |
| 17 # | |
| 18 # This library is distributed in the hope that it will be useful, | |
| 19 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 21 # Lesser General Public License for more details. | |
| 22 # | |
| 23 # You should have received a copy of the GNU Lesser General Public | |
| 24 # License along with this library; if not, write to the Free Software | |
| 25 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA | |
| 26 # 02110-1301 USA | |
| 27 ######################### END LICENSE BLOCK ######################### | |
| 28 | |
| 29 from collections import namedtuple | |
| 30 | |
| 31 from .charsetprober import CharSetProber | |
| 32 from .enums import CharacterCategory, ProbingState, SequenceLikelihood | |
| 33 | |
| 34 | |
| 35 SingleByteCharSetModel = namedtuple('SingleByteCharSetModel', | |
| 36 ['charset_name', | |
| 37 'language', | |
| 38 'char_to_order_map', | |
| 39 'language_model', | |
| 40 'typical_positive_ratio', | |
| 41 'keep_ascii_letters', | |
| 42 'alphabet']) | |
| 43 | |
| 44 | |
| 45 class SingleByteCharSetProber(CharSetProber): | |
| 46 SAMPLE_SIZE = 64 | |
| 47 SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2 | |
| 48 POSITIVE_SHORTCUT_THRESHOLD = 0.95 | |
| 49 NEGATIVE_SHORTCUT_THRESHOLD = 0.05 | |
| 50 | |
| 51 def __init__(self, model, reversed=False, name_prober=None): | |
| 52 super(SingleByteCharSetProber, self).__init__() | |
| 53 self._model = model | |
| 54 # TRUE if we need to reverse every pair in the model lookup | |
| 55 self._reversed = reversed | |
| 56 # Optional auxiliary prober for name decision | |
| 57 self._name_prober = name_prober | |
| 58 self._last_order = None | |
| 59 self._seq_counters = None | |
| 60 self._total_seqs = None | |
| 61 self._total_char = None | |
| 62 self._freq_char = None | |
| 63 self.reset() | |
| 64 | |
| 65 def reset(self): | |
| 66 super(SingleByteCharSetProber, self).reset() | |
| 67 # char order of last character | |
| 68 self._last_order = 255 | |
| 69 self._seq_counters = [0] * SequenceLikelihood.get_num_categories() | |
| 70 self._total_seqs = 0 | |
| 71 self._total_char = 0 | |
| 72 # characters that fall in our sampling range | |
| 73 self._freq_char = 0 | |
| 74 | |
| 75 @property | |
| 76 def charset_name(self): | |
| 77 if self._name_prober: | |
| 78 return self._name_prober.charset_name | |
| 79 else: | |
| 80 return self._model.charset_name | |
| 81 | |
| 82 @property | |
| 83 def language(self): | |
| 84 if self._name_prober: | |
| 85 return self._name_prober.language | |
| 86 else: | |
| 87 return self._model.language | |
| 88 | |
| 89 def feed(self, byte_str): | |
| 90 # TODO: Make filter_international_words keep things in self.alphabet | |
| 91 if not self._model.keep_ascii_letters: | |
| 92 byte_str = self.filter_international_words(byte_str) | |
| 93 if not byte_str: | |
| 94 return self.state | |
| 95 char_to_order_map = self._model.char_to_order_map | |
| 96 language_model = self._model.language_model | |
| 97 for char in byte_str: | |
| 98 order = char_to_order_map.get(char, CharacterCategory.UNDEFINED) | |
| 99 # XXX: This was SYMBOL_CAT_ORDER before, with a value of 250, but | |
| 100 # CharacterCategory.SYMBOL is actually 253, so we use CONTROL | |
| 101 # to make it closer to the original intent. The only difference | |
| 102 # is whether or not we count digits and control characters for | |
| 103 # _total_char purposes. | |
| 104 if order < CharacterCategory.CONTROL: | |
| 105 self._total_char += 1 | |
| 106 # TODO: Follow uchardet's lead and discount confidence for frequent | |
| 107 # control characters. | |
| 108 # See https://github.com/BYVoid/uchardet/commit/55b4f23971db61 | |
| 109 if order < self.SAMPLE_SIZE: | |
| 110 self._freq_char += 1 | |
| 111 if self._last_order < self.SAMPLE_SIZE: | |
| 112 self._total_seqs += 1 | |
| 113 if not self._reversed: | |
| 114 lm_cat = language_model[self._last_order][order] | |
| 115 else: | |
| 116 lm_cat = language_model[order][self._last_order] | |
| 117 self._seq_counters[lm_cat] += 1 | |
| 118 self._last_order = order | |
| 119 | |
| 120 charset_name = self._model.charset_name | |
| 121 if self.state == ProbingState.DETECTING: | |
| 122 if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD: | |
| 123 confidence = self.get_confidence() | |
| 124 if confidence > self.POSITIVE_SHORTCUT_THRESHOLD: | |
| 125 self.logger.debug('%s confidence = %s, we have a winner', | |
| 126 charset_name, confidence) | |
| 127 self._state = ProbingState.FOUND_IT | |
| 128 elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD: | |
| 129 self.logger.debug('%s confidence = %s, below negative ' | |
| 130 'shortcut threshhold %s', charset_name, | |
| 131 confidence, | |
| 132 self.NEGATIVE_SHORTCUT_THRESHOLD) | |
| 133 self._state = ProbingState.NOT_ME | |
| 134 | |
| 135 return self.state | |
| 136 | |
| 137 def get_confidence(self): | |
| 138 r = 0.01 | |
| 139 if self._total_seqs > 0: | |
| 140 r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) / | |
| 141 self._total_seqs / self._model.typical_positive_ratio) | |
| 142 r = r * self._freq_char / self._total_char | |
| 143 if r >= 1.0: | |
| 144 r = 0.99 | |
| 145 return r |
