Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/chardet/sbcharsetprober.py @ 2:6af9afd405e9 draft
"planemo upload commit 0a63dd5f4d38a1f6944587f52a8cd79874177fc1"
| author | shellac |
|---|---|
| date | Thu, 14 May 2020 14:56:58 -0400 |
| parents | 26e78fe6e8c4 |
| children |
comparison
equal
deleted
inserted
replaced
| 1:75ca89e9b81c | 2:6af9afd405e9 |
|---|---|
| 1 ######################## BEGIN LICENSE BLOCK ######################## | |
| 2 # The Original Code is Mozilla Universal charset detector code. | |
| 3 # | |
| 4 # The Initial Developer of the Original Code is | |
| 5 # Netscape Communications Corporation. | |
| 6 # Portions created by the Initial Developer are Copyright (C) 2001 | |
| 7 # the Initial Developer. All Rights Reserved. | |
| 8 # | |
| 9 # Contributor(s): | |
| 10 # Mark Pilgrim - port to Python | |
| 11 # Shy Shalom - original C code | |
| 12 # | |
| 13 # This library is free software; you can redistribute it and/or | |
| 14 # modify it under the terms of the GNU Lesser General Public | |
| 15 # License as published by the Free Software Foundation; either | |
| 16 # version 2.1 of the License, or (at your option) any later version. | |
| 17 # | |
| 18 # This library is distributed in the hope that it will be useful, | |
| 19 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 21 # Lesser General Public License for more details. | |
| 22 # | |
| 23 # You should have received a copy of the GNU Lesser General Public | |
| 24 # License along with this library; if not, write to the Free Software | |
| 25 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA | |
| 26 # 02110-1301 USA | |
| 27 ######################### END LICENSE BLOCK ######################### | |
| 28 | |
| 29 from .charsetprober import CharSetProber | |
| 30 from .enums import CharacterCategory, ProbingState, SequenceLikelihood | |
| 31 | |
| 32 | |
| 33 class SingleByteCharSetProber(CharSetProber): | |
| 34 SAMPLE_SIZE = 64 | |
| 35 SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2 | |
| 36 POSITIVE_SHORTCUT_THRESHOLD = 0.95 | |
| 37 NEGATIVE_SHORTCUT_THRESHOLD = 0.05 | |
| 38 | |
| 39 def __init__(self, model, reversed=False, name_prober=None): | |
| 40 super(SingleByteCharSetProber, self).__init__() | |
| 41 self._model = model | |
| 42 # TRUE if we need to reverse every pair in the model lookup | |
| 43 self._reversed = reversed | |
| 44 # Optional auxiliary prober for name decision | |
| 45 self._name_prober = name_prober | |
| 46 self._last_order = None | |
| 47 self._seq_counters = None | |
| 48 self._total_seqs = None | |
| 49 self._total_char = None | |
| 50 self._freq_char = None | |
| 51 self.reset() | |
| 52 | |
| 53 def reset(self): | |
| 54 super(SingleByteCharSetProber, self).reset() | |
| 55 # char order of last character | |
| 56 self._last_order = 255 | |
| 57 self._seq_counters = [0] * SequenceLikelihood.get_num_categories() | |
| 58 self._total_seqs = 0 | |
| 59 self._total_char = 0 | |
| 60 # characters that fall in our sampling range | |
| 61 self._freq_char = 0 | |
| 62 | |
| 63 @property | |
| 64 def charset_name(self): | |
| 65 if self._name_prober: | |
| 66 return self._name_prober.charset_name | |
| 67 else: | |
| 68 return self._model['charset_name'] | |
| 69 | |
| 70 @property | |
| 71 def language(self): | |
| 72 if self._name_prober: | |
| 73 return self._name_prober.language | |
| 74 else: | |
| 75 return self._model.get('language') | |
| 76 | |
| 77 def feed(self, byte_str): | |
| 78 if not self._model['keep_english_letter']: | |
| 79 byte_str = self.filter_international_words(byte_str) | |
| 80 if not byte_str: | |
| 81 return self.state | |
| 82 char_to_order_map = self._model['char_to_order_map'] | |
| 83 for i, c in enumerate(byte_str): | |
| 84 # XXX: Order is in range 1-64, so one would think we want 0-63 here, | |
| 85 # but that leads to 27 more test failures than before. | |
| 86 order = char_to_order_map[c] | |
| 87 # XXX: This was SYMBOL_CAT_ORDER before, with a value of 250, but | |
| 88 # CharacterCategory.SYMBOL is actually 253, so we use CONTROL | |
| 89 # to make it closer to the original intent. The only difference | |
| 90 # is whether or not we count digits and control characters for | |
| 91 # _total_char purposes. | |
| 92 if order < CharacterCategory.CONTROL: | |
| 93 self._total_char += 1 | |
| 94 if order < self.SAMPLE_SIZE: | |
| 95 self._freq_char += 1 | |
| 96 if self._last_order < self.SAMPLE_SIZE: | |
| 97 self._total_seqs += 1 | |
| 98 if not self._reversed: | |
| 99 i = (self._last_order * self.SAMPLE_SIZE) + order | |
| 100 model = self._model['precedence_matrix'][i] | |
| 101 else: # reverse the order of the letters in the lookup | |
| 102 i = (order * self.SAMPLE_SIZE) + self._last_order | |
| 103 model = self._model['precedence_matrix'][i] | |
| 104 self._seq_counters[model] += 1 | |
| 105 self._last_order = order | |
| 106 | |
| 107 charset_name = self._model['charset_name'] | |
| 108 if self.state == ProbingState.DETECTING: | |
| 109 if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD: | |
| 110 confidence = self.get_confidence() | |
| 111 if confidence > self.POSITIVE_SHORTCUT_THRESHOLD: | |
| 112 self.logger.debug('%s confidence = %s, we have a winner', | |
| 113 charset_name, confidence) | |
| 114 self._state = ProbingState.FOUND_IT | |
| 115 elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD: | |
| 116 self.logger.debug('%s confidence = %s, below negative ' | |
| 117 'shortcut threshhold %s', charset_name, | |
| 118 confidence, | |
| 119 self.NEGATIVE_SHORTCUT_THRESHOLD) | |
| 120 self._state = ProbingState.NOT_ME | |
| 121 | |
| 122 return self.state | |
| 123 | |
| 124 def get_confidence(self): | |
| 125 r = 0.01 | |
| 126 if self._total_seqs > 0: | |
| 127 r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) / | |
| 128 self._total_seqs / self._model['typical_positive_ratio']) | |
| 129 r = r * self._freq_char / self._total_char | |
| 130 if r >= 1.0: | |
| 131 r = 0.99 | |
| 132 return r |
