Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/chardet/codingstatemachine.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
| author | shellac |
|---|---|
| date | Sat, 02 May 2020 07:14:21 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:26e78fe6e8c4 |
|---|---|
| 1 ######################## BEGIN LICENSE BLOCK ######################## | |
| 2 # The Original Code is mozilla.org code. | |
| 3 # | |
| 4 # The Initial Developer of the Original Code is | |
| 5 # Netscape Communications Corporation. | |
| 6 # Portions created by the Initial Developer are Copyright (C) 1998 | |
| 7 # the Initial Developer. All Rights Reserved. | |
| 8 # | |
| 9 # Contributor(s): | |
| 10 # Mark Pilgrim - port to Python | |
| 11 # | |
| 12 # This library is free software; you can redistribute it and/or | |
| 13 # modify it under the terms of the GNU Lesser General Public | |
| 14 # License as published by the Free Software Foundation; either | |
| 15 # version 2.1 of the License, or (at your option) any later version. | |
| 16 # | |
| 17 # This library is distributed in the hope that it will be useful, | |
| 18 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 20 # Lesser General Public License for more details. | |
| 21 # | |
| 22 # You should have received a copy of the GNU Lesser General Public | |
| 23 # License along with this library; if not, write to the Free Software | |
| 24 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA | |
| 25 # 02110-1301 USA | |
| 26 ######################### END LICENSE BLOCK ######################### | |
| 27 | |
| 28 import logging | |
| 29 | |
| 30 from .enums import MachineState | |
| 31 | |
| 32 | |
| 33 class CodingStateMachine(object): | |
| 34 """ | |
| 35 A state machine to verify a byte sequence for a particular encoding. For | |
| 36 each byte the detector receives, it will feed that byte to every active | |
| 37 state machine available, one byte at a time. The state machine changes its | |
| 38 state based on its previous state and the byte it receives. There are 3 | |
| 39 states in a state machine that are of interest to an auto-detector: | |
| 40 | |
| 41 START state: This is the state to start with, or a legal byte sequence | |
| 42 (i.e. a valid code point) for character has been identified. | |
| 43 | |
| 44 ME state: This indicates that the state machine identified a byte sequence | |
| 45 that is specific to the charset it is designed for and that | |
| 46 there is no other possible encoding which can contain this byte | |
| 47 sequence. This will to lead to an immediate positive answer for | |
| 48 the detector. | |
| 49 | |
| 50 ERROR state: This indicates the state machine identified an illegal byte | |
| 51 sequence for that encoding. This will lead to an immediate | |
| 52 negative answer for this encoding. Detector will exclude this | |
| 53 encoding from consideration from here on. | |
| 54 """ | |
| 55 def __init__(self, sm): | |
| 56 self._model = sm | |
| 57 self._curr_byte_pos = 0 | |
| 58 self._curr_char_len = 0 | |
| 59 self._curr_state = None | |
| 60 self.logger = logging.getLogger(__name__) | |
| 61 self.reset() | |
| 62 | |
| 63 def reset(self): | |
| 64 self._curr_state = MachineState.START | |
| 65 | |
| 66 def next_state(self, c): | |
| 67 # for each byte we get its class | |
| 68 # if it is first byte, we also get byte length | |
| 69 byte_class = self._model['class_table'][c] | |
| 70 if self._curr_state == MachineState.START: | |
| 71 self._curr_byte_pos = 0 | |
| 72 self._curr_char_len = self._model['char_len_table'][byte_class] | |
| 73 # from byte's class and state_table, we get its next state | |
| 74 curr_state = (self._curr_state * self._model['class_factor'] | |
| 75 + byte_class) | |
| 76 self._curr_state = self._model['state_table'][curr_state] | |
| 77 self._curr_byte_pos += 1 | |
| 78 return self._curr_state | |
| 79 | |
| 80 def get_current_charlen(self): | |
| 81 return self._curr_char_len | |
| 82 | |
| 83 def get_coding_state_machine(self): | |
| 84 return self._model['name'] | |
| 85 | |
| 86 @property | |
| 87 def language(self): | |
| 88 return self._model['language'] |
