Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/chardet/charsetprober.py @ 2:6af9afd405e9 draft
"planemo upload commit 0a63dd5f4d38a1f6944587f52a8cd79874177fc1"
| author | shellac |
|---|---|
| date | Thu, 14 May 2020 14:56:58 -0400 |
| parents | 26e78fe6e8c4 |
| children |
comparison
equal
deleted
inserted
replaced
| 1:75ca89e9b81c | 2:6af9afd405e9 |
|---|---|
| 1 ######################## BEGIN LICENSE BLOCK ######################## | |
| 2 # The Original Code is Mozilla Universal charset detector code. | |
| 3 # | |
| 4 # The Initial Developer of the Original Code is | |
| 5 # Netscape Communications Corporation. | |
| 6 # Portions created by the Initial Developer are Copyright (C) 2001 | |
| 7 # the Initial Developer. All Rights Reserved. | |
| 8 # | |
| 9 # Contributor(s): | |
| 10 # Mark Pilgrim - port to Python | |
| 11 # Shy Shalom - original C code | |
| 12 # | |
| 13 # This library is free software; you can redistribute it and/or | |
| 14 # modify it under the terms of the GNU Lesser General Public | |
| 15 # License as published by the Free Software Foundation; either | |
| 16 # version 2.1 of the License, or (at your option) any later version. | |
| 17 # | |
| 18 # This library is distributed in the hope that it will be useful, | |
| 19 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 21 # Lesser General Public License for more details. | |
| 22 # | |
| 23 # You should have received a copy of the GNU Lesser General Public | |
| 24 # License along with this library; if not, write to the Free Software | |
| 25 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA | |
| 26 # 02110-1301 USA | |
| 27 ######################### END LICENSE BLOCK ######################### | |
| 28 | |
| 29 import logging | |
| 30 import re | |
| 31 | |
| 32 from .enums import ProbingState | |
| 33 | |
| 34 | |
| 35 class CharSetProber(object): | |
| 36 | |
| 37 SHORTCUT_THRESHOLD = 0.95 | |
| 38 | |
| 39 def __init__(self, lang_filter=None): | |
| 40 self._state = None | |
| 41 self.lang_filter = lang_filter | |
| 42 self.logger = logging.getLogger(__name__) | |
| 43 | |
| 44 def reset(self): | |
| 45 self._state = ProbingState.DETECTING | |
| 46 | |
| 47 @property | |
| 48 def charset_name(self): | |
| 49 return None | |
| 50 | |
| 51 def feed(self, buf): | |
| 52 pass | |
| 53 | |
| 54 @property | |
| 55 def state(self): | |
| 56 return self._state | |
| 57 | |
| 58 def get_confidence(self): | |
| 59 return 0.0 | |
| 60 | |
| 61 @staticmethod | |
| 62 def filter_high_byte_only(buf): | |
| 63 buf = re.sub(b'([\x00-\x7F])+', b' ', buf) | |
| 64 return buf | |
| 65 | |
| 66 @staticmethod | |
| 67 def filter_international_words(buf): | |
| 68 """ | |
| 69 We define three types of bytes: | |
| 70 alphabet: english alphabets [a-zA-Z] | |
| 71 international: international characters [\x80-\xFF] | |
| 72 marker: everything else [^a-zA-Z\x80-\xFF] | |
| 73 | |
| 74 The input buffer can be thought to contain a series of words delimited | |
| 75 by markers. This function works to filter all words that contain at | |
| 76 least one international character. All contiguous sequences of markers | |
| 77 are replaced by a single space ascii character. | |
| 78 | |
| 79 This filter applies to all scripts which do not use English characters. | |
| 80 """ | |
| 81 filtered = bytearray() | |
| 82 | |
| 83 # This regex expression filters out only words that have at-least one | |
| 84 # international character. The word may include one marker character at | |
| 85 # the end. | |
| 86 words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?', | |
| 87 buf) | |
| 88 | |
| 89 for word in words: | |
| 90 filtered.extend(word[:-1]) | |
| 91 | |
| 92 # If the last character in the word is a marker, replace it with a | |
| 93 # space as markers shouldn't affect our analysis (they are used | |
| 94 # similarly across all languages and may thus have similar | |
| 95 # frequencies). | |
| 96 last_char = word[-1:] | |
| 97 if not last_char.isalpha() and last_char < b'\x80': | |
| 98 last_char = b' ' | |
| 99 filtered.extend(last_char) | |
| 100 | |
| 101 return filtered | |
| 102 | |
| 103 @staticmethod | |
| 104 def filter_with_english_letters(buf): | |
| 105 """ | |
| 106 Returns a copy of ``buf`` that retains only the sequences of English | |
| 107 alphabet and high byte characters that are not between <> characters. | |
| 108 Also retains English alphabet and high byte characters immediately | |
| 109 before occurrences of >. | |
| 110 | |
| 111 This filter can be applied to all scripts which contain both English | |
| 112 characters and extended ASCII characters, but is currently only used by | |
| 113 ``Latin1Prober``. | |
| 114 """ | |
| 115 filtered = bytearray() | |
| 116 in_tag = False | |
| 117 prev = 0 | |
| 118 | |
| 119 for curr in range(len(buf)): | |
| 120 # Slice here to get bytes instead of an int with Python 3 | |
| 121 buf_char = buf[curr:curr + 1] | |
| 122 # Check if we're coming out of or entering an HTML tag | |
| 123 if buf_char == b'>': | |
| 124 in_tag = False | |
| 125 elif buf_char == b'<': | |
| 126 in_tag = True | |
| 127 | |
| 128 # If current character is not extended-ASCII and not alphabetic... | |
| 129 if buf_char < b'\x80' and not buf_char.isalpha(): | |
| 130 # ...and we're not in a tag | |
| 131 if curr > prev and not in_tag: | |
| 132 # Keep everything after last non-extended-ASCII, | |
| 133 # non-alphabetic character | |
| 134 filtered.extend(buf[prev:curr]) | |
| 135 # Output a space to delimit stretch we kept | |
| 136 filtered.extend(b' ') | |
| 137 prev = curr + 1 | |
| 138 | |
| 139 # If we're not in a tag... | |
| 140 if not in_tag: | |
| 141 # Keep everything after last non-extended-ASCII, non-alphabetic | |
| 142 # character | |
| 143 filtered.extend(buf[prev:]) | |
| 144 | |
| 145 return filtered |
