Mercurial > repos > shellac > guppy_basecaller
diff env/lib/python3.7/site-packages/chardet/charsetprober.py @ 5:9b1c78e6ba9c draft default tip
"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
author | shellac |
---|---|
date | Mon, 01 Jun 2020 08:59:25 -0400 |
parents | 79f47841a781 |
children |
line wrap: on
line diff
--- a/env/lib/python3.7/site-packages/chardet/charsetprober.py Thu May 14 16:47:39 2020 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,145 +0,0 @@ -######################## BEGIN LICENSE BLOCK ######################## -# The Original Code is Mozilla Universal charset detector code. -# -# The Initial Developer of the Original Code is -# Netscape Communications Corporation. -# Portions created by the Initial Developer are Copyright (C) 2001 -# the Initial Developer. All Rights Reserved. -# -# Contributor(s): -# Mark Pilgrim - port to Python -# Shy Shalom - original C code -# -# This library is free software; you can redistribute it and/or -# modify it under the terms of the GNU Lesser General Public -# License as published by the Free Software Foundation; either -# version 2.1 of the License, or (at your option) any later version. -# -# This library is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# Lesser General Public License for more details. -# -# You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA -# 02110-1301 USA -######################### END LICENSE BLOCK ######################### - -import logging -import re - -from .enums import ProbingState - - -class CharSetProber(object): - - SHORTCUT_THRESHOLD = 0.95 - - def __init__(self, lang_filter=None): - self._state = None - self.lang_filter = lang_filter - self.logger = logging.getLogger(__name__) - - def reset(self): - self._state = ProbingState.DETECTING - - @property - def charset_name(self): - return None - - def feed(self, buf): - pass - - @property - def state(self): - return self._state - - def get_confidence(self): - return 0.0 - - @staticmethod - def filter_high_byte_only(buf): - buf = re.sub(b'([\x00-\x7F])+', b' ', buf) - return buf - - @staticmethod - def filter_international_words(buf): - """ - We define three types of bytes: - alphabet: english alphabets [a-zA-Z] - international: international characters [\x80-\xFF] - marker: everything else [^a-zA-Z\x80-\xFF] - - The input buffer can be thought to contain a series of words delimited - by markers. This function works to filter all words that contain at - least one international character. All contiguous sequences of markers - are replaced by a single space ascii character. - - This filter applies to all scripts which do not use English characters. - """ - filtered = bytearray() - - # This regex expression filters out only words that have at-least one - # international character. The word may include one marker character at - # the end. - words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?', - buf) - - for word in words: - filtered.extend(word[:-1]) - - # If the last character in the word is a marker, replace it with a - # space as markers shouldn't affect our analysis (they are used - # similarly across all languages and may thus have similar - # frequencies). - last_char = word[-1:] - if not last_char.isalpha() and last_char < b'\x80': - last_char = b' ' - filtered.extend(last_char) - - return filtered - - @staticmethod - def filter_with_english_letters(buf): - """ - Returns a copy of ``buf`` that retains only the sequences of English - alphabet and high byte characters that are not between <> characters. - Also retains English alphabet and high byte characters immediately - before occurrences of >. - - This filter can be applied to all scripts which contain both English - characters and extended ASCII characters, but is currently only used by - ``Latin1Prober``. - """ - filtered = bytearray() - in_tag = False - prev = 0 - - for curr in range(len(buf)): - # Slice here to get bytes instead of an int with Python 3 - buf_char = buf[curr:curr + 1] - # Check if we're coming out of or entering an HTML tag - if buf_char == b'>': - in_tag = False - elif buf_char == b'<': - in_tag = True - - # If current character is not extended-ASCII and not alphabetic... - if buf_char < b'\x80' and not buf_char.isalpha(): - # ...and we're not in a tag - if curr > prev and not in_tag: - # Keep everything after last non-extended-ASCII, - # non-alphabetic character - filtered.extend(buf[prev:curr]) - # Output a space to delimit stretch we kept - filtered.extend(b' ') - prev = curr + 1 - - # If we're not in a tag... - if not in_tag: - # Keep everything after last non-extended-ASCII, non-alphabetic - # character - filtered.extend(buf[prev:]) - - return filtered