Mercurial > repos > guerler > hhblits
comparison lib/python3.8/site-packages/pip/_vendor/chardet/charsetprober.py @ 1:64071f2a4cf0 draft default tip
Deleted selected files
author | guerler |
---|---|
date | Mon, 27 Jul 2020 03:55:49 -0400 |
parents | 9e54283cc701 |
children |
comparison
equal
deleted
inserted
replaced
0:9e54283cc701 | 1:64071f2a4cf0 |
---|---|
1 ######################## BEGIN LICENSE BLOCK ######################## | |
2 # The Original Code is Mozilla Universal charset detector code. | |
3 # | |
4 # The Initial Developer of the Original Code is | |
5 # Netscape Communications Corporation. | |
6 # Portions created by the Initial Developer are Copyright (C) 2001 | |
7 # the Initial Developer. All Rights Reserved. | |
8 # | |
9 # Contributor(s): | |
10 # Mark Pilgrim - port to Python | |
11 # Shy Shalom - original C code | |
12 # | |
13 # This library is free software; you can redistribute it and/or | |
14 # modify it under the terms of the GNU Lesser General Public | |
15 # License as published by the Free Software Foundation; either | |
16 # version 2.1 of the License, or (at your option) any later version. | |
17 # | |
18 # This library is distributed in the hope that it will be useful, | |
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
21 # Lesser General Public License for more details. | |
22 # | |
23 # You should have received a copy of the GNU Lesser General Public | |
24 # License along with this library; if not, write to the Free Software | |
25 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA | |
26 # 02110-1301 USA | |
27 ######################### END LICENSE BLOCK ######################### | |
28 | |
29 import logging | |
30 import re | |
31 | |
32 from .enums import ProbingState | |
33 | |
34 | |
35 class CharSetProber(object): | |
36 | |
37 SHORTCUT_THRESHOLD = 0.95 | |
38 | |
39 def __init__(self, lang_filter=None): | |
40 self._state = None | |
41 self.lang_filter = lang_filter | |
42 self.logger = logging.getLogger(__name__) | |
43 | |
44 def reset(self): | |
45 self._state = ProbingState.DETECTING | |
46 | |
47 @property | |
48 def charset_name(self): | |
49 return None | |
50 | |
51 def feed(self, buf): | |
52 pass | |
53 | |
54 @property | |
55 def state(self): | |
56 return self._state | |
57 | |
58 def get_confidence(self): | |
59 return 0.0 | |
60 | |
61 @staticmethod | |
62 def filter_high_byte_only(buf): | |
63 buf = re.sub(b'([\x00-\x7F])+', b' ', buf) | |
64 return buf | |
65 | |
66 @staticmethod | |
67 def filter_international_words(buf): | |
68 """ | |
69 We define three types of bytes: | |
70 alphabet: english alphabets [a-zA-Z] | |
71 international: international characters [\x80-\xFF] | |
72 marker: everything else [^a-zA-Z\x80-\xFF] | |
73 | |
74 The input buffer can be thought to contain a series of words delimited | |
75 by markers. This function works to filter all words that contain at | |
76 least one international character. All contiguous sequences of markers | |
77 are replaced by a single space ascii character. | |
78 | |
79 This filter applies to all scripts which do not use English characters. | |
80 """ | |
81 filtered = bytearray() | |
82 | |
83 # This regex expression filters out only words that have at-least one | |
84 # international character. The word may include one marker character at | |
85 # the end. | |
86 words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?', | |
87 buf) | |
88 | |
89 for word in words: | |
90 filtered.extend(word[:-1]) | |
91 | |
92 # If the last character in the word is a marker, replace it with a | |
93 # space as markers shouldn't affect our analysis (they are used | |
94 # similarly across all languages and may thus have similar | |
95 # frequencies). | |
96 last_char = word[-1:] | |
97 if not last_char.isalpha() and last_char < b'\x80': | |
98 last_char = b' ' | |
99 filtered.extend(last_char) | |
100 | |
101 return filtered | |
102 | |
103 @staticmethod | |
104 def filter_with_english_letters(buf): | |
105 """ | |
106 Returns a copy of ``buf`` that retains only the sequences of English | |
107 alphabet and high byte characters that are not between <> characters. | |
108 Also retains English alphabet and high byte characters immediately | |
109 before occurrences of >. | |
110 | |
111 This filter can be applied to all scripts which contain both English | |
112 characters and extended ASCII characters, but is currently only used by | |
113 ``Latin1Prober``. | |
114 """ | |
115 filtered = bytearray() | |
116 in_tag = False | |
117 prev = 0 | |
118 | |
119 for curr in range(len(buf)): | |
120 # Slice here to get bytes instead of an int with Python 3 | |
121 buf_char = buf[curr:curr + 1] | |
122 # Check if we're coming out of or entering an HTML tag | |
123 if buf_char == b'>': | |
124 in_tag = False | |
125 elif buf_char == b'<': | |
126 in_tag = True | |
127 | |
128 # If current character is not extended-ASCII and not alphabetic... | |
129 if buf_char < b'\x80' and not buf_char.isalpha(): | |
130 # ...and we're not in a tag | |
131 if curr > prev and not in_tag: | |
132 # Keep everything after last non-extended-ASCII, | |
133 # non-alphabetic character | |
134 filtered.extend(buf[prev:curr]) | |
135 # Output a space to delimit stretch we kept | |
136 filtered.extend(b' ') | |
137 prev = curr + 1 | |
138 | |
139 # If we're not in a tag... | |
140 if not in_tag: | |
141 # Keep everything after last non-extended-ASCII, non-alphabetic | |
142 # character | |
143 filtered.extend(buf[prev:]) | |
144 | |
145 return filtered |