annotate env/lib/python3.7/site-packages/chardet/charsetprober.py @ 4:79f47841a781 draft

"planemo upload commit 2a0fe2cc28b09e101d37293e53e82f61762262ec"
author shellac
date Thu, 14 May 2020 16:47:39 -0400
parents 26e78fe6e8c4
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
1 ######################## BEGIN LICENSE BLOCK ########################
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
2 # The Original Code is Mozilla Universal charset detector code.
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
3 #
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
4 # The Initial Developer of the Original Code is
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
5 # Netscape Communications Corporation.
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
6 # Portions created by the Initial Developer are Copyright (C) 2001
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
7 # the Initial Developer. All Rights Reserved.
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
8 #
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
9 # Contributor(s):
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
10 # Mark Pilgrim - port to Python
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
11 # Shy Shalom - original C code
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
12 #
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
13 # This library is free software; you can redistribute it and/or
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
14 # modify it under the terms of the GNU Lesser General Public
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
15 # License as published by the Free Software Foundation; either
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
16 # version 2.1 of the License, or (at your option) any later version.
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
17 #
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
18 # This library is distributed in the hope that it will be useful,
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
21 # Lesser General Public License for more details.
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
22 #
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
23 # You should have received a copy of the GNU Lesser General Public
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
24 # License along with this library; if not, write to the Free Software
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
25 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
26 # 02110-1301 USA
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
27 ######################### END LICENSE BLOCK #########################
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
28
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
29 import logging
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
30 import re
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
31
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
32 from .enums import ProbingState
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
33
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
34
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
35 class CharSetProber(object):
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
36
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
37 SHORTCUT_THRESHOLD = 0.95
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
38
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
39 def __init__(self, lang_filter=None):
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
40 self._state = None
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
41 self.lang_filter = lang_filter
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
42 self.logger = logging.getLogger(__name__)
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
43
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
44 def reset(self):
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
45 self._state = ProbingState.DETECTING
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
46
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
47 @property
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
48 def charset_name(self):
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
49 return None
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
50
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
51 def feed(self, buf):
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
52 pass
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
53
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
54 @property
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
55 def state(self):
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
56 return self._state
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
57
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
58 def get_confidence(self):
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
59 return 0.0
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
60
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
61 @staticmethod
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
62 def filter_high_byte_only(buf):
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
63 buf = re.sub(b'([\x00-\x7F])+', b' ', buf)
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
64 return buf
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
65
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
66 @staticmethod
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
67 def filter_international_words(buf):
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
68 """
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
69 We define three types of bytes:
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
70 alphabet: english alphabets [a-zA-Z]
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
71 international: international characters [\x80-\xFF]
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
72 marker: everything else [^a-zA-Z\x80-\xFF]
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
73
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
74 The input buffer can be thought to contain a series of words delimited
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
75 by markers. This function works to filter all words that contain at
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
76 least one international character. All contiguous sequences of markers
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
77 are replaced by a single space ascii character.
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
78
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
79 This filter applies to all scripts which do not use English characters.
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
80 """
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
81 filtered = bytearray()
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
82
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
83 # This regex expression filters out only words that have at-least one
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
84 # international character. The word may include one marker character at
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
85 # the end.
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
86 words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?',
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
87 buf)
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
88
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
89 for word in words:
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
90 filtered.extend(word[:-1])
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
91
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
92 # If the last character in the word is a marker, replace it with a
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
93 # space as markers shouldn't affect our analysis (they are used
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
94 # similarly across all languages and may thus have similar
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
95 # frequencies).
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
96 last_char = word[-1:]
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
97 if not last_char.isalpha() and last_char < b'\x80':
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
98 last_char = b' '
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
99 filtered.extend(last_char)
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
100
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
101 return filtered
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
102
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
103 @staticmethod
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
104 def filter_with_english_letters(buf):
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
105 """
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
106 Returns a copy of ``buf`` that retains only the sequences of English
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
107 alphabet and high byte characters that are not between <> characters.
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
108 Also retains English alphabet and high byte characters immediately
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
109 before occurrences of >.
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
110
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
111 This filter can be applied to all scripts which contain both English
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
112 characters and extended ASCII characters, but is currently only used by
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
113 ``Latin1Prober``.
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
114 """
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
115 filtered = bytearray()
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
116 in_tag = False
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
117 prev = 0
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
118
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
119 for curr in range(len(buf)):
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
120 # Slice here to get bytes instead of an int with Python 3
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
121 buf_char = buf[curr:curr + 1]
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
122 # Check if we're coming out of or entering an HTML tag
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
123 if buf_char == b'>':
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
124 in_tag = False
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
125 elif buf_char == b'<':
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
126 in_tag = True
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
127
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
128 # If current character is not extended-ASCII and not alphabetic...
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
129 if buf_char < b'\x80' and not buf_char.isalpha():
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
130 # ...and we're not in a tag
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
131 if curr > prev and not in_tag:
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
132 # Keep everything after last non-extended-ASCII,
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
133 # non-alphabetic character
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
134 filtered.extend(buf[prev:curr])
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
135 # Output a space to delimit stretch we kept
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
136 filtered.extend(b' ')
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
137 prev = curr + 1
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
138
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
139 # If we're not in a tag...
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
140 if not in_tag:
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
141 # Keep everything after last non-extended-ASCII, non-alphabetic
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
142 # character
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
143 filtered.extend(buf[prev:])
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
144
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
145 return filtered