Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/chardet/metadata/languages.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
| author | shellac |
|---|---|
| date | Mon, 22 Mar 2021 18:12:50 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4f3585e2f14b |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # -*- coding: utf-8 -*- | |
| 3 """ | |
| 4 Metadata about languages used by our model training code for our | |
| 5 SingleByteCharSetProbers. Could be used for other things in the future. | |
| 6 | |
| 7 This code is based on the language metadata from the uchardet project. | |
| 8 """ | |
| 9 from __future__ import absolute_import, print_function | |
| 10 | |
| 11 from string import ascii_letters | |
| 12 | |
| 13 | |
| 14 # TODO: Add Ukranian (KOI8-U) | |
| 15 | |
| 16 class Language(object): | |
| 17 """Metadata about a language useful for training models | |
| 18 | |
| 19 :ivar name: The human name for the language, in English. | |
| 20 :type name: str | |
| 21 :ivar iso_code: 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, | |
| 22 or use another catalog as a last resort. | |
| 23 :type iso_code: str | |
| 24 :ivar use_ascii: Whether or not ASCII letters should be included in trained | |
| 25 models. | |
| 26 :type use_ascii: bool | |
| 27 :ivar charsets: The charsets we want to support and create data for. | |
| 28 :type charsets: list of str | |
| 29 :ivar alphabet: The characters in the language's alphabet. If `use_ascii` is | |
| 30 `True`, you only need to add those not in the ASCII set. | |
| 31 :type alphabet: str | |
| 32 :ivar wiki_start_pages: The Wikipedia pages to start from if we're crawling | |
| 33 Wikipedia for training data. | |
| 34 :type wiki_start_pages: list of str | |
| 35 """ | |
| 36 def __init__(self, name=None, iso_code=None, use_ascii=True, charsets=None, | |
| 37 alphabet=None, wiki_start_pages=None): | |
| 38 super(Language, self).__init__() | |
| 39 self.name = name | |
| 40 self.iso_code = iso_code | |
| 41 self.use_ascii = use_ascii | |
| 42 self.charsets = charsets | |
| 43 if self.use_ascii: | |
| 44 if alphabet: | |
| 45 alphabet += ascii_letters | |
| 46 else: | |
| 47 alphabet = ascii_letters | |
| 48 elif not alphabet: | |
| 49 raise ValueError('Must supply alphabet if use_ascii is False') | |
| 50 self.alphabet = ''.join(sorted(set(alphabet))) if alphabet else None | |
| 51 self.wiki_start_pages = wiki_start_pages | |
| 52 | |
| 53 def __repr__(self): | |
| 54 return '{}({})'.format(self.__class__.__name__, | |
| 55 ', '.join('{}={!r}'.format(k, v) | |
| 56 for k, v in self.__dict__.items() | |
| 57 if not k.startswith('_'))) | |
| 58 | |
| 59 | |
| 60 LANGUAGES = {'Arabic': Language(name='Arabic', | |
| 61 iso_code='ar', | |
| 62 use_ascii=False, | |
| 63 # We only support encodings that use isolated | |
| 64 # forms, because the current recommendation is | |
| 65 # that the rendering system handles presentation | |
| 66 # forms. This means we purposefully skip IBM864. | |
| 67 charsets=['ISO-8859-6', 'WINDOWS-1256', | |
| 68 'CP720', 'CP864'], | |
| 69 alphabet=u'ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ', | |
| 70 wiki_start_pages=[u'الصفحة_الرئيسية']), | |
| 71 'Belarusian': Language(name='Belarusian', | |
| 72 iso_code='be', | |
| 73 use_ascii=False, | |
| 74 charsets=['ISO-8859-5', 'WINDOWS-1251', | |
| 75 'IBM866', 'MacCyrillic'], | |
| 76 alphabet=(u'АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯ' | |
| 77 u'абвгдеёжзійклмнопрстуўфхцчшыьэюяʼ'), | |
| 78 wiki_start_pages=[u'Галоўная_старонка']), | |
| 79 'Bulgarian': Language(name='Bulgarian', | |
| 80 iso_code='bg', | |
| 81 use_ascii=False, | |
| 82 charsets=['ISO-8859-5', 'WINDOWS-1251', | |
| 83 'IBM855'], | |
| 84 alphabet=(u'АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯ' | |
| 85 u'абвгдежзийклмнопрстуфхцчшщъьюя'), | |
| 86 wiki_start_pages=[u'Начална_страница']), | |
| 87 'Czech': Language(name='Czech', | |
| 88 iso_code='cz', | |
| 89 use_ascii=True, | |
| 90 charsets=['ISO-8859-2', 'WINDOWS-1250'], | |
| 91 alphabet=u'áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ', | |
| 92 wiki_start_pages=[u'Hlavní_strana']), | |
| 93 'Danish': Language(name='Danish', | |
| 94 iso_code='da', | |
| 95 use_ascii=True, | |
| 96 charsets=['ISO-8859-1', 'ISO-8859-15', | |
| 97 'WINDOWS-1252'], | |
| 98 alphabet=u'æøåÆØÅ', | |
| 99 wiki_start_pages=[u'Forside']), | |
| 100 'German': Language(name='German', | |
| 101 iso_code='de', | |
| 102 use_ascii=True, | |
| 103 charsets=['ISO-8859-1', 'WINDOWS-1252'], | |
| 104 alphabet=u'äöüßÄÖÜ', | |
| 105 wiki_start_pages=[u'Wikipedia:Hauptseite']), | |
| 106 'Greek': Language(name='Greek', | |
| 107 iso_code='el', | |
| 108 use_ascii=False, | |
| 109 charsets=['ISO-8859-7', 'WINDOWS-1253'], | |
| 110 alphabet=(u'αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώ' | |
| 111 u'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ'), | |
| 112 wiki_start_pages=[u'Πύλη:Κύρια']), | |
| 113 'English': Language(name='English', | |
| 114 iso_code='en', | |
| 115 use_ascii=True, | |
| 116 charsets=['ISO-8859-1', 'WINDOWS-1252'], | |
| 117 wiki_start_pages=[u'Main_Page']), | |
| 118 'Esperanto': Language(name='Esperanto', | |
| 119 iso_code='eo', | |
| 120 # Q, W, X, and Y not used at all | |
| 121 use_ascii=False, | |
| 122 charsets=['ISO-8859-3'], | |
| 123 alphabet=(u'abcĉdefgĝhĥijĵklmnoprsŝtuŭvz' | |
| 124 u'ABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ'), | |
| 125 wiki_start_pages=[u'Vikipedio:Ĉefpaĝo']), | |
| 126 'Spanish': Language(name='Spanish', | |
| 127 iso_code='es', | |
| 128 use_ascii=True, | |
| 129 charsets=['ISO-8859-1', 'ISO-8859-15', | |
| 130 'WINDOWS-1252'], | |
| 131 alphabet=u'ñáéíóúüÑÁÉÍÓÚÜ', | |
| 132 wiki_start_pages=[u'Wikipedia:Portada']), | |
| 133 'Estonian': Language(name='Estonian', | |
| 134 iso_code='et', | |
| 135 use_ascii=False, | |
| 136 charsets=['ISO-8859-4', 'ISO-8859-13', | |
| 137 'WINDOWS-1257'], | |
| 138 # C, F, Š, Q, W, X, Y, Z, Ž are only for | |
| 139 # loanwords | |
| 140 alphabet=(u'ABDEGHIJKLMNOPRSTUVÕÄÖÜ' | |
| 141 u'abdeghijklmnoprstuvõäöü'), | |
| 142 wiki_start_pages=[u'Esileht']), | |
| 143 'Finnish': Language(name='Finnish', | |
| 144 iso_code='fi', | |
| 145 use_ascii=True, | |
| 146 charsets=['ISO-8859-1', 'ISO-8859-15', | |
| 147 'WINDOWS-1252'], | |
| 148 alphabet=u'ÅÄÖŠŽåäöšž', | |
| 149 wiki_start_pages=[u'Wikipedia:Etusivu']), | |
| 150 'French': Language(name='French', | |
| 151 iso_code='fr', | |
| 152 use_ascii=True, | |
| 153 charsets=['ISO-8859-1', 'ISO-8859-15', | |
| 154 'WINDOWS-1252'], | |
| 155 alphabet=u'œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ', | |
| 156 wiki_start_pages=[u'Wikipédia:Accueil_principal', | |
| 157 u'Bœuf (animal)']), | |
| 158 'Hebrew': Language(name='Hebrew', | |
| 159 iso_code='he', | |
| 160 use_ascii=False, | |
| 161 charsets=['ISO-8859-8', 'WINDOWS-1255'], | |
| 162 alphabet=u'אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ', | |
| 163 wiki_start_pages=[u'עמוד_ראשי']), | |
| 164 'Croatian': Language(name='Croatian', | |
| 165 iso_code='hr', | |
| 166 # Q, W, X, Y are only used for foreign words. | |
| 167 use_ascii=False, | |
| 168 charsets=['ISO-8859-2', 'WINDOWS-1250'], | |
| 169 alphabet=(u'abcčćdđefghijklmnoprsštuvzž' | |
| 170 u'ABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ'), | |
| 171 wiki_start_pages=[u'Glavna_stranica']), | |
| 172 'Hungarian': Language(name='Hungarian', | |
| 173 iso_code='hu', | |
| 174 # Q, W, X, Y are only used for foreign words. | |
| 175 use_ascii=False, | |
| 176 charsets=['ISO-8859-2', 'WINDOWS-1250'], | |
| 177 alphabet=(u'abcdefghijklmnoprstuvzáéíóöőúüű' | |
| 178 u'ABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ'), | |
| 179 wiki_start_pages=[u'Kezdőlap']), | |
| 180 'Italian': Language(name='Italian', | |
| 181 iso_code='it', | |
| 182 use_ascii=True, | |
| 183 charsets=['ISO-8859-1', 'ISO-8859-15', | |
| 184 'WINDOWS-1252'], | |
| 185 alphabet=u'ÀÈÉÌÒÓÙàèéìòóù', | |
| 186 wiki_start_pages=[u'Pagina_principale']), | |
| 187 'Lithuanian': Language(name='Lithuanian', | |
| 188 iso_code='lt', | |
| 189 use_ascii=False, | |
| 190 charsets=['ISO-8859-13', 'WINDOWS-1257', | |
| 191 'ISO-8859-4'], | |
| 192 # Q, W, and X not used at all | |
| 193 alphabet=(u'AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽ' | |
| 194 u'aąbcčdeęėfghiįyjklmnoprsštuųūvzž'), | |
| 195 wiki_start_pages=[u'Pagrindinis_puslapis']), | |
| 196 'Latvian': Language(name='Latvian', | |
| 197 iso_code='lv', | |
| 198 use_ascii=False, | |
| 199 charsets=['ISO-8859-13', 'WINDOWS-1257', | |
| 200 'ISO-8859-4'], | |
| 201 # Q, W, X, Y are only for loanwords | |
| 202 alphabet=(u'AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽ' | |
| 203 u'aābcčdeēfgģhiījkķlļmnņoprsštuūvzž'), | |
| 204 wiki_start_pages=[u'Sākumlapa']), | |
| 205 'Macedonian': Language(name='Macedonian', | |
| 206 iso_code='mk', | |
| 207 use_ascii=False, | |
| 208 charsets=['ISO-8859-5', 'WINDOWS-1251', | |
| 209 'MacCyrillic', 'IBM855'], | |
| 210 alphabet=(u'АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШ' | |
| 211 u'абвгдѓежзѕијклљмнњопрстќуфхцчџш'), | |
| 212 wiki_start_pages=[u'Главна_страница']), | |
| 213 'Dutch': Language(name='Dutch', | |
| 214 iso_code='nl', | |
| 215 use_ascii=True, | |
| 216 charsets=['ISO-8859-1', 'WINDOWS-1252'], | |
| 217 wiki_start_pages=[u'Hoofdpagina']), | |
| 218 'Polish': Language(name='Polish', | |
| 219 iso_code='pl', | |
| 220 # Q and X are only used for foreign words. | |
| 221 use_ascii=False, | |
| 222 charsets=['ISO-8859-2', 'WINDOWS-1250'], | |
| 223 alphabet=(u'AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ' | |
| 224 u'aąbcćdeęfghijklłmnńoóprsśtuwyzźż'), | |
| 225 wiki_start_pages=[u'Wikipedia:Strona_główna']), | |
| 226 'Portuguese': Language(name='Portuguese', | |
| 227 iso_code='pt', | |
| 228 use_ascii=True, | |
| 229 charsets=['ISO-8859-1', 'ISO-8859-15', | |
| 230 'WINDOWS-1252'], | |
| 231 alphabet=u'ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú', | |
| 232 wiki_start_pages=[u'Wikipédia:Página_principal']), | |
| 233 'Romanian': Language(name='Romanian', | |
| 234 iso_code='ro', | |
| 235 use_ascii=True, | |
| 236 charsets=['ISO-8859-2', 'WINDOWS-1250'], | |
| 237 alphabet=u'ăâîșțĂÂÎȘȚ', | |
| 238 wiki_start_pages=[u'Pagina_principală']), | |
| 239 'Russian': Language(name='Russian', | |
| 240 iso_code='ru', | |
| 241 use_ascii=False, | |
| 242 charsets=['ISO-8859-5', 'WINDOWS-1251', | |
| 243 'KOI8-R', 'MacCyrillic', 'IBM866', | |
| 244 'IBM855'], | |
| 245 alphabet=(u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя' | |
| 246 u'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'), | |
| 247 wiki_start_pages=[u'Заглавная_страница']), | |
| 248 'Slovak': Language(name='Slovak', | |
| 249 iso_code='sk', | |
| 250 use_ascii=True, | |
| 251 charsets=['ISO-8859-2', 'WINDOWS-1250'], | |
| 252 alphabet=u'áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ', | |
| 253 wiki_start_pages=[u'Hlavná_stránka']), | |
| 254 'Slovene': Language(name='Slovene', | |
| 255 iso_code='sl', | |
| 256 # Q, W, X, Y are only used for foreign words. | |
| 257 use_ascii=False, | |
| 258 charsets=['ISO-8859-2', 'WINDOWS-1250'], | |
| 259 alphabet=(u'abcčdefghijklmnoprsštuvzž' | |
| 260 u'ABCČDEFGHIJKLMNOPRSŠTUVZŽ'), | |
| 261 wiki_start_pages=[u'Glavna_stran']), | |
| 262 # Serbian can be written in both Latin and Cyrillic, but there's no | |
| 263 # simple way to get the Latin alphabet pages from Wikipedia through | |
| 264 # the API, so for now we just support Cyrillic. | |
| 265 'Serbian': Language(name='Serbian', | |
| 266 iso_code='sr', | |
| 267 alphabet=(u'АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШ' | |
| 268 u'абвгдђежзијклљмнњопрстћуфхцчџш'), | |
| 269 charsets=['ISO-8859-5', 'WINDOWS-1251', | |
| 270 'MacCyrillic', 'IBM855'], | |
| 271 wiki_start_pages=[u'Главна_страна']), | |
| 272 'Thai': Language(name='Thai', | |
| 273 iso_code='th', | |
| 274 use_ascii=False, | |
| 275 charsets=['ISO-8859-11', 'TIS-620', 'CP874'], | |
| 276 alphabet=u'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛', | |
| 277 wiki_start_pages=[u'หน้าหลัก']), | |
| 278 'Turkish': Language(name='Turkish', | |
| 279 iso_code='tr', | |
| 280 # Q, W, and X are not used by Turkish | |
| 281 use_ascii=False, | |
| 282 charsets=['ISO-8859-3', 'ISO-8859-9', | |
| 283 'WINDOWS-1254'], | |
| 284 alphabet=(u'abcçdefgğhıijklmnoöprsştuüvyzâîû' | |
| 285 u'ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ'), | |
| 286 wiki_start_pages=[u'Ana_Sayfa']), | |
| 287 'Vietnamese': Language(name='Vietnamese', | |
| 288 iso_code='vi', | |
| 289 use_ascii=False, | |
| 290 # Windows-1258 is the only common 8-bit | |
| 291 # Vietnamese encoding supported by Python. | |
| 292 # From Wikipedia: | |
| 293 # For systems that lack support for Unicode, | |
| 294 # dozens of 8-bit Vietnamese code pages are | |
| 295 # available.[1] The most common are VISCII | |
| 296 # (TCVN 5712:1993), VPS, and Windows-1258.[3] | |
| 297 # Where ASCII is required, such as when | |
| 298 # ensuring readability in plain text e-mail, | |
| 299 # Vietnamese letters are often encoded | |
| 300 # according to Vietnamese Quoted-Readable | |
| 301 # (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4] | |
| 302 # though usage of either variable-width | |
| 303 # scheme has declined dramatically following | |
| 304 # the adoption of Unicode on the World Wide | |
| 305 # Web. | |
| 306 charsets=['WINDOWS-1258'], | |
| 307 alphabet=(u'aăâbcdđeêghiklmnoôơpqrstuưvxy' | |
| 308 u'AĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY'), | |
| 309 wiki_start_pages=[u'Chữ_Quốc_ngữ']), | |
| 310 } |
