annotate env/lib/python3.7/site-packages/chardet/cli/chardetect.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author shellac
date Sat, 02 May 2020 07:14:21 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
1 #!/usr/bin/env python
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
2 """
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
3 Script which takes one or more file paths and reports on their detected
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
4 encodings
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
5
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
6 Example::
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
7
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
8 % chardetect somefile someotherfile
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
9 somefile: windows-1252 with confidence 0.5
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
10 someotherfile: ascii with confidence 1.0
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
11
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
12 If no paths are provided, it takes its input from stdin.
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
13
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
14 """
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
15
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
16 from __future__ import absolute_import, print_function, unicode_literals
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
17
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
18 import argparse
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
19 import sys
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
20
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
21 from chardet import __version__
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
22 from chardet.compat import PY2
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
23 from chardet.universaldetector import UniversalDetector
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
24
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
25
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
26 def description_of(lines, name='stdin'):
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
27 """
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
28 Return a string describing the probable encoding of a file or
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
29 list of strings.
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
30
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
31 :param lines: The lines to get the encoding of.
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
32 :type lines: Iterable of bytes
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
33 :param name: Name of file or collection of lines
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
34 :type name: str
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
35 """
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
36 u = UniversalDetector()
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
37 for line in lines:
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
38 line = bytearray(line)
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
39 u.feed(line)
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
40 # shortcut out of the loop to save reading further - particularly useful if we read a BOM.
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
41 if u.done:
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
42 break
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
43 u.close()
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
44 result = u.result
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
45 if PY2:
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
46 name = name.decode(sys.getfilesystemencoding(), 'ignore')
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
47 if result['encoding']:
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
48 return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
49 result['confidence'])
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
50 else:
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
51 return '{0}: no result'.format(name)
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
52
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
53
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
54 def main(argv=None):
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
55 """
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
56 Handles command line arguments and gets things started.
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
57
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
58 :param argv: List of arguments, as if specified on the command-line.
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
59 If None, ``sys.argv[1:]`` is used instead.
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
60 :type argv: list of str
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
61 """
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
62 # Get command line arguments
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
63 parser = argparse.ArgumentParser(
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
64 description="Takes one or more file paths and reports their detected \
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
65 encodings")
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
66 parser.add_argument('input',
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
67 help='File whose encoding we would like to determine. \
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
68 (default: stdin)',
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
69 type=argparse.FileType('rb'), nargs='*',
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
70 default=[sys.stdin if PY2 else sys.stdin.buffer])
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
71 parser.add_argument('--version', action='version',
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
72 version='%(prog)s {0}'.format(__version__))
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
73 args = parser.parse_args(argv)
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
74
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
75 for f in args.input:
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
76 if f.isatty():
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
77 print("You are running chardetect interactively. Press " +
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
78 "CTRL-D twice at the start of a blank line to signal the " +
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
79 "end of your input. If you want help, run chardetect " +
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
80 "--help\n", file=sys.stderr)
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
81 print(description_of(f, f.name))
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
82
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
83
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
84 if __name__ == '__main__':
26e78fe6e8c4 "planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff changeset
85 main()