0
|
1 # Filename: uniq.py
|
|
2 # Author: Ian N. Schenck
|
|
3 # Version: 19/12/2005
|
|
4 #
|
|
5 # This script accepts an input file, an output file, a column
|
|
6 # delimiter, and a list of columns. The script then grabs unique
|
|
7 # lines based on the columns, and returns those records with a count
|
|
8 # of occurences of each unique column, inserted before the columns.
|
|
9 #
|
|
10 # This executes the command pipeline:
|
|
11 # cut -f $fields | sort | uniq -C
|
|
12 #
|
|
13 # -i Input file
|
|
14 # -o Output file
|
|
15 # -d Delimiter
|
|
16 # -c Column list (Comma Seperated)
|
|
17
|
|
18 import sys
|
|
19 import re
|
|
20 import string
|
|
21 import commands
|
|
22
|
|
23 # This function is exceedingly useful, perhaps package for reuse?
|
|
24 def getopts(argv):
|
|
25 opts = {}
|
|
26 while argv:
|
|
27 if argv[0][0] == '-':
|
|
28 opts[argv[0]] = argv[1]
|
|
29 argv = argv[2:]
|
|
30 else:
|
|
31 argv = argv[1:]
|
|
32 return opts
|
|
33
|
|
34 def main():
|
|
35 args = sys.argv[1:]
|
|
36
|
|
37 try:
|
|
38 opts = getopts(args)
|
|
39 except IndexError:
|
|
40 print "Usage:"
|
|
41 print " -i Input file"
|
|
42 print " -o Output file"
|
|
43 print " -c Column list (comma seperated)"
|
|
44 print " -d Delimiter:"
|
|
45 print " T Tab"
|
|
46 print " C Comma"
|
|
47 print " D Dash"
|
|
48 print " U Underscore"
|
|
49 print " P Pipe"
|
|
50 print " Dt Dot"
|
|
51 print " Sp Space"
|
|
52 return 0
|
|
53
|
|
54 outputfile = opts.get("-o")
|
|
55 if outputfile == None:
|
|
56 print "No output file specified."
|
|
57 return -1
|
|
58
|
|
59 inputfile = opts.get("-i")
|
|
60 if inputfile == None:
|
|
61 print "No input file specified."
|
|
62 return -2
|
|
63
|
|
64 delim = opts.get("-d")
|
|
65 if delim == None:
|
|
66 print "Field delimiter not specified."
|
|
67 return -3
|
|
68
|
|
69 columns = opts.get("-c")
|
|
70 if columns == None or columns == 'None':
|
|
71 print "Columns not specified."
|
|
72 return -4
|
|
73
|
|
74 # All inputs have been specified at this point, now validate.
|
|
75 fileRegEx = re.compile("^[A-Za-z0-9./\-_]+$")
|
|
76 columnRegEx = re.compile("([0-9]{1,},?)+")
|
|
77
|
|
78 if not columnRegEx.match(columns):
|
|
79 print "Illegal column specification."
|
|
80 return -4
|
|
81 if not fileRegEx.match(outputfile):
|
|
82 print "Illegal output filename."
|
|
83 return -5
|
|
84 if not fileRegEx.match(inputfile):
|
|
85 print "Illegal input filename."
|
|
86 return -6
|
|
87
|
|
88 column_list = re.split(",",columns)
|
|
89 columns_for_display = ""
|
|
90 for col in column_list:
|
|
91 columns_for_display += "c"+col+", "
|
|
92
|
|
93 commandline = "cut "
|
|
94 # Set delimiter
|
|
95 if delim=='C':
|
|
96 commandline += "-d \",\" "
|
|
97 if delim=='D':
|
|
98 commandline += "-d \"-\" "
|
|
99 if delim=='U':
|
|
100 commandline += "-d \"_\" "
|
|
101 if delim=='P':
|
|
102 commandline += "-d \"|\" "
|
|
103 if delim=='Dt':
|
|
104 commandline += "-d \".\" "
|
|
105 if delim=='Sp':
|
|
106 commandline += "-d \" \" "
|
|
107
|
|
108 # set columns
|
|
109 commandline += "-f " + columns
|
|
110 commandline += " " + inputfile + " | sed s/\ //g | sort | uniq -c | sed s/^\ *// | tr \" \" \"\t\" > " + outputfile
|
|
111 errorcode, stdout = commands.getstatusoutput(commandline)
|
|
112
|
|
113 print "Count of unique values in " + columns_for_display
|
|
114 return errorcode
|
|
115
|
|
116 if __name__ == "__main__":
|
|
117 main()
|