annotate tools/filters/uniq.py @ 2:c2a356708570

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:42 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 # Filename: uniq.py
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2 # Author: Ian N. Schenck
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 # Version: 19/12/2005
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 # This script accepts an input file, an output file, a column
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 # delimiter, and a list of columns. The script then grabs unique
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 # lines based on the columns, and returns those records with a count
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 # of occurences of each unique column, inserted before the columns.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 # This executes the command pipeline:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 # cut -f $fields | sort | uniq -C
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 # -i Input file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 # -o Output file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 # -d Delimiter
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 # -c Column list (Comma Seperated)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 import sys
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 import re
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 import string
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 import commands
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 # This function is exceedingly useful, perhaps package for reuse?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 def getopts(argv):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 opts = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 while argv:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 if argv[0][0] == '-':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 opts[argv[0]] = argv[1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 argv = argv[2:]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 argv = argv[1:]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 return opts
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 def main():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 args = sys.argv[1:]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 opts = getopts(args)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 except IndexError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 print "Usage:"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 print " -i Input file"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 print " -o Output file"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 print " -c Column list (comma seperated)"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 print " -d Delimiter:"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 print " T Tab"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 print " C Comma"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 print " D Dash"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 print " U Underscore"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 print " P Pipe"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 print " Dt Dot"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 print " Sp Space"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 return 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 outputfile = opts.get("-o")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 if outputfile == None:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 print "No output file specified."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 return -1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 inputfile = opts.get("-i")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 if inputfile == None:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 print "No input file specified."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 return -2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 delim = opts.get("-d")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 if delim == None:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 print "Field delimiter not specified."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 return -3
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 columns = opts.get("-c")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 if columns == None or columns == 'None':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 print "Columns not specified."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 return -4
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 # All inputs have been specified at this point, now validate.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 fileRegEx = re.compile("^[A-Za-z0-9./\-_]+$")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 columnRegEx = re.compile("([0-9]{1,},?)+")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 if not columnRegEx.match(columns):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 print "Illegal column specification."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 return -4
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 if not fileRegEx.match(outputfile):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 print "Illegal output filename."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 return -5
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 if not fileRegEx.match(inputfile):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 print "Illegal input filename."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 return -6
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 column_list = re.split(",",columns)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 columns_for_display = ""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 for col in column_list:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 columns_for_display += "c"+col+", "
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 commandline = "cut "
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 # Set delimiter
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 if delim=='C':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 commandline += "-d \",\" "
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 if delim=='D':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 commandline += "-d \"-\" "
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 if delim=='U':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 commandline += "-d \"_\" "
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 if delim=='P':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 commandline += "-d \"|\" "
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 if delim=='Dt':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 commandline += "-d \".\" "
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 if delim=='Sp':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 commandline += "-d \" \" "
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108 # set columns
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109 commandline += "-f " + columns
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110 commandline += " " + inputfile + " | sed s/\ //g | sort | uniq -c | sed s/^\ *// | tr \" \" \"\t\" > " + outputfile
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 errorcode, stdout = commands.getstatusoutput(commandline)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 print "Count of unique values in " + columns_for_display
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 return errorcode
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116 if __name__ == "__main__":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 main()