annotate tools/stats/gsummary.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/env python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 import sys, re, tempfile
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 from rpy import *
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 # Older py compatibility
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 set()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 from sets import Set as set
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 assert sys.version_info[:2] >= ( 2, 4 )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 def stop_err( msg ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 sys.stderr.write( msg )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 sys.exit()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 def S3_METHODS( all="key" ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 Group_Math = [ "abs", "sign", "sqrt", "floor", "ceiling", "trunc", "round", "signif",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 "exp", "log", "cos", "sin", "tan", "acos", "asin", "atan", "cosh", "sinh", "tanh",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 "acosh", "asinh", "atanh", "lgamma", "gamma", "gammaCody", "digamma", "trigamma",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 "cumsum", "cumprod", "cummax", "cummin", "c" ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 Group_Ops = [ "+", "-", "*", "/", "^", "%%", "%/%", "&", "|", "!", "==", "!=", "<", "<=", ">=", ">", "(", ")", "~", "," ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 if all is "key":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 return { 'Math' : Group_Math, 'Ops' : Group_Ops }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 def main():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 datafile = sys.argv[1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 outfile_name = sys.argv[2]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 expression = sys.argv[3]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 stop_err( 'Usage: python gsummary.py input_file ouput_file expression' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 math_allowed = S3_METHODS()[ 'Math' ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 ops_allowed = S3_METHODS()[ 'Ops' ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 # Check for invalid expressions
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 for word in re.compile( '[a-zA-Z]+' ).findall( expression ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 if word and not word in math_allowed:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 stop_err( "Invalid expression '%s': term '%s' is not recognized or allowed" %( expression, word ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 symbols = set()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 for symbol in re.compile( '[^a-z0-9\s]+' ).findall( expression ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 if symbol and not symbol in ops_allowed:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 stop_err( "Invalid expression '%s': operator '%s' is not recognized or allowed" % ( expression, symbol ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 symbols.add( symbol )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 if len( symbols ) == 1 and ',' in symbols:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 # User may have entered a comma-separated list r_data_frame columns
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 stop_err( "Invalid columns '%s': this tool requires a single column or expression" % expression )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 # Find all column references in the expression
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 cols = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 for col in re.compile( 'c[0-9]+' ).findall( expression ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 cols.append( int( col[1:] ) - 1 )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 pass
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 tmp_file = tempfile.NamedTemporaryFile( 'w+b' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 # Write the R header row to the temporary file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 hdr_str = "\t".join( "c%s" % str( col+1 ) for col in cols )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 tmp_file.write( "%s\n" % hdr_str )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 skipped_lines = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 first_invalid_line = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 i = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 for i, line in enumerate( file( datafile ) ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 line = line.rstrip( '\r\n' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 if line and not line.startswith( '#' ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 valid = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 fields = line.split( '\t' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 # Write the R data row to the temporary file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 for col in cols:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 float( fields[ col ] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 skipped_lines += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 if not first_invalid_line:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 first_invalid_line = i + 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 valid = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 if valid:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 data_str = "\t".join( fields[ col ] for col in cols )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 tmp_file.write( "%s\n" % data_str )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 tmp_file.flush()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 if skipped_lines == i + 1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 stop_err( "Invalid column or column data values invalid for computation. See tool tips and syntax for data requirements." )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 # summary function and return labels
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 summary_func = r( "function( x ) { c( sum=sum( as.numeric( x ), na.rm=T ), mean=mean( as.numeric( x ), na.rm=T ), stdev=sd( as.numeric( x ), na.rm=T ), quantile( as.numeric( x ), na.rm=TRUE ) ) }" )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 headings = [ 'sum', 'mean', 'stdev', '0%', '25%', '50%', '75%', '100%' ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 headings_str = "\t".join( headings )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 set_default_mode( NO_CONVERSION )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 r_data_frame = r.read_table( tmp_file.name, header=True, sep="\t" )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 outfile = open( outfile_name, 'w' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 for col in re.compile( 'c[0-9]+' ).findall( expression ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 r.assign( col, r[ "$" ]( r_data_frame, col ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 summary = summary_func( r( expression ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 except RException, s:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 outfile.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 stop_err( "Computation resulted in the following error: %s" % str( s ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 summary = summary.as_py( BASIC_CONVERSION )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 outfile.write( "#%s\n" % headings_str )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108 outfile.write( "%s\n" % "\t".join( [ "%g" % ( summary[ k ] ) for k in headings ] ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109 outfile.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 if skipped_lines:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112 print "Skipped %d invalid lines beginning with line #%d. See tool tips for data requirements." % ( skipped_lines, first_invalid_line )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 if __name__ == "__main__": main()