Mercurial > repos > xuebing > sharplabtool
comparison tools/regVariation/best_regression_subsets.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 from galaxy import eggs | |
4 | |
5 import sys, string | |
6 from rpy import * | |
7 import numpy | |
8 | |
9 def stop_err(msg): | |
10 sys.stderr.write(msg) | |
11 sys.exit() | |
12 | |
13 infile = sys.argv[1] | |
14 y_col = int(sys.argv[2])-1 | |
15 x_cols = sys.argv[3].split(',') | |
16 outfile = sys.argv[4] | |
17 outfile2 = sys.argv[5] | |
18 print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1) | |
19 fout = open(outfile,'w') | |
20 | |
21 for i, line in enumerate( file ( infile )): | |
22 line = line.rstrip('\r\n') | |
23 if len( line )>0 and not line.startswith( '#' ): | |
24 elems = line.split( '\t' ) | |
25 break | |
26 if i == 30: | |
27 break # Hopefully we'll never get here... | |
28 | |
29 if len( elems )<1: | |
30 stop_err( "The data in your input dataset is either missing or not formatted properly." ) | |
31 | |
32 y_vals = [] | |
33 x_vals = [] | |
34 | |
35 for k,col in enumerate(x_cols): | |
36 x_cols[k] = int(col)-1 | |
37 x_vals.append([]) | |
38 | |
39 NA = 'NA' | |
40 for ind,line in enumerate( file( infile )): | |
41 if line and not line.startswith( '#' ): | |
42 try: | |
43 fields = line.split("\t") | |
44 try: | |
45 yval = float(fields[y_col]) | |
46 except Exception, ey: | |
47 yval = r('NA') | |
48 y_vals.append(yval) | |
49 for k,col in enumerate(x_cols): | |
50 try: | |
51 xval = float(fields[col]) | |
52 except Exception, ex: | |
53 xval = r('NA') | |
54 x_vals[k].append(xval) | |
55 except: | |
56 pass | |
57 | |
58 response_term = "" | |
59 | |
60 x_vals1 = numpy.asarray(x_vals).transpose() | |
61 | |
62 dat= r.list(x=array(x_vals1), y=y_vals) | |
63 | |
64 r.library("leaps") | |
65 | |
66 set_default_mode(NO_CONVERSION) | |
67 try: | |
68 leaps = r.regsubsets(r("y ~ x"), data= r.na_exclude(dat)) | |
69 except RException, rex: | |
70 stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.") | |
71 set_default_mode(BASIC_CONVERSION) | |
72 | |
73 summary = r.summary(leaps) | |
74 tot = len(x_vals) | |
75 pattern = "[" | |
76 for i in range(tot): | |
77 pattern = pattern + 'c' + str(int(x_cols[int(i)]) + 1) + ' ' | |
78 pattern = pattern.strip() + ']' | |
79 print >>fout, "#Vars\t%s\tR-sq\tAdj. R-sq\tC-p\tbic" %(pattern) | |
80 for ind,item in enumerate(summary['outmat']): | |
81 print >>fout, "%s\t%s\t%s\t%s\t%s\t%s" %(str(item).count('*'), item, summary['rsq'][ind], summary['adjr2'][ind], summary['cp'][ind], summary['bic'][ind]) | |
82 | |
83 | |
84 r.pdf( outfile2, 8, 8 ) | |
85 r.plot(leaps, scale="Cp", main="Best subsets using Cp Criterion") | |
86 r.plot(leaps, scale="r2", main="Best subsets using R-sq Criterion") | |
87 r.plot(leaps, scale="adjr2", main="Best subsets using Adjusted R-sq Criterion") | |
88 r.plot(leaps, scale="bic", main="Best subsets using bic Criterion") | |
89 | |
90 r.dev_off() |