annotate linear_regression.py @ 0:cf431604ec3e draft default tip

Imported from capsule None
author devteam
date Tue, 01 Apr 2014 10:52:17 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
1 #!/usr/bin/env python
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
2
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
3 import sys
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
4 from rpy import *
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
5 import numpy
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
6
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
7 def stop_err(msg):
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
8 sys.stderr.write(msg)
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
9 sys.exit()
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
10
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
11 infile = sys.argv[1]
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
12 y_col = int(sys.argv[2])-1
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
13 x_cols = sys.argv[3].split(',')
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
14 outfile = sys.argv[4]
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
15 outfile2 = sys.argv[5]
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
16
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
17 print "Predictor columns: %s; Response column: %d" % ( x_cols, y_col+1 )
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
18 fout = open(outfile,'w')
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
19 elems = []
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
20 for i, line in enumerate( file ( infile )):
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
21 line = line.rstrip('\r\n')
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
22 if len( line )>0 and not line.startswith( '#' ):
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
23 elems = line.split( '\t' )
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
24 break
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
25 if i == 30:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
26 break # Hopefully we'll never get here...
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
27
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
28 if len( elems )<1:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
29 stop_err( "The data in your input dataset is either missing or not formatted properly." )
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
30
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
31 y_vals = []
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
32 x_vals = []
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
33
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
34 for k, col in enumerate(x_cols):
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
35 x_cols[k] = int(col)-1
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
36 x_vals.append([])
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
37
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
38 NA = 'NA'
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
39 for ind, line in enumerate( file( infile )):
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
40 if line and not line.startswith( '#' ):
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
41 try:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
42 fields = line.split("\t")
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
43 try:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
44 yval = float(fields[y_col])
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
45 except:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
46 yval = r('NA')
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
47 y_vals.append(yval)
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
48 for k, col in enumerate(x_cols):
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
49 try:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
50 xval = float(fields[col])
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
51 except:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
52 xval = r('NA')
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
53 x_vals[k].append(xval)
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
54 except:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
55 pass
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
56
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
57 x_vals1 = numpy.asarray(x_vals).transpose()
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
58
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
59 dat = r.list(x=array(x_vals1), y=y_vals)
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
60
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
61 set_default_mode(NO_CONVERSION)
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
62 try:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
63 linear_model = r.lm(r("y ~ x"), data = r.na_exclude(dat))
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
64 except RException, rex:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
65 stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain only non-numeric or invalid values.")
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
66 set_default_mode(BASIC_CONVERSION)
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
67
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
68 coeffs = linear_model.as_py()['coefficients']
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
69 yintercept = coeffs['(Intercept)']
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
70 summary = r.summary(linear_model)
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
71
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
72 co = summary.get('coefficients', 'NA')
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
73 """
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
74 if len(co) != len(x_vals)+1:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
75 stop_err("Stopped performing linear regression on the input data, since one of the predictor columns contains only non-numeric or invalid values.")
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
76 """
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
77
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
78 try:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
79 yintercept = r.round(float(yintercept), digits=10)
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
80 pvaly = r.round(float(co[0][3]), digits=10)
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
81 except:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
82 pass
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
83
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
84 print >> fout, "Y-intercept\t%s" % (yintercept)
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
85 print >> fout, "p-value (Y-intercept)\t%s" % (pvaly)
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
86
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
87 if len(x_vals) == 1: #Simple linear regression case with 1 predictor variable
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
88 try:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
89 slope = r.round(float(coeffs['x']), digits=10)
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
90 except:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
91 slope = 'NA'
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
92 try:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
93 pval = r.round(float(co[1][3]), digits=10)
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
94 except:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
95 pval = 'NA'
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
96 print >> fout, "Slope (c%d)\t%s" % ( x_cols[0]+1, slope )
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
97 print >> fout, "p-value (c%d)\t%s" % ( x_cols[0]+1, pval )
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
98 else: #Multiple regression case with >1 predictors
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
99 ind = 1
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
100 while ind < len(coeffs.keys()):
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
101 try:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
102 slope = r.round(float(coeffs['x'+str(ind)]), digits=10)
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
103 except:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
104 slope = 'NA'
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
105 print >> fout, "Slope (c%d)\t%s" % ( x_cols[ind-1]+1, slope )
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
106 try:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
107 pval = r.round(float(co[ind][3]), digits=10)
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
108 except:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
109 pval = 'NA'
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
110 print >> fout, "p-value (c%d)\t%s" % ( x_cols[ind-1]+1, pval )
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
111 ind += 1
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
112
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
113 rsq = summary.get('r.squared','NA')
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
114 adjrsq = summary.get('adj.r.squared','NA')
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
115 fstat = summary.get('fstatistic','NA')
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
116 sigma = summary.get('sigma','NA')
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
117
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
118 try:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
119 rsq = r.round(float(rsq), digits=5)
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
120 adjrsq = r.round(float(adjrsq), digits=5)
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
121 fval = r.round(fstat['value'], digits=5)
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
122 fstat['value'] = str(fval)
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
123 sigma = r.round(float(sigma), digits=10)
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
124 except:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
125 pass
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
126
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
127 print >> fout, "R-squared\t%s" % (rsq)
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
128 print >> fout, "Adjusted R-squared\t%s" % (adjrsq)
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
129 print >> fout, "F-statistic\t%s" % (fstat)
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
130 print >> fout, "Sigma\t%s" % (sigma)
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
131
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
132 r.pdf( outfile2, 8, 8 )
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
133 if len(x_vals) == 1: #Simple linear regression case with 1 predictor variable
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
134 sub_title = "Slope = %s; Y-int = %s" % ( slope, yintercept )
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
135 try:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
136 r.plot(x=x_vals[0], y=y_vals, xlab="X", ylab="Y", sub=sub_title, main="Scatterplot with regression")
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
137 r.abline(a=yintercept, b=slope, col="red")
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
138 except:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
139 pass
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
140 else:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
141 r.pairs(dat, main="Scatterplot Matrix", col="blue")
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
142 try:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
143 r.plot(linear_model)
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
144 except:
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
145 pass
cf431604ec3e Imported from capsule None
devteam
parents:
diff changeset
146 r.dev_off()