Mercurial > repos > md-anderson-bioinformatics > heat_map_creation
comparison mda_heatmap_gen.py @ 32:16593e40c2cd draft
Version 2.0.5
author | insilico-bob |
---|---|
date | Thu, 20 Jul 2017 15:31:06 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
31:e01b833f5d43 | 32:16593e40c2cd |
---|---|
1 #!/usr/bin/env python | |
2 # -*- coding: utf-8 -*- | |
3 # python shell program to validate ng-chm heat map input matrix file and covariate file formats before calling java shell -- bob brown | |
4 | |
5 import subprocess #you must import subprocess so that python can talk to the command line | |
6 import sys | |
7 import os | |
8 import re | |
9 #import config | |
10 import traceback | |
11 #import commons | |
12 | |
13 #ConfigVals = config.Config("../rppaConf.txt") | |
14 | |
15 def main(): | |
16 | |
17 try: | |
18 print '\nStarting Heat Map file validation ......' | |
19 #print "\nheat map sys args len and values = ",len(sys.argv), str(sys.argv) #, '++',argvals | |
20 | |
21 | |
22 error= False | |
23 endCovarParam= len(sys.argv)-2 # IF any ending of loc for covar triplet info | |
24 startCovarParam= 17 # beginning loc for covar triplet info | |
25 inMatrix= sys.argv[3] | |
26 | |
27 for i in range( endCovarParam, 15, -3): | |
28 if len(sys.argv[i]) > 6: | |
29 if sys.argv[i][0:4].find('row_') == 0 or sys.argv[i][0:7].find('column_') == 0: # 0 is match start position | |
30 startCovarParam= i-2 | |
31 #print "\nHeat map arg 3 and start covariate index on = " ,str(sys.argv[3]),' - ', startCovarParam, ' covar name= ',str(sys.argv[startCovarParam:]) | |
32 #else: print '\nCovariate param row or column not found at i', i, str(sys.argv[i]) | |
33 | |
34 #test inMatrix= "/Users/bobbrown/Desktop/NGCHM-Galaxy-Test-Files/400x400firstRowShift.txt" | |
35 #test covarFN= '/Users/bobbrown/Desktop/400x400-column-covariate-continuous-TestingErrors.txt' | |
36 #test row_col_cat_contin= 'column_continuous' | |
37 #test row_col_cat_contin= 'column_categorical' | |
38 #test covarLabel = 'bob test' | |
39 #test numCovariates= 1 | |
40 | |
41 errorInMatrix,inMatrixRowLabels,inMatrixColLabels= ValidateHMInputMatrix(inMatrix) # verify input matrix | |
42 | |
43 print "\nFirst & last Row labels ", inMatrixRowLabels[0],inMatrixRowLabels[-1]," and Columns ", inMatrixColLabels[0],inMatrixColLabels[-1], " number Rows= ",len(inMatrixRowLabels)," number Columns= ",len(inMatrixColLabels) | |
44 | |
45 # continue reviewing covariates to catch any errors in any of the input info | |
46 if len(inMatrixRowLabels) < 5 or len(inMatrixColLabels) < 5: | |
47 errorInMatrix = True | |
48 print '\n----ERROR Input matrix has too few columns and rows need to ignore validating covariate files for now' | |
49 | |
50 elif not errorInMatrix: | |
51 print "\n++++ SUCCESS the Input Matrix looks good\n\n" | |
52 | |
53 i= startCovarParam | |
54 while i < (len(sys.argv)-2): # todo verify this works with advances tool is one other 0->n param after this | |
55 covarLabel= sys.argv[i] | |
56 covarLabel= covarLabel.replace(' ','') | |
57 covarFN= sys.argv[i+1] | |
58 covarFN= covarFN.replace(' ','') | |
59 row_col_cat_contin= sys.argv[i+2] | |
60 row_col_cat_contin= row_col_cat_contin.replace(' ','') | |
61 i +=3 | |
62 | |
63 print "\nSTART Validating covariate file with label= ", covarLabel, " and type= ",row_col_cat_contin | |
64 | |
65 error= ValidateHMCorvarFile(covarLabel, covarFN, row_col_cat_contin,inMatrixRowLabels,inMatrixColLabels) # check covariate files | |
66 | |
67 if error or errorInMatrix: | |
68 print"\n---ERROR issues found in input or covariate files\n " | |
69 sys.stderr.write( "\nERROR issues found in input or covariate files see errors in Standard Output\n\n ") | |
70 sys.exit(3) | |
71 | |
72 | |
73 print"\n FINISHED -- Validation of the Input Matrix and Covariate files (if any)\n\n" | |
74 | |
75 #print" next running the clustered heat map generator \n",str(sys.argv[11])+"/heatmap.sh "+ str(sys.argv[1:]) | |
76 # p = subprocess.Popen([str(sys.argv[1])+"/heatmap.sh "+ argvals], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) | |
77 #p = subprocess.Popen([str(sys.argv[11])+"/heatmap.sh "+ str(sys.argv[1:])], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) | |
78 | |
79 #retval = p.wait() | |
80 #print ' Cluster and Viewer returned\n' | |
81 #for line in p.stdout.readlines(): | |
82 # print line | |
83 | |
84 # else: | |
85 # sys.stderr.write("\nERROR -- The Heat Map Generator encountered the above errors with the input file(s)\n\n") | |
86 # sys.exit(3) # this will error it out :) | |
87 # except: | |
88 # sys.stderr.write(str(traceback.format_exc())) | |
89 # sys.exit(3) # this will error it out :) | |
90 except Exception, err: | |
91 sys.stderr.write('ERROR: %sn' % str(err)) | |
92 | |
93 return | |
94 | |
95 #+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+- | |
96 | |
97 def ValidateHMInputMatrix(inputMatrixPath): # This sub routine ensures that the slide design input by the user matches a slide design on record | |
98 | |
99 try: | |
100 error= True | |
101 | |
102 inputMatrixPath= inputMatrixPath.replace(' ','') | |
103 | |
104 inMatrixFH= open( inputMatrixPath, 'rU') | |
105 | |
106 #print "\nInput matrix path and name ", inputMatrixPath,"\n" | |
107 error= False | |
108 | |
109 countRow= 0 | |
110 lenRow1= 0 | |
111 lenAllRows= 0 | |
112 inMatrixRowLabels= [] | |
113 inMatrixColLabels= [] | |
114 | |
115 for rawRow in inMatrixFH: | |
116 countRow +=1 | |
117 | |
118 rawRow= rawRow.replace('\n','') | |
119 eachRow= rawRow.split('\t') | |
120 if countRow < 2: print 'Input Matrix start 1 to 10= ',eachRow[:10], '\n' | |
121 | |
122 if countRow == 1: | |
123 lenRow1= len(eachRow) | |
124 inMatrixColLabels= eachRow | |
125 for j in range(1,lenRow1): | |
126 tmp= re.search('[abcdefghijklmnopqrstuvwxyz]',eachRow[j].lower()) | |
127 try: | |
128 if tmp.group(0) == '': # if doesn't exist then error | |
129 tmp= tmp | |
130 except Exception as e: | |
131 print("\n--+-+- ERROR Column Headers at position "+str(j+1)+" value appears to be non-alphanumeric --"+str(eachRow[j])+"--") | |
132 sys.stderr.write("\n--+-+- ERROR Column Headers at position "+str(j+1)+" value appears to be non-alphanumeric --"+str(eachRow[j])+"--") | |
133 error= True | |
134 | |
135 if lenRow1 < 3: # likely is covariate file not input matrix | |
136 print"----WARNING Input number of columns= " , str(lenRow1)," is too few likely input matrix is really a covariate file" | |
137 SystemError ("----WARNING Input number of columns= " + str(lenRow1)+" is too few likely input matrix is really a covariate file") | |
138 #error= True | |
139 #sys.err= 2 | |
140 elif countRow == 2: | |
141 lenAllRows= len(eachRow) | |
142 if (lenAllRows == lenRow1) or (lenAllRows == lenRow1+1): #or (lenAllRows- lenRow1 == 0 or 1): | |
143 print"Validating Input matrix, number of Labeled Columns = ", str(lenAllRows) | |
144 inMatrixRowLabels.append(eachRow[0]) | |
145 | |
146 # allow other error to occur first | |
147 # tmp= re.search('[abcdefghijklmnopqrstuvwxyz]',eachRow[0].lower()) | |
148 # try: | |
149 # if tmp.group(0) == '': # if doesn't exist then error | |
150 # tmp= tmp | |
151 # except Exception as e: | |
152 # print("\n--+-+- WARNING Row Label at row 2 value appears to be non-alphanumeric --"+str(eachRow[j])+"--") | |
153 # sys.stderr.write("\n--+-+- WARNING Row Label at row 2 value appears to be non-alphanumeric --"+str(eachRow[j])+"--") | |
154 # #error= True | |
155 if (lenAllRows == lenRow1) and (inMatrixColLabels[0]==''): inMatrixColLabels.pop(0) #remove blank first cell | |
156 | |
157 else: | |
158 print( "\n--ERROR Input matrix number columns= "+str(lenRow1)+" in first row and the second row= "+str(lenAllRows)+" mismatch ") | |
159 sys.stderr.write( "\n--ERROR Input matrix number columns= "+str(lenRow1)+" in first row and the second row= "+str(lenAllRows)+" mismatch ") | |
160 error= True | |
161 sys.err= 6 | |
162 elif (lenRow1 != len(eachRow) and lenRow1+1 != len(eachRow)): | |
163 print ("\n--ERROR Input Row "+ str(countRow)+" number of columns= "+str(len(eachRow))+" is a length mismatch with row 2 length "+str( lenAllRows)) | |
164 sys.stderr.write ("\n--ERROR Input Row "+ str(countRow)+" number of columns= "+str(len(eachRow))+" is a length mismatch with row 2 length "+str( lenAllRows)) | |
165 error= True | |
166 sys.err= 7 | |
167 else: | |
168 inMatrixRowLabels.append(eachRow[0]) | |
169 tmp= re.search('[abcdefghijklmnopqrstuvwxyz]',eachRow[0].lower()) | |
170 try: | |
171 if tmp.group(0) == '': # if doesn't exist then error | |
172 tmp= tmp | |
173 except Exception as e: | |
174 print"-+-+- WARNING Row Label at row "+str(countRow)+" value appears to be non-alphanumeric --"+str(eachRow[j]) | |
175 sys.stderr.write("\n--+-+- WARNING Row Label at row "+str(countRow)+" value appears to be non-alphanumeric "+str(eachRow[j])) | |
176 | |
177 | |
178 if len(inMatrixColLabels) > 0: | |
179 if (inMatrixColLabels[-1] =='') or (inMatrixColLabels[-1] =='\n'): inMatrixColLabels.pop() | |
180 | |
181 inMatrixFH.close() | |
182 | |
183 #print error, lenAllRows, len(eachRow), eachRow[0] | |
184 except: | |
185 #inMatrixFH.close() | |
186 sys.stderr.write(str(traceback.format_exc())) | |
187 error= True | |
188 | |
189 return error,inMatrixRowLabels,inMatrixColLabels | |
190 | |
191 #+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+- | |
192 | |
193 def ValidateHMCorvarFile(covarLabel, covariateFilePath, row_col_cat_contin, inMatrixRowLabels,inMatrixColLabels): # This sub routine ensures that the slide design input by the user matches a slide design on record | |
194 | |
195 # verify | |
196 # 1 That covar file labels match the col or row labels 1 to 1 | |
197 # 2 That if a continuous covar file that the 2nd field is not all text hard to tell if '-' or 'e exponent' | |
198 # 3 That the length of the covar file matches the row or col length of the input matrix | |
199 | |
200 error= True | |
201 try: | |
202 | |
203 covFH= open( covariateFilePath, 'rU') | |
204 countRow= 0 | |
205 | |
206 error= False | |
207 | |
208 for rawRow in covFH: | |
209 countRow +=1 | |
210 rawRow= rawRow.replace('\n','') | |
211 eachRow= rawRow.split('\t') | |
212 if countRow== 0: print "\nCovariance file info - label ",str(covarLabel)," row/col categorical or continous",row_col_cat_contin," first row ",str(eachrow) | |
213 | |
214 if len(eachRow) < 2 and countRow > 1: | |
215 print("----ERROR Input Row "+str(countRow)+" does not have a label and/or value ") | |
216 sys.stderr.write("----ERROR Input Row "+str(countRow)+" does not have a label/or and value") | |
217 error= True | |
218 sys.err= 8 | |
219 #return error | |
220 elif len(eachRow) > 1: | |
221 tmp= re.search('[abcdefghijklmnopqrstuvwxyz]',eachRow[0].lower()) | |
222 try: | |
223 if tmp.group(0) == '': # if doesn't exist then error | |
224 tmp= tmp | |
225 except Exception as e: | |
226 print"\n-+-+- WARNING Covariate Label at row "+str(countRow)+" value appears to be non-alphanumeric --", eachRow[0],"--" | |
227 sys.stderr.write("\n--+-+- WARNING Row Headers at row "+str(countRow)+" value appears to be non-alphanumeric --"+str(eachRow[0])+"--") | |
228 | |
229 if not error: | |
230 if row_col_cat_contin[-4:] == 'uous': # verify continuous is number-ish | |
231 tmp= re.search('[+-.0123456789eE]',eachRow[1]) | |
232 try: | |
233 if tmp.group(0) == '': | |
234 tmp= tmp | |
235 except Exception as e: | |
236 print("\n-+-+-WARNING Input Row "+str(countRow)+" covariance continuous value appears to be non-numeric --"+ str(eachRow[1])+"--") | |
237 sys.stderr.write("\n-+-+-WARNING Input Row "+str(countRow)+" covariance continuous value appears to be non-numeric --"+ str(eachRow[1])+"--") | |
238 #error= True | |
239 except: | |
240 sys.stderr.write(str(traceback.format_exc())) | |
241 | |
242 covFH.close() | |
243 | |
244 return error | |
245 | |
246 | |
247 if __name__ == "__main__": | |
248 main() | |
249 | |
250 |