Mercurial > repos > md-anderson-bioinformatics > heat_map_creation
diff mda_heatmap_gen.py @ 32:16593e40c2cd draft
Version 2.0.5
author | insilico-bob |
---|---|
date | Thu, 20 Jul 2017 15:31:06 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mda_heatmap_gen.py Thu Jul 20 15:31:06 2017 -0400 @@ -0,0 +1,250 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# python shell program to validate ng-chm heat map input matrix file and covariate file formats before calling java shell -- bob brown + +import subprocess #you must import subprocess so that python can talk to the command line +import sys +import os +import re +#import config +import traceback +#import commons + +#ConfigVals = config.Config("../rppaConf.txt") + +def main(): + + try: + print '\nStarting Heat Map file validation ......' + #print "\nheat map sys args len and values = ",len(sys.argv), str(sys.argv) #, '++',argvals + + + error= False + endCovarParam= len(sys.argv)-2 # IF any ending of loc for covar triplet info + startCovarParam= 17 # beginning loc for covar triplet info + inMatrix= sys.argv[3] + + for i in range( endCovarParam, 15, -3): + if len(sys.argv[i]) > 6: + if sys.argv[i][0:4].find('row_') == 0 or sys.argv[i][0:7].find('column_') == 0: # 0 is match start position + startCovarParam= i-2 + #print "\nHeat map arg 3 and start covariate index on = " ,str(sys.argv[3]),' - ', startCovarParam, ' covar name= ',str(sys.argv[startCovarParam:]) + #else: print '\nCovariate param row or column not found at i', i, str(sys.argv[i]) + + #test inMatrix= "/Users/bobbrown/Desktop/NGCHM-Galaxy-Test-Files/400x400firstRowShift.txt" + #test covarFN= '/Users/bobbrown/Desktop/400x400-column-covariate-continuous-TestingErrors.txt' + #test row_col_cat_contin= 'column_continuous' + #test row_col_cat_contin= 'column_categorical' + #test covarLabel = 'bob test' + #test numCovariates= 1 + + errorInMatrix,inMatrixRowLabels,inMatrixColLabels= ValidateHMInputMatrix(inMatrix) # verify input matrix + + print "\nFirst & last Row labels ", inMatrixRowLabels[0],inMatrixRowLabels[-1]," and Columns ", inMatrixColLabels[0],inMatrixColLabels[-1], " number Rows= ",len(inMatrixRowLabels)," number Columns= ",len(inMatrixColLabels) + + # continue reviewing covariates to catch any errors in any of the input info + if len(inMatrixRowLabels) < 5 or len(inMatrixColLabels) < 5: + errorInMatrix = True + print '\n----ERROR Input matrix has too few columns and rows need to ignore validating covariate files for now' + + elif not errorInMatrix: + print "\n++++ SUCCESS the Input Matrix looks good\n\n" + + i= startCovarParam + while i < (len(sys.argv)-2): # todo verify this works with advances tool is one other 0->n param after this + covarLabel= sys.argv[i] + covarLabel= covarLabel.replace(' ','') + covarFN= sys.argv[i+1] + covarFN= covarFN.replace(' ','') + row_col_cat_contin= sys.argv[i+2] + row_col_cat_contin= row_col_cat_contin.replace(' ','') + i +=3 + + print "\nSTART Validating covariate file with label= ", covarLabel, " and type= ",row_col_cat_contin + + error= ValidateHMCorvarFile(covarLabel, covarFN, row_col_cat_contin,inMatrixRowLabels,inMatrixColLabels) # check covariate files + + if error or errorInMatrix: + print"\n---ERROR issues found in input or covariate files\n " + sys.stderr.write( "\nERROR issues found in input or covariate files see errors in Standard Output\n\n ") + sys.exit(3) + + + print"\n FINISHED -- Validation of the Input Matrix and Covariate files (if any)\n\n" + + #print" next running the clustered heat map generator \n",str(sys.argv[11])+"/heatmap.sh "+ str(sys.argv[1:]) + # p = subprocess.Popen([str(sys.argv[1])+"/heatmap.sh "+ argvals], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + #p = subprocess.Popen([str(sys.argv[11])+"/heatmap.sh "+ str(sys.argv[1:])], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + + #retval = p.wait() + #print ' Cluster and Viewer returned\n' + #for line in p.stdout.readlines(): + # print line + +# else: +# sys.stderr.write("\nERROR -- The Heat Map Generator encountered the above errors with the input file(s)\n\n") +# sys.exit(3) # this will error it out :) +# except: +# sys.stderr.write(str(traceback.format_exc())) +# sys.exit(3) # this will error it out :) + except Exception, err: + sys.stderr.write('ERROR: %sn' % str(err)) + + return + +#+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+- + +def ValidateHMInputMatrix(inputMatrixPath): # This sub routine ensures that the slide design input by the user matches a slide design on record + + try: + error= True + + inputMatrixPath= inputMatrixPath.replace(' ','') + + inMatrixFH= open( inputMatrixPath, 'rU') + + #print "\nInput matrix path and name ", inputMatrixPath,"\n" + error= False + + countRow= 0 + lenRow1= 0 + lenAllRows= 0 + inMatrixRowLabels= [] + inMatrixColLabels= [] + + for rawRow in inMatrixFH: + countRow +=1 + + rawRow= rawRow.replace('\n','') + eachRow= rawRow.split('\t') + if countRow < 2: print 'Input Matrix start 1 to 10= ',eachRow[:10], '\n' + + if countRow == 1: + lenRow1= len(eachRow) + inMatrixColLabels= eachRow + for j in range(1,lenRow1): + tmp= re.search('[abcdefghijklmnopqrstuvwxyz]',eachRow[j].lower()) + try: + if tmp.group(0) == '': # if doesn't exist then error + tmp= tmp + except Exception as e: + print("\n--+-+- ERROR Column Headers at position "+str(j+1)+" value appears to be non-alphanumeric --"+str(eachRow[j])+"--") + sys.stderr.write("\n--+-+- ERROR Column Headers at position "+str(j+1)+" value appears to be non-alphanumeric --"+str(eachRow[j])+"--") + error= True + + if lenRow1 < 3: # likely is covariate file not input matrix + print"----WARNING Input number of columns= " , str(lenRow1)," is too few likely input matrix is really a covariate file" + SystemError ("----WARNING Input number of columns= " + str(lenRow1)+" is too few likely input matrix is really a covariate file") + #error= True + #sys.err= 2 + elif countRow == 2: + lenAllRows= len(eachRow) + if (lenAllRows == lenRow1) or (lenAllRows == lenRow1+1): #or (lenAllRows- lenRow1 == 0 or 1): + print"Validating Input matrix, number of Labeled Columns = ", str(lenAllRows) + inMatrixRowLabels.append(eachRow[0]) + +# allow other error to occur first +# tmp= re.search('[abcdefghijklmnopqrstuvwxyz]',eachRow[0].lower()) +# try: +# if tmp.group(0) == '': # if doesn't exist then error +# tmp= tmp +# except Exception as e: +# print("\n--+-+- WARNING Row Label at row 2 value appears to be non-alphanumeric --"+str(eachRow[j])+"--") +# sys.stderr.write("\n--+-+- WARNING Row Label at row 2 value appears to be non-alphanumeric --"+str(eachRow[j])+"--") +# #error= True + if (lenAllRows == lenRow1) and (inMatrixColLabels[0]==''): inMatrixColLabels.pop(0) #remove blank first cell + + else: + print( "\n--ERROR Input matrix number columns= "+str(lenRow1)+" in first row and the second row= "+str(lenAllRows)+" mismatch ") + sys.stderr.write( "\n--ERROR Input matrix number columns= "+str(lenRow1)+" in first row and the second row= "+str(lenAllRows)+" mismatch ") + error= True + sys.err= 6 + elif (lenRow1 != len(eachRow) and lenRow1+1 != len(eachRow)): + print ("\n--ERROR Input Row "+ str(countRow)+" number of columns= "+str(len(eachRow))+" is a length mismatch with row 2 length "+str( lenAllRows)) + sys.stderr.write ("\n--ERROR Input Row "+ str(countRow)+" number of columns= "+str(len(eachRow))+" is a length mismatch with row 2 length "+str( lenAllRows)) + error= True + sys.err= 7 + else: + inMatrixRowLabels.append(eachRow[0]) + tmp= re.search('[abcdefghijklmnopqrstuvwxyz]',eachRow[0].lower()) + try: + if tmp.group(0) == '': # if doesn't exist then error + tmp= tmp + except Exception as e: + print"-+-+- WARNING Row Label at row "+str(countRow)+" value appears to be non-alphanumeric --"+str(eachRow[j]) + sys.stderr.write("\n--+-+- WARNING Row Label at row "+str(countRow)+" value appears to be non-alphanumeric "+str(eachRow[j])) + + + if len(inMatrixColLabels) > 0: + if (inMatrixColLabels[-1] =='') or (inMatrixColLabels[-1] =='\n'): inMatrixColLabels.pop() + + inMatrixFH.close() + + #print error, lenAllRows, len(eachRow), eachRow[0] + except: + #inMatrixFH.close() + sys.stderr.write(str(traceback.format_exc())) + error= True + + return error,inMatrixRowLabels,inMatrixColLabels + + #+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+- + +def ValidateHMCorvarFile(covarLabel, covariateFilePath, row_col_cat_contin, inMatrixRowLabels,inMatrixColLabels): # This sub routine ensures that the slide design input by the user matches a slide design on record + +# verify +# 1 That covar file labels match the col or row labels 1 to 1 +# 2 That if a continuous covar file that the 2nd field is not all text hard to tell if '-' or 'e exponent' +# 3 That the length of the covar file matches the row or col length of the input matrix + + error= True + try: + + covFH= open( covariateFilePath, 'rU') + countRow= 0 + + error= False + + for rawRow in covFH: + countRow +=1 + rawRow= rawRow.replace('\n','') + eachRow= rawRow.split('\t') + if countRow== 0: print "\nCovariance file info - label ",str(covarLabel)," row/col categorical or continous",row_col_cat_contin," first row ",str(eachrow) + + if len(eachRow) < 2 and countRow > 1: + print("----ERROR Input Row "+str(countRow)+" does not have a label and/or value ") + sys.stderr.write("----ERROR Input Row "+str(countRow)+" does not have a label/or and value") + error= True + sys.err= 8 + #return error + elif len(eachRow) > 1: + tmp= re.search('[abcdefghijklmnopqrstuvwxyz]',eachRow[0].lower()) + try: + if tmp.group(0) == '': # if doesn't exist then error + tmp= tmp + except Exception as e: + print"\n-+-+- WARNING Covariate Label at row "+str(countRow)+" value appears to be non-alphanumeric --", eachRow[0],"--" + sys.stderr.write("\n--+-+- WARNING Row Headers at row "+str(countRow)+" value appears to be non-alphanumeric --"+str(eachRow[0])+"--") + + if not error: + if row_col_cat_contin[-4:] == 'uous': # verify continuous is number-ish + tmp= re.search('[+-.0123456789eE]',eachRow[1]) + try: + if tmp.group(0) == '': + tmp= tmp + except Exception as e: + print("\n-+-+-WARNING Input Row "+str(countRow)+" covariance continuous value appears to be non-numeric --"+ str(eachRow[1])+"--") + sys.stderr.write("\n-+-+-WARNING Input Row "+str(countRow)+" covariance continuous value appears to be non-numeric --"+ str(eachRow[1])+"--") + #error= True + except: + sys.stderr.write(str(traceback.format_exc())) + + covFH.close() + + return error + + +if __name__ == "__main__": + main() + +