# HG changeset patch
# User insilico-bob
# Date 1543346440 18000
# Node ID f1bcd79cd923a76ce97dcc843ad28f1c8835283e
# Parent 7f12c81e20833740569fbd555d2a83e35f38496a
Uploaded
diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Filters.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Matrix_Filters.py Tue Nov 27 14:20:40 2018 -0500
@@ -0,0 +1,694 @@
+'''
+Created on Jun 7, 2017 updated Feb2018
+
+@author: rbrown and cjacoby
+'''
+
+import sys, traceback, argparse
+import numpy as np
+from Matrix_Validate_import import reader, Labeler
+import math
+#import matplotlib.pyplot as plt
+
+#Define argparse Function
+def get_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('input_file_txt', help='tab delimited text file input matrix(include .txt in name)')
+ parser.add_argument('choice',type=str, help='Variance Filter Method (Variance or Range)')
+ parser.add_argument('thresh', help='Thershold for Variance Filtering')
+ parser.add_argument('axes', help='Axes to Filter on (Either Row or Column')
+ parser.add_argument('output_file_txt', help='tab delimited text file output name (include .txt in name)')
+ args = parser.parse_args()
+ return args
+
+def Range_Filter_Row(matrix,thresh,row_header_list,column_header_list):
+ #Create Null Set of Filtered Row(Populated Later)
+ deletes = []
+ minVal = +9999999
+ maxVal = -99999
+ #Loop to Determine Which Rows have sub-Threshold Range
+ for i in range(0,len(matrix)):
+ temp_range = np.max(matrix[i][0::]) - np.min(matrix[i][0::])
+
+ if temp_range < minVal: minVal = temp_range
+ elif temp_range > maxVal: maxVal = temp_range
+
+ if temp_range <= float(thresh):
+ deletes = np.append(deletes,[i],0)
+
+ #Delete Rows sub-Threshold Rows
+ matrix = np.delete(matrix,deletes,0)
+ filter_rows = np.delete(row_header_list,deletes,0)
+ filter_cols = column_header_list
+ return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal
+
+def Range_Filter_Col(matrix,thresh,row_header_list,column_header_list):
+ #Create Null Set of Filtered Row(Populated Later)
+ deletes = []
+ minVal = +9999999
+ maxVal = -99999
+ #Loop to Determine Which Rows have sub-Threshold Variance
+ for i in range(0,len(matrix[0])):
+
+ temp_range = np.max([row[i] for row in matrix]) - np.min([row[i] for row in matrix])
+
+ if temp_range < minVal: minVal = temp_range
+ elif temp_range > maxVal: maxVal = temp_range
+
+ #print(temp_stdev)
+ if temp_range <= float(thresh):
+ deletes = np.append(deletes,[i],0)
+ print(deletes)
+
+ #Delete Rows sub-Threshold Rows
+ matrix = np.delete(matrix,deletes,1)
+ filter_rows = row_header_list
+ filter_cols = np.delete(column_header_list,deletes,0)
+ #np.savetxt('testtest.txt',matrix,delimiter='\t')
+
+ return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal
+
+#Define Function Which Deletes Sub-Threshold Rows
+def Variance_Percent_Filter_row(matrix,cutoff,row_header_list,column_header_list, create_plot= False):
+# if create a plot then DO NOT remove DATA only print diagram of variance ranges !!!
+
+# temp_stdev = np.var(matrix[i][1::])
+ #cutoff is the percentile rank of the variance values
+ cutoff= int(cutoff)/100.0
+ if cutoff > 0.99 or cutoff < .01:
+ sys.stderr.write( "ERROR illegal cutoff value= "+str(cutoff*100)+" allowed values 1 to 99")
+ sys.exit(-8)
+
+ deletes = []
+ varianceDict = {}
+ minVal = +9999999
+ maxVal = -99999
+
+ #Loop to Determine Which Rows have sub-Threshold Variance
+ for i in range(len(matrix)):
+ vector = []
+ for p in range(len(matrix[0])):
+ if not math.isnan(matrix[i][p]):
+ vector.append(matrix[i][p])
+
+ #temp_stdev = np.var(matrix[:,i])
+ if len(vector) > 1:
+ temp_stdev = np.var(vector)
+ else:
+ temp_stdev = 0.0
+
+ if temp_stdev < minVal:
+ minVal = temp_stdev
+ elif temp_stdev > maxVal:
+ maxVal = temp_stdev
+
+ if temp_stdev not in varianceDict:
+ varianceDict[temp_stdev] = [i]
+ else:
+ tmp= varianceDict[temp_stdev]
+ tmp.append(i)
+ varianceDict[temp_stdev] = tmp
+
+
+ #calc how many rows to remove
+ lowerLimit = int(cutoff*len(matrix) +1)
+ limit = False
+ cnt = 0
+
+ for key in sorted(varianceDict.items()):
+ #rows = varianceDict[key]
+ rows= key[1]
+ cnt += len(rows)
+ if cnt < lowerLimit: #remove rows below percentile cutoff
+ for j in rows:
+ deletes = np.append(deletes,[j],0)
+ #print(deletes)
+ else:
+ limit = True
+
+ print( "Dataset Lowest Variance= %.2f" % minVal+" Highest Variance= %.2f" % maxVal+" and Percentile cutoff row = "+str(lowerLimit)+" of "+str(len(matrix))+" rows")
+
+
+ #Delete Rows sub-Threshold Rows
+ matrix = np.delete(matrix,deletes,0)
+ filter_rows = np.delete(row_header_list,deletes,0)
+ filter_cols = column_header_list
+ #np.savetxt('testtest.txt',matrix,delimiter='\t')
+
+ """
+ if create_plot:
+ numBins = 10
+ binWidth = 1
+ binCat = []
+ binData = []
+ counted = False
+ incrmnt= (maxVal-minVal)/(numBins-1)
+ current_bin_max = minVal + incrmnt/2
+ cnt = 0
+ for key, val in sorted(varianceDict.items()):
+ if key < current_bin_max:
+ cnt += len(val) # add all rows having that variance value
+ counted = False
+ else:
+ binData.append(cnt)
+ cnt= len(val)
+ binCat.append(str("%0.2f" % (current_bin_max - incrmnt/2.0)))
+ current_bin_max += incrmnt
+ counted = True
+
+ if not counted:
+ binData.append(cnt)
+ binCat.append(str("%0.2f" % (current_bin_max - incrmnt/2.0)))
+
+ tot = sum(binData)
+ bins = []
+ for j in range(numBins):
+ bins.append(j*binWidth)
+
+ #ttps://pythonspot.com/matplotlib-bar-chart/
+ y_pos = np.arange(numBins)
+ plt.xticks(y_pos, binCat)
+ plt.title("Distribution of Variance Values by Row")
+ plt.ylabel('Variance Bin Totals')
+ plt.xlabel('Variance Value Bins')
+ #plt.legend()
+ plt.bar(y_pos, binData, align='center', alpha=0.5)
+
+ fig, ax = plt.subplots(num=1, figsize=(8,3))
+
+ plt.show()
+ """
+
+
+
+ return matrix,filter_rows,filter_cols ,len(deletes), minVal,maxVal
+
+def Variance_Percent_Filter_col(matrix,cutoff,row_header_list,column_header_list, create_plot=False):
+ #cutoff is the percentile rank of the variance values
+ cutoff= int(cutoff)/100.0
+ if cutoff > 0.99 or cutoff < .01:
+ sys.stderr.write( "ERROR illegal cutoff value= "+str(cutoff*100)+" allowed values 1 to 99")
+ sys.exit(-8)
+
+ deletes = []
+ varianceDict = {}
+ minVal = +9999999
+ maxVal = -99999
+ lenCol = len(matrix[0])
+
+ #Loop to Determine Which Rows have sub-Threshold Variance
+ for i in range(lenCol):
+ vector = []
+ for p in range(len(matrix)):
+ if not math.isnan(matrix[p][i]):
+ vector.append(matrix[p][i])
+
+ #temp_stdev = np.var(matrix[:,i])
+ if len(vector) > 1:
+ temp_stdev = np.var(vector)
+ else:
+ temp_stdev = 0.0
+
+ if temp_stdev < minVal:
+ minVal = temp_stdev
+ elif temp_stdev > maxVal:
+ maxVal = temp_stdev
+
+ if temp_stdev not in varianceDict:
+ varianceDict[temp_stdev] = [i]
+ else:
+ tmp= varianceDict[temp_stdev]
+ tmp.append(i)
+ varianceDict[temp_stdev] = tmp
+
+ #print(temp_stdev)
+ #if temp_stdev <= float(cutoff):
+
+ #calc how many rows to remove
+ lowerLimit = int(cutoff*lenCol +1)
+ limit = False
+ cnt = 0
+
+ for key in sorted(varianceDict.items()):
+ #rows = varianceDict[key]
+ cols= key[1]
+ cnt += len(cols)
+ if cnt < lowerLimit: #remove rows below percentile cutoff
+ for j in cols:
+ deletes = np.append(deletes,[j],0)
+ #print(deletes)
+ else:
+ limit = True
+
+ print( "Dataset Lowest Variance= %.2f" % minVal+" Highest Variance= %.2f" % maxVal+" and Percentile cutoff column= "+str(lowerLimit)+" of "+str(lenCol)+" columns")
+
+ matrix = np.delete(matrix,deletes,1)
+ filter_rows = row_header_list
+ filter_cols = np.delete(column_header_list,deletes,0)
+ #np.savetxt('testtest.txt',matrix,delimiter='\t')
+
+ """
+ if create_plot:
+ numBins = 10
+ binWidth = 1
+ binCat = []
+ binData = []
+ counted = False
+ incrmnt= (maxVal-minVal)/(numBins-1)
+ current_bin_max = minVal + incrmnt/2
+ cnt = 0
+ for key, val in sorted(varianceDict.items()):
+ if key < current_bin_max:
+ cnt += len(val) # add all rows having that variance value
+ counted = False
+ else:
+ binData.append(cnt)
+ cnt= len(val)
+ binCat.append(str("%0.2f" % (current_bin_max - incrmnt/2.0)))
+ current_bin_max += incrmnt
+ counted = True
+
+ if not counted:
+ binData.append(cnt)
+ binCat.append(str("%0.2f" % (current_bin_max - incrmnt/2.0)))
+
+ tot = sum(binData)
+ bins = []
+
+ for j in range(numBins):
+ bins.append(j*binWidth)
+ #https://pythonspot.com/matplotlib-bar-chart/
+ y_pos = np.arange(numBins)
+
+ plt.xticks(y_pos, binCat)
+ plt.title("Distribution of Variance Values by Column")
+ plt.ylabel('Variance Bin Totals')
+ plt.xlabel('Variance Value Bins')
+ #plt.legend()
+ plt.bar(y_pos, binData, align='center', alpha=0.5)
+
+ fig, ax = plt.subplots(num=1, figsize=(8,3))
+ plt.show()
+ """
+
+ return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal
+
+def UpperLowerLimit_Filter_Row(upperLower, matrix,cutoff,row_header_list,column_header_list):
+ #Create Null Set of Filtered Row(Populated Later)
+ deletes = []
+ minVal = +9999999
+ maxVal = -99999
+ #Loop to Determine Which Rows have sub-Threshold Range
+ for i in range(0,len(matrix)):
+ removeRow = False
+
+ for j in range(len(matrix[0])):
+ val= matrix[i][j]
+ if not math.isnan(val):
+ if val <= cutoff and upperLower == 'lower':
+ removeRow = True
+ elif val >= cutoff and upperLower == 'upper':
+ removeRow = True
+ else:
+ if val < minVal: minVal = val
+ if val > maxVal: maxVal = val
+
+ #print(temp_stdev)
+ if removeRow:
+ deletes = np.append(deletes,[i],0)
+
+ #Delete Rows sub-Threshold Rows
+ matrix = np.delete(matrix,deletes,0)
+ filter_rows = np.delete(row_header_list,deletes,0)
+ filter_cols = column_header_list
+
+ return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal
+
+def UpperLowerLimit_Filter_Col(upperLower,matrix,cutoff,row_header_list,column_header_list):
+ #Create Null Set of Filtered Row(Populated Later)
+ deletes = []
+ minVal = +9999999
+ maxVal = -99999
+ #Loop to Determine Which Rows have sub-Threshold Variance
+
+ for i in range(0,len(matrix[0])):
+ removeRow = False
+
+ for j in range(len(matrix)):
+ val= matrix[j][i]
+ if not math.isnan(val):
+ if val <= cutoff and upperLower == 'lower':
+ removeRow = True
+ elif val >= cutoff and upperLower == 'upper':
+ removeRow = True
+ else:
+ if val < minVal: minVal = val
+ if val > maxVal: maxVal = val
+
+ #print(temp_stdev)
+ if removeRow: deletes = np.append(deletes,[i],0)
+
+ #Delete Rows sub-Threshold Rows
+ matrix = np.delete(matrix,deletes,1)
+ filter_rows = row_header_list
+ filter_cols = np.delete(column_header_list,deletes,0)
+ #np.savetxt('testtest.txt',matrix,delimiter='\t')
+
+ return matrix, filter_rows, filter_cols,len(deletes),minVal,maxVal
+
+#========= remove rows with too many NANs in cells
+def NAN_Filter_Row(matrix,nanList,maxAllowedNANs,row_header_list,column_header_list):
+
+ try:
+ #Create Null Set of Filtered Row(Populated Later)
+ maxFoundNANs = 0
+ deletes = []
+ #Loop to Determine Which Rows have sub-Threshold Range
+ for i in range(0,len(matrix)):
+ #matches= [s for s in matrix[i][0::] if any(nan == s.upper() for nan in nanList)]
+ #matches= [s for s in matrix[i][:] if s in nanList]
+ matches= []
+ for s in matrix[i]:
+ if str(s) in nanList: matches.append(s)
+
+
+ lenMatches = len(matches)
+ if lenMatches > maxFoundNANs: maxFoundNANs = lenMatches
+
+ if lenMatches >= maxAllowedNANs:
+ deletes = np.append(deletes,[i],0)
+
+ #Delete Rows sub-Threshold Rows
+ matrix = np.delete(matrix,deletes,0)
+ filter_rows = np.delete(row_header_list,deletes,0)
+ filter_cols = column_header_list
+
+ except Exception as err:
+ traceback.print_exc()
+ sys.exit(-4)
+
+ return matrix, filter_rows, filter_cols,len(deletes),maxFoundNANs
+
+#========= remove Cols with too many NANs
+
+def NAN_Filter_Column(matrix,nanList,maxAllowedNANs,row_header_list,column_header_list):
+
+ #Create Null Set of Filtered Row(Populated Later)
+ minNumNANs = 0
+ maxFoundNANs = 0
+ deletes = []
+ #Loop to Determine Which Rows have sub-Threshold Variance
+ for i in range(0,len(matrix[0])):
+ matches= []
+ for j in range(len(matrix)):
+ if str(matrix[j][i]) in nanList: matches.append(matrix[j][i])
+
+ lenMatches = len(matches)
+ if lenMatches > maxFoundNANs:
+ maxFoundNANs = lenMatches
+
+ if lenMatches >= maxAllowedNANs:
+ deletes = np.append(deletes,[i],0)
+
+ #Delete cols with too many NANs
+ matrix = np.delete(matrix,deletes,1)
+ filter_rows = row_header_list
+ filter_cols = np.delete(column_header_list,deletes,0)
+ #np.savetxt('testtest.txt',matrix,delimiter='\t')
+ return matrix, filter_rows, filter_cols,len(deletes),maxFoundNANs
+
+
+#MAD Median Absolute Deviation median (|Xi - Xmedian|) > X
+def Row_Value_MAD(matrix,cutoff,row_header_list,column_header_list):
+#MAD Median Absolute Deviation median (|Xi - Xmedian|) > X
+# cutoff is MAX value used to meant to minimize the impact of one outlier
+
+ deletes = []
+ minVal = +9999999
+ maxVal = -99999
+ #Loop to Determine Which Rows have sub-Threshold Range
+ for i in range(0,len(matrix)):
+ medianRow = np.median(matrix[i])
+ temp = np.median(abs(matrix[i]- medianRow))
+# median (|Xi - Xmedian|) > X => meant to minimize the impact of one outlier
+ if temp < cutoff:
+ deletes = np.append(deletes,[i],0)
+
+ if temp < minVal: minVal = temp
+ if temp > maxVal: maxVal = temp
+
+ #Delete Rows sub-Threshold Rows
+ matrix = np.delete(matrix,deletes,0)
+ filter_rows = np.delete(row_header_list,deletes,0)
+ filter_cols = column_header_list
+ print( "INFO Row MAD - Matrix min MAD value= "+str(minVal)+" and the max MAD value= "+str(maxVal) )
+
+ return matrix, filter_rows, filter_cols,len(deletes),maxVal
+
+#MAD Median Absolute Deviation median (|Xi - Xmedian|) > X
+def Col_Value_MAD(matrix,cutoff,row_header_list,column_header_list):
+#MAD Median Absolute Deviation median (|Xi - Xmedian|) > X
+# cutoff is MAX value used to meant to minimize the impact of one outlier
+ deletes = []
+ minVal = +9999999
+ maxVal = -99999
+ #Loop to Determine Which Rows have sub-Threshold Range
+ for i in range(0,len(matrix[0])):
+ matrixCol= []
+ for j in range(len(matrix)):
+ matrixCol.append(matrix[j][i])
+
+ medianCol = np.median(matrixCol)
+ temp = np.median(abs(matrixCol- medianCol))
+# median (|Xi - Xmedian|) > X meant to minimize the impact of one outlier
+ if temp < cutoff:
+ deletes = np.append(deletes,[i],0)
+
+ if temp < minVal: minVal = temp
+ if temp > maxVal: maxVal = temp
+
+ #Delete Rows sub-Threshold Rows
+ matrix = np.delete(matrix,deletes,1)
+ filter_rows = row_header_list
+ filter_cols = np.delete(column_header_list,deletes,0)
+ print( "INFO Column MAD - Matrix min MAD value= "+str(minVal)+" and the max MAD value= "+str(maxVal) )
+
+ return matrix, filter_rows, filter_cols,len(deletes),maxVal
+
+
+# if covariance of the data in two columns exceeds a thresehold remove one row list the rows in a separate output
+# def CoVariance_Percent_Filter_row_col(matrix,thresh,row_header_list,column_header_list):
+# xv= array([8., 9.5, 7.8, 4.2, -7.7, -5.4, 3.2])
+# yv= array([8.9, 2.0, 4.8, -4.2, 2.7, -3.4, -5.9])
+#
+# def cov(x,y):
+# if (len(x) != len(y)
+# [Stop]
+# x.bar = mean(x)
+# y.bar = mean(y)
+# N = len(x)
+# Cov = (sum((x-x.bar)*(y-y.bar))) / (N-1.0)
+# return(Cov)
+
+# #Create Null Set of Filtered Row(Populated Later)
+# deletes = []
+#
+# temp_mean = np.nanmean(matrix[i])
+# temp_stdev = np.nanstd(matrix[i])
+#
+# get stddev of each row the calc xi -xj sq
+#
+# for i in range(0,len(matrix)):
+# temp_range = np.max(matrix[i][0::]) - np.min(matrix[i][0::])
+# if temp_range <= float(thresh):
+# deletes = np.append(deletes,[i],0)
+#
+# #Delete Rows sub-Threshold Rows
+# matrix = np.delete(matrix,deletes,0)
+# filter_rows = np.delete(row_header_list,deletes,0)
+# filter_cols = column_header_list
+# return(matrix,filter_rows,filter_cols)
+#
+# #np.savetxt('testtest.txt',matrix,delimiter='\t')
+# return(matrix,filter_rows,filter_cols)
+#
+
+#Define Function Which Labels Rows/Columns on Output
+#below replace
+# def labeler(matrix,filter_rows,filter_cols,output_file_txt):
+#
+# #Write Data to Specified Text File Output
+# with open(output_file_txt,'w') as f:
+# f.write("")
+# for k in range(0,len(filter_cols)):
+# f.write('\t' + filter_cols[k])
+# f.write('\n')
+# for i in range(0,len(filter_rows)):
+# f.write(filter_rows[i])
+# for j in range(0,len(matrix[0])):
+# f.write('\t' + format(matrix[i][j]))
+# f.write('\n')
+
+
+#Define Main Function
+def main():
+ try:
+ args = get_args()
+ #sys.stdout.write(str(args)+"\n")
+#
+#
+#
+#
+ nanList= ["NAN", "NA", "N/A", "-","?","nan", "na", "n/a"]
+
+ matrix, column_header_list,row_header_list = reader(args.input_file_txt)
+ #old_reader matrix, row_header_list, column_header_list = reader(args.input_file_txt)
+ threshold = float(args.thresh)
+ if threshold < 0.000001:
+ print('Invalid negative or near-zero threshold chosen = '+str(args.thresh)+" choose positive value")
+ sys.exit(-4)
+
+#VariancePercent
+ if args.choice == "VariancePercent" or args.choice == "VarianceCount": # > percent variance
+
+ if args.axes == "Row":
+ if args.choice == "VarianceCount": threshold= (1-threshold/len(row_header_list))*100.0
+
+ matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = Variance_Percent_Filter_row(matrix,threshold,row_header_list,column_header_list)
+ Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
+ if delCnt < 1:
+ print('\nNO Filtering occurred for rows using variance percentile < '+str(args.thresh)+ ' by row. Matrix row minimum variance= %.2f' % minVal+' and maximum variance= %.2f' % maxVal)
+ sys.stderr.write('\nFiltering out rows using variance percentile < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows')
+ sys.exit(-1)
+ else:
+ print('\nFiltering out rows using variance percentile < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows')
+ elif args.axes == "Column":
+ if args.choice == "VarianceCount": threshold= (1-threshold/len(column_header_list))*100.0
+ matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = Variance_Percent_Filter_col(matrix,threshold,row_header_list,column_header_list)
+ Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
+ if delCnt < 1:
+ print('\nNO Filtering occurred for columns using variance percentile < '+str(args.thresh)+ ' by columns. Matrix columns minimum variance= %.2f' % minVal+' and maximum variance= %.2f' % maxVal)
+ sys.stderr.write('\nNO Filtering out rows using variance percentile < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows')
+ sys.exit(-1)
+ else:
+ print('\nFiltering out columns using variance percentile < '+str(args.thresh)+ ' removed '+str(delCnt)+' columns')
+ else:
+ print('Invalid Axes ='+str(args.thresh))
+ sys.exit(-1)
+#LowerLimit
+ elif args.choice == "LowerLimit": #!! todo is NOT lower or upper limit but range of values
+ if args.axes == "Row":
+ matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = UpperLowerLimit_Filter_Row('lower',matrix,threshold,row_header_list,column_header_list)
+ Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
+ if delCnt < 1:
+ print('\nNO Filtering occurred for rows using LowerLimit < '+str(args.thresh)+ ' by row. Matrix row minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal)
+ sys.stderr.write('\nNO Filtering out rows using LowerLimit < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows')
+ sys.exit(-1)
+ else:
+ print('\nFiltered out '+str(delCnt)+' rows with Lower Limit < '+str(args.thresh))
+ elif args.axes == "Column":
+ matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = UpperLowerLimit_Filter_Col('lower', matrix,threshold,row_header_list,column_header_list)
+ Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
+ if delCnt < 1:
+ print('\nNO Filtering occurred for columns using Lower Limit < '+str(args.thresh)+ ' by columns. Matrix columns minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal)
+ sys.stderr.write('\nNO Filtering out rows using Lower Limit < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows')
+ sys.exit(-1)
+ else:
+ print('\nFiltered out '+str(delCnt)+' columns with Lower Limit < '+str(args.thresh))
+#UpperLimit
+ elif args.choice == "UpperLimit": #!! todo is NOT lower or upper limit but range of values
+ if args.axes == "Row":
+ matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = UpperLowerLimit_Filter_Row('upper',matrix,threshold,row_header_list,column_header_list)
+ Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
+ if delCnt < 1:
+ print('\nNO Filtering occurred for rows using Upper Limit < '+str(args.thresh)+ ' by row. Matrix row minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal)
+ sys.stderr.write('\nNO Filtering out rows using Upper Limit < '+str(args.thresh)+ ' by row. Matrix row minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal)
+ sys.exit(-1)
+ else:
+ print('\nFiltered out '+str(delCnt)+' rows with UpperLimit < '+str(args.thresh))
+ elif args.axes == "Column":
+ matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = UpperLowerLimit_Filter_Col('upper', matrix,threshold,row_header_list,column_header_list)
+ Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
+ if delCnt < 1:
+ print('\nNO Filtering occurred for columns using UpperLimit < '+str(args.thresh)+ ' by columns. Matrix columns minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal)
+ sys.stderr.write('\nFiltering out rows using UpperLimit < '+str(args.thresh)+ ' by columns. Matrix columns minimum range= %.2f' % minVal+' and maximum range= %.2f' % maxVal)
+ sys.exit(-1)
+ else:
+ print('\nFiltered out '+str(delCnt)+' columns with UpperLimit < '+str(args.thresh))
+#MADlimit
+ elif args.choice == "MADcount" or args.choice == "MADpercent": #!! is lowerlimit of median absolute deviation medians
+ threshold= threshold
+ if args.axes == "Row":
+ if args.choice == "MADpercent": threshold= len(row_header_list)*threshold/100.0
+
+ matrix, filter_rows, filter_cols,delCnt,maxVal = Row_Value_MAD(matrix,threshold,row_header_list,column_header_list)
+ Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
+ if delCnt < 1:
+ print('\nNO Filtering occurred for rows using MAD < '+str(threshold)+ ' by row. Matrix row MAD maximum value= %.2f' % maxVal)
+ sys.stderr.write('\nFiltering out rows using MAD < '+str(threshold)+ ' by row. Matrix row MAD maximum value= %.2f' % maxVal)
+ sys.exit(-1)
+ else:
+ print('\nFiltered out '+str(delCnt)+' rows using MAD maximum value > '+str(threshold))
+ elif args.axes == "Column":
+ if args.choice == "MADpercent": threshold= len(column_header_list)*threshold/100.0
+
+ matrix, filter_rows, filter_cols,delCnt,maxVal = Col_Value_MAD(matrix,threshold,row_header_list,column_header_list)
+ Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
+ if delCnt < 1:
+ print('\nNO Filtering occurred for columns using MAD < '+str(threshold)+ ' by columns. Matrix columns MAD maximum value= %.2f' % maxVal)
+ sys.stderr.write('\nFiltering out columns using MAD < '+str(threshold)+ ' by columns. Matrix columns MAD maximum value= %.2f' % maxVal)
+ sys.exit(-1)
+ else:
+ print('\nFiltered out '+str(delCnt)+' columns using MAD maximum value > '+str(threshold))
+#NANlimit
+ elif args.choice == "NANlimit" or args.choice == "NANpercent":
+ maxNANs= int(args.thresh)
+ val= ' '
+ if args.choice == "NANpercent":
+ n,m = np.shape(matrix)
+ maxNANs= int(int(args.thresh)*n/100)
+ val= '%'
+ if args.axes == "Row":
+ matrix, filter_rows, filter_cols,delCnt, maxFoundNANs = NAN_Filter_Row(matrix,nanList,maxNANs,row_header_list,column_header_list)
+ Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
+ if delCnt < 1:
+ print('\nNO Filtering occurred for rows using NAN limit = or > '+str(args.thresh)+val+ ' by row. Matrix row max NAN count is =' + str(maxFoundNANs ))
+ sys.stderr.write('\nNO Filtering out rows using NAN limit = or > '+str(args.thresh)+val+ ' by row. Matrix row max NAN count is =' + str(maxFoundNANs ))
+ sys.exit(-1)
+ else:
+ print('\nFiltered out '+str(delCnt)+' rows using NAN limit = or > '+str(args.thresh)+val)
+ elif args.axes == "Column":
+ matrix, filter_rows, filter_cols,delCnt, maxFoundNANs = NAN_Filter_Column(matrix, nanList, maxNANs, row_header_list, column_header_list)
+ Labeler(matrix,filter_cols,filter_rows,args.output_file_txt)
+ if delCnt < 1:
+ print('\nNO Filtering occurred for columns using NAN limit = or > '+str(args.thresh)+val+ ' by columns. Matrix columns max NAN count is = '+ str(maxFoundNANs))
+ sys.stderr.write('\nNO Filtering out columns using NAN limit = or > '+str(args.thresh)+val+ ' by columns. Matrix columns max NAN count is = '+ str(maxFoundNANs))
+ sys.exit(-1)
+ else:
+ print('\nFiltered out '+str(delCnt)+' columns using NAN limit = or > '+str(args.thresh)+val )
+
+# elif args.choice == "covariance":
+# if args.axes == "Row":
+# matrix, filter_rows, filter_cols = CoVariance_Percent_Filter_row(matrix,args.thresh,row_header_list,column_header_list)
+# Labeler(matrix,filter_rows,filter_cols,args.output_file_txt)
+# print('Covariance_Filter on row')
+# elif args.axes == "Column":
+# matrix, filter_rows, filter_cols = CoVariance_Percent_Filter_col(matrix,args.thresh,row_header_list,column_header_list)
+# Labeler(matrix,filter_rows,filter_cols,args.output_file_txt)
+# print('Covariance_Filter on column')
+ else:
+ print('Invalid Axes = '+str(args.axes))
+ sys.exit(-1)
+ else:
+ print("Invalid Filter Choice = "+str(args.choice))
+ sys.exit(-2)
+
+
+ except Exception as err:
+ traceback.print_exc()
+ sys.exit(-3)
+
+if __name__ == '__main__':
+ main()
+ print("\ndone")
+ sys.exit(0)
diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Filters.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Matrix_Filters.xml Tue Nov 27 14:20:40 2018 -0500
@@ -0,0 +1,58 @@
+
+
+ rows or columns based on specified threshold
+ Matrix_Filters.py '$p_input' '$extra.choice' '$extra.thresh' '$axes' '$output_file'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Manipulation.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Matrix_Manipulation.sh Tue Nov 27 14:20:40 2018 -0500
@@ -0,0 +1,42 @@
+#echo "1: " $1 # tool directory
+#echo "2: " $2 # manipulation option
+#echo "3: " $3 # input file
+#echo "4: " $4 # output file
+#echo "5: " $5 # choice
+#echo "6: " $6 # thresh
+#echo "7: " $7 # axis
+#echo "8: " $8 # transpose
+#echo "9: " $9 # input2
+#echo "10: " ${10} # offsetvalue
+#echo "11: " ${11} # scalevalue
+#echo "12: " ${12}
+#echo "13: " ${13}
+#echo "14: " ${14}
+#echo "15: " ${15}
+#echo "16: " ${16}
+
+#echo "tool directory is: " $1
+if [ "$2" = "Matrix_Filters" ]; then
+ echo "filter chosen"
+ #python $__tool_directory__/Matrix_Filters.py '$p_input '${manipulation.extra.choice}' '${manipulation.extra.thresh}' '${manipulation.axis}' '$output_file'
+ python $1/Matrix_Filters.py $3 $5 $6 $7 $4
+elif [ "$2" = "Matrix_Multiply" ]; then
+ echo "multiply chosen"
+ #python '$__tool_directory__/Matrix_Multiply.py' '$p_input' '${manipulation.extra.transpose}' '${manipulation.extra.input2}' '${manipulation.extra.choice}' '$output_file'
+ python $1/Matrix_Multiply.py $3 $8 $9 $5 $4
+elif [ "$2" = "Matrix_Statistics" ]; then
+ echo "statistics chosen"
+ #python '$__tool_directory__/Matrix_Statistics.py' '$p_input' '$choice' '$cutoff' '$axis' '$out_file'
+ python $1/Matrix_Statistics.py $3 $5 $6 $7 $4
+elif [ "$2" = "Matrix_Transformations" ]; then
+ echo "transform chosen"
+ #python '$__tool_directory__/Matrix_Transformations.py' '$p_input' '$choice' '$axis' '$scalevalue' '$offsetvalue' '$output_file'
+ python $1/Matrix_Transformations.py $3 $5 $7 ${11} ${10} $4
+elif [ "$2" = "Matrix_Validations" ]; then
+ echo "validations chosen"
+ #python '$__tool_directory__/Matrix_Validations.py' '$p_input' '${manipulation.extra.choice}' '${manipulation.extra.axis}' '$output_file'
+ python $1/Matrix_Validations.py $3 $5 $7 $4
+else
+ echo "no valid choice made"
+fi
+
diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Manipulation.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Matrix_Manipulation.xml Tue Nov 27 14:20:40 2018 -0500
@@ -0,0 +1,406 @@
+
+
+ Data manipulation before heatmap creation
+
+
+ $__tool_directory__/Matrix_Manipulation.sh '$__tool_directory__' '${manipulation.option}' '$p_input' '$output_file'
+ '${manipulation.extra.choice}' '${manipulation.extra.thresh}' '${manipulation.extra.axis}'
+ '${manipulation.extra.transpose}' '${manipulation.extra.input2}' '${manipulation.extra.offsetvalue}' '${manipulation.extra.scalevalue}'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Manipulation_Tool_Shed.zip
Binary file Matrix_Manipulation_Tool_Shed.zip has changed
diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Multiply.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Matrix_Multiply.py Tue Nov 27 14:20:40 2018 -0500
@@ -0,0 +1,173 @@
+'''
+Created on March 6, 2018
+
+@author: Bob Brown based on John Weinstein's algorithm
+'''
+
+import os
+import re
+import shutil
+import traceback
+import sys, traceback, argparse
+import numpy as np
+import warnings
+#import scipy.stats as ss
+from Matrix_Validate_import import reader, Labeler, MatchLabels
+import math
+warnings.filterwarnings('error')
+
+# John Weinsteins algorithm by bob brown https://discover.nci.nih.gov/CorrelateMatrices/help.do
+#http://www.blog.pythonlibrary.org/2014/04/30/reading-excel-spreadsheets-with-python-and-xlrd/
+
+def get_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('input_file1', help='text file input matrix(include .txt in name)')
+ parser.add_argument('transpose', type=str, help='transpose matrix 1?')
+ parser.add_argument('input_file2', help='text file input matrix(include .txt in name)')
+ parser.add_argument('choice', type=str, help='Choose Normalization Method: 1 = Z-score, 2 = Mean Centered, 3 = log2, 4= rank')
+# parser.add_argument('scaleValue', help='optional scaling factor for matrix)')
+ parser.add_argument('out_fileName', help='text file output matrix(include .txt in name)')
+ args = parser.parse_args()
+ if args.transpose == "": args.transpose = 'n'
+ return args
+
+
+def Matrix_Multiply(matrix1, matrix2):
+
+ try:
+#TODO handle NANs
+
+ matrixOut= np.dot(matrix1, matrix2)
+
+
+ except Exception as err:
+ traceback.print_exc()
+ sys.exit(-5)
+
+ return(matrixOut)
+
+
+#CorrelateMatrices correlation acorss 2 martices https://discover.nci.nih.gov/CorrelateMatrices/home.do
+def Correlate_Matrices(matrix1, matrix2):
+
+ #try:
+ # Leave both matrices as size axn and bxn and treat a is column and b as row
+ #matrix1T = Transpose(matrix1)
+
+#TODO handle NANs
+ numRows1,numColumns1= np.shape(matrix1)
+
+ numRows2,numColumns2= np.shape(matrix2)
+ matrixOut= []
+
+ if numColumns1 != numRows2:
+ print("ERROR number columns Matrix 1 ", str(numColumns1), " not equal number rows for Matrix 2 ",str(numRows2))
+ sys.exit(-1)
+#TODO need to look for NANs??
+
+ for i in range(numRows1):
+ vectorM1 = matrix1[i][:]
+ meanVec1 = np.nanmean(vectorM1)
+ varStdDev1 = np.nanstd(vectorM1, ddof=1)
+ lowStdDev1 = False
+ #if equals zero
+ if abs(varStdDev1) < .000001:
+ print("ERROR Variance value almost zero", str(varStdDev1), " for Matrix 1 Row ",str(i+1))
+ lowStdDev1= True
+ correlationRow= []
+
+ for j in range(numColumns2):
+ vectorM2 = []
+ for t in range(numRows2):
+ vectorM2.append(matrix2[t][j])
+ meanVec2 = np.nanmean(vectorM2)
+ varStdDev2 = np.nanstd(vectorM2, ddof=1)
+ lowStdDev2= False
+ #if equals zero
+ if abs(varStdDev2) < .000001:
+ print("ERROR Variance value almost zero", str(varStdDev2), " for Matrix 2 Column ",str(j+1))
+ lowStdDev2= True
+
+ covarStdDev12= 0
+
+ if not lowStdDev1 and not lowStdDev2:
+ #try:
+ for pos in range(len(vectorM1)):
+ covarStdDev12 += ((vectorM1[pos]-meanVec1)/varStdDev1)*((vectorM2[pos]-meanVec2)/varStdDev2)
+# bottom= (numColumns1 -1)*(varStdDev1*varStdDev2)
+# correlationRow.append( covarStdDev12/bottom)
+ correlationRow.append( covarStdDev12/(numColumns1 -1))
+ #except: bad value because of NAN or other
+ else:
+ correlationRow.append("divide by 0") # cannot calculate correlation var too small
+
+ matrixOut.append(correlationRow)
+
+# except Exception as err:
+# traceback.print_exc()
+# sys.exit(-6)
+
+ return(matrixOut)
+
+#----------------------------------------------------------------------
+def Transpose(in_mat):
+ out_mat = []
+ numRows,numColumns= np.shape(in_mat)
+
+ for i in range(numColumns):
+ temp= []
+ for j in range(numRows):
+ temp.append(in_mat[j][i])
+ out_mat.append(temp)
+ #print( str(out_mat))
+ return out_mat
+
+
+#----------------------------------------------------------------------
+if __name__ == "__main__":
+
+# input_file1 = "/Users/bobbrown/Desktop/Gene-by-var.txt"
+# input_file2 = "/Users/bobbrown/Desktop/var-by-sample.txt"
+# out_fileName = "/Users/bobbrown/Desktop/MatixMult-1-2-Out.txt"
+# selection = "MatrixMultiply"
+#TODO address NANs ???
+
+ try:
+ args = get_args()
+ selection= args.choice
+
+ matrix1,column_labels1,row_labels1 = reader(args.input_file1) # to be transposed later
+ matrix2,column_labels2,row_labels2 = reader(args.input_file2)
+
+
+ if args.transpose == 'y' or args.input_file1 == args.input_file2:
+ matrix1 = Transpose(matrix1)
+ print("\n>>>NOTICE Transposed first matrix so matrix 1 columns = Matrix 2 number rows ")
+ temp = row_labels1 #swap labels for output matrix
+ row_labels1 = column_labels1 #swap labels for output matrix
+ column_labels1= temp #swap labels for output matrix
+
+ MatchLabels(column_labels1,row_labels2) # verfiy labels and their order match
+
+ if len(column_labels1) != len(row_labels2):
+ print("\n>>> ERROR attempting to multiple Matrices of incompatible dimensions ")
+ print("First Matrix is "+str(len(row_labels1))+" by "+str(len(column_labels1))+" where second Matrix is "+str(len(og_row2))+" by "+str(len(column_labels2))+"\n")
+ print("Matrices must have dimensions AxB and BxC. A can equal C (square matrices)")
+ sys.exit(-1)
+
+ if selection == "MatrixMultiply":
+ matrixOut= Matrix_Multiply(matrix1, matrix2 )
+
+ elif selection == "Corr2Matrices" or selection == "Corr1Matrix":
+ matrixOut = Correlate_Matrices(matrix1, matrix2)
+
+ Labeler(matrixOut,column_labels2,row_labels1,args.out_fileName)
+
+ print("Matrix Multiply "+str(len(row_labels1))+" by "+str(len(column_labels1))+" Matrix 1 by "+str(len(row_labels2))+" by "+str(len(column_labels2))+" matrix 2")
+ print("Output Matrix dimensions are "+str(len(row_labels1))+" by "+str(len(column_labels2))+"\n")
+
+ except Exception as err:
+ traceback.print_exc()
+ sys.exit(-3)
+
+ sys.exit(0)
\ No newline at end of file
diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Multiply.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Matrix_Multiply.xml Tue Nov 27 14:20:40 2018 -0500
@@ -0,0 +1,52 @@
+
+
+ one matrix using one or two matrices
+ Matrix_Multiply.py '$extra.input1' 'extra.$transpose' 'extra.$input2' '$extra.choice' '$output_file'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Statistics.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Matrix_Statistics.py Tue Nov 27 14:20:40 2018 -0500
@@ -0,0 +1,145 @@
+'''
+Created on Feb2018
+
+@author: bob brown
+'''
+
+import sys, traceback, argparse
+import numpy as np
+from Matrix_Validate_import import reader
+#import matplotlib.pyplot as plt
+from Matrix_Filters import Variance_Percent_Filter_row, Variance_Percent_Filter_col
+
+#Define argparse Function
+def get_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('input_file_txt', help='tab delimited text file input matrix(include .txt in name)')
+ parser.add_argument('choice',type=str, help='Variance Filter Method (Variance or Range)')
+ parser.add_argument('thresh', help='Thershold for Variance Filtering')
+ parser.add_argument('axes', help='Axes to Filter on (Either Row or Column')
+ parser.add_argument('output_file_txt', help='tab delimited text file output name (include .txt in name)')
+ args = parser.parse_args()
+ return args
+
+
+#Define Function Which Labels Rows/Columns on Output
+def labeler(matrix,filter_rows,filter_cols,output_file_txt):
+
+ #Write Data to Specified Text File Output
+ with open(output_file_txt,'w') as f:
+ f.write("")
+ for k in range(0,len(filter_cols)):
+ f.write('\t' + filter_cols[k])
+ f.write('\n')
+ for i in range(0,len(filter_rows)):
+ f.write(filter_rows[i])
+ for j in range(0,len(matrix[0])):
+ f.write('\t' + format(matrix[i][j]))
+ f.write('\n')
+
+
+def Histo(matrix):
+ numBins= 20
+ data = []
+# numRow,numCol= np.shape(matrix)
+ for i in range(len(matrix[0])):
+ data.append(np.nanmean([row[i] for row in matrix]))
+
+# print(str(np.nanmean([row[i] for row in matrix])))
+
+#https://stackoverflow.com/questions/5328556/histogram-matplotlib
+ #bins = [0, 40, 60, 75, 90, 110, 125, 140, 160, 200]
+ minBin = int(min(data)-0.5)
+ maxBin = int(max(data)+0.5)
+ binWidth = float(maxBin-minBin)/numBins
+ bins= []
+ """
+ for j in range(numBins):
+ bins.append(minBin+ j*binWidth)
+ #bins= 20
+ n, bins, patches = plt.hist(data,bins, normed=False)
+ #n, bins, patches = plt.hist(data,bins, normed=1, color='green')
+ #hist, bins = np.histogram(data, bins=bins)
+ width = np.diff(bins)
+ center = (minBin + bins[1:]) / 2
+
+ cm = plt.cm.get_cmap('RdYlBu_r')
+ #col = (n-n.min())/(n.max()-n.min())
+ for c, p in zip(bins, patches):
+ plt.setp( p, 'facecolor', cm(c/numBins))
+ fig, ax = plt.subplots(num=1, figsize=(8,3))
+ ax.set_title("Distribution of Column Means")
+ #ax.bar(center,bins, align='center', width=width)
+ #ax.bar(center, hist, align='center', width=width)
+ #ax.set_xticks(bins)
+# fig.savefig("/Users/bobbrown/Desktop/Matrix-tools-Test-output/Column_Mean_Histogram.png")
+
+ plt.show()
+ """
+ return()
+
+#========== test create variable number output files in Galaxy
+def CreateFiles(output_file_info):
+
+ for i in range(3):
+ fd= open( output_file_info, 'w')
+ fd.write('File number = '+ str(i)+"\n")
+ fd.close()
+
+ return()
+
+#==================
+
+ #Define Main Function
+def main():
+ try:
+ args = get_args()
+ #sys.stdout.write(str(args)+"\n")
+ nanList= ["NAN", "NA", "N/A", "-","?","nan", "na", "n/a"]
+
+ matrix, og_cols,og_rows = reader(args.input_file_txt)
+ #old_reader matrix, og_rows, og_cols = reader(args.input_file_txt)
+# if float(args.thresh) < 0.000001:
+# print('Invalid negative threshold chosen = '+str(args.thresh)+" choose positive value")
+# sys.exit(-4)
+
+ if args.choice == "Histogram":
+ Histo(matrix)
+ elif args.choice == "CreateFiles":
+ CreateFiles(args.output_file_info)
+
+ elif args.choice == "Variance":
+ if args.axes == "Row":
+ matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = Variance_Percent_Filter_row(matrix,1,og_rows,og_cols,True)
+ labeler(matrix,filter_rows,filter_cols,args.output_file_txt)
+# if delCnt < 1:
+# print('\nNO Filtering occurred for rows using variance < '+str(args.thresh)+ ' by row. Matrix row minimum variance= %.2f' % minVal+' and maximum variance= %.2f' % maxVal)
+# sys.stderr.write('\nFiltering out rows using variance < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows')
+# sys.exit(-1)
+# else:
+# print('\nFiltering out rows using variance < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows')
+ elif args.axes == "Column":
+ matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = Variance_Percent_Filter_col(matrix,1,og_rows,og_cols,True)
+ labeler(matrix,filter_rows,filter_cols,args.output_file_txt)
+# if delCnt < 1:
+# print('\nNO Filtering occurred for columns using variance < '+str(args.thresh)+ ' by columns. Matrix columns minimum variance= %.2f' % minVal+' and maximum variance= %.2f' % maxVal)
+# sys.stderr.write('\nFiltering out rows using variance < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows')
+# sys.exit(-1)
+# else:
+# print('\nFiltering out columns using variance < '+str(args.thresh)+ ' removed '+str(delCnt)+' columns')
+ else:
+ print('Invalid Axes = '+str(args.axes))
+ sys.exit(-1)
+ else:
+ print("Invalid Filter Choice = "+str(args.choice))
+ sys.exit(-2)
+
+
+ except Exception as err:
+ traceback.print_exc()
+ sys.exit(-3)
+
+if __name__ == '__main__':
+ main()
+ print("\nFini")
+ sys.exit(0)
\ No newline at end of file
diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Statistics.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Matrix_Statistics.xml Tue Nov 27 14:20:40 2018 -0500
@@ -0,0 +1,23 @@
+
+
+ View metadata about Heat Map Matrix
+ Matrix_Statistics.py '$input' '$choice' '$cutoff' '$axes' '$out_file'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Transformations.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Matrix_Transformations.py Tue Nov 27 14:20:40 2018 -0500
@@ -0,0 +1,301 @@
+'''
+Created on Jun 6, 2017 updated Feb 2018
+
+@author: cjacoby and Bob Brown
+'''
+import os
+import sys, traceback, argparse
+import numpy as np
+from numpy import size, array
+import warnings
+from Matrix_Validate_import import reader
+#import scipy.stats as ss
+warnings.filterwarnings('error')
+
+#Define argparse Function
+def get_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('input_file_txt', help='text file input matrix(include .txt in name)')
+ parser.add_argument('choice', type=str, help='Choose normalization Method: 1 = Z-score, 2 = Mean Centered, 3 = log2, 4= rank')
+ parser.add_argument('axes', type=str, help='Choose Axis to normalize On (Row or Column)')
+ parser.add_argument('scalevalue', help='optional scaling factor for matrix)')
+ parser.add_argument('offsetvalue', help='optional offset for matrix')
+ parser.add_argument('output_file_txt', help='text file output matrix(include .txt in name)')
+ args = parser.parse_args()
+ return args
+
+
+def Zscore_row(matrix):
+
+ #Loop To Perform Z-Score normalization
+ for i in range(0,len(matrix)):
+ temp_mean = np.nanmean(matrix[i])
+ temp_stdev = np.nanstd(matrix[i],ddof=1)
+ for j in range(0,len(matrix[0])):
+ matrix[i][j] = (matrix[i][j]-temp_mean)/temp_stdev
+ return(matrix)
+
+#Define Z-Score normalization Function
+def Zscore_col(matrix):
+
+ #Loop To Perform Z-Score normalization
+ for i in range(len(matrix[0])):
+# matrix[:][i] = [scaleValue*x+offset for x in matrix[i]]
+ temp_mean = np.nanmean([row[i] for row in matrix])
+ temp_stdev = np.nanstd([row[i] for row in matrix],ddof=1)
+ #Probably Should Have if statement checking if stdev equals zero, although this implies the data is already Z-score normalized
+ for j in range(len(matrix)):
+ matrix[j][i] = (matrix[j][i]-temp_mean)/temp_stdev
+ return(matrix)
+
+
+#Define Mean Centered or Median centered normalization Function
+def MeanMedianCenter_row(matrix,type):
+
+
+ #Loop To Perform mean or median center
+ for i in range(0,len(matrix)):
+ if type == "mean":
+ temp_type = np.nanmean(matrix[i][1::])
+ else:
+ temp_type = np.nanmedian(matrix[i][1::])
+
+ for j in range(0,len(matrix[0])):
+ matrix[i][j] = (matrix[i][j]-temp_type)
+ return(matrix)
+
+
+#Define mean or median
+def MeanMedianCenter_col(matrix,type):
+
+ #Loop To Perform mean or median center
+ for i in range(0,len(matrix[0])):
+ if type == "mean":
+ temp_type = np.nanmean([row[i] for row in matrix])
+ else:
+ temp_type = np.nanmedian([row[i] for row in matrix])
+ #Probably Should Have if statement checking if stdev equals zero, although this implies the data is already Z-score normalized
+ for j in range(0,len(matrix)):
+ matrix[j][i] = (matrix[j][i]-temp_type)
+ return(matrix)
+
+#Divide by sum of the Row Function
+def Divide_By_Sum_row(matrix):
+
+ #Loop To Perform mean or median center
+ numRow,numCol= np.shape(matrix)
+
+ for i in range(numRow):
+ sumValue = sum(matrix[i][:])
+
+ #if equals zero
+ if abs(sumValue) > .0001:
+ for j in range(numCol):
+ matrix[i][j] = matrix[i][j]/sumValue
+ else:
+ print("ERROR Cannot divide by Sum almost zero", str(sumValue), " for Row ",str(i+1))
+ return(matrix)
+
+
+#Divide by sum of the Column Function
+def Divide_By_Sum_col(matrix):
+
+ #Loop To Perform mean or median center
+ numRow,numCol= np.shape(matrix)
+
+ for i in range(numCol):
+ sumValue= 0
+
+ #if equals zero
+ if abs(sumValue) > .0001:
+ for j in range(numRow):
+ matrix[j][i] = (matrix[j][i]/sumValue)
+ else:
+ print("ERROR Cannot divide by Sum almost zero", str(sumValue), " for Column ",str(i+1))
+ return(matrix)
+
+#scale or add offset to matrix by row
+def ScaleOffset_row(matrix,scaleValue,offset):
+
+ #Loop To Perform scale and offset do one or the other per request
+ if abs(scaleValue) > 0.0001:
+ for i in range(0,len(matrix)):
+ matrix[i][:] = [scaleValue*x+offset for x in matrix[i]]
+ else:
+ print (" Scale facter "+str(scaleValue)+" too small")
+ return(matrix)
+
+#scale or add offset to matrix by column
+def ScaleOffset_col(matrix,scaleValue,offset):
+
+ #Loop To Perform scale and offset do one or the other per request
+ if abs(scaleValue) > 0.0001:
+ for i in range(0,len(matrix[0])):
+ matrix[:][i] = [scaleValue*x+offset for x in matrix[i]]
+ else:
+ print (" Scale facter "+str(scaleValue)+" too small")
+ return(matrix)
+
+#Define Log2 normalization Method
+def Convert2Logs(matrix,logValue, offset):
+ import warnings
+ warnings.filterwarnings('error')
+
+ #Loop To Perform Z-Score normalization
+ for i in range(0,len(matrix)):
+ for j in range(0,len(matrix[0])):
+ try:
+ if logValue == "log2":
+ matrix[i][j] = np.log2(matrix[i][j]+offset)
+ else:
+ matrix[i][j] = np.log10(matrix[i][j]+offset)
+
+ except RuntimeWarning:
+ print(logValue+" normalization Failed: Encountered elements <= 0, which are invalid inputs for a Log normalization")
+ break
+ else:
+ continue
+ break
+ return(matrix)
+
+#transpose matrix
+def Transpose(in_mat):
+ out_mat = []
+ numRow,numCol= np.shape(in_mat)
+
+ for i in range(numCol):
+ temp= []
+ for j in range(numRow):
+ temp.append(in_mat[j][i])
+ out_mat.append(temp)
+ #print( str(out_mat))
+ return out_mat
+
+# restores row and column labels in ouput
+def labeler(matrix,og_cols,og_rows,output_file_txt):
+ #Define Null Sets For Col and Row Headers
+ with open(output_file_txt,'w') as f:
+ f.write("")
+ for k in range(0,len(og_cols)):
+ f.write('\t' + str(og_cols[k]) )
+ f.write('\n')
+ for i in range(0,len(og_rows)):
+ f.write(str(og_rows[i]) )
+ for j in range(0,len(matrix[0])):
+ f.write('\t' + format(matrix[i][j]))
+ f.write('\n')
+
+#Define Main Function
+def main():
+
+ try:
+ args = get_args()
+ scaleValue = float(args.scalevalue)
+ offsetValue= float(args.offsetvalue)
+ #print(args)
+ #sys.stdout.write(str(args)+"\n")
+
+ matrix,og_cols,og_rows = reader(args.input_file_txt)
+ if args.choice == "z_score_normalization":
+ if args.axes == "Row":
+ matrix = Zscore_row(matrix)
+ labeler(matrix,og_cols,og_rows,args.output_file_txt)
+ print("zcore, row")
+ elif args.axes == "Column":
+ matrix = Zscore_col(matrix)
+ labeler(matrix,og_cols,og_rows,args.output_file_txt)
+ print("zscore, column")
+ else:
+ print("zscore, invalid axis")
+ elif args.choice == "mean_center_normalization":
+ if args.axes == "Row":
+ matrix = MeanMedianCenter_row(matrix,"mean")
+ labeler(matrix,og_cols,og_rows,args.output_file_txt)
+ print("mean-center by row")
+ elif args.axes == "Column":
+ matrix = MeanMedianCenter_col(matrix,"mean")
+ labeler(matrix,og_cols,og_rows,args.output_file_txt)
+ print("mean-center by column")
+ else:
+ print("meancenter, invalid axis")
+ elif args.choice == "median_center_normalization":
+ if args.axes == "Row":
+ matrix = MeanMedianCenter_row(matrix,"median")
+ labeler(matrix,og_cols,og_rows,args.output_file_txt)
+ print("median-center by row")
+ elif args.axes == "Column":
+ matrix = MeanMedianCenter_col(matrix,"median")
+ labeler(matrix,og_cols,og_rows,args.output_file_txt)
+ print("median-center by column")
+ else:
+ print("meancenter, invalid axis")
+ elif args.choice == "add_offset":
+ if args.axes == "Row":
+ #offset = -100 #!!!! TODO REMOVE AND ADD WHEN clause to xml to get value
+ matrix = ScaleOffset_row(matrix,1.0,offsetValue)
+ labeler(matrix,og_cols,og_rows,args.output_file_txt)
+ print("offset of "+str(offsetValue)+" by row")
+ elif args.axes == "Column":
+ matrix = ScaleOffset_col(matrix,1.0,offsetValue)
+ labeler(matrix,og_cols,og_rows,args.output_file_txt)
+ print("offset of "+str(offsetValue)+" by column")
+ else:
+ print("offset"+str(offsetValue)+" invalid axis -not row or column")
+ elif args.choice == "scale":
+ if args.axes == "Row":
+ #scaleValue = 1000 #!!!! TODO REMOVE AND ADD WHEN clause to xml to get value
+ matrix = ScaleOffset_row(matrix,scaleValue,0.0)
+ labeler(matrix,og_cols,og_rows,args.output_file_txt)
+ print("scaling "+str(scaleValue)+" by row")
+ elif args.axes == "Column":
+ matrix = ScaleOffset_col(matrix,scaleValue,0.0)
+ labeler(matrix,og_cols,og_rows,args.output_file_txt)
+ print("scaling "+str(scaleValue)+" by column")
+ else:
+ print("scaling "+str(scaleValue)+" invalid axis")
+ elif args.choice == "transpose":
+ matrix = Transpose(matrix) #issue using same matrix?
+ labeler(matrix,og_rows,og_cols,args.output_file_txt) #swapped row&col labels
+ print("transpose mxn matrix to nxm size")
+ elif args.choice == "ln_normalization":
+ matrix = Convert2Logs(matrix,"log2",offsetValue)
+ labeler(matrix,og_cols,og_rows,args.output_file_txt)
+ print("log2 plus "+str(offsetValue)+" normalization for all values")
+ elif args.choice == "log_normalization":
+ matrix = Convert2Logs(matrix,"log10",offsetValue)
+ labeler(matrix,og_cols,og_rows,args.output_file_txt)
+ print("log10 normalization for all values")
+ elif args.choice == "rank":
+ if args.axes == "Row":
+ matrix = Rankdata_ByRow(matrix)
+ labeler(matrix,og_cols,og_rows,args.output_file_txt)
+ print("performed rank normalization by row")
+ elif args.axes == "Column":
+ matrix = Rankdata_ByColumn(matrix)
+ labeler(matrix,og_cols,og_rows,args.output_file_txt)
+ print("performed rank normalization by column")
+ else:
+ print("rank, invalid axis")
+ elif args.choice == "divide_by_sum":
+ if args.axes == "Row":
+ matrix = Divide_By_Sum_row(matrix)
+ labeler(matrix,og_cols,og_rows,args.output_file_txt)
+ print("performed divide row N values by row N's sum")
+ elif args.axes == "Column":
+ matrix = Divide_By_Sum_col(matrix)
+ labeler(matrix,og_cols,og_rows,args.output_file_txt)
+ print("performed divide column N values by column N's sum")
+ else:
+ print("divide_by_sum, invalid axis")
+
+ else:
+ print("Invalid normalization Choice")
+
+ except Exception as err:
+ traceback.print_exc()
+ sys.exit(1)
+
+
+if __name__ == '__main__':
+ main()
+ print("Done")
diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Transformations.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Matrix_Transformations.xml Tue Nov 27 14:20:40 2018 -0500
@@ -0,0 +1,106 @@
+
+
+ by Rows, Columns, All by method
+ Matrix_Transformations.py '$p_input' '$extra.choice' '$extra.axes' '$extra.scalevalue' '$extra.offsetvalue' '$output_file'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Validate_import.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Matrix_Validate_import.py Tue Nov 27 14:20:40 2018 -0500
@@ -0,0 +1,179 @@
+'''
+Created on Jun 7, 2017 modified Feb2018
+
+@author: cjacoby and Bob Brown
+'''
+
+import sys, traceback, argparse
+import numpy as np
+import os
+#import matplotlib.pyplot as plt
+#import matplotlib.pyplot as plt; plt.rcdefaults()
+
+# Define the Reading Function Which Pulls the Data from a .txt file
+def reader(input_file_txt, create_plot= False):
+ #Read Matrix, Preserving String Values for Headers first row and first column (both minus first cell)
+ #Read Matrix, Converting all values to Float for Data Processing
+
+ f = open(input_file_txt, "rU")
+
+ #print( 'Valid NAN identifiers are: empty cells, cells with blanks,"NA","N/A","-", and "?"')
+
+ column_labels = []
+ row_labels = []
+ matrix = []
+ firstLine= True
+
+ line = f.readline()
+
+# "NA","N/A","-","?","NAN","NaN","Na","na","n/a","null",EMPTY/Null, SPACE (blank char)
+
+ nanList = ["", " ","NAN", "NA", "N/A", "-","?"]
+ binCatDict = {"":0, " ":0, "Text":0, "NA":0, "-":0,"NAN":0, "N/A":0,"?":0}
+ row = 0
+ nanCnt = 0
+ nonNumCnt = 0
+
+ while line:
+ line = line.strip("\n")
+ line = line.split('\t')
+
+ row += 1
+
+ if firstLine:
+ lengthRow = len(line)
+ column_labels = line[1:]
+ else:
+ if lengthRow != len(line):
+ # print("\nERROR matrix row lengths unequal for row 0 and row "+str(row)+"\n" )
+ sys.exit(-1)
+
+ temp = []
+# column= 0
+ row_labels.append(str(line[0]))
+
+ #for item in line[1:]: use enumerate
+ for column, item in enumerate(line[1:],1):
+# column += 1
+ try:
+ temp.append(float(item))
+ except ValueError:
+ temp.append(np.nan)
+ itemUC= item.upper()
+
+ if itemUC in nanList:
+ nanCnt += 1
+ binCatDict[itemUC]= binCatDict[itemUC]+1
+ # print( 'Legit nans= ',str(item))
+ else:
+ if nonNumCnt == 0: sys.stderr.write("Start List of up to first 50 Invalid cell values \n")
+ nonNumCnt +=1
+ if nonNumCnt < 50: sys.stderr.write("At row_column= "+str(row)+"_"+str(column)+' invalid data cell value '+ item+"\n")
+
+ matrix.append(temp)
+
+ line = f.readline()
+ firstLine= False
+
+ #sys.stdout.write("\n\n")
+ f.close()
+ binCatDict["Text"]= nonNumCnt
+
+# plot results of NAN counts above
+
+ binCat = ["null", "blank", 'hyphen', '?','NA','N/A' ,'NAN', 'text']
+ orderDict= {0:"", 1:"", 2:'-', 3:'?',4:'NA', 5:'N/A' ,6:'NAN', 7:'Text'}
+#TODO verify dict orde for data
+ #print("> key value =",key, str(value))
+
+ if create_plot:
+ numBins = len(binCat)
+ binWidth = 1
+ bins = []
+ binData = []
+
+ for key in sorted(orderDict):
+ value= binCatDict[orderDict[key]] # place items on chart in order and with data value for item
+ if value < 1:
+ binData.append(value+0.01)
+ else:
+ binData.append(value)
+
+ #"""
+ for j in range(numBins):
+ bins.append(j*binWidth)
+ #ttps://pythonspot.com/matplotlib-bar-chart/
+ y_pos = np.arange(numBins)
+ plt.yticks(y_pos, binCat)
+ plt.title("Distribution of NAN types (UPPER & lower & MiXeD case combined)")
+ plt.ylabel('NAN Types')
+ plt.xlabel('Occurrences')
+ #plt.legend()
+ plt.barh(y_pos, binData, align='center', alpha=0.5)
+
+ fig, ax = plt.subplots(num=1, figsize=(8,3))
+ ax.set_title("Data Cell Counts of Not A Number (NAN) Types")
+ #ax.bar(center,bins, align='center', width=width)
+ #ax.bar(center, hist, align='center', width=width)
+ #ax.set_xticks(bins)
+ # fig.savefig("/Users/bobbrown/Desktop/Matrix-tools-Test-output/NAN-plot.png")
+
+ # fig, ax = plt.subplots(num=1, figsize=(8,3))
+ # fig.savefig("/Users/bobbrown/Desktop/Matrix-tools-Test-output/hist-out.png")
+
+ plt.show()
+ #"""
+
+#after plot error?
+ x,y=np.shape(matrix)
+ if nanCnt > 0: print("WARNING -- Found "+str(nanCnt)+" Valid Non-numbers. Their percent of total matrix data cell values = "+str((100*nanCnt)/(x*y))+"% ")
+ if nonNumCnt > 0: sys.exit(-1)
+ #print ("reader output:")
+ #print (matrix)
+ #print (column_labels)
+ #print(row_labels)
+ return matrix,column_labels,row_labels
+
+#----------------------------------------------------------------------
+# Verify Matrix A column_labels match Matrix B row_labels in name and order for A*B
+def MatchLabels(column_labels,row_labels):
+
+ if len(column_labels) != len(row_labels):
+ sys.err("ERROR 1st matrix column count "+str(len(column_labels))+" not equal 2nd Matrix number row count "+str(len(row_labels))+"\n" )
+ else:
+ cnt= 0
+ for k in range(0,len(column_labels)):
+ if column_labels[k] != row_labels[k] and cnt < 20:
+ cnt += 1
+ #sys.err("ERROR At column & row position "+str(k)+" Matrix 1 column value "+str(column_labels)+" not equal 2nd Matrix row value "+str(row_labels)+"\n" )
+
+ if cnt > 0:
+ sys.exit(-11)
+#----------------------------------------------------------------------
+# restores row and column labels in ouput
+def Labeler(matrix,column_labels,row_labels,output_file_txt):
+ #print("matrix length: " + str(len(matrix)))
+ #print("row labels length: " + str(len(row_labels)))
+ #print("col labels length: " +str(len(column_labels)))
+ #Define Null Sets For Col and Row Headers
+ with open(output_file_txt,'w') as f:
+ f.write("")
+ for k in range(0,len(column_labels)):
+ f.write('\t' + str(column_labels[k]) )
+ f.write('\n')
+ #for i in range(0,len(row_labels)):
+ for i in range(0,len(matrix)):
+ f.write(str(row_labels[i]) )
+ #print("matrix["+str(i)+"] length:" + str(len(matrix[i])))
+ for j in range(0,len(matrix[0])):
+ f.write('\t' + format(matrix[i][j]))
+ f.write('\n')
+
+
+#----------------------------------------------------------------------
+if __name__ == '__main__':
+ input_file_txt = str(sys.argv[1])
+
+ matrix,column_labels,row_labels = reader(input_file_txt)
+ print("Done")
+
diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Validations.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Matrix_Validations.py Tue Nov 27 14:20:40 2018 -0500
@@ -0,0 +1,180 @@
+'''
+Created on Jun 7, 2017 modified Feb2018
+
+@author: Bob Brown and cjacoby
+'''
+
+import sys, traceback, argparse
+import numpy as np
+import os
+from Matrix_Validate_import import reader, Labeler
+
+#Define The Four Arguments Used in the Program
+def get_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('input_file_txt', help='tab delimited text file input matrix(include .txt in name)')
+ parser.add_argument('replacement', type=str, help='Choose Replacement for Missing Value. Valid Choices are strings: "Mean" or "Zero"')
+ parser.add_argument('axes', type=str, help='Choose Axes to Normalize On (Either "Row" or "Column"')
+ parser.add_argument('output_file_txt' ,help='tab delimited text file output name (include .txt in name)')
+ args = parser.parse_args()
+ return args
+
+
+#Define Function to Replace Null Values with Row Mean
+def nan_replacer_mean_rows(matrix):
+
+ nonNumCnt= 0
+ nanCnt = 0 #valid NANs are "NA","N/A","-","?"
+
+ #Loop Replacing all Null Values with Row Mean
+ for i in range(0,len(matrix)):
+ temp_mean = np.nanmean(matrix[i])
+ for j in range(0,len(matrix[0])):
+ #if matrix[i][j] == "NA": #np.isnan(matrix[i][j]) == True:
+ if np.isnan(matrix[i][j]) == True:
+ matrix[i][j] = temp_mean
+ nanCnt = nanCnt + 1
+ return matrix, nonNumCnt, nanCnt
+
+#Define Function to Replace Null Values with Column Mean
+def nan_replacer_mean_columns(matrix):
+
+ nonNumCnt= 0
+ nanCnt = 0 #valid NANs are "NA","N/A","-","?"
+
+ #Loop Replacing all Null Values with Column Mean
+ for i in range(0,len(matrix[0])):
+ col = [row[i] for row in matrix]
+ temp_mean = np.nanmean(col)
+ for j in range(0,len(matrix)):
+ #if matrix[i][j] == "NA": #elif np.isnan(matrix[j][i]) == True:
+ if np.isnan(matrix[j][i]) == True:
+ matrix[j][i] = temp_mean
+ nanCnt = nanCnt + 1
+
+ return matrix, nonNumCnt, nanCnt
+
+#Define Function to Replace Null Values with Zero (axis orientation is irrelevant)
+def nan_replacer_zero(matrix):
+
+ nonNumCnt= 0
+ nanCnt = 0 #valid NANs are "NA","N/A","-","?"
+
+ #Loop Replacing all Null Values with Row Range
+ for i in range(0,len(matrix)):
+ for j in range(0,len(matrix[0])):
+ #if matrix[i][j] =="NA":
+ if np.isnan(matrix[i][j]) == True:
+ matrix[i][j] = 0
+
+ return matrix, nonNumCnt, nanCnt
+
+#Define Function to Re-Label Output Matrix
+#!!!! not needed no output matrix from Validate tool
+def OLD_labeler(matrix, og_cols, og_rows, output_file_txt):
+ #Write Data to Specified Text File Output
+ with open(output_file_txt,'w') as f:
+ f.write("Use original input file for further processing\n")
+ f.close()
+# f.write("")
+# for k in range(0,len(og_cols)):
+# f.write('\t' + str(og_cols[k]))
+# f.write('\n')
+# for i in range(0,len(og_rows)):
+# f.write(og_rows[i])
+# for j in range(0,len(matrix[0])):
+# f.write('\t' + format(matrix[i][j]))
+# f.write('\n')
+
+#Main Function
+def main():
+ args = get_args()
+ #print(args)
+ #sys.stdout.write(str(args))
+ #sys.stdout.write( '\nValid NAN identifiers are "NA","N/A","-", and "?"')
+
+ matrix,og_cols,og_rows = reader(args.input_file_txt)
+
+# if nonNumCnt > 0:
+# print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
+# #sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
+# if nanCnt > 0:
+# print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
+# sys.exit(-1)
+# else:
+# if nanCnt > 0:
+# print('\nWARNING Matrix has NO unknown non-numbers in matrix, but contains '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
+# else:
+# print('Matrix is Good-to-Go -- all numbers in data area. ')
+
+ #with open(args.output_file_txt,'w') as f:
+ # f.write("Use original input file for further processing\n")
+ #f.close()
+ #sys.exit(0)
+
+# TODO !!!!! Below if MDA decides to use it TURNED OFF FOR NOW
+# TODO !!!!! Below if MDA decides to use it TURNED OFF FOR NOW
+
+ if args.replacement == "Mean":
+ if args.axes == "Row":
+ matrix, nonNumCnt, nanCnt = nan_replacer_mean_rows(matrix)
+ Labeler(matrix,og_cols,og_rows,args.output_file_txt)
+ #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt)
+ #print('Mean,Row')
+ if nonNumCnt > 0:
+ print('ERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
+ sys.stderr.write('ERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
+ if nanCnt > 0:
+ print('WARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
+ sys.exit(-1)
+ else:
+ if nanCnt > 0:
+ print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
+ else:
+ print('\nMatrix is Good-to-Go -- all numbers in matrix. ')
+ sys.exit(0)
+ elif args.axes == "Column":
+ matrix, nonNumCnt, nanCnt = nan_replacer_mean_columns(matrix)
+ Labeler(matrix,og_cols,og_rows,args.output_file_txt)
+ #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt)
+ #print('Mean,Column')
+ if nonNumCnt > 0:
+ print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
+ sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
+ if nanCnt > 0:
+ print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
+ sys.exit(-1)
+ else:
+ if nanCnt > 0:
+ print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
+ else:
+ print('\nMatrix is Good-to-Go -- all numbers in matrix. ')
+ sys.exit(0)
+ else:
+ print('Mean, but given Invalid Axis= '+str(args.axes))
+ sys.stderr.write('Mean, but given Invalid Axis= '+str(args.axes))
+ elif args.replacement == "Zero":
+ matrix, nonNumCnt, nanCnt = nan_replacer_zero(matrix)
+ Labeler(matrix,og_cols,og_rows,args.output_file_txt)
+ #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt)
+ if nonNumCnt > 0:
+ print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
+ sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
+ if nanCnt > 0:
+ print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
+ sys.exit(-1)
+ else:
+ if nanCnt > 0:
+ print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
+ else:
+ print('\nMatrix is Good-to-Go -- all numbers in matrix. ')
+ sys.exit(0)
+ else:
+ print('zero, but given Invalid Axis= '+str(args.axes))
+ sys.stderr.write('zero, but given Invalid Axis= '+str(args.axes))
+ sys.exit(-2)
+
+
+if __name__ == '__main__':
+ main()
+ print("done")
diff -r 7f12c81e2083 -r f1bcd79cd923 Matrix_Validations.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Matrix_Validations.xml Tue Nov 27 14:20:40 2018 -0500
@@ -0,0 +1,40 @@
+
+
+ Locate and identify non-numbers
+ Matrix_Validations.py '$p_input' 'Zero' 'Row' '$output_file'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff -r 7f12c81e2083 -r f1bcd79cd923 Split_ExcelTabs_IntoFiles.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Split_ExcelTabs_IntoFiles.py Tue Nov 27 14:20:40 2018 -0500
@@ -0,0 +1,290 @@
+import sys
+import os
+#import MySQLdb
+#import config
+import subprocess
+import re
+import shutil
+import traceback
+#import xlsxwriter
+import xlrd
+
+#http://www.blog.pythonlibrary.org/2014/04/30/reading-excel-spreadsheets-with-python-and-xlrd/
+
+def File_From_Tab(infileName, outfileName, tabName,tabNumber):
+ """
+ Open and read an Excel file
+ """
+ book = xlrd.open_workbook(infileName)
+ # print number of sheets
+ #print book.nsheets
+
+ # print sheet names
+ tabList= book.sheet_names()
+ #print tabList
+ #print book.sheet_names()
+ if tabName == "" and (tabNumber <1 or tabNumber > len(tabList)):
+ sys.stderr.write("\n>>>ERROR illegal tab number "+str(tabNumber)+" input when no tab name was specified\n")
+ sys.stderr.write("\n>>>Allowed tab numbers, or tab names, for this file with "+str(len(tabList))+" total tabs are:")
+
+ for i in range(len(tabList)):
+ sys.stderr.write("\n>>> tab number "+str(i+1)+" is named "+str(tabList[i]))
+ sys.exit(-1)
+
+ if tabName != "": # use name instead of tab number
+ found = False
+ i = 0
+ while (i < len(tabList)) and not found:
+ i += 1
+ if tabName == str(tabList[i-1]):
+ tabNumber = i
+ found = True
+ if not found:
+ sys.stderr("\n>>> ERROR -- Input Tab name "+tabName+" was not found\n")
+ sys.exit(-1)
+ # get the first worksheet
+ #first_sheet = book.sheet_by_index(0)
+ worksheet = book.sheet_by_index(tabNumber-1)
+
+ outFile = open(outfileName+str(tabList[tabNumber-1]+".tsv"), 'w')
+
+ #https://stackoverflow.com/questions/14944623/python-xrld-read-rows-and-columns
+ #workbook = xlrd.open_workbook('my_workbook.xls')
+ #worksheet = workbook.sheet_by_name('Sheet1')
+ num_rows = worksheet.nrows - 1
+ num_cells = worksheet.ncols - 1
+ curr_row = -1
+ while curr_row < num_rows:
+ curr_row += 1
+ row = worksheet.row(curr_row)
+
+ if curr_row == 0:
+ endOfLine= False
+ allRowNumCols= len(row)
+ i= len(row)-1
+ # find length of matrix and covariates using first row
+ # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank
+ while i <= len(row)-1 and not endOfLine:
+ cell_type = worksheet.cell_type(curr_row, i)
+ #temp = str(worksheet.cell_value(curr_row, i))
+ #print( " pos and cell type row one ",cell_type, i)
+
+ if cell_type == 0 or cell_type == 6:
+ allRowNumCols -= 1
+ i -= 1
+ else:
+ endOfLine= True
+
+ if allRowNumCols < 5:
+ sys.stderr.write("\nERROR First row number of columns= "+str(allRowNumCols)+" is too short, so all rows will be ignored\n")
+ sys.exit(-1)
+ elif curr_row == 0:
+ sys.stdout.write("\nALL Rows must all have the same number of columns as the First row's number columns = "+ str(allRowNumCols) +"\n")
+
+ temp= ''
+ rowLen= 0
+ endOfLine= False
+
+ while rowLen < allRowNumCols and not endOfLine:
+ temp += str(worksheet.cell_value(curr_row, rowLen))+"\t"
+ #temp += str(row[rowLen])+"\t"
+ rowLen += 1
+
+ temp = temp[:-1]+"\n"
+ #print 'Row:', curr_row, len(row), rowLen
+ outFile.write(temp) #TODO check if rows are all same length
+
+ sys.stdout.write("File created with "+str(curr_row)+" rows and "+str(allRowNumCols)+" columns\n")
+# curr_cell = -1
+# while curr_cell < num_cells:
+# curr_cell += 1
+# # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank
+# cell_type = worksheet.cell_type(curr_row, curr_cell)
+# cell_value = worksheet.cell_value(curr_row, curr_cell)
+# print ' ', cell_type, ':', cell_value
+ # # read a row
+# print first_sheet.row_values(0)
+#
+# # read a cell
+# cell = first_sheet.cell(0,0)
+# print cell
+# print cell.value
+#
+# # read a row slice
+# print first_sheet.row_slice(rowx=0,
+# start_colx=0,
+# end_colx=2)
+
+ return tabList
+
+
+#======================
+# from RPPA callInSilicoReportWriter.py
+# def write_xlsx_for_report(directory_for_reports, report_name, report_id, dict_cf2_values):
+#
+#
+# error_write_xlsx = ""
+# error_occurred = 0
+#
+# try:
+# path_to_dir_when_writing = os.path.join(directory_for_reports, report_name)
+# header_path = os.path.join(directory_for_reports, report_name, "header.csv")
+# raw_log_2_path = os.path.join(directory_for_reports, report_name, "RawLog2.csv")
+# norm_linear_path = os.path.join(directory_for_reports, report_name, "NormLinear.csv")
+# norm_log_2_path = os.path.join(directory_for_reports, report_name, "NormLog2.csv")
+# norm_log_2_median_centered_path = os.path.join(directory_for_reports, report_name, "NormLog2_MedianCentered.csv")
+#
+# # put the cf2 values in the NormLinear file
+# error_put_cf2_in_normLinear = write_new_normLinear_csv_file_with_cf2_values(path_to_dir_when_writing, norm_linear_path, dict_cf2_values)
+#
+#
+# excel_workBook = xlsxwriter.Workbook(os.path.join(directory_for_reports, report_name,report_name + ".xlsx"), {'strings_to_numbers': True})
+#
+# rawLog2_worksheet = excel_workBook.add_worksheet("RawLog2")
+# error_rawLog2 = construct_worksheet_for_xlsx(rawLog2_worksheet, header_path, "RawLog2", raw_log_2_path)
+#
+# norm_linear_worksheet = excel_workBook.add_worksheet("NormLinear")
+# error_norm_linear = construct_worksheet_for_xlsx(norm_linear_worksheet, header_path, "NormLinear", norm_linear_path)
+#
+# norm_log_2_worksheet = excel_workBook.add_worksheet("NormLog2")
+# error_norm_log_2 = construct_worksheet_for_xlsx(norm_log_2_worksheet, header_path, "NormLog2", norm_log_2_path)
+#
+# norm_log_2_median_centered_worksheet = excel_workBook.add_worksheet("NormLog2_MedianCentered")
+# error_norm_log_2_median_centered = construct_worksheet_for_xlsx(norm_log_2_median_centered_worksheet, header_path, "Median-Centered", norm_log_2_median_centered_path)
+#
+# errors_array = [error_put_cf2_in_normLinear, error_rawLog2, error_norm_linear, error_norm_log_2, error_norm_log_2_median_centered]
+# for error in errors_array:
+# if error != "":
+# error_write_xlsx = error_write_xlsx + error
+# error_occurred = 1
+# if error_occurred == 1:
+# error_write_xlsx + "\nThe excel workbook for the report "+report_name+" was not written successfully.\n\n"
+#
+# excel_workBook.close()
+# except Exception, e:
+# error_occurred = 1
+# error_write_xlsx += str(repr(e)) + "\n\n"
+# error_write_xlsx + "\nThe excel workbook for the report "+report_name+" was not written successfully.\n\n"
+# try:
+# excel_workBook.close()
+# except Exception, f:
+# sys.stderr.write("An unforeseen problem has occurred in write_xlsx_for_report()\n")
+# sys.stderr.write(str(repr(f)) + "\n\n")
+#
+#
+# return error_occurred, error_write_xlsx
+#
+#
+# def write_new_normLinear_csv_file_with_cf2_values(path_to_dir, norm_linear_path, dict_cf2_values):
+# errors = ""
+# try:
+# titles = {}
+# new_lines_normLinear_with_cf2 = []
+# # read old norm linear file
+# rf_normLinear = open(norm_linear_path, 'rU')
+# line_num = 0
+# for line in rf_normLinear:
+# line = strip_new_line_from_right_side(line)
+# toks = line.split(",")
+# line_num += 1
+# if line_num == 1:
+# line += "1,CF2"
+# new_lines_normLinear_with_cf2.append(line)
+# titles = toks
+# continue
+# pos_rf = int(toks[titles.index('Order')])
+# line += "," + str(dict_cf2_values[pos_rf])
+# new_lines_normLinear_with_cf2.append(line)
+# rf_normLinear.close()
+# # rename the old normLinear file
+# os.rename(norm_linear_path, os.path.join(path_to_dir, 'before_cf2_NormLinear.csv'))
+#
+# # write new normLinear with cf2
+# wf_new_normLinear = open(norm_linear_path, 'w')
+# for line_writing in new_lines_normLinear_with_cf2:
+# wf_new_normLinear.write(line_writing + "\n")
+# wf_new_normLinear.close()
+# except Exception, err_write_normLinear_with_cf2_values:
+# errors = str(repr(err_write_normLinear_with_cf2_values))
+#
+# return errors
+#
+#
+# # This function constructs the worksheet for each tab in the excel file for a report
+# # It puts these things in this order:
+# # 1. Title of the tab
+# # 2. Header for the tab
+# # 3. Content of the tab
+# def construct_worksheet_for_xlsx(worksheet, header_path, title_top_of_tab, tab_input_path):
+#
+# reload(sys)
+# sys.setdefaultencoding('utf8')
+# errors = ""
+#
+# try:
+# # Write the title at the top of the tab
+# worksheet.write(0,0,title_top_of_tab)
+#
+# # Variable to keep track of the rows
+# row_num = 1
+#
+# # Write the header stuff
+# header_file = open(header_path, 'rU')
+# for head_line in header_file:
+# head_line = strip_new_line_from_right_side(head_line)
+# head_toks = head_line.split(",")
+# col_num = 0
+# for tok in head_toks:
+# worksheet.write(row_num, col_num, tok)
+# col_num += 1
+# row_num += 1
+#
+# # Write the content stuff
+# tab_input_file = open(tab_input_path, 'rU')
+# for tab_line in tab_input_file:
+# tab_line = strip_new_line_from_right_side(tab_line)
+# tab_toks = tab_line.split(",")
+# col_num = 0
+# for tok in tab_toks:
+# tok = tok.decode('iso-8859-1').encode('utf-8')
+# worksheet.write(row_num, col_num, tok)
+# col_num += 1
+# row_num += 1
+#
+# header_file.close()
+# tab_input_file.close()
+# except Exception, e:
+# errors = errors + "\n\nAn error occurred while constructing the "+title_top_of_tab+" tab for the excel file.\n"
+# errors = errors + "The error was :\n\t" + str(e) + "\n\n"
+# try:
+# header_file.close()
+# tab_input_file.close()
+# except NameError:
+# x = 5
+#
+ return errors
+
+#----------------------------------------------------------------------
+if __name__ == "__main__":
+
+ #try:
+ if len(sys.argv) > 4:
+ infileName = '"'+sys.argv[1]+'"'
+ tabName = '"'+sys.argv[2]+'"'
+ tabNumber = 0
+ if tabName == '': tabNumber = int(sys.argv[3])
+ outfileName = '"'+sys.argv[4]+'"' #TODO Later multiple outputs one per tab
+
+ sys.stdout.write( "\nInput parameters ",str(sys.argv[1:4]),"\n" )
+
+ #infileName = "/Users/bobbrown/Desktop/01_Gordon_Mills__Zhiyong_Ding.xlsx"
+ #outfileName= "/Users/bobbrown/Desktop/01_Gordon_Mills__Zhiyong_Ding-Tab-Out-"
+ #tabName ="NormLog2"
+ #tabName =""
+ #tabNumber= 10
+
+ status= File_From_Tab(infileName, outfileName, tabName, tabNumber )
+ #except
+ #sys.exit(-1)
+
+ sys.exit(0)
\ No newline at end of file
diff -r 7f12c81e2083 -r f1bcd79cd923 Split_ExcelTabs_IntoFiles.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Split_ExcelTabs_IntoFiles.xml Tue Nov 27 14:20:40 2018 -0500
@@ -0,0 +1,35 @@
+
+
+ into separate tab delimited files
+ Split_ExcelTabs_IntoFiles.py '$input' '' '$extra.tabnumber' '$output_file'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff -r 7f12c81e2083 -r f1bcd79cd923 TestOutFile.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/TestOutFile.txt Tue Nov 27 14:20:40 2018 -0500
@@ -0,0 +1,1 @@
+output from input= TestInFile
\ No newline at end of file
diff -r 7f12c81e2083 -r f1bcd79cd923 Test_input_into_file.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Test_input_into_file.py Tue Nov 27 14:20:40 2018 -0500
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+
+#Created on Jule 23, 2018
+
+# @author: Bob Brown
+
+import sys
+import os
+
+def main():
+
+ # Grab the inputs from the Galaxy xml interface and write to a file that is passed to the program
+ # Not each of the parameters as separate command line variables.
+# ab_gene_name_for_header={}
+# ab_rrid_for_header={}
+ dir= "/Users/bobbrown/Desktop/junk/"
+ accepted_extensions = ["csv", "tsv"]
+ filenames = [fn for fn in os.listdir(dir) if fn.split(".")[-1] in accepted_extensions]
+ for f in filenames:
+ print("filename= "+f)
+ os.remove(dir+f)
+
+ sys.exit(0)
+
+ ab_gene_name_for_header={'abc':'geneName'}
+ ab_rrid_for_header={'abc':'rrid123'}
+ line= 'abc,123\n'
+
+ pos= line.find(",")
+ ABname= line[0:pos]
+ ABnewName= ABname+ "|"+ab_gene_name_for_header[ABname]+"|"+ab_rrid_for_header[ABname]
+ line= ABnewName+line[pos:]
+ line= line.replace(',','\t')
+ sys.exit(0)
+# try:
+ print(' \n starting Test program read params from file stored in tools dir. Arguments=')
+ print(str(sys.argv[1:])+'\n')
+
+ if False:
+ infileName = sys.argv[1]
+ # directory = sys.argv[2]
+ directory = '/Users/bobbrown/Desktop/'
+ outfileName = sys.argv[3] #use later
+ # outfile = sys.argv[6]
+
+ #sys.stdout.write
+
+ # ifile= open(directory+"/"+infileName,'rU')
+ ifile= open(directory+infileName,'rU')
+ ofile= open(directory+outfileName,'w')
+ # ofile= open('/Users/bobbrown/Desktop/TestOutFileVarParams.txt','w')
+
+ cnt= 0
+ # for param in range(2,len(sys.argv)):
+ # cnt +=1
+ # ofile.write("param "+str(cnt)+"= "+param+"\n")
+
+
+ for param in ifile:
+ cnt +=1
+ ofile.write("param "+str(cnt)+"= "+param)
+
+ ifile.close()
+
+ ofile.close()
+
+ print('Fini -- rows read = '+str(cnt)+'\n')
+
+# except :
+# print('Error>>> ')
+
+ return
+##
+##
+
+if __name__ == '__main__': main()
+ #sys.exit(0)
\ No newline at end of file
diff -r 7f12c81e2083 -r f1bcd79cd923 Test_input_into_file.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Test_input_into_file.xml Tue Nov 27 14:20:40 2018 -0500
@@ -0,0 +1,61 @@
+
+
+ Read xml params into a file
+
+Test-input-into-file.py $inputfile "$__tool_dir__" $outputfile
+ #for $op in $test_param1
+ ${op.discrete_fields1.Text_Fields1}
+ #end for
+ "$EndofVarParam1"
+ "$Covariate_Type"
+ "$EndofVarParam2"
+ #for $op in $test_param3
+ ${op.discrete_fields3.Text_Fields3}
+ #end for
+ "$EndofVarParam3"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 7f12c81e2083 -r f1bcd79cd923 bar_chart_plot.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bar_chart_plot.py Tue Nov 27 14:20:40 2018 -0500
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+"""
+histogram_gnuplot.py
+a generic histogram builder based on gnuplot backend
+
+ data_file - tab delimited file with data
+ xtic_column - column containing labels for x ticks [integer, 0 means no ticks]
+ column_list - comma separated list of columns to plot
+ title - title for the entire histrogram
+ ylabel - y axis label
+ yrange_max - minimal value at the y axis (integer)
+ yrange_max - maximal value at the y_axis (integer)
+ to set yrange to autoscaling assign 0 to yrange_min and yrange_max
+ graph_file - file to write histogram image to
+ img_size - as X,Y pair in pixels (e.g., 800,600 or 600,800 etc.)
+
+
+ This tool required gnuplot and gnuplot.py
+
+anton nekrutenko | anton@bx.psu.edu
+"""
+
+import string
+import sys
+import tempfile
+
+import Gnuplot
+import Gnuplot.funcutils
+
+assert sys.version_info[:2] >= (2, 4)
+
+
+def stop_err(msg):
+ sys.stderr.write(msg)
+ sys.exit()
+
+
+def main(tmpFileName):
+ skipped_lines_count = 0
+ skipped_lines_index = []
+ gf = open(tmpFileName, 'w')
+
+ try:
+ in_file = open(sys.argv[1], 'r')
+ xtic = int(sys.argv[2])
+ col_list = string.split(sys.argv[3], ",")
+ title = 'set title "' + sys.argv[4] + '"'
+ ylabel = 'set ylabel "' + sys.argv[5] + '"'
+ ymin = sys.argv[6]
+ ymax = sys.argv[7]
+ img_file = sys.argv[8]
+ img_size = sys.argv[9]
+ except:
+ stop_err("Check arguments\n")
+
+ try:
+ int(col_list[0])
+ except:
+ stop_err('You forgot to set columns for plotting\n')
+
+ for i, line in enumerate(in_file):
+ valid = True
+ line = line.rstrip('\r\n')
+ if line and not line.startswith('#'):
+ row = []
+ try:
+ fields = line.split('\t')
+ for col in col_list:
+ row.append(str(float(fields[int(col) - 1])))
+ except:
+ valid = False
+ skipped_lines_count += 1
+ skipped_lines_index.append(i)
+ else:
+ valid = False
+ skipped_lines_count += 1
+ skipped_lines_index.append(i)
+
+ if valid and xtic > 0:
+ row.append(fields[xtic - 1])
+ elif valid and xtic == 0:
+ row.append(str(i))
+
+ if valid:
+ gf.write('\t'.join(row))
+ gf.write('\n')
+
+ if skipped_lines_count < i:
+ # Prepare 'using' clause of plot statement
+ g_plot_command = ' '
+
+ # Set the first column
+ if xtic > 0:
+ g_plot_command = "'%s' using 1:xticlabels(%s) ti 'Column %s', " % (tmpFileName, str(len(row)), col_list[0])
+ else:
+ g_plot_command = "'%s' using 1 ti 'Column %s', " % (tmpFileName, col_list[0])
+
+ # Set subsequent columns
+ for i in range(1, len(col_list)):
+ g_plot_command += "'%s' using %s t 'Column %s', " % (tmpFileName, str(i + 1), col_list[i])
+
+ g_plot_command = g_plot_command.rstrip(', ')
+
+ yrange = 'set yrange [' + ymin + ":" + ymax + ']'
+
+ try:
+ g = Gnuplot.Gnuplot()
+ g('reset')
+ g('set boxwidth 0.9 absolute')
+ g('set style fill solid 1.00 border -1')
+ g('set style histogram clustered gap 5 title offset character 0, 0, 0')
+ g('set xtics border in scale 1,0.5 nomirror rotate by 90 offset character 0, 0, 0')
+ g('set key invert reverse Left outside')
+ if xtic == 0:
+ g('unset xtics')
+ g(title)
+ g(ylabel)
+ g_term = 'set terminal png tiny size ' + img_size
+ g(g_term)
+ g_out = 'set output "' + img_file + '"'
+ if ymin != ymax:
+ g(yrange)
+ g(g_out)
+ g('set style data histograms')
+ g.plot(g_plot_command)
+ except:
+ stop_err("Gnuplot error: Data cannot be plotted")
+ else:
+ sys.stderr.write('Column(s) %s of your dataset do not contain valid numeric data' % sys.argv[3])
+
+ if skipped_lines_count > 0:
+ sys.stdout.write('\nWARNING. You dataset contain(s) %d invalid lines starting with line #%d. These lines were skipped while building the graph.\n' % (skipped_lines_count, skipped_lines_index[0] + 1))
+
+
+if __name__ == "__main__":
+ # The tempfile initialization is here because while inside the main() it seems to create a condition
+ # when the file is removed before gnuplot has a chance of accessing it
+ gp_data_file = tempfile.NamedTemporaryFile('w')
+ Gnuplot.gp.GnuplotOpts.default_term = 'png'
+ main(gp_data_file.name)