Mercurial > repos > md-anderson-bioinformatics > matrix_manipulation
comparison Matrix_Validate_import.py @ 1:f1bcd79cd923 draft default tip
Uploaded
| author | insilico-bob |
|---|---|
| date | Tue, 27 Nov 2018 14:20:40 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:7f12c81e2083 | 1:f1bcd79cd923 |
|---|---|
| 1 ''' | |
| 2 Created on Jun 7, 2017 modified Feb2018 | |
| 3 | |
| 4 @author: cjacoby and Bob Brown | |
| 5 ''' | |
| 6 | |
| 7 import sys, traceback, argparse | |
| 8 import numpy as np | |
| 9 import os | |
| 10 #import matplotlib.pyplot as plt | |
| 11 #import matplotlib.pyplot as plt; plt.rcdefaults() | |
| 12 | |
| 13 # Define the Reading Function Which Pulls the Data from a .txt file | |
| 14 def reader(input_file_txt, create_plot= False): | |
| 15 #Read Matrix, Preserving String Values for Headers first row and first column (both minus first cell) | |
| 16 #Read Matrix, Converting all values to Float for Data Processing | |
| 17 | |
| 18 f = open(input_file_txt, "rU") | |
| 19 | |
| 20 #print( 'Valid NAN identifiers are: empty cells, cells with blanks,"NA","N/A","-", and "?"') | |
| 21 | |
| 22 column_labels = [] | |
| 23 row_labels = [] | |
| 24 matrix = [] | |
| 25 firstLine= True | |
| 26 | |
| 27 line = f.readline() | |
| 28 | |
| 29 # "NA","N/A","-","?","NAN","NaN","Na","na","n/a","null",EMPTY/Null, SPACE (blank char) | |
| 30 | |
| 31 nanList = ["", " ","NAN", "NA", "N/A", "-","?"] | |
| 32 binCatDict = {"":0, " ":0, "Text":0, "NA":0, "-":0,"NAN":0, "N/A":0,"?":0} | |
| 33 row = 0 | |
| 34 nanCnt = 0 | |
| 35 nonNumCnt = 0 | |
| 36 | |
| 37 while line: | |
| 38 line = line.strip("\n") | |
| 39 line = line.split('\t') | |
| 40 | |
| 41 row += 1 | |
| 42 | |
| 43 if firstLine: | |
| 44 lengthRow = len(line) | |
| 45 column_labels = line[1:] | |
| 46 else: | |
| 47 if lengthRow != len(line): | |
| 48 # print("\nERROR matrix row lengths unequal for row 0 and row "+str(row)+"\n" ) | |
| 49 sys.exit(-1) | |
| 50 | |
| 51 temp = [] | |
| 52 # column= 0 | |
| 53 row_labels.append(str(line[0])) | |
| 54 | |
| 55 #for item in line[1:]: use enumerate | |
| 56 for column, item in enumerate(line[1:],1): | |
| 57 # column += 1 | |
| 58 try: | |
| 59 temp.append(float(item)) | |
| 60 except ValueError: | |
| 61 temp.append(np.nan) | |
| 62 itemUC= item.upper() | |
| 63 | |
| 64 if itemUC in nanList: | |
| 65 nanCnt += 1 | |
| 66 binCatDict[itemUC]= binCatDict[itemUC]+1 | |
| 67 # print( 'Legit nans= ',str(item)) | |
| 68 else: | |
| 69 if nonNumCnt == 0: sys.stderr.write("Start List of up to first 50 Invalid cell values \n") | |
| 70 nonNumCnt +=1 | |
| 71 if nonNumCnt < 50: sys.stderr.write("At row_column= "+str(row)+"_"+str(column)+' invalid data cell value '+ item+"\n") | |
| 72 | |
| 73 matrix.append(temp) | |
| 74 | |
| 75 line = f.readline() | |
| 76 firstLine= False | |
| 77 | |
| 78 #sys.stdout.write("\n\n") | |
| 79 f.close() | |
| 80 binCatDict["Text"]= nonNumCnt | |
| 81 | |
| 82 # plot results of NAN counts above | |
| 83 | |
| 84 binCat = ["null", "blank", 'hyphen', '?','NA','N/A' ,'NAN', 'text'] | |
| 85 orderDict= {0:"", 1:"", 2:'-', 3:'?',4:'NA', 5:'N/A' ,6:'NAN', 7:'Text'} | |
| 86 #TODO verify dict orde for data | |
| 87 #print("> key value =",key, str(value)) | |
| 88 | |
| 89 if create_plot: | |
| 90 numBins = len(binCat) | |
| 91 binWidth = 1 | |
| 92 bins = [] | |
| 93 binData = [] | |
| 94 | |
| 95 for key in sorted(orderDict): | |
| 96 value= binCatDict[orderDict[key]] # place items on chart in order and with data value for item | |
| 97 if value < 1: | |
| 98 binData.append(value+0.01) | |
| 99 else: | |
| 100 binData.append(value) | |
| 101 | |
| 102 #""" | |
| 103 for j in range(numBins): | |
| 104 bins.append(j*binWidth) | |
| 105 #ttps://pythonspot.com/matplotlib-bar-chart/ | |
| 106 y_pos = np.arange(numBins) | |
| 107 plt.yticks(y_pos, binCat) | |
| 108 plt.title("Distribution of NAN types (UPPER & lower & MiXeD case combined)") | |
| 109 plt.ylabel('NAN Types') | |
| 110 plt.xlabel('Occurrences') | |
| 111 #plt.legend() | |
| 112 plt.barh(y_pos, binData, align='center', alpha=0.5) | |
| 113 | |
| 114 fig, ax = plt.subplots(num=1, figsize=(8,3)) | |
| 115 ax.set_title("Data Cell Counts of Not A Number (NAN) Types") | |
| 116 #ax.bar(center,bins, align='center', width=width) | |
| 117 #ax.bar(center, hist, align='center', width=width) | |
| 118 #ax.set_xticks(bins) | |
| 119 # fig.savefig("/Users/bobbrown/Desktop/Matrix-tools-Test-output/NAN-plot.png") | |
| 120 | |
| 121 # fig, ax = plt.subplots(num=1, figsize=(8,3)) | |
| 122 # fig.savefig("/Users/bobbrown/Desktop/Matrix-tools-Test-output/hist-out.png") | |
| 123 | |
| 124 plt.show() | |
| 125 #""" | |
| 126 | |
| 127 #after plot error? | |
| 128 x,y=np.shape(matrix) | |
| 129 if nanCnt > 0: print("WARNING -- Found "+str(nanCnt)+" Valid Non-numbers. Their percent of total matrix data cell values = "+str((100*nanCnt)/(x*y))+"% ") | |
| 130 if nonNumCnt > 0: sys.exit(-1) | |
| 131 #print ("reader output:") | |
| 132 #print (matrix) | |
| 133 #print (column_labels) | |
| 134 #print(row_labels) | |
| 135 return matrix,column_labels,row_labels | |
| 136 | |
| 137 #---------------------------------------------------------------------- | |
| 138 # Verify Matrix A column_labels match Matrix B row_labels in name and order for A*B | |
| 139 def MatchLabels(column_labels,row_labels): | |
| 140 | |
| 141 if len(column_labels) != len(row_labels): | |
| 142 sys.err("ERROR 1st matrix column count "+str(len(column_labels))+" not equal 2nd Matrix number row count "+str(len(row_labels))+"\n" ) | |
| 143 else: | |
| 144 cnt= 0 | |
| 145 for k in range(0,len(column_labels)): | |
| 146 if column_labels[k] != row_labels[k] and cnt < 20: | |
| 147 cnt += 1 | |
| 148 #sys.err("ERROR At column & row position "+str(k)+" Matrix 1 column value "+str(column_labels)+" not equal 2nd Matrix row value "+str(row_labels)+"\n" ) | |
| 149 | |
| 150 if cnt > 0: | |
| 151 sys.exit(-11) | |
| 152 #---------------------------------------------------------------------- | |
| 153 # restores row and column labels in ouput | |
| 154 def Labeler(matrix,column_labels,row_labels,output_file_txt): | |
| 155 #print("matrix length: " + str(len(matrix))) | |
| 156 #print("row labels length: " + str(len(row_labels))) | |
| 157 #print("col labels length: " +str(len(column_labels))) | |
| 158 #Define Null Sets For Col and Row Headers | |
| 159 with open(output_file_txt,'w') as f: | |
| 160 f.write("") | |
| 161 for k in range(0,len(column_labels)): | |
| 162 f.write('\t' + str(column_labels[k]) ) | |
| 163 f.write('\n') | |
| 164 #for i in range(0,len(row_labels)): | |
| 165 for i in range(0,len(matrix)): | |
| 166 f.write(str(row_labels[i]) ) | |
| 167 #print("matrix["+str(i)+"] length:" + str(len(matrix[i]))) | |
| 168 for j in range(0,len(matrix[0])): | |
| 169 f.write('\t' + format(matrix[i][j])) | |
| 170 f.write('\n') | |
| 171 | |
| 172 | |
| 173 #---------------------------------------------------------------------- | |
| 174 if __name__ == '__main__': | |
| 175 input_file_txt = str(sys.argv[1]) | |
| 176 | |
| 177 matrix,column_labels,row_labels = reader(input_file_txt) | |
| 178 print("Done") | |
| 179 |
