Mercurial > repos > md-anderson-bioinformatics > matrix_manipulation
comparison Matrix_Validations.py @ 1:f1bcd79cd923 draft default tip
Uploaded
| author | insilico-bob |
|---|---|
| date | Tue, 27 Nov 2018 14:20:40 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:7f12c81e2083 | 1:f1bcd79cd923 |
|---|---|
| 1 ''' | |
| 2 Created on Jun 7, 2017 modified Feb2018 | |
| 3 | |
| 4 @author: Bob Brown and cjacoby | |
| 5 ''' | |
| 6 | |
| 7 import sys, traceback, argparse | |
| 8 import numpy as np | |
| 9 import os | |
| 10 from Matrix_Validate_import import reader, Labeler | |
| 11 | |
| 12 #Define The Four Arguments Used in the Program | |
| 13 def get_args(): | |
| 14 parser = argparse.ArgumentParser() | |
| 15 parser.add_argument('input_file_txt', help='tab delimited text file input matrix(include .txt in name)') | |
| 16 parser.add_argument('replacement', type=str, help='Choose Replacement for Missing Value. Valid Choices are strings: "Mean" or "Zero"') | |
| 17 parser.add_argument('axes', type=str, help='Choose Axes to Normalize On (Either "Row" or "Column"') | |
| 18 parser.add_argument('output_file_txt' ,help='tab delimited text file output name (include .txt in name)') | |
| 19 args = parser.parse_args() | |
| 20 return args | |
| 21 | |
| 22 | |
| 23 #Define Function to Replace Null Values with Row Mean | |
| 24 def nan_replacer_mean_rows(matrix): | |
| 25 | |
| 26 nonNumCnt= 0 | |
| 27 nanCnt = 0 #valid NANs are "NA","N/A","-","?" | |
| 28 | |
| 29 #Loop Replacing all Null Values with Row Mean | |
| 30 for i in range(0,len(matrix)): | |
| 31 temp_mean = np.nanmean(matrix[i]) | |
| 32 for j in range(0,len(matrix[0])): | |
| 33 #if matrix[i][j] == "NA": #np.isnan(matrix[i][j]) == True: | |
| 34 if np.isnan(matrix[i][j]) == True: | |
| 35 matrix[i][j] = temp_mean | |
| 36 nanCnt = nanCnt + 1 | |
| 37 return matrix, nonNumCnt, nanCnt | |
| 38 | |
| 39 #Define Function to Replace Null Values with Column Mean | |
| 40 def nan_replacer_mean_columns(matrix): | |
| 41 | |
| 42 nonNumCnt= 0 | |
| 43 nanCnt = 0 #valid NANs are "NA","N/A","-","?" | |
| 44 | |
| 45 #Loop Replacing all Null Values with Column Mean | |
| 46 for i in range(0,len(matrix[0])): | |
| 47 col = [row[i] for row in matrix] | |
| 48 temp_mean = np.nanmean(col) | |
| 49 for j in range(0,len(matrix)): | |
| 50 #if matrix[i][j] == "NA": #elif np.isnan(matrix[j][i]) == True: | |
| 51 if np.isnan(matrix[j][i]) == True: | |
| 52 matrix[j][i] = temp_mean | |
| 53 nanCnt = nanCnt + 1 | |
| 54 | |
| 55 return matrix, nonNumCnt, nanCnt | |
| 56 | |
| 57 #Define Function to Replace Null Values with Zero (axis orientation is irrelevant) | |
| 58 def nan_replacer_zero(matrix): | |
| 59 | |
| 60 nonNumCnt= 0 | |
| 61 nanCnt = 0 #valid NANs are "NA","N/A","-","?" | |
| 62 | |
| 63 #Loop Replacing all Null Values with Row Range | |
| 64 for i in range(0,len(matrix)): | |
| 65 for j in range(0,len(matrix[0])): | |
| 66 #if matrix[i][j] =="NA": | |
| 67 if np.isnan(matrix[i][j]) == True: | |
| 68 matrix[i][j] = 0 | |
| 69 | |
| 70 return matrix, nonNumCnt, nanCnt | |
| 71 | |
| 72 #Define Function to Re-Label Output Matrix | |
| 73 #!!!! not needed no output matrix from Validate tool | |
| 74 def OLD_labeler(matrix, og_cols, og_rows, output_file_txt): | |
| 75 #Write Data to Specified Text File Output | |
| 76 with open(output_file_txt,'w') as f: | |
| 77 f.write("Use original input file for further processing\n") | |
| 78 f.close() | |
| 79 # f.write("") | |
| 80 # for k in range(0,len(og_cols)): | |
| 81 # f.write('\t' + str(og_cols[k])) | |
| 82 # f.write('\n') | |
| 83 # for i in range(0,len(og_rows)): | |
| 84 # f.write(og_rows[i]) | |
| 85 # for j in range(0,len(matrix[0])): | |
| 86 # f.write('\t' + format(matrix[i][j])) | |
| 87 # f.write('\n') | |
| 88 | |
| 89 #Main Function | |
| 90 def main(): | |
| 91 args = get_args() | |
| 92 #print(args) | |
| 93 #sys.stdout.write(str(args)) | |
| 94 #sys.stdout.write( '\nValid NAN identifiers are "NA","N/A","-", and "?"') | |
| 95 | |
| 96 matrix,og_cols,og_rows = reader(args.input_file_txt) | |
| 97 | |
| 98 # if nonNumCnt > 0: | |
| 99 # print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) | |
| 100 # #sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) | |
| 101 # if nanCnt > 0: | |
| 102 # print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') | |
| 103 # sys.exit(-1) | |
| 104 # else: | |
| 105 # if nanCnt > 0: | |
| 106 # print('\nWARNING Matrix has NO unknown non-numbers in matrix, but contains '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') | |
| 107 # else: | |
| 108 # print('Matrix is Good-to-Go -- all numbers in data area. ') | |
| 109 | |
| 110 #with open(args.output_file_txt,'w') as f: | |
| 111 # f.write("Use original input file for further processing\n") | |
| 112 #f.close() | |
| 113 #sys.exit(0) | |
| 114 | |
| 115 # TODO !!!!! Below if MDA decides to use it TURNED OFF FOR NOW | |
| 116 # TODO !!!!! Below if MDA decides to use it TURNED OFF FOR NOW | |
| 117 | |
| 118 if args.replacement == "Mean": | |
| 119 if args.axes == "Row": | |
| 120 matrix, nonNumCnt, nanCnt = nan_replacer_mean_rows(matrix) | |
| 121 Labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
| 122 #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt) | |
| 123 #print('Mean,Row') | |
| 124 if nonNumCnt > 0: | |
| 125 print('ERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) | |
| 126 sys.stderr.write('ERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) | |
| 127 if nanCnt > 0: | |
| 128 print('WARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') | |
| 129 sys.exit(-1) | |
| 130 else: | |
| 131 if nanCnt > 0: | |
| 132 print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') | |
| 133 else: | |
| 134 print('\nMatrix is Good-to-Go -- all numbers in matrix. ') | |
| 135 sys.exit(0) | |
| 136 elif args.axes == "Column": | |
| 137 matrix, nonNumCnt, nanCnt = nan_replacer_mean_columns(matrix) | |
| 138 Labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
| 139 #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt) | |
| 140 #print('Mean,Column') | |
| 141 if nonNumCnt > 0: | |
| 142 print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) | |
| 143 sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) | |
| 144 if nanCnt > 0: | |
| 145 print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') | |
| 146 sys.exit(-1) | |
| 147 else: | |
| 148 if nanCnt > 0: | |
| 149 print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') | |
| 150 else: | |
| 151 print('\nMatrix is Good-to-Go -- all numbers in matrix. ') | |
| 152 sys.exit(0) | |
| 153 else: | |
| 154 print('Mean, but given Invalid Axis= '+str(args.axes)) | |
| 155 sys.stderr.write('Mean, but given Invalid Axis= '+str(args.axes)) | |
| 156 elif args.replacement == "Zero": | |
| 157 matrix, nonNumCnt, nanCnt = nan_replacer_zero(matrix) | |
| 158 Labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
| 159 #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt) | |
| 160 if nonNumCnt > 0: | |
| 161 print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) | |
| 162 sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) | |
| 163 if nanCnt > 0: | |
| 164 print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') | |
| 165 sys.exit(-1) | |
| 166 else: | |
| 167 if nanCnt > 0: | |
| 168 print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') | |
| 169 else: | |
| 170 print('\nMatrix is Good-to-Go -- all numbers in matrix. ') | |
| 171 sys.exit(0) | |
| 172 else: | |
| 173 print('zero, but given Invalid Axis= '+str(args.axes)) | |
| 174 sys.stderr.write('zero, but given Invalid Axis= '+str(args.axes)) | |
| 175 sys.exit(-2) | |
| 176 | |
| 177 | |
| 178 if __name__ == '__main__': | |
| 179 main() | |
| 180 print("done") |
