Mercurial > repos > md-anderson-bioinformatics > matrix_manipulation
comparison Matrix_Validations.py @ 1:f1bcd79cd923 draft default tip
Uploaded
author | insilico-bob |
---|---|
date | Tue, 27 Nov 2018 14:20:40 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:7f12c81e2083 | 1:f1bcd79cd923 |
---|---|
1 ''' | |
2 Created on Jun 7, 2017 modified Feb2018 | |
3 | |
4 @author: Bob Brown and cjacoby | |
5 ''' | |
6 | |
7 import sys, traceback, argparse | |
8 import numpy as np | |
9 import os | |
10 from Matrix_Validate_import import reader, Labeler | |
11 | |
12 #Define The Four Arguments Used in the Program | |
13 def get_args(): | |
14 parser = argparse.ArgumentParser() | |
15 parser.add_argument('input_file_txt', help='tab delimited text file input matrix(include .txt in name)') | |
16 parser.add_argument('replacement', type=str, help='Choose Replacement for Missing Value. Valid Choices are strings: "Mean" or "Zero"') | |
17 parser.add_argument('axes', type=str, help='Choose Axes to Normalize On (Either "Row" or "Column"') | |
18 parser.add_argument('output_file_txt' ,help='tab delimited text file output name (include .txt in name)') | |
19 args = parser.parse_args() | |
20 return args | |
21 | |
22 | |
23 #Define Function to Replace Null Values with Row Mean | |
24 def nan_replacer_mean_rows(matrix): | |
25 | |
26 nonNumCnt= 0 | |
27 nanCnt = 0 #valid NANs are "NA","N/A","-","?" | |
28 | |
29 #Loop Replacing all Null Values with Row Mean | |
30 for i in range(0,len(matrix)): | |
31 temp_mean = np.nanmean(matrix[i]) | |
32 for j in range(0,len(matrix[0])): | |
33 #if matrix[i][j] == "NA": #np.isnan(matrix[i][j]) == True: | |
34 if np.isnan(matrix[i][j]) == True: | |
35 matrix[i][j] = temp_mean | |
36 nanCnt = nanCnt + 1 | |
37 return matrix, nonNumCnt, nanCnt | |
38 | |
39 #Define Function to Replace Null Values with Column Mean | |
40 def nan_replacer_mean_columns(matrix): | |
41 | |
42 nonNumCnt= 0 | |
43 nanCnt = 0 #valid NANs are "NA","N/A","-","?" | |
44 | |
45 #Loop Replacing all Null Values with Column Mean | |
46 for i in range(0,len(matrix[0])): | |
47 col = [row[i] for row in matrix] | |
48 temp_mean = np.nanmean(col) | |
49 for j in range(0,len(matrix)): | |
50 #if matrix[i][j] == "NA": #elif np.isnan(matrix[j][i]) == True: | |
51 if np.isnan(matrix[j][i]) == True: | |
52 matrix[j][i] = temp_mean | |
53 nanCnt = nanCnt + 1 | |
54 | |
55 return matrix, nonNumCnt, nanCnt | |
56 | |
57 #Define Function to Replace Null Values with Zero (axis orientation is irrelevant) | |
58 def nan_replacer_zero(matrix): | |
59 | |
60 nonNumCnt= 0 | |
61 nanCnt = 0 #valid NANs are "NA","N/A","-","?" | |
62 | |
63 #Loop Replacing all Null Values with Row Range | |
64 for i in range(0,len(matrix)): | |
65 for j in range(0,len(matrix[0])): | |
66 #if matrix[i][j] =="NA": | |
67 if np.isnan(matrix[i][j]) == True: | |
68 matrix[i][j] = 0 | |
69 | |
70 return matrix, nonNumCnt, nanCnt | |
71 | |
72 #Define Function to Re-Label Output Matrix | |
73 #!!!! not needed no output matrix from Validate tool | |
74 def OLD_labeler(matrix, og_cols, og_rows, output_file_txt): | |
75 #Write Data to Specified Text File Output | |
76 with open(output_file_txt,'w') as f: | |
77 f.write("Use original input file for further processing\n") | |
78 f.close() | |
79 # f.write("") | |
80 # for k in range(0,len(og_cols)): | |
81 # f.write('\t' + str(og_cols[k])) | |
82 # f.write('\n') | |
83 # for i in range(0,len(og_rows)): | |
84 # f.write(og_rows[i]) | |
85 # for j in range(0,len(matrix[0])): | |
86 # f.write('\t' + format(matrix[i][j])) | |
87 # f.write('\n') | |
88 | |
89 #Main Function | |
90 def main(): | |
91 args = get_args() | |
92 #print(args) | |
93 #sys.stdout.write(str(args)) | |
94 #sys.stdout.write( '\nValid NAN identifiers are "NA","N/A","-", and "?"') | |
95 | |
96 matrix,og_cols,og_rows = reader(args.input_file_txt) | |
97 | |
98 # if nonNumCnt > 0: | |
99 # print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) | |
100 # #sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) | |
101 # if nanCnt > 0: | |
102 # print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') | |
103 # sys.exit(-1) | |
104 # else: | |
105 # if nanCnt > 0: | |
106 # print('\nWARNING Matrix has NO unknown non-numbers in matrix, but contains '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') | |
107 # else: | |
108 # print('Matrix is Good-to-Go -- all numbers in data area. ') | |
109 | |
110 #with open(args.output_file_txt,'w') as f: | |
111 # f.write("Use original input file for further processing\n") | |
112 #f.close() | |
113 #sys.exit(0) | |
114 | |
115 # TODO !!!!! Below if MDA decides to use it TURNED OFF FOR NOW | |
116 # TODO !!!!! Below if MDA decides to use it TURNED OFF FOR NOW | |
117 | |
118 if args.replacement == "Mean": | |
119 if args.axes == "Row": | |
120 matrix, nonNumCnt, nanCnt = nan_replacer_mean_rows(matrix) | |
121 Labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
122 #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt) | |
123 #print('Mean,Row') | |
124 if nonNumCnt > 0: | |
125 print('ERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) | |
126 sys.stderr.write('ERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) | |
127 if nanCnt > 0: | |
128 print('WARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') | |
129 sys.exit(-1) | |
130 else: | |
131 if nanCnt > 0: | |
132 print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') | |
133 else: | |
134 print('\nMatrix is Good-to-Go -- all numbers in matrix. ') | |
135 sys.exit(0) | |
136 elif args.axes == "Column": | |
137 matrix, nonNumCnt, nanCnt = nan_replacer_mean_columns(matrix) | |
138 Labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
139 #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt) | |
140 #print('Mean,Column') | |
141 if nonNumCnt > 0: | |
142 print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) | |
143 sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) | |
144 if nanCnt > 0: | |
145 print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') | |
146 sys.exit(-1) | |
147 else: | |
148 if nanCnt > 0: | |
149 print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') | |
150 else: | |
151 print('\nMatrix is Good-to-Go -- all numbers in matrix. ') | |
152 sys.exit(0) | |
153 else: | |
154 print('Mean, but given Invalid Axis= '+str(args.axes)) | |
155 sys.stderr.write('Mean, but given Invalid Axis= '+str(args.axes)) | |
156 elif args.replacement == "Zero": | |
157 matrix, nonNumCnt, nanCnt = nan_replacer_zero(matrix) | |
158 Labeler(matrix,og_cols,og_rows,args.output_file_txt) | |
159 #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt) | |
160 if nonNumCnt > 0: | |
161 print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) | |
162 sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) | |
163 if nanCnt > 0: | |
164 print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') | |
165 sys.exit(-1) | |
166 else: | |
167 if nanCnt > 0: | |
168 print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') | |
169 else: | |
170 print('\nMatrix is Good-to-Go -- all numbers in matrix. ') | |
171 sys.exit(0) | |
172 else: | |
173 print('zero, but given Invalid Axis= '+str(args.axes)) | |
174 sys.stderr.write('zero, but given Invalid Axis= '+str(args.axes)) | |
175 sys.exit(-2) | |
176 | |
177 | |
178 if __name__ == '__main__': | |
179 main() | |
180 print("done") |