1
|
1 '''
|
|
2 Created on Jun 7, 2017 modified Feb2018
|
|
3
|
|
4 @author: Bob Brown and cjacoby
|
|
5 '''
|
|
6
|
|
7 import sys, traceback, argparse
|
|
8 import numpy as np
|
|
9 import os
|
|
10 from Matrix_Validate_import import reader, Labeler
|
|
11
|
|
12 #Define The Four Arguments Used in the Program
|
|
13 def get_args():
|
|
14 parser = argparse.ArgumentParser()
|
|
15 parser.add_argument('input_file_txt', help='tab delimited text file input matrix(include .txt in name)')
|
|
16 parser.add_argument('replacement', type=str, help='Choose Replacement for Missing Value. Valid Choices are strings: "Mean" or "Zero"')
|
|
17 parser.add_argument('axes', type=str, help='Choose Axes to Normalize On (Either "Row" or "Column"')
|
|
18 parser.add_argument('output_file_txt' ,help='tab delimited text file output name (include .txt in name)')
|
|
19 args = parser.parse_args()
|
|
20 return args
|
|
21
|
|
22
|
|
23 #Define Function to Replace Null Values with Row Mean
|
|
24 def nan_replacer_mean_rows(matrix):
|
|
25
|
|
26 nonNumCnt= 0
|
|
27 nanCnt = 0 #valid NANs are "NA","N/A","-","?"
|
|
28
|
|
29 #Loop Replacing all Null Values with Row Mean
|
|
30 for i in range(0,len(matrix)):
|
|
31 temp_mean = np.nanmean(matrix[i])
|
|
32 for j in range(0,len(matrix[0])):
|
|
33 #if matrix[i][j] == "NA": #np.isnan(matrix[i][j]) == True:
|
|
34 if np.isnan(matrix[i][j]) == True:
|
|
35 matrix[i][j] = temp_mean
|
|
36 nanCnt = nanCnt + 1
|
|
37 return matrix, nonNumCnt, nanCnt
|
|
38
|
|
39 #Define Function to Replace Null Values with Column Mean
|
|
40 def nan_replacer_mean_columns(matrix):
|
|
41
|
|
42 nonNumCnt= 0
|
|
43 nanCnt = 0 #valid NANs are "NA","N/A","-","?"
|
|
44
|
|
45 #Loop Replacing all Null Values with Column Mean
|
|
46 for i in range(0,len(matrix[0])):
|
|
47 col = [row[i] for row in matrix]
|
|
48 temp_mean = np.nanmean(col)
|
|
49 for j in range(0,len(matrix)):
|
|
50 #if matrix[i][j] == "NA": #elif np.isnan(matrix[j][i]) == True:
|
|
51 if np.isnan(matrix[j][i]) == True:
|
|
52 matrix[j][i] = temp_mean
|
|
53 nanCnt = nanCnt + 1
|
|
54
|
|
55 return matrix, nonNumCnt, nanCnt
|
|
56
|
|
57 #Define Function to Replace Null Values with Zero (axis orientation is irrelevant)
|
|
58 def nan_replacer_zero(matrix):
|
|
59
|
|
60 nonNumCnt= 0
|
|
61 nanCnt = 0 #valid NANs are "NA","N/A","-","?"
|
|
62
|
|
63 #Loop Replacing all Null Values with Row Range
|
|
64 for i in range(0,len(matrix)):
|
|
65 for j in range(0,len(matrix[0])):
|
|
66 #if matrix[i][j] =="NA":
|
|
67 if np.isnan(matrix[i][j]) == True:
|
|
68 matrix[i][j] = 0
|
|
69
|
|
70 return matrix, nonNumCnt, nanCnt
|
|
71
|
|
72 #Define Function to Re-Label Output Matrix
|
|
73 #!!!! not needed no output matrix from Validate tool
|
|
74 def OLD_labeler(matrix, og_cols, og_rows, output_file_txt):
|
|
75 #Write Data to Specified Text File Output
|
|
76 with open(output_file_txt,'w') as f:
|
|
77 f.write("Use original input file for further processing\n")
|
|
78 f.close()
|
|
79 # f.write("")
|
|
80 # for k in range(0,len(og_cols)):
|
|
81 # f.write('\t' + str(og_cols[k]))
|
|
82 # f.write('\n')
|
|
83 # for i in range(0,len(og_rows)):
|
|
84 # f.write(og_rows[i])
|
|
85 # for j in range(0,len(matrix[0])):
|
|
86 # f.write('\t' + format(matrix[i][j]))
|
|
87 # f.write('\n')
|
|
88
|
|
89 #Main Function
|
|
90 def main():
|
|
91 args = get_args()
|
|
92 #print(args)
|
|
93 #sys.stdout.write(str(args))
|
|
94 #sys.stdout.write( '\nValid NAN identifiers are "NA","N/A","-", and "?"')
|
|
95
|
|
96 matrix,og_cols,og_rows = reader(args.input_file_txt)
|
|
97
|
|
98 # if nonNumCnt > 0:
|
|
99 # print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
|
|
100 # #sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
|
|
101 # if nanCnt > 0:
|
|
102 # print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
|
|
103 # sys.exit(-1)
|
|
104 # else:
|
|
105 # if nanCnt > 0:
|
|
106 # print('\nWARNING Matrix has NO unknown non-numbers in matrix, but contains '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
|
|
107 # else:
|
|
108 # print('Matrix is Good-to-Go -- all numbers in data area. ')
|
|
109
|
|
110 #with open(args.output_file_txt,'w') as f:
|
|
111 # f.write("Use original input file for further processing\n")
|
|
112 #f.close()
|
|
113 #sys.exit(0)
|
|
114
|
|
115 # TODO !!!!! Below if MDA decides to use it TURNED OFF FOR NOW
|
|
116 # TODO !!!!! Below if MDA decides to use it TURNED OFF FOR NOW
|
|
117
|
|
118 if args.replacement == "Mean":
|
|
119 if args.axes == "Row":
|
|
120 matrix, nonNumCnt, nanCnt = nan_replacer_mean_rows(matrix)
|
|
121 Labeler(matrix,og_cols,og_rows,args.output_file_txt)
|
|
122 #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt)
|
|
123 #print('Mean,Row')
|
|
124 if nonNumCnt > 0:
|
|
125 print('ERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
|
|
126 sys.stderr.write('ERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
|
|
127 if nanCnt > 0:
|
|
128 print('WARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
|
|
129 sys.exit(-1)
|
|
130 else:
|
|
131 if nanCnt > 0:
|
|
132 print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
|
|
133 else:
|
|
134 print('\nMatrix is Good-to-Go -- all numbers in matrix. ')
|
|
135 sys.exit(0)
|
|
136 elif args.axes == "Column":
|
|
137 matrix, nonNumCnt, nanCnt = nan_replacer_mean_columns(matrix)
|
|
138 Labeler(matrix,og_cols,og_rows,args.output_file_txt)
|
|
139 #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt)
|
|
140 #print('Mean,Column')
|
|
141 if nonNumCnt > 0:
|
|
142 print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
|
|
143 sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
|
|
144 if nanCnt > 0:
|
|
145 print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
|
|
146 sys.exit(-1)
|
|
147 else:
|
|
148 if nanCnt > 0:
|
|
149 print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
|
|
150 else:
|
|
151 print('\nMatrix is Good-to-Go -- all numbers in matrix. ')
|
|
152 sys.exit(0)
|
|
153 else:
|
|
154 print('Mean, but given Invalid Axis= '+str(args.axes))
|
|
155 sys.stderr.write('Mean, but given Invalid Axis= '+str(args.axes))
|
|
156 elif args.replacement == "Zero":
|
|
157 matrix, nonNumCnt, nanCnt = nan_replacer_zero(matrix)
|
|
158 Labeler(matrix,og_cols,og_rows,args.output_file_txt)
|
|
159 #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt)
|
|
160 if nonNumCnt > 0:
|
|
161 print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
|
|
162 sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
|
|
163 if nanCnt > 0:
|
|
164 print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
|
|
165 sys.exit(-1)
|
|
166 else:
|
|
167 if nanCnt > 0:
|
|
168 print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
|
|
169 else:
|
|
170 print('\nMatrix is Good-to-Go -- all numbers in matrix. ')
|
|
171 sys.exit(0)
|
|
172 else:
|
|
173 print('zero, but given Invalid Axis= '+str(args.axes))
|
|
174 sys.stderr.write('zero, but given Invalid Axis= '+str(args.axes))
|
|
175 sys.exit(-2)
|
|
176
|
|
177
|
|
178 if __name__ == '__main__':
|
|
179 main()
|
|
180 print("done")
|