comparison Matrix_Validations.py @ 1:f1bcd79cd923 draft default tip

Uploaded
author insilico-bob
date Tue, 27 Nov 2018 14:20:40 -0500
parents
children
comparison
equal deleted inserted replaced
0:7f12c81e2083 1:f1bcd79cd923
1 '''
2 Created on Jun 7, 2017 modified Feb2018
3
4 @author: Bob Brown and cjacoby
5 '''
6
7 import sys, traceback, argparse
8 import numpy as np
9 import os
10 from Matrix_Validate_import import reader, Labeler
11
12 #Define The Four Arguments Used in the Program
13 def get_args():
14 parser = argparse.ArgumentParser()
15 parser.add_argument('input_file_txt', help='tab delimited text file input matrix(include .txt in name)')
16 parser.add_argument('replacement', type=str, help='Choose Replacement for Missing Value. Valid Choices are strings: "Mean" or "Zero"')
17 parser.add_argument('axes', type=str, help='Choose Axes to Normalize On (Either "Row" or "Column"')
18 parser.add_argument('output_file_txt' ,help='tab delimited text file output name (include .txt in name)')
19 args = parser.parse_args()
20 return args
21
22
23 #Define Function to Replace Null Values with Row Mean
24 def nan_replacer_mean_rows(matrix):
25
26 nonNumCnt= 0
27 nanCnt = 0 #valid NANs are "NA","N/A","-","?"
28
29 #Loop Replacing all Null Values with Row Mean
30 for i in range(0,len(matrix)):
31 temp_mean = np.nanmean(matrix[i])
32 for j in range(0,len(matrix[0])):
33 #if matrix[i][j] == "NA": #np.isnan(matrix[i][j]) == True:
34 if np.isnan(matrix[i][j]) == True:
35 matrix[i][j] = temp_mean
36 nanCnt = nanCnt + 1
37 return matrix, nonNumCnt, nanCnt
38
39 #Define Function to Replace Null Values with Column Mean
40 def nan_replacer_mean_columns(matrix):
41
42 nonNumCnt= 0
43 nanCnt = 0 #valid NANs are "NA","N/A","-","?"
44
45 #Loop Replacing all Null Values with Column Mean
46 for i in range(0,len(matrix[0])):
47 col = [row[i] for row in matrix]
48 temp_mean = np.nanmean(col)
49 for j in range(0,len(matrix)):
50 #if matrix[i][j] == "NA": #elif np.isnan(matrix[j][i]) == True:
51 if np.isnan(matrix[j][i]) == True:
52 matrix[j][i] = temp_mean
53 nanCnt = nanCnt + 1
54
55 return matrix, nonNumCnt, nanCnt
56
57 #Define Function to Replace Null Values with Zero (axis orientation is irrelevant)
58 def nan_replacer_zero(matrix):
59
60 nonNumCnt= 0
61 nanCnt = 0 #valid NANs are "NA","N/A","-","?"
62
63 #Loop Replacing all Null Values with Row Range
64 for i in range(0,len(matrix)):
65 for j in range(0,len(matrix[0])):
66 #if matrix[i][j] =="NA":
67 if np.isnan(matrix[i][j]) == True:
68 matrix[i][j] = 0
69
70 return matrix, nonNumCnt, nanCnt
71
72 #Define Function to Re-Label Output Matrix
73 #!!!! not needed no output matrix from Validate tool
74 def OLD_labeler(matrix, og_cols, og_rows, output_file_txt):
75 #Write Data to Specified Text File Output
76 with open(output_file_txt,'w') as f:
77 f.write("Use original input file for further processing\n")
78 f.close()
79 # f.write("")
80 # for k in range(0,len(og_cols)):
81 # f.write('\t' + str(og_cols[k]))
82 # f.write('\n')
83 # for i in range(0,len(og_rows)):
84 # f.write(og_rows[i])
85 # for j in range(0,len(matrix[0])):
86 # f.write('\t' + format(matrix[i][j]))
87 # f.write('\n')
88
89 #Main Function
90 def main():
91 args = get_args()
92 #print(args)
93 #sys.stdout.write(str(args))
94 #sys.stdout.write( '\nValid NAN identifiers are "NA","N/A","-", and "?"')
95
96 matrix,og_cols,og_rows = reader(args.input_file_txt)
97
98 # if nonNumCnt > 0:
99 # print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
100 # #sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
101 # if nanCnt > 0:
102 # print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
103 # sys.exit(-1)
104 # else:
105 # if nanCnt > 0:
106 # print('\nWARNING Matrix has NO unknown non-numbers in matrix, but contains '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
107 # else:
108 # print('Matrix is Good-to-Go -- all numbers in data area. ')
109
110 #with open(args.output_file_txt,'w') as f:
111 # f.write("Use original input file for further processing\n")
112 #f.close()
113 #sys.exit(0)
114
115 # TODO !!!!! Below if MDA decides to use it TURNED OFF FOR NOW
116 # TODO !!!!! Below if MDA decides to use it TURNED OFF FOR NOW
117
118 if args.replacement == "Mean":
119 if args.axes == "Row":
120 matrix, nonNumCnt, nanCnt = nan_replacer_mean_rows(matrix)
121 Labeler(matrix,og_cols,og_rows,args.output_file_txt)
122 #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt)
123 #print('Mean,Row')
124 if nonNumCnt > 0:
125 print('ERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
126 sys.stderr.write('ERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
127 if nanCnt > 0:
128 print('WARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
129 sys.exit(-1)
130 else:
131 if nanCnt > 0:
132 print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
133 else:
134 print('\nMatrix is Good-to-Go -- all numbers in matrix. ')
135 sys.exit(0)
136 elif args.axes == "Column":
137 matrix, nonNumCnt, nanCnt = nan_replacer_mean_columns(matrix)
138 Labeler(matrix,og_cols,og_rows,args.output_file_txt)
139 #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt)
140 #print('Mean,Column')
141 if nonNumCnt > 0:
142 print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
143 sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
144 if nanCnt > 0:
145 print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
146 sys.exit(-1)
147 else:
148 if nanCnt > 0:
149 print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
150 else:
151 print('\nMatrix is Good-to-Go -- all numbers in matrix. ')
152 sys.exit(0)
153 else:
154 print('Mean, but given Invalid Axis= '+str(args.axes))
155 sys.stderr.write('Mean, but given Invalid Axis= '+str(args.axes))
156 elif args.replacement == "Zero":
157 matrix, nonNumCnt, nanCnt = nan_replacer_zero(matrix)
158 Labeler(matrix,og_cols,og_rows,args.output_file_txt)
159 #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt)
160 if nonNumCnt > 0:
161 print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
162 sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
163 if nanCnt > 0:
164 print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
165 sys.exit(-1)
166 else:
167 if nanCnt > 0:
168 print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
169 else:
170 print('\nMatrix is Good-to-Go -- all numbers in matrix. ')
171 sys.exit(0)
172 else:
173 print('zero, but given Invalid Axis= '+str(args.axes))
174 sys.stderr.write('zero, but given Invalid Axis= '+str(args.axes))
175 sys.exit(-2)
176
177
178 if __name__ == '__main__':
179 main()
180 print("done")