Mercurial > repos > md-anderson-bioinformatics > matrix_manipulation
diff Matrix_Validations.py @ 1:f1bcd79cd923 draft default tip
Uploaded
author | insilico-bob |
---|---|
date | Tue, 27 Nov 2018 14:20:40 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Matrix_Validations.py Tue Nov 27 14:20:40 2018 -0500 @@ -0,0 +1,180 @@ +''' +Created on Jun 7, 2017 modified Feb2018 + +@author: Bob Brown and cjacoby +''' + +import sys, traceback, argparse +import numpy as np +import os +from Matrix_Validate_import import reader, Labeler + +#Define The Four Arguments Used in the Program +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('input_file_txt', help='tab delimited text file input matrix(include .txt in name)') + parser.add_argument('replacement', type=str, help='Choose Replacement for Missing Value. Valid Choices are strings: "Mean" or "Zero"') + parser.add_argument('axes', type=str, help='Choose Axes to Normalize On (Either "Row" or "Column"') + parser.add_argument('output_file_txt' ,help='tab delimited text file output name (include .txt in name)') + args = parser.parse_args() + return args + + +#Define Function to Replace Null Values with Row Mean +def nan_replacer_mean_rows(matrix): + + nonNumCnt= 0 + nanCnt = 0 #valid NANs are "NA","N/A","-","?" + + #Loop Replacing all Null Values with Row Mean + for i in range(0,len(matrix)): + temp_mean = np.nanmean(matrix[i]) + for j in range(0,len(matrix[0])): + #if matrix[i][j] == "NA": #np.isnan(matrix[i][j]) == True: + if np.isnan(matrix[i][j]) == True: + matrix[i][j] = temp_mean + nanCnt = nanCnt + 1 + return matrix, nonNumCnt, nanCnt + +#Define Function to Replace Null Values with Column Mean +def nan_replacer_mean_columns(matrix): + + nonNumCnt= 0 + nanCnt = 0 #valid NANs are "NA","N/A","-","?" + + #Loop Replacing all Null Values with Column Mean + for i in range(0,len(matrix[0])): + col = [row[i] for row in matrix] + temp_mean = np.nanmean(col) + for j in range(0,len(matrix)): + #if matrix[i][j] == "NA": #elif np.isnan(matrix[j][i]) == True: + if np.isnan(matrix[j][i]) == True: + matrix[j][i] = temp_mean + nanCnt = nanCnt + 1 + + return matrix, nonNumCnt, nanCnt + +#Define Function to Replace Null Values with Zero (axis orientation is irrelevant) +def nan_replacer_zero(matrix): + + nonNumCnt= 0 + nanCnt = 0 #valid NANs are "NA","N/A","-","?" + + #Loop Replacing all Null Values with Row Range + for i in range(0,len(matrix)): + for j in range(0,len(matrix[0])): + #if matrix[i][j] =="NA": + if np.isnan(matrix[i][j]) == True: + matrix[i][j] = 0 + + return matrix, nonNumCnt, nanCnt + +#Define Function to Re-Label Output Matrix +#!!!! not needed no output matrix from Validate tool +def OLD_labeler(matrix, og_cols, og_rows, output_file_txt): + #Write Data to Specified Text File Output + with open(output_file_txt,'w') as f: + f.write("Use original input file for further processing\n") + f.close() +# f.write("") +# for k in range(0,len(og_cols)): +# f.write('\t' + str(og_cols[k])) +# f.write('\n') +# for i in range(0,len(og_rows)): +# f.write(og_rows[i]) +# for j in range(0,len(matrix[0])): +# f.write('\t' + format(matrix[i][j])) +# f.write('\n') + +#Main Function +def main(): + args = get_args() + #print(args) + #sys.stdout.write(str(args)) + #sys.stdout.write( '\nValid NAN identifiers are "NA","N/A","-", and "?"') + + matrix,og_cols,og_rows = reader(args.input_file_txt) + +# if nonNumCnt > 0: +# print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) +# #sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) +# if nanCnt > 0: +# print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') +# sys.exit(-1) +# else: +# if nanCnt > 0: +# print('\nWARNING Matrix has NO unknown non-numbers in matrix, but contains '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') +# else: +# print('Matrix is Good-to-Go -- all numbers in data area. ') + + #with open(args.output_file_txt,'w') as f: + # f.write("Use original input file for further processing\n") + #f.close() + #sys.exit(0) + +# TODO !!!!! Below if MDA decides to use it TURNED OFF FOR NOW +# TODO !!!!! Below if MDA decides to use it TURNED OFF FOR NOW + + if args.replacement == "Mean": + if args.axes == "Row": + matrix, nonNumCnt, nanCnt = nan_replacer_mean_rows(matrix) + Labeler(matrix,og_cols,og_rows,args.output_file_txt) + #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt) + #print('Mean,Row') + if nonNumCnt > 0: + print('ERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) + sys.stderr.write('ERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) + if nanCnt > 0: + print('WARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') + sys.exit(-1) + else: + if nanCnt > 0: + print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') + else: + print('\nMatrix is Good-to-Go -- all numbers in matrix. ') + sys.exit(0) + elif args.axes == "Column": + matrix, nonNumCnt, nanCnt = nan_replacer_mean_columns(matrix) + Labeler(matrix,og_cols,og_rows,args.output_file_txt) + #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt) + #print('Mean,Column') + if nonNumCnt > 0: + print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) + sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) + if nanCnt > 0: + print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') + sys.exit(-1) + else: + if nanCnt > 0: + print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') + else: + print('\nMatrix is Good-to-Go -- all numbers in matrix. ') + sys.exit(0) + else: + print('Mean, but given Invalid Axis= '+str(args.axes)) + sys.stderr.write('Mean, but given Invalid Axis= '+str(args.axes)) + elif args.replacement == "Zero": + matrix, nonNumCnt, nanCnt = nan_replacer_zero(matrix) + Labeler(matrix,og_cols,og_rows,args.output_file_txt) + #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt) + if nonNumCnt > 0: + print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) + sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ', %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' ) + if nanCnt > 0: + print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') + sys.exit(-1) + else: + if nanCnt > 0: + print('\nWARNING Matrix has '+str(nanCnt)+' that is %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers') + else: + print('\nMatrix is Good-to-Go -- all numbers in matrix. ') + sys.exit(0) + else: + print('zero, but given Invalid Axis= '+str(args.axes)) + sys.stderr.write('zero, but given Invalid Axis= '+str(args.axes)) + sys.exit(-2) + + +if __name__ == '__main__': + main() + print("done")