view Matrix_Validations.py @ 1:f1bcd79cd923 draft default tip

Uploaded
author insilico-bob
date Tue, 27 Nov 2018 14:20:40 -0500
parents
children
line wrap: on
line source

'''
Created on Jun 7, 2017 modified Feb2018

@author: Bob Brown and cjacoby
'''
 
import sys, traceback, argparse
import numpy as np
import os
from Matrix_Validate_import import reader, Labeler

#Define The Four Arguments Used in the Program
def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('input_file_txt', help='tab delimited text file input matrix(include .txt in name)')
    parser.add_argument('replacement', type=str, help='Choose Replacement for Missing Value. Valid Choices are strings: "Mean" or "Zero"')
    parser.add_argument('axes', type=str, help='Choose Axes to Normalize On (Either "Row" or "Column"')
    parser.add_argument('output_file_txt' ,help='tab delimited text file output name (include .txt in name)')
    args = parser.parse_args()
    return args


#Define Function to Replace Null Values with Row Mean
def nan_replacer_mean_rows(matrix):

    nonNumCnt= 0
    nanCnt   = 0   #valid NANs are "NA","N/A","-","?"

    #Loop Replacing all Null Values with Row Mean
    for i in range(0,len(matrix)):
        temp_mean = np.nanmean(matrix[i])
        for j in range(0,len(matrix[0])):
            #if matrix[i][j] == "NA": #np.isnan(matrix[i][j]) == True:
            if np.isnan(matrix[i][j]) == True:
                matrix[i][j] = temp_mean     
                nanCnt = nanCnt + 1
    return matrix, nonNumCnt, nanCnt

#Define Function to Replace Null Values with Column Mean
def nan_replacer_mean_columns(matrix):

    nonNumCnt= 0
    nanCnt   = 0   #valid NANs are "NA","N/A","-","?"
    
    #Loop Replacing all Null Values with Column Mean
    for i in range(0,len(matrix[0])):
        col = [row[i] for row in matrix]
        temp_mean = np.nanmean(col)
        for j in range(0,len(matrix)):
            #if matrix[i][j] == "NA": #elif np.isnan(matrix[j][i]) == True:
            if np.isnan(matrix[j][i]) == True:
                matrix[j][i] = temp_mean  
                nanCnt = nanCnt + 1   
    
    return matrix, nonNumCnt, nanCnt

#Define Function to Replace Null Values with Zero (axis orientation is irrelevant)
def nan_replacer_zero(matrix):

    nonNumCnt= 0
    nanCnt   = 0   #valid NANs are "NA","N/A","-","?"
    
    #Loop Replacing all Null Values with Row Range
    for i in range(0,len(matrix)):
        for j in range(0,len(matrix[0])):
            #if matrix[i][j] =="NA":
            if np.isnan(matrix[i][j]) == True:
               matrix[i][j] = 0

    return matrix, nonNumCnt, nanCnt

#Define Function to Re-Label Output Matrix
#!!!! not needed no output matrix from Validate tool
def OLD_labeler(matrix, og_cols, og_rows, output_file_txt): 
    #Write Data to Specified Text File Output
    with open(output_file_txt,'w') as f:
        f.write("Use original input file for further processing\n")
    f.close()
#        f.write("")
#         for k in range(0,len(og_cols)):
#                 f.write('\t' + str(og_cols[k]))
#         f.write('\n')
#         for i in range(0,len(og_rows)):
#                 f.write(og_rows[i])
#                 for j in range(0,len(matrix[0])):
#                         f.write('\t' + format(matrix[i][j]))
#                 f.write('\n') 
    
#Main Function
def main():
    args = get_args()
    #print(args)
    #sys.stdout.write(str(args))
    #sys.stdout.write( '\nValid NAN identifiers are "NA","N/A","-", and "?"')
    
    matrix,og_cols,og_rows = reader(args.input_file_txt)

#     if nonNumCnt > 0:
#         print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ',  %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
#         #sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ',  %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
#         if nanCnt > 0:
#             print('\nWARNING Matrix has '+str(nanCnt)+'  that is  %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
#         sys.exit(-1)
#     else:
#         if nanCnt > 0:
#             print('\nWARNING Matrix has NO unknown non-numbers in matrix, but contains '+str(nanCnt)+' that is  %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
#         else:
#             print('Matrix is Good-to-Go -- all numbers in data area. ')

    #with open(args.output_file_txt,'w') as f:
    #    f.write("Use original input file for further processing\n")
    #f.close()
    #sys.exit(0)
    
# TODO !!!!!  Below if MDA decides to use it  TURNED OFF FOR NOW
# TODO !!!!!  Below if MDA decides to use it  TURNED OFF FOR NOW

    if args.replacement == "Mean":
        if args.axes == "Row":
            matrix, nonNumCnt, nanCnt = nan_replacer_mean_rows(matrix)
            Labeler(matrix,og_cols,og_rows,args.output_file_txt)
            #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt)
            #print('Mean,Row')
            if nonNumCnt > 0:
                print('ERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ',  %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
                sys.stderr.write('ERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ',  %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
                if nanCnt > 0:
                    print('WARNING Matrix has '+str(nanCnt)+'  that is  %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
                sys.exit(-1)
            else:
                if nanCnt > 0:
                    print('\nWARNING Matrix has '+str(nanCnt)+'  that is  %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
                else:
                    print('\nMatrix is Good-to-Go -- all numbers in matrix. ')
                sys.exit(0)
        elif args.axes == "Column":
            matrix, nonNumCnt, nanCnt = nan_replacer_mean_columns(matrix)
            Labeler(matrix,og_cols,og_rows,args.output_file_txt)
            #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt)
            #print('Mean,Column')
            if nonNumCnt > 0:
                print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ',  %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
                sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ',  %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
                if nanCnt > 0:
                    print('\nWARNING Matrix has '+str(nanCnt)+'  that is  %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
                sys.exit(-1)
            else:
                if nanCnt > 0:
                    print('\nWARNING Matrix has '+str(nanCnt)+'  that is  %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
                else:
                    print('\nMatrix is Good-to-Go -- all numbers in matrix. ')
                sys.exit(0)
        else:
            print('Mean, but given Invalid Axis= '+str(args.axes))
            sys.stderr.write('Mean, but given Invalid Axis= '+str(args.axes))
    elif args.replacement == "Zero":
        matrix, nonNumCnt, nanCnt = nan_replacer_zero(matrix)
        Labeler(matrix,og_cols,og_rows,args.output_file_txt)
        #OLD_labeler(matrix, og_cols, og_rows, args.output_file_txt)
        if nonNumCnt > 0:
            print('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ',  %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
            sys.stderr.write('\nERROR Matrix has non-numbers that are non-NAN identifiers in matrix. Total and percent unknown strings found = '+str(nonNumCnt)+ ',  %.2f' % (100.0*nonNumCnt/(1.0*len(og_cols)*len(og_rows)))+'%' )
            if nanCnt > 0:
                print('\nWARNING Matrix has '+str(nanCnt)+'  that is  %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
            sys.exit(-1)
        else:
            if nanCnt > 0:
                print('\nWARNING Matrix has '+str(nanCnt)+'  that is  %.2f' % (100.0*nanCnt/(1.0*len(og_cols)*len(og_rows)))+'% known NAN identifiers')
            else:
                print('\nMatrix is Good-to-Go -- all numbers in matrix. ')
            sys.exit(0)
    else:
        print('zero, but given Invalid Axis= '+str(args.axes))
        sys.stderr.write('zero, but given Invalid Axis= '+str(args.axes))
        sys.exit(-2)

       
if __name__ == '__main__':
    main()
    print("done")