| 1 | 1 ''' | 
|  | 2 Created on Jun 7, 2017 modified Feb2018 | 
|  | 3 | 
|  | 4 @author: cjacoby and Bob Brown | 
|  | 5 ''' | 
|  | 6 | 
|  | 7 import sys, traceback, argparse | 
|  | 8 import numpy as np | 
|  | 9 import os | 
|  | 10 #import matplotlib.pyplot as plt | 
|  | 11 #import matplotlib.pyplot as plt; plt.rcdefaults() | 
|  | 12 | 
|  | 13 # Define the Reading Function Which Pulls the Data from a .txt file | 
|  | 14 def reader(input_file_txt, create_plot= False): | 
|  | 15     #Read Matrix, Preserving String Values for Headers first row and first column (both minus first cell) | 
|  | 16     #Read Matrix, Converting all values to Float for Data Processing | 
|  | 17 | 
|  | 18     f = open(input_file_txt, "rU") | 
|  | 19 | 
|  | 20     #print( 'Valid NAN identifiers are: empty cells, cells with blanks,"NA","N/A","-", and "?"') | 
|  | 21 | 
|  | 22     column_labels = [] | 
|  | 23     row_labels = [] | 
|  | 24     matrix  = [] | 
|  | 25     firstLine= True | 
|  | 26 | 
|  | 27     line = f.readline() | 
|  | 28 | 
|  | 29 #    "NA","N/A","-","?","NAN","NaN","Na","na","n/a","null",EMPTY/Null, SPACE (blank char) | 
|  | 30 | 
|  | 31     nanList    = ["", " ","NAN", "NA", "N/A", "-","?"] | 
|  | 32     binCatDict = {"":0, " ":0, "Text":0, "NA":0, "-":0,"NAN":0, "N/A":0,"?":0} | 
|  | 33     row       = 0 | 
|  | 34     nanCnt    = 0 | 
|  | 35     nonNumCnt = 0 | 
|  | 36 | 
|  | 37     while line: | 
|  | 38         line = line.strip("\n") | 
|  | 39         line = line.split('\t') | 
|  | 40 | 
|  | 41         row += 1 | 
|  | 42 | 
|  | 43         if firstLine: | 
|  | 44             lengthRow = len(line) | 
|  | 45             column_labels   = line[1:] | 
|  | 46         else: | 
|  | 47             if lengthRow != len(line): | 
|  | 48                # print("\nERROR matrix row lengths unequal for row 0 and row "+str(row)+"\n" ) | 
|  | 49                 sys.exit(-1) | 
|  | 50 | 
|  | 51             temp  = [] | 
|  | 52 #            column= 0 | 
|  | 53             row_labels.append(str(line[0])) | 
|  | 54 | 
|  | 55             #for item in line[1:]:  use enumerate | 
|  | 56             for column, item in enumerate(line[1:],1): | 
|  | 57 #                column += 1 | 
|  | 58                 try: | 
|  | 59                     temp.append(float(item)) | 
|  | 60                 except ValueError: | 
|  | 61                     temp.append(np.nan) | 
|  | 62                     itemUC= item.upper() | 
|  | 63 | 
|  | 64                     if itemUC in nanList: | 
|  | 65                         nanCnt += 1 | 
|  | 66                         binCatDict[itemUC]= binCatDict[itemUC]+1 | 
|  | 67                        # print( 'Legit nans= ',str(item)) | 
|  | 68                     else: | 
|  | 69                         if nonNumCnt == 0:  sys.stderr.write("Start List of up to first 50 Invalid cell values \n") | 
|  | 70                         nonNumCnt +=1 | 
|  | 71                         if nonNumCnt < 50:  sys.stderr.write("At row_column= "+str(row)+"_"+str(column)+' invalid data cell value '+ item+"\n") | 
|  | 72 | 
|  | 73             matrix.append(temp) | 
|  | 74 | 
|  | 75         line = f.readline() | 
|  | 76         firstLine= False | 
|  | 77 | 
|  | 78     #sys.stdout.write("\n\n") | 
|  | 79     f.close() | 
|  | 80     binCatDict["Text"]= nonNumCnt | 
|  | 81 | 
|  | 82 # plot results of NAN counts above | 
|  | 83 | 
|  | 84     binCat = ["null", "blank", 'hyphen', '?','NA','N/A' ,'NAN', 'text'] | 
|  | 85     orderDict= {0:"", 1:"", 2:'-', 3:'?',4:'NA',  5:'N/A' ,6:'NAN', 7:'Text'} | 
|  | 86 #TODO verify dict orde for data | 
|  | 87         #print("> key value  =",key, str(value)) | 
|  | 88 | 
|  | 89     if create_plot: | 
|  | 90         numBins = len(binCat) | 
|  | 91         binWidth = 1 | 
|  | 92         bins     = [] | 
|  | 93         binData  = [] | 
|  | 94 | 
|  | 95         for key in sorted(orderDict): | 
|  | 96             value= binCatDict[orderDict[key]]   # place items on chart in order and with data value for item | 
|  | 97             if value < 1: | 
|  | 98                 binData.append(value+0.01) | 
|  | 99             else: | 
|  | 100                 binData.append(value) | 
|  | 101 | 
|  | 102         #""" | 
|  | 103         for j in range(numBins): | 
|  | 104             bins.append(j*binWidth) | 
|  | 105     #ttps://pythonspot.com/matplotlib-bar-chart/ | 
|  | 106         y_pos = np.arange(numBins) | 
|  | 107         plt.yticks(y_pos, binCat) | 
|  | 108         plt.title("Distribution of NAN types (UPPER & lower & MiXeD case combined)") | 
|  | 109         plt.ylabel('NAN Types') | 
|  | 110         plt.xlabel('Occurrences') | 
|  | 111         #plt.legend() | 
|  | 112         plt.barh(y_pos, binData, align='center', alpha=0.5) | 
|  | 113 | 
|  | 114         fig, ax = plt.subplots(num=1, figsize=(8,3)) | 
|  | 115         ax.set_title("Data Cell Counts of Not A Number (NAN) Types") | 
|  | 116         #ax.bar(center,bins, align='center', width=width) | 
|  | 117         #ax.bar(center, hist, align='center', width=width) | 
|  | 118         #ax.set_xticks(bins) | 
|  | 119     #    fig.savefig("/Users/bobbrown/Desktop/Matrix-tools-Test-output/NAN-plot.png") | 
|  | 120 | 
|  | 121     #    fig, ax = plt.subplots(num=1, figsize=(8,3)) | 
|  | 122     #    fig.savefig("/Users/bobbrown/Desktop/Matrix-tools-Test-output/hist-out.png") | 
|  | 123 | 
|  | 124         plt.show() | 
|  | 125         #""" | 
|  | 126 | 
|  | 127 #after plot error? | 
|  | 128     x,y=np.shape(matrix) | 
|  | 129     if nanCnt > 0: print("WARNING -- Found "+str(nanCnt)+" Valid Non-numbers. Their percent of total matrix data cell values = "+str((100*nanCnt)/(x*y))+"% ") | 
|  | 130     if nonNumCnt > 0:  sys.exit(-1) | 
|  | 131     #print ("reader output:") | 
|  | 132     #print (matrix) | 
|  | 133     #print (column_labels) | 
|  | 134     #print(row_labels) | 
|  | 135     return matrix,column_labels,row_labels | 
|  | 136 | 
|  | 137 #---------------------------------------------------------------------- | 
|  | 138 # Verify Matrix A  column_labels match Matrix B row_labels in name and order for A*B | 
|  | 139 def MatchLabels(column_labels,row_labels): | 
|  | 140 | 
|  | 141         if len(column_labels) != len(row_labels): | 
|  | 142             sys.err("ERROR 1st matrix column count "+str(len(column_labels))+" not equal 2nd Matrix number row count "+str(len(row_labels))+"\n" ) | 
|  | 143         else: | 
|  | 144             cnt= 0 | 
|  | 145             for k in range(0,len(column_labels)): | 
|  | 146                 if column_labels[k] != row_labels[k] and cnt < 20: | 
|  | 147                     cnt += 1 | 
|  | 148                     #sys.err("ERROR At column & row position "+str(k)+" Matrix 1 column value "+str(column_labels)+" not equal 2nd Matrix row value "+str(row_labels)+"\n" ) | 
|  | 149 | 
|  | 150             if cnt > 0: | 
|  | 151                 sys.exit(-11) | 
|  | 152 #---------------------------------------------------------------------- | 
|  | 153 # restores row and column labels in ouput | 
|  | 154 def Labeler(matrix,column_labels,row_labels,output_file_txt): | 
|  | 155     #print("matrix length: " + str(len(matrix))) | 
|  | 156     #print("row labels length: " + str(len(row_labels))) | 
|  | 157     #print("col labels length: " +str(len(column_labels))) | 
|  | 158     #Define Null Sets For Col and Row Headers | 
|  | 159     with open(output_file_txt,'w') as f: | 
|  | 160         f.write("") | 
|  | 161         for k in range(0,len(column_labels)): | 
|  | 162                 f.write('\t' + str(column_labels[k]) ) | 
|  | 163         f.write('\n') | 
|  | 164         #for i in range(0,len(row_labels)): | 
|  | 165         for i in range(0,len(matrix)): | 
|  | 166                 f.write(str(row_labels[i]) ) | 
|  | 167                 #print("matrix["+str(i)+"] length:" + str(len(matrix[i]))) | 
|  | 168                 for j in range(0,len(matrix[0])): | 
|  | 169                         f.write('\t' + format(matrix[i][j])) | 
|  | 170                 f.write('\n') | 
|  | 171 | 
|  | 172 | 
|  | 173 #---------------------------------------------------------------------- | 
|  | 174 if __name__ == '__main__': | 
|  | 175     input_file_txt = str(sys.argv[1]) | 
|  | 176 | 
|  | 177     matrix,column_labels,row_labels = reader(input_file_txt) | 
|  | 178     print("Done") | 
|  | 179 |