annotate Matrix_Validate_import.py @ 1:f1bcd79cd923 draft default tip

Uploaded
author insilico-bob
date Tue, 27 Nov 2018 14:20:40 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
1 '''
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
2 Created on Jun 7, 2017 modified Feb2018
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
3
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
4 @author: cjacoby and Bob Brown
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
5 '''
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
6
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
7 import sys, traceback, argparse
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
8 import numpy as np
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
9 import os
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
10 #import matplotlib.pyplot as plt
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
11 #import matplotlib.pyplot as plt; plt.rcdefaults()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
12
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
13 # Define the Reading Function Which Pulls the Data from a .txt file
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
14 def reader(input_file_txt, create_plot= False):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
15 #Read Matrix, Preserving String Values for Headers first row and first column (both minus first cell)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
16 #Read Matrix, Converting all values to Float for Data Processing
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
17
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
18 f = open(input_file_txt, "rU")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
19
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
20 #print( 'Valid NAN identifiers are: empty cells, cells with blanks,"NA","N/A","-", and "?"')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
21
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
22 column_labels = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
23 row_labels = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
24 matrix = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
25 firstLine= True
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
26
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
27 line = f.readline()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
28
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
29 # "NA","N/A","-","?","NAN","NaN","Na","na","n/a","null",EMPTY/Null, SPACE (blank char)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
30
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
31 nanList = ["", " ","NAN", "NA", "N/A", "-","?"]
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
32 binCatDict = {"":0, " ":0, "Text":0, "NA":0, "-":0,"NAN":0, "N/A":0,"?":0}
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
33 row = 0
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
34 nanCnt = 0
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
35 nonNumCnt = 0
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
36
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
37 while line:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
38 line = line.strip("\n")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
39 line = line.split('\t')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
40
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
41 row += 1
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
42
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
43 if firstLine:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
44 lengthRow = len(line)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
45 column_labels = line[1:]
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
46 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
47 if lengthRow != len(line):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
48 # print("\nERROR matrix row lengths unequal for row 0 and row "+str(row)+"\n" )
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
49 sys.exit(-1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
50
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
51 temp = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
52 # column= 0
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
53 row_labels.append(str(line[0]))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
54
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
55 #for item in line[1:]: use enumerate
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
56 for column, item in enumerate(line[1:],1):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
57 # column += 1
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
58 try:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
59 temp.append(float(item))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
60 except ValueError:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
61 temp.append(np.nan)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
62 itemUC= item.upper()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
63
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
64 if itemUC in nanList:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
65 nanCnt += 1
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
66 binCatDict[itemUC]= binCatDict[itemUC]+1
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
67 # print( 'Legit nans= ',str(item))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
68 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
69 if nonNumCnt == 0: sys.stderr.write("Start List of up to first 50 Invalid cell values \n")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
70 nonNumCnt +=1
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
71 if nonNumCnt < 50: sys.stderr.write("At row_column= "+str(row)+"_"+str(column)+' invalid data cell value '+ item+"\n")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
72
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
73 matrix.append(temp)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
74
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
75 line = f.readline()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
76 firstLine= False
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
77
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
78 #sys.stdout.write("\n\n")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
79 f.close()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
80 binCatDict["Text"]= nonNumCnt
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
81
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
82 # plot results of NAN counts above
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
83
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
84 binCat = ["null", "blank", 'hyphen', '?','NA','N/A' ,'NAN', 'text']
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
85 orderDict= {0:"", 1:"", 2:'-', 3:'?',4:'NA', 5:'N/A' ,6:'NAN', 7:'Text'}
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
86 #TODO verify dict orde for data
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
87 #print("> key value =",key, str(value))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
88
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
89 if create_plot:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
90 numBins = len(binCat)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
91 binWidth = 1
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
92 bins = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
93 binData = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
94
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
95 for key in sorted(orderDict):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
96 value= binCatDict[orderDict[key]] # place items on chart in order and with data value for item
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
97 if value < 1:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
98 binData.append(value+0.01)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
99 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
100 binData.append(value)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
101
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
102 #"""
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
103 for j in range(numBins):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
104 bins.append(j*binWidth)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
105 #ttps://pythonspot.com/matplotlib-bar-chart/
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
106 y_pos = np.arange(numBins)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
107 plt.yticks(y_pos, binCat)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
108 plt.title("Distribution of NAN types (UPPER & lower & MiXeD case combined)")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
109 plt.ylabel('NAN Types')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
110 plt.xlabel('Occurrences')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
111 #plt.legend()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
112 plt.barh(y_pos, binData, align='center', alpha=0.5)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
113
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
114 fig, ax = plt.subplots(num=1, figsize=(8,3))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
115 ax.set_title("Data Cell Counts of Not A Number (NAN) Types")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
116 #ax.bar(center,bins, align='center', width=width)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
117 #ax.bar(center, hist, align='center', width=width)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
118 #ax.set_xticks(bins)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
119 # fig.savefig("/Users/bobbrown/Desktop/Matrix-tools-Test-output/NAN-plot.png")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
120
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
121 # fig, ax = plt.subplots(num=1, figsize=(8,3))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
122 # fig.savefig("/Users/bobbrown/Desktop/Matrix-tools-Test-output/hist-out.png")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
123
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
124 plt.show()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
125 #"""
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
126
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
127 #after plot error?
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
128 x,y=np.shape(matrix)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
129 if nanCnt > 0: print("WARNING -- Found "+str(nanCnt)+" Valid Non-numbers. Their percent of total matrix data cell values = "+str((100*nanCnt)/(x*y))+"% ")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
130 if nonNumCnt > 0: sys.exit(-1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
131 #print ("reader output:")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
132 #print (matrix)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
133 #print (column_labels)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
134 #print(row_labels)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
135 return matrix,column_labels,row_labels
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
136
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
137 #----------------------------------------------------------------------
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
138 # Verify Matrix A column_labels match Matrix B row_labels in name and order for A*B
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
139 def MatchLabels(column_labels,row_labels):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
140
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
141 if len(column_labels) != len(row_labels):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
142 sys.err("ERROR 1st matrix column count "+str(len(column_labels))+" not equal 2nd Matrix number row count "+str(len(row_labels))+"\n" )
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
143 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
144 cnt= 0
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
145 for k in range(0,len(column_labels)):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
146 if column_labels[k] != row_labels[k] and cnt < 20:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
147 cnt += 1
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
148 #sys.err("ERROR At column & row position "+str(k)+" Matrix 1 column value "+str(column_labels)+" not equal 2nd Matrix row value "+str(row_labels)+"\n" )
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
149
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
150 if cnt > 0:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
151 sys.exit(-11)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
152 #----------------------------------------------------------------------
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
153 # restores row and column labels in ouput
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
154 def Labeler(matrix,column_labels,row_labels,output_file_txt):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
155 #print("matrix length: " + str(len(matrix)))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
156 #print("row labels length: " + str(len(row_labels)))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
157 #print("col labels length: " +str(len(column_labels)))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
158 #Define Null Sets For Col and Row Headers
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
159 with open(output_file_txt,'w') as f:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
160 f.write("")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
161 for k in range(0,len(column_labels)):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
162 f.write('\t' + str(column_labels[k]) )
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
163 f.write('\n')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
164 #for i in range(0,len(row_labels)):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
165 for i in range(0,len(matrix)):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
166 f.write(str(row_labels[i]) )
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
167 #print("matrix["+str(i)+"] length:" + str(len(matrix[i])))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
168 for j in range(0,len(matrix[0])):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
169 f.write('\t' + format(matrix[i][j]))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
170 f.write('\n')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
171
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
172
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
173 #----------------------------------------------------------------------
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
174 if __name__ == '__main__':
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
175 input_file_txt = str(sys.argv[1])
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
176
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
177 matrix,column_labels,row_labels = reader(input_file_txt)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
178 print("Done")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
179