annotate Matrix_Statistics.py @ 1:f1bcd79cd923 draft default tip

Uploaded
author insilico-bob
date Tue, 27 Nov 2018 14:20:40 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
1 '''
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
2 Created on Feb2018
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
3
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
4 @author: bob brown
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
5 '''
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
6
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
7 import sys, traceback, argparse
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
8 import numpy as np
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
9 from Matrix_Validate_import import reader
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
10 #import matplotlib.pyplot as plt
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
11 from Matrix_Filters import Variance_Percent_Filter_row, Variance_Percent_Filter_col
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
12
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
13 #Define argparse Function
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
14 def get_args():
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
15 parser = argparse.ArgumentParser()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
16 parser.add_argument('input_file_txt', help='tab delimited text file input matrix(include .txt in name)')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
17 parser.add_argument('choice',type=str, help='Variance Filter Method (Variance or Range)')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
18 parser.add_argument('thresh', help='Thershold for Variance Filtering')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
19 parser.add_argument('axes', help='Axes to Filter on (Either Row or Column')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
20 parser.add_argument('output_file_txt', help='tab delimited text file output name (include .txt in name)')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
21 args = parser.parse_args()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
22 return args
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
23
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
24
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
25 #Define Function Which Labels Rows/Columns on Output
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
26 def labeler(matrix,filter_rows,filter_cols,output_file_txt):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
27
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
28 #Write Data to Specified Text File Output
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
29 with open(output_file_txt,'w') as f:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
30 f.write("")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
31 for k in range(0,len(filter_cols)):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
32 f.write('\t' + filter_cols[k])
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
33 f.write('\n')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
34 for i in range(0,len(filter_rows)):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
35 f.write(filter_rows[i])
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
36 for j in range(0,len(matrix[0])):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
37 f.write('\t' + format(matrix[i][j]))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
38 f.write('\n')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
39
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
40
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
41 def Histo(matrix):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
42 numBins= 20
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
43 data = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
44 # numRow,numCol= np.shape(matrix)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
45 for i in range(len(matrix[0])):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
46 data.append(np.nanmean([row[i] for row in matrix]))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
47
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
48 # print(str(np.nanmean([row[i] for row in matrix])))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
49
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
50 #https://stackoverflow.com/questions/5328556/histogram-matplotlib
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
51 #bins = [0, 40, 60, 75, 90, 110, 125, 140, 160, 200]
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
52 minBin = int(min(data)-0.5)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
53 maxBin = int(max(data)+0.5)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
54 binWidth = float(maxBin-minBin)/numBins
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
55 bins= []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
56 """
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
57 for j in range(numBins):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
58 bins.append(minBin+ j*binWidth)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
59 #bins= 20
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
60 n, bins, patches = plt.hist(data,bins, normed=False)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
61 #n, bins, patches = plt.hist(data,bins, normed=1, color='green')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
62 #hist, bins = np.histogram(data, bins=bins)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
63 width = np.diff(bins)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
64 center = (minBin + bins[1:]) / 2
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
65
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
66 cm = plt.cm.get_cmap('RdYlBu_r')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
67 #col = (n-n.min())/(n.max()-n.min())
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
68 for c, p in zip(bins, patches):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
69 plt.setp( p, 'facecolor', cm(c/numBins))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
70 fig, ax = plt.subplots(num=1, figsize=(8,3))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
71 ax.set_title("Distribution of Column Means")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
72 #ax.bar(center,bins, align='center', width=width)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
73 #ax.bar(center, hist, align='center', width=width)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
74 #ax.set_xticks(bins)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
75 # fig.savefig("/Users/bobbrown/Desktop/Matrix-tools-Test-output/Column_Mean_Histogram.png")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
76
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
77 plt.show()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
78 """
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
79 return()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
80
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
81 #========== test create variable number output files in Galaxy
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
82 def CreateFiles(output_file_info):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
83
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
84 for i in range(3):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
85 fd= open( output_file_info, 'w')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
86 fd.write('File number = '+ str(i)+"\n")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
87 fd.close()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
88
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
89 return()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
90
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
91 #==================
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
92
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
93 #Define Main Function
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
94 def main():
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
95 try:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
96 args = get_args()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
97 #sys.stdout.write(str(args)+"\n")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
98 nanList= ["NAN", "NA", "N/A", "-","?","nan", "na", "n/a"]
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
99
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
100 matrix, og_cols,og_rows = reader(args.input_file_txt)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
101 #old_reader matrix, og_rows, og_cols = reader(args.input_file_txt)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
102 # if float(args.thresh) < 0.000001:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
103 # print('Invalid negative threshold chosen = '+str(args.thresh)+" choose positive value")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
104 # sys.exit(-4)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
105
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
106 if args.choice == "Histogram":
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
107 Histo(matrix)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
108 elif args.choice == "CreateFiles":
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
109 CreateFiles(args.output_file_info)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
110
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
111 elif args.choice == "Variance":
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
112 if args.axes == "Row":
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
113 matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = Variance_Percent_Filter_row(matrix,1,og_rows,og_cols,True)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
114 labeler(matrix,filter_rows,filter_cols,args.output_file_txt)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
115 # if delCnt < 1:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
116 # print('\nNO Filtering occurred for rows using variance < '+str(args.thresh)+ ' by row. Matrix row minimum variance= %.2f' % minVal+' and maximum variance= %.2f' % maxVal)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
117 # sys.stderr.write('\nFiltering out rows using variance < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
118 # sys.exit(-1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
119 # else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
120 # print('\nFiltering out rows using variance < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
121 elif args.axes == "Column":
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
122 matrix, filter_rows, filter_cols,delCnt,minVal,maxVal = Variance_Percent_Filter_col(matrix,1,og_rows,og_cols,True)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
123 labeler(matrix,filter_rows,filter_cols,args.output_file_txt)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
124 # if delCnt < 1:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
125 # print('\nNO Filtering occurred for columns using variance < '+str(args.thresh)+ ' by columns. Matrix columns minimum variance= %.2f' % minVal+' and maximum variance= %.2f' % maxVal)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
126 # sys.stderr.write('\nFiltering out rows using variance < '+str(args.thresh)+ ' removed '+str(delCnt)+' rows')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
127 # sys.exit(-1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
128 # else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
129 # print('\nFiltering out columns using variance < '+str(args.thresh)+ ' removed '+str(delCnt)+' columns')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
130 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
131 print('Invalid Axes = '+str(args.axes))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
132 sys.exit(-1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
133 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
134 print("Invalid Filter Choice = "+str(args.choice))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
135 sys.exit(-2)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
136
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
137
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
138 except Exception as err:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
139 traceback.print_exc()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
140 sys.exit(-3)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
141
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
142 if __name__ == '__main__':
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
143 main()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
144 print("\nFini")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
145 sys.exit(0)