annotate Matrix_Multiply.py @ 1:f1bcd79cd923 draft default tip

Uploaded
author insilico-bob
date Tue, 27 Nov 2018 14:20:40 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
1 '''
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
2 Created on March 6, 2018
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
3
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
4 @author: Bob Brown based on John Weinstein's algorithm
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
5 '''
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
6
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
7 import os
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
8 import re
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
9 import shutil
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
10 import traceback
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
11 import sys, traceback, argparse
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
12 import numpy as np
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
13 import warnings
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
14 #import scipy.stats as ss
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
15 from Matrix_Validate_import import reader, Labeler, MatchLabels
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
16 import math
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
17 warnings.filterwarnings('error')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
18
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
19 # John Weinsteins algorithm by bob brown https://discover.nci.nih.gov/CorrelateMatrices/help.do
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
20 #http://www.blog.pythonlibrary.org/2014/04/30/reading-excel-spreadsheets-with-python-and-xlrd/
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
21
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
22 def get_args():
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
23 parser = argparse.ArgumentParser()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
24 parser.add_argument('input_file1', help='text file input matrix(include .txt in name)')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
25 parser.add_argument('transpose', type=str, help='transpose matrix 1?')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
26 parser.add_argument('input_file2', help='text file input matrix(include .txt in name)')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
27 parser.add_argument('choice', type=str, help='Choose Normalization Method: 1 = Z-score, 2 = Mean Centered, 3 = log2, 4= rank')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
28 # parser.add_argument('scaleValue', help='optional scaling factor for matrix)')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
29 parser.add_argument('out_fileName', help='text file output matrix(include .txt in name)')
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
30 args = parser.parse_args()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
31 if args.transpose == "": args.transpose = 'n'
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
32 return args
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
33
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
34
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
35 def Matrix_Multiply(matrix1, matrix2):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
36
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
37 try:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
38 #TODO handle NANs
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
39
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
40 matrixOut= np.dot(matrix1, matrix2)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
41
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
42
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
43 except Exception as err:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
44 traceback.print_exc()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
45 sys.exit(-5)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
46
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
47 return(matrixOut)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
48
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
49
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
50 #CorrelateMatrices correlation acorss 2 martices https://discover.nci.nih.gov/CorrelateMatrices/home.do
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
51 def Correlate_Matrices(matrix1, matrix2):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
52
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
53 #try:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
54 # Leave both matrices as size axn and bxn and treat a is column and b as row
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
55 #matrix1T = Transpose(matrix1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
56
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
57 #TODO handle NANs
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
58 numRows1,numColumns1= np.shape(matrix1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
59
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
60 numRows2,numColumns2= np.shape(matrix2)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
61 matrixOut= []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
62
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
63 if numColumns1 != numRows2:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
64 print("ERROR number columns Matrix 1 ", str(numColumns1), " not equal number rows for Matrix 2 ",str(numRows2))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
65 sys.exit(-1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
66 #TODO need to look for NANs??
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
67
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
68 for i in range(numRows1):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
69 vectorM1 = matrix1[i][:]
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
70 meanVec1 = np.nanmean(vectorM1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
71 varStdDev1 = np.nanstd(vectorM1, ddof=1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
72 lowStdDev1 = False
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
73 #if equals zero
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
74 if abs(varStdDev1) < .000001:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
75 print("ERROR Variance value almost zero", str(varStdDev1), " for Matrix 1 Row ",str(i+1))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
76 lowStdDev1= True
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
77 correlationRow= []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
78
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
79 for j in range(numColumns2):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
80 vectorM2 = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
81 for t in range(numRows2):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
82 vectorM2.append(matrix2[t][j])
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
83 meanVec2 = np.nanmean(vectorM2)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
84 varStdDev2 = np.nanstd(vectorM2, ddof=1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
85 lowStdDev2= False
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
86 #if equals zero
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
87 if abs(varStdDev2) < .000001:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
88 print("ERROR Variance value almost zero", str(varStdDev2), " for Matrix 2 Column ",str(j+1))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
89 lowStdDev2= True
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
90
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
91 covarStdDev12= 0
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
92
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
93 if not lowStdDev1 and not lowStdDev2:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
94 #try:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
95 for pos in range(len(vectorM1)):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
96 covarStdDev12 += ((vectorM1[pos]-meanVec1)/varStdDev1)*((vectorM2[pos]-meanVec2)/varStdDev2)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
97 # bottom= (numColumns1 -1)*(varStdDev1*varStdDev2)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
98 # correlationRow.append( covarStdDev12/bottom)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
99 correlationRow.append( covarStdDev12/(numColumns1 -1))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
100 #except: bad value because of NAN or other
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
101 else:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
102 correlationRow.append("divide by 0") # cannot calculate correlation var too small
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
103
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
104 matrixOut.append(correlationRow)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
105
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
106 # except Exception as err:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
107 # traceback.print_exc()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
108 # sys.exit(-6)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
109
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
110 return(matrixOut)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
111
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
112 #----------------------------------------------------------------------
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
113 def Transpose(in_mat):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
114 out_mat = []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
115 numRows,numColumns= np.shape(in_mat)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
116
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
117 for i in range(numColumns):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
118 temp= []
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
119 for j in range(numRows):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
120 temp.append(in_mat[j][i])
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
121 out_mat.append(temp)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
122 #print( str(out_mat))
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
123 return out_mat
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
124
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
125
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
126 #----------------------------------------------------------------------
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
127 if __name__ == "__main__":
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
128
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
129 # input_file1 = "/Users/bobbrown/Desktop/Gene-by-var.txt"
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
130 # input_file2 = "/Users/bobbrown/Desktop/var-by-sample.txt"
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
131 # out_fileName = "/Users/bobbrown/Desktop/MatixMult-1-2-Out.txt"
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
132 # selection = "MatrixMultiply"
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
133 #TODO address NANs ???
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
134
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
135 try:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
136 args = get_args()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
137 selection= args.choice
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
138
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
139 matrix1,column_labels1,row_labels1 = reader(args.input_file1) # to be transposed later
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
140 matrix2,column_labels2,row_labels2 = reader(args.input_file2)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
141
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
142
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
143 if args.transpose == 'y' or args.input_file1 == args.input_file2:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
144 matrix1 = Transpose(matrix1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
145 print("\n>>>NOTICE Transposed first matrix so matrix 1 columns = Matrix 2 number rows ")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
146 temp = row_labels1 #swap labels for output matrix
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
147 row_labels1 = column_labels1 #swap labels for output matrix
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
148 column_labels1= temp #swap labels for output matrix
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
149
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
150 MatchLabels(column_labels1,row_labels2) # verfiy labels and their order match
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
151
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
152 if len(column_labels1) != len(row_labels2):
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
153 print("\n>>> ERROR attempting to multiple Matrices of incompatible dimensions ")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
154 print("First Matrix is "+str(len(row_labels1))+" by "+str(len(column_labels1))+" where second Matrix is "+str(len(og_row2))+" by "+str(len(column_labels2))+"\n")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
155 print("Matrices must have dimensions AxB and BxC. A can equal C (square matrices)")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
156 sys.exit(-1)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
157
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
158 if selection == "MatrixMultiply":
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
159 matrixOut= Matrix_Multiply(matrix1, matrix2 )
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
160
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
161 elif selection == "Corr2Matrices" or selection == "Corr1Matrix":
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
162 matrixOut = Correlate_Matrices(matrix1, matrix2)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
163
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
164 Labeler(matrixOut,column_labels2,row_labels1,args.out_fileName)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
165
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
166 print("Matrix Multiply "+str(len(row_labels1))+" by "+str(len(column_labels1))+" Matrix 1 by "+str(len(row_labels2))+" by "+str(len(column_labels2))+" matrix 2")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
167 print("Output Matrix dimensions are "+str(len(row_labels1))+" by "+str(len(column_labels2))+"\n")
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
168
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
169 except Exception as err:
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
170 traceback.print_exc()
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
171 sys.exit(-3)
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
172
f1bcd79cd923 Uploaded
insilico-bob
parents:
diff changeset
173 sys.exit(0)