1
|
1 '''
|
|
2 Created on March 6, 2018
|
|
3
|
|
4 @author: Bob Brown based on John Weinstein's algorithm
|
|
5 '''
|
|
6
|
|
7 import os
|
|
8 import re
|
|
9 import shutil
|
|
10 import traceback
|
|
11 import sys, traceback, argparse
|
|
12 import numpy as np
|
|
13 import warnings
|
|
14 #import scipy.stats as ss
|
|
15 from Matrix_Validate_import import reader, Labeler, MatchLabels
|
|
16 import math
|
|
17 warnings.filterwarnings('error')
|
|
18
|
|
19 # John Weinsteins algorithm by bob brown https://discover.nci.nih.gov/CorrelateMatrices/help.do
|
|
20 #http://www.blog.pythonlibrary.org/2014/04/30/reading-excel-spreadsheets-with-python-and-xlrd/
|
|
21
|
|
22 def get_args():
|
|
23 parser = argparse.ArgumentParser()
|
|
24 parser.add_argument('input_file1', help='text file input matrix(include .txt in name)')
|
|
25 parser.add_argument('transpose', type=str, help='transpose matrix 1?')
|
|
26 parser.add_argument('input_file2', help='text file input matrix(include .txt in name)')
|
|
27 parser.add_argument('choice', type=str, help='Choose Normalization Method: 1 = Z-score, 2 = Mean Centered, 3 = log2, 4= rank')
|
|
28 # parser.add_argument('scaleValue', help='optional scaling factor for matrix)')
|
|
29 parser.add_argument('out_fileName', help='text file output matrix(include .txt in name)')
|
|
30 args = parser.parse_args()
|
|
31 if args.transpose == "": args.transpose = 'n'
|
|
32 return args
|
|
33
|
|
34
|
|
35 def Matrix_Multiply(matrix1, matrix2):
|
|
36
|
|
37 try:
|
|
38 #TODO handle NANs
|
|
39
|
|
40 matrixOut= np.dot(matrix1, matrix2)
|
|
41
|
|
42
|
|
43 except Exception as err:
|
|
44 traceback.print_exc()
|
|
45 sys.exit(-5)
|
|
46
|
|
47 return(matrixOut)
|
|
48
|
|
49
|
|
50 #CorrelateMatrices correlation acorss 2 martices https://discover.nci.nih.gov/CorrelateMatrices/home.do
|
|
51 def Correlate_Matrices(matrix1, matrix2):
|
|
52
|
|
53 #try:
|
|
54 # Leave both matrices as size axn and bxn and treat a is column and b as row
|
|
55 #matrix1T = Transpose(matrix1)
|
|
56
|
|
57 #TODO handle NANs
|
|
58 numRows1,numColumns1= np.shape(matrix1)
|
|
59
|
|
60 numRows2,numColumns2= np.shape(matrix2)
|
|
61 matrixOut= []
|
|
62
|
|
63 if numColumns1 != numRows2:
|
|
64 print("ERROR number columns Matrix 1 ", str(numColumns1), " not equal number rows for Matrix 2 ",str(numRows2))
|
|
65 sys.exit(-1)
|
|
66 #TODO need to look for NANs??
|
|
67
|
|
68 for i in range(numRows1):
|
|
69 vectorM1 = matrix1[i][:]
|
|
70 meanVec1 = np.nanmean(vectorM1)
|
|
71 varStdDev1 = np.nanstd(vectorM1, ddof=1)
|
|
72 lowStdDev1 = False
|
|
73 #if equals zero
|
|
74 if abs(varStdDev1) < .000001:
|
|
75 print("ERROR Variance value almost zero", str(varStdDev1), " for Matrix 1 Row ",str(i+1))
|
|
76 lowStdDev1= True
|
|
77 correlationRow= []
|
|
78
|
|
79 for j in range(numColumns2):
|
|
80 vectorM2 = []
|
|
81 for t in range(numRows2):
|
|
82 vectorM2.append(matrix2[t][j])
|
|
83 meanVec2 = np.nanmean(vectorM2)
|
|
84 varStdDev2 = np.nanstd(vectorM2, ddof=1)
|
|
85 lowStdDev2= False
|
|
86 #if equals zero
|
|
87 if abs(varStdDev2) < .000001:
|
|
88 print("ERROR Variance value almost zero", str(varStdDev2), " for Matrix 2 Column ",str(j+1))
|
|
89 lowStdDev2= True
|
|
90
|
|
91 covarStdDev12= 0
|
|
92
|
|
93 if not lowStdDev1 and not lowStdDev2:
|
|
94 #try:
|
|
95 for pos in range(len(vectorM1)):
|
|
96 covarStdDev12 += ((vectorM1[pos]-meanVec1)/varStdDev1)*((vectorM2[pos]-meanVec2)/varStdDev2)
|
|
97 # bottom= (numColumns1 -1)*(varStdDev1*varStdDev2)
|
|
98 # correlationRow.append( covarStdDev12/bottom)
|
|
99 correlationRow.append( covarStdDev12/(numColumns1 -1))
|
|
100 #except: bad value because of NAN or other
|
|
101 else:
|
|
102 correlationRow.append("divide by 0") # cannot calculate correlation var too small
|
|
103
|
|
104 matrixOut.append(correlationRow)
|
|
105
|
|
106 # except Exception as err:
|
|
107 # traceback.print_exc()
|
|
108 # sys.exit(-6)
|
|
109
|
|
110 return(matrixOut)
|
|
111
|
|
112 #----------------------------------------------------------------------
|
|
113 def Transpose(in_mat):
|
|
114 out_mat = []
|
|
115 numRows,numColumns= np.shape(in_mat)
|
|
116
|
|
117 for i in range(numColumns):
|
|
118 temp= []
|
|
119 for j in range(numRows):
|
|
120 temp.append(in_mat[j][i])
|
|
121 out_mat.append(temp)
|
|
122 #print( str(out_mat))
|
|
123 return out_mat
|
|
124
|
|
125
|
|
126 #----------------------------------------------------------------------
|
|
127 if __name__ == "__main__":
|
|
128
|
|
129 # input_file1 = "/Users/bobbrown/Desktop/Gene-by-var.txt"
|
|
130 # input_file2 = "/Users/bobbrown/Desktop/var-by-sample.txt"
|
|
131 # out_fileName = "/Users/bobbrown/Desktop/MatixMult-1-2-Out.txt"
|
|
132 # selection = "MatrixMultiply"
|
|
133 #TODO address NANs ???
|
|
134
|
|
135 try:
|
|
136 args = get_args()
|
|
137 selection= args.choice
|
|
138
|
|
139 matrix1,column_labels1,row_labels1 = reader(args.input_file1) # to be transposed later
|
|
140 matrix2,column_labels2,row_labels2 = reader(args.input_file2)
|
|
141
|
|
142
|
|
143 if args.transpose == 'y' or args.input_file1 == args.input_file2:
|
|
144 matrix1 = Transpose(matrix1)
|
|
145 print("\n>>>NOTICE Transposed first matrix so matrix 1 columns = Matrix 2 number rows ")
|
|
146 temp = row_labels1 #swap labels for output matrix
|
|
147 row_labels1 = column_labels1 #swap labels for output matrix
|
|
148 column_labels1= temp #swap labels for output matrix
|
|
149
|
|
150 MatchLabels(column_labels1,row_labels2) # verfiy labels and their order match
|
|
151
|
|
152 if len(column_labels1) != len(row_labels2):
|
|
153 print("\n>>> ERROR attempting to multiple Matrices of incompatible dimensions ")
|
|
154 print("First Matrix is "+str(len(row_labels1))+" by "+str(len(column_labels1))+" where second Matrix is "+str(len(og_row2))+" by "+str(len(column_labels2))+"\n")
|
|
155 print("Matrices must have dimensions AxB and BxC. A can equal C (square matrices)")
|
|
156 sys.exit(-1)
|
|
157
|
|
158 if selection == "MatrixMultiply":
|
|
159 matrixOut= Matrix_Multiply(matrix1, matrix2 )
|
|
160
|
|
161 elif selection == "Corr2Matrices" or selection == "Corr1Matrix":
|
|
162 matrixOut = Correlate_Matrices(matrix1, matrix2)
|
|
163
|
|
164 Labeler(matrixOut,column_labels2,row_labels1,args.out_fileName)
|
|
165
|
|
166 print("Matrix Multiply "+str(len(row_labels1))+" by "+str(len(column_labels1))+" Matrix 1 by "+str(len(row_labels2))+" by "+str(len(column_labels2))+" matrix 2")
|
|
167 print("Output Matrix dimensions are "+str(len(row_labels1))+" by "+str(len(column_labels2))+"\n")
|
|
168
|
|
169 except Exception as err:
|
|
170 traceback.print_exc()
|
|
171 sys.exit(-3)
|
|
172
|
|
173 sys.exit(0) |