annotate src/breadcrumbs/src/Metric.py @ 0:0de566f21448 draft default tip

v2
author sagun98
date Thu, 03 Jun 2021 18:13:32 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
sagun98
parents:
diff changeset
1 """
sagun98
parents:
diff changeset
2 Author: Timothy Tickle
sagun98
parents:
diff changeset
3 Description: Calculates Metrics.
sagun98
parents:
diff changeset
4 """
sagun98
parents:
diff changeset
5
sagun98
parents:
diff changeset
6 #####################################################################################
sagun98
parents:
diff changeset
7 #Copyright (C) <2012>
sagun98
parents:
diff changeset
8 #
sagun98
parents:
diff changeset
9 #Permission is hereby granted, free of charge, to any person obtaining a copy of
sagun98
parents:
diff changeset
10 #this software and associated documentation files (the "Software"), to deal in the
sagun98
parents:
diff changeset
11 #Software without restriction, including without limitation the rights to use, copy,
sagun98
parents:
diff changeset
12 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
sagun98
parents:
diff changeset
13 #and to permit persons to whom the Software is furnished to do so, subject to
sagun98
parents:
diff changeset
14 #the following conditions:
sagun98
parents:
diff changeset
15 #
sagun98
parents:
diff changeset
16 #The above copyright notice and this permission notice shall be included in all copies
sagun98
parents:
diff changeset
17 #or substantial portions of the Software.
sagun98
parents:
diff changeset
18 #
sagun98
parents:
diff changeset
19 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
sagun98
parents:
diff changeset
20 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
sagun98
parents:
diff changeset
21 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
sagun98
parents:
diff changeset
22 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
sagun98
parents:
diff changeset
23 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
sagun98
parents:
diff changeset
24 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
sagun98
parents:
diff changeset
25 #####################################################################################
sagun98
parents:
diff changeset
26
sagun98
parents:
diff changeset
27 __author__ = "Timothy Tickle"
sagun98
parents:
diff changeset
28 __copyright__ = "Copyright 2012"
sagun98
parents:
diff changeset
29 __credits__ = ["Timothy Tickle"]
sagun98
parents:
diff changeset
30 __license__ = "MIT"
sagun98
parents:
diff changeset
31 __maintainer__ = "Timothy Tickle"
sagun98
parents:
diff changeset
32 __email__ = "ttickle@sph.harvard.edu"
sagun98
parents:
diff changeset
33 __status__ = "Development"
sagun98
parents:
diff changeset
34
sagun98
parents:
diff changeset
35 #Update path
sagun98
parents:
diff changeset
36 from ConstantsBreadCrumbs import ConstantsBreadCrumbs
sagun98
parents:
diff changeset
37 import csv
sagun98
parents:
diff changeset
38 import numpy as np
sagun98
parents:
diff changeset
39 from types import *
sagun98
parents:
diff changeset
40 from ValidateData import ValidateData
sagun98
parents:
diff changeset
41
sagun98
parents:
diff changeset
42 #External libraries
sagun98
parents:
diff changeset
43 from cogent.maths.unifrac.fast_unifrac import fast_unifrac_file
sagun98
parents:
diff changeset
44 import cogent.maths.stats.alpha_diversity
sagun98
parents:
diff changeset
45 import scipy.spatial.distance
sagun98
parents:
diff changeset
46
sagun98
parents:
diff changeset
47 class Metric:
sagun98
parents:
diff changeset
48 """
sagun98
parents:
diff changeset
49 Performs ecological measurements.
sagun98
parents:
diff changeset
50 """
sagun98
parents:
diff changeset
51
sagun98
parents:
diff changeset
52 #Diversity metrics Alpha
sagun98
parents:
diff changeset
53 c_strSimpsonDiversity = "SimpsonD"
sagun98
parents:
diff changeset
54 c_strInvSimpsonDiversity = "InSimpsonD"
sagun98
parents:
diff changeset
55 c_strChao1Diversity = "Chao1"
sagun98
parents:
diff changeset
56
sagun98
parents:
diff changeset
57 #Diversity metrics Beta
sagun98
parents:
diff changeset
58 c_strBrayCurtisDissimilarity = "B_Curtis"
sagun98
parents:
diff changeset
59 c_strUnifracUnweighted = "unifrac_unweighted"
sagun98
parents:
diff changeset
60 c_strUnifracWeighted = "unifrac_weighted"
sagun98
parents:
diff changeset
61
sagun98
parents:
diff changeset
62 #Additive inverses of beta metrics
sagun98
parents:
diff changeset
63 c_strInvBrayCurtisDissimilarity = "InB_Curtis"
sagun98
parents:
diff changeset
64
sagun98
parents:
diff changeset
65 #Richness
sagun98
parents:
diff changeset
66 c_strShannonRichness = "ShannonR"
sagun98
parents:
diff changeset
67 c_strObservedCount = "Observed_Count"
sagun98
parents:
diff changeset
68
sagun98
parents:
diff changeset
69 #Different alpha diversity metrics
sagun98
parents:
diff changeset
70 setAlphaDiversities = set(["observed_species","margalef","menhinick",
sagun98
parents:
diff changeset
71 "dominance","reciprocal_simpson","shannon","equitability","berger_parker_d",
sagun98
parents:
diff changeset
72 "mcintosh_d","brillouin_d","strong","fisher_alpha","simpson",
sagun98
parents:
diff changeset
73 "mcintosh_e","heip_e","simpson_e","robbins","michaelis_menten_fit","chao1","ACE"])
sagun98
parents:
diff changeset
74
sagun98
parents:
diff changeset
75 #Different beta diversity metrics
sagun98
parents:
diff changeset
76 setBetaDiversities = set(["braycurtis","canberra","chebyshev","cityblock",
sagun98
parents:
diff changeset
77 "correlation","cosine","euclidean","hamming","sqeuclidean"])
sagun98
parents:
diff changeset
78
sagun98
parents:
diff changeset
79 #Tested 4
sagun98
parents:
diff changeset
80 @staticmethod
sagun98
parents:
diff changeset
81 def funcGetSimpsonsDiversityIndex(ldSampleTaxaAbundancies=None):
sagun98
parents:
diff changeset
82 """
sagun98
parents:
diff changeset
83 Calculates the Simpsons diversity index as defined as sum(Pi*Pi).
sagun98
parents:
diff changeset
84 Note***: Assumes that the abundance measurements are already normalized by the total population N.
sagun98
parents:
diff changeset
85
sagun98
parents:
diff changeset
86 :param ldSampleTaxaAbundancies: List of measurements to calculate metric on (a sample).
sagun98
parents:
diff changeset
87 :type: List of doubles
sagun98
parents:
diff changeset
88 :return Double: Diversity metric
sagun98
parents:
diff changeset
89 """
sagun98
parents:
diff changeset
90
sagun98
parents:
diff changeset
91 #Calculate metric
sagun98
parents:
diff changeset
92 return sum((ldSampleTaxaAbundancies)*(ldSampleTaxaAbundancies))
sagun98
parents:
diff changeset
93
sagun98
parents:
diff changeset
94 #Tested 4
sagun98
parents:
diff changeset
95 @staticmethod
sagun98
parents:
diff changeset
96 def funcGetInverseSimpsonsDiversityIndex(ldSampleTaxaAbundancies=None):
sagun98
parents:
diff changeset
97 """
sagun98
parents:
diff changeset
98 Calculates Inverse Simpsons diversity index 1/sum(Pi*Pi).
sagun98
parents:
diff changeset
99 This is multiplicative inverse which reverses the order of the simpsons diversity index.
sagun98
parents:
diff changeset
100 Note***: Assumes that the abundance measurements are already normalized by the total population N.
sagun98
parents:
diff changeset
101
sagun98
parents:
diff changeset
102 :param ldSampleTaxaAbundancies: List of measurements to calculate metric on (a sample).
sagun98
parents:
diff changeset
103 :type: List of doubles
sagun98
parents:
diff changeset
104 :return Double: Diversity metric
sagun98
parents:
diff changeset
105 """
sagun98
parents:
diff changeset
106
sagun98
parents:
diff changeset
107 simpsons = Metric.funcGetSimpsonsDiversityIndex(ldSampleTaxaAbundancies)
sagun98
parents:
diff changeset
108 #If simpsons is false return false, else return inverse
sagun98
parents:
diff changeset
109 if not simpsons:
sagun98
parents:
diff changeset
110 return False
sagun98
parents:
diff changeset
111 return 1.0/simpsons
sagun98
parents:
diff changeset
112
sagun98
parents:
diff changeset
113 #Tested 4
sagun98
parents:
diff changeset
114 @staticmethod
sagun98
parents:
diff changeset
115 def funcGetShannonRichnessIndex(ldSampleTaxaAbundancies=None):
sagun98
parents:
diff changeset
116 """
sagun98
parents:
diff changeset
117 Calculates the Shannon richness index.
sagun98
parents:
diff changeset
118 Note***: Assumes that the abundance measurements are already normalized by the total population N.
sagun98
parents:
diff changeset
119 If not normalized, include N in the parameter tempTotalN and it will be.
sagun98
parents:
diff changeset
120 This is in base exp(1) like the default R Vegan package. Cogent is by defaul in bits (base=2)
sagun98
parents:
diff changeset
121 Both options are here for your use. See Metric.funcGetAlphaDiversity() to access cogent
sagun98
parents:
diff changeset
122
sagun98
parents:
diff changeset
123 :param ldSampleTaxaAbundancies: List of measurements to calculate metric on (a sample).
sagun98
parents:
diff changeset
124 :type: List of doubles
sagun98
parents:
diff changeset
125 :return Double: Richness metric
sagun98
parents:
diff changeset
126 """
sagun98
parents:
diff changeset
127
sagun98
parents:
diff changeset
128 #Calculate metric
sagun98
parents:
diff changeset
129 ldSampleTaxaAbundancies = ldSampleTaxaAbundancies[np.where(ldSampleTaxaAbundancies != 0)]
sagun98
parents:
diff changeset
130 tempIntermediateNumber = sum(ldSampleTaxaAbundancies*(np.log(ldSampleTaxaAbundancies)))
sagun98
parents:
diff changeset
131 if(tempIntermediateNumber == 0.0):
sagun98
parents:
diff changeset
132 return 0.0
sagun98
parents:
diff changeset
133 return -1 * tempIntermediateNumber
sagun98
parents:
diff changeset
134
sagun98
parents:
diff changeset
135 #Test 3
sagun98
parents:
diff changeset
136 @staticmethod
sagun98
parents:
diff changeset
137 def funcGetChao1DiversityIndex(ldSampleTaxaAbundancies=None, fCorrectForBias=False):
sagun98
parents:
diff changeset
138 """
sagun98
parents:
diff changeset
139 Calculates the Chao1 diversity index.
sagun98
parents:
diff changeset
140 Note***: Not normalized by abundance.
sagun98
parents:
diff changeset
141
sagun98
parents:
diff changeset
142 :param ldSampleTaxaAbundancies: List of measurements to calculate metric on (a sample).
sagun98
parents:
diff changeset
143 :type: List of doubles
sagun98
parents:
diff changeset
144 :param fCorrectForBias: Indicator to use bias correction.
sagun98
parents:
diff changeset
145 :type: Boolean False indicates uncorrected for bias (uncorrected = Chao 1984, corrected = Chao 1987, Eq. 2)
sagun98
parents:
diff changeset
146 :return Double: Diversity metric
sagun98
parents:
diff changeset
147 """
sagun98
parents:
diff changeset
148 #If not counts return false
sagun98
parents:
diff changeset
149 if [num for num in ldSampleTaxaAbundancies if((num<1) and (not num==0))]: return False
sagun98
parents:
diff changeset
150
sagun98
parents:
diff changeset
151 #Observed = total number of species observed in all samples pooled
sagun98
parents:
diff changeset
152 totalObservedSpecies = len(ldSampleTaxaAbundancies)-len(ldSampleTaxaAbundancies[ldSampleTaxaAbundancies == 0])
sagun98
parents:
diff changeset
153
sagun98
parents:
diff changeset
154 #Singles = number of species that occur in exactly 1 sample
sagun98
parents:
diff changeset
155 singlesObserved = len(ldSampleTaxaAbundancies[ldSampleTaxaAbundancies == 1.0])
sagun98
parents:
diff changeset
156
sagun98
parents:
diff changeset
157 #Doubles = number of species that occue in exactly 2 samples
sagun98
parents:
diff changeset
158 doublesObserved = len(ldSampleTaxaAbundancies[ldSampleTaxaAbundancies == 2.0])
sagun98
parents:
diff changeset
159
sagun98
parents:
diff changeset
160 #If singles or doubles = 0, return observations so that a divided by zero error does not occur
sagun98
parents:
diff changeset
161 if((singlesObserved == 0) or (doublesObserved == 0)):
sagun98
parents:
diff changeset
162 return totalObservedSpecies
sagun98
parents:
diff changeset
163
sagun98
parents:
diff changeset
164 #Calculate metric
sagun98
parents:
diff changeset
165 if fCorrectForBias:
sagun98
parents:
diff changeset
166 return cogent.maths.stats.alpha_diversity.chao1_bias_corrected(observed = totalObservedSpecies, singles = singlesObserved, doubles = doublesObserved)
sagun98
parents:
diff changeset
167 else:
sagun98
parents:
diff changeset
168 return cogent.maths.stats.alpha_diversity.chao1_uncorrected(observed = totalObservedSpecies, singles = singlesObserved, doubles = doublesObserved)
sagun98
parents:
diff changeset
169
sagun98
parents:
diff changeset
170 #Test 3
sagun98
parents:
diff changeset
171 @staticmethod
sagun98
parents:
diff changeset
172 def funcGetObservedCount(ldSampleAbundances, dThreshold = 0.0):
sagun98
parents:
diff changeset
173 """
sagun98
parents:
diff changeset
174 Count how many bugs / features have a value of greater than 0 or the threshold given.
sagun98
parents:
diff changeset
175 Expects a vector of abundances.
sagun98
parents:
diff changeset
176 ****Do not normalize data if using the threshold.
sagun98
parents:
diff changeset
177
sagun98
parents:
diff changeset
178 :param ldSampleAbundances: List of measurements to calculate metric on (a sample).
sagun98
parents:
diff changeset
179 :type: List of doubles
sagun98
parents:
diff changeset
180 :param dThreshold: The lowest number the measurement can be to be counted as an observation.
sagun98
parents:
diff changeset
181 :type: Double
sagun98
parents:
diff changeset
182 :return Count: Number of features observed in a sample.
sagun98
parents:
diff changeset
183 """
sagun98
parents:
diff changeset
184
sagun98
parents:
diff changeset
185 return sum([1 for observation in ldSampleAbundances if observation > dThreshold])
sagun98
parents:
diff changeset
186
sagun98
parents:
diff changeset
187 #Test Cases 6
sagun98
parents:
diff changeset
188 @staticmethod
sagun98
parents:
diff changeset
189 def funcGetAlphaDiversity(liCounts,strMetric):
sagun98
parents:
diff changeset
190 """
sagun98
parents:
diff changeset
191 Passes counts to cogent for an alpha diversity metric.
sagun98
parents:
diff changeset
192 setAlphaDiversities are the names supported
sagun98
parents:
diff changeset
193
sagun98
parents:
diff changeset
194 :param liCount: List of counts to calculate metric on (a sample).
sagun98
parents:
diff changeset
195 :type: List of ints
sagun98
parents:
diff changeset
196 :return Diversity: Double diversity metric.
sagun98
parents:
diff changeset
197 """
sagun98
parents:
diff changeset
198
sagun98
parents:
diff changeset
199 return getattr(cogent.maths.stats.alpha_diversity,strMetric)(liCounts)
sagun98
parents:
diff changeset
200
sagun98
parents:
diff changeset
201 #Happy path tested 1
sagun98
parents:
diff changeset
202 @staticmethod
sagun98
parents:
diff changeset
203 def funcGetDissimilarity(ldSampleTaxaAbundancies, funcDistanceFunction):
sagun98
parents:
diff changeset
204 """
sagun98
parents:
diff changeset
205 Calculates the distance between samples given a function.
sagun98
parents:
diff changeset
206
sagun98
parents:
diff changeset
207 If you have 5 rows (labeled r1,r2,r3,r4,r5) the vector are the distances in this order.
sagun98
parents:
diff changeset
208 condensed form = [d(r1,r2), d(r1,r3), d(r1,r4), d(r1,r5), d(r2,r3), d(r2,r4), d(r2,r5), d(r3,r4), d(r3,r5), d(r4,r5)].
sagun98
parents:
diff changeset
209 Note***: Assumes that the abundance measurements are already normalized by the total population N.
sagun98
parents:
diff changeset
210
sagun98
parents:
diff changeset
211 :param ldSampleTaxaAbundancies:
sagun98
parents:
diff changeset
212 :type: List of doubles
sagun98
parents:
diff changeset
213 :param funcDistanceFunction: Distance function used to calculate distances
sagun98
parents:
diff changeset
214 :type: Function
sagun98
parents:
diff changeset
215 :return Double: Dissimilarity metric
sagun98
parents:
diff changeset
216 """
sagun98
parents:
diff changeset
217
sagun98
parents:
diff changeset
218 #Calculate metric
sagun98
parents:
diff changeset
219 try:
sagun98
parents:
diff changeset
220 return scipy.spatial.distance.pdist(ldSampleTaxaAbundancies, funcDistanceFunction)
sagun98
parents:
diff changeset
221 except ValueError as error:
sagun98
parents:
diff changeset
222 print "".join(["Metric.funcGetDissimilarity. Error=",str(error)])
sagun98
parents:
diff changeset
223 return False
sagun98
parents:
diff changeset
224
sagun98
parents:
diff changeset
225 #Test case 1
sagun98
parents:
diff changeset
226 @staticmethod
sagun98
parents:
diff changeset
227 def funcGetDissimilarityByName(ldSampleTaxaAbundancies, strMetric):
sagun98
parents:
diff changeset
228 """
sagun98
parents:
diff changeset
229 Calculates beta-diversity metrics between lists of abundances
sagun98
parents:
diff changeset
230 setBetaDiversities are the names supported
sagun98
parents:
diff changeset
231
sagun98
parents:
diff changeset
232 :param ldSampleTaxaAbundancies:
sagun98
parents:
diff changeset
233 :type: List of doubles
sagun98
parents:
diff changeset
234 :param strMetric: Name of the distance function used to calculate distances
sagun98
parents:
diff changeset
235 :type: String
sagun98
parents:
diff changeset
236 :return list double: Dissimilarity metrics between each sample
sagun98
parents:
diff changeset
237 """
sagun98
parents:
diff changeset
238
sagun98
parents:
diff changeset
239 return scipy.spatial.distance.pdist(ldSampleTaxaAbundancies,strMetric)
sagun98
parents:
diff changeset
240
sagun98
parents:
diff changeset
241 #Test 3
sagun98
parents:
diff changeset
242 @staticmethod
sagun98
parents:
diff changeset
243 def funcGetBrayCurtisDissimilarity(ldSampleTaxaAbundancies):
sagun98
parents:
diff changeset
244 """
sagun98
parents:
diff changeset
245 Calculates the BrayCurtis Beta dissimilarity index.
sagun98
parents:
diff changeset
246 d(u,v)=sum(abs(row1-row2))/sum(row1+row2).
sagun98
parents:
diff changeset
247 This is scale invariant.
sagun98
parents:
diff changeset
248 If you have 5 rows (labeled r1,r2,r3,r4,r5) the vector are the distances in this order.
sagun98
parents:
diff changeset
249 condensed form = [d(r1,r2), d(r1,r3), d(r1,r4), d(r1,r5), d(r2,r3), d(r2,r4), d(r2,r5), d(r3,r4), d(r3,r5), d(r4,r5)].
sagun98
parents:
diff changeset
250 Note***: Assumes that the abundance measurements are already normalized by the total population N.
sagun98
parents:
diff changeset
251
sagun98
parents:
diff changeset
252 :param ldSampleTaxaAbundancies:
sagun98
parents:
diff changeset
253 :type: List of doubles
sagun98
parents:
diff changeset
254 :return Double Matrix: Dissimilarity metric
sagun98
parents:
diff changeset
255 """
sagun98
parents:
diff changeset
256
sagun98
parents:
diff changeset
257 #Calculate metric
sagun98
parents:
diff changeset
258 try:
sagun98
parents:
diff changeset
259 return scipy.spatial.distance.pdist(X=ldSampleTaxaAbundancies, metric='braycurtis')
sagun98
parents:
diff changeset
260 except ValueError as error:
sagun98
parents:
diff changeset
261 print "".join(["Metric.getBrayCurtisDissimilarity. Error=",str(error)])
sagun98
parents:
diff changeset
262 return False
sagun98
parents:
diff changeset
263
sagun98
parents:
diff changeset
264 #Test 3
sagun98
parents:
diff changeset
265 @staticmethod
sagun98
parents:
diff changeset
266 def funcGetInverseBrayCurtisDissimilarity(ldSampleTaxaAbundancies):
sagun98
parents:
diff changeset
267 """
sagun98
parents:
diff changeset
268 Calculates 1 - the BrayCurtis Beta dissimilarity index.
sagun98
parents:
diff changeset
269 d(u,v)=1-(sum(abs(row1-row2))/sum(row1+row2)).
sagun98
parents:
diff changeset
270 This is scale invariant and ranges between 0 and 1.
sagun98
parents:
diff changeset
271 If you have 5 rows (labeled r1,r2,r3,r4,r5) the vector are the distances in this order.
sagun98
parents:
diff changeset
272 condensed form = [d(r1,r2), d(r1,r3), d(r1,r4), d(r1,r5), d(r2,r3), d(r2,r4), d(r2,r5), d(r3,r4), d(r3,r5), d(r4,r5)].
sagun98
parents:
diff changeset
273 Note***: Assumes that the abundance measurements are already normalized by the total population N.
sagun98
parents:
diff changeset
274
sagun98
parents:
diff changeset
275 :param ldSampleTaxaAbundancies: An np.array of samples (rows) x measurements (columns) in which distance is measured between rows
sagun98
parents:
diff changeset
276 :type: List List of doubles
sagun98
parents:
diff changeset
277 :return Double Matrix: 1 - Bray-Curtis dissimilarity.
sagun98
parents:
diff changeset
278 """
sagun98
parents:
diff changeset
279
sagun98
parents:
diff changeset
280 bcValue = Metric.funcGetBrayCurtisDissimilarity(ldSampleTaxaAbundancies = ldSampleTaxaAbundancies)
sagun98
parents:
diff changeset
281 if not type(bcValue) is BooleanType:
sagun98
parents:
diff changeset
282 return 1.0-bcValue
sagun98
parents:
diff changeset
283 return False
sagun98
parents:
diff changeset
284
sagun98
parents:
diff changeset
285 #Test cases 8
sagun98
parents:
diff changeset
286 @staticmethod
sagun98
parents:
diff changeset
287 def funcGetUnifracDistance(istrmTree,istrmEnvr,lsSampleOrder=None,fWeighted=True):
sagun98
parents:
diff changeset
288 """
sagun98
parents:
diff changeset
289 Gets a unifrac distance from files/filestreams.
sagun98
parents:
diff changeset
290
sagun98
parents:
diff changeset
291 :param istrmTree: File path or stream which is a Newick format file
sagun98
parents:
diff changeset
292 :type: String of file stream
sagun98
parents:
diff changeset
293 :param istrmEnvr: File path or stream which is a Newick format file
sagun98
parents:
diff changeset
294 :type: String of file stream
sagun98
parents:
diff changeset
295 """
sagun98
parents:
diff changeset
296 npaDist, lsSampleNames = fast_unifrac_file(open(istrmTree,"r") if isinstance(istrmTree, str) else istrmTree,
sagun98
parents:
diff changeset
297 open(istrmEnvr,"r") if isinstance(istrmEnvr, str) else istrmEnvr, weighted=fWeighted).get("distance_matrix",False)
sagun98
parents:
diff changeset
298
sagun98
parents:
diff changeset
299 #Was trying to avoid preallocating a matrix but if you only need a subset of the samples then it
sagun98
parents:
diff changeset
300 #is simpler to preallocate so this is what I am doing but making a condensed matrix and not a full matrix
sagun98
parents:
diff changeset
301
sagun98
parents:
diff changeset
302 #Dictionary to translate the current order of the samples to what is expected if given an input order
sagun98
parents:
diff changeset
303 if lsSampleOrder:
sagun98
parents:
diff changeset
304 #{NewOrder:OriginalOrder} way to convert from old to new sample location
sagun98
parents:
diff changeset
305 dictTranslate = dict([[lsSampleOrder.index(sSampleName),lsSampleNames.index(sSampleName)] for sSampleName in lsSampleNames if sSampleName in lsSampleOrder])
sagun98
parents:
diff changeset
306
sagun98
parents:
diff changeset
307 #Check to make sure all samples requested were found
sagun98
parents:
diff changeset
308 if not len(dictTranslate.keys()) == len(lsSampleOrder):
sagun98
parents:
diff changeset
309 print "Metric.funcGetUnifracDistance. Error= The some or all sample names given (lsSampleOrder) were not contained in the matrix."
sagun98
parents:
diff changeset
310 return False
sagun98
parents:
diff changeset
311
sagun98
parents:
diff changeset
312 #Length of data
sagun98
parents:
diff changeset
313 iLengthOfData = len(lsSampleOrder)
sagun98
parents:
diff changeset
314
sagun98
parents:
diff changeset
315 #Preallocate matrix and shuffle
sagun98
parents:
diff changeset
316 mtrxData = np.zeros(shape=(iLengthOfData,iLengthOfData))
sagun98
parents:
diff changeset
317 for x in xrange(iLengthOfData):
sagun98
parents:
diff changeset
318 for y in xrange(iLengthOfData):
sagun98
parents:
diff changeset
319 mtrxData[x,y] = npaDist[dictTranslate[x],dictTranslate[y]]
sagun98
parents:
diff changeset
320 npaDist = mtrxData
sagun98
parents:
diff changeset
321
sagun98
parents:
diff changeset
322 lsSampleNames = lsSampleOrder
sagun98
parents:
diff changeset
323
sagun98
parents:
diff changeset
324 #If no sample order is given, condense the matrix and return
sagun98
parents:
diff changeset
325 return (scipy.spatial.distance.squareform(npaDist),lsSampleNames)
sagun98
parents:
diff changeset
326
sagun98
parents:
diff changeset
327
sagun98
parents:
diff changeset
328 #Test 7
sagun98
parents:
diff changeset
329 @staticmethod
sagun98
parents:
diff changeset
330 def funcGetAlphaMetric(ldAbundancies, strMetric):
sagun98
parents:
diff changeset
331 """
sagun98
parents:
diff changeset
332 Get alpha abundance of the metric for the vector.
sagun98
parents:
diff changeset
333 Note: Shannon is measured with base 2 ("shannon") or base exp(1) (Metric.c_strShannonRichness) depending which method is called.
sagun98
parents:
diff changeset
334
sagun98
parents:
diff changeset
335 :param ldAbundancies: List of values to compute metric (a sample).
sagun98
parents:
diff changeset
336 :type: List List of doubles.
sagun98
parents:
diff changeset
337 :param strMetric: The metric to measure.
sagun98
parents:
diff changeset
338 :type: String Metric name (Use from constants above).
sagun98
parents:
diff changeset
339 :return Double: Metric specified by strMetric derived from ldAbundancies.
sagun98
parents:
diff changeset
340 """
sagun98
parents:
diff changeset
341
sagun98
parents:
diff changeset
342 if(strMetric == Metric.c_strShannonRichness):
sagun98
parents:
diff changeset
343 return Metric.funcGetShannonRichnessIndex(ldSampleTaxaAbundancies=ldAbundancies)
sagun98
parents:
diff changeset
344 elif(strMetric == Metric.c_strSimpsonDiversity):
sagun98
parents:
diff changeset
345 return Metric.funcGetSimpsonsDiversityIndex(ldSampleTaxaAbundancies=ldAbundancies)
sagun98
parents:
diff changeset
346 elif(strMetric == Metric.c_strInvSimpsonDiversity):
sagun98
parents:
diff changeset
347 return Metric.funcGetInverseSimpsonsDiversityIndex(ldSampleTaxaAbundancies=ldAbundancies)
sagun98
parents:
diff changeset
348 elif(strMetric == Metric.c_strObservedCount):
sagun98
parents:
diff changeset
349 return Metric.funcGetObservedCount(ldSampleAbundances=ldAbundancies)
sagun98
parents:
diff changeset
350 #Chao1 Needs NOT Normalized Abundance (Counts)
sagun98
parents:
diff changeset
351 elif(strMetric == Metric.c_strChao1Diversity):
sagun98
parents:
diff changeset
352 return Metric.funcGetChao1DiversityIndex(ldSampleTaxaAbundancies=ldAbundancies)
sagun98
parents:
diff changeset
353 elif(strMetric in Metric.setAlphaDiversities):
sagun98
parents:
diff changeset
354 return Metric.funcGetAlphaDiversity(liCounts=ldAbundancies, strMetric=strMetric)
sagun98
parents:
diff changeset
355 else:
sagun98
parents:
diff changeset
356 return False
sagun98
parents:
diff changeset
357
sagun98
parents:
diff changeset
358 #Test 5
sagun98
parents:
diff changeset
359 @staticmethod
sagun98
parents:
diff changeset
360 def funcBuildAlphaMetricsMatrix(npaSampleAbundance = None, lsSampleNames = None, lsDiversityMetricAlpha = None):
sagun98
parents:
diff changeset
361 """
sagun98
parents:
diff changeset
362 Build a matrix of alpha diversity metrics for each sample
sagun98
parents:
diff changeset
363 Row = metric, column = sample
sagun98
parents:
diff changeset
364
sagun98
parents:
diff changeset
365 :param npaSampleAbundance: Observations (Taxa (row) x sample (column))
sagun98
parents:
diff changeset
366 :type: Numpy Array
sagun98
parents:
diff changeset
367 :param lsSampleNames: List of sample names of samples to measure (do not include the taxa id column name or other column names which should not be read).
sagun98
parents:
diff changeset
368 :type: List of strings Strings being samples to measure from the npaSampleAbundance.
sagun98
parents:
diff changeset
369 :param lsDiversityMetricAlpha: List of diversity metrics to use in measuring.
sagun98
parents:
diff changeset
370 :type: List of strings Strings being metrics to derived from the indicated samples.
sagun98
parents:
diff changeset
371 :return List of List of doubles: Each internal list is a list of (floats) indicating a specific metric measurement method measuring multiple samples
sagun98
parents:
diff changeset
372 [[metric1-sample1, metric1-sample2, metric1-sample3],[metric1-sample1, metric1-sample2, metric1-sample3]]
sagun98
parents:
diff changeset
373 """
sagun98
parents:
diff changeset
374
sagun98
parents:
diff changeset
375 if not ValidateData.funcIsValidList(lsDiversityMetricAlpha):
sagun98
parents:
diff changeset
376 lsDiversityMetricAlpha = [lsDiversityMetricAlpha]
sagun98
parents:
diff changeset
377
sagun98
parents:
diff changeset
378 #Get amount of metrics
sagun98
parents:
diff changeset
379 metricsCount = len(lsDiversityMetricAlpha)
sagun98
parents:
diff changeset
380
sagun98
parents:
diff changeset
381 #Create return
sagun98
parents:
diff changeset
382 returnMetricsMatrixRet = [[] for index in lsDiversityMetricAlpha]
sagun98
parents:
diff changeset
383
sagun98
parents:
diff changeset
384 #For each sample get all metrics
sagun98
parents:
diff changeset
385 #Place in list of lists
sagun98
parents:
diff changeset
386 #[[metric1-sample1, metric1-sample2, metric1-sample3],[metric1-sample1, metric1-sample2, metric1-sample3]]
sagun98
parents:
diff changeset
387 for sample in lsSampleNames:
sagun98
parents:
diff changeset
388 sampleAbundance = npaSampleAbundance[sample]
sagun98
parents:
diff changeset
389 for metricIndex in xrange(0,metricsCount):
sagun98
parents:
diff changeset
390 returnMetricsMatrixRet[metricIndex].append(Metric.funcGetAlphaMetric(ldAbundancies = sampleAbundance, strMetric = lsDiversityMetricAlpha[metricIndex]))
sagun98
parents:
diff changeset
391 return returnMetricsMatrixRet
sagun98
parents:
diff changeset
392
sagun98
parents:
diff changeset
393 #Testing 6 cases
sagun98
parents:
diff changeset
394 @staticmethod
sagun98
parents:
diff changeset
395 def funcGetBetaMetric(npadAbundancies=None, sMetric=None, istrmTree=None, istrmEnvr=None, lsSampleOrder=None, fAdditiveInverse = False):
sagun98
parents:
diff changeset
396 """
sagun98
parents:
diff changeset
397 Takes a matrix of values and returns a beta metric matrix. The metric returned is indicated by name (sMetric).
sagun98
parents:
diff changeset
398
sagun98
parents:
diff changeset
399 :param npadAbundancies: Numpy array of sample abundances to measure against.
sagun98
parents:
diff changeset
400 :type: Numpy Array Numpy array where row=samples and columns = features.
sagun98
parents:
diff changeset
401 :param sMetric: String name of beta metric. Possibilities are listed in microPITA.
sagun98
parents:
diff changeset
402 :type: String String name of beta metric. Possibilities are listed in microPITA.
sagun98
parents:
diff changeset
403 :return Double: Measurement indicated by metric for given abundance list
sagun98
parents:
diff changeset
404 """
sagun98
parents:
diff changeset
405
sagun98
parents:
diff changeset
406 if sMetric == Metric.c_strBrayCurtisDissimilarity:
sagun98
parents:
diff changeset
407 mtrxDistance = Metric.funcGetBrayCurtisDissimilarity(ldSampleTaxaAbundancies=npadAbundancies)
sagun98
parents:
diff changeset
408 elif sMetric == Metric.c_strInvBrayCurtisDissimilarity:
sagun98
parents:
diff changeset
409 mtrxDistance = Metric.funcGetInverseBrayCurtisDissimilarity(ldSampleTaxaAbundancies=npadAbundancies)
sagun98
parents:
diff changeset
410 elif sMetric in Metric.setBetaDiversities:
sagun98
parents:
diff changeset
411 mtrxDistance = Metric.funcGetDissimilarityByName(ldSampleTaxaAbundancies=npadAbundancies, strMetric=sMetric)
sagun98
parents:
diff changeset
412 elif sMetric == Metric.c_strUnifracUnweighted:
sagun98
parents:
diff changeset
413 mtrxDistance = Metric.funcGetUnifracDistance(istrmTree=istrmTree,istrmEnvr=istrmEnvr,lsSampleOrder=lsSampleOrder,fWeighted=False)
sagun98
parents:
diff changeset
414 # mtrxDistance = xReturn[0] if not type(xReturn) is BooleanType else xReturn
sagun98
parents:
diff changeset
415 elif sMetric == Metric.c_strUnifracWeighted:
sagun98
parents:
diff changeset
416 mtrxDistance = Metric.funcGetUnifracDistance(istrmTree=istrmTree,istrmEnvr=istrmEnvr,lsSampleOrder=lsSampleOrder,fWeighted=True)
sagun98
parents:
diff changeset
417 # mtrxDistance = xReturn[0] if not type(xReturn) is BooleanType else xReturn
sagun98
parents:
diff changeset
418 else:
sagun98
parents:
diff changeset
419 mtrxDistance = False
sagun98
parents:
diff changeset
420 if fAdditiveInverse and not type(mtrxDistance) is BooleanType:
sagun98
parents:
diff changeset
421 if sMetric in [Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]:
sagun98
parents:
diff changeset
422 mtrxDistance = (1.0 - mtrxDistance[0],mtrxDistance[1])
sagun98
parents:
diff changeset
423 else:
sagun98
parents:
diff changeset
424 mtrxDistance = 1.0 - mtrxDistance
sagun98
parents:
diff changeset
425 return mtrxDistance
sagun98
parents:
diff changeset
426
sagun98
parents:
diff changeset
427 #Test Cases 11
sagun98
parents:
diff changeset
428 @staticmethod
sagun98
parents:
diff changeset
429 def funcReadMatrixFile(istmMatrixFile, lsSampleOrder=None):
sagun98
parents:
diff changeset
430 """
sagun98
parents:
diff changeset
431 Reads in a file with a precalculated beta-diversty matrix.
sagun98
parents:
diff changeset
432
sagun98
parents:
diff changeset
433 :param istmMatrixFile: File with beta-diversity matrix
sagun98
parents:
diff changeset
434 :type: FileStream of String file path
sagun98
parents:
diff changeset
435 """
sagun98
parents:
diff changeset
436
sagun98
parents:
diff changeset
437 #Read in data
sagun98
parents:
diff changeset
438 f = csv.reader(open(istmMatrixFile,"r") if isinstance(istmMatrixFile, str) else istmMatrixFile, delimiter=ConstantsBreadCrumbs.c_matrixFileDelim )
sagun98
parents:
diff changeset
439
sagun98
parents:
diff changeset
440 #Get header
sagun98
parents:
diff changeset
441 try:
sagun98
parents:
diff changeset
442 lsHeader = f.next()
sagun98
parents:
diff changeset
443 except StopIteration:
sagun98
parents:
diff changeset
444 return (False,False)
sagun98
parents:
diff changeset
445 lsHeaderReducedToSamples = [sHeader for sHeader in lsHeader if sHeader in lsSampleOrder] if lsSampleOrder else lsHeader[1:]
sagun98
parents:
diff changeset
446
sagun98
parents:
diff changeset
447 #If no sample ordering is given, set the ordering to what is in the file
sagun98
parents:
diff changeset
448 if not lsSampleOrder:
sagun98
parents:
diff changeset
449 lsSampleOrder = lsHeaderReducedToSamples
sagun98
parents:
diff changeset
450
sagun98
parents:
diff changeset
451 #Preallocate matrix
sagun98
parents:
diff changeset
452 mtrxData = np.zeros(shape=(len(lsSampleOrder),len(lsSampleOrder)))
sagun98
parents:
diff changeset
453
sagun98
parents:
diff changeset
454 #Make sure all samples requested are in the file
sagun98
parents:
diff changeset
455 if(not len(lsSampleOrder) == len(lsHeaderReducedToSamples)): return False
sagun98
parents:
diff changeset
456
sagun98
parents:
diff changeset
457 for lsLine in f:
sagun98
parents:
diff changeset
458 if lsLine[0] in lsSampleOrder:
sagun98
parents:
diff changeset
459 iRowIndex = lsSampleOrder.index(lsLine[0])
sagun98
parents:
diff changeset
460
sagun98
parents:
diff changeset
461 for i in xrange(1,len(lsSampleOrder)):
sagun98
parents:
diff changeset
462 iColumnIndexComing = lsHeader.index(lsSampleOrder[i])
sagun98
parents:
diff changeset
463 iColumnIndexGoing = lsSampleOrder.index(lsSampleOrder[i])
sagun98
parents:
diff changeset
464 mtrxData[iRowIndex,iColumnIndexGoing] = lsLine[iColumnIndexComing]
sagun98
parents:
diff changeset
465 mtrxData[iColumnIndexGoing,iRowIndex] = lsLine[iColumnIndexComing]
sagun98
parents:
diff changeset
466 tpleMData = mtrxData.shape
sagun98
parents:
diff changeset
467 mtrxData = mtrxData if any(sum(ld)>0 for ld in mtrxData) or ((tpleMData[0]==1) and (tpleMData[1]==1)) else []
sagun98
parents:
diff changeset
468 return (mtrxData,lsSampleOrder)
sagun98
parents:
diff changeset
469
sagun98
parents:
diff changeset
470 #Test cases 2
sagun98
parents:
diff changeset
471 @staticmethod
sagun98
parents:
diff changeset
472 def funcWriteMatrixFile(mtrxMatrix, ostmMatrixFile, lsSampleNames=None):
sagun98
parents:
diff changeset
473 """
sagun98
parents:
diff changeset
474 Writes a square matrix to file.
sagun98
parents:
diff changeset
475
sagun98
parents:
diff changeset
476 :param mtrxMatrix: Matrix to write to file
sagun98
parents:
diff changeset
477 :type: Numpy array
sagun98
parents:
diff changeset
478 :lsSampleNames: The names of the samples in the order of the matrix
sagun98
parents:
diff changeset
479 :type: List of strings
sagun98
parents:
diff changeset
480 :ostmBetaMatrixFile: File to write to
sagun98
parents:
diff changeset
481 :type: String or file stream
sagun98
parents:
diff changeset
482 """
sagun98
parents:
diff changeset
483
sagun98
parents:
diff changeset
484 if not sum(mtrxMatrix.shape)>0 or not ostmMatrixFile:
sagun98
parents:
diff changeset
485 return False
sagun98
parents:
diff changeset
486
sagun98
parents:
diff changeset
487 #Check to make sure the sample names are the correct length
sagun98
parents:
diff changeset
488 tpleiShape = mtrxMatrix.shape
sagun98
parents:
diff changeset
489 if not lsSampleNames:
sagun98
parents:
diff changeset
490 lsSampleNames = range(tpleiShape[0])
sagun98
parents:
diff changeset
491 if not(len(lsSampleNames) == tpleiShape[0]):
sagun98
parents:
diff changeset
492 print "".join(["Metric.funcWriteMatrixFile. Error= Length of sample names ("+str(len(lsSampleNames))+") and matrix ("+str(mtrxMatrix.shape)+") not equal."])
sagun98
parents:
diff changeset
493 return False
sagun98
parents:
diff changeset
494
sagun98
parents:
diff changeset
495 #Write to file
sagun98
parents:
diff changeset
496 ostmOut = csv.writer(open(ostmMatrixFile,"w") if isinstance(ostmMatrixFile,str) else ostmMatrixFile, delimiter=ConstantsBreadCrumbs.c_matrixFileDelim )
sagun98
parents:
diff changeset
497
sagun98
parents:
diff changeset
498 #Add the additional space at the beginning of the sample names to represent the id row/column
sagun98
parents:
diff changeset
499 lsSampleNames = [""]+list(lsSampleNames)
sagun98
parents:
diff changeset
500
sagun98
parents:
diff changeset
501 #Write header and each row to file
sagun98
parents:
diff changeset
502 ostmOut.writerow(lsSampleNames)
sagun98
parents:
diff changeset
503 [ostmOut.writerow([lsSampleNames[iIndex+1]]+mtrxMatrix[iIndex,].tolist()) for iIndex in xrange(tpleiShape[0])]
sagun98
parents:
diff changeset
504 return True