Mercurial > repos > sagun98 > micropita
diff galaxy_micropita/src/breadcrumbs/src/Metric.py @ 3:8fb4630ab314 draft default tip
Uploaded
author | sagun98 |
---|---|
date | Thu, 03 Jun 2021 17:07:36 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy_micropita/src/breadcrumbs/src/Metric.py Thu Jun 03 17:07:36 2021 +0000 @@ -0,0 +1,504 @@ +""" +Author: Timothy Tickle +Description: Calculates Metrics. +""" + +##################################################################################### +#Copyright (C) <2012> +# +#Permission is hereby granted, free of charge, to any person obtaining a copy of +#this software and associated documentation files (the "Software"), to deal in the +#Software without restriction, including without limitation the rights to use, copy, +#modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, +#and to permit persons to whom the Software is furnished to do so, subject to +#the following conditions: +# +#The above copyright notice and this permission notice shall be included in all copies +#or substantial portions of the Software. +# +#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +#INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +#PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +#HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +#OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +##################################################################################### + +__author__ = "Timothy Tickle" +__copyright__ = "Copyright 2012" +__credits__ = ["Timothy Tickle"] +__license__ = "MIT" +__maintainer__ = "Timothy Tickle" +__email__ = "ttickle@sph.harvard.edu" +__status__ = "Development" + +#Update path +from ConstantsBreadCrumbs import ConstantsBreadCrumbs +import csv +import numpy as np +from types import * +from ValidateData import ValidateData + +#External libraries +from cogent.maths.unifrac.fast_unifrac import fast_unifrac_file +import cogent.maths.stats.alpha_diversity +import scipy.spatial.distance + +class Metric: + """ + Performs ecological measurements. + """ + + #Diversity metrics Alpha + c_strSimpsonDiversity = "SimpsonD" + c_strInvSimpsonDiversity = "InSimpsonD" + c_strChao1Diversity = "Chao1" + + #Diversity metrics Beta + c_strBrayCurtisDissimilarity = "B_Curtis" + c_strUnifracUnweighted = "unifrac_unweighted" + c_strUnifracWeighted = "unifrac_weighted" + + #Additive inverses of beta metrics + c_strInvBrayCurtisDissimilarity = "InB_Curtis" + + #Richness + c_strShannonRichness = "ShannonR" + c_strObservedCount = "Observed_Count" + + #Different alpha diversity metrics + setAlphaDiversities = set(["observed_species","margalef","menhinick", + "dominance","reciprocal_simpson","shannon","equitability","berger_parker_d", + "mcintosh_d","brillouin_d","strong","fisher_alpha","simpson", + "mcintosh_e","heip_e","simpson_e","robbins","michaelis_menten_fit","chao1","ACE"]) + + #Different beta diversity metrics + setBetaDiversities = set(["braycurtis","canberra","chebyshev","cityblock", + "correlation","cosine","euclidean","hamming","sqeuclidean"]) + + #Tested 4 + @staticmethod + def funcGetSimpsonsDiversityIndex(ldSampleTaxaAbundancies=None): + """ + Calculates the Simpsons diversity index as defined as sum(Pi*Pi). + Note***: Assumes that the abundance measurements are already normalized by the total population N. + + :param ldSampleTaxaAbundancies: List of measurements to calculate metric on (a sample). + :type: List of doubles + :return Double: Diversity metric + """ + + #Calculate metric + return sum((ldSampleTaxaAbundancies)*(ldSampleTaxaAbundancies)) + + #Tested 4 + @staticmethod + def funcGetInverseSimpsonsDiversityIndex(ldSampleTaxaAbundancies=None): + """ + Calculates Inverse Simpsons diversity index 1/sum(Pi*Pi). + This is multiplicative inverse which reverses the order of the simpsons diversity index. + Note***: Assumes that the abundance measurements are already normalized by the total population N. + + :param ldSampleTaxaAbundancies: List of measurements to calculate metric on (a sample). + :type: List of doubles + :return Double: Diversity metric + """ + + simpsons = Metric.funcGetSimpsonsDiversityIndex(ldSampleTaxaAbundancies) + #If simpsons is false return false, else return inverse + if not simpsons: + return False + return 1.0/simpsons + + #Tested 4 + @staticmethod + def funcGetShannonRichnessIndex(ldSampleTaxaAbundancies=None): + """ + Calculates the Shannon richness index. + Note***: Assumes that the abundance measurements are already normalized by the total population N. + If not normalized, include N in the parameter tempTotalN and it will be. + This is in base exp(1) like the default R Vegan package. Cogent is by defaul in bits (base=2) + Both options are here for your use. See Metric.funcGetAlphaDiversity() to access cogent + + :param ldSampleTaxaAbundancies: List of measurements to calculate metric on (a sample). + :type: List of doubles + :return Double: Richness metric + """ + + #Calculate metric + ldSampleTaxaAbundancies = ldSampleTaxaAbundancies[np.where(ldSampleTaxaAbundancies != 0)] + tempIntermediateNumber = sum(ldSampleTaxaAbundancies*(np.log(ldSampleTaxaAbundancies))) + if(tempIntermediateNumber == 0.0): + return 0.0 + return -1 * tempIntermediateNumber + + #Test 3 + @staticmethod + def funcGetChao1DiversityIndex(ldSampleTaxaAbundancies=None, fCorrectForBias=False): + """ + Calculates the Chao1 diversity index. + Note***: Not normalized by abundance. + + :param ldSampleTaxaAbundancies: List of measurements to calculate metric on (a sample). + :type: List of doubles + :param fCorrectForBias: Indicator to use bias correction. + :type: Boolean False indicates uncorrected for bias (uncorrected = Chao 1984, corrected = Chao 1987, Eq. 2) + :return Double: Diversity metric + """ + #If not counts return false + if [num for num in ldSampleTaxaAbundancies if((num<1) and (not num==0))]: return False + + #Observed = total number of species observed in all samples pooled + totalObservedSpecies = len(ldSampleTaxaAbundancies)-len(ldSampleTaxaAbundancies[ldSampleTaxaAbundancies == 0]) + + #Singles = number of species that occur in exactly 1 sample + singlesObserved = len(ldSampleTaxaAbundancies[ldSampleTaxaAbundancies == 1.0]) + + #Doubles = number of species that occue in exactly 2 samples + doublesObserved = len(ldSampleTaxaAbundancies[ldSampleTaxaAbundancies == 2.0]) + + #If singles or doubles = 0, return observations so that a divided by zero error does not occur + if((singlesObserved == 0) or (doublesObserved == 0)): + return totalObservedSpecies + + #Calculate metric + if fCorrectForBias: + return cogent.maths.stats.alpha_diversity.chao1_bias_corrected(observed = totalObservedSpecies, singles = singlesObserved, doubles = doublesObserved) + else: + return cogent.maths.stats.alpha_diversity.chao1_uncorrected(observed = totalObservedSpecies, singles = singlesObserved, doubles = doublesObserved) + + #Test 3 + @staticmethod + def funcGetObservedCount(ldSampleAbundances, dThreshold = 0.0): + """ + Count how many bugs / features have a value of greater than 0 or the threshold given. + Expects a vector of abundances. + ****Do not normalize data if using the threshold. + + :param ldSampleAbundances: List of measurements to calculate metric on (a sample). + :type: List of doubles + :param dThreshold: The lowest number the measurement can be to be counted as an observation. + :type: Double + :return Count: Number of features observed in a sample. + """ + + return sum([1 for observation in ldSampleAbundances if observation > dThreshold]) + + #Test Cases 6 + @staticmethod + def funcGetAlphaDiversity(liCounts,strMetric): + """ + Passes counts to cogent for an alpha diversity metric. + setAlphaDiversities are the names supported + + :param liCount: List of counts to calculate metric on (a sample). + :type: List of ints + :return Diversity: Double diversity metric. + """ + + return getattr(cogent.maths.stats.alpha_diversity,strMetric)(liCounts) + + #Happy path tested 1 + @staticmethod + def funcGetDissimilarity(ldSampleTaxaAbundancies, funcDistanceFunction): + """ + Calculates the distance between samples given a function. + + If you have 5 rows (labeled r1,r2,r3,r4,r5) the vector are the distances in this order. + condensed form = [d(r1,r2), d(r1,r3), d(r1,r4), d(r1,r5), d(r2,r3), d(r2,r4), d(r2,r5), d(r3,r4), d(r3,r5), d(r4,r5)]. + Note***: Assumes that the abundance measurements are already normalized by the total population N. + + :param ldSampleTaxaAbundancies: + :type: List of doubles + :param funcDistanceFunction: Distance function used to calculate distances + :type: Function + :return Double: Dissimilarity metric + """ + + #Calculate metric + try: + return scipy.spatial.distance.pdist(ldSampleTaxaAbundancies, funcDistanceFunction) + except ValueError as error: + print "".join(["Metric.funcGetDissimilarity. Error=",str(error)]) + return False + + #Test case 1 + @staticmethod + def funcGetDissimilarityByName(ldSampleTaxaAbundancies, strMetric): + """ + Calculates beta-diversity metrics between lists of abundances + setBetaDiversities are the names supported + + :param ldSampleTaxaAbundancies: + :type: List of doubles + :param strMetric: Name of the distance function used to calculate distances + :type: String + :return list double: Dissimilarity metrics between each sample + """ + + return scipy.spatial.distance.pdist(ldSampleTaxaAbundancies,strMetric) + + #Test 3 + @staticmethod + def funcGetBrayCurtisDissimilarity(ldSampleTaxaAbundancies): + """ + Calculates the BrayCurtis Beta dissimilarity index. + d(u,v)=sum(abs(row1-row2))/sum(row1+row2). + This is scale invariant. + If you have 5 rows (labeled r1,r2,r3,r4,r5) the vector are the distances in this order. + condensed form = [d(r1,r2), d(r1,r3), d(r1,r4), d(r1,r5), d(r2,r3), d(r2,r4), d(r2,r5), d(r3,r4), d(r3,r5), d(r4,r5)]. + Note***: Assumes that the abundance measurements are already normalized by the total population N. + + :param ldSampleTaxaAbundancies: + :type: List of doubles + :return Double Matrix: Dissimilarity metric + """ + + #Calculate metric + try: + return scipy.spatial.distance.pdist(X=ldSampleTaxaAbundancies, metric='braycurtis') + except ValueError as error: + print "".join(["Metric.getBrayCurtisDissimilarity. Error=",str(error)]) + return False + + #Test 3 + @staticmethod + def funcGetInverseBrayCurtisDissimilarity(ldSampleTaxaAbundancies): + """ + Calculates 1 - the BrayCurtis Beta dissimilarity index. + d(u,v)=1-(sum(abs(row1-row2))/sum(row1+row2)). + This is scale invariant and ranges between 0 and 1. + If you have 5 rows (labeled r1,r2,r3,r4,r5) the vector are the distances in this order. + condensed form = [d(r1,r2), d(r1,r3), d(r1,r4), d(r1,r5), d(r2,r3), d(r2,r4), d(r2,r5), d(r3,r4), d(r3,r5), d(r4,r5)]. + Note***: Assumes that the abundance measurements are already normalized by the total population N. + + :param ldSampleTaxaAbundancies: An np.array of samples (rows) x measurements (columns) in which distance is measured between rows + :type: List List of doubles + :return Double Matrix: 1 - Bray-Curtis dissimilarity. + """ + + bcValue = Metric.funcGetBrayCurtisDissimilarity(ldSampleTaxaAbundancies = ldSampleTaxaAbundancies) + if not type(bcValue) is BooleanType: + return 1.0-bcValue + return False + + #Test cases 8 + @staticmethod + def funcGetUnifracDistance(istrmTree,istrmEnvr,lsSampleOrder=None,fWeighted=True): + """ + Gets a unifrac distance from files/filestreams. + + :param istrmTree: File path or stream which is a Newick format file + :type: String of file stream + :param istrmEnvr: File path or stream which is a Newick format file + :type: String of file stream + """ + npaDist, lsSampleNames = fast_unifrac_file(open(istrmTree,"r") if isinstance(istrmTree, str) else istrmTree, + open(istrmEnvr,"r") if isinstance(istrmEnvr, str) else istrmEnvr, weighted=fWeighted).get("distance_matrix",False) + + #Was trying to avoid preallocating a matrix but if you only need a subset of the samples then it + #is simpler to preallocate so this is what I am doing but making a condensed matrix and not a full matrix + + #Dictionary to translate the current order of the samples to what is expected if given an input order + if lsSampleOrder: + #{NewOrder:OriginalOrder} way to convert from old to new sample location + dictTranslate = dict([[lsSampleOrder.index(sSampleName),lsSampleNames.index(sSampleName)] for sSampleName in lsSampleNames if sSampleName in lsSampleOrder]) + + #Check to make sure all samples requested were found + if not len(dictTranslate.keys()) == len(lsSampleOrder): + print "Metric.funcGetUnifracDistance. Error= The some or all sample names given (lsSampleOrder) were not contained in the matrix." + return False + + #Length of data + iLengthOfData = len(lsSampleOrder) + + #Preallocate matrix and shuffle + mtrxData = np.zeros(shape=(iLengthOfData,iLengthOfData)) + for x in xrange(iLengthOfData): + for y in xrange(iLengthOfData): + mtrxData[x,y] = npaDist[dictTranslate[x],dictTranslate[y]] + npaDist = mtrxData + + lsSampleNames = lsSampleOrder + + #If no sample order is given, condense the matrix and return + return (scipy.spatial.distance.squareform(npaDist),lsSampleNames) + + + #Test 7 + @staticmethod + def funcGetAlphaMetric(ldAbundancies, strMetric): + """ + Get alpha abundance of the metric for the vector. + Note: Shannon is measured with base 2 ("shannon") or base exp(1) (Metric.c_strShannonRichness) depending which method is called. + + :param ldAbundancies: List of values to compute metric (a sample). + :type: List List of doubles. + :param strMetric: The metric to measure. + :type: String Metric name (Use from constants above). + :return Double: Metric specified by strMetric derived from ldAbundancies. + """ + + if(strMetric == Metric.c_strShannonRichness): + return Metric.funcGetShannonRichnessIndex(ldSampleTaxaAbundancies=ldAbundancies) + elif(strMetric == Metric.c_strSimpsonDiversity): + return Metric.funcGetSimpsonsDiversityIndex(ldSampleTaxaAbundancies=ldAbundancies) + elif(strMetric == Metric.c_strInvSimpsonDiversity): + return Metric.funcGetInverseSimpsonsDiversityIndex(ldSampleTaxaAbundancies=ldAbundancies) + elif(strMetric == Metric.c_strObservedCount): + return Metric.funcGetObservedCount(ldSampleAbundances=ldAbundancies) + #Chao1 Needs NOT Normalized Abundance (Counts) + elif(strMetric == Metric.c_strChao1Diversity): + return Metric.funcGetChao1DiversityIndex(ldSampleTaxaAbundancies=ldAbundancies) + elif(strMetric in Metric.setAlphaDiversities): + return Metric.funcGetAlphaDiversity(liCounts=ldAbundancies, strMetric=strMetric) + else: + return False + + #Test 5 + @staticmethod + def funcBuildAlphaMetricsMatrix(npaSampleAbundance = None, lsSampleNames = None, lsDiversityMetricAlpha = None): + """ + Build a matrix of alpha diversity metrics for each sample + Row = metric, column = sample + + :param npaSampleAbundance: Observations (Taxa (row) x sample (column)) + :type: Numpy Array + :param lsSampleNames: List of sample names of samples to measure (do not include the taxa id column name or other column names which should not be read). + :type: List of strings Strings being samples to measure from the npaSampleAbundance. + :param lsDiversityMetricAlpha: List of diversity metrics to use in measuring. + :type: List of strings Strings being metrics to derived from the indicated samples. + :return List of List of doubles: Each internal list is a list of (floats) indicating a specific metric measurement method measuring multiple samples + [[metric1-sample1, metric1-sample2, metric1-sample3],[metric1-sample1, metric1-sample2, metric1-sample3]] + """ + + if not ValidateData.funcIsValidList(lsDiversityMetricAlpha): + lsDiversityMetricAlpha = [lsDiversityMetricAlpha] + + #Get amount of metrics + metricsCount = len(lsDiversityMetricAlpha) + + #Create return + returnMetricsMatrixRet = [[] for index in lsDiversityMetricAlpha] + + #For each sample get all metrics + #Place in list of lists + #[[metric1-sample1, metric1-sample2, metric1-sample3],[metric1-sample1, metric1-sample2, metric1-sample3]] + for sample in lsSampleNames: + sampleAbundance = npaSampleAbundance[sample] + for metricIndex in xrange(0,metricsCount): + returnMetricsMatrixRet[metricIndex].append(Metric.funcGetAlphaMetric(ldAbundancies = sampleAbundance, strMetric = lsDiversityMetricAlpha[metricIndex])) + return returnMetricsMatrixRet + + #Testing 6 cases + @staticmethod + def funcGetBetaMetric(npadAbundancies=None, sMetric=None, istrmTree=None, istrmEnvr=None, lsSampleOrder=None, fAdditiveInverse = False): + """ + Takes a matrix of values and returns a beta metric matrix. The metric returned is indicated by name (sMetric). + + :param npadAbundancies: Numpy array of sample abundances to measure against. + :type: Numpy Array Numpy array where row=samples and columns = features. + :param sMetric: String name of beta metric. Possibilities are listed in microPITA. + :type: String String name of beta metric. Possibilities are listed in microPITA. + :return Double: Measurement indicated by metric for given abundance list + """ + + if sMetric == Metric.c_strBrayCurtisDissimilarity: + mtrxDistance = Metric.funcGetBrayCurtisDissimilarity(ldSampleTaxaAbundancies=npadAbundancies) + elif sMetric == Metric.c_strInvBrayCurtisDissimilarity: + mtrxDistance = Metric.funcGetInverseBrayCurtisDissimilarity(ldSampleTaxaAbundancies=npadAbundancies) + elif sMetric in Metric.setBetaDiversities: + mtrxDistance = Metric.funcGetDissimilarityByName(ldSampleTaxaAbundancies=npadAbundancies, strMetric=sMetric) + elif sMetric == Metric.c_strUnifracUnweighted: + mtrxDistance = Metric.funcGetUnifracDistance(istrmTree=istrmTree,istrmEnvr=istrmEnvr,lsSampleOrder=lsSampleOrder,fWeighted=False) +# mtrxDistance = xReturn[0] if not type(xReturn) is BooleanType else xReturn + elif sMetric == Metric.c_strUnifracWeighted: + mtrxDistance = Metric.funcGetUnifracDistance(istrmTree=istrmTree,istrmEnvr=istrmEnvr,lsSampleOrder=lsSampleOrder,fWeighted=True) +# mtrxDistance = xReturn[0] if not type(xReturn) is BooleanType else xReturn + else: + mtrxDistance = False + if fAdditiveInverse and not type(mtrxDistance) is BooleanType: + if sMetric in [Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]: + mtrxDistance = (1.0 - mtrxDistance[0],mtrxDistance[1]) + else: + mtrxDistance = 1.0 - mtrxDistance + return mtrxDistance + + #Test Cases 11 + @staticmethod + def funcReadMatrixFile(istmMatrixFile, lsSampleOrder=None): + """ + Reads in a file with a precalculated beta-diversty matrix. + + :param istmMatrixFile: File with beta-diversity matrix + :type: FileStream of String file path + """ + + #Read in data + f = csv.reader(open(istmMatrixFile,"r") if isinstance(istmMatrixFile, str) else istmMatrixFile, delimiter=ConstantsBreadCrumbs.c_matrixFileDelim ) + + #Get header + try: + lsHeader = f.next() + except StopIteration: + return (False,False) + lsHeaderReducedToSamples = [sHeader for sHeader in lsHeader if sHeader in lsSampleOrder] if lsSampleOrder else lsHeader[1:] + + #If no sample ordering is given, set the ordering to what is in the file + if not lsSampleOrder: + lsSampleOrder = lsHeaderReducedToSamples + + #Preallocate matrix + mtrxData = np.zeros(shape=(len(lsSampleOrder),len(lsSampleOrder))) + + #Make sure all samples requested are in the file + if(not len(lsSampleOrder) == len(lsHeaderReducedToSamples)): return False + + for lsLine in f: + if lsLine[0] in lsSampleOrder: + iRowIndex = lsSampleOrder.index(lsLine[0]) + + for i in xrange(1,len(lsSampleOrder)): + iColumnIndexComing = lsHeader.index(lsSampleOrder[i]) + iColumnIndexGoing = lsSampleOrder.index(lsSampleOrder[i]) + mtrxData[iRowIndex,iColumnIndexGoing] = lsLine[iColumnIndexComing] + mtrxData[iColumnIndexGoing,iRowIndex] = lsLine[iColumnIndexComing] + tpleMData = mtrxData.shape + mtrxData = mtrxData if any(sum(ld)>0 for ld in mtrxData) or ((tpleMData[0]==1) and (tpleMData[1]==1)) else [] + return (mtrxData,lsSampleOrder) + + #Test cases 2 + @staticmethod + def funcWriteMatrixFile(mtrxMatrix, ostmMatrixFile, lsSampleNames=None): + """ + Writes a square matrix to file. + + :param mtrxMatrix: Matrix to write to file + :type: Numpy array + :lsSampleNames: The names of the samples in the order of the matrix + :type: List of strings + :ostmBetaMatrixFile: File to write to + :type: String or file stream + """ + + if not sum(mtrxMatrix.shape)>0 or not ostmMatrixFile: + return False + + #Check to make sure the sample names are the correct length + tpleiShape = mtrxMatrix.shape + if not lsSampleNames: + lsSampleNames = range(tpleiShape[0]) + if not(len(lsSampleNames) == tpleiShape[0]): + print "".join(["Metric.funcWriteMatrixFile. Error= Length of sample names ("+str(len(lsSampleNames))+") and matrix ("+str(mtrxMatrix.shape)+") not equal."]) + return False + + #Write to file + ostmOut = csv.writer(open(ostmMatrixFile,"w") if isinstance(ostmMatrixFile,str) else ostmMatrixFile, delimiter=ConstantsBreadCrumbs.c_matrixFileDelim ) + + #Add the additional space at the beginning of the sample names to represent the id row/column + lsSampleNames = [""]+list(lsSampleNames) + + #Write header and each row to file + ostmOut.writerow(lsSampleNames) + [ostmOut.writerow([lsSampleNames[iIndex+1]]+mtrxMatrix[iIndex,].tolist()) for iIndex in xrange(tpleiShape[0])] + return True