Mercurial > repos > george-weingart > micropita
view MicroPITA.py @ 0:2f4f6f08c8c4 draft
Uploaded
author | george-weingart |
---|---|
date | Tue, 13 May 2014 21:58:57 -0400 |
parents | |
children | cd71e90abfab |
line wrap: on
line source
#!/usr/bin/env python """ Author: Timothy Tickle Description: Class to Run analysis for the microPITA paper """ ##################################################################################### #Copyright (C) <2012> # #Permission is hereby granted, free of charge, to any person obtaining a copy of #this software and associated documentation files (the "Software"), to deal in the #Software without restriction, including without limitation the rights to use, copy, #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, #and to permit persons to whom the Software is furnished to do so, subject to #the following conditions: # #The above copyright notice and this permission notice shall be included in all copies #or substantial portions of the Software. # #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ##################################################################################### __author__ = "Timothy Tickle" __copyright__ = "Copyright 2012" __credits__ = ["Timothy Tickle"] __license__ = "MIT" __maintainer__ = "Timothy Tickle" __email__ = "ttickle@sph.harvard.edu" __status__ = "Development" import sys import argparse from src.breadcrumbs.src.AbundanceTable import AbundanceTable from src.breadcrumbs.src.ConstantsBreadCrumbs import ConstantsBreadCrumbs from src.breadcrumbs.src.Metric import Metric from src.breadcrumbs.src.KMedoids import Kmedoids from src.breadcrumbs.src.MLPYDistanceAdaptor import MLPYDistanceAdaptor from src.breadcrumbs.src.SVM import SVM from src.breadcrumbs.src.UtilityMath import UtilityMath from src.ConstantsMicropita import ConstantsMicropita import csv import logging import math import mlpy import numpy as np import operator import os import random import scipy.cluster.hierarchy as hcluster import scipy.spatial.distance from types import * class MicroPITA: """ Selects samples from a first tier of a multi-tiered study to be used in a second tier. Different methods can be used for selection. The expected input is an abundance table (and potentially a text file of targeted features, if using the targeted features option). Output is a list of samples exhibiting the characteristics of interest. """ #Constants #Diversity metrics Alpha c_strInverseSimpsonDiversity = Metric.c_strInvSimpsonDiversity c_strChao1Diversity = Metric.c_strChao1Diversity #Diversity metrics Beta c_strBrayCurtisDissimilarity = Metric.c_strBrayCurtisDissimilarity #Additive inverses of diversity metrics beta c_strInvBrayCurtisDissimilarity = Metric.c_strInvBrayCurtisDissimilarity #Technique Names ConstantsMicropita.c_strDiversity2 = ConstantsMicropita.c_strDiversity+"_C" #Targeted feature settings c_strTargetedRanked = ConstantsMicropita.c_strTargetedRanked c_strTargetedAbundance = ConstantsMicropita.c_strTargetedAbundance #Technique groupings # c_lsDiversityMethods = [ConstantsMicropita.c_strDiversity,ConstantsMicropita.c_strDiversity2] #Converts ecology metrics into standardized method selection names dictConvertAMetricDiversity = {c_strInverseSimpsonDiversity:ConstantsMicropita.c_strDiversity, c_strChao1Diversity:ConstantsMicropita.c_strDiversity2} # dictConvertMicroPITAToAMetric = {ConstantsMicropita.c_strDiversity:c_strInverseSimpsonDiversity, ConstantsMicropita.c_strDiversity2:c_strChao1Diversity} dictConvertBMetricToMethod = {c_strBrayCurtisDissimilarity:ConstantsMicropita.c_strRepresentative} dictConvertInvBMetricToMethod = {c_strBrayCurtisDissimilarity:ConstantsMicropita.c_strExtreme} #Linkage used in the Hierarchical clustering c_strHierarchicalClusterMethod = 'average' ####Group 1## Diversity #Testing: Happy path Testing (8) def funcGetTopRankedSamples(self, lldMatrix = None, lsSampleNames = None, iTopAmount = None): """ Given a list of lists of measurements, for each list the indices of the highest values are returned. If lsSamplesNames is given it is treated as a list of string names that is in the order of the measurements in each list. Indices are returned or the sample names associated with the indices. :param lldMatrix: List of lists [[value,value,value,value],[value,value,value,value]]. :type: List of lists List of measurements. Each list is a different measurement. Each measurement in positionally related to a sample. :param lsSampleNames: List of sample names positionally related (the same) to each list (Optional). :type: List of strings List of strings. :param iTopAmount: The amount of top measured samples (assumes the higher measurements are better). :type: integer Integer amount of sample names/ indices to return. :return List: List of samples to be selected. """ topRankListRet = [] for rowMetrics in lldMatrix: #Create 2 d array to hold value and index and sort liIndexX = [rowMetrics,range(len(rowMetrics))] liIndexX[1].sort(key = liIndexX[0].__getitem__,reverse = True) if lsSampleNames: topRankListRet.append([lsSampleNames[iIndex] for iIndex in liIndexX[1][:iTopAmount]]) else: topRankListRet.append(liIndexX[1][:iTopAmount]) return topRankListRet ####Group 2## Representative Dissimilarity #Testing: Happy path tested 1 def funcGetCentralSamplesByKMedoids(self, npaMatrix=None, sMetric=None, lsSampleNames=None, iNumberSamplesReturned=0, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None): """ Gets centroid samples by k-medoids clustering of a given matrix. :param npaMatrix: Numpy array where row=features and columns=samples :type: Numpy array Abundance Data. :param sMetric: String name of beta metric used as the distance metric. :type: String String name of beta metric. :param lsSampleNames: The names of the sample :type: List List of strings :param iNumberSamplesReturned: Number of samples to return, each will be a centroid of a sample. :type: Integer Number of samples to return :return List: List of selected samples. :param istmBetaMatrix: File with beta-diversity matrix :type: File stream or file path string """ #Count of how many rows sampleCount = npaMatrix.shape[0] if iNumberSamplesReturned > sampleCount: logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: There are not enough samples to return the amount of samples specified. Return sample count = "+str(iNumberSamplesReturned)+". Sample number = "+str(sampleCount)+".") return False #If the cluster count is equal to the sample count return all samples if sampleCount == iNumberSamplesReturned: return list(lsSampleNames) #Get distance matrix distanceMatrix=scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix,lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaMatrix, sMetric=sMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames) if type(distanceMatrix) is BooleanType: logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: Could not read in the supplied distance matrix, returning false.") return False # Handle unifrac output if sMetric in [Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]: distanceMatrix = distanceMatrix[0] #Log distance matrix logging.debug("MicroPITA.funcGetCentralSamplesByKMedoids:: Distance matrix for representative selection using metric="+str(sMetric)) distance = MLPYDistanceAdaptor(npaDistanceMatrix=distanceMatrix, fIsCondensedMatrix=True) #Create object to determine clusters/medoids medoidsMaker = Kmedoids(k=iNumberSamplesReturned, dist=distance) #medoidsData includes(1d numpy array, medoids indexes; # 1d numpy array, non-medoids indexes; # 1d numpy array, cluster membership for non-medoids; # double, cost of configuration) #npaMatrix is samples x rows #Build a matrix of lists of indicies to pass to the distance matrix lliIndicesMatrix = [[iIndexPosition] for iIndexPosition in xrange(0,len(npaMatrix))] medoidsData = medoidsMaker.compute(np.array(lliIndicesMatrix)) logging.debug("MicroPITA.funcGetCentralSamplesByKMedoids:: Results from the kmedoid method in representative selection:") logging.debug(str(medoidsData)) #If returning the same amount of clusters and samples #Return centroids selectedIndexes = medoidsData[0] return [lsSampleNames[selectedIndexes[index]] for index in xrange(0,iNumberSamplesReturned)] ####Group 3## Highest Dissimilarity #Testing: Happy path tested def funcSelectExtremeSamplesFromHClust(self, strBetaMetric, npaAbundanceMatrix, lsSampleNames, iSelectSampleCount, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None): """ Select extreme samples from HClustering. :param strBetaMetric: The beta metric to use for distance matrix generation. :type: String The name of the beta metric to use. :param npaAbundanceMatrix: Numpy array where row=samples and columns=features. :type: Numpy Array Abundance data. :param lsSampleNames: The names of the sample. :type: List List of strings. :param iSelectSampleCount: Number of samples to select (return). :type: Integer Integer number of samples returned. :return Samples: List of samples. :param istmBetaMatrix: File with beta-diversity matrix :type: File stream or file path string """ #If they want all the sample count, return all sample names iSampleCount=len(npaAbundanceMatrix[:,0]) if iSelectSampleCount==iSampleCount: return lsSampleNames #Holds the samples to be returned lsReturnSamplesRet = [] #Generate beta matrix #Returns condensed matrix tempDistanceMatrix = scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix,lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaAbundanceMatrix, sMetric=strBetaMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames, fAdditiveInverse = True) if strBetaMetric in [Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]: tempDistanceMatrix = tempDistanceMatrix[0] if type(tempDistanceMatrix) is BooleanType: logging.error("MicroPITA.funcSelectExtremeSamplesFromHClust:: Could not read in the supplied distance matrix, returning false.") return False if istmBetaMatrix: tempDistanceMatrix = 1-tempDistanceMatrix #Feed beta matrix to linkage to cluster #Send condensed matrix linkageMatrix = hcluster.linkage(tempDistanceMatrix, method=self.c_strHierarchicalClusterMethod) #Extract cluster information from dendrogram #The linakge matrix is of the form #[[int1 int2 doube int3],...] #int1 and int1 are the paired samples indexed at 0 and up. #each list is an entry for a branch that is number starting with the first #list being sample count index + 1 #each list is then named by an increment as they appear #this means that if a number is in the list and is = sample count or greater it is not #terminal and is instead a branch. #This method just takes the lowest metric measurement (highest distance pairs/clusters) #Works much better than the original technique #get total number of samples iCurrentSelectCount = 0 for row in linkageMatrix: #Get nodes ofthe lowest pairing (so the furthest apart pair) iNode1 = int(row[0]) iNode2 = int(row[1]) #Make sure the nodes are a terminal node (sample) and not a branch in the dendrogram #The branching in the dendrogram will start at the number of samples and increment higher. #Add each of the pair one at a time breaking when enough samples are selected. if iNode1<iSampleCount: lsReturnSamplesRet.append(lsSampleNames[iNode1]) iCurrentSelectCount = iCurrentSelectCount + 1 if iCurrentSelectCount == iSelectSampleCount: break if iNode2<iSampleCount: lsReturnSamplesRet.append(lsSampleNames[iNode2]) iCurrentSelectCount = iCurrentSelectCount + 1 if iCurrentSelectCount == iSelectSampleCount: break #Return selected samples return lsReturnSamplesRet ####Group 4## Rank Average of user Defined Taxa #Testing: Happy Path Tested def funcGetAverageAbundanceSamples(self, abndTable, lsTargetedFeature, fRank=False): """ Averages feature abundance or ranked abundance. Expects a column 0 of taxa id that is skipped. :param abndTable: Abundance Table to analyse :type: AbundanceTable Abundance Table :param lsTargetedFeature: String names :type: list list of string names of features (bugs) which are measured after ranking against the full sample :param fRank: Indicates to rank the abundance before getting the average abundance of the features (default false) :type: boolean Flag indicating ranking abundance before calculating average feature measurement (false= no ranking) :return List of lists or boolean: List of lists or False on error. One internal list per sample indicating the sample, feature average abundance or ranked abundance. Lists will already be sorted. For not Ranked [[sample,average abundance of selected feature,1]] For Ranked [[sample,average ranked abundance, average abundance of selected feature]] Error Returns false """ llAbundance = abndTable.funcGetAverageAbundancePerSample(lsTargetedFeature) if not llAbundance: logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.") return False #Add a space for ranking if needed #Not ranked will be [[sSample,average abundance,1]] #(where 1 will not discriminant ties if used in later functions, so this generalizes) #Ranked will be [[sSample, average rank, average abundance]] llRetAbundance = [[llist[0],-1,llist[1]] for llist in llAbundance] #Rank if needed if fRank: abndRanked = abndTable.funcRankAbundance() if abndRanked == None: logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not rank the abundance table, returned false.") return False llRetRank = abndRanked.funcGetAverageAbundancePerSample(lsTargetedFeature) if not llRetRank: logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average ranked abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.") return False dictRanks = dict(llRetRank) llRetAbundance = [[a[0],dictRanks[a[0]],a[2]] for a in llRetAbundance] #Sort first for ties and then for the main feature if not fRank or ConstantsMicropita.c_fBreakRankTiesByDiversity: llRetAbundance = sorted(llRetAbundance, key = lambda sampleData: sampleData[2], reverse = not fRank) if fRank: llRetAbundance = sorted(llRetAbundance, key = lambda sampleData: sampleData[1], reverse = not fRank) return llRetAbundance #Testing: Happy Path Tested def funcSelectTargetedTaxaSamples(self, abndMatrix, lsTargetedTaxa, iSampleSelectionCount, sMethod = ConstantsMicropita.lsTargetedFeatureMethodValues[0]): """ Selects samples with the highest ranks or abundance of targeted features. If ranked, select the highest abundance for tie breaking :param abndMatrix: Abundance table to analyse :type: AbundanceTable Abundance table :param lsTargetedTaxa: List of features :type: list list of strings :param iSampleSelectionCount: Number of samples to select :type: integer integer :param sMethod: Method to select targeted features :type: string String (Can be values found in ConstantsMicropita.lsTargetedFeatureMethodValues) :return List of strings: List of sample names which were selected List of strings Empty list is returned on an error. """ #Check data if(len(lsTargetedTaxa) < 1): logging.error("MicroPITA.funcSelectTargetedTaxaSamples. Taxa defined selection was requested but no features were given.") return [] lsTargetedSamples = self.funcGetAverageAbundanceSamples(abndTable=abndMatrix, lsTargetedFeature=lsTargetedTaxa, fRank=sMethod.lower() == self.c_strTargetedRanked.lower()) #If an error occured or the key word for the method was not recognized if lsTargetedSamples == False: logging.error("MicroPITA.funcSelectTargetedTaxaSamples:: Was not able to select for the features given. So targeted feature selection was performed. Check to make sure the features are spelled correctly and exist in the abundance file.") return [] #Select from results return [sSample[0] for sSample in lsTargetedSamples[:iSampleSelectionCount]] ####Group 5## Random #Testing: Happy path Tested def funcGetRandomSamples(self, lsSamples=None, iNumberOfSamplesToReturn=0): """ Returns random sample names of the number given. No replacement. :param lsSamples: List of sample names :type: list list of strings :param iNumberOfSamplesToReturn: Number of samples to select :type: integer integer. :return List: List of selected samples (strings). """ #Input matrix sample count sampleCount = len(lsSamples) #Return the full matrix if they ask for a return matrix where length == original if(iNumberOfSamplesToReturn >= sampleCount): return lsSamples #Get the random indices for the sample (without replacement) liRandomIndices = random.sample(range(sampleCount), iNumberOfSamplesToReturn) #Create a boolean array of if indexes are to be included in the reduced array return [sSample for iIndex, sSample in enumerate(lsSamples) if iIndex in liRandomIndices] #Happy path tested (case 3) def funcGetAveragePopulation(self, abndTable, lfCompress): """ Get the average row per column in the abndtable. :param abndTable: AbundanceTable of data to be averaged :type: AbudanceTable :param lfCompress: List of boolean flags (false means to remove sample before averaging :type: List of floats :return List of doubles: """ if sum(lfCompress) == 0: return [] #Get the average populations lAverageRet = [] for sFeature in abndTable.funcGetAbundanceCopy(): sFeature = list(sFeature)[1:] sFeature=np.compress(lfCompress,sFeature,axis=0) lAverageRet.append(sum(sFeature)/float(len(sFeature))) return lAverageRet #Happy path tested (2 cases) def funcGetDistanceFromAverage(self, abndTable,ldAverage,lsSamples,lfSelected): """ Given an abundance table and an average sample, this returns the distance of each sample (measured using brays-curtis dissimilarity) from the average. The distances are reduced by needing to be in the lsSamples and being a true in the lfSelected (which is associated with the samples in the order of the samples in the abundance table; use abundancetable.funcGetSampleNames() to see the order if needed). :param abndTable: Abundance table holding the data to be analyzed. :type: AbundanceTable :param ldAverage: Average population (Average features of the abundance table of samples) :type: List of doubles which represent the average population :param lsSamples: These are the only samples used in the analysis :type: List of strings (sample ids) :param lfSelected: Samples to be included in the analysis :type: List of boolean (true means include) :return: List of distances (doubles) """ #Get the distance from label 1 of all samples in label0 splitting into selected and not selected lists ldSelectedDistances = [] for sSampleName in [sSample for iindex, sSample in enumerate(lsSamples) if lfSelected[iindex]]: #Get the sample measurements ldSelectedDistances.append(Metric.funcGetBrayCurtisDissimilarity(np.array([abndTable.funcGetSample(sSampleName),ldAverage]))[0]) return ldSelectedDistances #Happy path tested (1 case) def funcMeasureDistanceFromLabelToAverageOtherLabel(self, abndTable, lfGroupOfInterest, lfGroupOther): """ Get the distance of samples from one label from the average sample of not the label. Note: This assumes 2 classes. :param abndTable: Table of data to work out of. :type: Abundace Table :param lfGroupOfInterest: Boolean indicator of the sample being in the first group. :type: List of floats, true indicating an individual in the group of interest. :param lfGroupOther: Boolean indicator of the sample being in the other group. :type: List of floats, true indicating an individual in the :return List of List of doubles: [list of tuples (string sample name,double distance) for the selected population, list of tuples for the not selected population] """ #Get all sample names lsAllSamples = abndTable.funcGetSampleNames() #Get average populations lAverageOther = self.funcGetAveragePopulation(abndTable=abndTable, lfCompress=lfGroupOther) #Get the distance from the average of the other label (label 1) ldSelectedDistances = self.funcGetDistanceFromAverage(abndTable=abndTable, ldAverage=lAverageOther, lsSamples=lsAllSamples, lfSelected=lfGroupOfInterest) return zip([lsAllSamples[iindex] for iindex, fGroup in enumerate(lfGroupOfInterest) if fGroup],ldSelectedDistances) #Happy path tested (1 test case) def funcPerformDistanceSelection(self, abndTable, iSelectionCount, sLabel, sValueOfInterest): """ Given metadata, metadata of one value (sValueOfInterest) is measured from the average (centroid) value of another label group. An iSelectionCount of samples is selected from the group of interest closest to and furthest from the centroid of the other group. :params abndTable: Abundance of measurements :type: AbundanceTable :params iSelectionCount: The number of samples selected per sample. :type: Integer Integer greater than 0 :params sLabel: ID of the metadata which is the supervised label :type: String :params sValueOfInterest: Metadata value in the sLabel metadta row of the abundance table which defines the group of interest. :type: String found in the abundance table metadata row indicated by sLabel. :return list list of tuples (samplename, distance) [[iSelectionCount of tuples closest to the other centroid], [iSelectionCount of tuples farthest from the other centroid], [all tuples of samples not selected]] """ lsMetadata = abndTable.funcGetMetadata(sLabel) #Other metadata values lsUniqueOtherValues = list(set(lsMetadata)-set(sValueOfInterest)) #Get boolean indicator of values of interest lfLabelsInterested = [sValueOfInterest == sValue for sValue in lsMetadata] #Get the distances of the items of interest from the other metadata values dictDistanceAverages = {} for sOtherLabel in lsUniqueOtherValues: #Get boolean indicator of labels not of interest lfLabelsOther = [sOtherLabel == sValue for sValue in lsMetadata] #Get the distances of data from two different groups to the average of the other ldValueDistances = dict(self.funcMeasureDistanceFromLabelToAverageOtherLabel(abndTable, lfLabelsInterested, lfLabelsOther)) for sKey in ldValueDistances: dictDistanceAverages[sKey] = ldValueDistances[sKey] + dictDistanceAverages[sKey] if sKey in dictDistanceAverages else ldValueDistances[sKey] #Finish average by dividing by length of lsUniqueOtherValues ltpleAverageDistances = [(sKey, dictDistanceAverages[sKey]/float(len(lsUniqueOtherValues))) for sKey in dictDistanceAverages] #Sort to extract extremes ltpleAverageDistances = sorted(ltpleAverageDistances,key=operator.itemgetter(1)) #Get the closest and farthest distances ltupleDiscriminantSamples = ltpleAverageDistances[:iSelectionCount] ltupleDistinctSamples = ltpleAverageDistances[iSelectionCount*-1:] #Remove the selected samples from the larger population of distances (better visualization) ldSelected = [tpleSelected[0] for tpleSelected in ltupleDiscriminantSamples+ltupleDistinctSamples] #Return discriminant tuples, distinct tuples, other tuples return [ltupleDiscriminantSamples, ltupleDistinctSamples, [tplData for tplData in ltpleAverageDistances if tplData[0] not in ldSelected]] #Run the supervised method surrounding distance from centroids #Happy path tested (3 test cases) def funcRunSupervisedDistancesFromCentroids(self, abundanceTable, fRunDistinct, fRunDiscriminant, xOutputSupFile, xPredictSupFile, strSupervisedMetadata, iSampleSupSelectionCount, lsOriginalSampleNames, lsOriginalLabels, fAppendFiles = False): """ Runs supervised methods based on measuring distances of one label from the centroid of another. NAs are evaluated as theirown group. :param abundanceTable: AbundanceTable :type: AbudanceTable Data to analyze :param fRunDistinct: Run distinct selection method :type: Boolean boolean (true runs method) :param fRunDiscriminant: Run discriminant method :type: Boolean boolean (true runs method) :param xOutputSupFile: File output from supervised methods detailing data going into the method. :type: String or FileStream :param xPredictSupFile: File output from supervised methods distance results from supervised methods. :type: String or FileStream :param strSupervisedMetadata: The metadata that will be used to group samples. :type: String :param iSampleSupSelectionCount: Number of samples to select :type: Integer int sample selection count :param lsOriginalSampleNames: List of the sample names, order is important and should be preserved from the abundanceTable. :type: List of samples :param fAppendFiles: Indicates that output files already exist and appending is occuring. :type: Boolean :return Selected Samples: A dictionary of selected samples by selection ID Dictionary {"Selection Method":["SampleID","SampleID"...]} """ #Get labels and run one label against many lstrMetadata = abundanceTable.funcGetMetadata(strSupervisedMetadata) dictlltpleDistanceMeasurements = {} for sMetadataValue in set(lstrMetadata): #For now perform the selection here for the label of interest against the other labels dictlltpleDistanceMeasurements.setdefault(sMetadataValue,[]).extend(self.funcPerformDistanceSelection(abndTable=abundanceTable, iSelectionCount=iSampleSupSelectionCount, sLabel=strSupervisedMetadata, sValueOfInterest=sMetadataValue)) #Make expected output files for supervised methods #1. Output file which is similar to an input file for SVMs #2. Output file that is similar to the probabilitic output of a SVM (LibSVM) #Manly for making output of supervised methods (Distance from Centroid) similar #MicropitaVis needs some of these files if xOutputSupFile: if fAppendFiles: SVM.funcUpdateSVMFileWithAbundanceTable(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile, lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames) else: SVM.funcConvertAbundanceTableToSVMFile(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile, sMetadataLabel=strSupervisedMetadata, lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames) #Will contain the samples selected to return #One or more of the methods may be active so this is why I am extending instead of just returning the result of each method type dictSelectedSamplesRet = dict() for sKey, ltplDistances in dictlltpleDistanceMeasurements.items(): if fRunDistinct: dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDistinct,[]).extend([ltple[0] for ltple in ltplDistances[1]]) if fRunDiscriminant: dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDiscriminant,[]).extend([ltple[0] for ltple in ltplDistances[0]]) if xPredictSupFile: dictFlattenedDistances = dict() [dictFlattenedDistances.setdefault(sKey, []).append(tple) for sKey, lltple in dictlltpleDistanceMeasurements.items() for ltple in lltple for tple in ltple] if fAppendFiles: self._updatePredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile, dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames) else: self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile, dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames) return dictSelectedSamplesRet #Two happy path test cases def _updatePredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames): """ Manages updating the predict file. :param xPredictSupFile: File that has predictions (distances) from the supervised method. :type: FileStream or String file path :param xInputLabelsFile: File that as input to the supervised methods. :type: FileStream or String file path :param dictltpleDistanceMeasurements: :type: Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]} """ if not isinstance(xPredictSupFile, str): xPredictSupFile.close() xPredictSupFile = xPredictSupFile.name csvr = open(xPredictSupFile,'r') f = csv.reader(csvr,delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace) lsHeader = f.next()[1:] dictlltpleRead = dict([(sHeader,[]) for sHeader in lsHeader]) #Read data in iSampleIndex = 0 for sRow in f: sLabel = sRow[0] [dictlltpleRead[lsHeader[iDistanceIndex]].append((lsOriginalSampleNames[iSampleIndex],dDistance)) for iDistanceIndex, dDistance in enumerate(sRow[1:]) if not dDistance == ConstantsMicropita.c_sEmptyPredictFileValue] iSampleIndex += 1 #Combine dictltpleDistanceMeasurements with new data #If they share a key then merge keeping parameter data #If they do not share the key, keep the full data dictNew = {} for sKey in dictltpleDistanceMeasurements.keys(): lsSamples = [tple[0] for tple in dictltpleDistanceMeasurements[sKey]] dictNew[sKey] = dictltpleDistanceMeasurements[sKey]+[tple for tple in dictlltpleRead[sKey] if tple[0] not in lsSamples] if sKey in dictlltpleRead.keys() else dictltpleDistanceMeasurements[sKey] for sKey in dictlltpleRead: if sKey not in dictltpleDistanceMeasurements.keys(): dictNew[sKey] = dictlltpleRead[sKey] #Call writer self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xInputLabelsFile, dictltpleDistanceMeasurements=dictNew, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames, fFromUpdate=True) #2 happy path test cases def _writeToPredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames, fFromUpdate=False): """ Write to the predict file. :param xPredictSupFile: File that has predictions (distances) from the supervised method. :type: FileStream or String file path :param xInputLabelsFile: File that as input to the supervised methods. :type: FileStream or String file path :param dictltpleDistanceMeasurements: :type: Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]} :param abundanceTable: An abundance table of the sample data. :type: AbundanceTable :param lsOriginalSampleNames: Used if the file is being updated as the sample names so that it may be passed in and consistent with other writing. Otherwise will use the sample names from the abundance table. :type: List of strings :param fFromUpdate: Indicates if this is part of an update to the file or not. :type: Boolean """ xInputLabelsFileName = xInputLabelsFile if not isinstance(xInputLabelsFile,str): xInputLabelsFileName = xInputLabelsFile.name f = csv.writer(open(xPredictSupFile,"w") if isinstance(xPredictSupFile, str) else xPredictSupFile,delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace) lsAllSampleNames = abundanceTable.funcGetSampleNames() lsLabels = SVM.funcReadLabelsFromFile(xSVMFile=xInputLabelsFileName, lsAllSampleNames= lsOriginalSampleNames if fFromUpdate else lsAllSampleNames, isPredictFile=False) dictLabels = dict([(sSample,sLabel) for sLabel in lsLabels.keys() for sSample in lsLabels[sLabel]]) #Dictionay keys will be used to order the predict file lsMeasurementKeys = dictltpleDistanceMeasurements.keys() #Make header f.writerow(["labels"]+lsMeasurementKeys) #Reformat dictionary to make it easier to use for sKey in dictltpleDistanceMeasurements: dictltpleDistanceMeasurements[sKey] = dict([ltpl for ltpl in dictltpleDistanceMeasurements[sKey]]) for sSample in lsOriginalSampleNames: #Make body of file f.writerow([dictLabels.get(sSample,ConstantsMicropita.c_sEmptyPredictFileValue)]+ [str(dictltpleDistanceMeasurements[sKey].get(sSample,ConstantsMicropita.c_sEmptyPredictFileValue)) for sKey in lsMeasurementKeys]) def _funcRunNormalizeSensitiveMethods(self, abndData, iSampleSelectionCount, dictSelectedSamples, lsAlphaMetrics, lsBetaMetrics, lsInverseBetaMetrics, fRunDiversity, fRunRepresentative, fRunExtreme, strAlphaMetadata=None, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None, fInvertDiversity=False): """ Manages running methods that are sensitive to normalization. This is called twice, once for the set of methods which should not be normalized and the other for the set that should be normalized. :param abndData: Abundance table object holding the samples to be measured. :type: AbundanceTable :param iSampleSelectionCount The number of samples to select per method. :type: Integer :param dictSelectedSamples Will be added to as samples are selected {"Method:["strSelectedSampleID","strSelectedSampleID"...]}. :type: Dictionary :param lsAlphaMetrics: List of alpha metrics to use on alpha metric dependent assays (like highest diversity). :type: List of strings :param lsBetaMetrics: List of beta metrics to use on beta metric dependent assays (like most representative). :type: List of strings :param lsInverseBetaMetrics: List of inverse beta metrics to use on inverse beta metric dependent assays (like most dissimilar). :type: List of strings :param fRunDiversity: Run Diversity based methods (true indicates run). :type: Boolean :param fRunRepresentative: Run Representative based methods (true indicates run). :type: Boolean :param fRunExtreme: Run Extreme based methods (true indicates run). :type: Boolean :param istmBetaMatrix: File that has a precalculated beta matrix :type: File stream or File path string :return Selected Samples: Samples selected by methods. Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]} """ #Sample ids/names lsSampleNames = abndData.funcGetSampleNames() #Generate alpha metrics and get most diverse if fRunDiversity: #Get Alpha metrics matrix internalAlphaMatrix = None #Name of technique strMethod = [strAlphaMetadata] if strAlphaMetadata else lsAlphaMetrics #If given an alpha-diversity metadata if strAlphaMetadata: internalAlphaMatrix = [[float(strNum) for strNum in abndData.funcGetMetadata(strAlphaMetadata)]] else: #Expects Observations (Taxa (row) x sample (column)) #Returns [[metric1-sample1, metric1-sample2, metric1-sample3],[metric1-sample1, metric1-sample2, metric1-sample3]] internalAlphaMatrix = Metric.funcBuildAlphaMetricsMatrix(npaSampleAbundance = abndData.funcGetAbundanceCopy() if not abndData.funcIsSummed() else abndData.funcGetFeatureAbundanceTable(abndData.funcGetTerminalNodes()).funcGetAbundanceCopy(), lsSampleNames = lsSampleNames, lsDiversityMetricAlpha = lsAlphaMetrics) if internalAlphaMatrix: #Invert measurments if fInvertDiversity: lldNewDiversity = [] for lsLine in internalAlphaMatrix: lldNewDiversity.append([1/max(dValue,ConstantsMicropita.c_smallNumber) for dValue in lsLine]) internalAlphaMatrix = lldNewDiversity #Get top ranked alpha diversity by most diverse #Expects [[sample1,sample2,sample3...],[sample1,sample2,sample3..],...] #Returns [[sampleName1, sampleName2, sampleNameN],[sampleName1, sampleName2, sampleNameN]] mostDiverseAlphaSamplesIndexes = self.funcGetTopRankedSamples(lldMatrix=internalAlphaMatrix, lsSampleNames=lsSampleNames, iTopAmount=iSampleSelectionCount) #Add to results for index in xrange(0,len(strMethod)): strSelectionMethod = self.dictConvertAMetricDiversity.get(strMethod[index],ConstantsMicropita.c_strDiversity+"="+strMethod[index]) dictSelectedSamples.setdefault(strSelectionMethod,[]).extend(mostDiverseAlphaSamplesIndexes[index]) logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 1b") logging.info(dictSelectedSamples) #Generate beta metrics and if fRunRepresentative or fRunExtreme: #Abundance matrix transposed npaTransposedAbundance = UtilityMath.funcTransposeDataMatrix(abndData.funcGetAbundanceCopy(), fRemoveAdornments=True) #Get center selection using clusters/tiling #This will be for beta metrics in normalized space if fRunRepresentative: if istmBetaMatrix: #Get representative dissimilarity samples medoidSamples=self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=ConstantsMicropita.c_custom, lsSampleNames=lsSampleNames, iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) if medoidSamples: dictSelectedSamples.setdefault(ConstantsMicropita.c_strRepresentative+"="+ConstantsMicropita.c_custom,[]).extend(medoidSamples) else: logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Performing representative selection on normalized data.") for bMetric in lsBetaMetrics: #Get representative dissimilarity samples medoidSamples=self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=bMetric, lsSampleNames=lsSampleNames, iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) if medoidSamples: dictSelectedSamples.setdefault(self.dictConvertBMetricToMethod.get(bMetric,ConstantsMicropita.c_strRepresentative+"="+bMetric),[]).extend(medoidSamples) #Get extreme selection using clusters, tiling if fRunExtreme: logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Performing extreme selection on normalized data.") if istmBetaMatrix: #Samples for representative dissimilarity #This involves inverting the distance metric, #Taking the dendrogram level of where the number cluster == the number of samples to select #Returning a repersentative sample from each cluster extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=ConstantsMicropita.c_custom, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) #Add selected samples if extremeSamples: dictSelectedSamples.setdefault(ConstantsMicropita.c_strExtreme+"="+ConstantsMicropita.c_custom,[]).extend(extremeSamples) else: #Run KMedoids with inverse custom distance metric in normalized space for bMetric in lsInverseBetaMetrics: #Samples for representative dissimilarity #This involves inverting the distance metric, #Taking the dendrogram level of where the number cluster == the number of samples to select #Returning a repersentative sample from each cluster extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=bMetric, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr) #Add selected samples if extremeSamples: dictSelectedSamples.setdefault(self.dictConvertInvBMetricToMethod.get(bMetric,ConstantsMicropita.c_strExtreme+"="+bMetric),[]).extend(extremeSamples) logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 2,3b") logging.info(dictSelectedSamples) return dictSelectedSamples def funcRun(self, strIDName, strLastMetadataName, istmInput, ostmInputPredictFile, ostmPredictFile, ostmCheckedFile, ostmOutput, cDelimiter, cFeatureNameDelimiter, strFeatureSelection, istmFeatures, iCount, lstrMethods, strLastRowMetadata = None, strLabel = None, strStratify = None, strCustomAlpha = None, strCustomBeta = None, strAlphaMetadata = None, istmBetaMatrix = None, istrmTree = None, istrmEnvr = None, iMinSeqs = ConstantsMicropita.c_liOccurenceFilter[0], iMinSamples = ConstantsMicropita.c_liOccurenceFilter[1], fInvertDiversity = False): """ Manages the selection of samples given different metrics. :param strIDName: Sample Id metadata row :type: String :param strLastMetadataName: The id of the metadata positioned last in the abundance table. :type: String String metadata id. :param istmInput: File to store input data to supervised methods. :type: FileStream of String file path :param ostmInputPredictFile: File to store distances from supervised methods. :type: FileStream or String file path :param ostmCheckedFile: File to store the AbundanceTable data after it is being checked. :type: FileStream or String file path :param ostmOutPut: File to store sample selection by methods of interest. :type: FileStream or String file path :param cDelimiter: Delimiter of abundance table. :type: Character Char (default TAB). :param cFeatureNameDelimiter: Delimiter of the name of features (for instance if they contain consensus lineages indicating clades). :type: Character (default |). :param stFeatureSelectionMethod: Which method to use to select features in a targeted manner (Using average ranked abundance or average abundance). :type: String (specific values indicated in ConstantsMicropita.lsTargetedFeatureMethodValues). :param istmFeatures: File which holds the features of interest if using targeted feature methodology. :type: FileStream or String file path :param iCount: Number of samples to select in each methods, supervised methods select this amount per label if possible. :type: Integer integer. :param lstrMethods: List of strings indicating selection techniques. :type: List of string method names :param strLabel: The metadata used for supervised labels. :type: String :param strStratify: The metadata used to stratify unsupervised data. :type: String :param strCustomAlpha: Custom alpha diversity metric :type: String :param strCustomBeta: Custom beta diversity metric :type: String :param strAlphaMetadata: Metadata id which is a diveristy metric to use in highest diversity sampling :type: String :param istmBetaMatrix: File containing precalculated beta-diversity matrix for representative sampling :type: FileStream or String file path :param istrmTree: File containing tree for phylogentic beta-diversity analysis :type: FileStream or String file path :param istrmEnvr: File containing environment for phylogentic beta-diversity analysis :type: FileStream or String file path :param iMinSeqs: Minimum sequence in the occurence filter which filters all features not with a minimum number of sequences in each of a minimum number of samples. :type: Integer :param iMinSamples: Minimum sample count for the occurence filter. :type: Integer :param fInvertDiversity: When true will invert diversity measurements before using. :type: boolean :return Selected Samples: Samples selected by methods. Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]} """ #Holds the top ranked samples from different metrics #dict[metric name] = [samplename,samplename...] selectedSamples = dict() #If a target feature file is given make sure that targeted feature is in the selection methods, if not add if ConstantsMicropita.c_strFeature in lstrMethods: if not istmFeatures: logging.error("MicroPITA.funcRun:: Did not receive both the Targeted feature file and the feature selection method. MicroPITA did not run.") return False #Diversity metrics to run #Use custom metrics if specified #Custom beta metrics set to normalized only, custom alpha metrics set to count only diversityMetricsAlpha = [] if strCustomAlpha or strAlphaMetadata else [MicroPITA.c_strInverseSimpsonDiversity] diversityMetricsBeta = [] if istmBetaMatrix else [strCustomBeta] if strCustomBeta else [MicroPITA.c_strBrayCurtisDissimilarity] # inverseDiversityMetricsBeta = [MicroPITA.c_strInvBrayCurtisDissimilarity] diversityMetricsAlphaNoNormalize = [strAlphaMetadata] if strAlphaMetadata else [strCustomAlpha] if strCustomAlpha else [] diversityMetricsBetaNoNormalize = [] # inverseDiversityMetricsBetaNoNormalize = [] #Targeted taxa userDefinedTaxa = [] #Perform different flows flags c_RUN_MAX_DIVERSITY_1 = ConstantsMicropita.c_strDiversity in lstrMethods c_RUN_REPRESENTIVE_DISSIMILARITY_2 = ConstantsMicropita.c_strRepresentative in lstrMethods c_RUN_MAX_DISSIMILARITY_3 = ConstantsMicropita.c_strExtreme in lstrMethods c_RUN_RANK_AVERAGE_USER_4 = False if ConstantsMicropita.c_strFeature in lstrMethods: c_RUN_RANK_AVERAGE_USER_4 = True if not istmFeatures: logging.error("MicroPITA.funcRun:: No taxa file was given for taxa selection.") return False #Read in taxa list, break down to lines and filter out empty strings userDefinedTaxa = filter(None,(s.strip( ) for s in istmFeatures.readlines())) c_RUN_RANDOM_5 = ConstantsMicropita.c_strRandom in lstrMethods c_RUN_DISTINCT = ConstantsMicropita.c_strDistinct in lstrMethods c_RUN_DISCRIMINANT = ConstantsMicropita.c_strDiscriminant in lstrMethods #Read in abundance data #Abundance is a structured array. Samples (column) by Taxa (rows) with the taxa id row included as the column index=0 #Abundance table object to read in and manage data totalAbundanceTable = AbundanceTable.funcMakeFromFile(xInputFile=istmInput, lOccurenceFilter = [iMinSeqs, iMinSamples], cDelimiter=cDelimiter, sMetadataID=strIDName, sLastMetadataRow=strLastRowMetadata, sLastMetadata=strLastMetadataName, cFeatureNameDelimiter=cFeatureNameDelimiter, xOutputFile=ostmCheckedFile) if not totalAbundanceTable: logging.error("MicroPITA.funcRun:: Could not read in the abundance table. Analysis was not performed."+ " This often occurs when the Last Metadata is not specified correctly."+ " Please check to make sure the Last Metadata selection is the row of the last metadata,"+ " all values after this selection should be microbial measurements and should be numeric.") return False lsOriginalLabels = SVM.funcMakeLabels(totalAbundanceTable.funcGetMetadata(strLabel)) if strLabel else strLabel dictTotalMetadata = totalAbundanceTable.funcGetMetadataCopy() logging.debug("MicroPITA.funcRun:: Received metadata=" + str(dictTotalMetadata)) #If there is only 1 unique value for the labels, do not run the Supervised methods if strLabel and ( len(set(dictTotalMetadata.get(strLabel,[]))) < 2 ): logging.error("The label " + strLabel + " did not have 2 or more values. Labels found=" + str(dictTotalMetadata.get(strLabel,[]))) return False #Run unsupervised methods### #Stratify the data if need be and drop the old data lStratifiedAbundanceTables = totalAbundanceTable.funcStratifyByMetadata(strStratify) if strStratify else [totalAbundanceTable] #For each stratified abundance block or for the unstratfified abundance #Run the unsupervised blocks fAppendSupFiles = False for stratAbundanceTable in lStratifiedAbundanceTables: logging.info("MicroPITA.funcRun:: Running abundance block:"+stratAbundanceTable.funcGetName()) ###NOT SUMMED, NOT NORMALIZED #Only perform if the data is not yet normalized if not stratAbundanceTable.funcIsNormalized( ): #Need to first work with unnormalized data if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3: self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount, dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlphaNoNormalize, lsBetaMetrics=diversityMetricsBetaNoNormalize, lsInverseBetaMetrics=diversityMetricsBetaNoNormalize, fRunDiversity=c_RUN_MAX_DIVERSITY_1,fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2, fRunExtreme=c_RUN_MAX_DISSIMILARITY_3, strAlphaMetadata=strAlphaMetadata, istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity) #Generate selection by the rank average of user defined taxa #Expects (Taxa (row) by Samples (column)) #Expects a column 0 of taxa id that is skipped #Returns [(sample name,average,rank)] #SUMMED AND NORMALIZED stratAbundanceTable.funcSumClades() #Normalize data at this point stratAbundanceTable.funcNormalize() if c_RUN_RANK_AVERAGE_USER_4: selectedSamples[ConstantsMicropita.c_strFeature] = self.funcSelectTargetedTaxaSamples(abndMatrix=stratAbundanceTable, lsTargetedTaxa=userDefinedTaxa, iSampleSelectionCount=iCount, sMethod=strFeatureSelection) logging.info("MicroPITA.funcRun:: Selected Samples Rank") logging.info(selectedSamples) ###SUMMED AND NORMALIZED analysis block #Diversity based metric will move reduce to terminal taxa as needed if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3: self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount, dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlpha, lsBetaMetrics=diversityMetricsBeta, lsInverseBetaMetrics=diversityMetricsBeta, fRunDiversity=c_RUN_MAX_DIVERSITY_1,fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2, fRunExtreme=c_RUN_MAX_DISSIMILARITY_3, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity) #5::Select randomly #Expects sampleNames = List of sample names [name, name, name...] if(c_RUN_RANDOM_5): #Select randomly from sample names selectedSamples[ConstantsMicropita.c_strRandom] = self.funcGetRandomSamples(lsSamples=stratAbundanceTable.funcGetSampleNames(), iNumberOfSamplesToReturn=iCount) logging.info("MicroPITA.funcRun:: Selected Samples Random") logging.info(selectedSamples) #Perform supervised selection if c_RUN_DISTINCT or c_RUN_DISCRIMINANT: if strLabel: dictSelectionRet = self.funcRunSupervisedDistancesFromCentroids(abundanceTable=stratAbundanceTable, fRunDistinct=c_RUN_DISTINCT, fRunDiscriminant=c_RUN_DISCRIMINANT, xOutputSupFile=ostmInputPredictFile,xPredictSupFile=ostmPredictFile, strSupervisedMetadata=strLabel, iSampleSupSelectionCount=iCount, lsOriginalSampleNames = totalAbundanceTable.funcGetSampleNames(), lsOriginalLabels = lsOriginalLabels, fAppendFiles=fAppendSupFiles) [selectedSamples.setdefault(sKey,[]).extend(lValue) for sKey,lValue in dictSelectionRet.items()] if not fAppendSupFiles: fAppendSupFiles = True logging.info("MicroPITA.funcRun:: Selected Samples Unsupervised") logging.info(selectedSamples) return selectedSamples #Testing: Happy path tested @staticmethod def funcWriteSelectionToFile(dictSelection,xOutputFilePath): """ Writes the selection of samples by method to an output file. :param dictSelection: The dictionary of selections by method to be written to a file. :type: Dictionary The dictionary of selections by method {"method":["sample selected","sample selected"...]} :param xOutputFilePath: FileStream or String path to file inwhich the dictionary is written. :type: String FileStream or String path to file """ if not dictSelection: return #Open file f = csv.writer(open(xOutputFilePath,"w") if isinstance(xOutputFilePath, str) else xOutputFilePath, delimiter=ConstantsMicropita.c_outputFileDelim ) #Create output content from dictionary for sKey in dictSelection: f.writerow([sKey]+dictSelection[sKey]) logging.debug("MicroPITA.funcRun:: Selected samples output to file:"+str(dictSelection[sKey])) #Testing: Happy Path tested @staticmethod def funcReadSelectionFileToDictionary(xInputFile): """ Reads in an output selection file from micropita and formats it into a dictionary. :param xInputFile: String path to file or file stream to read and translate into a dictionary. {"method":["sample selected","sample selected"...]} :type: FileStream or String Path to file :return Dictionary: Samples selected by methods. Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]} """ #Open file istmReader = csv.reader(open(xInputFile,'r') if isinstance(xInputFile, str) else xInputFile, delimiter = ConstantsMicropita.c_outputFileDelim) #Dictionary to hold selection data return dict([(lsLine[0], lsLine[1:]) for lsLine in istmReader]) #Set up arguments reader argp = argparse.ArgumentParser( prog = "MicroPITA.py", description = """Selects samples from abundance tables based on various selection schemes.""" ) args = argp.add_argument_group( "Common", "Commonly modified options" ) args.add_argument(ConstantsMicropita.c_strCountArgument,"--num", dest="iCount", metavar = "samples", default = 10, type = int, help = ConstantsMicropita.c_strCountHelp) args.add_argument("-m","--method", dest = "lstrMethods", metavar = "method", default = [], help = ConstantsMicropita.c_strSelectionTechniquesHelp, choices = ConstantsMicropita.c_lsAllMethods, action = "append") args = argp.add_argument_group( "Custom", "Selecting and inputing custom metrics" ) args.add_argument("-a","--alpha", dest = "strAlphaDiversity", metavar = "AlphaDiversity", default = None, help = ConstantsMicropita.c_strCustomAlphaDiversityHelp, choices = Metric.setAlphaDiversities) args.add_argument("-b","--beta", dest = "strBetaDiversity", metavar = "BetaDiversity", default = None, help = ConstantsMicropita.c_strCustomBetaDiversityHelp, choices = list(Metric.setBetaDiversities)+[Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]) args.add_argument("-q","--alphameta", dest = "strAlphaMetadata", metavar = "AlphaDiversityMetadata", default = None, help = ConstantsMicropita.c_strCustomAlphaDiversityMetadataHelp) args.add_argument("-x","--betamatrix", dest = "istmBetaMatrix", metavar = "BetaDiversityMatrix", default = None, help = ConstantsMicropita.c_strCustomBetaDiversityMatrixHelp) args.add_argument("-o","--tree", dest = "istrmTree", metavar = "PhylogeneticTree", default = None, help = ConstantsMicropita.c_strCustomPhylogeneticTreeHelp) args.add_argument("-i","--envr", dest = "istrmEnvr", metavar = "EnvironmentFile", default = None, help = ConstantsMicropita.c_strCustomEnvironmentFileHelp) args.add_argument("-f","--invertDiversity", dest = "fInvertDiversity", action="store_true", default = False, help = ConstantsMicropita.c_strInvertDiversityHelp) args = argp.add_argument_group( "Miscellaneous", "Row/column identifiers and feature targeting options" ) args.add_argument("-d",ConstantsMicropita.c_strIDNameArgument, dest="strIDName", metavar="sample_id", help= ConstantsMicropita.c_strIDNameHelp) args.add_argument("-l",ConstantsMicropita.c_strLastMetadataNameArgument, dest="strLastMetadataName", metavar = "metadata_id", default = None, help= ConstantsMicropita.c_strLastMetadataNameHelp) args.add_argument("-r",ConstantsMicropita.c_strTargetedFeatureMethodArgument, dest="strFeatureSelection", metavar="targeting_method", default=ConstantsMicropita.lsTargetedFeatureMethodValues[0], choices=ConstantsMicropita.lsTargetedFeatureMethodValues, help= ConstantsMicropita.c_strTargetedFeatureMethodHelp) args.add_argument("-t",ConstantsMicropita.c_strTargetedSelectionFileArgument, dest="istmFeatures", metavar="feature_file", type=argparse.FileType("rU"), help=ConstantsMicropita.c_strTargetedSelectionFileHelp) args.add_argument("-w",ConstantsMicropita.c_strFeatureMetadataArgument, dest="strLastFeatureMetadata", metavar="Last_Feature_Metadata", default=None, help=ConstantsMicropita.c_strFeatureMetadataHelp) args = argp.add_argument_group( "Data labeling", "Metadata IDs for strata and supervised label values" ) args.add_argument("-e",ConstantsMicropita.c_strSupervisedLabelArgument, dest="strLabel", metavar= "supervised_id", help=ConstantsMicropita.c_strSupervisedLabelHelp) args.add_argument("-s",ConstantsMicropita.c_strUnsupervisedStratifyMetadataArgument, dest="strUnsupervisedStratify", metavar="stratify_id", help= ConstantsMicropita.c_strUnsupervisedStratifyMetadataHelp) args = argp.add_argument_group( "File formatting", "Rarely modified file formatting options" ) args.add_argument("-j",ConstantsMicropita.c_strFileDelimiterArgument, dest="cFileDelimiter", metavar="column_delimiter", default="\t", help=ConstantsMicropita.c_strFileDelimiterHelp) args.add_argument("-k",ConstantsMicropita.c_strFeatureNameDelimiterArgument, dest="cFeatureNameDelimiter", metavar="taxonomy_delimiter", default="|", help=ConstantsMicropita.c_strFeatureNameDelimiterHelp) args = argp.add_argument_group( "Debugging", "Debugging options - modify at your own risk!" ) args.add_argument("-v",ConstantsMicropita.c_strLoggingArgument, dest="strLogLevel", metavar = "log_level", default="WARNING", choices=ConstantsMicropita.c_lsLoggingChoices, help= ConstantsMicropita.c_strLoggingHelp) args.add_argument("-c",ConstantsMicropita.c_strCheckedAbundanceFileArgument, dest="ostmCheckedFile", metavar = "output_qc", type = argparse.FileType("w"), help = ConstantsMicropita.c_strCheckedAbundanceFileHelp) args.add_argument("-g",ConstantsMicropita.c_strLoggingFileArgument, dest="ostmLoggingFile", metavar = "output_log", type = argparse.FileType("w"), help = ConstantsMicropita.c_strLoggingFileHelp) args.add_argument("-u",ConstantsMicropita.c_strSupervisedInputFile, dest="ostmInputPredictFile", metavar = "output_scaled", type = argparse.FileType("w"), help = ConstantsMicropita.c_strSupervisedInputFileHelp) args.add_argument("-p",ConstantsMicropita.c_strSupervisedPredictedFile, dest="ostmPredictFile", metavar = "output_labels", type = argparse.FileType("w"), help = ConstantsMicropita.c_strSupervisedPredictedFileHelp) argp.add_argument("istmInput", metavar = "input.pcl/biome", type = argparse.FileType("rU"), help = ConstantsMicropita.c_strAbundanceFileHelp, default = sys.stdin) argp.add_argument("ostmOutput", metavar = "output.txt", type = argparse.FileType("w"), help = ConstantsMicropita.c_strGenericOutputDataFileHelp, default = sys.stdout) __doc__ = "::\n\n\t" + argp.format_help( ).replace( "\n", "\n\t" ) + __doc__ def _main( ): args = argp.parse_args( ) #Set up logger iLogLevel = getattr(logging, args.strLogLevel.upper(), None) logging.basicConfig(stream = args.ostmLoggingFile if args.ostmLoggingFile else sys.stderr, filemode = 'w', level=iLogLevel) #Run micropita logging.info("MicroPITA:: Start microPITA") microPITA = MicroPITA() #Argparse will append to the default but will not remove the default so I do this here if not len(args.lstrMethods): args.lstrMethods = [ConstantsMicropita.c_strRepresentative] dictSelectedSamples = microPITA.funcRun( strIDName = args.strIDName, strLastMetadataName = args.strLastMetadataName, istmInput = args.istmInput, ostmInputPredictFile = args.ostmInputPredictFile, ostmPredictFile = args.ostmPredictFile, ostmCheckedFile = args.ostmCheckedFile, ostmOutput = args.ostmOutput, cDelimiter = args.cFileDelimiter, cFeatureNameDelimiter = args.cFeatureNameDelimiter, istmFeatures = args.istmFeatures, strFeatureSelection = args.strFeatureSelection, iCount = args.iCount, strLastRowMetadata = args.strLastFeatureMetadata, strLabel = args.strLabel, strStratify = args.strUnsupervisedStratify, strCustomAlpha = args.strAlphaDiversity, strCustomBeta = args.strBetaDiversity, strAlphaMetadata = args.strAlphaMetadata, istmBetaMatrix = args.istmBetaMatrix, istrmTree = args.istrmTree, istrmEnvr = args.istrmEnvr, lstrMethods = args.lstrMethods, fInvertDiversity = args.fInvertDiversity ) if not dictSelectedSamples: logging.error("MicroPITA:: Error, did not get a result from analysis.") return -1 logging.info("End microPITA") #Log output for debugging logging.debug("MicroPITA:: Returned the following samples:"+str(dictSelectedSamples)) #Write selection to file microPITA.funcWriteSelectionToFile(dictSelection=dictSelectedSamples, xOutputFilePath=args.ostmOutput) if __name__ == "__main__": _main( )