comparison galaxy_micropita/MicroPITA.py @ 3:8fb4630ab314 draft default tip

Uploaded
author sagun98
date Thu, 03 Jun 2021 17:07:36 +0000
parents
children
comparison
equal deleted inserted replaced
2:1c5736dc85ab 3:8fb4630ab314
1 #!/usr/bin/env python
2 """
3 Author: Timothy Tickle
4 Description: Class to Run analysis for the microPITA paper
5 """
6
7 #####################################################################################
8 #Copyright (C) <2012>
9 #
10 #Permission is hereby granted, free of charge, to any person obtaining a copy of
11 #this software and associated documentation files (the "Software"), to deal in the
12 #Software without restriction, including without limitation the rights to use, copy,
13 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
14 #and to permit persons to whom the Software is furnished to do so, subject to
15 #the following conditions:
16 #
17 #The above copyright notice and this permission notice shall be included in all copies
18 #or substantial portions of the Software.
19 #
20 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
21 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
22 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
23 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
24 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 #####################################################################################
27
28 __author__ = "Timothy Tickle"
29 __copyright__ = "Copyright 2012"
30 __credits__ = ["Timothy Tickle"]
31 __license__ = "MIT"
32 __maintainer__ = "Timothy Tickle"
33 __email__ = "ttickle@sph.harvard.edu"
34 __status__ = "Development"
35
36 import sys
37 import argparse
38 from src.breadcrumbs.src.AbundanceTable import AbundanceTable
39 from src.breadcrumbs.src.ConstantsBreadCrumbs import ConstantsBreadCrumbs
40 from src.breadcrumbs.src.Metric import Metric
41 from src.breadcrumbs.src.KMedoids import Kmedoids
42 from src.breadcrumbs.src.MLPYDistanceAdaptor import MLPYDistanceAdaptor
43 from src.breadcrumbs.src.SVM import SVM
44 from src.breadcrumbs.src.UtilityMath import UtilityMath
45
46 from src.ConstantsMicropita import ConstantsMicropita
47 import csv
48 import logging
49 import math
50 import mlpy
51 import numpy as np
52 import operator
53 import os
54 import random
55 import scipy.cluster.hierarchy as hcluster
56 import scipy.spatial.distance
57 from types import *
58
59 class MicroPITA:
60 """
61 Selects samples from a first tier of a multi-tiered study to be used in a second tier.
62 Different methods can be used for selection.
63 The expected input is an abundance table (and potentially a text file of targeted features,
64 if using the targeted features option). Output is a list of samples exhibiting the
65 characteristics of interest.
66 """
67
68 #Constants
69 #Diversity metrics Alpha
70 c_strInverseSimpsonDiversity = Metric.c_strInvSimpsonDiversity
71 c_strChao1Diversity = Metric.c_strChao1Diversity
72
73 #Diversity metrics Beta
74 c_strBrayCurtisDissimilarity = Metric.c_strBrayCurtisDissimilarity
75
76 #Additive inverses of diversity metrics beta
77 c_strInvBrayCurtisDissimilarity = Metric.c_strInvBrayCurtisDissimilarity
78
79 #Technique Names
80 ConstantsMicropita.c_strDiversity2 = ConstantsMicropita.c_strDiversity+"_C"
81
82 #Targeted feature settings
83 c_strTargetedRanked = ConstantsMicropita.c_strTargetedRanked
84 c_strTargetedAbundance = ConstantsMicropita.c_strTargetedAbundance
85
86 #Technique groupings
87 # c_lsDiversityMethods = [ConstantsMicropita.c_strDiversity,ConstantsMicropita.c_strDiversity2]
88
89 #Converts ecology metrics into standardized method selection names
90 dictConvertAMetricDiversity = {c_strInverseSimpsonDiversity:ConstantsMicropita.c_strDiversity, c_strChao1Diversity:ConstantsMicropita.c_strDiversity2}
91 # dictConvertMicroPITAToAMetric = {ConstantsMicropita.c_strDiversity:c_strInverseSimpsonDiversity, ConstantsMicropita.c_strDiversity2:c_strChao1Diversity}
92 dictConvertBMetricToMethod = {c_strBrayCurtisDissimilarity:ConstantsMicropita.c_strRepresentative}
93 dictConvertInvBMetricToMethod = {c_strBrayCurtisDissimilarity:ConstantsMicropita.c_strExtreme}
94
95 #Linkage used in the Hierarchical clustering
96 c_strHierarchicalClusterMethod = 'average'
97
98 ####Group 1## Diversity
99 #Testing: Happy path Testing (8)
100 def funcGetTopRankedSamples(self, lldMatrix = None, lsSampleNames = None, iTopAmount = None):
101 """
102 Given a list of lists of measurements, for each list the indices of the highest values are returned. If lsSamplesNames is given
103 it is treated as a list of string names that is in the order of the measurements in each list. Indices are returned or the sample
104 names associated with the indices.
105
106 :param lldMatrix: List of lists [[value,value,value,value],[value,value,value,value]].
107 :type: List of lists List of measurements. Each list is a different measurement. Each measurement in positionally related to a sample.
108 :param lsSampleNames: List of sample names positionally related (the same) to each list (Optional).
109 :type: List of strings List of strings.
110 :param iTopAmount: The amount of top measured samples (assumes the higher measurements are better).
111 :type: integer Integer amount of sample names/ indices to return.
112 :return List: List of samples to be selected.
113 """
114 topRankListRet = []
115 for rowMetrics in lldMatrix:
116 #Create 2 d array to hold value and index and sort
117 liIndexX = [rowMetrics,range(len(rowMetrics))]
118 liIndexX[1].sort(key = liIndexX[0].__getitem__,reverse = True)
119
120 if lsSampleNames:
121 topRankListRet.append([lsSampleNames[iIndex] for iIndex in liIndexX[1][:iTopAmount]])
122 else:
123 topRankListRet.append(liIndexX[1][:iTopAmount])
124
125 return topRankListRet
126
127 ####Group 2## Representative Dissimilarity
128 #Testing: Happy path tested 1
129 def funcGetCentralSamplesByKMedoids(self, npaMatrix=None, sMetric=None, lsSampleNames=None, iNumberSamplesReturned=0, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None):
130 """
131 Gets centroid samples by k-medoids clustering of a given matrix.
132
133 :param npaMatrix: Numpy array where row=features and columns=samples
134 :type: Numpy array Abundance Data.
135 :param sMetric: String name of beta metric used as the distance metric.
136 :type: String String name of beta metric.
137 :param lsSampleNames: The names of the sample
138 :type: List List of strings
139 :param iNumberSamplesReturned: Number of samples to return, each will be a centroid of a sample.
140 :type: Integer Number of samples to return
141 :return List: List of selected samples.
142 :param istmBetaMatrix: File with beta-diversity matrix
143 :type: File stream or file path string
144 """
145
146 #Count of how many rows
147 sampleCount = npaMatrix.shape[0]
148 if iNumberSamplesReturned > sampleCount:
149 logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: There are not enough samples to return the amount of samples specified. Return sample count = "+str(iNumberSamplesReturned)+". Sample number = "+str(sampleCount)+".")
150 return False
151
152 #If the cluster count is equal to the sample count return all samples
153 if sampleCount == iNumberSamplesReturned:
154 return list(lsSampleNames)
155
156 #Get distance matrix
157 distanceMatrix=scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix,lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaMatrix, sMetric=sMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames)
158 if type(distanceMatrix) is BooleanType:
159 logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: Could not read in the supplied distance matrix, returning false.")
160 return False
161
162 # Handle unifrac output
163 if sMetric in [Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]:
164 distanceMatrix = distanceMatrix[0]
165
166 #Log distance matrix
167 logging.debug("MicroPITA.funcGetCentralSamplesByKMedoids:: Distance matrix for representative selection using metric="+str(sMetric))
168
169 distance = MLPYDistanceAdaptor(npaDistanceMatrix=distanceMatrix, fIsCondensedMatrix=True)
170
171 #Create object to determine clusters/medoids
172 medoidsMaker = Kmedoids(k=iNumberSamplesReturned, dist=distance)
173 #medoidsData includes(1d numpy array, medoids indexes;
174 # 1d numpy array, non-medoids indexes;
175 # 1d numpy array, cluster membership for non-medoids;
176 # double, cost of configuration)
177 #npaMatrix is samples x rows
178 #Build a matrix of lists of indicies to pass to the distance matrix
179 lliIndicesMatrix = [[iIndexPosition] for iIndexPosition in xrange(0,len(npaMatrix))]
180 medoidsData = medoidsMaker.compute(np.array(lliIndicesMatrix))
181 logging.debug("MicroPITA.funcGetCentralSamplesByKMedoids:: Results from the kmedoid method in representative selection:")
182 logging.debug(str(medoidsData))
183
184 #If returning the same amount of clusters and samples
185 #Return centroids
186 selectedIndexes = medoidsData[0]
187 return [lsSampleNames[selectedIndexes[index]] for index in xrange(0,iNumberSamplesReturned)]
188
189 ####Group 3## Highest Dissimilarity
190 #Testing: Happy path tested
191 def funcSelectExtremeSamplesFromHClust(self, strBetaMetric, npaAbundanceMatrix, lsSampleNames, iSelectSampleCount, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None):
192 """
193 Select extreme samples from HClustering.
194
195 :param strBetaMetric: The beta metric to use for distance matrix generation.
196 :type: String The name of the beta metric to use.
197 :param npaAbundanceMatrix: Numpy array where row=samples and columns=features.
198 :type: Numpy Array Abundance data.
199 :param lsSampleNames: The names of the sample.
200 :type: List List of strings.
201 :param iSelectSampleCount: Number of samples to select (return).
202 :type: Integer Integer number of samples returned.
203 :return Samples: List of samples.
204 :param istmBetaMatrix: File with beta-diversity matrix
205 :type: File stream or file path string
206 """
207
208 #If they want all the sample count, return all sample names
209 iSampleCount=len(npaAbundanceMatrix[:,0])
210 if iSelectSampleCount==iSampleCount:
211 return lsSampleNames
212
213 #Holds the samples to be returned
214 lsReturnSamplesRet = []
215
216 #Generate beta matrix
217 #Returns condensed matrix
218 tempDistanceMatrix = scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix,lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaAbundanceMatrix, sMetric=strBetaMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames, fAdditiveInverse = True)
219
220 if strBetaMetric in [Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]:
221 tempDistanceMatrix = tempDistanceMatrix[0]
222
223 if type(tempDistanceMatrix) is BooleanType:
224 logging.error("MicroPITA.funcSelectExtremeSamplesFromHClust:: Could not read in the supplied distance matrix, returning false.")
225 return False
226
227 if istmBetaMatrix:
228 tempDistanceMatrix = 1-tempDistanceMatrix
229
230 #Feed beta matrix to linkage to cluster
231 #Send condensed matrix
232 linkageMatrix = hcluster.linkage(tempDistanceMatrix, method=self.c_strHierarchicalClusterMethod)
233
234 #Extract cluster information from dendrogram
235 #The linakge matrix is of the form
236 #[[int1 int2 doube int3],...]
237 #int1 and int1 are the paired samples indexed at 0 and up.
238 #each list is an entry for a branch that is number starting with the first
239 #list being sample count index + 1
240 #each list is then named by an increment as they appear
241 #this means that if a number is in the list and is = sample count or greater it is not
242 #terminal and is instead a branch.
243 #This method just takes the lowest metric measurement (highest distance pairs/clusters)
244 #Works much better than the original technique
245 #get total number of samples
246
247 iCurrentSelectCount = 0
248 for row in linkageMatrix:
249 #Get nodes ofthe lowest pairing (so the furthest apart pair)
250 iNode1 = int(row[0])
251 iNode2 = int(row[1])
252 #Make sure the nodes are a terminal node (sample) and not a branch in the dendrogram
253 #The branching in the dendrogram will start at the number of samples and increment higher.
254 #Add each of the pair one at a time breaking when enough samples are selected.
255 if iNode1<iSampleCount:
256 lsReturnSamplesRet.append(lsSampleNames[iNode1])
257 iCurrentSelectCount = iCurrentSelectCount + 1
258 if iCurrentSelectCount == iSelectSampleCount:
259 break
260 if iNode2<iSampleCount:
261 lsReturnSamplesRet.append(lsSampleNames[iNode2])
262 iCurrentSelectCount = iCurrentSelectCount + 1
263 if iCurrentSelectCount == iSelectSampleCount:
264 break
265
266 #Return selected samples
267 return lsReturnSamplesRet
268
269 ####Group 4## Rank Average of user Defined Taxa
270 #Testing: Happy Path Tested
271 def funcGetAverageAbundanceSamples(self, abndTable, lsTargetedFeature, fRank=False):
272 """
273 Averages feature abundance or ranked abundance. Expects a column 0 of taxa id that is skipped.
274
275 :param abndTable: Abundance Table to analyse
276 :type: AbundanceTable Abundance Table
277 :param lsTargetedFeature: String names
278 :type: list list of string names of features (bugs) which are measured after ranking against the full sample
279 :param fRank: Indicates to rank the abundance before getting the average abundance of the features (default false)
280 :type: boolean Flag indicating ranking abundance before calculating average feature measurement (false= no ranking)
281 :return List of lists or boolean: List of lists or False on error. One internal list per sample indicating the sample,
282 feature average abundance or ranked abundance. Lists will already be sorted.
283 For not Ranked [[sample,average abundance of selected feature,1]]
284 For Ranked [[sample,average ranked abundance, average abundance of selected feature]]
285 Error Returns false
286 """
287
288 llAbundance = abndTable.funcGetAverageAbundancePerSample(lsTargetedFeature)
289 if not llAbundance:
290 logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.")
291 return False
292 #Add a space for ranking if needed
293 #Not ranked will be [[sSample,average abundance,1]]
294 #(where 1 will not discriminant ties if used in later functions, so this generalizes)
295 #Ranked will be [[sSample, average rank, average abundance]]
296 llRetAbundance = [[llist[0],-1,llist[1]] for llist in llAbundance]
297 #Rank if needed
298 if fRank:
299 abndRanked = abndTable.funcRankAbundance()
300 if abndRanked == None:
301 logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not rank the abundance table, returned false.")
302 return False
303 llRetRank = abndRanked.funcGetAverageAbundancePerSample(lsTargetedFeature)
304 if not llRetRank:
305 logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average ranked abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.")
306 return False
307 dictRanks = dict(llRetRank)
308 llRetAbundance = [[a[0],dictRanks[a[0]],a[2]] for a in llRetAbundance]
309
310 #Sort first for ties and then for the main feature
311 if not fRank or ConstantsMicropita.c_fBreakRankTiesByDiversity:
312 llRetAbundance = sorted(llRetAbundance, key = lambda sampleData: sampleData[2], reverse = not fRank)
313 if fRank:
314 llRetAbundance = sorted(llRetAbundance, key = lambda sampleData: sampleData[1], reverse = not fRank)
315 return llRetAbundance
316
317 #Testing: Happy Path Tested
318 def funcSelectTargetedTaxaSamples(self, abndMatrix, lsTargetedTaxa, iSampleSelectionCount, sMethod = ConstantsMicropita.lsTargetedFeatureMethodValues[0]):
319 """
320 Selects samples with the highest ranks or abundance of targeted features.
321 If ranked, select the highest abundance for tie breaking
322
323 :param abndMatrix: Abundance table to analyse
324 :type: AbundanceTable Abundance table
325 :param lsTargetedTaxa: List of features
326 :type: list list of strings
327 :param iSampleSelectionCount: Number of samples to select
328 :type: integer integer
329 :param sMethod: Method to select targeted features
330 :type: string String (Can be values found in ConstantsMicropita.lsTargetedFeatureMethodValues)
331 :return List of strings: List of sample names which were selected
332 List of strings Empty list is returned on an error.
333 """
334
335 #Check data
336 if(len(lsTargetedTaxa) < 1):
337 logging.error("MicroPITA.funcSelectTargetedTaxaSamples. Taxa defined selection was requested but no features were given.")
338 return []
339
340 lsTargetedSamples = self.funcGetAverageAbundanceSamples(abndTable=abndMatrix, lsTargetedFeature=lsTargetedTaxa,
341 fRank=sMethod.lower() == self.c_strTargetedRanked.lower())
342 #If an error occured or the key word for the method was not recognized
343 if lsTargetedSamples == False:
344 logging.error("MicroPITA.funcSelectTargetedTaxaSamples:: Was not able to select for the features given. So targeted feature selection was performed. Check to make sure the features are spelled correctly and exist in the abundance file.")
345 return []
346
347 #Select from results
348 return [sSample[0] for sSample in lsTargetedSamples[:iSampleSelectionCount]]
349
350 ####Group 5## Random
351 #Testing: Happy path Tested
352 def funcGetRandomSamples(self, lsSamples=None, iNumberOfSamplesToReturn=0):
353 """
354 Returns random sample names of the number given. No replacement.
355
356 :param lsSamples: List of sample names
357 :type: list list of strings
358 :param iNumberOfSamplesToReturn: Number of samples to select
359 :type: integer integer.
360 :return List: List of selected samples (strings).
361 """
362
363 #Input matrix sample count
364 sampleCount = len(lsSamples)
365
366 #Return the full matrix if they ask for a return matrix where length == original
367 if(iNumberOfSamplesToReturn >= sampleCount):
368 return lsSamples
369
370 #Get the random indices for the sample (without replacement)
371 liRandomIndices = random.sample(range(sampleCount), iNumberOfSamplesToReturn)
372
373 #Create a boolean array of if indexes are to be included in the reduced array
374 return [sSample for iIndex, sSample in enumerate(lsSamples) if iIndex in liRandomIndices]
375
376 #Happy path tested (case 3)
377 def funcGetAveragePopulation(self, abndTable, lfCompress):
378 """
379 Get the average row per column in the abndtable.
380
381 :param abndTable: AbundanceTable of data to be averaged
382 :type: AbudanceTable
383 :param lfCompress: List of boolean flags (false means to remove sample before averaging
384 :type: List of floats
385 :return List of doubles:
386 """
387 if sum(lfCompress) == 0:
388 return []
389
390 #Get the average populations
391 lAverageRet = []
392
393 for sFeature in abndTable.funcGetAbundanceCopy():
394 sFeature = list(sFeature)[1:]
395 sFeature=np.compress(lfCompress,sFeature,axis=0)
396 lAverageRet.append(sum(sFeature)/float(len(sFeature)))
397 return lAverageRet
398
399 #Happy path tested (2 cases)
400 def funcGetDistanceFromAverage(self, abndTable,ldAverage,lsSamples,lfSelected):
401 """
402 Given an abundance table and an average sample, this returns the distance of each sample
403 (measured using brays-curtis dissimilarity) from the average.
404 The distances are reduced by needing to be in the lsSamples and being a true in the lfSelected
405 (which is associated with the samples in the order of the samples in the abundance table;
406 use abundancetable.funcGetSampleNames() to see the order if needed).
407
408 :param abndTable: Abundance table holding the data to be analyzed.
409 :type: AbundanceTable
410 :param ldAverage: Average population (Average features of the abundance table of samples)
411 :type: List of doubles which represent the average population
412 :param lsSamples: These are the only samples used in the analysis
413 :type: List of strings (sample ids)
414 :param lfSelected: Samples to be included in the analysis
415 :type: List of boolean (true means include)
416 :return: List of distances (doubles)
417 """
418 #Get the distance from label 1 of all samples in label0 splitting into selected and not selected lists
419 ldSelectedDistances = []
420
421 for sSampleName in [sSample for iindex, sSample in enumerate(lsSamples) if lfSelected[iindex]]:
422 #Get the sample measurements
423 ldSelectedDistances.append(Metric.funcGetBrayCurtisDissimilarity(np.array([abndTable.funcGetSample(sSampleName),ldAverage]))[0])
424 return ldSelectedDistances
425
426 #Happy path tested (1 case)
427 def funcMeasureDistanceFromLabelToAverageOtherLabel(self, abndTable, lfGroupOfInterest, lfGroupOther):
428 """
429 Get the distance of samples from one label from the average sample of not the label.
430 Note: This assumes 2 classes.
431
432 :param abndTable: Table of data to work out of.
433 :type: Abundace Table
434 :param lfGroupOfInterest: Boolean indicator of the sample being in the first group.
435 :type: List of floats, true indicating an individual in the group of interest.
436 :param lfGroupOther: Boolean indicator of the sample being in the other group.
437 :type: List of floats, true indicating an individual in the
438 :return List of List of doubles: [list of tuples (string sample name,double distance) for the selected population, list of tuples for the not selected population]
439 """
440 #Get all sample names
441 lsAllSamples = abndTable.funcGetSampleNames()
442
443 #Get average populations
444 lAverageOther = self.funcGetAveragePopulation(abndTable=abndTable, lfCompress=lfGroupOther)
445
446 #Get the distance from the average of the other label (label 1)
447 ldSelectedDistances = self.funcGetDistanceFromAverage(abndTable=abndTable, ldAverage=lAverageOther,
448 lsSamples=lsAllSamples, lfSelected=lfGroupOfInterest)
449
450 return zip([lsAllSamples[iindex] for iindex, fGroup in enumerate(lfGroupOfInterest) if fGroup],ldSelectedDistances)
451
452 #Happy path tested (1 test case)
453 def funcPerformDistanceSelection(self, abndTable, iSelectionCount, sLabel, sValueOfInterest):
454 """
455 Given metadata, metadata of one value (sValueOfInterest) is measured from the average (centroid) value of another label group.
456 An iSelectionCount of samples is selected from the group of interest closest to and furthest from the centroid of the other group.
457
458 :params abndTable: Abundance of measurements
459 :type: AbundanceTable
460 :params iSelectionCount: The number of samples selected per sample.
461 :type: Integer Integer greater than 0
462 :params sLabel: ID of the metadata which is the supervised label
463 :type: String
464 :params sValueOfInterest: Metadata value in the sLabel metadta row of the abundance table which defines the group of interest.
465 :type: String found in the abundance table metadata row indicated by sLabel.
466 :return list list of tuples (samplename, distance) [[iSelectionCount of tuples closest to the other centroid], [iSelectionCount of tuples farthest from the other centroid], [all tuples of samples not selected]]
467 """
468
469 lsMetadata = abndTable.funcGetMetadata(sLabel)
470 #Other metadata values
471 lsUniqueOtherValues = list(set(lsMetadata)-set(sValueOfInterest))
472
473 #Get boolean indicator of values of interest
474 lfLabelsInterested = [sValueOfInterest == sValue for sValue in lsMetadata]
475
476 #Get the distances of the items of interest from the other metadata values
477 dictDistanceAverages = {}
478 for sOtherLabel in lsUniqueOtherValues:
479 #Get boolean indicator of labels not of interest
480 lfLabelsOther = [sOtherLabel == sValue for sValue in lsMetadata]
481
482 #Get the distances of data from two different groups to the average of the other
483 ldValueDistances = dict(self.funcMeasureDistanceFromLabelToAverageOtherLabel(abndTable, lfLabelsInterested, lfLabelsOther))
484
485 for sKey in ldValueDistances:
486 dictDistanceAverages[sKey] = ldValueDistances[sKey] + dictDistanceAverages[sKey] if sKey in dictDistanceAverages else ldValueDistances[sKey]
487
488 #Finish average by dividing by length of lsUniqueOtherValues
489 ltpleAverageDistances = [(sKey, dictDistanceAverages[sKey]/float(len(lsUniqueOtherValues))) for sKey in dictDistanceAverages]
490
491 #Sort to extract extremes
492 ltpleAverageDistances = sorted(ltpleAverageDistances,key=operator.itemgetter(1))
493
494 #Get the closest and farthest distances
495 ltupleDiscriminantSamples = ltpleAverageDistances[:iSelectionCount]
496 ltupleDistinctSamples = ltpleAverageDistances[iSelectionCount*-1:]
497
498 #Remove the selected samples from the larger population of distances (better visualization)
499 ldSelected = [tpleSelected[0] for tpleSelected in ltupleDiscriminantSamples+ltupleDistinctSamples]
500
501 #Return discriminant tuples, distinct tuples, other tuples
502 return [ltupleDiscriminantSamples, ltupleDistinctSamples,
503 [tplData for tplData in ltpleAverageDistances if tplData[0] not in ldSelected]]
504
505 #Run the supervised method surrounding distance from centroids
506 #Happy path tested (3 test cases)
507 def funcRunSupervisedDistancesFromCentroids(self, abundanceTable, fRunDistinct, fRunDiscriminant,
508 xOutputSupFile, xPredictSupFile, strSupervisedMetadata,
509 iSampleSupSelectionCount, lsOriginalSampleNames, lsOriginalLabels, fAppendFiles = False):
510 """
511 Runs supervised methods based on measuring distances of one label from the centroid of another. NAs are evaluated as theirown group.
512
513 :param abundanceTable: AbundanceTable
514 :type: AbudanceTable Data to analyze
515 :param fRunDistinct: Run distinct selection method
516 :type: Boolean boolean (true runs method)
517 :param fRunDiscriminant: Run discriminant method
518 :type: Boolean boolean (true runs method)
519 :param xOutputSupFile: File output from supervised methods detailing data going into the method.
520 :type: String or FileStream
521 :param xPredictSupFile: File output from supervised methods distance results from supervised methods.
522 :type: String or FileStream
523 :param strSupervisedMetadata: The metadata that will be used to group samples.
524 :type: String
525 :param iSampleSupSelectionCount: Number of samples to select
526 :type: Integer int sample selection count
527 :param lsOriginalSampleNames: List of the sample names, order is important and should be preserved from the abundanceTable.
528 :type: List of samples
529 :param fAppendFiles: Indicates that output files already exist and appending is occuring.
530 :type: Boolean
531 :return Selected Samples: A dictionary of selected samples by selection ID
532 Dictionary {"Selection Method":["SampleID","SampleID"...]}
533 """
534 #Get labels and run one label against many
535 lstrMetadata = abundanceTable.funcGetMetadata(strSupervisedMetadata)
536 dictlltpleDistanceMeasurements = {}
537 for sMetadataValue in set(lstrMetadata):
538
539 #For now perform the selection here for the label of interest against the other labels
540 dictlltpleDistanceMeasurements.setdefault(sMetadataValue,[]).extend(self.funcPerformDistanceSelection(abndTable=abundanceTable,
541 iSelectionCount=iSampleSupSelectionCount, sLabel=strSupervisedMetadata, sValueOfInterest=sMetadataValue))
542
543 #Make expected output files for supervised methods
544 #1. Output file which is similar to an input file for SVMs
545 #2. Output file that is similar to the probabilitic output of a SVM (LibSVM)
546 #Manly for making output of supervised methods (Distance from Centroid) similar
547 #MicropitaVis needs some of these files
548 if xOutputSupFile:
549 if fAppendFiles:
550 SVM.funcUpdateSVMFileWithAbundanceTable(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile,
551 lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames)
552 else:
553 SVM.funcConvertAbundanceTableToSVMFile(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile,
554 sMetadataLabel=strSupervisedMetadata, lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames)
555
556 #Will contain the samples selected to return
557 #One or more of the methods may be active so this is why I am extending instead of just returning the result of each method type
558 dictSelectedSamplesRet = dict()
559 for sKey, ltplDistances in dictlltpleDistanceMeasurements.items():
560 if fRunDistinct:
561 dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDistinct,[]).extend([ltple[0] for ltple in ltplDistances[1]])
562 if fRunDiscriminant:
563 dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDiscriminant,[]).extend([ltple[0] for ltple in ltplDistances[0]])
564
565 if xPredictSupFile:
566 dictFlattenedDistances = dict()
567 [dictFlattenedDistances.setdefault(sKey, []).append(tple)
568 for sKey, lltple in dictlltpleDistanceMeasurements.items()
569 for ltple in lltple for tple in ltple]
570 if fAppendFiles:
571 self._updatePredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile,
572 dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames)
573 else:
574 self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile,
575 dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames)
576 return dictSelectedSamplesRet
577
578 #Two happy path test cases
579 def _updatePredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames):
580 """
581 Manages updating the predict file.
582
583 :param xPredictSupFile: File that has predictions (distances) from the supervised method.
584 :type: FileStream or String file path
585 :param xInputLabelsFile: File that as input to the supervised methods.
586 :type: FileStream or String file path
587 :param dictltpleDistanceMeasurements:
588 :type: Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]}
589 """
590
591 if not isinstance(xPredictSupFile, str):
592 xPredictSupFile.close()
593 xPredictSupFile = xPredictSupFile.name
594 csvr = open(xPredictSupFile,'r')
595
596 f = csv.reader(csvr,delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace)
597 lsHeader = f.next()[1:]
598 dictlltpleRead = dict([(sHeader,[]) for sHeader in lsHeader])
599
600 #Read data in
601 iSampleIndex = 0
602 for sRow in f:
603 sLabel = sRow[0]
604 [dictlltpleRead[lsHeader[iDistanceIndex]].append((lsOriginalSampleNames[iSampleIndex],dDistance)) for iDistanceIndex, dDistance in enumerate(sRow[1:])
605 if not dDistance == ConstantsMicropita.c_sEmptyPredictFileValue]
606 iSampleIndex += 1
607
608 #Combine dictltpleDistanceMeasurements with new data
609 #If they share a key then merge keeping parameter data
610 #If they do not share the key, keep the full data
611 dictNew = {}
612 for sKey in dictltpleDistanceMeasurements.keys():
613 lsSamples = [tple[0] for tple in dictltpleDistanceMeasurements[sKey]]
614 dictNew[sKey] = dictltpleDistanceMeasurements[sKey]+[tple for tple in dictlltpleRead[sKey] if tple[0] not in lsSamples] if sKey in dictlltpleRead.keys() else dictltpleDistanceMeasurements[sKey]
615 for sKey in dictlltpleRead:
616 if sKey not in dictltpleDistanceMeasurements.keys():
617 dictNew[sKey] = dictlltpleRead[sKey]
618
619 #Call writer
620 self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xInputLabelsFile,
621 dictltpleDistanceMeasurements=dictNew, abundanceTable=abundanceTable,
622 lsOriginalSampleNames=lsOriginalSampleNames, fFromUpdate=True)
623
624 #2 happy path test cases
625 def _writeToPredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames, fFromUpdate=False):
626 """
627 Write to the predict file.
628
629 :param xPredictSupFile: File that has predictions (distances) from the supervised method.
630 :type: FileStream or String file path
631 :param xInputLabelsFile: File that as input to the supervised methods.
632 :type: FileStream or String file path
633 :param dictltpleDistanceMeasurements:
634 :type: Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]}
635 :param abundanceTable: An abundance table of the sample data.
636 :type: AbundanceTable
637 :param lsOriginalSampleNames: Used if the file is being updated as the sample names so that it may be passed in and consistent with other writing.
638 Otherwise will use the sample names from the abundance table.
639 :type: List of strings
640 :param fFromUpdate: Indicates if this is part of an update to the file or not.
641 :type: Boolean
642 """
643
644 xInputLabelsFileName = xInputLabelsFile
645 if not isinstance(xInputLabelsFile,str):
646 xInputLabelsFileName = xInputLabelsFile.name
647 f = csv.writer(open(xPredictSupFile,"w") if isinstance(xPredictSupFile, str) else xPredictSupFile,delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace)
648
649 lsAllSampleNames = abundanceTable.funcGetSampleNames()
650 lsLabels = SVM.funcReadLabelsFromFile(xSVMFile=xInputLabelsFileName, lsAllSampleNames= lsOriginalSampleNames if fFromUpdate else lsAllSampleNames,
651 isPredictFile=False)
652 dictLabels = dict([(sSample,sLabel) for sLabel in lsLabels.keys() for sSample in lsLabels[sLabel]])
653
654 #Dictionay keys will be used to order the predict file
655 lsMeasurementKeys = dictltpleDistanceMeasurements.keys()
656 #Make header
657 f.writerow(["labels"]+lsMeasurementKeys)
658
659 #Reformat dictionary to make it easier to use
660 for sKey in dictltpleDistanceMeasurements:
661 dictltpleDistanceMeasurements[sKey] = dict([ltpl for ltpl in dictltpleDistanceMeasurements[sKey]])
662
663 for sSample in lsOriginalSampleNames:
664 #Make body of file
665 f.writerow([dictLabels.get(sSample,ConstantsMicropita.c_sEmptyPredictFileValue)]+
666 [str(dictltpleDistanceMeasurements[sKey].get(sSample,ConstantsMicropita.c_sEmptyPredictFileValue))
667 for sKey in lsMeasurementKeys])
668
669 def _funcRunNormalizeSensitiveMethods(self, abndData, iSampleSelectionCount, dictSelectedSamples, lsAlphaMetrics, lsBetaMetrics, lsInverseBetaMetrics,
670 fRunDiversity, fRunRepresentative, fRunExtreme, strAlphaMetadata=None,
671 istmBetaMatrix=None, istrmTree=None, istrmEnvr=None, fInvertDiversity=False):
672 """
673 Manages running methods that are sensitive to normalization. This is called twice, once for the set of methods which should not be normalized and the other
674 for the set that should be normalized.
675
676 :param abndData: Abundance table object holding the samples to be measured.
677 :type: AbundanceTable
678 :param iSampleSelectionCount The number of samples to select per method.
679 :type: Integer
680 :param dictSelectedSamples Will be added to as samples are selected {"Method:["strSelectedSampleID","strSelectedSampleID"...]}.
681 :type: Dictionary
682 :param lsAlphaMetrics: List of alpha metrics to use on alpha metric dependent assays (like highest diversity).
683 :type: List of strings
684 :param lsBetaMetrics: List of beta metrics to use on beta metric dependent assays (like most representative).
685 :type: List of strings
686 :param lsInverseBetaMetrics: List of inverse beta metrics to use on inverse beta metric dependent assays (like most dissimilar).
687 :type: List of strings
688 :param fRunDiversity: Run Diversity based methods (true indicates run).
689 :type: Boolean
690 :param fRunRepresentative: Run Representative based methods (true indicates run).
691 :type: Boolean
692 :param fRunExtreme: Run Extreme based methods (true indicates run).
693 :type: Boolean
694 :param istmBetaMatrix: File that has a precalculated beta matrix
695 :type: File stream or File path string
696 :return Selected Samples: Samples selected by methods.
697 Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]}
698 """
699
700 #Sample ids/names
701 lsSampleNames = abndData.funcGetSampleNames()
702
703 #Generate alpha metrics and get most diverse
704 if fRunDiversity:
705
706 #Get Alpha metrics matrix
707 internalAlphaMatrix = None
708 #Name of technique
709 strMethod = [strAlphaMetadata] if strAlphaMetadata else lsAlphaMetrics
710
711 #If given an alpha-diversity metadata
712 if strAlphaMetadata:
713 internalAlphaMatrix = [[float(strNum) for strNum in abndData.funcGetMetadata(strAlphaMetadata)]]
714 else:
715 #Expects Observations (Taxa (row) x sample (column))
716 #Returns [[metric1-sample1, metric1-sample2, metric1-sample3],[metric1-sample1, metric1-sample2, metric1-sample3]]
717 internalAlphaMatrix = Metric.funcBuildAlphaMetricsMatrix(npaSampleAbundance = abndData.funcGetAbundanceCopy()
718 if not abndData.funcIsSummed()
719 else abndData.funcGetFeatureAbundanceTable(abndData.funcGetTerminalNodes()).funcGetAbundanceCopy(),
720 lsSampleNames = lsSampleNames, lsDiversityMetricAlpha = lsAlphaMetrics)
721
722 if internalAlphaMatrix:
723 #Invert measurments
724 if fInvertDiversity:
725 lldNewDiversity = []
726 for lsLine in internalAlphaMatrix:
727 lldNewDiversity.append([1/max(dValue,ConstantsMicropita.c_smallNumber) for dValue in lsLine])
728 internalAlphaMatrix = lldNewDiversity
729 #Get top ranked alpha diversity by most diverse
730 #Expects [[sample1,sample2,sample3...],[sample1,sample2,sample3..],...]
731 #Returns [[sampleName1, sampleName2, sampleNameN],[sampleName1, sampleName2, sampleNameN]]
732 mostDiverseAlphaSamplesIndexes = self.funcGetTopRankedSamples(lldMatrix=internalAlphaMatrix, lsSampleNames=lsSampleNames, iTopAmount=iSampleSelectionCount)
733
734 #Add to results
735 for index in xrange(0,len(strMethod)):
736 strSelectionMethod = self.dictConvertAMetricDiversity.get(strMethod[index],ConstantsMicropita.c_strDiversity+"="+strMethod[index])
737 dictSelectedSamples.setdefault(strSelectionMethod,[]).extend(mostDiverseAlphaSamplesIndexes[index])
738
739 logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 1b")
740 logging.info(dictSelectedSamples)
741
742 #Generate beta metrics and
743 if fRunRepresentative or fRunExtreme:
744
745 #Abundance matrix transposed
746 npaTransposedAbundance = UtilityMath.funcTransposeDataMatrix(abndData.funcGetAbundanceCopy(), fRemoveAdornments=True)
747
748 #Get center selection using clusters/tiling
749 #This will be for beta metrics in normalized space
750 if fRunRepresentative:
751
752 if istmBetaMatrix:
753 #Get representative dissimilarity samples
754 medoidSamples=self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=ConstantsMicropita.c_custom, lsSampleNames=lsSampleNames, iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)
755
756 if medoidSamples:
757 dictSelectedSamples.setdefault(ConstantsMicropita.c_strRepresentative+"="+ConstantsMicropita.c_custom,[]).extend(medoidSamples)
758 else:
759 logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Performing representative selection on normalized data.")
760 for bMetric in lsBetaMetrics:
761
762 #Get representative dissimilarity samples
763 medoidSamples=self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=bMetric, lsSampleNames=lsSampleNames, iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)
764
765 if medoidSamples:
766 dictSelectedSamples.setdefault(self.dictConvertBMetricToMethod.get(bMetric,ConstantsMicropita.c_strRepresentative+"="+bMetric),[]).extend(medoidSamples)
767
768 #Get extreme selection using clusters, tiling
769 if fRunExtreme:
770 logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Performing extreme selection on normalized data.")
771 if istmBetaMatrix:
772
773 #Samples for representative dissimilarity
774 #This involves inverting the distance metric,
775 #Taking the dendrogram level of where the number cluster == the number of samples to select
776 #Returning a repersentative sample from each cluster
777 extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=ConstantsMicropita.c_custom, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)
778
779 #Add selected samples
780 if extremeSamples:
781 dictSelectedSamples.setdefault(ConstantsMicropita.c_strExtreme+"="+ConstantsMicropita.c_custom,[]).extend(extremeSamples)
782
783 else:
784 #Run KMedoids with inverse custom distance metric in normalized space
785 for bMetric in lsInverseBetaMetrics:
786
787 #Samples for representative dissimilarity
788 #This involves inverting the distance metric,
789 #Taking the dendrogram level of where the number cluster == the number of samples to select
790 #Returning a repersentative sample from each cluster
791 extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=bMetric, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)
792
793 #Add selected samples
794 if extremeSamples:
795 dictSelectedSamples.setdefault(self.dictConvertInvBMetricToMethod.get(bMetric,ConstantsMicropita.c_strExtreme+"="+bMetric),[]).extend(extremeSamples)
796
797 logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 2,3b")
798 logging.info(dictSelectedSamples)
799 return dictSelectedSamples
800
801 def funcRun(self, strIDName, strLastMetadataName, istmInput,
802 ostmInputPredictFile, ostmPredictFile, ostmCheckedFile, ostmOutput,
803 cDelimiter, cFeatureNameDelimiter, strFeatureSelection,
804 istmFeatures, iCount, lstrMethods, strLastRowMetadata = None, strLabel = None, strStratify = None,
805 strCustomAlpha = None, strCustomBeta = None, strAlphaMetadata = None, istmBetaMatrix = None, istrmTree = None, istrmEnvr = None,
806 iMinSeqs = ConstantsMicropita.c_liOccurenceFilter[0], iMinSamples = ConstantsMicropita.c_liOccurenceFilter[1], fInvertDiversity = False):
807 """
808 Manages the selection of samples given different metrics.
809
810 :param strIDName: Sample Id metadata row
811 :type: String
812 :param strLastMetadataName: The id of the metadata positioned last in the abundance table.
813 :type: String String metadata id.
814 :param istmInput: File to store input data to supervised methods.
815 :type: FileStream of String file path
816 :param ostmInputPredictFile: File to store distances from supervised methods.
817 :type: FileStream or String file path
818 :param ostmCheckedFile: File to store the AbundanceTable data after it is being checked.
819 :type: FileStream or String file path
820 :param ostmOutPut: File to store sample selection by methods of interest.
821 :type: FileStream or String file path
822 :param cDelimiter: Delimiter of abundance table.
823 :type: Character Char (default TAB).
824 :param cFeatureNameDelimiter: Delimiter of the name of features (for instance if they contain consensus lineages indicating clades).
825 :type: Character (default |).
826 :param stFeatureSelectionMethod: Which method to use to select features in a targeted manner (Using average ranked abundance or average abundance).
827 :type: String (specific values indicated in ConstantsMicropita.lsTargetedFeatureMethodValues).
828 :param istmFeatures: File which holds the features of interest if using targeted feature methodology.
829 :type: FileStream or String file path
830 :param iCount: Number of samples to select in each methods, supervised methods select this amount per label if possible.
831 :type: Integer integer.
832 :param lstrMethods: List of strings indicating selection techniques.
833 :type: List of string method names
834 :param strLabel: The metadata used for supervised labels.
835 :type: String
836 :param strStratify: The metadata used to stratify unsupervised data.
837 :type: String
838 :param strCustomAlpha: Custom alpha diversity metric
839 :type: String
840 :param strCustomBeta: Custom beta diversity metric
841 :type: String
842 :param strAlphaMetadata: Metadata id which is a diveristy metric to use in highest diversity sampling
843 :type: String
844 :param istmBetaMatrix: File containing precalculated beta-diversity matrix for representative sampling
845 :type: FileStream or String file path
846 :param istrmTree: File containing tree for phylogentic beta-diversity analysis
847 :type: FileStream or String file path
848 :param istrmEnvr: File containing environment for phylogentic beta-diversity analysis
849 :type: FileStream or String file path
850 :param iMinSeqs: Minimum sequence in the occurence filter which filters all features not with a minimum number of sequences in each of a minimum number of samples.
851 :type: Integer
852 :param iMinSamples: Minimum sample count for the occurence filter.
853 :type: Integer
854 :param fInvertDiversity: When true will invert diversity measurements before using.
855 :type: boolean
856 :return Selected Samples: Samples selected by methods.
857 Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]}
858 """
859
860 #Holds the top ranked samples from different metrics
861 #dict[metric name] = [samplename,samplename...]
862 selectedSamples = dict()
863
864 #If a target feature file is given make sure that targeted feature is in the selection methods, if not add
865 if ConstantsMicropita.c_strFeature in lstrMethods:
866 if not istmFeatures:
867 logging.error("MicroPITA.funcRun:: Did not receive both the Targeted feature file and the feature selection method. MicroPITA did not run.")
868 return False
869
870 #Diversity metrics to run
871 #Use custom metrics if specified
872 #Custom beta metrics set to normalized only, custom alpha metrics set to count only
873 diversityMetricsAlpha = [] if strCustomAlpha or strAlphaMetadata else [MicroPITA.c_strInverseSimpsonDiversity]
874 diversityMetricsBeta = [] if istmBetaMatrix else [strCustomBeta] if strCustomBeta else [MicroPITA.c_strBrayCurtisDissimilarity]
875 # inverseDiversityMetricsBeta = [MicroPITA.c_strInvBrayCurtisDissimilarity]
876 diversityMetricsAlphaNoNormalize = [strAlphaMetadata] if strAlphaMetadata else [strCustomAlpha] if strCustomAlpha else []
877 diversityMetricsBetaNoNormalize = []
878 # inverseDiversityMetricsBetaNoNormalize = []
879
880 #Targeted taxa
881 userDefinedTaxa = []
882
883 #Perform different flows flags
884 c_RUN_MAX_DIVERSITY_1 = ConstantsMicropita.c_strDiversity in lstrMethods
885 c_RUN_REPRESENTIVE_DISSIMILARITY_2 = ConstantsMicropita.c_strRepresentative in lstrMethods
886 c_RUN_MAX_DISSIMILARITY_3 = ConstantsMicropita.c_strExtreme in lstrMethods
887 c_RUN_RANK_AVERAGE_USER_4 = False
888 if ConstantsMicropita.c_strFeature in lstrMethods:
889 c_RUN_RANK_AVERAGE_USER_4 = True
890 if not istmFeatures:
891 logging.error("MicroPITA.funcRun:: No taxa file was given for taxa selection.")
892 return False
893 #Read in taxa list, break down to lines and filter out empty strings
894 userDefinedTaxa = filter(None,(s.strip( ) for s in istmFeatures.readlines()))
895 c_RUN_RANDOM_5 = ConstantsMicropita.c_strRandom in lstrMethods
896 c_RUN_DISTINCT = ConstantsMicropita.c_strDistinct in lstrMethods
897 c_RUN_DISCRIMINANT = ConstantsMicropita.c_strDiscriminant in lstrMethods
898
899 #Read in abundance data
900 #Abundance is a structured array. Samples (column) by Taxa (rows) with the taxa id row included as the column index=0
901 #Abundance table object to read in and manage data
902 totalAbundanceTable = AbundanceTable.funcMakeFromFile(xInputFile=istmInput, lOccurenceFilter = [iMinSeqs, iMinSamples],
903 cDelimiter=cDelimiter, sMetadataID=strIDName, sLastMetadataRow=strLastRowMetadata,
904 sLastMetadata=strLastMetadataName, cFeatureNameDelimiter=cFeatureNameDelimiter, xOutputFile=ostmCheckedFile)
905 if not totalAbundanceTable:
906 logging.error("MicroPITA.funcRun:: Could not read in the abundance table. Analysis was not performed."+
907 " This often occurs when the Last Metadata is not specified correctly."+
908 " Please check to make sure the Last Metadata selection is the row of the last metadata,"+
909 " all values after this selection should be microbial measurements and should be numeric.")
910 return False
911
912 lsOriginalLabels = SVM.funcMakeLabels(totalAbundanceTable.funcGetMetadata(strLabel)) if strLabel else strLabel
913
914 dictTotalMetadata = totalAbundanceTable.funcGetMetadataCopy()
915 logging.debug("MicroPITA.funcRun:: Received metadata=" + str(dictTotalMetadata))
916 #If there is only 1 unique value for the labels, do not run the Supervised methods
917 if strLabel and ( len(set(dictTotalMetadata.get(strLabel,[]))) < 2 ):
918 logging.error("The label " + strLabel + " did not have 2 or more values. Labels found=" + str(dictTotalMetadata.get(strLabel,[])))
919 return False
920
921 #Run unsupervised methods###
922 #Stratify the data if need be and drop the old data
923 lStratifiedAbundanceTables = totalAbundanceTable.funcStratifyByMetadata(strStratify) if strStratify else [totalAbundanceTable]
924
925 #For each stratified abundance block or for the unstratfified abundance
926 #Run the unsupervised blocks
927 fAppendSupFiles = False
928 for stratAbundanceTable in lStratifiedAbundanceTables:
929 logging.info("MicroPITA.funcRun:: Running abundance block:"+stratAbundanceTable.funcGetName())
930
931 ###NOT SUMMED, NOT NORMALIZED
932 #Only perform if the data is not yet normalized
933 if not stratAbundanceTable.funcIsNormalized( ):
934 #Need to first work with unnormalized data
935 if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3:
936
937 self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount,
938 dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlphaNoNormalize,
939 lsBetaMetrics=diversityMetricsBetaNoNormalize,
940 lsInverseBetaMetrics=diversityMetricsBetaNoNormalize,
941 fRunDiversity=c_RUN_MAX_DIVERSITY_1,fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2,
942 fRunExtreme=c_RUN_MAX_DISSIMILARITY_3, strAlphaMetadata=strAlphaMetadata,
943 istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity)
944
945
946 #Generate selection by the rank average of user defined taxa
947 #Expects (Taxa (row) by Samples (column))
948 #Expects a column 0 of taxa id that is skipped
949 #Returns [(sample name,average,rank)]
950 #SUMMED AND NORMALIZED
951 stratAbundanceTable.funcSumClades()
952 #Normalize data at this point
953 stratAbundanceTable.funcNormalize()
954 if c_RUN_RANK_AVERAGE_USER_4:
955 selectedSamples[ConstantsMicropita.c_strFeature] = self.funcSelectTargetedTaxaSamples(abndMatrix=stratAbundanceTable,
956 lsTargetedTaxa=userDefinedTaxa, iSampleSelectionCount=iCount, sMethod=strFeatureSelection)
957 logging.info("MicroPITA.funcRun:: Selected Samples Rank")
958 logging.info(selectedSamples)
959
960 ###SUMMED AND NORMALIZED analysis block
961 #Diversity based metric will move reduce to terminal taxa as needed
962 if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3:
963
964 self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount,
965 dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlpha,
966 lsBetaMetrics=diversityMetricsBeta,
967 lsInverseBetaMetrics=diversityMetricsBeta,
968 fRunDiversity=c_RUN_MAX_DIVERSITY_1,fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2,
969 fRunExtreme=c_RUN_MAX_DISSIMILARITY_3,
970 istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity)
971
972 #5::Select randomly
973 #Expects sampleNames = List of sample names [name, name, name...]
974 if(c_RUN_RANDOM_5):
975 #Select randomly from sample names
976 selectedSamples[ConstantsMicropita.c_strRandom] = self.funcGetRandomSamples(lsSamples=stratAbundanceTable.funcGetSampleNames(), iNumberOfSamplesToReturn=iCount)
977 logging.info("MicroPITA.funcRun:: Selected Samples Random")
978 logging.info(selectedSamples)
979
980 #Perform supervised selection
981 if c_RUN_DISTINCT or c_RUN_DISCRIMINANT:
982 if strLabel:
983 dictSelectionRet = self.funcRunSupervisedDistancesFromCentroids(abundanceTable=stratAbundanceTable,
984 fRunDistinct=c_RUN_DISTINCT, fRunDiscriminant=c_RUN_DISCRIMINANT,
985 xOutputSupFile=ostmInputPredictFile,xPredictSupFile=ostmPredictFile,
986 strSupervisedMetadata=strLabel, iSampleSupSelectionCount=iCount,
987 lsOriginalSampleNames = totalAbundanceTable.funcGetSampleNames(),
988 lsOriginalLabels = lsOriginalLabels,
989 fAppendFiles=fAppendSupFiles)
990
991 [selectedSamples.setdefault(sKey,[]).extend(lValue) for sKey,lValue in dictSelectionRet.items()]
992
993 if not fAppendSupFiles:
994 fAppendSupFiles = True
995 logging.info("MicroPITA.funcRun:: Selected Samples Unsupervised")
996 logging.info(selectedSamples)
997 return selectedSamples
998
999 #Testing: Happy path tested
1000 @staticmethod
1001 def funcWriteSelectionToFile(dictSelection,xOutputFilePath):
1002 """
1003 Writes the selection of samples by method to an output file.
1004
1005 :param dictSelection: The dictionary of selections by method to be written to a file.
1006 :type: Dictionary The dictionary of selections by method {"method":["sample selected","sample selected"...]}
1007 :param xOutputFilePath: FileStream or String path to file inwhich the dictionary is written.
1008 :type: String FileStream or String path to file
1009 """
1010
1011 if not dictSelection:
1012 return
1013
1014 #Open file
1015 f = csv.writer(open(xOutputFilePath,"w") if isinstance(xOutputFilePath, str) else xOutputFilePath, delimiter=ConstantsMicropita.c_outputFileDelim )
1016
1017 #Create output content from dictionary
1018 for sKey in dictSelection:
1019 f.writerow([sKey]+dictSelection[sKey])
1020 logging.debug("MicroPITA.funcRun:: Selected samples output to file:"+str(dictSelection[sKey]))
1021
1022 #Testing: Happy Path tested
1023 @staticmethod
1024 def funcReadSelectionFileToDictionary(xInputFile):
1025 """
1026 Reads in an output selection file from micropita and formats it into a dictionary.
1027
1028 :param xInputFile: String path to file or file stream to read and translate into a dictionary.
1029 {"method":["sample selected","sample selected"...]}
1030 :type: FileStream or String Path to file
1031 :return Dictionary: Samples selected by methods.
1032 Dictionary {"Selection Method":["SampleID","SampleID","SampleID",...]}
1033 """
1034
1035 #Open file
1036 istmReader = csv.reader(open(xInputFile,'r') if isinstance(xInputFile, str) else xInputFile, delimiter = ConstantsMicropita.c_outputFileDelim)
1037
1038 #Dictionary to hold selection data
1039 return dict([(lsLine[0], lsLine[1:]) for lsLine in istmReader])
1040
1041 #Set up arguments reader
1042 argp = argparse.ArgumentParser( prog = "MicroPITA.py",
1043 description = """Selects samples from abundance tables based on various selection schemes.""" )
1044
1045 args = argp.add_argument_group( "Common", "Commonly modified options" )
1046 args.add_argument(ConstantsMicropita.c_strCountArgument,"--num", dest="iCount", metavar = "samples", default = 10, type = int, help = ConstantsMicropita.c_strCountHelp)
1047 args.add_argument("-m","--method", dest = "lstrMethods", metavar = "method", default = [], help = ConstantsMicropita.c_strSelectionTechniquesHelp,
1048 choices = ConstantsMicropita.c_lsAllMethods, action = "append")
1049
1050 args = argp.add_argument_group( "Custom", "Selecting and inputing custom metrics" )
1051 args.add_argument("-a","--alpha", dest = "strAlphaDiversity", metavar = "AlphaDiversity", default = None, help = ConstantsMicropita.c_strCustomAlphaDiversityHelp, choices = Metric.setAlphaDiversities)
1052 args.add_argument("-b","--beta", dest = "strBetaDiversity", metavar = "BetaDiversity", default = None, help = ConstantsMicropita.c_strCustomBetaDiversityHelp, choices = list(Metric.setBetaDiversities)+[Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted])
1053 args.add_argument("-q","--alphameta", dest = "strAlphaMetadata", metavar = "AlphaDiversityMetadata", default = None, help = ConstantsMicropita.c_strCustomAlphaDiversityMetadataHelp)
1054 args.add_argument("-x","--betamatrix", dest = "istmBetaMatrix", metavar = "BetaDiversityMatrix", default = None, help = ConstantsMicropita.c_strCustomBetaDiversityMatrixHelp)
1055 args.add_argument("-o","--tree", dest = "istrmTree", metavar = "PhylogeneticTree", default = None, help = ConstantsMicropita.c_strCustomPhylogeneticTreeHelp)
1056 args.add_argument("-i","--envr", dest = "istrmEnvr", metavar = "EnvironmentFile", default = None, help = ConstantsMicropita.c_strCustomEnvironmentFileHelp)
1057 args.add_argument("-f","--invertDiversity", dest = "fInvertDiversity", action="store_true", default = False, help = ConstantsMicropita.c_strInvertDiversityHelp)
1058
1059 args = argp.add_argument_group( "Miscellaneous", "Row/column identifiers and feature targeting options" )
1060 args.add_argument("-d",ConstantsMicropita.c_strIDNameArgument, dest="strIDName", metavar="sample_id", help= ConstantsMicropita.c_strIDNameHelp)
1061 args.add_argument("-l",ConstantsMicropita.c_strLastMetadataNameArgument, dest="strLastMetadataName", metavar = "metadata_id", default = None,
1062 help= ConstantsMicropita.c_strLastMetadataNameHelp)
1063 args.add_argument("-r",ConstantsMicropita.c_strTargetedFeatureMethodArgument, dest="strFeatureSelection", metavar="targeting_method", default=ConstantsMicropita.lsTargetedFeatureMethodValues[0],
1064 choices=ConstantsMicropita.lsTargetedFeatureMethodValues, help= ConstantsMicropita.c_strTargetedFeatureMethodHelp)
1065 args.add_argument("-t",ConstantsMicropita.c_strTargetedSelectionFileArgument, dest="istmFeatures", metavar="feature_file", type=argparse.FileType("rU"), help=ConstantsMicropita.c_strTargetedSelectionFileHelp)
1066 args.add_argument("-w",ConstantsMicropita.c_strFeatureMetadataArgument, dest="strLastFeatureMetadata", metavar="Last_Feature_Metadata", default=None, help=ConstantsMicropita.c_strFeatureMetadataHelp)
1067
1068 args = argp.add_argument_group( "Data labeling", "Metadata IDs for strata and supervised label values" )
1069 args.add_argument("-e",ConstantsMicropita.c_strSupervisedLabelArgument, dest="strLabel", metavar= "supervised_id", help=ConstantsMicropita.c_strSupervisedLabelHelp)
1070 args.add_argument("-s",ConstantsMicropita.c_strUnsupervisedStratifyMetadataArgument, dest="strUnsupervisedStratify", metavar="stratify_id",
1071 help= ConstantsMicropita.c_strUnsupervisedStratifyMetadataHelp)
1072
1073 args = argp.add_argument_group( "File formatting", "Rarely modified file formatting options" )
1074 args.add_argument("-j",ConstantsMicropita.c_strFileDelimiterArgument, dest="cFileDelimiter", metavar="column_delimiter", default="\t", help=ConstantsMicropita.c_strFileDelimiterHelp)
1075 args.add_argument("-k",ConstantsMicropita.c_strFeatureNameDelimiterArgument, dest="cFeatureNameDelimiter", metavar="taxonomy_delimiter", default="|", help=ConstantsMicropita.c_strFeatureNameDelimiterHelp)
1076
1077 args = argp.add_argument_group( "Debugging", "Debugging options - modify at your own risk!" )
1078 args.add_argument("-v",ConstantsMicropita.c_strLoggingArgument, dest="strLogLevel", metavar = "log_level", default="WARNING",
1079 choices=ConstantsMicropita.c_lsLoggingChoices, help= ConstantsMicropita.c_strLoggingHelp)
1080 args.add_argument("-c",ConstantsMicropita.c_strCheckedAbundanceFileArgument, dest="ostmCheckedFile", metavar = "output_qc", type = argparse.FileType("w"), help = ConstantsMicropita.c_strCheckedAbundanceFileHelp)
1081 args.add_argument("-g",ConstantsMicropita.c_strLoggingFileArgument, dest="ostmLoggingFile", metavar = "output_log", type = argparse.FileType("w"), help = ConstantsMicropita.c_strLoggingFileHelp)
1082 args.add_argument("-u",ConstantsMicropita.c_strSupervisedInputFile, dest="ostmInputPredictFile", metavar = "output_scaled", type = argparse.FileType("w"), help = ConstantsMicropita.c_strSupervisedInputFileHelp)
1083 args.add_argument("-p",ConstantsMicropita.c_strSupervisedPredictedFile, dest="ostmPredictFile", metavar = "output_labels", type = argparse.FileType("w"), help = ConstantsMicropita.c_strSupervisedPredictedFileHelp)
1084
1085 argp.add_argument("istmInput", metavar = "input.pcl/biome", type = argparse.FileType("rU"), help = ConstantsMicropita.c_strAbundanceFileHelp,
1086 default = sys.stdin)
1087 argp.add_argument("ostmOutput", metavar = "output.txt", type = argparse.FileType("w"), help = ConstantsMicropita.c_strGenericOutputDataFileHelp,
1088 default = sys.stdout)
1089
1090 __doc__ = "::\n\n\t" + argp.format_help( ).replace( "\n", "\n\t" ) + __doc__
1091
1092 def _main( ):
1093 args = argp.parse_args( )
1094
1095 #Set up logger
1096 iLogLevel = getattr(logging, args.strLogLevel.upper(), None)
1097 logging.basicConfig(stream = args.ostmLoggingFile if args.ostmLoggingFile else sys.stderr, filemode = 'w', level=iLogLevel)
1098
1099 #Run micropita
1100 logging.info("MicroPITA:: Start microPITA")
1101 microPITA = MicroPITA()
1102
1103 #Argparse will append to the default but will not remove the default so I do this here
1104 if not len(args.lstrMethods):
1105 args.lstrMethods = [ConstantsMicropita.c_strRepresentative]
1106
1107 dictSelectedSamples = microPITA.funcRun(
1108 strIDName = args.strIDName,
1109 strLastMetadataName = args.strLastMetadataName,
1110 istmInput = args.istmInput,
1111 ostmInputPredictFile = args.ostmInputPredictFile,
1112 ostmPredictFile = args.ostmPredictFile,
1113 ostmCheckedFile = args.ostmCheckedFile,
1114 ostmOutput = args.ostmOutput,
1115 cDelimiter = args.cFileDelimiter,
1116 cFeatureNameDelimiter = args.cFeatureNameDelimiter,
1117 istmFeatures = args.istmFeatures,
1118 strFeatureSelection = args.strFeatureSelection,
1119 iCount = args.iCount,
1120 strLastRowMetadata = args.strLastFeatureMetadata,
1121 strLabel = args.strLabel,
1122 strStratify = args.strUnsupervisedStratify,
1123 strCustomAlpha = args.strAlphaDiversity,
1124 strCustomBeta = args.strBetaDiversity,
1125 strAlphaMetadata = args.strAlphaMetadata,
1126 istmBetaMatrix = args.istmBetaMatrix,
1127 istrmTree = args.istrmTree,
1128 istrmEnvr = args.istrmEnvr,
1129 lstrMethods = args.lstrMethods,
1130 fInvertDiversity = args.fInvertDiversity
1131 )
1132
1133 if not dictSelectedSamples:
1134 logging.error("MicroPITA:: Error, did not get a result from analysis.")
1135 return -1
1136 logging.info("End microPITA")
1137
1138 #Log output for debugging
1139 logging.debug("MicroPITA:: Returned the following samples:"+str(dictSelectedSamples))
1140
1141 #Write selection to file
1142 microPITA.funcWriteSelectionToFile(dictSelection=dictSelectedSamples, xOutputFilePath=args.ostmOutput)
1143
1144 if __name__ == "__main__":
1145 _main( )