| 
0
 | 
     1 #!/usr/bin/env python
 | 
| 
 | 
     2 """
 | 
| 
 | 
     3 Author: Timothy Tickle
 | 
| 
 | 
     4 Description: Class to Run analysis for the microPITA paper
 | 
| 
 | 
     5 """
 | 
| 
 | 
     6 
 | 
| 
 | 
     7 #####################################################################################
 | 
| 
 | 
     8 #Copyright (C) <2012>
 | 
| 
 | 
     9 #
 | 
| 
 | 
    10 #Permission is hereby granted, free of charge, to any person obtaining a copy of
 | 
| 
 | 
    11 #this software and associated documentation files (the "Software"), to deal in the
 | 
| 
 | 
    12 #Software without restriction, including without limitation the rights to use, copy,
 | 
| 
 | 
    13 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
 | 
| 
 | 
    14 #and to permit persons to whom the Software is furnished to do so, subject to
 | 
| 
 | 
    15 #the following conditions:
 | 
| 
 | 
    16 #
 | 
| 
 | 
    17 #The above copyright notice and this permission notice shall be included in all copies
 | 
| 
 | 
    18 #or substantial portions of the Software.
 | 
| 
 | 
    19 #
 | 
| 
 | 
    20 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
 | 
| 
 | 
    21 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
 | 
| 
 | 
    22 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 | 
| 
 | 
    23 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 | 
| 
 | 
    24 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 | 
| 
 | 
    25 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 | 
| 
 | 
    26 #####################################################################################
 | 
| 
 | 
    27 
 | 
| 
 | 
    28 __author__ = "Timothy Tickle"
 | 
| 
 | 
    29 __copyright__ = "Copyright 2012"
 | 
| 
 | 
    30 __credits__ = ["Timothy Tickle"]
 | 
| 
 | 
    31 __license__ = "MIT"
 | 
| 
 | 
    32 __maintainer__ = "Timothy Tickle"
 | 
| 
 | 
    33 __email__ = "ttickle@sph.harvard.edu"
 | 
| 
 | 
    34 __status__ = "Development"
 | 
| 
 | 
    35 
 | 
| 
 | 
    36 import sys
 | 
| 
 | 
    37 import argparse
 | 
| 
 | 
    38 from src.breadcrumbs.src.AbundanceTable import AbundanceTable
 | 
| 
 | 
    39 from src.breadcrumbs.src.ConstantsBreadCrumbs import ConstantsBreadCrumbs
 | 
| 
 | 
    40 from src.breadcrumbs.src.Metric import Metric
 | 
| 
 | 
    41 from src.breadcrumbs.src.KMedoids import Kmedoids
 | 
| 
 | 
    42 from src.breadcrumbs.src.MLPYDistanceAdaptor import MLPYDistanceAdaptor
 | 
| 
 | 
    43 from src.breadcrumbs.src.SVM import SVM
 | 
| 
 | 
    44 from src.breadcrumbs.src.UtilityMath import UtilityMath
 | 
| 
 | 
    45 
 | 
| 
 | 
    46 from src.ConstantsMicropita import ConstantsMicropita
 | 
| 
 | 
    47 import csv
 | 
| 
 | 
    48 import logging
 | 
| 
 | 
    49 import math
 | 
| 
 | 
    50 import mlpy
 | 
| 
 | 
    51 import numpy as np
 | 
| 
 | 
    52 import operator
 | 
| 
 | 
    53 import os
 | 
| 
 | 
    54 import random
 | 
| 
 | 
    55 import scipy.cluster.hierarchy as hcluster
 | 
| 
 | 
    56 import scipy.spatial.distance
 | 
| 
 | 
    57 from types import *
 | 
| 
 | 
    58 
 | 
| 
 | 
    59 class MicroPITA:
 | 
| 
 | 
    60 	"""
 | 
| 
 | 
    61 	Selects samples from a first tier of a multi-tiered study to be used in a second tier.
 | 
| 
 | 
    62 	Different methods can be used for selection.
 | 
| 
 | 
    63 	The expected input is an abundance table (and potentially a text file of targeted features,
 | 
| 
 | 
    64 	if using the targeted features option). Output is a list of samples exhibiting the
 | 
| 
 | 
    65 	characteristics of interest.
 | 
| 
 | 
    66 	"""
 | 
| 
 | 
    67 
 | 
| 
 | 
    68 	#Constants
 | 
| 
 | 
    69 	#Diversity metrics Alpha
 | 
| 
 | 
    70 	c_strInverseSimpsonDiversity = Metric.c_strInvSimpsonDiversity
 | 
| 
 | 
    71 	c_strChao1Diversity = Metric.c_strChao1Diversity
 | 
| 
 | 
    72 
 | 
| 
 | 
    73 	#Diversity metrics Beta
 | 
| 
 | 
    74 	c_strBrayCurtisDissimilarity = Metric.c_strBrayCurtisDissimilarity
 | 
| 
 | 
    75 
 | 
| 
 | 
    76 	#Additive inverses of diversity metrics beta
 | 
| 
 | 
    77 	c_strInvBrayCurtisDissimilarity = Metric.c_strInvBrayCurtisDissimilarity
 | 
| 
 | 
    78 
 | 
| 
 | 
    79 	#Technique Names
 | 
| 
 | 
    80 	ConstantsMicropita.c_strDiversity2 = ConstantsMicropita.c_strDiversity+"_C"
 | 
| 
 | 
    81 
 | 
| 
 | 
    82 	#Targeted feature settings
 | 
| 
 | 
    83 	c_strTargetedRanked = ConstantsMicropita.c_strTargetedRanked
 | 
| 
 | 
    84 	c_strTargetedAbundance = ConstantsMicropita.c_strTargetedAbundance
 | 
| 
 | 
    85 
 | 
| 
 | 
    86 	#Technique groupings
 | 
| 
 | 
    87 #	c_lsDiversityMethods = [ConstantsMicropita.c_strDiversity,ConstantsMicropita.c_strDiversity2]
 | 
| 
 | 
    88 
 | 
| 
 | 
    89 	#Converts ecology metrics into standardized method selection names
 | 
| 
 | 
    90 	dictConvertAMetricDiversity = {c_strInverseSimpsonDiversity:ConstantsMicropita.c_strDiversity, c_strChao1Diversity:ConstantsMicropita.c_strDiversity2}
 | 
| 
 | 
    91 #	dictConvertMicroPITAToAMetric = {ConstantsMicropita.c_strDiversity:c_strInverseSimpsonDiversity, ConstantsMicropita.c_strDiversity2:c_strChao1Diversity}
 | 
| 
 | 
    92 	dictConvertBMetricToMethod = {c_strBrayCurtisDissimilarity:ConstantsMicropita.c_strRepresentative}
 | 
| 
 | 
    93 	dictConvertInvBMetricToMethod = {c_strBrayCurtisDissimilarity:ConstantsMicropita.c_strExtreme}
 | 
| 
 | 
    94 
 | 
| 
 | 
    95 	#Linkage used in the Hierarchical clustering
 | 
| 
 | 
    96 	c_strHierarchicalClusterMethod = 'average'
 | 
| 
 | 
    97 
 | 
| 
 | 
    98 ####Group 1## Diversity
 | 
| 
 | 
    99 	#Testing: Happy path Testing (8)
 | 
| 
 | 
   100 	def funcGetTopRankedSamples(self, lldMatrix = None, lsSampleNames = None, iTopAmount = None):
 | 
| 
 | 
   101 		"""
 | 
| 
 | 
   102 		Given a list of lists of measurements, for each list the indices of the highest values are returned. If lsSamplesNames is given
 | 
| 
 | 
   103 			it is treated as a list of string names that is in the order of the measurements in each list. Indices are returned or the sample
 | 
| 
 | 
   104 			names associated with the indices.
 | 
| 
 | 
   105 		
 | 
| 
 | 
   106 		:param	lldMatrix:	List of lists [[value,value,value,value],[value,value,value,value]].
 | 
| 
 | 
   107 		:type:	List of lists	List of measurements. Each list is a different measurement. Each measurement in positionally related to a sample.
 | 
| 
 | 
   108 		:param	lsSampleNames:	List of sample names positionally related (the same) to each list (Optional).
 | 
| 
 | 
   109 		:type:	List of strings	List of strings.
 | 
| 
 | 
   110 		:param	iTopAmount:	The amount of top measured samples (assumes the higher measurements are better).
 | 
| 
 | 
   111 		:type:	integer	Integer amount of sample names/ indices to return.
 | 
| 
 | 
   112 		:return	List:	List of samples to be selected.
 | 
| 
 | 
   113 		"""
 | 
| 
 | 
   114 		topRankListRet = []
 | 
| 
 | 
   115 		for rowMetrics in lldMatrix:
 | 
| 
 | 
   116 			#Create 2 d array to hold value and index and sort
 | 
| 
 | 
   117 			liIndexX = [rowMetrics,range(len(rowMetrics))]
 | 
| 
 | 
   118 			liIndexX[1].sort(key = liIndexX[0].__getitem__,reverse = True)
 | 
| 
 | 
   119 
 | 
| 
 | 
   120 			if lsSampleNames:
 | 
| 
 | 
   121 				topRankListRet.append([lsSampleNames[iIndex] for iIndex in liIndexX[1][:iTopAmount]])
 | 
| 
 | 
   122 			else:
 | 
| 
 | 
   123 				topRankListRet.append(liIndexX[1][:iTopAmount])
 | 
| 
 | 
   124 
 | 
| 
 | 
   125 		return topRankListRet
 | 
| 
 | 
   126 	
 | 
| 
 | 
   127 	####Group 2## Representative Dissimilarity
 | 
| 
 | 
   128 	#Testing: Happy path tested 1
 | 
| 
 | 
   129 	def funcGetCentralSamplesByKMedoids(self, npaMatrix=None, sMetric=None, lsSampleNames=None, iNumberSamplesReturned=0, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None):
 | 
| 
 | 
   130 		"""
 | 
| 
 | 
   131 		Gets centroid samples by k-medoids clustering of a given matrix.
 | 
| 
 | 
   132 		
 | 
| 
 | 
   133 		:param	npaMatrix:	Numpy array where row=features and columns=samples
 | 
| 
 | 
   134 		:type:	Numpy array	Abundance Data.
 | 
| 
 | 
   135 		:param	sMetric:	String name of beta metric used as the distance metric.
 | 
| 
 | 
   136 		:type:	String	String name of beta metric.
 | 
| 
 | 
   137 		:param	lsSampleNames:	The names of the sample
 | 
| 
 | 
   138 		:type:	List	List of strings
 | 
| 
 | 
   139 		:param	iNumberSamplesReturned:	Number of samples to return, each will be a centroid of a sample.
 | 
| 
 | 
   140 		:type:	Integer	Number of samples to return
 | 
| 
 | 
   141 		:return	List:	List of selected samples.
 | 
| 
 | 
   142 		:param	istmBetaMatrix: File with beta-diversity matrix
 | 
| 
 | 
   143 		:type:	File stream or file path string
 | 
| 
 | 
   144 		"""
 | 
| 
 | 
   145 
 | 
| 
 | 
   146 		#Count of how many rows
 | 
| 
 | 
   147 		sampleCount = npaMatrix.shape[0]
 | 
| 
 | 
   148 		if iNumberSamplesReturned > sampleCount:
 | 
| 
 | 
   149 			logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: There are not enough samples to return the amount of samples specified. Return sample count = "+str(iNumberSamplesReturned)+". Sample number = "+str(sampleCount)+".")
 | 
| 
 | 
   150 			return False
 | 
| 
 | 
   151 
 | 
| 
 | 
   152 		#If the cluster count is equal to the sample count return all samples
 | 
| 
 | 
   153 		if sampleCount == iNumberSamplesReturned:
 | 
| 
 | 
   154 			return list(lsSampleNames)
 | 
| 
 | 
   155 
 | 
| 
 | 
   156 		#Get distance matrix
 | 
| 
 | 
   157 		distanceMatrix=scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix,lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaMatrix, sMetric=sMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames)
 | 
| 
 | 
   158 		if type(distanceMatrix) is BooleanType:
 | 
| 
 | 
   159 			logging.error("MicroPITA.funcGetCentralSamplesByKMedoids:: Could not read in the supplied distance matrix, returning false.")
 | 
| 
 | 
   160 			return False
 | 
| 
 | 
   161 
 | 
| 
 | 
   162 		# Handle unifrac output
 | 
| 
 | 
   163 		if sMetric in [Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]:
 | 
| 
 | 
   164 			distanceMatrix = distanceMatrix[0]
 | 
| 
 | 
   165 	
 | 
| 
 | 
   166 		#Log distance matrix
 | 
| 
 | 
   167 		logging.debug("MicroPITA.funcGetCentralSamplesByKMedoids:: Distance matrix for representative selection using metric="+str(sMetric))
 | 
| 
 | 
   168 	
 | 
| 
 | 
   169 		distance = MLPYDistanceAdaptor(npaDistanceMatrix=distanceMatrix, fIsCondensedMatrix=True)
 | 
| 
 | 
   170 	
 | 
| 
 | 
   171 		#Create object to determine clusters/medoids
 | 
| 
 | 
   172 		medoidsMaker = Kmedoids(k=iNumberSamplesReturned, dist=distance)
 | 
| 
 | 
   173 		#medoidsData includes(1d numpy array, medoids indexes; 
 | 
| 
 | 
   174 		#			  1d numpy array, non-medoids indexes;
 | 
| 
 | 
   175 		#			  1d numpy array, cluster membership for non-medoids;
 | 
| 
 | 
   176 		#			  double, cost of configuration)
 | 
| 
 | 
   177 		#npaMatrix is samples x rows
 | 
| 
 | 
   178 		#Build a matrix of lists of indicies to pass to the distance matrix
 | 
| 
 | 
   179 		lliIndicesMatrix = [[iIndexPosition] for iIndexPosition in xrange(0,len(npaMatrix))]
 | 
| 
 | 
   180 		medoidsData = medoidsMaker.compute(np.array(lliIndicesMatrix))
 | 
| 
 | 
   181 		logging.debug("MicroPITA.funcGetCentralSamplesByKMedoids:: Results from the kmedoid method in representative selection:")
 | 
| 
 | 
   182 		logging.debug(str(medoidsData))
 | 
| 
 | 
   183 	
 | 
| 
 | 
   184 		#If returning the same amount of clusters and samples
 | 
| 
 | 
   185 		#Return centroids
 | 
| 
 | 
   186 		selectedIndexes = medoidsData[0]
 | 
| 
 | 
   187 		return [lsSampleNames[selectedIndexes[index]] for index in xrange(0,iNumberSamplesReturned)]
 | 
| 
 | 
   188 	
 | 
| 
 | 
   189 	####Group 3## Highest Dissimilarity
 | 
| 
 | 
   190 	#Testing: Happy path tested
 | 
| 
 | 
   191 	def funcSelectExtremeSamplesFromHClust(self, strBetaMetric, npaAbundanceMatrix, lsSampleNames, iSelectSampleCount, istmBetaMatrix=None, istrmTree=None, istrmEnvr=None):
 | 
| 
 | 
   192 		"""
 | 
| 
 | 
   193 		Select extreme samples from HClustering.
 | 
| 
 | 
   194 		
 | 
| 
 | 
   195 		:param	strBetaMetric:	The beta metric to use for distance matrix generation.
 | 
| 
 | 
   196 		:type:	String	The name of the beta metric to use.
 | 
| 
 | 
   197 		:param	npaAbundanceMatrix:	Numpy array where row=samples and columns=features.
 | 
| 
 | 
   198 		:type:	Numpy Array	Abundance data.
 | 
| 
 | 
   199 		:param	lsSampleNames:	The names of the sample.
 | 
| 
 | 
   200 		:type:	List	List of strings.
 | 
| 
 | 
   201 		:param	iSelectSampleCount:	Number of samples to select (return).
 | 
| 
 | 
   202 		:type:	Integer	Integer number of samples returned.
 | 
| 
 | 
   203 		:return	Samples:	List of samples.
 | 
| 
 | 
   204 		:param	istmBetaMatrix: File with beta-diversity matrix
 | 
| 
 | 
   205 		:type:	File stream or file path string
 | 
| 
 | 
   206 		"""
 | 
| 
 | 
   207 	
 | 
| 
 | 
   208 		#If they want all the sample count, return all sample names
 | 
| 
 | 
   209 		iSampleCount=len(npaAbundanceMatrix[:,0])
 | 
| 
 | 
   210 		if iSelectSampleCount==iSampleCount:
 | 
| 
 | 
   211 		  return lsSampleNames
 | 
| 
 | 
   212 	
 | 
| 
 | 
   213 		#Holds the samples to be returned
 | 
| 
 | 
   214 		lsReturnSamplesRet = []
 | 
| 
 | 
   215 	
 | 
| 
 | 
   216 		#Generate beta matrix
 | 
| 
 | 
   217 		#Returns condensed matrix
 | 
| 
 | 
   218 		tempDistanceMatrix = scipy.spatial.distance.squareform(Metric.funcReadMatrixFile(istmMatrixFile=istmBetaMatrix,lsSampleOrder=lsSampleNames)[0]) if istmBetaMatrix else Metric.funcGetBetaMetric(npadAbundancies=npaAbundanceMatrix, sMetric=strBetaMetric, istrmTree=istrmTree, istrmEnvr=istrmEnvr, lsSampleOrder=lsSampleNames, fAdditiveInverse = True)
 | 
| 
 | 
   219 
 | 
| 
 | 
   220 		if strBetaMetric in [Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted]:
 | 
| 
 | 
   221 			tempDistanceMatrix = tempDistanceMatrix[0]
 | 
| 
 | 
   222 
 | 
| 
 | 
   223 		if type(tempDistanceMatrix) is BooleanType:
 | 
| 
 | 
   224 			logging.error("MicroPITA.funcSelectExtremeSamplesFromHClust:: Could not read in the supplied distance matrix, returning false.")
 | 
| 
 | 
   225 			return False
 | 
| 
 | 
   226 
 | 
| 
 | 
   227 		if istmBetaMatrix:
 | 
| 
 | 
   228 			tempDistanceMatrix = 1-tempDistanceMatrix
 | 
| 
 | 
   229 
 | 
| 
 | 
   230 		#Feed beta matrix to linkage to cluster
 | 
| 
 | 
   231 		#Send condensed matrix
 | 
| 
 | 
   232 		linkageMatrix = hcluster.linkage(tempDistanceMatrix, method=self.c_strHierarchicalClusterMethod)
 | 
| 
 | 
   233 	
 | 
| 
 | 
   234 		#Extract cluster information from dendrogram
 | 
| 
 | 
   235 		#The linakge matrix is of the form
 | 
| 
 | 
   236 		#[[int1 int2 doube int3],...]
 | 
| 
 | 
   237 		#int1 and int1 are the paired samples indexed at 0 and up.
 | 
| 
 | 
   238 		#each list is an entry for a branch that is number starting with the first
 | 
| 
 | 
   239 		#list being sample count index + 1
 | 
| 
 | 
   240 		#each list is then named by an increment as they appear
 | 
| 
 | 
   241 		#this means that if a number is in the list and is = sample count or greater it is not
 | 
| 
 | 
   242 		#terminal and is instead a branch.
 | 
| 
 | 
   243 		#This method just takes the lowest metric measurement (highest distance pairs/clusters)
 | 
| 
 | 
   244 		#Works much better than the original technique
 | 
| 
 | 
   245 		#get total number of samples
 | 
| 
 | 
   246 	
 | 
| 
 | 
   247 		iCurrentSelectCount = 0
 | 
| 
 | 
   248 		for row in linkageMatrix:
 | 
| 
 | 
   249 			#Get nodes ofthe lowest pairing (so the furthest apart pair)
 | 
| 
 | 
   250 			iNode1 = int(row[0])
 | 
| 
 | 
   251 			iNode2 = int(row[1])
 | 
| 
 | 
   252 			#Make sure the nodes are a terminal node (sample) and not a branch in the dendrogram
 | 
| 
 | 
   253 			#The branching in the dendrogram will start at the number of samples and increment higher.
 | 
| 
 | 
   254 			#Add each of the pair one at a time breaking when enough samples are selected.
 | 
| 
 | 
   255 			if iNode1<iSampleCount:
 | 
| 
 | 
   256 				lsReturnSamplesRet.append(lsSampleNames[iNode1])
 | 
| 
 | 
   257 				iCurrentSelectCount = iCurrentSelectCount + 1
 | 
| 
 | 
   258 			if iCurrentSelectCount == iSelectSampleCount:
 | 
| 
 | 
   259 				break
 | 
| 
 | 
   260 			if iNode2<iSampleCount:
 | 
| 
 | 
   261 				lsReturnSamplesRet.append(lsSampleNames[iNode2])
 | 
| 
 | 
   262 				iCurrentSelectCount = iCurrentSelectCount + 1
 | 
| 
 | 
   263 			if iCurrentSelectCount == iSelectSampleCount:
 | 
| 
 | 
   264 				break
 | 
| 
 | 
   265 	
 | 
| 
 | 
   266 		#Return selected samples
 | 
| 
 | 
   267 		return lsReturnSamplesRet
 | 
| 
 | 
   268 	
 | 
| 
 | 
   269 	####Group 4## Rank Average of user Defined Taxa
 | 
| 
 | 
   270 		#Testing: Happy Path Tested
 | 
| 
 | 
   271 	def funcGetAverageAbundanceSamples(self, abndTable, lsTargetedFeature, fRank=False):
 | 
| 
 | 
   272 		"""
 | 
| 
 | 
   273 		Averages feature abundance or ranked abundance. Expects a column 0 of taxa id that is skipped.
 | 
| 
 | 
   274 		
 | 
| 
 | 
   275 		:param	abndTable:	Abundance Table to analyse
 | 
| 
 | 
   276 		:type:	AbundanceTable	Abundance Table
 | 
| 
 | 
   277 		:param	lsTargetedFeature:	String names
 | 
| 
 | 
   278 		:type:	list	list of string names of features (bugs) which are measured after ranking against the full sample
 | 
| 
 | 
   279 		:param  fRank:	Indicates to rank the abundance before getting the average abundance of the features (default false)
 | 
| 
 | 
   280 		:type:   boolean	Flag indicating ranking abundance before calculating average feature measurement (false= no ranking)
 | 
| 
 | 
   281 		:return	List of lists or boolean:	List of lists or False on error. One internal list per sample indicating the sample,
 | 
| 
 | 
   282 				feature average abundance or ranked abundance. Lists will already be sorted.
 | 
| 
 | 
   283 				For not Ranked [[sample,average abundance of selected feature,1]]
 | 
| 
 | 
   284 				For Ranked [[sample,average ranked abundance, average abundance of selected feature]]
 | 
| 
 | 
   285 				Error Returns false
 | 
| 
 | 
   286 		"""
 | 
| 
 | 
   287 		
 | 
| 
 | 
   288 		llAbundance = abndTable.funcGetAverageAbundancePerSample(lsTargetedFeature)
 | 
| 
 | 
   289 		if not llAbundance:
 | 
| 
 | 
   290 			logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.")
 | 
| 
 | 
   291 			return False
 | 
| 
 | 
   292 		#Add a space for ranking if needed
 | 
| 
 | 
   293 		#Not ranked will be [[sSample,average abundance,1]]
 | 
| 
 | 
   294 		#(where 1 will not discriminant ties if used in later functions, so this generalizes)
 | 
| 
 | 
   295 		#Ranked will be [[sSample, average rank, average abundance]]
 | 
| 
 | 
   296 		llRetAbundance = [[llist[0],-1,llist[1]] for llist in llAbundance]
 | 
| 
 | 
   297 		#Rank if needed
 | 
| 
 | 
   298 		if fRank:
 | 
| 
 | 
   299 			abndRanked = abndTable.funcRankAbundance()
 | 
| 
 | 
   300 			if abndRanked == None:
 | 
| 
 | 
   301 				logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not rank the abundance table, returned false.")
 | 
| 
 | 
   302 				return False
 | 
| 
 | 
   303 			llRetRank = abndRanked.funcGetAverageAbundancePerSample(lsTargetedFeature)
 | 
| 
 | 
   304 			if not llRetRank:
 | 
| 
 | 
   305 				logging.error("MicroPITA.funcGetAverageAbundanceSamples:: Could not get average ranked abundance, returned false. Make sure the features (bugs) are spelled correctly and in the abundance table.")
 | 
| 
 | 
   306 				return False
 | 
| 
 | 
   307 			dictRanks = dict(llRetRank)
 | 
| 
 | 
   308 			llRetAbundance = [[a[0],dictRanks[a[0]],a[2]] for a in llRetAbundance]
 | 
| 
 | 
   309 			
 | 
| 
 | 
   310 		#Sort first for ties and then for the main feature
 | 
| 
 | 
   311  		if not fRank or ConstantsMicropita.c_fBreakRankTiesByDiversity:
 | 
| 
 | 
   312 			llRetAbundance = sorted(llRetAbundance, key = lambda sampleData: sampleData[2], reverse = not fRank)
 | 
| 
 | 
   313 		if fRank:
 | 
| 
 | 
   314 			llRetAbundance = sorted(llRetAbundance, key = lambda sampleData: sampleData[1], reverse = not fRank)
 | 
| 
 | 
   315 		return llRetAbundance
 | 
| 
 | 
   316 	
 | 
| 
 | 
   317 	#Testing: Happy Path Tested
 | 
| 
 | 
   318 	def funcSelectTargetedTaxaSamples(self, abndMatrix, lsTargetedTaxa, iSampleSelectionCount, sMethod = ConstantsMicropita.lsTargetedFeatureMethodValues[0]):
 | 
| 
 | 
   319 	  """
 | 
| 
 | 
   320 	  Selects samples with the highest ranks or abundance of targeted features.
 | 
| 
 | 
   321 	  If ranked, select the highest abundance for tie breaking
 | 
| 
 | 
   322 	
 | 
| 
 | 
   323 	  :param	abndMatrix:	Abundance table to analyse 
 | 
| 
 | 
   324 	  :type:	AbundanceTable	Abundance table
 | 
| 
 | 
   325 	  :param	lsTargetedTaxa:	List of features
 | 
| 
 | 
   326 	  :type:	list	list of strings
 | 
| 
 | 
   327 	  :param	iSampleSelectionCount:	Number of samples to select
 | 
| 
 | 
   328 	  :type:	integer	integer
 | 
| 
 | 
   329 	  :param	sMethod:	Method to select targeted features
 | 
| 
 | 
   330 	  :type:	string	String (Can be values found in ConstantsMicropita.lsTargetedFeatureMethodValues)
 | 
| 
 | 
   331 	  :return	List of strings:	List of sample names which were selected
 | 
| 
 | 
   332 	  List of strings	Empty list is returned on an error.
 | 
| 
 | 
   333 	  """
 | 
| 
 | 
   334 	
 | 
| 
 | 
   335 	  #Check data
 | 
| 
 | 
   336 	  if(len(lsTargetedTaxa) < 1):
 | 
| 
 | 
   337 		logging.error("MicroPITA.funcSelectTargetedTaxaSamples. Taxa defined selection was requested but no features were given.")
 | 
| 
 | 
   338 		return []
 | 
| 
 | 
   339 
 | 
| 
 | 
   340 	  lsTargetedSamples = self.funcGetAverageAbundanceSamples(abndTable=abndMatrix, lsTargetedFeature=lsTargetedTaxa,
 | 
| 
 | 
   341 	  	fRank=sMethod.lower() == self.c_strTargetedRanked.lower())
 | 
| 
 | 
   342 	  #If an error occured or the key word for the method was not recognized
 | 
| 
 | 
   343 	  if lsTargetedSamples == False: 
 | 
| 
 | 
   344 		  logging.error("MicroPITA.funcSelectTargetedTaxaSamples:: Was not able to select for the features given. So targeted feature selection was performed. Check to make sure the features are spelled correctly and exist in the abundance file.")
 | 
| 
 | 
   345 		  return []
 | 
| 
 | 
   346 	
 | 
| 
 | 
   347 	  #Select from results
 | 
| 
 | 
   348 	  return [sSample[0] for sSample in lsTargetedSamples[:iSampleSelectionCount]]
 | 
| 
 | 
   349 	
 | 
| 
 | 
   350 	####Group 5## Random
 | 
| 
 | 
   351 	#Testing: Happy path Tested
 | 
| 
 | 
   352 	def funcGetRandomSamples(self, lsSamples=None, iNumberOfSamplesToReturn=0):
 | 
| 
 | 
   353 		"""
 | 
| 
 | 
   354 		Returns random sample names of the number given. No replacement.
 | 
| 
 | 
   355 		
 | 
| 
 | 
   356 		:param	lsSamples:	List of sample names 
 | 
| 
 | 
   357 		:type:	list	list of strings
 | 
| 
 | 
   358 		:param	iNumberOfSamplesToReturn:	Number of samples to select
 | 
| 
 | 
   359 		:type:	integer	integer.
 | 
| 
 | 
   360 		:return	List:	List of selected samples (strings).
 | 
| 
 | 
   361 		"""
 | 
| 
 | 
   362 
 | 
| 
 | 
   363 		#Input matrix sample count
 | 
| 
 | 
   364 		sampleCount = len(lsSamples)
 | 
| 
 | 
   365 
 | 
| 
 | 
   366 		#Return the full matrix if they ask for a return matrix where length == original
 | 
| 
 | 
   367 		if(iNumberOfSamplesToReturn >= sampleCount):
 | 
| 
 | 
   368 			return lsSamples
 | 
| 
 | 
   369 	
 | 
| 
 | 
   370 		#Get the random indices for the sample (without replacement)
 | 
| 
 | 
   371 		liRandomIndices = random.sample(range(sampleCount), iNumberOfSamplesToReturn)
 | 
| 
 | 
   372 	
 | 
| 
 | 
   373 		#Create a boolean array of if indexes are to be included in the reduced array
 | 
| 
 | 
   374                 return [sSample for iIndex, sSample in enumerate(lsSamples) if iIndex in liRandomIndices]
 | 
| 
 | 
   375 
 | 
| 
 | 
   376 	#Happy path tested (case 3)
 | 
| 
 | 
   377 	def funcGetAveragePopulation(self, abndTable, lfCompress):
 | 
| 
 | 
   378 		"""
 | 
| 
 | 
   379 		Get the average row per column in the abndtable.
 | 
| 
 | 
   380 
 | 
| 
 | 
   381 		:param abndTable: AbundanceTable of data to be averaged
 | 
| 
 | 
   382 		:type: AbudanceTable
 | 
| 
 | 
   383 		:param lfCompress: List of boolean flags (false means to remove sample before averaging
 | 
| 
 | 
   384 		:type: List of floats
 | 
| 
 | 
   385 		:return List of doubles: 
 | 
| 
 | 
   386 		"""
 | 
| 
 | 
   387 		if sum(lfCompress) == 0:
 | 
| 
 | 
   388 			return []
 | 
| 
 | 
   389 
 | 
| 
 | 
   390 		#Get the average populations
 | 
| 
 | 
   391 		lAverageRet = []
 | 
| 
 | 
   392 
 | 
| 
 | 
   393 		for sFeature in abndTable.funcGetAbundanceCopy():
 | 
| 
 | 
   394 			sFeature = list(sFeature)[1:]
 | 
| 
 | 
   395 			sFeature=np.compress(lfCompress,sFeature,axis=0)
 | 
| 
 | 
   396 			lAverageRet.append(sum(sFeature)/float(len(sFeature)))
 | 
| 
 | 
   397 		return lAverageRet
 | 
| 
 | 
   398 
 | 
| 
 | 
   399 	#Happy path tested (2 cases)
 | 
| 
 | 
   400 	def funcGetDistanceFromAverage(self, abndTable,ldAverage,lsSamples,lfSelected):
 | 
| 
 | 
   401 		"""
 | 
| 
 | 
   402 		Given an abundance table and an average sample, this returns the distance of each sample
 | 
| 
 | 
   403 		(measured using brays-curtis dissimilarity) from the average.
 | 
| 
 | 
   404 		The distances are reduced by needing to be in the lsSamples and being a true in the lfSelected
 | 
| 
 | 
   405 		(which is associated with the samples in the order of the samples in the abundance table;
 | 
| 
 | 
   406 		use abundancetable.funcGetSampleNames() to see the order if needed).
 | 
| 
 | 
   407 
 | 
| 
 | 
   408 		:param abndTable: Abundance table holding the data to be analyzed.
 | 
| 
 | 
   409 		:type: AbundanceTable
 | 
| 
 | 
   410 		:param ldAverage: Average population (Average features of the abundance table of samples)
 | 
| 
 | 
   411 		:type: List of doubles which represent the average population
 | 
| 
 | 
   412 		:param lsSamples: These are the only samples used in the analysis
 | 
| 
 | 
   413 		:type: List of strings (sample ids)
 | 
| 
 | 
   414 		:param lfSelected: Samples to be included in the analysis
 | 
| 
 | 
   415 		:type: List of boolean (true means include)
 | 
| 
 | 
   416 		:return: List of distances (doubles)
 | 
| 
 | 
   417 		"""
 | 
| 
 | 
   418 		#Get the distance from label 1 of all samples in label0 splitting into selected and not selected lists
 | 
| 
 | 
   419 		ldSelectedDistances = []
 | 
| 
 | 
   420 
 | 
| 
 | 
   421 		for sSampleName in [sSample for iindex, sSample in enumerate(lsSamples) if lfSelected[iindex]]:
 | 
| 
 | 
   422 			#Get the sample measurements
 | 
| 
 | 
   423 			ldSelectedDistances.append(Metric.funcGetBrayCurtisDissimilarity(np.array([abndTable.funcGetSample(sSampleName),ldAverage]))[0])
 | 
| 
 | 
   424 		return ldSelectedDistances
 | 
| 
 | 
   425 
 | 
| 
 | 
   426 	#Happy path tested (1 case)
 | 
| 
 | 
   427 	def funcMeasureDistanceFromLabelToAverageOtherLabel(self, abndTable, lfGroupOfInterest, lfGroupOther):
 | 
| 
 | 
   428 		"""
 | 
| 
 | 
   429 		Get the distance of samples from one label from the average sample of not the label.
 | 
| 
 | 
   430 		Note: This assumes 2 classes.  
 | 
| 
 | 
   431 
 | 
| 
 | 
   432 		:param abndTable: Table of data to work out of.
 | 
| 
 | 
   433 		:type: Abundace Table
 | 
| 
 | 
   434 		:param lfGroupOfInterest: Boolean indicator of the sample being in the first group.
 | 
| 
 | 
   435 		:type: List of floats, true indicating an individual in the group of interest.
 | 
| 
 | 
   436 		:param lfGroupOther:	Boolean indicator of the sample being in the other group.
 | 
| 
 | 
   437 		:type:	List of floats, true indicating an individual in the 
 | 
| 
 | 
   438 		:return List of List of doubles: [list of tuples (string sample name,double distance) for the selected population, list of tuples for the not selected population]
 | 
| 
 | 
   439 		"""
 | 
| 
 | 
   440 		#Get all sample names
 | 
| 
 | 
   441 		lsAllSamples = abndTable.funcGetSampleNames()
 | 
| 
 | 
   442 
 | 
| 
 | 
   443 		#Get average populations
 | 
| 
 | 
   444 		lAverageOther = self.funcGetAveragePopulation(abndTable=abndTable, lfCompress=lfGroupOther)
 | 
| 
 | 
   445 
 | 
| 
 | 
   446 		#Get the distance from the average of the other label (label 1)
 | 
| 
 | 
   447 		ldSelectedDistances = self.funcGetDistanceFromAverage(abndTable=abndTable, ldAverage=lAverageOther,
 | 
| 
 | 
   448 			lsSamples=lsAllSamples, lfSelected=lfGroupOfInterest)
 | 
| 
 | 
   449 
 | 
| 
 | 
   450 		return zip([lsAllSamples[iindex] for iindex, fGroup in enumerate(lfGroupOfInterest) if fGroup],ldSelectedDistances)
 | 
| 
 | 
   451 
 | 
| 
 | 
   452 	#Happy path tested (1 test case)
 | 
| 
 | 
   453 	def funcPerformDistanceSelection(self, abndTable, iSelectionCount, sLabel, sValueOfInterest):
 | 
| 
 | 
   454 		"""
 | 
| 
 | 
   455 		Given metadata, metadata of one value (sValueOfInterest) is measured from the average (centroid) value of another label group.
 | 
| 
 | 
   456 		An iSelectionCount of samples is selected from the group of interest closest to and furthest from the centroid of the other group.
 | 
| 
 | 
   457 
 | 
| 
 | 
   458 		:params  abndTable: Abundance of measurements
 | 
| 
 | 
   459 		:type: AbundanceTable
 | 
| 
 | 
   460 		:params iSelectionCount: The number of samples selected per sample.
 | 
| 
 | 
   461 		:type: Integer Integer greater than 0
 | 
| 
 | 
   462 		:params sLabel: ID of the metadata which is the supervised label
 | 
| 
 | 
   463 		:type: String
 | 
| 
 | 
   464 		:params sValueOfInterest: Metadata value in the sLabel metadta row of the abundance table which defines the group of interest.
 | 
| 
 | 
   465 		:type: String found in the abundance table metadata row indicated by sLabel.
 | 
| 
 | 
   466 		:return list list of tuples (samplename, distance) [[iSelectionCount of tuples closest to the other centroid], [iSelectionCount of tuples farthest from the other centroid], [all tuples of samples not selected]]
 | 
| 
 | 
   467 		"""
 | 
| 
 | 
   468 
 | 
| 
 | 
   469 		lsMetadata = abndTable.funcGetMetadata(sLabel)
 | 
| 
 | 
   470 		#Other metadata values
 | 
| 
 | 
   471 		lsUniqueOtherValues = list(set(lsMetadata)-set(sValueOfInterest))
 | 
| 
 | 
   472 
 | 
| 
 | 
   473 		#Get boolean indicator of values of interest
 | 
| 
 | 
   474 		lfLabelsInterested = [sValueOfInterest == sValue for sValue in lsMetadata]
 | 
| 
 | 
   475 
 | 
| 
 | 
   476                 #Get the distances of the items of interest from the other metadata values
 | 
| 
 | 
   477 		dictDistanceAverages = {}
 | 
| 
 | 
   478                 for sOtherLabel in lsUniqueOtherValues:
 | 
| 
 | 
   479 			#Get boolean indicator of labels not of interest 
 | 
| 
 | 
   480 			lfLabelsOther = [sOtherLabel == sValue for sValue in lsMetadata]
 | 
| 
 | 
   481 
 | 
| 
 | 
   482 			#Get the distances of data from two different groups to the average of the other
 | 
| 
 | 
   483 			ldValueDistances = dict(self.funcMeasureDistanceFromLabelToAverageOtherLabel(abndTable, lfLabelsInterested, lfLabelsOther))
 | 
| 
 | 
   484 
 | 
| 
 | 
   485 			for sKey in ldValueDistances:
 | 
| 
 | 
   486 				dictDistanceAverages[sKey] = ldValueDistances[sKey] + dictDistanceAverages[sKey] if sKey in dictDistanceAverages else ldValueDistances[sKey]
 | 
| 
 | 
   487 
 | 
| 
 | 
   488 		#Finish average by dividing by length of lsUniqueOtherValues
 | 
| 
 | 
   489 		ltpleAverageDistances = [(sKey, dictDistanceAverages[sKey]/float(len(lsUniqueOtherValues))) for sKey in dictDistanceAverages]
 | 
| 
 | 
   490 
 | 
| 
 | 
   491                 #Sort to extract extremes
 | 
| 
 | 
   492                 ltpleAverageDistances = sorted(ltpleAverageDistances,key=operator.itemgetter(1))
 | 
| 
 | 
   493 
 | 
| 
 | 
   494 		#Get the closest and farthest distances
 | 
| 
 | 
   495 		ltupleDiscriminantSamples = ltpleAverageDistances[:iSelectionCount]
 | 
| 
 | 
   496 		ltupleDistinctSamples = ltpleAverageDistances[iSelectionCount*-1:]
 | 
| 
 | 
   497 
 | 
| 
 | 
   498 		#Remove the selected samples from the larger population of distances (better visualization)
 | 
| 
 | 
   499 		ldSelected = [tpleSelected[0] for tpleSelected in ltupleDiscriminantSamples+ltupleDistinctSamples]
 | 
| 
 | 
   500 
 | 
| 
 | 
   501 		#Return discriminant tuples, distinct tuples, other tuples
 | 
| 
 | 
   502 		return [ltupleDiscriminantSamples, ltupleDistinctSamples,
 | 
| 
 | 
   503 			   [tplData for tplData in ltpleAverageDistances if tplData[0] not in ldSelected]]
 | 
| 
 | 
   504 
 | 
| 
 | 
   505 	#Run the supervised method surrounding distance from centroids
 | 
| 
 | 
   506 	#Happy path tested (3 test cases)
 | 
| 
 | 
   507 	def funcRunSupervisedDistancesFromCentroids(self, abundanceTable, fRunDistinct, fRunDiscriminant,
 | 
| 
 | 
   508 						xOutputSupFile, xPredictSupFile, strSupervisedMetadata,
 | 
| 
 | 
   509 						iSampleSupSelectionCount, lsOriginalSampleNames, lsOriginalLabels, fAppendFiles = False):
 | 
| 
 | 
   510 		"""
 | 
| 
 | 
   511 		Runs supervised methods based on measuring distances of one label from the centroid of another. NAs are evaluated as theirown group.
 | 
| 
 | 
   512 
 | 
| 
 | 
   513 		:param	abundanceTable:	AbundanceTable
 | 
| 
 | 
   514 		:type:	AbudanceTable	Data to analyze
 | 
| 
 | 
   515 		:param	fRunDistinct:	Run distinct selection method
 | 
| 
 | 
   516 		:type:	Boolean	boolean (true runs method)
 | 
| 
 | 
   517 		:param	fRunDiscriminant:	Run discriminant method
 | 
| 
 | 
   518 		:type:	Boolean	boolean (true runs method)
 | 
| 
 | 
   519 		:param	xOutputSupFile:	File output from supervised methods detailing data going into the method.
 | 
| 
 | 
   520 		:type:	String or FileStream
 | 
| 
 | 
   521 		:param	xPredictSupFile:	File output from supervised methods distance results from supervised methods.
 | 
| 
 | 
   522 		:type:	String or FileStream
 | 
| 
 | 
   523 		:param strSupervisedMetadata:	The metadata that will be used to group samples.
 | 
| 
 | 
   524 		:type:	String
 | 
| 
 | 
   525 		:param	iSampleSupSelectionCount:	Number of samples to select
 | 
| 
 | 
   526 		:type:	Integer	int sample selection count
 | 
| 
 | 
   527 		:param lsOriginalSampleNames:	List of the sample names, order is important and should be preserved from the abundanceTable.
 | 
| 
 | 
   528 		:type:	List of samples	
 | 
| 
 | 
   529 		:param	fAppendFiles:	Indicates that output files already exist and appending is occuring.
 | 
| 
 | 
   530 		:type:	Boolean
 | 
| 
 | 
   531 		:return	Selected Samples:	A dictionary of selected samples by selection ID
 | 
| 
 | 
   532 		Dictionary	{"Selection Method":["SampleID","SampleID"...]}
 | 
| 
 | 
   533 		"""
 | 
| 
 | 
   534 		#Get labels and run one label against many
 | 
| 
 | 
   535 		lstrMetadata = abundanceTable.funcGetMetadata(strSupervisedMetadata)
 | 
| 
 | 
   536 		dictlltpleDistanceMeasurements = {}
 | 
| 
 | 
   537 		for sMetadataValue in set(lstrMetadata):
 | 
| 
 | 
   538 
 | 
| 
 | 
   539 			#For now perform the selection here for the label of interest against the other labels
 | 
| 
 | 
   540 			dictlltpleDistanceMeasurements.setdefault(sMetadataValue,[]).extend(self.funcPerformDistanceSelection(abndTable=abundanceTable,
 | 
| 
 | 
   541 				iSelectionCount=iSampleSupSelectionCount, sLabel=strSupervisedMetadata, sValueOfInterest=sMetadataValue))
 | 
| 
 | 
   542 
 | 
| 
 | 
   543 		#Make expected output files for supervised methods
 | 
| 
 | 
   544 		#1. Output file which is similar to an input file for SVMs
 | 
| 
 | 
   545 		#2. Output file that is similar to the probabilitic output of a SVM (LibSVM)
 | 
| 
 | 
   546 		#Manly for making output of supervised methods (Distance from Centroid) similar
 | 
| 
 | 
   547 		#MicropitaVis needs some of these files
 | 
| 
 | 
   548 		if xOutputSupFile:
 | 
| 
 | 
   549 			if fAppendFiles:
 | 
| 
 | 
   550 				SVM.funcUpdateSVMFileWithAbundanceTable(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile,
 | 
| 
 | 
   551 					lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames)
 | 
| 
 | 
   552 			else:
 | 
| 
 | 
   553 				SVM.funcConvertAbundanceTableToSVMFile(abndAbundanceTable=abundanceTable, xOutputSVMFile=xOutputSupFile,
 | 
| 
 | 
   554 					sMetadataLabel=strSupervisedMetadata, lsOriginalLabels=lsOriginalLabels, lsSampleOrdering=lsOriginalSampleNames)
 | 
| 
 | 
   555 
 | 
| 
 | 
   556 		#Will contain the samples selected to return
 | 
| 
 | 
   557 		#One or more of the methods may be active so this is why I am extending instead of just returning the result of each method type
 | 
| 
 | 
   558 		dictSelectedSamplesRet = dict()
 | 
| 
 | 
   559 		for sKey, ltplDistances in dictlltpleDistanceMeasurements.items():
 | 
| 
 | 
   560 			if fRunDistinct:
 | 
| 
 | 
   561 				dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDistinct,[]).extend([ltple[0] for ltple in ltplDistances[1]])
 | 
| 
 | 
   562 			if fRunDiscriminant:
 | 
| 
 | 
   563 				dictSelectedSamplesRet.setdefault(ConstantsMicropita.c_strDiscriminant,[]).extend([ltple[0] for ltple in ltplDistances[0]])
 | 
| 
 | 
   564 
 | 
| 
 | 
   565 		if xPredictSupFile:
 | 
| 
 | 
   566 			dictFlattenedDistances = dict()
 | 
| 
 | 
   567 			[dictFlattenedDistances.setdefault(sKey, []).append(tple)
 | 
| 
 | 
   568 				for sKey, lltple in dictlltpleDistanceMeasurements.items()
 | 
| 
 | 
   569 				for ltple in lltple for tple in ltple]
 | 
| 
 | 
   570 			if fAppendFiles:
 | 
| 
 | 
   571 				self._updatePredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile,
 | 
| 
 | 
   572 					dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames)
 | 
| 
 | 
   573 			else:
 | 
| 
 | 
   574 				self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xOutputSupFile,
 | 
| 
 | 
   575 					dictltpleDistanceMeasurements=dictFlattenedDistances, abundanceTable=abundanceTable, lsOriginalSampleNames=lsOriginalSampleNames)
 | 
| 
 | 
   576 		return dictSelectedSamplesRet
 | 
| 
 | 
   577 
 | 
| 
 | 
   578 	#Two happy path test cases
 | 
| 
 | 
   579 	def _updatePredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames):
 | 
| 
 | 
   580 		"""
 | 
| 
 | 
   581 		Manages updating the predict file.
 | 
| 
 | 
   582 
 | 
| 
 | 
   583 		:param	xPredictSupFile: File that has predictions (distances) from the supervised method.
 | 
| 
 | 
   584 		:type:	FileStream or String file path
 | 
| 
 | 
   585 		:param	xInputLabelsFile: File that as input to the supervised methods.
 | 
| 
 | 
   586 		:type:	FileStream or String file path
 | 
| 
 | 
   587 		:param	dictltpleDistanceMeasurements: 
 | 
| 
 | 
   588 		:type:	Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]}
 | 
| 
 | 
   589 		"""
 | 
| 
 | 
   590 
 | 
| 
 | 
   591 		if not isinstance(xPredictSupFile, str):
 | 
| 
 | 
   592 			xPredictSupFile.close()
 | 
| 
 | 
   593 			xPredictSupFile = xPredictSupFile.name
 | 
| 
 | 
   594 		csvr = open(xPredictSupFile,'r')
 | 
| 
 | 
   595 
 | 
| 
 | 
   596 		f = csv.reader(csvr,delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace)
 | 
| 
 | 
   597 		lsHeader = f.next()[1:]
 | 
| 
 | 
   598 		dictlltpleRead = dict([(sHeader,[]) for sHeader in lsHeader])
 | 
| 
 | 
   599 
 | 
| 
 | 
   600 		#Read data in 
 | 
| 
 | 
   601 		iSampleIndex = 0
 | 
| 
 | 
   602 		for sRow in f:
 | 
| 
 | 
   603 			sLabel = sRow[0]
 | 
| 
 | 
   604 			[dictlltpleRead[lsHeader[iDistanceIndex]].append((lsOriginalSampleNames[iSampleIndex],dDistance)) for iDistanceIndex, dDistance in enumerate(sRow[1:])
 | 
| 
 | 
   605 				if not dDistance == ConstantsMicropita.c_sEmptyPredictFileValue]
 | 
| 
 | 
   606 			iSampleIndex += 1
 | 
| 
 | 
   607 
 | 
| 
 | 
   608 		#Combine dictltpleDistanceMeasurements with new data
 | 
| 
 | 
   609 		#If they share a key then merge keeping parameter data
 | 
| 
 | 
   610 		#If they do not share the key, keep the full data
 | 
| 
 | 
   611 		dictNew = {}
 | 
| 
 | 
   612 		for sKey in dictltpleDistanceMeasurements.keys():
 | 
| 
 | 
   613 			lsSamples = [tple[0] for tple in dictltpleDistanceMeasurements[sKey]]
 | 
| 
 | 
   614 			dictNew[sKey] = dictltpleDistanceMeasurements[sKey]+[tple for tple in dictlltpleRead[sKey] if tple[0] not in lsSamples] if sKey in dictlltpleRead.keys() else dictltpleDistanceMeasurements[sKey]
 | 
| 
 | 
   615                 for sKey in dictlltpleRead:
 | 
| 
 | 
   616 			if sKey not in dictltpleDistanceMeasurements.keys():
 | 
| 
 | 
   617 				dictNew[sKey] = dictlltpleRead[sKey]
 | 
| 
 | 
   618 
 | 
| 
 | 
   619 		#Call writer
 | 
| 
 | 
   620 		self._writeToPredictFile(xPredictSupFile=xPredictSupFile, xInputLabelsFile=xInputLabelsFile,
 | 
| 
 | 
   621 			dictltpleDistanceMeasurements=dictNew, abundanceTable=abundanceTable,
 | 
| 
 | 
   622 			lsOriginalSampleNames=lsOriginalSampleNames, fFromUpdate=True)
 | 
| 
 | 
   623 
 | 
| 
 | 
   624 	#2 happy path test cases
 | 
| 
 | 
   625         def _writeToPredictFile(self, xPredictSupFile, xInputLabelsFile, dictltpleDistanceMeasurements, abundanceTable, lsOriginalSampleNames, fFromUpdate=False):
 | 
| 
 | 
   626 		"""
 | 
| 
 | 
   627 		Write to the predict file.
 | 
| 
 | 
   628 
 | 
| 
 | 
   629 		:param	xPredictSupFile: File that has predictions (distances) from the supervised method.
 | 
| 
 | 
   630 		:type:	FileStream or String file path
 | 
| 
 | 
   631 		:param	xInputLabelsFile: File that as input to the supervised methods.
 | 
| 
 | 
   632 		:type:	FileStream or String file path
 | 
| 
 | 
   633 		:param	dictltpleDistanceMeasurements: 
 | 
| 
 | 
   634 		:type:	Dictionary of lists of tuples {"labelgroup":[("SampleName",dDistance)...], "labelgroup":[("SampleName",dDistance)...]}
 | 
| 
 | 
   635 		:param	abundanceTable: An abundance table of the sample data.
 | 
| 
 | 
   636 		:type:	AbundanceTable
 | 
| 
 | 
   637 		:param	lsOriginalSampleNames: Used if the file is being updated as the sample names so that it may be passed in and consistent with other writing.
 | 
| 
 | 
   638 			Otherwise will use the sample names from the abundance table.
 | 
| 
 | 
   639 		:type:	List of strings
 | 
| 
 | 
   640 		:param	fFromUpdate:	Indicates if this is part of an update to the file or not.
 | 
| 
 | 
   641 		:type:	Boolean
 | 
| 
 | 
   642 		"""
 | 
| 
 | 
   643 
 | 
| 
 | 
   644 		xInputLabelsFileName = xInputLabelsFile
 | 
| 
 | 
   645 		if not isinstance(xInputLabelsFile,str):
 | 
| 
 | 
   646 			xInputLabelsFileName = xInputLabelsFile.name
 | 
| 
 | 
   647 		f = csv.writer(open(xPredictSupFile,"w") if isinstance(xPredictSupFile, str) else xPredictSupFile,delimiter=ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace)
 | 
| 
 | 
   648 
 | 
| 
 | 
   649 		lsAllSampleNames = abundanceTable.funcGetSampleNames()
 | 
| 
 | 
   650 		lsLabels = SVM.funcReadLabelsFromFile(xSVMFile=xInputLabelsFileName, lsAllSampleNames= lsOriginalSampleNames if fFromUpdate else lsAllSampleNames,
 | 
| 
 | 
   651 						isPredictFile=False)
 | 
| 
 | 
   652 		dictLabels = dict([(sSample,sLabel) for sLabel in lsLabels.keys() for sSample in lsLabels[sLabel]])
 | 
| 
 | 
   653 
 | 
| 
 | 
   654 		#Dictionay keys will be used to order the predict file
 | 
| 
 | 
   655 		lsMeasurementKeys = dictltpleDistanceMeasurements.keys()
 | 
| 
 | 
   656 		#Make header
 | 
| 
 | 
   657 		f.writerow(["labels"]+lsMeasurementKeys)
 | 
| 
 | 
   658 
 | 
| 
 | 
   659 		#Reformat dictionary to make it easier to use
 | 
| 
 | 
   660 		for sKey in dictltpleDistanceMeasurements:
 | 
| 
 | 
   661 			dictltpleDistanceMeasurements[sKey] = dict([ltpl for ltpl in dictltpleDistanceMeasurements[sKey]])
 | 
| 
 | 
   662 
 | 
| 
 | 
   663 		for sSample in lsOriginalSampleNames:
 | 
| 
 | 
   664 			#Make body of file
 | 
| 
 | 
   665 			f.writerow([dictLabels.get(sSample,ConstantsMicropita.c_sEmptyPredictFileValue)]+
 | 
| 
 | 
   666 				[str(dictltpleDistanceMeasurements[sKey].get(sSample,ConstantsMicropita.c_sEmptyPredictFileValue))
 | 
| 
 | 
   667 				for sKey in lsMeasurementKeys])
 | 
| 
 | 
   668 
 | 
| 
 | 
   669 	def _funcRunNormalizeSensitiveMethods(self, abndData, iSampleSelectionCount, dictSelectedSamples, lsAlphaMetrics, lsBetaMetrics, lsInverseBetaMetrics,
 | 
| 
 | 
   670 												fRunDiversity, fRunRepresentative, fRunExtreme, strAlphaMetadata=None,
 | 
| 
 | 
   671 												istmBetaMatrix=None, istrmTree=None, istrmEnvr=None, fInvertDiversity=False):
 | 
| 
 | 
   672 		"""
 | 
| 
 | 
   673 		Manages running methods that are sensitive to normalization. This is called twice, once for the set of methods which should not be normalized and the other
 | 
| 
 | 
   674 		for the set that should be normalized.
 | 
| 
 | 
   675 	
 | 
| 
 | 
   676 		:param	abndData:	Abundance table object holding the samples to be measured.
 | 
| 
 | 
   677 		:type:	AbundanceTable
 | 
| 
 | 
   678 		:param	iSampleSelectionCount	The number of samples to select per method.
 | 
| 
 | 
   679 		:type:	Integer
 | 
| 
 | 
   680 		:param	dictSelectedSamples	Will be added to as samples are selected {"Method:["strSelectedSampleID","strSelectedSampleID"...]}.
 | 
| 
 | 
   681 		:type:	Dictionary
 | 
| 
 | 
   682 		:param	lsAlphaMetrics:	List of alpha metrics to use on alpha metric dependent assays (like highest diversity).
 | 
| 
 | 
   683 		:type:	List of strings
 | 
| 
 | 
   684 		:param	lsBetaMetrics:	List of beta metrics to use on beta metric dependent assays (like most representative).
 | 
| 
 | 
   685 		:type:	List of strings
 | 
| 
 | 
   686 		:param	lsInverseBetaMetrics:	List of inverse beta metrics to use on inverse beta metric dependent assays (like most dissimilar).
 | 
| 
 | 
   687 		:type:	List of strings
 | 
| 
 | 
   688 		:param	fRunDiversity:	Run Diversity based methods (true indicates run).
 | 
| 
 | 
   689 		:type:	Boolean	
 | 
| 
 | 
   690 		:param	fRunRepresentative:	Run Representative based methods (true indicates run).
 | 
| 
 | 
   691 		:type:	Boolean	
 | 
| 
 | 
   692 		:param	fRunExtreme:	Run Extreme based methods (true indicates run).
 | 
| 
 | 
   693 		:type:	Boolean	
 | 
| 
 | 
   694 		:param	istmBetaMatrix:	File that has a precalculated beta matrix
 | 
| 
 | 
   695 		:type:	File stream or File path string
 | 
| 
 | 
   696 		:return	Selected Samples:	Samples selected by methods.
 | 
| 
 | 
   697 				Dictionary	{"Selection Method":["SampleID","SampleID","SampleID",...]}
 | 
| 
 | 
   698 		"""
 | 
| 
 | 
   699 
 | 
| 
 | 
   700 		#Sample ids/names
 | 
| 
 | 
   701 		lsSampleNames = abndData.funcGetSampleNames()
 | 
| 
 | 
   702 	
 | 
| 
 | 
   703 		#Generate alpha metrics and get most diverse
 | 
| 
 | 
   704 		if fRunDiversity:
 | 
| 
 | 
   705 
 | 
| 
 | 
   706 			#Get Alpha metrics matrix
 | 
| 
 | 
   707 			internalAlphaMatrix = None
 | 
| 
 | 
   708 			#Name of technique
 | 
| 
 | 
   709 			strMethod = [strAlphaMetadata] if strAlphaMetadata else lsAlphaMetrics
 | 
| 
 | 
   710 
 | 
| 
 | 
   711 			#If given an alpha-diversity metadata
 | 
| 
 | 
   712 			if strAlphaMetadata:
 | 
| 
 | 
   713 				internalAlphaMatrix = [[float(strNum) for strNum in abndData.funcGetMetadata(strAlphaMetadata)]]
 | 
| 
 | 
   714 			else:
 | 
| 
 | 
   715 				#Expects Observations (Taxa (row) x sample (column))
 | 
| 
 | 
   716 				#Returns [[metric1-sample1, metric1-sample2, metric1-sample3],[metric1-sample1, metric1-sample2, metric1-sample3]]
 | 
| 
 | 
   717 				internalAlphaMatrix = Metric.funcBuildAlphaMetricsMatrix(npaSampleAbundance = abndData.funcGetAbundanceCopy()
 | 
| 
 | 
   718 							if not abndData.funcIsSummed()
 | 
| 
 | 
   719 							else abndData.funcGetFeatureAbundanceTable(abndData.funcGetTerminalNodes()).funcGetAbundanceCopy(),
 | 
| 
 | 
   720 							lsSampleNames = lsSampleNames, lsDiversityMetricAlpha = lsAlphaMetrics)
 | 
| 
 | 
   721 	
 | 
| 
 | 
   722 			if internalAlphaMatrix:
 | 
| 
 | 
   723 				#Invert measurments
 | 
| 
 | 
   724 				if fInvertDiversity:
 | 
| 
 | 
   725 					lldNewDiversity = []
 | 
| 
 | 
   726 					for lsLine in internalAlphaMatrix:
 | 
| 
 | 
   727 						lldNewDiversity.append([1/max(dValue,ConstantsMicropita.c_smallNumber) for dValue in lsLine])
 | 
| 
 | 
   728 					internalAlphaMatrix = lldNewDiversity
 | 
| 
 | 
   729 				#Get top ranked alpha diversity by most diverse
 | 
| 
 | 
   730 				#Expects [[sample1,sample2,sample3...],[sample1,sample2,sample3..],...]
 | 
| 
 | 
   731 				#Returns [[sampleName1, sampleName2, sampleNameN],[sampleName1, sampleName2, sampleNameN]]
 | 
| 
 | 
   732 				mostDiverseAlphaSamplesIndexes = self.funcGetTopRankedSamples(lldMatrix=internalAlphaMatrix, lsSampleNames=lsSampleNames, iTopAmount=iSampleSelectionCount)
 | 
| 
 | 
   733 
 | 
| 
 | 
   734 				#Add to results
 | 
| 
 | 
   735 				for index in xrange(0,len(strMethod)):
 | 
| 
 | 
   736 					strSelectionMethod = self.dictConvertAMetricDiversity.get(strMethod[index],ConstantsMicropita.c_strDiversity+"="+strMethod[index])
 | 
| 
 | 
   737 					dictSelectedSamples.setdefault(strSelectionMethod,[]).extend(mostDiverseAlphaSamplesIndexes[index])
 | 
| 
 | 
   738 
 | 
| 
 | 
   739 		logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 1b")
 | 
| 
 | 
   740 		logging.info(dictSelectedSamples)
 | 
| 
 | 
   741 	
 | 
| 
 | 
   742 		#Generate beta metrics and 
 | 
| 
 | 
   743 		if fRunRepresentative or fRunExtreme:
 | 
| 
 | 
   744 
 | 
| 
 | 
   745 			#Abundance matrix transposed
 | 
| 
 | 
   746 			npaTransposedAbundance = UtilityMath.funcTransposeDataMatrix(abndData.funcGetAbundanceCopy(), fRemoveAdornments=True)
 | 
| 
 | 
   747 	
 | 
| 
 | 
   748 			#Get center selection using clusters/tiling
 | 
| 
 | 
   749 			#This will be for beta metrics in normalized space
 | 
| 
 | 
   750 			if fRunRepresentative:
 | 
| 
 | 
   751 
 | 
| 
 | 
   752 				if istmBetaMatrix:
 | 
| 
 | 
   753 					#Get representative dissimilarity samples
 | 
| 
 | 
   754 					medoidSamples=self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=ConstantsMicropita.c_custom, lsSampleNames=lsSampleNames, iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)
 | 
| 
 | 
   755 
 | 
| 
 | 
   756 					if medoidSamples:
 | 
| 
 | 
   757 						dictSelectedSamples.setdefault(ConstantsMicropita.c_strRepresentative+"="+ConstantsMicropita.c_custom,[]).extend(medoidSamples)
 | 
| 
 | 
   758 				else:
 | 
| 
 | 
   759 					logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Performing representative selection on normalized data.")
 | 
| 
 | 
   760 					for bMetric in lsBetaMetrics:
 | 
| 
 | 
   761 
 | 
| 
 | 
   762 						#Get representative dissimilarity samples
 | 
| 
 | 
   763 						medoidSamples=self.funcGetCentralSamplesByKMedoids(npaMatrix=npaTransposedAbundance, sMetric=bMetric, lsSampleNames=lsSampleNames, iNumberSamplesReturned=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)
 | 
| 
 | 
   764 
 | 
| 
 | 
   765 						if medoidSamples:
 | 
| 
 | 
   766 							dictSelectedSamples.setdefault(self.dictConvertBMetricToMethod.get(bMetric,ConstantsMicropita.c_strRepresentative+"="+bMetric),[]).extend(medoidSamples)
 | 
| 
 | 
   767 
 | 
| 
 | 
   768 			#Get extreme selection using clusters, tiling
 | 
| 
 | 
   769 			if fRunExtreme:
 | 
| 
 | 
   770 				logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Performing extreme selection on normalized data.")
 | 
| 
 | 
   771 				if istmBetaMatrix:
 | 
| 
 | 
   772 
 | 
| 
 | 
   773 					#Samples for representative dissimilarity
 | 
| 
 | 
   774 					#This involves inverting the distance metric,
 | 
| 
 | 
   775 					#Taking the dendrogram level of where the number cluster == the number of samples to select
 | 
| 
 | 
   776 					#Returning a repersentative sample from each cluster
 | 
| 
 | 
   777 					extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=ConstantsMicropita.c_custom, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)
 | 
| 
 | 
   778 	
 | 
| 
 | 
   779 					#Add selected samples
 | 
| 
 | 
   780 					if extremeSamples:
 | 
| 
 | 
   781 						dictSelectedSamples.setdefault(ConstantsMicropita.c_strExtreme+"="+ConstantsMicropita.c_custom,[]).extend(extremeSamples)
 | 
| 
 | 
   782 
 | 
| 
 | 
   783 				else:
 | 
| 
 | 
   784 					#Run KMedoids with inverse custom distance metric in normalized space
 | 
| 
 | 
   785 					for bMetric in lsInverseBetaMetrics:
 | 
| 
 | 
   786 
 | 
| 
 | 
   787 						#Samples for representative dissimilarity
 | 
| 
 | 
   788 						#This involves inverting the distance metric,
 | 
| 
 | 
   789 						#Taking the dendrogram level of where the number cluster == the number of samples to select
 | 
| 
 | 
   790 						#Returning a repersentative sample from each cluster
 | 
| 
 | 
   791 						extremeSamples = self.funcSelectExtremeSamplesFromHClust(strBetaMetric=bMetric, npaAbundanceMatrix=npaTransposedAbundance, lsSampleNames=lsSampleNames, iSelectSampleCount=iSampleSelectionCount, istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr)
 | 
| 
 | 
   792 	
 | 
| 
 | 
   793 						#Add selected samples
 | 
| 
 | 
   794 						if extremeSamples:
 | 
| 
 | 
   795 							dictSelectedSamples.setdefault(self.dictConvertInvBMetricToMethod.get(bMetric,ConstantsMicropita.c_strExtreme+"="+bMetric),[]).extend(extremeSamples)
 | 
| 
 | 
   796 
 | 
| 
 | 
   797 		logging.info("MicroPITA.funcRunNormalizeSensitiveMethods:: Selected Samples 2,3b")
 | 
| 
 | 
   798 		logging.info(dictSelectedSamples)
 | 
| 
 | 
   799 		return dictSelectedSamples
 | 
| 
 | 
   800 
 | 
| 
 | 
   801 	def funcRun(self, strIDName, strLastMetadataName, istmInput,
 | 
| 
 | 
   802 					  ostmInputPredictFile, ostmPredictFile, ostmCheckedFile, ostmOutput,
 | 
| 
 | 
   803 					  cDelimiter, cFeatureNameDelimiter, strFeatureSelection,
 | 
| 
 | 
   804 					  istmFeatures, iCount, lstrMethods, strLastRowMetadata = None, strLabel = None, strStratify = None,
 | 
| 
 | 
   805 					  strCustomAlpha = None, strCustomBeta = None, strAlphaMetadata = None, istmBetaMatrix = None, istrmTree = None, istrmEnvr = None, 
 | 
| 
 | 
   806 					  iMinSeqs = ConstantsMicropita.c_liOccurenceFilter[0], iMinSamples = ConstantsMicropita.c_liOccurenceFilter[1], fInvertDiversity = False):
 | 
| 
 | 
   807 		"""
 | 
| 
 | 
   808 		Manages the selection of samples given different metrics.
 | 
| 
 | 
   809 
 | 
| 
 | 
   810 		:param	strIDName: Sample Id metadata row
 | 
| 
 | 
   811 		:type:	String
 | 
| 
 | 
   812 		:param	strLastMetadataName: The id of the metadata positioned last in the abundance table.
 | 
| 
 | 
   813 		:type:	String	String metadata id.
 | 
| 
 | 
   814 		:param	istmInput: File to store input data to supervised methods.
 | 
| 
 | 
   815 		:type:	FileStream of String file path
 | 
| 
 | 
   816 		:param	ostmInputPredictFile: File to store distances from supervised methods.
 | 
| 
 | 
   817 		:type:	FileStream or String file path
 | 
| 
 | 
   818 		:param	ostmCheckedFile: File to store the AbundanceTable data after it is being checked.
 | 
| 
 | 
   819 		:type:	FileStream or String file path
 | 
| 
 | 
   820 		:param	ostmOutPut: File to store sample selection by methods of interest.
 | 
| 
 | 
   821 		:type:	FileStream or String file path
 | 
| 
 | 
   822 		:param	cDelimiter: Delimiter of abundance table.
 | 
| 
 | 
   823 		:type:	Character Char (default TAB).
 | 
| 
 | 
   824 		:param	cFeatureNameDelimiter: Delimiter of the name of features (for instance if they contain consensus lineages indicating clades).
 | 
| 
 | 
   825 		:type:	Character (default |).
 | 
| 
 | 
   826 		:param	stFeatureSelectionMethod: Which method to use to select features in a targeted manner (Using average ranked abundance or average abundance).
 | 
| 
 | 
   827 		:type:	String (specific values indicated in ConstantsMicropita.lsTargetedFeatureMethodValues).
 | 
| 
 | 
   828 		:param	istmFeatures: File which holds the features of interest if using targeted feature methodology.
 | 
| 
 | 
   829 		:type:	FileStream or String file path
 | 
| 
 | 
   830 		:param	iCount:	Number of samples to select in each methods, supervised methods select this amount per label if possible.
 | 
| 
 | 
   831 		:type:	Integer	integer.
 | 
| 
 | 
   832 		:param	lstrMethods: List of strings indicating selection techniques.
 | 
| 
 | 
   833 		:type:	List of string method names
 | 
| 
 | 
   834 		:param	strLabel: The metadata used for supervised labels.
 | 
| 
 | 
   835 		:type:	String
 | 
| 
 | 
   836 		:param	strStratify: The metadata used to stratify unsupervised data.
 | 
| 
 | 
   837 		:type:	String
 | 
| 
 | 
   838 		:param	strCustomAlpha: Custom alpha diversity metric
 | 
| 
 | 
   839 		:type:	String
 | 
| 
 | 
   840 		:param	strCustomBeta: Custom beta diversity metric
 | 
| 
 | 
   841 		:type:	String
 | 
| 
 | 
   842 		:param	strAlphaMetadata: Metadata id which is a diveristy metric to use in highest diversity sampling
 | 
| 
 | 
   843 		:type:	String
 | 
| 
 | 
   844 		:param	istmBetaMatrix: File containing precalculated beta-diversity matrix for representative sampling
 | 
| 
 | 
   845 		:type:	FileStream or String file path
 | 
| 
 | 
   846 		:param	istrmTree: File containing tree for phylogentic beta-diversity analysis
 | 
| 
 | 
   847 		:type:	FileStream or String file path
 | 
| 
 | 
   848 		:param	istrmEnvr: File containing environment for phylogentic beta-diversity analysis
 | 
| 
 | 
   849 		:type:	FileStream or String file path
 | 
| 
 | 
   850 		:param	iMinSeqs: Minimum sequence in the occurence filter which filters all features not with a minimum number of sequences in each of a minimum number of samples.
 | 
| 
 | 
   851 		:type:	Integer
 | 
| 
 | 
   852 		:param	iMinSamples: Minimum sample count for the occurence filter.
 | 
| 
 | 
   853 		:type:	Integer
 | 
| 
 | 
   854 		:param	fInvertDiversity: When true will invert diversity measurements before using.
 | 
| 
 | 
   855 		:type:	boolean
 | 
| 
 | 
   856 		:return	Selected Samples:	Samples selected by methods.
 | 
| 
 | 
   857 				Dictionary	{"Selection Method":["SampleID","SampleID","SampleID",...]}
 | 
| 
 | 
   858 		"""
 | 
| 
 | 
   859 
 | 
| 
 | 
   860 		#Holds the top ranked samples from different metrics
 | 
| 
 | 
   861 		#dict[metric name] = [samplename,samplename...]
 | 
| 
 | 
   862 		selectedSamples = dict()
 | 
| 
 | 
   863 	
 | 
| 
 | 
   864 		#If a target feature file is given make sure that targeted feature is in the selection methods, if not add
 | 
| 
 | 
   865 		if ConstantsMicropita.c_strFeature in lstrMethods:
 | 
| 
 | 
   866 		  if not istmFeatures:
 | 
| 
 | 
   867 			logging.error("MicroPITA.funcRun:: Did not receive both the Targeted feature file and the feature selection method. MicroPITA did not run.")
 | 
| 
 | 
   868 			return False
 | 
| 
 | 
   869 
 | 
| 
 | 
   870 		#Diversity metrics to run
 | 
| 
 | 
   871 		#Use custom metrics if specified
 | 
| 
 | 
   872                 #Custom beta metrics set to normalized only, custom alpha metrics set to count only
 | 
| 
 | 
   873 		diversityMetricsAlpha = [] if strCustomAlpha or strAlphaMetadata else [MicroPITA.c_strInverseSimpsonDiversity]
 | 
| 
 | 
   874 		diversityMetricsBeta = [] if istmBetaMatrix else [strCustomBeta] if strCustomBeta else [MicroPITA.c_strBrayCurtisDissimilarity]
 | 
| 
 | 
   875 #		inverseDiversityMetricsBeta = [MicroPITA.c_strInvBrayCurtisDissimilarity]
 | 
| 
 | 
   876 		diversityMetricsAlphaNoNormalize = [strAlphaMetadata] if strAlphaMetadata else [strCustomAlpha] if strCustomAlpha else []
 | 
| 
 | 
   877 		diversityMetricsBetaNoNormalize = []
 | 
| 
 | 
   878 #		inverseDiversityMetricsBetaNoNormalize = []
 | 
| 
 | 
   879 
 | 
| 
 | 
   880 		#Targeted taxa
 | 
| 
 | 
   881 		userDefinedTaxa = []
 | 
| 
 | 
   882 	
 | 
| 
 | 
   883 		#Perform different flows flags
 | 
| 
 | 
   884 		c_RUN_MAX_DIVERSITY_1 = ConstantsMicropita.c_strDiversity in lstrMethods
 | 
| 
 | 
   885 		c_RUN_REPRESENTIVE_DISSIMILARITY_2 = ConstantsMicropita.c_strRepresentative in lstrMethods
 | 
| 
 | 
   886 		c_RUN_MAX_DISSIMILARITY_3 = ConstantsMicropita.c_strExtreme in lstrMethods
 | 
| 
 | 
   887 		c_RUN_RANK_AVERAGE_USER_4 = False
 | 
| 
 | 
   888 		if ConstantsMicropita.c_strFeature in lstrMethods:
 | 
| 
 | 
   889 			c_RUN_RANK_AVERAGE_USER_4 = True
 | 
| 
 | 
   890 			if not istmFeatures:
 | 
| 
 | 
   891 				logging.error("MicroPITA.funcRun:: No taxa file was given for taxa selection.") 
 | 
| 
 | 
   892 				return False
 | 
| 
 | 
   893 			#Read in taxa list, break down to lines and filter out empty strings
 | 
| 
 | 
   894 			userDefinedTaxa = filter(None,(s.strip( ) for s in istmFeatures.readlines()))
 | 
| 
 | 
   895 		c_RUN_RANDOM_5 = ConstantsMicropita.c_strRandom in lstrMethods
 | 
| 
 | 
   896 		c_RUN_DISTINCT = ConstantsMicropita.c_strDistinct in lstrMethods
 | 
| 
 | 
   897 		c_RUN_DISCRIMINANT = ConstantsMicropita.c_strDiscriminant in lstrMethods
 | 
| 
 | 
   898 
 | 
| 
 | 
   899 		#Read in abundance data
 | 
| 
 | 
   900 		#Abundance is a structured array. Samples (column) by Taxa (rows) with the taxa id row included as the column index=0
 | 
| 
 | 
   901 		#Abundance table object to read in and manage data
 | 
| 
 | 
   902 		totalAbundanceTable = AbundanceTable.funcMakeFromFile(xInputFile=istmInput, lOccurenceFilter = [iMinSeqs, iMinSamples],
 | 
| 
 | 
   903 								cDelimiter=cDelimiter, sMetadataID=strIDName, sLastMetadataRow=strLastRowMetadata,
 | 
| 
 | 
   904 								sLastMetadata=strLastMetadataName, cFeatureNameDelimiter=cFeatureNameDelimiter, xOutputFile=ostmCheckedFile)
 | 
| 
 | 
   905 		if not totalAbundanceTable:
 | 
| 
 | 
   906 			logging.error("MicroPITA.funcRun:: Could not read in the abundance table. Analysis was not performed."+
 | 
| 
 | 
   907 				" This often occurs when the Last Metadata is not specified correctly."+
 | 
| 
 | 
   908 				" Please check to make sure the Last Metadata selection is the row of the last metadata,"+
 | 
| 
 | 
   909 				" all values after this selection should be microbial measurements and should be numeric.")
 | 
| 
 | 
   910 			return False
 | 
| 
 | 
   911 
 | 
| 
 | 
   912 		lsOriginalLabels = SVM.funcMakeLabels(totalAbundanceTable.funcGetMetadata(strLabel)) if strLabel else strLabel
 | 
| 
 | 
   913 
 | 
| 
 | 
   914 		dictTotalMetadata = totalAbundanceTable.funcGetMetadataCopy()
 | 
| 
 | 
   915 		logging.debug("MicroPITA.funcRun:: Received metadata=" + str(dictTotalMetadata))
 | 
| 
 | 
   916 		#If there is only 1 unique value for the labels, do not run the Supervised methods
 | 
| 
 | 
   917 		if strLabel and ( len(set(dictTotalMetadata.get(strLabel,[]))) < 2 ):
 | 
| 
 | 
   918 			logging.error("The label " + strLabel + " did not have 2 or more values. Labels found=" + str(dictTotalMetadata.get(strLabel,[])))
 | 
| 
 | 
   919 			return False
 | 
| 
 | 
   920 
 | 
| 
 | 
   921 		#Run unsupervised methods###
 | 
| 
 | 
   922 		#Stratify the data if need be and drop the old data
 | 
| 
 | 
   923 		lStratifiedAbundanceTables = totalAbundanceTable.funcStratifyByMetadata(strStratify) if strStratify else [totalAbundanceTable]
 | 
| 
 | 
   924 
 | 
| 
 | 
   925 		#For each stratified abundance block or for the unstratfified abundance
 | 
| 
 | 
   926 		#Run the unsupervised blocks
 | 
| 
 | 
   927 		fAppendSupFiles = False
 | 
| 
 | 
   928 		for stratAbundanceTable in lStratifiedAbundanceTables:
 | 
| 
 | 
   929 			logging.info("MicroPITA.funcRun:: Running abundance block:"+stratAbundanceTable.funcGetName())
 | 
| 
 | 
   930 
 | 
| 
 | 
   931  			###NOT SUMMED, NOT NORMALIZED			
 | 
| 
 | 
   932 			#Only perform if the data is not yet normalized
 | 
| 
 | 
   933 			if not stratAbundanceTable.funcIsNormalized( ):
 | 
| 
 | 
   934 				#Need to first work with unnormalized data
 | 
| 
 | 
   935 				if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3:
 | 
| 
 | 
   936 
 | 
| 
 | 
   937 					self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount,
 | 
| 
 | 
   938 													 dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlphaNoNormalize,
 | 
| 
 | 
   939 													 lsBetaMetrics=diversityMetricsBetaNoNormalize,
 | 
| 
 | 
   940 													 lsInverseBetaMetrics=diversityMetricsBetaNoNormalize,
 | 
| 
 | 
   941 													 fRunDiversity=c_RUN_MAX_DIVERSITY_1,fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2,
 | 
| 
 | 
   942 													 fRunExtreme=c_RUN_MAX_DISSIMILARITY_3, strAlphaMetadata=strAlphaMetadata, 
 | 
| 
 | 
   943                                                                                                          istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity)
 | 
| 
 | 
   944 
 | 
| 
 | 
   945 
 | 
| 
 | 
   946 			#Generate selection by the rank average of user defined taxa
 | 
| 
 | 
   947 			#Expects (Taxa (row) by Samples (column))
 | 
| 
 | 
   948 			#Expects a column 0 of taxa id that is skipped
 | 
| 
 | 
   949 			#Returns [(sample name,average,rank)]
 | 
| 
 | 
   950 			#SUMMED AND NORMALIZED
 | 
| 
 | 
   951 			stratAbundanceTable.funcSumClades()
 | 
| 
 | 
   952 			#Normalize data at this point
 | 
| 
 | 
   953 			stratAbundanceTable.funcNormalize()
 | 
| 
 | 
   954 			if c_RUN_RANK_AVERAGE_USER_4:
 | 
| 
 | 
   955 				selectedSamples[ConstantsMicropita.c_strFeature] = self.funcSelectTargetedTaxaSamples(abndMatrix=stratAbundanceTable,
 | 
| 
 | 
   956 						lsTargetedTaxa=userDefinedTaxa, iSampleSelectionCount=iCount, sMethod=strFeatureSelection)
 | 
| 
 | 
   957 				logging.info("MicroPITA.funcRun:: Selected Samples Rank")
 | 
| 
 | 
   958 				logging.info(selectedSamples)
 | 
| 
 | 
   959 
 | 
| 
 | 
   960  			###SUMMED AND NORMALIZED analysis block
 | 
| 
 | 
   961 			#Diversity based metric will move reduce to terminal taxa as needed
 | 
| 
 | 
   962 			if c_RUN_MAX_DIVERSITY_1 or c_RUN_REPRESENTIVE_DISSIMILARITY_2 or c_RUN_MAX_DISSIMILARITY_3:
 | 
| 
 | 
   963 
 | 
| 
 | 
   964 				self._funcRunNormalizeSensitiveMethods(abndData=stratAbundanceTable, iSampleSelectionCount=iCount,
 | 
| 
 | 
   965 												 dictSelectedSamples=selectedSamples, lsAlphaMetrics=diversityMetricsAlpha,
 | 
| 
 | 
   966 												 lsBetaMetrics=diversityMetricsBeta,
 | 
| 
 | 
   967 												 lsInverseBetaMetrics=diversityMetricsBeta,
 | 
| 
 | 
   968 												 fRunDiversity=c_RUN_MAX_DIVERSITY_1,fRunRepresentative=c_RUN_REPRESENTIVE_DISSIMILARITY_2,
 | 
| 
 | 
   969 												 fRunExtreme=c_RUN_MAX_DISSIMILARITY_3,
 | 
| 
 | 
   970                                                                                                  istmBetaMatrix=istmBetaMatrix, istrmTree=istrmTree, istrmEnvr=istrmEnvr, fInvertDiversity=fInvertDiversity)
 | 
| 
 | 
   971 
 | 
| 
 | 
   972 			#5::Select randomly
 | 
| 
 | 
   973 			#Expects sampleNames = List of sample names [name, name, name...]
 | 
| 
 | 
   974 			if(c_RUN_RANDOM_5):
 | 
| 
 | 
   975 				#Select randomly from sample names
 | 
| 
 | 
   976 				selectedSamples[ConstantsMicropita.c_strRandom] = self.funcGetRandomSamples(lsSamples=stratAbundanceTable.funcGetSampleNames(), iNumberOfSamplesToReturn=iCount)
 | 
| 
 | 
   977 				logging.info("MicroPITA.funcRun:: Selected Samples Random")
 | 
| 
 | 
   978 				logging.info(selectedSamples)
 | 
| 
 | 
   979 
 | 
| 
 | 
   980 			#Perform supervised selection
 | 
| 
 | 
   981 			if c_RUN_DISTINCT or c_RUN_DISCRIMINANT:
 | 
| 
 | 
   982  				if strLabel:
 | 
| 
 | 
   983 					dictSelectionRet = self.funcRunSupervisedDistancesFromCentroids(abundanceTable=stratAbundanceTable,
 | 
| 
 | 
   984 								fRunDistinct=c_RUN_DISTINCT, fRunDiscriminant=c_RUN_DISCRIMINANT,
 | 
| 
 | 
   985 								xOutputSupFile=ostmInputPredictFile,xPredictSupFile=ostmPredictFile,
 | 
| 
 | 
   986 								strSupervisedMetadata=strLabel, iSampleSupSelectionCount=iCount,
 | 
| 
 | 
   987 								lsOriginalSampleNames = totalAbundanceTable.funcGetSampleNames(),
 | 
| 
 | 
   988 								lsOriginalLabels = lsOriginalLabels,
 | 
| 
 | 
   989 								fAppendFiles=fAppendSupFiles)
 | 
| 
 | 
   990 
 | 
| 
 | 
   991 					[selectedSamples.setdefault(sKey,[]).extend(lValue) for sKey,lValue in dictSelectionRet.items()]
 | 
| 
 | 
   992 
 | 
| 
 | 
   993 					if not fAppendSupFiles:
 | 
| 
 | 
   994 						fAppendSupFiles = True
 | 
| 
 | 
   995 					logging.info("MicroPITA.funcRun:: Selected Samples Unsupervised")
 | 
| 
 | 
   996 					logging.info(selectedSamples)
 | 
| 
 | 
   997 		return selectedSamples
 | 
| 
 | 
   998 	
 | 
| 
 | 
   999 	#Testing: Happy path tested
 | 
| 
 | 
  1000 	@staticmethod
 | 
| 
 | 
  1001 	def funcWriteSelectionToFile(dictSelection,xOutputFilePath):
 | 
| 
 | 
  1002 		"""
 | 
| 
 | 
  1003 		Writes the selection of samples by method to an output file.
 | 
| 
 | 
  1004 		
 | 
| 
 | 
  1005 		:param	dictSelection:	The dictionary of selections by method to be written to a file.
 | 
| 
 | 
  1006 		:type:	Dictionary	The dictionary of selections by method {"method":["sample selected","sample selected"...]}
 | 
| 
 | 
  1007 		:param	xOutputFilePath:	FileStream or String path to file inwhich the dictionary is written.
 | 
| 
 | 
  1008 		:type:	String	FileStream or String path to file
 | 
| 
 | 
  1009 		"""
 | 
| 
 | 
  1010 	
 | 
| 
 | 
  1011 		if not dictSelection:
 | 
| 
 | 
  1012 			return
 | 
| 
 | 
  1013 
 | 
| 
 | 
  1014 		#Open file
 | 
| 
 | 
  1015 		f = csv.writer(open(xOutputFilePath,"w") if isinstance(xOutputFilePath, str) else xOutputFilePath, delimiter=ConstantsMicropita.c_outputFileDelim )
 | 
| 
 | 
  1016 
 | 
| 
 | 
  1017 		#Create output content from dictionary
 | 
| 
 | 
  1018 		for sKey in dictSelection:
 | 
| 
 | 
  1019 			f.writerow([sKey]+dictSelection[sKey])
 | 
| 
 | 
  1020 			logging.debug("MicroPITA.funcRun:: Selected samples output to file:"+str(dictSelection[sKey]))
 | 
| 
 | 
  1021 	
 | 
| 
 | 
  1022 	#Testing: Happy Path tested
 | 
| 
 | 
  1023 	@staticmethod
 | 
| 
 | 
  1024 	def funcReadSelectionFileToDictionary(xInputFile):
 | 
| 
 | 
  1025 		"""
 | 
| 
 | 
  1026 		Reads in an output selection file from micropita and formats it into a dictionary.
 | 
| 
 | 
  1027 		
 | 
| 
 | 
  1028 		:param	xInputFile:	String path to file or file stream to read and translate into a dictionary.
 | 
| 
 | 
  1029 									{"method":["sample selected","sample selected"...]}
 | 
| 
 | 
  1030 		:type:	FileStream or String Path to file
 | 
| 
 | 
  1031 		:return	Dictionary:	Samples selected by methods.
 | 
| 
 | 
  1032 					Dictionary	{"Selection Method":["SampleID","SampleID","SampleID",...]}
 | 
| 
 | 
  1033 		"""
 | 
| 
 | 
  1034 
 | 
| 
 | 
  1035 		#Open file
 | 
| 
 | 
  1036 		istmReader = csv.reader(open(xInputFile,'r') if isinstance(xInputFile, str) else xInputFile, delimiter = ConstantsMicropita.c_outputFileDelim)
 | 
| 
 | 
  1037 
 | 
| 
 | 
  1038 		#Dictionary to hold selection data
 | 
| 
 | 
  1039 		return dict([(lsLine[0], lsLine[1:]) for lsLine in istmReader])
 | 
| 
 | 
  1040 
 | 
| 
 | 
  1041 #Set up arguments reader
 | 
| 
 | 
  1042 argp = argparse.ArgumentParser( prog = "MicroPITA.py", 
 | 
| 
 | 
  1043 	description = """Selects samples from abundance tables based on various selection schemes.""" )
 | 
| 
 | 
  1044 
 | 
| 
 | 
  1045 args = argp.add_argument_group( "Common", "Commonly modified options" )
 | 
| 
 | 
  1046 args.add_argument(ConstantsMicropita.c_strCountArgument,"--num", dest="iCount", metavar = "samples", default = 10, type = int, help = ConstantsMicropita.c_strCountHelp)
 | 
| 
 | 
  1047 args.add_argument("-m","--method", dest = "lstrMethods", metavar = "method", default = [], help = ConstantsMicropita.c_strSelectionTechniquesHelp,
 | 
| 
 | 
  1048 	choices = ConstantsMicropita.c_lsAllMethods, action = "append")
 | 
| 
 | 
  1049 
 | 
| 
 | 
  1050 args = argp.add_argument_group( "Custom", "Selecting and inputing custom metrics" )
 | 
| 
 | 
  1051 args.add_argument("-a","--alpha", dest = "strAlphaDiversity", metavar = "AlphaDiversity", default = None, help = ConstantsMicropita.c_strCustomAlphaDiversityHelp,  choices = Metric.setAlphaDiversities)
 | 
| 
 | 
  1052 args.add_argument("-b","--beta", dest = "strBetaDiversity", metavar = "BetaDiversity", default = None, help = ConstantsMicropita.c_strCustomBetaDiversityHelp,  choices = list(Metric.setBetaDiversities)+[Metric.c_strUnifracUnweighted,Metric.c_strUnifracWeighted])
 | 
| 
 | 
  1053 args.add_argument("-q","--alphameta", dest = "strAlphaMetadata", metavar = "AlphaDiversityMetadata", default = None, help = ConstantsMicropita.c_strCustomAlphaDiversityMetadataHelp)
 | 
| 
 | 
  1054 args.add_argument("-x","--betamatrix", dest = "istmBetaMatrix", metavar = "BetaDiversityMatrix", default = None, help = ConstantsMicropita.c_strCustomBetaDiversityMatrixHelp)
 | 
| 
 | 
  1055 args.add_argument("-o","--tree", dest = "istrmTree", metavar = "PhylogeneticTree", default = None, help = ConstantsMicropita.c_strCustomPhylogeneticTreeHelp)
 | 
| 
 | 
  1056 args.add_argument("-i","--envr", dest = "istrmEnvr", metavar = "EnvironmentFile", default = None, help = ConstantsMicropita.c_strCustomEnvironmentFileHelp)
 | 
| 
 | 
  1057 args.add_argument("-f","--invertDiversity", dest = "fInvertDiversity", action="store_true", default = False, help = ConstantsMicropita.c_strInvertDiversityHelp)
 | 
| 
 | 
  1058 
 | 
| 
 | 
  1059 args = argp.add_argument_group( "Miscellaneous", "Row/column identifiers and feature targeting options" )
 | 
| 
 | 
  1060 args.add_argument("-d",ConstantsMicropita.c_strIDNameArgument, dest="strIDName", metavar="sample_id", help= ConstantsMicropita.c_strIDNameHelp)
 | 
| 
 | 
  1061 args.add_argument("-l",ConstantsMicropita.c_strLastMetadataNameArgument, dest="strLastMetadataName", metavar = "metadata_id", default = None,
 | 
| 
 | 
  1062 				  help= ConstantsMicropita.c_strLastMetadataNameHelp)
 | 
| 
 | 
  1063 args.add_argument("-r",ConstantsMicropita.c_strTargetedFeatureMethodArgument, dest="strFeatureSelection", metavar="targeting_method", default=ConstantsMicropita.lsTargetedFeatureMethodValues[0],
 | 
| 
 | 
  1064 				  choices=ConstantsMicropita.lsTargetedFeatureMethodValues, help= ConstantsMicropita.c_strTargetedFeatureMethodHelp)
 | 
| 
 | 
  1065 args.add_argument("-t",ConstantsMicropita.c_strTargetedSelectionFileArgument, dest="istmFeatures", metavar="feature_file", type=argparse.FileType("rU"), help=ConstantsMicropita.c_strTargetedSelectionFileHelp)
 | 
| 
 | 
  1066 args.add_argument("-w",ConstantsMicropita.c_strFeatureMetadataArgument, dest="strLastFeatureMetadata", metavar="Last_Feature_Metadata", default=None, help=ConstantsMicropita.c_strFeatureMetadataHelp)
 | 
| 
 | 
  1067 
 | 
| 
 | 
  1068 args = argp.add_argument_group( "Data labeling", "Metadata IDs for strata and supervised label values" )
 | 
| 
 | 
  1069 args.add_argument("-e",ConstantsMicropita.c_strSupervisedLabelArgument, dest="strLabel", metavar= "supervised_id", help=ConstantsMicropita.c_strSupervisedLabelHelp)
 | 
| 
 | 
  1070 args.add_argument("-s",ConstantsMicropita.c_strUnsupervisedStratifyMetadataArgument, dest="strUnsupervisedStratify", metavar="stratify_id", 
 | 
| 
 | 
  1071 				  help= ConstantsMicropita.c_strUnsupervisedStratifyMetadataHelp)
 | 
| 
 | 
  1072 
 | 
| 
 | 
  1073 args = argp.add_argument_group( "File formatting", "Rarely modified file formatting options" )
 | 
| 
 | 
  1074 args.add_argument("-j",ConstantsMicropita.c_strFileDelimiterArgument, dest="cFileDelimiter", metavar="column_delimiter", default="\t", help=ConstantsMicropita.c_strFileDelimiterHelp) 
 | 
| 
 | 
  1075 args.add_argument("-k",ConstantsMicropita.c_strFeatureNameDelimiterArgument, dest="cFeatureNameDelimiter", metavar="taxonomy_delimiter", default="|", help=ConstantsMicropita.c_strFeatureNameDelimiterHelp) 
 | 
| 
 | 
  1076 
 | 
| 
 | 
  1077 args = argp.add_argument_group( "Debugging", "Debugging options - modify at your own risk!" )
 | 
| 
 | 
  1078 args.add_argument("-v",ConstantsMicropita.c_strLoggingArgument, dest="strLogLevel", metavar = "log_level", default="WARNING", 
 | 
| 
 | 
  1079 				  choices=ConstantsMicropita.c_lsLoggingChoices, help= ConstantsMicropita.c_strLoggingHelp)
 | 
| 
 | 
  1080 args.add_argument("-c",ConstantsMicropita.c_strCheckedAbundanceFileArgument, dest="ostmCheckedFile", metavar = "output_qc", type = argparse.FileType("w"), help = ConstantsMicropita.c_strCheckedAbundanceFileHelp)
 | 
| 
 | 
  1081 args.add_argument("-g",ConstantsMicropita.c_strLoggingFileArgument, dest="ostmLoggingFile", metavar = "output_log", type = argparse.FileType("w"), help = ConstantsMicropita.c_strLoggingFileHelp)
 | 
| 
 | 
  1082 args.add_argument("-u",ConstantsMicropita.c_strSupervisedInputFile, dest="ostmInputPredictFile", metavar = "output_scaled", type = argparse.FileType("w"), help = ConstantsMicropita.c_strSupervisedInputFileHelp)
 | 
| 
 | 
  1083 args.add_argument("-p",ConstantsMicropita.c_strSupervisedPredictedFile, dest="ostmPredictFile", metavar = "output_labels", type = argparse.FileType("w"), help = ConstantsMicropita.c_strSupervisedPredictedFileHelp)
 | 
| 
 | 
  1084 
 | 
| 
 | 
  1085 argp.add_argument("istmInput", metavar = "input.pcl/biome", type = argparse.FileType("rU"), help = ConstantsMicropita.c_strAbundanceFileHelp,
 | 
| 
 | 
  1086 	default = sys.stdin)
 | 
| 
 | 
  1087 argp.add_argument("ostmOutput", metavar = "output.txt", type = argparse.FileType("w"), help = ConstantsMicropita.c_strGenericOutputDataFileHelp,
 | 
| 
 | 
  1088 	default = sys.stdout)
 | 
| 
 | 
  1089 
 | 
| 
 | 
  1090 __doc__ = "::\n\n\t" + argp.format_help( ).replace( "\n", "\n\t" ) + __doc__
 | 
| 
 | 
  1091 
 | 
| 
 | 
  1092 def _main( ):
 | 
| 
 | 
  1093 	args = argp.parse_args( )
 | 
| 
 | 
  1094 
 | 
| 
 | 
  1095 	#Set up logger
 | 
| 
 | 
  1096 	iLogLevel = getattr(logging, args.strLogLevel.upper(), None)
 | 
| 
 | 
  1097 	logging.basicConfig(stream = args.ostmLoggingFile if args.ostmLoggingFile else sys.stderr, filemode = 'w', level=iLogLevel)
 | 
| 
 | 
  1098 
 | 
| 
 | 
  1099 	#Run micropita
 | 
| 
 | 
  1100 	logging.info("MicroPITA:: Start microPITA")
 | 
| 
 | 
  1101 	microPITA = MicroPITA()
 | 
| 
 | 
  1102 
 | 
| 
 | 
  1103 	#Argparse will append to the default but will not remove the default so I do this here
 | 
| 
 | 
  1104 	if not len(args.lstrMethods):
 | 
| 
 | 
  1105 		args.lstrMethods = [ConstantsMicropita.c_strRepresentative]
 | 
| 
 | 
  1106 
 | 
| 
 | 
  1107 	dictSelectedSamples = microPITA.funcRun(
 | 
| 
 | 
  1108 		strIDName		= args.strIDName,
 | 
| 
 | 
  1109 		strLastMetadataName	= args.strLastMetadataName,
 | 
| 
 | 
  1110 		istmInput		= args.istmInput,
 | 
| 
 | 
  1111 		ostmInputPredictFile	= args.ostmInputPredictFile,
 | 
| 
 | 
  1112 		ostmPredictFile		= args.ostmPredictFile,
 | 
| 
 | 
  1113 		ostmCheckedFile		= args.ostmCheckedFile,
 | 
| 
 | 
  1114 		ostmOutput		= args.ostmOutput,
 | 
| 
 | 
  1115 		cDelimiter		= args.cFileDelimiter,
 | 
| 
 | 
  1116 		cFeatureNameDelimiter	= args.cFeatureNameDelimiter,
 | 
| 
 | 
  1117 		istmFeatures		= args.istmFeatures,
 | 
| 
 | 
  1118 		strFeatureSelection	= args.strFeatureSelection,
 | 
| 
 | 
  1119 		iCount			= args.iCount,
 | 
| 
 | 
  1120 		strLastRowMetadata	= args.strLastFeatureMetadata,
 | 
| 
 | 
  1121 		strLabel		= args.strLabel,
 | 
| 
 | 
  1122 		strStratify		= args.strUnsupervisedStratify,
 | 
| 
 | 
  1123 		strCustomAlpha		= args.strAlphaDiversity,
 | 
| 
 | 
  1124 		strCustomBeta		= args.strBetaDiversity,
 | 
| 
 | 
  1125 		strAlphaMetadata	= args.strAlphaMetadata,
 | 
| 
 | 
  1126 		istmBetaMatrix		= args.istmBetaMatrix,
 | 
| 
 | 
  1127 		istrmTree		= args.istrmTree,
 | 
| 
 | 
  1128 		istrmEnvr		= args.istrmEnvr,
 | 
| 
 | 
  1129 		lstrMethods		= args.lstrMethods,
 | 
| 
 | 
  1130 		fInvertDiversity	= args.fInvertDiversity
 | 
| 
 | 
  1131 	)
 | 
| 
 | 
  1132 
 | 
| 
 | 
  1133 	if not dictSelectedSamples:
 | 
| 
 | 
  1134 		logging.error("MicroPITA:: Error, did not get a result from analysis.")
 | 
| 
 | 
  1135 		return -1
 | 
| 
 | 
  1136 	logging.info("End microPITA")
 | 
| 
 | 
  1137 
 | 
| 
 | 
  1138 	#Log output for debugging
 | 
| 
 | 
  1139 	logging.debug("MicroPITA:: Returned the following samples:"+str(dictSelectedSamples))
 | 
| 
 | 
  1140 
 | 
| 
 | 
  1141 	#Write selection to file
 | 
| 
 | 
  1142 	microPITA.funcWriteSelectionToFile(dictSelection=dictSelectedSamples, xOutputFilePath=args.ostmOutput)
 | 
| 
 | 
  1143 
 | 
| 
 | 
  1144 if __name__ == "__main__":
 | 
| 
 | 
  1145 	_main( )
 |