Mercurial > repos > george-weingart > micropita
diff src/breadcrumbs/src/Cladogram.py @ 0:2f4f6f08c8c4 draft
Uploaded
author | george-weingart |
---|---|
date | Tue, 13 May 2014 21:58:57 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/breadcrumbs/src/Cladogram.py Tue May 13 21:58:57 2014 -0400 @@ -0,0 +1,950 @@ +""" +Author: Timothy Tickle +Description: Class to call circlader and create dendrograms. +""" + +##################################################################################### +#Copyright (C) <2012> +# +#Permission is hereby granted, free of charge, to any person obtaining a copy of +#this software and associated documentation files (the "Software"), to deal in the +#Software without restriction, including without limitation the rights to use, copy, +#modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, +#and to permit persons to whom the Software is furnished to do so, subject to +#the following conditions: +# +#The above copyright notice and this permission notice shall be included in all copies +#or substantial portions of the Software. +# +#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +#INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +#PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +#HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +#OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +##################################################################################### + +__author__ = "Timothy Tickle" +__copyright__ = "Copyright 2012" +__credits__ = ["Timothy Tickle"] +__license__ = "MIT" +__maintainer__ = "Timothy Tickle" +__email__ = "ttickle@sph.harvard.edu" +__status__ = "Development" + +#External libraries +from AbundanceTable import AbundanceTable +from CommandLine import CommandLine +from ConstantsBreadCrumbs import ConstantsBreadCrumbs +from ConstantsFiguresBreadCrumbs import ConstantsFiguresBreadCrumbs +import math +import numpy as np +import os +import re +import scipy.stats +from ValidateData import ValidateData +#import scipy.stats.stats as stats + +class Cladogram: + """ + This class manages creating files for Circlader and calling circulator. + """ + + #Script name + circladerScript=None + + #Constants + c_sTaxa="Taxa" + c_sCircle="Circle" + c_sBorder="Border" + c_sShape="Shape" + c_sAlpha="Alpha" + c_sForced="Forced" + + #Numpy array (structured array) holding data + #Should be SampleID, Sample Abundances/Data (samples = columns)..... + npaAbundance = None + #List of sample names + lsSampleNames = None + #Name of output image + strImageName = "Cladogram.png" + #String used to call the sample id column + strSampleID = "ID" + strUnclassified = "unclassified" + + #Minimum size of clade (terminal node count for clade) + iMinCladeSize = 1 + #Level of ancestry to filter at (starts with 0 and based on the input file) + iCladeLevelToMeasure = 1 + iCladeLevelToReduce = 1 + cFeatureDelimiter = "|" + + #Flags + #Turns on (True) or off (False) abundance-based filtering + fAbundanceFilter = False + #Turns on (True) or off (False) clade size-based filtering + fCladeSizeFilter = False + #Indicate if the following files were made + fSizeFileMade=False + fCircleFileMade=False + fColorFileMade=False + fTickFileMade=False + fHighlightFileMade=False + + #Circlader files + strTreeFilePath="_Taxa.txt" + strCircleFilePath = "_Circle.txt" + strColorFilePath="_Color.txt" + strTickFilePath="_Tick.txt" + strHighLightFilePath="_HighLight.txt" + strSizeFilePath="_Size.txt" + strStyleFilePath="" + + #Thresholds + #Controls the showing of taxa + c_dPercentileCutOff = 90.0 + c_dPercentageAbovePercentile = 1.0 + + #Minimum average abundance score when using log scale + c_dMinLogSize = 0.0000000001 + #Constant used to maginfy the size difference in the taxa (log only) + c_dLogScale = 1000000 + #When after log10, an addition scaling adjustment (use this) + c_dCircleScale = 3 + + #Data for circular files + #Used to change IDs to proper labels + dictConvertIDs = None + #Labels to be relabeled + dictRelabels = None + #Colors + dictColors = None + #Elements that are forced to be highlighted + dictForcedHighLights = None + #Ticks + llsTicks = None + #Forced root of the tree, discarding data as needed. + strRoot = None + #Holds circle data as a list of dictionaries + #One dictionary per circle + ldictCircleData = None + + def __init__(self): + self.dictForcedHighLights = dict() + + #Happy Path Tested + def addHighLights(self, dictClades,fOverwrite): + """ + This methods allows highlighting to be added. + When an element is added in this manner it will not be filtered out. + These elements, if existing in the tree will be highlighted the named color given. + This color name should be supplied in the set Color Data method + {strName1:strColorName1,strName2:strColorName2,...} + + :param dictClades: Names of elements, if found in the tree which should be highlighted + :type: dictClades Dictionary of element name (string) and element color (string) + :param fOverwrite: If element is already indicated to be highlighted, overwrite the color to the one provided here. + :type: fOverwrite boolean (True == overwrite color) + """ + if ValidateData.funcIsValidDictionary(dictClades): + if ValidateData.funcIsValidBoolean(fOverwrite): + for strElement in dictClades: + if(strElement in self.dictForcedHighLights): + if(fOverwrite): + self.dictForcedHighLights[strElement] = dictClades[strElement] + else: + self.dictForcedHighLights[strElement] = dictClades[strElement] + + #Not tested + def getHighLights(self): + return self.dictForcedHighLights + + #Not tested + def forceRoot(self, strRoot): + """ + This method allows one to root the tree at a certain level and value + Only taxa that contain this value in their ancestry will be plotted + The root will be the value given, any previous heirachy will be ignored + This will remove highlighted data if indicated to do so + + :params strRoot: Where to root the tree + :type: strRoot String + """ + self.strRoot = strRoot + + def generate(self, strImageName, strStyleFile, sTaxaFileName, strCircladerScript = ConstantsBreadCrumbs.c_strCircladerScript, iTerminalCladeLevel = 10, sColorFileName=None, sTickFileName=None, sHighlightFileName=None, sSizeFileName=None, sCircleFileName=None): + """ + This is the method to call to generate a cladogram using circlader. + The default data file is an abundance table unless the getDa function is overwritten. + + :param strImageName: File name to save the output cladogram image + :type: strImageName File name (string) + :param strStyleFile: File path indicating the style file to use + :type: strStyleFile File path (string) + :param sTaxaFileName: File path indicating the taxa file to use + :type: sTaxaFileName File path (string) + :param strCircladerScript: File path to the Circlader script + :type: String + :param iTerminalCladeLevel: Clade level to use as terminal in plotting + :type: iTerminalCladeLevel integer starting with 1 + :param strColorFile: File path indicating the color file to use + :type: strColorFile File path (string) + :param strTickFile: File path indicating the tick file to use + :type: strTickFile File path (string) + :param strHighlightFile: File path indicating the highlight file to use + :type: strHighlightFile File path (string) + :param strSizeFile: File path indicating the size file to use + :type: strSizeFile File path (string) + :param sCircleFileName: File path of circlader circle file. + :type: String + """ + + if self.npaAbundance == None: + print "Cladogram::generate. The data was not set so an image could not be generated" + return False + + #Set script + self.circladerScript = strCircladerScript + + #Set output file name + self.strImageName = strImageName + + #Check files exist and remove files which will be written + self.manageFilePaths(sTaxaFileName, strStyleFile, sColorFileName, sTickFileName, sHighlightFileName, sSizeFileName, sCircleFileName) + + #Get IDs + lsIDs = [strId for strId in list(self.npaAbundance[self.strSampleID])] + + #Generate a dictionary to convert the ids to correct format + #Fix unclassified names + #Make numeric labels as indicated + self.dictConvertIDs = self.generateLabels(lsIDs) + + #Remove taxa lower than the display clade level + lsCladeAndAboveFeatures = [] + for sFeature in lsIDs: + if len(sFeature.split(self.cFeatureDelimiter)) <= iTerminalCladeLevel: + lsCladeAndAboveFeatures.append(sFeature) + lsIDs = lsCladeAndAboveFeatures + + #Filter by abundance + if(self.fAbundanceFilter): + lsIDs = self.filterByAbundance(lsIDs) + + #Update to the correct root + lsIDs = self.updateToRoot(lsIDs) + + #Set highlights to root for consistency + if(not self.strRoot == None): + dictRootedHighLights = dict() + if not self.dictForcedHighLights == None: + for sKey in self.dictForcedHighLights.keys(): + strUpdatedKey = self.updateToRoot([sKey]) + dictRootedHighLights[strUpdatedKey[0]]=self.dictForcedHighLights[sKey] + self.dictForcedHighLights = dictRootedHighLights + + #Set relabels to root for consistency + if(not self.strRoot == None): + dictRootedLabels = dict() + if not self.dictRelabels == None: + for sKey in self.dictRelabels.keys(): + strUpdatedKey = self.updateToRoot([sKey]) + dictRootedLabels[strUpdatedKey[0]]=self.dictRelabels[sKey] + self.dictRelabels = dictRootedLabels + + #Filter by clade size Should be the last filter. + #It is not a strong filter but cleans up images + if(self.fCladeSizeFilter): + lsIDs = self.filterByCladeSize(lsIDs) + + #Add in forced highlighting + lsIDs.extend(self.dictForcedHighLights.keys()) + lsIDs = list(set(lsIDs)) + + #Add in forced circle data + for dictCircleData in self.ldictCircleData: + if(dictCircleData[self.c_sForced]): + lsTaxa = dictCircleData[self.c_sTaxa] + lsAlpha = dictCircleData[self.c_sAlpha] + lsAddTaxa = [] + [lsAddTaxa.append(lsTaxa[tpleAlpha[0]]) if not tpleAlpha[1] == '0.0' else 0 for tpleAlpha in enumerate(lsAlpha)] + lsIDs.extend(lsAddTaxa) + lsIDs = list(set(lsIDs)) + + #Create circle files (needs to be after any filtering because it has a forcing option). + if not self.createCircleFile(lsIDs): + return False + + #Generate / Write Tree file + if not self.createTreeFile(lsIDs): + return False + + #Generate / Write Highlight file + if not self.createHighlightFile(lsIDs): + return False + + #Generate / write color file + if(self.dictColors is not None): + lsColorData = [ConstantsBreadCrumbs.c_cTab.join([sColorKey,self.dictColors[sColorKey]]) for sColorKey in self.dictColors] + self.writeToFile(self.strColorFilePath, ConstantsBreadCrumbs.c_strEndline.join(lsColorData), False) + self.fColorFileMade=True + + #Generate / write tick file + if(self.llsTicks is not None): + lsTickData = [ConstantsBreadCrumbs.c_cTab.join(lsTicks) for lsTicks in self.llsTicks] + self.writeToFile(self.strTickFilePath, ConstantsBreadCrumbs.c_strEndline.join(lsTickData), False) + self.fTickFileMade=True + + #Generate / Write size data + if not self.createSizeFile(lsIDs): + return False + + #Call commandline + lsCommand = [self.circladerScript, self.strTreeFilePath, self.strImageName, "--style_file", self.strStyleFilePath, "--tree_format", "tabular"] + if(self.fSizeFileMade): + lsCommand.extend(["--size_file", self.strSizeFilePath]) + if(self.fColorFileMade): + lsCommand.extend(["--color_file", self.strColorFilePath]) + if(self.fTickFileMade): + lsCommand.extend(["--tick_file", self.strTickFilePath]) + if(self.fHighlightFileMade): + lsCommand.extend(["--highlight_file", self.strHighLightFilePath]) + if(self.fCircleFileMade): + lsCommand.extend(["--circle_file", self.strCircleFilePath]) + CommandLine().runCommandLine(lsCommand) + + #Happy path tested + def setColorData(self, dictColors): + """ + This methods allows color information to be specified. + Need to give a dictionary having a name (key)(string) and color (value)(string RGB)data + {strName1:Color,strName2:Color...} + Name will be a string name that references what needs to be this color + Color data should be a string in the RGB format 0-255,0-255,0-255 + + :param dictColors: Color Name and RGB specification + :type: dictColorsDictionary strings + """ + if ValidateData.funcIsValidDictionary(dictColors): + self.dictColors = dictColors + if not ConstantsFiguresBreadCrumbs.c_strBackgroundColorName in self.dictColors: + self.dictColors[ConstantsFiguresBreadCrumbs.c_strBackgroundColorName]=ConstantsFiguresBreadCrumbs.c_strBackgroundColor + + #Not tested + def setAbundanceData(self, abtbAbundanceTable): + """ + Sets the abundance data the Cladogram will use to plot + + :params abtAbundanceTable: AbundanceTable to set + :type: AbundanceTable + """ + self.npaAbundance = abtbAbundanceTable.funcGetAbundanceCopy() + self.strSampleID = abtbAbundanceTable.funcGetIDMetadataName() + self.lsSampleNames = abtbAbundanceTable.funcGetSampleNames() + + #Not tested + def setFilterByAbundance(self, fAbundanceFilter, dPercentileCutOff = 90.0, dPercentageAbovePercentile = 1.0): + """ + Switch filtering by abundance on and off. + fAbundanceFilter == True indicates filtering is on + + :param fAbundanceFilter: Switch to turn on (true) and off (false) abundance-based filtering + :type: fAbundanceFilter boolean + :param dPercentileCutOff: Percentage between 100.0 to 0.0. + :type: double + :param dPercentageAbovePercentile: Percentage between 100.0 to 1.0. + :type: double + """ + self.fAbundanceFilter = fAbundanceFilter + self.c_dPercentileCutOff = dPercentileCutOff + self.c_dPercentageAbovePercentile = dPercentageAbovePercentile + + #Not Tested + def setCircleScale(self, iScale): + """ + Is a scale used to increase or decrease node sizes in the the cladogram to make more visible + iScale default is 3 + + :param iScale: Integer to increase the relative sizes of nodes + :type: iScale integer + """ + self.c_dCircleScale = iScale + + #Not tested + def setFeatureDelimiter(self, cDelimiter): + """ + Set the delimiter used to parse the consensus lineages of features. + + :param cDelimiter: The delimiter used to parse the consensus lineage of features. + :type: Character + """ + if cDelimiter: + self.cFeatureDelimiter = cDelimiter + + #Not tested + def setFilterByCladeSize(self, fCladeSizeFilter, iCladeLevelToMeasure = 3, iCladeLevelToReduce = 1, iMinimumCladeSize = 5, cFeatureDelimiter = None, strUnclassified="unclassified"): + """ + Switch filtering by clade size on and off. + fCladeSizeFilter == True indicates filtering is on + NOT 0 based. + + :param fCladeSizeFilter: Switch to turn on (true) and off (false) clade size-based filtering + :type: fCladeSizeFilter boolean + :param iCladeLevelToMeasure: The level of the concensus lineage that is measure or counted. Should be greater than iCladeLevelToReduce (Root is 1) + :type: iCladeLevelToMeasure int + :param iCladeLevelToReduce: The level of the concensus lineage that is reduced if the measured level are not the correct count (Root is 1) + :type: iCladeLevelToReduce int + :param iMinimumCladeSize: Minimum count of the measured clade for the clade to be kept + :type: iMinimumCladeSize int + :param cFeatureDelimiter: One may set the feature delimiter if needed. + :type: Character + :param strUnclassified: String indicating unclassifed features + :type: String + """ + self.fCladeSizeFilter = fCladeSizeFilter + if iCladeLevelToMeasure > 0: + self.iCladeLevelToMeasure = iCladeLevelToMeasure + if iCladeLevelToReduce > 0: + self.iCladeLevelToReduce = iCladeLevelToReduce + if iMinimumCladeSize > 0: + self.iMinCladeSize = iMinimumCladeSize + if cFeatureDelimiter: + self.cFeatureDelimiter = cFeatureDelimiter + if strUnclassified: + self.strUnclassified = strUnclassified + + #Not tested + def setTicks(self, llsTicks): + """ + This methods allows tick information to be specified. + Need to generate a list of lists each having a tick level (number starting at 0 as a string), and tick name + #Lowest numbers are closest to the center of the tree + [[#,Name1],[#,Name2]...] + + :param llsTicks: Level # and Name of level + :type: llsTicks List of lists of strings + """ + self.llsTicks = llsTicks + + #Happy Path tested with createCircleFile + def addCircle(self, lsTaxa, strCircle, dBorder=0.0, strShape="R", dAlpha=1.0, fForced=False): + """ + This methods allows one to add a circle to the outside of the cladogram. + + :param lsTaxa: Taxa to highlight with this circle + :type: lsTaxa List of strings (taxa names) + :param strCircle: Circle the elements will be in, indicates color and circle level. + :type: strCircle String circle + :param dBorder: Border size for the circle element border (between 0.0 and 1.0) + can also be a list of dBorders. If list, position must match lsTaxa. + :type: dBorder Float of border size (or list of floats). + :param strShape: String Indicator of shape or method to determine shape. + Can also be a list of shapes. If list, position must match lsTaxa. + :type: strShape String to indicate the shape (may also be a list of strings). + Default value is square. + Valid shapes are R(Square), v(inward pointing triangle), ^(outward pointing triangle) + :param dAlpha: The transparency of the circle element (between 0.0[clear] and 1.0[solid]). + Can also be a list of floats. If list, position must match lsTaxa. + :type: dAlpha Float to indicate the transparency of the shape (may also be a list of strings). + :param fForced: Forces item in the features in the circle to be displayed in the cladogram no matter thier passing filters. + :type: Boolean + """ + if(self.ldictCircleData == None): + self.ldictCircleData = list() + dictCircleData = dict() + dictCircleData[self.c_sTaxa]=lsTaxa + dictCircleData[self.c_sCircle]=strCircle + dictCircleData[self.c_sBorder]=dBorder + dictCircleData[self.c_sShape]=strShape + dictCircleData[self.c_sAlpha]=dAlpha + dictCircleData[self.c_sForced]=fForced + + self.ldictCircleData.append(dictCircleData) + return True + + #Happy Path tested with AddCircle + def createCircleFile(self, lsIDs): + """ + Write circle data to file. + + :param lsIDs: Ids to include in the circle file + :type: lsIDs List of strings + """ + #If there is circle data + if(not self.ldictCircleData == None): + if self.strCircleFilePath == None: + print("Error, there is no circle file specified to write to.") + return False + #Holds circle data {Taxaname:string updates correctly for output to file} + dictCircleDataMethods = dict() + lsCircleData = list() + + for dictCircleData in self.ldictCircleData: + lsTaxa = dictCircleData[self.c_sTaxa] + #Shape/s for taxa + datShape = dictCircleData[self.c_sShape] + fShapeIsList = (str(type(datShape)) == "<type 'list'>") + #Border/s for taxa + datBorder = dictCircleData[self.c_sBorder] + fBorderIsList = (str(type(datBorder)) == "<type 'list'>") + #Alpha/s for taxa + datAlpha = dictCircleData[self.c_sAlpha] + fAlphaIsList = (str(type(datAlpha)) == "<type 'list'>") + #Circle name + sCircleMethod = dictCircleData[self.c_sCircle] + + #Check to make sure the lengths of the array match up + if(fShapeIsList): + if not len(datShape) == len(lsTaxa): + print("".join(["Error, Shapes were given as an list not of the size of the taxa list. Shape list length: ",str(len(datShape)),". Taxa list length: ",str(len(lsTaxa)),"."])) + return False + if(fBorderIsList): + if not len(datBorder) == len(lsTaxa): + print("".join(["Error, Border sizes were given as an list not of the size of the taxa list. Border list length: ",str(len(datBorder)),". Taxa list length: ",str(len(lsTaxa)),"."])) + return False + if(fAlphaIsList): + if not len(datAlpha) == len(lsTaxa): + print("".join(["Error, Alpha sizes were given as an list not of the size of the taxa list. Alpha list length: ",str(len(datAlpha)),". Taxa list length: ",str(len(lsTaxa)),"."])) + return False + + #Update taxa to root if needed + #When doing this if any of the other data is an array we have to edit them + #as the taxa are edited for updating root + if((not fShapeIsList) and (not fBorderIsList) and (not fAlphaIsList)): + lsTaxa = self.updateToRoot(dictCircleData[self.c_sTaxa]) + else: + #Initilize as lists or as the string value they already are + lsUpdatedTaxa = list() + datUpdatedShapes=list() + if(not fShapeIsList): + datUpdatedShapes = datShape + datUpdatedBorders=list() + if(not fBorderIsList): + datUpdatedBorders = datBorder + datUpdatedAlphas=list() + if(not fAlphaIsList): + datUpdatedAlphas = datAlpha + + #If a taxa is kept, keep associated list information + #If not a list data, leave alone, it will be used globally for all taxa. + iTaxaIndex = -1 + for sTaxa in lsTaxa: + iTaxaIndex = iTaxaIndex + 1 + sUpdatedTaxa=self.updateToRoot([sTaxa]) + + if len(sUpdatedTaxa)==1: + lsUpdatedTaxa.append(sUpdatedTaxa[0]) + if(fShapeIsList): + datUpdatedShapes.append(datShape[iTaxaIndex]) + if(fBorderIsList): + datUpdatedBorders.append(datBorder[iTaxaIndex]) + if(fAlphaIsList): + datUpdatedAlphas.append(datAlpha[iTaxaIndex]) + + #Reset data to rooted data + lsTaxa=lsUpdatedTaxa + datShape=datUpdatedShapes + datBorder=datUpdatedBorders + datAlpha=datUpdatedAlphas + + #QC passes so we will add the circle to the figure and the ticks. + #If there are ticks and if the circle is not already in the ticks. + if(not self.llsTicks == None): + strCircleName = dictCircleData[self.c_sCircle] + fFound = False + iHighestNumber = -1 + for tick in self.llsTicks: + #Look for name + if tick[1] == strCircleName: + fFound = True + #Find highest count + if int(tick[0]) > iHighestNumber: + iHighestNumber = int(tick[0]) + if not fFound: + self.llsTicks.append([str(iHighestNumber+1),strCircleName]) + + #If the circle is forced, add the taxa to the lsIDs + #Otherwise we will only plot those that are matching + #the lsIDs and the circle taxa list. + if dictCircleData[self.c_sForced]: + for iAlpha in xrange(0,len(datAlpha)): + if(not datAlpha[iAlpha] == "0.0"): + lsIDs.append(lsTaxa[iAlpha]) + lsIDs = list(set(lsIDs)) + + #For all taxa in the cladogram + for sTaxa in lsTaxa: + #Store circle content name in dictionary + if not sTaxa in dictCircleDataMethods: + #Reset name to . delimited + asNameElements = filter(None,re.split("\|",sTaxa)) + + sCurTaxaName = asNameElements[len(asNameElements)-1] + if(len(asNameElements)>1): + if(sCurTaxaName=="unclassified"): + sCurTaxaName = ".".join([asNameElements[len(asNameElements)-2],sCurTaxaName]) + sCurTaxa = ".".join(asNameElements) + #Add to dictionary + dictCircleDataMethods[sTaxa] = sCurTaxa + + #If the taxa is in the selected method + if sTaxa in lsTaxa: + #Index of the id in the circle data + iTaxaIndex = lsTaxa.index(sTaxa) + #Get border + sBorder = "" + if(fBorderIsList): + sBorder = str(datBorder[iTaxaIndex]) + else: + sBorder = str(datBorder) + #Get shape + sShape = "" + if(fShapeIsList): + sShape = datShape[iTaxaIndex] + else: + sShape = datShape + #Get alpha + sAlpha = "" + if(fAlphaIsList): + sAlpha = str(datAlpha[iTaxaIndex]) + else: + sAlpha = str(datAlpha) + dictCircleDataMethods[sTaxa]=dictCircleDataMethods[sTaxa]+"".join([ConstantsBreadCrumbs.c_cTab,sCircleMethod,":",sAlpha,"!",sShape,"#",sBorder]) + else: + dictCircleDataMethods[sTaxa]=dictCircleDataMethods[sTaxa]+"".join([ConstantsBreadCrumbs.c_cTab,sCircleMethod,":0.0!R#0.0"]) + + if len(dictCircleDataMethods)>0: + lsTaxaKeys = dictCircleDataMethods.keys() + sCircleContent = dictCircleDataMethods[lsTaxaKeys[0]] + for sTaxaKey in lsTaxaKeys[1:len(lsTaxaKeys)]: + sCircleContent = ConstantsBreadCrumbs.c_strEndline.join([sCircleContent,dictCircleDataMethods[sTaxaKey]]) + self.writeToFile(self.strCircleFilePath, sCircleContent, False) + self.fCircleFileMade=True + + return True + self.fCircleFileMade=False + return False + + #Happy Path tested + def createHighlightFile(self, lsIDs): + """ + Write highlight data to file + + :param lsIDs: Ids to include in the highlight file + :type: lsIDs List of strings + """ + lsHighLightData = list() + #Each taxa name + for sID in lsIDs: + sCurColor = "" + #Rename taxa to be consisten with the . delimit format + asNameElements = filter(None,re.split("\|",sID)) + sCurTaxaName = asNameElements[len(asNameElements)-1] + if(len(asNameElements)>1): + if(sCurTaxaName=="unclassified"): + sCurTaxaName = ".".join([asNameElements[len(asNameElements)-2],sCurTaxaName]) + sCurTaxa = ".".join(asNameElements) + + sCurLabel = "" + #Get color + sColorKey = "" + if(sID in self.dictForcedHighLights): + sColorKey = self.dictForcedHighLights[sID] + if(sColorKey in self.dictColors): + sCurColor = self.formatRGB(self.dictColors[sColorKey]) + #Get label + if(self.dictRelabels is not None): + if(sID in self.dictRelabels): + sCurLabel = self.dictRelabels[sID] + if(sCurLabel == ""): + lsHighLightData.append(ConstantsBreadCrumbs.c_cTab.join([sCurTaxa,sCurTaxaName,sCurLabel,sCurColor])) + else: + lsHighLightData.append(ConstantsBreadCrumbs.c_cTab.join([sCurTaxa,sCurLabel,sCurLabel,sCurColor])) + + if len(lsHighLightData)>0: + self.writeToFile(self.strHighLightFilePath, ConstantsBreadCrumbs.c_strEndline.join(lsHighLightData), False) + self.fHighlightFileMade=True + return True + + #Happy path tested + def createSizeFile(self, lsIDs): + """ + Write size data to file + + :param lsIDs: Ids to include in the size file + :type: lsIDs List of strings + """ + if self.npaAbundance is not None: + dMinimumValue = (self.c_dMinLogSize*self.c_dLogScale)+1 + lsWriteData = list() + for rowData in self.npaAbundance: + strCurrentId = rowData[0] + #Reset to root if needed to match current data + if(not self.strRoot == None): + strCurrentId = self.updateToRoot([strCurrentId]) + if(len(strCurrentId) > 0): + strCurrentId = strCurrentId[0] + if(strCurrentId in lsIDs): + dAverage = np.average(list(rowData)[1:]) + dSize = max([dMinimumValue,(dAverage*self.c_dLogScale)+1]) + lsWriteData.append(".".join(re.split("\|",strCurrentId))+ConstantsBreadCrumbs.c_cTab+str(math.log10(dSize)*self.c_dCircleScale)) + if len(lsWriteData)>0: + self.writeToFile(self.strSizeFilePath, ConstantsBreadCrumbs.c_strEndline.join(lsWriteData), False) + self.fSizeFileMade=True + return True + + #Happy path tested 1 + def createTreeFile(self, lsIDs): + """ + Write tree data to file. The tree file defines the internal cladogram and all it's points. + + :param lsIDs: Ids to include in the tree file as well as their ancestors + :type: lsIDs List of strings + """ + lsFullTree = list() + for sID in lsIDs: + lsIDElements = filter(None,re.split("\|",sID)) + sElementCur = lsIDElements[0] + if(not sElementCur in lsFullTree): + lsFullTree.append(sElementCur) + if(len(lsIDElements) > 1): + sNodePath = "" + for iEndLevel in xrange(1,len(lsIDElements)+1): + sCurAncestry = lsIDElements[0:iEndLevel] + sNodePath = ".".join(sCurAncestry) + if(not sNodePath in lsFullTree): + lsFullTree.append(sNodePath) + + if len(lsFullTree)>0: + self.writeToFile(self.strTreeFilePath, ConstantsBreadCrumbs.c_strEndline.join(lsFullTree), False) + return True + + #Happy Path tested + def filterByAbundance(self, lsIDs): + """ + Filter by abundance. Specifically this version requires elements of + the tree to have a certain percentage of a certain percentile in samples. + + :param lsIDs: Ids to filter + :type: lsIDs List of strings + """ + #list of ids to return that survived the filtering + retls = list() + if not self.npaAbundance is None: + #Hold the cuttoff score (threshold) for the percentile of interest {SampleName(string):score(double)} + dictPercentiles = dict() + for index in xrange(1,len(self.npaAbundance.dtype.names)): + dScore = scipy.stats.scoreatpercentile(self.npaAbundance[self.npaAbundance.dtype.names[index]],self.c_dPercentileCutOff) + dictPercentiles[self.npaAbundance.dtype.names[index]] = dScore + + #Sample count (Ignore sample id [position 0] which is not a name) + dSampleCount = float(len(self.npaAbundance.dtype.names[1:])) + + #Check each taxa + for rowTaxaData in self.npaAbundance: + sCurTaxaName = rowTaxaData[0] + #Only look at the IDs given + if(sCurTaxaName in lsIDs): + dCountAbovePercentile = 0.0 + ldAbundanceMeasures = list(rowTaxaData)[1:] + #Check to see if the abundance score meets the threshold and count if it does + for iScoreIndex in xrange(0,len(ldAbundanceMeasures)): + if(ldAbundanceMeasures[iScoreIndex] >= dictPercentiles[self.lsSampleNames[iScoreIndex]]): + dCountAbovePercentile = dCountAbovePercentile + 1.0 + dPercentOverPercentile = dCountAbovePercentile / dSampleCount + if(dPercentOverPercentile >= (self.c_dPercentageAbovePercentile/100.0)): + retls.append(sCurTaxaName) + return retls + + #Happy Path Tested + def filterByCladeSize(self, lsIDs): + """ + Filter by the count of individuals in the clade. + + :param lsIDs: Ids to filter + :type: lsIDs List of strings + """ + #First get terminal nodes + lsTerminalNodes = AbundanceTable.funcGetTerminalNodesFromList(lsIDs,self.cFeatureDelimiter) + + #Count up clades + cladeCounts = dict() + + #For each terminal node count the + #Clades at clade levels + for sTerminalNode in lsTerminalNodes: + lsLineage = sTerminalNode.split(self.cFeatureDelimiter) + iLineageCount = len(lsLineage) + #If the lineage is shorter than the reduced clade level then no need to filter it + if iLineageCount >= self.iCladeLevelToReduce: + #If the lineage is longer than the reduced clade level and measuring clade level then count + #or If the lineage is longer than the reduced clade level but shorter than the measuring clade, + #only count if the last element is unclassified + if (iLineageCount >= self.iCladeLevelToMeasure) or (lsLineage[-1] == self.strUnclassified): + sLineage = self.cFeatureDelimiter.join(lsLineage[0:self.iCladeLevelToReduce]) + cladeCounts[sLineage] = cladeCounts.get(sLineage,0) + 1 + + #Go through the IDs and reduce as needed using the clade counts + retls = list() + for sID in lsIDs: + lsID = sID.split(self.cFeatureDelimiter) + iIDCount = len(lsID) + + #Too short to filter + if iLineageCount < self.iCladeLevelToReduce: + retls.append(sID) + #Check to see if the clade which is being reduced made the cut + if iIDCount >= self.iCladeLevelToReduce: + if (iIDCount >= self.iCladeLevelToMeasure) or (lsID[-1] == self.strUnclassified): + if cladeCounts[self.cFeatureDelimiter.join(lsID[0:self.iCladeLevelToReduce])] >= self.iMinCladeSize: + retls.append(sID) + + return retls + + #Happy path tested + def formatRGB(self, sColor): + """ + Takes a string that is of the format 0-255,0-255,0-255 and converts it to the + color format of circlader _c_[0-1,0-1,0-1] + + :param sColor: String RGB format + :type: sColor String + """ + sCircladerColor = "_c_[1,1,1]" + if(sColor is not None): + sColorElements = filter(None,re.split(",",sColor)) + if(len(sColorElements)==3): + iR = int(sColorElements[0])/255.0 + iG = int(sColorElements[1])/255.0 + iB = int(sColorElements[2])/255.0 + sCircladerColor = "".join(["_c_[",str(iR),",",str(iG),",",str(iB),"]"]) + return sCircladerColor + + #Happy path tested + def generateLabels(self, lsIDs): + """ + Labels for visualization. + Changes unclassified to one_level_higher.unclassified and enables numeric labeling / relabeling. + Will only rename, will not add the label. The key must exist for the value to be used in replacing. + + :param lsIDs: Ids to include in the labels file + :type: lsIDs List of strings + """ + dictRet = dict() + for sID in lsIDs: + lsIDElements = filter(None,re.split("\|",sID)) + iIDElementsCount = len(lsIDElements) + sLabel = lsIDElements[iIDElementsCount-1] + #Fix unclassified + if((sLabel == "unclassified") and (iIDElementsCount > 1)): + sLabel = ".".join([lsIDElements[iIDElementsCount-2],sLabel]) + #Change to relabels if given + if(self.dictRelabels is not None): + if(sLabel in self.dictRelabels): + sLabel = self.dictRelabels[sLabel] + #Store lable + dictRet[sID] = sLabel + return dictRet + + #Happy path tested + def manageFilePaths(self, sTaxaFileName, strStyleFile, sColorFileName=None, sTickFileName=None, sHighlightFileName=None, sSizeFileName=None, sCircleFileName=None): + """ + This method sets the naming to the files generated that Circlader acts on. + These files include the tree, color, highlight, tick, circle, and size files. + Checks to make sure the file path to the syle file provided is an existing file. + Deletes any existing files with these generated names (except for the style files). + + :param sStyleFile: File path indicating the style file to use + :type: String + :param strTaxaFile: File path indicating the taxa file to use + :type: String + :param sColorFile: File path indicating the color file to use + :type: String + :param sTickFile: File path indicating the tick file to use + :type: String + :param sHighlightFile: File path indicating the highlight file to use + :type: String + :param sSizeFile: File path indicating the size file to use + :type: String + :param sCircleFileName: File path for circle files + :type: String + :return boolean: True indicates success, false indicates error + """ + #Do not remove the style file, it is static + if strStyleFile is None: + print("Error, style file is None") + return(False) + if not os.path.exists(strStyleFile): + print("Error, no style file found.") + return(False) + else: + self.strStyleFilePath = strStyleFile + + #Set output files and remove if needed + self.strTreeFilePath = sTaxaFileName + self.strColorFilePath = sColorFileName + self.strTickFilePath = sTickFileName + self.strHighLightFilePath = sHighlightFileName + self.strSizeFilePath = sSizeFileName + self.strCircleFilePath = sCircleFileName + for sFile in [self.strTreeFilePath,self.strColorFilePath,self.strTickFilePath, + self.strHighLightFilePath,self.strSizeFilePath,self.strCircleFilePath]: + if not sFile is None: + if(os.path.exists(sFile)): + os.remove(sFile) + return True + + #Not tested + def relabelIDs(self, dictLabels): + """ + Allows the relabeling of ids. Can be used to make numeric labeling of ids or renaming + + :param dictLabels: Should label (key) (after unclassified is modified) and new label (value) + :type: dictLabels Dictionary of string (key:label to replace) string (value:new label to use in replacing) + """ + self.dictRelabels = dictLabels + + #Happy path tested + def updateToRoot(self, lsIDs): + """ + Updates the clade to the root given. The clade must contain the root and the level of the + root in the clade will be rest to it's first level, ignoring the previous levels of the clade. + + :param lsIDs: List of Clades that will be reset to the root specified by setRoot + :type: lsIDs List of strings. Each string representing a clade. + """ + + if(self.strRoot is None): + return lsIDs + #Force root tree if indicated to do so + lsRootedIDs = list() + for sID in lsIDs: + sIDElements = filter(None,re.split("\|",sID)) + if(self.strRoot in sIDElements): + iRootIndex = sIDElements.index(self.strRoot) + #If multiple levels of the clade exist after the new root merge them. + if(len(sIDElements)>iRootIndex+2): + lsRootedIDs.append("|".join(sIDElements[iRootIndex+1:])) + #If only one level of the clade exists after the new root, return it. + elif(len(sIDElements)>iRootIndex+1): + lsRootedIDs.append(sIDElements[iRootIndex+1]) + return(lsRootedIDs) + + #Testing: Used extensively in other tests + def writeToFile(self, strFileName, strDataToWrite, fAppend): + """ + Helper function that writes a string to a file + + :param strFileName: File to write to + :type: strFileName File path (string) + :param strDataToWrite: Data to write to file + :type: strDataToWrite String + :param fAppend: Indicates if an append should occur (True == Append) + :type: fAppend boolean + """ + + cMode = 'w' + if fAppend: + cMode = 'a' + with open(strFileName,cMode) as f: + f.write(strDataToWrite)