comparison src/breadcrumbs/src/Cladogram.py @ 0:2f4f6f08c8c4 draft

Uploaded
author george-weingart
date Tue, 13 May 2014 21:58:57 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2f4f6f08c8c4
1 """
2 Author: Timothy Tickle
3 Description: Class to call circlader and create dendrograms.
4 """
5
6 #####################################################################################
7 #Copyright (C) <2012>
8 #
9 #Permission is hereby granted, free of charge, to any person obtaining a copy of
10 #this software and associated documentation files (the "Software"), to deal in the
11 #Software without restriction, including without limitation the rights to use, copy,
12 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
13 #and to permit persons to whom the Software is furnished to do so, subject to
14 #the following conditions:
15 #
16 #The above copyright notice and this permission notice shall be included in all copies
17 #or substantial portions of the Software.
18 #
19 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
20 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
21 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 #####################################################################################
26
27 __author__ = "Timothy Tickle"
28 __copyright__ = "Copyright 2012"
29 __credits__ = ["Timothy Tickle"]
30 __license__ = "MIT"
31 __maintainer__ = "Timothy Tickle"
32 __email__ = "ttickle@sph.harvard.edu"
33 __status__ = "Development"
34
35 #External libraries
36 from AbundanceTable import AbundanceTable
37 from CommandLine import CommandLine
38 from ConstantsBreadCrumbs import ConstantsBreadCrumbs
39 from ConstantsFiguresBreadCrumbs import ConstantsFiguresBreadCrumbs
40 import math
41 import numpy as np
42 import os
43 import re
44 import scipy.stats
45 from ValidateData import ValidateData
46 #import scipy.stats.stats as stats
47
48 class Cladogram:
49 """
50 This class manages creating files for Circlader and calling circulator.
51 """
52
53 #Script name
54 circladerScript=None
55
56 #Constants
57 c_sTaxa="Taxa"
58 c_sCircle="Circle"
59 c_sBorder="Border"
60 c_sShape="Shape"
61 c_sAlpha="Alpha"
62 c_sForced="Forced"
63
64 #Numpy array (structured array) holding data
65 #Should be SampleID, Sample Abundances/Data (samples = columns).....
66 npaAbundance = None
67 #List of sample names
68 lsSampleNames = None
69 #Name of output image
70 strImageName = "Cladogram.png"
71 #String used to call the sample id column
72 strSampleID = "ID"
73 strUnclassified = "unclassified"
74
75 #Minimum size of clade (terminal node count for clade)
76 iMinCladeSize = 1
77 #Level of ancestry to filter at (starts with 0 and based on the input file)
78 iCladeLevelToMeasure = 1
79 iCladeLevelToReduce = 1
80 cFeatureDelimiter = "|"
81
82 #Flags
83 #Turns on (True) or off (False) abundance-based filtering
84 fAbundanceFilter = False
85 #Turns on (True) or off (False) clade size-based filtering
86 fCladeSizeFilter = False
87 #Indicate if the following files were made
88 fSizeFileMade=False
89 fCircleFileMade=False
90 fColorFileMade=False
91 fTickFileMade=False
92 fHighlightFileMade=False
93
94 #Circlader files
95 strTreeFilePath="_Taxa.txt"
96 strCircleFilePath = "_Circle.txt"
97 strColorFilePath="_Color.txt"
98 strTickFilePath="_Tick.txt"
99 strHighLightFilePath="_HighLight.txt"
100 strSizeFilePath="_Size.txt"
101 strStyleFilePath=""
102
103 #Thresholds
104 #Controls the showing of taxa
105 c_dPercentileCutOff = 90.0
106 c_dPercentageAbovePercentile = 1.0
107
108 #Minimum average abundance score when using log scale
109 c_dMinLogSize = 0.0000000001
110 #Constant used to maginfy the size difference in the taxa (log only)
111 c_dLogScale = 1000000
112 #When after log10, an addition scaling adjustment (use this)
113 c_dCircleScale = 3
114
115 #Data for circular files
116 #Used to change IDs to proper labels
117 dictConvertIDs = None
118 #Labels to be relabeled
119 dictRelabels = None
120 #Colors
121 dictColors = None
122 #Elements that are forced to be highlighted
123 dictForcedHighLights = None
124 #Ticks
125 llsTicks = None
126 #Forced root of the tree, discarding data as needed.
127 strRoot = None
128 #Holds circle data as a list of dictionaries
129 #One dictionary per circle
130 ldictCircleData = None
131
132 def __init__(self):
133 self.dictForcedHighLights = dict()
134
135 #Happy Path Tested
136 def addHighLights(self, dictClades,fOverwrite):
137 """
138 This methods allows highlighting to be added.
139 When an element is added in this manner it will not be filtered out.
140 These elements, if existing in the tree will be highlighted the named color given.
141 This color name should be supplied in the set Color Data method
142 {strName1:strColorName1,strName2:strColorName2,...}
143
144 :param dictClades: Names of elements, if found in the tree which should be highlighted
145 :type: dictClades Dictionary of element name (string) and element color (string)
146 :param fOverwrite: If element is already indicated to be highlighted, overwrite the color to the one provided here.
147 :type: fOverwrite boolean (True == overwrite color)
148 """
149 if ValidateData.funcIsValidDictionary(dictClades):
150 if ValidateData.funcIsValidBoolean(fOverwrite):
151 for strElement in dictClades:
152 if(strElement in self.dictForcedHighLights):
153 if(fOverwrite):
154 self.dictForcedHighLights[strElement] = dictClades[strElement]
155 else:
156 self.dictForcedHighLights[strElement] = dictClades[strElement]
157
158 #Not tested
159 def getHighLights(self):
160 return self.dictForcedHighLights
161
162 #Not tested
163 def forceRoot(self, strRoot):
164 """
165 This method allows one to root the tree at a certain level and value
166 Only taxa that contain this value in their ancestry will be plotted
167 The root will be the value given, any previous heirachy will be ignored
168 This will remove highlighted data if indicated to do so
169
170 :params strRoot: Where to root the tree
171 :type: strRoot String
172 """
173 self.strRoot = strRoot
174
175 def generate(self, strImageName, strStyleFile, sTaxaFileName, strCircladerScript = ConstantsBreadCrumbs.c_strCircladerScript, iTerminalCladeLevel = 10, sColorFileName=None, sTickFileName=None, sHighlightFileName=None, sSizeFileName=None, sCircleFileName=None):
176 """
177 This is the method to call to generate a cladogram using circlader.
178 The default data file is an abundance table unless the getDa function is overwritten.
179
180 :param strImageName: File name to save the output cladogram image
181 :type: strImageName File name (string)
182 :param strStyleFile: File path indicating the style file to use
183 :type: strStyleFile File path (string)
184 :param sTaxaFileName: File path indicating the taxa file to use
185 :type: sTaxaFileName File path (string)
186 :param strCircladerScript: File path to the Circlader script
187 :type: String
188 :param iTerminalCladeLevel: Clade level to use as terminal in plotting
189 :type: iTerminalCladeLevel integer starting with 1
190 :param strColorFile: File path indicating the color file to use
191 :type: strColorFile File path (string)
192 :param strTickFile: File path indicating the tick file to use
193 :type: strTickFile File path (string)
194 :param strHighlightFile: File path indicating the highlight file to use
195 :type: strHighlightFile File path (string)
196 :param strSizeFile: File path indicating the size file to use
197 :type: strSizeFile File path (string)
198 :param sCircleFileName: File path of circlader circle file.
199 :type: String
200 """
201
202 if self.npaAbundance == None:
203 print "Cladogram::generate. The data was not set so an image could not be generated"
204 return False
205
206 #Set script
207 self.circladerScript = strCircladerScript
208
209 #Set output file name
210 self.strImageName = strImageName
211
212 #Check files exist and remove files which will be written
213 self.manageFilePaths(sTaxaFileName, strStyleFile, sColorFileName, sTickFileName, sHighlightFileName, sSizeFileName, sCircleFileName)
214
215 #Get IDs
216 lsIDs = [strId for strId in list(self.npaAbundance[self.strSampleID])]
217
218 #Generate a dictionary to convert the ids to correct format
219 #Fix unclassified names
220 #Make numeric labels as indicated
221 self.dictConvertIDs = self.generateLabels(lsIDs)
222
223 #Remove taxa lower than the display clade level
224 lsCladeAndAboveFeatures = []
225 for sFeature in lsIDs:
226 if len(sFeature.split(self.cFeatureDelimiter)) <= iTerminalCladeLevel:
227 lsCladeAndAboveFeatures.append(sFeature)
228 lsIDs = lsCladeAndAboveFeatures
229
230 #Filter by abundance
231 if(self.fAbundanceFilter):
232 lsIDs = self.filterByAbundance(lsIDs)
233
234 #Update to the correct root
235 lsIDs = self.updateToRoot(lsIDs)
236
237 #Set highlights to root for consistency
238 if(not self.strRoot == None):
239 dictRootedHighLights = dict()
240 if not self.dictForcedHighLights == None:
241 for sKey in self.dictForcedHighLights.keys():
242 strUpdatedKey = self.updateToRoot([sKey])
243 dictRootedHighLights[strUpdatedKey[0]]=self.dictForcedHighLights[sKey]
244 self.dictForcedHighLights = dictRootedHighLights
245
246 #Set relabels to root for consistency
247 if(not self.strRoot == None):
248 dictRootedLabels = dict()
249 if not self.dictRelabels == None:
250 for sKey in self.dictRelabels.keys():
251 strUpdatedKey = self.updateToRoot([sKey])
252 dictRootedLabels[strUpdatedKey[0]]=self.dictRelabels[sKey]
253 self.dictRelabels = dictRootedLabels
254
255 #Filter by clade size Should be the last filter.
256 #It is not a strong filter but cleans up images
257 if(self.fCladeSizeFilter):
258 lsIDs = self.filterByCladeSize(lsIDs)
259
260 #Add in forced highlighting
261 lsIDs.extend(self.dictForcedHighLights.keys())
262 lsIDs = list(set(lsIDs))
263
264 #Add in forced circle data
265 for dictCircleData in self.ldictCircleData:
266 if(dictCircleData[self.c_sForced]):
267 lsTaxa = dictCircleData[self.c_sTaxa]
268 lsAlpha = dictCircleData[self.c_sAlpha]
269 lsAddTaxa = []
270 [lsAddTaxa.append(lsTaxa[tpleAlpha[0]]) if not tpleAlpha[1] == '0.0' else 0 for tpleAlpha in enumerate(lsAlpha)]
271 lsIDs.extend(lsAddTaxa)
272 lsIDs = list(set(lsIDs))
273
274 #Create circle files (needs to be after any filtering because it has a forcing option).
275 if not self.createCircleFile(lsIDs):
276 return False
277
278 #Generate / Write Tree file
279 if not self.createTreeFile(lsIDs):
280 return False
281
282 #Generate / Write Highlight file
283 if not self.createHighlightFile(lsIDs):
284 return False
285
286 #Generate / write color file
287 if(self.dictColors is not None):
288 lsColorData = [ConstantsBreadCrumbs.c_cTab.join([sColorKey,self.dictColors[sColorKey]]) for sColorKey in self.dictColors]
289 self.writeToFile(self.strColorFilePath, ConstantsBreadCrumbs.c_strEndline.join(lsColorData), False)
290 self.fColorFileMade=True
291
292 #Generate / write tick file
293 if(self.llsTicks is not None):
294 lsTickData = [ConstantsBreadCrumbs.c_cTab.join(lsTicks) for lsTicks in self.llsTicks]
295 self.writeToFile(self.strTickFilePath, ConstantsBreadCrumbs.c_strEndline.join(lsTickData), False)
296 self.fTickFileMade=True
297
298 #Generate / Write size data
299 if not self.createSizeFile(lsIDs):
300 return False
301
302 #Call commandline
303 lsCommand = [self.circladerScript, self.strTreeFilePath, self.strImageName, "--style_file", self.strStyleFilePath, "--tree_format", "tabular"]
304 if(self.fSizeFileMade):
305 lsCommand.extend(["--size_file", self.strSizeFilePath])
306 if(self.fColorFileMade):
307 lsCommand.extend(["--color_file", self.strColorFilePath])
308 if(self.fTickFileMade):
309 lsCommand.extend(["--tick_file", self.strTickFilePath])
310 if(self.fHighlightFileMade):
311 lsCommand.extend(["--highlight_file", self.strHighLightFilePath])
312 if(self.fCircleFileMade):
313 lsCommand.extend(["--circle_file", self.strCircleFilePath])
314 CommandLine().runCommandLine(lsCommand)
315
316 #Happy path tested
317 def setColorData(self, dictColors):
318 """
319 This methods allows color information to be specified.
320 Need to give a dictionary having a name (key)(string) and color (value)(string RGB)data
321 {strName1:Color,strName2:Color...}
322 Name will be a string name that references what needs to be this color
323 Color data should be a string in the RGB format 0-255,0-255,0-255
324
325 :param dictColors: Color Name and RGB specification
326 :type: dictColorsDictionary strings
327 """
328 if ValidateData.funcIsValidDictionary(dictColors):
329 self.dictColors = dictColors
330 if not ConstantsFiguresBreadCrumbs.c_strBackgroundColorName in self.dictColors:
331 self.dictColors[ConstantsFiguresBreadCrumbs.c_strBackgroundColorName]=ConstantsFiguresBreadCrumbs.c_strBackgroundColor
332
333 #Not tested
334 def setAbundanceData(self, abtbAbundanceTable):
335 """
336 Sets the abundance data the Cladogram will use to plot
337
338 :params abtAbundanceTable: AbundanceTable to set
339 :type: AbundanceTable
340 """
341 self.npaAbundance = abtbAbundanceTable.funcGetAbundanceCopy()
342 self.strSampleID = abtbAbundanceTable.funcGetIDMetadataName()
343 self.lsSampleNames = abtbAbundanceTable.funcGetSampleNames()
344
345 #Not tested
346 def setFilterByAbundance(self, fAbundanceFilter, dPercentileCutOff = 90.0, dPercentageAbovePercentile = 1.0):
347 """
348 Switch filtering by abundance on and off.
349 fAbundanceFilter == True indicates filtering is on
350
351 :param fAbundanceFilter: Switch to turn on (true) and off (false) abundance-based filtering
352 :type: fAbundanceFilter boolean
353 :param dPercentileCutOff: Percentage between 100.0 to 0.0.
354 :type: double
355 :param dPercentageAbovePercentile: Percentage between 100.0 to 1.0.
356 :type: double
357 """
358 self.fAbundanceFilter = fAbundanceFilter
359 self.c_dPercentileCutOff = dPercentileCutOff
360 self.c_dPercentageAbovePercentile = dPercentageAbovePercentile
361
362 #Not Tested
363 def setCircleScale(self, iScale):
364 """
365 Is a scale used to increase or decrease node sizes in the the cladogram to make more visible
366 iScale default is 3
367
368 :param iScale: Integer to increase the relative sizes of nodes
369 :type: iScale integer
370 """
371 self.c_dCircleScale = iScale
372
373 #Not tested
374 def setFeatureDelimiter(self, cDelimiter):
375 """
376 Set the delimiter used to parse the consensus lineages of features.
377
378 :param cDelimiter: The delimiter used to parse the consensus lineage of features.
379 :type: Character
380 """
381 if cDelimiter:
382 self.cFeatureDelimiter = cDelimiter
383
384 #Not tested
385 def setFilterByCladeSize(self, fCladeSizeFilter, iCladeLevelToMeasure = 3, iCladeLevelToReduce = 1, iMinimumCladeSize = 5, cFeatureDelimiter = None, strUnclassified="unclassified"):
386 """
387 Switch filtering by clade size on and off.
388 fCladeSizeFilter == True indicates filtering is on
389 NOT 0 based.
390
391 :param fCladeSizeFilter: Switch to turn on (true) and off (false) clade size-based filtering
392 :type: fCladeSizeFilter boolean
393 :param iCladeLevelToMeasure: The level of the concensus lineage that is measure or counted. Should be greater than iCladeLevelToReduce (Root is 1)
394 :type: iCladeLevelToMeasure int
395 :param iCladeLevelToReduce: The level of the concensus lineage that is reduced if the measured level are not the correct count (Root is 1)
396 :type: iCladeLevelToReduce int
397 :param iMinimumCladeSize: Minimum count of the measured clade for the clade to be kept
398 :type: iMinimumCladeSize int
399 :param cFeatureDelimiter: One may set the feature delimiter if needed.
400 :type: Character
401 :param strUnclassified: String indicating unclassifed features
402 :type: String
403 """
404 self.fCladeSizeFilter = fCladeSizeFilter
405 if iCladeLevelToMeasure > 0:
406 self.iCladeLevelToMeasure = iCladeLevelToMeasure
407 if iCladeLevelToReduce > 0:
408 self.iCladeLevelToReduce = iCladeLevelToReduce
409 if iMinimumCladeSize > 0:
410 self.iMinCladeSize = iMinimumCladeSize
411 if cFeatureDelimiter:
412 self.cFeatureDelimiter = cFeatureDelimiter
413 if strUnclassified:
414 self.strUnclassified = strUnclassified
415
416 #Not tested
417 def setTicks(self, llsTicks):
418 """
419 This methods allows tick information to be specified.
420 Need to generate a list of lists each having a tick level (number starting at 0 as a string), and tick name
421 #Lowest numbers are closest to the center of the tree
422 [[#,Name1],[#,Name2]...]
423
424 :param llsTicks: Level # and Name of level
425 :type: llsTicks List of lists of strings
426 """
427 self.llsTicks = llsTicks
428
429 #Happy Path tested with createCircleFile
430 def addCircle(self, lsTaxa, strCircle, dBorder=0.0, strShape="R", dAlpha=1.0, fForced=False):
431 """
432 This methods allows one to add a circle to the outside of the cladogram.
433
434 :param lsTaxa: Taxa to highlight with this circle
435 :type: lsTaxa List of strings (taxa names)
436 :param strCircle: Circle the elements will be in, indicates color and circle level.
437 :type: strCircle String circle
438 :param dBorder: Border size for the circle element border (between 0.0 and 1.0)
439 can also be a list of dBorders. If list, position must match lsTaxa.
440 :type: dBorder Float of border size (or list of floats).
441 :param strShape: String Indicator of shape or method to determine shape.
442 Can also be a list of shapes. If list, position must match lsTaxa.
443 :type: strShape String to indicate the shape (may also be a list of strings).
444 Default value is square.
445 Valid shapes are R(Square), v(inward pointing triangle), ^(outward pointing triangle)
446 :param dAlpha: The transparency of the circle element (between 0.0[clear] and 1.0[solid]).
447 Can also be a list of floats. If list, position must match lsTaxa.
448 :type: dAlpha Float to indicate the transparency of the shape (may also be a list of strings).
449 :param fForced: Forces item in the features in the circle to be displayed in the cladogram no matter thier passing filters.
450 :type: Boolean
451 """
452 if(self.ldictCircleData == None):
453 self.ldictCircleData = list()
454 dictCircleData = dict()
455 dictCircleData[self.c_sTaxa]=lsTaxa
456 dictCircleData[self.c_sCircle]=strCircle
457 dictCircleData[self.c_sBorder]=dBorder
458 dictCircleData[self.c_sShape]=strShape
459 dictCircleData[self.c_sAlpha]=dAlpha
460 dictCircleData[self.c_sForced]=fForced
461
462 self.ldictCircleData.append(dictCircleData)
463 return True
464
465 #Happy Path tested with AddCircle
466 def createCircleFile(self, lsIDs):
467 """
468 Write circle data to file.
469
470 :param lsIDs: Ids to include in the circle file
471 :type: lsIDs List of strings
472 """
473 #If there is circle data
474 if(not self.ldictCircleData == None):
475 if self.strCircleFilePath == None:
476 print("Error, there is no circle file specified to write to.")
477 return False
478 #Holds circle data {Taxaname:string updates correctly for output to file}
479 dictCircleDataMethods = dict()
480 lsCircleData = list()
481
482 for dictCircleData in self.ldictCircleData:
483 lsTaxa = dictCircleData[self.c_sTaxa]
484 #Shape/s for taxa
485 datShape = dictCircleData[self.c_sShape]
486 fShapeIsList = (str(type(datShape)) == "<type 'list'>")
487 #Border/s for taxa
488 datBorder = dictCircleData[self.c_sBorder]
489 fBorderIsList = (str(type(datBorder)) == "<type 'list'>")
490 #Alpha/s for taxa
491 datAlpha = dictCircleData[self.c_sAlpha]
492 fAlphaIsList = (str(type(datAlpha)) == "<type 'list'>")
493 #Circle name
494 sCircleMethod = dictCircleData[self.c_sCircle]
495
496 #Check to make sure the lengths of the array match up
497 if(fShapeIsList):
498 if not len(datShape) == len(lsTaxa):
499 print("".join(["Error, Shapes were given as an list not of the size of the taxa list. Shape list length: ",str(len(datShape)),". Taxa list length: ",str(len(lsTaxa)),"."]))
500 return False
501 if(fBorderIsList):
502 if not len(datBorder) == len(lsTaxa):
503 print("".join(["Error, Border sizes were given as an list not of the size of the taxa list. Border list length: ",str(len(datBorder)),". Taxa list length: ",str(len(lsTaxa)),"."]))
504 return False
505 if(fAlphaIsList):
506 if not len(datAlpha) == len(lsTaxa):
507 print("".join(["Error, Alpha sizes were given as an list not of the size of the taxa list. Alpha list length: ",str(len(datAlpha)),". Taxa list length: ",str(len(lsTaxa)),"."]))
508 return False
509
510 #Update taxa to root if needed
511 #When doing this if any of the other data is an array we have to edit them
512 #as the taxa are edited for updating root
513 if((not fShapeIsList) and (not fBorderIsList) and (not fAlphaIsList)):
514 lsTaxa = self.updateToRoot(dictCircleData[self.c_sTaxa])
515 else:
516 #Initilize as lists or as the string value they already are
517 lsUpdatedTaxa = list()
518 datUpdatedShapes=list()
519 if(not fShapeIsList):
520 datUpdatedShapes = datShape
521 datUpdatedBorders=list()
522 if(not fBorderIsList):
523 datUpdatedBorders = datBorder
524 datUpdatedAlphas=list()
525 if(not fAlphaIsList):
526 datUpdatedAlphas = datAlpha
527
528 #If a taxa is kept, keep associated list information
529 #If not a list data, leave alone, it will be used globally for all taxa.
530 iTaxaIndex = -1
531 for sTaxa in lsTaxa:
532 iTaxaIndex = iTaxaIndex + 1
533 sUpdatedTaxa=self.updateToRoot([sTaxa])
534
535 if len(sUpdatedTaxa)==1:
536 lsUpdatedTaxa.append(sUpdatedTaxa[0])
537 if(fShapeIsList):
538 datUpdatedShapes.append(datShape[iTaxaIndex])
539 if(fBorderIsList):
540 datUpdatedBorders.append(datBorder[iTaxaIndex])
541 if(fAlphaIsList):
542 datUpdatedAlphas.append(datAlpha[iTaxaIndex])
543
544 #Reset data to rooted data
545 lsTaxa=lsUpdatedTaxa
546 datShape=datUpdatedShapes
547 datBorder=datUpdatedBorders
548 datAlpha=datUpdatedAlphas
549
550 #QC passes so we will add the circle to the figure and the ticks.
551 #If there are ticks and if the circle is not already in the ticks.
552 if(not self.llsTicks == None):
553 strCircleName = dictCircleData[self.c_sCircle]
554 fFound = False
555 iHighestNumber = -1
556 for tick in self.llsTicks:
557 #Look for name
558 if tick[1] == strCircleName:
559 fFound = True
560 #Find highest count
561 if int(tick[0]) > iHighestNumber:
562 iHighestNumber = int(tick[0])
563 if not fFound:
564 self.llsTicks.append([str(iHighestNumber+1),strCircleName])
565
566 #If the circle is forced, add the taxa to the lsIDs
567 #Otherwise we will only plot those that are matching
568 #the lsIDs and the circle taxa list.
569 if dictCircleData[self.c_sForced]:
570 for iAlpha in xrange(0,len(datAlpha)):
571 if(not datAlpha[iAlpha] == "0.0"):
572 lsIDs.append(lsTaxa[iAlpha])
573 lsIDs = list(set(lsIDs))
574
575 #For all taxa in the cladogram
576 for sTaxa in lsTaxa:
577 #Store circle content name in dictionary
578 if not sTaxa in dictCircleDataMethods:
579 #Reset name to . delimited
580 asNameElements = filter(None,re.split("\|",sTaxa))
581
582 sCurTaxaName = asNameElements[len(asNameElements)-1]
583 if(len(asNameElements)>1):
584 if(sCurTaxaName=="unclassified"):
585 sCurTaxaName = ".".join([asNameElements[len(asNameElements)-2],sCurTaxaName])
586 sCurTaxa = ".".join(asNameElements)
587 #Add to dictionary
588 dictCircleDataMethods[sTaxa] = sCurTaxa
589
590 #If the taxa is in the selected method
591 if sTaxa in lsTaxa:
592 #Index of the id in the circle data
593 iTaxaIndex = lsTaxa.index(sTaxa)
594 #Get border
595 sBorder = ""
596 if(fBorderIsList):
597 sBorder = str(datBorder[iTaxaIndex])
598 else:
599 sBorder = str(datBorder)
600 #Get shape
601 sShape = ""
602 if(fShapeIsList):
603 sShape = datShape[iTaxaIndex]
604 else:
605 sShape = datShape
606 #Get alpha
607 sAlpha = ""
608 if(fAlphaIsList):
609 sAlpha = str(datAlpha[iTaxaIndex])
610 else:
611 sAlpha = str(datAlpha)
612 dictCircleDataMethods[sTaxa]=dictCircleDataMethods[sTaxa]+"".join([ConstantsBreadCrumbs.c_cTab,sCircleMethod,":",sAlpha,"!",sShape,"#",sBorder])
613 else:
614 dictCircleDataMethods[sTaxa]=dictCircleDataMethods[sTaxa]+"".join([ConstantsBreadCrumbs.c_cTab,sCircleMethod,":0.0!R#0.0"])
615
616 if len(dictCircleDataMethods)>0:
617 lsTaxaKeys = dictCircleDataMethods.keys()
618 sCircleContent = dictCircleDataMethods[lsTaxaKeys[0]]
619 for sTaxaKey in lsTaxaKeys[1:len(lsTaxaKeys)]:
620 sCircleContent = ConstantsBreadCrumbs.c_strEndline.join([sCircleContent,dictCircleDataMethods[sTaxaKey]])
621 self.writeToFile(self.strCircleFilePath, sCircleContent, False)
622 self.fCircleFileMade=True
623
624 return True
625 self.fCircleFileMade=False
626 return False
627
628 #Happy Path tested
629 def createHighlightFile(self, lsIDs):
630 """
631 Write highlight data to file
632
633 :param lsIDs: Ids to include in the highlight file
634 :type: lsIDs List of strings
635 """
636 lsHighLightData = list()
637 #Each taxa name
638 for sID in lsIDs:
639 sCurColor = ""
640 #Rename taxa to be consisten with the . delimit format
641 asNameElements = filter(None,re.split("\|",sID))
642 sCurTaxaName = asNameElements[len(asNameElements)-1]
643 if(len(asNameElements)>1):
644 if(sCurTaxaName=="unclassified"):
645 sCurTaxaName = ".".join([asNameElements[len(asNameElements)-2],sCurTaxaName])
646 sCurTaxa = ".".join(asNameElements)
647
648 sCurLabel = ""
649 #Get color
650 sColorKey = ""
651 if(sID in self.dictForcedHighLights):
652 sColorKey = self.dictForcedHighLights[sID]
653 if(sColorKey in self.dictColors):
654 sCurColor = self.formatRGB(self.dictColors[sColorKey])
655 #Get label
656 if(self.dictRelabels is not None):
657 if(sID in self.dictRelabels):
658 sCurLabel = self.dictRelabels[sID]
659 if(sCurLabel == ""):
660 lsHighLightData.append(ConstantsBreadCrumbs.c_cTab.join([sCurTaxa,sCurTaxaName,sCurLabel,sCurColor]))
661 else:
662 lsHighLightData.append(ConstantsBreadCrumbs.c_cTab.join([sCurTaxa,sCurLabel,sCurLabel,sCurColor]))
663
664 if len(lsHighLightData)>0:
665 self.writeToFile(self.strHighLightFilePath, ConstantsBreadCrumbs.c_strEndline.join(lsHighLightData), False)
666 self.fHighlightFileMade=True
667 return True
668
669 #Happy path tested
670 def createSizeFile(self, lsIDs):
671 """
672 Write size data to file
673
674 :param lsIDs: Ids to include in the size file
675 :type: lsIDs List of strings
676 """
677 if self.npaAbundance is not None:
678 dMinimumValue = (self.c_dMinLogSize*self.c_dLogScale)+1
679 lsWriteData = list()
680 for rowData in self.npaAbundance:
681 strCurrentId = rowData[0]
682 #Reset to root if needed to match current data
683 if(not self.strRoot == None):
684 strCurrentId = self.updateToRoot([strCurrentId])
685 if(len(strCurrentId) > 0):
686 strCurrentId = strCurrentId[0]
687 if(strCurrentId in lsIDs):
688 dAverage = np.average(list(rowData)[1:])
689 dSize = max([dMinimumValue,(dAverage*self.c_dLogScale)+1])
690 lsWriteData.append(".".join(re.split("\|",strCurrentId))+ConstantsBreadCrumbs.c_cTab+str(math.log10(dSize)*self.c_dCircleScale))
691 if len(lsWriteData)>0:
692 self.writeToFile(self.strSizeFilePath, ConstantsBreadCrumbs.c_strEndline.join(lsWriteData), False)
693 self.fSizeFileMade=True
694 return True
695
696 #Happy path tested 1
697 def createTreeFile(self, lsIDs):
698 """
699 Write tree data to file. The tree file defines the internal cladogram and all it's points.
700
701 :param lsIDs: Ids to include in the tree file as well as their ancestors
702 :type: lsIDs List of strings
703 """
704 lsFullTree = list()
705 for sID in lsIDs:
706 lsIDElements = filter(None,re.split("\|",sID))
707 sElementCur = lsIDElements[0]
708 if(not sElementCur in lsFullTree):
709 lsFullTree.append(sElementCur)
710 if(len(lsIDElements) > 1):
711 sNodePath = ""
712 for iEndLevel in xrange(1,len(lsIDElements)+1):
713 sCurAncestry = lsIDElements[0:iEndLevel]
714 sNodePath = ".".join(sCurAncestry)
715 if(not sNodePath in lsFullTree):
716 lsFullTree.append(sNodePath)
717
718 if len(lsFullTree)>0:
719 self.writeToFile(self.strTreeFilePath, ConstantsBreadCrumbs.c_strEndline.join(lsFullTree), False)
720 return True
721
722 #Happy Path tested
723 def filterByAbundance(self, lsIDs):
724 """
725 Filter by abundance. Specifically this version requires elements of
726 the tree to have a certain percentage of a certain percentile in samples.
727
728 :param lsIDs: Ids to filter
729 :type: lsIDs List of strings
730 """
731 #list of ids to return that survived the filtering
732 retls = list()
733 if not self.npaAbundance is None:
734 #Hold the cuttoff score (threshold) for the percentile of interest {SampleName(string):score(double)}
735 dictPercentiles = dict()
736 for index in xrange(1,len(self.npaAbundance.dtype.names)):
737 dScore = scipy.stats.scoreatpercentile(self.npaAbundance[self.npaAbundance.dtype.names[index]],self.c_dPercentileCutOff)
738 dictPercentiles[self.npaAbundance.dtype.names[index]] = dScore
739
740 #Sample count (Ignore sample id [position 0] which is not a name)
741 dSampleCount = float(len(self.npaAbundance.dtype.names[1:]))
742
743 #Check each taxa
744 for rowTaxaData in self.npaAbundance:
745 sCurTaxaName = rowTaxaData[0]
746 #Only look at the IDs given
747 if(sCurTaxaName in lsIDs):
748 dCountAbovePercentile = 0.0
749 ldAbundanceMeasures = list(rowTaxaData)[1:]
750 #Check to see if the abundance score meets the threshold and count if it does
751 for iScoreIndex in xrange(0,len(ldAbundanceMeasures)):
752 if(ldAbundanceMeasures[iScoreIndex] >= dictPercentiles[self.lsSampleNames[iScoreIndex]]):
753 dCountAbovePercentile = dCountAbovePercentile + 1.0
754 dPercentOverPercentile = dCountAbovePercentile / dSampleCount
755 if(dPercentOverPercentile >= (self.c_dPercentageAbovePercentile/100.0)):
756 retls.append(sCurTaxaName)
757 return retls
758
759 #Happy Path Tested
760 def filterByCladeSize(self, lsIDs):
761 """
762 Filter by the count of individuals in the clade.
763
764 :param lsIDs: Ids to filter
765 :type: lsIDs List of strings
766 """
767 #First get terminal nodes
768 lsTerminalNodes = AbundanceTable.funcGetTerminalNodesFromList(lsIDs,self.cFeatureDelimiter)
769
770 #Count up clades
771 cladeCounts = dict()
772
773 #For each terminal node count the
774 #Clades at clade levels
775 for sTerminalNode in lsTerminalNodes:
776 lsLineage = sTerminalNode.split(self.cFeatureDelimiter)
777 iLineageCount = len(lsLineage)
778 #If the lineage is shorter than the reduced clade level then no need to filter it
779 if iLineageCount >= self.iCladeLevelToReduce:
780 #If the lineage is longer than the reduced clade level and measuring clade level then count
781 #or If the lineage is longer than the reduced clade level but shorter than the measuring clade,
782 #only count if the last element is unclassified
783 if (iLineageCount >= self.iCladeLevelToMeasure) or (lsLineage[-1] == self.strUnclassified):
784 sLineage = self.cFeatureDelimiter.join(lsLineage[0:self.iCladeLevelToReduce])
785 cladeCounts[sLineage] = cladeCounts.get(sLineage,0) + 1
786
787 #Go through the IDs and reduce as needed using the clade counts
788 retls = list()
789 for sID in lsIDs:
790 lsID = sID.split(self.cFeatureDelimiter)
791 iIDCount = len(lsID)
792
793 #Too short to filter
794 if iLineageCount < self.iCladeLevelToReduce:
795 retls.append(sID)
796 #Check to see if the clade which is being reduced made the cut
797 if iIDCount >= self.iCladeLevelToReduce:
798 if (iIDCount >= self.iCladeLevelToMeasure) or (lsID[-1] == self.strUnclassified):
799 if cladeCounts[self.cFeatureDelimiter.join(lsID[0:self.iCladeLevelToReduce])] >= self.iMinCladeSize:
800 retls.append(sID)
801
802 return retls
803
804 #Happy path tested
805 def formatRGB(self, sColor):
806 """
807 Takes a string that is of the format 0-255,0-255,0-255 and converts it to the
808 color format of circlader _c_[0-1,0-1,0-1]
809
810 :param sColor: String RGB format
811 :type: sColor String
812 """
813 sCircladerColor = "_c_[1,1,1]"
814 if(sColor is not None):
815 sColorElements = filter(None,re.split(",",sColor))
816 if(len(sColorElements)==3):
817 iR = int(sColorElements[0])/255.0
818 iG = int(sColorElements[1])/255.0
819 iB = int(sColorElements[2])/255.0
820 sCircladerColor = "".join(["_c_[",str(iR),",",str(iG),",",str(iB),"]"])
821 return sCircladerColor
822
823 #Happy path tested
824 def generateLabels(self, lsIDs):
825 """
826 Labels for visualization.
827 Changes unclassified to one_level_higher.unclassified and enables numeric labeling / relabeling.
828 Will only rename, will not add the label. The key must exist for the value to be used in replacing.
829
830 :param lsIDs: Ids to include in the labels file
831 :type: lsIDs List of strings
832 """
833 dictRet = dict()
834 for sID in lsIDs:
835 lsIDElements = filter(None,re.split("\|",sID))
836 iIDElementsCount = len(lsIDElements)
837 sLabel = lsIDElements[iIDElementsCount-1]
838 #Fix unclassified
839 if((sLabel == "unclassified") and (iIDElementsCount > 1)):
840 sLabel = ".".join([lsIDElements[iIDElementsCount-2],sLabel])
841 #Change to relabels if given
842 if(self.dictRelabels is not None):
843 if(sLabel in self.dictRelabels):
844 sLabel = self.dictRelabels[sLabel]
845 #Store lable
846 dictRet[sID] = sLabel
847 return dictRet
848
849 #Happy path tested
850 def manageFilePaths(self, sTaxaFileName, strStyleFile, sColorFileName=None, sTickFileName=None, sHighlightFileName=None, sSizeFileName=None, sCircleFileName=None):
851 """
852 This method sets the naming to the files generated that Circlader acts on.
853 These files include the tree, color, highlight, tick, circle, and size files.
854 Checks to make sure the file path to the syle file provided is an existing file.
855 Deletes any existing files with these generated names (except for the style files).
856
857 :param sStyleFile: File path indicating the style file to use
858 :type: String
859 :param strTaxaFile: File path indicating the taxa file to use
860 :type: String
861 :param sColorFile: File path indicating the color file to use
862 :type: String
863 :param sTickFile: File path indicating the tick file to use
864 :type: String
865 :param sHighlightFile: File path indicating the highlight file to use
866 :type: String
867 :param sSizeFile: File path indicating the size file to use
868 :type: String
869 :param sCircleFileName: File path for circle files
870 :type: String
871 :return boolean: True indicates success, false indicates error
872 """
873 #Do not remove the style file, it is static
874 if strStyleFile is None:
875 print("Error, style file is None")
876 return(False)
877 if not os.path.exists(strStyleFile):
878 print("Error, no style file found.")
879 return(False)
880 else:
881 self.strStyleFilePath = strStyleFile
882
883 #Set output files and remove if needed
884 self.strTreeFilePath = sTaxaFileName
885 self.strColorFilePath = sColorFileName
886 self.strTickFilePath = sTickFileName
887 self.strHighLightFilePath = sHighlightFileName
888 self.strSizeFilePath = sSizeFileName
889 self.strCircleFilePath = sCircleFileName
890 for sFile in [self.strTreeFilePath,self.strColorFilePath,self.strTickFilePath,
891 self.strHighLightFilePath,self.strSizeFilePath,self.strCircleFilePath]:
892 if not sFile is None:
893 if(os.path.exists(sFile)):
894 os.remove(sFile)
895 return True
896
897 #Not tested
898 def relabelIDs(self, dictLabels):
899 """
900 Allows the relabeling of ids. Can be used to make numeric labeling of ids or renaming
901
902 :param dictLabels: Should label (key) (after unclassified is modified) and new label (value)
903 :type: dictLabels Dictionary of string (key:label to replace) string (value:new label to use in replacing)
904 """
905 self.dictRelabels = dictLabels
906
907 #Happy path tested
908 def updateToRoot(self, lsIDs):
909 """
910 Updates the clade to the root given. The clade must contain the root and the level of the
911 root in the clade will be rest to it's first level, ignoring the previous levels of the clade.
912
913 :param lsIDs: List of Clades that will be reset to the root specified by setRoot
914 :type: lsIDs List of strings. Each string representing a clade.
915 """
916
917 if(self.strRoot is None):
918 return lsIDs
919 #Force root tree if indicated to do so
920 lsRootedIDs = list()
921 for sID in lsIDs:
922 sIDElements = filter(None,re.split("\|",sID))
923 if(self.strRoot in sIDElements):
924 iRootIndex = sIDElements.index(self.strRoot)
925 #If multiple levels of the clade exist after the new root merge them.
926 if(len(sIDElements)>iRootIndex+2):
927 lsRootedIDs.append("|".join(sIDElements[iRootIndex+1:]))
928 #If only one level of the clade exists after the new root, return it.
929 elif(len(sIDElements)>iRootIndex+1):
930 lsRootedIDs.append(sIDElements[iRootIndex+1])
931 return(lsRootedIDs)
932
933 #Testing: Used extensively in other tests
934 def writeToFile(self, strFileName, strDataToWrite, fAppend):
935 """
936 Helper function that writes a string to a file
937
938 :param strFileName: File to write to
939 :type: strFileName File path (string)
940 :param strDataToWrite: Data to write to file
941 :type: strDataToWrite String
942 :param fAppend: Indicates if an append should occur (True == Append)
943 :type: fAppend boolean
944 """
945
946 cMode = 'w'
947 if fAppend:
948 cMode = 'a'
949 with open(strFileName,cMode) as f:
950 f.write(strDataToWrite)