comparison src/breadcrumbs/src/AbundanceTable.py @ 0:0de566f21448 draft default tip

v2
author sagun98
date Thu, 03 Jun 2021 18:13:32 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:0de566f21448
1 """
2 Author: Timothy Tickle
3 Description: Class to abstract an abundance table and methods to run on such a table.
4 """
5
6 #####################################################################################
7 #Copyright (C) <2012>
8 #
9 #Permission is hereby granted, free of charge, to any person obtaining a copy of
10 #this software and associated documentation files (the "Software"), to deal in the
11 #Software without restriction, including without limitation the rights to use, copy,
12 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
13 #and to permit persons to whom the Software is furnished to do so, subject to
14 #the following conditions:
15 #
16 #The above copyright notice and this permission notice shall be included in all copies
17 #or substantial portions of the Software.
18 #
19 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
20 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
21 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 #####################################################################################
26
27 __author__ = "Timothy Tickle"
28 __copyright__ = "Copyright 2012"
29 __credits__ = ["Timothy Tickle"]
30 __license__ = "MIT"
31 __maintainer__ = "Timothy Tickle"
32 __email__ = "ttickle@sph.harvard.edu"
33 __status__ = "Development"
34
35 import csv
36 import sys
37 import blist
38 from CClade import CClade
39 from ConstantsBreadCrumbs import ConstantsBreadCrumbs
40 import copy
41 from datetime import date
42 import numpy as np
43 import os
44 import re
45 import scipy.stats
46 import string
47 from ValidateData import ValidateData
48 from biom.parse import *
49 from biom.table import *
50
51 c_dTarget = 1.0
52 c_fRound = False
53 c_iSumAllCladeLevels = -1
54 c_fOutputLeavesOnly = False
55
56 import warnings
57 warnings.simplefilter(action = "ignore", category = FutureWarning)
58
59
60 class RowMetadata:
61 """
62 Holds the row (feature) metadata and associated functions.
63 """
64
65 def __init__(self, dictRowMetadata, iLongestMetadataEntry=None, lsRowMetadataIDs=None):
66 """ Constructor requires a dictionary or row metadata.
67 :param dictRowMetadata: The row metadata values with the ids as the keys, must be stable (keep order)
68 :type: {string feature id: {'metadata': {'taxonomy': [list of metadata values]}}}
69 """
70
71 self.dictRowMetadata = dictRowMetadata
72 self.iLongestMetadataEntry = iLongestMetadataEntry
73 self.lsRowMetadataIDs = lsRowMetadataIDs
74
75 self.dictMetadataIDs = {}
76 # Get the ids for the metadata
77 if self.dictRowMetadata:
78 for dictMetadata in self.dictRowMetadata.values():
79 dictMetadata = dictMetadata.get(ConstantsBreadCrumbs.c_metadata_lowercase, None)
80
81 if dictMetadata:
82 for key,value in dictMetadata.items():
83 if self.dictMetadataIDs.get(key, None):
84 self.dictMetadataIDs[key] = max(self.dictMetadataIDs[key],len(dictMetadata[key]))
85 else:
86 self.dictMetadataIDs[key] = len(dictMetadata[key])
87
88 def funcMakeIDs(self):
89 """ There should be a one to one mapping of row metadata ids and the values associated here with a feature ID.
90 If not make ids from the key by appending numbers.
91 """
92
93 # If there exists a ids list already return (this allows ID order to be given and preserved)
94 # Else make a list of IDs
95 if self.lsRowMetadataIDs:
96 return self.lsRowMetadataIDs
97
98 lsIDs = []
99 lsKeys = []
100
101 for key, value in self.dictMetadataIDs.items():
102 lsKeys.append( key )
103 if value > 1:
104 lsIDs.extend( [ "_".join( [ key, str( iIndex ) ] ) for iIndex in xrange( value ) ] )
105 else:
106 lsIDs.append( key )
107 return [ lsIDs, lsKeys ]
108
109 def funGetFeatureMetadata(self, sFeature, sMetadata):
110 """
111 Returns a list of values in the order of row metadta ids for a microbial feature given an id.
112
113 :param sFeature: Feature id to get metadata
114 :type: string
115 :param sMetadata: Metadata to get
116 :type: string
117 :return: list of metadata associated with the metadata
118 """
119 lsMetadata = []
120 if self.dictRowMetadata:
121 dictFeature = self.dictRowMetadata.get( sFeature, None )
122 if dictFeature:
123 dictFeatureMetadata = dictFeature.get(ConstantsBreadCrumbs.c_metadata_lowercase, None)
124 if dictFeatureMetadata:
125 lsMetadata = dictFeatureMetadata.get(sMetadata, None)
126 return lsMetadata
127
128
129 class AbundanceTable:
130 """
131 Represents an abundance table and contains common function to perform on the object.
132
133 This class is made from an abundance data file. What is expected is a text file delimited by
134 a character (which is given to the object). The first column is expected to be the id column
135 for each of the rows. Metadata is expected before measurement data. Columns are samples and
136 rows are features (bugs).
137
138 This object is currently not hashable.
139 """
140
141 def __init__(self, npaAbundance, dictMetadata, strName, strLastMetadata, rwmtRowMetadata = None, dictFileMetadata = None, lOccurenceFilter = None, cFileDelimiter = ConstantsBreadCrumbs.c_cTab, cFeatureNameDelimiter = "|"):
142 """
143 Constructor for an abundance table.
144
145 :param npaAbundance: Structured Array of abundance data (Row=Features, Columns=Samples)
146 :type: Numpy Structured Array abundance data (Row=Features, Columns=Samples)
147 :param dictMetadata: Dictionary of metadata {"String ID":["strValue","strValue","strValue","strValue","strValue"]}
148 :type: Dictionary Dictionary
149 :param npaRowMetdata Structured Array of row (feature) metadata (optional)
150 :type: Numpy Structured Array abundance data (Row=Features, Columns=Feature metadata)
151 :param strName: The name of the metadata that serves as the ID for the columns (For example a sample ID)
152 :type: string
153 :param strLastMetadata: The string last metadata name
154 :type: string
155 :param lOccurenceFilter: List of integers used in an occurence filter. [Min abundance, Min sample]
156 :type: List of integers
157 :param cFileDelimiter: Character used as the delimiter of the file that is read in to create the abundance table.
158 Will also be used to write the abudance table file to a file to keep file consistency.
159 :type: Character delimiter for reading the data in (default = TAB)
160 :param cFeatureNameDelimiter: Character used as the delimiter of the feature names (column 1). This is useful if the name are complex, for instance consensus lineages in metagenomics.
161 :type: Character delimiter for feature names (default = |)
162 """
163
164 ### File Metadata
165
166 #Date
167 self.dateCreationDate = dictFileMetadata.get(ConstantsBreadCrumbs.c_strDateKey,None) if dictFileMetadata else None
168
169 #Indicates if the table has been filtered and how
170 self._strCurrentFilterState = ""
171
172 #The delimiter from the source file
173 self._cDelimiter = cFileDelimiter
174
175 #The feature name delimiter
176 self._cFeatureDelimiter = cFeatureNameDelimiter
177
178 #File type
179 self.strFileFormatType = dictFileMetadata.get(ConstantsBreadCrumbs.c_strFormatKey,None) if dictFileMetadata else None
180
181 #File generation source
182 self.strFileGenerationSource = dictFileMetadata.get(ConstantsBreadCrumbs.c_strSourceKey,None) if dictFileMetadata else None
183
184 #File type
185 self.strFileType = dictFileMetadata.get(ConstantsBreadCrumbs.c_strTypekey,None) if dictFileMetadata else None
186
187 #File url
188 self.strFileURL = dictFileMetadata.get(ConstantsBreadCrumbs.c_strURLKey,None) if dictFileMetadata else None
189
190 #The id of the file
191 self.strId = dictFileMetadata.get(ConstantsBreadCrumbs.c_strIDKey,None) if dictFileMetadata else None
192
193 #The lastmetadata name (which should be preserved when writing the file)
194 # Can be a None if biom file is read in.
195 self._strLastMetadataName = strLastMetadata
196
197 #The original number of features in the table
198 self._iOriginalFeatureCount = -1
199
200 #The name of the object relating to the file it was read from or would have been read from if it exists
201 #Keeps tract of changes to the file through the name
202 #Will be used to write out the object to a file as needed
203 self._strOriginalName = strName
204
205 #The original number of samples in the table
206 self._iOriginalSampleCount = -1
207
208 #Data sparsity type
209 self.fSparseMatrix = dictFileMetadata.get(ConstantsBreadCrumbs.c_strSparsityKey,False) if dictFileMetadata else False
210
211 ### Data metadata
212 #The column (sample) metdata
213 self._dictTableMetadata = dictMetadata
214
215 #The row (feature) metadata (Row Metadata object)
216 self.rwmtRowMetadata = rwmtRowMetadata
217
218 ### Data
219
220 #The abundance data
221 self._npaFeatureAbundance = npaAbundance
222
223
224 ### Logistical
225
226 #Clade prefixes for biological samples
227 self._lsCladePrefixes = ["k__","p__","c__","o__","f__","g__","s__"]
228
229 #This is not a hashable object
230 self.__hash__ = None
231
232
233 ### Prep the object
234
235 self._fIsNormalized = self._fIsSummed = None
236 #If contents is not a false then set contents to appropriate objects
237 # Checking to see if the data is normalized, summed and if we need to run a filter on it.
238 if ( self._npaFeatureAbundance != None ) and self._dictTableMetadata:
239 self._iOriginalFeatureCount = self._npaFeatureAbundance.shape[0]
240 self._iOriginalSampleCount = len(self.funcGetSampleNames())
241
242 self._fIsNormalized = ( max( [max( list(a)[1:] or [0] ) for a in self._npaFeatureAbundance] or [0] ) <= 1 )
243
244 lsLeaves = AbundanceTable.funcGetTerminalNodesFromList( [a[0] for a in self._npaFeatureAbundance], self._cFeatureDelimiter )
245 self._fIsSummed = ( len( lsLeaves ) != len( self._npaFeatureAbundance ) )
246
247 #Occurence filtering
248 #Removes features that do not have a given level iLowestAbundance in a given amount of samples iLowestSampleOccurence
249 if ( not self._fIsNormalized ) and lOccurenceFilter:
250 iLowestAbundance, iLowestSampleOccurrence = lOccurenceFilter
251 self.funcFilterAbundanceBySequenceOccurence( iLowestAbundance, iLowestSampleOccurrence )
252 # else:
253 # sys.stderr.write( "Abundance or metadata was None, should be atleast an empty object\n" )
254
255 @staticmethod
256 def funcMakeFromFile(xInputFile, cDelimiter = ConstantsBreadCrumbs.c_cTab, sMetadataID = None, sLastMetadataRow = None, sLastMetadata = None,
257 lOccurenceFilter = None, cFeatureNameDelimiter="|", xOutputFile = None):
258 """
259 Creates an abundance table from a table file.
260
261 :param xInputFile: Path to input file.
262 :type: String String path.
263 :param cDelimiter: Delimiter for parsing the input file.
264 :type: Character Character
265 :param sMetadataID: String ID that is a metadata row ID (found on the first column) and used as an ID for samples
266 :type: String String ID
267 :param sLastRowMetadata: The id of the last (most right column) row metadata
268 :type: String String ID
269 :param sLastMetadata: The ID of the metadata that is the last metadata before measurement or feature rows.
270 :type: String String ID
271 :param lOccurenceFilter: List of integers used in an occurence filter. [Min abundance, Min sample]
272 :type: List of integers
273 :param cFeatureNameDelimiter: Used to parse feature (bug) names if they are complex.
274 For example if they are consensus lineages and contain parent clade information.
275 :type: Character Delimiting letter
276 :param xOutputFile: File to output the abundance table which was read in.
277 :type: FileStream or String file path
278 :return AbundanceTable: Will return an AbundanceTable object on no error. Returns False on error.
279 """
280
281 #Get output file and remove if existing
282 outputFile = open( xOutputFile, "w" ) if isinstance(xOutputFile, str) else xOutputFile
283
284 #################################################################################
285 # Check if file is a biom file - if so invoke the biom routine #
286 #################################################################################
287 strFileName = xInputFile if isinstance(xInputFile, str) else xInputFile.name
288 # Determine the file read function by file extension
289 if strFileName.endswith(ConstantsBreadCrumbs.c_strBiomFile):
290 BiomCommonArea = AbundanceTable._funcBiomToStructuredArray(xInputFile)
291 if BiomCommonArea:
292 lContents = [BiomCommonArea[ConstantsBreadCrumbs.c_BiomTaxData],
293 BiomCommonArea[ConstantsBreadCrumbs.c_Metadata],
294 BiomCommonArea[ ConstantsBreadCrumbs.c_dRowsMetadata],
295 BiomCommonArea[ConstantsBreadCrumbs.c_BiomFileInfo]
296 ]
297
298 # Update last metadata and id if given
299 if not sLastMetadata:
300 strLastMetadata = BiomCommonArea[ConstantsBreadCrumbs.c_sLastMetadata]
301 else:
302 # return false on failure
303 lContents = False
304 else:
305 #Read in from text file to create the abundance and metadata structures
306 lContents = AbundanceTable._funcTextToStructuredArray(xInputFile=xInputFile, cDelimiter=cDelimiter,
307 sMetadataID = sMetadataID, sLastMetadataRow = sLastMetadataRow, sLastMetadata = sLastMetadata, ostmOutputFile = outputFile)
308
309 #If contents is not a false then set contents to appropriate objects
310 return AbundanceTable(npaAbundance=lContents[0], dictMetadata=lContents[1], strName=str(xInputFile), strLastMetadata=sLastMetadata, rwmtRowMetadata = lContents[2],
311 dictFileMetadata = lContents[3], lOccurenceFilter = lOccurenceFilter, cFileDelimiter=cDelimiter, cFeatureNameDelimiter=cFeatureNameDelimiter) if lContents else False
312
313 #Testing Status: Light happy path testing
314 @staticmethod
315 def funcCheckRawDataFile(strReadDataFileName, iFirstDataIndex = -1, sLastMetadataName = None, lOccurenceFilter = None, strOutputFileName = "", cDelimiter = ConstantsBreadCrumbs.c_cTab):
316 """
317 Check the input abundance table.
318 Currently reduces the features that have no occurence.
319 Also inserts a NA for blank metadata and a 0 for blank abundance data.
320 Gives the option to filter features through an occurence filter (a feature must have a level of abundance in a minimal number of samples to be included).
321 Either iFirstDataIndex or sLastMetadataName must be given
322
323 :param strReadDataFileName: File path of file to read and check.
324 :type: String File path.
325 :param iFirstDataIndex: First (row) index of data not metadata in the abundance file.
326 :type: Integer Index starting at 0.
327 :param sLastMetadataName: The ID of the last metadata in the file. Rows of measurements should follow this metadata.
328 :type: String
329 :param lOccurenceFilter: The lowest number of occurences in the lowest number of samples needed for a feature to be kept
330 :type: List[2] List length 2 [lowest abundance (not normalized), lowest number of samples to occur in] (eg. [2.0,2.0])
331 :param strOutputFileName: File path of out put file.
332 :type: String File path.
333 :param cDelimiter: Character delimiter for reading and writing files.
334 :type: Character Delimiter.
335 :return Output Path: Output path for written checked file.
336 """
337
338 #Validate parameters
339 if (iFirstDataIndex == -1) and (sLastMetadataName == None):
340 print "AbundanceTable:checkRawDataFile::Error, either iFirstDataIndex or sLastMetadataNamemust be given."
341 return False
342
343 #Get output file and remove if existing
344 outputFile = strOutputFileName
345 if not strOutputFileName:
346 outputFile = os.path.splitext(strReadDataFileName)[0]+ConstantsBreadCrumbs.OUTPUT_SUFFIX
347
348 #Read input file lines
349 #Drop blank lines
350 readData = ""
351 with open(strReadDataFileName,'rU') as f:
352 readData = f.read()
353 readData = filter(None,readData.split(ConstantsBreadCrumbs.c_strEndline))
354
355 #Read the length of each line and make sure there is no jagged data
356 #Also hold row count for the metadata
357 iLongestLength = len(readData[0].split(cDelimiter))
358 iMetadataRow = -1
359 if not sLastMetadataName:
360 sLastMetadataName = "None"
361 for iIndex, strLine in enumerate(readData):
362 sLineElements = strLine.split(cDelimiter)
363 if sLineElements[0] == sLastMetadataName:
364 iMetadataRow = iIndex
365 iLongestLength = max(iLongestLength, len(sLineElements))
366
367 #If not already set, set iFirstDataIndex
368 if iFirstDataIndex < 0:
369 iFirstDataIndex = iMetadataRow + 1
370
371 #Used to substitute . to -
372 reSubPeriod = re.compile('\.')
373
374 #File writer
375 with open(outputFile,'w') as f:
376
377 #Write metadata
378 #Empty data is changed to a default
379 #Jagged ends are filled with a default
380 for strDataLine in readData[:iFirstDataIndex]:
381 lsLineElements = strDataLine.split(cDelimiter)
382 for iindex, sElement in enumerate(lsLineElements):
383 if not sElement.strip():
384 lsLineElements[iindex] = ConstantsBreadCrumbs.c_strEmptyDataMetadata
385 if len(lsLineElements) < iLongestLength:
386 lsLineElements = lsLineElements + ([ConstantsBreadCrumbs.c_strEmptyDataMetadata]*(iLongestLength-len(lsLineElements)))
387 f.write(cDelimiter.join(lsLineElements)+ConstantsBreadCrumbs.c_strEndline)
388
389 #For each data line in the table
390 for line in readData[iFirstDataIndex:]:
391 writeToFile = False
392 cleanLine = list()
393 #Break line into delimited elements
394 lineElements = line.split(cDelimiter)
395
396 #Clean feature name
397 sCleanFeatureName = reSubPeriod.sub("-",lineElements[0])
398
399 #For each element but the first (taxa name)
400 #Element check to see if not == zero
401 #If so add to output
402 for element in lineElements[1:]:
403 if(element.strip() in string.whitespace):
404 cleanLine.append(ConstantsBreadCrumbs.c_strEmptyAbundanceData)
405 #Set abundance of 0 but do not indicate the line should be saved
406 elif(element == "0"):
407 cleanLine.append(element)
408 #If an abundance is found set the line to be saved.
409 else:
410 cleanLine.append(element)
411 writeToFile = True
412
413 #Occurence filtering
414 #Removes features that do not have a given level iLowestAbundance in a given amount of samples iLowestSampleOccurence
415 if lOccurenceFilter:
416 iLowestAbundance, iLowestSampleOccurence = lOccurenceFilter
417 if iLowestSampleOccurence > sum([1 if float(sEntry) >= iLowestAbundance else 0 for sEntry in cleanLine]):
418 writeToFile = False
419
420 #Write to file
421 if writeToFile:
422 f.write(sCleanFeatureName+cDelimiter+cDelimiter.join(cleanLine)+ConstantsBreadCrumbs.c_strEndline)
423 return outputFile
424
425 def __repr__(self):
426 """
427 Represent or print object.
428 """
429 return "AbundanceTable"
430
431 def __str__(self):
432 """
433 Create a string representation of the Abundance Table.
434 """
435
436 return "".join(["Sample count:", str(len(self._npaFeatureAbundance.dtype.names[1:])),
437 os.linesep+"Feature count:", str(len(self._npaFeatureAbundance[self._npaFeatureAbundance.dtype.names[0]])),
438 os.linesep+"Id Metadata:", self._npaFeatureAbundance.dtype.names[0],
439 os.linesep+"Metadata ids:", str(self._dictTableMetadata.keys()),
440 os.linesep+"Metadata count:", str(len(self._dictTableMetadata.keys())),
441 os.linesep+"Originating source:",self._strOriginalName,
442 os.linesep+"Original feature count:", str(self._iOriginalFeatureCount),
443 os.linesep+"Original sample count:", str(self._iOriginalSampleCount),
444 os.linesep+"Is normalized:", str(self._fIsNormalized),
445 os.linesep+"Is summed:", str(self._fIsSummed),
446 os.linesep+"Current filtering state:", str(self._strCurrentFilterState),
447 os.linesep+"Feature delimiter:", self._cFeatureDelimiter,
448 os.linesep+"File delimiter:",self._cDelimiter])
449
450 def __eq__(self, objOther):
451 """
452 Check if an object is equivalent in data to this object
453 Check to make sure that objOther is not None
454 Check to make sure objOther is the correct class type
455 Check to make sure self and other internal data are the same (exclusing file name)
456 Check data and make sure the npa arrays are the same
457 Check the metdata to make sure the dicts are the same
458 (will need to sort the keys of the dicts before comparing, they do not guarentee any order.
459 """
460 # Check for none
461 if objOther is None:
462 return False
463
464 #Check for object type
465 if isinstance(objOther,AbundanceTable) != True:
466 return False
467
468 #Check feature delimiter
469 if self.funcGetFeatureDelimiter() != objOther.funcGetFeatureDelimiter():
470 return False
471
472 #Check file delimiter
473 if self.funcGetFileDelimiter() != objOther.funcGetFileDelimiter():
474 return False
475
476
477
478
479 #**************************************************
480 #* Commented out *
481 #**************************************************
482 #Check name - Commented out by GW on 2013/09/14 because
483 #If we import pcl file into biom file and then export to pcl, the file names might be different but the tables might be the same
484 #Check name
485 #if self.funcGetName() != objOther.funcGetName():
486 #return False
487
488
489 #Check sample metadata
490 #Go through the metadata
491 result1 = self.funcGetMetadataCopy()
492 result2 = objOther.funcGetMetadataCopy()
493 if sorted(result1.keys()) != sorted(result2.keys()):
494 return False
495 for strKey in result1.keys():
496 if strKey not in result2:
497 return False
498 if result1[strKey] != result2[strKey]:
499 return False
500
501 #TODO check the row (feature) metadata
502
503 #TODO check the file metadata
504 #Check the ID
505 if self.funcGetFileDelimiter() != objOther.funcGetFileDelimiter():
506 return False
507
508 #Check the date
509 if self.dateCreationDate != objOther.dateCreationDate:
510 return False
511
512 #Check the format
513 if self.strFileFormatType != objOther.strFileFormatType:
514 return False
515
516
517
518 #**************************************************
519 #* Commented out *
520 #**************************************************
521 #Check source - Commented out by GW on 2013/09/14 because
522 #If we import pcl file into biom file and then export to pcl, the file names might be different but the tables might be the same
523 #Check the source
524 #if self.strFileGenerationSource != objOther.strFileGenerationSource:
525 #return False
526
527 #Check the type
528 if self.strFileType != objOther.strFileType:
529 return False
530
531 #Check the URL
532 if self.strFileURL != objOther.strFileURL:
533 return False
534
535 #Check data
536 #TODO go through the data
537 #TODO also check the data type
538 result1 = self.funcGetAbundanceCopy()
539 result2 = objOther.funcGetAbundanceCopy()
540 if len(result1) != len(result2):
541 return False
542
543 sorted_result1 = sorted(result1, key=lambda tup: tup[0])
544 sorted_result2 = sorted(result2, key=lambda tup: tup[0])
545
546 if sorted_result1 != sorted_result2 :
547 return False
548
549
550 #**************************************************
551 #* Commented out *
552 #**************************************************
553 #Check AbundanceTable.__str__(self) - Commented out by GW on 2013/09/14 because
554 #If we import pcl file into biom file and then export to pcl, the file names might be different but the tables might be the same
555
556 #Check string representation
557 #if AbundanceTable.__str__(self) != AbundanceTable.__str__(objOther):
558 #return False
559
560 #Check if sample ids are the same and in the same order
561 if self.funcGetSampleNames() != objOther.funcGetSampleNames():
562 return False
563
564 return True
565
566
567 def __ne__(self, objOther):
568 return not self == objOther
569
570
571 #Testing Status: Light happy path testing
572 #TODO: Tim change static to class methods
573 @staticmethod
574 def _funcTextToStructuredArray(xInputFile = None, cDelimiter = ConstantsBreadCrumbs.c_cTab, sMetadataID = None, sLastMetadataRow = None, sLastMetadata = None, ostmOutputFile = None):
575 """
576 Private method
577 Used to read in a file that is samples (column) and taxa (rows) into a structured array.
578
579 :param xInputFile: File stream or path to input file.
580 :type: String File stream or string path.
581 :param cDelimiter: Delimiter for parsing the input file.
582 :type: Character Character.
583 :param sMetadataID: String ID that is a metadata row ID (found on the first column) and used as an ID for samples.
584 If not given it is assumed to be position 0
585 :type: String String ID
586 :param sLastMetadataRow: String ID that is the last row metadat id (id of the most right column with row/feature metadata)
587 :type: String String ID
588 :param sLastMetadata: The ID of the metadata that is the last metadata before measurement or feature rows.
589 :type: String String ID
590 :param ostmOutputFile: Output File to write to if needed. None does not write the file.
591 :type: FileStream or String
592 :return [taxData,metadata,rowmetadata]: Numpy Structured Array of abundance data and dictionary of metadata.
593 Metadata is a dictionary as such {"ID", [value,value,values...]}
594 Values are in the order thety are read in (and the order of the sample names).
595 ID is the first column in each metadata row.
596 - rowmetadata is a optional Numpy strucured array (can be None if not made)
597 + rowmetadata is a optional RowMetadata object (can be None if not made)
598 The rowmetadata and taxData row Ids should match
599 - [Numpy structured Array, Dictionary, Numpy structured array]
600 + The last dict is a collection of BIOM fielparameters when converting from a BIOM file
601 + [Numpy structured Array, Dictionary, Numpy structured array, dict]
602 """
603
604 # Open file from a stream or file path
605 istmInput = open( xInputFile, 'rU' ) if isinstance(xInputFile, str) else xInputFile
606 # Flag that when incremented will switch from metadata parsing to data parsing
607 iFirstDataRow = -1
608 # Sample id row
609 namesRow = None
610 # Row metadata names
611 lsRowMetadataIDs = None
612 # Index of the last row metadata
613 iIndexLastMetadataRow = None
614 # Holds metadata {ID:[list of values]}
615 metadata = dict()
616 # Holds the data measurements [(tuple fo values)]
617 dataMatrix = []
618 # Holds row metadata { sID : [ list of values ] }
619 dictRowMetadata = {}
620 # Positional index
621 iIndex = -1
622 # File handle
623 csvw = None
624
625 # Read in files
626 if ostmOutputFile:
627 csvw = csv.writer( open(ostmOutputFile,'w') if isinstance(ostmOutputFile, str) else ostmOutputFile, csv.excel_tab, delimiter = cDelimiter )
628 # For each line in the file, and assume the tax id is the first element and the data follows
629 for lsLineElements in csv.reader( istmInput, dialect = csv.excel_tab, delimiter = cDelimiter ):
630 iIndex += 1
631 taxId, sampleReads = lsLineElements[0], lsLineElements[1:]
632
633 # Read through data measurements
634 # Process them as a list of tuples (needed for structured array)
635 if iFirstDataRow > 0:
636 try:
637 # Parse the sample reads, removing row metadata and storing row metadata if it exists
638 if lsRowMetadataIDs:
639 # Build expected dict for row metadata dictionary {string feature id: {'metadata': {metadatakey: [list of metadata values]}}}
640 dictFeature = dict([ [sID, [sKey]] for sID, sKey in zip( lsRowMetadataIDs, sampleReads[ 0 : iIndexLastMetadataRow ]) ])
641 if len( dictFeature ):
642 dictRowMetadata[ taxId ] = { ConstantsBreadCrumbs.c_metadata_lowercase: dictFeature }
643 dataMatrix.append(tuple([taxId] + [( float(s) if s.strip( ) else 0 ) for s in sampleReads[ iIndexLastMetadataRow: ]]))
644 else:
645 dataMatrix.append(tuple([taxId] + [( float(s) if s.strip( ) else 0 ) for s in sampleReads]))
646 except ValueError:
647 sys.stderr.write( "AbundanceTable:textToStructuredArray::Error, non-numerical value on data row. File:" + str(xInputFile) +
648 " Row:" + str(lsLineElements) + "\n" )
649 return False
650 # Go through study measurements
651 else:
652 # Read in metadata values, if the entry is blank then give it the default empty metadata value.
653 for i, s in enumerate( sampleReads ):
654 if not s.strip( ):
655 sampleReads[i] = ConstantsBreadCrumbs.c_strEmptyDataMetadata
656
657 # If no id metadata (sample ids) is given then the first row is assumed to be the id row, otherwise look for the id for the metadata.
658 # Add the metadata to the containing dict
659 if ( ( not sMetadataID ) and ( iIndex == 0 ) ) or ( taxId == sMetadataID ):
660 namesRow = lsLineElements
661 # Remove the row metadata ids, these names are for the column ID and the samples ids
662 if sLastMetadataRow:
663 iIndexLastMetadataRow = lsLineElements.index(sLastMetadataRow)
664 lsRowMetadataIDs = namesRow[ 1 : iIndexLastMetadataRow + 1 ]
665 namesRow = [ namesRow[ 0 ] ] + namesRow[ iIndexLastMetadataRow + 1: ]
666
667 # If the sample metadata dictionary already has entries then remove the row metadata info from it.
668 if len( metadata ) and len( lsRowMetadataIDs ):
669 for sKey, lsValues in metadata.items():
670 metadata[ sKey ] = lsValues[ iIndexLastMetadataRow: ]
671
672 # Set the metadata without row metadata entries
673 metadata[taxId] = sampleReads[ iIndexLastMetadataRow: ] if (lsRowMetadataIDs and len( lsRowMetadataIDs )) else sampleReads
674
675 # If the last metadata was just processed switch to data processing
676 # If the last metadata name is not given it is assumed that there is only one metadata
677 if ( not sLastMetadata ) or ( taxId == sLastMetadata ):
678 iFirstDataRow = iIndex + 1
679
680 # If writing out the data write back out the line read in.
681 # This happens at the end so that the above cleaning is captured and written.
682 if csvw:
683 csvw.writerow( [taxId] + sampleReads )
684
685 if sLastMetadata and ( not dataMatrix ):
686 sys.stderr.write( "AbundanceTable:textToStructuredArray::Error, did not find the row for the last metadata ID. File:" + str(xInputFile) +
687 " Identifier:" + sLastMetadata + "\n" )
688 return False
689
690 # Make sure the names are found
691 if namesRow == None:
692 sys.stderr.write( "AbundanceTable:textToStructuredArray::Error, did not find the row for the unique sample/column. File:" + str(xInputFile) +
693 " Identifier:" + sMetadataID + "\n" )
694 return False
695
696 # Now we know the longest taxId we can define the first column holding the tax id
697 # Gross requirement of Numpy structured arrays, a = ASCII followed by max # of characters (as a string)
698 longestTaxId = max( len(a[0]) for a in dataMatrix )
699 dataTypeVector = [(namesRow[0],'a' + str(longestTaxId*2))] + [(s, "f4") for s in namesRow[1:]]
700 # Create structured array
701 taxData = np.array(dataMatrix,dtype=np.dtype(dataTypeVector))
702
703 # Returns a none currently because the PCL file specification this originally worked on did not have feature metadata
704 # Can be updated in the future.
705 # [Data (structured array), column metadata (dict), row metadata (structured array), file metadata (dict)]
706 return [taxData, metadata, RowMetadata(dictRowMetadata = dictRowMetadata, lsRowMetadataIDs = lsRowMetadataIDs), {
707 ConstantsBreadCrumbs.c_strIDKey:ConstantsBreadCrumbs.c_strDefaultPCLID,
708 ConstantsBreadCrumbs.c_strDateKey:str(date.today()),
709 ConstantsBreadCrumbs.c_strFormatKey:ConstantsBreadCrumbs.c_strDefaultPCLFileFormateType,
710 ConstantsBreadCrumbs.c_strSourceKey:ConstantsBreadCrumbs.c_strDefaultPCLGenerationSource,
711 ConstantsBreadCrumbs.c_strTypekey:ConstantsBreadCrumbs.c_strDefaultPCLFileTpe,
712 ConstantsBreadCrumbs.c_strURLKey:ConstantsBreadCrumbs.c_strDefaultPCLURL,
713 ConstantsBreadCrumbs.c_strSparsityKey:ConstantsBreadCrumbs. c_fDefaultPCLSparsity}]
714
715 # def funcAdd(self,abndTwo,strFileName=None):
716 # """
717 # Allows one to add an abundance table to an abundance table. They both must be the same state of normalization or summation
718 # or they will be summed or normalized if one of the two are.
719 #
720 # :param abndTwo: AbundanceTable object 2
721 # :type: AbundanceTable
722 # :return AbudanceTable:
723 # """
724 #
725 # #Check summation and normalization
726 # if(self.funcIsSummed() or abndTwo.funcIsSummed()):
727 # self.funcSum()
728 # abndTwo.funcSum()
729 # if(self.funcIsNormalized() or abndTwo.funcIsNormalized()):
730 # self.funcNormalize()
731 # abndTwo.funcNormalize()
732 #
733 # #Normalize Feature names
734 # #Get if the abundance tables have clades
735 # fAbndInputHasClades = self.funcHasFeatureHierarchy()
736 # fAbndCompareHasClades = abndTwo.funcHasFeatureHierarchy()
737 #
738 # if(fAbndInputHasClades or fAbndCompareHasClades):
739 # #If feature delimiters do not match, switch
740 # if not self.funcGetFeatureDelimiter() == abndTwo.funcGetFeatureDelimiter():
741 # abndTwo.funcSetFeatureDelimiter(self.funcGetFeatureDelimiter())
742 #
743 # #Add prefixes if needed.
744 # self.funcAddCladePrefixToFeatures()
745 # abndTwo.funcAddCladePrefixToFeatures()
746 #
747 # #Get feature Names
748 # lsFeatures1 = self.funcGetFeatureNames()
749 # lsFeatures2 = abndTwo.funcGetFeatureNames()
750 #
751 # #Make one feature name list
752 # lsFeaturesCombined = list(set(lsFeatures1+lsFeature2))
753 #
754 # #Add samples by features (Use 0.0 for empty data features, use NA for empty metadata features)
755 #
756 #
757 # #Combine metadata
758 # dictMetadata1 = self.funcGetMetadataCopy()
759 # dictMetadata2 = abndTwo.funcGetMetadataCopy()
760 #
761 # #Get first table metadata and add NA for metadata it is missing for the length of the current metadata
762 # lsMetadataOnlyInTwo = list(set(dictMetadata2.keys())-set(dictMetadata1.keys()))
763 # dictCombinedMetadata = dictMetadata1
764 # lsEmptyMetadata = ["NA" for i in xrange(self.funcGetSampleCount())]
765 # for sKey in lsMetadataOnlyInTwo:
766 # dictCombinedMetadata[sKey]=lsEmptyMetadata
767 # #Add in the other metadata dictionary
768 # lsCombinedKeys = dictCombinedMetadata.keys()
769 # lsEmptyMetadata = ["NA" for i in xrange(abndTwo.funcGetSampleCount())]
770 # for sKey in lsCombinedKeys():
771 # if sKey in dictMetadata2:
772 # dictCombinedMetadata[sKey] = dictCombinedMetadata[sKey]+dictMetadata2[sKey]
773 # else:
774 # dictCombinedMetadata[sKey] = dictCombinedMetadata[sKey]+lsEmptyMetadata
775 #
776 # #Make Abundance table
777 # return AbundanceTable(npaAbundance=npaAbundance,
778 # dictMetadata = dictCombinedMetadata,
779 # strName = strFileName if strFileName else os.path.splitext(self)[0]+"_combined_"+os.path.splitext(abndTwo)[0],
780 # strLastMetadata = self.funcGetLastMetadataName(),
781 # cFileDelimiter = self.funcGetFileDelimiter(), cFeatureNameDelimiter=self.funcGetFeatureDelimiter())
782
783 #TODO This does not adjust for sample ordering, needs to
784 def funcAddDataFeature(self, lsNames, npdData):
785 """
786 Adds a data or group of data to the underlying table.
787 Names should be in the order of the data
788 Each row is considered a feature (not sample).
789
790 :param lsNames: Names of the features being added to the data of the table
791 :type: List List of string names
792 :param npdData: Rows of features to add to the table
793 :type: Numpy array accessed by row.
794 """
795 if ( self._npaFeatureAbundance == None ):
796 return False
797
798 # Check number of input data rows
799 iDataRows = npdData.shape[0]
800 if (len(lsNames) != iDataRows):
801 print "Error:The names and the rows of data features to add must be of equal length"
802
803 # Grow the array by the neccessary amount and add the new rows
804 iTableRowCount = self.funcGetFeatureCount()
805 iRowElementCount = self.funcGetSampleCount()
806 self._npaFeatureAbundance.resize(iTableRowCount+iDataRows)
807 for iIndexData in xrange(iDataRows):
808 self._npaFeatureAbundance[iTableRowCount+iIndexData] = tuple([lsNames[iIndexData]]+list(npdData[iIndexData]))
809
810 return True
811
812 #TODO This does not adjust for sample ordering, needs to
813 def funcAddMetadataFeature(self,lsNames,llsMetadata):
814 """
815 Adds metadata feature to the underlying table.
816 Names should be in the order of the lists of metadata
817 Each internal list is considered a metadata and paired to a name
818 """
819 if ( self._dictTableMetadata == None ):
820 return False
821
822 # Check number of input data rows
823 iMetadataCount = len(llsMetadata)
824 if (len(lsNames) != iMetadataCount):
825 print "Error:The names and the rows of metadata features to add must be of equal length"
826
827 # Add the metadata
828 for tpleMetadata in zip(lsNames,llsMetadata):
829 self._dictTableMetadata[tpleMetadata[0]]=tpleMetadata[1]
830 return True
831
832 #2 test Cases
833 def funcSetFeatureDelimiter(self, cDelimiter):
834 """
835 Changes the feature delimiter to the one provided.
836 Updates the feature names.
837
838 :param cDelimiter: Character feature delimiter
839 :type: Character
840 :return Boolean: Indicator of success or not (false)
841 """
842 if ( self._npaFeatureAbundance == None ):
843 return False
844 cDelimiterCurrent = self.funcGetFeatureDelimiter()
845 if ( not cDelimiter or not cDelimiterCurrent):
846 return False
847
848 #Make new feature names
849 lsNewFeatureNames = [sFeatureName.replace(cDelimiterCurrent,cDelimiter) for sFeatureName in self.funcGetFeatureNames()]
850
851 #Update new feature names to abundance table
852 if (not self.funcGetIDMetadataName() == None):
853 self._npaFeatureAbundance[self.funcGetIDMetadataName()] = np.array(lsNewFeatureNames)
854
855 #Update delimiter
856 self._cFeatureDelimiter = cDelimiter
857 return True
858
859 #Happy path tested
860 def funcGetSampleNames(self):
861 """
862 Returns the sample names (IDs) contained in the abundance table.
863
864 :return Sample Name: A List of sample names indicated by the metadata associated with the sMetadataId given in table creation.
865 A list of string names or empty list on error as well as no underlying table.
866 """
867
868 return self._npaFeatureAbundance.dtype.names[1:] if ( self._npaFeatureAbundance != None ) else []
869
870 #Happy Path Tested
871 def funcGetIDMetadataName(self):
872 """
873 Returns the metadata id.
874
875 :return ID: The metadata id (the sample Id).
876 Returns none on error.
877 """
878
879 return self._npaFeatureAbundance.dtype.names[0] if ( self._npaFeatureAbundance != None ) else None
880
881 #Happy path tested
882 def funcGetAbundanceCopy(self):
883 """
884 Returns a deep copy of the abundance table.
885
886 :return Numpy Structured Array: The measurement data in the Abundance table. Can use sample names to access each column of measurements.
887 Returns none on error.
888 """
889
890 return self._npaFeatureAbundance.copy() if ( self._npaFeatureAbundance != None ) else None
891
892 #Happy path tested
893 def funcGetAverageAbundancePerSample(self, lsTargetedFeatures):
894 """
895 Averages feature abundance within a sample.
896
897 :param lsTargetedFeatures: String names of features to average
898 :type: List of string names of features which are measured
899 :return List: List of lists or boolean (False on error). One internal list per sample indicating the sample and the feature's average abudance
900 [[sample,average abundance of selected taxa]] or False on error
901 """
902
903 #Sample rank averages [[sample,average abundance of selected taxa]]
904 sampleAbundanceAverages = []
905
906 sampleNames = self.funcGetSampleNames()
907 allTaxaNames = self.funcGetFeatureNames()
908 #Get an abundance table compressed to features of interest
909 abndReducedTable = self.funcGetFeatureAbundanceTable(lsTargetedFeatures)
910 if abndReducedTable == None:
911 return False
912
913 #If the taxa to be selected are not in the list, Return nothing and log
914 lsMissing = []
915 for sFeature in lsTargetedFeatures:
916 if not sFeature in allTaxaNames:
917 lsMissing.append(sFeature)
918 else:
919 #Check to make sure the taxa of interest is not average abundance of 0
920 if not abndReducedTable.funcGetFeatureSumAcrossSamples(sFeature):
921 lsMissing.append(sFeature)
922 if len(lsMissing) > 0:
923 sys.stderr.write( "Could not find features for averaging: " + str(lsMissing) )
924 return False
925
926 #For each sample name get average abundance
927 for sName in sampleNames:
928 npaFeaturesSample = abndReducedTable.funcGetSample(sName)
929 sampleAbundanceAverages.append([sName,sum(npaFeaturesSample)/float(len(npaFeaturesSample))])
930
931 #Sort based on average
932 return sorted(sampleAbundanceAverages, key = lambda sampleData: sampleData[1], reverse = True)
933
934 #Happy path tested 1
935 def funcGetAverageSample(self):
936 """
937 Returns the average sample of the abundance table.
938 This average sample is made of the average of each feature.
939 :return list: A list of averages in the order of the feature names.
940 """
941
942 ldAverageSample = []
943 #If there are no samples then return empty list.
944 if len(self.funcGetSampleNames()) < 1:
945 return ldAverageSample
946
947 #If there are samples return the average of each feature in the order of the feature names.
948 for sFeature in self._npaFeatureAbundance:
949 npFeaturesAbundance = list(sFeature)[1:]
950 ldAverageSample.append(sum(npFeaturesAbundance)/float(len(npFeaturesAbundance)))
951
952 return ldAverageSample
953
954 #Tested 2 cases
955 def funcHasFeatureHierarchy(self):
956 """
957 Returns an indicator of having a hierarchy in the features indicated by the existance of the
958 feature delimiter.
959
960 :return Boolean: True (Has a hierarchy) or False (Does not have a hierarchy)
961 """
962
963 if ( self._npaFeatureAbundance == None ):
964 return None
965 cDelimiter = self.funcGetFeatureDelimiter()
966 if ( not cDelimiter ):
967 return False
968
969 #For each feature name, check to see if the delimiter is in the name
970 for sFeature in self.funcGetFeatureNames():
971 if cDelimiter in sFeature:
972 return True
973 return False
974
975 def funcGetCladePrefixes(self):
976 """
977 Returns the list of prefixes to use on biological sample hierarchy
978
979 :return List: List of strings
980 """
981 return self._lsCladePrefixes
982
983 #3 test cases
984 def funcAddCladePrefixToFeatures(self):
985 """
986 As a standardized clade prefix to indicate biological clade given hierarchy.
987 Will not add a prefix to already prefixes feature names.
988 Will add prefix to feature names that do not have them or clades in a feature name that
989 do not have them while leaving ones that do as is.
990
991 :return Boolean: True (Has a hierarchy) or False (Does not have a hierarchy)
992 """
993
994 if ( self._npaFeatureAbundance == None ):
995 return None
996 cDelimiter = self.funcGetFeatureDelimiter()
997 lsPrefixes = self.funcGetCladePrefixes()
998 iPrefixLength = len(lsPrefixes)
999 if ( not cDelimiter ):
1000 return False
1001
1002 #Append prefixes to feature names
1003 lsUpdatedFeatureNames = []
1004 lsFeatureNames = self.funcGetFeatureNames()
1005
1006 for sFeatureName in lsFeatureNames:
1007 lsClades = sFeatureName.split(cDelimiter)
1008 #If there are not enough then error
1009 if(len(lsClades) > iPrefixLength):
1010 print "Error:: Too many clades given to be biologically meaningful"
1011 return False
1012 lsUpdatedFeatureNames.append(cDelimiter.join([lsPrefixes[iClade]+lsClades[iClade] if not(lsClades[iClade][0:len(lsPrefixes[iClade])]==lsPrefixes[iClade]) else lsClades[iClade] for iClade in xrange(len(lsClades))]))
1013
1014 #Update new feature names to abundance table
1015 if not self.funcGetIDMetadataName() == None:
1016 self._npaFeatureAbundance[self.funcGetIDMetadataName()] = np.array(lsUpdatedFeatureNames)
1017
1018 return True
1019
1020 #Happy Path Tested
1021 def funcGetFeatureAbundanceTable(self, lsFeatures):
1022 """
1023 Returns a copy of the current abundance table with the abundance of just the given features.
1024
1025 :param lsFeatures: String Feature IDs that are kept in the compressed abundance table.
1026 :type: List of strings Feature IDs (found as the first entry of a filter in the input file.
1027 :return AbundanceTable: A compressed version of the abundance table.
1028 On an error None is returned.
1029 """
1030
1031 if ( self._npaFeatureAbundance == None ) or ( lsFeatures == None ):
1032 return None
1033
1034 #Get a list of boolean indicators that the row is from the features list
1035 lfFeatureData = [sRowID in lsFeatures for sRowID in self.funcGetFeatureNames()]
1036 #compressed version as an Abundance table
1037 lsNamePieces = os.path.splitext(self._strOriginalName)
1038 abndFeature = AbundanceTable(npaAbundance=np.compress(lfFeatureData, self._npaFeatureAbundance, axis = 0),
1039 dictMetadata = self.funcGetMetadataCopy(),
1040 strName = lsNamePieces[0] + "-" + str(len(lsFeatures)) +"-Features"+lsNamePieces[1],
1041 strLastMetadata=self.funcGetLastMetadataName(),
1042 cFileDelimiter = self.funcGetFileDelimiter(), cFeatureNameDelimiter= self.funcGetFeatureDelimiter())
1043 #Table is no longer normalized
1044 abndFeature._fIsNormalized = False
1045 return abndFeature
1046
1047 #Happy path tested
1048 def funcGetFeatureDelimiter(self):
1049 """
1050 The delimiter of the feature names (For example to use on concensus lineages).
1051
1052 :return Character: Delimiter for the feature name pieces if it is complex.
1053 """
1054
1055 return self._cFeatureDelimiter
1056
1057 #Happy path tested
1058 def funcGetFeatureCount(self):
1059 """
1060 Returns the current feature count.
1061
1062 :return Count: Returns the int count of features in the abundance table.
1063 Returns None on error.
1064 """
1065
1066 return self._npaFeatureAbundance.shape[0] if not self._npaFeatureAbundance is None else 0
1067
1068 #Happy path tested
1069 def funcGetFeatureSumAcrossSamples(self,sFeatureName):
1070 """
1071 Returns float sum of feature values across the samples.
1072
1073 :param sFeatureName: The feature ID to get the sum from.
1074 :type: String.
1075 :return Double: Sum of one feature across samples.
1076 """
1077 return sum(self.funcGetFeature(sFeatureName))
1078
1079 def funcGetFeature(self,sFeatureName):
1080 """
1081 Returns feature values across the samples.
1082
1083 :param sFeatureName: The feature ID to get the sum from.
1084 :type: String.
1085 :return Double: Feature across samples.
1086 """
1087
1088 for sFeature in self._npaFeatureAbundance:
1089 if sFeature[0] == sFeatureName:
1090 return list(sFeature)[1:]
1091 return None
1092
1093 #Happy path tested
1094 def funcGetFeatureNames(self):
1095 """
1096 Return the feature names as a list.
1097
1098 :return Feature Names: List of feature names (or IDs) as strings.
1099 As an error returns empty list.
1100 """
1101
1102 if (not self._npaFeatureAbundance == None):
1103 return self._npaFeatureAbundance[self.funcGetIDMetadataName()]
1104 return []
1105
1106 #Happy path tested
1107 def funcGetFileDelimiter(self):
1108 """
1109 The delimiter of the file the data was read from and which is also the delimiter which would be used to write the data to a file.
1110
1111 :return Character: Delimiter for the parsing and writing the file.
1112 """
1113
1114 return self._cDelimiter
1115
1116 def funcGetLastMetadataName(self):
1117 """
1118 Get the last metadata name that seperates abundance and metadata measurements.
1119
1120 :return string: Metadata name
1121 """
1122 return self._strLastMetadataName
1123
1124 #Happy path tested
1125 def funcGetSample(self,sSampleName):
1126 """
1127 Return a copy of the feature measurements of a sample.
1128
1129 :param sSampleName: Name of sample to return.
1130 :type: String
1131 :return Sample: Measurements Feature measurements of a sample.
1132 Empty numpy array returned on error.
1133 """
1134
1135 if (not self._npaFeatureAbundance == None):
1136 return self._npaFeatureAbundance[sSampleName].copy()
1137 return np.array([])
1138
1139 #Happy path tested
1140 def funcGetMetadata(self, strMetadataName):
1141 """
1142 Returns a list of metadata that is associated with the given metadata name (id).
1143
1144 :param strMetadataName: String metadata ID to be returned
1145 :type: String ID
1146 :return Metadata: List of metadata
1147 """
1148
1149 return copy.deepcopy( self._dictTableMetadata.get(strMetadataName) ) \
1150 if self._dictTableMetadata else None
1151
1152 #Happy path tested
1153 def funcGetMetadataCopy(self):
1154 """
1155 Returns a deep copy of the metadata.
1156
1157 :return Metadata copy: {"ID":[value,value...]}
1158 """
1159
1160 return copy.deepcopy(self._dictTableMetadata)
1161
1162 #Happy path tested
1163 def funcGetName(self):
1164 """
1165 Returns the name of the object which is the file name that generated it.
1166 If the object was generated from an Abundance Table (for instance through stratification)
1167 the name is still in the form of a file that could be written to which is informative
1168 of the changes that have occurred on the data set.
1169 :return string: Name
1170 """
1171 return self._strOriginalName
1172
1173 #Happy path tested. could do more
1174 def funcGetTerminalNodes(self):
1175 """
1176 Returns the terminal nodes given the current feature names in the abundance table. The
1177 features must contain a consensus lineage or all will be returned.
1178 :return List: List of strings of the terminal nodes given the abundance table.
1179 """
1180 return AbundanceTable.funcGetTerminalNodesFromList(lsNames=self.funcGetFeatureNames(),cNameDelimiter=self.funcGetFeatureDelimiter())
1181
1182 #Tested 2 test cases
1183 @staticmethod
1184 def funcGetTerminalNodesFromList(lsNames,cNameDelimiter):
1185 """
1186 Returns the terminal nodes given the current feature names in the abundance table. The
1187 features must contain a consensus lineage or all will be returned.
1188
1189 :param lsNames: The list of string names to parse and filter.
1190 :type: List of strings
1191 :param cNameDelimiter: The delimiter for the name of the features.
1192 :type: Character Delimiter
1193 :return list: A list of terminal elements in the list (given only the list).
1194 """
1195
1196 #Build hash
1197 dictCounts = dict()
1198 for strTaxaName in lsNames:
1199 #Split into the elements of the clades
1200 lsClades = filter(None,strTaxaName.split(cNameDelimiter))
1201 #Count clade levels
1202 iCladeLength = len(lsClades)
1203
1204 #Evaluate first element
1205 sClade = lsClades[0]
1206 dictCounts[sClade] = sClade not in dictCounts
1207
1208 #Evaluate the rest of the elements
1209 if iCladeLength < 2:
1210 continue
1211 for iIndex in xrange(1,iCladeLength):
1212 prevClade = sClade
1213 sClade = cNameDelimiter.join([sClade,lsClades[iIndex]])
1214 if sClade in dictCounts:
1215 dictCounts[sClade] = dictCounts[prevClade] = False
1216 else:
1217 dictCounts[sClade] = True
1218 dictCounts[prevClade] = False
1219
1220 #Return only the elements that were of count 1
1221 return filter( lambda s: dictCounts[s] == True, dictCounts )
1222
1223 #Happy path tested
1224 def funcIsNormalized(self):
1225 """
1226 Returns if the data has been normalized.
1227
1228 :return Boolean: Indicates if the data is normalized.
1229 True indicates it the data is normalized.
1230 """
1231
1232 return self._fIsNormalized
1233
1234 #Happy path tested
1235 def funcIsPrimaryIdMetadata(self,sMetadataName):
1236 """
1237 Checks the metadata data associated with the sMetadatName and returns if the metadata is unique.
1238 This is important to some of the functions in the Abundance Table specifically when translating from one metadata to another.
1239
1240 :param sMetadataName: ID of metadata to check for uniqueness.
1241 :type: String Metadata ID.
1242 :return Boolean: Returns indicator of uniqueness.
1243 True indicates unique.
1244 """
1245
1246 lMetadata = self.funcGetMetadata(sMetadataName)
1247 if not lMetadata:
1248 return False
1249 return (len(lMetadata) == len(set(lMetadata)))
1250
1251 #Happy path tested
1252 def funcIsSummed(self):
1253 """
1254 Return is the data is summed.
1255
1256 :return Boolean: Indicator of being summed. True indicates summed.
1257 """
1258
1259 return self._fIsSummed
1260
1261 #Happy path tested
1262 def funcFilterAbundanceByPercentile(self, dPercentileCutOff = 95.0, dPercentageAbovePercentile=1.0):
1263 """
1264 Filter on features.
1265 A feature is removed if it's abundance is not found in the top X percentile a certain percentage of the samples.
1266
1267 :param dPercentileCutOff: The percentile used for filtering.
1268 :type: double A double between 0.0 and 100.0
1269 :param dPercentageAbovePercentile: The percentage above the given percentile (dPercentileCutOff) that must exist to keep the feature.
1270 :type: double Between 0.0 and 100.0
1271 :return Boolean: Indicator of filtering occuring without error. True indicates filtering occuring.
1272 """
1273
1274 #No need to do anything
1275 if(dPercentileCutOff==0.0) or (dPercentageAbovePercentile==0.0):
1276 return True
1277
1278 #Sample names
1279 lsSampleNames = self.funcGetSampleNames()
1280
1281 #Scale percentage out of 100
1282 dPercentageAbovePercentile = dPercentageAbovePercentile/100.0
1283
1284 #Sample count
1285 iSampleCount = len(lsSampleNames)
1286
1287 #Get a threshold score of the value at the specified percentile for each sample
1288 #In the order of the sample names
1289 ldScoreAtPercentile = [scipy.stats.scoreatpercentile(self._npaFeatureAbundance[lsSampleNames[iIndex]],dPercentileCutOff) for iIndex in xrange(iSampleCount)]
1290
1291 #Record how many entries for each feature have a value equal to or greater than the dPercentileCutOff
1292 #If the percentile of entries passing the criteria are above the dPercentageAbovePercentile put index in list to keep
1293 liKeepIndices = []
1294 iSampleCount = float(iSampleCount)
1295 for iRowIndex, npaRow in enumerate(self._npaFeatureAbundance):
1296 iCountPass = sum([1 if dValue >= ldScoreAtPercentile[iValueIndex] else 0 for iValueIndex, dValue in enumerate(list(npaRow)[1:])])
1297 if (iCountPass / iSampleCount) >= dPercentageAbovePercentile:
1298 liKeepIndices.append(iRowIndex)
1299
1300 #Compress array
1301 self._npaFeatureAbundance = self._npaFeatureAbundance[liKeepIndices,:]
1302
1303 #Update filter state
1304 self._strCurrentFilterState += ":dPercentileCutOff=" + str(dPercentileCutOff) + ",dPercentageAbovePercentile=" + str(dPercentageAbovePercentile)
1305
1306 #Table is no longer normalized
1307 self._fIsNormalized = False
1308
1309 return True
1310
1311 def funcFilterAbundanceByMinValue(self, dMinAbundance = 0.0001, iMinSamples = 3):
1312 """
1313 Filter abundance by requiring features to have a minimum relative abundance in a minimum number of samples.
1314 Will evaluate greater than or equal to the dMinAbundance and iMinSamples.
1315
1316 :param dMinAbundance: Minimum relative abundance.
1317 :type: Real Number Less than 1.
1318 :param iMinSamples: Minimum samples to have the relative abundnace or greater in.
1319 :type: Integer Number greater than 1.
1320 :return Boolean: Indicator of the filter running without error. False indicates error.
1321 """
1322
1323 #No need to do anything
1324 if(dMinAbundance==0) or (iMinSamples==0):
1325 return True
1326
1327 #This normalization requires the data to be relative abundance
1328 if not self._fIsNormalized:
1329 #sys.stderr.write( "Could not filter by sequence occurence because the data is already normalized.\n" )
1330 return False
1331
1332 #Holds which indexes are kept
1333 liKeepFeatures = []
1334 for iRowIndex, dataRow in enumerate( self._npaFeatureAbundance ):
1335 #See which rows meet the criteria and keep the index if needed.
1336 if len( filter( lambda d: d >= dMinAbundance, list(dataRow)[1:] ) ) >= iMinSamples:
1337 liKeepFeatures.append(iRowIndex)
1338
1339 #Compress array
1340 self._npaFeatureAbundance = self._npaFeatureAbundance[liKeepFeatures,:]
1341 #Update filter state
1342 self._strCurrentFilterState += ":dMinAbundance=" + str(dMinAbundance) + ",iMinSamples=" + str(iMinSamples)
1343
1344 return True
1345
1346 #Happy path tested
1347 def funcFilterAbundanceBySequenceOccurence(self, iMinSequence = 2, iMinSamples = 2):
1348 """
1349 Filter occurence by requiring features to have a minimum sequence occurence in a minimum number of samples.
1350 Will evaluate greater than or equal to the iMinSequence and iMinSamples.
1351
1352 :param iMinSequence: Minimum sequence to occur.
1353 :type: Integer Number Greater than 1.
1354 :param iMinSamples: Minimum samples to occur in.
1355 :type: Integer Number greater than 1.
1356 :return Boolean: Indicator of the filter running without error. False indicates error.
1357 """
1358
1359 #No need to do anything
1360 if(iMinSequence==0) or (iMinSamples==0):
1361 return True
1362
1363 #This normalization requires the data to be reads
1364 if self._fIsNormalized:
1365 #sys.stderr.write( "Could not filter by sequence occurence because the data is already normalized.\n" )
1366 return False
1367
1368 #Holds which indexes are kept
1369 liKeepFeatures = []
1370 for iRowIndex, dataRow in enumerate( self._npaFeatureAbundance ):
1371 #See which rows meet the criteria and keep the index if needed.
1372 if len( filter( lambda d: d >= iMinSequence, list(dataRow)[1:] ) ) >= iMinSamples:
1373 liKeepFeatures.append(iRowIndex)
1374
1375 #Compress array
1376 self._npaFeatureAbundance = self._npaFeatureAbundance[liKeepFeatures,:]
1377 #Update filter state
1378 self._strCurrentFilterState += ":iMinSequence=" + str(iMinSequence) + ",iMinSamples=" + str(iMinSamples)
1379
1380 return True
1381
1382 #1 Happy path test
1383 def funcFilterFeatureBySD(self, dMinSDCuttOff = 0.0):
1384 """
1385 A feature is removed if it's abundance is not found to have standard deviation more than the given dMinSDCutoff.
1386
1387 :param dMinSDCuttOff: Standard deviation threshold.
1388 :type: Double A double greater than 0.0.
1389 :return Boolean: Indicator of success. False indicates error.
1390 """
1391
1392 #No need to do anything
1393 if(dMinSDCuttOff==0.0):
1394 return True
1395
1396 #Holds which indexes are kept
1397 liKeepFeatures = []
1398
1399 #Evaluate each sample
1400 for iRowIndex, dataRow in enumerate(self._npaFeatureAbundance):
1401 if(np.std(list(dataRow)[1:])>=dMinSDCuttOff):
1402 liKeepFeatures.append(iRowIndex)
1403
1404 #Compress array
1405 self._npaFeatureAbundance = self._npaFeatureAbundance[liKeepFeatures,:]
1406
1407 #Update filter state
1408 self._strCurrentFilterState += ":dMinSDCuttOff=" + str(dMinSDCuttOff)
1409
1410 #Table is no longer normalized
1411 self._fIsNormalized = False
1412
1413 return True
1414
1415 #Happy path tested 2 tests
1416 def funcGetWithoutOTUs(self):
1417 """
1418 Remove features that are terminal otus. Terminal otus are identified as being an integer.
1419 """
1420
1421 #Get the feature names
1422 lsFeatures = self.funcGetFeatureNames()
1423
1424 #Reduce, filter the feature names
1425 lsFeatures = [sFeature for sFeature in lsFeatures if not (ValidateData.funcIsValidStringInt(sFeature.split(self.funcGetFeatureDelimiter())[-1]))]
1426
1427 return self.funcGetFeatureAbundanceTable(lsFeatures)
1428
1429 #Happy path tested
1430 def funcNormalize(self):
1431 """
1432 Convenience method which will call which ever normalization is approriate on the data.
1433 :return Boolean: Indicator of success (true).
1434 """
1435
1436 if self._fIsSummed:
1437 return self.funcNormalizeColumnsWithSummedClades()
1438 else:
1439 return self.funcNormalizeColumnsBySum()
1440
1441 #Testing Status: Light happy path testing
1442 def funcNormalizeColumnsBySum(self):
1443 """
1444 Normalize the data in a manner that is approrpiate for NOT summed data.
1445 Normalize the columns (samples) of the abundance table.
1446 Normalizes as a fraction of the total (number/(sum of all numbers in the column)).
1447 Will not act on summed tables.
1448
1449 :return Boolean: Indicator of success. False indicates error.
1450 """
1451
1452 if self._fIsNormalized:
1453 # sys.stderr.write( "This table is already normalized, did not perform new normalization request.\n" )
1454 return False
1455
1456 if self._fIsSummed:
1457 sys.stderr.write( "This table has clades summed, this normalization is not appropriate. Did not perform.\n" )
1458 return False
1459
1460 #Normalize
1461 for columnName in self.funcGetSampleNames():
1462 column = self._npaFeatureAbundance[columnName]
1463 columnTotal = sum(column)
1464 if(columnTotal > 0.0):
1465 column = column/columnTotal
1466 self._npaFeatureAbundance[columnName] = column
1467
1468 #Indicate normalization has occured
1469 self._fIsNormalized = True
1470
1471 return True
1472
1473 #Happy path tested
1474 def funcNormalizeColumnsWithSummedClades(self):
1475 """
1476 Normalizes a summed Abundance Table.
1477 If this is called on a dataset which is not summed and not normalized.
1478 The data will be summed first and then normalized.
1479 If already normalized, the current normalization is kept.
1480
1481 :return Boolean: Indicator of success. False indicates error.
1482 """
1483
1484 if self._fIsNormalized:
1485 # sys.stderr.write( "This table is already normalized, did not perform new summed normalization request.\n" )
1486 return False
1487
1488 if not self._fIsSummed:
1489 sys.stderr.write( "This table does not have clades summed, this normalization is not appropriate until the clades are summed. The clades are being summed now before normalization.\n" )
1490 self.funcSumClades()
1491
1492 #Load a hash table with root data {sKey: npaAbundances}
1493 hashRoots = {}
1494 for npaRow in self._npaFeatureAbundance:
1495
1496 curldAbundance = np.array(list(npaRow)[1:])
1497 curFeatureNameLength = len(npaRow[0].split(self._cFeatureDelimiter))
1498 curlRootData = hashRoots.get(npaRow[0].split(self._cFeatureDelimiter)[0])
1499
1500 if not curlRootData:
1501 hashRoots[npaRow[0].split(self._cFeatureDelimiter)[0]] = [curFeatureNameLength, curldAbundance]
1502 elif curlRootData[0] > curFeatureNameLength:
1503 hashRoots[npaRow[0].split(self._cFeatureDelimiter)[0]] = [curFeatureNameLength, curldAbundance]
1504
1505 #Normalize each feature by thier root feature
1506 dataMatrix = list()
1507 for npaRow in self._npaFeatureAbundance:
1508
1509 curHashRoot = list(hashRoots[npaRow[0].split(self._cFeatureDelimiter)[0]][1])
1510 dataMatrix.append(tuple([npaRow[0]]+[npaRow[i+1]/curHashRoot[i] if curHashRoot[i] > 0 else 0 for i in xrange(len(curHashRoot))]))
1511
1512 self._npaFeatureAbundance = np.array(dataMatrix,self._npaFeatureAbundance.dtype)
1513
1514 #Indicate normalization has occured
1515 self._fIsNormalized = True
1516
1517 return True
1518
1519 def _funcRankAbundanceHelper( self, aaTodo, iRank, lRankAbundance ):
1520 """
1521 Helper method for ranking abudance which are tied.
1522
1523 :params aaTodo: List of tied ranks to change to a rank.
1524 :type: List of Enumerates of samples.
1525 :params iRank: Current Rank
1526 :type: Integer
1527 :params lRankAbundance: Sample of abundance
1528 :type: List of integers
1529 """
1530
1531 # Subtract one from iRank (each time) to account for next loop iteration
1532 # Then average it with itself minus (the length of aaTodo + 1)
1533 dRank = ( iRank + iRank - len( aaTodo ) - 1 ) / 2.0
1534 for a in aaTodo:
1535 lRankAbundance[a[0]] = dRank
1536
1537 #1 Happy path test
1538 def funcRankAbundance(self):
1539 """
1540 Rank abundances of features with in a sample.
1541
1542 :return AbundanceTable: Abundance table data ranked (Features with in samples).
1543 None is returned on error.
1544 """
1545
1546 if self._npaFeatureAbundance == None:
1547 return None
1548
1549 lsSampleNames = self.funcGetSampleNames()
1550 npRankAbundance = self.funcGetAbundanceCopy()
1551 liRanks = []
1552 #For each sample name get the ranks
1553 for sName in lsSampleNames:
1554 #Enumerate for order and sort abundances
1555 lfSample = list(enumerate(npRankAbundance[sName]))
1556 lfSample = sorted(lfSample, key = lambda a: a[1], reverse = True)
1557
1558 # Accumulate indices until a new value is encountered to detect + handle ties
1559 aaTodo = []
1560 for i, a in enumerate( lfSample ):
1561 if ( not aaTodo ) or ( a[1] == aaTodo[-1][1] ):
1562 aaTodo.append( a )
1563 else:
1564 # Make multiple tied ranks = average of first and last
1565 self._funcRankAbundanceHelper( aaTodo, i, npRankAbundance[sName] )
1566 aaTodo = [a]
1567 self._funcRankAbundanceHelper( aaTodo, i + 1, npRankAbundance[sName] )
1568
1569 abndRanked = AbundanceTable(npaAbundance=npRankAbundance, dictMetadata=self.funcGetMetadataCopy(),
1570 strName= self.funcGetName() + "-Ranked",
1571 strLastMetadata=self.funcGetLastMetadataName(),
1572 cFileDelimiter=self.funcGetFileDelimiter(),
1573 cFeatureNameDelimiter=self.funcGetFeatureDelimiter())
1574
1575 #Table is no longer normalized
1576 abndRanked._fIsNormalized = False
1577 return abndRanked
1578
1579 def funcGetSampleCount(self):
1580 """
1581 Returns the sample count of the abundance table.
1582 """
1583 return len(self.funcGetSampleNames())
1584
1585 #Happy Path Tested
1586 def funcReduceFeaturesToCladeLevel(self, iCladeLevel):
1587 """
1588 Reduce the current table to a certain clade level.
1589
1590 :param iCladeLevel: The level of the clade to trim the features to.
1591 :type: Integer The higher the number the more clades are presevered in the consensus lineage contained in the feature name.
1592 :return Boolean: Indicator of success. False indicates error.
1593 """
1594
1595 if iCladeLevel < 1: return False
1596 if not self._npaFeatureAbundance == None:
1597 liFeatureKeep = []
1598 [liFeatureKeep.append(tplFeature[0]) if (len(tplFeature[1][0].split(self.funcGetFeatureDelimiter())) <= iCladeLevel) else 0
1599 for tplFeature in enumerate(self._npaFeatureAbundance)]
1600 #Compress array
1601 self._npaFeatureAbundance = self._npaFeatureAbundance[liFeatureKeep,:]
1602
1603 #Update filter state
1604 self._strCurrentFilterState += ":iCladeLevel=" + str(iCladeLevel)
1605 return True
1606 else:
1607 return False
1608
1609 #Happy path tested
1610 def funcRemoveSamples(self,lsSampleNames):
1611 """
1612 Removes the samples given in the list.
1613
1614 :param lsSampleNames: A list of string names of samples to remove.
1615 :type: List of strings Unique values
1616 :return Boolean: Indicator of success (True = success, no error)
1617 """
1618
1619 #Samples to remove
1620 setSamples = set(lsSampleNames)
1621
1622 #Get orignal sample count
1623 iOriginalCount = self._iOriginalSampleCount
1624
1625 #The samples to keep
1626 lsKeepSamples = [sSample for sSample in self.funcGetSampleNames() if not sSample in setSamples]
1627 #The sample to keep as boolean flags for compressing the metadata
1628 lfKeepSamples = [not sSample in setSamples for sSample in self.funcGetSampleNames()]
1629
1630 #Reduce the abundance data and update
1631 self._npaFeatureAbundance = self._npaFeatureAbundance[[self.funcGetIDMetadataName()]+lsKeepSamples]
1632
1633 #Reduce the metadata and update
1634 for sKey in self._dictTableMetadata:
1635 self._dictTableMetadata[sKey] = [value for iindex, value in enumerate(self._dictTableMetadata[sKey]) if lfKeepSamples[iindex]]
1636
1637 #Update sample number count
1638 self._iOriginalSampleCount = len(self.funcGetSampleNames())
1639
1640 return self._iOriginalSampleCount == (iOriginalCount-len(setSamples))
1641
1642 #Happy path tested
1643 def funcRemoveSamplesByMetadata(self, sMetadata, lValuesToRemove):
1644 """
1645 Removes samples from the abundance table based on values of a metadata.
1646 If a metadata has any value given the associated sample is removed.
1647
1648 :param sMetadata: ID of the metdata to check the given values.
1649 :type: String Metadata ID
1650 :param lValuesToRemove: A list of values which if equal to a metadata entry indicate to remove the associated sample.
1651 :type: List of values: List
1652 :return Boolean: Indicator of success (True = success, no error)
1653 """
1654
1655 lsSampleNames = self.funcGetSampleNames()
1656 return self.funcRemoveSamples([lsSampleNames[iindex] for iindex, sValue in enumerate(self.funcGetMetadata(sMetadata)) if sValue in lValuesToRemove])
1657
1658 #Happy path testing
1659 def funcSumClades(self):
1660 """
1661 Sums abundance data by clades indicated in the feature name (as consensus lineages).
1662
1663 :return Boolean: Indicator of success.
1664 False indicates an error.
1665 """
1666
1667 if not self.funcIsSummed():
1668
1669 #Read in the data
1670 #Find the header column (iCol) assumed to be 1 or 2 depending on the location of "NAME"
1671 #Create a list (adSeq) that will eventually hold the sum of the columns of data
1672 astrHeaders = iCol = None
1673 adSeqs = np.array([0] * len(self.funcGetSampleNames()))
1674 pTree = CClade( )
1675 aastrRaw = []
1676
1677 #For each row in the npaAbundance
1678 #Get the feature name, feature abundances, and sum up the abudance columns
1679 #Keep the sum for later normalization
1680 #Give a tree the feature name and abundance
1681 for dataRow in self._npaFeatureAbundance:
1682
1683 sFeatureName = dataRow[0]
1684 ldAbundances = list(dataRow)[1:]
1685
1686 #Add to the sum of the columns (samples)
1687 adSeqs = adSeqs + np.array(list(dataRow)[1:])
1688
1689 #Build tree
1690 pTree.get( sFeatureName.split(self._cFeatureDelimiter) ).set( ldAbundances )
1691
1692 #Create tree of data
1693 #Input missing data
1694 #Fill hashFeatures with the clade name (key) and a blist of values (value) of the specified level interested.
1695 pTree.impute( )
1696 hashFeatures = {}
1697 pTree.freeze( hashFeatures, c_iSumAllCladeLevels, c_fOutputLeavesOnly )
1698 setstrFeatures = hashFeatures.keys( )
1699
1700 #Remove parent clades that are identical to child clades
1701 for strFeature, adCounts in hashFeatures.items( ):
1702 astrFeature = strFeature.strip( ).split( "|" )
1703 while len( astrFeature ) > 1:
1704 astrFeature = astrFeature[:-1]
1705 strParent = "|".join( astrFeature )
1706 adParent = hashFeatures.get( strParent )
1707 if adParent == adCounts:
1708 del hashFeatures[strParent]
1709 setstrFeatures.remove( strParent )
1710
1711 #Sort features to be nice
1712 astrFeatures = sorted( setstrFeatures )
1713
1714 #Change the hash table to an array
1715 dataMatrix = list()
1716 for sFeature in astrFeatures:
1717 dataMatrix.append(tuple([sFeature]+list(hashFeatures[sFeature])))
1718 self._npaFeatureAbundance=np.array(dataMatrix,self._npaFeatureAbundance.dtype)
1719
1720 #Indicate summation has occured
1721 self._fIsSummed = True
1722
1723 return True
1724
1725 #Happy path tested
1726 def funcStratifyByMetadata(self, strMetadata, fWriteToFile=False):
1727 """
1728 Stratifies the AbundanceTable by the given metadata.
1729 Will write each stratified abundance table to file
1730 if fWriteToFile is True the object will used it's internally stored name as a file to write to
1731 if fWriteToFile is a string then it should be a directory and end with "." This will rebase the file
1732 and store it in a different directory but with an otherwise unchanged name.
1733 Note: If the metadata used for stratification has NAs, they will be segregated to thier own table and returned.
1734
1735 :param strMetadata: Metadata ID to stratify data with.
1736 :type: String ID for a metadata.
1737 :param fWriteToFile: Indicator to write to file.
1738 :type: Boolean True indicates to write to file.
1739 :return List: List of AbundanceTables which are deep copies of the original.
1740 Empty list on error.
1741 """
1742
1743 if self._npaFeatureAbundance is None or self._dictTableMetadata is None:
1744 return []
1745
1746 #Get unique metadata values to stratify by
1747 lsMetadata = self._dictTableMetadata.get(strMetadata,[])
1748 setValues = set(lsMetadata)
1749 #If there is only one metadata value then no need to stratify so return the original in the list (and write if needed)
1750 if len(setValues) == 0:
1751 return []
1752
1753 retlAbundanceTables = []
1754 dictAbundanceBlocks = dict()
1755 #Given here there are multiple metadata values, continue to stratify
1756 lsNames = self.funcGetSampleNames()
1757 #Get index of values to break up
1758 for value in setValues:
1759 lfDataIndex = [sData==value for sData in lsMetadata]
1760 #Get abundance data for the metadata value
1761 #The true is added to keep the first column which should be the feature id
1762 npaStratfiedAbundance = self._npaFeatureAbundance[[self.funcGetIDMetadataName()]+list(np.compress(lfDataIndex,lsNames))]
1763
1764 #Get metadata for the metadata value
1765 dictStratifiedMetadata = dict()
1766 for metadataType in self._dictTableMetadata:
1767 dictValues = self.funcGetMetadata(metadataType)
1768 dictStratifiedMetadata[metadataType] = np.compress(lfDataIndex,dictValues).tolist()
1769
1770 #Make abundance table
1771 #Add abundance table to the list
1772 lsNamePieces = os.path.splitext(self._strOriginalName)
1773 objStratifiedAbundanceTable = AbundanceTable(npaAbundance=npaStratfiedAbundance, dictMetadata=dictStratifiedMetadata,
1774 strName=lsNamePieces[0] + "-StratBy-" + value+lsNamePieces[1],
1775 strLastMetadata=self.funcGetLastMetadataName(),
1776 cFeatureNameDelimiter=self._cFeatureDelimiter, cFileDelimiter = self._cDelimiter)
1777 if fWriteToFile:
1778 objStratifiedAbundanceTable.funcWriteToFile(lsNamePieces[0] + "-StratBy-" + value+lsNamePieces[1])
1779 #Append abundance table to returning list
1780 retlAbundanceTables.append(objStratifiedAbundanceTable)
1781
1782 return retlAbundanceTables
1783
1784 #Happy Path Tested
1785 def funcTranslateIntoMetadata(self, lsValues, sMetadataFrom, sMetadataTo, fFromPrimaryIds=True):
1786 """
1787 Takes the given data values in one metadata and translates it to values in another
1788 metadata of the sample samples holding the values of the first metadata
1789 FPrimaryIds, if true the sMetadataFrom are checked for unique values,
1790 If FPrimaryIds is not true, duplicate values can stop the preservation of order
1791 Or may cause duplication in the "to" group. This is not advised.
1792 if the sMetadataFrom has any duplicates the function fails and return false.
1793
1794 :param lsValues: Values to translate.
1795 :type: List List of values.
1796 :param sMetadataFrom: The metadata the lsValues come from.
1797 :type: String ID for the metadata.
1798 :param sMetadataTo: The metadata the lsValues will be translated into keeping the samples the same.
1799 :type: String ID for the metadata.
1800 :param fFromPrimaryIds: The metadata that are in the from metadata list must be unique in each sample.
1801 :type: Boolean True indicates the metadata list should be unique in each sample. Otherwise a false will return.
1802 :return List: List of new values or False on error.
1803 """
1804
1805 #Get metadata
1806 lFromMetadata = self.funcGetMetadata(sMetadataFrom)
1807 if not lFromMetadata:
1808 sys.stderr.write( "Abundancetable::funcTranlateIntoMetadata. Did not receive lFromMetadata.\n" )
1809 return False
1810
1811 lToMetadata = self.funcGetMetadata(sMetadataTo)
1812 if not lToMetadata:
1813 sys.stderr.write( "Abundancetable::funcTranlateIntoMetadata. Did not receive lToMetadata.\n" )
1814 return False
1815
1816 #Check to see if the values are unique if indicated to do so
1817 if fFromPrimaryIds:
1818 if not len(lFromMetadata) == len(set(lFromMetadata)):
1819 sys.stderr.write( "Abundancetable::funcTranlateIntoMetadata. sMetadataFrom did not have unique values.\n" )
1820 return False
1821
1822 #Translate over
1823 if lFromMetadata and lToMetadata:
1824 return [lToMetadata[iIndex] for iIndex in [lFromMetadata.index(value) for value in lsValues]]
1825
1826 return False
1827
1828 #Happy path tested
1829 def funcToArray(self):
1830 """
1831 Returns a numpy array of the current Abundance Table.
1832 Removes the first ID head column and the numpy array is
1833 Made of lists, not tuples.
1834
1835 :return Numpy Array: np.array([[float,float,...],[float,float,...],[float,float,...]])
1836 None is returned on error.
1837 """
1838
1839 if not self._npaFeatureAbundance == None:
1840 return np.array([list(tplRow)[1:] for tplRow in self._npaFeatureAbundance],'float')
1841 return None
1842
1843 #Happy Path tested
1844 def funcWriteToFile(self, xOutputFile, cDelimiter=None, cFileType=ConstantsBreadCrumbs.c_strPCLFile):
1845 """
1846 Writes the AbundanceTable to a file strOutputFile.
1847 Will rewrite over a file as needed.
1848 Will use the cDelimiter to delimit columns if provided.
1849
1850 :param xOutputFile: File stream or File path to write the file to.
1851 :type: String File Path
1852 :param cDelimiter: Delimiter for the output file.
1853 :type: Character If cDlimiter is not specified, the internally stored file delimiter is used.
1854 """
1855
1856 if not xOutputFile:
1857 return
1858 # Check delimiter argument
1859 if not cDelimiter:
1860 cDelimiter = self._cDelimiter
1861
1862 # Check file type: If pcl: Write pcl file; If biom: write biom file; If None - write pcl file
1863 if(cFileType == None):
1864 cFileType == ConstantsBreadCrumbs.c_strPCLFile
1865
1866 if(cFileType == ConstantsBreadCrumbs.c_strPCLFile):
1867 # Write as a pcl file
1868 self._funcWritePCLFile(xOutputFile, cDelimiter=cDelimiter)
1869 elif(cFileType == ConstantsBreadCrumbs.c_strBiomFile):
1870 #Write as a biom file
1871 self._funcWriteBiomFile(xOutputFile)
1872 return
1873
1874 def _funcWritePCLFile(self, xOutputFile, cDelimiter=None):
1875 """
1876 Write an abundance table object as a PCL file.
1877
1878 :param xOutputFile: File stream or File path to write the file to.
1879 :type: String File Path
1880 :param cDelimiter: Delimiter for the output file.
1881 :type: Character If cDlimiter is not specified, the internally stored file delimiter is used.
1882 """
1883
1884 f = csv.writer(open( xOutputFile, "w" ) if isinstance(xOutputFile, str) else xOutputFile, csv.excel_tab, delimiter=cDelimiter)
1885
1886 # Get Row metadata id info (IDs for column header, keys that line up with the ids)
1887 lsRowMetadataIDs, lsRowMetadataIDKeys = self.rwmtRowMetadata.funcMakeIDs() if self.rwmtRowMetadata else [[],[]]
1888
1889 #Write Ids
1890 f.writerows([[self.funcGetIDMetadataName()]+lsRowMetadataIDs+list(self.funcGetSampleNames())])
1891
1892 #Write column metadata
1893 lsKeys = list(set(self._dictTableMetadata.keys())-set([self.funcGetIDMetadataName(),self.funcGetLastMetadataName()]))
1894 lMetadataIterations = list(set(lsKeys+[self.funcGetLastMetadataName()] ))
1895
1896 f.writerows([[sMetaKey]+([ConstantsBreadCrumbs.c_strEmptyDataMetadata]*len(lsRowMetadataIDs))+self.funcGetMetadata(sMetaKey) for sMetaKey in lMetadataIterations if sMetaKey != self.funcGetIDMetadataName() and not sMetaKey is None])
1897
1898 #Write abundance
1899 lsOutput = list()
1900 curAbundance = self._npaFeatureAbundance.tolist()
1901
1902 for curAbundanceRow in curAbundance:
1903 # Make feature metadata, padding with NA as needed
1904 lsMetadata = []
1905 for sMetadataId in lsRowMetadataIDKeys:
1906 lsMetadata = lsMetadata + self.rwmtRowMetadata.funGetFeatureMetadata( curAbundanceRow[0], sMetadataId )
1907 lsMetadata = lsMetadata + ( [ ConstantsBreadCrumbs.c_strEmptyDataMetadata ] *
1908 ( self.rwmtRowMetadata.dictMetadataIDs.get( sMetadataId, 0 ) - len( lsMetadata ) ) )
1909 f.writerows([[curAbundanceRow[0]]+lsMetadata+[str(curAbundanceElement) for curAbundanceElement in curAbundanceRow[1:]]])
1910 return
1911
1912 def _funcWriteBiomFile(self, xOutputFile):
1913 """
1914 Write an abundance table object as a Biom file.
1915 :param xOutputFile: File stream or File path to write the file to.
1916 :type: String File Path
1917 """
1918
1919 #**************************
1920 # Get Sample Names *
1921 #**************************
1922 lSampNames = list(self.funcGetSampleNames())
1923
1924 #**************************
1925 # Metadata Names *
1926 #**************************
1927
1928 dictMetadataCopy = self.funcGetMetadataCopy()
1929 lMetaData = list()
1930 iKeysCounter = 0
1931 for lMetadataCopyEntry in dictMetadataCopy.iteritems():
1932 iKeysCounter +=1
1933 sMetadataName = lMetadataCopyEntry[0]
1934 lMetadataEntries = lMetadataCopyEntry[1]
1935 iMetadataEntryCounter = -1
1936 for sMetadataEntry in lMetadataEntries:
1937 iMetadataEntryCounter+=1
1938 dictMetadataNames = dict()
1939 dictMetadataNames[sMetadataName ] = sMetadataEntry
1940 if iKeysCounter == 1:
1941 lMetaData.append(dictMetadataNames)
1942 else:
1943 lMetaData[iMetadataEntryCounter][sMetadataName ] = sMetadataEntry
1944
1945
1946 #**************************
1947 # Observation Ids *
1948 # and row metadata *
1949 #**************************
1950 bTaxonomyInRowsFlag = False
1951 if self.rwmtRowMetadata.dictRowMetadata is not None:
1952 bTaxonomyInRowsFlag = True
1953
1954 lObservationMetadataTable = list()
1955
1956 lObservationIds = list()
1957 lFeatureNamesResultArray = self.funcGetFeatureNames()
1958 for sFeatureName in lFeatureNamesResultArray:
1959 lObservationIds.append(sFeatureName)
1960
1961 if self.rwmtRowMetadata and self.rwmtRowMetadata.dictRowMetadata:
1962 RowMetadataEntry = self.rwmtRowMetadata.dictRowMetadata[sFeatureName][ConstantsBreadCrumbs.c_metadata_lowercase]
1963 lObservationMetadataTable.append( RowMetadataEntry )
1964
1965 #**************************
1966 # Data *
1967 #**************************
1968
1969 lData = list()
1970 lAbundanceCopyResultArray = self.funcGetAbundanceCopy()
1971
1972 for r in lAbundanceCopyResultArray:
1973 lr = list(r)
1974 lr.pop(0) #Remove metadata
1975 lAbundanceValues = list()
1976 for AbundanceEntry in lr:
1977 flAbundanceEntry = float(AbundanceEntry)
1978 lAbundanceValues.append(flAbundanceEntry)
1979 lData.append(lAbundanceValues)
1980 arrData = array(lData) #Convert list to array
1981
1982
1983
1984 #**************************
1985 # Invoke the *
1986 # biom table factory *
1987 #**************************
1988 if bTaxonomyInRowsFlag == False:
1989 BiomTable = table_factory(arrData,
1990 lSampNames,
1991 lObservationIds,
1992 lMetaData,
1993 constructor=SparseOTUTable)
1994 else: #There was metadata in the rows
1995 BiomTable = table_factory(arrData,
1996 lSampNames,
1997 lObservationIds,
1998 lMetaData,
1999 lObservationMetadataTable if len(lObservationMetadataTable) > 0 else None,
2000 constructor=SparseOTUTable)
2001
2002 #**************************
2003 # Generate biom Output *
2004 #**************************
2005 f = open( xOutputFile, "w" ) if isinstance(xOutputFile, str) else xOutputFile
2006 f.write(BiomTable.getBiomFormatJsonString(ConstantsBreadCrumbs.c_biom_file_generated_by))
2007 f.close()
2008 return
2009
2010 #Testing Status: 1 Happy path test
2011 @staticmethod
2012 def funcPairTables(strFileOne, strFileTwo, strIdentifier, cDelimiter, strOutFileOne, strOutFileTwo, lsIgnoreValues=None):
2013 """
2014 This method will read in two files and abridge both files (saved as new files)
2015 to just the samples in common between the two files given a common identifier.
2016 ***If the identifier is not unique in each data set, the first sample with the pairing id is taken so make sure the ID is unique.
2017 Expects the files to have the sample delimiters.
2018
2019 :param strFileOne: Path to file one to be paired.
2020 :type: String File path.
2021 :param strFileTwo: Path to file two to be paired.
2022 :type: String File path.
2023 :param strIdentifier: Metadata ID that is used for pairing.
2024 :type: String Metadata ID.
2025 :param cDelimiter: Character delimiter to read the files.
2026 :type: Character Delimiter.
2027 :param strOutFileOne: The output file for the paired version of the first file.
2028 :type: String File path.
2029 :param strOutFileTwo: The output file for the paired version of the second file.
2030 :type: String File path.
2031 :param lsIgnoreValues: These values are ignored even if common IDs between the two files.
2032 :type: List List of strings.
2033 :return Boolean: Indicator of no errors.
2034 False indicates errors.
2035 """
2036
2037 #Validate parameters
2038 if(not ValidateData.funcIsValidFileName(strFileOne)):
2039 sys.stderr.write( "AbundanceTable:checkRawDataFile::Error, file not valid. File:" + strFileOne + "\n" )
2040 return False
2041 #Validate parameters
2042 if(not ValidateData.funcIsValidFileName(strFileTwo)):
2043 sys.stderr.write( "AbundanceTable:checkRawDataFile::Error, file not valid. File:"+ strFileTwo + "\n" )
2044 return False
2045
2046 #Make file one
2047 #Read in file
2048 istm = csv.reader(open(strFileOne,'rU'), csv.excel_tab, delimiter=cDelimiter)
2049 lsContentsOne = [lsRow for lsRow in istm]
2050
2051 #Get the file identifier for file one
2052 fileOneIdentifier = None
2053 for sLine in lsContentsOne:
2054 if sLine[0] == strIdentifier:
2055 fileOneIdentifier = sLine
2056 break
2057
2058 #Make file two
2059 #Read in file
2060 istm = csv.reader(open(strFileTwo,'rU'), csv.excel_tab, delimiter=cDelimiter)
2061 lsContentsTwo = [lsRow for lsRow in istm]
2062
2063 #Get the file identifier for file two
2064 fileTwoIdentifier = None
2065 for sLine in lsContentsTwo:
2066 if sLine[0] == strIdentifier:
2067 fileTwoIdentifier = sLine
2068 break
2069
2070 #Get what is in common between the identifiers
2071 #And find which columns to keep in the tables based on the common elements
2072 setsCommonIdentifiers = set(fileOneIdentifier) & set(fileTwoIdentifier)
2073 if lsIgnoreValues:
2074 setsCommonIdentifiers = setsCommonIdentifiers - set(lsIgnoreValues)
2075
2076 #Get positions of common identifiers in each data set, if the identifier is not unique in a date set just take the first index
2077 lfFileOneIDIndexes = [fileOneIdentifier.index(sCommonID) for sCommonID in setsCommonIdentifiers]
2078 lfFileTwoIDIndexes = [fileTwoIdentifier.index(sCommonID) for sCommonID in setsCommonIdentifiers]
2079
2080 #Convert index list to list of boolean
2081 lfFileOneElements = [iIndex in lfFileOneIDIndexes for iIndex, sIdentifier in enumerate(fileOneIdentifier)]
2082 lfFileTwoElements = [iIndex in lfFileTwoIDIndexes for iIndex, sIdentifier in enumerate(fileTwoIdentifier)]
2083
2084 #Write out file one
2085 ostm = csv.writer(open(strOutFileOne,'w'), csv.excel_tab, delimiter=cDelimiter)
2086 (ostm.writerows([np.compress(lfFileOneElements,sLine) for sLine in lsContentsOne]))
2087
2088 #Write out file two
2089 ostm = csv.writer(open(strOutFileTwo,'w'), csv.excel_tab, delimiter=cDelimiter)
2090 (ostm.writerows([np.compress(lfFileTwoElements,sLine) for sLine in lsContentsTwo]))
2091
2092 return True
2093
2094 #Testing Status: Light happy path testing
2095 @staticmethod
2096 def funcStratifyAbundanceTableByMetadata(strInputFile = None, strDirectory = "", cDelimiter = ConstantsBreadCrumbs.c_cTab, iStratifyByRow = 1, llsGroupings = []):
2097 """
2098 Splits an abundance table into multiple abundance tables stratified by the metadata
2099
2100 :param strInputFile: String file path to read in and stratify.
2101 :type: String File path.
2102 :param strDirectory: Output directory to write stratified files.
2103 :type: String Output directory path.
2104 :param cDelimiter: The delimiter used in the adundance file.
2105 :type: Character Delimiter.
2106 :param iStratifyByRow: The row which contains the metadata to use in stratification.
2107 :type: Integer Positive integer index.
2108 :param llsGroupings: A list of string lists where each string list holds values that are equal and should be grouped together.
2109 So for example, if you wanted to group metadata "1", "2", and "3" seperately but "4" and "5" together you would
2110 Give the following [["4","5"]].
2111 If you know what "1" and "3" also together you would give [["1","3"],["4","5"]]
2112 :type List List of list of strings
2113 :return Boolean: Indicator of NO error.
2114 False indicates an error.
2115 """
2116
2117 #Validate parameters
2118 if(not ValidateData.funcIsValidFileName(strInputFile)):
2119 sys.stderr.write( "AbundanceTable:stratifyAbundanceTableByMetadata::Error, file not valid. File:" + strInputFile + "\n" )
2120 return False
2121 if(not ValidateData.funcIsValidStringType(cDelimiter)):
2122 sys.stderr.write( "AbundanceTable:stratifyAbundanceTableByMetadata::Error, Delimiter is not a valid string/char type. Delimiter =" + cDelimiter + "\n" )
2123 return False
2124 if(not ValidateData.funcIsValidPositiveInteger(iStratifyByRow, tempZero = True) and (not ValidateData.funcIsValidString(iStratifyByRow))):
2125 sys.stderr.write( "AbundanceTable:stratifyAbundanceTableByMetadata::Error, Stratify by row is not a positive integer or string keyword. Row =" +
2126 str(iStratifyByRow) + ".\n" )
2127 return False
2128
2129 #Get the base of the file path
2130 #This is dependent on the given output directory and the prefix of the file name of the input file
2131 #If no output file is given then the input file directory is used.
2132 baseFilePath = strDirectory
2133 lsFilePiecesExt = os.path.splitext(strInputFile)
2134 if baseFilePath:
2135 baseFilePath = baseFilePath + os.path.splitext(os.path.split(strInputFile)[1])[0]
2136 else:
2137 baseFilePath = lsFilePiecesExt[0]
2138
2139 #Read in file
2140 istm = csv.reader(open(strInputFile,'rU'), csv.excel_tab, delimiter=cDelimiter)
2141 sFileContents = [lsRow for lsRow in istm]
2142
2143 #Collect metadata
2144 metadataInformation = dict()
2145
2146 #If the tempStratifyRow is by key word than find the index
2147 if ValidateData.funcIsValidString(iStratifyByRow):
2148 for iLineIndex, strLine in enumerate(sFileContents):
2149 if strLine[0].strip("\"") == iStratifyByRow:
2150 iStratifyByRow = iLineIndex
2151 break
2152
2153 #Stratify by metadata row
2154 #Split metadata row into metadata entries
2155 #And put in a dictionary containing {"variable":[1,2,3,4 column index]}
2156 stratifyByRow = sFileContents[iStratifyByRow]
2157 for metaDataIndex in xrange(1,len(stratifyByRow)):
2158 metadata = stratifyByRow[metaDataIndex]
2159 #Put all wierd categories, none, whitespace, blank space metadata cases into one bin
2160 if not metadata or metadata in string.whitespace:
2161 metadata = "Blank"
2162 #Remove any extraneous formatting
2163 metadata = metadata.strip(string.whitespace)
2164 #Store processed metadata with column occurence in dictionary
2165 if(not metadata in metadataInformation):
2166 metadataInformation[metadata] = []
2167 metadataInformation[metadata].append(metaDataIndex)
2168
2169 #For each of the groupings
2170 #Use the first value as the primary value which the rest of the values in the list are placed into
2171 #Go through the dict holding the indices and extend the list for the primary value with the secondary values
2172 #Then set the secondary value list to empty so that it will be ignored.
2173 if llsGroupings:
2174 for lSKeyGroups in llsGroupings:
2175 if len(lSKeyGroups) > 1:
2176 for sGroup in lSKeyGroups[1:]:
2177 if sGroup in metadataInformation:
2178 metadataInformation[lSKeyGroups[0]].extend(metadataInformation[sGroup])
2179 metadataInformation[sGroup] = []
2180
2181 #Stratify data
2182 stratifiedAbundanceTables = dict()
2183 for tableRow in sFileContents:
2184 if(len(tableRow)> 1):
2185 for metadata in metadataInformation:
2186 #[0] includes the taxa line
2187 columns = metadataInformation[metadata]
2188 if columns:
2189 columns = [0] + columns
2190 lineList = list()
2191 for column in columns:
2192 lineList.append(tableRow[column])
2193 stratifiedAbundanceTables.setdefault(metadata,[]).append(lineList)
2194
2195 #Write to file
2196 lsFilesWritten = []
2197 for metadata in stratifiedAbundanceTables:
2198 sOutputFile = baseFilePath+"-by-"+metadata.strip("\"")+lsFilePiecesExt[1]
2199 f = csv.writer(open(sOutputFile,'w'), csv.excel_tab, delimiter = cDelimiter )
2200 f.writerows(stratifiedAbundanceTables[metadata])
2201 lsFilesWritten.append(sOutputFile)
2202
2203 return lsFilesWritten
2204
2205
2206
2207 #*******************************************
2208 #* biom interface functions: *
2209 #* 1. _funcBiomToStructuredArray *
2210 #* 2. _funcDecodeBiomMetadata *
2211 #*******************************************
2212 @staticmethod
2213 def _funcBiomToStructuredArray(xInputFile = None):
2214 """
2215 Reads the biom input file and builds a "BiomCommonArea" that contains:
2216 1.BiomCommonArea['sLastMetadata'] - This is the name of the last Metadata (String)
2217 2.BiomCommonArea['BiomTaxData']- dict() - going to be used as lcontents[0]==TaxData
2218 3.BiomCommonArea['Metadata'] - dict() - going to be used as lcontents[1]==MetaData
2219 4.BiomCommonArea['BiomFileInfo'] - dict() - going to be used as lcontents[2]==FileInfo (id, format:eg. Biological Observation Matrix 0.9.1) etc.
2220 5.BiomCommonArea['column_metadata_id'] - This is a string which is the name of the column id
2221 :param xInputFile: File path of biom file to read.
2222 :type: String File path.
2223 :return: BiomCommonArea (See description above)
2224 :type: dict()
2225 """
2226
2227 #*******************************************
2228 #* Build the metadata *
2229 #*******************************************
2230 try:
2231 BiomTable = parse_biom_table(open(xInputFile,'U') if isinstance(xInputFile, str) else xInputFile) #Import the biom file
2232 except:
2233 print("Failure decoding biom file - please check your input biom file and rerun")
2234 BiomCommonArea = None
2235 return BiomCommonArea
2236
2237 BiomCommonArea = dict()
2238 dBugNames = list() #Bug Names Table
2239 dRowsMetadata = None #Initialize the np.array of the Rows metadata
2240 BiomElements = BiomTable.getBiomFormatObject('')
2241 for BiomKey, BiomValue in BiomElements.iteritems():
2242 #****************************************************
2243 #* Checking the different keys: format, *
2244 #* rows, columns, date, generated_by *
2245 #****************************************************
2246 if (BiomKey == ConstantsBreadCrumbs.c_strFormatKey
2247 or BiomKey == ConstantsBreadCrumbs.c_strFormatUrl
2248 or BiomKey == ConstantsBreadCrumbs.c_MatrixTtype
2249 or BiomKey == ConstantsBreadCrumbs.c_strTypekey
2250 or BiomKey == ConstantsBreadCrumbs.c_strIDKey #Same as below
2251 or BiomKey == ConstantsBreadCrumbs.c_GeneratedBy #<---Need to follow up with Biom as always BiomValue = "" even though in the file has a value
2252 or BiomKey == ConstantsBreadCrumbs.c_strDateKey): #Same as above
2253 BiomCommonArea = AbundanceTable._funcInsertKeyToCommonArea(BiomCommonArea, BiomKey, BiomValue)
2254
2255
2256 if BiomKey == ConstantsBreadCrumbs.c_rows:
2257 iMaxIdLen = 0
2258 for iIndexRowMetaData in range(0, len(BiomValue)):
2259 if ConstantsBreadCrumbs.c_id_lowercase in BiomValue[iIndexRowMetaData]:
2260 sBugName = BiomValue[iIndexRowMetaData][ConstantsBreadCrumbs.c_id_lowercase]
2261 dBugNames.append(sBugName) #Post to the bug table
2262 if len(sBugName) > iMaxIdLen: #We are calculating dynamically the length of the ID
2263 iMaxIdLen = len(sBugName)
2264
2265 if ConstantsBreadCrumbs.c_metadata_lowercase in BiomValue[0] and BiomValue[0][ConstantsBreadCrumbs.c_metadata_lowercase] != None :
2266 dRowsMetadata = AbundanceTable._funcBiomBuildRowMetadata(BiomValue, iMaxIdLen )
2267
2268
2269 if BiomKey == ConstantsBreadCrumbs.c_columns:
2270 BiomCommonArea = AbundanceTable._funcDecodeBiomMetadata(BiomCommonArea, BiomValue, iMaxIdLen) #Call the subroutine to Build the metadata
2271
2272
2273 #*******************************************
2274 #* Build the TaxData *
2275 #*******************************************
2276
2277 BiomTaxDataWork = list() #Initlialize TaxData
2278 BiomObservations = BiomTable.iterObservations(conv_to_np=True) #Invoke biom method to fetch data from the biom file
2279 for BiomObservationData in BiomObservations:
2280 sBugName = str( BiomObservationData[1])
2281 BiomTaxDataEntry = list()
2282 BiomTaxDataEntry.append(sBugName)
2283 BiomObservationsValues = BiomObservationData[0]
2284 for BiomDataValue in BiomObservationsValues:
2285 BiomTaxDataEntry.append(BiomDataValue)
2286 BiomTaxDataWork.append(tuple(BiomTaxDataEntry))
2287
2288 BiomCommonArea[ConstantsBreadCrumbs.c_BiomTaxData] = np.array(BiomTaxDataWork,dtype=np.dtype(BiomCommonArea[ConstantsBreadCrumbs.c_Dtype]))
2289 BiomCommonArea[ConstantsBreadCrumbs.c_dRowsMetadata] = RowMetadata(dRowsMetadata)
2290 del(BiomCommonArea[ConstantsBreadCrumbs.c_Dtype]) #Not needed anymore
2291
2292 return BiomCommonArea
2293
2294
2295 @staticmethod
2296 def _funcDecodeBiomMetadata(BiomCommonArea, BiomValue = None, iMaxIdLen=0 ):
2297 """
2298 Decode the Biom Metadata and build:
2299 1. BiomCommonArea['Metadata']
2300 2. BiomCommonArea['Dtype']
2301 3. BiomCommonArea['sLastMetadata']
2302 4. BiomCommonArea['column_metadata_id'] - This is a string which is the name of the column id
2303 These elements will be formatted and passed down the line to build the AbundanceTable
2304 :param BiomValue: The "columns" Metadata from the biom file (Contains the Metadata information)
2305 :type: dict()
2306 :param iMaxIdLen: The maximum length of a row ID
2307 :type: Integer
2308 :return: BiomCommonArea
2309 :type: dict()
2310 """
2311
2312 BiomCommonArea[ConstantsBreadCrumbs.c_sLastMetadata] = None #Initialize the LastMetadata element
2313 BiomCommonArea['dRowsMetadata'] = None #Initialize for cases that there is no metadata in the rows
2314
2315 strLastMetadata = None
2316 strIDMetadata = None
2317
2318 lenBiomValue = len(BiomValue)
2319 BiomMetadata = dict()
2320 for cntMetadata in range(0, lenBiomValue):
2321 BiomMetadataEntry = BiomValue[cntMetadata]
2322
2323 for key, value in BiomMetadataEntry.iteritems(): #Loop on the entries
2324 if key == ConstantsBreadCrumbs.c_id_lowercase: #If id - process it
2325 strIDMetadata = ConstantsBreadCrumbs.c_ID
2326 if ConstantsBreadCrumbs.c_ID not in BiomMetadata: #If ID not in the common area - initalize it
2327 BiomMetadata[ConstantsBreadCrumbs.c_ID] = list() #Initialize a list
2328 for indx in range(0, lenBiomValue): #And post the values
2329 BiomMetadata[ConstantsBreadCrumbs.c_ID].append(None)
2330 BiomMetadata[ConstantsBreadCrumbs.c_ID][cntMetadata] = value.encode(ConstantsBreadCrumbs.c_ascii,ConstantsBreadCrumbs.c_ignore)
2331
2332 if key == ConstantsBreadCrumbs.c_metadata_lowercase: #If key = metadata
2333 if not value is None: #And value is not empty
2334 MetadataDict = value #Initialize a dictionary and post the values
2335 for MDkey, MDvalue in MetadataDict.iteritems():
2336 if type(MDkey) == unicode :
2337 MDkeyAscii = MDkey.encode(ConstantsBreadCrumbs.c_ascii,ConstantsBreadCrumbs.c_ignore)
2338 else:
2339 MDkeyAscii = MDkey
2340 if type(MDvalue) == unicode:
2341 MDvalueAscii = MDvalue.encode(ConstantsBreadCrumbs.c_ascii,ConstantsBreadCrumbs.c_ignore)
2342 else:
2343 MDvalueAscii = MDvalue
2344
2345 if len(MDkeyAscii) > 0: #Search for the last metadata
2346 if not strIDMetadata:
2347 strIDMetadata = MDkeyAscii
2348 BiomCommonArea[ConstantsBreadCrumbs.c_sLastMetadata] = MDkeyAscii #Set the last Metadata
2349 if MDkeyAscii not in BiomMetadata:
2350 BiomMetadata[MDkeyAscii] = list()
2351 for indx in range(0, lenBiomValue):
2352 BiomMetadata[MDkeyAscii].append(None)
2353 BiomMetadata[MDkeyAscii][cntMetadata] = MDvalueAscii
2354
2355
2356 BiomCommonArea[ConstantsBreadCrumbs.c_Metadata] = BiomMetadata
2357 BiomCommonArea[ConstantsBreadCrumbs.c_MetadataID] = strIDMetadata
2358
2359 #**********************************************
2360 #* Build dtype *
2361 #**********************************************
2362
2363 BiomDtype = list()
2364 iMaxIdLen+=10 #Increase it by 10
2365 BiomDtypeEntry = list()
2366 FirstValue = ConstantsBreadCrumbs.c_ID
2367 SecondValue = "a" + str(iMaxIdLen)
2368 BiomDtypeEntry.append(FirstValue)
2369 BiomDtypeEntry.append(SecondValue)
2370 BiomDtype.append(tuple(BiomDtypeEntry))
2371
2372 for a in BiomMetadata[ConstantsBreadCrumbs.c_ID]:
2373 BiomDtypeEntry = list()
2374 FirstValue = a.encode(ConstantsBreadCrumbs.c_ascii,ConstantsBreadCrumbs.c_ignore)
2375 SecondValue = ConstantsBreadCrumbs.c_f4
2376 BiomDtypeEntry.append(FirstValue)
2377 BiomDtypeEntry.append(SecondValue)
2378 BiomDtype.append(tuple(BiomDtypeEntry))
2379
2380 BiomCommonArea[ConstantsBreadCrumbs.c_Dtype] = BiomDtype
2381 return BiomCommonArea
2382
2383 @staticmethod
2384 def _funcBiomBuildRowMetadata( BiomValue, iMaxIdLen ):
2385 """
2386 Builds the row metadata from a BIOM value
2387
2388 :param BiomValue: BIOM Value from the BIOM JSON parsing
2389 :type: Complex dict of string pairs and dicts
2390 :param iMaxIdLen: Maximum length of all the IDs
2391 :type: int
2392 :return: dictRowsMetadata - np Array containing the rows metadata
2393 :type: {string feature id: {'metadata': {'taxonomy': [list of metadata values]}}}
2394 """
2395 # Build the input dict for RowMetadata from a dict of dicts from a BIOM file
2396 dictRowsMetadata = dict()
2397 for iIndexRowMetaData in range(0, len(BiomValue)):
2398 dictRowsMetadata[str(BiomValue[iIndexRowMetaData][ConstantsBreadCrumbs.c_id_lowercase])] = dict()
2399 RowMetadataEntryFromTable = BiomValue[iIndexRowMetaData][ConstantsBreadCrumbs.c_metadata_lowercase]
2400 dMetadataTempDict = dict()
2401 for key, value in RowMetadataEntryFromTable.iteritems():
2402 dMetadataTempDict[key] = value
2403 dictRowsMetadata[str(BiomValue[iIndexRowMetaData][ConstantsBreadCrumbs.c_id_lowercase])][ConstantsBreadCrumbs.c_metadata_lowercase] = dMetadataTempDict
2404 return dictRowsMetadata
2405
2406 @staticmethod
2407 def _funcInsertKeyToCommonArea(BiomCommonArea, BiomKey, BiomValue):
2408 """
2409 Inserts the keys into the BiomCommonArea["BiomFileInfo"]
2410 :param BiomCommonArea - The common area that has been built before
2411 :type: dict()
2412 :param BiomKey - The current key (eg. format, date, generated by)
2413 :type: str
2414 :param BiomValue - The current value of the key (eg. for format: "Biological Observation Matrix 0.9.1")
2415 :type: str
2416 :return: BiomCommonArea - The updated common area
2417 :type: dict()
2418 """
2419
2420 if ConstantsBreadCrumbs.c_BiomFileInfo not in BiomCommonArea:
2421 BiomCommonArea[ConstantsBreadCrumbs.c_BiomFileInfo] = dict()
2422
2423 strInsertKey = BiomKey #Set Default - But it is now always the same... (eg. URL is not: format_url -->url and others)
2424 PostBiomValue = BiomValue #The default value to be posted
2425 if BiomKey == ConstantsBreadCrumbs.c_strFormatUrl:
2426 strInsertKey = ConstantsBreadCrumbs.c_strURLKey
2427
2428 if BiomKey == ConstantsBreadCrumbs.c_MatrixTtype:
2429 strInsertKey = ConstantsBreadCrumbs.c_strSparsityKey
2430
2431 if BiomKey == ConstantsBreadCrumbs.c_GeneratedBy:
2432 PostBiomValue = None
2433
2434 if BiomKey == ConstantsBreadCrumbs.c_strDateKey:
2435 PostBiomValue = None
2436
2437 BiomCommonArea[ConstantsBreadCrumbs.c_BiomFileInfo][strInsertKey] = PostBiomValue
2438 return BiomCommonArea
2439