0
+ − 1 #!/usr/bin/env python
+ − 2 """
+ − 3 Author: Timothy Tickle
+ − 4 Description: Performs common manipulations on tables
+ − 5 """
+ − 6
+ − 7 __author__ = "Timothy Tickle"
+ − 8 __copyright__ = "Copyright 2012"
+ − 9 __credits__ = ["Timothy Tickle"]
+ − 10 __license__ = ""
+ − 11 __version__ = ""
+ − 12 __maintainer__ = "Timothy Tickle"
+ − 13 __email__ = "ttickle@sph.harvard.edu"
+ − 14 __status__ = "Development"
+ − 15
+ − 16 import argparse
+ − 17 import csv
+ − 18 import sys
+ − 19 import re
+ − 20 import os
+ − 21 import numpy as np
+ − 22 from src.AbundanceTable import AbundanceTable
+ − 23 #from src.PCA import PCA
+ − 24 from src.ValidateData import ValidateData
+ − 25
+ − 26 #Set up arguments reader
+ − 27 argp = argparse.ArgumentParser( prog = "scriptManipulateTable.py",
+ − 28 description = """Performs common manipulations on tables.\nExample: python scriptManipulateTable.py -i TID -l STSite Test.pcl""" )
+ − 29
+ − 30 #Arguments
+ − 31 #Describe table
+ − 32 argp.add_argument("-i","--id", dest="sIDName", default="ID", help="Abundance Table ID")
+ − 33 argp.add_argument("-l","--meta", dest="sLastMetadataName", help="Last metadata name")
+ − 34 argp.add_argument("-d","--fileDelim", dest= "cFileDelimiter", action= "store", default="\t", help="File delimiter, default tab")
+ − 35 argp.add_argument("-f","--featureDelim", dest= "cFeatureDelimiter", action= "store", default="|", help="Feature (eg. bug or function) delimiter, default '|'")
+ − 36
+ − 37 #Checked x 2
+ − 38 argp.add_argument("-n","--doNorm", dest="fNormalize", action="store_true", default=False, help="Flag to turn on normalization")
+ − 39 argp.add_argument("-s","--doSum", dest="fSum", action="store_true", default=False, help="Flag to turn on summation")
+ − 40
+ − 41 #Unsupervised filtering
+ − 42 argp.add_argument("-A","--doFilterAbundance", dest="strFilterAbundance", action="store", default=None, help="Turns on filtering by abundance (remove features that do not have a minimum abundance in a minimum number of samples); Should be a real number and an integer in the form 'minAbundance,minSamples', (should be performed on a normalized file).")
+ − 43 argp.add_argument("-P","--doFilterPercentile", dest="strFilterPercentile", action="store", default=None, help="Turns on filtering by percentile Should be two numbers between 0 and 1 in the form 'percentile,percentage'. (should be performed on a normalized file).")
+ − 44 argp.add_argument("-O","--doFilterOccurrence", dest="strFilterOccurence", action="store", default=None, help="Turns on filtering by occurrence. Should be two integers in the form 'minSequence,minSample' (should NOT be performed on a normalized file).")
+ − 45 #argp.add_argument("-D","--doFilterDeviation", dest="dCuttOff", action="store", type=float, default=None, help="Flag to turn on filtering by standard deviation (should NOT be performed on a normalized file).")
+ − 46
+ − 47 #Change bug membership
+ − 48 argp.add_argument("-t","--makeTerminal", dest="fMakeTerminal", action="store_true", default=False, help="Works reduces the file to teminal features in the original file.")
+ − 49 argp.add_argument("-u","--reduceOTUs", dest="fRemoveOTUs", action="store_true", default=False, help="Remove otu entries from file.")
+ − 50 argp.add_argument("-c","--reduceToClade", dest="iClade", action="store", type=int, default=None, help="Specify a level of clade to reduce to [].")
+ − 51 argp.add_argument("-b","--reduceToFeatures", dest="strFeatures", action="store", default=None, help="Reduce measurements to certain features (bugs or functions). This can be a comma delimited string (of atleast 2 bugs) or a file.")
+ − 52
+ − 53 #Manipulate based on metadata
+ − 54 #Checked
+ − 55 argp.add_argument("-y","--stratifyBy", dest="strStratifyBy", action="store", default=None, help="Metadata to stratify tables by.")
+ − 56 argp.add_argument("-r","--removeMetadata", dest="strRemoveMetadata", action="store", default=None, help="Remove samples of this metadata and value (format comma delimited string with metadata id first and the values to remove after 'id,lvalue1,value2').")
+ − 57
+ − 58 #Manipulate lineage
+ − 59 #Checked
+ − 60 argp.add_argument("-x","--doPrefixClades", dest="fPrefixClades", action="store_true", default=False, help="Flag to turn on adding prefixes to clades to better identify them, for example s__ will be placed infront of each species.")
+ − 61
+ − 62 #Combine tables
+ − 63 #argp.add_argument("-m","--combineIntersect", dest="fCombineIntersect", action="store_true", default=False, help="Combine two tables including only common features/metadata (intersection).")
+ − 64 #argp.add_argument("-e","--combineUnion", dest="fCombineUnion", action="store_true", default=False, help="Combine two tables (union).")
+ − 65
+ − 66 #Dimensionality Reduction
+ − 67 #argp.add_argument("-p","--doPCA", dest="fDoPCA",action="store_true", default=False, help="Flag to turn on adding metabugs and metametadata by performing PCA on each of bug relative abundance and continuous metadata and add the resulting components")
+ − 68
+ − 69 #Checked
+ − 70 argp.add_argument("-o","--output", dest="strOutFile", action="store", default=None, help="Indicate output pcl file.")
+ − 71 argp.add_argument("strFileAbund", help ="Input data file")
+ − 72
+ − 73 args = argp.parse_args( )
+ − 74
+ − 75 # Creat output file if needed.
+ − 76 if not args.strOutFile:
+ − 77 args.strOutFile = os.path.splitext(args.strFileAbund)[0]+"-mod.pcl"
+ − 78 lsPieces = os.path.splitext(args.strOutFile)
+ − 79
+ − 80 #List of abundance tables
+ − 81 lsTables = []
+ − 82
+ − 83 #Read in abundance table
+ − 84 abndTable = AbundanceTable.funcMakeFromFile(xInputFile=args.strFileAbund,
+ − 85 cDelimiter = args.cFileDelimiter,
+ − 86 sMetadataID = args.sIDName,
+ − 87 sLastMetadata = args.sLastMetadataName,
+ − 88 lOccurenceFilter = None,
+ − 89 cFeatureNameDelimiter=args.cFeatureDelimiter,
+ − 90 xOutputFile = args.strOutFile)
+ − 91
+ − 92 #TODO Check filtering, can not have some filtering together
+ − 93
+ − 94 # Make feature list
+ − 95 lsFeatures = []
+ − 96 if args.strFeatures:
+ − 97 print "Get features not completed"
+ − 98 # if "," in args.strFeatures:
+ − 99 # lsFeatures = args.strFeatures.split(",")
+ − 100 # print "ManipulateTable::Reading in feature list "+str(len(lsFeatures))+"."
+ − 101 # else:
+ − 102 # csvr = csv.reader(open(args.strFeatures, "rU"))
+ − 103 # print "ManipulateTable::Reading in feature file "+args.strFeatures+"."
+ − 104 # for lsLine in csvr:
+ − 105 # lsFeatures.extend(lsLine)
+ − 106
+ − 107 lsTables.append(abndTable)
+ − 108
+ − 109 # Do summing
+ − 110 #Sum if need
+ − 111 if args.fSum:
+ − 112 for abndTable in lsTables:
+ − 113 print "ManipulateTable::"+abndTable.funcGetName()+" had "+str(len(abndTable.funcGetFeatureNames()))+" features before summing."
+ − 114 fResult = abndTable.funcSumClades()
+ − 115 if fResult:
+ − 116 print "ManipulateTable::"+abndTable.funcGetName()+" was summed."
+ − 117 print "ManipulateTable::"+abndTable.funcGetName()+" has "+str(len(abndTable.funcGetFeatureNames()))+" features after summing."
+ − 118 else:
+ − 119 print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" was NOT summed."
+ − 120
+ − 121 # Filter on counts
+ − 122 if args.strFilterOccurence:
+ − 123 iMinimumSequence,iMinimumSample = args.strFilterOccurence.split(",")
+ − 124 for abndTable in lsTables:
+ − 125 if abndTable.funcIsNormalized():
+ − 126 print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" is normalized and can not be filtered by occurence. This filter needs counts."
+ − 127 else:
+ − 128 fResult = abndTable.funcFilterAbundanceBySequenceOccurence(iMinSequence = int(iMinimumSequence), iMinSamples = int(iMinimumSample))
+ − 129 if fResult:
+ − 130 print "ManipulateTable::"+abndTable.funcGetName()+" was filtered by occurence and now has "+str(len(abndTable.funcGetFeatureNames()))+" features."
+ − 131 else:
+ − 132 print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" was NOT filtered by occurence."
+ − 133
+ − 134 # Change bug membership
+ − 135 if args.fMakeTerminal:
+ − 136 lsTerminalTables = []
+ − 137 for abndTable in lsTables:
+ − 138 print "ManipulateTable::"+abndTable.funcGetName()+" had "+str(len(abndTable.funcGetFeatureNames()))+" features before making terminal."
+ − 139 abndTable = abndTable.funcGetFeatureAbundanceTable(abndTable.funcGetTerminalNodes())
+ − 140 if abndTable:
+ − 141 print "ManipulateTable::"+abndTable.funcGetName()+" has "+str(len(abndTable.funcGetFeatureNames()))+" terminal features."
+ − 142 lsTerminalTables.append(abndTable)
+ − 143 else:
+ − 144 print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" was not made terminal."
+ − 145 lsTables = lsTerminalTables
+ − 146
+ − 147 if args.fRemoveOTUs:
+ − 148 lsNotOTUs = []
+ − 149 for abndTable in lsTables:
+ − 150 print "ManipulateTable::"+abndTable.funcGetName()+" had "+str(len(abndTable.funcGetFeatureNames()))+" features before removing OTUs."
+ − 151 abndTable = abndTable.funcGetWithoutOTUs()
+ − 152 if abndTable:
+ − 153 print "ManipulateTable::"+abndTable.funcGetName()+" had OTUs removed and now has "+str(len(abndTable.funcGetFeatureNames()))+" features."
+ − 154 lsNotOTUs.append(abndTable)
+ − 155 else:
+ − 156 print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" OTUs were not removed."
+ − 157 lsTables = lsNotOTUs
+ − 158
+ − 159 if args.iClade:
+ − 160 for abndTable in lsTables:
+ − 161 fResult = abndTable.funcReduceFeaturesToCladeLevel(args.iClade)
+ − 162 if fResult:
+ − 163 print "ManipulateTable::"+abndTable.funcGetName()+" was reduced to clade level "+str(args.iClade)+"."
+ − 164 else:
+ − 165 print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" was NOT reduced in clade levels."
+ − 166
+ − 167 if args.strFeatures:
+ − 168 for abndTable in lsTables:
+ − 169 fResult = abndTable.funcGetFeatureAbundanceTable(lsFeatures)
+ − 170 if fResult:
+ − 171 print "ManipulateTable::"+abndTable.funcGetName()+" has been reduced to given features and now has "+str(len(abndTable.funcGetFeatureNames()))+" features."
+ − 172 else:
+ − 173 print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" could not be reduced to the given list."
+ − 174
+ − 175 if args.strRemoveMetadata:
+ − 176 lsMetadata = args.strRemoveMetadata.split(",")
+ − 177 for abndTable in lsTables:
+ − 178 fResult = abndTable.funcRemoveSamplesByMetadata(sMetadata=lsMetadata[0], lValuesToRemove=lsMetadata[1:])
+ − 179 if fResult:
+ − 180 print "ManipulateTable::"+abndTable.funcGetName()+" has had samples removed and now has "+str(len(abndTable.funcGetSampleNames()))+" samples."
+ − 181 else:
+ − 182 print "ManipulateTable::ERROR. Could not remove samples from "+abndTable.funcGetName()+"."
+ − 183
+ − 184 # Normalize if needed
+ − 185 if args.fNormalize:
+ − 186 for abndTable in lsTables:
+ − 187 fResult = abndTable.funcNormalize()
+ − 188 if fResult:
+ − 189 print "ManipulateTable::"+abndTable.funcGetName()+" was normalized."
+ − 190 else:
+ − 191 print "ManipulateTable::"+abndTable.funcGetName()+" was NOT normalized."
+ − 192
+ − 193 # Filter on percentile
+ − 194 if args.strFilterPercentile:
+ − 195 dPercentile,dPercentage = args.strFilterPercentile.split(",")
+ − 196 for abndTable in lsTables:
+ − 197 if abndTable.funcIsNormalized():
+ − 198 fResult = abndTable.funcFilterAbundanceByPercentile(dPercentileCutOff = float(dPercentile), dPercentageAbovePercentile = float(dPercentage))
+ − 199 if fResult:
+ − 200 print "ManipulateTable::"+abndTable.funcGetName()+" has been reduced by percentile and now has "+str(len(abndTable.funcGetFeatureNames()))+" features."
+ − 201 else:
+ − 202 print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" could not be reduced by percentile."
+ − 203 else:
+ − 204 print "ManipulateTable::"+abndTable.funcGetName()+" was NOT normalized and so the percentile filter is invalid, please indicate to normalize the table."
+ − 205
+ − 206 # Filter on abundance (should go after filter on percentile because the filter on percentile
+ − 207 # needs the full distribution of features in a sample
+ − 208 if args.strFilterAbundance:
+ − 209 dAbundance,iMinSamples = args.strFilterAbundance.split(",")
+ − 210 dAbundance = float(dAbundance)
+ − 211 iMinSamples = int(iMinSamples)
+ − 212 for abndTable in lsTables:
+ − 213 if abndTable.funcIsNormalized():
+ − 214 fResult = abndTable.funcFilterAbundanceByMinValue(dMinAbundance=dAbundance,iMinSamples=iMinSamples)
+ − 215 if fResult:
+ − 216 print "ManipulateTable::"+abndTable.funcGetName()+" has been reduced by minimum relative abundance value and now has "+str(len(abndTable.funcGetFeatureNames()))+" features."
+ − 217 else:
+ − 218 print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" could not be reduced by percentile."
+ − 219 else:
+ − 220 print "ManipulateTable::"+abndTable.funcGetName()+" was NOT normalized and so the abundance filter is invalid, please indicate to normalize the table."
+ − 221
+ − 222 #if args.dCuttOff:
+ − 223 # print "Standard deviation filtering not completed"
+ − 224 # for abndTable in lsTables:
+ − 225 # abndTable.funcFilterFeatureBySD(dMinSDCuttOff=args.dCuttOff)
+ − 226 # if fResult:
+ − 227 # print "ManipulateTable::"+abndTable.funcGetName()+" has been reduced by standard deviation and now has "+str(len(abndTable.funcGetFeatureNames()))+" features."
+ − 228 # else:
+ − 229 # print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" could not be reduced by standard devation."
+ − 230
+ − 231 # Need to normalize again after abundance data filtering given removing features breaks the normalization
+ − 232 # This happends twice because normalization is required to make the abundance data to filter on ;-)
+ − 233 # Normalize if needed
+ − 234 if args.fNormalize:
+ − 235 for abndTable in lsTables:
+ − 236 fResult = abndTable.funcNormalize()
+ − 237 if fResult:
+ − 238 print "ManipulateTable::"+abndTable.funcGetName()+" was normalized after filtering on abundance data."
+ − 239
+ − 240 #Manipulate lineage
+ − 241 if args.fPrefixClades:
+ − 242 for abndTable in lsTables:
+ − 243 fResult = abndTable.funcAddCladePrefixToFeatures()
+ − 244 if fResult:
+ − 245 print "ManipulateTable::Clade Prefix was added to "+abndTable.funcGetName()
+ − 246 else:
+ − 247 print "ManipulateTable::ERROR. Clade Prefix was NOT added to "+abndTable.funcGetName()
+ − 248
+ − 249 # Under development
+ − 250 # Reduce dimensionality
+ − 251 #if args.fDoPCA:
+ − 252 # pcaCur = PCA()
+ − 253 # for abndTable in lsTables:
+ − 254 #
+ − 255 # # Add data features
+ − 256 # # Make data components and add to abundance table
+ − 257 # pcaCur.loadData(abndTable,True)
+ − 258 # pcaCur.run(fASTransform=True)
+ − 259 # ldVariance = pcaCur.getVariance()
+ − 260 # lldComponents = pcaCur.getComponents()
+ − 261 # # Make Names
+ − 262 # lsNamesData = ["Data_PC"+str((tpleVariance[0]+1))+"_"+re.sub("[\.|-]","_",str(tpleVariance[1])) for tpleVariance in enumerate(ldVariance)]
+ − 263 # abndTable.funcAddDataFeature(lsNamesData,lldComponents)
+ − 264 #
+ − 265 # # Add metadata features
+ − 266 # # Convert metadata to an input for PCA
+ − 267 # pcaCur.loadData(pcaCur.convertMetadataForPCA(abndTable),False)
+ − 268 # fSuccessful = pcaCur.run(fASTransform=False)
+ − 269 # if(fSuccessful):
+ − 270 # ldVariance = pcaCur.getVariance()
+ − 271 # lldComponents = pcaCur.getComponents()
+ − 272 # # Make Names
+ − 273 # lsNamesMetadata = ["Metadata_PC"+str((tpleVariance[0]+1))+"_"+re.sub("[\.|-]","_",str(tpleVariance[1])) for tpleVariance in enumerate(ldVariance)]
+ − 274 # # Make metadata components and add to abundance
+ − 275 # llsMetadata = [list(npdRow) for npdRow in lldComponents]
+ − 276 # abndTable.funcAddMetadataFeature(lsNamesMetadata, llsMetadata)
+ − 277 # else:
+ − 278 # print "ManipulateTable::No metadata to PCA, no PCA components added to file based on metadata"
+ − 279
+ − 280 #Manipulate based on metadata
+ − 281 if args.strStratifyBy:
+ − 282 labndStratifiedTables = []
+ − 283 for abndTable in lsTables:
+ − 284 labndResult = abndTable.funcStratifyByMetadata(strMetadata=args.strStratifyBy)
+ − 285 print "ManipulateTable::"+abndTable.funcGetName()+" was stratified by "+args.strStratifyBy+" in to "+str(len(labndResult))+" tables."
+ − 286 labndStratifiedTables.extend(labndResult)
+ − 287 lsTables = labndStratifiedTables
+ − 288
+ − 289 if len(lsTables) == 1:
+ − 290 lsTables[0].funcWriteToFile(args.strOutFile)
+ − 291 else:
+ − 292 iIndex = 1
+ − 293 for abndManTable in lsTables:
+ − 294 abndManTable.funcWriteToFile(lsPieces[0]+str(iIndex)+lsPieces[1])
+ − 295 iIndex = iIndex + 1