Mercurial > repos > sagun98 > micropita_v2
comparison src/breadcrumbs/scripts/scriptManipulateTable.py @ 0:0de566f21448 draft default tip
v2
| author | sagun98 |
|---|---|
| date | Thu, 03 Jun 2021 18:13:32 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:0de566f21448 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 Author: Timothy Tickle | |
| 4 Description: Performs common manipulations on tables | |
| 5 """ | |
| 6 | |
| 7 __author__ = "Timothy Tickle" | |
| 8 __copyright__ = "Copyright 2012" | |
| 9 __credits__ = ["Timothy Tickle"] | |
| 10 __license__ = "" | |
| 11 __version__ = "" | |
| 12 __maintainer__ = "Timothy Tickle" | |
| 13 __email__ = "ttickle@sph.harvard.edu" | |
| 14 __status__ = "Development" | |
| 15 | |
| 16 import argparse | |
| 17 import csv | |
| 18 import sys | |
| 19 import re | |
| 20 import os | |
| 21 import numpy as np | |
| 22 from src.AbundanceTable import AbundanceTable | |
| 23 #from src.PCA import PCA | |
| 24 from src.ValidateData import ValidateData | |
| 25 | |
| 26 #Set up arguments reader | |
| 27 argp = argparse.ArgumentParser( prog = "scriptManipulateTable.py", | |
| 28 description = """Performs common manipulations on tables.\nExample: python scriptManipulateTable.py -i TID -l STSite Test.pcl""" ) | |
| 29 | |
| 30 #Arguments | |
| 31 #Describe table | |
| 32 argp.add_argument("-i","--id", dest="sIDName", default="ID", help="Abundance Table ID") | |
| 33 argp.add_argument("-l","--meta", dest="sLastMetadataName", help="Last metadata name") | |
| 34 argp.add_argument("-d","--fileDelim", dest= "cFileDelimiter", action= "store", default="\t", help="File delimiter, default tab") | |
| 35 argp.add_argument("-f","--featureDelim", dest= "cFeatureDelimiter", action= "store", default="|", help="Feature (eg. bug or function) delimiter, default '|'") | |
| 36 | |
| 37 #Checked x 2 | |
| 38 argp.add_argument("-n","--doNorm", dest="fNormalize", action="store_true", default=False, help="Flag to turn on normalization") | |
| 39 argp.add_argument("-s","--doSum", dest="fSum", action="store_true", default=False, help="Flag to turn on summation") | |
| 40 | |
| 41 #Unsupervised filtering | |
| 42 argp.add_argument("-A","--doFilterAbundance", dest="strFilterAbundance", action="store", default=None, help="Turns on filtering by abundance (remove features that do not have a minimum abundance in a minimum number of samples); Should be a real number and an integer in the form 'minAbundance,minSamples', (should be performed on a normalized file).") | |
| 43 argp.add_argument("-P","--doFilterPercentile", dest="strFilterPercentile", action="store", default=None, help="Turns on filtering by percentile Should be two numbers between 0 and 1 in the form 'percentile,percentage'. (should be performed on a normalized file).") | |
| 44 argp.add_argument("-O","--doFilterOccurrence", dest="strFilterOccurence", action="store", default=None, help="Turns on filtering by occurrence. Should be two integers in the form 'minSequence,minSample' (should NOT be performed on a normalized file).") | |
| 45 #argp.add_argument("-D","--doFilterDeviation", dest="dCuttOff", action="store", type=float, default=None, help="Flag to turn on filtering by standard deviation (should NOT be performed on a normalized file).") | |
| 46 | |
| 47 #Change bug membership | |
| 48 argp.add_argument("-t","--makeTerminal", dest="fMakeTerminal", action="store_true", default=False, help="Works reduces the file to teminal features in the original file.") | |
| 49 argp.add_argument("-u","--reduceOTUs", dest="fRemoveOTUs", action="store_true", default=False, help="Remove otu entries from file.") | |
| 50 argp.add_argument("-c","--reduceToClade", dest="iClade", action="store", type=int, default=None, help="Specify a level of clade to reduce to [].") | |
| 51 argp.add_argument("-b","--reduceToFeatures", dest="strFeatures", action="store", default=None, help="Reduce measurements to certain features (bugs or functions). This can be a comma delimited string (of atleast 2 bugs) or a file.") | |
| 52 | |
| 53 #Manipulate based on metadata | |
| 54 #Checked | |
| 55 argp.add_argument("-y","--stratifyBy", dest="strStratifyBy", action="store", default=None, help="Metadata to stratify tables by.") | |
| 56 argp.add_argument("-r","--removeMetadata", dest="strRemoveMetadata", action="store", default=None, help="Remove samples of this metadata and value (format comma delimited string with metadata id first and the values to remove after 'id,lvalue1,value2').") | |
| 57 | |
| 58 #Manipulate lineage | |
| 59 #Checked | |
| 60 argp.add_argument("-x","--doPrefixClades", dest="fPrefixClades", action="store_true", default=False, help="Flag to turn on adding prefixes to clades to better identify them, for example s__ will be placed infront of each species.") | |
| 61 | |
| 62 #Combine tables | |
| 63 #argp.add_argument("-m","--combineIntersect", dest="fCombineIntersect", action="store_true", default=False, help="Combine two tables including only common features/metadata (intersection).") | |
| 64 #argp.add_argument("-e","--combineUnion", dest="fCombineUnion", action="store_true", default=False, help="Combine two tables (union).") | |
| 65 | |
| 66 #Dimensionality Reduction | |
| 67 #argp.add_argument("-p","--doPCA", dest="fDoPCA",action="store_true", default=False, help="Flag to turn on adding metabugs and metametadata by performing PCA on each of bug relative abundance and continuous metadata and add the resulting components") | |
| 68 | |
| 69 #Checked | |
| 70 argp.add_argument("-o","--output", dest="strOutFile", action="store", default=None, help="Indicate output pcl file.") | |
| 71 argp.add_argument("strFileAbund", help ="Input data file") | |
| 72 | |
| 73 args = argp.parse_args( ) | |
| 74 | |
| 75 # Creat output file if needed. | |
| 76 if not args.strOutFile: | |
| 77 args.strOutFile = os.path.splitext(args.strFileAbund)[0]+"-mod.pcl" | |
| 78 lsPieces = os.path.splitext(args.strOutFile) | |
| 79 | |
| 80 #List of abundance tables | |
| 81 lsTables = [] | |
| 82 | |
| 83 #Read in abundance table | |
| 84 abndTable = AbundanceTable.funcMakeFromFile(xInputFile=args.strFileAbund, | |
| 85 cDelimiter = args.cFileDelimiter, | |
| 86 sMetadataID = args.sIDName, | |
| 87 sLastMetadata = args.sLastMetadataName, | |
| 88 lOccurenceFilter = None, | |
| 89 cFeatureNameDelimiter=args.cFeatureDelimiter, | |
| 90 xOutputFile = args.strOutFile) | |
| 91 | |
| 92 #TODO Check filtering, can not have some filtering together | |
| 93 | |
| 94 # Make feature list | |
| 95 lsFeatures = [] | |
| 96 if args.strFeatures: | |
| 97 print "Get features not completed" | |
| 98 # if "," in args.strFeatures: | |
| 99 # lsFeatures = args.strFeatures.split(",") | |
| 100 # print "ManipulateTable::Reading in feature list "+str(len(lsFeatures))+"." | |
| 101 # else: | |
| 102 # csvr = csv.reader(open(args.strFeatures, "rU")) | |
| 103 # print "ManipulateTable::Reading in feature file "+args.strFeatures+"." | |
| 104 # for lsLine in csvr: | |
| 105 # lsFeatures.extend(lsLine) | |
| 106 | |
| 107 lsTables.append(abndTable) | |
| 108 | |
| 109 # Do summing | |
| 110 #Sum if need | |
| 111 if args.fSum: | |
| 112 for abndTable in lsTables: | |
| 113 print "ManipulateTable::"+abndTable.funcGetName()+" had "+str(len(abndTable.funcGetFeatureNames()))+" features before summing." | |
| 114 fResult = abndTable.funcSumClades() | |
| 115 if fResult: | |
| 116 print "ManipulateTable::"+abndTable.funcGetName()+" was summed." | |
| 117 print "ManipulateTable::"+abndTable.funcGetName()+" has "+str(len(abndTable.funcGetFeatureNames()))+" features after summing." | |
| 118 else: | |
| 119 print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" was NOT summed." | |
| 120 | |
| 121 # Filter on counts | |
| 122 if args.strFilterOccurence: | |
| 123 iMinimumSequence,iMinimumSample = args.strFilterOccurence.split(",") | |
| 124 for abndTable in lsTables: | |
| 125 if abndTable.funcIsNormalized(): | |
| 126 print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" is normalized and can not be filtered by occurence. This filter needs counts." | |
| 127 else: | |
| 128 fResult = abndTable.funcFilterAbundanceBySequenceOccurence(iMinSequence = int(iMinimumSequence), iMinSamples = int(iMinimumSample)) | |
| 129 if fResult: | |
| 130 print "ManipulateTable::"+abndTable.funcGetName()+" was filtered by occurence and now has "+str(len(abndTable.funcGetFeatureNames()))+" features." | |
| 131 else: | |
| 132 print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" was NOT filtered by occurence." | |
| 133 | |
| 134 # Change bug membership | |
| 135 if args.fMakeTerminal: | |
| 136 lsTerminalTables = [] | |
| 137 for abndTable in lsTables: | |
| 138 print "ManipulateTable::"+abndTable.funcGetName()+" had "+str(len(abndTable.funcGetFeatureNames()))+" features before making terminal." | |
| 139 abndTable = abndTable.funcGetFeatureAbundanceTable(abndTable.funcGetTerminalNodes()) | |
| 140 if abndTable: | |
| 141 print "ManipulateTable::"+abndTable.funcGetName()+" has "+str(len(abndTable.funcGetFeatureNames()))+" terminal features." | |
| 142 lsTerminalTables.append(abndTable) | |
| 143 else: | |
| 144 print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" was not made terminal." | |
| 145 lsTables = lsTerminalTables | |
| 146 | |
| 147 if args.fRemoveOTUs: | |
| 148 lsNotOTUs = [] | |
| 149 for abndTable in lsTables: | |
| 150 print "ManipulateTable::"+abndTable.funcGetName()+" had "+str(len(abndTable.funcGetFeatureNames()))+" features before removing OTUs." | |
| 151 abndTable = abndTable.funcGetWithoutOTUs() | |
| 152 if abndTable: | |
| 153 print "ManipulateTable::"+abndTable.funcGetName()+" had OTUs removed and now has "+str(len(abndTable.funcGetFeatureNames()))+" features." | |
| 154 lsNotOTUs.append(abndTable) | |
| 155 else: | |
| 156 print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" OTUs were not removed." | |
| 157 lsTables = lsNotOTUs | |
| 158 | |
| 159 if args.iClade: | |
| 160 for abndTable in lsTables: | |
| 161 fResult = abndTable.funcReduceFeaturesToCladeLevel(args.iClade) | |
| 162 if fResult: | |
| 163 print "ManipulateTable::"+abndTable.funcGetName()+" was reduced to clade level "+str(args.iClade)+"." | |
| 164 else: | |
| 165 print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" was NOT reduced in clade levels." | |
| 166 | |
| 167 if args.strFeatures: | |
| 168 for abndTable in lsTables: | |
| 169 fResult = abndTable.funcGetFeatureAbundanceTable(lsFeatures) | |
| 170 if fResult: | |
| 171 print "ManipulateTable::"+abndTable.funcGetName()+" has been reduced to given features and now has "+str(len(abndTable.funcGetFeatureNames()))+" features." | |
| 172 else: | |
| 173 print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" could not be reduced to the given list." | |
| 174 | |
| 175 if args.strRemoveMetadata: | |
| 176 lsMetadata = args.strRemoveMetadata.split(",") | |
| 177 for abndTable in lsTables: | |
| 178 fResult = abndTable.funcRemoveSamplesByMetadata(sMetadata=lsMetadata[0], lValuesToRemove=lsMetadata[1:]) | |
| 179 if fResult: | |
| 180 print "ManipulateTable::"+abndTable.funcGetName()+" has had samples removed and now has "+str(len(abndTable.funcGetSampleNames()))+" samples." | |
| 181 else: | |
| 182 print "ManipulateTable::ERROR. Could not remove samples from "+abndTable.funcGetName()+"." | |
| 183 | |
| 184 # Normalize if needed | |
| 185 if args.fNormalize: | |
| 186 for abndTable in lsTables: | |
| 187 fResult = abndTable.funcNormalize() | |
| 188 if fResult: | |
| 189 print "ManipulateTable::"+abndTable.funcGetName()+" was normalized." | |
| 190 else: | |
| 191 print "ManipulateTable::"+abndTable.funcGetName()+" was NOT normalized." | |
| 192 | |
| 193 # Filter on percentile | |
| 194 if args.strFilterPercentile: | |
| 195 dPercentile,dPercentage = args.strFilterPercentile.split(",") | |
| 196 for abndTable in lsTables: | |
| 197 if abndTable.funcIsNormalized(): | |
| 198 fResult = abndTable.funcFilterAbundanceByPercentile(dPercentileCutOff = float(dPercentile), dPercentageAbovePercentile = float(dPercentage)) | |
| 199 if fResult: | |
| 200 print "ManipulateTable::"+abndTable.funcGetName()+" has been reduced by percentile and now has "+str(len(abndTable.funcGetFeatureNames()))+" features." | |
| 201 else: | |
| 202 print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" could not be reduced by percentile." | |
| 203 else: | |
| 204 print "ManipulateTable::"+abndTable.funcGetName()+" was NOT normalized and so the percentile filter is invalid, please indicate to normalize the table." | |
| 205 | |
| 206 # Filter on abundance (should go after filter on percentile because the filter on percentile | |
| 207 # needs the full distribution of features in a sample | |
| 208 if args.strFilterAbundance: | |
| 209 dAbundance,iMinSamples = args.strFilterAbundance.split(",") | |
| 210 dAbundance = float(dAbundance) | |
| 211 iMinSamples = int(iMinSamples) | |
| 212 for abndTable in lsTables: | |
| 213 if abndTable.funcIsNormalized(): | |
| 214 fResult = abndTable.funcFilterAbundanceByMinValue(dMinAbundance=dAbundance,iMinSamples=iMinSamples) | |
| 215 if fResult: | |
| 216 print "ManipulateTable::"+abndTable.funcGetName()+" has been reduced by minimum relative abundance value and now has "+str(len(abndTable.funcGetFeatureNames()))+" features." | |
| 217 else: | |
| 218 print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" could not be reduced by percentile." | |
| 219 else: | |
| 220 print "ManipulateTable::"+abndTable.funcGetName()+" was NOT normalized and so the abundance filter is invalid, please indicate to normalize the table." | |
| 221 | |
| 222 #if args.dCuttOff: | |
| 223 # print "Standard deviation filtering not completed" | |
| 224 # for abndTable in lsTables: | |
| 225 # abndTable.funcFilterFeatureBySD(dMinSDCuttOff=args.dCuttOff) | |
| 226 # if fResult: | |
| 227 # print "ManipulateTable::"+abndTable.funcGetName()+" has been reduced by standard deviation and now has "+str(len(abndTable.funcGetFeatureNames()))+" features." | |
| 228 # else: | |
| 229 # print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" could not be reduced by standard devation." | |
| 230 | |
| 231 # Need to normalize again after abundance data filtering given removing features breaks the normalization | |
| 232 # This happends twice because normalization is required to make the abundance data to filter on ;-) | |
| 233 # Normalize if needed | |
| 234 if args.fNormalize: | |
| 235 for abndTable in lsTables: | |
| 236 fResult = abndTable.funcNormalize() | |
| 237 if fResult: | |
| 238 print "ManipulateTable::"+abndTable.funcGetName()+" was normalized after filtering on abundance data." | |
| 239 | |
| 240 #Manipulate lineage | |
| 241 if args.fPrefixClades: | |
| 242 for abndTable in lsTables: | |
| 243 fResult = abndTable.funcAddCladePrefixToFeatures() | |
| 244 if fResult: | |
| 245 print "ManipulateTable::Clade Prefix was added to "+abndTable.funcGetName() | |
| 246 else: | |
| 247 print "ManipulateTable::ERROR. Clade Prefix was NOT added to "+abndTable.funcGetName() | |
| 248 | |
| 249 # Under development | |
| 250 # Reduce dimensionality | |
| 251 #if args.fDoPCA: | |
| 252 # pcaCur = PCA() | |
| 253 # for abndTable in lsTables: | |
| 254 # | |
| 255 # # Add data features | |
| 256 # # Make data components and add to abundance table | |
| 257 # pcaCur.loadData(abndTable,True) | |
| 258 # pcaCur.run(fASTransform=True) | |
| 259 # ldVariance = pcaCur.getVariance() | |
| 260 # lldComponents = pcaCur.getComponents() | |
| 261 # # Make Names | |
| 262 # lsNamesData = ["Data_PC"+str((tpleVariance[0]+1))+"_"+re.sub("[\.|-]","_",str(tpleVariance[1])) for tpleVariance in enumerate(ldVariance)] | |
| 263 # abndTable.funcAddDataFeature(lsNamesData,lldComponents) | |
| 264 # | |
| 265 # # Add metadata features | |
| 266 # # Convert metadata to an input for PCA | |
| 267 # pcaCur.loadData(pcaCur.convertMetadataForPCA(abndTable),False) | |
| 268 # fSuccessful = pcaCur.run(fASTransform=False) | |
| 269 # if(fSuccessful): | |
| 270 # ldVariance = pcaCur.getVariance() | |
| 271 # lldComponents = pcaCur.getComponents() | |
| 272 # # Make Names | |
| 273 # lsNamesMetadata = ["Metadata_PC"+str((tpleVariance[0]+1))+"_"+re.sub("[\.|-]","_",str(tpleVariance[1])) for tpleVariance in enumerate(ldVariance)] | |
| 274 # # Make metadata components and add to abundance | |
| 275 # llsMetadata = [list(npdRow) for npdRow in lldComponents] | |
| 276 # abndTable.funcAddMetadataFeature(lsNamesMetadata, llsMetadata) | |
| 277 # else: | |
| 278 # print "ManipulateTable::No metadata to PCA, no PCA components added to file based on metadata" | |
| 279 | |
| 280 #Manipulate based on metadata | |
| 281 if args.strStratifyBy: | |
| 282 labndStratifiedTables = [] | |
| 283 for abndTable in lsTables: | |
| 284 labndResult = abndTable.funcStratifyByMetadata(strMetadata=args.strStratifyBy) | |
| 285 print "ManipulateTable::"+abndTable.funcGetName()+" was stratified by "+args.strStratifyBy+" in to "+str(len(labndResult))+" tables." | |
| 286 labndStratifiedTables.extend(labndResult) | |
| 287 lsTables = labndStratifiedTables | |
| 288 | |
| 289 if len(lsTables) == 1: | |
| 290 lsTables[0].funcWriteToFile(args.strOutFile) | |
| 291 else: | |
| 292 iIndex = 1 | |
| 293 for abndManTable in lsTables: | |
| 294 abndManTable.funcWriteToFile(lsPieces[0]+str(iIndex)+lsPieces[1]) | |
| 295 iIndex = iIndex + 1 |
