Mercurial > repos > sagun98 > micropita
diff galaxy_micropita/src/breadcrumbs/scripts/scriptManipulateTable.py @ 3:8fb4630ab314 draft default tip
Uploaded
author | sagun98 |
---|---|
date | Thu, 03 Jun 2021 17:07:36 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy_micropita/src/breadcrumbs/scripts/scriptManipulateTable.py Thu Jun 03 17:07:36 2021 +0000 @@ -0,0 +1,295 @@ +#!/usr/bin/env python +""" +Author: Timothy Tickle +Description: Performs common manipulations on tables +""" + +__author__ = "Timothy Tickle" +__copyright__ = "Copyright 2012" +__credits__ = ["Timothy Tickle"] +__license__ = "" +__version__ = "" +__maintainer__ = "Timothy Tickle" +__email__ = "ttickle@sph.harvard.edu" +__status__ = "Development" + +import argparse +import csv +import sys +import re +import os +import numpy as np +from src.AbundanceTable import AbundanceTable +#from src.PCA import PCA +from src.ValidateData import ValidateData + +#Set up arguments reader +argp = argparse.ArgumentParser( prog = "scriptManipulateTable.py", + description = """Performs common manipulations on tables.\nExample: python scriptManipulateTable.py -i TID -l STSite Test.pcl""" ) + +#Arguments +#Describe table +argp.add_argument("-i","--id", dest="sIDName", default="ID", help="Abundance Table ID") +argp.add_argument("-l","--meta", dest="sLastMetadataName", help="Last metadata name") +argp.add_argument("-d","--fileDelim", dest= "cFileDelimiter", action= "store", default="\t", help="File delimiter, default tab") +argp.add_argument("-f","--featureDelim", dest= "cFeatureDelimiter", action= "store", default="|", help="Feature (eg. bug or function) delimiter, default '|'") + +#Checked x 2 +argp.add_argument("-n","--doNorm", dest="fNormalize", action="store_true", default=False, help="Flag to turn on normalization") +argp.add_argument("-s","--doSum", dest="fSum", action="store_true", default=False, help="Flag to turn on summation") + +#Unsupervised filtering +argp.add_argument("-A","--doFilterAbundance", dest="strFilterAbundance", action="store", default=None, help="Turns on filtering by abundance (remove features that do not have a minimum abundance in a minimum number of samples); Should be a real number and an integer in the form 'minAbundance,minSamples', (should be performed on a normalized file).") +argp.add_argument("-P","--doFilterPercentile", dest="strFilterPercentile", action="store", default=None, help="Turns on filtering by percentile Should be two numbers between 0 and 1 in the form 'percentile,percentage'. (should be performed on a normalized file).") +argp.add_argument("-O","--doFilterOccurrence", dest="strFilterOccurence", action="store", default=None, help="Turns on filtering by occurrence. Should be two integers in the form 'minSequence,minSample' (should NOT be performed on a normalized file).") +#argp.add_argument("-D","--doFilterDeviation", dest="dCuttOff", action="store", type=float, default=None, help="Flag to turn on filtering by standard deviation (should NOT be performed on a normalized file).") + +#Change bug membership +argp.add_argument("-t","--makeTerminal", dest="fMakeTerminal", action="store_true", default=False, help="Works reduces the file to teminal features in the original file.") +argp.add_argument("-u","--reduceOTUs", dest="fRemoveOTUs", action="store_true", default=False, help="Remove otu entries from file.") +argp.add_argument("-c","--reduceToClade", dest="iClade", action="store", type=int, default=None, help="Specify a level of clade to reduce to [].") +argp.add_argument("-b","--reduceToFeatures", dest="strFeatures", action="store", default=None, help="Reduce measurements to certain features (bugs or functions). This can be a comma delimited string (of atleast 2 bugs) or a file.") + +#Manipulate based on metadata +#Checked +argp.add_argument("-y","--stratifyBy", dest="strStratifyBy", action="store", default=None, help="Metadata to stratify tables by.") +argp.add_argument("-r","--removeMetadata", dest="strRemoveMetadata", action="store", default=None, help="Remove samples of this metadata and value (format comma delimited string with metadata id first and the values to remove after 'id,lvalue1,value2').") + +#Manipulate lineage +#Checked +argp.add_argument("-x","--doPrefixClades", dest="fPrefixClades", action="store_true", default=False, help="Flag to turn on adding prefixes to clades to better identify them, for example s__ will be placed infront of each species.") + +#Combine tables +#argp.add_argument("-m","--combineIntersect", dest="fCombineIntersect", action="store_true", default=False, help="Combine two tables including only common features/metadata (intersection).") +#argp.add_argument("-e","--combineUnion", dest="fCombineUnion", action="store_true", default=False, help="Combine two tables (union).") + +#Dimensionality Reduction +#argp.add_argument("-p","--doPCA", dest="fDoPCA",action="store_true", default=False, help="Flag to turn on adding metabugs and metametadata by performing PCA on each of bug relative abundance and continuous metadata and add the resulting components") + +#Checked +argp.add_argument("-o","--output", dest="strOutFile", action="store", default=None, help="Indicate output pcl file.") +argp.add_argument("strFileAbund", help ="Input data file") + +args = argp.parse_args( ) + +# Creat output file if needed. +if not args.strOutFile: + args.strOutFile = os.path.splitext(args.strFileAbund)[0]+"-mod.pcl" +lsPieces = os.path.splitext(args.strOutFile) + +#List of abundance tables +lsTables = [] + +#Read in abundance table +abndTable = AbundanceTable.funcMakeFromFile(xInputFile=args.strFileAbund, + cDelimiter = args.cFileDelimiter, + sMetadataID = args.sIDName, + sLastMetadata = args.sLastMetadataName, + lOccurenceFilter = None, + cFeatureNameDelimiter=args.cFeatureDelimiter, + xOutputFile = args.strOutFile) + +#TODO Check filtering, can not have some filtering together + +# Make feature list +lsFeatures = [] +if args.strFeatures: + print "Get features not completed" +# if "," in args.strFeatures: +# lsFeatures = args.strFeatures.split(",") +# print "ManipulateTable::Reading in feature list "+str(len(lsFeatures))+"." +# else: +# csvr = csv.reader(open(args.strFeatures, "rU")) +# print "ManipulateTable::Reading in feature file "+args.strFeatures+"." +# for lsLine in csvr: +# lsFeatures.extend(lsLine) + +lsTables.append(abndTable) + +# Do summing +#Sum if need +if args.fSum: + for abndTable in lsTables: + print "ManipulateTable::"+abndTable.funcGetName()+" had "+str(len(abndTable.funcGetFeatureNames()))+" features before summing." + fResult = abndTable.funcSumClades() + if fResult: + print "ManipulateTable::"+abndTable.funcGetName()+" was summed." + print "ManipulateTable::"+abndTable.funcGetName()+" has "+str(len(abndTable.funcGetFeatureNames()))+" features after summing." + else: + print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" was NOT summed." + +# Filter on counts +if args.strFilterOccurence: + iMinimumSequence,iMinimumSample = args.strFilterOccurence.split(",") + for abndTable in lsTables: + if abndTable.funcIsNormalized(): + print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" is normalized and can not be filtered by occurence. This filter needs counts." + else: + fResult = abndTable.funcFilterAbundanceBySequenceOccurence(iMinSequence = int(iMinimumSequence), iMinSamples = int(iMinimumSample)) + if fResult: + print "ManipulateTable::"+abndTable.funcGetName()+" was filtered by occurence and now has "+str(len(abndTable.funcGetFeatureNames()))+" features." + else: + print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" was NOT filtered by occurence." + +# Change bug membership +if args.fMakeTerminal: + lsTerminalTables = [] + for abndTable in lsTables: + print "ManipulateTable::"+abndTable.funcGetName()+" had "+str(len(abndTable.funcGetFeatureNames()))+" features before making terminal." + abndTable = abndTable.funcGetFeatureAbundanceTable(abndTable.funcGetTerminalNodes()) + if abndTable: + print "ManipulateTable::"+abndTable.funcGetName()+" has "+str(len(abndTable.funcGetFeatureNames()))+" terminal features." + lsTerminalTables.append(abndTable) + else: + print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" was not made terminal." + lsTables = lsTerminalTables + +if args.fRemoveOTUs: + lsNotOTUs = [] + for abndTable in lsTables: + print "ManipulateTable::"+abndTable.funcGetName()+" had "+str(len(abndTable.funcGetFeatureNames()))+" features before removing OTUs." + abndTable = abndTable.funcGetWithoutOTUs() + if abndTable: + print "ManipulateTable::"+abndTable.funcGetName()+" had OTUs removed and now has "+str(len(abndTable.funcGetFeatureNames()))+" features." + lsNotOTUs.append(abndTable) + else: + print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" OTUs were not removed." + lsTables = lsNotOTUs + +if args.iClade: + for abndTable in lsTables: + fResult = abndTable.funcReduceFeaturesToCladeLevel(args.iClade) + if fResult: + print "ManipulateTable::"+abndTable.funcGetName()+" was reduced to clade level "+str(args.iClade)+"." + else: + print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" was NOT reduced in clade levels." + +if args.strFeatures: + for abndTable in lsTables: + fResult = abndTable.funcGetFeatureAbundanceTable(lsFeatures) + if fResult: + print "ManipulateTable::"+abndTable.funcGetName()+" has been reduced to given features and now has "+str(len(abndTable.funcGetFeatureNames()))+" features." + else: + print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" could not be reduced to the given list." + +if args.strRemoveMetadata: + lsMetadata = args.strRemoveMetadata.split(",") + for abndTable in lsTables: + fResult = abndTable.funcRemoveSamplesByMetadata(sMetadata=lsMetadata[0], lValuesToRemove=lsMetadata[1:]) + if fResult: + print "ManipulateTable::"+abndTable.funcGetName()+" has had samples removed and now has "+str(len(abndTable.funcGetSampleNames()))+" samples." + else: + print "ManipulateTable::ERROR. Could not remove samples from "+abndTable.funcGetName()+"." + +# Normalize if needed +if args.fNormalize: + for abndTable in lsTables: + fResult = abndTable.funcNormalize() + if fResult: + print "ManipulateTable::"+abndTable.funcGetName()+" was normalized." + else: + print "ManipulateTable::"+abndTable.funcGetName()+" was NOT normalized." + +# Filter on percentile +if args.strFilterPercentile: + dPercentile,dPercentage = args.strFilterPercentile.split(",") + for abndTable in lsTables: + if abndTable.funcIsNormalized(): + fResult = abndTable.funcFilterAbundanceByPercentile(dPercentileCutOff = float(dPercentile), dPercentageAbovePercentile = float(dPercentage)) + if fResult: + print "ManipulateTable::"+abndTable.funcGetName()+" has been reduced by percentile and now has "+str(len(abndTable.funcGetFeatureNames()))+" features." + else: + print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" could not be reduced by percentile." + else: + print "ManipulateTable::"+abndTable.funcGetName()+" was NOT normalized and so the percentile filter is invalid, please indicate to normalize the table." + +# Filter on abundance (should go after filter on percentile because the filter on percentile +# needs the full distribution of features in a sample +if args.strFilterAbundance: + dAbundance,iMinSamples = args.strFilterAbundance.split(",") + dAbundance = float(dAbundance) + iMinSamples = int(iMinSamples) + for abndTable in lsTables: + if abndTable.funcIsNormalized(): + fResult = abndTable.funcFilterAbundanceByMinValue(dMinAbundance=dAbundance,iMinSamples=iMinSamples) + if fResult: + print "ManipulateTable::"+abndTable.funcGetName()+" has been reduced by minimum relative abundance value and now has "+str(len(abndTable.funcGetFeatureNames()))+" features." + else: + print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" could not be reduced by percentile." + else: + print "ManipulateTable::"+abndTable.funcGetName()+" was NOT normalized and so the abundance filter is invalid, please indicate to normalize the table." + +#if args.dCuttOff: +# print "Standard deviation filtering not completed" +# for abndTable in lsTables: +# abndTable.funcFilterFeatureBySD(dMinSDCuttOff=args.dCuttOff) +# if fResult: +# print "ManipulateTable::"+abndTable.funcGetName()+" has been reduced by standard deviation and now has "+str(len(abndTable.funcGetFeatureNames()))+" features." +# else: +# print "ManipulateTable::ERROR. "+abndTable.funcGetName()+" could not be reduced by standard devation." + +# Need to normalize again after abundance data filtering given removing features breaks the normalization +# This happends twice because normalization is required to make the abundance data to filter on ;-) +# Normalize if needed +if args.fNormalize: + for abndTable in lsTables: + fResult = abndTable.funcNormalize() + if fResult: + print "ManipulateTable::"+abndTable.funcGetName()+" was normalized after filtering on abundance data." + +#Manipulate lineage +if args.fPrefixClades: + for abndTable in lsTables: + fResult = abndTable.funcAddCladePrefixToFeatures() + if fResult: + print "ManipulateTable::Clade Prefix was added to "+abndTable.funcGetName() + else: + print "ManipulateTable::ERROR. Clade Prefix was NOT added to "+abndTable.funcGetName() + +# Under development +# Reduce dimensionality +#if args.fDoPCA: +# pcaCur = PCA() +# for abndTable in lsTables: +# +# # Add data features +# # Make data components and add to abundance table +# pcaCur.loadData(abndTable,True) +# pcaCur.run(fASTransform=True) +# ldVariance = pcaCur.getVariance() +# lldComponents = pcaCur.getComponents() +# # Make Names +# lsNamesData = ["Data_PC"+str((tpleVariance[0]+1))+"_"+re.sub("[\.|-]","_",str(tpleVariance[1])) for tpleVariance in enumerate(ldVariance)] +# abndTable.funcAddDataFeature(lsNamesData,lldComponents) +# +# # Add metadata features +# # Convert metadata to an input for PCA +# pcaCur.loadData(pcaCur.convertMetadataForPCA(abndTable),False) +# fSuccessful = pcaCur.run(fASTransform=False) +# if(fSuccessful): +# ldVariance = pcaCur.getVariance() +# lldComponents = pcaCur.getComponents() +# # Make Names +# lsNamesMetadata = ["Metadata_PC"+str((tpleVariance[0]+1))+"_"+re.sub("[\.|-]","_",str(tpleVariance[1])) for tpleVariance in enumerate(ldVariance)] +# # Make metadata components and add to abundance +# llsMetadata = [list(npdRow) for npdRow in lldComponents] +# abndTable.funcAddMetadataFeature(lsNamesMetadata, llsMetadata) +# else: +# print "ManipulateTable::No metadata to PCA, no PCA components added to file based on metadata" + +#Manipulate based on metadata +if args.strStratifyBy: + labndStratifiedTables = [] + for abndTable in lsTables: + labndResult = abndTable.funcStratifyByMetadata(strMetadata=args.strStratifyBy) + print "ManipulateTable::"+abndTable.funcGetName()+" was stratified by "+args.strStratifyBy+" in to "+str(len(labndResult))+" tables." + labndStratifiedTables.extend(labndResult) + lsTables = labndStratifiedTables + +if len(lsTables) == 1: + lsTables[0].funcWriteToFile(args.strOutFile) +else: + iIndex = 1 + for abndManTable in lsTables: + abndManTable.funcWriteToFile(lsPieces[0]+str(iIndex)+lsPieces[1]) + iIndex = iIndex + 1