Mercurial > repos > jjjjia > cpo_prediction
diff cpo_galaxy_prediction.py @ 3:e6027598a35c draft
planemo upload
author | jjjjia |
---|---|
date | Mon, 20 Aug 2018 17:53:59 -0400 |
parents | 29302ffdf137 |
children | bd6f5844d60e |
line wrap: on
line diff
--- a/cpo_galaxy_prediction.py Mon Aug 20 14:19:56 2018 -0400 +++ b/cpo_galaxy_prediction.py Mon Aug 20 17:53:59 2018 -0400 @@ -11,8 +11,7 @@ #$ -m ea #$ -M bja20@sfu.ca -#~/scripts/pipeline.py -i BC11-Kpn005_S2 -f /data/jjjjia/R1/BC11-Kpn005_S2_L001_R1_001.fastq.gz -r /data/jjjjia/R2/BC11-Kpn005_S2_L001_R2_001.fastq.gz -o pipelineResultsQsub -e "Klebsiella pneumoniae" - +#./prediction.py -i ~/testCases/cpoResults/contigs/BC11-Kpn005_S2.fa -m ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.mlst -c ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.recon/contig_report.txt -f ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.recon/mobtyper_aggregate_report.txt -a ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.cp -r ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.rgi.txt -e "Klebsiella" import subprocess import pandas import optparse @@ -27,27 +26,23 @@ import numpy -debug = False #True #debug skips the shell scripts and also dump out a ton of debugging messages +debug = True #debug skips the shell scripts and also dump out a ton of debugging messages if not debug: #parses some parameters parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...") #required + #MLSTHIT, mobsuite, resfinder, rgi, mlstscheme parser.add_option("-i", "--id", dest="id", type="string", help="identifier of the isolate") - parser.add_option("-a", "--assembly", dest="assemblyPath", type="string", help="absolute file path to contigs fasta") - parser.add_option("-c", "--card-db", dest="cardDB", default = "/home/jjjjia/databases/card202.json", type="string", help="absolute path to card reference database") - parser.add_option("-o", "--output", dest="output", default='./', type="string", help="absolute path to output folder") + parser.add_option("-m", "--mlst", dest="mlst", type="string", help="absolute file path to mlst result") + parser.add_option("-c", "--mobfinderContig", dest="mobfinderContig", type="string", help="absolute path to mobfinder aggregate result") + parser.add_option("-f", "--mobfinderAggregate", dest="mobfinderAggregate", type="string", help="absolute path to mobfinder plasmid results") + parser.add_option("-a", "--abricate", dest="abricate", type="string", help="absolute path to abricate results") + parser.add_option("-r", "--rgi", dest="rgi", type="string", help="absolute path to rgi results") parser.add_option("-e", "--expected", dest="expectedSpecies", default="NA/NA/NA", type="string", help="expected species of the isolate") + parser.add_option("-s", "--mlst-scheme", dest="mlstScheme", default= "./scheme_species_map.tab", type="string", help="absolute file path to mlst scheme") + parser.add_option("-p", "--plasmidfinder", dest="plasmidfinder", type="string", help="absolute file path to plasmidfinder ") - #optionals - parser.add_option("-k", "--script-path", dest="scriptDir", default="/home/jjjjia/scripts", type="string", help="absolute file path to this script folder") - parser.add_option("-b", "--update-abricate-path", dest="updateAbPath", default = "", type="string", help="absolute file path to fasta sequence used for abricate database") - parser.add_option("-m", "--update-abricate-dbname", dest="updateAbName", default = "default", type="string", help="name of abricate database to update") - parser.add_option("-u", "--update-mlst", dest="updateMLST", default = "False", type="string", help="True = update MLST") - #used for parsing - parser.add_option("-s", "--mlst-scheme", dest="mlst", default= "/home/jjjjia/databases/scheme_species_map.tab", type="string", help="absolute file path to mlst scheme") - - #parallelization, useless, these are hard coded to 8cores/64G RAM #parser.add_option("-t", "--threads", dest="threads", default=8, type="int", help="number of cpu to use") #parser.add_option("-p", "--memory", dest="memory", default=64, type="int", help="memory to use in GB") @@ -56,33 +51,35 @@ #if len(args) != 8: #parser.error("incorrect number of arguments, all 7 is required") curDir = os.getcwd() - outputDir = options.output - expectedSpecies = options.expectedSpecies - mlstScheme = options.mlst - tempDir = outputDir + "/shovillTemp" - scriptDir = options.scriptDir - updateAbName = options.updateAbName - updateAbPath = options.updateAbPath - updateMLST = options.updateMLST - cardDB=options.cardDB - assemblyPath=options.assemblyPath - ID = options.id + ID = str(options.id).lstrip().rstrip() + mlst = str(options.mlst).lstrip().rstrip() + mobfindercontig = str(options.mobfinderContig).lstrip().rstrip() + mobfinderaggregate = str(options.mobfinderAggregate).lstrip().rstrip() + abricate = str(options.abricate).lstrip().rstrip() + rgi = str(options.rgi).lstrip().rstrip() + expectedSpecies = str(options.expectedSpecies).lstrip().rstrip() + mlstScheme = str(options.mlstScheme).lstrip().rstrip() + plasmidfinder = str(options.plasmidfinder).lstrip().rstrip() + outputDir = "./" + print(mlst) + print(mobfindercontig) + print(mobfinderaggregate) + print(abricate) + print(rgi) + print(expectedSpecies) + print(mlstScheme) else: - manifestDir = "" curDir = os.getcwd() - outputDir = "pipelineTest" + ID = "BC11" + mlst = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.mlst" + mobfindercontig = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.recon\contig_report.txt" + mobfinderaggregate = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.recon\mobtyper_aggregate_report.txt" + abricate = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.cp" + rgi = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.rgi.txt" expectedSpecies = "Escherichia coli" - #threads = 8 - #memory = 64 - mlstScheme = outputDir + "/scheme_species_map.tab" - tempDir= outputDir + "/shovillTemp" - scriptDir = "~/scripts" - updateAbName = "cpo" - updateAbPath = "~/database/bccdcCPO.seq" - updateMLST = True - assemblyPath = "./" - cardDB = "./" - ID = "BC11-Kpn005_S2" + mlstScheme = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\scheme_species_map.tab" + plasmidfinder = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.origins" + outputDir = "./" #region result objects #define some objects to store values from results @@ -164,6 +161,7 @@ self.mash_neighbor_distance = 0.00 self.mash_neighbor_cluster= 0 self.row = "" + class RGIResult(object): def __init__(self): self.ORF_ID = "" @@ -233,17 +231,18 @@ out.write(gzContent) return True def ToJson(dictObject, outputPath): - outDir = outputDir + '/summary/' + ID + ".json/" - if not (os.path.exists(outDir)): - os.makedirs(outDir) - with open(outDir + outputPath, 'w') as f: - json.dump([ob.__dict__ for ob in dictObject.values()], f, ensure_ascii=False) + #outDir = outputDir + '/summary/' + ID + ".json/" + #if not (os.path.exists(outDir)): + #os.makedirs(outDir) + #with open(outputPath, 'w') as f: + #json.dump([ob.__dict__ for ob in dictObject.values()], f, ensure_ascii=False) + return "" #endregion #region functions to parse result files -def ParseMLSTResult(pathToMLSTResult): +def ParseMLSTResult(pathToMLSTResult, scheme): _mlstResult = {} - scheme = pandas.read_csv(mlstScheme, delimiter='\t', header=0) + scheme = pandas.read_csv(scheme, delimiter='\t', header=0) scheme = scheme.replace(numpy.nan, '', regex=True) taxon = {} @@ -442,47 +441,55 @@ r.source = "likely chromosome" _rgiR[r.Model_ID]=r return _rgiR + +def ParsePlasmidFinderResult(pathToPlasmidFinderResult): + #pipelineTest/contigs/BC110-Kpn005.fa contig00019 45455 45758 IncFIC(FII)_1 8-308/499 ========/=..... 8/11 59.52 75.65 plasmidfinder AP001918 IncFIC(FII)_1__AP001918 + #example resfinder: + #pipelineTest/contigs/BC110-Kpn005.fa contig00038 256 1053 OXA-181 1-798/798 =============== 0/0 100.00 100.00 bccdc AEP16366.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-181 + + _pFinder = {} #*********************** + plasmidFinder = pandas.read_csv(pathToPlasmidFinderResult, delimiter='\t', header=0) + + for i in range(len(plasmidFinder.index)): + pf = starFinders() + pf.file = str(plasmidFinder.iloc[i,0]) + pf.sequence = str(plasmidFinder.iloc[i,1]) + pf.start = int(plasmidFinder.iloc[i,2]) + pf.end = int(plasmidFinder.iloc[i,3]) + pf.gene = str(plasmidFinder.iloc[i,4]) + pf.shortGene = pf.gene[:pf.gene.index("_")] + pf.coverage = str(plasmidFinder.iloc[i,5]) + pf.coverage_map = str(plasmidFinder.iloc[i,6]) + pf.gaps = str(plasmidFinder.iloc[i,7]) + pf.pCoverage = float(plasmidFinder.iloc[i,8]) + pf.pIdentity = float(plasmidFinder.iloc[i,9]) + pf.database = str(plasmidFinder.iloc[i,10]) + pf.accession = str(plasmidFinder.iloc[i,11]) + pf.product = str(plasmidFinder.iloc[i,12]) + pf.source = "plasmid" + pf.row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist()) + _pFinder[pf.gene]=pf + #row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist()) + #plasmidFinderContigs.append(str(plasmidFinder.iloc[i,1])) + #origins.append(str(plasmidFinder.iloc[i,4][:plasmidFinder.iloc[i,4].index("_")])) + return _pFinder #endregion def Main(): + outputDir = "./" notes = [] #init the output list output = [] jsonOutput = [] - print(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + assemblyPath) - output.append(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + assemblyPath) - - #region update databases if update=true - if not debug: - #update databases if necessary - if not (updateAbPath == "" and updateAbName == "default"): - print("updating abricate database: " + updateAbName + " @fasta path: " + updateAbPath) - cmd = [scriptDir + "/pipeline_updateAbricateDB.sh", updateAbPath, updateAbName] - update = execute(cmd) - if (updateMLST.lower() == "true"): - print("updating mlst database... ") - cmd = [scriptDir + "/pipeline_updateMLST.sh"] - update = execute(cmd) - #endregion - - print("step 3: parsing the mlst results") - - print("performing MLST") - #region call the mlst script - if not debug: - print("running pipeline_prediction.sh") - #input parameters: 1=ID 2 = assemblyPath, 3= outputdir, 4=card.json - cmd = [scriptDir + "/pipeline_prediction.sh", ID, assemblyPath, outputDir, cardDB] - result = execute(cmd) - #endregion + print(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + ID) + output.append(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + ID) #region parse the mlst results print("step 3: parsing mlst, plasmid, and amr results") print("identifying MLST") - pathToMLSTScheme = outputDir + "/predictions/" + ID + ".mlst" - mlstHit = ParseMLSTResult(pathToMLSTScheme)#*********************** + mlstHit = ParseMLSTResult(mlst, str(mlstScheme))#*********************** ToJson(mlstHit, "mlst.json") #write it to a json output mlstHit = list(mlstHit.values())[0] @@ -496,9 +503,9 @@ origins = [] #parse mobsuite results - mSuite = ParseMobsuiteResult(outputDir + "/predictions/" + ID + ".recon/contig_report.txt")#************* + mSuite = ParseMobsuiteResult(mobfindercontig) #outputDir + "/predictions/" + ID + ".recon/contig_report.txt")#************* ToJson(mSuite, "mobsuite.json") #************* - mSuitePlasmids = ParseMobsuitePlasmids(outputDir + "/predictions/" + ID + ".recon/mobtyper_aggregate_report.txt")#************* + mSuitePlasmids = ParseMobsuitePlasmids(mobfinderaggregate)#outputDir + "/predictions/" + ID + ".recon/mobtyper_aggregate_report.txt")#************* ToJson(mSuitePlasmids, "mobsuitePlasmids.json") #************* for key in mSuite: @@ -512,10 +519,13 @@ origins.append(mSuite[key].rep_type) #parse resfinder AMR results - rFinder = ParseResFinderResult(outputDir + "/predictions/" + ID + ".cp", plasmidContigs, likelyPlasmidContigs) #********************** + pFinder = ParsePlasmidFinderResult(plasmidfinder) + ToJson(pFinder, "origins.json") + + rFinder = ParseResFinderResult(abricate, plasmidContigs, likelyPlasmidContigs)#outputDir + "/predictions/" + ID + ".cp", plasmidContigs, likelyPlasmidContigs) #********************** ToJson(rFinder, "resfinder.json") #************* - rgiAMR = ParseRGIResult(outputDir + "/predictions/" + ID + ".rgi.txt", plasmidContigs, likelyPlasmidContigs)#*********************** + rgiAMR = ParseRGIResult(rgi, plasmidContigs, likelyPlasmidContigs) # outputDir + "/predictions/" + ID + ".rgi.txt", plasmidContigs, likelyPlasmidContigs)#*********************** ToJson(rgiAMR, "rgi.json") #************* carbapenamases = [] @@ -580,7 +590,7 @@ #write summary to a file summaryDir = outputDir + "/summary/" + ID - out = open(summaryDir + ".txt", 'w') + out = open("summary.txt", 'w') for item in output: out.write("%s\n" % item) @@ -623,7 +633,7 @@ tsvOut.append(temp) summaryDir = outputDir + "/summary/" + ID - out = open(summaryDir + ".tsv", 'w') + out = open("summary.tsv", 'w') for item in tsvOut: out.write("%s\n" % item) #endregion @@ -635,4 +645,4 @@ Main() end = time.time() -print("Finished!\nThe analysis used: " + str(end-start) + " seconds") +print("Finished!\nThe analysis used: " + str(end-start) + " seconds") \ No newline at end of file