comparison cpo_galaxy_prediction.py @ 3:e6027598a35c draft

planemo upload
author jjjjia
date Mon, 20 Aug 2018 17:53:59 -0400
parents 29302ffdf137
children bd6f5844d60e
comparison
equal deleted inserted replaced
2:29302ffdf137 3:e6027598a35c
9 #$ -e ./logs/$JOB_ID.err 9 #$ -e ./logs/$JOB_ID.err
10 #$ -o ./logs/$JOB_ID.log 10 #$ -o ./logs/$JOB_ID.log
11 #$ -m ea 11 #$ -m ea
12 #$ -M bja20@sfu.ca 12 #$ -M bja20@sfu.ca
13 13
14 #~/scripts/pipeline.py -i BC11-Kpn005_S2 -f /data/jjjjia/R1/BC11-Kpn005_S2_L001_R1_001.fastq.gz -r /data/jjjjia/R2/BC11-Kpn005_S2_L001_R2_001.fastq.gz -o pipelineResultsQsub -e "Klebsiella pneumoniae" 14 #./prediction.py -i ~/testCases/cpoResults/contigs/BC11-Kpn005_S2.fa -m ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.mlst -c ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.recon/contig_report.txt -f ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.recon/mobtyper_aggregate_report.txt -a ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.cp -r ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.rgi.txt -e "Klebsiella"
15
16 import subprocess 15 import subprocess
17 import pandas 16 import pandas
18 import optparse 17 import optparse
19 import os 18 import os
20 import datetime 19 import datetime
25 import collections 24 import collections
26 import json 25 import json
27 import numpy 26 import numpy
28 27
29 28
30 debug = False #True #debug skips the shell scripts and also dump out a ton of debugging messages 29 debug = True #debug skips the shell scripts and also dump out a ton of debugging messages
31 30
32 if not debug: 31 if not debug:
33 #parses some parameters 32 #parses some parameters
34 parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...") 33 parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...")
35 #required 34 #required
35 #MLSTHIT, mobsuite, resfinder, rgi, mlstscheme
36 parser.add_option("-i", "--id", dest="id", type="string", help="identifier of the isolate") 36 parser.add_option("-i", "--id", dest="id", type="string", help="identifier of the isolate")
37 parser.add_option("-a", "--assembly", dest="assemblyPath", type="string", help="absolute file path to contigs fasta") 37 parser.add_option("-m", "--mlst", dest="mlst", type="string", help="absolute file path to mlst result")
38 parser.add_option("-c", "--card-db", dest="cardDB", default = "/home/jjjjia/databases/card202.json", type="string", help="absolute path to card reference database") 38 parser.add_option("-c", "--mobfinderContig", dest="mobfinderContig", type="string", help="absolute path to mobfinder aggregate result")
39 parser.add_option("-o", "--output", dest="output", default='./', type="string", help="absolute path to output folder") 39 parser.add_option("-f", "--mobfinderAggregate", dest="mobfinderAggregate", type="string", help="absolute path to mobfinder plasmid results")
40 parser.add_option("-a", "--abricate", dest="abricate", type="string", help="absolute path to abricate results")
41 parser.add_option("-r", "--rgi", dest="rgi", type="string", help="absolute path to rgi results")
40 parser.add_option("-e", "--expected", dest="expectedSpecies", default="NA/NA/NA", type="string", help="expected species of the isolate") 42 parser.add_option("-e", "--expected", dest="expectedSpecies", default="NA/NA/NA", type="string", help="expected species of the isolate")
41 43 parser.add_option("-s", "--mlst-scheme", dest="mlstScheme", default= "./scheme_species_map.tab", type="string", help="absolute file path to mlst scheme")
42 #optionals 44 parser.add_option("-p", "--plasmidfinder", dest="plasmidfinder", type="string", help="absolute file path to plasmidfinder ")
43 parser.add_option("-k", "--script-path", dest="scriptDir", default="/home/jjjjia/scripts", type="string", help="absolute file path to this script folder") 45
44 parser.add_option("-b", "--update-abricate-path", dest="updateAbPath", default = "", type="string", help="absolute file path to fasta sequence used for abricate database")
45 parser.add_option("-m", "--update-abricate-dbname", dest="updateAbName", default = "default", type="string", help="name of abricate database to update")
46 parser.add_option("-u", "--update-mlst", dest="updateMLST", default = "False", type="string", help="True = update MLST")
47 #used for parsing
48 parser.add_option("-s", "--mlst-scheme", dest="mlst", default= "/home/jjjjia/databases/scheme_species_map.tab", type="string", help="absolute file path to mlst scheme")
49
50
51 #parallelization, useless, these are hard coded to 8cores/64G RAM 46 #parallelization, useless, these are hard coded to 8cores/64G RAM
52 #parser.add_option("-t", "--threads", dest="threads", default=8, type="int", help="number of cpu to use") 47 #parser.add_option("-t", "--threads", dest="threads", default=8, type="int", help="number of cpu to use")
53 #parser.add_option("-p", "--memory", dest="memory", default=64, type="int", help="memory to use in GB") 48 #parser.add_option("-p", "--memory", dest="memory", default=64, type="int", help="memory to use in GB")
54 49
55 (options,args) = parser.parse_args() 50 (options,args) = parser.parse_args()
56 #if len(args) != 8: 51 #if len(args) != 8:
57 #parser.error("incorrect number of arguments, all 7 is required") 52 #parser.error("incorrect number of arguments, all 7 is required")
58 curDir = os.getcwd() 53 curDir = os.getcwd()
59 outputDir = options.output 54 ID = str(options.id).lstrip().rstrip()
60 expectedSpecies = options.expectedSpecies 55 mlst = str(options.mlst).lstrip().rstrip()
61 mlstScheme = options.mlst 56 mobfindercontig = str(options.mobfinderContig).lstrip().rstrip()
62 tempDir = outputDir + "/shovillTemp" 57 mobfinderaggregate = str(options.mobfinderAggregate).lstrip().rstrip()
63 scriptDir = options.scriptDir 58 abricate = str(options.abricate).lstrip().rstrip()
64 updateAbName = options.updateAbName 59 rgi = str(options.rgi).lstrip().rstrip()
65 updateAbPath = options.updateAbPath 60 expectedSpecies = str(options.expectedSpecies).lstrip().rstrip()
66 updateMLST = options.updateMLST 61 mlstScheme = str(options.mlstScheme).lstrip().rstrip()
67 cardDB=options.cardDB 62 plasmidfinder = str(options.plasmidfinder).lstrip().rstrip()
68 assemblyPath=options.assemblyPath 63 outputDir = "./"
69 ID = options.id 64 print(mlst)
65 print(mobfindercontig)
66 print(mobfinderaggregate)
67 print(abricate)
68 print(rgi)
69 print(expectedSpecies)
70 print(mlstScheme)
70 else: 71 else:
71 manifestDir = ""
72 curDir = os.getcwd() 72 curDir = os.getcwd()
73 outputDir = "pipelineTest" 73 ID = "BC11"
74 mlst = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.mlst"
75 mobfindercontig = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.recon\contig_report.txt"
76 mobfinderaggregate = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.recon\mobtyper_aggregate_report.txt"
77 abricate = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.cp"
78 rgi = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.rgi.txt"
74 expectedSpecies = "Escherichia coli" 79 expectedSpecies = "Escherichia coli"
75 #threads = 8 80 mlstScheme = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\scheme_species_map.tab"
76 #memory = 64 81 plasmidfinder = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.origins"
77 mlstScheme = outputDir + "/scheme_species_map.tab" 82 outputDir = "./"
78 tempDir= outputDir + "/shovillTemp"
79 scriptDir = "~/scripts"
80 updateAbName = "cpo"
81 updateAbPath = "~/database/bccdcCPO.seq"
82 updateMLST = True
83 assemblyPath = "./"
84 cardDB = "./"
85 ID = "BC11-Kpn005_S2"
86 83
87 #region result objects 84 #region result objects
88 #define some objects to store values from results 85 #define some objects to store values from results
89 #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar). 86 #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar).
90 class starFinders(object): 87 class starFinders(object):
162 self.PredictedMobility = "" 159 self.PredictedMobility = ""
163 self.mash_nearest_neighbor = "" 160 self.mash_nearest_neighbor = ""
164 self.mash_neighbor_distance = 0.00 161 self.mash_neighbor_distance = 0.00
165 self.mash_neighbor_cluster= 0 162 self.mash_neighbor_cluster= 0
166 self.row = "" 163 self.row = ""
164
167 class RGIResult(object): 165 class RGIResult(object):
168 def __init__(self): 166 def __init__(self):
169 self.ORF_ID = "" 167 self.ORF_ID = ""
170 self.Contig = "" 168 self.Contig = ""
171 self.Start = -1 169 self.Start = -1
231 gzContent = f.read() 229 gzContent = f.read()
232 with open(outputpath, 'wb') as out: 230 with open(outputpath, 'wb') as out:
233 out.write(gzContent) 231 out.write(gzContent)
234 return True 232 return True
235 def ToJson(dictObject, outputPath): 233 def ToJson(dictObject, outputPath):
236 outDir = outputDir + '/summary/' + ID + ".json/" 234 #outDir = outputDir + '/summary/' + ID + ".json/"
237 if not (os.path.exists(outDir)): 235 #if not (os.path.exists(outDir)):
238 os.makedirs(outDir) 236 #os.makedirs(outDir)
239 with open(outDir + outputPath, 'w') as f: 237 #with open(outputPath, 'w') as f:
240 json.dump([ob.__dict__ for ob in dictObject.values()], f, ensure_ascii=False) 238 #json.dump([ob.__dict__ for ob in dictObject.values()], f, ensure_ascii=False)
239 return ""
241 #endregion 240 #endregion
242 241
243 #region functions to parse result files 242 #region functions to parse result files
244 def ParseMLSTResult(pathToMLSTResult): 243 def ParseMLSTResult(pathToMLSTResult, scheme):
245 _mlstResult = {} 244 _mlstResult = {}
246 scheme = pandas.read_csv(mlstScheme, delimiter='\t', header=0) 245 scheme = pandas.read_csv(scheme, delimiter='\t', header=0)
247 scheme = scheme.replace(numpy.nan, '', regex=True) 246 scheme = scheme.replace(numpy.nan, '', regex=True)
248 247
249 taxon = {} 248 taxon = {}
250 #record the scheme as a dictionary 249 #record the scheme as a dictionary
251 taxon["-"] = "No MLST Match" 250 taxon["-"] = "No MLST Match"
440 r.source = "likely plasmid" 439 r.source = "likely plasmid"
441 else: 440 else:
442 r.source = "likely chromosome" 441 r.source = "likely chromosome"
443 _rgiR[r.Model_ID]=r 442 _rgiR[r.Model_ID]=r
444 return _rgiR 443 return _rgiR
444
445 def ParsePlasmidFinderResult(pathToPlasmidFinderResult):
446 #pipelineTest/contigs/BC110-Kpn005.fa contig00019 45455 45758 IncFIC(FII)_1 8-308/499 ========/=..... 8/11 59.52 75.65 plasmidfinder AP001918 IncFIC(FII)_1__AP001918
447 #example resfinder:
448 #pipelineTest/contigs/BC110-Kpn005.fa contig00038 256 1053 OXA-181 1-798/798 =============== 0/0 100.00 100.00 bccdc AEP16366.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-181
449
450 _pFinder = {} #***********************
451 plasmidFinder = pandas.read_csv(pathToPlasmidFinderResult, delimiter='\t', header=0)
452
453 for i in range(len(plasmidFinder.index)):
454 pf = starFinders()
455 pf.file = str(plasmidFinder.iloc[i,0])
456 pf.sequence = str(plasmidFinder.iloc[i,1])
457 pf.start = int(plasmidFinder.iloc[i,2])
458 pf.end = int(plasmidFinder.iloc[i,3])
459 pf.gene = str(plasmidFinder.iloc[i,4])
460 pf.shortGene = pf.gene[:pf.gene.index("_")]
461 pf.coverage = str(plasmidFinder.iloc[i,5])
462 pf.coverage_map = str(plasmidFinder.iloc[i,6])
463 pf.gaps = str(plasmidFinder.iloc[i,7])
464 pf.pCoverage = float(plasmidFinder.iloc[i,8])
465 pf.pIdentity = float(plasmidFinder.iloc[i,9])
466 pf.database = str(plasmidFinder.iloc[i,10])
467 pf.accession = str(plasmidFinder.iloc[i,11])
468 pf.product = str(plasmidFinder.iloc[i,12])
469 pf.source = "plasmid"
470 pf.row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist())
471 _pFinder[pf.gene]=pf
472 #row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist())
473 #plasmidFinderContigs.append(str(plasmidFinder.iloc[i,1]))
474 #origins.append(str(plasmidFinder.iloc[i,4][:plasmidFinder.iloc[i,4].index("_")]))
475 return _pFinder
445 #endregion 476 #endregion
446 477
447 def Main(): 478 def Main():
479 outputDir = "./"
448 notes = [] 480 notes = []
449 #init the output list 481 #init the output list
450 output = [] 482 output = []
451 jsonOutput = [] 483 jsonOutput = []
452 484
453 print(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + assemblyPath) 485 print(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + ID)
454 output.append(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + assemblyPath) 486 output.append(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + ID)
455
456 #region update databases if update=true
457 if not debug:
458 #update databases if necessary
459 if not (updateAbPath == "" and updateAbName == "default"):
460 print("updating abricate database: " + updateAbName + " @fasta path: " + updateAbPath)
461 cmd = [scriptDir + "/pipeline_updateAbricateDB.sh", updateAbPath, updateAbName]
462 update = execute(cmd)
463 if (updateMLST.lower() == "true"):
464 print("updating mlst database... ")
465 cmd = [scriptDir + "/pipeline_updateMLST.sh"]
466 update = execute(cmd)
467 #endregion
468
469 print("step 3: parsing the mlst results")
470
471 print("performing MLST")
472 #region call the mlst script
473 if not debug:
474 print("running pipeline_prediction.sh")
475 #input parameters: 1=ID 2 = assemblyPath, 3= outputdir, 4=card.json
476 cmd = [scriptDir + "/pipeline_prediction.sh", ID, assemblyPath, outputDir, cardDB]
477 result = execute(cmd)
478 #endregion
479 487
480 #region parse the mlst results 488 #region parse the mlst results
481 print("step 3: parsing mlst, plasmid, and amr results") 489 print("step 3: parsing mlst, plasmid, and amr results")
482 490
483 print("identifying MLST") 491 print("identifying MLST")
484 pathToMLSTScheme = outputDir + "/predictions/" + ID + ".mlst" 492 mlstHit = ParseMLSTResult(mlst, str(mlstScheme))#***********************
485 mlstHit = ParseMLSTResult(pathToMLSTScheme)#***********************
486 ToJson(mlstHit, "mlst.json") #write it to a json output 493 ToJson(mlstHit, "mlst.json") #write it to a json output
487 mlstHit = list(mlstHit.values())[0] 494 mlstHit = list(mlstHit.values())[0]
488 495
489 #endregion 496 #endregion
490 497
494 plasmidContigs = [] 501 plasmidContigs = []
495 likelyPlasmidContigs = [] 502 likelyPlasmidContigs = []
496 origins = [] 503 origins = []
497 504
498 #parse mobsuite results 505 #parse mobsuite results
499 mSuite = ParseMobsuiteResult(outputDir + "/predictions/" + ID + ".recon/contig_report.txt")#************* 506 mSuite = ParseMobsuiteResult(mobfindercontig) #outputDir + "/predictions/" + ID + ".recon/contig_report.txt")#*************
500 ToJson(mSuite, "mobsuite.json") #************* 507 ToJson(mSuite, "mobsuite.json") #*************
501 mSuitePlasmids = ParseMobsuitePlasmids(outputDir + "/predictions/" + ID + ".recon/mobtyper_aggregate_report.txt")#************* 508 mSuitePlasmids = ParseMobsuitePlasmids(mobfinderaggregate)#outputDir + "/predictions/" + ID + ".recon/mobtyper_aggregate_report.txt")#*************
502 ToJson(mSuitePlasmids, "mobsuitePlasmids.json") #************* 509 ToJson(mSuitePlasmids, "mobsuitePlasmids.json") #*************
503 510
504 for key in mSuite: 511 for key in mSuite:
505 if mSuite[key].contig_num not in plasmidContigs and mSuite[key].contig_num not in likelyPlasmidContigs: 512 if mSuite[key].contig_num not in plasmidContigs and mSuite[key].contig_num not in likelyPlasmidContigs:
506 if not (mSuite[key].rep_type == ''): 513 if not (mSuite[key].rep_type == ''):
510 for key in mSuite: 517 for key in mSuite:
511 if mSuite[key].rep_type not in origins: 518 if mSuite[key].rep_type not in origins:
512 origins.append(mSuite[key].rep_type) 519 origins.append(mSuite[key].rep_type)
513 520
514 #parse resfinder AMR results 521 #parse resfinder AMR results
515 rFinder = ParseResFinderResult(outputDir + "/predictions/" + ID + ".cp", plasmidContigs, likelyPlasmidContigs) #********************** 522 pFinder = ParsePlasmidFinderResult(plasmidfinder)
523 ToJson(pFinder, "origins.json")
524
525 rFinder = ParseResFinderResult(abricate, plasmidContigs, likelyPlasmidContigs)#outputDir + "/predictions/" + ID + ".cp", plasmidContigs, likelyPlasmidContigs) #**********************
516 ToJson(rFinder, "resfinder.json") #************* 526 ToJson(rFinder, "resfinder.json") #*************
517 527
518 rgiAMR = ParseRGIResult(outputDir + "/predictions/" + ID + ".rgi.txt", plasmidContigs, likelyPlasmidContigs)#*********************** 528 rgiAMR = ParseRGIResult(rgi, plasmidContigs, likelyPlasmidContigs) # outputDir + "/predictions/" + ID + ".rgi.txt", plasmidContigs, likelyPlasmidContigs)#***********************
519 ToJson(rgiAMR, "rgi.json") #************* 529 ToJson(rgiAMR, "rgi.json") #*************
520 530
521 carbapenamases = [] 531 carbapenamases = []
522 amrGenes = [] 532 amrGenes = []
523 for keys in rFinder: 533 for keys in rFinder:
578 for key in rgiAMR: 588 for key in rgiAMR:
579 output.append(rgiAMR[key].row) 589 output.append(rgiAMR[key].row)
580 590
581 #write summary to a file 591 #write summary to a file
582 summaryDir = outputDir + "/summary/" + ID 592 summaryDir = outputDir + "/summary/" + ID
583 out = open(summaryDir + ".txt", 'w') 593 out = open("summary.txt", 'w')
584 for item in output: 594 for item in output:
585 out.write("%s\n" % item) 595 out.write("%s\n" % item)
586 596
587 597
588 #TSV output 598 #TSV output
621 temp += ";".join(plasmidContigs) + "\t" 631 temp += ";".join(plasmidContigs) + "\t"
622 temp += ";".join(likelyPlasmidContigs) 632 temp += ";".join(likelyPlasmidContigs)
623 tsvOut.append(temp) 633 tsvOut.append(temp)
624 634
625 summaryDir = outputDir + "/summary/" + ID 635 summaryDir = outputDir + "/summary/" + ID
626 out = open(summaryDir + ".tsv", 'w') 636 out = open("summary.tsv", 'w')
627 for item in tsvOut: 637 for item in tsvOut:
628 out.write("%s\n" % item) 638 out.write("%s\n" % item)
629 #endregion 639 #endregion
630 640
631 641