comparison cpo_galaxy_prediction.py @ 18:596bf8a792de draft

planemo upload
author jjjjia
date Tue, 28 Aug 2018 15:15:09 -0400
parents a14b12a71a53
children 1543496b2db4
comparison
equal deleted inserted replaced
17:ed3b291693fc 18:596bf8a792de
40 parser.add_option("-a", "--abricate", dest="abricate", type="string", help="absolute path to abricate results") 40 parser.add_option("-a", "--abricate", dest="abricate", type="string", help="absolute path to abricate results")
41 parser.add_option("-r", "--rgi", dest="rgi", type="string", help="absolute path to rgi results") 41 parser.add_option("-r", "--rgi", dest="rgi", type="string", help="absolute path to rgi results")
42 parser.add_option("-e", "--expected", dest="expectedSpecies", default="NA/NA/NA", type="string", help="expected species of the isolate") 42 parser.add_option("-e", "--expected", dest="expectedSpecies", default="NA/NA/NA", type="string", help="expected species of the isolate")
43 parser.add_option("-s", "--mlst-scheme", dest="mlstScheme", default= "./scheme_species_map.tab", type="string", help="absolute file path to mlst scheme") 43 parser.add_option("-s", "--mlst-scheme", dest="mlstScheme", default= "./scheme_species_map.tab", type="string", help="absolute file path to mlst scheme")
44 parser.add_option("-p", "--plasmidfinder", dest="plasmidfinder", type="string", help="absolute file path to plasmidfinder ") 44 parser.add_option("-p", "--plasmidfinder", dest="plasmidfinder", type="string", help="absolute file path to plasmidfinder ")
45 parser.add_options("-d", "--mash", dest="mash", type="string", help="absolute file path to mash plasmiddb result")
45 46
46 #parallelization, useless, these are hard coded to 8cores/64G RAM 47 #parallelization, useless, these are hard coded to 8cores/64G RAM
47 #parser.add_option("-t", "--threads", dest="threads", default=8, type="int", help="number of cpu to use") 48 #parser.add_option("-t", "--threads", dest="threads", default=8, type="int", help="number of cpu to use")
48 #parser.add_option("-p", "--memory", dest="memory", default=64, type="int", help="memory to use in GB") 49 #parser.add_option("-p", "--memory", dest="memory", default=64, type="int", help="memory to use in GB")
49 50
58 abricate = str(options.abricate).lstrip().rstrip() 59 abricate = str(options.abricate).lstrip().rstrip()
59 rgi = str(options.rgi).lstrip().rstrip() 60 rgi = str(options.rgi).lstrip().rstrip()
60 expectedSpecies = str(options.expectedSpecies).lstrip().rstrip() 61 expectedSpecies = str(options.expectedSpecies).lstrip().rstrip()
61 mlstScheme = str(options.mlstScheme).lstrip().rstrip() 62 mlstScheme = str(options.mlstScheme).lstrip().rstrip()
62 plasmidfinder = str(options.plasmidfinder).lstrip().rstrip() 63 plasmidfinder = str(options.plasmidfinder).lstrip().rstrip()
64 mash = str(options.mash).lstrip().rstrip()
63 outputDir = "./" 65 outputDir = "./"
64 print(mlst) 66 print(mlst)
65 print(mobfindercontig) 67 print(mobfindercontig)
66 print(mobfinderaggregate) 68 print(mobfinderaggregate)
67 print(abricate) 69 print(abricate)
68 print(rgi) 70 print(rgi)
69 print(expectedSpecies) 71 print(expectedSpecies)
70 print(mlstScheme) 72 print(mlstScheme)
73 print(mash)
74
71 else: 75 else:
72 curDir = os.getcwd() 76 curDir = os.getcwd()
73 ID = "BC11" 77 ID = "BC11"
74 mlst = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.mlst" 78 mlst = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.mlst"
75 mobfindercontig = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.recon\contig_report.txt" 79 mobfindercontig = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.recon\contig_report.txt"
77 abricate = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.cp" 81 abricate = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.cp"
78 rgi = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.rgi.txt" 82 rgi = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.rgi.txt"
79 expectedSpecies = "Escherichia coli" 83 expectedSpecies = "Escherichia coli"
80 mlstScheme = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\scheme_species_map.tab" 84 mlstScheme = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\scheme_species_map.tab"
81 plasmidfinder = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.origins" 85 plasmidfinder = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.origins"
86 mash = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\mash.tsv"
82 outputDir = "./" 87 outputDir = "./"
83 88
84 #region result objects 89 #region result objects
85 #define some objects to store values from results 90 #define some objects to store values from results
86 #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar). 91 #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar).
187 self.Percentage_Length_of_Reference_Sequence = 0.00 192 self.Percentage_Length_of_Reference_Sequence = 0.00
188 self.ID = "" 193 self.ID = ""
189 self.Model_ID = 0 194 self.Model_ID = 0
190 self.source = "" 195 self.source = ""
191 self.row = "" 196 self.row = ""
197
198 class MashResult(object):
199 def __init__(self):
200 self.size = 0.0
201 self.depth = 0.0
202 self.identity = 0.0
203 self.sharedHashes = ""
204 self.medianMultiplicity = 0
205 self.pvalue = 0.0
206 self.queryID= ""
207 self.queryComment = ""
208 self.species = ""
209 self.row = ""
210 self.accession = ""
211 self.gcf=""
212 self.assembly=""
213
214 def toDict(self): #doesnt actually work
215 return dict((name, getattr(self, name)) for name in dir(self) if not name.startswith('__'))
216
192 217
193 #endregion 218 #endregion
194 219
195 #region useful functions 220 #region useful functions
196 def read(path): 221 def read(path):
474 _pFinder[pf.gene]=pf 499 _pFinder[pf.gene]=pf
475 #row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist()) 500 #row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist())
476 #plasmidFinderContigs.append(str(plasmidFinder.iloc[i,1])) 501 #plasmidFinderContigs.append(str(plasmidFinder.iloc[i,1]))
477 #origins.append(str(plasmidFinder.iloc[i,4][:plasmidFinder.iloc[i,4].index("_")])) 502 #origins.append(str(plasmidFinder.iloc[i,4][:plasmidFinder.iloc[i,4].index("_")]))
478 return _pFinder 503 return _pFinder
504
505 def ParseMashResult(pathToMashScreen):
506 mashScreen = pandas.read_csv(pathToMashScreen, delimiter='\t', header=None)
507
508 _mashPlasmidHits = {} #***********************
509 #parse what the species are.
510 for i in (range(len(mashScreen.index))):
511 mr = MashResult()
512 mr.identity = float(mashScreen.ix[i, 0])
513 mr.sharedHashes = mashScreen.ix[i, 1]
514 mr.medianMultiplicity = int(mashScreen.ix[i, 2])
515 mr.pvalue = float(mashScreen.ix[i, 3])
516 mr.name = mashScreen.ix[i, 4] #accession
517 mr.row = "\t".join(str(x) for x in mashScreen.ix[i].tolist())
518 _mashPlasmidHits[mr.name] = mr
519 return _mashPlasmidHits
479 #endregion 520 #endregion
480 521
481 def Main(): 522 def Main():
482 outputDir = "./" 523 outputDir = "./"
483 notes = [] 524 notes = []
528 rFinder = ParseResFinderResult(abricate, plasmidContigs, likelyPlasmidContigs)#outputDir + "/predictions/" + ID + ".cp", plasmidContigs, likelyPlasmidContigs) #********************** 569 rFinder = ParseResFinderResult(abricate, plasmidContigs, likelyPlasmidContigs)#outputDir + "/predictions/" + ID + ".cp", plasmidContigs, likelyPlasmidContigs) #**********************
529 ToJson(rFinder, "resfinder.json") #************* 570 ToJson(rFinder, "resfinder.json") #*************
530 571
531 rgiAMR = ParseRGIResult(rgi, plasmidContigs, likelyPlasmidContigs) # outputDir + "/predictions/" + ID + ".rgi.txt", plasmidContigs, likelyPlasmidContigs)#*********************** 572 rgiAMR = ParseRGIResult(rgi, plasmidContigs, likelyPlasmidContigs) # outputDir + "/predictions/" + ID + ".rgi.txt", plasmidContigs, likelyPlasmidContigs)#***********************
532 ToJson(rgiAMR, "rgi.json") #************* 573 ToJson(rgiAMR, "rgi.json") #*************
574
575 plasmidFamily = ParseMashResult(mash)
576 ToJson(plasmidFamily, "mash.json")
533 577
534 carbapenamases = [] 578 carbapenamases = []
535 resfinderCarbas = [] #list of rfinder objects for lindaout list 579 resfinderCarbas = [] #list of rfinder objects for lindaout list
536 amrGenes = [] 580 amrGenes = []
537 for keys in rFinder: 581 for keys in rFinder:
611 lindaTemp += str(mlstHit.seqType) + "\t" #seq type 655 lindaTemp += str(mlstHit.seqType) + "\t" #seq type
612 lindaTemp += "\t".join(mlstHit.scheme.split(";")) + "\t"#mlst alleles x 7 656 lindaTemp += "\t".join(mlstHit.scheme.split(";")) + "\t"#mlst alleles x 7
613 lindaTemp += "\t\t" #sero and kcap 657 lindaTemp += "\t\t" #sero and kcap
614 658
615 #resfinderCarbas 659 #resfinderCarbas
660 index = 0
616 for carbs in resfinderCarbas: 661 for carbs in resfinderCarbas:
617 if (carbs.source == "plasmid"): # 662 if (carbs.source == "plasmid"): #
618 lindaTemp += "\t\t\t\t\t" #plasmid 1 rflp plasmid 1 family information. PLASMID_1_FAMILY\tPLASMID_1_BEST_MATCH\tPLASMID_1_COVERAGE\tPLASMID_1_SNVS_TO_BEST_MATCH 663 lindaTemp += "\t"
664 plasmid = plasmidFamily[list(plasmidFamily.keys())[index]]
665 lindaTemp += plasmid.name + "\t"
666 lindaTemp += str(plasmid.identity) + "\t"
667 lindaTemp += plasmid.sharedHashes + "\t"
619 lindaTemp += carbs.shortGene + "\t" #found an carbapenase 668 lindaTemp += carbs.shortGene + "\t" #found an carbapenase
620 contig = carbs.sequence[6:] #this is the contig number 669 contig = carbs.sequence[6:] #this is the contig number
621 for i in mSuite.keys(): 670 for i in mSuite.keys():
622 if (str(mSuite[i].contig_num) == str(contig)): #found the right plasmid 671 if (str(mSuite[i].contig_num) == str(contig)): #found the right plasmid
623 lindaTemp += mSuite[i].rep_type 672 clusterid = mSuite[i].cluster_id
673 rep_types = mSuitePlasmids["plasmid_" + str(clusterid) + ".fasta"].rep_types
674 lindaTemp += rep_types
624 lindaOut.append(lindaTemp) 675 lindaOut.append(lindaTemp)
625 out = open("summary.linda.tsv", 'w') 676 out = open("summary.linda.tsv", 'w')
626 for item in lindaOut: 677 for item in lindaOut:
627 out.write("%s\n" % item) 678 out.write("%s\n" % item)
628 679
629 tsvOut.append("new\tID\tExpected Species\tMLST Species\tSequence Type\tMLST Scheme\tCarbapenem Resistance Genes\tOther AMR Genes\tTotal Plasmids\tPlasmids ID\tNum_Contigs\tPlasmid Length\tPlasmid RepType\tPlasmid Mobility\tNearest Reference\tDefinitely Plasmid Contigs\tLikely Plasmid Contigs") 680 tsvOut.append("new\tID\tExpected Species\tMLST Species\tSequence Type\tMLST Scheme\tCarbapenem Resistance Genes\tOther AMR Genes\tPlasmid Best Match\tTotal Plasmids\tPlasmids ID\tNum_Contigs\tPlasmid Length\tPlasmid RepType\tPlasmid Mobility\tNearest Reference\tDefinitely Plasmid Contigs\tLikely Plasmid Contigs")
630 #start with ID 681 #start with ID
631 temp = "\t" 682 temp = "\t"
632 temp += (ID + "\t") 683 temp += (ID + "\t")
633 temp += expectedSpecies + "\t" 684 temp += expectedSpecies + "\t"
634 685
640 #now onto AMR genes 691 #now onto AMR genes
641 temp += ";".join(carbapenamases) + "\t" 692 temp += ";".join(carbapenamases) + "\t"
642 temp += ";".join(amrGenes) + "\t" 693 temp += ";".join(amrGenes) + "\t"
643 694
644 #lastly plasmids 695 #lastly plasmids
696 temp += str(plasmidFamily[list(plasmidFamily.keys())[0]].name)
645 temp+= str(len(mSuitePlasmids)) + "\t" 697 temp+= str(len(mSuitePlasmids)) + "\t"
646 plasmidID = "" 698 plasmidID = ""
647 contigs = "" 699 contigs = ""
648 lengths = "" 700 lengths = ""
649 rep_type = "" 701 rep_type = ""