Mercurial > repos > jjjjia > cpo_prediction
comparison cpo_galaxy_prediction.py @ 18:596bf8a792de draft
planemo upload
author | jjjjia |
---|---|
date | Tue, 28 Aug 2018 15:15:09 -0400 |
parents | a14b12a71a53 |
children | 1543496b2db4 |
comparison
equal
deleted
inserted
replaced
17:ed3b291693fc | 18:596bf8a792de |
---|---|
40 parser.add_option("-a", "--abricate", dest="abricate", type="string", help="absolute path to abricate results") | 40 parser.add_option("-a", "--abricate", dest="abricate", type="string", help="absolute path to abricate results") |
41 parser.add_option("-r", "--rgi", dest="rgi", type="string", help="absolute path to rgi results") | 41 parser.add_option("-r", "--rgi", dest="rgi", type="string", help="absolute path to rgi results") |
42 parser.add_option("-e", "--expected", dest="expectedSpecies", default="NA/NA/NA", type="string", help="expected species of the isolate") | 42 parser.add_option("-e", "--expected", dest="expectedSpecies", default="NA/NA/NA", type="string", help="expected species of the isolate") |
43 parser.add_option("-s", "--mlst-scheme", dest="mlstScheme", default= "./scheme_species_map.tab", type="string", help="absolute file path to mlst scheme") | 43 parser.add_option("-s", "--mlst-scheme", dest="mlstScheme", default= "./scheme_species_map.tab", type="string", help="absolute file path to mlst scheme") |
44 parser.add_option("-p", "--plasmidfinder", dest="plasmidfinder", type="string", help="absolute file path to plasmidfinder ") | 44 parser.add_option("-p", "--plasmidfinder", dest="plasmidfinder", type="string", help="absolute file path to plasmidfinder ") |
45 parser.add_options("-d", "--mash", dest="mash", type="string", help="absolute file path to mash plasmiddb result") | |
45 | 46 |
46 #parallelization, useless, these are hard coded to 8cores/64G RAM | 47 #parallelization, useless, these are hard coded to 8cores/64G RAM |
47 #parser.add_option("-t", "--threads", dest="threads", default=8, type="int", help="number of cpu to use") | 48 #parser.add_option("-t", "--threads", dest="threads", default=8, type="int", help="number of cpu to use") |
48 #parser.add_option("-p", "--memory", dest="memory", default=64, type="int", help="memory to use in GB") | 49 #parser.add_option("-p", "--memory", dest="memory", default=64, type="int", help="memory to use in GB") |
49 | 50 |
58 abricate = str(options.abricate).lstrip().rstrip() | 59 abricate = str(options.abricate).lstrip().rstrip() |
59 rgi = str(options.rgi).lstrip().rstrip() | 60 rgi = str(options.rgi).lstrip().rstrip() |
60 expectedSpecies = str(options.expectedSpecies).lstrip().rstrip() | 61 expectedSpecies = str(options.expectedSpecies).lstrip().rstrip() |
61 mlstScheme = str(options.mlstScheme).lstrip().rstrip() | 62 mlstScheme = str(options.mlstScheme).lstrip().rstrip() |
62 plasmidfinder = str(options.plasmidfinder).lstrip().rstrip() | 63 plasmidfinder = str(options.plasmidfinder).lstrip().rstrip() |
64 mash = str(options.mash).lstrip().rstrip() | |
63 outputDir = "./" | 65 outputDir = "./" |
64 print(mlst) | 66 print(mlst) |
65 print(mobfindercontig) | 67 print(mobfindercontig) |
66 print(mobfinderaggregate) | 68 print(mobfinderaggregate) |
67 print(abricate) | 69 print(abricate) |
68 print(rgi) | 70 print(rgi) |
69 print(expectedSpecies) | 71 print(expectedSpecies) |
70 print(mlstScheme) | 72 print(mlstScheme) |
73 print(mash) | |
74 | |
71 else: | 75 else: |
72 curDir = os.getcwd() | 76 curDir = os.getcwd() |
73 ID = "BC11" | 77 ID = "BC11" |
74 mlst = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.mlst" | 78 mlst = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.mlst" |
75 mobfindercontig = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.recon\contig_report.txt" | 79 mobfindercontig = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.recon\contig_report.txt" |
77 abricate = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.cp" | 81 abricate = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.cp" |
78 rgi = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.rgi.txt" | 82 rgi = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.rgi.txt" |
79 expectedSpecies = "Escherichia coli" | 83 expectedSpecies = "Escherichia coli" |
80 mlstScheme = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\scheme_species_map.tab" | 84 mlstScheme = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\scheme_species_map.tab" |
81 plasmidfinder = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.origins" | 85 plasmidfinder = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.origins" |
86 mash = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\mash.tsv" | |
82 outputDir = "./" | 87 outputDir = "./" |
83 | 88 |
84 #region result objects | 89 #region result objects |
85 #define some objects to store values from results | 90 #define some objects to store values from results |
86 #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar). | 91 #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar). |
187 self.Percentage_Length_of_Reference_Sequence = 0.00 | 192 self.Percentage_Length_of_Reference_Sequence = 0.00 |
188 self.ID = "" | 193 self.ID = "" |
189 self.Model_ID = 0 | 194 self.Model_ID = 0 |
190 self.source = "" | 195 self.source = "" |
191 self.row = "" | 196 self.row = "" |
197 | |
198 class MashResult(object): | |
199 def __init__(self): | |
200 self.size = 0.0 | |
201 self.depth = 0.0 | |
202 self.identity = 0.0 | |
203 self.sharedHashes = "" | |
204 self.medianMultiplicity = 0 | |
205 self.pvalue = 0.0 | |
206 self.queryID= "" | |
207 self.queryComment = "" | |
208 self.species = "" | |
209 self.row = "" | |
210 self.accession = "" | |
211 self.gcf="" | |
212 self.assembly="" | |
213 | |
214 def toDict(self): #doesnt actually work | |
215 return dict((name, getattr(self, name)) for name in dir(self) if not name.startswith('__')) | |
216 | |
192 | 217 |
193 #endregion | 218 #endregion |
194 | 219 |
195 #region useful functions | 220 #region useful functions |
196 def read(path): | 221 def read(path): |
474 _pFinder[pf.gene]=pf | 499 _pFinder[pf.gene]=pf |
475 #row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist()) | 500 #row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist()) |
476 #plasmidFinderContigs.append(str(plasmidFinder.iloc[i,1])) | 501 #plasmidFinderContigs.append(str(plasmidFinder.iloc[i,1])) |
477 #origins.append(str(plasmidFinder.iloc[i,4][:plasmidFinder.iloc[i,4].index("_")])) | 502 #origins.append(str(plasmidFinder.iloc[i,4][:plasmidFinder.iloc[i,4].index("_")])) |
478 return _pFinder | 503 return _pFinder |
504 | |
505 def ParseMashResult(pathToMashScreen): | |
506 mashScreen = pandas.read_csv(pathToMashScreen, delimiter='\t', header=None) | |
507 | |
508 _mashPlasmidHits = {} #*********************** | |
509 #parse what the species are. | |
510 for i in (range(len(mashScreen.index))): | |
511 mr = MashResult() | |
512 mr.identity = float(mashScreen.ix[i, 0]) | |
513 mr.sharedHashes = mashScreen.ix[i, 1] | |
514 mr.medianMultiplicity = int(mashScreen.ix[i, 2]) | |
515 mr.pvalue = float(mashScreen.ix[i, 3]) | |
516 mr.name = mashScreen.ix[i, 4] #accession | |
517 mr.row = "\t".join(str(x) for x in mashScreen.ix[i].tolist()) | |
518 _mashPlasmidHits[mr.name] = mr | |
519 return _mashPlasmidHits | |
479 #endregion | 520 #endregion |
480 | 521 |
481 def Main(): | 522 def Main(): |
482 outputDir = "./" | 523 outputDir = "./" |
483 notes = [] | 524 notes = [] |
528 rFinder = ParseResFinderResult(abricate, plasmidContigs, likelyPlasmidContigs)#outputDir + "/predictions/" + ID + ".cp", plasmidContigs, likelyPlasmidContigs) #********************** | 569 rFinder = ParseResFinderResult(abricate, plasmidContigs, likelyPlasmidContigs)#outputDir + "/predictions/" + ID + ".cp", plasmidContigs, likelyPlasmidContigs) #********************** |
529 ToJson(rFinder, "resfinder.json") #************* | 570 ToJson(rFinder, "resfinder.json") #************* |
530 | 571 |
531 rgiAMR = ParseRGIResult(rgi, plasmidContigs, likelyPlasmidContigs) # outputDir + "/predictions/" + ID + ".rgi.txt", plasmidContigs, likelyPlasmidContigs)#*********************** | 572 rgiAMR = ParseRGIResult(rgi, plasmidContigs, likelyPlasmidContigs) # outputDir + "/predictions/" + ID + ".rgi.txt", plasmidContigs, likelyPlasmidContigs)#*********************** |
532 ToJson(rgiAMR, "rgi.json") #************* | 573 ToJson(rgiAMR, "rgi.json") #************* |
574 | |
575 plasmidFamily = ParseMashResult(mash) | |
576 ToJson(plasmidFamily, "mash.json") | |
533 | 577 |
534 carbapenamases = [] | 578 carbapenamases = [] |
535 resfinderCarbas = [] #list of rfinder objects for lindaout list | 579 resfinderCarbas = [] #list of rfinder objects for lindaout list |
536 amrGenes = [] | 580 amrGenes = [] |
537 for keys in rFinder: | 581 for keys in rFinder: |
611 lindaTemp += str(mlstHit.seqType) + "\t" #seq type | 655 lindaTemp += str(mlstHit.seqType) + "\t" #seq type |
612 lindaTemp += "\t".join(mlstHit.scheme.split(";")) + "\t"#mlst alleles x 7 | 656 lindaTemp += "\t".join(mlstHit.scheme.split(";")) + "\t"#mlst alleles x 7 |
613 lindaTemp += "\t\t" #sero and kcap | 657 lindaTemp += "\t\t" #sero and kcap |
614 | 658 |
615 #resfinderCarbas | 659 #resfinderCarbas |
660 index = 0 | |
616 for carbs in resfinderCarbas: | 661 for carbs in resfinderCarbas: |
617 if (carbs.source == "plasmid"): # | 662 if (carbs.source == "plasmid"): # |
618 lindaTemp += "\t\t\t\t\t" #plasmid 1 rflp plasmid 1 family information. PLASMID_1_FAMILY\tPLASMID_1_BEST_MATCH\tPLASMID_1_COVERAGE\tPLASMID_1_SNVS_TO_BEST_MATCH | 663 lindaTemp += "\t" |
664 plasmid = plasmidFamily[list(plasmidFamily.keys())[index]] | |
665 lindaTemp += plasmid.name + "\t" | |
666 lindaTemp += str(plasmid.identity) + "\t" | |
667 lindaTemp += plasmid.sharedHashes + "\t" | |
619 lindaTemp += carbs.shortGene + "\t" #found an carbapenase | 668 lindaTemp += carbs.shortGene + "\t" #found an carbapenase |
620 contig = carbs.sequence[6:] #this is the contig number | 669 contig = carbs.sequence[6:] #this is the contig number |
621 for i in mSuite.keys(): | 670 for i in mSuite.keys(): |
622 if (str(mSuite[i].contig_num) == str(contig)): #found the right plasmid | 671 if (str(mSuite[i].contig_num) == str(contig)): #found the right plasmid |
623 lindaTemp += mSuite[i].rep_type | 672 clusterid = mSuite[i].cluster_id |
673 rep_types = mSuitePlasmids["plasmid_" + str(clusterid) + ".fasta"].rep_types | |
674 lindaTemp += rep_types | |
624 lindaOut.append(lindaTemp) | 675 lindaOut.append(lindaTemp) |
625 out = open("summary.linda.tsv", 'w') | 676 out = open("summary.linda.tsv", 'w') |
626 for item in lindaOut: | 677 for item in lindaOut: |
627 out.write("%s\n" % item) | 678 out.write("%s\n" % item) |
628 | 679 |
629 tsvOut.append("new\tID\tExpected Species\tMLST Species\tSequence Type\tMLST Scheme\tCarbapenem Resistance Genes\tOther AMR Genes\tTotal Plasmids\tPlasmids ID\tNum_Contigs\tPlasmid Length\tPlasmid RepType\tPlasmid Mobility\tNearest Reference\tDefinitely Plasmid Contigs\tLikely Plasmid Contigs") | 680 tsvOut.append("new\tID\tExpected Species\tMLST Species\tSequence Type\tMLST Scheme\tCarbapenem Resistance Genes\tOther AMR Genes\tPlasmid Best Match\tTotal Plasmids\tPlasmids ID\tNum_Contigs\tPlasmid Length\tPlasmid RepType\tPlasmid Mobility\tNearest Reference\tDefinitely Plasmid Contigs\tLikely Plasmid Contigs") |
630 #start with ID | 681 #start with ID |
631 temp = "\t" | 682 temp = "\t" |
632 temp += (ID + "\t") | 683 temp += (ID + "\t") |
633 temp += expectedSpecies + "\t" | 684 temp += expectedSpecies + "\t" |
634 | 685 |
640 #now onto AMR genes | 691 #now onto AMR genes |
641 temp += ";".join(carbapenamases) + "\t" | 692 temp += ";".join(carbapenamases) + "\t" |
642 temp += ";".join(amrGenes) + "\t" | 693 temp += ";".join(amrGenes) + "\t" |
643 | 694 |
644 #lastly plasmids | 695 #lastly plasmids |
696 temp += str(plasmidFamily[list(plasmidFamily.keys())[0]].name) | |
645 temp+= str(len(mSuitePlasmids)) + "\t" | 697 temp+= str(len(mSuitePlasmids)) + "\t" |
646 plasmidID = "" | 698 plasmidID = "" |
647 contigs = "" | 699 contigs = "" |
648 lengths = "" | 700 lengths = "" |
649 rep_type = "" | 701 rep_type = "" |