cpo_prediction: cpo_galaxy_prediction.py comparison

comparison cpo_galaxy_prediction.py @ 18:596bf8a792de draft

planemo upload

author	jjjjia
date	Tue, 28 Aug 2018 15:15:09 -0400
parents	a14b12a71a53
children	1543496b2db4

comparison

equal deleted inserted replaced

-:ed3b291693fc
+:596bf8a792de
 parser.add_option("-a", "--abricate", dest="abricate", type="string", help="absolute path to abricate results")
 parser.add_option("-r", "--rgi", dest="rgi", type="string", help="absolute path to rgi results")
 parser.add_option("-e", "--expected", dest="expectedSpecies", default="NA/NA/NA", type="string", help="expected species of the isolate")
 parser.add_option("-s", "--mlst-scheme", dest="mlstScheme", default= "./scheme_species_map.tab", type="string", help="absolute file path to mlst scheme")
 parser.add_option("-p", "--plasmidfinder", dest="plasmidfinder", type="string", help="absolute file path to plasmidfinder ")
+parser.add_options("-d", "--mash", dest="mash", type="string", help="absolute file path to mash plasmiddb result")
 #parallelization, useless, these are hard coded to 8cores/64G RAM
 #parser.add_option("-t", "--threads", dest="threads", default=8, type="int", help="number of cpu to use")
 #parser.add_option("-p", "--memory", dest="memory", default=64, type="int", help="memory to use in GB")
 abricate = str(options.abricate).lstrip().rstrip()
 rgi = str(options.rgi).lstrip().rstrip()
 expectedSpecies = str(options.expectedSpecies).lstrip().rstrip()
 mlstScheme = str(options.mlstScheme).lstrip().rstrip()
 plasmidfinder = str(options.plasmidfinder).lstrip().rstrip()
+mash = str(options.mash).lstrip().rstrip()
 outputDir = "./"
 print(mlst)
 print(mobfindercontig)
 print(mobfinderaggregate)
 print(abricate)
 print(rgi)
 print(expectedSpecies)
 print(mlstScheme)
+print(mash)
 else:
 curDir = os.getcwd()
 ID = "BC11"
 mlst = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.mlst"
 mobfindercontig = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.recon\contig_report.txt"
 abricate = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.cp"
 rgi = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.rgi.txt"
 expectedSpecies = "Escherichia coli"
 mlstScheme = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\scheme_species_map.tab"
 plasmidfinder = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.origins"
+mash = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\mash.tsv"
 outputDir = "./"
 #region result objects
 #define some objects to store values from results
 #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar).
 self.Percentage_Length_of_Reference_Sequence	= 0.00
 self.ID	= ""
 self.Model_ID = 0
 self.source = ""
 self.row = ""
+class MashResult(object):
+def __init__(self):
+self.size = 0.0
+self.depth = 0.0
+self.identity = 0.0
+self.sharedHashes = ""
+self.medianMultiplicity = 0
+self.pvalue = 0.0
+self.queryID= ""
+self.queryComment = ""
+self.species = ""
+self.row = ""
+self.accession = ""
+self.gcf=""
+self.assembly=""
+def toDict(self): #doesnt actually work
+return dict((name, getattr(self, name)) for name in dir(self) if not name.startswith('__'))
 #endregion
 #region useful functions
 def read(path):
 _pFinder[pf.gene]=pf
 #row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist())
 #plasmidFinderContigs.append(str(plasmidFinder.iloc[i,1]))
 #origins.append(str(plasmidFinder.iloc[i,4][:plasmidFinder.iloc[i,4].index("_")]))
 return _pFinder
+def ParseMashResult(pathToMashScreen):
+mashScreen = pandas.read_csv(pathToMashScreen, delimiter='\t', header=None)
+_mashPlasmidHits = {} #***********************
+#parse what the species are.
+for i in (range(len(mashScreen.index))):
+mr = MashResult()
+mr.identity = float(mashScreen.ix[i, 0])
+mr.sharedHashes = mashScreen.ix[i, 1]
+mr.medianMultiplicity = int(mashScreen.ix[i, 2])
+mr.pvalue = float(mashScreen.ix[i, 3])
+mr.name = mashScreen.ix[i, 4] #accession
+mr.row = "\t".join(str(x) for x in mashScreen.ix[i].tolist())
+_mashPlasmidHits[mr.name] = mr
+return _mashPlasmidHits
 #endregion
 def Main():
 outputDir = "./"
 notes = []
 rFinder = ParseResFinderResult(abricate, plasmidContigs, likelyPlasmidContigs)#outputDir + "/predictions/" + ID + ".cp", plasmidContigs, likelyPlasmidContigs) #**********************
 ToJson(rFinder, "resfinder.json") #*************
 rgiAMR = ParseRGIResult(rgi, plasmidContigs, likelyPlasmidContigs) # outputDir + "/predictions/" + ID + ".rgi.txt", plasmidContigs, likelyPlasmidContigs)#***********************
 ToJson(rgiAMR, "rgi.json") #*************
+plasmidFamily = ParseMashResult(mash)
+ToJson(plasmidFamily, "mash.json")
 carbapenamases = []
 resfinderCarbas = [] #list of rfinder objects for lindaout list
 amrGenes = []
 for keys in rFinder:
 lindaTemp += str(mlstHit.seqType)  + "\t" #seq type
 lindaTemp += "\t".join(mlstHit.scheme.split(";")) + "\t"#mlst alleles x 7
 lindaTemp += "\t\t" #sero and kcap
 #resfinderCarbas
+index = 0
 for carbs in resfinderCarbas:
 if (carbs.source == "plasmid"): #
-lindaTemp += "\t\t\t\t\t" #plasmid 1 rflp plasmid 1 family information. PLASMID_1_FAMILY\tPLASMID_1_BEST_MATCH\tPLASMID_1_COVERAGE\tPLASMID_1_SNVS_TO_BEST_MATCH
+lindaTemp += "\t"
+plasmid = plasmidFamily[list(plasmidFamily.keys())[index]]
+lindaTemp += plasmid.name + "\t"
+lindaTemp += str(plasmid.identity) + "\t"
+lindaTemp += plasmid.sharedHashes + "\t"
 lindaTemp += carbs.shortGene + "\t" #found an carbapenase
 contig = carbs.sequence[6:] #this is the contig number
 for i in mSuite.keys():
 if (str(mSuite[i].contig_num) == str(contig)): #found the right plasmid
-lindaTemp += mSuite[i].rep_type
+clusterid = mSuite[i].cluster_id
+rep_types = mSuitePlasmids["plasmid_" + str(clusterid) + ".fasta"].rep_types
+lindaTemp += rep_types
 lindaOut.append(lindaTemp)
 out = open("summary.linda.tsv", 'w')
 for item in lindaOut:
 out.write("%s\n" % item)
-tsvOut.append("new\tID\tExpected Species\tMLST Species\tSequence Type\tMLST Scheme\tCarbapenem Resistance Genes\tOther AMR Genes\tTotal Plasmids\tPlasmids ID\tNum_Contigs\tPlasmid Length\tPlasmid RepType\tPlasmid Mobility\tNearest Reference\tDefinitely Plasmid Contigs\tLikely Plasmid Contigs")
+tsvOut.append("new\tID\tExpected Species\tMLST Species\tSequence Type\tMLST Scheme\tCarbapenem Resistance Genes\tOther AMR Genes\tPlasmid Best Match\tTotal Plasmids\tPlasmids ID\tNum_Contigs\tPlasmid Length\tPlasmid RepType\tPlasmid Mobility\tNearest Reference\tDefinitely Plasmid Contigs\tLikely Plasmid Contigs")
 #start with ID
 temp = "\t"
 temp += (ID + "\t")
 temp += expectedSpecies + "\t"
 #now onto AMR genes
 temp += ";".join(carbapenamases) + "\t"
 temp += ";".join(amrGenes) + "\t"
 #lastly plasmids
+temp += str(plasmidFamily[list(plasmidFamily.keys())[0]].name)
 temp+= str(len(mSuitePlasmids)) + "\t"
 plasmidID = ""
 contigs = ""
 lengths = ""
 rep_type = ""

Mercurial > repos > jjjjia > cpo_prediction

comparison cpo_galaxy_prediction.py @ 18:596bf8a792de draft