Mercurial > repos > jjjjia > cpo_prediction
diff cpo_galaxy_prediction.py @ 18:596bf8a792de draft
planemo upload
author | jjjjia |
---|---|
date | Tue, 28 Aug 2018 15:15:09 -0400 |
parents | a14b12a71a53 |
children | 1543496b2db4 |
line wrap: on
line diff
--- a/cpo_galaxy_prediction.py Mon Aug 27 19:58:24 2018 -0400 +++ b/cpo_galaxy_prediction.py Tue Aug 28 15:15:09 2018 -0400 @@ -42,6 +42,7 @@ parser.add_option("-e", "--expected", dest="expectedSpecies", default="NA/NA/NA", type="string", help="expected species of the isolate") parser.add_option("-s", "--mlst-scheme", dest="mlstScheme", default= "./scheme_species_map.tab", type="string", help="absolute file path to mlst scheme") parser.add_option("-p", "--plasmidfinder", dest="plasmidfinder", type="string", help="absolute file path to plasmidfinder ") + parser.add_options("-d", "--mash", dest="mash", type="string", help="absolute file path to mash plasmiddb result") #parallelization, useless, these are hard coded to 8cores/64G RAM #parser.add_option("-t", "--threads", dest="threads", default=8, type="int", help="number of cpu to use") @@ -60,6 +61,7 @@ expectedSpecies = str(options.expectedSpecies).lstrip().rstrip() mlstScheme = str(options.mlstScheme).lstrip().rstrip() plasmidfinder = str(options.plasmidfinder).lstrip().rstrip() + mash = str(options.mash).lstrip().rstrip() outputDir = "./" print(mlst) print(mobfindercontig) @@ -68,6 +70,8 @@ print(rgi) print(expectedSpecies) print(mlstScheme) + print(mash) + else: curDir = os.getcwd() ID = "BC11" @@ -79,6 +83,7 @@ expectedSpecies = "Escherichia coli" mlstScheme = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\scheme_species_map.tab" plasmidfinder = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.origins" + mash = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\mash.tsv" outputDir = "./" #region result objects @@ -190,6 +195,26 @@ self.source = "" self.row = "" +class MashResult(object): + def __init__(self): + self.size = 0.0 + self.depth = 0.0 + self.identity = 0.0 + self.sharedHashes = "" + self.medianMultiplicity = 0 + self.pvalue = 0.0 + self.queryID= "" + self.queryComment = "" + self.species = "" + self.row = "" + self.accession = "" + self.gcf="" + self.assembly="" + + def toDict(self): #doesnt actually work + return dict((name, getattr(self, name)) for name in dir(self) if not name.startswith('__')) + + #endregion #region useful functions @@ -476,6 +501,22 @@ #plasmidFinderContigs.append(str(plasmidFinder.iloc[i,1])) #origins.append(str(plasmidFinder.iloc[i,4][:plasmidFinder.iloc[i,4].index("_")])) return _pFinder + +def ParseMashResult(pathToMashScreen): + mashScreen = pandas.read_csv(pathToMashScreen, delimiter='\t', header=None) + + _mashPlasmidHits = {} #*********************** + #parse what the species are. + for i in (range(len(mashScreen.index))): + mr = MashResult() + mr.identity = float(mashScreen.ix[i, 0]) + mr.sharedHashes = mashScreen.ix[i, 1] + mr.medianMultiplicity = int(mashScreen.ix[i, 2]) + mr.pvalue = float(mashScreen.ix[i, 3]) + mr.name = mashScreen.ix[i, 4] #accession + mr.row = "\t".join(str(x) for x in mashScreen.ix[i].tolist()) + _mashPlasmidHits[mr.name] = mr + return _mashPlasmidHits #endregion def Main(): @@ -531,6 +572,9 @@ rgiAMR = ParseRGIResult(rgi, plasmidContigs, likelyPlasmidContigs) # outputDir + "/predictions/" + ID + ".rgi.txt", plasmidContigs, likelyPlasmidContigs)#*********************** ToJson(rgiAMR, "rgi.json") #************* + plasmidFamily = ParseMashResult(mash) + ToJson(plasmidFamily, "mash.json") + carbapenamases = [] resfinderCarbas = [] #list of rfinder objects for lindaout list amrGenes = [] @@ -613,20 +657,27 @@ lindaTemp += "\t\t" #sero and kcap #resfinderCarbas + index = 0 for carbs in resfinderCarbas: if (carbs.source == "plasmid"): # - lindaTemp += "\t\t\t\t\t" #plasmid 1 rflp plasmid 1 family information. PLASMID_1_FAMILY\tPLASMID_1_BEST_MATCH\tPLASMID_1_COVERAGE\tPLASMID_1_SNVS_TO_BEST_MATCH + lindaTemp += "\t" + plasmid = plasmidFamily[list(plasmidFamily.keys())[index]] + lindaTemp += plasmid.name + "\t" + lindaTemp += str(plasmid.identity) + "\t" + lindaTemp += plasmid.sharedHashes + "\t" lindaTemp += carbs.shortGene + "\t" #found an carbapenase contig = carbs.sequence[6:] #this is the contig number for i in mSuite.keys(): if (str(mSuite[i].contig_num) == str(contig)): #found the right plasmid - lindaTemp += mSuite[i].rep_type + clusterid = mSuite[i].cluster_id + rep_types = mSuitePlasmids["plasmid_" + str(clusterid) + ".fasta"].rep_types + lindaTemp += rep_types lindaOut.append(lindaTemp) out = open("summary.linda.tsv", 'w') for item in lindaOut: out.write("%s\n" % item) - tsvOut.append("new\tID\tExpected Species\tMLST Species\tSequence Type\tMLST Scheme\tCarbapenem Resistance Genes\tOther AMR Genes\tTotal Plasmids\tPlasmids ID\tNum_Contigs\tPlasmid Length\tPlasmid RepType\tPlasmid Mobility\tNearest Reference\tDefinitely Plasmid Contigs\tLikely Plasmid Contigs") + tsvOut.append("new\tID\tExpected Species\tMLST Species\tSequence Type\tMLST Scheme\tCarbapenem Resistance Genes\tOther AMR Genes\tPlasmid Best Match\tTotal Plasmids\tPlasmids ID\tNum_Contigs\tPlasmid Length\tPlasmid RepType\tPlasmid Mobility\tNearest Reference\tDefinitely Plasmid Contigs\tLikely Plasmid Contigs") #start with ID temp = "\t" temp += (ID + "\t") @@ -642,6 +693,7 @@ temp += ";".join(amrGenes) + "\t" #lastly plasmids + temp += str(plasmidFamily[list(plasmidFamily.keys())[0]].name) temp+= str(len(mSuitePlasmids)) + "\t" plasmidID = "" contigs = ""