diff cpo_galaxy_prediction.py @ 18:596bf8a792de draft

planemo upload
author jjjjia
date Tue, 28 Aug 2018 15:15:09 -0400
parents a14b12a71a53
children 1543496b2db4
line wrap: on
line diff
--- a/cpo_galaxy_prediction.py	Mon Aug 27 19:58:24 2018 -0400
+++ b/cpo_galaxy_prediction.py	Tue Aug 28 15:15:09 2018 -0400
@@ -42,6 +42,7 @@
     parser.add_option("-e", "--expected", dest="expectedSpecies", default="NA/NA/NA", type="string", help="expected species of the isolate")
     parser.add_option("-s", "--mlst-scheme", dest="mlstScheme", default= "./scheme_species_map.tab", type="string", help="absolute file path to mlst scheme")
     parser.add_option("-p", "--plasmidfinder", dest="plasmidfinder", type="string", help="absolute file path to plasmidfinder ")
+    parser.add_options("-d", "--mash", dest="mash", type="string", help="absolute file path to mash plasmiddb result")
 
     #parallelization, useless, these are hard coded to 8cores/64G RAM
     #parser.add_option("-t", "--threads", dest="threads", default=8, type="int", help="number of cpu to use")
@@ -60,6 +61,7 @@
     expectedSpecies = str(options.expectedSpecies).lstrip().rstrip()
     mlstScheme = str(options.mlstScheme).lstrip().rstrip()
     plasmidfinder = str(options.plasmidfinder).lstrip().rstrip()
+    mash = str(options.mash).lstrip().rstrip()
     outputDir = "./"
     print(mlst)
     print(mobfindercontig)
@@ -68,6 +70,8 @@
     print(rgi)
     print(expectedSpecies)
     print(mlstScheme)
+    print(mash)
+
 else:
     curDir = os.getcwd()
     ID = "BC11"
@@ -79,6 +83,7 @@
     expectedSpecies = "Escherichia coli"
     mlstScheme = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\scheme_species_map.tab"
     plasmidfinder = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.origins"
+    mash = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\mash.tsv"
     outputDir = "./"
 
 #region result objects
@@ -190,6 +195,26 @@
         self.source = ""
         self.row = ""
 
+class MashResult(object):
+    def __init__(self):
+        self.size = 0.0
+        self.depth = 0.0
+        self.identity = 0.0
+        self.sharedHashes = ""
+        self.medianMultiplicity = 0
+        self.pvalue = 0.0
+        self.queryID= ""
+        self.queryComment = ""
+        self.species = ""
+        self.row = ""
+        self.accession = ""
+        self.gcf=""
+        self.assembly=""
+
+    def toDict(self): #doesnt actually work
+        return dict((name, getattr(self, name)) for name in dir(self) if not name.startswith('__')) 
+
+
 #endregion
 
 #region useful functions
@@ -476,6 +501,22 @@
         #plasmidFinderContigs.append(str(plasmidFinder.iloc[i,1]))
         #origins.append(str(plasmidFinder.iloc[i,4][:plasmidFinder.iloc[i,4].index("_")]))
     return _pFinder
+
+def ParseMashResult(pathToMashScreen):
+    mashScreen = pandas.read_csv(pathToMashScreen, delimiter='\t', header=None)
+
+    _mashPlasmidHits = {} #***********************
+    #parse what the species are.
+    for i in (range(len(mashScreen.index))):
+        mr = MashResult()
+        mr.identity = float(mashScreen.ix[i, 0])
+        mr.sharedHashes = mashScreen.ix[i, 1]
+        mr.medianMultiplicity = int(mashScreen.ix[i, 2])
+        mr.pvalue = float(mashScreen.ix[i, 3])
+        mr.name = mashScreen.ix[i, 4] #accession
+        mr.row = "\t".join(str(x) for x in mashScreen.ix[i].tolist())
+        _mashPlasmidHits[mr.name] = mr
+    return _mashPlasmidHits
 #endregion
 
 def Main():
@@ -531,6 +572,9 @@
     rgiAMR = ParseRGIResult(rgi, plasmidContigs, likelyPlasmidContigs) # outputDir + "/predictions/" + ID + ".rgi.txt", plasmidContigs, likelyPlasmidContigs)#***********************
     ToJson(rgiAMR, "rgi.json") #*************
 
+    plasmidFamily = ParseMashResult(mash)
+    ToJson(plasmidFamily, "mash.json")
+
     carbapenamases = [] 
     resfinderCarbas = [] #list of rfinder objects for lindaout list
     amrGenes = []
@@ -613,20 +657,27 @@
     lindaTemp += "\t\t" #sero and kcap
     
     #resfinderCarbas
+    index = 0
     for carbs in resfinderCarbas:
         if (carbs.source == "plasmid"): #
-            lindaTemp += "\t\t\t\t\t" #plasmid 1 rflp plasmid 1 family information. PLASMID_1_FAMILY\tPLASMID_1_BEST_MATCH\tPLASMID_1_COVERAGE\tPLASMID_1_SNVS_TO_BEST_MATCH
+            lindaTemp += "\t"
+            plasmid = plasmidFamily[list(plasmidFamily.keys())[index]]
+            lindaTemp += plasmid.name + "\t"
+            lindaTemp += str(plasmid.identity) + "\t"
+            lindaTemp += plasmid.sharedHashes + "\t"
             lindaTemp += carbs.shortGene + "\t" #found an carbapenase
             contig = carbs.sequence[6:] #this is the contig number
             for i in mSuite.keys():
                 if (str(mSuite[i].contig_num) == str(contig)): #found the right plasmid
-                    lindaTemp += mSuite[i].rep_type
+                    clusterid = mSuite[i].cluster_id
+                    rep_types = mSuitePlasmids["plasmid_" + str(clusterid) + ".fasta"].rep_types
+                    lindaTemp += rep_types
     lindaOut.append(lindaTemp)
     out = open("summary.linda.tsv", 'w')
     for item in lindaOut:
         out.write("%s\n" % item)
 
-    tsvOut.append("new\tID\tExpected Species\tMLST Species\tSequence Type\tMLST Scheme\tCarbapenem Resistance Genes\tOther AMR Genes\tTotal Plasmids\tPlasmids ID\tNum_Contigs\tPlasmid Length\tPlasmid RepType\tPlasmid Mobility\tNearest Reference\tDefinitely Plasmid Contigs\tLikely Plasmid Contigs")
+    tsvOut.append("new\tID\tExpected Species\tMLST Species\tSequence Type\tMLST Scheme\tCarbapenem Resistance Genes\tOther AMR Genes\tPlasmid Best Match\tTotal Plasmids\tPlasmids ID\tNum_Contigs\tPlasmid Length\tPlasmid RepType\tPlasmid Mobility\tNearest Reference\tDefinitely Plasmid Contigs\tLikely Plasmid Contigs")
     #start with ID
     temp = "\t"
     temp += (ID + "\t")
@@ -642,6 +693,7 @@
     temp += ";".join(amrGenes) + "\t"
 
     #lastly plasmids
+    temp += str(plasmidFamily[list(plasmidFamily.keys())[0]].name)
     temp+= str(len(mSuitePlasmids)) + "\t"
     plasmidID = ""
     contigs = ""