diff cpo_galaxy_prediction.py @ 3:e6027598a35c draft

planemo upload
author jjjjia
date Mon, 20 Aug 2018 17:53:59 -0400
parents 29302ffdf137
children bd6f5844d60e
line wrap: on
line diff
--- a/cpo_galaxy_prediction.py	Mon Aug 20 14:19:56 2018 -0400
+++ b/cpo_galaxy_prediction.py	Mon Aug 20 17:53:59 2018 -0400
@@ -11,8 +11,7 @@
 #$ -m ea
 #$ -M bja20@sfu.ca
 
-#~/scripts/pipeline.py -i BC11-Kpn005_S2 -f /data/jjjjia/R1/BC11-Kpn005_S2_L001_R1_001.fastq.gz -r /data/jjjjia/R2/BC11-Kpn005_S2_L001_R2_001.fastq.gz -o pipelineResultsQsub -e "Klebsiella pneumoniae" 
-
+#./prediction.py -i ~/testCases/cpoResults/contigs/BC11-Kpn005_S2.fa -m ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.mlst -c ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.recon/contig_report.txt -f ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.recon/mobtyper_aggregate_report.txt -a ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.cp -r ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.rgi.txt -e "Klebsiella"
 import subprocess
 import pandas
 import optparse
@@ -27,27 +26,23 @@
 import numpy
 
 
-debug = False #True #debug skips the shell scripts and also dump out a ton of debugging messages
+debug = True #debug skips the shell scripts and also dump out a ton of debugging messages
 
 if not debug:
     #parses some parameters
     parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...")
     #required
+    #MLSTHIT, mobsuite, resfinder, rgi, mlstscheme
     parser.add_option("-i", "--id", dest="id", type="string", help="identifier of the isolate")    
-    parser.add_option("-a", "--assembly", dest="assemblyPath", type="string", help="absolute file path to contigs fasta")
-    parser.add_option("-c", "--card-db", dest="cardDB", default = "/home/jjjjia/databases/card202.json", type="string", help="absolute path to card reference database")
-    parser.add_option("-o", "--output", dest="output", default='./', type="string", help="absolute path to output folder")
+    parser.add_option("-m", "--mlst", dest="mlst", type="string", help="absolute file path to mlst result")
+    parser.add_option("-c", "--mobfinderContig", dest="mobfinderContig", type="string", help="absolute path to mobfinder aggregate result")
+    parser.add_option("-f", "--mobfinderAggregate", dest="mobfinderAggregate", type="string", help="absolute path to mobfinder plasmid results")
+    parser.add_option("-a", "--abricate", dest="abricate", type="string", help="absolute path to abricate results")
+    parser.add_option("-r", "--rgi", dest="rgi", type="string", help="absolute path to rgi results")
     parser.add_option("-e", "--expected", dest="expectedSpecies", default="NA/NA/NA", type="string", help="expected species of the isolate")
+    parser.add_option("-s", "--mlst-scheme", dest="mlstScheme", default= "./scheme_species_map.tab", type="string", help="absolute file path to mlst scheme")
+    parser.add_option("-p", "--plasmidfinder", dest="plasmidfinder", type="string", help="absolute file path to plasmidfinder ")
 
-    #optionals
-    parser.add_option("-k", "--script-path", dest="scriptDir", default="/home/jjjjia/scripts", type="string", help="absolute file path to this script folder")
-    parser.add_option("-b", "--update-abricate-path", dest="updateAbPath", default = "", type="string", help="absolute file path to fasta sequence used for abricate database")
-    parser.add_option("-m", "--update-abricate-dbname", dest="updateAbName", default = "default", type="string", help="name of abricate database to update")
-    parser.add_option("-u", "--update-mlst", dest="updateMLST", default = "False", type="string", help="True = update MLST")
-    #used for parsing 
-    parser.add_option("-s", "--mlst-scheme", dest="mlst", default= "/home/jjjjia/databases/scheme_species_map.tab", type="string", help="absolute file path to mlst scheme")
-
-    
     #parallelization, useless, these are hard coded to 8cores/64G RAM
     #parser.add_option("-t", "--threads", dest="threads", default=8, type="int", help="number of cpu to use")
     #parser.add_option("-p", "--memory", dest="memory", default=64, type="int", help="memory to use in GB")
@@ -56,33 +51,35 @@
     #if len(args) != 8:
         #parser.error("incorrect number of arguments, all 7 is required")
     curDir = os.getcwd()
-    outputDir = options.output
-    expectedSpecies = options.expectedSpecies
-    mlstScheme = options.mlst
-    tempDir = outputDir + "/shovillTemp"
-    scriptDir = options.scriptDir
-    updateAbName = options.updateAbName
-    updateAbPath = options.updateAbPath
-    updateMLST = options.updateMLST
-    cardDB=options.cardDB
-    assemblyPath=options.assemblyPath
-    ID = options.id
+    ID = str(options.id).lstrip().rstrip()
+    mlst = str(options.mlst).lstrip().rstrip()
+    mobfindercontig = str(options.mobfinderContig).lstrip().rstrip()
+    mobfinderaggregate = str(options.mobfinderAggregate).lstrip().rstrip()
+    abricate = str(options.abricate).lstrip().rstrip()
+    rgi = str(options.rgi).lstrip().rstrip()
+    expectedSpecies = str(options.expectedSpecies).lstrip().rstrip()
+    mlstScheme = str(options.mlstScheme).lstrip().rstrip()
+    plasmidfinder = str(options.plasmidfinder).lstrip().rstrip()
+    outputDir = "./"
+    print(mlst)
+    print(mobfindercontig)
+    print(mobfinderaggregate)
+    print(abricate)
+    print(rgi)
+    print(expectedSpecies)
+    print(mlstScheme)
 else:
-    manifestDir = ""
     curDir = os.getcwd()
-    outputDir = "pipelineTest"
+    ID = "BC11"
+    mlst = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.mlst"
+    mobfindercontig = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.recon\contig_report.txt"
+    mobfinderaggregate = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.recon\mobtyper_aggregate_report.txt"
+    abricate = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.cp"
+    rgi = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.rgi.txt"
     expectedSpecies = "Escherichia coli"
-    #threads = 8
-    #memory = 64
-    mlstScheme = outputDir + "/scheme_species_map.tab"
-    tempDir= outputDir + "/shovillTemp"    
-    scriptDir = "~/scripts"
-    updateAbName = "cpo"
-    updateAbPath = "~/database/bccdcCPO.seq"
-    updateMLST = True
-    assemblyPath = "./"
-    cardDB = "./"
-    ID = "BC11-Kpn005_S2"
+    mlstScheme = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\scheme_species_map.tab"
+    plasmidfinder = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.origins"
+    outputDir = "./"
 
 #region result objects
 #define some objects to store values from results
@@ -164,6 +161,7 @@
         self.mash_neighbor_distance	= 0.00
         self.mash_neighbor_cluster= 0
         self.row = ""
+
 class RGIResult(object):
     def __init__(self):
         self.ORF_ID	= ""
@@ -233,17 +231,18 @@
             out.write(gzContent)
         return True
 def ToJson(dictObject, outputPath):
-    outDir = outputDir + '/summary/' + ID + ".json/"
-    if not (os.path.exists(outDir)):
-        os.makedirs(outDir)
-    with open(outDir + outputPath, 'w') as f:
-      json.dump([ob.__dict__ for ob in dictObject.values()], f, ensure_ascii=False)
+    #outDir = outputDir + '/summary/' + ID + ".json/"
+    #if not (os.path.exists(outDir)):
+        #os.makedirs(outDir)
+    #with open(outputPath, 'w') as f:
+      #json.dump([ob.__dict__ for ob in dictObject.values()], f, ensure_ascii=False)
+    return ""
 #endregion
 
 #region functions to parse result files
-def ParseMLSTResult(pathToMLSTResult):
+def ParseMLSTResult(pathToMLSTResult, scheme):
     _mlstResult = {}
-    scheme = pandas.read_csv(mlstScheme, delimiter='\t', header=0)
+    scheme = pandas.read_csv(scheme, delimiter='\t', header=0)
     scheme = scheme.replace(numpy.nan, '', regex=True)
 
     taxon = {}
@@ -442,47 +441,55 @@
             r.source = "likely chromosome"
         _rgiR[r.Model_ID]=r
     return _rgiR
+
+def ParsePlasmidFinderResult(pathToPlasmidFinderResult):
+    #pipelineTest/contigs/BC110-Kpn005.fa	contig00019	45455	45758	IncFIC(FII)_1	8-308/499	========/=.....	8/11	59.52	75.65	plasmidfinder	AP001918	IncFIC(FII)_1__AP001918
+    #example resfinder:
+    #pipelineTest/contigs/BC110-Kpn005.fa	contig00038	256	1053	OXA-181	1-798/798	===============	0/0	100.00	100.00	bccdc	AEP16366.1	  OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-181 
+
+    _pFinder = {} #***********************
+    plasmidFinder = pandas.read_csv(pathToPlasmidFinderResult, delimiter='\t', header=0)
+
+    for i in range(len(plasmidFinder.index)):
+        pf = starFinders()
+        pf.file = str(plasmidFinder.iloc[i,0])
+        pf.sequence = str(plasmidFinder.iloc[i,1])
+        pf.start = int(plasmidFinder.iloc[i,2])
+        pf.end = int(plasmidFinder.iloc[i,3])
+        pf.gene = str(plasmidFinder.iloc[i,4])
+        pf.shortGene = pf.gene[:pf.gene.index("_")]
+        pf.coverage = str(plasmidFinder.iloc[i,5])
+        pf.coverage_map = str(plasmidFinder.iloc[i,6])
+        pf.gaps = str(plasmidFinder.iloc[i,7])
+        pf.pCoverage = float(plasmidFinder.iloc[i,8])
+        pf.pIdentity = float(plasmidFinder.iloc[i,9])
+        pf.database = str(plasmidFinder.iloc[i,10])
+        pf.accession = str(plasmidFinder.iloc[i,11])
+        pf.product = str(plasmidFinder.iloc[i,12])
+        pf.source = "plasmid"
+        pf.row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist())
+        _pFinder[pf.gene]=pf
+        #row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist())
+        #plasmidFinderContigs.append(str(plasmidFinder.iloc[i,1]))
+        #origins.append(str(plasmidFinder.iloc[i,4][:plasmidFinder.iloc[i,4].index("_")]))
+    return _pFinder
 #endregion
 
 def Main():
+    outputDir = "./"
     notes = []
     #init the output list
     output = []
     jsonOutput = []
 
-    print(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + assemblyPath)
-    output.append(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + assemblyPath)
-
-    #region update databases if update=true
-    if not debug:
-        #update databases if necessary
-        if not (updateAbPath == "" and updateAbName == "default"):
-            print("updating abricate database: " + updateAbName + " @fasta path: " + updateAbPath)
-            cmd = [scriptDir + "/pipeline_updateAbricateDB.sh", updateAbPath, updateAbName]
-            update = execute(cmd)
-        if (updateMLST.lower() == "true"):
-            print("updating mlst database... ")
-            cmd = [scriptDir + "/pipeline_updateMLST.sh"]
-            update = execute(cmd)
-    #endregion
-
-    print("step 3: parsing the mlst results")
-
-    print("performing MLST")
-    #region call the mlst script
-    if not debug:
-        print("running pipeline_prediction.sh")
-        #input parameters: 1=ID 2 = assemblyPath, 3= outputdir, 4=card.json
-        cmd = [scriptDir + "/pipeline_prediction.sh", ID, assemblyPath, outputDir, cardDB]
-        result = execute(cmd)
-    #endregion
+    print(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + ID)
+    output.append(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + ID)
 
     #region parse the mlst results
     print("step 3: parsing mlst, plasmid, and amr results")
     
     print("identifying MLST")    
-    pathToMLSTScheme = outputDir + "/predictions/" + ID + ".mlst"
-    mlstHit = ParseMLSTResult(pathToMLSTScheme)#***********************
+    mlstHit = ParseMLSTResult(mlst, str(mlstScheme))#***********************
     ToJson(mlstHit, "mlst.json") #write it to a json output
     mlstHit = list(mlstHit.values())[0]
 
@@ -496,9 +503,9 @@
     origins = []
 
     #parse mobsuite results
-    mSuite = ParseMobsuiteResult(outputDir + "/predictions/" + ID + ".recon/contig_report.txt")#*************
+    mSuite = ParseMobsuiteResult(mobfindercontig) #outputDir + "/predictions/" + ID + ".recon/contig_report.txt")#*************
     ToJson(mSuite, "mobsuite.json") #*************
-    mSuitePlasmids = ParseMobsuitePlasmids(outputDir + "/predictions/" + ID + ".recon/mobtyper_aggregate_report.txt")#*************
+    mSuitePlasmids = ParseMobsuitePlasmids(mobfinderaggregate)#outputDir + "/predictions/" + ID + ".recon/mobtyper_aggregate_report.txt")#*************
     ToJson(mSuitePlasmids, "mobsuitePlasmids.json") #*************
 
     for key in mSuite:
@@ -512,10 +519,13 @@
             origins.append(mSuite[key].rep_type)
 
     #parse resfinder AMR results
-    rFinder = ParseResFinderResult(outputDir + "/predictions/" + ID + ".cp", plasmidContigs, likelyPlasmidContigs) #**********************
+    pFinder = ParsePlasmidFinderResult(plasmidfinder)
+    ToJson(pFinder, "origins.json")
+
+    rFinder = ParseResFinderResult(abricate, plasmidContigs, likelyPlasmidContigs)#outputDir + "/predictions/" + ID + ".cp", plasmidContigs, likelyPlasmidContigs) #**********************
     ToJson(rFinder, "resfinder.json") #*************
 
-    rgiAMR = ParseRGIResult(outputDir + "/predictions/" + ID + ".rgi.txt", plasmidContigs, likelyPlasmidContigs)#***********************
+    rgiAMR = ParseRGIResult(rgi, plasmidContigs, likelyPlasmidContigs) # outputDir + "/predictions/" + ID + ".rgi.txt", plasmidContigs, likelyPlasmidContigs)#***********************
     ToJson(rgiAMR, "rgi.json") #*************
 
     carbapenamases = []
@@ -580,7 +590,7 @@
 
     #write summary to a file
     summaryDir = outputDir + "/summary/" + ID
-    out = open(summaryDir + ".txt", 'w')
+    out = open("summary.txt", 'w')
     for item in output:
         out.write("%s\n" % item)
 
@@ -623,7 +633,7 @@
     tsvOut.append(temp)
 
     summaryDir = outputDir + "/summary/" + ID
-    out = open(summaryDir + ".tsv", 'w')
+    out = open("summary.tsv", 'w')
     for item in tsvOut:
         out.write("%s\n" % item)
     #endregion
@@ -635,4 +645,4 @@
 Main()
 
 end = time.time()
-print("Finished!\nThe analysis used: " + str(end-start) + " seconds")
+print("Finished!\nThe analysis used: " + str(end-start) + " seconds")
\ No newline at end of file