Mercurial > repos > jjjjia > cpo_prediction

diff cpo_galaxy_prediction.py @ 1:fea89c4d5227 draft
Uploaded
author: jjjjia
date: Thu, 16 Aug 2018 19:27:05 -0400
children: 29302ffdf137
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpo_galaxy_prediction.py	Thu Aug 16 19:27:05 2018 -0400
@@ -0,0 +1,638 @@
+#!/home/jjjjia/.conda/envs/py36/bin/python
+
+#$ -S /home/jjjjia/.conda/envs/py36/bin/python
+#$ -V             # Pass environment variables to the job
+#$ -N CPO_pipeline    # Replace with a more specific job name
+#$ -wd /home/jjjjia/testCases           # Use the current working dir
+#$ -pe smp 8      # Parallel Environment (how many cores)
+#$ -l h_vmem=11G  # Memory (RAM) allocation *per core*
+#$ -e ./logs/$JOB_ID.err
+#$ -o ./logs/$JOB_ID.log
+#$ -m ea
+#$ -M bja20@sfu.ca
+
+#~/scripts/pipeline.py -i BC11-Kpn005_S2 -f /data/jjjjia/R1/BC11-Kpn005_S2_L001_R1_001.fastq.gz -r /data/jjjjia/R2/BC11-Kpn005_S2_L001_R2_001.fastq.gz -o pipelineResultsQsub -e "Klebsiella pneumoniae" 
+
+import subprocess
+import pandas
+import optparse
+import os
+import datetime
+import sys
+import time
+import urllib.request
+import gzip
+import collections
+import json
+import numpy
+
+
+debug = True #debug skips the shell scripts and also dump out a ton of debugging messages
+
+if not debug:
+    #parses some parameters
+    parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...")
+    #required
+    parser.add_option("-i", "--id", dest="id", type="string", help="identifier of the isolate")    
+    parser.add_option("-a", "--assembly", dest="assemblyPath", type="string", help="absolute file path to contigs fasta")
+    parser.add_option("-c", "--card-db", dest="cardDB", default = "/home/jjjjia/databases/card202.json", type="string", help="absolute path to card reference database")
+    parser.add_option("-o", "--output", dest="output", default='./', type="string", help="absolute path to output folder")
+    parser.add_option("-e", "--expected", dest="expectedSpecies", default="NA/NA/NA", type="string", help="expected species of the isolate")
+
+    #optionals
+    parser.add_option("-k", "--script-path", dest="scriptDir", default="/home/jjjjia/scripts", type="string", help="absolute file path to this script folder")
+    parser.add_option("-b", "--update-abricate-path", dest="updateAbPath", default = "", type="string", help="absolute file path to fasta sequence used for abricate database")
+    parser.add_option("-m", "--update-abricate-dbname", dest="updateAbName", default = "default", type="string", help="name of abricate database to update")
+    parser.add_option("-u", "--update-mlst", dest="updateMLST", default = "False", type="string", help="True = update MLST")
+    #used for parsing 
+    parser.add_option("-s", "--mlst-scheme", dest="mlst", default= "/home/jjjjia/databases/scheme_species_map.tab", type="string", help="absolute file path to mlst scheme")
+
+    
+    #parallelization, useless, these are hard coded to 8cores/64G RAM
+    #parser.add_option("-t", "--threads", dest="threads", default=8, type="int", help="number of cpu to use")
+    #parser.add_option("-p", "--memory", dest="memory", default=64, type="int", help="memory to use in GB")
+
+    (options,args) = parser.parse_args()
+    #if len(args) != 8:
+        #parser.error("incorrect number of arguments, all 7 is required")
+    curDir = os.getcwd()
+    outputDir = options.output
+    expectedSpecies = options.expectedSpecies
+    mlstScheme = options.mlst
+    tempDir = outputDir + "/shovillTemp"
+    scriptDir = options.scriptDir
+    updateAbName = options.updateAbName
+    updateAbPath = options.updateAbPath
+    updateMLST = options.updateMLST
+    cardDB=options.cardDB
+    assemblyPath=options.assemblyPath
+    ID = options.id
+else:
+    manifestDir = ""
+    curDir = os.getcwd()
+    outputDir = "pipelineTest"
+    expectedSpecies = "Escherichia coli"
+    #threads = 8
+    #memory = 64
+    mlstScheme = outputDir + "/scheme_species_map.tab"
+    tempDir= outputDir + "/shovillTemp"    
+    scriptDir = "~/scripts"
+    updateAbName = "cpo"
+    updateAbPath = "~/database/bccdcCPO.seq"
+    updateMLST = True
+    assemblyPath = "./"
+    cardDB = "./"
+    ID = "BC11-Kpn005_S2"
+
+#region result objects
+#define some objects to store values from results
+#//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar).
+class starFinders(object):
+    def __init__(self):
+        self.file = ""
+        self.sequence = ""
+        self.start = 0
+        self.end = 0
+        self.gene = ""
+        self.shortGene = ""
+        self.coverage = ""
+        self.coverage_map = ""
+        self.gaps = ""
+        self.pCoverage = 100.00
+        self.pIdentity = 100.00
+        self.database = ""
+        self.accession = ""
+        self.product = ""
+        self.source = "chromosome"
+        self.row = ""
+
+class PlasFlowResult(object):
+    def __init__(self):
+        self.sequence = ""
+        self.length = 0
+        self.label = ""
+        self.confidence = 0
+        self.usefulRow = ""
+        self.row = ""
+
+class MlstResult(object):
+    def __init__(self):
+        self.file = ""
+        self.speciesID = ""
+        self.seqType = 0
+        self.scheme = ""
+        self.species = ""
+        self.row=""
+
+class mobsuiteResult(object):
+    def __init__(self):
+        self.file_id = ""
+        self.cluster_id	= ""
+        self.contig_id	= ""
+        self.contig_num = 0
+        self.contig_length	= 0
+        self.circularity_status	= ""
+        self.rep_type	= ""
+        self.rep_type_accession = ""	
+        self.relaxase_type	= ""
+        self.relaxase_type_accession = ""	
+        self.mash_nearest_neighbor	 = ""
+        self.mash_neighbor_distance	= 0.00
+        self.repetitive_dna_id	= ""
+        self.match_type	= ""
+        self.score	= 0
+        self.contig_match_start	= 0
+        self.contig_match_end = 0
+        self.row = ""
+
+class mobsuitePlasmids(object):
+    def __init__(self):
+        self.file_id = ""
+        self.num_contigs = 0
+        self.total_length = 0
+        self.gc = ""
+        self.rep_types = ""
+        self.rep_typeAccession = ""
+        self.relaxase_type= ""
+        self.relaxase_type_accession	= ""
+        self.mpf_type	= ""
+        self.mpf_type_accession= ""	
+        self.orit_type	= ""
+        self.orit_accession	= ""
+        self.PredictedMobility	= ""
+        self.mash_nearest_neighbor	= ""
+        self.mash_neighbor_distance	= 0.00
+        self.mash_neighbor_cluster= 0
+        self.row = ""
+class RGIResult(object):
+    def __init__(self):
+        self.ORF_ID	= ""
+        self.Contig	= ""
+        self.Start	= -1
+        self.Stop	= -1
+        self.Orientation = ""	
+        self.Cut_Off	= ""
+        self.Pass_Bitscore	= 100000
+        self.Best_Hit_Bitscore	= 0.00
+        self.Best_Hit_ARO	= ""
+        self.Best_Identities	= 0.00
+        self.ARO = 0
+        self.Model_type	= ""
+        self.SNPs_in_Best_Hit_ARO	= ""
+        self.Other_SNPs	= ""
+        self.Drug_Class	= ""
+        self.Resistance_Mechanism	= ""
+        self.AMR_Gene_Family	= ""
+        self.Predicted_DNA	= ""
+        self.Predicted_Protein	= ""
+        self.CARD_Protein_Sequence	= ""
+        self.Percentage_Length_of_Reference_Sequence	= 0.00
+        self.ID	= ""
+        self.Model_ID = 0
+        self.source = ""
+        self.row = ""
+
+#endregion
+
+#region useful functions
+def read(path):
+    return [line.rstrip('\n') for line in open(path)]
+def execute(command):
+    process = subprocess.Popen(command, shell=False, cwd=curDir, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+
+    # Poll process for new output until finished
+    while True:
+        nextline = process.stdout.readline()
+        if nextline == '' and process.poll() is not None:
+            break
+        sys.stdout.write(nextline)
+        sys.stdout.flush()
+
+    output = process.communicate()[0]
+    exitCode = process.returncode
+
+    if (exitCode == 0):
+        return output
+    else:
+        raise subprocess.CalledProcessError(exitCode, command)
+def httpGetFile(url, filepath=""):
+    if (filepath == ""):
+        return urllib.request.urlretrieve(url)
+    else:
+        urllib.request.urlretrieve(url, filepath)
+        return True
+def gunzip(inputpath="", outputpath=""):
+    if (outputpath == ""):
+        with gzip.open(inputpath, 'rb') as f:
+            gzContent = f.read()
+        return gzContent
+    else:
+        with gzip.open(inputpath, 'rb') as f:
+            gzContent = f.read()
+        with open(outputpath, 'wb') as out:
+            out.write(gzContent)
+        return True
+def ToJson(dictObject, outputPath):
+    outDir = outputDir + '/summary/' + ID + ".json/"
+    if not (os.path.exists(outDir)):
+        os.makedirs(outDir)
+    with open(outDir + outputPath, 'w') as f:
+      json.dump([ob.__dict__ for ob in dictObject.values()], f, ensure_ascii=False)
+#endregion
+
+#region functions to parse result files
+def ParseMLSTResult(pathToMLSTResult):
+    _mlstResult = {}
+    scheme = pandas.read_csv(mlstScheme, delimiter='\t', header=0)
+    scheme = scheme.replace(numpy.nan, '', regex=True)
+
+    taxon = {}
+    #record the scheme as a dictionary
+    taxon["-"] = "No MLST Match"
+    for i in range(len(scheme.index)):
+        key = scheme.iloc[i,0]
+        if (str(scheme.iloc[i,2]) == "nan"):
+            value = str(scheme.iloc[i,1])
+        else:
+            value = str(scheme.iloc[i,1]) + " " + str(scheme.iloc[i,2])
+        
+        if (key in taxon.keys()):
+            taxon[key] = taxon.get(key) + ";" + value
+        else:
+            taxon[key] = value
+    #read in the mlst result
+    mlst = pandas.read_csv(pathToMLSTResult, delimiter='\t', header=None)
+    _mlstHit = MlstResult()
+
+    _mlstHit.file = mlst.iloc[0,0]
+    _mlstHit.speciesID = (mlst.iloc[0,1])
+    _mlstHit.seqType = str(mlst.iloc[0,2])
+    for i in range(3, len(mlst.columns)):
+        _mlstHit.scheme += mlst.iloc[0,i] + ";"
+    _mlstHit.species = taxon[_mlstHit.speciesID]
+    _mlstHit.row = "\t".join(str(x) for x in mlst.ix[0].tolist())
+    _mlstResult[_mlstHit.speciesID]=_mlstHit
+
+    return _mlstResult
+
+def ParsePlasmidFinderResult(pathToPlasmidFinderResult):
+    #pipelineTest/contigs/BC110-Kpn005.fa	contig00019	45455	45758	IncFIC(FII)_1	8-308/499	========/=.....	8/11	59.52	75.65	plasmidfinder	AP001918	IncFIC(FII)_1__AP001918
+    #example resfinder:
+    #pipelineTest/contigs/BC110-Kpn005.fa	contig00038	256	1053	OXA-181	1-798/798	===============	0/0	100.00	100.00	bccdc	AEP16366.1	  OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-181 
+
+    _pFinder = {} #***********************
+    plasmidFinder = pandas.read_csv(pathToPlasmidFinderResult, delimiter='\t', header=0)
+    plasmidFinder = plasmidFinder.replace(numpy.nan, '', regex=True)
+
+
+    for i in range(len(plasmidFinder.index)):
+        pf = starFinders()
+        pf.file = str(plasmidFinder.iloc[i,0])
+        pf.sequence = str(plasmidFinder.iloc[i,1])
+        pf.start = int(plasmidFinder.iloc[i,2])
+        pf.end = int(plasmidFinder.iloc[i,3])
+        pf.gene = str(plasmidFinder.iloc[i,4])
+        pf.shortGene = pf.gene[:pf.gene.index("_")]
+        pf.coverage = str(plasmidFinder.iloc[i,5])
+        pf.coverage_map = str(plasmidFinder.iloc[i,6])
+        pf.gaps = str(plasmidFinder.iloc[i,7])
+        pf.pCoverage = float(plasmidFinder.iloc[i,8])
+        pf.pIdentity = float(plasmidFinder.iloc[i,9])
+        pf.database = str(plasmidFinder.iloc[i,10])
+        pf.accession = str(plasmidFinder.iloc[i,11])
+        pf.product = str(plasmidFinder.iloc[i,12])
+        pf.source = "plasmid"
+        pf.row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist())
+        _pFinder[pf.gene]=pf
+        #row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist())
+        #plasmidFinderContigs.append(str(plasmidFinder.iloc[i,1]))
+        #origins.append(str(plasmidFinder.iloc[i,4][:plasmidFinder.iloc[i,4].index("_")]))
+    return _pFinder
+
+def ParseMobsuiteResult(pathToMobsuiteResult):
+    _mobsuite = {}
+    mResult = pandas.read_csv(pathToMobsuiteResult, delimiter='\t', header=0)
+    mResult = mResult.replace(numpy.nan, '', regex=True)
+
+    for i in range(len(mResult.index)):
+        mr = mobsuiteResult()
+        mr.file_id = str(mResult.iloc[i,0])
+        mr.cluster_id = str(mResult.iloc[i,1])
+        if (mr.cluster_id == "chromosome"):
+            break
+        mr.contig_id = str(mResult.iloc[i,2])
+        mr.contig_num = mr.contig_id[(mr.contig_id.find("contig")+6):mr.contig_id.find("_len=")]
+        mr.contig_length = int(mResult.iloc[i,3])
+        mr.circularity_status = str(mResult.iloc[i,4])
+        mr.rep_type = str(mResult.iloc[i,5])
+        mr.rep_type_accession = str(mResult.iloc[i,6])
+        mr.relaxase_type = str(mResult.iloc[i,7])
+        mr.relaxase_type_accession = str(mResult.iloc[i,8])
+        mr.mash_nearest_neighbor = str(mResult.iloc[i,9])
+        mr.mash_neighbor_distance = float(mResult.iloc[i,10])
+        mr.repetitive_dna_id = str(mResult.iloc[i,11])
+        mr.match_type = str(mResult.iloc[i,12])
+        if (mr.match_type == ""):
+            mr.score = -1
+            mr.contig_match_start = -1
+            mr.contig_match_end = -1
+        else:
+            mr.score = int(mResult.iloc[i,13])
+            mr.contig_match_start = int(mResult.iloc[i,14])
+            mr.contig_match_end = int(mResult.iloc[i,15])
+        mr.row = "\t".join(str(x) for x in mResult.ix[i].tolist())
+        _mobsuite[mr.contig_id]=(mr)
+    return _mobsuite
+
+def ParseMobsuitePlasmids(pathToMobsuiteResult):
+    _mobsuite = {}
+    mResults = pandas.read_csv(pathToMobsuiteResult, delimiter='\t', header=0)
+    mResults = mResults.replace(numpy.nan, '', regex=True)
+
+    for i in range(len(mResults.index)):
+        mr = mobsuitePlasmids()
+        mr.file_id = str(mResults.iloc[i,0])
+        mr.num_contigs = int(mResults.iloc[i,1])
+        mr.total_length = int(mResults.iloc[i,2])
+        mr.gc = int(mResults.iloc[i,3])
+        mr.rep_types = str(mResults.iloc[i,4])
+        mr.rep_typeAccession = str(mResults.iloc[i,5])
+        mr.relaxase_type = str(mResults.iloc[i,6])
+        mr.relaxase_type_accession = str(mResults.iloc[i,7])
+        mr.mpf_type = str(mResults.iloc[i,8])
+        mr.mpf_type_accession = str(mResults.iloc[i,9])
+        mr.orit_type = str(mResults.iloc[i,10])
+        mr.orit_accession = str(mResults.iloc[i,11])
+        mr.PredictedMobility = str(mResults.iloc[i,12])
+        mr.mash_nearest_neighbor = str(mResults.iloc[i,13])
+        mr.mash_neighbor_distance = float(mResults.iloc[i,14])
+        mr.mash_neighbor_cluster = int(mResults.iloc[i,15])
+        mr.row = "\t".join(str(x) for x in mResults.ix[i].tolist())
+        _mobsuite[mr.file_id] = mr
+    return _mobsuite
+
+def ParseResFinderResult(pathToResFinderResults, plasmidContigs, likelyPlasmidContigs):
+    _rFinder = {}
+    resFinder = pandas.read_csv(pathToResFinderResults, delimiter='\t', header=0)
+    resFinder = resFinder.replace(numpy.nan, '', regex=True)
+
+    for i in range(len(resFinder.index)):
+        rf = starFinders()
+        rf.file = str(resFinder.iloc[i,0])
+        rf.sequence = str(resFinder.iloc[i,1])
+        rf.start = int(resFinder.iloc[i,2])
+        rf.end = int(resFinder.iloc[i,3])
+        rf.gene = str(resFinder.iloc[i,4])
+        rf.shortGene = rf.gene
+        rf.coverage = str(resFinder.iloc[i,5])
+        rf.coverage_map = str(resFinder.iloc[i,6])
+        rf.gaps = str(resFinder.iloc[i,7])
+        rf.pCoverage = float(resFinder.iloc[i,8])
+        rf.pIdentity = float(resFinder.iloc[i,9])
+        rf.database = str(resFinder.iloc[i,10])
+        rf.accession = str(resFinder.iloc[i,11])
+        rf.product = str(resFinder.iloc[i,12])
+        rf.row = "\t".join(str(x) for x in resFinder.ix[i].tolist())
+        if (rf.sequence[6:] in plasmidContigs):
+            rf.source = "plasmid"
+        elif (rf.sequence[6:] in likelyPlasmidContigs):
+            rf.source = "likely plasmid"
+        else:
+            rf.source = "likely chromosome"
+        _rFinder[rf.gene]=rf
+    return _rFinder
+
+def ParseRGIResult(pathToRGIResults, plasmidContigs, likelyPlasmidContigs):
+    _rgiR = {}
+    RGI = pandas.read_csv(pathToRGIResults, delimiter='\t', header=0)
+    RGI = RGI.replace(numpy.nan, '', regex=True)
+
+    for i in range(len(RGI.index)):
+        r = RGIResult()
+        r.ORF_ID = str(RGI.iloc[i,0])
+        r.Contig = str(RGI.iloc[i,1])
+        r.Contig_Num = r.Contig[6:r.Contig.find("_")]
+        r.Start = int(RGI.iloc[i,2])
+        r.Stop = int(RGI.iloc[i,3])
+        r.Orientation = str(RGI.iloc[i,4])
+        r.Cut_Off = str(RGI.iloc[i,5])
+        r.Pass_Bitscore = int(RGI.iloc[i,6])
+        r.Best_Hit_Bitscore = float(RGI.iloc[i,7])
+        r.Best_Hit_ARO = str(RGI.iloc[i,8])
+        r.Best_Identities = float(RGI.iloc[i,9])
+        r.ARO = int(RGI.iloc[i,10])
+        r.Model_type = str(RGI.iloc[i,11])
+        r.SNPs_in_Best_Hit_ARO = str(RGI.iloc[i,12])
+        r.Other_SNPs = str(RGI.iloc[i,13])
+        r.Drug_Class = str(RGI.iloc[i,14])
+        r.Resistance_Mechanism = str(RGI.iloc[i,15])
+        r.AMR_Gene_Family = str(RGI.iloc[i,16])
+        r.Predicted_DNA = str(RGI.iloc[i,17])
+        r.Predicted_Protein = str(RGI.iloc[i,18])
+        r.CARD_Protein_Sequence = str(RGI.iloc[i,19])
+        r.Percentage_Length_of_Reference_Sequence = float(RGI.iloc[i,20])
+        r.ID = str(RGI.iloc[i,21])
+        r.Model_ID = int(RGI.iloc[i,22])
+        r.row = "\t".join(str(x) for x in RGI.ix[i].tolist())
+        if (r.Contig_Num in plasmidContigs):
+            r.source = "plasmid"
+        elif (r.Contig_Num in likelyPlasmidContigs):
+            r.source = "likely plasmid"
+        else:
+            r.source = "likely chromosome"
+        _rgiR[r.Model_ID]=r
+    return _rgiR
+#endregion
+
+def Main():
+    notes = []
+    #init the output list
+    output = []
+    jsonOutput = []
+
+    print(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + assemblyPath)
+    output.append(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + assemblyPath)
+
+    #region update databases if update=true
+    if not debug:
+        #update databases if necessary
+        if not (updateAbPath == "" and updateAbName == "default"):
+            print("updating abricate database: " + updateAbName + " @fasta path: " + updateAbPath)
+            cmd = [scriptDir + "/pipeline_updateAbricateDB.sh", updateAbPath, updateAbName]
+            update = execute(cmd)
+        if (updateMLST.lower() == "true"):
+            print("updating mlst database... ")
+            cmd = [scriptDir + "/pipeline_updateMLST.sh"]
+            update = execute(cmd)
+    #endregion
+
+    print("step 3: parsing the mlst results")
+
+    print("performing MLST")
+    #region call the mlst script
+    if not debug:
+        print("running pipeline_prediction.sh")
+        #input parameters: 1=ID 2 = assemblyPath, 3= outputdir, 4=card.json
+        cmd = [scriptDir + "/pipeline_prediction.sh", ID, assemblyPath, outputDir, cardDB]
+        result = execute(cmd)
+    #endregion
+
+    #region parse the mlst results
+    print("step 3: parsing mlst, plasmid, and amr results")
+    
+    print("identifying MLST")    
+    pathToMLSTScheme = outputDir + "/predictions/" + ID + ".mlst"
+    mlstHit = ParseMLSTResult(pathToMLSTScheme)#***********************
+    ToJson(mlstHit, "mlst.json") #write it to a json output
+    mlstHit = list(mlstHit.values())[0]
+
+    #endregion
+
+    #region parse mobsuite, resfinder and rgi results
+    print("identifying plasmid contigs and amr genes")
+
+    plasmidContigs = []
+    likelyPlasmidContigs = []
+    origins = []
+
+    #parse mobsuite results
+    mSuite = ParseMobsuiteResult(outputDir + "/predictions/" + ID + ".recon/contig_report.txt")#*************
+    ToJson(mSuite, "mobsuite.json") #*************
+    mSuitePlasmids = ParseMobsuitePlasmids(outputDir + "/predictions/" + ID + ".recon/mobtyper_aggregate_report.txt")#*************
+    ToJson(mSuitePlasmids, "mobsuitePlasmids.json") #*************
+
+    for key in mSuite:
+        if mSuite[key].contig_num not in plasmidContigs and mSuite[key].contig_num not in likelyPlasmidContigs:
+            if not (mSuite[key].rep_type == ''):
+                plasmidContigs.append(mSuite[key].contig_num)
+            else:
+                likelyPlasmidContigs.append(mSuite[key].contig_num)
+    for key in mSuite:
+        if mSuite[key].rep_type not in origins:
+            origins.append(mSuite[key].rep_type)
+
+    #parse resfinder AMR results
+    rFinder = ParseResFinderResult(outputDir + "/predictions/" + ID + ".cp", plasmidContigs, likelyPlasmidContigs) #**********************
+    ToJson(rFinder, "resfinder.json") #*************
+
+    rgiAMR = ParseRGIResult(outputDir + "/predictions/" + ID + ".rgi.txt", plasmidContigs, likelyPlasmidContigs)#***********************
+    ToJson(rgiAMR, "rgi.json") #*************
+
+    carbapenamases = []
+    amrGenes = []
+    for keys in rFinder:
+        carbapenamases.append(rFinder[keys].shortGene + "(" + rFinder[keys].source + ")")
+    for keys in rgiAMR:
+        if (rgiAMR[keys].Drug_Class.find("carbapenem") > -1):
+            if (rgiAMR[keys].Best_Hit_ARO not in carbapenamases):
+                carbapenamases.append(rgiAMR[keys].Best_Hit_ARO+ "(" + rgiAMR[keys].source + ")")
+        else:
+            if (rgiAMR[keys].Best_Hit_ARO not in amrGenes):
+                amrGenes.append(rgiAMR[keys].Best_Hit_ARO+ "(" + rgiAMR[keys].source + ")")
+    #endregion
+
+    #region output parsed mlst information
+    print("formatting mlst outputs")
+    output.append("\n\n\n~~~~~~~MLST summary~~~~~~~")
+    output.append("MLST determined species: " + mlstHit.species)
+    output.append("\nMLST Details: ")
+    output.append(mlstHit.row)
+
+    output.append("\nMLST information: ")
+    if (mlstHit.species == expectedSpecies):
+        output.append("MLST determined species is the same as expected species")
+        #notes.append("MLST determined species is the same as expected species")
+    else:
+        output.append("!!!MLST determined species is NOT the same as expected species, contamination? mislabeling?")
+        notes.append("MLST: Not expected species. Possible contamination or mislabeling")
+
+    #endregion
+
+    #region output the parsed plasmid/amr results
+    output.append("\n\n\n~~~~~~~~Plasmids~~~~~~~~\n")
+    
+    output.append("predicted plasmid origins: ")
+    output.append(";".join(origins))
+
+    output.append("\ndefinitely plasmid contigs")
+    output.append(";".join(plasmidContigs))
+    
+    output.append("\nlikely plasmid contigs")
+    output.append(";".join(likelyPlasmidContigs))
+
+    output.append("\nmob-suite prediction details: ")
+    for key in mSuite:
+        output.append(mSuite[key].row)
+
+    output.append("\n\n\n~~~~~~~~AMR Genes~~~~~~~~\n")
+    output.append("predicted carbapenamase Genes: ")
+    output.append(",".join(carbapenamases))
+    output.append("other RGI AMR Genes: ")
+    for key in rgiAMR:
+        output.append(rgiAMR[key].Best_Hit_ARO + "(" + rgiAMR[key].source + ")")
+
+    output.append("\nDetails about the carbapenamase Genes: ")
+    for key in rFinder:
+        output.append(rFinder[key].row)
+    output.append("\nDetails about the RGI AMR Genes: ")
+    for key in rgiAMR:
+        output.append(rgiAMR[key].row)
+
+    #write summary to a file
+    summaryDir = outputDir + "/summary/" + ID
+    out = open(summaryDir + ".txt", 'w')
+    for item in output:
+        out.write("%s\n" % item)
+
+
+    #TSV output
+    tsvOut = []
+    tsvOut.append("ID\tExpected Species\tMLST Species\tSequence Type\tMLST Scheme\tCarbapenem Resistance Genes\tOther AMR Genes\tTotal Plasmids\tPlasmids ID\tNum_Contigs\tPlasmid Length\tPlasmid RepType\tPlasmid Mobility\tNearest Reference\tDefinitely Plasmid Contigs\tLikely Plasmid Contigs")
+    #start with ID
+    temp = ""
+    temp += (ID + "\t")
+    temp += expectedSpecies + "\t"
+
+    #move into MLST
+    temp += mlstHit.species + "\t"
+    temp += str(mlstHit.seqType) + "\t"
+    temp += mlstHit.scheme + "\t"
+    
+    #now onto AMR genes
+    temp += ";".join(carbapenamases) + "\t"
+    temp += ";".join(amrGenes) + "\t"
+
+    #lastly plasmids
+    temp+= str(len(mSuitePlasmids)) + "\t"
+    plasmidID = ""
+    contigs = ""
+    lengths = ""
+    rep_type = ""
+    mobility = ""
+    neighbour = ""
+    for keys in mSuitePlasmids:
+        plasmidID += str(mSuitePlasmids[keys].mash_neighbor_cluster) + ";"
+        contigs += str(mSuitePlasmids[keys].num_contigs) + ";"
+        lengths += str(mSuitePlasmids[keys].total_length) + ";"
+        rep_type += str(mSuitePlasmids[keys].rep_types) + ";"
+        mobility += str(mSuitePlasmids[keys].PredictedMobility) + ";"
+        neighbour += str(mSuitePlasmids[keys].mash_nearest_neighbor) + ";"
+    temp += plasmidID + "\t" + contigs + "\t" + lengths + "\t" + rep_type + "\t" + mobility + "\t" + neighbour + "\t"
+    temp += ";".join(plasmidContigs) + "\t"
+    temp += ";".join(likelyPlasmidContigs)
+    tsvOut.append(temp)
+
+    summaryDir = outputDir + "/summary/" + ID
+    out = open(summaryDir + ".tsv", 'w')
+    for item in tsvOut:
+        out.write("%s\n" % item)
+    #endregion
+
+
+start = time.time()#time the analysis
+print("Starting workflow...")
+#analysis time
+Main()
+
+end = time.time()
+print("Finished!\nThe analysis used: " + str(end-start) + " seconds")
\ No newline at end of file
author	jjjjia
date	Thu, 16 Aug 2018 19:27:05 -0400
parents
children	29302ffdf137