diff cpo_galaxy_tree.py @ 1:fea89c4d5227 draft

Uploaded
author jjjjia
date Thu, 16 Aug 2018 19:27:05 -0400
parents
children cabceaa239e4
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpo_galaxy_tree.py	Thu Aug 16 19:27:05 2018 -0400
@@ -0,0 +1,359 @@
+#!/home/jjjjia/.conda/envs/py36/bin/python
+
+#$ -S /home/jjjjia/.conda/envs/py36/bin/python
+#$ -V             # Pass environment variables to the job
+#$ -N CPO_pipeline    # Replace with a more specific job name
+#$ -wd /home/jjjjia/testCases           # Use the current working dir
+#$ -pe smp 8      # Parallel Environment (how many cores)
+#$ -l h_vmem=11G  # Memory (RAM) allocation *per core*
+#$ -e ./logs/$JOB_ID.err
+#$ -o ./logs/$JOB_ID.log
+#$ -m ea
+#$ -M bja20@sfu.ca
+
+#~/scripts/pipeline.py -i BC11-Kpn005_S2 -f /data/jjjjia/R1/BC11-Kpn005_S2_L001_R1_001.fastq.gz -r /data/jjjjia/R2/BC11-Kpn005_S2_L001_R2_001.fastq.gz -o pipelineResultsQsub -e "Klebsiella pneumoniae" 
+
+import subprocess
+import pandas
+import optparse
+import os
+import datetime
+import sys
+import time
+import urllib.request
+import gzip
+import collections
+import json
+import ete3
+import numpy
+
+#parses some parameters
+parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...")
+parser.add_option("-t", "--tree", dest="treePath", type="string", default="./pipelineTest/tree.txt", help="identifier of the isolate")    
+parser.add_option("-d", "--distance", dest="distancePath", type="string", default="./pipelineTest/distance.tab", help="absolute file path forward read (R1)")
+parser.add_option("-m", "--metadata", dest="metadataPath", type="string", default="./pipelineTest/metadata.tsv",help="absolute file path to reverse read (R2)")
+(options,args) = parser.parse_args()
+treePath = str(options.treePath).lstrip().rstrip()
+distancePath = str(options.distancePath).lstrip().rstrip()
+metadataPath = str(options.metadataPath).lstrip().rstrip()
+
+
+#region result objects
+#define some objects to store values from results
+#//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar).
+class workflowResult(object):
+    def __init__(self):
+        self.new = False
+        self.ID	= "" 
+        self.ExpectedSpecies = ""
+        self.MLSTSpecies = ""
+        self.SequenceType = ""
+        self.MLSTScheme = ""
+        self.CarbapenemResistanceGenes =""
+        self.OtherAMRGenes=""
+        self.TotalPlasmids = 0
+        self.plasmids = []
+        self.DefinitelyPlasmidContigs =""
+        self.LikelyPlasmidContigs=""
+        self.row = ""
+class plasmidObj(object):
+    def __init__(self):
+        self.PlasmidsID = 0
+        self.Num_Contigs = 0
+        self.PlasmidLength	= 0
+        self.PlasmidRepType	= ""
+        self.PlasmidMobility = ""
+        self.NearestReference = ""
+
+#endregion
+
+#region useful functions
+def read(path):
+    return [line.rstrip('\n') for line in open(path)]
+def execute(command):
+    process = subprocess.Popen(command, shell=False, cwd=curDir, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+
+    # Poll process for new output until finished
+    while True:
+        nextline = process.stdout.readline()
+        if nextline == '' and process.poll() is not None:
+            break
+        sys.stdout.write(nextline)
+        sys.stdout.flush()
+
+    output = process.communicate()[0]
+    exitCode = process.returncode
+
+    if (exitCode == 0):
+        return output
+    else:
+        raise subprocess.CalledProcessError(exitCode, command)
+def httpGetFile(url, filepath=""):
+    if (filepath == ""):
+        return urllib.request.urlretrieve(url)
+    else:
+        urllib.request.urlretrieve(url, filepath)
+        return True
+def gunzip(inputpath="", outputpath=""):
+    if (outputpath == ""):
+        with gzip.open(inputpath, 'rb') as f:
+            gzContent = f.read()
+        return gzContent
+    else:
+        with gzip.open(inputpath, 'rb') as f:
+            gzContent = f.read()
+        with open(outputpath, 'wb') as out:
+            out.write(gzContent)
+        return True
+#endregion
+
+#region functions to parse result files
+def ParseWorkflowResults(pathToResult):
+    _worflowResult = {}
+    r = pandas.read_csv(pathToResult, delimiter='\t', header=None) #read  the kraken2report.tsv
+    r = r.replace(numpy.nan, '', regex=True)
+    for i in range(len(r.index)):  
+        _results = workflowResult()
+        if(str(r.iloc[i,0]).lower() == "new"):
+            _results.new = True
+        else:
+            _results.new = False        
+        _results.ID = str(r.iloc[i,1])
+        _results.ExpectedSpecies = str(r.iloc[i,2])
+        _results.MLSTSpecies = str(r.iloc[i,3])
+        _results.SequenceType = str(r.iloc[i,4])
+        _results.MLSTScheme = (str(r.iloc[i,5]))
+        _results.CarbapenemResistanceGenes = (str(r.iloc[i,6]))
+        _results.OtherAMRGenes = (str(r.iloc[i,7]))
+        _results.TotalPlasmids = int(r.iloc[i,8])
+        for j in range(0,_results.TotalPlasmids):
+            _plasmid = plasmidObj()
+            _plasmid.PlasmidsID =(((str(r.iloc[i,9])).split(";"))[j])
+            _plasmid.Num_Contigs = (((str(r.iloc[i,10])).split(";"))[j])
+            _plasmid.PlasmidLength	= (((str(r.iloc[i,11])).split(";"))[j])
+            _plasmid.PlasmidRepType	= (((str(r.iloc[i,12])).split(";"))[j])
+            _plasmid.PlasmidMobility = ((str(r.iloc[i,13])).split(";"))[j]
+            _plasmid.NearestReference = ((str(r.iloc[i,14])).split(";"))[j]
+            _results.plasmids.append(_plasmid)
+        _results.DefinitelyPlasmidContigs = (str(r.iloc[i,15]))
+        _results.LikelyPlasmidContigs = (str(r.iloc[i,16]))
+        _results.row = "\t".join(str(x) for x in r.ix[i].tolist())
+        _worflowResult[_results.ID] = _results
+    return _worflowResult
+
+#endregion
+
+def Main():
+    metadata = ParseWorkflowResults(metadataPath)
+    distance = read(distancePath)
+    treeFile = "".join(read(treePath))
+
+    distanceDict = {}
+    for i in range(len(distance)):
+        temp = distance[i].split("\t")
+        distanceDict[temp[0]] = temp[1:]
+    #region step5: tree construction
+    t = ete3.Tree(treeFile)
+    t.set_outgroup(t&"Reference")
+
+    ts = ete3.TreeStyle()
+    ts.show_leaf_name = True
+    ts.show_branch_length = True
+    ts.scale = 2000 #pixel per branch length unit
+    ts.branch_vertical_margin = 15 #pixel between branches
+    style2 = ete3.NodeStyle()
+    style2["fgcolor"] = "#000000"
+    style2["shape"] = "circle"
+    style2["vt_line_color"] = "#0000aa"
+    style2["hz_line_color"] = "#0000aa"
+    style2["vt_line_width"] = 2
+    style2["hz_line_width"] = 2
+    style2["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted
+    style2["hz_line_type"] = 0
+    for n in t.traverse():
+        n.set_style(style2)
+    '''
+    #region create detailed tree
+    
+    plasmidCount = 0
+    for n in t.traverse():
+        if (n.is_leaf() and not n.name == "Reference"):
+            mData = metadata[n.name.replace(".fa","")]
+            face = ete3.faces.TextFace(mData.MLSTSpecies,fsize=10,tight_text=True)
+            face.border.margin = 5
+            face.margin_left = 10
+            face.margin_right = 10
+            n.add_face(face, 0, "aligned")
+            face = ete3.faces.TextFace(mData.SequenceType,fsize=10,tight_text=True)
+            face.border.margin = 5
+            face.margin_right = 10
+            n.add_face(face, 1, "aligned")
+            face = ete3.faces.TextFace(mData.CarbapenemResistanceGenes,fsize=10,tight_text=True)
+            face.border.margin = 5
+            face.margin_right = 10
+            n.add_face(face, 2, "aligned")
+            index = 3
+            if (mData.TotalPlasmids > plasmidCount):
+                plasmidCount = mData.TotalPlasmids
+            for i in range(0, mData.TotalPlasmids):
+                face = ete3.faces.TextFace(mData.plasmids[i].PlasmidRepType,fsize=10,tight_text=True)
+                face.border.margin = 5
+                face.margin_right = 10
+                n.add_face(face, index, "aligned")
+                index+=1
+                face = ete3.faces.TextFace(mData.plasmids[i].PlasmidMobility,fsize=10,tight_text=True)
+                face.border.margin = 5
+                face.margin_right = 10
+                n.add_face(face, index, "aligned")
+                index+=1
+
+    face = ete3.faces.TextFace("Species",fsize=10,tight_text=True)
+    face.border.margin = 5
+    face.margin_right = 10
+    face.margin_left = 10
+    (t&"Reference").add_face(face, 0, "aligned")
+    face = ete3.faces.TextFace("Sequence Type",fsize=10,tight_text=True)
+    face.border.margin = 5
+    face.margin_right = 10
+    (t&"Reference").add_face(face, 1, "aligned")
+    face = ete3.faces.TextFace("Carbapenamases",fsize=10,tight_text=True)
+    face.border.margin = 5
+    face.margin_right = 10
+    (t&"Reference").add_face(face, 2, "aligned")
+    index = 3
+    for i in range(0, plasmidCount):
+        face = ete3.faces.TextFace("plasmid " + str(i) + " replicons",fsize=10,tight_text=True)
+        face.border.margin = 5
+        face.margin_right = 10
+        (t&"Reference").add_face(face, index, "aligned")
+        index+=1
+        face = ete3.faces.TextFace("plasmid " + str(i) + " mobility",fsize=10,tight_text=True)
+        face.border.margin = 5
+        face.margin_right = 10
+        (t&"Reference").add_face(face, index, "aligned")
+        index+=1
+
+    t.render("./pipelineTest/tree.png", w=5000,units="mm", tree_style=ts)
+    
+    #endregion
+    '''
+    #region create box tree
+     #region step5: tree construction
+    treeFile = "".join(read("./pipelineTest/tree.txt"))
+    t = ete3.Tree(treeFile)
+    t.set_outgroup(t&"Reference")
+
+    ts = ete3.TreeStyle()
+    ts.show_leaf_name = True
+    ts.show_branch_length = True
+    ts.scale = 2000 #pixel per branch length unit
+    ts.branch_vertical_margin = 15 #pixel between branches
+    style2 = ete3.NodeStyle()
+    style2["fgcolor"] = "#000000"
+    style2["shape"] = "circle"
+    style2["vt_line_color"] = "#0000aa"
+    style2["hz_line_color"] = "#0000aa"
+    style2["vt_line_width"] = 2
+    style2["hz_line_width"] = 2
+    style2["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted
+    style2["hz_line_type"] = 0
+    for n in t.traverse():
+        n.set_style(style2)
+    plasmidIncs = {}
+    for key in metadata:
+        for plasmid in metadata[key].plasmids:
+            for inc in plasmid.PlasmidRepType.split(","):
+                if (inc.lower().find("inc") > -1):
+                    if not (inc in plasmidIncs):
+                        plasmidIncs[inc] = [metadata[key].ID]
+                    else:
+                        if metadata[key].ID not in plasmidIncs[inc]:
+                            plasmidIncs[inc].append(metadata[key].ID)
+    #plasmidIncs = sorted(plasmidIncs)
+    for n in t.traverse():
+        if (n.is_leaf() and n.name == "Reference"):
+            face = ete3.faces.TextFace("New?",fsize=10,tight_text=True)
+            face.border.margin = 5
+            face.margin_right = 5
+            face.margin_left = 5
+            (t&"Reference").add_face(face, 0, "aligned")
+            for i in range(len(plasmidIncs)): #this loop adds the columns (aka the incs) to the reference node
+                face = ete3.faces.TextFace(list(plasmidIncs.keys())[i],fsize=10,tight_text=True)
+                face.border.margin = 5
+                face.margin_right = 5
+                face.margin_left = 5
+                (t&"Reference").add_face(face, i + 1, "aligned")
+            face = ete3.faces.TextFace("MLSTScheme",fsize=10,tight_text=True)
+            face.border.margin = 5
+            face.margin_right = 5
+            face.margin_left = 5
+            (t&"Reference").add_face(face, len(plasmidIncs) + 0 + 1, "aligned")
+            face = ete3.faces.TextFace("Sequence Type",fsize=10,tight_text=True)
+            face.border.margin = 5
+            face.margin_right = 5
+            face.margin_left = 5
+            (t&"Reference").add_face(face, len(plasmidIncs) + 1 + 1, "aligned")
+            face = ete3.faces.TextFace("Carbapenamases",fsize=10,tight_text=True)
+            face.border.margin = 5
+            face.margin_right = 5
+            face.margin_left = 5
+            (t&"Reference").add_face(face, len(plasmidIncs) + 2 + 1, "aligned")
+            for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds the columns (aka the incs) to the reference node
+                face = ete3.faces.TextFace(distanceDict[list(distanceDict.keys())[0]][i],fsize=10,tight_text=True)
+                face.border.margin = 5
+                face.margin_right = 5
+                face.margin_left = 5
+                (t&"Reference").add_face(face, len(plasmidIncs) + 2 + i + 1 + 1, "aligned")
+        elif (n.is_leaf() and not n.name == "Reference"):
+            if (metadata[n.name.replace(".fa","")].new == True):
+                face = ete3.faces.RectFace(30,30,"green","green") # TextFace("Y",fsize=10,tight_text=True)
+                face.border.margin = 5
+                face.margin_right = 5
+                face.margin_left = 5
+                face.vt_align = 1
+                face.ht_align = 1
+                n.add_face(face, 0, "aligned")
+            for incs in plasmidIncs: #this loop adds presence/absence to the sample nodes
+                if (n.name.replace(".fa","") in plasmidIncs[incs]):
+                    face = ete3.faces.RectFace(30,30,"black","black") # TextFace("Y",fsize=10,tight_text=True)
+                    face.border.margin = 5
+                    face.margin_right = 5
+                    face.margin_left = 5
+                    face.vt_align = 1
+                    face.ht_align = 1
+                    n.add_face(face, list(plasmidIncs.keys()).index(incs) + 1, "aligned")
+            mData = metadata[n.name.replace(".fa","")]
+            face = ete3.faces.TextFace(mData.MLSTSpecies,fsize=10,tight_text=True)
+            face.border.margin = 5
+            face.margin_right = 5
+            face.margin_left = 5
+            n.add_face(face, len(plasmidIncs) + 0 + 1, "aligned")
+            face = ete3.faces.TextFace(mData.SequenceType,fsize=10,tight_text=True)
+            face.border.margin = 5
+            face.margin_right = 5
+            face.margin_left = 5
+            n.add_face(face, len(plasmidIncs) + 1 + 1, "aligned")
+            face = ete3.faces.TextFace(mData.CarbapenemResistanceGenes,fsize=10,tight_text=True)
+            face.margin_right = 5
+            face.margin_left = 5
+            n.add_face(face, len(plasmidIncs) + 2 + 1, "aligned")
+            for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds distance matrix
+                face = ete3.faces.TextFace(list(distanceDict[n.name])[i],fsize=10,tight_text=True)
+                face.border.margin = 5
+                face.margin_right = 5
+                face.margin_left = 5
+                n.add_face(face, len(plasmidIncs) + 2 + i + 1 + 1, "aligned")
+
+    t.render("./tree.png", w=5000,units="mm", tree_style=ts)
+
+    #endregion
+#endregion
+
+
+start = time.time()#time the analysis
+
+#analysis time
+Main()
+
+end = time.time()
+print("Finished!\nThe analysis used: " + str(end-start) + " seconds")
\ No newline at end of file