view cpo_galaxy_tree.py @ 6:cabceaa239e4 draft

planemo upload
author jjjjia
date Thu, 23 Aug 2018 12:21:15 -0400
parents fea89c4d5227
children 4d2777aa99db
line wrap: on
line source

#!/home/jjjjia/.conda/envs/py36/bin/python

#$ -S /home/jjjjia/.conda/envs/py36/bin/python
#$ -V             # Pass environment variables to the job
#$ -N CPO_pipeline    # Replace with a more specific job name
#$ -wd /home/jjjjia/testCases           # Use the current working dir
#$ -pe smp 8      # Parallel Environment (how many cores)
#$ -l h_vmem=11G  # Memory (RAM) allocation *per core*
#$ -e ./logs/$JOB_ID.err
#$ -o ./logs/$JOB_ID.log
#$ -m ea
#$ -M bja20@sfu.ca

#~/scripts/pipeline.py -i BC11-Kpn005_S2 -f /data/jjjjia/R1/BC11-Kpn005_S2_L001_R1_001.fastq.gz -r /data/jjjjia/R2/BC11-Kpn005_S2_L001_R2_001.fastq.gz -o pipelineResultsQsub -e "Klebsiella pneumoniae" 

import subprocess
import pandas
import optparse
import os
import datetime
import sys
import time
import urllib.request
import gzip
import collections
import json
import numpy
import ete3 as e


#parses some parameters
parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...")
parser.add_option("-t", "--tree", dest="treePath", type="string", default="./pipelineTest/tree.txt", help="identifier of the isolate")    
parser.add_option("-d", "--distance", dest="distancePath", type="string", default="./pipelineTest/distance.tab", help="absolute file path forward read (R1)")
parser.add_option("-m", "--metadata", dest="metadataPath", type="string", default="./pipelineTest/metadata.tsv",help="absolute file path to reverse read (R2)")
(options,args) = parser.parse_args()
treePath = str(options.treePath).lstrip().rstrip()
distancePath = str(options.distancePath).lstrip().rstrip()
metadataPath = str(options.metadataPath).lstrip().rstrip()


#region result objects
#define some objects to store values from results
#//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar).
class workflowResult(object):
    def __init__(self):
        self.new = False
        self.ID	= "" 
        self.ExpectedSpecies = ""
        self.MLSTSpecies = ""
        self.SequenceType = ""
        self.MLSTScheme = ""
        self.CarbapenemResistanceGenes =""
        self.OtherAMRGenes=""
        self.TotalPlasmids = 0
        self.plasmids = []
        self.DefinitelyPlasmidContigs =""
        self.LikelyPlasmidContigs=""
        self.row = ""
class plasmidObj(object):
    def __init__(self):
        self.PlasmidsID = 0
        self.Num_Contigs = 0
        self.PlasmidLength	= 0
        self.PlasmidRepType	= ""
        self.PlasmidMobility = ""
        self.NearestReference = ""

#endregion

#region useful functions
def read(path):
    return [line.rstrip('\n') for line in open(path)]
def execute(command):
    process = subprocess.Popen(command, shell=False, cwd=curDir, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

    # Poll process for new output until finished
    while True:
        nextline = process.stdout.readline()
        if nextline == '' and process.poll() is not None:
            break
        sys.stdout.write(nextline)
        sys.stdout.flush()

    output = process.communicate()[0]
    exitCode = process.returncode

    if (exitCode == 0):
        return output
    else:
        raise subprocess.CalledProcessError(exitCode, command)
def httpGetFile(url, filepath=""):
    if (filepath == ""):
        return urllib.request.urlretrieve(url)
    else:
        urllib.request.urlretrieve(url, filepath)
        return True
def gunzip(inputpath="", outputpath=""):
    if (outputpath == ""):
        with gzip.open(inputpath, 'rb') as f:
            gzContent = f.read()
        return gzContent
    else:
        with gzip.open(inputpath, 'rb') as f:
            gzContent = f.read()
        with open(outputpath, 'wb') as out:
            out.write(gzContent)
        return True
def addFace(name):
    #if its the reference branch, populate the faces with column headers
    face = e.faces.TextFace(name,fsize=10,tight_text=True)
    face.border.margin = 5
    face.margin_right = 5
    face.margin_left = 5
    return face
#endregion

#region functions to parse result files
def ParseWorkflowResults(pathToResult):
    _worflowResult = {}
    r = pandas.read_csv(pathToResult, delimiter='\t', header=0) 
    r = r.replace(numpy.nan, '', regex=True)
    for i in range(len(r.index)):  
        _results = workflowResult()
        if(str(r.loc[r.index[i], 'new']).lower() == "new"):
            _results.new = True
        else:
            _results.new = False        
        _results.ID = str(r.loc[r.index[i], 'ID'])
        _results.ExpectedSpecies = str(r.loc[r.index[i], 'Expected Species'])
        _results.MLSTSpecies = str(r.loc[r.index[i], 'MLST Species'])
        _results.SequenceType = str(r.loc[r.index[i], 'Sequence Type'])
        _results.MLSTScheme = (str(r.loc[r.index[i], 'MLST Scheme']))
        _results.CarbapenemResistanceGenes = (str(r.loc[r.index[i], 'Carbapenem Resistance Genes']))
        _results.OtherAMRGenes = (str(r.loc[r.index[i], 'Other AMR Genes']))
        _results.TotalPlasmids = int(r.loc[r.index[i], 'Total Plasmids'])
        for j in range(0,_results.TotalPlasmids):
            _plasmid = plasmidObj()
            _plasmid.PlasmidsID =(((str(r.loc[r.index[i], 'Plasmids ID'])).split(";"))[j])
            _plasmid.Num_Contigs = (((str(r.loc[r.index[i], 'Num_Contigs'])).split(";"))[j])
            _plasmid.PlasmidLength	= (((str(r.loc[r.index[i], 'Plasmid Length'])).split(";"))[j])
            _plasmid.PlasmidRepType	= (((str(r.loc[r.index[i], 'Plasmid RepType'])).split(";"))[j])
            _plasmid.PlasmidMobility = ((str(r.loc[r.index[i], 'Plasmid Mobility'])).split(";"))[j]
            _plasmid.NearestReference = ((str(r.loc[r.index[i], 'Nearest Reference'])).split(";"))[j]
            _results.plasmids.append(_plasmid)
        _results.DefinitelyPlasmidContigs = (str(r.loc[r.index[i], 'Definitely Plasmid Contigs']))
        _results.LikelyPlasmidContigs = (str(r.loc[r.index[i], 'Likely Plasmid Contigs']))
        _results.row = "\t".join(str(x) for x in r.ix[i].tolist())
        _worflowResult[_results.ID] = _results
    return _worflowResult

#endregion

def Main():
    metadata = ParseWorkflowResults(metadataPath)
    distance = read(distancePath)
    treeFile = "".join(read(treePath))

    distanceDict = {} #store the distance matrix as rowname:list<string>
    for i in range(len(distance)):
        temp = distance[i].split("\t")
        distanceDict[temp[0]] = temp[1:]
    #region step5: tree construction

    '''
    #region create detailed tree
    
    plasmidCount = 0
    for n in t.traverse():
        if (n.is_leaf() and not n.name == "Reference"):
            mData = metadata[n.name.replace(".fa","")]
            face = faces.TextFace(mData.MLSTSpecies,fsize=10,tight_text=True)
            face.border.margin = 5
            face.margin_left = 10
            face.margin_right = 10
            n.add_face(face, 0, "aligned")
            face = faces.TextFace(mData.SequenceType,fsize=10,tight_text=True)
            face.border.margin = 5
            face.margin_right = 10
            n.add_face(face, 1, "aligned")
            face = faces.TextFace(mData.CarbapenemResistanceGenes,fsize=10,tight_text=True)
            face.border.margin = 5
            face.margin_right = 10
            n.add_face(face, 2, "aligned")
            index = 3
            if (mData.TotalPlasmids > plasmidCount):
                plasmidCount = mData.TotalPlasmids
            for i in range(0, mData.TotalPlasmids):
                face = faces.TextFace(mData.plasmids[i].PlasmidRepType,fsize=10,tight_text=True)
                face.border.margin = 5
                face.margin_right = 10
                n.add_face(face, index, "aligned")
                index+=1
                face = faces.TextFace(mData.plasmids[i].PlasmidMobility,fsize=10,tight_text=True)
                face.border.margin = 5
                face.margin_right = 10
                n.add_face(face, index, "aligned")
                index+=1

    face = faces.TextFace("Species",fsize=10,tight_text=True)
    face.border.margin = 5
    face.margin_right = 10
    face.margin_left = 10
    (t&"Reference").add_face(face, 0, "aligned")
    face = faces.TextFace("Sequence Type",fsize=10,tight_text=True)
    face.border.margin = 5
    face.margin_right = 10
    (t&"Reference").add_face(face, 1, "aligned")
    face = faces.TextFace("Carbapenamases",fsize=10,tight_text=True)
    face.border.margin = 5
    face.margin_right = 10
    (t&"Reference").add_face(face, 2, "aligned")
    index = 3
    for i in range(0, plasmidCount):
        face = faces.TextFace("plasmid " + str(i) + " replicons",fsize=10,tight_text=True)
        face.border.margin = 5
        face.margin_right = 10
        (t&"Reference").add_face(face, index, "aligned")
        index+=1
        face = faces.TextFace("plasmid " + str(i) + " mobility",fsize=10,tight_text=True)
        face.border.margin = 5
        face.margin_right = 10
        (t&"Reference").add_face(face, index, "aligned")
        index+=1

    t.render("./pipelineTest/tree.png", w=5000,units="mm", tree_style=ts)
    
    #endregion
    '''
    #region create box tree
    #region step5: tree construction
    treeFile = "".join(read(treePath))
    t = e.Tree(treeFile)
    t.set_outgroup(t&"Reference")

    #set the tree style
    ts = e.TreeStyle()
    ts.show_leaf_name = False
    ts.show_branch_length = True
    ts.scale = 2000 #pixel per branch length unit
    ts.branch_vertical_margin = 15 #pixel between branches
    style2 = e.NodeStyle()
    style2["fgcolor"] = "#000000"
    style2["shape"] = "circle"
    style2["vt_line_color"] = "#0000aa"
    style2["hz_line_color"] = "#0000aa"
    style2["vt_line_width"] = 2
    style2["hz_line_width"] = 2
    style2["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted
    style2["hz_line_type"] = 0
    for n in t.traverse():
        n.set_style(style2)

    #find the plasmid origins
    plasmidIncs = {}
    for key in metadata:
        for plasmid in metadata[key].plasmids:
            for inc in plasmid.PlasmidRepType.split(","):
                if (inc.lower().find("inc") > -1):
                    if not (inc in plasmidIncs):
                        plasmidIncs[inc] = [metadata[key].ID]
                    else:
                        if metadata[key].ID not in plasmidIncs[inc]:
                            plasmidIncs[inc].append(metadata[key].ID)
    #plasmidIncs = sorted(plasmidIncs)
    for n in t.traverse(): #loop through the nodes of a tree
        if (n.is_leaf() and n.name == "Reference"):
            #if its the reference branch, populate the faces with column headers
            index = 0
            (t&"Reference").add_face(addFace("SampleID"), index, "aligned")
            index = index + 1
            (t&"Reference").add_face(addFace("New?"), index, "aligned")
            index = index + 1
            for i in range(len(plasmidIncs)): #this loop adds the columns (aka the incs) to the reference node
                (t&"Reference").add_face(addFace(list(plasmidIncs.keys())[i]), i + index, "aligned")
            index = index + len(plasmidIncs)
            (t&"Reference").add_face(addFace("MLSTScheme"), index, "aligned")
            index = index + 1
            (t&"Reference").add_face(addFace("Sequence Type"), index, "aligned")
            index = index + 1 
            (t&"Reference").add_face(addFace("Carbapenamases"), index, "aligned")
            index = index + 1
            for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds the distance matrix
                (t&"Reference").add_face(addFace(distanceDict[list(distanceDict.keys())[0]][i]), index + i, "aligned")
            index = index + len(distanceDict[list(distanceDict.keys())[0]])
        elif (n.is_leaf() and not n.name == "Reference"): 
            #not reference branches, populate with metadata
            index = 0
            mData = metadata[n.name.replace(".fa","")]
            n.add_face(addFace(mData.ID), index, "aligned")
            index = index + 1
            if (metadata[n.name.replace(".fa","")].new == True): #new column
                face = e.RectFace(30,30,"green","green") # TextFace("Y",fsize=10,tight_text=True)
                face.border.margin = 5
                face.margin_right = 5
                face.margin_left = 5
                face.vt_align = 1
                face.ht_align = 1
                n.add_face(face, index, "aligned")
            index = index + 1
            for incs in plasmidIncs: #this loop adds presence/absence to the sample nodes
                if (n.name.replace(".fa","") in plasmidIncs[incs]):
                    face = e.RectFace(30,30,"black","black") # TextFace("Y",fsize=10,tight_text=True)
                    face.border.margin = 5
                    face.margin_right = 5
                    face.margin_left = 5
                    face.vt_align = 1
                    face.ht_align = 1
                    n.add_face(face, list(plasmidIncs.keys()).index(incs) + index, "aligned")
            index = index + len(plasmidIncs)
            n.add_face(addFace(mData.MLSTSpecies), index, "aligned")
            index = index + 1
            n.add_face(addFace(mData.SequenceType), index, "aligned")
            index = index + 1
            n.add_face(addFace(mData.CarbapenemResistanceGenes), index, "aligned")
            index = index + 1
            for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds distance matrix
                n.add_face(addFace(list(distanceDict[n.name])[i]), index + i, "aligned")
                
    t.render("./tree.png", w=5000,units="mm", tree_style=ts) #save it as a png. or an phyloxml

    #endregion
#endregion


start = time.time()#time the analysis

#analysis time
Main()

end = time.time()
print("Finished!\nThe analysis used: " + str(end-start) + " seconds")