diff cpo_galaxy_tree.py @ 6:cabceaa239e4 draft

planemo upload
author jjjjia
date Thu, 23 Aug 2018 12:21:15 -0400
parents fea89c4d5227
children 4d2777aa99db
line wrap: on
line diff
--- a/cpo_galaxy_tree.py	Tue Aug 21 17:53:08 2018 -0400
+++ b/cpo_galaxy_tree.py	Thu Aug 23 12:21:15 2018 -0400
@@ -24,8 +24,9 @@
 import gzip
 import collections
 import json
-import ete3
 import numpy
+import ete3 as e
+
 
 #parses some parameters
 parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...")
@@ -105,38 +106,45 @@
         with open(outputpath, 'wb') as out:
             out.write(gzContent)
         return True
+def addFace(name):
+    #if its the reference branch, populate the faces with column headers
+    face = e.faces.TextFace(name,fsize=10,tight_text=True)
+    face.border.margin = 5
+    face.margin_right = 5
+    face.margin_left = 5
+    return face
 #endregion
 
 #region functions to parse result files
 def ParseWorkflowResults(pathToResult):
     _worflowResult = {}
-    r = pandas.read_csv(pathToResult, delimiter='\t', header=None) #read  the kraken2report.tsv
+    r = pandas.read_csv(pathToResult, delimiter='\t', header=0) 
     r = r.replace(numpy.nan, '', regex=True)
     for i in range(len(r.index)):  
         _results = workflowResult()
-        if(str(r.iloc[i,0]).lower() == "new"):
+        if(str(r.loc[r.index[i], 'new']).lower() == "new"):
             _results.new = True
         else:
             _results.new = False        
-        _results.ID = str(r.iloc[i,1])
-        _results.ExpectedSpecies = str(r.iloc[i,2])
-        _results.MLSTSpecies = str(r.iloc[i,3])
-        _results.SequenceType = str(r.iloc[i,4])
-        _results.MLSTScheme = (str(r.iloc[i,5]))
-        _results.CarbapenemResistanceGenes = (str(r.iloc[i,6]))
-        _results.OtherAMRGenes = (str(r.iloc[i,7]))
-        _results.TotalPlasmids = int(r.iloc[i,8])
+        _results.ID = str(r.loc[r.index[i], 'ID'])
+        _results.ExpectedSpecies = str(r.loc[r.index[i], 'Expected Species'])
+        _results.MLSTSpecies = str(r.loc[r.index[i], 'MLST Species'])
+        _results.SequenceType = str(r.loc[r.index[i], 'Sequence Type'])
+        _results.MLSTScheme = (str(r.loc[r.index[i], 'MLST Scheme']))
+        _results.CarbapenemResistanceGenes = (str(r.loc[r.index[i], 'Carbapenem Resistance Genes']))
+        _results.OtherAMRGenes = (str(r.loc[r.index[i], 'Other AMR Genes']))
+        _results.TotalPlasmids = int(r.loc[r.index[i], 'Total Plasmids'])
         for j in range(0,_results.TotalPlasmids):
             _plasmid = plasmidObj()
-            _plasmid.PlasmidsID =(((str(r.iloc[i,9])).split(";"))[j])
-            _plasmid.Num_Contigs = (((str(r.iloc[i,10])).split(";"))[j])
-            _plasmid.PlasmidLength	= (((str(r.iloc[i,11])).split(";"))[j])
-            _plasmid.PlasmidRepType	= (((str(r.iloc[i,12])).split(";"))[j])
-            _plasmid.PlasmidMobility = ((str(r.iloc[i,13])).split(";"))[j]
-            _plasmid.NearestReference = ((str(r.iloc[i,14])).split(";"))[j]
+            _plasmid.PlasmidsID =(((str(r.loc[r.index[i], 'Plasmids ID'])).split(";"))[j])
+            _plasmid.Num_Contigs = (((str(r.loc[r.index[i], 'Num_Contigs'])).split(";"))[j])
+            _plasmid.PlasmidLength	= (((str(r.loc[r.index[i], 'Plasmid Length'])).split(";"))[j])
+            _plasmid.PlasmidRepType	= (((str(r.loc[r.index[i], 'Plasmid RepType'])).split(";"))[j])
+            _plasmid.PlasmidMobility = ((str(r.loc[r.index[i], 'Plasmid Mobility'])).split(";"))[j]
+            _plasmid.NearestReference = ((str(r.loc[r.index[i], 'Nearest Reference'])).split(";"))[j]
             _results.plasmids.append(_plasmid)
-        _results.DefinitelyPlasmidContigs = (str(r.iloc[i,15]))
-        _results.LikelyPlasmidContigs = (str(r.iloc[i,16]))
+        _results.DefinitelyPlasmidContigs = (str(r.loc[r.index[i], 'Definitely Plasmid Contigs']))
+        _results.LikelyPlasmidContigs = (str(r.loc[r.index[i], 'Likely Plasmid Contigs']))
         _results.row = "\t".join(str(x) for x in r.ix[i].tolist())
         _worflowResult[_results.ID] = _results
     return _worflowResult
@@ -148,20 +156,90 @@
     distance = read(distancePath)
     treeFile = "".join(read(treePath))
 
-    distanceDict = {}
+    distanceDict = {} #store the distance matrix as rowname:list<string>
     for i in range(len(distance)):
         temp = distance[i].split("\t")
         distanceDict[temp[0]] = temp[1:]
     #region step5: tree construction
-    t = ete3.Tree(treeFile)
+
+    '''
+    #region create detailed tree
+    
+    plasmidCount = 0
+    for n in t.traverse():
+        if (n.is_leaf() and not n.name == "Reference"):
+            mData = metadata[n.name.replace(".fa","")]
+            face = faces.TextFace(mData.MLSTSpecies,fsize=10,tight_text=True)
+            face.border.margin = 5
+            face.margin_left = 10
+            face.margin_right = 10
+            n.add_face(face, 0, "aligned")
+            face = faces.TextFace(mData.SequenceType,fsize=10,tight_text=True)
+            face.border.margin = 5
+            face.margin_right = 10
+            n.add_face(face, 1, "aligned")
+            face = faces.TextFace(mData.CarbapenemResistanceGenes,fsize=10,tight_text=True)
+            face.border.margin = 5
+            face.margin_right = 10
+            n.add_face(face, 2, "aligned")
+            index = 3
+            if (mData.TotalPlasmids > plasmidCount):
+                plasmidCount = mData.TotalPlasmids
+            for i in range(0, mData.TotalPlasmids):
+                face = faces.TextFace(mData.plasmids[i].PlasmidRepType,fsize=10,tight_text=True)
+                face.border.margin = 5
+                face.margin_right = 10
+                n.add_face(face, index, "aligned")
+                index+=1
+                face = faces.TextFace(mData.plasmids[i].PlasmidMobility,fsize=10,tight_text=True)
+                face.border.margin = 5
+                face.margin_right = 10
+                n.add_face(face, index, "aligned")
+                index+=1
+
+    face = faces.TextFace("Species",fsize=10,tight_text=True)
+    face.border.margin = 5
+    face.margin_right = 10
+    face.margin_left = 10
+    (t&"Reference").add_face(face, 0, "aligned")
+    face = faces.TextFace("Sequence Type",fsize=10,tight_text=True)
+    face.border.margin = 5
+    face.margin_right = 10
+    (t&"Reference").add_face(face, 1, "aligned")
+    face = faces.TextFace("Carbapenamases",fsize=10,tight_text=True)
+    face.border.margin = 5
+    face.margin_right = 10
+    (t&"Reference").add_face(face, 2, "aligned")
+    index = 3
+    for i in range(0, plasmidCount):
+        face = faces.TextFace("plasmid " + str(i) + " replicons",fsize=10,tight_text=True)
+        face.border.margin = 5
+        face.margin_right = 10
+        (t&"Reference").add_face(face, index, "aligned")
+        index+=1
+        face = faces.TextFace("plasmid " + str(i) + " mobility",fsize=10,tight_text=True)
+        face.border.margin = 5
+        face.margin_right = 10
+        (t&"Reference").add_face(face, index, "aligned")
+        index+=1
+
+    t.render("./pipelineTest/tree.png", w=5000,units="mm", tree_style=ts)
+    
+    #endregion
+    '''
+    #region create box tree
+    #region step5: tree construction
+    treeFile = "".join(read(treePath))
+    t = e.Tree(treeFile)
     t.set_outgroup(t&"Reference")
 
-    ts = ete3.TreeStyle()
-    ts.show_leaf_name = True
+    #set the tree style
+    ts = e.TreeStyle()
+    ts.show_leaf_name = False
     ts.show_branch_length = True
     ts.scale = 2000 #pixel per branch length unit
     ts.branch_vertical_margin = 15 #pixel between branches
-    style2 = ete3.NodeStyle()
+    style2 = e.NodeStyle()
     style2["fgcolor"] = "#000000"
     style2["shape"] = "circle"
     style2["vt_line_color"] = "#0000aa"
@@ -172,93 +250,8 @@
     style2["hz_line_type"] = 0
     for n in t.traverse():
         n.set_style(style2)
-    '''
-    #region create detailed tree
-    
-    plasmidCount = 0
-    for n in t.traverse():
-        if (n.is_leaf() and not n.name == "Reference"):
-            mData = metadata[n.name.replace(".fa","")]
-            face = ete3.faces.TextFace(mData.MLSTSpecies,fsize=10,tight_text=True)
-            face.border.margin = 5
-            face.margin_left = 10
-            face.margin_right = 10
-            n.add_face(face, 0, "aligned")
-            face = ete3.faces.TextFace(mData.SequenceType,fsize=10,tight_text=True)
-            face.border.margin = 5
-            face.margin_right = 10
-            n.add_face(face, 1, "aligned")
-            face = ete3.faces.TextFace(mData.CarbapenemResistanceGenes,fsize=10,tight_text=True)
-            face.border.margin = 5
-            face.margin_right = 10
-            n.add_face(face, 2, "aligned")
-            index = 3
-            if (mData.TotalPlasmids > plasmidCount):
-                plasmidCount = mData.TotalPlasmids
-            for i in range(0, mData.TotalPlasmids):
-                face = ete3.faces.TextFace(mData.plasmids[i].PlasmidRepType,fsize=10,tight_text=True)
-                face.border.margin = 5
-                face.margin_right = 10
-                n.add_face(face, index, "aligned")
-                index+=1
-                face = ete3.faces.TextFace(mData.plasmids[i].PlasmidMobility,fsize=10,tight_text=True)
-                face.border.margin = 5
-                face.margin_right = 10
-                n.add_face(face, index, "aligned")
-                index+=1
 
-    face = ete3.faces.TextFace("Species",fsize=10,tight_text=True)
-    face.border.margin = 5
-    face.margin_right = 10
-    face.margin_left = 10
-    (t&"Reference").add_face(face, 0, "aligned")
-    face = ete3.faces.TextFace("Sequence Type",fsize=10,tight_text=True)
-    face.border.margin = 5
-    face.margin_right = 10
-    (t&"Reference").add_face(face, 1, "aligned")
-    face = ete3.faces.TextFace("Carbapenamases",fsize=10,tight_text=True)
-    face.border.margin = 5
-    face.margin_right = 10
-    (t&"Reference").add_face(face, 2, "aligned")
-    index = 3
-    for i in range(0, plasmidCount):
-        face = ete3.faces.TextFace("plasmid " + str(i) + " replicons",fsize=10,tight_text=True)
-        face.border.margin = 5
-        face.margin_right = 10
-        (t&"Reference").add_face(face, index, "aligned")
-        index+=1
-        face = ete3.faces.TextFace("plasmid " + str(i) + " mobility",fsize=10,tight_text=True)
-        face.border.margin = 5
-        face.margin_right = 10
-        (t&"Reference").add_face(face, index, "aligned")
-        index+=1
-
-    t.render("./pipelineTest/tree.png", w=5000,units="mm", tree_style=ts)
-    
-    #endregion
-    '''
-    #region create box tree
-     #region step5: tree construction
-    treeFile = "".join(read("./pipelineTest/tree.txt"))
-    t = ete3.Tree(treeFile)
-    t.set_outgroup(t&"Reference")
-
-    ts = ete3.TreeStyle()
-    ts.show_leaf_name = True
-    ts.show_branch_length = True
-    ts.scale = 2000 #pixel per branch length unit
-    ts.branch_vertical_margin = 15 #pixel between branches
-    style2 = ete3.NodeStyle()
-    style2["fgcolor"] = "#000000"
-    style2["shape"] = "circle"
-    style2["vt_line_color"] = "#0000aa"
-    style2["hz_line_color"] = "#0000aa"
-    style2["vt_line_width"] = 2
-    style2["hz_line_width"] = 2
-    style2["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted
-    style2["hz_line_type"] = 0
-    for n in t.traverse():
-        n.set_style(style2)
+    #find the plasmid origins
     plasmidIncs = {}
     for key in metadata:
         for plasmid in metadata[key].plasmids:
@@ -270,81 +263,61 @@
                         if metadata[key].ID not in plasmidIncs[inc]:
                             plasmidIncs[inc].append(metadata[key].ID)
     #plasmidIncs = sorted(plasmidIncs)
-    for n in t.traverse():
+    for n in t.traverse(): #loop through the nodes of a tree
         if (n.is_leaf() and n.name == "Reference"):
-            face = ete3.faces.TextFace("New?",fsize=10,tight_text=True)
-            face.border.margin = 5
-            face.margin_right = 5
-            face.margin_left = 5
-            (t&"Reference").add_face(face, 0, "aligned")
+            #if its the reference branch, populate the faces with column headers
+            index = 0
+            (t&"Reference").add_face(addFace("SampleID"), index, "aligned")
+            index = index + 1
+            (t&"Reference").add_face(addFace("New?"), index, "aligned")
+            index = index + 1
             for i in range(len(plasmidIncs)): #this loop adds the columns (aka the incs) to the reference node
-                face = ete3.faces.TextFace(list(plasmidIncs.keys())[i],fsize=10,tight_text=True)
-                face.border.margin = 5
-                face.margin_right = 5
-                face.margin_left = 5
-                (t&"Reference").add_face(face, i + 1, "aligned")
-            face = ete3.faces.TextFace("MLSTScheme",fsize=10,tight_text=True)
-            face.border.margin = 5
-            face.margin_right = 5
-            face.margin_left = 5
-            (t&"Reference").add_face(face, len(plasmidIncs) + 0 + 1, "aligned")
-            face = ete3.faces.TextFace("Sequence Type",fsize=10,tight_text=True)
-            face.border.margin = 5
-            face.margin_right = 5
-            face.margin_left = 5
-            (t&"Reference").add_face(face, len(plasmidIncs) + 1 + 1, "aligned")
-            face = ete3.faces.TextFace("Carbapenamases",fsize=10,tight_text=True)
-            face.border.margin = 5
-            face.margin_right = 5
-            face.margin_left = 5
-            (t&"Reference").add_face(face, len(plasmidIncs) + 2 + 1, "aligned")
-            for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds the columns (aka the incs) to the reference node
-                face = ete3.faces.TextFace(distanceDict[list(distanceDict.keys())[0]][i],fsize=10,tight_text=True)
-                face.border.margin = 5
-                face.margin_right = 5
-                face.margin_left = 5
-                (t&"Reference").add_face(face, len(plasmidIncs) + 2 + i + 1 + 1, "aligned")
-        elif (n.is_leaf() and not n.name == "Reference"):
-            if (metadata[n.name.replace(".fa","")].new == True):
-                face = ete3.faces.RectFace(30,30,"green","green") # TextFace("Y",fsize=10,tight_text=True)
+                (t&"Reference").add_face(addFace(list(plasmidIncs.keys())[i]), i + index, "aligned")
+            index = index + len(plasmidIncs)
+            (t&"Reference").add_face(addFace("MLSTScheme"), index, "aligned")
+            index = index + 1
+            (t&"Reference").add_face(addFace("Sequence Type"), index, "aligned")
+            index = index + 1 
+            (t&"Reference").add_face(addFace("Carbapenamases"), index, "aligned")
+            index = index + 1
+            for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds the distance matrix
+                (t&"Reference").add_face(addFace(distanceDict[list(distanceDict.keys())[0]][i]), index + i, "aligned")
+            index = index + len(distanceDict[list(distanceDict.keys())[0]])
+        elif (n.is_leaf() and not n.name == "Reference"): 
+            #not reference branches, populate with metadata
+            index = 0
+            mData = metadata[n.name.replace(".fa","")]
+            n.add_face(addFace(mData.ID), index, "aligned")
+            index = index + 1
+            if (metadata[n.name.replace(".fa","")].new == True): #new column
+                face = e.RectFace(30,30,"green","green") # TextFace("Y",fsize=10,tight_text=True)
                 face.border.margin = 5
                 face.margin_right = 5
                 face.margin_left = 5
                 face.vt_align = 1
                 face.ht_align = 1
-                n.add_face(face, 0, "aligned")
+                n.add_face(face, index, "aligned")
+            index = index + 1
             for incs in plasmidIncs: #this loop adds presence/absence to the sample nodes
                 if (n.name.replace(".fa","") in plasmidIncs[incs]):
-                    face = ete3.faces.RectFace(30,30,"black","black") # TextFace("Y",fsize=10,tight_text=True)
+                    face = e.RectFace(30,30,"black","black") # TextFace("Y",fsize=10,tight_text=True)
                     face.border.margin = 5
                     face.margin_right = 5
                     face.margin_left = 5
                     face.vt_align = 1
                     face.ht_align = 1
-                    n.add_face(face, list(plasmidIncs.keys()).index(incs) + 1, "aligned")
-            mData = metadata[n.name.replace(".fa","")]
-            face = ete3.faces.TextFace(mData.MLSTSpecies,fsize=10,tight_text=True)
-            face.border.margin = 5
-            face.margin_right = 5
-            face.margin_left = 5
-            n.add_face(face, len(plasmidIncs) + 0 + 1, "aligned")
-            face = ete3.faces.TextFace(mData.SequenceType,fsize=10,tight_text=True)
-            face.border.margin = 5
-            face.margin_right = 5
-            face.margin_left = 5
-            n.add_face(face, len(plasmidIncs) + 1 + 1, "aligned")
-            face = ete3.faces.TextFace(mData.CarbapenemResistanceGenes,fsize=10,tight_text=True)
-            face.margin_right = 5
-            face.margin_left = 5
-            n.add_face(face, len(plasmidIncs) + 2 + 1, "aligned")
+                    n.add_face(face, list(plasmidIncs.keys()).index(incs) + index, "aligned")
+            index = index + len(plasmidIncs)
+            n.add_face(addFace(mData.MLSTSpecies), index, "aligned")
+            index = index + 1
+            n.add_face(addFace(mData.SequenceType), index, "aligned")
+            index = index + 1
+            n.add_face(addFace(mData.CarbapenemResistanceGenes), index, "aligned")
+            index = index + 1
             for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds distance matrix
-                face = ete3.faces.TextFace(list(distanceDict[n.name])[i],fsize=10,tight_text=True)
-                face.border.margin = 5
-                face.margin_right = 5
-                face.margin_left = 5
-                n.add_face(face, len(plasmidIncs) + 2 + i + 1 + 1, "aligned")
-
-    t.render("./tree.png", w=5000,units="mm", tree_style=ts)
+                n.add_face(addFace(list(distanceDict[n.name])[i]), index + i, "aligned")
+                
+    t.render("./tree.png", w=5000,units="mm", tree_style=ts) #save it as a png. or an phyloxml
 
     #endregion
 #endregion