comparison cpo_galaxy_tree.py @ 1:fea89c4d5227 draft

Uploaded
author jjjjia
date Thu, 16 Aug 2018 19:27:05 -0400
parents
children cabceaa239e4
comparison
equal deleted inserted replaced
0:917a05a03ac9 1:fea89c4d5227
1 #!/home/jjjjia/.conda/envs/py36/bin/python
2
3 #$ -S /home/jjjjia/.conda/envs/py36/bin/python
4 #$ -V # Pass environment variables to the job
5 #$ -N CPO_pipeline # Replace with a more specific job name
6 #$ -wd /home/jjjjia/testCases # Use the current working dir
7 #$ -pe smp 8 # Parallel Environment (how many cores)
8 #$ -l h_vmem=11G # Memory (RAM) allocation *per core*
9 #$ -e ./logs/$JOB_ID.err
10 #$ -o ./logs/$JOB_ID.log
11 #$ -m ea
12 #$ -M bja20@sfu.ca
13
14 #~/scripts/pipeline.py -i BC11-Kpn005_S2 -f /data/jjjjia/R1/BC11-Kpn005_S2_L001_R1_001.fastq.gz -r /data/jjjjia/R2/BC11-Kpn005_S2_L001_R2_001.fastq.gz -o pipelineResultsQsub -e "Klebsiella pneumoniae"
15
16 import subprocess
17 import pandas
18 import optparse
19 import os
20 import datetime
21 import sys
22 import time
23 import urllib.request
24 import gzip
25 import collections
26 import json
27 import ete3
28 import numpy
29
30 #parses some parameters
31 parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...")
32 parser.add_option("-t", "--tree", dest="treePath", type="string", default="./pipelineTest/tree.txt", help="identifier of the isolate")
33 parser.add_option("-d", "--distance", dest="distancePath", type="string", default="./pipelineTest/distance.tab", help="absolute file path forward read (R1)")
34 parser.add_option("-m", "--metadata", dest="metadataPath", type="string", default="./pipelineTest/metadata.tsv",help="absolute file path to reverse read (R2)")
35 (options,args) = parser.parse_args()
36 treePath = str(options.treePath).lstrip().rstrip()
37 distancePath = str(options.distancePath).lstrip().rstrip()
38 metadataPath = str(options.metadataPath).lstrip().rstrip()
39
40
41 #region result objects
42 #define some objects to store values from results
43 #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar).
44 class workflowResult(object):
45 def __init__(self):
46 self.new = False
47 self.ID = ""
48 self.ExpectedSpecies = ""
49 self.MLSTSpecies = ""
50 self.SequenceType = ""
51 self.MLSTScheme = ""
52 self.CarbapenemResistanceGenes =""
53 self.OtherAMRGenes=""
54 self.TotalPlasmids = 0
55 self.plasmids = []
56 self.DefinitelyPlasmidContigs =""
57 self.LikelyPlasmidContigs=""
58 self.row = ""
59 class plasmidObj(object):
60 def __init__(self):
61 self.PlasmidsID = 0
62 self.Num_Contigs = 0
63 self.PlasmidLength = 0
64 self.PlasmidRepType = ""
65 self.PlasmidMobility = ""
66 self.NearestReference = ""
67
68 #endregion
69
70 #region useful functions
71 def read(path):
72 return [line.rstrip('\n') for line in open(path)]
73 def execute(command):
74 process = subprocess.Popen(command, shell=False, cwd=curDir, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
75
76 # Poll process for new output until finished
77 while True:
78 nextline = process.stdout.readline()
79 if nextline == '' and process.poll() is not None:
80 break
81 sys.stdout.write(nextline)
82 sys.stdout.flush()
83
84 output = process.communicate()[0]
85 exitCode = process.returncode
86
87 if (exitCode == 0):
88 return output
89 else:
90 raise subprocess.CalledProcessError(exitCode, command)
91 def httpGetFile(url, filepath=""):
92 if (filepath == ""):
93 return urllib.request.urlretrieve(url)
94 else:
95 urllib.request.urlretrieve(url, filepath)
96 return True
97 def gunzip(inputpath="", outputpath=""):
98 if (outputpath == ""):
99 with gzip.open(inputpath, 'rb') as f:
100 gzContent = f.read()
101 return gzContent
102 else:
103 with gzip.open(inputpath, 'rb') as f:
104 gzContent = f.read()
105 with open(outputpath, 'wb') as out:
106 out.write(gzContent)
107 return True
108 #endregion
109
110 #region functions to parse result files
111 def ParseWorkflowResults(pathToResult):
112 _worflowResult = {}
113 r = pandas.read_csv(pathToResult, delimiter='\t', header=None) #read the kraken2report.tsv
114 r = r.replace(numpy.nan, '', regex=True)
115 for i in range(len(r.index)):
116 _results = workflowResult()
117 if(str(r.iloc[i,0]).lower() == "new"):
118 _results.new = True
119 else:
120 _results.new = False
121 _results.ID = str(r.iloc[i,1])
122 _results.ExpectedSpecies = str(r.iloc[i,2])
123 _results.MLSTSpecies = str(r.iloc[i,3])
124 _results.SequenceType = str(r.iloc[i,4])
125 _results.MLSTScheme = (str(r.iloc[i,5]))
126 _results.CarbapenemResistanceGenes = (str(r.iloc[i,6]))
127 _results.OtherAMRGenes = (str(r.iloc[i,7]))
128 _results.TotalPlasmids = int(r.iloc[i,8])
129 for j in range(0,_results.TotalPlasmids):
130 _plasmid = plasmidObj()
131 _plasmid.PlasmidsID =(((str(r.iloc[i,9])).split(";"))[j])
132 _plasmid.Num_Contigs = (((str(r.iloc[i,10])).split(";"))[j])
133 _plasmid.PlasmidLength = (((str(r.iloc[i,11])).split(";"))[j])
134 _plasmid.PlasmidRepType = (((str(r.iloc[i,12])).split(";"))[j])
135 _plasmid.PlasmidMobility = ((str(r.iloc[i,13])).split(";"))[j]
136 _plasmid.NearestReference = ((str(r.iloc[i,14])).split(";"))[j]
137 _results.plasmids.append(_plasmid)
138 _results.DefinitelyPlasmidContigs = (str(r.iloc[i,15]))
139 _results.LikelyPlasmidContigs = (str(r.iloc[i,16]))
140 _results.row = "\t".join(str(x) for x in r.ix[i].tolist())
141 _worflowResult[_results.ID] = _results
142 return _worflowResult
143
144 #endregion
145
146 def Main():
147 metadata = ParseWorkflowResults(metadataPath)
148 distance = read(distancePath)
149 treeFile = "".join(read(treePath))
150
151 distanceDict = {}
152 for i in range(len(distance)):
153 temp = distance[i].split("\t")
154 distanceDict[temp[0]] = temp[1:]
155 #region step5: tree construction
156 t = ete3.Tree(treeFile)
157 t.set_outgroup(t&"Reference")
158
159 ts = ete3.TreeStyle()
160 ts.show_leaf_name = True
161 ts.show_branch_length = True
162 ts.scale = 2000 #pixel per branch length unit
163 ts.branch_vertical_margin = 15 #pixel between branches
164 style2 = ete3.NodeStyle()
165 style2["fgcolor"] = "#000000"
166 style2["shape"] = "circle"
167 style2["vt_line_color"] = "#0000aa"
168 style2["hz_line_color"] = "#0000aa"
169 style2["vt_line_width"] = 2
170 style2["hz_line_width"] = 2
171 style2["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted
172 style2["hz_line_type"] = 0
173 for n in t.traverse():
174 n.set_style(style2)
175 '''
176 #region create detailed tree
177
178 plasmidCount = 0
179 for n in t.traverse():
180 if (n.is_leaf() and not n.name == "Reference"):
181 mData = metadata[n.name.replace(".fa","")]
182 face = ete3.faces.TextFace(mData.MLSTSpecies,fsize=10,tight_text=True)
183 face.border.margin = 5
184 face.margin_left = 10
185 face.margin_right = 10
186 n.add_face(face, 0, "aligned")
187 face = ete3.faces.TextFace(mData.SequenceType,fsize=10,tight_text=True)
188 face.border.margin = 5
189 face.margin_right = 10
190 n.add_face(face, 1, "aligned")
191 face = ete3.faces.TextFace(mData.CarbapenemResistanceGenes,fsize=10,tight_text=True)
192 face.border.margin = 5
193 face.margin_right = 10
194 n.add_face(face, 2, "aligned")
195 index = 3
196 if (mData.TotalPlasmids > plasmidCount):
197 plasmidCount = mData.TotalPlasmids
198 for i in range(0, mData.TotalPlasmids):
199 face = ete3.faces.TextFace(mData.plasmids[i].PlasmidRepType,fsize=10,tight_text=True)
200 face.border.margin = 5
201 face.margin_right = 10
202 n.add_face(face, index, "aligned")
203 index+=1
204 face = ete3.faces.TextFace(mData.plasmids[i].PlasmidMobility,fsize=10,tight_text=True)
205 face.border.margin = 5
206 face.margin_right = 10
207 n.add_face(face, index, "aligned")
208 index+=1
209
210 face = ete3.faces.TextFace("Species",fsize=10,tight_text=True)
211 face.border.margin = 5
212 face.margin_right = 10
213 face.margin_left = 10
214 (t&"Reference").add_face(face, 0, "aligned")
215 face = ete3.faces.TextFace("Sequence Type",fsize=10,tight_text=True)
216 face.border.margin = 5
217 face.margin_right = 10
218 (t&"Reference").add_face(face, 1, "aligned")
219 face = ete3.faces.TextFace("Carbapenamases",fsize=10,tight_text=True)
220 face.border.margin = 5
221 face.margin_right = 10
222 (t&"Reference").add_face(face, 2, "aligned")
223 index = 3
224 for i in range(0, plasmidCount):
225 face = ete3.faces.TextFace("plasmid " + str(i) + " replicons",fsize=10,tight_text=True)
226 face.border.margin = 5
227 face.margin_right = 10
228 (t&"Reference").add_face(face, index, "aligned")
229 index+=1
230 face = ete3.faces.TextFace("plasmid " + str(i) + " mobility",fsize=10,tight_text=True)
231 face.border.margin = 5
232 face.margin_right = 10
233 (t&"Reference").add_face(face, index, "aligned")
234 index+=1
235
236 t.render("./pipelineTest/tree.png", w=5000,units="mm", tree_style=ts)
237
238 #endregion
239 '''
240 #region create box tree
241 #region step5: tree construction
242 treeFile = "".join(read("./pipelineTest/tree.txt"))
243 t = ete3.Tree(treeFile)
244 t.set_outgroup(t&"Reference")
245
246 ts = ete3.TreeStyle()
247 ts.show_leaf_name = True
248 ts.show_branch_length = True
249 ts.scale = 2000 #pixel per branch length unit
250 ts.branch_vertical_margin = 15 #pixel between branches
251 style2 = ete3.NodeStyle()
252 style2["fgcolor"] = "#000000"
253 style2["shape"] = "circle"
254 style2["vt_line_color"] = "#0000aa"
255 style2["hz_line_color"] = "#0000aa"
256 style2["vt_line_width"] = 2
257 style2["hz_line_width"] = 2
258 style2["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted
259 style2["hz_line_type"] = 0
260 for n in t.traverse():
261 n.set_style(style2)
262 plasmidIncs = {}
263 for key in metadata:
264 for plasmid in metadata[key].plasmids:
265 for inc in plasmid.PlasmidRepType.split(","):
266 if (inc.lower().find("inc") > -1):
267 if not (inc in plasmidIncs):
268 plasmidIncs[inc] = [metadata[key].ID]
269 else:
270 if metadata[key].ID not in plasmidIncs[inc]:
271 plasmidIncs[inc].append(metadata[key].ID)
272 #plasmidIncs = sorted(plasmidIncs)
273 for n in t.traverse():
274 if (n.is_leaf() and n.name == "Reference"):
275 face = ete3.faces.TextFace("New?",fsize=10,tight_text=True)
276 face.border.margin = 5
277 face.margin_right = 5
278 face.margin_left = 5
279 (t&"Reference").add_face(face, 0, "aligned")
280 for i in range(len(plasmidIncs)): #this loop adds the columns (aka the incs) to the reference node
281 face = ete3.faces.TextFace(list(plasmidIncs.keys())[i],fsize=10,tight_text=True)
282 face.border.margin = 5
283 face.margin_right = 5
284 face.margin_left = 5
285 (t&"Reference").add_face(face, i + 1, "aligned")
286 face = ete3.faces.TextFace("MLSTScheme",fsize=10,tight_text=True)
287 face.border.margin = 5
288 face.margin_right = 5
289 face.margin_left = 5
290 (t&"Reference").add_face(face, len(plasmidIncs) + 0 + 1, "aligned")
291 face = ete3.faces.TextFace("Sequence Type",fsize=10,tight_text=True)
292 face.border.margin = 5
293 face.margin_right = 5
294 face.margin_left = 5
295 (t&"Reference").add_face(face, len(plasmidIncs) + 1 + 1, "aligned")
296 face = ete3.faces.TextFace("Carbapenamases",fsize=10,tight_text=True)
297 face.border.margin = 5
298 face.margin_right = 5
299 face.margin_left = 5
300 (t&"Reference").add_face(face, len(plasmidIncs) + 2 + 1, "aligned")
301 for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds the columns (aka the incs) to the reference node
302 face = ete3.faces.TextFace(distanceDict[list(distanceDict.keys())[0]][i],fsize=10,tight_text=True)
303 face.border.margin = 5
304 face.margin_right = 5
305 face.margin_left = 5
306 (t&"Reference").add_face(face, len(plasmidIncs) + 2 + i + 1 + 1, "aligned")
307 elif (n.is_leaf() and not n.name == "Reference"):
308 if (metadata[n.name.replace(".fa","")].new == True):
309 face = ete3.faces.RectFace(30,30,"green","green") # TextFace("Y",fsize=10,tight_text=True)
310 face.border.margin = 5
311 face.margin_right = 5
312 face.margin_left = 5
313 face.vt_align = 1
314 face.ht_align = 1
315 n.add_face(face, 0, "aligned")
316 for incs in plasmidIncs: #this loop adds presence/absence to the sample nodes
317 if (n.name.replace(".fa","") in plasmidIncs[incs]):
318 face = ete3.faces.RectFace(30,30,"black","black") # TextFace("Y",fsize=10,tight_text=True)
319 face.border.margin = 5
320 face.margin_right = 5
321 face.margin_left = 5
322 face.vt_align = 1
323 face.ht_align = 1
324 n.add_face(face, list(plasmidIncs.keys()).index(incs) + 1, "aligned")
325 mData = metadata[n.name.replace(".fa","")]
326 face = ete3.faces.TextFace(mData.MLSTSpecies,fsize=10,tight_text=True)
327 face.border.margin = 5
328 face.margin_right = 5
329 face.margin_left = 5
330 n.add_face(face, len(plasmidIncs) + 0 + 1, "aligned")
331 face = ete3.faces.TextFace(mData.SequenceType,fsize=10,tight_text=True)
332 face.border.margin = 5
333 face.margin_right = 5
334 face.margin_left = 5
335 n.add_face(face, len(plasmidIncs) + 1 + 1, "aligned")
336 face = ete3.faces.TextFace(mData.CarbapenemResistanceGenes,fsize=10,tight_text=True)
337 face.margin_right = 5
338 face.margin_left = 5
339 n.add_face(face, len(plasmidIncs) + 2 + 1, "aligned")
340 for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds distance matrix
341 face = ete3.faces.TextFace(list(distanceDict[n.name])[i],fsize=10,tight_text=True)
342 face.border.margin = 5
343 face.margin_right = 5
344 face.margin_left = 5
345 n.add_face(face, len(plasmidIncs) + 2 + i + 1 + 1, "aligned")
346
347 t.render("./tree.png", w=5000,units="mm", tree_style=ts)
348
349 #endregion
350 #endregion
351
352
353 start = time.time()#time the analysis
354
355 #analysis time
356 Main()
357
358 end = time.time()
359 print("Finished!\nThe analysis used: " + str(end-start) + " seconds")