1
|
1 #!/home/jjjjia/.conda/envs/py36/bin/python
|
|
2
|
|
3 #$ -S /home/jjjjia/.conda/envs/py36/bin/python
|
|
4 #$ -V # Pass environment variables to the job
|
|
5 #$ -N CPO_pipeline # Replace with a more specific job name
|
|
6 #$ -wd /home/jjjjia/testCases # Use the current working dir
|
|
7 #$ -pe smp 8 # Parallel Environment (how many cores)
|
|
8 #$ -l h_vmem=11G # Memory (RAM) allocation *per core*
|
|
9 #$ -e ./logs/$JOB_ID.err
|
|
10 #$ -o ./logs/$JOB_ID.log
|
|
11 #$ -m ea
|
|
12 #$ -M bja20@sfu.ca
|
|
13
|
|
14 #~/scripts/pipeline.py -i BC11-Kpn005_S2 -f /data/jjjjia/R1/BC11-Kpn005_S2_L001_R1_001.fastq.gz -r /data/jjjjia/R2/BC11-Kpn005_S2_L001_R2_001.fastq.gz -o pipelineResultsQsub -e "Klebsiella pneumoniae"
|
|
15
|
|
16 import subprocess
|
|
17 import pandas
|
|
18 import optparse
|
|
19 import os
|
|
20 import datetime
|
|
21 import sys
|
|
22 import time
|
|
23 import urllib.request
|
|
24 import gzip
|
|
25 import collections
|
|
26 import json
|
|
27 import ete3
|
|
28 import numpy
|
|
29
|
|
30 #parses some parameters
|
|
31 parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...")
|
|
32 parser.add_option("-t", "--tree", dest="treePath", type="string", default="./pipelineTest/tree.txt", help="identifier of the isolate")
|
|
33 parser.add_option("-d", "--distance", dest="distancePath", type="string", default="./pipelineTest/distance.tab", help="absolute file path forward read (R1)")
|
|
34 parser.add_option("-m", "--metadata", dest="metadataPath", type="string", default="./pipelineTest/metadata.tsv",help="absolute file path to reverse read (R2)")
|
|
35 (options,args) = parser.parse_args()
|
|
36 treePath = str(options.treePath).lstrip().rstrip()
|
|
37 distancePath = str(options.distancePath).lstrip().rstrip()
|
|
38 metadataPath = str(options.metadataPath).lstrip().rstrip()
|
|
39
|
|
40
|
|
41 #region result objects
|
|
42 #define some objects to store values from results
|
|
43 #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar).
|
|
44 class workflowResult(object):
|
|
45 def __init__(self):
|
|
46 self.new = False
|
|
47 self.ID = ""
|
|
48 self.ExpectedSpecies = ""
|
|
49 self.MLSTSpecies = ""
|
|
50 self.SequenceType = ""
|
|
51 self.MLSTScheme = ""
|
|
52 self.CarbapenemResistanceGenes =""
|
|
53 self.OtherAMRGenes=""
|
|
54 self.TotalPlasmids = 0
|
|
55 self.plasmids = []
|
|
56 self.DefinitelyPlasmidContigs =""
|
|
57 self.LikelyPlasmidContigs=""
|
|
58 self.row = ""
|
|
59 class plasmidObj(object):
|
|
60 def __init__(self):
|
|
61 self.PlasmidsID = 0
|
|
62 self.Num_Contigs = 0
|
|
63 self.PlasmidLength = 0
|
|
64 self.PlasmidRepType = ""
|
|
65 self.PlasmidMobility = ""
|
|
66 self.NearestReference = ""
|
|
67
|
|
68 #endregion
|
|
69
|
|
70 #region useful functions
|
|
71 def read(path):
|
|
72 return [line.rstrip('\n') for line in open(path)]
|
|
73 def execute(command):
|
|
74 process = subprocess.Popen(command, shell=False, cwd=curDir, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
|
75
|
|
76 # Poll process for new output until finished
|
|
77 while True:
|
|
78 nextline = process.stdout.readline()
|
|
79 if nextline == '' and process.poll() is not None:
|
|
80 break
|
|
81 sys.stdout.write(nextline)
|
|
82 sys.stdout.flush()
|
|
83
|
|
84 output = process.communicate()[0]
|
|
85 exitCode = process.returncode
|
|
86
|
|
87 if (exitCode == 0):
|
|
88 return output
|
|
89 else:
|
|
90 raise subprocess.CalledProcessError(exitCode, command)
|
|
91 def httpGetFile(url, filepath=""):
|
|
92 if (filepath == ""):
|
|
93 return urllib.request.urlretrieve(url)
|
|
94 else:
|
|
95 urllib.request.urlretrieve(url, filepath)
|
|
96 return True
|
|
97 def gunzip(inputpath="", outputpath=""):
|
|
98 if (outputpath == ""):
|
|
99 with gzip.open(inputpath, 'rb') as f:
|
|
100 gzContent = f.read()
|
|
101 return gzContent
|
|
102 else:
|
|
103 with gzip.open(inputpath, 'rb') as f:
|
|
104 gzContent = f.read()
|
|
105 with open(outputpath, 'wb') as out:
|
|
106 out.write(gzContent)
|
|
107 return True
|
|
108 #endregion
|
|
109
|
|
110 #region functions to parse result files
|
|
111 def ParseWorkflowResults(pathToResult):
|
|
112 _worflowResult = {}
|
|
113 r = pandas.read_csv(pathToResult, delimiter='\t', header=None) #read the kraken2report.tsv
|
|
114 r = r.replace(numpy.nan, '', regex=True)
|
|
115 for i in range(len(r.index)):
|
|
116 _results = workflowResult()
|
|
117 if(str(r.iloc[i,0]).lower() == "new"):
|
|
118 _results.new = True
|
|
119 else:
|
|
120 _results.new = False
|
|
121 _results.ID = str(r.iloc[i,1])
|
|
122 _results.ExpectedSpecies = str(r.iloc[i,2])
|
|
123 _results.MLSTSpecies = str(r.iloc[i,3])
|
|
124 _results.SequenceType = str(r.iloc[i,4])
|
|
125 _results.MLSTScheme = (str(r.iloc[i,5]))
|
|
126 _results.CarbapenemResistanceGenes = (str(r.iloc[i,6]))
|
|
127 _results.OtherAMRGenes = (str(r.iloc[i,7]))
|
|
128 _results.TotalPlasmids = int(r.iloc[i,8])
|
|
129 for j in range(0,_results.TotalPlasmids):
|
|
130 _plasmid = plasmidObj()
|
|
131 _plasmid.PlasmidsID =(((str(r.iloc[i,9])).split(";"))[j])
|
|
132 _plasmid.Num_Contigs = (((str(r.iloc[i,10])).split(";"))[j])
|
|
133 _plasmid.PlasmidLength = (((str(r.iloc[i,11])).split(";"))[j])
|
|
134 _plasmid.PlasmidRepType = (((str(r.iloc[i,12])).split(";"))[j])
|
|
135 _plasmid.PlasmidMobility = ((str(r.iloc[i,13])).split(";"))[j]
|
|
136 _plasmid.NearestReference = ((str(r.iloc[i,14])).split(";"))[j]
|
|
137 _results.plasmids.append(_plasmid)
|
|
138 _results.DefinitelyPlasmidContigs = (str(r.iloc[i,15]))
|
|
139 _results.LikelyPlasmidContigs = (str(r.iloc[i,16]))
|
|
140 _results.row = "\t".join(str(x) for x in r.ix[i].tolist())
|
|
141 _worflowResult[_results.ID] = _results
|
|
142 return _worflowResult
|
|
143
|
|
144 #endregion
|
|
145
|
|
146 def Main():
|
|
147 metadata = ParseWorkflowResults(metadataPath)
|
|
148 distance = read(distancePath)
|
|
149 treeFile = "".join(read(treePath))
|
|
150
|
|
151 distanceDict = {}
|
|
152 for i in range(len(distance)):
|
|
153 temp = distance[i].split("\t")
|
|
154 distanceDict[temp[0]] = temp[1:]
|
|
155 #region step5: tree construction
|
|
156 t = ete3.Tree(treeFile)
|
|
157 t.set_outgroup(t&"Reference")
|
|
158
|
|
159 ts = ete3.TreeStyle()
|
|
160 ts.show_leaf_name = True
|
|
161 ts.show_branch_length = True
|
|
162 ts.scale = 2000 #pixel per branch length unit
|
|
163 ts.branch_vertical_margin = 15 #pixel between branches
|
|
164 style2 = ete3.NodeStyle()
|
|
165 style2["fgcolor"] = "#000000"
|
|
166 style2["shape"] = "circle"
|
|
167 style2["vt_line_color"] = "#0000aa"
|
|
168 style2["hz_line_color"] = "#0000aa"
|
|
169 style2["vt_line_width"] = 2
|
|
170 style2["hz_line_width"] = 2
|
|
171 style2["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted
|
|
172 style2["hz_line_type"] = 0
|
|
173 for n in t.traverse():
|
|
174 n.set_style(style2)
|
|
175 '''
|
|
176 #region create detailed tree
|
|
177
|
|
178 plasmidCount = 0
|
|
179 for n in t.traverse():
|
|
180 if (n.is_leaf() and not n.name == "Reference"):
|
|
181 mData = metadata[n.name.replace(".fa","")]
|
|
182 face = ete3.faces.TextFace(mData.MLSTSpecies,fsize=10,tight_text=True)
|
|
183 face.border.margin = 5
|
|
184 face.margin_left = 10
|
|
185 face.margin_right = 10
|
|
186 n.add_face(face, 0, "aligned")
|
|
187 face = ete3.faces.TextFace(mData.SequenceType,fsize=10,tight_text=True)
|
|
188 face.border.margin = 5
|
|
189 face.margin_right = 10
|
|
190 n.add_face(face, 1, "aligned")
|
|
191 face = ete3.faces.TextFace(mData.CarbapenemResistanceGenes,fsize=10,tight_text=True)
|
|
192 face.border.margin = 5
|
|
193 face.margin_right = 10
|
|
194 n.add_face(face, 2, "aligned")
|
|
195 index = 3
|
|
196 if (mData.TotalPlasmids > plasmidCount):
|
|
197 plasmidCount = mData.TotalPlasmids
|
|
198 for i in range(0, mData.TotalPlasmids):
|
|
199 face = ete3.faces.TextFace(mData.plasmids[i].PlasmidRepType,fsize=10,tight_text=True)
|
|
200 face.border.margin = 5
|
|
201 face.margin_right = 10
|
|
202 n.add_face(face, index, "aligned")
|
|
203 index+=1
|
|
204 face = ete3.faces.TextFace(mData.plasmids[i].PlasmidMobility,fsize=10,tight_text=True)
|
|
205 face.border.margin = 5
|
|
206 face.margin_right = 10
|
|
207 n.add_face(face, index, "aligned")
|
|
208 index+=1
|
|
209
|
|
210 face = ete3.faces.TextFace("Species",fsize=10,tight_text=True)
|
|
211 face.border.margin = 5
|
|
212 face.margin_right = 10
|
|
213 face.margin_left = 10
|
|
214 (t&"Reference").add_face(face, 0, "aligned")
|
|
215 face = ete3.faces.TextFace("Sequence Type",fsize=10,tight_text=True)
|
|
216 face.border.margin = 5
|
|
217 face.margin_right = 10
|
|
218 (t&"Reference").add_face(face, 1, "aligned")
|
|
219 face = ete3.faces.TextFace("Carbapenamases",fsize=10,tight_text=True)
|
|
220 face.border.margin = 5
|
|
221 face.margin_right = 10
|
|
222 (t&"Reference").add_face(face, 2, "aligned")
|
|
223 index = 3
|
|
224 for i in range(0, plasmidCount):
|
|
225 face = ete3.faces.TextFace("plasmid " + str(i) + " replicons",fsize=10,tight_text=True)
|
|
226 face.border.margin = 5
|
|
227 face.margin_right = 10
|
|
228 (t&"Reference").add_face(face, index, "aligned")
|
|
229 index+=1
|
|
230 face = ete3.faces.TextFace("plasmid " + str(i) + " mobility",fsize=10,tight_text=True)
|
|
231 face.border.margin = 5
|
|
232 face.margin_right = 10
|
|
233 (t&"Reference").add_face(face, index, "aligned")
|
|
234 index+=1
|
|
235
|
|
236 t.render("./pipelineTest/tree.png", w=5000,units="mm", tree_style=ts)
|
|
237
|
|
238 #endregion
|
|
239 '''
|
|
240 #region create box tree
|
|
241 #region step5: tree construction
|
|
242 treeFile = "".join(read("./pipelineTest/tree.txt"))
|
|
243 t = ete3.Tree(treeFile)
|
|
244 t.set_outgroup(t&"Reference")
|
|
245
|
|
246 ts = ete3.TreeStyle()
|
|
247 ts.show_leaf_name = True
|
|
248 ts.show_branch_length = True
|
|
249 ts.scale = 2000 #pixel per branch length unit
|
|
250 ts.branch_vertical_margin = 15 #pixel between branches
|
|
251 style2 = ete3.NodeStyle()
|
|
252 style2["fgcolor"] = "#000000"
|
|
253 style2["shape"] = "circle"
|
|
254 style2["vt_line_color"] = "#0000aa"
|
|
255 style2["hz_line_color"] = "#0000aa"
|
|
256 style2["vt_line_width"] = 2
|
|
257 style2["hz_line_width"] = 2
|
|
258 style2["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted
|
|
259 style2["hz_line_type"] = 0
|
|
260 for n in t.traverse():
|
|
261 n.set_style(style2)
|
|
262 plasmidIncs = {}
|
|
263 for key in metadata:
|
|
264 for plasmid in metadata[key].plasmids:
|
|
265 for inc in plasmid.PlasmidRepType.split(","):
|
|
266 if (inc.lower().find("inc") > -1):
|
|
267 if not (inc in plasmidIncs):
|
|
268 plasmidIncs[inc] = [metadata[key].ID]
|
|
269 else:
|
|
270 if metadata[key].ID not in plasmidIncs[inc]:
|
|
271 plasmidIncs[inc].append(metadata[key].ID)
|
|
272 #plasmidIncs = sorted(plasmidIncs)
|
|
273 for n in t.traverse():
|
|
274 if (n.is_leaf() and n.name == "Reference"):
|
|
275 face = ete3.faces.TextFace("New?",fsize=10,tight_text=True)
|
|
276 face.border.margin = 5
|
|
277 face.margin_right = 5
|
|
278 face.margin_left = 5
|
|
279 (t&"Reference").add_face(face, 0, "aligned")
|
|
280 for i in range(len(plasmidIncs)): #this loop adds the columns (aka the incs) to the reference node
|
|
281 face = ete3.faces.TextFace(list(plasmidIncs.keys())[i],fsize=10,tight_text=True)
|
|
282 face.border.margin = 5
|
|
283 face.margin_right = 5
|
|
284 face.margin_left = 5
|
|
285 (t&"Reference").add_face(face, i + 1, "aligned")
|
|
286 face = ete3.faces.TextFace("MLSTScheme",fsize=10,tight_text=True)
|
|
287 face.border.margin = 5
|
|
288 face.margin_right = 5
|
|
289 face.margin_left = 5
|
|
290 (t&"Reference").add_face(face, len(plasmidIncs) + 0 + 1, "aligned")
|
|
291 face = ete3.faces.TextFace("Sequence Type",fsize=10,tight_text=True)
|
|
292 face.border.margin = 5
|
|
293 face.margin_right = 5
|
|
294 face.margin_left = 5
|
|
295 (t&"Reference").add_face(face, len(plasmidIncs) + 1 + 1, "aligned")
|
|
296 face = ete3.faces.TextFace("Carbapenamases",fsize=10,tight_text=True)
|
|
297 face.border.margin = 5
|
|
298 face.margin_right = 5
|
|
299 face.margin_left = 5
|
|
300 (t&"Reference").add_face(face, len(plasmidIncs) + 2 + 1, "aligned")
|
|
301 for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds the columns (aka the incs) to the reference node
|
|
302 face = ete3.faces.TextFace(distanceDict[list(distanceDict.keys())[0]][i],fsize=10,tight_text=True)
|
|
303 face.border.margin = 5
|
|
304 face.margin_right = 5
|
|
305 face.margin_left = 5
|
|
306 (t&"Reference").add_face(face, len(plasmidIncs) + 2 + i + 1 + 1, "aligned")
|
|
307 elif (n.is_leaf() and not n.name == "Reference"):
|
|
308 if (metadata[n.name.replace(".fa","")].new == True):
|
|
309 face = ete3.faces.RectFace(30,30,"green","green") # TextFace("Y",fsize=10,tight_text=True)
|
|
310 face.border.margin = 5
|
|
311 face.margin_right = 5
|
|
312 face.margin_left = 5
|
|
313 face.vt_align = 1
|
|
314 face.ht_align = 1
|
|
315 n.add_face(face, 0, "aligned")
|
|
316 for incs in plasmidIncs: #this loop adds presence/absence to the sample nodes
|
|
317 if (n.name.replace(".fa","") in plasmidIncs[incs]):
|
|
318 face = ete3.faces.RectFace(30,30,"black","black") # TextFace("Y",fsize=10,tight_text=True)
|
|
319 face.border.margin = 5
|
|
320 face.margin_right = 5
|
|
321 face.margin_left = 5
|
|
322 face.vt_align = 1
|
|
323 face.ht_align = 1
|
|
324 n.add_face(face, list(plasmidIncs.keys()).index(incs) + 1, "aligned")
|
|
325 mData = metadata[n.name.replace(".fa","")]
|
|
326 face = ete3.faces.TextFace(mData.MLSTSpecies,fsize=10,tight_text=True)
|
|
327 face.border.margin = 5
|
|
328 face.margin_right = 5
|
|
329 face.margin_left = 5
|
|
330 n.add_face(face, len(plasmidIncs) + 0 + 1, "aligned")
|
|
331 face = ete3.faces.TextFace(mData.SequenceType,fsize=10,tight_text=True)
|
|
332 face.border.margin = 5
|
|
333 face.margin_right = 5
|
|
334 face.margin_left = 5
|
|
335 n.add_face(face, len(plasmidIncs) + 1 + 1, "aligned")
|
|
336 face = ete3.faces.TextFace(mData.CarbapenemResistanceGenes,fsize=10,tight_text=True)
|
|
337 face.margin_right = 5
|
|
338 face.margin_left = 5
|
|
339 n.add_face(face, len(plasmidIncs) + 2 + 1, "aligned")
|
|
340 for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds distance matrix
|
|
341 face = ete3.faces.TextFace(list(distanceDict[n.name])[i],fsize=10,tight_text=True)
|
|
342 face.border.margin = 5
|
|
343 face.margin_right = 5
|
|
344 face.margin_left = 5
|
|
345 n.add_face(face, len(plasmidIncs) + 2 + i + 1 + 1, "aligned")
|
|
346
|
|
347 t.render("./tree.png", w=5000,units="mm", tree_style=ts)
|
|
348
|
|
349 #endregion
|
|
350 #endregion
|
|
351
|
|
352
|
|
353 start = time.time()#time the analysis
|
|
354
|
|
355 #analysis time
|
|
356 Main()
|
|
357
|
|
358 end = time.time()
|
|
359 print("Finished!\nThe analysis used: " + str(end-start) + " seconds") |