Mercurial > repos > jjjjia > cpo_prediction
comparison cpo_galaxy_tree.py @ 1:fea89c4d5227 draft
Uploaded
author | jjjjia |
---|---|
date | Thu, 16 Aug 2018 19:27:05 -0400 |
parents | |
children | cabceaa239e4 |
comparison
equal
deleted
inserted
replaced
0:917a05a03ac9 | 1:fea89c4d5227 |
---|---|
1 #!/home/jjjjia/.conda/envs/py36/bin/python | |
2 | |
3 #$ -S /home/jjjjia/.conda/envs/py36/bin/python | |
4 #$ -V # Pass environment variables to the job | |
5 #$ -N CPO_pipeline # Replace with a more specific job name | |
6 #$ -wd /home/jjjjia/testCases # Use the current working dir | |
7 #$ -pe smp 8 # Parallel Environment (how many cores) | |
8 #$ -l h_vmem=11G # Memory (RAM) allocation *per core* | |
9 #$ -e ./logs/$JOB_ID.err | |
10 #$ -o ./logs/$JOB_ID.log | |
11 #$ -m ea | |
12 #$ -M bja20@sfu.ca | |
13 | |
14 #~/scripts/pipeline.py -i BC11-Kpn005_S2 -f /data/jjjjia/R1/BC11-Kpn005_S2_L001_R1_001.fastq.gz -r /data/jjjjia/R2/BC11-Kpn005_S2_L001_R2_001.fastq.gz -o pipelineResultsQsub -e "Klebsiella pneumoniae" | |
15 | |
16 import subprocess | |
17 import pandas | |
18 import optparse | |
19 import os | |
20 import datetime | |
21 import sys | |
22 import time | |
23 import urllib.request | |
24 import gzip | |
25 import collections | |
26 import json | |
27 import ete3 | |
28 import numpy | |
29 | |
30 #parses some parameters | |
31 parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...") | |
32 parser.add_option("-t", "--tree", dest="treePath", type="string", default="./pipelineTest/tree.txt", help="identifier of the isolate") | |
33 parser.add_option("-d", "--distance", dest="distancePath", type="string", default="./pipelineTest/distance.tab", help="absolute file path forward read (R1)") | |
34 parser.add_option("-m", "--metadata", dest="metadataPath", type="string", default="./pipelineTest/metadata.tsv",help="absolute file path to reverse read (R2)") | |
35 (options,args) = parser.parse_args() | |
36 treePath = str(options.treePath).lstrip().rstrip() | |
37 distancePath = str(options.distancePath).lstrip().rstrip() | |
38 metadataPath = str(options.metadataPath).lstrip().rstrip() | |
39 | |
40 | |
41 #region result objects | |
42 #define some objects to store values from results | |
43 #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar). | |
44 class workflowResult(object): | |
45 def __init__(self): | |
46 self.new = False | |
47 self.ID = "" | |
48 self.ExpectedSpecies = "" | |
49 self.MLSTSpecies = "" | |
50 self.SequenceType = "" | |
51 self.MLSTScheme = "" | |
52 self.CarbapenemResistanceGenes ="" | |
53 self.OtherAMRGenes="" | |
54 self.TotalPlasmids = 0 | |
55 self.plasmids = [] | |
56 self.DefinitelyPlasmidContigs ="" | |
57 self.LikelyPlasmidContigs="" | |
58 self.row = "" | |
59 class plasmidObj(object): | |
60 def __init__(self): | |
61 self.PlasmidsID = 0 | |
62 self.Num_Contigs = 0 | |
63 self.PlasmidLength = 0 | |
64 self.PlasmidRepType = "" | |
65 self.PlasmidMobility = "" | |
66 self.NearestReference = "" | |
67 | |
68 #endregion | |
69 | |
70 #region useful functions | |
71 def read(path): | |
72 return [line.rstrip('\n') for line in open(path)] | |
73 def execute(command): | |
74 process = subprocess.Popen(command, shell=False, cwd=curDir, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) | |
75 | |
76 # Poll process for new output until finished | |
77 while True: | |
78 nextline = process.stdout.readline() | |
79 if nextline == '' and process.poll() is not None: | |
80 break | |
81 sys.stdout.write(nextline) | |
82 sys.stdout.flush() | |
83 | |
84 output = process.communicate()[0] | |
85 exitCode = process.returncode | |
86 | |
87 if (exitCode == 0): | |
88 return output | |
89 else: | |
90 raise subprocess.CalledProcessError(exitCode, command) | |
91 def httpGetFile(url, filepath=""): | |
92 if (filepath == ""): | |
93 return urllib.request.urlretrieve(url) | |
94 else: | |
95 urllib.request.urlretrieve(url, filepath) | |
96 return True | |
97 def gunzip(inputpath="", outputpath=""): | |
98 if (outputpath == ""): | |
99 with gzip.open(inputpath, 'rb') as f: | |
100 gzContent = f.read() | |
101 return gzContent | |
102 else: | |
103 with gzip.open(inputpath, 'rb') as f: | |
104 gzContent = f.read() | |
105 with open(outputpath, 'wb') as out: | |
106 out.write(gzContent) | |
107 return True | |
108 #endregion | |
109 | |
110 #region functions to parse result files | |
111 def ParseWorkflowResults(pathToResult): | |
112 _worflowResult = {} | |
113 r = pandas.read_csv(pathToResult, delimiter='\t', header=None) #read the kraken2report.tsv | |
114 r = r.replace(numpy.nan, '', regex=True) | |
115 for i in range(len(r.index)): | |
116 _results = workflowResult() | |
117 if(str(r.iloc[i,0]).lower() == "new"): | |
118 _results.new = True | |
119 else: | |
120 _results.new = False | |
121 _results.ID = str(r.iloc[i,1]) | |
122 _results.ExpectedSpecies = str(r.iloc[i,2]) | |
123 _results.MLSTSpecies = str(r.iloc[i,3]) | |
124 _results.SequenceType = str(r.iloc[i,4]) | |
125 _results.MLSTScheme = (str(r.iloc[i,5])) | |
126 _results.CarbapenemResistanceGenes = (str(r.iloc[i,6])) | |
127 _results.OtherAMRGenes = (str(r.iloc[i,7])) | |
128 _results.TotalPlasmids = int(r.iloc[i,8]) | |
129 for j in range(0,_results.TotalPlasmids): | |
130 _plasmid = plasmidObj() | |
131 _plasmid.PlasmidsID =(((str(r.iloc[i,9])).split(";"))[j]) | |
132 _plasmid.Num_Contigs = (((str(r.iloc[i,10])).split(";"))[j]) | |
133 _plasmid.PlasmidLength = (((str(r.iloc[i,11])).split(";"))[j]) | |
134 _plasmid.PlasmidRepType = (((str(r.iloc[i,12])).split(";"))[j]) | |
135 _plasmid.PlasmidMobility = ((str(r.iloc[i,13])).split(";"))[j] | |
136 _plasmid.NearestReference = ((str(r.iloc[i,14])).split(";"))[j] | |
137 _results.plasmids.append(_plasmid) | |
138 _results.DefinitelyPlasmidContigs = (str(r.iloc[i,15])) | |
139 _results.LikelyPlasmidContigs = (str(r.iloc[i,16])) | |
140 _results.row = "\t".join(str(x) for x in r.ix[i].tolist()) | |
141 _worflowResult[_results.ID] = _results | |
142 return _worflowResult | |
143 | |
144 #endregion | |
145 | |
146 def Main(): | |
147 metadata = ParseWorkflowResults(metadataPath) | |
148 distance = read(distancePath) | |
149 treeFile = "".join(read(treePath)) | |
150 | |
151 distanceDict = {} | |
152 for i in range(len(distance)): | |
153 temp = distance[i].split("\t") | |
154 distanceDict[temp[0]] = temp[1:] | |
155 #region step5: tree construction | |
156 t = ete3.Tree(treeFile) | |
157 t.set_outgroup(t&"Reference") | |
158 | |
159 ts = ete3.TreeStyle() | |
160 ts.show_leaf_name = True | |
161 ts.show_branch_length = True | |
162 ts.scale = 2000 #pixel per branch length unit | |
163 ts.branch_vertical_margin = 15 #pixel between branches | |
164 style2 = ete3.NodeStyle() | |
165 style2["fgcolor"] = "#000000" | |
166 style2["shape"] = "circle" | |
167 style2["vt_line_color"] = "#0000aa" | |
168 style2["hz_line_color"] = "#0000aa" | |
169 style2["vt_line_width"] = 2 | |
170 style2["hz_line_width"] = 2 | |
171 style2["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted | |
172 style2["hz_line_type"] = 0 | |
173 for n in t.traverse(): | |
174 n.set_style(style2) | |
175 ''' | |
176 #region create detailed tree | |
177 | |
178 plasmidCount = 0 | |
179 for n in t.traverse(): | |
180 if (n.is_leaf() and not n.name == "Reference"): | |
181 mData = metadata[n.name.replace(".fa","")] | |
182 face = ete3.faces.TextFace(mData.MLSTSpecies,fsize=10,tight_text=True) | |
183 face.border.margin = 5 | |
184 face.margin_left = 10 | |
185 face.margin_right = 10 | |
186 n.add_face(face, 0, "aligned") | |
187 face = ete3.faces.TextFace(mData.SequenceType,fsize=10,tight_text=True) | |
188 face.border.margin = 5 | |
189 face.margin_right = 10 | |
190 n.add_face(face, 1, "aligned") | |
191 face = ete3.faces.TextFace(mData.CarbapenemResistanceGenes,fsize=10,tight_text=True) | |
192 face.border.margin = 5 | |
193 face.margin_right = 10 | |
194 n.add_face(face, 2, "aligned") | |
195 index = 3 | |
196 if (mData.TotalPlasmids > plasmidCount): | |
197 plasmidCount = mData.TotalPlasmids | |
198 for i in range(0, mData.TotalPlasmids): | |
199 face = ete3.faces.TextFace(mData.plasmids[i].PlasmidRepType,fsize=10,tight_text=True) | |
200 face.border.margin = 5 | |
201 face.margin_right = 10 | |
202 n.add_face(face, index, "aligned") | |
203 index+=1 | |
204 face = ete3.faces.TextFace(mData.plasmids[i].PlasmidMobility,fsize=10,tight_text=True) | |
205 face.border.margin = 5 | |
206 face.margin_right = 10 | |
207 n.add_face(face, index, "aligned") | |
208 index+=1 | |
209 | |
210 face = ete3.faces.TextFace("Species",fsize=10,tight_text=True) | |
211 face.border.margin = 5 | |
212 face.margin_right = 10 | |
213 face.margin_left = 10 | |
214 (t&"Reference").add_face(face, 0, "aligned") | |
215 face = ete3.faces.TextFace("Sequence Type",fsize=10,tight_text=True) | |
216 face.border.margin = 5 | |
217 face.margin_right = 10 | |
218 (t&"Reference").add_face(face, 1, "aligned") | |
219 face = ete3.faces.TextFace("Carbapenamases",fsize=10,tight_text=True) | |
220 face.border.margin = 5 | |
221 face.margin_right = 10 | |
222 (t&"Reference").add_face(face, 2, "aligned") | |
223 index = 3 | |
224 for i in range(0, plasmidCount): | |
225 face = ete3.faces.TextFace("plasmid " + str(i) + " replicons",fsize=10,tight_text=True) | |
226 face.border.margin = 5 | |
227 face.margin_right = 10 | |
228 (t&"Reference").add_face(face, index, "aligned") | |
229 index+=1 | |
230 face = ete3.faces.TextFace("plasmid " + str(i) + " mobility",fsize=10,tight_text=True) | |
231 face.border.margin = 5 | |
232 face.margin_right = 10 | |
233 (t&"Reference").add_face(face, index, "aligned") | |
234 index+=1 | |
235 | |
236 t.render("./pipelineTest/tree.png", w=5000,units="mm", tree_style=ts) | |
237 | |
238 #endregion | |
239 ''' | |
240 #region create box tree | |
241 #region step5: tree construction | |
242 treeFile = "".join(read("./pipelineTest/tree.txt")) | |
243 t = ete3.Tree(treeFile) | |
244 t.set_outgroup(t&"Reference") | |
245 | |
246 ts = ete3.TreeStyle() | |
247 ts.show_leaf_name = True | |
248 ts.show_branch_length = True | |
249 ts.scale = 2000 #pixel per branch length unit | |
250 ts.branch_vertical_margin = 15 #pixel between branches | |
251 style2 = ete3.NodeStyle() | |
252 style2["fgcolor"] = "#000000" | |
253 style2["shape"] = "circle" | |
254 style2["vt_line_color"] = "#0000aa" | |
255 style2["hz_line_color"] = "#0000aa" | |
256 style2["vt_line_width"] = 2 | |
257 style2["hz_line_width"] = 2 | |
258 style2["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted | |
259 style2["hz_line_type"] = 0 | |
260 for n in t.traverse(): | |
261 n.set_style(style2) | |
262 plasmidIncs = {} | |
263 for key in metadata: | |
264 for plasmid in metadata[key].plasmids: | |
265 for inc in plasmid.PlasmidRepType.split(","): | |
266 if (inc.lower().find("inc") > -1): | |
267 if not (inc in plasmidIncs): | |
268 plasmidIncs[inc] = [metadata[key].ID] | |
269 else: | |
270 if metadata[key].ID not in plasmidIncs[inc]: | |
271 plasmidIncs[inc].append(metadata[key].ID) | |
272 #plasmidIncs = sorted(plasmidIncs) | |
273 for n in t.traverse(): | |
274 if (n.is_leaf() and n.name == "Reference"): | |
275 face = ete3.faces.TextFace("New?",fsize=10,tight_text=True) | |
276 face.border.margin = 5 | |
277 face.margin_right = 5 | |
278 face.margin_left = 5 | |
279 (t&"Reference").add_face(face, 0, "aligned") | |
280 for i in range(len(plasmidIncs)): #this loop adds the columns (aka the incs) to the reference node | |
281 face = ete3.faces.TextFace(list(plasmidIncs.keys())[i],fsize=10,tight_text=True) | |
282 face.border.margin = 5 | |
283 face.margin_right = 5 | |
284 face.margin_left = 5 | |
285 (t&"Reference").add_face(face, i + 1, "aligned") | |
286 face = ete3.faces.TextFace("MLSTScheme",fsize=10,tight_text=True) | |
287 face.border.margin = 5 | |
288 face.margin_right = 5 | |
289 face.margin_left = 5 | |
290 (t&"Reference").add_face(face, len(plasmidIncs) + 0 + 1, "aligned") | |
291 face = ete3.faces.TextFace("Sequence Type",fsize=10,tight_text=True) | |
292 face.border.margin = 5 | |
293 face.margin_right = 5 | |
294 face.margin_left = 5 | |
295 (t&"Reference").add_face(face, len(plasmidIncs) + 1 + 1, "aligned") | |
296 face = ete3.faces.TextFace("Carbapenamases",fsize=10,tight_text=True) | |
297 face.border.margin = 5 | |
298 face.margin_right = 5 | |
299 face.margin_left = 5 | |
300 (t&"Reference").add_face(face, len(plasmidIncs) + 2 + 1, "aligned") | |
301 for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds the columns (aka the incs) to the reference node | |
302 face = ete3.faces.TextFace(distanceDict[list(distanceDict.keys())[0]][i],fsize=10,tight_text=True) | |
303 face.border.margin = 5 | |
304 face.margin_right = 5 | |
305 face.margin_left = 5 | |
306 (t&"Reference").add_face(face, len(plasmidIncs) + 2 + i + 1 + 1, "aligned") | |
307 elif (n.is_leaf() and not n.name == "Reference"): | |
308 if (metadata[n.name.replace(".fa","")].new == True): | |
309 face = ete3.faces.RectFace(30,30,"green","green") # TextFace("Y",fsize=10,tight_text=True) | |
310 face.border.margin = 5 | |
311 face.margin_right = 5 | |
312 face.margin_left = 5 | |
313 face.vt_align = 1 | |
314 face.ht_align = 1 | |
315 n.add_face(face, 0, "aligned") | |
316 for incs in plasmidIncs: #this loop adds presence/absence to the sample nodes | |
317 if (n.name.replace(".fa","") in plasmidIncs[incs]): | |
318 face = ete3.faces.RectFace(30,30,"black","black") # TextFace("Y",fsize=10,tight_text=True) | |
319 face.border.margin = 5 | |
320 face.margin_right = 5 | |
321 face.margin_left = 5 | |
322 face.vt_align = 1 | |
323 face.ht_align = 1 | |
324 n.add_face(face, list(plasmidIncs.keys()).index(incs) + 1, "aligned") | |
325 mData = metadata[n.name.replace(".fa","")] | |
326 face = ete3.faces.TextFace(mData.MLSTSpecies,fsize=10,tight_text=True) | |
327 face.border.margin = 5 | |
328 face.margin_right = 5 | |
329 face.margin_left = 5 | |
330 n.add_face(face, len(plasmidIncs) + 0 + 1, "aligned") | |
331 face = ete3.faces.TextFace(mData.SequenceType,fsize=10,tight_text=True) | |
332 face.border.margin = 5 | |
333 face.margin_right = 5 | |
334 face.margin_left = 5 | |
335 n.add_face(face, len(plasmidIncs) + 1 + 1, "aligned") | |
336 face = ete3.faces.TextFace(mData.CarbapenemResistanceGenes,fsize=10,tight_text=True) | |
337 face.margin_right = 5 | |
338 face.margin_left = 5 | |
339 n.add_face(face, len(plasmidIncs) + 2 + 1, "aligned") | |
340 for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds distance matrix | |
341 face = ete3.faces.TextFace(list(distanceDict[n.name])[i],fsize=10,tight_text=True) | |
342 face.border.margin = 5 | |
343 face.margin_right = 5 | |
344 face.margin_left = 5 | |
345 n.add_face(face, len(plasmidIncs) + 2 + i + 1 + 1, "aligned") | |
346 | |
347 t.render("./tree.png", w=5000,units="mm", tree_style=ts) | |
348 | |
349 #endregion | |
350 #endregion | |
351 | |
352 | |
353 start = time.time()#time the analysis | |
354 | |
355 #analysis time | |
356 Main() | |
357 | |
358 end = time.time() | |
359 print("Finished!\nThe analysis used: " + str(end-start) + " seconds") |