Mercurial > repos > jjjjia > cpo_prediction
comparison cpo_galaxy_tree_sensitive.py @ 12:4b2738bc81ed draft
planemo upload
author | jjjjia |
---|---|
date | Fri, 24 Aug 2018 19:10:42 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
11:8aaa4001383b | 12:4b2738bc81ed |
---|---|
1 #!/home/jjjjia/.conda/envs/py36/bin/python | |
2 | |
3 #$ -S /home/jjjjia/.conda/envs/py36/bin/python | |
4 #$ -V # Pass environment variables to the job | |
5 #$ -N CPO_pipeline # Replace with a more specific job name | |
6 #$ -wd /home/jjjjia/testCases # Use the current working dir | |
7 #$ -pe smp 1 # Parallel Environment (how many cores) | |
8 #$ -l h_vmem=11G # Memory (RAM) allocation *per core* | |
9 #$ -e ./logs/$JOB_ID.err | |
10 #$ -o ./logs/$JOB_ID.log | |
11 #$ -m ea | |
12 #$ -M bja20@sfu.ca | |
13 | |
14 # >python cpo_galaxy_tree.py -t /path/to/tree.ph -d /path/to/distance/matrix -m /path/to/metadata | |
15 # python cpo_galaxy_tree.py -t tree.txt -d ./dist.tabular -m ./metadata.tsv | |
16 | |
17 # <requirements> | |
18 # <requirement type="package" version="0.23.4">pandas</requirement> | |
19 # <requirement type="package" version="3.6">python</requirement> | |
20 # <requirement type="package" version="3.1.1">ete3</requirement> | |
21 # <requirement type="package" version="5.9.3">pyqt</requirement> | |
22 # </requirements> | |
23 | |
24 import subprocess | |
25 import pandas #conda pandas | |
26 import optparse | |
27 import os | |
28 import datetime | |
29 import sys | |
30 import time | |
31 import urllib.request | |
32 import gzip | |
33 import collections | |
34 import json | |
35 import numpy #conda numpy | |
36 import ete3 as e #conda ete3 3.1.1**** >requires pyqt5 | |
37 import csv | |
38 | |
39 #parses some parameters | |
40 parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...") | |
41 parser.add_option("-t", "--tree", dest="treePath", type="string", default="./pipelineTest/tree.txt", help="identifier of the isolate") | |
42 parser.add_option("-d", "--distance", dest="distancePath", type="string", default="./pipelineTest/distance.tab", help="absolute file path forward read (R1)") | |
43 parser.add_option("-m", "--metadata", dest="metadataPath", type="string", default="./pipelineTest/metadata.tsv",help="absolute file path to reverse read (R2)") | |
44 parser.add_option("-p", "--sensitive_data", dest="sensitivePath", type="string", default="", help="Spreadsheet (CSV) with sensitive metadata") | |
45 parser.add_option("-c", "--sensitive_cols", dest="sensitiveCols", type="string", default="", help="CSV list of column names from sensitive metadata spreadsheet to use as labels on dendrogram") | |
46 parser.add_option("-o", "--output_file", dest="outputFile", type="string", default="tree.png", help="Output graphics file. Use ending 'png', 'pdf' or 'svg' to specify file format.") | |
47 parser.add_option("-b", "--bcid_column", dest="bcidCol", type="string", default="BCID", help="Column name of BCID in sensitive metadata file") | |
48 parser.add_option("-n", "--missing_value", dest="naValue", type="string", default="NA", help="Value to write for missing data.") | |
49 | |
50 (options,args) = parser.parse_args() | |
51 treePath = str(options.treePath).lstrip().rstrip() | |
52 distancePath = str(options.distancePath).lstrip().rstrip() | |
53 metadataPath = str(options.metadataPath).lstrip().rstrip() | |
54 sensitivePath = str(options.sensitivePath).lstrip().rstrip() | |
55 sensitiveCols = str(options.sensitiveCols).lstrip().rstrip() | |
56 outputFile = str(options.outputFile).lstrip().rstrip() | |
57 bcidCol = str( str(options.bcidCol).lstrip().rstrip() ) | |
58 naValue = str( str(options.naValue).lstrip().rstrip() ) | |
59 | |
60 if len(sensitivePath) == 0: | |
61 print("Must give a file with sensitive meta data. Option -p, or --sensitive_data") | |
62 | |
63 ### test values to uncomment | |
64 # sensitivePath = "./sensitive_metadata.csv" | |
65 # sensitiveCols = "Name,Care facility" | |
66 # outputFile = "newtree_test.png" | |
67 # bcidCol = "BCID" | |
68 | |
69 | |
70 | |
71 import pandas | |
72 class SensitiveMetadata(object): | |
73 def __init__(self): | |
74 x = pandas.read_csv( sensitivePath ) | |
75 col_names = [ s for s in sensitiveCols.split(',')] # convert to 0 offset | |
76 if not bcidCol in col_names: | |
77 col_names.append( bcidCol ) | |
78 all_cols = [ str(col) for col in x.columns ] | |
79 col_idxs = [ all_cols.index(col) for col in col_names ] | |
80 self.sensitive_data = x.iloc[:, col_idxs] | |
81 def get_columns(self): | |
82 cols = [ str(x) for x in self.sensitive_data.columns ] | |
83 return cols | |
84 def get_value( self, bcid, column_name ): # might be nice to get them all in single call via an input list of bcids ... for later | |
85 bcids= list( self.sensitive_data.loc[:, bcidCol ] ) # get the list of all BCIDs in sensitive metadata | |
86 if not bcid in bcids: | |
87 return naValue | |
88 else: | |
89 row_idx = bcids.index( bcid ) # lookup the row for this BCID | |
90 return self.sensitive_data.loc[ row_idx, column_name ] # return the one value based on the column (col_idx) and this row | |
91 | |
92 | |
93 #region result objects | |
94 #define some objects to store values from results | |
95 #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar). | |
96 class workflowResult(object): | |
97 def __init__(self): | |
98 self.new = False | |
99 self.ID = "" | |
100 self.ExpectedSpecies = "" | |
101 self.MLSTSpecies = "" | |
102 self.SequenceType = "" | |
103 self.MLSTScheme = "" | |
104 self.CarbapenemResistanceGenes ="" | |
105 self.OtherAMRGenes="" | |
106 self.TotalPlasmids = 0 | |
107 self.plasmids = [] | |
108 self.DefinitelyPlasmidContigs ="" | |
109 self.LikelyPlasmidContigs="" | |
110 self.row = "" | |
111 class plasmidObj(object): | |
112 def __init__(self): | |
113 self.PlasmidsID = 0 | |
114 self.Num_Contigs = 0 | |
115 self.PlasmidLength = 0 | |
116 self.PlasmidRepType = "" | |
117 self.PlasmidMobility = "" | |
118 self.NearestReference = "" | |
119 | |
120 #endregion | |
121 | |
122 #region useful functions | |
123 def read(path): #read in a text file to a list | |
124 return [line.rstrip('\n') for line in open(path)] | |
125 def execute(command): #subprocess.popen call bash command | |
126 process = subprocess.Popen(command, shell=False, cwd=curDir, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) | |
127 | |
128 # Poll process for new output until finished | |
129 while True: | |
130 nextline = process.stdout.readline() | |
131 if nextline == '' and process.poll() is not None: | |
132 break | |
133 sys.stdout.write(nextline) | |
134 sys.stdout.flush() | |
135 | |
136 output = process.communicate()[0] | |
137 exitCode = process.returncode | |
138 | |
139 if (exitCode == 0): | |
140 return output | |
141 else: | |
142 raise subprocess.CalledProcessError(exitCode, command) | |
143 def httpGetFile(url, filepath=""): #download a file from the web | |
144 if (filepath == ""): | |
145 return urllib.request.urlretrieve(url) | |
146 else: | |
147 urllib.request.urlretrieve(url, filepath) | |
148 return True | |
149 def gunzip(inputpath="", outputpath=""): #gunzip | |
150 if (outputpath == ""): | |
151 with gzip.open(inputpath, 'rb') as f: | |
152 gzContent = f.read() | |
153 return gzContent | |
154 else: | |
155 with gzip.open(inputpath, 'rb') as f: | |
156 gzContent = f.read() | |
157 with open(outputpath, 'wb') as out: | |
158 out.write(gzContent) | |
159 return True | |
160 def addFace(name): #function to add a facet to a tree | |
161 #if its the reference branch, populate the faces with column headers | |
162 face = e.faces.TextFace(name,fsize=10,tight_text=True) | |
163 face.border.margin = 5 | |
164 face.margin_right = 5 | |
165 face.margin_left = 5 | |
166 return face | |
167 #endregion | |
168 | |
169 #region functions to parse result files | |
170 def ParseWorkflowResults(pathToResult): | |
171 _worflowResult = {} | |
172 r = pandas.read_csv(pathToResult, delimiter='\t', header=0) | |
173 r = r.replace(numpy.nan, '', regex=True) | |
174 for i in range(len(r.index)): | |
175 _results = workflowResult() | |
176 if(str(r.loc[r.index[i], 'new']).lower() == "new"): | |
177 _results.new = True | |
178 else: | |
179 _results.new = False | |
180 _results.ID = str(r.loc[r.index[i], 'ID']) | |
181 _results.ExpectedSpecies = str(r.loc[r.index[i], 'Expected Species']) | |
182 _results.MLSTSpecies = str(r.loc[r.index[i], 'MLST Species']) | |
183 _results.SequenceType = str(r.loc[r.index[i], 'Sequence Type']) | |
184 _results.MLSTScheme = (str(r.loc[r.index[i], 'MLST Scheme'])) | |
185 _results.CarbapenemResistanceGenes = (str(r.loc[r.index[i], 'Carbapenem Resistance Genes'])) | |
186 _results.OtherAMRGenes = (str(r.loc[r.index[i], 'Other AMR Genes'])) | |
187 _results.TotalPlasmids = int(r.loc[r.index[i], 'Total Plasmids']) | |
188 for j in range(0,_results.TotalPlasmids): | |
189 _plasmid = plasmidObj() | |
190 _plasmid.PlasmidsID =(((str(r.loc[r.index[i], 'Plasmids ID'])).split(";"))[j]) | |
191 _plasmid.Num_Contigs = (((str(r.loc[r.index[i], 'Num_Contigs'])).split(";"))[j]) | |
192 _plasmid.PlasmidLength = (((str(r.loc[r.index[i], 'Plasmid Length'])).split(";"))[j]) | |
193 _plasmid.PlasmidRepType = (((str(r.loc[r.index[i], 'Plasmid RepType'])).split(";"))[j]) | |
194 _plasmid.PlasmidMobility = ((str(r.loc[r.index[i], 'Plasmid Mobility'])).split(";"))[j] | |
195 _plasmid.NearestReference = ((str(r.loc[r.index[i], 'Nearest Reference'])).split(";"))[j] | |
196 _results.plasmids.append(_plasmid) | |
197 _results.DefinitelyPlasmidContigs = (str(r.loc[r.index[i], 'Definitely Plasmid Contigs'])) | |
198 _results.LikelyPlasmidContigs = (str(r.loc[r.index[i], 'Likely Plasmid Contigs'])) | |
199 _results.row = "\t".join(str(x) for x in r.ix[i].tolist()) | |
200 _worflowResult[_results.ID] = _results | |
201 return _worflowResult | |
202 | |
203 #endregion | |
204 | |
205 def Main(): | |
206 sensitive_meta_data = SensitiveMetadata() | |
207 # print( sensitive_meta_data.get_columns() ) | |
208 metadata = ParseWorkflowResults(metadataPath) | |
209 distance = read(distancePath) | |
210 treeFile = "".join(read(treePath)) | |
211 | |
212 distanceDict = {} #store the distance matrix as rowname:list<string> | |
213 for i in range(len(distance)): | |
214 temp = distance[i].split("\t") | |
215 distanceDict[temp[0]] = temp[1:] | |
216 #region step5: tree construction | |
217 | |
218 ''' | |
219 #region create detailed tree | |
220 | |
221 plasmidCount = 0 | |
222 for n in t.traverse(): | |
223 if (n.is_leaf() and not n.name == "Reference"): | |
224 mData = metadata[n.name.replace(".fa","")] | |
225 face = faces.TextFace(mData.MLSTSpecies,fsize=10,tight_text=True) | |
226 face.border.margin = 5 | |
227 face.margin_left = 10 | |
228 face.margin_right = 10 | |
229 n.add_face(face, 0, "aligned") | |
230 face = faces.TextFace(mData.SequenceType,fsize=10,tight_text=True) | |
231 face.border.margin = 5 | |
232 face.margin_right = 10 | |
233 n.add_face(face, 1, "aligned") | |
234 face = faces.TextFace(mData.CarbapenemResistanceGenes,fsize=10,tight_text=True) | |
235 face.border.margin = 5 | |
236 face.margin_right = 10 | |
237 n.add_face(face, 2, "aligned") | |
238 index = 3 | |
239 if (mData.TotalPlasmids > plasmidCount): | |
240 plasmidCount = mData.TotalPlasmids | |
241 for i in range(0, mData.TotalPlasmids): | |
242 face = faces.TextFace(mData.plasmids[i].PlasmidRepType,fsize=10,tight_text=True) | |
243 face.border.margin = 5 | |
244 face.margin_right = 10 | |
245 n.add_face(face, index, "aligned") | |
246 index+=1 | |
247 face = faces.TextFace(mData.plasmids[i].PlasmidMobility,fsize=10,tight_text=True) | |
248 face.border.margin = 5 | |
249 face.margin_right = 10 | |
250 n.add_face(face, index, "aligned") | |
251 index+=1 | |
252 | |
253 face = faces.TextFace("Species",fsize=10,tight_text=True) | |
254 face.border.margin = 5 | |
255 face.margin_right = 10 | |
256 face.margin_left = 10 | |
257 (t&"Reference").add_face(face, 0, "aligned") | |
258 face = faces.TextFace("Sequence Type",fsize=10,tight_text=True) | |
259 face.border.margin = 5 | |
260 face.margin_right = 10 | |
261 (t&"Reference").add_face(face, 1, "aligned") | |
262 face = faces.TextFace("Carbapenamases",fsize=10,tight_text=True) | |
263 face.border.margin = 5 | |
264 face.margin_right = 10 | |
265 (t&"Reference").add_face(face, 2, "aligned") | |
266 index = 3 | |
267 for i in range(0, plasmidCount): | |
268 face = faces.TextFace("plasmid " + str(i) + " replicons",fsize=10,tight_text=True) | |
269 face.border.margin = 5 | |
270 face.margin_right = 10 | |
271 (t&"Reference").add_face(face, index, "aligned") | |
272 index+=1 | |
273 face = faces.TextFace("plasmid " + str(i) + " mobility",fsize=10,tight_text=True) | |
274 face.border.margin = 5 | |
275 face.margin_right = 10 | |
276 (t&"Reference").add_face(face, index, "aligned") | |
277 index+=1 | |
278 | |
279 t.render("./pipelineTest/tree.png", w=5000,units="mm", tree_style=ts) | |
280 | |
281 #endregion | |
282 ''' | |
283 #region create box tree | |
284 #region step5: tree construction | |
285 treeFile = "".join(read(treePath)) | |
286 t = e.Tree(treeFile) | |
287 t.set_outgroup(t&"Reference") | |
288 | |
289 #set the tree style | |
290 ts = e.TreeStyle() | |
291 ts.show_leaf_name = False | |
292 ts.show_branch_length = True | |
293 ts.scale = 2000 #pixel per branch length unit | |
294 ts.branch_vertical_margin = 15 #pixel between branches | |
295 style2 = e.NodeStyle() | |
296 style2["fgcolor"] = "#000000" | |
297 style2["shape"] = "circle" | |
298 style2["vt_line_color"] = "#0000aa" | |
299 style2["hz_line_color"] = "#0000aa" | |
300 style2["vt_line_width"] = 2 | |
301 style2["hz_line_width"] = 2 | |
302 style2["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted | |
303 style2["hz_line_type"] = 0 | |
304 for n in t.traverse(): | |
305 n.set_style(style2) | |
306 | |
307 #find the plasmid origins | |
308 plasmidIncs = {} | |
309 for key in metadata: | |
310 for plasmid in metadata[key].plasmids: | |
311 for inc in plasmid.PlasmidRepType.split(","): | |
312 if (inc.lower().find("inc") > -1): | |
313 if not (inc in plasmidIncs): | |
314 plasmidIncs[inc] = [metadata[key].ID] | |
315 else: | |
316 if metadata[key].ID not in plasmidIncs[inc]: | |
317 plasmidIncs[inc].append(metadata[key].ID) | |
318 #plasmidIncs = sorted(plasmidIncs) | |
319 for n in t.traverse(): #loop through the nodes of a tree | |
320 if (n.is_leaf() and n.name == "Reference"): | |
321 #if its the reference branch, populate the faces with column headers | |
322 index = 0 | |
323 | |
324 for sensitive_data_column in sensitive_meta_data.get_columns(): | |
325 (t&"Reference").add_face(addFace(sensitive_data_column), index, "aligned") | |
326 index = index + 1 | |
327 | |
328 (t&"Reference").add_face(addFace("SampleID"), index, "aligned") | |
329 index = index + 1 | |
330 (t&"Reference").add_face(addFace("New?"), index, "aligned") | |
331 index = index + 1 | |
332 for i in range(len(plasmidIncs)): #this loop adds the columns (aka the incs) to the reference node | |
333 (t&"Reference").add_face(addFace(list(plasmidIncs.keys())[i]), i + index, "aligned") | |
334 index = index + len(plasmidIncs) | |
335 (t&"Reference").add_face(addFace("MLSTScheme"), index, "aligned") | |
336 index = index + 1 | |
337 (t&"Reference").add_face(addFace("Sequence Type"), index, "aligned") | |
338 index = index + 1 | |
339 (t&"Reference").add_face(addFace("Carbapenamases"), index, "aligned") | |
340 index = index + 1 | |
341 for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds the distance matrix | |
342 (t&"Reference").add_face(addFace(distanceDict[list(distanceDict.keys())[0]][i]), index + i, "aligned") | |
343 index = index + len(distanceDict[list(distanceDict.keys())[0]]) | |
344 elif (n.is_leaf() and not n.name == "Reference"): | |
345 #not reference branches, populate with metadata | |
346 index = 0 | |
347 mData = metadata[n.name.replace(".fa","")] | |
348 | |
349 # pushing in sensitive data | |
350 for sensitive_data_column in sensitive_meta_data.get_columns(): | |
351 sens_col_val = sensitive_meta_data.get_value(bcid=mData.ID, column_name=sensitive_data_column ) | |
352 n.add_face(addFace(sens_col_val), index, "aligned") | |
353 index = index + 1 | |
354 | |
355 n.add_face(addFace(mData.ID), index, "aligned") | |
356 index = index + 1 | |
357 if (metadata[n.name.replace(".fa","")].new == True): #new column | |
358 face = e.RectFace(30,30,"green","green") # TextFace("Y",fsize=10,tight_text=True) | |
359 face.border.margin = 5 | |
360 face.margin_right = 5 | |
361 face.margin_left = 5 | |
362 face.vt_align = 1 | |
363 face.ht_align = 1 | |
364 n.add_face(face, index, "aligned") | |
365 index = index + 1 | |
366 for incs in plasmidIncs: #this loop adds presence/absence to the sample nodes | |
367 if (n.name.replace(".fa","") in plasmidIncs[incs]): | |
368 face = e.RectFace(30,30,"black","black") # TextFace("Y",fsize=10,tight_text=True) | |
369 face.border.margin = 5 | |
370 face.margin_right = 5 | |
371 face.margin_left = 5 | |
372 face.vt_align = 1 | |
373 face.ht_align = 1 | |
374 n.add_face(face, list(plasmidIncs.keys()).index(incs) + index, "aligned") | |
375 index = index + len(plasmidIncs) | |
376 n.add_face(addFace(mData.MLSTSpecies), index, "aligned") | |
377 index = index + 1 | |
378 n.add_face(addFace(mData.SequenceType), index, "aligned") | |
379 index = index + 1 | |
380 n.add_face(addFace(mData.CarbapenemResistanceGenes), index, "aligned") | |
381 index = index + 1 | |
382 for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds distance matrix | |
383 n.add_face(addFace(list(distanceDict[n.name])[i]), index + i, "aligned") | |
384 | |
385 t.render( outputFile, w=5000,units="mm", tree_style=ts) #save it as a png. or an phyloxml | |
386 | |
387 #endregion | |
388 #endregion | |
389 | |
390 | |
391 start = time.time()#time the analysis | |
392 | |
393 #analysis time | |
394 Main() | |
395 | |
396 end = time.time() | |
397 print("Finished!\nThe analysis used: " + str(end-start) + " seconds") |