comparison cpo_galaxy_tree_sensitive.py @ 12:4b2738bc81ed draft

planemo upload
author jjjjia
date Fri, 24 Aug 2018 19:10:42 -0400
parents
children
comparison
equal deleted inserted replaced
11:8aaa4001383b 12:4b2738bc81ed
1 #!/home/jjjjia/.conda/envs/py36/bin/python
2
3 #$ -S /home/jjjjia/.conda/envs/py36/bin/python
4 #$ -V # Pass environment variables to the job
5 #$ -N CPO_pipeline # Replace with a more specific job name
6 #$ -wd /home/jjjjia/testCases # Use the current working dir
7 #$ -pe smp 1 # Parallel Environment (how many cores)
8 #$ -l h_vmem=11G # Memory (RAM) allocation *per core*
9 #$ -e ./logs/$JOB_ID.err
10 #$ -o ./logs/$JOB_ID.log
11 #$ -m ea
12 #$ -M bja20@sfu.ca
13
14 # >python cpo_galaxy_tree.py -t /path/to/tree.ph -d /path/to/distance/matrix -m /path/to/metadata
15 # python cpo_galaxy_tree.py -t tree.txt -d ./dist.tabular -m ./metadata.tsv
16
17 # <requirements>
18 # <requirement type="package" version="0.23.4">pandas</requirement>
19 # <requirement type="package" version="3.6">python</requirement>
20 # <requirement type="package" version="3.1.1">ete3</requirement>
21 # <requirement type="package" version="5.9.3">pyqt</requirement>
22 # </requirements>
23
24 import subprocess
25 import pandas #conda pandas
26 import optparse
27 import os
28 import datetime
29 import sys
30 import time
31 import urllib.request
32 import gzip
33 import collections
34 import json
35 import numpy #conda numpy
36 import ete3 as e #conda ete3 3.1.1**** >requires pyqt5
37 import csv
38
39 #parses some parameters
40 parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...")
41 parser.add_option("-t", "--tree", dest="treePath", type="string", default="./pipelineTest/tree.txt", help="identifier of the isolate")
42 parser.add_option("-d", "--distance", dest="distancePath", type="string", default="./pipelineTest/distance.tab", help="absolute file path forward read (R1)")
43 parser.add_option("-m", "--metadata", dest="metadataPath", type="string", default="./pipelineTest/metadata.tsv",help="absolute file path to reverse read (R2)")
44 parser.add_option("-p", "--sensitive_data", dest="sensitivePath", type="string", default="", help="Spreadsheet (CSV) with sensitive metadata")
45 parser.add_option("-c", "--sensitive_cols", dest="sensitiveCols", type="string", default="", help="CSV list of column names from sensitive metadata spreadsheet to use as labels on dendrogram")
46 parser.add_option("-o", "--output_file", dest="outputFile", type="string", default="tree.png", help="Output graphics file. Use ending 'png', 'pdf' or 'svg' to specify file format.")
47 parser.add_option("-b", "--bcid_column", dest="bcidCol", type="string", default="BCID", help="Column name of BCID in sensitive metadata file")
48 parser.add_option("-n", "--missing_value", dest="naValue", type="string", default="NA", help="Value to write for missing data.")
49
50 (options,args) = parser.parse_args()
51 treePath = str(options.treePath).lstrip().rstrip()
52 distancePath = str(options.distancePath).lstrip().rstrip()
53 metadataPath = str(options.metadataPath).lstrip().rstrip()
54 sensitivePath = str(options.sensitivePath).lstrip().rstrip()
55 sensitiveCols = str(options.sensitiveCols).lstrip().rstrip()
56 outputFile = str(options.outputFile).lstrip().rstrip()
57 bcidCol = str( str(options.bcidCol).lstrip().rstrip() )
58 naValue = str( str(options.naValue).lstrip().rstrip() )
59
60 if len(sensitivePath) == 0:
61 print("Must give a file with sensitive meta data. Option -p, or --sensitive_data")
62
63 ### test values to uncomment
64 # sensitivePath = "./sensitive_metadata.csv"
65 # sensitiveCols = "Name,Care facility"
66 # outputFile = "newtree_test.png"
67 # bcidCol = "BCID"
68
69
70
71 import pandas
72 class SensitiveMetadata(object):
73 def __init__(self):
74 x = pandas.read_csv( sensitivePath )
75 col_names = [ s for s in sensitiveCols.split(',')] # convert to 0 offset
76 if not bcidCol in col_names:
77 col_names.append( bcidCol )
78 all_cols = [ str(col) for col in x.columns ]
79 col_idxs = [ all_cols.index(col) for col in col_names ]
80 self.sensitive_data = x.iloc[:, col_idxs]
81 def get_columns(self):
82 cols = [ str(x) for x in self.sensitive_data.columns ]
83 return cols
84 def get_value( self, bcid, column_name ): # might be nice to get them all in single call via an input list of bcids ... for later
85 bcids= list( self.sensitive_data.loc[:, bcidCol ] ) # get the list of all BCIDs in sensitive metadata
86 if not bcid in bcids:
87 return naValue
88 else:
89 row_idx = bcids.index( bcid ) # lookup the row for this BCID
90 return self.sensitive_data.loc[ row_idx, column_name ] # return the one value based on the column (col_idx) and this row
91
92
93 #region result objects
94 #define some objects to store values from results
95 #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar).
96 class workflowResult(object):
97 def __init__(self):
98 self.new = False
99 self.ID = ""
100 self.ExpectedSpecies = ""
101 self.MLSTSpecies = ""
102 self.SequenceType = ""
103 self.MLSTScheme = ""
104 self.CarbapenemResistanceGenes =""
105 self.OtherAMRGenes=""
106 self.TotalPlasmids = 0
107 self.plasmids = []
108 self.DefinitelyPlasmidContigs =""
109 self.LikelyPlasmidContigs=""
110 self.row = ""
111 class plasmidObj(object):
112 def __init__(self):
113 self.PlasmidsID = 0
114 self.Num_Contigs = 0
115 self.PlasmidLength = 0
116 self.PlasmidRepType = ""
117 self.PlasmidMobility = ""
118 self.NearestReference = ""
119
120 #endregion
121
122 #region useful functions
123 def read(path): #read in a text file to a list
124 return [line.rstrip('\n') for line in open(path)]
125 def execute(command): #subprocess.popen call bash command
126 process = subprocess.Popen(command, shell=False, cwd=curDir, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
127
128 # Poll process for new output until finished
129 while True:
130 nextline = process.stdout.readline()
131 if nextline == '' and process.poll() is not None:
132 break
133 sys.stdout.write(nextline)
134 sys.stdout.flush()
135
136 output = process.communicate()[0]
137 exitCode = process.returncode
138
139 if (exitCode == 0):
140 return output
141 else:
142 raise subprocess.CalledProcessError(exitCode, command)
143 def httpGetFile(url, filepath=""): #download a file from the web
144 if (filepath == ""):
145 return urllib.request.urlretrieve(url)
146 else:
147 urllib.request.urlretrieve(url, filepath)
148 return True
149 def gunzip(inputpath="", outputpath=""): #gunzip
150 if (outputpath == ""):
151 with gzip.open(inputpath, 'rb') as f:
152 gzContent = f.read()
153 return gzContent
154 else:
155 with gzip.open(inputpath, 'rb') as f:
156 gzContent = f.read()
157 with open(outputpath, 'wb') as out:
158 out.write(gzContent)
159 return True
160 def addFace(name): #function to add a facet to a tree
161 #if its the reference branch, populate the faces with column headers
162 face = e.faces.TextFace(name,fsize=10,tight_text=True)
163 face.border.margin = 5
164 face.margin_right = 5
165 face.margin_left = 5
166 return face
167 #endregion
168
169 #region functions to parse result files
170 def ParseWorkflowResults(pathToResult):
171 _worflowResult = {}
172 r = pandas.read_csv(pathToResult, delimiter='\t', header=0)
173 r = r.replace(numpy.nan, '', regex=True)
174 for i in range(len(r.index)):
175 _results = workflowResult()
176 if(str(r.loc[r.index[i], 'new']).lower() == "new"):
177 _results.new = True
178 else:
179 _results.new = False
180 _results.ID = str(r.loc[r.index[i], 'ID'])
181 _results.ExpectedSpecies = str(r.loc[r.index[i], 'Expected Species'])
182 _results.MLSTSpecies = str(r.loc[r.index[i], 'MLST Species'])
183 _results.SequenceType = str(r.loc[r.index[i], 'Sequence Type'])
184 _results.MLSTScheme = (str(r.loc[r.index[i], 'MLST Scheme']))
185 _results.CarbapenemResistanceGenes = (str(r.loc[r.index[i], 'Carbapenem Resistance Genes']))
186 _results.OtherAMRGenes = (str(r.loc[r.index[i], 'Other AMR Genes']))
187 _results.TotalPlasmids = int(r.loc[r.index[i], 'Total Plasmids'])
188 for j in range(0,_results.TotalPlasmids):
189 _plasmid = plasmidObj()
190 _plasmid.PlasmidsID =(((str(r.loc[r.index[i], 'Plasmids ID'])).split(";"))[j])
191 _plasmid.Num_Contigs = (((str(r.loc[r.index[i], 'Num_Contigs'])).split(";"))[j])
192 _plasmid.PlasmidLength = (((str(r.loc[r.index[i], 'Plasmid Length'])).split(";"))[j])
193 _plasmid.PlasmidRepType = (((str(r.loc[r.index[i], 'Plasmid RepType'])).split(";"))[j])
194 _plasmid.PlasmidMobility = ((str(r.loc[r.index[i], 'Plasmid Mobility'])).split(";"))[j]
195 _plasmid.NearestReference = ((str(r.loc[r.index[i], 'Nearest Reference'])).split(";"))[j]
196 _results.plasmids.append(_plasmid)
197 _results.DefinitelyPlasmidContigs = (str(r.loc[r.index[i], 'Definitely Plasmid Contigs']))
198 _results.LikelyPlasmidContigs = (str(r.loc[r.index[i], 'Likely Plasmid Contigs']))
199 _results.row = "\t".join(str(x) for x in r.ix[i].tolist())
200 _worflowResult[_results.ID] = _results
201 return _worflowResult
202
203 #endregion
204
205 def Main():
206 sensitive_meta_data = SensitiveMetadata()
207 # print( sensitive_meta_data.get_columns() )
208 metadata = ParseWorkflowResults(metadataPath)
209 distance = read(distancePath)
210 treeFile = "".join(read(treePath))
211
212 distanceDict = {} #store the distance matrix as rowname:list<string>
213 for i in range(len(distance)):
214 temp = distance[i].split("\t")
215 distanceDict[temp[0]] = temp[1:]
216 #region step5: tree construction
217
218 '''
219 #region create detailed tree
220
221 plasmidCount = 0
222 for n in t.traverse():
223 if (n.is_leaf() and not n.name == "Reference"):
224 mData = metadata[n.name.replace(".fa","")]
225 face = faces.TextFace(mData.MLSTSpecies,fsize=10,tight_text=True)
226 face.border.margin = 5
227 face.margin_left = 10
228 face.margin_right = 10
229 n.add_face(face, 0, "aligned")
230 face = faces.TextFace(mData.SequenceType,fsize=10,tight_text=True)
231 face.border.margin = 5
232 face.margin_right = 10
233 n.add_face(face, 1, "aligned")
234 face = faces.TextFace(mData.CarbapenemResistanceGenes,fsize=10,tight_text=True)
235 face.border.margin = 5
236 face.margin_right = 10
237 n.add_face(face, 2, "aligned")
238 index = 3
239 if (mData.TotalPlasmids > plasmidCount):
240 plasmidCount = mData.TotalPlasmids
241 for i in range(0, mData.TotalPlasmids):
242 face = faces.TextFace(mData.plasmids[i].PlasmidRepType,fsize=10,tight_text=True)
243 face.border.margin = 5
244 face.margin_right = 10
245 n.add_face(face, index, "aligned")
246 index+=1
247 face = faces.TextFace(mData.plasmids[i].PlasmidMobility,fsize=10,tight_text=True)
248 face.border.margin = 5
249 face.margin_right = 10
250 n.add_face(face, index, "aligned")
251 index+=1
252
253 face = faces.TextFace("Species",fsize=10,tight_text=True)
254 face.border.margin = 5
255 face.margin_right = 10
256 face.margin_left = 10
257 (t&"Reference").add_face(face, 0, "aligned")
258 face = faces.TextFace("Sequence Type",fsize=10,tight_text=True)
259 face.border.margin = 5
260 face.margin_right = 10
261 (t&"Reference").add_face(face, 1, "aligned")
262 face = faces.TextFace("Carbapenamases",fsize=10,tight_text=True)
263 face.border.margin = 5
264 face.margin_right = 10
265 (t&"Reference").add_face(face, 2, "aligned")
266 index = 3
267 for i in range(0, plasmidCount):
268 face = faces.TextFace("plasmid " + str(i) + " replicons",fsize=10,tight_text=True)
269 face.border.margin = 5
270 face.margin_right = 10
271 (t&"Reference").add_face(face, index, "aligned")
272 index+=1
273 face = faces.TextFace("plasmid " + str(i) + " mobility",fsize=10,tight_text=True)
274 face.border.margin = 5
275 face.margin_right = 10
276 (t&"Reference").add_face(face, index, "aligned")
277 index+=1
278
279 t.render("./pipelineTest/tree.png", w=5000,units="mm", tree_style=ts)
280
281 #endregion
282 '''
283 #region create box tree
284 #region step5: tree construction
285 treeFile = "".join(read(treePath))
286 t = e.Tree(treeFile)
287 t.set_outgroup(t&"Reference")
288
289 #set the tree style
290 ts = e.TreeStyle()
291 ts.show_leaf_name = False
292 ts.show_branch_length = True
293 ts.scale = 2000 #pixel per branch length unit
294 ts.branch_vertical_margin = 15 #pixel between branches
295 style2 = e.NodeStyle()
296 style2["fgcolor"] = "#000000"
297 style2["shape"] = "circle"
298 style2["vt_line_color"] = "#0000aa"
299 style2["hz_line_color"] = "#0000aa"
300 style2["vt_line_width"] = 2
301 style2["hz_line_width"] = 2
302 style2["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted
303 style2["hz_line_type"] = 0
304 for n in t.traverse():
305 n.set_style(style2)
306
307 #find the plasmid origins
308 plasmidIncs = {}
309 for key in metadata:
310 for plasmid in metadata[key].plasmids:
311 for inc in plasmid.PlasmidRepType.split(","):
312 if (inc.lower().find("inc") > -1):
313 if not (inc in plasmidIncs):
314 plasmidIncs[inc] = [metadata[key].ID]
315 else:
316 if metadata[key].ID not in plasmidIncs[inc]:
317 plasmidIncs[inc].append(metadata[key].ID)
318 #plasmidIncs = sorted(plasmidIncs)
319 for n in t.traverse(): #loop through the nodes of a tree
320 if (n.is_leaf() and n.name == "Reference"):
321 #if its the reference branch, populate the faces with column headers
322 index = 0
323
324 for sensitive_data_column in sensitive_meta_data.get_columns():
325 (t&"Reference").add_face(addFace(sensitive_data_column), index, "aligned")
326 index = index + 1
327
328 (t&"Reference").add_face(addFace("SampleID"), index, "aligned")
329 index = index + 1
330 (t&"Reference").add_face(addFace("New?"), index, "aligned")
331 index = index + 1
332 for i in range(len(plasmidIncs)): #this loop adds the columns (aka the incs) to the reference node
333 (t&"Reference").add_face(addFace(list(plasmidIncs.keys())[i]), i + index, "aligned")
334 index = index + len(plasmidIncs)
335 (t&"Reference").add_face(addFace("MLSTScheme"), index, "aligned")
336 index = index + 1
337 (t&"Reference").add_face(addFace("Sequence Type"), index, "aligned")
338 index = index + 1
339 (t&"Reference").add_face(addFace("Carbapenamases"), index, "aligned")
340 index = index + 1
341 for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds the distance matrix
342 (t&"Reference").add_face(addFace(distanceDict[list(distanceDict.keys())[0]][i]), index + i, "aligned")
343 index = index + len(distanceDict[list(distanceDict.keys())[0]])
344 elif (n.is_leaf() and not n.name == "Reference"):
345 #not reference branches, populate with metadata
346 index = 0
347 mData = metadata[n.name.replace(".fa","")]
348
349 # pushing in sensitive data
350 for sensitive_data_column in sensitive_meta_data.get_columns():
351 sens_col_val = sensitive_meta_data.get_value(bcid=mData.ID, column_name=sensitive_data_column )
352 n.add_face(addFace(sens_col_val), index, "aligned")
353 index = index + 1
354
355 n.add_face(addFace(mData.ID), index, "aligned")
356 index = index + 1
357 if (metadata[n.name.replace(".fa","")].new == True): #new column
358 face = e.RectFace(30,30,"green","green") # TextFace("Y",fsize=10,tight_text=True)
359 face.border.margin = 5
360 face.margin_right = 5
361 face.margin_left = 5
362 face.vt_align = 1
363 face.ht_align = 1
364 n.add_face(face, index, "aligned")
365 index = index + 1
366 for incs in plasmidIncs: #this loop adds presence/absence to the sample nodes
367 if (n.name.replace(".fa","") in plasmidIncs[incs]):
368 face = e.RectFace(30,30,"black","black") # TextFace("Y",fsize=10,tight_text=True)
369 face.border.margin = 5
370 face.margin_right = 5
371 face.margin_left = 5
372 face.vt_align = 1
373 face.ht_align = 1
374 n.add_face(face, list(plasmidIncs.keys()).index(incs) + index, "aligned")
375 index = index + len(plasmidIncs)
376 n.add_face(addFace(mData.MLSTSpecies), index, "aligned")
377 index = index + 1
378 n.add_face(addFace(mData.SequenceType), index, "aligned")
379 index = index + 1
380 n.add_face(addFace(mData.CarbapenemResistanceGenes), index, "aligned")
381 index = index + 1
382 for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds distance matrix
383 n.add_face(addFace(list(distanceDict[n.name])[i]), index + i, "aligned")
384
385 t.render( outputFile, w=5000,units="mm", tree_style=ts) #save it as a png. or an phyloxml
386
387 #endregion
388 #endregion
389
390
391 start = time.time()#time the analysis
392
393 #analysis time
394 Main()
395
396 end = time.time()
397 print("Finished!\nThe analysis used: " + str(end-start) + " seconds")