1
|
1 #!/home/jjjjia/.conda/envs/py36/bin/python
|
|
2
|
|
3 #$ -S /home/jjjjia/.conda/envs/py36/bin/python
|
|
4 #$ -V # Pass environment variables to the job
|
|
5 #$ -N CPO_pipeline # Replace with a more specific job name
|
|
6 #$ -wd /home/jjjjia/testCases # Use the current working dir
|
7
|
7 #$ -pe smp 1 # Parallel Environment (how many cores)
|
1
|
8 #$ -l h_vmem=11G # Memory (RAM) allocation *per core*
|
|
9 #$ -e ./logs/$JOB_ID.err
|
|
10 #$ -o ./logs/$JOB_ID.log
|
|
11 #$ -m ea
|
|
12 #$ -M bja20@sfu.ca
|
|
13
|
7
|
14 # >python cpo_galaxy_tree.py -t /path/to/tree.ph -d /path/to/distance/matrix -m /path/to/metadata
|
|
15
|
|
16 # <requirements>
|
|
17 # <requirement type="package" version="0.23.4">pandas</requirement>
|
|
18 # <requirement type="package" version="3.6">python</requirement>
|
|
19 # <requirement type="package" version="3.1.1">ete3</requirement>
|
8
|
20 # <requirement type="package" version="5.6.0">pyqt</requirement>
|
12
|
21 # <requirement type="package" version="5.6.2">qt</requirement>
|
7
|
22 # </requirements>
|
1
|
23
|
|
24 import subprocess
|
7
|
25 import pandas #conda pandas
|
1
|
26 import optparse
|
|
27 import os
|
27
|
28 os.environ['QT_QPA_PLATFORM']='offscreen'
|
1
|
29 import datetime
|
|
30 import sys
|
|
31 import time
|
|
32 import urllib.request
|
|
33 import gzip
|
|
34 import collections
|
|
35 import json
|
7
|
36 import numpy #conda numpy
|
|
37 import ete3 as e #conda ete3 3.1.1**** >requires pyqt5
|
6
|
38
|
1
|
39
|
|
40 #parses some parameters
|
|
41 parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...")
|
18
|
42 parser.add_option("-t", "--tree", dest="treePath", type="string", default="./pipelineTest/tree.txt", help="absolute file path to phylip tree")
|
26
|
43 parser.add_option("-d", "--distance", dest="distancePath", type="string", default="./pipelineTest/dist.tabular", help="absolute file path to distance matrix")
|
|
44 parser.add_option("-m", "--metadata", dest="metadataPath", type="string", default="./pipelineTest/metadata.tabular",help="absolute file path to metadata file")
|
18
|
45 parser.add_option("-o", "--output_file", dest="outputFile", type="string", default="tree.png", help="Output graphics file. Use ending 'png', 'pdf' or 'svg' to specify file format.")
|
|
46
|
|
47 # sensitive data adder
|
|
48 parser.add_option("-p", "--sensitive_data", dest="sensitivePath", type="string", default="", help="Spreadsheet (CSV) with sensitive metadata")
|
|
49 parser.add_option("-c", "--sensitive_cols", dest="sensitiveCols", type="string", default="", help="CSV list of column names from sensitive metadata spreadsheet to use as labels on dendrogram")
|
|
50 parser.add_option("-b", "--bcid_column", dest="bcidCol", type="string", default="BCID", help="Column name of BCID in sensitive metadata file")
|
|
51 parser.add_option("-n", "--missing_value", dest="naValue", type="string", default="NA", help="Value to write for missing data.")
|
|
52
|
1
|
53 (options,args) = parser.parse_args()
|
|
54 treePath = str(options.treePath).lstrip().rstrip()
|
|
55 distancePath = str(options.distancePath).lstrip().rstrip()
|
|
56 metadataPath = str(options.metadataPath).lstrip().rstrip()
|
|
57
|
18
|
58 sensitivePath = str(options.sensitivePath).lstrip().rstrip()
|
|
59 sensitiveCols = str(options.sensitiveCols).lstrip().rstrip()
|
|
60 outputFile = str(options.outputFile).lstrip().rstrip()
|
|
61 bcidCol = str( str(options.bcidCol).lstrip().rstrip() )
|
|
62 naValue = str( str(options.naValue).lstrip().rstrip() )
|
|
63
|
1
|
64
|
|
65 #region result objects
|
|
66 #define some objects to store values from results
|
|
67 #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar).
|
18
|
68
|
|
69 class SensitiveMetadata(object):
|
|
70 def __init__(self):
|
|
71 x = pandas.read_csv( sensitivePath )
|
|
72 col_names = [ s for s in sensitiveCols.split(',')] # convert to 0 offset
|
|
73 if not bcidCol in col_names:
|
|
74 col_names.append( bcidCol )
|
|
75 all_cols = [ str(col) for col in x.columns ]
|
|
76 col_idxs = [ all_cols.index(col) for col in col_names ]
|
|
77 self.sensitive_data = x.iloc[:, col_idxs]
|
|
78 def get_columns(self):
|
|
79 cols = [ str(x) for x in self.sensitive_data.columns ]
|
|
80 return cols
|
|
81 def get_value( self, bcid, column_name ): # might be nice to get them all in single call via an input list of bcids ... for later
|
|
82 bcids= list( self.sensitive_data.loc[:, bcidCol ] ) # get the list of all BCIDs in sensitive metadata
|
|
83 if not bcid in bcids:
|
|
84 return naValue
|
|
85 else:
|
|
86 row_idx = bcids.index( bcid ) # lookup the row for this BCID
|
|
87 return self.sensitive_data.loc[ row_idx, column_name ] # return the one value based on the column (col_idx) and this row
|
|
88
|
1
|
89 class workflowResult(object):
|
|
90 def __init__(self):
|
|
91 self.new = False
|
12
|
92 self.ID = "?"
|
|
93 self.ExpectedSpecies = "?"
|
|
94 self.MLSTSpecies = "?"
|
|
95 self.SequenceType = "?"
|
|
96 self.MLSTScheme = "?"
|
|
97 self.CarbapenemResistanceGenes ="?"
|
18
|
98 self.plasmidBestMatch ="?"
|
25
|
99 self.plasmididentity =-1
|
|
100 self.plasmidsharedhashes ="?"
|
12
|
101 self.OtherAMRGenes="?"
|
|
102 self.TotalPlasmids = -1
|
1
|
103 self.plasmids = []
|
12
|
104 self.DefinitelyPlasmidContigs ="?"
|
|
105 self.LikelyPlasmidContigs="?"
|
1
|
106 self.row = ""
|
|
107 class plasmidObj(object):
|
|
108 def __init__(self):
|
|
109 self.PlasmidsID = 0
|
|
110 self.Num_Contigs = 0
|
|
111 self.PlasmidLength = 0
|
|
112 self.PlasmidRepType = ""
|
|
113 self.PlasmidMobility = ""
|
|
114 self.NearestReference = ""
|
|
115
|
|
116 #endregion
|
|
117
|
|
118 #region useful functions
|
7
|
119 def read(path): #read in a text file to a list
|
1
|
120 return [line.rstrip('\n') for line in open(path)]
|
7
|
121 def execute(command): #subprocess.popen call bash command
|
1
|
122 process = subprocess.Popen(command, shell=False, cwd=curDir, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
|
123
|
|
124 # Poll process for new output until finished
|
|
125 while True:
|
|
126 nextline = process.stdout.readline()
|
|
127 if nextline == '' and process.poll() is not None:
|
|
128 break
|
|
129 sys.stdout.write(nextline)
|
|
130 sys.stdout.flush()
|
|
131
|
|
132 output = process.communicate()[0]
|
|
133 exitCode = process.returncode
|
|
134
|
|
135 if (exitCode == 0):
|
|
136 return output
|
|
137 else:
|
|
138 raise subprocess.CalledProcessError(exitCode, command)
|
7
|
139 def httpGetFile(url, filepath=""): #download a file from the web
|
1
|
140 if (filepath == ""):
|
|
141 return urllib.request.urlretrieve(url)
|
|
142 else:
|
|
143 urllib.request.urlretrieve(url, filepath)
|
|
144 return True
|
7
|
145 def gunzip(inputpath="", outputpath=""): #gunzip
|
1
|
146 if (outputpath == ""):
|
|
147 with gzip.open(inputpath, 'rb') as f:
|
|
148 gzContent = f.read()
|
|
149 return gzContent
|
|
150 else:
|
|
151 with gzip.open(inputpath, 'rb') as f:
|
|
152 gzContent = f.read()
|
|
153 with open(outputpath, 'wb') as out:
|
|
154 out.write(gzContent)
|
|
155 return True
|
7
|
156 def addFace(name): #function to add a facet to a tree
|
6
|
157 #if its the reference branch, populate the faces with column headers
|
|
158 face = e.faces.TextFace(name,fsize=10,tight_text=True)
|
|
159 face.border.margin = 5
|
|
160 face.margin_right = 5
|
|
161 face.margin_left = 5
|
|
162 return face
|
1
|
163 #endregion
|
|
164
|
|
165 #region functions to parse result files
|
|
166 def ParseWorkflowResults(pathToResult):
|
|
167 _worflowResult = {}
|
6
|
168 r = pandas.read_csv(pathToResult, delimiter='\t', header=0)
|
1
|
169 r = r.replace(numpy.nan, '', regex=True)
|
12
|
170 _naResult = workflowResult()
|
|
171 _worflowResult["na"] = _naResult
|
1
|
172 for i in range(len(r.index)):
|
|
173 _results = workflowResult()
|
6
|
174 if(str(r.loc[r.index[i], 'new']).lower() == "new"):
|
1
|
175 _results.new = True
|
|
176 else:
|
|
177 _results.new = False
|
12
|
178 _results.ID = str(r.loc[r.index[i], 'ID']).replace(".fa","")
|
6
|
179 _results.ExpectedSpecies = str(r.loc[r.index[i], 'Expected Species'])
|
|
180 _results.MLSTSpecies = str(r.loc[r.index[i], 'MLST Species'])
|
|
181 _results.SequenceType = str(r.loc[r.index[i], 'Sequence Type'])
|
|
182 _results.MLSTScheme = (str(r.loc[r.index[i], 'MLST Scheme']))
|
|
183 _results.CarbapenemResistanceGenes = (str(r.loc[r.index[i], 'Carbapenem Resistance Genes']))
|
|
184 _results.OtherAMRGenes = (str(r.loc[r.index[i], 'Other AMR Genes']))
|
|
185 _results.TotalPlasmids = int(r.loc[r.index[i], 'Total Plasmids'])
|
18
|
186 _results.plasmidBestMatch = str(r.loc[r.index[i], 'Plasmid Best Match'])
|
25
|
187 _results.plasmididentity = str(r.loc[r.index[i], 'Plasmid Identity'])
|
26
|
188 _results.plasmidsharedhashes = str(r.loc[r.index[i], 'Plasmid Shared Hash'])
|
1
|
189 for j in range(0,_results.TotalPlasmids):
|
|
190 _plasmid = plasmidObj()
|
6
|
191 _plasmid.PlasmidsID =(((str(r.loc[r.index[i], 'Plasmids ID'])).split(";"))[j])
|
|
192 _plasmid.Num_Contigs = (((str(r.loc[r.index[i], 'Num_Contigs'])).split(";"))[j])
|
|
193 _plasmid.PlasmidLength = (((str(r.loc[r.index[i], 'Plasmid Length'])).split(";"))[j])
|
|
194 _plasmid.PlasmidRepType = (((str(r.loc[r.index[i], 'Plasmid RepType'])).split(";"))[j])
|
|
195 _plasmid.PlasmidMobility = ((str(r.loc[r.index[i], 'Plasmid Mobility'])).split(";"))[j]
|
|
196 _plasmid.NearestReference = ((str(r.loc[r.index[i], 'Nearest Reference'])).split(";"))[j]
|
1
|
197 _results.plasmids.append(_plasmid)
|
6
|
198 _results.DefinitelyPlasmidContigs = (str(r.loc[r.index[i], 'Definitely Plasmid Contigs']))
|
|
199 _results.LikelyPlasmidContigs = (str(r.loc[r.index[i], 'Likely Plasmid Contigs']))
|
1
|
200 _results.row = "\t".join(str(x) for x in r.ix[i].tolist())
|
|
201 _worflowResult[_results.ID] = _results
|
|
202 return _worflowResult
|
|
203
|
|
204 #endregion
|
|
205
|
|
206 def Main():
|
18
|
207 if len(sensitivePath)>0:
|
|
208 sensitive_meta_data = SensitiveMetadata()
|
|
209
|
1
|
210 metadata = ParseWorkflowResults(metadataPath)
|
|
211 distance = read(distancePath)
|
|
212 treeFile = "".join(read(treePath))
|
|
213
|
6
|
214 distanceDict = {} #store the distance matrix as rowname:list<string>
|
1
|
215 for i in range(len(distance)):
|
|
216 temp = distance[i].split("\t")
|
|
217 distanceDict[temp[0]] = temp[1:]
|
6
|
218
|
|
219 #region create box tree
|
|
220 #region step5: tree construction
|
|
221 treeFile = "".join(read(treePath))
|
|
222 t = e.Tree(treeFile)
|
1
|
223 t.set_outgroup(t&"Reference")
|
|
224
|
6
|
225 #set the tree style
|
|
226 ts = e.TreeStyle()
|
12
|
227 ts.show_leaf_name = True
|
1
|
228 ts.show_branch_length = True
|
|
229 ts.scale = 2000 #pixel per branch length unit
|
|
230 ts.branch_vertical_margin = 15 #pixel between branches
|
6
|
231 style2 = e.NodeStyle()
|
1
|
232 style2["fgcolor"] = "#000000"
|
|
233 style2["shape"] = "circle"
|
|
234 style2["vt_line_color"] = "#0000aa"
|
|
235 style2["hz_line_color"] = "#0000aa"
|
|
236 style2["vt_line_width"] = 2
|
|
237 style2["hz_line_width"] = 2
|
|
238 style2["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted
|
|
239 style2["hz_line_type"] = 0
|
|
240 for n in t.traverse():
|
|
241 n.set_style(style2)
|
|
242
|
6
|
243 #find the plasmid origins
|
1
|
244 plasmidIncs = {}
|
|
245 for key in metadata:
|
|
246 for plasmid in metadata[key].plasmids:
|
|
247 for inc in plasmid.PlasmidRepType.split(","):
|
|
248 if (inc.lower().find("inc") > -1):
|
|
249 if not (inc in plasmidIncs):
|
|
250 plasmidIncs[inc] = [metadata[key].ID]
|
|
251 else:
|
|
252 if metadata[key].ID not in plasmidIncs[inc]:
|
|
253 plasmidIncs[inc].append(metadata[key].ID)
|
|
254 #plasmidIncs = sorted(plasmidIncs)
|
6
|
255 for n in t.traverse(): #loop through the nodes of a tree
|
1
|
256 if (n.is_leaf() and n.name == "Reference"):
|
6
|
257 #if its the reference branch, populate the faces with column headers
|
|
258 index = 0
|
18
|
259
|
|
260 if len(sensitivePath)>0: #sensitive metadat @ chris
|
|
261 for sensitive_data_column in sensitive_meta_data.get_columns():
|
|
262 (t&"Reference").add_face(addFace(sensitive_data_column), index, "aligned")
|
|
263 index = index + 1
|
|
264
|
6
|
265 (t&"Reference").add_face(addFace("SampleID"), index, "aligned")
|
|
266 index = index + 1
|
|
267 (t&"Reference").add_face(addFace("New?"), index, "aligned")
|
|
268 index = index + 1
|
1
|
269 for i in range(len(plasmidIncs)): #this loop adds the columns (aka the incs) to the reference node
|
6
|
270 (t&"Reference").add_face(addFace(list(plasmidIncs.keys())[i]), i + index, "aligned")
|
|
271 index = index + len(plasmidIncs)
|
|
272 (t&"Reference").add_face(addFace("MLSTScheme"), index, "aligned")
|
|
273 index = index + 1
|
|
274 (t&"Reference").add_face(addFace("Sequence Type"), index, "aligned")
|
|
275 index = index + 1
|
|
276 (t&"Reference").add_face(addFace("Carbapenamases"), index, "aligned")
|
|
277 index = index + 1
|
18
|
278 (t&"Reference").add_face(addFace("Plasmid Best Match"), index, "aligned")
|
|
279 index = index + 1
|
24
|
280 (t&"Reference").add_face(addFace("Best Match Identity"), index, "aligned")
|
|
281 index = index + 1
|
6
|
282 for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds the distance matrix
|
|
283 (t&"Reference").add_face(addFace(distanceDict[list(distanceDict.keys())[0]][i]), index + i, "aligned")
|
|
284 index = index + len(distanceDict[list(distanceDict.keys())[0]])
|
|
285 elif (n.is_leaf() and not n.name == "Reference"):
|
|
286 #not reference branches, populate with metadata
|
|
287 index = 0
|
18
|
288
|
|
289 if len(sensitivePath)>0: #sensitive metadata @ chris
|
|
290 # pushing in sensitive data
|
|
291 for sensitive_data_column in sensitive_meta_data.get_columns():
|
|
292 # tree uses bcids like BC18A021A_S12
|
|
293 # while sens meta-data uses BC18A021A
|
|
294 # trim the "_S.*" if present
|
|
295 bcid = str(mData.ID)
|
|
296 if bcid.find( "_S" ) != -1:
|
|
297 bcid = bcid[ 0:bcid.find( "_S" ) ]
|
|
298 sens_col_val = sensitive_meta_data.get_value(bcid=bcid, column_name=sensitive_data_column )
|
|
299 n.add_face(addFace(sens_col_val), index, "aligned")
|
|
300 index = index + 1
|
|
301
|
12
|
302 if (n.name.replace(".fa","") in metadata.keys()):
|
|
303 mData = metadata[n.name.replace(".fa","")]
|
|
304 else:
|
|
305 mData = metadata["na"]
|
6
|
306 n.add_face(addFace(mData.ID), index, "aligned")
|
|
307 index = index + 1
|
12
|
308 if (mData.new == True): #new column
|
6
|
309 face = e.RectFace(30,30,"green","green") # TextFace("Y",fsize=10,tight_text=True)
|
1
|
310 face.border.margin = 5
|
|
311 face.margin_right = 5
|
|
312 face.margin_left = 5
|
|
313 face.vt_align = 1
|
|
314 face.ht_align = 1
|
6
|
315 n.add_face(face, index, "aligned")
|
|
316 index = index + 1
|
1
|
317 for incs in plasmidIncs: #this loop adds presence/absence to the sample nodes
|
|
318 if (n.name.replace(".fa","") in plasmidIncs[incs]):
|
6
|
319 face = e.RectFace(30,30,"black","black") # TextFace("Y",fsize=10,tight_text=True)
|
1
|
320 face.border.margin = 5
|
|
321 face.margin_right = 5
|
|
322 face.margin_left = 5
|
|
323 face.vt_align = 1
|
|
324 face.ht_align = 1
|
6
|
325 n.add_face(face, list(plasmidIncs.keys()).index(incs) + index, "aligned")
|
|
326 index = index + len(plasmidIncs)
|
|
327 n.add_face(addFace(mData.MLSTSpecies), index, "aligned")
|
|
328 index = index + 1
|
|
329 n.add_face(addFace(mData.SequenceType), index, "aligned")
|
|
330 index = index + 1
|
|
331 n.add_face(addFace(mData.CarbapenemResistanceGenes), index, "aligned")
|
|
332 index = index + 1
|
18
|
333 n.add_face(addFace(mData.plasmidBestMatch), index, "aligned")
|
|
334 index = index + 1
|
24
|
335 n.add_face(addFace(mData.plasmididentity), index, "aligned")
|
|
336 index = index + 1
|
1
|
337 for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds distance matrix
|
13
|
338 if (n.name in distanceDict): #make sure the column is in the distance matrice
|
|
339 n.add_face(addFace(list(distanceDict[n.name])[i]), index + i, "aligned")
|
6
|
340
|
18
|
341 t.render(outputFile, w=5000,units="mm", tree_style=ts) #save it as a png, pdf, svg or an phyloxml
|
1
|
342
|
|
343 #endregion
|
|
344 #endregion
|
|
345
|
|
346
|
|
347 start = time.time()#time the analysis
|
|
348
|
|
349 #analysis time
|
|
350 Main()
|
|
351
|
|
352 end = time.time()
|
12
|
353 print("Finished!\nThe analysis used: " + str(end-start) + " seconds") |