Mercurial > repos > jjjjia > cpo_prediction
comparison cpo_galaxy_prediction.py @ 1:fea89c4d5227 draft
Uploaded
author | jjjjia |
---|---|
date | Thu, 16 Aug 2018 19:27:05 -0400 |
parents | |
children | 29302ffdf137 |
comparison
equal
deleted
inserted
replaced
0:917a05a03ac9 | 1:fea89c4d5227 |
---|---|
1 #!/home/jjjjia/.conda/envs/py36/bin/python | |
2 | |
3 #$ -S /home/jjjjia/.conda/envs/py36/bin/python | |
4 #$ -V # Pass environment variables to the job | |
5 #$ -N CPO_pipeline # Replace with a more specific job name | |
6 #$ -wd /home/jjjjia/testCases # Use the current working dir | |
7 #$ -pe smp 8 # Parallel Environment (how many cores) | |
8 #$ -l h_vmem=11G # Memory (RAM) allocation *per core* | |
9 #$ -e ./logs/$JOB_ID.err | |
10 #$ -o ./logs/$JOB_ID.log | |
11 #$ -m ea | |
12 #$ -M bja20@sfu.ca | |
13 | |
14 #~/scripts/pipeline.py -i BC11-Kpn005_S2 -f /data/jjjjia/R1/BC11-Kpn005_S2_L001_R1_001.fastq.gz -r /data/jjjjia/R2/BC11-Kpn005_S2_L001_R2_001.fastq.gz -o pipelineResultsQsub -e "Klebsiella pneumoniae" | |
15 | |
16 import subprocess | |
17 import pandas | |
18 import optparse | |
19 import os | |
20 import datetime | |
21 import sys | |
22 import time | |
23 import urllib.request | |
24 import gzip | |
25 import collections | |
26 import json | |
27 import numpy | |
28 | |
29 | |
30 debug = True #debug skips the shell scripts and also dump out a ton of debugging messages | |
31 | |
32 if not debug: | |
33 #parses some parameters | |
34 parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...") | |
35 #required | |
36 parser.add_option("-i", "--id", dest="id", type="string", help="identifier of the isolate") | |
37 parser.add_option("-a", "--assembly", dest="assemblyPath", type="string", help="absolute file path to contigs fasta") | |
38 parser.add_option("-c", "--card-db", dest="cardDB", default = "/home/jjjjia/databases/card202.json", type="string", help="absolute path to card reference database") | |
39 parser.add_option("-o", "--output", dest="output", default='./', type="string", help="absolute path to output folder") | |
40 parser.add_option("-e", "--expected", dest="expectedSpecies", default="NA/NA/NA", type="string", help="expected species of the isolate") | |
41 | |
42 #optionals | |
43 parser.add_option("-k", "--script-path", dest="scriptDir", default="/home/jjjjia/scripts", type="string", help="absolute file path to this script folder") | |
44 parser.add_option("-b", "--update-abricate-path", dest="updateAbPath", default = "", type="string", help="absolute file path to fasta sequence used for abricate database") | |
45 parser.add_option("-m", "--update-abricate-dbname", dest="updateAbName", default = "default", type="string", help="name of abricate database to update") | |
46 parser.add_option("-u", "--update-mlst", dest="updateMLST", default = "False", type="string", help="True = update MLST") | |
47 #used for parsing | |
48 parser.add_option("-s", "--mlst-scheme", dest="mlst", default= "/home/jjjjia/databases/scheme_species_map.tab", type="string", help="absolute file path to mlst scheme") | |
49 | |
50 | |
51 #parallelization, useless, these are hard coded to 8cores/64G RAM | |
52 #parser.add_option("-t", "--threads", dest="threads", default=8, type="int", help="number of cpu to use") | |
53 #parser.add_option("-p", "--memory", dest="memory", default=64, type="int", help="memory to use in GB") | |
54 | |
55 (options,args) = parser.parse_args() | |
56 #if len(args) != 8: | |
57 #parser.error("incorrect number of arguments, all 7 is required") | |
58 curDir = os.getcwd() | |
59 outputDir = options.output | |
60 expectedSpecies = options.expectedSpecies | |
61 mlstScheme = options.mlst | |
62 tempDir = outputDir + "/shovillTemp" | |
63 scriptDir = options.scriptDir | |
64 updateAbName = options.updateAbName | |
65 updateAbPath = options.updateAbPath | |
66 updateMLST = options.updateMLST | |
67 cardDB=options.cardDB | |
68 assemblyPath=options.assemblyPath | |
69 ID = options.id | |
70 else: | |
71 manifestDir = "" | |
72 curDir = os.getcwd() | |
73 outputDir = "pipelineTest" | |
74 expectedSpecies = "Escherichia coli" | |
75 #threads = 8 | |
76 #memory = 64 | |
77 mlstScheme = outputDir + "/scheme_species_map.tab" | |
78 tempDir= outputDir + "/shovillTemp" | |
79 scriptDir = "~/scripts" | |
80 updateAbName = "cpo" | |
81 updateAbPath = "~/database/bccdcCPO.seq" | |
82 updateMLST = True | |
83 assemblyPath = "./" | |
84 cardDB = "./" | |
85 ID = "BC11-Kpn005_S2" | |
86 | |
87 #region result objects | |
88 #define some objects to store values from results | |
89 #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar). | |
90 class starFinders(object): | |
91 def __init__(self): | |
92 self.file = "" | |
93 self.sequence = "" | |
94 self.start = 0 | |
95 self.end = 0 | |
96 self.gene = "" | |
97 self.shortGene = "" | |
98 self.coverage = "" | |
99 self.coverage_map = "" | |
100 self.gaps = "" | |
101 self.pCoverage = 100.00 | |
102 self.pIdentity = 100.00 | |
103 self.database = "" | |
104 self.accession = "" | |
105 self.product = "" | |
106 self.source = "chromosome" | |
107 self.row = "" | |
108 | |
109 class PlasFlowResult(object): | |
110 def __init__(self): | |
111 self.sequence = "" | |
112 self.length = 0 | |
113 self.label = "" | |
114 self.confidence = 0 | |
115 self.usefulRow = "" | |
116 self.row = "" | |
117 | |
118 class MlstResult(object): | |
119 def __init__(self): | |
120 self.file = "" | |
121 self.speciesID = "" | |
122 self.seqType = 0 | |
123 self.scheme = "" | |
124 self.species = "" | |
125 self.row="" | |
126 | |
127 class mobsuiteResult(object): | |
128 def __init__(self): | |
129 self.file_id = "" | |
130 self.cluster_id = "" | |
131 self.contig_id = "" | |
132 self.contig_num = 0 | |
133 self.contig_length = 0 | |
134 self.circularity_status = "" | |
135 self.rep_type = "" | |
136 self.rep_type_accession = "" | |
137 self.relaxase_type = "" | |
138 self.relaxase_type_accession = "" | |
139 self.mash_nearest_neighbor = "" | |
140 self.mash_neighbor_distance = 0.00 | |
141 self.repetitive_dna_id = "" | |
142 self.match_type = "" | |
143 self.score = 0 | |
144 self.contig_match_start = 0 | |
145 self.contig_match_end = 0 | |
146 self.row = "" | |
147 | |
148 class mobsuitePlasmids(object): | |
149 def __init__(self): | |
150 self.file_id = "" | |
151 self.num_contigs = 0 | |
152 self.total_length = 0 | |
153 self.gc = "" | |
154 self.rep_types = "" | |
155 self.rep_typeAccession = "" | |
156 self.relaxase_type= "" | |
157 self.relaxase_type_accession = "" | |
158 self.mpf_type = "" | |
159 self.mpf_type_accession= "" | |
160 self.orit_type = "" | |
161 self.orit_accession = "" | |
162 self.PredictedMobility = "" | |
163 self.mash_nearest_neighbor = "" | |
164 self.mash_neighbor_distance = 0.00 | |
165 self.mash_neighbor_cluster= 0 | |
166 self.row = "" | |
167 class RGIResult(object): | |
168 def __init__(self): | |
169 self.ORF_ID = "" | |
170 self.Contig = "" | |
171 self.Start = -1 | |
172 self.Stop = -1 | |
173 self.Orientation = "" | |
174 self.Cut_Off = "" | |
175 self.Pass_Bitscore = 100000 | |
176 self.Best_Hit_Bitscore = 0.00 | |
177 self.Best_Hit_ARO = "" | |
178 self.Best_Identities = 0.00 | |
179 self.ARO = 0 | |
180 self.Model_type = "" | |
181 self.SNPs_in_Best_Hit_ARO = "" | |
182 self.Other_SNPs = "" | |
183 self.Drug_Class = "" | |
184 self.Resistance_Mechanism = "" | |
185 self.AMR_Gene_Family = "" | |
186 self.Predicted_DNA = "" | |
187 self.Predicted_Protein = "" | |
188 self.CARD_Protein_Sequence = "" | |
189 self.Percentage_Length_of_Reference_Sequence = 0.00 | |
190 self.ID = "" | |
191 self.Model_ID = 0 | |
192 self.source = "" | |
193 self.row = "" | |
194 | |
195 #endregion | |
196 | |
197 #region useful functions | |
198 def read(path): | |
199 return [line.rstrip('\n') for line in open(path)] | |
200 def execute(command): | |
201 process = subprocess.Popen(command, shell=False, cwd=curDir, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) | |
202 | |
203 # Poll process for new output until finished | |
204 while True: | |
205 nextline = process.stdout.readline() | |
206 if nextline == '' and process.poll() is not None: | |
207 break | |
208 sys.stdout.write(nextline) | |
209 sys.stdout.flush() | |
210 | |
211 output = process.communicate()[0] | |
212 exitCode = process.returncode | |
213 | |
214 if (exitCode == 0): | |
215 return output | |
216 else: | |
217 raise subprocess.CalledProcessError(exitCode, command) | |
218 def httpGetFile(url, filepath=""): | |
219 if (filepath == ""): | |
220 return urllib.request.urlretrieve(url) | |
221 else: | |
222 urllib.request.urlretrieve(url, filepath) | |
223 return True | |
224 def gunzip(inputpath="", outputpath=""): | |
225 if (outputpath == ""): | |
226 with gzip.open(inputpath, 'rb') as f: | |
227 gzContent = f.read() | |
228 return gzContent | |
229 else: | |
230 with gzip.open(inputpath, 'rb') as f: | |
231 gzContent = f.read() | |
232 with open(outputpath, 'wb') as out: | |
233 out.write(gzContent) | |
234 return True | |
235 def ToJson(dictObject, outputPath): | |
236 outDir = outputDir + '/summary/' + ID + ".json/" | |
237 if not (os.path.exists(outDir)): | |
238 os.makedirs(outDir) | |
239 with open(outDir + outputPath, 'w') as f: | |
240 json.dump([ob.__dict__ for ob in dictObject.values()], f, ensure_ascii=False) | |
241 #endregion | |
242 | |
243 #region functions to parse result files | |
244 def ParseMLSTResult(pathToMLSTResult): | |
245 _mlstResult = {} | |
246 scheme = pandas.read_csv(mlstScheme, delimiter='\t', header=0) | |
247 scheme = scheme.replace(numpy.nan, '', regex=True) | |
248 | |
249 taxon = {} | |
250 #record the scheme as a dictionary | |
251 taxon["-"] = "No MLST Match" | |
252 for i in range(len(scheme.index)): | |
253 key = scheme.iloc[i,0] | |
254 if (str(scheme.iloc[i,2]) == "nan"): | |
255 value = str(scheme.iloc[i,1]) | |
256 else: | |
257 value = str(scheme.iloc[i,1]) + " " + str(scheme.iloc[i,2]) | |
258 | |
259 if (key in taxon.keys()): | |
260 taxon[key] = taxon.get(key) + ";" + value | |
261 else: | |
262 taxon[key] = value | |
263 #read in the mlst result | |
264 mlst = pandas.read_csv(pathToMLSTResult, delimiter='\t', header=None) | |
265 _mlstHit = MlstResult() | |
266 | |
267 _mlstHit.file = mlst.iloc[0,0] | |
268 _mlstHit.speciesID = (mlst.iloc[0,1]) | |
269 _mlstHit.seqType = str(mlst.iloc[0,2]) | |
270 for i in range(3, len(mlst.columns)): | |
271 _mlstHit.scheme += mlst.iloc[0,i] + ";" | |
272 _mlstHit.species = taxon[_mlstHit.speciesID] | |
273 _mlstHit.row = "\t".join(str(x) for x in mlst.ix[0].tolist()) | |
274 _mlstResult[_mlstHit.speciesID]=_mlstHit | |
275 | |
276 return _mlstResult | |
277 | |
278 def ParsePlasmidFinderResult(pathToPlasmidFinderResult): | |
279 #pipelineTest/contigs/BC110-Kpn005.fa contig00019 45455 45758 IncFIC(FII)_1 8-308/499 ========/=..... 8/11 59.52 75.65 plasmidfinder AP001918 IncFIC(FII)_1__AP001918 | |
280 #example resfinder: | |
281 #pipelineTest/contigs/BC110-Kpn005.fa contig00038 256 1053 OXA-181 1-798/798 =============== 0/0 100.00 100.00 bccdc AEP16366.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-181 | |
282 | |
283 _pFinder = {} #*********************** | |
284 plasmidFinder = pandas.read_csv(pathToPlasmidFinderResult, delimiter='\t', header=0) | |
285 plasmidFinder = plasmidFinder.replace(numpy.nan, '', regex=True) | |
286 | |
287 | |
288 for i in range(len(plasmidFinder.index)): | |
289 pf = starFinders() | |
290 pf.file = str(plasmidFinder.iloc[i,0]) | |
291 pf.sequence = str(plasmidFinder.iloc[i,1]) | |
292 pf.start = int(plasmidFinder.iloc[i,2]) | |
293 pf.end = int(plasmidFinder.iloc[i,3]) | |
294 pf.gene = str(plasmidFinder.iloc[i,4]) | |
295 pf.shortGene = pf.gene[:pf.gene.index("_")] | |
296 pf.coverage = str(plasmidFinder.iloc[i,5]) | |
297 pf.coverage_map = str(plasmidFinder.iloc[i,6]) | |
298 pf.gaps = str(plasmidFinder.iloc[i,7]) | |
299 pf.pCoverage = float(plasmidFinder.iloc[i,8]) | |
300 pf.pIdentity = float(plasmidFinder.iloc[i,9]) | |
301 pf.database = str(plasmidFinder.iloc[i,10]) | |
302 pf.accession = str(plasmidFinder.iloc[i,11]) | |
303 pf.product = str(plasmidFinder.iloc[i,12]) | |
304 pf.source = "plasmid" | |
305 pf.row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist()) | |
306 _pFinder[pf.gene]=pf | |
307 #row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist()) | |
308 #plasmidFinderContigs.append(str(plasmidFinder.iloc[i,1])) | |
309 #origins.append(str(plasmidFinder.iloc[i,4][:plasmidFinder.iloc[i,4].index("_")])) | |
310 return _pFinder | |
311 | |
312 def ParseMobsuiteResult(pathToMobsuiteResult): | |
313 _mobsuite = {} | |
314 mResult = pandas.read_csv(pathToMobsuiteResult, delimiter='\t', header=0) | |
315 mResult = mResult.replace(numpy.nan, '', regex=True) | |
316 | |
317 for i in range(len(mResult.index)): | |
318 mr = mobsuiteResult() | |
319 mr.file_id = str(mResult.iloc[i,0]) | |
320 mr.cluster_id = str(mResult.iloc[i,1]) | |
321 if (mr.cluster_id == "chromosome"): | |
322 break | |
323 mr.contig_id = str(mResult.iloc[i,2]) | |
324 mr.contig_num = mr.contig_id[(mr.contig_id.find("contig")+6):mr.contig_id.find("_len=")] | |
325 mr.contig_length = int(mResult.iloc[i,3]) | |
326 mr.circularity_status = str(mResult.iloc[i,4]) | |
327 mr.rep_type = str(mResult.iloc[i,5]) | |
328 mr.rep_type_accession = str(mResult.iloc[i,6]) | |
329 mr.relaxase_type = str(mResult.iloc[i,7]) | |
330 mr.relaxase_type_accession = str(mResult.iloc[i,8]) | |
331 mr.mash_nearest_neighbor = str(mResult.iloc[i,9]) | |
332 mr.mash_neighbor_distance = float(mResult.iloc[i,10]) | |
333 mr.repetitive_dna_id = str(mResult.iloc[i,11]) | |
334 mr.match_type = str(mResult.iloc[i,12]) | |
335 if (mr.match_type == ""): | |
336 mr.score = -1 | |
337 mr.contig_match_start = -1 | |
338 mr.contig_match_end = -1 | |
339 else: | |
340 mr.score = int(mResult.iloc[i,13]) | |
341 mr.contig_match_start = int(mResult.iloc[i,14]) | |
342 mr.contig_match_end = int(mResult.iloc[i,15]) | |
343 mr.row = "\t".join(str(x) for x in mResult.ix[i].tolist()) | |
344 _mobsuite[mr.contig_id]=(mr) | |
345 return _mobsuite | |
346 | |
347 def ParseMobsuitePlasmids(pathToMobsuiteResult): | |
348 _mobsuite = {} | |
349 mResults = pandas.read_csv(pathToMobsuiteResult, delimiter='\t', header=0) | |
350 mResults = mResults.replace(numpy.nan, '', regex=True) | |
351 | |
352 for i in range(len(mResults.index)): | |
353 mr = mobsuitePlasmids() | |
354 mr.file_id = str(mResults.iloc[i,0]) | |
355 mr.num_contigs = int(mResults.iloc[i,1]) | |
356 mr.total_length = int(mResults.iloc[i,2]) | |
357 mr.gc = int(mResults.iloc[i,3]) | |
358 mr.rep_types = str(mResults.iloc[i,4]) | |
359 mr.rep_typeAccession = str(mResults.iloc[i,5]) | |
360 mr.relaxase_type = str(mResults.iloc[i,6]) | |
361 mr.relaxase_type_accession = str(mResults.iloc[i,7]) | |
362 mr.mpf_type = str(mResults.iloc[i,8]) | |
363 mr.mpf_type_accession = str(mResults.iloc[i,9]) | |
364 mr.orit_type = str(mResults.iloc[i,10]) | |
365 mr.orit_accession = str(mResults.iloc[i,11]) | |
366 mr.PredictedMobility = str(mResults.iloc[i,12]) | |
367 mr.mash_nearest_neighbor = str(mResults.iloc[i,13]) | |
368 mr.mash_neighbor_distance = float(mResults.iloc[i,14]) | |
369 mr.mash_neighbor_cluster = int(mResults.iloc[i,15]) | |
370 mr.row = "\t".join(str(x) for x in mResults.ix[i].tolist()) | |
371 _mobsuite[mr.file_id] = mr | |
372 return _mobsuite | |
373 | |
374 def ParseResFinderResult(pathToResFinderResults, plasmidContigs, likelyPlasmidContigs): | |
375 _rFinder = {} | |
376 resFinder = pandas.read_csv(pathToResFinderResults, delimiter='\t', header=0) | |
377 resFinder = resFinder.replace(numpy.nan, '', regex=True) | |
378 | |
379 for i in range(len(resFinder.index)): | |
380 rf = starFinders() | |
381 rf.file = str(resFinder.iloc[i,0]) | |
382 rf.sequence = str(resFinder.iloc[i,1]) | |
383 rf.start = int(resFinder.iloc[i,2]) | |
384 rf.end = int(resFinder.iloc[i,3]) | |
385 rf.gene = str(resFinder.iloc[i,4]) | |
386 rf.shortGene = rf.gene | |
387 rf.coverage = str(resFinder.iloc[i,5]) | |
388 rf.coverage_map = str(resFinder.iloc[i,6]) | |
389 rf.gaps = str(resFinder.iloc[i,7]) | |
390 rf.pCoverage = float(resFinder.iloc[i,8]) | |
391 rf.pIdentity = float(resFinder.iloc[i,9]) | |
392 rf.database = str(resFinder.iloc[i,10]) | |
393 rf.accession = str(resFinder.iloc[i,11]) | |
394 rf.product = str(resFinder.iloc[i,12]) | |
395 rf.row = "\t".join(str(x) for x in resFinder.ix[i].tolist()) | |
396 if (rf.sequence[6:] in plasmidContigs): | |
397 rf.source = "plasmid" | |
398 elif (rf.sequence[6:] in likelyPlasmidContigs): | |
399 rf.source = "likely plasmid" | |
400 else: | |
401 rf.source = "likely chromosome" | |
402 _rFinder[rf.gene]=rf | |
403 return _rFinder | |
404 | |
405 def ParseRGIResult(pathToRGIResults, plasmidContigs, likelyPlasmidContigs): | |
406 _rgiR = {} | |
407 RGI = pandas.read_csv(pathToRGIResults, delimiter='\t', header=0) | |
408 RGI = RGI.replace(numpy.nan, '', regex=True) | |
409 | |
410 for i in range(len(RGI.index)): | |
411 r = RGIResult() | |
412 r.ORF_ID = str(RGI.iloc[i,0]) | |
413 r.Contig = str(RGI.iloc[i,1]) | |
414 r.Contig_Num = r.Contig[6:r.Contig.find("_")] | |
415 r.Start = int(RGI.iloc[i,2]) | |
416 r.Stop = int(RGI.iloc[i,3]) | |
417 r.Orientation = str(RGI.iloc[i,4]) | |
418 r.Cut_Off = str(RGI.iloc[i,5]) | |
419 r.Pass_Bitscore = int(RGI.iloc[i,6]) | |
420 r.Best_Hit_Bitscore = float(RGI.iloc[i,7]) | |
421 r.Best_Hit_ARO = str(RGI.iloc[i,8]) | |
422 r.Best_Identities = float(RGI.iloc[i,9]) | |
423 r.ARO = int(RGI.iloc[i,10]) | |
424 r.Model_type = str(RGI.iloc[i,11]) | |
425 r.SNPs_in_Best_Hit_ARO = str(RGI.iloc[i,12]) | |
426 r.Other_SNPs = str(RGI.iloc[i,13]) | |
427 r.Drug_Class = str(RGI.iloc[i,14]) | |
428 r.Resistance_Mechanism = str(RGI.iloc[i,15]) | |
429 r.AMR_Gene_Family = str(RGI.iloc[i,16]) | |
430 r.Predicted_DNA = str(RGI.iloc[i,17]) | |
431 r.Predicted_Protein = str(RGI.iloc[i,18]) | |
432 r.CARD_Protein_Sequence = str(RGI.iloc[i,19]) | |
433 r.Percentage_Length_of_Reference_Sequence = float(RGI.iloc[i,20]) | |
434 r.ID = str(RGI.iloc[i,21]) | |
435 r.Model_ID = int(RGI.iloc[i,22]) | |
436 r.row = "\t".join(str(x) for x in RGI.ix[i].tolist()) | |
437 if (r.Contig_Num in plasmidContigs): | |
438 r.source = "plasmid" | |
439 elif (r.Contig_Num in likelyPlasmidContigs): | |
440 r.source = "likely plasmid" | |
441 else: | |
442 r.source = "likely chromosome" | |
443 _rgiR[r.Model_ID]=r | |
444 return _rgiR | |
445 #endregion | |
446 | |
447 def Main(): | |
448 notes = [] | |
449 #init the output list | |
450 output = [] | |
451 jsonOutput = [] | |
452 | |
453 print(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + assemblyPath) | |
454 output.append(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + assemblyPath) | |
455 | |
456 #region update databases if update=true | |
457 if not debug: | |
458 #update databases if necessary | |
459 if not (updateAbPath == "" and updateAbName == "default"): | |
460 print("updating abricate database: " + updateAbName + " @fasta path: " + updateAbPath) | |
461 cmd = [scriptDir + "/pipeline_updateAbricateDB.sh", updateAbPath, updateAbName] | |
462 update = execute(cmd) | |
463 if (updateMLST.lower() == "true"): | |
464 print("updating mlst database... ") | |
465 cmd = [scriptDir + "/pipeline_updateMLST.sh"] | |
466 update = execute(cmd) | |
467 #endregion | |
468 | |
469 print("step 3: parsing the mlst results") | |
470 | |
471 print("performing MLST") | |
472 #region call the mlst script | |
473 if not debug: | |
474 print("running pipeline_prediction.sh") | |
475 #input parameters: 1=ID 2 = assemblyPath, 3= outputdir, 4=card.json | |
476 cmd = [scriptDir + "/pipeline_prediction.sh", ID, assemblyPath, outputDir, cardDB] | |
477 result = execute(cmd) | |
478 #endregion | |
479 | |
480 #region parse the mlst results | |
481 print("step 3: parsing mlst, plasmid, and amr results") | |
482 | |
483 print("identifying MLST") | |
484 pathToMLSTScheme = outputDir + "/predictions/" + ID + ".mlst" | |
485 mlstHit = ParseMLSTResult(pathToMLSTScheme)#*********************** | |
486 ToJson(mlstHit, "mlst.json") #write it to a json output | |
487 mlstHit = list(mlstHit.values())[0] | |
488 | |
489 #endregion | |
490 | |
491 #region parse mobsuite, resfinder and rgi results | |
492 print("identifying plasmid contigs and amr genes") | |
493 | |
494 plasmidContigs = [] | |
495 likelyPlasmidContigs = [] | |
496 origins = [] | |
497 | |
498 #parse mobsuite results | |
499 mSuite = ParseMobsuiteResult(outputDir + "/predictions/" + ID + ".recon/contig_report.txt")#************* | |
500 ToJson(mSuite, "mobsuite.json") #************* | |
501 mSuitePlasmids = ParseMobsuitePlasmids(outputDir + "/predictions/" + ID + ".recon/mobtyper_aggregate_report.txt")#************* | |
502 ToJson(mSuitePlasmids, "mobsuitePlasmids.json") #************* | |
503 | |
504 for key in mSuite: | |
505 if mSuite[key].contig_num not in plasmidContigs and mSuite[key].contig_num not in likelyPlasmidContigs: | |
506 if not (mSuite[key].rep_type == ''): | |
507 plasmidContigs.append(mSuite[key].contig_num) | |
508 else: | |
509 likelyPlasmidContigs.append(mSuite[key].contig_num) | |
510 for key in mSuite: | |
511 if mSuite[key].rep_type not in origins: | |
512 origins.append(mSuite[key].rep_type) | |
513 | |
514 #parse resfinder AMR results | |
515 rFinder = ParseResFinderResult(outputDir + "/predictions/" + ID + ".cp", plasmidContigs, likelyPlasmidContigs) #********************** | |
516 ToJson(rFinder, "resfinder.json") #************* | |
517 | |
518 rgiAMR = ParseRGIResult(outputDir + "/predictions/" + ID + ".rgi.txt", plasmidContigs, likelyPlasmidContigs)#*********************** | |
519 ToJson(rgiAMR, "rgi.json") #************* | |
520 | |
521 carbapenamases = [] | |
522 amrGenes = [] | |
523 for keys in rFinder: | |
524 carbapenamases.append(rFinder[keys].shortGene + "(" + rFinder[keys].source + ")") | |
525 for keys in rgiAMR: | |
526 if (rgiAMR[keys].Drug_Class.find("carbapenem") > -1): | |
527 if (rgiAMR[keys].Best_Hit_ARO not in carbapenamases): | |
528 carbapenamases.append(rgiAMR[keys].Best_Hit_ARO+ "(" + rgiAMR[keys].source + ")") | |
529 else: | |
530 if (rgiAMR[keys].Best_Hit_ARO not in amrGenes): | |
531 amrGenes.append(rgiAMR[keys].Best_Hit_ARO+ "(" + rgiAMR[keys].source + ")") | |
532 #endregion | |
533 | |
534 #region output parsed mlst information | |
535 print("formatting mlst outputs") | |
536 output.append("\n\n\n~~~~~~~MLST summary~~~~~~~") | |
537 output.append("MLST determined species: " + mlstHit.species) | |
538 output.append("\nMLST Details: ") | |
539 output.append(mlstHit.row) | |
540 | |
541 output.append("\nMLST information: ") | |
542 if (mlstHit.species == expectedSpecies): | |
543 output.append("MLST determined species is the same as expected species") | |
544 #notes.append("MLST determined species is the same as expected species") | |
545 else: | |
546 output.append("!!!MLST determined species is NOT the same as expected species, contamination? mislabeling?") | |
547 notes.append("MLST: Not expected species. Possible contamination or mislabeling") | |
548 | |
549 #endregion | |
550 | |
551 #region output the parsed plasmid/amr results | |
552 output.append("\n\n\n~~~~~~~~Plasmids~~~~~~~~\n") | |
553 | |
554 output.append("predicted plasmid origins: ") | |
555 output.append(";".join(origins)) | |
556 | |
557 output.append("\ndefinitely plasmid contigs") | |
558 output.append(";".join(plasmidContigs)) | |
559 | |
560 output.append("\nlikely plasmid contigs") | |
561 output.append(";".join(likelyPlasmidContigs)) | |
562 | |
563 output.append("\nmob-suite prediction details: ") | |
564 for key in mSuite: | |
565 output.append(mSuite[key].row) | |
566 | |
567 output.append("\n\n\n~~~~~~~~AMR Genes~~~~~~~~\n") | |
568 output.append("predicted carbapenamase Genes: ") | |
569 output.append(",".join(carbapenamases)) | |
570 output.append("other RGI AMR Genes: ") | |
571 for key in rgiAMR: | |
572 output.append(rgiAMR[key].Best_Hit_ARO + "(" + rgiAMR[key].source + ")") | |
573 | |
574 output.append("\nDetails about the carbapenamase Genes: ") | |
575 for key in rFinder: | |
576 output.append(rFinder[key].row) | |
577 output.append("\nDetails about the RGI AMR Genes: ") | |
578 for key in rgiAMR: | |
579 output.append(rgiAMR[key].row) | |
580 | |
581 #write summary to a file | |
582 summaryDir = outputDir + "/summary/" + ID | |
583 out = open(summaryDir + ".txt", 'w') | |
584 for item in output: | |
585 out.write("%s\n" % item) | |
586 | |
587 | |
588 #TSV output | |
589 tsvOut = [] | |
590 tsvOut.append("ID\tExpected Species\tMLST Species\tSequence Type\tMLST Scheme\tCarbapenem Resistance Genes\tOther AMR Genes\tTotal Plasmids\tPlasmids ID\tNum_Contigs\tPlasmid Length\tPlasmid RepType\tPlasmid Mobility\tNearest Reference\tDefinitely Plasmid Contigs\tLikely Plasmid Contigs") | |
591 #start with ID | |
592 temp = "" | |
593 temp += (ID + "\t") | |
594 temp += expectedSpecies + "\t" | |
595 | |
596 #move into MLST | |
597 temp += mlstHit.species + "\t" | |
598 temp += str(mlstHit.seqType) + "\t" | |
599 temp += mlstHit.scheme + "\t" | |
600 | |
601 #now onto AMR genes | |
602 temp += ";".join(carbapenamases) + "\t" | |
603 temp += ";".join(amrGenes) + "\t" | |
604 | |
605 #lastly plasmids | |
606 temp+= str(len(mSuitePlasmids)) + "\t" | |
607 plasmidID = "" | |
608 contigs = "" | |
609 lengths = "" | |
610 rep_type = "" | |
611 mobility = "" | |
612 neighbour = "" | |
613 for keys in mSuitePlasmids: | |
614 plasmidID += str(mSuitePlasmids[keys].mash_neighbor_cluster) + ";" | |
615 contigs += str(mSuitePlasmids[keys].num_contigs) + ";" | |
616 lengths += str(mSuitePlasmids[keys].total_length) + ";" | |
617 rep_type += str(mSuitePlasmids[keys].rep_types) + ";" | |
618 mobility += str(mSuitePlasmids[keys].PredictedMobility) + ";" | |
619 neighbour += str(mSuitePlasmids[keys].mash_nearest_neighbor) + ";" | |
620 temp += plasmidID + "\t" + contigs + "\t" + lengths + "\t" + rep_type + "\t" + mobility + "\t" + neighbour + "\t" | |
621 temp += ";".join(plasmidContigs) + "\t" | |
622 temp += ";".join(likelyPlasmidContigs) | |
623 tsvOut.append(temp) | |
624 | |
625 summaryDir = outputDir + "/summary/" + ID | |
626 out = open(summaryDir + ".tsv", 'w') | |
627 for item in tsvOut: | |
628 out.write("%s\n" % item) | |
629 #endregion | |
630 | |
631 | |
632 start = time.time()#time the analysis | |
633 print("Starting workflow...") | |
634 #analysis time | |
635 Main() | |
636 | |
637 end = time.time() | |
638 print("Finished!\nThe analysis used: " + str(end-start) + " seconds") |