comparison galaxy_prediction.py @ 0:917a05a03ac9 draft

Uploaded
author jjjjia
date Tue, 14 Aug 2018 17:18:49 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:917a05a03ac9
1 #!/home/jjjjia/.conda/envs/py36/bin/python
2
3 #$ -S /home/jjjjia/.conda/envs/py36/bin/python
4 #$ -V # Pass environment variables to the job
5 #$ -N CPO_pipeline # Replace with a more specific job name
6 #$ -wd /home/jjjjia/testCases # Use the current working dir
7 #$ -pe smp 8 # Parallel Environment (how many cores)
8 #$ -l h_vmem=11G # Memory (RAM) allocation *per core*
9 #$ -e ./logs/$JOB_ID.err
10 #$ -o ./logs/$JOB_ID.log
11 #$ -m ea
12 #$ -M bja20@sfu.ca
13
14 #./prediction.py -i ~/testCases/cpoResults/contigs/BC11-Kpn005_S2.fa -m ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.mlst -c ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.recon/contig_report.txt -f ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.recon/mobtyper_aggregate_report.txt -a ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.cp -r ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.rgi.txt -e "Klebsiella"
15 import subprocess
16 import pandas
17 import optparse
18 import os
19 import datetime
20 import sys
21 import time
22 import urllib.request
23 import gzip
24 import collections
25 import json
26 import numpy
27
28
29 debug = False #debug skips the shell scripts and also dump out a ton of debugging messages
30
31 if not debug:
32 #parses some parameters
33 parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...")
34 #required
35 #MLSTHIT, mobsuite, resfinder, rgi, mlstscheme
36 parser.add_option("-i", "--id", dest="id", type="string", help="identifier of the isolate")
37 parser.add_option("-m", "--mlst", dest="mlst", type="string", help="absolute file path to mlst result")
38 parser.add_option("-c", "--mobfinderContig", dest="mobfinderContig", type="string", help="absolute path to mobfinder aggregate result")
39 parser.add_option("-f", "--mobfinderAggregate", dest="mobfinderAggregate", type="string", help="absolute path to mobfinder plasmid results")
40 parser.add_option("-a", "--abricate", dest="abricate", type="string", help="absolute path to abricate results")
41 parser.add_option("-r", "--rgi", dest="rgi", type="string", help="absolute path to rgi results")
42 parser.add_option("-e", "--expected", dest="expectedSpecies", default="NA/NA/NA", type="string", help="expected species of the isolate")
43 parser.add_option("-s", "--mlst-scheme", dest="mlstScheme", default= "./scheme_species_map.tab", type="string", help="absolute file path to mlst scheme")
44
45 #parallelization, useless, these are hard coded to 8cores/64G RAM
46 #parser.add_option("-t", "--threads", dest="threads", default=8, type="int", help="number of cpu to use")
47 #parser.add_option("-p", "--memory", dest="memory", default=64, type="int", help="memory to use in GB")
48
49 (options,args) = parser.parse_args()
50 #if len(args) != 8:
51 #parser.error("incorrect number of arguments, all 7 is required")
52 curDir = os.getcwd()
53 ID = str(options.id).lstrip().rstrip()
54 mlst = str(options.mlst).lstrip().rstrip()
55 mobfindercontig = str(options.mobfinderContig).lstrip().rstrip()
56 mobfinderaggregate = str(options.mobfinderAggregate).lstrip().rstrip()
57 abricate = str(options.abricate).lstrip().rstrip()
58 rgi = str(options.rgi).lstrip().rstrip()
59 expectedSpecies = str(options.expectedSpecies).lstrip().rstrip()
60 mlstScheme = str(options.mlstScheme).lstrip().rstrip()
61 outputDir = "./"
62 print(mlst)
63 print(mobfindercontig)
64 print(mobfinderaggregate)
65 print(abricate)
66 print(rgi)
67 print(expectedSpecies)
68 print(mlstScheme)
69 else:
70 curDir = os.getcwd()
71 ID = "BC11"
72 mlst = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.mlst"
73 mobfindercontig = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.recon\contig_report.txt"
74 mobfinderaggregate = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.recon\mobtyper_aggregate_report.txt"
75 abricate = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.cp"
76 rgi = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.rgi.txt"
77 expectedSpecies = "Escherichia coli"
78 mlstScheme = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\scheme_species_map.tab"
79 outputDir = "./"
80
81 #region result objects
82 #define some objects to store values from results
83 #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar).
84 class starFinders(object):
85 def __init__(self):
86 self.file = ""
87 self.sequence = ""
88 self.start = 0
89 self.end = 0
90 self.gene = ""
91 self.shortGene = ""
92 self.coverage = ""
93 self.coverage_map = ""
94 self.gaps = ""
95 self.pCoverage = 100.00
96 self.pIdentity = 100.00
97 self.database = ""
98 self.accession = ""
99 self.product = ""
100 self.source = "chromosome"
101 self.row = ""
102
103 class PlasFlowResult(object):
104 def __init__(self):
105 self.sequence = ""
106 self.length = 0
107 self.label = ""
108 self.confidence = 0
109 self.usefulRow = ""
110 self.row = ""
111
112 class MlstResult(object):
113 def __init__(self):
114 self.file = ""
115 self.speciesID = ""
116 self.seqType = 0
117 self.scheme = ""
118 self.species = ""
119 self.row=""
120
121 class mobsuiteResult(object):
122 def __init__(self):
123 self.file_id = ""
124 self.cluster_id = ""
125 self.contig_id = ""
126 self.contig_num = 0
127 self.contig_length = 0
128 self.circularity_status = ""
129 self.rep_type = ""
130 self.rep_type_accession = ""
131 self.relaxase_type = ""
132 self.relaxase_type_accession = ""
133 self.mash_nearest_neighbor = ""
134 self.mash_neighbor_distance = 0.00
135 self.repetitive_dna_id = ""
136 self.match_type = ""
137 self.score = 0
138 self.contig_match_start = 0
139 self.contig_match_end = 0
140 self.row = ""
141
142 class mobsuitePlasmids(object):
143 def __init__(self):
144 self.file_id = ""
145 self.num_contigs = 0
146 self.total_length = 0
147 self.gc = ""
148 self.rep_types = ""
149 self.rep_typeAccession = ""
150 self.relaxase_type= ""
151 self.relaxase_type_accession = ""
152 self.mpf_type = ""
153 self.mpf_type_accession= ""
154 self.orit_type = ""
155 self.orit_accession = ""
156 self.PredictedMobility = ""
157 self.mash_nearest_neighbor = ""
158 self.mash_neighbor_distance = 0.00
159 self.mash_neighbor_cluster= 0
160 self.row = ""
161 class RGIResult(object):
162 def __init__(self):
163 self.ORF_ID = ""
164 self.Contig = ""
165 self.Start = -1
166 self.Stop = -1
167 self.Orientation = ""
168 self.Cut_Off = ""
169 self.Pass_Bitscore = 100000
170 self.Best_Hit_Bitscore = 0.00
171 self.Best_Hit_ARO = ""
172 self.Best_Identities = 0.00
173 self.ARO = 0
174 self.Model_type = ""
175 self.SNPs_in_Best_Hit_ARO = ""
176 self.Other_SNPs = ""
177 self.Drug_Class = ""
178 self.Resistance_Mechanism = ""
179 self.AMR_Gene_Family = ""
180 self.Predicted_DNA = ""
181 self.Predicted_Protein = ""
182 self.CARD_Protein_Sequence = ""
183 self.Percentage_Length_of_Reference_Sequence = 0.00
184 self.ID = ""
185 self.Model_ID = 0
186 self.source = ""
187 self.row = ""
188
189 #endregion
190
191 #region useful functions
192 def read(path):
193 return [line.rstrip('\n') for line in open(path)]
194 def execute(command):
195 process = subprocess.Popen(command, shell=False, cwd=curDir, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
196
197 # Poll process for new output until finished
198 while True:
199 nextline = process.stdout.readline()
200 if nextline == '' and process.poll() is not None:
201 break
202 sys.stdout.write(nextline)
203 sys.stdout.flush()
204
205 output = process.communicate()[0]
206 exitCode = process.returncode
207
208 if (exitCode == 0):
209 return output
210 else:
211 raise subprocess.CalledProcessError(exitCode, command)
212 def httpGetFile(url, filepath=""):
213 if (filepath == ""):
214 return urllib.request.urlretrieve(url)
215 else:
216 urllib.request.urlretrieve(url, filepath)
217 return True
218 def gunzip(inputpath="", outputpath=""):
219 if (outputpath == ""):
220 with gzip.open(inputpath, 'rb') as f:
221 gzContent = f.read()
222 return gzContent
223 else:
224 with gzip.open(inputpath, 'rb') as f:
225 gzContent = f.read()
226 with open(outputpath, 'wb') as out:
227 out.write(gzContent)
228 return True
229 def ToJson(dictObject, outputPath):
230 #outDir = outputDir + '/summary/' + ID + ".json/"
231 #if not (os.path.exists(outDir)):
232 #os.makedirs(outDir)
233 #with open(outputPath, 'w') as f:
234 #json.dump([ob.__dict__ for ob in dictObject.values()], f, ensure_ascii=False)
235 return ""
236 #endregion
237
238 #region functions to parse result files
239 def ParseMLSTResult(pathToMLSTResult, scheme):
240 _mlstResult = {}
241 scheme = pandas.read_csv(scheme, delimiter='\t', header=0)
242 scheme = scheme.replace(numpy.nan, '', regex=True)
243
244 taxon = {}
245 #record the scheme as a dictionary
246 taxon["-"] = "No MLST Match"
247 for i in range(len(scheme.index)):
248 key = scheme.iloc[i,0]
249 if (str(scheme.iloc[i,2]) == "nan"):
250 value = str(scheme.iloc[i,1])
251 else:
252 value = str(scheme.iloc[i,1]) + " " + str(scheme.iloc[i,2])
253
254 if (key in taxon.keys()):
255 taxon[key] = taxon.get(key) + ";" + value
256 else:
257 taxon[key] = value
258 #read in the mlst result
259 mlst = pandas.read_csv(pathToMLSTResult, delimiter='\t', header=None)
260 _mlstHit = MlstResult()
261
262 _mlstHit.file = mlst.iloc[0,0]
263 _mlstHit.speciesID = (mlst.iloc[0,1])
264 _mlstHit.seqType = str(mlst.iloc[0,2])
265 for i in range(3, len(mlst.columns)):
266 _mlstHit.scheme += mlst.iloc[0,i] + ";"
267 _mlstHit.species = taxon[_mlstHit.speciesID]
268 _mlstHit.row = "\t".join(str(x) for x in mlst.ix[0].tolist())
269 _mlstResult[_mlstHit.speciesID]=_mlstHit
270
271 return _mlstResult
272
273 def ParsePlasmidFinderResult(pathToPlasmidFinderResult):
274 #pipelineTest/contigs/BC110-Kpn005.fa contig00019 45455 45758 IncFIC(FII)_1 8-308/499 ========/=..... 8/11 59.52 75.65 plasmidfinder AP001918 IncFIC(FII)_1__AP001918
275 #example resfinder:
276 #pipelineTest/contigs/BC110-Kpn005.fa contig00038 256 1053 OXA-181 1-798/798 =============== 0/0 100.00 100.00 bccdc AEP16366.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-181
277
278 _pFinder = {} #***********************
279 plasmidFinder = pandas.read_csv(pathToPlasmidFinderResult, delimiter='\t', header=0)
280 plasmidFinder = plasmidFinder.replace(numpy.nan, '', regex=True)
281
282
283 for i in range(len(plasmidFinder.index)):
284 pf = starFinders()
285 pf.file = str(plasmidFinder.iloc[i,0])
286 pf.sequence = str(plasmidFinder.iloc[i,1])
287 pf.start = int(plasmidFinder.iloc[i,2])
288 pf.end = int(plasmidFinder.iloc[i,3])
289 pf.gene = str(plasmidFinder.iloc[i,4])
290 pf.shortGene = pf.gene[:pf.gene.index("_")]
291 pf.coverage = str(plasmidFinder.iloc[i,5])
292 pf.coverage_map = str(plasmidFinder.iloc[i,6])
293 pf.gaps = str(plasmidFinder.iloc[i,7])
294 pf.pCoverage = float(plasmidFinder.iloc[i,8])
295 pf.pIdentity = float(plasmidFinder.iloc[i,9])
296 pf.database = str(plasmidFinder.iloc[i,10])
297 pf.accession = str(plasmidFinder.iloc[i,11])
298 pf.product = str(plasmidFinder.iloc[i,12])
299 pf.source = "plasmid"
300 pf.row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist())
301 _pFinder[pf.gene]=pf
302 #row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist())
303 #plasmidFinderContigs.append(str(plasmidFinder.iloc[i,1]))
304 #origins.append(str(plasmidFinder.iloc[i,4][:plasmidFinder.iloc[i,4].index("_")]))
305 return _pFinder
306
307 def ParseMobsuiteResult(pathToMobsuiteResult):
308 _mobsuite = {}
309 mResult = pandas.read_csv(pathToMobsuiteResult, delimiter='\t', header=0)
310 mResult = mResult.replace(numpy.nan, '', regex=True)
311
312 for i in range(len(mResult.index)):
313 mr = mobsuiteResult()
314 mr.file_id = str(mResult.iloc[i,0])
315 mr.cluster_id = str(mResult.iloc[i,1])
316 if (mr.cluster_id == "chromosome"):
317 break
318 mr.contig_id = str(mResult.iloc[i,2])
319 mr.contig_num = mr.contig_id[(mr.contig_id.find("contig")+6):mr.contig_id.find("_len=")]
320 mr.contig_length = int(mResult.iloc[i,3])
321 mr.circularity_status = str(mResult.iloc[i,4])
322 mr.rep_type = str(mResult.iloc[i,5])
323 mr.rep_type_accession = str(mResult.iloc[i,6])
324 mr.relaxase_type = str(mResult.iloc[i,7])
325 mr.relaxase_type_accession = str(mResult.iloc[i,8])
326 mr.mash_nearest_neighbor = str(mResult.iloc[i,9])
327 mr.mash_neighbor_distance = float(mResult.iloc[i,10])
328 mr.repetitive_dna_id = str(mResult.iloc[i,11])
329 mr.match_type = str(mResult.iloc[i,12])
330 if (mr.match_type == ""):
331 mr.score = -1
332 mr.contig_match_start = -1
333 mr.contig_match_end = -1
334 else:
335 mr.score = int(mResult.iloc[i,13])
336 mr.contig_match_start = int(mResult.iloc[i,14])
337 mr.contig_match_end = int(mResult.iloc[i,15])
338 mr.row = "\t".join(str(x) for x in mResult.ix[i].tolist())
339 _mobsuite[mr.contig_id]=(mr)
340 return _mobsuite
341
342 def ParseMobsuitePlasmids(pathToMobsuiteResult):
343 _mobsuite = {}
344 mResults = pandas.read_csv(pathToMobsuiteResult, delimiter='\t', header=0)
345 mResults = mResults.replace(numpy.nan, '', regex=True)
346
347 for i in range(len(mResults.index)):
348 mr = mobsuitePlasmids()
349 mr.file_id = str(mResults.iloc[i,0])
350 mr.num_contigs = int(mResults.iloc[i,1])
351 mr.total_length = int(mResults.iloc[i,2])
352 mr.gc = int(mResults.iloc[i,3])
353 mr.rep_types = str(mResults.iloc[i,4])
354 mr.rep_typeAccession = str(mResults.iloc[i,5])
355 mr.relaxase_type = str(mResults.iloc[i,6])
356 mr.relaxase_type_accession = str(mResults.iloc[i,7])
357 mr.mpf_type = str(mResults.iloc[i,8])
358 mr.mpf_type_accession = str(mResults.iloc[i,9])
359 mr.orit_type = str(mResults.iloc[i,10])
360 mr.orit_accession = str(mResults.iloc[i,11])
361 mr.PredictedMobility = str(mResults.iloc[i,12])
362 mr.mash_nearest_neighbor = str(mResults.iloc[i,13])
363 mr.mash_neighbor_distance = float(mResults.iloc[i,14])
364 mr.mash_neighbor_cluster = int(mResults.iloc[i,15])
365 mr.row = "\t".join(str(x) for x in mResults.ix[i].tolist())
366 _mobsuite[mr.file_id] = mr
367 return _mobsuite
368
369 def ParseResFinderResult(pathToResFinderResults, plasmidContigs, likelyPlasmidContigs):
370 _rFinder = {}
371 resFinder = pandas.read_csv(pathToResFinderResults, delimiter='\t', header=0)
372 resFinder = resFinder.replace(numpy.nan, '', regex=True)
373
374 for i in range(len(resFinder.index)):
375 rf = starFinders()
376 rf.file = str(resFinder.iloc[i,0])
377 rf.sequence = str(resFinder.iloc[i,1])
378 rf.start = int(resFinder.iloc[i,2])
379 rf.end = int(resFinder.iloc[i,3])
380 rf.gene = str(resFinder.iloc[i,4])
381 rf.shortGene = rf.gene
382 rf.coverage = str(resFinder.iloc[i,5])
383 rf.coverage_map = str(resFinder.iloc[i,6])
384 rf.gaps = str(resFinder.iloc[i,7])
385 rf.pCoverage = float(resFinder.iloc[i,8])
386 rf.pIdentity = float(resFinder.iloc[i,9])
387 rf.database = str(resFinder.iloc[i,10])
388 rf.accession = str(resFinder.iloc[i,11])
389 rf.product = str(resFinder.iloc[i,12])
390 rf.row = "\t".join(str(x) for x in resFinder.ix[i].tolist())
391 if (rf.sequence[6:] in plasmidContigs):
392 rf.source = "plasmid"
393 elif (rf.sequence[6:] in likelyPlasmidContigs):
394 rf.source = "likely plasmid"
395 else:
396 rf.source = "likely chromosome"
397 _rFinder[rf.gene]=rf
398 return _rFinder
399
400 def ParseRGIResult(pathToRGIResults, plasmidContigs, likelyPlasmidContigs):
401 _rgiR = {}
402 RGI = pandas.read_csv(pathToRGIResults, delimiter='\t', header=0)
403 RGI = RGI.replace(numpy.nan, '', regex=True)
404
405 for i in range(len(RGI.index)):
406 r = RGIResult()
407 r.ORF_ID = str(RGI.iloc[i,0])
408 r.Contig = str(RGI.iloc[i,1])
409 r.Contig_Num = r.Contig[6:r.Contig.find("_")]
410 r.Start = int(RGI.iloc[i,2])
411 r.Stop = int(RGI.iloc[i,3])
412 r.Orientation = str(RGI.iloc[i,4])
413 r.Cut_Off = str(RGI.iloc[i,5])
414 r.Pass_Bitscore = int(RGI.iloc[i,6])
415 r.Best_Hit_Bitscore = float(RGI.iloc[i,7])
416 r.Best_Hit_ARO = str(RGI.iloc[i,8])
417 r.Best_Identities = float(RGI.iloc[i,9])
418 r.ARO = int(RGI.iloc[i,10])
419 r.Model_type = str(RGI.iloc[i,11])
420 r.SNPs_in_Best_Hit_ARO = str(RGI.iloc[i,12])
421 r.Other_SNPs = str(RGI.iloc[i,13])
422 r.Drug_Class = str(RGI.iloc[i,14])
423 r.Resistance_Mechanism = str(RGI.iloc[i,15])
424 r.AMR_Gene_Family = str(RGI.iloc[i,16])
425 r.Predicted_DNA = str(RGI.iloc[i,17])
426 r.Predicted_Protein = str(RGI.iloc[i,18])
427 r.CARD_Protein_Sequence = str(RGI.iloc[i,19])
428 r.Percentage_Length_of_Reference_Sequence = float(RGI.iloc[i,20])
429 r.ID = str(RGI.iloc[i,21])
430 r.Model_ID = int(RGI.iloc[i,22])
431 r.row = "\t".join(str(x) for x in RGI.ix[i].tolist())
432 if (r.Contig_Num in plasmidContigs):
433 r.source = "plasmid"
434 elif (r.Contig_Num in likelyPlasmidContigs):
435 r.source = "likely plasmid"
436 else:
437 r.source = "likely chromosome"
438 _rgiR[r.Model_ID]=r
439 return _rgiR
440 #endregion
441
442 def Main():
443 outputDir = "./"
444 notes = []
445 #init the output list
446 output = []
447 jsonOutput = []
448
449 print(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + options.id)
450 output.append(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + options.id)
451
452 #region parse the mlst results
453 print("step 3: parsing mlst, plasmid, and amr results")
454
455 print("identifying MLST")
456 mlstHit = ParseMLSTResult(mlst, str(mlstScheme))#***********************
457 ToJson(mlstHit, "mlst.json") #write it to a json output
458 mlstHit = list(mlstHit.values())[0]
459
460 #endregion
461
462 #region parse mobsuite, resfinder and rgi results
463 print("identifying plasmid contigs and amr genes")
464
465 plasmidContigs = []
466 likelyPlasmidContigs = []
467 origins = []
468
469 #parse mobsuite results
470 mSuite = ParseMobsuiteResult(mobfindercontig) #outputDir + "/predictions/" + ID + ".recon/contig_report.txt")#*************
471 ToJson(mSuite, "mobsuite.json") #*************
472 mSuitePlasmids = ParseMobsuitePlasmids(mobfinderaggregate)#outputDir + "/predictions/" + ID + ".recon/mobtyper_aggregate_report.txt")#*************
473 ToJson(mSuitePlasmids, "mobsuitePlasmids.json") #*************
474
475 for key in mSuite:
476 if mSuite[key].contig_num not in plasmidContigs and mSuite[key].contig_num not in likelyPlasmidContigs:
477 if not (mSuite[key].rep_type == ''):
478 plasmidContigs.append(mSuite[key].contig_num)
479 else:
480 likelyPlasmidContigs.append(mSuite[key].contig_num)
481 for key in mSuite:
482 if mSuite[key].rep_type not in origins:
483 origins.append(mSuite[key].rep_type)
484
485 #parse resfinder AMR results
486 rFinder = ParseResFinderResult(abricate, plasmidContigs, likelyPlasmidContigs)#outputDir + "/predictions/" + ID + ".cp", plasmidContigs, likelyPlasmidContigs) #**********************
487 ToJson(rFinder, "resfinder.json") #*************
488
489 rgiAMR = ParseRGIResult(rgi, plasmidContigs, likelyPlasmidContigs) # outputDir + "/predictions/" + ID + ".rgi.txt", plasmidContigs, likelyPlasmidContigs)#***********************
490 ToJson(rgiAMR, "rgi.json") #*************
491
492 carbapenamases = []
493 amrGenes = []
494 for keys in rFinder:
495 carbapenamases.append(rFinder[keys].shortGene + "(" + rFinder[keys].source + ")")
496 for keys in rgiAMR:
497 if (rgiAMR[keys].Drug_Class.find("carbapenem") > -1):
498 if (rgiAMR[keys].Best_Hit_ARO not in carbapenamases):
499 carbapenamases.append(rgiAMR[keys].Best_Hit_ARO+ "(" + rgiAMR[keys].source + ")")
500 else:
501 if (rgiAMR[keys].Best_Hit_ARO not in amrGenes):
502 amrGenes.append(rgiAMR[keys].Best_Hit_ARO+ "(" + rgiAMR[keys].source + ")")
503 #endregion
504
505 #region output parsed mlst information
506 print("formatting mlst outputs")
507 output.append("\n\n\n~~~~~~~MLST summary~~~~~~~")
508 output.append("MLST determined species: " + mlstHit.species)
509 output.append("\nMLST Details: ")
510 output.append(mlstHit.row)
511
512 output.append("\nMLST information: ")
513 if (mlstHit.species == expectedSpecies):
514 output.append("MLST determined species is the same as expected species")
515 #notes.append("MLST determined species is the same as expected species")
516 else:
517 output.append("!!!MLST determined species is NOT the same as expected species, contamination? mislabeling?")
518 notes.append("MLST: Not expected species. Possible contamination or mislabeling")
519
520 #endregion
521
522 #region output the parsed plasmid/amr results
523 output.append("\n\n\n~~~~~~~~Plasmids~~~~~~~~\n")
524
525 output.append("predicted plasmid origins: ")
526 output.append(";".join(origins))
527
528 output.append("\ndefinitely plasmid contigs")
529 output.append(";".join(plasmidContigs))
530
531 output.append("\nlikely plasmid contigs")
532 output.append(";".join(likelyPlasmidContigs))
533
534 output.append("\nmob-suite prediction details: ")
535 for key in mSuite:
536 output.append(mSuite[key].row)
537
538 output.append("\n\n\n~~~~~~~~AMR Genes~~~~~~~~\n")
539 output.append("predicted carbapenamase Genes: ")
540 output.append(",".join(carbapenamases))
541 output.append("other RGI AMR Genes: ")
542 for key in rgiAMR:
543 output.append(rgiAMR[key].Best_Hit_ARO + "(" + rgiAMR[key].source + ")")
544
545 output.append("\nDetails about the carbapenamase Genes: ")
546 for key in rFinder:
547 output.append(rFinder[key].row)
548 output.append("\nDetails about the RGI AMR Genes: ")
549 for key in rgiAMR:
550 output.append(rgiAMR[key].row)
551
552 #write summary to a file
553 summaryDir = outputDir + "/summary/" + ID
554 out = open("summary.txt", 'w')
555 for item in output:
556 out.write("%s\n" % item)
557
558
559 #TSV output
560 tsvOut = []
561 tsvOut.append("ID\tExpected Species\tMLST Species\tSequence Type\tMLST Scheme\tCarbapenem Resistance Genes\tOther AMR Genes\tTotal Plasmids\tPlasmids ID\tNum_Contigs\tPlasmid Length\tPlasmid RepType\tPlasmid Mobility\tNearest Reference\tDefinitely Plasmid Contigs\tLikely Plasmid Contigs")
562 #start with ID
563 temp = ""
564 temp += (ID + "\t")
565 temp += expectedSpecies + "\t"
566
567 #move into MLST
568 temp += mlstHit.species + "\t"
569 temp += str(mlstHit.seqType) + "\t"
570 temp += mlstHit.scheme + "\t"
571
572 #now onto AMR genes
573 temp += ";".join(carbapenamases) + "\t"
574 temp += ";".join(amrGenes) + "\t"
575
576 #lastly plasmids
577 temp+= str(len(mSuitePlasmids)) + "\t"
578 plasmidID = ""
579 contigs = ""
580 lengths = ""
581 rep_type = ""
582 mobility = ""
583 neighbour = ""
584 for keys in mSuitePlasmids:
585 plasmidID += str(mSuitePlasmids[keys].mash_neighbor_cluster) + ";"
586 contigs += str(mSuitePlasmids[keys].num_contigs) + ";"
587 lengths += str(mSuitePlasmids[keys].total_length) + ";"
588 rep_type += str(mSuitePlasmids[keys].rep_types) + ";"
589 mobility += str(mSuitePlasmids[keys].PredictedMobility) + ";"
590 neighbour += str(mSuitePlasmids[keys].mash_nearest_neighbor) + ";"
591 temp += plasmidID + "\t" + contigs + "\t" + lengths + "\t" + rep_type + "\t" + mobility + "\t" + neighbour + "\t"
592 temp += ";".join(plasmidContigs) + "\t"
593 temp += ";".join(likelyPlasmidContigs)
594 tsvOut.append(temp)
595
596 summaryDir = outputDir + "/summary/" + ID
597 out = open("summary.tsv", 'w')
598 for item in tsvOut:
599 out.write("%s\n" % item)
600 #endregion
601
602
603 start = time.time()#time the analysis
604 print("Starting workflow...")
605 #analysis time
606 Main()
607
608 end = time.time()
609 print("Finished!\nThe analysis used: " + str(end-start) + " seconds")