comparison cpo_galaxy_prediction.py @ 1:fea89c4d5227 draft

Uploaded
author jjjjia
date Thu, 16 Aug 2018 19:27:05 -0400
parents
children 29302ffdf137
comparison
equal deleted inserted replaced
0:917a05a03ac9 1:fea89c4d5227
1 #!/home/jjjjia/.conda/envs/py36/bin/python
2
3 #$ -S /home/jjjjia/.conda/envs/py36/bin/python
4 #$ -V # Pass environment variables to the job
5 #$ -N CPO_pipeline # Replace with a more specific job name
6 #$ -wd /home/jjjjia/testCases # Use the current working dir
7 #$ -pe smp 8 # Parallel Environment (how many cores)
8 #$ -l h_vmem=11G # Memory (RAM) allocation *per core*
9 #$ -e ./logs/$JOB_ID.err
10 #$ -o ./logs/$JOB_ID.log
11 #$ -m ea
12 #$ -M bja20@sfu.ca
13
14 #~/scripts/pipeline.py -i BC11-Kpn005_S2 -f /data/jjjjia/R1/BC11-Kpn005_S2_L001_R1_001.fastq.gz -r /data/jjjjia/R2/BC11-Kpn005_S2_L001_R2_001.fastq.gz -o pipelineResultsQsub -e "Klebsiella pneumoniae"
15
16 import subprocess
17 import pandas
18 import optparse
19 import os
20 import datetime
21 import sys
22 import time
23 import urllib.request
24 import gzip
25 import collections
26 import json
27 import numpy
28
29
30 debug = True #debug skips the shell scripts and also dump out a ton of debugging messages
31
32 if not debug:
33 #parses some parameters
34 parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...")
35 #required
36 parser.add_option("-i", "--id", dest="id", type="string", help="identifier of the isolate")
37 parser.add_option("-a", "--assembly", dest="assemblyPath", type="string", help="absolute file path to contigs fasta")
38 parser.add_option("-c", "--card-db", dest="cardDB", default = "/home/jjjjia/databases/card202.json", type="string", help="absolute path to card reference database")
39 parser.add_option("-o", "--output", dest="output", default='./', type="string", help="absolute path to output folder")
40 parser.add_option("-e", "--expected", dest="expectedSpecies", default="NA/NA/NA", type="string", help="expected species of the isolate")
41
42 #optionals
43 parser.add_option("-k", "--script-path", dest="scriptDir", default="/home/jjjjia/scripts", type="string", help="absolute file path to this script folder")
44 parser.add_option("-b", "--update-abricate-path", dest="updateAbPath", default = "", type="string", help="absolute file path to fasta sequence used for abricate database")
45 parser.add_option("-m", "--update-abricate-dbname", dest="updateAbName", default = "default", type="string", help="name of abricate database to update")
46 parser.add_option("-u", "--update-mlst", dest="updateMLST", default = "False", type="string", help="True = update MLST")
47 #used for parsing
48 parser.add_option("-s", "--mlst-scheme", dest="mlst", default= "/home/jjjjia/databases/scheme_species_map.tab", type="string", help="absolute file path to mlst scheme")
49
50
51 #parallelization, useless, these are hard coded to 8cores/64G RAM
52 #parser.add_option("-t", "--threads", dest="threads", default=8, type="int", help="number of cpu to use")
53 #parser.add_option("-p", "--memory", dest="memory", default=64, type="int", help="memory to use in GB")
54
55 (options,args) = parser.parse_args()
56 #if len(args) != 8:
57 #parser.error("incorrect number of arguments, all 7 is required")
58 curDir = os.getcwd()
59 outputDir = options.output
60 expectedSpecies = options.expectedSpecies
61 mlstScheme = options.mlst
62 tempDir = outputDir + "/shovillTemp"
63 scriptDir = options.scriptDir
64 updateAbName = options.updateAbName
65 updateAbPath = options.updateAbPath
66 updateMLST = options.updateMLST
67 cardDB=options.cardDB
68 assemblyPath=options.assemblyPath
69 ID = options.id
70 else:
71 manifestDir = ""
72 curDir = os.getcwd()
73 outputDir = "pipelineTest"
74 expectedSpecies = "Escherichia coli"
75 #threads = 8
76 #memory = 64
77 mlstScheme = outputDir + "/scheme_species_map.tab"
78 tempDir= outputDir + "/shovillTemp"
79 scriptDir = "~/scripts"
80 updateAbName = "cpo"
81 updateAbPath = "~/database/bccdcCPO.seq"
82 updateMLST = True
83 assemblyPath = "./"
84 cardDB = "./"
85 ID = "BC11-Kpn005_S2"
86
87 #region result objects
88 #define some objects to store values from results
89 #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar).
90 class starFinders(object):
91 def __init__(self):
92 self.file = ""
93 self.sequence = ""
94 self.start = 0
95 self.end = 0
96 self.gene = ""
97 self.shortGene = ""
98 self.coverage = ""
99 self.coverage_map = ""
100 self.gaps = ""
101 self.pCoverage = 100.00
102 self.pIdentity = 100.00
103 self.database = ""
104 self.accession = ""
105 self.product = ""
106 self.source = "chromosome"
107 self.row = ""
108
109 class PlasFlowResult(object):
110 def __init__(self):
111 self.sequence = ""
112 self.length = 0
113 self.label = ""
114 self.confidence = 0
115 self.usefulRow = ""
116 self.row = ""
117
118 class MlstResult(object):
119 def __init__(self):
120 self.file = ""
121 self.speciesID = ""
122 self.seqType = 0
123 self.scheme = ""
124 self.species = ""
125 self.row=""
126
127 class mobsuiteResult(object):
128 def __init__(self):
129 self.file_id = ""
130 self.cluster_id = ""
131 self.contig_id = ""
132 self.contig_num = 0
133 self.contig_length = 0
134 self.circularity_status = ""
135 self.rep_type = ""
136 self.rep_type_accession = ""
137 self.relaxase_type = ""
138 self.relaxase_type_accession = ""
139 self.mash_nearest_neighbor = ""
140 self.mash_neighbor_distance = 0.00
141 self.repetitive_dna_id = ""
142 self.match_type = ""
143 self.score = 0
144 self.contig_match_start = 0
145 self.contig_match_end = 0
146 self.row = ""
147
148 class mobsuitePlasmids(object):
149 def __init__(self):
150 self.file_id = ""
151 self.num_contigs = 0
152 self.total_length = 0
153 self.gc = ""
154 self.rep_types = ""
155 self.rep_typeAccession = ""
156 self.relaxase_type= ""
157 self.relaxase_type_accession = ""
158 self.mpf_type = ""
159 self.mpf_type_accession= ""
160 self.orit_type = ""
161 self.orit_accession = ""
162 self.PredictedMobility = ""
163 self.mash_nearest_neighbor = ""
164 self.mash_neighbor_distance = 0.00
165 self.mash_neighbor_cluster= 0
166 self.row = ""
167 class RGIResult(object):
168 def __init__(self):
169 self.ORF_ID = ""
170 self.Contig = ""
171 self.Start = -1
172 self.Stop = -1
173 self.Orientation = ""
174 self.Cut_Off = ""
175 self.Pass_Bitscore = 100000
176 self.Best_Hit_Bitscore = 0.00
177 self.Best_Hit_ARO = ""
178 self.Best_Identities = 0.00
179 self.ARO = 0
180 self.Model_type = ""
181 self.SNPs_in_Best_Hit_ARO = ""
182 self.Other_SNPs = ""
183 self.Drug_Class = ""
184 self.Resistance_Mechanism = ""
185 self.AMR_Gene_Family = ""
186 self.Predicted_DNA = ""
187 self.Predicted_Protein = ""
188 self.CARD_Protein_Sequence = ""
189 self.Percentage_Length_of_Reference_Sequence = 0.00
190 self.ID = ""
191 self.Model_ID = 0
192 self.source = ""
193 self.row = ""
194
195 #endregion
196
197 #region useful functions
198 def read(path):
199 return [line.rstrip('\n') for line in open(path)]
200 def execute(command):
201 process = subprocess.Popen(command, shell=False, cwd=curDir, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
202
203 # Poll process for new output until finished
204 while True:
205 nextline = process.stdout.readline()
206 if nextline == '' and process.poll() is not None:
207 break
208 sys.stdout.write(nextline)
209 sys.stdout.flush()
210
211 output = process.communicate()[0]
212 exitCode = process.returncode
213
214 if (exitCode == 0):
215 return output
216 else:
217 raise subprocess.CalledProcessError(exitCode, command)
218 def httpGetFile(url, filepath=""):
219 if (filepath == ""):
220 return urllib.request.urlretrieve(url)
221 else:
222 urllib.request.urlretrieve(url, filepath)
223 return True
224 def gunzip(inputpath="", outputpath=""):
225 if (outputpath == ""):
226 with gzip.open(inputpath, 'rb') as f:
227 gzContent = f.read()
228 return gzContent
229 else:
230 with gzip.open(inputpath, 'rb') as f:
231 gzContent = f.read()
232 with open(outputpath, 'wb') as out:
233 out.write(gzContent)
234 return True
235 def ToJson(dictObject, outputPath):
236 outDir = outputDir + '/summary/' + ID + ".json/"
237 if not (os.path.exists(outDir)):
238 os.makedirs(outDir)
239 with open(outDir + outputPath, 'w') as f:
240 json.dump([ob.__dict__ for ob in dictObject.values()], f, ensure_ascii=False)
241 #endregion
242
243 #region functions to parse result files
244 def ParseMLSTResult(pathToMLSTResult):
245 _mlstResult = {}
246 scheme = pandas.read_csv(mlstScheme, delimiter='\t', header=0)
247 scheme = scheme.replace(numpy.nan, '', regex=True)
248
249 taxon = {}
250 #record the scheme as a dictionary
251 taxon["-"] = "No MLST Match"
252 for i in range(len(scheme.index)):
253 key = scheme.iloc[i,0]
254 if (str(scheme.iloc[i,2]) == "nan"):
255 value = str(scheme.iloc[i,1])
256 else:
257 value = str(scheme.iloc[i,1]) + " " + str(scheme.iloc[i,2])
258
259 if (key in taxon.keys()):
260 taxon[key] = taxon.get(key) + ";" + value
261 else:
262 taxon[key] = value
263 #read in the mlst result
264 mlst = pandas.read_csv(pathToMLSTResult, delimiter='\t', header=None)
265 _mlstHit = MlstResult()
266
267 _mlstHit.file = mlst.iloc[0,0]
268 _mlstHit.speciesID = (mlst.iloc[0,1])
269 _mlstHit.seqType = str(mlst.iloc[0,2])
270 for i in range(3, len(mlst.columns)):
271 _mlstHit.scheme += mlst.iloc[0,i] + ";"
272 _mlstHit.species = taxon[_mlstHit.speciesID]
273 _mlstHit.row = "\t".join(str(x) for x in mlst.ix[0].tolist())
274 _mlstResult[_mlstHit.speciesID]=_mlstHit
275
276 return _mlstResult
277
278 def ParsePlasmidFinderResult(pathToPlasmidFinderResult):
279 #pipelineTest/contigs/BC110-Kpn005.fa contig00019 45455 45758 IncFIC(FII)_1 8-308/499 ========/=..... 8/11 59.52 75.65 plasmidfinder AP001918 IncFIC(FII)_1__AP001918
280 #example resfinder:
281 #pipelineTest/contigs/BC110-Kpn005.fa contig00038 256 1053 OXA-181 1-798/798 =============== 0/0 100.00 100.00 bccdc AEP16366.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-181
282
283 _pFinder = {} #***********************
284 plasmidFinder = pandas.read_csv(pathToPlasmidFinderResult, delimiter='\t', header=0)
285 plasmidFinder = plasmidFinder.replace(numpy.nan, '', regex=True)
286
287
288 for i in range(len(plasmidFinder.index)):
289 pf = starFinders()
290 pf.file = str(plasmidFinder.iloc[i,0])
291 pf.sequence = str(plasmidFinder.iloc[i,1])
292 pf.start = int(plasmidFinder.iloc[i,2])
293 pf.end = int(plasmidFinder.iloc[i,3])
294 pf.gene = str(plasmidFinder.iloc[i,4])
295 pf.shortGene = pf.gene[:pf.gene.index("_")]
296 pf.coverage = str(plasmidFinder.iloc[i,5])
297 pf.coverage_map = str(plasmidFinder.iloc[i,6])
298 pf.gaps = str(plasmidFinder.iloc[i,7])
299 pf.pCoverage = float(plasmidFinder.iloc[i,8])
300 pf.pIdentity = float(plasmidFinder.iloc[i,9])
301 pf.database = str(plasmidFinder.iloc[i,10])
302 pf.accession = str(plasmidFinder.iloc[i,11])
303 pf.product = str(plasmidFinder.iloc[i,12])
304 pf.source = "plasmid"
305 pf.row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist())
306 _pFinder[pf.gene]=pf
307 #row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist())
308 #plasmidFinderContigs.append(str(plasmidFinder.iloc[i,1]))
309 #origins.append(str(plasmidFinder.iloc[i,4][:plasmidFinder.iloc[i,4].index("_")]))
310 return _pFinder
311
312 def ParseMobsuiteResult(pathToMobsuiteResult):
313 _mobsuite = {}
314 mResult = pandas.read_csv(pathToMobsuiteResult, delimiter='\t', header=0)
315 mResult = mResult.replace(numpy.nan, '', regex=True)
316
317 for i in range(len(mResult.index)):
318 mr = mobsuiteResult()
319 mr.file_id = str(mResult.iloc[i,0])
320 mr.cluster_id = str(mResult.iloc[i,1])
321 if (mr.cluster_id == "chromosome"):
322 break
323 mr.contig_id = str(mResult.iloc[i,2])
324 mr.contig_num = mr.contig_id[(mr.contig_id.find("contig")+6):mr.contig_id.find("_len=")]
325 mr.contig_length = int(mResult.iloc[i,3])
326 mr.circularity_status = str(mResult.iloc[i,4])
327 mr.rep_type = str(mResult.iloc[i,5])
328 mr.rep_type_accession = str(mResult.iloc[i,6])
329 mr.relaxase_type = str(mResult.iloc[i,7])
330 mr.relaxase_type_accession = str(mResult.iloc[i,8])
331 mr.mash_nearest_neighbor = str(mResult.iloc[i,9])
332 mr.mash_neighbor_distance = float(mResult.iloc[i,10])
333 mr.repetitive_dna_id = str(mResult.iloc[i,11])
334 mr.match_type = str(mResult.iloc[i,12])
335 if (mr.match_type == ""):
336 mr.score = -1
337 mr.contig_match_start = -1
338 mr.contig_match_end = -1
339 else:
340 mr.score = int(mResult.iloc[i,13])
341 mr.contig_match_start = int(mResult.iloc[i,14])
342 mr.contig_match_end = int(mResult.iloc[i,15])
343 mr.row = "\t".join(str(x) for x in mResult.ix[i].tolist())
344 _mobsuite[mr.contig_id]=(mr)
345 return _mobsuite
346
347 def ParseMobsuitePlasmids(pathToMobsuiteResult):
348 _mobsuite = {}
349 mResults = pandas.read_csv(pathToMobsuiteResult, delimiter='\t', header=0)
350 mResults = mResults.replace(numpy.nan, '', regex=True)
351
352 for i in range(len(mResults.index)):
353 mr = mobsuitePlasmids()
354 mr.file_id = str(mResults.iloc[i,0])
355 mr.num_contigs = int(mResults.iloc[i,1])
356 mr.total_length = int(mResults.iloc[i,2])
357 mr.gc = int(mResults.iloc[i,3])
358 mr.rep_types = str(mResults.iloc[i,4])
359 mr.rep_typeAccession = str(mResults.iloc[i,5])
360 mr.relaxase_type = str(mResults.iloc[i,6])
361 mr.relaxase_type_accession = str(mResults.iloc[i,7])
362 mr.mpf_type = str(mResults.iloc[i,8])
363 mr.mpf_type_accession = str(mResults.iloc[i,9])
364 mr.orit_type = str(mResults.iloc[i,10])
365 mr.orit_accession = str(mResults.iloc[i,11])
366 mr.PredictedMobility = str(mResults.iloc[i,12])
367 mr.mash_nearest_neighbor = str(mResults.iloc[i,13])
368 mr.mash_neighbor_distance = float(mResults.iloc[i,14])
369 mr.mash_neighbor_cluster = int(mResults.iloc[i,15])
370 mr.row = "\t".join(str(x) for x in mResults.ix[i].tolist())
371 _mobsuite[mr.file_id] = mr
372 return _mobsuite
373
374 def ParseResFinderResult(pathToResFinderResults, plasmidContigs, likelyPlasmidContigs):
375 _rFinder = {}
376 resFinder = pandas.read_csv(pathToResFinderResults, delimiter='\t', header=0)
377 resFinder = resFinder.replace(numpy.nan, '', regex=True)
378
379 for i in range(len(resFinder.index)):
380 rf = starFinders()
381 rf.file = str(resFinder.iloc[i,0])
382 rf.sequence = str(resFinder.iloc[i,1])
383 rf.start = int(resFinder.iloc[i,2])
384 rf.end = int(resFinder.iloc[i,3])
385 rf.gene = str(resFinder.iloc[i,4])
386 rf.shortGene = rf.gene
387 rf.coverage = str(resFinder.iloc[i,5])
388 rf.coverage_map = str(resFinder.iloc[i,6])
389 rf.gaps = str(resFinder.iloc[i,7])
390 rf.pCoverage = float(resFinder.iloc[i,8])
391 rf.pIdentity = float(resFinder.iloc[i,9])
392 rf.database = str(resFinder.iloc[i,10])
393 rf.accession = str(resFinder.iloc[i,11])
394 rf.product = str(resFinder.iloc[i,12])
395 rf.row = "\t".join(str(x) for x in resFinder.ix[i].tolist())
396 if (rf.sequence[6:] in plasmidContigs):
397 rf.source = "plasmid"
398 elif (rf.sequence[6:] in likelyPlasmidContigs):
399 rf.source = "likely plasmid"
400 else:
401 rf.source = "likely chromosome"
402 _rFinder[rf.gene]=rf
403 return _rFinder
404
405 def ParseRGIResult(pathToRGIResults, plasmidContigs, likelyPlasmidContigs):
406 _rgiR = {}
407 RGI = pandas.read_csv(pathToRGIResults, delimiter='\t', header=0)
408 RGI = RGI.replace(numpy.nan, '', regex=True)
409
410 for i in range(len(RGI.index)):
411 r = RGIResult()
412 r.ORF_ID = str(RGI.iloc[i,0])
413 r.Contig = str(RGI.iloc[i,1])
414 r.Contig_Num = r.Contig[6:r.Contig.find("_")]
415 r.Start = int(RGI.iloc[i,2])
416 r.Stop = int(RGI.iloc[i,3])
417 r.Orientation = str(RGI.iloc[i,4])
418 r.Cut_Off = str(RGI.iloc[i,5])
419 r.Pass_Bitscore = int(RGI.iloc[i,6])
420 r.Best_Hit_Bitscore = float(RGI.iloc[i,7])
421 r.Best_Hit_ARO = str(RGI.iloc[i,8])
422 r.Best_Identities = float(RGI.iloc[i,9])
423 r.ARO = int(RGI.iloc[i,10])
424 r.Model_type = str(RGI.iloc[i,11])
425 r.SNPs_in_Best_Hit_ARO = str(RGI.iloc[i,12])
426 r.Other_SNPs = str(RGI.iloc[i,13])
427 r.Drug_Class = str(RGI.iloc[i,14])
428 r.Resistance_Mechanism = str(RGI.iloc[i,15])
429 r.AMR_Gene_Family = str(RGI.iloc[i,16])
430 r.Predicted_DNA = str(RGI.iloc[i,17])
431 r.Predicted_Protein = str(RGI.iloc[i,18])
432 r.CARD_Protein_Sequence = str(RGI.iloc[i,19])
433 r.Percentage_Length_of_Reference_Sequence = float(RGI.iloc[i,20])
434 r.ID = str(RGI.iloc[i,21])
435 r.Model_ID = int(RGI.iloc[i,22])
436 r.row = "\t".join(str(x) for x in RGI.ix[i].tolist())
437 if (r.Contig_Num in plasmidContigs):
438 r.source = "plasmid"
439 elif (r.Contig_Num in likelyPlasmidContigs):
440 r.source = "likely plasmid"
441 else:
442 r.source = "likely chromosome"
443 _rgiR[r.Model_ID]=r
444 return _rgiR
445 #endregion
446
447 def Main():
448 notes = []
449 #init the output list
450 output = []
451 jsonOutput = []
452
453 print(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + assemblyPath)
454 output.append(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + assemblyPath)
455
456 #region update databases if update=true
457 if not debug:
458 #update databases if necessary
459 if not (updateAbPath == "" and updateAbName == "default"):
460 print("updating abricate database: " + updateAbName + " @fasta path: " + updateAbPath)
461 cmd = [scriptDir + "/pipeline_updateAbricateDB.sh", updateAbPath, updateAbName]
462 update = execute(cmd)
463 if (updateMLST.lower() == "true"):
464 print("updating mlst database... ")
465 cmd = [scriptDir + "/pipeline_updateMLST.sh"]
466 update = execute(cmd)
467 #endregion
468
469 print("step 3: parsing the mlst results")
470
471 print("performing MLST")
472 #region call the mlst script
473 if not debug:
474 print("running pipeline_prediction.sh")
475 #input parameters: 1=ID 2 = assemblyPath, 3= outputdir, 4=card.json
476 cmd = [scriptDir + "/pipeline_prediction.sh", ID, assemblyPath, outputDir, cardDB]
477 result = execute(cmd)
478 #endregion
479
480 #region parse the mlst results
481 print("step 3: parsing mlst, plasmid, and amr results")
482
483 print("identifying MLST")
484 pathToMLSTScheme = outputDir + "/predictions/" + ID + ".mlst"
485 mlstHit = ParseMLSTResult(pathToMLSTScheme)#***********************
486 ToJson(mlstHit, "mlst.json") #write it to a json output
487 mlstHit = list(mlstHit.values())[0]
488
489 #endregion
490
491 #region parse mobsuite, resfinder and rgi results
492 print("identifying plasmid contigs and amr genes")
493
494 plasmidContigs = []
495 likelyPlasmidContigs = []
496 origins = []
497
498 #parse mobsuite results
499 mSuite = ParseMobsuiteResult(outputDir + "/predictions/" + ID + ".recon/contig_report.txt")#*************
500 ToJson(mSuite, "mobsuite.json") #*************
501 mSuitePlasmids = ParseMobsuitePlasmids(outputDir + "/predictions/" + ID + ".recon/mobtyper_aggregate_report.txt")#*************
502 ToJson(mSuitePlasmids, "mobsuitePlasmids.json") #*************
503
504 for key in mSuite:
505 if mSuite[key].contig_num not in plasmidContigs and mSuite[key].contig_num not in likelyPlasmidContigs:
506 if not (mSuite[key].rep_type == ''):
507 plasmidContigs.append(mSuite[key].contig_num)
508 else:
509 likelyPlasmidContigs.append(mSuite[key].contig_num)
510 for key in mSuite:
511 if mSuite[key].rep_type not in origins:
512 origins.append(mSuite[key].rep_type)
513
514 #parse resfinder AMR results
515 rFinder = ParseResFinderResult(outputDir + "/predictions/" + ID + ".cp", plasmidContigs, likelyPlasmidContigs) #**********************
516 ToJson(rFinder, "resfinder.json") #*************
517
518 rgiAMR = ParseRGIResult(outputDir + "/predictions/" + ID + ".rgi.txt", plasmidContigs, likelyPlasmidContigs)#***********************
519 ToJson(rgiAMR, "rgi.json") #*************
520
521 carbapenamases = []
522 amrGenes = []
523 for keys in rFinder:
524 carbapenamases.append(rFinder[keys].shortGene + "(" + rFinder[keys].source + ")")
525 for keys in rgiAMR:
526 if (rgiAMR[keys].Drug_Class.find("carbapenem") > -1):
527 if (rgiAMR[keys].Best_Hit_ARO not in carbapenamases):
528 carbapenamases.append(rgiAMR[keys].Best_Hit_ARO+ "(" + rgiAMR[keys].source + ")")
529 else:
530 if (rgiAMR[keys].Best_Hit_ARO not in amrGenes):
531 amrGenes.append(rgiAMR[keys].Best_Hit_ARO+ "(" + rgiAMR[keys].source + ")")
532 #endregion
533
534 #region output parsed mlst information
535 print("formatting mlst outputs")
536 output.append("\n\n\n~~~~~~~MLST summary~~~~~~~")
537 output.append("MLST determined species: " + mlstHit.species)
538 output.append("\nMLST Details: ")
539 output.append(mlstHit.row)
540
541 output.append("\nMLST information: ")
542 if (mlstHit.species == expectedSpecies):
543 output.append("MLST determined species is the same as expected species")
544 #notes.append("MLST determined species is the same as expected species")
545 else:
546 output.append("!!!MLST determined species is NOT the same as expected species, contamination? mislabeling?")
547 notes.append("MLST: Not expected species. Possible contamination or mislabeling")
548
549 #endregion
550
551 #region output the parsed plasmid/amr results
552 output.append("\n\n\n~~~~~~~~Plasmids~~~~~~~~\n")
553
554 output.append("predicted plasmid origins: ")
555 output.append(";".join(origins))
556
557 output.append("\ndefinitely plasmid contigs")
558 output.append(";".join(plasmidContigs))
559
560 output.append("\nlikely plasmid contigs")
561 output.append(";".join(likelyPlasmidContigs))
562
563 output.append("\nmob-suite prediction details: ")
564 for key in mSuite:
565 output.append(mSuite[key].row)
566
567 output.append("\n\n\n~~~~~~~~AMR Genes~~~~~~~~\n")
568 output.append("predicted carbapenamase Genes: ")
569 output.append(",".join(carbapenamases))
570 output.append("other RGI AMR Genes: ")
571 for key in rgiAMR:
572 output.append(rgiAMR[key].Best_Hit_ARO + "(" + rgiAMR[key].source + ")")
573
574 output.append("\nDetails about the carbapenamase Genes: ")
575 for key in rFinder:
576 output.append(rFinder[key].row)
577 output.append("\nDetails about the RGI AMR Genes: ")
578 for key in rgiAMR:
579 output.append(rgiAMR[key].row)
580
581 #write summary to a file
582 summaryDir = outputDir + "/summary/" + ID
583 out = open(summaryDir + ".txt", 'w')
584 for item in output:
585 out.write("%s\n" % item)
586
587
588 #TSV output
589 tsvOut = []
590 tsvOut.append("ID\tExpected Species\tMLST Species\tSequence Type\tMLST Scheme\tCarbapenem Resistance Genes\tOther AMR Genes\tTotal Plasmids\tPlasmids ID\tNum_Contigs\tPlasmid Length\tPlasmid RepType\tPlasmid Mobility\tNearest Reference\tDefinitely Plasmid Contigs\tLikely Plasmid Contigs")
591 #start with ID
592 temp = ""
593 temp += (ID + "\t")
594 temp += expectedSpecies + "\t"
595
596 #move into MLST
597 temp += mlstHit.species + "\t"
598 temp += str(mlstHit.seqType) + "\t"
599 temp += mlstHit.scheme + "\t"
600
601 #now onto AMR genes
602 temp += ";".join(carbapenamases) + "\t"
603 temp += ";".join(amrGenes) + "\t"
604
605 #lastly plasmids
606 temp+= str(len(mSuitePlasmids)) + "\t"
607 plasmidID = ""
608 contigs = ""
609 lengths = ""
610 rep_type = ""
611 mobility = ""
612 neighbour = ""
613 for keys in mSuitePlasmids:
614 plasmidID += str(mSuitePlasmids[keys].mash_neighbor_cluster) + ";"
615 contigs += str(mSuitePlasmids[keys].num_contigs) + ";"
616 lengths += str(mSuitePlasmids[keys].total_length) + ";"
617 rep_type += str(mSuitePlasmids[keys].rep_types) + ";"
618 mobility += str(mSuitePlasmids[keys].PredictedMobility) + ";"
619 neighbour += str(mSuitePlasmids[keys].mash_nearest_neighbor) + ";"
620 temp += plasmidID + "\t" + contigs + "\t" + lengths + "\t" + rep_type + "\t" + mobility + "\t" + neighbour + "\t"
621 temp += ";".join(plasmidContigs) + "\t"
622 temp += ";".join(likelyPlasmidContigs)
623 tsvOut.append(temp)
624
625 summaryDir = outputDir + "/summary/" + ID
626 out = open(summaryDir + ".tsv", 'w')
627 for item in tsvOut:
628 out.write("%s\n" % item)
629 #endregion
630
631
632 start = time.time()#time the analysis
633 print("Starting workflow...")
634 #analysis time
635 Main()
636
637 end = time.time()
638 print("Finished!\nThe analysis used: " + str(end-start) + " seconds")