Mercurial > repos > jjjjia > cpo_prediction
comparison galaxy_prediction.py @ 0:917a05a03ac9 draft
Uploaded
author | jjjjia |
---|---|
date | Tue, 14 Aug 2018 17:18:49 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:917a05a03ac9 |
---|---|
1 #!/home/jjjjia/.conda/envs/py36/bin/python | |
2 | |
3 #$ -S /home/jjjjia/.conda/envs/py36/bin/python | |
4 #$ -V # Pass environment variables to the job | |
5 #$ -N CPO_pipeline # Replace with a more specific job name | |
6 #$ -wd /home/jjjjia/testCases # Use the current working dir | |
7 #$ -pe smp 8 # Parallel Environment (how many cores) | |
8 #$ -l h_vmem=11G # Memory (RAM) allocation *per core* | |
9 #$ -e ./logs/$JOB_ID.err | |
10 #$ -o ./logs/$JOB_ID.log | |
11 #$ -m ea | |
12 #$ -M bja20@sfu.ca | |
13 | |
14 #./prediction.py -i ~/testCases/cpoResults/contigs/BC11-Kpn005_S2.fa -m ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.mlst -c ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.recon/contig_report.txt -f ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.recon/mobtyper_aggregate_report.txt -a ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.cp -r ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.rgi.txt -e "Klebsiella" | |
15 import subprocess | |
16 import pandas | |
17 import optparse | |
18 import os | |
19 import datetime | |
20 import sys | |
21 import time | |
22 import urllib.request | |
23 import gzip | |
24 import collections | |
25 import json | |
26 import numpy | |
27 | |
28 | |
29 debug = False #debug skips the shell scripts and also dump out a ton of debugging messages | |
30 | |
31 if not debug: | |
32 #parses some parameters | |
33 parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...") | |
34 #required | |
35 #MLSTHIT, mobsuite, resfinder, rgi, mlstscheme | |
36 parser.add_option("-i", "--id", dest="id", type="string", help="identifier of the isolate") | |
37 parser.add_option("-m", "--mlst", dest="mlst", type="string", help="absolute file path to mlst result") | |
38 parser.add_option("-c", "--mobfinderContig", dest="mobfinderContig", type="string", help="absolute path to mobfinder aggregate result") | |
39 parser.add_option("-f", "--mobfinderAggregate", dest="mobfinderAggregate", type="string", help="absolute path to mobfinder plasmid results") | |
40 parser.add_option("-a", "--abricate", dest="abricate", type="string", help="absolute path to abricate results") | |
41 parser.add_option("-r", "--rgi", dest="rgi", type="string", help="absolute path to rgi results") | |
42 parser.add_option("-e", "--expected", dest="expectedSpecies", default="NA/NA/NA", type="string", help="expected species of the isolate") | |
43 parser.add_option("-s", "--mlst-scheme", dest="mlstScheme", default= "./scheme_species_map.tab", type="string", help="absolute file path to mlst scheme") | |
44 | |
45 #parallelization, useless, these are hard coded to 8cores/64G RAM | |
46 #parser.add_option("-t", "--threads", dest="threads", default=8, type="int", help="number of cpu to use") | |
47 #parser.add_option("-p", "--memory", dest="memory", default=64, type="int", help="memory to use in GB") | |
48 | |
49 (options,args) = parser.parse_args() | |
50 #if len(args) != 8: | |
51 #parser.error("incorrect number of arguments, all 7 is required") | |
52 curDir = os.getcwd() | |
53 ID = str(options.id).lstrip().rstrip() | |
54 mlst = str(options.mlst).lstrip().rstrip() | |
55 mobfindercontig = str(options.mobfinderContig).lstrip().rstrip() | |
56 mobfinderaggregate = str(options.mobfinderAggregate).lstrip().rstrip() | |
57 abricate = str(options.abricate).lstrip().rstrip() | |
58 rgi = str(options.rgi).lstrip().rstrip() | |
59 expectedSpecies = str(options.expectedSpecies).lstrip().rstrip() | |
60 mlstScheme = str(options.mlstScheme).lstrip().rstrip() | |
61 outputDir = "./" | |
62 print(mlst) | |
63 print(mobfindercontig) | |
64 print(mobfinderaggregate) | |
65 print(abricate) | |
66 print(rgi) | |
67 print(expectedSpecies) | |
68 print(mlstScheme) | |
69 else: | |
70 curDir = os.getcwd() | |
71 ID = "BC11" | |
72 mlst = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.mlst" | |
73 mobfindercontig = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.recon\contig_report.txt" | |
74 mobfinderaggregate = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.recon\mobtyper_aggregate_report.txt" | |
75 abricate = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.cp" | |
76 rgi = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.rgi.txt" | |
77 expectedSpecies = "Escherichia coli" | |
78 mlstScheme = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\scheme_species_map.tab" | |
79 outputDir = "./" | |
80 | |
81 #region result objects | |
82 #define some objects to store values from results | |
83 #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar). | |
84 class starFinders(object): | |
85 def __init__(self): | |
86 self.file = "" | |
87 self.sequence = "" | |
88 self.start = 0 | |
89 self.end = 0 | |
90 self.gene = "" | |
91 self.shortGene = "" | |
92 self.coverage = "" | |
93 self.coverage_map = "" | |
94 self.gaps = "" | |
95 self.pCoverage = 100.00 | |
96 self.pIdentity = 100.00 | |
97 self.database = "" | |
98 self.accession = "" | |
99 self.product = "" | |
100 self.source = "chromosome" | |
101 self.row = "" | |
102 | |
103 class PlasFlowResult(object): | |
104 def __init__(self): | |
105 self.sequence = "" | |
106 self.length = 0 | |
107 self.label = "" | |
108 self.confidence = 0 | |
109 self.usefulRow = "" | |
110 self.row = "" | |
111 | |
112 class MlstResult(object): | |
113 def __init__(self): | |
114 self.file = "" | |
115 self.speciesID = "" | |
116 self.seqType = 0 | |
117 self.scheme = "" | |
118 self.species = "" | |
119 self.row="" | |
120 | |
121 class mobsuiteResult(object): | |
122 def __init__(self): | |
123 self.file_id = "" | |
124 self.cluster_id = "" | |
125 self.contig_id = "" | |
126 self.contig_num = 0 | |
127 self.contig_length = 0 | |
128 self.circularity_status = "" | |
129 self.rep_type = "" | |
130 self.rep_type_accession = "" | |
131 self.relaxase_type = "" | |
132 self.relaxase_type_accession = "" | |
133 self.mash_nearest_neighbor = "" | |
134 self.mash_neighbor_distance = 0.00 | |
135 self.repetitive_dna_id = "" | |
136 self.match_type = "" | |
137 self.score = 0 | |
138 self.contig_match_start = 0 | |
139 self.contig_match_end = 0 | |
140 self.row = "" | |
141 | |
142 class mobsuitePlasmids(object): | |
143 def __init__(self): | |
144 self.file_id = "" | |
145 self.num_contigs = 0 | |
146 self.total_length = 0 | |
147 self.gc = "" | |
148 self.rep_types = "" | |
149 self.rep_typeAccession = "" | |
150 self.relaxase_type= "" | |
151 self.relaxase_type_accession = "" | |
152 self.mpf_type = "" | |
153 self.mpf_type_accession= "" | |
154 self.orit_type = "" | |
155 self.orit_accession = "" | |
156 self.PredictedMobility = "" | |
157 self.mash_nearest_neighbor = "" | |
158 self.mash_neighbor_distance = 0.00 | |
159 self.mash_neighbor_cluster= 0 | |
160 self.row = "" | |
161 class RGIResult(object): | |
162 def __init__(self): | |
163 self.ORF_ID = "" | |
164 self.Contig = "" | |
165 self.Start = -1 | |
166 self.Stop = -1 | |
167 self.Orientation = "" | |
168 self.Cut_Off = "" | |
169 self.Pass_Bitscore = 100000 | |
170 self.Best_Hit_Bitscore = 0.00 | |
171 self.Best_Hit_ARO = "" | |
172 self.Best_Identities = 0.00 | |
173 self.ARO = 0 | |
174 self.Model_type = "" | |
175 self.SNPs_in_Best_Hit_ARO = "" | |
176 self.Other_SNPs = "" | |
177 self.Drug_Class = "" | |
178 self.Resistance_Mechanism = "" | |
179 self.AMR_Gene_Family = "" | |
180 self.Predicted_DNA = "" | |
181 self.Predicted_Protein = "" | |
182 self.CARD_Protein_Sequence = "" | |
183 self.Percentage_Length_of_Reference_Sequence = 0.00 | |
184 self.ID = "" | |
185 self.Model_ID = 0 | |
186 self.source = "" | |
187 self.row = "" | |
188 | |
189 #endregion | |
190 | |
191 #region useful functions | |
192 def read(path): | |
193 return [line.rstrip('\n') for line in open(path)] | |
194 def execute(command): | |
195 process = subprocess.Popen(command, shell=False, cwd=curDir, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) | |
196 | |
197 # Poll process for new output until finished | |
198 while True: | |
199 nextline = process.stdout.readline() | |
200 if nextline == '' and process.poll() is not None: | |
201 break | |
202 sys.stdout.write(nextline) | |
203 sys.stdout.flush() | |
204 | |
205 output = process.communicate()[0] | |
206 exitCode = process.returncode | |
207 | |
208 if (exitCode == 0): | |
209 return output | |
210 else: | |
211 raise subprocess.CalledProcessError(exitCode, command) | |
212 def httpGetFile(url, filepath=""): | |
213 if (filepath == ""): | |
214 return urllib.request.urlretrieve(url) | |
215 else: | |
216 urllib.request.urlretrieve(url, filepath) | |
217 return True | |
218 def gunzip(inputpath="", outputpath=""): | |
219 if (outputpath == ""): | |
220 with gzip.open(inputpath, 'rb') as f: | |
221 gzContent = f.read() | |
222 return gzContent | |
223 else: | |
224 with gzip.open(inputpath, 'rb') as f: | |
225 gzContent = f.read() | |
226 with open(outputpath, 'wb') as out: | |
227 out.write(gzContent) | |
228 return True | |
229 def ToJson(dictObject, outputPath): | |
230 #outDir = outputDir + '/summary/' + ID + ".json/" | |
231 #if not (os.path.exists(outDir)): | |
232 #os.makedirs(outDir) | |
233 #with open(outputPath, 'w') as f: | |
234 #json.dump([ob.__dict__ for ob in dictObject.values()], f, ensure_ascii=False) | |
235 return "" | |
236 #endregion | |
237 | |
238 #region functions to parse result files | |
239 def ParseMLSTResult(pathToMLSTResult, scheme): | |
240 _mlstResult = {} | |
241 scheme = pandas.read_csv(scheme, delimiter='\t', header=0) | |
242 scheme = scheme.replace(numpy.nan, '', regex=True) | |
243 | |
244 taxon = {} | |
245 #record the scheme as a dictionary | |
246 taxon["-"] = "No MLST Match" | |
247 for i in range(len(scheme.index)): | |
248 key = scheme.iloc[i,0] | |
249 if (str(scheme.iloc[i,2]) == "nan"): | |
250 value = str(scheme.iloc[i,1]) | |
251 else: | |
252 value = str(scheme.iloc[i,1]) + " " + str(scheme.iloc[i,2]) | |
253 | |
254 if (key in taxon.keys()): | |
255 taxon[key] = taxon.get(key) + ";" + value | |
256 else: | |
257 taxon[key] = value | |
258 #read in the mlst result | |
259 mlst = pandas.read_csv(pathToMLSTResult, delimiter='\t', header=None) | |
260 _mlstHit = MlstResult() | |
261 | |
262 _mlstHit.file = mlst.iloc[0,0] | |
263 _mlstHit.speciesID = (mlst.iloc[0,1]) | |
264 _mlstHit.seqType = str(mlst.iloc[0,2]) | |
265 for i in range(3, len(mlst.columns)): | |
266 _mlstHit.scheme += mlst.iloc[0,i] + ";" | |
267 _mlstHit.species = taxon[_mlstHit.speciesID] | |
268 _mlstHit.row = "\t".join(str(x) for x in mlst.ix[0].tolist()) | |
269 _mlstResult[_mlstHit.speciesID]=_mlstHit | |
270 | |
271 return _mlstResult | |
272 | |
273 def ParsePlasmidFinderResult(pathToPlasmidFinderResult): | |
274 #pipelineTest/contigs/BC110-Kpn005.fa contig00019 45455 45758 IncFIC(FII)_1 8-308/499 ========/=..... 8/11 59.52 75.65 plasmidfinder AP001918 IncFIC(FII)_1__AP001918 | |
275 #example resfinder: | |
276 #pipelineTest/contigs/BC110-Kpn005.fa contig00038 256 1053 OXA-181 1-798/798 =============== 0/0 100.00 100.00 bccdc AEP16366.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-181 | |
277 | |
278 _pFinder = {} #*********************** | |
279 plasmidFinder = pandas.read_csv(pathToPlasmidFinderResult, delimiter='\t', header=0) | |
280 plasmidFinder = plasmidFinder.replace(numpy.nan, '', regex=True) | |
281 | |
282 | |
283 for i in range(len(plasmidFinder.index)): | |
284 pf = starFinders() | |
285 pf.file = str(plasmidFinder.iloc[i,0]) | |
286 pf.sequence = str(plasmidFinder.iloc[i,1]) | |
287 pf.start = int(plasmidFinder.iloc[i,2]) | |
288 pf.end = int(plasmidFinder.iloc[i,3]) | |
289 pf.gene = str(plasmidFinder.iloc[i,4]) | |
290 pf.shortGene = pf.gene[:pf.gene.index("_")] | |
291 pf.coverage = str(plasmidFinder.iloc[i,5]) | |
292 pf.coverage_map = str(plasmidFinder.iloc[i,6]) | |
293 pf.gaps = str(plasmidFinder.iloc[i,7]) | |
294 pf.pCoverage = float(plasmidFinder.iloc[i,8]) | |
295 pf.pIdentity = float(plasmidFinder.iloc[i,9]) | |
296 pf.database = str(plasmidFinder.iloc[i,10]) | |
297 pf.accession = str(plasmidFinder.iloc[i,11]) | |
298 pf.product = str(plasmidFinder.iloc[i,12]) | |
299 pf.source = "plasmid" | |
300 pf.row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist()) | |
301 _pFinder[pf.gene]=pf | |
302 #row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist()) | |
303 #plasmidFinderContigs.append(str(plasmidFinder.iloc[i,1])) | |
304 #origins.append(str(plasmidFinder.iloc[i,4][:plasmidFinder.iloc[i,4].index("_")])) | |
305 return _pFinder | |
306 | |
307 def ParseMobsuiteResult(pathToMobsuiteResult): | |
308 _mobsuite = {} | |
309 mResult = pandas.read_csv(pathToMobsuiteResult, delimiter='\t', header=0) | |
310 mResult = mResult.replace(numpy.nan, '', regex=True) | |
311 | |
312 for i in range(len(mResult.index)): | |
313 mr = mobsuiteResult() | |
314 mr.file_id = str(mResult.iloc[i,0]) | |
315 mr.cluster_id = str(mResult.iloc[i,1]) | |
316 if (mr.cluster_id == "chromosome"): | |
317 break | |
318 mr.contig_id = str(mResult.iloc[i,2]) | |
319 mr.contig_num = mr.contig_id[(mr.contig_id.find("contig")+6):mr.contig_id.find("_len=")] | |
320 mr.contig_length = int(mResult.iloc[i,3]) | |
321 mr.circularity_status = str(mResult.iloc[i,4]) | |
322 mr.rep_type = str(mResult.iloc[i,5]) | |
323 mr.rep_type_accession = str(mResult.iloc[i,6]) | |
324 mr.relaxase_type = str(mResult.iloc[i,7]) | |
325 mr.relaxase_type_accession = str(mResult.iloc[i,8]) | |
326 mr.mash_nearest_neighbor = str(mResult.iloc[i,9]) | |
327 mr.mash_neighbor_distance = float(mResult.iloc[i,10]) | |
328 mr.repetitive_dna_id = str(mResult.iloc[i,11]) | |
329 mr.match_type = str(mResult.iloc[i,12]) | |
330 if (mr.match_type == ""): | |
331 mr.score = -1 | |
332 mr.contig_match_start = -1 | |
333 mr.contig_match_end = -1 | |
334 else: | |
335 mr.score = int(mResult.iloc[i,13]) | |
336 mr.contig_match_start = int(mResult.iloc[i,14]) | |
337 mr.contig_match_end = int(mResult.iloc[i,15]) | |
338 mr.row = "\t".join(str(x) for x in mResult.ix[i].tolist()) | |
339 _mobsuite[mr.contig_id]=(mr) | |
340 return _mobsuite | |
341 | |
342 def ParseMobsuitePlasmids(pathToMobsuiteResult): | |
343 _mobsuite = {} | |
344 mResults = pandas.read_csv(pathToMobsuiteResult, delimiter='\t', header=0) | |
345 mResults = mResults.replace(numpy.nan, '', regex=True) | |
346 | |
347 for i in range(len(mResults.index)): | |
348 mr = mobsuitePlasmids() | |
349 mr.file_id = str(mResults.iloc[i,0]) | |
350 mr.num_contigs = int(mResults.iloc[i,1]) | |
351 mr.total_length = int(mResults.iloc[i,2]) | |
352 mr.gc = int(mResults.iloc[i,3]) | |
353 mr.rep_types = str(mResults.iloc[i,4]) | |
354 mr.rep_typeAccession = str(mResults.iloc[i,5]) | |
355 mr.relaxase_type = str(mResults.iloc[i,6]) | |
356 mr.relaxase_type_accession = str(mResults.iloc[i,7]) | |
357 mr.mpf_type = str(mResults.iloc[i,8]) | |
358 mr.mpf_type_accession = str(mResults.iloc[i,9]) | |
359 mr.orit_type = str(mResults.iloc[i,10]) | |
360 mr.orit_accession = str(mResults.iloc[i,11]) | |
361 mr.PredictedMobility = str(mResults.iloc[i,12]) | |
362 mr.mash_nearest_neighbor = str(mResults.iloc[i,13]) | |
363 mr.mash_neighbor_distance = float(mResults.iloc[i,14]) | |
364 mr.mash_neighbor_cluster = int(mResults.iloc[i,15]) | |
365 mr.row = "\t".join(str(x) for x in mResults.ix[i].tolist()) | |
366 _mobsuite[mr.file_id] = mr | |
367 return _mobsuite | |
368 | |
369 def ParseResFinderResult(pathToResFinderResults, plasmidContigs, likelyPlasmidContigs): | |
370 _rFinder = {} | |
371 resFinder = pandas.read_csv(pathToResFinderResults, delimiter='\t', header=0) | |
372 resFinder = resFinder.replace(numpy.nan, '', regex=True) | |
373 | |
374 for i in range(len(resFinder.index)): | |
375 rf = starFinders() | |
376 rf.file = str(resFinder.iloc[i,0]) | |
377 rf.sequence = str(resFinder.iloc[i,1]) | |
378 rf.start = int(resFinder.iloc[i,2]) | |
379 rf.end = int(resFinder.iloc[i,3]) | |
380 rf.gene = str(resFinder.iloc[i,4]) | |
381 rf.shortGene = rf.gene | |
382 rf.coverage = str(resFinder.iloc[i,5]) | |
383 rf.coverage_map = str(resFinder.iloc[i,6]) | |
384 rf.gaps = str(resFinder.iloc[i,7]) | |
385 rf.pCoverage = float(resFinder.iloc[i,8]) | |
386 rf.pIdentity = float(resFinder.iloc[i,9]) | |
387 rf.database = str(resFinder.iloc[i,10]) | |
388 rf.accession = str(resFinder.iloc[i,11]) | |
389 rf.product = str(resFinder.iloc[i,12]) | |
390 rf.row = "\t".join(str(x) for x in resFinder.ix[i].tolist()) | |
391 if (rf.sequence[6:] in plasmidContigs): | |
392 rf.source = "plasmid" | |
393 elif (rf.sequence[6:] in likelyPlasmidContigs): | |
394 rf.source = "likely plasmid" | |
395 else: | |
396 rf.source = "likely chromosome" | |
397 _rFinder[rf.gene]=rf | |
398 return _rFinder | |
399 | |
400 def ParseRGIResult(pathToRGIResults, plasmidContigs, likelyPlasmidContigs): | |
401 _rgiR = {} | |
402 RGI = pandas.read_csv(pathToRGIResults, delimiter='\t', header=0) | |
403 RGI = RGI.replace(numpy.nan, '', regex=True) | |
404 | |
405 for i in range(len(RGI.index)): | |
406 r = RGIResult() | |
407 r.ORF_ID = str(RGI.iloc[i,0]) | |
408 r.Contig = str(RGI.iloc[i,1]) | |
409 r.Contig_Num = r.Contig[6:r.Contig.find("_")] | |
410 r.Start = int(RGI.iloc[i,2]) | |
411 r.Stop = int(RGI.iloc[i,3]) | |
412 r.Orientation = str(RGI.iloc[i,4]) | |
413 r.Cut_Off = str(RGI.iloc[i,5]) | |
414 r.Pass_Bitscore = int(RGI.iloc[i,6]) | |
415 r.Best_Hit_Bitscore = float(RGI.iloc[i,7]) | |
416 r.Best_Hit_ARO = str(RGI.iloc[i,8]) | |
417 r.Best_Identities = float(RGI.iloc[i,9]) | |
418 r.ARO = int(RGI.iloc[i,10]) | |
419 r.Model_type = str(RGI.iloc[i,11]) | |
420 r.SNPs_in_Best_Hit_ARO = str(RGI.iloc[i,12]) | |
421 r.Other_SNPs = str(RGI.iloc[i,13]) | |
422 r.Drug_Class = str(RGI.iloc[i,14]) | |
423 r.Resistance_Mechanism = str(RGI.iloc[i,15]) | |
424 r.AMR_Gene_Family = str(RGI.iloc[i,16]) | |
425 r.Predicted_DNA = str(RGI.iloc[i,17]) | |
426 r.Predicted_Protein = str(RGI.iloc[i,18]) | |
427 r.CARD_Protein_Sequence = str(RGI.iloc[i,19]) | |
428 r.Percentage_Length_of_Reference_Sequence = float(RGI.iloc[i,20]) | |
429 r.ID = str(RGI.iloc[i,21]) | |
430 r.Model_ID = int(RGI.iloc[i,22]) | |
431 r.row = "\t".join(str(x) for x in RGI.ix[i].tolist()) | |
432 if (r.Contig_Num in plasmidContigs): | |
433 r.source = "plasmid" | |
434 elif (r.Contig_Num in likelyPlasmidContigs): | |
435 r.source = "likely plasmid" | |
436 else: | |
437 r.source = "likely chromosome" | |
438 _rgiR[r.Model_ID]=r | |
439 return _rgiR | |
440 #endregion | |
441 | |
442 def Main(): | |
443 outputDir = "./" | |
444 notes = [] | |
445 #init the output list | |
446 output = [] | |
447 jsonOutput = [] | |
448 | |
449 print(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + options.id) | |
450 output.append(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + options.id) | |
451 | |
452 #region parse the mlst results | |
453 print("step 3: parsing mlst, plasmid, and amr results") | |
454 | |
455 print("identifying MLST") | |
456 mlstHit = ParseMLSTResult(mlst, str(mlstScheme))#*********************** | |
457 ToJson(mlstHit, "mlst.json") #write it to a json output | |
458 mlstHit = list(mlstHit.values())[0] | |
459 | |
460 #endregion | |
461 | |
462 #region parse mobsuite, resfinder and rgi results | |
463 print("identifying plasmid contigs and amr genes") | |
464 | |
465 plasmidContigs = [] | |
466 likelyPlasmidContigs = [] | |
467 origins = [] | |
468 | |
469 #parse mobsuite results | |
470 mSuite = ParseMobsuiteResult(mobfindercontig) #outputDir + "/predictions/" + ID + ".recon/contig_report.txt")#************* | |
471 ToJson(mSuite, "mobsuite.json") #************* | |
472 mSuitePlasmids = ParseMobsuitePlasmids(mobfinderaggregate)#outputDir + "/predictions/" + ID + ".recon/mobtyper_aggregate_report.txt")#************* | |
473 ToJson(mSuitePlasmids, "mobsuitePlasmids.json") #************* | |
474 | |
475 for key in mSuite: | |
476 if mSuite[key].contig_num not in plasmidContigs and mSuite[key].contig_num not in likelyPlasmidContigs: | |
477 if not (mSuite[key].rep_type == ''): | |
478 plasmidContigs.append(mSuite[key].contig_num) | |
479 else: | |
480 likelyPlasmidContigs.append(mSuite[key].contig_num) | |
481 for key in mSuite: | |
482 if mSuite[key].rep_type not in origins: | |
483 origins.append(mSuite[key].rep_type) | |
484 | |
485 #parse resfinder AMR results | |
486 rFinder = ParseResFinderResult(abricate, plasmidContigs, likelyPlasmidContigs)#outputDir + "/predictions/" + ID + ".cp", plasmidContigs, likelyPlasmidContigs) #********************** | |
487 ToJson(rFinder, "resfinder.json") #************* | |
488 | |
489 rgiAMR = ParseRGIResult(rgi, plasmidContigs, likelyPlasmidContigs) # outputDir + "/predictions/" + ID + ".rgi.txt", plasmidContigs, likelyPlasmidContigs)#*********************** | |
490 ToJson(rgiAMR, "rgi.json") #************* | |
491 | |
492 carbapenamases = [] | |
493 amrGenes = [] | |
494 for keys in rFinder: | |
495 carbapenamases.append(rFinder[keys].shortGene + "(" + rFinder[keys].source + ")") | |
496 for keys in rgiAMR: | |
497 if (rgiAMR[keys].Drug_Class.find("carbapenem") > -1): | |
498 if (rgiAMR[keys].Best_Hit_ARO not in carbapenamases): | |
499 carbapenamases.append(rgiAMR[keys].Best_Hit_ARO+ "(" + rgiAMR[keys].source + ")") | |
500 else: | |
501 if (rgiAMR[keys].Best_Hit_ARO not in amrGenes): | |
502 amrGenes.append(rgiAMR[keys].Best_Hit_ARO+ "(" + rgiAMR[keys].source + ")") | |
503 #endregion | |
504 | |
505 #region output parsed mlst information | |
506 print("formatting mlst outputs") | |
507 output.append("\n\n\n~~~~~~~MLST summary~~~~~~~") | |
508 output.append("MLST determined species: " + mlstHit.species) | |
509 output.append("\nMLST Details: ") | |
510 output.append(mlstHit.row) | |
511 | |
512 output.append("\nMLST information: ") | |
513 if (mlstHit.species == expectedSpecies): | |
514 output.append("MLST determined species is the same as expected species") | |
515 #notes.append("MLST determined species is the same as expected species") | |
516 else: | |
517 output.append("!!!MLST determined species is NOT the same as expected species, contamination? mislabeling?") | |
518 notes.append("MLST: Not expected species. Possible contamination or mislabeling") | |
519 | |
520 #endregion | |
521 | |
522 #region output the parsed plasmid/amr results | |
523 output.append("\n\n\n~~~~~~~~Plasmids~~~~~~~~\n") | |
524 | |
525 output.append("predicted plasmid origins: ") | |
526 output.append(";".join(origins)) | |
527 | |
528 output.append("\ndefinitely plasmid contigs") | |
529 output.append(";".join(plasmidContigs)) | |
530 | |
531 output.append("\nlikely plasmid contigs") | |
532 output.append(";".join(likelyPlasmidContigs)) | |
533 | |
534 output.append("\nmob-suite prediction details: ") | |
535 for key in mSuite: | |
536 output.append(mSuite[key].row) | |
537 | |
538 output.append("\n\n\n~~~~~~~~AMR Genes~~~~~~~~\n") | |
539 output.append("predicted carbapenamase Genes: ") | |
540 output.append(",".join(carbapenamases)) | |
541 output.append("other RGI AMR Genes: ") | |
542 for key in rgiAMR: | |
543 output.append(rgiAMR[key].Best_Hit_ARO + "(" + rgiAMR[key].source + ")") | |
544 | |
545 output.append("\nDetails about the carbapenamase Genes: ") | |
546 for key in rFinder: | |
547 output.append(rFinder[key].row) | |
548 output.append("\nDetails about the RGI AMR Genes: ") | |
549 for key in rgiAMR: | |
550 output.append(rgiAMR[key].row) | |
551 | |
552 #write summary to a file | |
553 summaryDir = outputDir + "/summary/" + ID | |
554 out = open("summary.txt", 'w') | |
555 for item in output: | |
556 out.write("%s\n" % item) | |
557 | |
558 | |
559 #TSV output | |
560 tsvOut = [] | |
561 tsvOut.append("ID\tExpected Species\tMLST Species\tSequence Type\tMLST Scheme\tCarbapenem Resistance Genes\tOther AMR Genes\tTotal Plasmids\tPlasmids ID\tNum_Contigs\tPlasmid Length\tPlasmid RepType\tPlasmid Mobility\tNearest Reference\tDefinitely Plasmid Contigs\tLikely Plasmid Contigs") | |
562 #start with ID | |
563 temp = "" | |
564 temp += (ID + "\t") | |
565 temp += expectedSpecies + "\t" | |
566 | |
567 #move into MLST | |
568 temp += mlstHit.species + "\t" | |
569 temp += str(mlstHit.seqType) + "\t" | |
570 temp += mlstHit.scheme + "\t" | |
571 | |
572 #now onto AMR genes | |
573 temp += ";".join(carbapenamases) + "\t" | |
574 temp += ";".join(amrGenes) + "\t" | |
575 | |
576 #lastly plasmids | |
577 temp+= str(len(mSuitePlasmids)) + "\t" | |
578 plasmidID = "" | |
579 contigs = "" | |
580 lengths = "" | |
581 rep_type = "" | |
582 mobility = "" | |
583 neighbour = "" | |
584 for keys in mSuitePlasmids: | |
585 plasmidID += str(mSuitePlasmids[keys].mash_neighbor_cluster) + ";" | |
586 contigs += str(mSuitePlasmids[keys].num_contigs) + ";" | |
587 lengths += str(mSuitePlasmids[keys].total_length) + ";" | |
588 rep_type += str(mSuitePlasmids[keys].rep_types) + ";" | |
589 mobility += str(mSuitePlasmids[keys].PredictedMobility) + ";" | |
590 neighbour += str(mSuitePlasmids[keys].mash_nearest_neighbor) + ";" | |
591 temp += plasmidID + "\t" + contigs + "\t" + lengths + "\t" + rep_type + "\t" + mobility + "\t" + neighbour + "\t" | |
592 temp += ";".join(plasmidContigs) + "\t" | |
593 temp += ";".join(likelyPlasmidContigs) | |
594 tsvOut.append(temp) | |
595 | |
596 summaryDir = outputDir + "/summary/" + ID | |
597 out = open("summary.tsv", 'w') | |
598 for item in tsvOut: | |
599 out.write("%s\n" % item) | |
600 #endregion | |
601 | |
602 | |
603 start = time.time()#time the analysis | |
604 print("Starting workflow...") | |
605 #analysis time | |
606 Main() | |
607 | |
608 end = time.time() | |
609 print("Finished!\nThe analysis used: " + str(end-start) + " seconds") |