1
|
1 #!/home/jjjjia/.conda/envs/py36/bin/python
|
|
2
|
|
3 #$ -S /home/jjjjia/.conda/envs/py36/bin/python
|
|
4 #$ -V # Pass environment variables to the job
|
|
5 #$ -N CPO_pipeline # Replace with a more specific job name
|
|
6 #$ -wd /home/jjjjia/testCases # Use the current working dir
|
|
7 #$ -pe smp 8 # Parallel Environment (how many cores)
|
|
8 #$ -l h_vmem=11G # Memory (RAM) allocation *per core*
|
|
9 #$ -e ./logs/$JOB_ID.err
|
|
10 #$ -o ./logs/$JOB_ID.log
|
|
11 #$ -m ea
|
|
12 #$ -M bja20@sfu.ca
|
|
13
|
3
|
14 #./prediction.py -i ~/testCases/cpoResults/contigs/BC11-Kpn005_S2.fa -m ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.mlst -c ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.recon/contig_report.txt -f ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.recon/mobtyper_aggregate_report.txt -a ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.cp -r ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.rgi.txt -e "Klebsiella"
|
1
|
15 import subprocess
|
|
16 import pandas
|
|
17 import optparse
|
|
18 import os
|
|
19 import datetime
|
|
20 import sys
|
|
21 import time
|
|
22 import urllib.request
|
|
23 import gzip
|
|
24 import collections
|
|
25 import json
|
|
26 import numpy
|
|
27
|
|
28
|
6
|
29 debug = False #debug skips the shell scripts and also dump out a ton of debugging messages
|
1
|
30
|
|
31 if not debug:
|
|
32 #parses some parameters
|
|
33 parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...")
|
|
34 #required
|
3
|
35 #MLSTHIT, mobsuite, resfinder, rgi, mlstscheme
|
1
|
36 parser.add_option("-i", "--id", dest="id", type="string", help="identifier of the isolate")
|
3
|
37 parser.add_option("-m", "--mlst", dest="mlst", type="string", help="absolute file path to mlst result")
|
|
38 parser.add_option("-c", "--mobfinderContig", dest="mobfinderContig", type="string", help="absolute path to mobfinder aggregate result")
|
|
39 parser.add_option("-f", "--mobfinderAggregate", dest="mobfinderAggregate", type="string", help="absolute path to mobfinder plasmid results")
|
|
40 parser.add_option("-a", "--abricate", dest="abricate", type="string", help="absolute path to abricate results")
|
|
41 parser.add_option("-r", "--rgi", dest="rgi", type="string", help="absolute path to rgi results")
|
1
|
42 parser.add_option("-e", "--expected", dest="expectedSpecies", default="NA/NA/NA", type="string", help="expected species of the isolate")
|
3
|
43 parser.add_option("-s", "--mlst-scheme", dest="mlstScheme", default= "./scheme_species_map.tab", type="string", help="absolute file path to mlst scheme")
|
|
44 parser.add_option("-p", "--plasmidfinder", dest="plasmidfinder", type="string", help="absolute file path to plasmidfinder ")
|
1
|
45
|
|
46 #parallelization, useless, these are hard coded to 8cores/64G RAM
|
|
47 #parser.add_option("-t", "--threads", dest="threads", default=8, type="int", help="number of cpu to use")
|
|
48 #parser.add_option("-p", "--memory", dest="memory", default=64, type="int", help="memory to use in GB")
|
|
49
|
|
50 (options,args) = parser.parse_args()
|
|
51 #if len(args) != 8:
|
|
52 #parser.error("incorrect number of arguments, all 7 is required")
|
|
53 curDir = os.getcwd()
|
3
|
54 ID = str(options.id).lstrip().rstrip()
|
|
55 mlst = str(options.mlst).lstrip().rstrip()
|
|
56 mobfindercontig = str(options.mobfinderContig).lstrip().rstrip()
|
|
57 mobfinderaggregate = str(options.mobfinderAggregate).lstrip().rstrip()
|
|
58 abricate = str(options.abricate).lstrip().rstrip()
|
|
59 rgi = str(options.rgi).lstrip().rstrip()
|
|
60 expectedSpecies = str(options.expectedSpecies).lstrip().rstrip()
|
|
61 mlstScheme = str(options.mlstScheme).lstrip().rstrip()
|
|
62 plasmidfinder = str(options.plasmidfinder).lstrip().rstrip()
|
|
63 outputDir = "./"
|
|
64 print(mlst)
|
|
65 print(mobfindercontig)
|
|
66 print(mobfinderaggregate)
|
|
67 print(abricate)
|
|
68 print(rgi)
|
|
69 print(expectedSpecies)
|
|
70 print(mlstScheme)
|
1
|
71 else:
|
|
72 curDir = os.getcwd()
|
3
|
73 ID = "BC11"
|
|
74 mlst = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.mlst"
|
|
75 mobfindercontig = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.recon\contig_report.txt"
|
|
76 mobfinderaggregate = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.recon\mobtyper_aggregate_report.txt"
|
|
77 abricate = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.cp"
|
|
78 rgi = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.rgi.txt"
|
1
|
79 expectedSpecies = "Escherichia coli"
|
3
|
80 mlstScheme = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\scheme_species_map.tab"
|
|
81 plasmidfinder = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.origins"
|
|
82 outputDir = "./"
|
1
|
83
|
|
84 #region result objects
|
|
85 #define some objects to store values from results
|
|
86 #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar).
|
|
87 class starFinders(object):
|
|
88 def __init__(self):
|
|
89 self.file = ""
|
|
90 self.sequence = ""
|
|
91 self.start = 0
|
|
92 self.end = 0
|
|
93 self.gene = ""
|
|
94 self.shortGene = ""
|
|
95 self.coverage = ""
|
|
96 self.coverage_map = ""
|
|
97 self.gaps = ""
|
|
98 self.pCoverage = 100.00
|
|
99 self.pIdentity = 100.00
|
|
100 self.database = ""
|
|
101 self.accession = ""
|
|
102 self.product = ""
|
|
103 self.source = "chromosome"
|
|
104 self.row = ""
|
|
105
|
|
106 class PlasFlowResult(object):
|
|
107 def __init__(self):
|
|
108 self.sequence = ""
|
|
109 self.length = 0
|
|
110 self.label = ""
|
|
111 self.confidence = 0
|
|
112 self.usefulRow = ""
|
|
113 self.row = ""
|
|
114
|
|
115 class MlstResult(object):
|
|
116 def __init__(self):
|
|
117 self.file = ""
|
|
118 self.speciesID = ""
|
|
119 self.seqType = 0
|
|
120 self.scheme = ""
|
|
121 self.species = ""
|
|
122 self.row=""
|
|
123
|
|
124 class mobsuiteResult(object):
|
|
125 def __init__(self):
|
|
126 self.file_id = ""
|
|
127 self.cluster_id = ""
|
|
128 self.contig_id = ""
|
|
129 self.contig_num = 0
|
|
130 self.contig_length = 0
|
|
131 self.circularity_status = ""
|
|
132 self.rep_type = ""
|
|
133 self.rep_type_accession = ""
|
|
134 self.relaxase_type = ""
|
|
135 self.relaxase_type_accession = ""
|
|
136 self.mash_nearest_neighbor = ""
|
|
137 self.mash_neighbor_distance = 0.00
|
|
138 self.repetitive_dna_id = ""
|
|
139 self.match_type = ""
|
|
140 self.score = 0
|
|
141 self.contig_match_start = 0
|
|
142 self.contig_match_end = 0
|
|
143 self.row = ""
|
|
144
|
|
145 class mobsuitePlasmids(object):
|
|
146 def __init__(self):
|
|
147 self.file_id = ""
|
|
148 self.num_contigs = 0
|
|
149 self.total_length = 0
|
|
150 self.gc = ""
|
|
151 self.rep_types = ""
|
|
152 self.rep_typeAccession = ""
|
|
153 self.relaxase_type= ""
|
|
154 self.relaxase_type_accession = ""
|
|
155 self.mpf_type = ""
|
|
156 self.mpf_type_accession= ""
|
|
157 self.orit_type = ""
|
|
158 self.orit_accession = ""
|
|
159 self.PredictedMobility = ""
|
|
160 self.mash_nearest_neighbor = ""
|
|
161 self.mash_neighbor_distance = 0.00
|
|
162 self.mash_neighbor_cluster= 0
|
|
163 self.row = ""
|
3
|
164
|
1
|
165 class RGIResult(object):
|
|
166 def __init__(self):
|
|
167 self.ORF_ID = ""
|
|
168 self.Contig = ""
|
|
169 self.Start = -1
|
|
170 self.Stop = -1
|
|
171 self.Orientation = ""
|
|
172 self.Cut_Off = ""
|
|
173 self.Pass_Bitscore = 100000
|
|
174 self.Best_Hit_Bitscore = 0.00
|
|
175 self.Best_Hit_ARO = ""
|
|
176 self.Best_Identities = 0.00
|
|
177 self.ARO = 0
|
|
178 self.Model_type = ""
|
|
179 self.SNPs_in_Best_Hit_ARO = ""
|
|
180 self.Other_SNPs = ""
|
|
181 self.Drug_Class = ""
|
|
182 self.Resistance_Mechanism = ""
|
|
183 self.AMR_Gene_Family = ""
|
|
184 self.Predicted_DNA = ""
|
|
185 self.Predicted_Protein = ""
|
|
186 self.CARD_Protein_Sequence = ""
|
|
187 self.Percentage_Length_of_Reference_Sequence = 0.00
|
|
188 self.ID = ""
|
|
189 self.Model_ID = 0
|
|
190 self.source = ""
|
|
191 self.row = ""
|
|
192
|
|
193 #endregion
|
|
194
|
|
195 #region useful functions
|
|
196 def read(path):
|
|
197 return [line.rstrip('\n') for line in open(path)]
|
|
198 def execute(command):
|
|
199 process = subprocess.Popen(command, shell=False, cwd=curDir, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
|
200
|
|
201 # Poll process for new output until finished
|
|
202 while True:
|
|
203 nextline = process.stdout.readline()
|
|
204 if nextline == '' and process.poll() is not None:
|
|
205 break
|
|
206 sys.stdout.write(nextline)
|
|
207 sys.stdout.flush()
|
|
208
|
|
209 output = process.communicate()[0]
|
|
210 exitCode = process.returncode
|
|
211
|
|
212 if (exitCode == 0):
|
|
213 return output
|
|
214 else:
|
|
215 raise subprocess.CalledProcessError(exitCode, command)
|
|
216 def httpGetFile(url, filepath=""):
|
|
217 if (filepath == ""):
|
|
218 return urllib.request.urlretrieve(url)
|
|
219 else:
|
|
220 urllib.request.urlretrieve(url, filepath)
|
|
221 return True
|
|
222 def gunzip(inputpath="", outputpath=""):
|
|
223 if (outputpath == ""):
|
|
224 with gzip.open(inputpath, 'rb') as f:
|
|
225 gzContent = f.read()
|
|
226 return gzContent
|
|
227 else:
|
|
228 with gzip.open(inputpath, 'rb') as f:
|
|
229 gzContent = f.read()
|
|
230 with open(outputpath, 'wb') as out:
|
|
231 out.write(gzContent)
|
|
232 return True
|
|
233 def ToJson(dictObject, outputPath):
|
3
|
234 #outDir = outputDir + '/summary/' + ID + ".json/"
|
|
235 #if not (os.path.exists(outDir)):
|
|
236 #os.makedirs(outDir)
|
|
237 #with open(outputPath, 'w') as f:
|
|
238 #json.dump([ob.__dict__ for ob in dictObject.values()], f, ensure_ascii=False)
|
|
239 return ""
|
1
|
240 #endregion
|
|
241
|
|
242 #region functions to parse result files
|
3
|
243 def ParseMLSTResult(pathToMLSTResult, scheme):
|
1
|
244 _mlstResult = {}
|
3
|
245 scheme = pandas.read_csv(scheme, delimiter='\t', header=0)
|
1
|
246 scheme = scheme.replace(numpy.nan, '', regex=True)
|
|
247
|
|
248 taxon = {}
|
|
249 #record the scheme as a dictionary
|
|
250 taxon["-"] = "No MLST Match"
|
|
251 for i in range(len(scheme.index)):
|
|
252 key = scheme.iloc[i,0]
|
|
253 if (str(scheme.iloc[i,2]) == "nan"):
|
|
254 value = str(scheme.iloc[i,1])
|
|
255 else:
|
|
256 value = str(scheme.iloc[i,1]) + " " + str(scheme.iloc[i,2])
|
|
257
|
|
258 if (key in taxon.keys()):
|
|
259 taxon[key] = taxon.get(key) + ";" + value
|
|
260 else:
|
|
261 taxon[key] = value
|
|
262 #read in the mlst result
|
|
263 mlst = pandas.read_csv(pathToMLSTResult, delimiter='\t', header=None)
|
|
264 _mlstHit = MlstResult()
|
|
265
|
|
266 _mlstHit.file = mlst.iloc[0,0]
|
|
267 _mlstHit.speciesID = (mlst.iloc[0,1])
|
|
268 _mlstHit.seqType = str(mlst.iloc[0,2])
|
|
269 for i in range(3, len(mlst.columns)):
|
|
270 _mlstHit.scheme += mlst.iloc[0,i] + ";"
|
|
271 _mlstHit.species = taxon[_mlstHit.speciesID]
|
|
272 _mlstHit.row = "\t".join(str(x) for x in mlst.ix[0].tolist())
|
|
273 _mlstResult[_mlstHit.speciesID]=_mlstHit
|
|
274
|
|
275 return _mlstResult
|
|
276
|
|
277 def ParsePlasmidFinderResult(pathToPlasmidFinderResult):
|
|
278 #pipelineTest/contigs/BC110-Kpn005.fa contig00019 45455 45758 IncFIC(FII)_1 8-308/499 ========/=..... 8/11 59.52 75.65 plasmidfinder AP001918 IncFIC(FII)_1__AP001918
|
|
279 #example resfinder:
|
|
280 #pipelineTest/contigs/BC110-Kpn005.fa contig00038 256 1053 OXA-181 1-798/798 =============== 0/0 100.00 100.00 bccdc AEP16366.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-181
|
|
281
|
|
282 _pFinder = {} #***********************
|
|
283 plasmidFinder = pandas.read_csv(pathToPlasmidFinderResult, delimiter='\t', header=0)
|
|
284 plasmidFinder = plasmidFinder.replace(numpy.nan, '', regex=True)
|
|
285
|
|
286
|
|
287 for i in range(len(plasmidFinder.index)):
|
|
288 pf = starFinders()
|
|
289 pf.file = str(plasmidFinder.iloc[i,0])
|
|
290 pf.sequence = str(plasmidFinder.iloc[i,1])
|
|
291 pf.start = int(plasmidFinder.iloc[i,2])
|
|
292 pf.end = int(plasmidFinder.iloc[i,3])
|
|
293 pf.gene = str(plasmidFinder.iloc[i,4])
|
|
294 pf.shortGene = pf.gene[:pf.gene.index("_")]
|
|
295 pf.coverage = str(plasmidFinder.iloc[i,5])
|
|
296 pf.coverage_map = str(plasmidFinder.iloc[i,6])
|
|
297 pf.gaps = str(plasmidFinder.iloc[i,7])
|
|
298 pf.pCoverage = float(plasmidFinder.iloc[i,8])
|
|
299 pf.pIdentity = float(plasmidFinder.iloc[i,9])
|
|
300 pf.database = str(plasmidFinder.iloc[i,10])
|
|
301 pf.accession = str(plasmidFinder.iloc[i,11])
|
|
302 pf.product = str(plasmidFinder.iloc[i,12])
|
|
303 pf.source = "plasmid"
|
|
304 pf.row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist())
|
|
305 _pFinder[pf.gene]=pf
|
|
306 #row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist())
|
|
307 #plasmidFinderContigs.append(str(plasmidFinder.iloc[i,1]))
|
|
308 #origins.append(str(plasmidFinder.iloc[i,4][:plasmidFinder.iloc[i,4].index("_")]))
|
|
309 return _pFinder
|
|
310
|
|
311 def ParseMobsuiteResult(pathToMobsuiteResult):
|
|
312 _mobsuite = {}
|
|
313 mResult = pandas.read_csv(pathToMobsuiteResult, delimiter='\t', header=0)
|
|
314 mResult = mResult.replace(numpy.nan, '', regex=True)
|
|
315
|
|
316 for i in range(len(mResult.index)):
|
|
317 mr = mobsuiteResult()
|
|
318 mr.file_id = str(mResult.iloc[i,0])
|
|
319 mr.cluster_id = str(mResult.iloc[i,1])
|
|
320 if (mr.cluster_id == "chromosome"):
|
|
321 break
|
|
322 mr.contig_id = str(mResult.iloc[i,2])
|
|
323 mr.contig_num = mr.contig_id[(mr.contig_id.find("contig")+6):mr.contig_id.find("_len=")]
|
|
324 mr.contig_length = int(mResult.iloc[i,3])
|
|
325 mr.circularity_status = str(mResult.iloc[i,4])
|
|
326 mr.rep_type = str(mResult.iloc[i,5])
|
|
327 mr.rep_type_accession = str(mResult.iloc[i,6])
|
|
328 mr.relaxase_type = str(mResult.iloc[i,7])
|
|
329 mr.relaxase_type_accession = str(mResult.iloc[i,8])
|
|
330 mr.mash_nearest_neighbor = str(mResult.iloc[i,9])
|
|
331 mr.mash_neighbor_distance = float(mResult.iloc[i,10])
|
|
332 mr.repetitive_dna_id = str(mResult.iloc[i,11])
|
|
333 mr.match_type = str(mResult.iloc[i,12])
|
|
334 if (mr.match_type == ""):
|
|
335 mr.score = -1
|
|
336 mr.contig_match_start = -1
|
|
337 mr.contig_match_end = -1
|
|
338 else:
|
|
339 mr.score = int(mResult.iloc[i,13])
|
|
340 mr.contig_match_start = int(mResult.iloc[i,14])
|
|
341 mr.contig_match_end = int(mResult.iloc[i,15])
|
|
342 mr.row = "\t".join(str(x) for x in mResult.ix[i].tolist())
|
|
343 _mobsuite[mr.contig_id]=(mr)
|
|
344 return _mobsuite
|
|
345
|
|
346 def ParseMobsuitePlasmids(pathToMobsuiteResult):
|
|
347 _mobsuite = {}
|
|
348 mResults = pandas.read_csv(pathToMobsuiteResult, delimiter='\t', header=0)
|
|
349 mResults = mResults.replace(numpy.nan, '', regex=True)
|
|
350
|
|
351 for i in range(len(mResults.index)):
|
|
352 mr = mobsuitePlasmids()
|
|
353 mr.file_id = str(mResults.iloc[i,0])
|
|
354 mr.num_contigs = int(mResults.iloc[i,1])
|
|
355 mr.total_length = int(mResults.iloc[i,2])
|
|
356 mr.gc = int(mResults.iloc[i,3])
|
|
357 mr.rep_types = str(mResults.iloc[i,4])
|
|
358 mr.rep_typeAccession = str(mResults.iloc[i,5])
|
|
359 mr.relaxase_type = str(mResults.iloc[i,6])
|
|
360 mr.relaxase_type_accession = str(mResults.iloc[i,7])
|
|
361 mr.mpf_type = str(mResults.iloc[i,8])
|
|
362 mr.mpf_type_accession = str(mResults.iloc[i,9])
|
|
363 mr.orit_type = str(mResults.iloc[i,10])
|
|
364 mr.orit_accession = str(mResults.iloc[i,11])
|
|
365 mr.PredictedMobility = str(mResults.iloc[i,12])
|
|
366 mr.mash_nearest_neighbor = str(mResults.iloc[i,13])
|
|
367 mr.mash_neighbor_distance = float(mResults.iloc[i,14])
|
|
368 mr.mash_neighbor_cluster = int(mResults.iloc[i,15])
|
|
369 mr.row = "\t".join(str(x) for x in mResults.ix[i].tolist())
|
|
370 _mobsuite[mr.file_id] = mr
|
|
371 return _mobsuite
|
|
372
|
|
373 def ParseResFinderResult(pathToResFinderResults, plasmidContigs, likelyPlasmidContigs):
|
|
374 _rFinder = {}
|
|
375 resFinder = pandas.read_csv(pathToResFinderResults, delimiter='\t', header=0)
|
|
376 resFinder = resFinder.replace(numpy.nan, '', regex=True)
|
|
377
|
|
378 for i in range(len(resFinder.index)):
|
|
379 rf = starFinders()
|
|
380 rf.file = str(resFinder.iloc[i,0])
|
|
381 rf.sequence = str(resFinder.iloc[i,1])
|
|
382 rf.start = int(resFinder.iloc[i,2])
|
|
383 rf.end = int(resFinder.iloc[i,3])
|
|
384 rf.gene = str(resFinder.iloc[i,4])
|
|
385 rf.shortGene = rf.gene
|
|
386 rf.coverage = str(resFinder.iloc[i,5])
|
|
387 rf.coverage_map = str(resFinder.iloc[i,6])
|
|
388 rf.gaps = str(resFinder.iloc[i,7])
|
|
389 rf.pCoverage = float(resFinder.iloc[i,8])
|
|
390 rf.pIdentity = float(resFinder.iloc[i,9])
|
|
391 rf.database = str(resFinder.iloc[i,10])
|
|
392 rf.accession = str(resFinder.iloc[i,11])
|
|
393 rf.product = str(resFinder.iloc[i,12])
|
|
394 rf.row = "\t".join(str(x) for x in resFinder.ix[i].tolist())
|
|
395 if (rf.sequence[6:] in plasmidContigs):
|
|
396 rf.source = "plasmid"
|
|
397 elif (rf.sequence[6:] in likelyPlasmidContigs):
|
|
398 rf.source = "likely plasmid"
|
|
399 else:
|
|
400 rf.source = "likely chromosome"
|
|
401 _rFinder[rf.gene]=rf
|
|
402 return _rFinder
|
|
403
|
|
404 def ParseRGIResult(pathToRGIResults, plasmidContigs, likelyPlasmidContigs):
|
|
405 _rgiR = {}
|
|
406 RGI = pandas.read_csv(pathToRGIResults, delimiter='\t', header=0)
|
|
407 RGI = RGI.replace(numpy.nan, '', regex=True)
|
|
408
|
|
409 for i in range(len(RGI.index)):
|
|
410 r = RGIResult()
|
|
411 r.ORF_ID = str(RGI.iloc[i,0])
|
|
412 r.Contig = str(RGI.iloc[i,1])
|
|
413 r.Contig_Num = r.Contig[6:r.Contig.find("_")]
|
|
414 r.Start = int(RGI.iloc[i,2])
|
|
415 r.Stop = int(RGI.iloc[i,3])
|
|
416 r.Orientation = str(RGI.iloc[i,4])
|
|
417 r.Cut_Off = str(RGI.iloc[i,5])
|
|
418 r.Pass_Bitscore = int(RGI.iloc[i,6])
|
|
419 r.Best_Hit_Bitscore = float(RGI.iloc[i,7])
|
|
420 r.Best_Hit_ARO = str(RGI.iloc[i,8])
|
|
421 r.Best_Identities = float(RGI.iloc[i,9])
|
|
422 r.ARO = int(RGI.iloc[i,10])
|
|
423 r.Model_type = str(RGI.iloc[i,11])
|
|
424 r.SNPs_in_Best_Hit_ARO = str(RGI.iloc[i,12])
|
|
425 r.Other_SNPs = str(RGI.iloc[i,13])
|
|
426 r.Drug_Class = str(RGI.iloc[i,14])
|
|
427 r.Resistance_Mechanism = str(RGI.iloc[i,15])
|
|
428 r.AMR_Gene_Family = str(RGI.iloc[i,16])
|
|
429 r.Predicted_DNA = str(RGI.iloc[i,17])
|
|
430 r.Predicted_Protein = str(RGI.iloc[i,18])
|
|
431 r.CARD_Protein_Sequence = str(RGI.iloc[i,19])
|
|
432 r.Percentage_Length_of_Reference_Sequence = float(RGI.iloc[i,20])
|
|
433 r.ID = str(RGI.iloc[i,21])
|
|
434 r.Model_ID = int(RGI.iloc[i,22])
|
|
435 r.row = "\t".join(str(x) for x in RGI.ix[i].tolist())
|
|
436 if (r.Contig_Num in plasmidContigs):
|
|
437 r.source = "plasmid"
|
|
438 elif (r.Contig_Num in likelyPlasmidContigs):
|
|
439 r.source = "likely plasmid"
|
|
440 else:
|
|
441 r.source = "likely chromosome"
|
|
442 _rgiR[r.Model_ID]=r
|
|
443 return _rgiR
|
3
|
444
|
|
445 def ParsePlasmidFinderResult(pathToPlasmidFinderResult):
|
|
446 #pipelineTest/contigs/BC110-Kpn005.fa contig00019 45455 45758 IncFIC(FII)_1 8-308/499 ========/=..... 8/11 59.52 75.65 plasmidfinder AP001918 IncFIC(FII)_1__AP001918
|
|
447 #example resfinder:
|
|
448 #pipelineTest/contigs/BC110-Kpn005.fa contig00038 256 1053 OXA-181 1-798/798 =============== 0/0 100.00 100.00 bccdc AEP16366.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-181
|
|
449
|
|
450 _pFinder = {} #***********************
|
|
451 plasmidFinder = pandas.read_csv(pathToPlasmidFinderResult, delimiter='\t', header=0)
|
|
452
|
|
453 for i in range(len(plasmidFinder.index)):
|
|
454 pf = starFinders()
|
|
455 pf.file = str(plasmidFinder.iloc[i,0])
|
|
456 pf.sequence = str(plasmidFinder.iloc[i,1])
|
|
457 pf.start = int(plasmidFinder.iloc[i,2])
|
|
458 pf.end = int(plasmidFinder.iloc[i,3])
|
|
459 pf.gene = str(plasmidFinder.iloc[i,4])
|
|
460 pf.shortGene = pf.gene[:pf.gene.index("_")]
|
|
461 pf.coverage = str(plasmidFinder.iloc[i,5])
|
|
462 pf.coverage_map = str(plasmidFinder.iloc[i,6])
|
|
463 pf.gaps = str(plasmidFinder.iloc[i,7])
|
|
464 pf.pCoverage = float(plasmidFinder.iloc[i,8])
|
|
465 pf.pIdentity = float(plasmidFinder.iloc[i,9])
|
|
466 pf.database = str(plasmidFinder.iloc[i,10])
|
|
467 pf.accession = str(plasmidFinder.iloc[i,11])
|
|
468 pf.product = str(plasmidFinder.iloc[i,12])
|
|
469 pf.source = "plasmid"
|
|
470 pf.row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist())
|
|
471 _pFinder[pf.gene]=pf
|
|
472 #row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist())
|
|
473 #plasmidFinderContigs.append(str(plasmidFinder.iloc[i,1]))
|
|
474 #origins.append(str(plasmidFinder.iloc[i,4][:plasmidFinder.iloc[i,4].index("_")]))
|
|
475 return _pFinder
|
1
|
476 #endregion
|
|
477
|
|
478 def Main():
|
3
|
479 outputDir = "./"
|
1
|
480 notes = []
|
|
481 #init the output list
|
|
482 output = []
|
|
483 jsonOutput = []
|
|
484
|
3
|
485 print(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + ID)
|
|
486 output.append(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + ID)
|
1
|
487
|
|
488 #region parse the mlst results
|
|
489 print("step 3: parsing mlst, plasmid, and amr results")
|
|
490
|
|
491 print("identifying MLST")
|
3
|
492 mlstHit = ParseMLSTResult(mlst, str(mlstScheme))#***********************
|
1
|
493 ToJson(mlstHit, "mlst.json") #write it to a json output
|
|
494 mlstHit = list(mlstHit.values())[0]
|
|
495
|
|
496 #endregion
|
|
497
|
|
498 #region parse mobsuite, resfinder and rgi results
|
|
499 print("identifying plasmid contigs and amr genes")
|
|
500
|
|
501 plasmidContigs = []
|
|
502 likelyPlasmidContigs = []
|
|
503 origins = []
|
|
504
|
|
505 #parse mobsuite results
|
3
|
506 mSuite = ParseMobsuiteResult(mobfindercontig) #outputDir + "/predictions/" + ID + ".recon/contig_report.txt")#*************
|
1
|
507 ToJson(mSuite, "mobsuite.json") #*************
|
3
|
508 mSuitePlasmids = ParseMobsuitePlasmids(mobfinderaggregate)#outputDir + "/predictions/" + ID + ".recon/mobtyper_aggregate_report.txt")#*************
|
1
|
509 ToJson(mSuitePlasmids, "mobsuitePlasmids.json") #*************
|
|
510
|
|
511 for key in mSuite:
|
|
512 if mSuite[key].contig_num not in plasmidContigs and mSuite[key].contig_num not in likelyPlasmidContigs:
|
|
513 if not (mSuite[key].rep_type == ''):
|
|
514 plasmidContigs.append(mSuite[key].contig_num)
|
|
515 else:
|
|
516 likelyPlasmidContigs.append(mSuite[key].contig_num)
|
|
517 for key in mSuite:
|
|
518 if mSuite[key].rep_type not in origins:
|
|
519 origins.append(mSuite[key].rep_type)
|
|
520
|
|
521 #parse resfinder AMR results
|
3
|
522 pFinder = ParsePlasmidFinderResult(plasmidfinder)
|
|
523 ToJson(pFinder, "origins.json")
|
|
524
|
|
525 rFinder = ParseResFinderResult(abricate, plasmidContigs, likelyPlasmidContigs)#outputDir + "/predictions/" + ID + ".cp", plasmidContigs, likelyPlasmidContigs) #**********************
|
1
|
526 ToJson(rFinder, "resfinder.json") #*************
|
|
527
|
3
|
528 rgiAMR = ParseRGIResult(rgi, plasmidContigs, likelyPlasmidContigs) # outputDir + "/predictions/" + ID + ".rgi.txt", plasmidContigs, likelyPlasmidContigs)#***********************
|
1
|
529 ToJson(rgiAMR, "rgi.json") #*************
|
|
530
|
|
531 carbapenamases = []
|
|
532 amrGenes = []
|
|
533 for keys in rFinder:
|
|
534 carbapenamases.append(rFinder[keys].shortGene + "(" + rFinder[keys].source + ")")
|
|
535 for keys in rgiAMR:
|
|
536 if (rgiAMR[keys].Drug_Class.find("carbapenem") > -1):
|
|
537 if (rgiAMR[keys].Best_Hit_ARO not in carbapenamases):
|
|
538 carbapenamases.append(rgiAMR[keys].Best_Hit_ARO+ "(" + rgiAMR[keys].source + ")")
|
|
539 else:
|
|
540 if (rgiAMR[keys].Best_Hit_ARO not in amrGenes):
|
|
541 amrGenes.append(rgiAMR[keys].Best_Hit_ARO+ "(" + rgiAMR[keys].source + ")")
|
|
542 #endregion
|
|
543
|
|
544 #region output parsed mlst information
|
|
545 print("formatting mlst outputs")
|
|
546 output.append("\n\n\n~~~~~~~MLST summary~~~~~~~")
|
|
547 output.append("MLST determined species: " + mlstHit.species)
|
|
548 output.append("\nMLST Details: ")
|
|
549 output.append(mlstHit.row)
|
|
550
|
|
551 output.append("\nMLST information: ")
|
|
552 if (mlstHit.species == expectedSpecies):
|
|
553 output.append("MLST determined species is the same as expected species")
|
|
554 #notes.append("MLST determined species is the same as expected species")
|
|
555 else:
|
|
556 output.append("!!!MLST determined species is NOT the same as expected species, contamination? mislabeling?")
|
|
557 notes.append("MLST: Not expected species. Possible contamination or mislabeling")
|
|
558
|
|
559 #endregion
|
|
560
|
|
561 #region output the parsed plasmid/amr results
|
|
562 output.append("\n\n\n~~~~~~~~Plasmids~~~~~~~~\n")
|
|
563
|
|
564 output.append("predicted plasmid origins: ")
|
|
565 output.append(";".join(origins))
|
|
566
|
|
567 output.append("\ndefinitely plasmid contigs")
|
|
568 output.append(";".join(plasmidContigs))
|
|
569
|
|
570 output.append("\nlikely plasmid contigs")
|
|
571 output.append(";".join(likelyPlasmidContigs))
|
|
572
|
|
573 output.append("\nmob-suite prediction details: ")
|
|
574 for key in mSuite:
|
|
575 output.append(mSuite[key].row)
|
|
576
|
|
577 output.append("\n\n\n~~~~~~~~AMR Genes~~~~~~~~\n")
|
|
578 output.append("predicted carbapenamase Genes: ")
|
|
579 output.append(",".join(carbapenamases))
|
|
580 output.append("other RGI AMR Genes: ")
|
|
581 for key in rgiAMR:
|
|
582 output.append(rgiAMR[key].Best_Hit_ARO + "(" + rgiAMR[key].source + ")")
|
|
583
|
|
584 output.append("\nDetails about the carbapenamase Genes: ")
|
|
585 for key in rFinder:
|
|
586 output.append(rFinder[key].row)
|
|
587 output.append("\nDetails about the RGI AMR Genes: ")
|
|
588 for key in rgiAMR:
|
|
589 output.append(rgiAMR[key].row)
|
|
590
|
|
591 #write summary to a file
|
|
592 summaryDir = outputDir + "/summary/" + ID
|
3
|
593 out = open("summary.txt", 'w')
|
1
|
594 for item in output:
|
|
595 out.write("%s\n" % item)
|
|
596
|
|
597
|
|
598 #TSV output
|
6
|
599 lindaOut = []
|
1
|
600 tsvOut = []
|
6
|
601 lindaOut.append("new\tID\tQUALITY\tExpected Species\tMLST Scheme\tSequence Type\tMLST_ALLELE_1\tMLST_ALLELE_2\tMLST_ALLELE_3\tMLST_ALLELE_4\tMLST_ALLELE_5\tMLST_ALLELE_6\tMLST_ALLELE_7\tSEROTYPE\tK_CAPSULE\tPLASMID_1_FAMILY\tPLASMID_1_BEST_MATCH\tPLASMID_1_COVERAGE\tPLASMID_1_SNVS_TO_BEST_MATCH\tPLASMID_1_CARBAPENEMASE\tPLASMID_1_INC_GROUP\tPLASMID_2_RFLP\tPLASMID_2_FAMILY\tPLASMID_2_BEST_MATCH\tPLASMID_2_COVERAGE\tPLASMID_2_SNVS_TO_BEST_MATCH\tPLASMID_2_CARBAPENEMASE\tPLASMID_2_INC_GROUP")
|
|
602
|
|
603 tsvOut.append("new\tID\tExpected Species\tMLST Species\tSequence Type\tMLST Scheme\tCarbapenem Resistance Genes\tOther AMR Genes\tTotal Plasmids\tPlasmids ID\tNum_Contigs\tPlasmid Length\tPlasmid RepType\tPlasmid Mobility\tNearest Reference\tDefinitely Plasmid Contigs\tLikely Plasmid Contigs")
|
1
|
604 #start with ID
|
6
|
605 temp = "\t"
|
1
|
606 temp += (ID + "\t")
|
|
607 temp += expectedSpecies + "\t"
|
|
608
|
|
609 #move into MLST
|
|
610 temp += mlstHit.species + "\t"
|
|
611 temp += str(mlstHit.seqType) + "\t"
|
|
612 temp += mlstHit.scheme + "\t"
|
|
613
|
|
614 #now onto AMR genes
|
|
615 temp += ";".join(carbapenamases) + "\t"
|
|
616 temp += ";".join(amrGenes) + "\t"
|
|
617
|
|
618 #lastly plasmids
|
|
619 temp+= str(len(mSuitePlasmids)) + "\t"
|
|
620 plasmidID = ""
|
|
621 contigs = ""
|
|
622 lengths = ""
|
|
623 rep_type = ""
|
|
624 mobility = ""
|
|
625 neighbour = ""
|
|
626 for keys in mSuitePlasmids:
|
|
627 plasmidID += str(mSuitePlasmids[keys].mash_neighbor_cluster) + ";"
|
|
628 contigs += str(mSuitePlasmids[keys].num_contigs) + ";"
|
|
629 lengths += str(mSuitePlasmids[keys].total_length) + ";"
|
|
630 rep_type += str(mSuitePlasmids[keys].rep_types) + ";"
|
|
631 mobility += str(mSuitePlasmids[keys].PredictedMobility) + ";"
|
|
632 neighbour += str(mSuitePlasmids[keys].mash_nearest_neighbor) + ";"
|
|
633 temp += plasmidID + "\t" + contigs + "\t" + lengths + "\t" + rep_type + "\t" + mobility + "\t" + neighbour + "\t"
|
|
634 temp += ";".join(plasmidContigs) + "\t"
|
|
635 temp += ";".join(likelyPlasmidContigs)
|
|
636 tsvOut.append(temp)
|
|
637
|
|
638 summaryDir = outputDir + "/summary/" + ID
|
3
|
639 out = open("summary.tsv", 'w')
|
1
|
640 for item in tsvOut:
|
|
641 out.write("%s\n" % item)
|
|
642 #endregion
|
|
643
|
|
644
|
|
645 start = time.time()#time the analysis
|
|
646 print("Starting workflow...")
|
|
647 #analysis time
|
|
648 Main()
|
|
649
|
|
650 end = time.time()
|
5
|
651 print("Finished!\nThe analysis used: " + str(end-start) + " seconds") |