comparison resfinder/cge/resfinder.py @ 0:55051a9bc58d draft default tip

Uploaded
author dcouvin
date Mon, 10 Jan 2022 20:06:07 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:55051a9bc58d
1 #!/usr/bin/env python3
2 from __future__ import division
3 import sys
4 import os
5 import time
6 import random
7 import re
8 from argparse import ArgumentParser
9 from tabulate import tabulate
10 import collections
11
12 from cgecore.blaster import Blaster
13 from cgecore.cgefinder import CGEFinder
14 from .output.table import TableResults
15
16
17 class ResFinder(CGEFinder):
18
19 def __init__(self, db_conf_file, notes, db_path, db_path_kma=None,
20 databases=None):
21 """
22 """
23 self.db_path = db_path
24
25 if(db_path_kma is None):
26 self.db_path_kma = db_path
27 else:
28 self.db_path_kma = db_path_kma
29
30 self.configured_dbs = dict()
31 self.kma_db_files = None
32 self.load_db_config(db_conf_file=db_conf_file)
33
34 self.databases = []
35 self.load_databases(databases=databases)
36
37 self.phenos = dict()
38 self.load_notes(notes=notes)
39
40 self.blast_results = None
41 # self.kma_results = None
42 # self.results = None
43
44 @staticmethod
45 def old_results_to_standard_output(result, software, version, run_date,
46 run_cmd, id, mutation=False,
47 tableresults=None):
48 """
49 """
50 std_results = TableResults(software, version, run_date, run_cmd, id)
51 headers = [
52 "template_name",
53 "template_length",
54 "template_start_pos",
55 "template_end_pos",
56 "aln_length",
57 "aln_identity",
58 "aln_gaps",
59 "aln_template_string",
60 "aln_query_string",
61 "aln_homology_string",
62 "template_variant",
63 "acc_no",
64 "query_id",
65 "query_start_pos",
66 "query_end_pos",
67 "query_depth",
68 "blast_eval",
69 "blast_bitscore",
70 "pval",
71 "software_score"
72 ]
73
74 for db_name, db in result.items():
75 if(db_name == "excluded"):
76 continue
77
78 if(db == "No hit found"):
79 continue
80
81 std_results.add_table(db_name)
82 std_db = std_results.long[db_name]
83 std_db.add_headers(headers)
84 std_db.lock_headers = True
85
86 for unique_id, hit_db in db.items():
87 if(unique_id in result["excluded"]):
88 continue
89 # TODO: unique_id == unique_db_id
90 sbjct = hit_db["sbjct_header"].split("_")
91 template = sbjct[0]
92 variant = sbjct[1]
93 acc = "_".join(sbjct[2:])
94 unique_db_id = ("{}_{}".format(template, acc))
95 std_db[unique_db_id] = {
96 "template_name": template,
97 "template_variant": variant,
98 "acc_no": acc,
99 "template_length": hit_db["sbjct_length"],
100 "template_start_pos": hit_db["sbjct_start"],
101 "template_end_pos": hit_db["sbjct_end"],
102 "aln_length": hit_db["HSP_length"],
103 "aln_identity": hit_db["perc_ident"],
104 "aln_gaps": hit_db["gaps"],
105 "aln_template_string": hit_db["sbjct_string"],
106 "aln_query_string": hit_db["query_string"],
107 "aln_homology_string": hit_db["homo_string"],
108 "query_id": hit_db["contig_name"],
109 "query_start_pos": hit_db["query_start"],
110 "query_end_pos": hit_db["query_end"],
111 "query_depth": hit_db.get("query_depth", "NA"),
112 "blast_eval": hit_db.get("evalue", "NA"),
113 "blast_bitscore": hit_db.get("bit", "NA"),
114 "pval": hit_db.get("p_value", "NA"),
115 "software_score": hit_db["cal_score"]
116 }
117
118 return std_results
119
120 def write_results(self, out_path, result, res_type, software="ResFinder"):
121 """
122 """
123 if(res_type == ResFinder.TYPE_BLAST):
124 result_str = self.results_to_str(
125 res_type=res_type, results=result.results,
126 query_align=result.gene_align_query,
127 homo_align=result.gene_align_homo,
128 sbjct_align=result.gene_align_sbjct)
129 elif(res_type == ResFinder.TYPE_KMA):
130 result_str = self.results_to_str(res_type=res_type,
131 results=result)
132
133 with open(out_path + "/{}_results_tab.txt".format(software), "w") as fh:
134 fh.write(result_str[0])
135 with open(out_path + "/{}_results_table.txt".format(software), "w") as fh:
136 fh.write(result_str[1])
137 with open(out_path + "/{}_results.txt".format(software), "w") as fh:
138 fh.write(result_str[2])
139 with open(out_path + "/{}_Resistance_gene_seq.fsa".format(software), "w") as fh:
140 fh.write(result_str[3])
141 with open(out_path + "/{}_Hit_in_genome_seq.fsa".format(software), "w") as fh:
142 fh.write(result_str[4])
143
144 def blast(self, inputfile, out_path, min_cov=0.9, threshold=0.6,
145 blast="blastn", allowed_overlap=0):
146 """
147 """
148 blast_run = Blaster(inputfile=inputfile, databases=self.databases,
149 db_path=self.db_path, out_path=out_path,
150 min_cov=min_cov, threshold=threshold, blast=blast,
151 allowed_overlap=allowed_overlap)
152 self.blast_results = blast_run.results
153 return blast_run
154
155 def results_to_str(self, res_type, results, query_align=None,
156 homo_align=None, sbjct_align=None):
157
158 if(res_type != ResFinder.TYPE_BLAST
159 and res_type != ResFinder.TYPE_KMA):
160 sys.exit("resfinder.py error: result method was not provided or "
161 "not recognized.")
162
163 # Write the header for the tab file
164 tab_str = ("Resistance gene\tIdentity\tAlignment Length/Gene Length\t"
165 "Coverage\tPosition in reference\tContig\t"
166 "Position in contig\tPhenotype\tAccession no.\n")
167
168 table_str = ""
169 txt_str = ""
170 ref_str = ""
171 hit_str = ""
172
173 # Getting and writing out the results
174 titles = dict()
175 rows = dict()
176 headers = dict()
177 txt_file_seq_text = dict()
178 split_print = collections.defaultdict(list)
179
180 for db in results:
181 if(db == "excluded"):
182 continue
183
184 # Clean up dbs with only excluded hits
185 no_hits = True
186 for hit in results[db]:
187 if(hit not in results["excluded"]):
188 no_hits = False
189 break
190 if(no_hits):
191 results[db] = "No hit found"
192
193 profile = str(self.configured_dbs[db][0])
194 if results[db] == "No hit found":
195 table_str += ("%s\n%s\n\n" % (profile, results[db]))
196 else:
197 titles[db] = "%s" % (profile)
198 headers[db] = ["Resistance gene", "Identity",
199 "Alignment Length/Gene Length", "Coverage",
200 "Position in reference", "Contig",
201 "Position in contig", "Phenotype",
202 "Accession no."]
203 table_str += ("%s\n" % (profile))
204 table_str += ("Resistance gene\tIdentity\t"
205 "Alignment Length/Gene Length\tCoverage\t"
206 "Position in reference\tContig\t"
207 "Position in contig\tPhenotype\t"
208 "Accession no.\n")
209
210 rows[db] = list()
211 txt_file_seq_text[db] = list()
212
213 for hit in results[db]:
214 if(hit in results["excluded"]):
215 continue
216
217 res_header = results[db][hit]["sbjct_header"]
218 tmp = res_header.split("_")
219 gene = tmp[0]
220 acc = "_".join(tmp[2:])
221 ID = results[db][hit]["perc_ident"]
222 coverage = results[db][hit]["perc_coverage"]
223 sbjt_length = results[db][hit]["sbjct_length"]
224 HSP = results[db][hit]["HSP_length"]
225 positions_contig = "{}..{}".format(
226 results[db][hit]["query_start"],
227 results[db][hit]["query_end"])
228 positions_ref = "{}..{}".format(
229 results[db][hit]["sbjct_start"],
230 results[db][hit]["sbjct_end"])
231 contig_name = results[db][hit]["contig_name"]
232 pheno = self.phenos.get(gene, ("Warning: gene is missing "
233 "from Notes file. Please "
234 "inform curator."))
235
236 pheno = pheno.strip()
237
238 if "split_length" in results[db][hit]:
239 total_HSP = results[db][hit]["split_length"]
240 split_print[res_header].append([gene, ID, total_HSP,
241 sbjt_length, coverage,
242 positions_ref,
243 contig_name,
244 positions_contig,
245 pheno, acc])
246 tab_str += ("%s\t%s\t%s/%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
247 % (gene, ID, HSP, sbjt_length, coverage,
248 positions_ref, contig_name,
249 positions_contig, pheno, acc)
250 )
251 else:
252 # Write tabels
253 table_str += ("%s\t%.2f\t%s/%s\t%s\t%s\t%s\t%s\t%s\t%s"
254 "\n" % (gene, ID, HSP, sbjt_length,
255 coverage, positions_ref,
256 contig_name, positions_contig,
257 pheno, acc)
258 )
259 tab_str += ("%s\t%.2f\t%s/%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
260 % (gene, ID, HSP, sbjt_length, coverage,
261 positions_ref, contig_name,
262 positions_contig, pheno, acc)
263 )
264
265 # Saving the output to write the txt result table
266 hsp_length = "%s/%s" % (HSP, sbjt_length)
267 rows[db].append([gene, ID, hsp_length, coverage,
268 positions_ref, contig_name,
269 positions_contig, pheno, acc])
270
271 # Writing subjet/ref sequence
272 if(res_type == ResFinder.TYPE_BLAST):
273 ref_seq = sbjct_align[db][hit]
274 elif(res_type == ResFinder.TYPE_KMA):
275 ref_seq = results[db][hit]["sbjct_string"]
276
277 ref_str += (">%s_%s\n" % (gene, acc))
278 for i in range(0, len(ref_seq), 60):
279 ref_str += ("%s\n" % (ref_seq[i:i + 60]))
280
281 # Getting the header and text for the txt file output
282 sbjct_start = results[db][hit]["sbjct_start"]
283 sbjct_end = results[db][hit]["sbjct_end"]
284 text = ("%s, ID: %.2f %%, "
285 "Alignment Length/Gene Length: %s/%s, "
286 "Coverage: %s, "
287 "Positions in reference: %s..%s, Contig name: %s, "
288 "Position: %s" % (gene, ID, HSP, sbjt_length,
289 coverage, sbjct_start, sbjct_end,
290 contig_name, positions_contig))
291 hit_str += (">%s\n" % text)
292
293 # Writing query/hit sequence
294 if(res_type == ResFinder.TYPE_BLAST):
295 hit_seq = query_align[db][hit]
296 elif(res_type == ResFinder.TYPE_KMA):
297 hit_seq = results[db][hit]["query_string"]
298
299 for i in range(0, len(hit_seq), 60):
300 hit_str += ("%s\n" % (hit_seq[i:i + 60]))
301
302 # Saving the output to print the txt result file allignemts
303 if(res_type == ResFinder.TYPE_BLAST):
304 txt_file_seq_text[db].append((text, ref_seq,
305 homo_align[db][hit],
306 hit_seq))
307 elif(res_type == ResFinder.TYPE_KMA):
308 txt_file_seq_text[db].append(
309 (text, ref_seq, results[db][hit]["homo_string"],
310 hit_seq))
311
312 for res in split_print:
313 gene = split_print[res][0][0]
314 ID = split_print[res][0][1]
315 HSP = split_print[res][0][2]
316 sbjt_length = split_print[res][0][3]
317 coverage = split_print[res][0][4]
318 positions_ref = split_print[res][0][5]
319 contig_name = split_print[res][0][6]
320 positions_contig = split_print[res][0][7]
321 pheno = split_print[res][0][8]
322 acc = split_print[res][0][9]
323
324 total_coverage = 0
325
326 for i in range(1, len(split_print[res])):
327 ID = "%s, %.2f" % (ID, split_print[res][i][1])
328 total_coverage += split_print[res][i][4]
329 positions_ref = (positions_ref + ", "
330 + split_print[res][i][5])
331 contig_name = (contig_name + ", "
332 + split_print[res][i][6])
333 positions_contig = (positions_contig + ", "
334 + split_print[res][i][7])
335
336 table_str += ("%s\t%s\t%s/%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
337 % (gene, ID, HSP, sbjt_length, coverage,
338 positions_ref, contig_name,
339 positions_contig, pheno, acc)
340 )
341
342 hsp_length = "%s/%s" % (HSP, sbjt_length)
343
344 rows[db].append([gene, ID, hsp_length, coverage,
345 positions_ref, contig_name,
346 positions_contig, pheno, acc])
347
348 table_str += ("\n")
349
350 # Writing table txt for all hits
351 for db in titles:
352 # Txt file table
353 table = ResFinder.text_table(titles[db], headers[db], rows[db])
354 txt_str += table
355
356 # Writing alignment txt for all hits
357 for db in titles:
358 # Txt file alignments
359 txt_str += ("##################### %s #####################\n"
360 % (db))
361 for text in txt_file_seq_text[db]:
362 txt_str += ("%s\n\n" % (text[0]))
363 for i in range(0, len(text[1]), 60):
364 txt_str += ("%s\n" % (text[1][i:i + 60]))
365 txt_str += ("%s\n" % (text[2][i:i + 60]))
366 txt_str += ("%s\n\n" % (text[3][i:i + 60]))
367 txt_str += ("\n")
368
369 return (tab_str, table_str, txt_str, ref_str, hit_str)
370
371 @staticmethod
372 def text_table(title, headers, rows, table_format='psql'):
373 ''' Create text table
374
375 USAGE:
376 >>> from tabulate import tabulate
377 >>> title = 'My Title'
378 >>> headers = ['A','B']
379 >>> rows = [[1,2],[3,4]]
380 >>> print(text_table(title, headers, rows))
381 +-----------+
382 | My Title |
383 +-----+-----+
384 | A | B |
385 +=====+=====+
386 | 1 | 2 |
387 | 3 | 4 |
388 +-----+-----+
389 '''
390 # Create table
391 table = tabulate(rows, headers, tablefmt=table_format)
392 # Prepare title injection
393 width = len(table.split('\n')[0])
394 tlen = len(title)
395 if tlen + 4 > width:
396 # Truncate oversized titles
397 tlen = width - 4
398 title = title[:tlen]
399 spaces = width - 2 - tlen
400 left_spacer = ' ' * int(spaces / 2)
401 right_spacer = ' ' * (spaces - len(left_spacer))
402 # Update table with title
403 table = '\n'.join(['+%s+' % ('-' * (width - 2)),
404 '|%s%s%s|' % (left_spacer,
405 title, right_spacer),
406 table, '\n'])
407 return table
408
409 def load_notes(self, notes):
410 with open(notes, 'r') as f:
411 for line in f:
412 line = line.strip()
413 if line.startswith("#"):
414 continue
415 else:
416 tmp = line.split(":")
417
418 self.phenos[tmp[0]] = "%s %s" % (tmp[1], tmp[2])
419
420 if(tmp[2].startswith("Alternate name; ")):
421 self.phenos[tmp[2][16:]] = "%s %s" % (tmp[1], tmp[2])
422
423 def load_databases(self, databases):
424 """
425 """
426 # Check if databases and config file are correct/correponds
427 if databases == '':
428 sys.exit("Input Error: No database was specified!\n")
429 elif databases is None:
430 # Choose all available databases from the config file
431 self.databases = self.configured_dbs.keys()
432 else:
433 # Handle multiple databases
434 databases = databases.split(',')
435 # Check that the ResFinder DBs are valid
436 for db_prefix in databases:
437 if db_prefix in self.configured_dbs:
438 self.databases.append(db_prefix)
439 else:
440 sys.exit("Input Error: Provided database was not "
441 "recognised! (%s)\n" % db_prefix)
442
443 def load_db_config(self, db_conf_file):
444 """
445 """
446 extensions = []
447 with open(db_conf_file) as f:
448 for line in f:
449 line = line.strip()
450
451 if not line:
452 continue
453
454 if line[0] == '#':
455 if 'extensions:' in line:
456 extensions = [s.strip()
457 for s in line.split('extensions:')[-1]
458 .split(',')]
459 continue
460
461 tmp = line.split('\t')
462 if len(tmp) != 3:
463 sys.exit(("Input Error: Invalid line in the database"
464 " config file!\nA proper entry requires 3 tab "
465 "separated columns!\n%s") % (line))
466
467 db_prefix = tmp[0].strip()
468 name = tmp[1].split('#')[0].strip()
469 description = tmp[2]
470
471 # Check if all db files are present
472 for ext in extensions:
473 db = "%s/%s.%s" % (self.db_path, db_prefix, ext)
474 if not os.path.exists(db):
475 sys.exit(("Input Error: The database file (%s) "
476 "could not be found!") % (db))
477
478 if db_prefix not in self.configured_dbs:
479 self.configured_dbs[db_prefix] = []
480 self.configured_dbs[db_prefix].append(name)
481
482 if len(self.configured_dbs) == 0:
483 sys.exit("Input Error: No databases were found in the "
484 "database config file!")
485
486 # Loading paths for KMA databases.
487 for drug in self.configured_dbs:
488 kma_db = self.db_path_kma + drug
489 self.kma_db_files = [kma_db + ".b", kma_db + ".length.b",
490 kma_db + ".name.b", kma_db + ".align.b"]
491
492
493 if __name__ == '__main__':
494
495 ##########################################################################
496 # PARSE COMMAND LINE OPTIONS
497 ##########################################################################
498
499 parser = ArgumentParser()
500 parser.add_argument("-i", "--inputfile",
501 dest="inputfile",
502 help="Input file",
503 default=None)
504 parser.add_argument("-1", "--fastq1",
505 help="Raw read data file 1.",
506 default=None)
507 parser.add_argument("-2", "--fastq2",
508 help="Raw read data file 2 (only required if \
509 data is paired-end).",
510 default=None)
511 parser.add_argument("-o", "--outputPath",
512 dest="out_path",
513 help="Path to blast output",
514 default='')
515
516 parser.add_argument("-b", "--blastPath",
517 dest="blast_path",
518 help="Path to blast",
519 default='blastn')
520 parser.add_argument("-p", "--databasePath",
521 dest="db_path",
522 help="Path to the databases",
523 default='')
524
525 parser.add_argument("-k", "--kmaPath",
526 dest="kma_path",
527 help="Path to KMA",
528 default=None)
529 parser.add_argument("-q", "--databasePathKMA",
530 dest="db_path_kma",
531 help="Path to the directories containing the \
532 KMA indexed databases. Defaults to the \
533 directory 'kma_indexing' inside the \
534 databasePath directory.",
535 default=None)
536
537 parser.add_argument("-d", "--databases",
538 dest="databases",
539 help="Databases chosen to search in - if none \
540 are specified all are used",
541 default=None)
542 parser.add_argument("-l", "--min_cov",
543 dest="min_cov",
544 help="Minimum coverage",
545 default=0.60)
546 parser.add_argument("-t", "--threshold",
547 dest="threshold",
548 help="Blast threshold for identity",
549 default=0.90)
550 parser.add_argument("-nano", "--nanopore",
551 action="store_true",
552 dest="nanopore",
553 help="If nanopore data is used",
554 default=False)
555 args = parser.parse_args()
556
557 ##########################################################################
558 # MAIN
559 ##########################################################################
560
561 # Defining varibales
562
563 min_cov = args.min_cov
564 threshold = args.threshold
565
566 # Check if valid database is provided
567 if args.db_path is None:
568 sys.exit("Input Error: No database directory was provided!\n")
569 elif not os.path.exists(args.db_path):
570 sys.exit("Input Error: The specified database directory does not "
571 "exist!\n")
572 else:
573 # Check existence of config file
574 db_config_file = '%s/config' % (args.db_path)
575 if not os.path.exists(db_config_file):
576 sys.exit("Input Error: The database config file could not be "
577 "found!")
578 # Save path
579 db_path = args.db_path
580
581 # Check existence of notes file
582 notes_path = "%s/notes.txt" % (args.db_path)
583 if not os.path.exists(notes_path):
584 sys.exit('Input Error: notes.txt not found! (%s)' % (notes_path))
585
586 # Check for input
587 if args.inputfile is None and args.fastq1 is None:
588 sys.exit("Input Error: No Input were provided!\n")
589
590 # Check if valid input file for assembly is provided
591 if args.inputfile:
592 if not os.path.exists(args.inputfile):
593 sys.exit("Input Error: Input file does not exist!\n")
594 else:
595 inputfile = args.inputfile
596 else:
597 inputfile = None
598
599 # Check if valid input files for raw data
600 if args.fastq1:
601
602 if not os.path.exists(args.fastq1):
603 sys.exit("Input Error: fastq1 file does not exist!\n")
604 else:
605 input_fastq1 = args.fastq1
606
607 if args.fastq2:
608 if not os.path.exists(args.fastq2):
609 sys.exit("Input Error: fastq2 file does not exist!\n")
610 else:
611 input_fastq2 = args.fastq2
612 else:
613 input_fastq2 = None
614 else:
615 input_fastq1 = None
616 input_fastq2 = None
617
618 # Check if valid output directory is provided
619 if not os.path.exists(args.out_path):
620 sys.exit("Input Error: Output dirctory does not exists!\n")
621 else:
622 out_path = args.out_path
623
624 # If input is an assembly, then use BLAST
625 if(inputfile is not None):
626 finder = ResFinder(db_conf_file=db_config_file,
627 databases=args.databases,
628 db_path=db_path, notes=notes_path)
629
630 blast_run = finder.blast(inputfile=inputfile, out_path=out_path,
631 min_cov=min_cov, threshold=threshold,
632 blast=args.blast_path)
633
634 finder.write_results(out_path=out_path, result=blast_run,
635 res_type=ResFinder.TYPE_BLAST)
636
637 # If the input is raw read data, then use KMA
638 elif(input_fastq1 is not None):
639 finder = ResFinder(db_conf_file=db_config_file,
640 databases=args.databases,
641 db_path=db_path, db_path_kma=args.db_path_kma,
642 notes=notes_path)
643
644 # if input_fastq2 is None, it is ignored by the kma method.
645 if args.nanopore:
646 kma_run = finder.kma(inputfile_1=input_fastq1,
647 inputfile_2=input_fastq2, kma_add_args='-ont -md 5')
648 else:
649 kma_run = finder.kma(inputfile_1=input_fastq1,
650 inputfile_2=input_fastq2)
651
652 finder.write_results(out_path=out_path, result=kma_run,
653 res_type=ResFinder.TYPE_KMA)