# HG changeset patch # User public-health-bioinformatics # Date 1578003274 18000 # Node ID 2262e531c50b4729260ac5a3014fa29ef6b95b31 # Parent 912a3a3dc08286302d73c9ac02d621393cbfd194 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/blob/master/tools/screen_abricate_report commit 50a464c3e6f87ca8d2c874842cbcee370d8aa9c4" diff -r 912a3a3dc082 -r 2262e531c50b screen_abricate_report.py --- a/screen_abricate_report.py Thu Dec 19 20:31:11 2019 -0500 +++ b/screen_abricate_report.py Thu Jan 02 17:14:34 2020 -0500 @@ -7,6 +7,27 @@ import re +class Range(object): + """ + Used to limit the min_coverage and min_identity args to range 0.0 - 100.0 + """ + def __init__(self, start, end): + self.start = start + self.end = end + + def __eq__(self, other): + return self.start <= other <= self.end + + def __contains__(self, item): + return self.__eq__(item) + + def __iter__(self): + yield self + + def __repr__(self): + return str(self.start) + " - " + str(self.end) + + def parse_screen_file(screen_file): screen = [] with open(screen_file) as f: @@ -24,6 +45,16 @@ return fieldnames +def detect_gene(abricate_report_row, regex, min_coverage, min_identity): + gene_of_interest = bool(re.search(regex, abricate_report_row['GENE'])) + sufficient_coverage = float(abricate_report_row['%COVERAGE']) >= min_coverage + sufficient_identity = float(abricate_report_row['%IDENTITY']) >= min_identity + if gene_of_interest and sufficient_coverage and sufficient_identity: + return True + else: + return False + + def main(args): screen = parse_screen_file(args.screening_file) gene_detection_status_fieldnames = ['gene_name', 'detected'] @@ -44,20 +75,25 @@ 'detected': False } for abricate_report_row in abricate_report_reader: - if re.search(gene['regex'], abricate_report_row['GENE']): + if detect_gene(abricate_report_row, gene['regex'], args.min_coverage, args.min_identity): gene_detection_status['detected'] = True screened_report_writer.writerow(abricate_report_row) gene_detection_status_writer.writerow(gene_detection_status) f1.seek(0) # return file pointer to start of abricate report + next(abricate_report_reader) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("abricate_report", help="Input: Abricate report to screen (tsv)") parser.add_argument("--screening_file", help="Input: List of genes to screen for (tsv)") - parser.add_argument("--screened_report", help=("Output: Screened abricate report ", - "including only genes of interest (tsv)")) - parser.add_argument("--gene_detection_status", help=("Output: detection status for all genes ", - "listed in the screening file (tsv)")) + parser.add_argument("--screened_report", + help=("Output: Screened abricate report, including only genes of interest (tsv)")) + parser.add_argument("--gene_detection_status", + help=("Output: detection status for all genes listed in the screening file (tsv)")) + parser.add_argument("--min_coverage", type=float, default=90.0, + choices=Range(0.0, 100.0), help=("Minimum percent coverage")) + parser.add_argument("--min_identity", type=float, default=90.0, + choices=Range(0.0, 100.0), help=("Minimum percent identity")) args = parser.parse_args() main(args) diff -r 912a3a3dc082 -r 2262e531c50b screen_abricate_report.xml --- a/screen_abricate_report.xml Thu Dec 19 20:31:11 2019 -0500 +++ b/screen_abricate_report.xml Thu Jan 02 17:14:34 2020 -0500 @@ -1,4 +1,4 @@ - + Screens an abricate report for genes of interest @@ -12,7 +12,9 @@ #end if --screening_file '${input_screening_file}' --screened_report '${screened_report}' - --gene_detection_status '${gene_detection_status}' && + --gene_detection_status '${gene_detection_status}' + --min_coverage '${min_coverage}' + --min_identity '${min_identity}' && cp '${input_screening_file}' '${output_screening_file}' ]]> @@ -34,6 +36,8 @@ + + @@ -65,7 +69,7 @@ gene of interest, and the second column is a regular expression that can be used to identify examples of that gene in the 'GENE' column of the abricate report. - For example, one might use the regex 'KPC\-{d+}' to identify all alleles of the + For example, one might use the regex '^KPC-\d+$' to identify all alleles of the KPC gene (KPC-2, KPC-3, KPC-4, ..., KPC-10, ...) ]]> diff -r 912a3a3dc082 -r 2262e531c50b test-data/SAMN13042171_abricate_report_screened.tsv --- a/test-data/SAMN13042171_abricate_report_screened.tsv Thu Dec 19 20:31:11 2019 -0500 +++ b/test-data/SAMN13042171_abricate_report_screened.tsv Thu Jan 02 17:14:34 2020 -0500 @@ -1,2 +1,3 @@ #FILE SEQUENCE START END STRAND GENE COVERAGE COVERAGE_MAP GAPS %COVERAGE %IDENTITY DATABASE ACCESSION PRODUCT RESISTANCE SAMN13042171.fasta contig00063 2498 3310 - NDM-1 1-813/813 =============== 0/0 100.00 100.00 card FN396876:2407-3220 NDM-1 is a metallo-beta-lactamase isolated from Klebsiella pneumoniae with nearly complete resistance to all beta-lactam antibiotics. cephalosporin/penam/carbapenem/cephamycin +SAMN13042171.fasta contig00035 9413 10237 - OXA-9 1-825/825 =============== 0/0 100.00 100.00 card M55547:1-826 OXA-9 is a beta-lactamase found in Klebsiella pneumoniae. cephalosporin/penam diff -r 912a3a3dc082 -r 2262e531c50b test-data/SAMN13042171_gene_detection_status.tsv --- a/test-data/SAMN13042171_gene_detection_status.tsv Thu Dec 19 20:31:11 2019 -0500 +++ b/test-data/SAMN13042171_gene_detection_status.tsv Thu Jan 02 17:14:34 2020 -0500 @@ -1,4 +1,5 @@ gene_name detected KPC False NDM True -OXA-48 False +OXA True +rpoB2 False diff -r 912a3a3dc082 -r 2262e531c50b test-data/screening_file.tsv --- a/test-data/screening_file.tsv Thu Dec 19 20:31:11 2019 -0500 +++ b/test-data/screening_file.tsv Thu Jan 02 17:14:34 2020 -0500 @@ -1,4 +1,5 @@ gene_name regex -KPC KPC -NDM NDM -OXA-48 OXA\-48 +KPC ^KPC-\d+$ +NDM ^NDM-\d+$ +OXA ^OXA-\d+$ +rpoB2 rpoB2