Mercurial > repos > public-health-bioinformatics > adjust_bracken_for_unclassified_reads
diff adjust_bracken_kreport_for_unclassified_reads.py @ 3:899a650587ed draft default tip
planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/blob/master/tools/adjust_bracken_for_unclassified_reads commit 24535690aedb81353cf5e036dc4577022d9604ad
author | public-health-bioinformatics |
---|---|
date | Thu, 27 Oct 2022 19:13:25 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/adjust_bracken_kreport_for_unclassified_reads.py Thu Oct 27 19:13:25 2022 +0000 @@ -0,0 +1,73 @@ +#!/usr/bin/env python + +import argparse +import csv +import json +import sys + + +def parse_kraken_report(kraken_report_path): + kraken_report = [] + with open(kraken_report_path, 'r') as f: + for line in f: + kraken_line = {} + [percentage, seqs_total, seqs_this_level, taxonomic_level, ncbi_taxid, taxon_name] = line.strip().split('\t') + kraken_line['percentage'] = float(percentage) + kraken_line['seqs_total'] = int(seqs_total) + kraken_line['seqs_this_level'] = int(seqs_this_level) + kraken_line['taxonomic_level'] = taxonomic_level + kraken_line['ncbi_taxid'] = ncbi_taxid + kraken_line['taxon_name'] = taxon_name + kraken_report.append(kraken_line) + + return kraken_report + + +def main(args): + kraken_report = parse_kraken_report(args.kraken_report) + kraken_style_bracken_report = parse_kraken_report(args.kraken_style_bracken_report) + + try: + kraken_report_unclassified_seqs = list(filter(lambda x: x['taxon_name'] == 'unclassified', kraken_report))[0]['seqs_this_level'] + except IndexError as e: + kraken_report_unclassified_seqs = 0 + kraken_report_classified_seqs = list(filter(lambda x: x['taxon_name'] == 'root', kraken_report))[0]['seqs_total'] + kraken_style_bracken_report_classified_seqs = list(filter(lambda x: x['taxon_name'] == 'root', kraken_style_bracken_report))[0]['seqs_total'] + + total_seqs = kraken_style_bracken_report_classified_seqs + kraken_report_unclassified_seqs + fraction_unclassified = float(kraken_report_unclassified_seqs) / float(total_seqs) + + output_fieldnames = [ + 'percentage', + 'seqs_total', + 'seqs_this_level', + 'taxonomic_level', + 'ncbi_taxid', + 'taxon_name', + ] + + writer = csv.DictWriter(sys.stdout, fieldnames=output_fieldnames, dialect='excel-tab') + + bracken_unclassified_entry = { + 'percentage': '{:.2f}'.format(kraken_report_unclassified_seqs / total_seqs * 100), + 'seqs_total': kraken_report_unclassified_seqs, + 'seqs_this_level': kraken_report_unclassified_seqs, + 'taxonomic_level': 'U', + 'ncbi_taxid': 0, + 'taxon_name': 'unclassified', + } + + for row in kraken_style_bracken_report: + row['percentage'] = '{:.2f}'.format(row['seqs_total'] / total_seqs * 100) + + kraken_style_bracken_report_with_unclassified = [bracken_unclassified_entry] + kraken_style_bracken_report + for row in kraken_style_bracken_report_with_unclassified: + writer.writerow(row) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-k', '--kraken-report') + parser.add_argument('-b', '--kraken-style-bracken-report') + args = parser.parse_args() + main(args)