Mercurial > repos > public-health-bioinformatics > adjust_bracken_for_unclassified_reads
view adjust_bracken_kreport_for_unclassified_reads.py @ 3:899a650587ed draft default tip
planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/blob/master/tools/adjust_bracken_for_unclassified_reads commit 24535690aedb81353cf5e036dc4577022d9604ad
author | public-health-bioinformatics |
---|---|
date | Thu, 27 Oct 2022 19:13:25 +0000 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python import argparse import csv import json import sys def parse_kraken_report(kraken_report_path): kraken_report = [] with open(kraken_report_path, 'r') as f: for line in f: kraken_line = {} [percentage, seqs_total, seqs_this_level, taxonomic_level, ncbi_taxid, taxon_name] = line.strip().split('\t') kraken_line['percentage'] = float(percentage) kraken_line['seqs_total'] = int(seqs_total) kraken_line['seqs_this_level'] = int(seqs_this_level) kraken_line['taxonomic_level'] = taxonomic_level kraken_line['ncbi_taxid'] = ncbi_taxid kraken_line['taxon_name'] = taxon_name kraken_report.append(kraken_line) return kraken_report def main(args): kraken_report = parse_kraken_report(args.kraken_report) kraken_style_bracken_report = parse_kraken_report(args.kraken_style_bracken_report) try: kraken_report_unclassified_seqs = list(filter(lambda x: x['taxon_name'] == 'unclassified', kraken_report))[0]['seqs_this_level'] except IndexError as e: kraken_report_unclassified_seqs = 0 kraken_report_classified_seqs = list(filter(lambda x: x['taxon_name'] == 'root', kraken_report))[0]['seqs_total'] kraken_style_bracken_report_classified_seqs = list(filter(lambda x: x['taxon_name'] == 'root', kraken_style_bracken_report))[0]['seqs_total'] total_seqs = kraken_style_bracken_report_classified_seqs + kraken_report_unclassified_seqs fraction_unclassified = float(kraken_report_unclassified_seqs) / float(total_seqs) output_fieldnames = [ 'percentage', 'seqs_total', 'seqs_this_level', 'taxonomic_level', 'ncbi_taxid', 'taxon_name', ] writer = csv.DictWriter(sys.stdout, fieldnames=output_fieldnames, dialect='excel-tab') bracken_unclassified_entry = { 'percentage': '{:.2f}'.format(kraken_report_unclassified_seqs / total_seqs * 100), 'seqs_total': kraken_report_unclassified_seqs, 'seqs_this_level': kraken_report_unclassified_seqs, 'taxonomic_level': 'U', 'ncbi_taxid': 0, 'taxon_name': 'unclassified', } for row in kraken_style_bracken_report: row['percentage'] = '{:.2f}'.format(row['seqs_total'] / total_seqs * 100) kraken_style_bracken_report_with_unclassified = [bracken_unclassified_entry] + kraken_style_bracken_report for row in kraken_style_bracken_report_with_unclassified: writer.writerow(row) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-k', '--kraken-report') parser.add_argument('-b', '--kraken-style-bracken-report') args = parser.parse_args() main(args)