diff adjust_bracken_for_unclassified_reads.py @ 0:3ab9d37e547e draft

"planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/blob/master/tools/adjust_bracken_for_unclassified_reads commit 0d1d1f356cdfd8ef6dbcdd1bfe76c4637587ff53"
author public-health-bioinformatics
date Thu, 10 Mar 2022 21:35:14 +0000
parents
children 87459bd1615a
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/adjust_bracken_for_unclassified_reads.py	Thu Mar 10 21:35:14 2022 +0000
@@ -0,0 +1,94 @@
+#!/usr/bin/env python
+
+import argparse
+import csv
+import json
+import sys
+
+def parse_bracken_abundances(bracken_abundances_path):
+    bracken_abundances = []
+    with open(bracken_abundances_path, 'r') as f:
+        reader = csv.DictReader(f, dialect='excel-tab')
+        for row in reader:
+            b = {}
+            b['name'] = row['name']
+            b['taxonomy_id'] = row['taxonomy_id']
+            b['taxonomy_lvl'] = row['taxonomy_lvl']
+            b['kraken_assigned_seqs'] = int(row['kraken_assigned_reads'])
+            b['bracken_assigned_seqs'] = int(row['new_est_reads'])
+            b['bracken_fraction_total_seqs'] = float(row['fraction_total_reads'])
+            bracken_abundances.append(b)
+
+    return bracken_abundances
+
+
+def parse_kraken_report(kraken_report_path):
+    kraken_report = []
+    with open(kraken_report_path, 'r') as f:
+        for line in f:
+            kraken_line = {}
+            [percentage, seqs_total, seqs_this_level, taxonomic_level, ncbi_taxid, taxon_name] = line.strip().split(None, 5)
+            kraken_line['percentage'] = float(percentage)
+            kraken_line['seqs_total'] = int(seqs_total)
+            kraken_line['seqs_this_level'] = int(seqs_this_level)
+            kraken_line['taxonomic_level'] = taxonomic_level
+            kraken_line['ncbi_taxid'] = ncbi_taxid
+            kraken_line['taxon_name'] = taxon_name
+            kraken_report.append(kraken_line)
+
+    return kraken_report
+
+
+def main(args):
+    kraken_report = parse_kraken_report(args.kraken_report)
+    bracken_abundances = parse_bracken_abundances(args.bracken_abundances)
+
+    kraken_report_unclassified_seqs = list(filter(lambda x: x['taxon_name'] == 'unclassified', kraken_report))[0]['seqs_this_level']
+    kraken_report_classified_seqs = list(filter(lambda x: x['taxon_name'] == 'root', kraken_report))[0]['seqs_total']
+
+    total_seqs = kraken_report_classified_seqs + kraken_report_unclassified_seqs
+    percent_unclassified = float(kraken_report_unclassified_seqs) / float(total_seqs)
+
+    bracken_unclassified_entry = {
+        'name': 'unclassified',
+        'taxonomy_id': 0,
+        'taxonomy_lvl': 'U',
+        'kraken_assigned_seqs': kraken_report_unclassified_seqs,
+        'bracken_assigned_seqs': kraken_report_unclassified_seqs,
+        'kraken_fraction_total_seqs': percent_unclassified,
+        'bracken_fraction_total_seqs': 0.0,
+    }
+
+    bracken_abundances = [bracken_unclassified_entry] + bracken_abundances
+
+    output_fieldnames = [
+        'name',
+        'taxonomy_id',
+        'taxonomy_lvl',
+        'kraken_assigned_seqs',
+        'bracken_assigned_seqs',
+        'total_seqs',
+        'kraken_fraction_total_seqs',
+        'bracken_fraction_total_seqs',
+    ]
+
+    writer = csv.DictWriter(sys.stdout, fieldnames=output_fieldnames, dialect='excel-tab')
+    writer.writeheader()
+    
+    for b in bracken_abundances:
+        b['total_seqs'] = total_seqs
+        kraken_adjusted_fraction_total_seqs = float(b['kraken_assigned_seqs']) / float(total_seqs)
+        b['kraken_fraction_total_seqs'] = '{:.6f}'.format(kraken_adjusted_fraction_total_seqs)
+        bracken_adjusted_fraction_total_seqs = float(b['bracken_assigned_seqs']) / float(total_seqs)
+        b['bracken_fraction_total_seqs'] = '{:.6f}'.format(bracken_adjusted_fraction_total_seqs)
+
+    for b in sorted(bracken_abundances, key=lambda x: x['bracken_fraction_total_seqs'], reverse=True):
+        writer.writerow(b)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-k', '--kraken-report')
+    parser.add_argument('-a', '--bracken-abundances')
+    args = parser.parse_args()
+    main(args)