comparison adjust_bracken_for_unclassified_reads.py @ 0:3ab9d37e547e draft

"planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/blob/master/tools/adjust_bracken_for_unclassified_reads commit 0d1d1f356cdfd8ef6dbcdd1bfe76c4637587ff53"
author public-health-bioinformatics
date Thu, 10 Mar 2022 21:35:14 +0000
parents
children 87459bd1615a
comparison
equal deleted inserted replaced
-1:000000000000 0:3ab9d37e547e
1 #!/usr/bin/env python
2
3 import argparse
4 import csv
5 import json
6 import sys
7
8 def parse_bracken_abundances(bracken_abundances_path):
9 bracken_abundances = []
10 with open(bracken_abundances_path, 'r') as f:
11 reader = csv.DictReader(f, dialect='excel-tab')
12 for row in reader:
13 b = {}
14 b['name'] = row['name']
15 b['taxonomy_id'] = row['taxonomy_id']
16 b['taxonomy_lvl'] = row['taxonomy_lvl']
17 b['kraken_assigned_seqs'] = int(row['kraken_assigned_reads'])
18 b['bracken_assigned_seqs'] = int(row['new_est_reads'])
19 b['bracken_fraction_total_seqs'] = float(row['fraction_total_reads'])
20 bracken_abundances.append(b)
21
22 return bracken_abundances
23
24
25 def parse_kraken_report(kraken_report_path):
26 kraken_report = []
27 with open(kraken_report_path, 'r') as f:
28 for line in f:
29 kraken_line = {}
30 [percentage, seqs_total, seqs_this_level, taxonomic_level, ncbi_taxid, taxon_name] = line.strip().split(None, 5)
31 kraken_line['percentage'] = float(percentage)
32 kraken_line['seqs_total'] = int(seqs_total)
33 kraken_line['seqs_this_level'] = int(seqs_this_level)
34 kraken_line['taxonomic_level'] = taxonomic_level
35 kraken_line['ncbi_taxid'] = ncbi_taxid
36 kraken_line['taxon_name'] = taxon_name
37 kraken_report.append(kraken_line)
38
39 return kraken_report
40
41
42 def main(args):
43 kraken_report = parse_kraken_report(args.kraken_report)
44 bracken_abundances = parse_bracken_abundances(args.bracken_abundances)
45
46 kraken_report_unclassified_seqs = list(filter(lambda x: x['taxon_name'] == 'unclassified', kraken_report))[0]['seqs_this_level']
47 kraken_report_classified_seqs = list(filter(lambda x: x['taxon_name'] == 'root', kraken_report))[0]['seqs_total']
48
49 total_seqs = kraken_report_classified_seqs + kraken_report_unclassified_seqs
50 percent_unclassified = float(kraken_report_unclassified_seqs) / float(total_seqs)
51
52 bracken_unclassified_entry = {
53 'name': 'unclassified',
54 'taxonomy_id': 0,
55 'taxonomy_lvl': 'U',
56 'kraken_assigned_seqs': kraken_report_unclassified_seqs,
57 'bracken_assigned_seqs': kraken_report_unclassified_seqs,
58 'kraken_fraction_total_seqs': percent_unclassified,
59 'bracken_fraction_total_seqs': 0.0,
60 }
61
62 bracken_abundances = [bracken_unclassified_entry] + bracken_abundances
63
64 output_fieldnames = [
65 'name',
66 'taxonomy_id',
67 'taxonomy_lvl',
68 'kraken_assigned_seqs',
69 'bracken_assigned_seqs',
70 'total_seqs',
71 'kraken_fraction_total_seqs',
72 'bracken_fraction_total_seqs',
73 ]
74
75 writer = csv.DictWriter(sys.stdout, fieldnames=output_fieldnames, dialect='excel-tab')
76 writer.writeheader()
77
78 for b in bracken_abundances:
79 b['total_seqs'] = total_seqs
80 kraken_adjusted_fraction_total_seqs = float(b['kraken_assigned_seqs']) / float(total_seqs)
81 b['kraken_fraction_total_seqs'] = '{:.6f}'.format(kraken_adjusted_fraction_total_seqs)
82 bracken_adjusted_fraction_total_seqs = float(b['bracken_assigned_seqs']) / float(total_seqs)
83 b['bracken_fraction_total_seqs'] = '{:.6f}'.format(bracken_adjusted_fraction_total_seqs)
84
85 for b in sorted(bracken_abundances, key=lambda x: x['bracken_fraction_total_seqs'], reverse=True):
86 writer.writerow(b)
87
88
89 if __name__ == '__main__':
90 parser = argparse.ArgumentParser()
91 parser.add_argument('-k', '--kraken-report')
92 parser.add_argument('-a', '--bracken-abundances')
93 args = parser.parse_args()
94 main(args)