comparison adjust_bracken_kreport_for_unclassified_reads.py @ 3:899a650587ed draft default tip

planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/blob/master/tools/adjust_bracken_for_unclassified_reads commit 24535690aedb81353cf5e036dc4577022d9604ad
author public-health-bioinformatics
date Thu, 27 Oct 2022 19:13:25 +0000
parents
children
comparison
equal deleted inserted replaced
2:87459bd1615a 3:899a650587ed
1 #!/usr/bin/env python
2
3 import argparse
4 import csv
5 import json
6 import sys
7
8
9 def parse_kraken_report(kraken_report_path):
10 kraken_report = []
11 with open(kraken_report_path, 'r') as f:
12 for line in f:
13 kraken_line = {}
14 [percentage, seqs_total, seqs_this_level, taxonomic_level, ncbi_taxid, taxon_name] = line.strip().split('\t')
15 kraken_line['percentage'] = float(percentage)
16 kraken_line['seqs_total'] = int(seqs_total)
17 kraken_line['seqs_this_level'] = int(seqs_this_level)
18 kraken_line['taxonomic_level'] = taxonomic_level
19 kraken_line['ncbi_taxid'] = ncbi_taxid
20 kraken_line['taxon_name'] = taxon_name
21 kraken_report.append(kraken_line)
22
23 return kraken_report
24
25
26 def main(args):
27 kraken_report = parse_kraken_report(args.kraken_report)
28 kraken_style_bracken_report = parse_kraken_report(args.kraken_style_bracken_report)
29
30 try:
31 kraken_report_unclassified_seqs = list(filter(lambda x: x['taxon_name'] == 'unclassified', kraken_report))[0]['seqs_this_level']
32 except IndexError as e:
33 kraken_report_unclassified_seqs = 0
34 kraken_report_classified_seqs = list(filter(lambda x: x['taxon_name'] == 'root', kraken_report))[0]['seqs_total']
35 kraken_style_bracken_report_classified_seqs = list(filter(lambda x: x['taxon_name'] == 'root', kraken_style_bracken_report))[0]['seqs_total']
36
37 total_seqs = kraken_style_bracken_report_classified_seqs + kraken_report_unclassified_seqs
38 fraction_unclassified = float(kraken_report_unclassified_seqs) / float(total_seqs)
39
40 output_fieldnames = [
41 'percentage',
42 'seqs_total',
43 'seqs_this_level',
44 'taxonomic_level',
45 'ncbi_taxid',
46 'taxon_name',
47 ]
48
49 writer = csv.DictWriter(sys.stdout, fieldnames=output_fieldnames, dialect='excel-tab')
50
51 bracken_unclassified_entry = {
52 'percentage': '{:.2f}'.format(kraken_report_unclassified_seqs / total_seqs * 100),
53 'seqs_total': kraken_report_unclassified_seqs,
54 'seqs_this_level': kraken_report_unclassified_seqs,
55 'taxonomic_level': 'U',
56 'ncbi_taxid': 0,
57 'taxon_name': 'unclassified',
58 }
59
60 for row in kraken_style_bracken_report:
61 row['percentage'] = '{:.2f}'.format(row['seqs_total'] / total_seqs * 100)
62
63 kraken_style_bracken_report_with_unclassified = [bracken_unclassified_entry] + kraken_style_bracken_report
64 for row in kraken_style_bracken_report_with_unclassified:
65 writer.writerow(row)
66
67
68 if __name__ == '__main__':
69 parser = argparse.ArgumentParser()
70 parser.add_argument('-k', '--kraken-report')
71 parser.add_argument('-b', '--kraken-style-bracken-report')
72 args = parser.parse_args()
73 main(args)