changeset 3:899a650587ed draft default tip

planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/blob/master/tools/adjust_bracken_for_unclassified_reads commit 24535690aedb81353cf5e036dc4577022d9604ad
author public-health-bioinformatics
date Thu, 27 Oct 2022 19:13:25 +0000
parents 87459bd1615a
children
files adjust_bracken_kreport_for_unclassified_reads.py adjust_bracken_kreport_for_unclassified_reads.xml test-data/input/SRR17907745_kraken_style_bracken_report.txt
diffstat 3 files changed, 245 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/adjust_bracken_kreport_for_unclassified_reads.py	Thu Oct 27 19:13:25 2022 +0000
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+
+import argparse
+import csv
+import json
+import sys
+
+
+def parse_kraken_report(kraken_report_path):
+    kraken_report = []
+    with open(kraken_report_path, 'r') as f:
+        for line in f:
+            kraken_line = {}
+            [percentage, seqs_total, seqs_this_level, taxonomic_level, ncbi_taxid, taxon_name] = line.strip().split('\t')
+            kraken_line['percentage'] = float(percentage)
+            kraken_line['seqs_total'] = int(seqs_total)
+            kraken_line['seqs_this_level'] = int(seqs_this_level)
+            kraken_line['taxonomic_level'] = taxonomic_level
+            kraken_line['ncbi_taxid'] = ncbi_taxid
+            kraken_line['taxon_name'] = taxon_name
+            kraken_report.append(kraken_line)
+
+    return kraken_report
+
+
+def main(args):
+    kraken_report = parse_kraken_report(args.kraken_report)
+    kraken_style_bracken_report = parse_kraken_report(args.kraken_style_bracken_report)
+
+    try:
+        kraken_report_unclassified_seqs = list(filter(lambda x: x['taxon_name'] == 'unclassified', kraken_report))[0]['seqs_this_level']
+    except IndexError as e:
+        kraken_report_unclassified_seqs = 0
+    kraken_report_classified_seqs = list(filter(lambda x: x['taxon_name'] == 'root', kraken_report))[0]['seqs_total']
+    kraken_style_bracken_report_classified_seqs = list(filter(lambda x: x['taxon_name'] == 'root', kraken_style_bracken_report))[0]['seqs_total']
+
+    total_seqs = kraken_style_bracken_report_classified_seqs + kraken_report_unclassified_seqs
+    fraction_unclassified = float(kraken_report_unclassified_seqs) / float(total_seqs)
+
+    output_fieldnames = [
+        'percentage',
+        'seqs_total',
+        'seqs_this_level',
+        'taxonomic_level',
+        'ncbi_taxid',
+        'taxon_name',
+    ]
+    
+    writer = csv.DictWriter(sys.stdout, fieldnames=output_fieldnames, dialect='excel-tab')
+
+    bracken_unclassified_entry = {
+        'percentage': '{:.2f}'.format(kraken_report_unclassified_seqs / total_seqs * 100),
+        'seqs_total': kraken_report_unclassified_seqs,
+        'seqs_this_level': kraken_report_unclassified_seqs,
+        'taxonomic_level': 'U',
+        'ncbi_taxid': 0,
+        'taxon_name': 'unclassified',
+    }
+
+    for row in kraken_style_bracken_report:
+        row['percentage'] = '{:.2f}'.format(row['seqs_total'] / total_seqs * 100)
+
+    kraken_style_bracken_report_with_unclassified = [bracken_unclassified_entry] + kraken_style_bracken_report
+    for row in kraken_style_bracken_report_with_unclassified:
+        writer.writerow(row)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-k', '--kraken-report')
+    parser.add_argument('-b', '--kraken-style-bracken-report')
+    args = parser.parse_args()
+    main(args)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/adjust_bracken_kreport_for_unclassified_reads.xml	Thu Oct 27 19:13:25 2022 +0000
@@ -0,0 +1,29 @@
+<tool id="adjust_bracken_kreport_for_unclassified_reads" name="Adjust Kraken-Style Bracken Report for Unclassified Reads" version="0.1.0+galaxy0">
+    <description>Adjust kraken-style bracken report to account for unclassified reads</description>
+    <requirements>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        '$__tool_directory__/adjust_bracken_kreport_for_unclassified_reads.py'
+        --kraken-report '${kraken_report}'
+        --kraken-style-bracken-report '${kraken_style_bracken_report}'
+        > ${adjusted_report}
+    ]]></command>
+    <inputs>
+        <param name="kraken_report" type="data" format="txt" />
+        <param name="kraken_style_bracken_report" type="data" format="txt" />
+    </inputs>
+    <outputs>
+        <data name="adjusted_report" label="Adjusted Report" format="txt"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="kraken_report" value="input/SRR17619849_kraken2.txt"/>
+            <param name="kraken_style_bracken_report" value=""/>
+            <output name="adjusted_report" file="" ftype="tabular"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+    ]]></help>
+    <citations>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input/SRR17907745_kraken_style_bracken_report.txt	Thu Oct 27 19:13:25 2022 +0000
@@ -0,0 +1,143 @@
+100.00	2371444	0	R	1	root
+100.00	2371444	0	R1	131567	  cellular organisms
+99.99	2371306	0	D	2	    Bacteria
+99.99	2371287	0	P	1224	      Proteobacteria
+99.99	2371241	0	C	1236	        Gammaproteobacteria
+99.99	2371241	0	O	91347	          Enterobacterales
+99.25	2353760	0	F	543	            Enterobacteriaceae
+53.04	1257733	0	G	570	              Klebsiella
+44.15	1046984	1046984	S	1463165	                Klebsiella quasipneumoniae
+7.73	183381	183381	S	573	                Klebsiella pneumoniae
+0.26	6092	0	G1	2608929	                unclassified Klebsiella
+0.19	4605	4605	S	2015795	                  Klebsiella sp. LY
+0.03	757	757	S	2267618	                  Klebsiella sp. P1CD1
+0.01	318	318	S	2787706	                  Klebsiella sp. BDA134-6
+0.02	410	410	S	2488567	                  Klebsiella sp. FDAARGOS_511
+0.32	7585	7585	S	244366	                Klebsiella variicola
+0.08	1853	1853	S	571	                Klebsiella oxytoca
+0.05	1294	1294	S	548	                Klebsiella aerogenes
+0.05	1261	1261	S	2026240	                Klebsiella quasivariicola
+0.19	4517	4517	S	1134687	                Klebsiella michiganensis
+0.17	3937	3937	S	2058152	                Klebsiella grimontii
+0.03	659	659	S	2489010	                Klebsiella africana
+0.01	164	164	S	2153354	                Klebsiella huaxiensis
+33.69	798958	0	G	561	              Escherichia
+33.68	798809	798809	S	562	                Escherichia coli
+0.01	149	149	S	564	                Escherichia fergusonii
+6.10	144567	0	G	547	              Enterobacter
+6.09	144531	0	G1	354276	                Enterobacter cloacae complex
+3.68	87241	87241	S	158836	                  Enterobacter hormaechei
+2.40	56903	56903	S	550	                  Enterobacter cloacae
+0.01	282	282	S	1812935	                  Enterobacter roggenkampii
+0.00	91	91	S	208224	                  Enterobacter kobei
+0.00	13	13	S	61645	                  Enterobacter asburiae
+0.00	35	0	G1	2608935	                unclassified Enterobacter
+0.00	18	18	S	2596949	                  Enterobacter sp. E76
+0.00	17	17	S	2500132	                  Enterobacter sp. N18-03635
+5.77	136773	0	G	544	              Citrobacter
+5.65	134026	0	G1	1344959	                Citrobacter freundii complex
+5.65	133878	133878	S	546	                  Citrobacter freundii
+0.01	148	148	S	1639133	                  Citrobacter portucalensis
+0.12	2728	0	G1	2644389	                unclassified Citrobacter
+0.10	2352	2352	S	2742632	                  Citrobacter sp. RHBSTW-00053
+0.02	375	375	S	2742638	                  Citrobacter sp. RHBSTW-00137
+0.00	19	19	S	67825	                Citrobacter rodentium
+0.58	13709	0	G	590	              Salmonella
+0.21	5059	0	G1	2614656	                unclassified Salmonella
+0.09	2074	2074	S	2500542	                  Salmonella sp. SSDFZ54
+0.13	2985	2985	S	599	                  Salmonella sp.
+0.36	8649	8649	S	28901	                Salmonella enterica
+0.03	746	0	G	620	              Shigella
+0.03	746	746	S	621	                Shigella boydii
+0.02	554	0	G	160674	              Raoultella
+0.02	431	431	S	54291	                Raoultella ornithinolytica
+0.00	89	89	S	577	                Raoultella terrigena
+0.00	33	33	S	575	                Raoultella planticola
+0.02	409	0	G	1330547	              Kosakonia
+0.02	387	387	S	283686	                Kosakonia radicincitans
+0.00	22	22	S	497725	                Kosakonia oryzae
+0.01	182	0	G	83654	              Leclercia
+0.01	182	0	G1	2627398	                unclassified Leclercia
+0.01	182	182	S	2815358	                  Leclercia sp. 4-9-1-25
+0.00	46	0	G	1330546	              Pluralibacter
+0.00	35	35	S	1334193	                [Enterobacter] lignolyticus
+0.00	11	11	S	61647	                Pluralibacter gergoviae
+0.00	21	0	G	2815296	              Jejubacter
+0.00	21	21	S	2579935	                Jejubacter calystegiae
+0.00	13	0	G	158483	              Cedecea
+0.00	13	13	S	158822	                Cedecea neteri
+0.00	11	0	G	1330545	              Lelliottia
+0.00	11	11	S	61646	                Lelliottia amnigena
+0.00	11	0	G	2726810	              Scandinavium
+0.00	11	11	S	1851514	                Scandinavium goeteborgense
+0.00	11	0	G	1335483	              Shimwellia
+0.00	11	11	S	563	                Shimwellia blattae
+0.00	10	0	G	2055880	              Pseudescherichia
+0.00	10	10	S	566	                Pseudescherichia vulneris
+0.71	16876	0	F	1903409	            Erwiniaceae
+0.71	16788	0	G	82986	              Tatumella
+0.71	16788	0	G1	2649542	                unclassified Tatumella
+0.71	16788	16788	S	2487345	                  Tatumella sp. TA1
+0.00	53	0	G	53335	              Pantoea
+0.00	33	33	S	553	                Pantoea ananatis
+0.00	19	19	S	66269	                Pantoea stewartii
+0.00	34	0	G	551	              Erwinia
+0.00	34	34	S	79967	                Erwinia pyrifoliae
+0.02	525	0	F	1903411	            Yersiniaceae
+0.01	341	0	G	629	              Yersinia
+0.01	190	0	G1	1649845	                Yersinia pseudotuberculosis complex
+0.01	190	190	S	633	                  Yersinia pseudotuberculosis
+0.01	151	151	S	28152	                Yersinia kristensenii
+0.01	151	0	G	34037	              Rahnella
+0.00	79	79	S	34038	                Rahnella aquatilis
+0.00	71	0	G1	2635087	                unclassified Rahnella
+0.00	71	71	S	657334	                  Rahnella sp. WMR42
+0.00	31	0	G	613	              Serratia
+0.00	31	31	S	615	                Serratia marcescens
+0.00	79	0	F	1903410	            Pectobacteriaceae
+0.00	61	0	G	122277	              Pectobacterium
+0.00	61	61	S	2488639	                Pectobacterium versatile
+0.00	18	0	G	71655	              Brenneria
+0.00	18	18	S	1109412	                Brenneria goodwinii
+0.00	46	0	C	28211	        Alphaproteobacteria
+0.00	46	0	O	356	          Hyphomicrobiales
+0.00	46	0	F	41294	            Bradyrhizobiaceae
+0.00	46	0	G	1073	              Rhodopseudomonas
+0.00	46	46	S	1076	                Rhodopseudomonas palustris
+0.00	19	0	D1	1783272	      Terrabacteria group
+0.00	19	0	P	1239	        Firmicutes
+0.00	19	0	C	91061	          Bacilli
+0.00	19	0	O	1385	            Bacillales
+0.00	19	0	F	90964	              Staphylococcaceae
+0.00	19	0	G	1279	                Staphylococcus
+0.00	19	19	S	1280	                  Staphylococcus aureus
+0.01	138	0	D	2759	    Eukaryota
+0.01	138	0	D1	33154	      Opisthokonta
+0.01	138	0	K	33208	        Metazoa
+0.01	138	0	K1	6072	          Eumetazoa
+0.01	138	0	K2	33213	            Bilateria
+0.01	138	0	K3	33511	              Deuterostomia
+0.01	138	0	P	7711	                Chordata
+0.01	138	0	P1	89593	                  Craniata
+0.01	138	0	P2	7742	                    Vertebrata
+0.01	138	0	P3	7776	                      Gnathostomata
+0.01	138	0	P4	117570	                        Teleostomi
+0.01	138	0	P5	117571	                          Euteleostomi
+0.01	138	0	P6	8287	                            Sarcopterygii
+0.01	138	0	C	1338369	                              Dipnotetrapodomorpha
+0.01	138	0	C1	32523	                                Tetrapoda
+0.01	138	0	C2	32524	                                  Amniota
+0.01	138	0	C	40674	                                    Mammalia
+0.01	138	0	C1	32525	                                      Theria
+0.01	138	0	C2	9347	                                        Eutheria
+0.01	138	0	C3	1437010	                                          Boreoeutheria
+0.01	138	0	C4	314146	                                            Euarchontoglires
+0.01	138	0	O	9443	                                              Primates
+0.01	138	0	O1	376913	                                                Haplorrhini
+0.01	138	0	O2	314293	                                                  Simiiformes
+0.01	138	0	O3	9526	                                                    Catarrhini
+0.01	138	0	O4	314295	                                                      Hominoidea
+0.01	138	0	F	9604	                                                        Hominidae
+0.01	138	0	F1	207598	                                                          Homininae
+0.01	138	0	G	9605	                                                            Homo
+0.01	138	138	S	9606	                                                              Homo sapiens