Mercurial > repos > iuc > metaphlan
changeset 1:b89b0765695d draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/metaphlan/ commit 2b87bc7417360e2b2c9ec0605d475909f6f0482f"
author | iuc |
---|---|
date | Mon, 17 May 2021 20:10:24 +0000 |
parents | f5df500fcc3c |
children | a92a632c4d9b |
files | formatoutput.py macros.xml metaphlan.xml |
diffstat | 3 files changed, 400 insertions(+), 17 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/formatoutput.py Mon May 17 20:10:24 2021 +0000 @@ -0,0 +1,144 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import argparse +import re +from pathlib import Path + +taxo_level = { + 'k': 'kingdom', + 'p': 'phylum', + 'c': 'class', + 'o': 'order', + 'f': 'family', + 'g': 'genus', + 's': 'species', + 't': 'strains'} + + +def split_levels(metaphlan_output_fp, out_dp, legacy_output): + ''' + Split default MetaPhlAn into a report for each taxonomic level + + :param metaphlan_output_fp: Path default MetaPhlAn output + :param out_dp: Path to output directory + :param legacy_output: Boolean for legacy output + ''' + # prepare output files + abund_f = { + 'k': open(out_dp / Path('kingdom'), 'w'), + 'p': open(out_dp / Path('phylum'), 'w'), + 'c': open(out_dp / Path('class'), 'w'), + 'o': open(out_dp / Path('order'), 'w'), + 'f': open(out_dp / Path('family'), 'w'), + 'g': open(out_dp / Path('genus'), 'w'), + 's': open(out_dp / Path('species'), 'w'), + 't': open(out_dp / Path('strains'), 'w') + } + for level in abund_f: + abund_f[level].write("%s\t" % taxo_level[level]) + if not legacy_output: + abund_f[level].write("%s_id\t" % taxo_level[level]) + abund_f[level].write("abundance\n") + + levels_number = len(taxo_level) + + with open(metaphlan_output_fp, 'r') as metaphlan_output_f: + with open(out_dp / Path('all'), 'w') as all_level_f: + # write header in all leve file + for level in ['k', 'p', 'c', 'o', 'f', 'g', 's', 't']: + all_level_f.write("%s\t" % taxo_level[level]) + if not legacy_output: + all_level_f.write("%s_id\t" % taxo_level[level]) + all_level_f.write("abundance\n") + + # parse metaphlan file + for line in metaphlan_output_f.readlines(): + # skip headers + if line.startswith("#"): + continue + + # spit lines + split_line = line[:-1].split('\t') + taxo_n = split_line[0].split('|') + if legacy_output: + abundance = split_line[1] + else: + taxo_id = split_line[1].split('|') + abundance = split_line[2] + + # get taxon name and ids + for i in range(len(taxo_n)): + taxo = taxo_n[i].split('__')[1] + taxo = taxo.replace("_", " ") + all_level_f.write("%s\t" % taxo) + if not legacy_output: + all_level_f.write("%s\t" % taxo_id[i]) + + # if not all taxon levels + for i in range(len(taxo_n), levels_number): + all_level_f.write('\t') + + all_level_f.write("%s\n" % abundance) + + # write + last_taxo_level = taxo_n[-1].split('__') + taxo = last_taxo_level[1].replace("_", " ") + level = last_taxo_level[0] + abund_f[level].write("%s\t" % taxo) + if not legacy_output: + abund_f[level].write("%s\t" % taxo_id[-1]) + abund_f[level].write("%s\n" % abundance) + + # close files + for taxo_level_f in abund_f: + abund_f[taxo_level_f].close() + + +def format_for_krona(metaphlan_output_fp, krona_out_fp): + ''' + Split default MetaPhlAn into a report for each taxonomic levKRONAel + + :param metaphlan_output_fp: Path default MetaPhlAn output + :param krona_out: Path to output file for Krona + ''' + re_replace = re.compile(r"\w__") + re_bar = re.compile(r"\|") + re_underscore = re.compile(r"_") + + with open(metaphlan_output_fp, 'r') as metaphlan_output_f: + with open(krona_out_fp, 'w') as krona_out_f: + for line in metaphlan_output_f.readlines(): + if "s__" in line: + x = line.rstrip().split('\t') + lineage = re.sub(re_bar, '', x[0]) + lineage = re.sub(re_replace, '\t', lineage) + lineage = re.sub(re_underscore, ' ', lineage) + krona_out_f.write("%s\t%s\n" % (x[-1], lineage)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Format MetaPhlAn output') + subparsers = parser.add_subparsers(dest='function') + # split_levels + split_levels_parser = subparsers.add_parser('split_levels', help='Split default MetaPhlAn into a report for each taxonomic level') + split_levels_parser.add_argument('--metaphlan_output', help="Path to default MetaPhlAn output") + split_levels_parser.add_argument('--outdir', help="Path to output directory") + split_levels_parser.add_argument('--legacy-output', dest='legacy_output', action='store_true', help="Old MetaPhlAn2 two columns output") + split_levels_parser.set_defaults(legacy_output=False) + # format_for_krona + format_for_krona_parser = subparsers.add_parser('format_for_krona', help='Split default MetaPhlAn into a report for each taxonomic level') + format_for_krona_parser.add_argument('--metaphlan_output', help="Path to default MetaPhlAn output") + format_for_krona_parser.add_argument('--krona_output', help="Path to Krona output directory") + + args = parser.parse_args() + + if args.function == 'split_levels': + split_levels( + Path(args.metaphlan_output), + Path(args.outdir), + args.legacy_output) + elif args.function == 'format_for_krona': + format_for_krona( + Path(args.metaphlan_output), + Path(args.krona_output))
--- a/macros.xml Mon Apr 19 20:56:20 2021 +0000 +++ b/macros.xml Mon May 17 20:10:24 2021 +0000 @@ -1,6 +1,6 @@ <?xml version="1.0"?> <macros> - <token name="@TOOL_VERSION@">3.0.7</token> + <token name="@TOOL_VERSION@">3.0.8</token> <token name="@VERSION_SUFFIX@">0</token> <token name="@PROFILE@">20.01</token> <xml name="edam_ontology"> @@ -24,17 +24,4 @@ <citation type="doi">1101/2020.11.19.388223</citation> </citations> </xml> - <token name="@FILE_FORMATS@">fastq,fastq.gz,fastq.bz2,fasta,fasta.gz,fasta.bz2</token> - <xml name="tax_lev"> - <param argument="--tax_lev" type="select" label="Taxonomic level for the relative abundance output"> - <option value="a" selected="true">All taxonomic levels</option> - <option value="k">Kingdoms only</option> - <option value="p">Phyla only</option> - <option value="c">Classes only</option> - <option value="o">Orders only</option> - <option value="f">Families only</option> - <option value="g">Genera only</option> - <option value="s">Species only</option> - </param> - </xml> </macros>
--- a/metaphlan.xml Mon Apr 19 20:56:20 2021 +0000 +++ b/metaphlan.xml Mon May 17 20:10:24 2021 +0000 @@ -2,6 +2,32 @@ <description>to profile the composition of microbial communities</description> <macros> <import>macros.xml</import> + <xml name="tax_lev"> + <conditional name="tax_lev"> + <param argument="--tax_lev" type="select" label="Taxonomic level for the relative abundance output"> + <option value="a" selected="true">All taxonomic levels</option> + <option value="k">Kingdoms only</option> + <option value="p">Phyla only</option> + <option value="c">Classes only</option> + <option value="o">Orders only</option> + <option value="f">Families only</option> + <option value="g">Genera only</option> + <option value="s">Species only</option> + </param> + <when value="a"> + <param name="split_levels" type='boolean' checked="false" truevalue='true' falsevalue='false' + label="Generate a report for each taxonomic level?" help="It will be in addition to the default output"/> + </when> + <when value="k"/> + <when value="p"/> + <when value="c"/> + <when value="o"/> + <when value="f"/> + <when value="g"/> + <when value="s"/> + </conditional> + </xml> + <token name="@FILE_FORMATS@">fastq,fastq.gz,fastq.bz2,fasta,fasta.gz,fasta.bz2</token> </macros> <expand macro="edam_ontology"/> <expand macro="requirements"/> @@ -110,7 +136,7 @@ #end if -t '$analysis.analysis_type.t' #if $analysis.analysis_type.t == "rel_ab" or $analysis.analysis_type.t == "rel_ab_w_read_stats" - --tax_lev '$analysis.analysis_type.tax_lev' + --tax_lev '$analysis.analysis_type.tax_lev.tax_lev' #else if $analysis.analysis_type.t == "clade_specific_strain_tracker" --clade '$analysis.analysis_type.clade' #if str($analysis.analysis_type.min_ab) != '' @@ -159,6 +185,27 @@ && mv 'bowtie2out' '$bowtie2out' #end if + +#if $analysis.analysis_type.tax_lev.tax_lev == 'a' and $analysis.analysis_type.tax_lev.split_levels +&& +mkdir 'split_levels' +&& +python '$__tool_directory__/formatoutput.py' + split_levels + --metaphlan_output '$output_file' + --outdir 'split_levels' + $out.legacy_output +&& +ls split_levels +#end if + +#if $out.krona_output +&& +python '$__tool_directory__/formatoutput.py' + format_for_krona + --metaphlan_output '$output_file' + --krona_output '$krona_output_file' +#end if ]]></command> <inputs> <section name="inputs" title="Inputs" expanded="true"> @@ -294,6 +341,7 @@ label="Report the profiling using the CAMI output format?"/> <param argument="--unknown_estimation" type='boolean' checked="false" truevalue='--unknown_estimation' falsevalue='' label="Scale relative abundances to the number of reads mapping to known clades in order to estimate unknowness?"/> + <param name="krona_output" type='boolean' checked="false" truevalue='true' falsevalue='false' label="Output for Krona?"/> </section> </inputs> <outputs> @@ -305,9 +353,16 @@ <filter>inputs['in']['selector'] == "raw"</filter> </data> <data name="biom_output_file" format="biom1" label="${tool.name} on ${on_string}: BIOM file" /> + <collection name="levels" type="list" label="${tool.name} on ${on_string}: Predicted taxon relative abundances at each taxonomic levels" > + <discover_datasets pattern="(?P<designation>.+)" directory="split_levels/" format="tabular"/> + <filter>analysis['analysis_type']['tax_lev']['tax_lev'] == "a" and analysis['analysis_type']['tax_lev']['split_levels']</filter> + </collection> + <data name="krona_output_file" format="tabular" label="${tool.name} on ${on_string}: Predicted taxon relative abundances for Krona"> + <filter>out['krona_output']</filter> + </data> </outputs> <tests> - <test expect_num_outputs="4"> + <test expect_num_outputs="6"> <section name="inputs"> <conditional name="in"> <param name="selector" value="raw"/> @@ -329,6 +384,13 @@ </conditional> </section> <section name="analysis"> + <conditional name="analysis_type"> + <param name="t" value="rel_ab"/> + <conditional name="tax_lev"> + <param name="tax_lev" value="a"/> + <param name="split_levels" value="true"/> + </conditional> + </conditional> <param name="min_cu_len" value="2000"/> <param name="organism_profiling" value="add_viruses"/> <param name="stat" value="avg_g"/> @@ -343,6 +405,7 @@ <param name="legacy_output" value="false"/> <param name="CAMI_format_output" value="false"/> <param name="unknown_estimation" value="false"/> + <param name="krona_output" value="true"/> </section> <output name="output_file" ftype="tabular" file="SRS014464-Anterior_nares-abundances.tabular" compare="sim_size"> <assert_contents> @@ -365,6 +428,84 @@ <has_text text="k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Moraxella|s__Moraxella_lacunata"/> </assert_contents> </output> + <output_collection name="levels" type="list" > + <element name="all" ftype="tabular"> + <assert_contents> + <has_text text="Gammaproteobacteria"/> + <has_text text="Corynebacterium accolens"/> + <has_n_columns n="17"/> + </assert_contents> + </element> + <element name="kingdom" ftype="tabular"> + <assert_contents> + <has_text text="kingdom_id"/> + <has_text text="Bacteria"/> + <has_n_columns n="3"/> + </assert_contents> + </element> + <element name="phylum" ftype="tabular"> + <assert_contents> + <has_text text="phylum_id"/> + <not_has_text text="kingdom_id"/> + <has_text text="Firmicutes"/> + <has_n_columns n="3"/> + </assert_contents> + </element> + <element name="class" ftype="tabular"> + <assert_contents> + <has_text text="class_id"/> + <not_has_text text="phylum_id"/> + <has_text text="Actinobacteria"/> + <has_n_columns n="3"/> + </assert_contents> + </element> + <element name="order" ftype="tabular"> + <assert_contents> + <has_text text="order_id"/> + <not_has_text text="class_id"/> + <has_text text="Propionibacteriales"/> + <has_n_columns n="3"/> + </assert_contents> + </element> + <element name="family" ftype="tabular"> + <assert_contents> + <has_text text="family_id"/> + <not_has_text text="order"/> + <has_text text="Propionibacteriaceae"/> + <has_n_columns n="3"/> + </assert_contents> + </element> + <element name="genus" ftype="tabular"> + <assert_contents> + <has_text text="genus_id"/> + <not_has_text text="family"/> + <has_text text="Cutibacterium"/> + <has_n_columns n="3"/> + </assert_contents> + </element> + <element name="species" ftype="tabular"> + <assert_contents> + <has_text text="species_id"/> + <not_has_text text="genus"/> + <has_text text="Corynebacterium accolens"/> + <has_n_columns n="3"/> + </assert_contents> + </element> + <element name="strains" ftype="tabular"> + <assert_contents> + <has_text text="strains_id"/> + <not_has_text text="species_id"/> + <has_n_columns n="3"/> + </assert_contents> + </element> + </output_collection> + <output name="krona_output_file" ftype="tabular"> + <assert_contents> + <not_has_text text="k__Bacteria"/> + <has_text text="Corynebacterium accolens"/> + <has_n_columns n="9"/> + </assert_contents> + </output> </test> <test expect_num_outputs="4"> <section name="inputs"> @@ -389,6 +530,13 @@ </conditional> </section> <section name="analysis"> + <conditional name="analysis_type"> + <param name="t" value="rel_ab"/> + <conditional name="tax_lev"> + <param name="tax_lev" value="a"/> + <param name="split_levels" value="false"/> + </conditional> + </conditional> <param name="min_cu_len" value="2000"/> <param name="organism_profiling" value="add_viruses"/> <param name="stat" value="avg_g"/> @@ -403,6 +551,7 @@ <param name="legacy_output" value="false"/> <param name="CAMI_format_output" value="false"/> <param name="unknown_estimation" value="false"/> + <param name="krona_output" value="false"/> </section> <output name="output_file" ftype="tabular" file="SRS014464-Anterior_nares-abundances.tabular" compare="sim_size"> <assert_contents> @@ -452,6 +601,13 @@ </conditional> </section> <section name="analysis"> + <conditional name="analysis_type"> + <param name="t" value="rel_ab"/> + <conditional name="tax_lev"> + <param name="tax_lev" value="a"/> + <param name="split_levels" value="false"/> + </conditional> + </conditional> <param name="min_cu_len" value="2000"/> <param name="organism_profiling" value="add_viruses"/> <param name="stat" value="avg_g"/> @@ -466,6 +622,7 @@ <param name="legacy_output" value="false"/> <param name="CAMI_format_output" value="false"/> <param name="unknown_estimation" value="false"/> + <param name="krona_output" value="false"/> </section> <output name="output_file" ftype="tabular" file="SRS014464-Anterior_nares-abundances.tabular" compare="sim_size"> <assert_contents> @@ -506,6 +663,13 @@ </conditional> </section> <section name="analysis"> + <conditional name="analysis_type"> + <param name="t" value="rel_ab"/> + <conditional name="tax_lev"> + <param name="tax_lev" value="a"/> + <param name="split_levels" value="false"/> + </conditional> + </conditional> <param name="min_cu_len" value="2000"/> <param name="organism_profiling" value="add_viruses"/> <param name="stat" value="avg_g"/> @@ -520,6 +684,7 @@ <param name="legacy_output" value="false"/> <param name="CAMI_format_output" value="false"/> <param name="unknown_estimation" value="false"/> + <param name="krona_output" value="false"/> </section> <output name="output_file" ftype="tabular" file="SRS014464-Anterior_nares-abundances.tabular" compare="sim_size"> <assert_contents> @@ -557,6 +722,13 @@ <param name="min_mapq_val" value="5"/> </section> <section name="analysis"> + <conditional name="analysis_type"> + <param name="t" value="rel_ab"/> + <conditional name="tax_lev"> + <param name="tax_lev" value="a"/> + <param name="split_levels" value="false"/> + </conditional> + </conditional> <param name="min_cu_len" value="2000"/> <param name="organism_profiling" value="add_viruses"/> <param name="stat" value="avg_g"/> @@ -571,6 +743,7 @@ <param name="legacy_output" value="false"/> <param name="CAMI_format_output" value="false"/> <param name="unknown_estimation" value="false"/> + <param name="krona_output" value="false"/> </section> <output name="output_file" ftype="tabular" file="SRS014464-Anterior_nares-abundances.tabular" compare="sim_size"> <assert_contents> @@ -586,7 +759,7 @@ </assert_contents> </output> </test> - <test expect_num_outputs="4"> + <test expect_num_outputs="6"> <section name="inputs"> <conditional name="in"> <param name="selector" value="raw"/> @@ -608,6 +781,13 @@ </conditional> </section> <section name="analysis"> + <conditional name="analysis_type"> + <param name="t" value="rel_ab"/> + <conditional name="tax_lev"> + <param name="tax_lev" value="a"/> + <param name="split_levels" value="true"/> + </conditional> + </conditional> <param name="min_cu_len" value="2000"/> <param name="organism_profiling" value="add_viruses"/> <param name="stat" value="avg_g"/> @@ -623,6 +803,7 @@ <param name="legacy_output" value="true"/> <param name="CAMI_format_output" value="false"/> <param name="unknown_estimation" value="false"/> + <param name="krona_output" value="true"/> </section> <output name="output_file" ftype="tabular" file="SRS014464-Anterior_nares-legacy-abundances.tabular" compare="sim_size"> <assert_contents> @@ -647,6 +828,77 @@ <has_text text="k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Moraxella|s__Moraxella_lacunata"/> </assert_contents> </output> + <output_collection name="levels" type="list" > + <element name="all" ftype="tabular"> + <assert_contents> + <has_text text="Gammaproteobacteria"/> + <has_text text="Corynebacterium accolens"/> + <has_n_columns n="9"/> + </assert_contents> + </element> + <element name="kingdom" ftype="tabular"> + <assert_contents> + <has_text text="kingdom"/> + <has_text text="Bacteria"/> + <has_n_columns n="2"/> + </assert_contents> + </element> + <element name="phylum" ftype="tabular"> + <assert_contents> + <has_text text="phylum"/> + <has_text text="Firmicutes"/> + <has_n_columns n="2"/> + </assert_contents> + </element> + <element name="class" ftype="tabular"> + <assert_contents> + <has_text text="class"/> + <has_text text="Actinobacteria"/> + <has_n_columns n="2"/> + </assert_contents> + </element> + <element name="order" ftype="tabular"> + <assert_contents> + <has_text text="order"/> + <has_text text="Propionibacteriales"/> + <has_n_columns n="2"/> + </assert_contents> + </element> + <element name="family" ftype="tabular"> + <assert_contents> + <has_text text="family"/> + <has_text text="Propionibacteriaceae"/> + <has_n_columns n="2"/> + </assert_contents> + </element> + <element name="genus" ftype="tabular"> + <assert_contents> + <has_text text="genus"/> + <has_text text="Cutibacterium"/> + <has_n_columns n="2"/> + </assert_contents> + </element> + <element name="species" ftype="tabular"> + <assert_contents> + <has_text text="species"/> + <has_text text="Corynebacterium accolens"/> + <has_n_columns n="2"/> + </assert_contents> + </element> + <element name="strains" ftype="tabular"> + <assert_contents> + <has_text text="strains"/> + <has_n_columns n="2"/> + </assert_contents> + </element> + </output_collection> + <output name="krona_output_file" ftype="tabular"> + <assert_contents> + <not_has_text text="k__Bacteria"/> + <has_text text="Corynebacterium accolens"/> + <has_n_columns n="9"/> + </assert_contents> + </output> </test> </tests> <help><![CDATA[