changeset 11:ac24fff14f23 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_datasets commit 4d7d3a56084e140f4fa63fb0e04a08b732f247f2
author iuc
date Fri, 02 Dec 2022 10:52:48 +0000
parents a3395b1d871b
children d78faac2c6ef
files datasets_gene.xml datasets_genome.xml macros.xml test-data/geneids.txt
diffstat 4 files changed, 973 insertions(+), 113 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datasets_gene.xml	Fri Dec 02 10:52:48 2022 +0000
@@ -0,0 +1,536 @@
+<tool id="datasets_download_gene" name="NCBI Datasets Gene" profile="@PROFILE@" license="@LICENSE@" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
+    <description>download gene sequences and metadata</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"></expand>
+    <command><![CDATA[
+#import re
+@SETUP_CERTIFICATES@
+datasets download gene $query.subcommand.download_by
+#if $query.subcommand.download_by == 'taxon':
+    '$query.subcommand.taxon_positional'
+#else:
+    #if $query.subcommand.text_or_file.text_or_file == 'text':
+        #echo " ".join(f"'{x}'" for x in re.split(" |,", str($query.subcommand.text_or_file.accession)) if x)
+    #else
+        --inputfile '$query.subcommand.text_or_file.inputfile'
+    #end if
+#end if
+
+#if $query.subcommand.download_by != 'taxon' and $query.subcommand.ortholog:
+    --ortholog '$query.subcommand.ortholog'
+#end if
+
+#if $query.subcommand.download_by == 'symbol':
+    #if $query.subcommand.taxon
+        --taxon '$query.subcommand.taxon'
+    #end if
+#end if
+
+#if $query.subcommand.download_by == 'accession':
+    #if $query.subcommand.taxon_filter
+        --taxon-filter '$query.subcommand.taxon_filter'
+    #end if
+    #if str($query.subcommand.include_flanks_bp)
+        --include-flanks-bp $query.subcommand.include_flanks_bp
+    #end if
+#end if
+
+#if $filters.fasta_filter_cond.fasta_filter_select
+    #if $filters.fasta_filter_cond.fasta_filter_select == 'text'
+        --fasta-filter #echo ",".join(f"'{x}'" for x in $filters.fasta_filter_cond.fasta_filter.split(',') if x)
+    #else
+        --fasta-filter-file '$filters.fasta_filter_cond.fasta_filter_file'
+    #end if
+#end if
+
+--include
+#if $file_choices.kingdom_cond.include
+    #echo ",".join($file_choices.kingdom_cond.include)
+#else
+    none
+#end if
+
+--no-progressbar
+
+## produce TSV report file (either gene or prok-gene)
+&& 
+dataformat
+    tsv
+    $file_choices.kingdom_cond.kingdom_sel
+    --package ncbi_dataset.zip
+    --fields #echo ",".join($file_choices.kingdom_cond.report_columns)
+    > gene_data_report.tsv
+## if ! dataformat tsv gene --package ncbi_dataset.zip > gene_data_report.tsv 2> dataformat.log; then
+##     dataformat tsv prok-gene --package ncbi_dataset.zip > gene_data_report.tsv 2>> dataformat.log;
+## fi
+
+#if $file_choices.kingdom_cond.include and "product-report" in $file_choices.kingdom_cond.include
+    && dataformat tsv gene-product --package ncbi_dataset.zip > gene_product_report.tsv
+#end if
+
+## unzip and rehydrate if any data is to be downloaded (include is not None)
+#if $file_choices.kingdom_cond.include
+    ## unzip
+    && 7z x -y ncbi_dataset.zip > 7z.log
+#end if
+]]></command>
+    <inputs>
+        <section name="query" title="Query" expanded="true">
+            <conditional name="subcommand">
+                <param name="download_by" type="select" label="Choose how to find genes to download">
+                    <option value="gene-id">By NCBI Gene ID</option>
+                    <option value="symbol">By Gene symbol</option>
+                    <option value="accession">By RefSeq nucleotide or protein accession</option>
+                    <option value="taxon">By taxon (NCBI Taxonomy ID, scientific or common name at any tax rank)</option>
+                </param>
+                <when value="gene-id">
+                    <expand macro="text_or_file" what="Gene ID" what_extended="NCBI Gene ID" help=""/>
+                    <expand macro="ortholog"/>
+                </when>
+                <when value="symbol">
+                    <expand macro="text_or_file" what="Gene Symbol" what_extended="NCBI Gene Symbol" help=""/>
+                    <expand macro="ortholog"/>
+                    <param argument="--taxon" type="text" value="human" label="Species for gene symbol" help="NCBI taxid, common or scientific name">
+                        <sanitizer invalid_char="">
+                            <valid initial="string.letters">
+                                <add value=" " />
+                                <add value="-" />
+                            </valid>
+                        </sanitizer>
+                    </param>
+                </when>
+                <when value="accession">
+                    <expand macro="text_or_file" what="Gene Accession" what_extended="NCBI Gene Accession" help=""/>
+                    <expand macro="ortholog"/>
+                    <param argument="--taxon-filter" type="text" value="" label="Limit gene sequences and annotation report file to specified taxon" help="any rank, only available for WP accessions">
+                        <sanitizer invalid_char="">
+                            <valid initial="string.letters">
+                                <add value=" " />
+                                <add value="-" />
+                            </valid>
+                        </sanitizer>
+                    </param>
+                    <param argument="--include-flanks-bp" type="integer" optional="true" min="0" label="Length of flanking nucleotides" help="WP accessions only"/>
+                </when>
+                <when value="taxon">
+                    <expand macro="taxon_positional"/>
+                </when>
+            </conditional>
+        </section>
+        <section name="filters" title="Filters and Limit">
+            <conditional name="fasta_filter_cond" label="Filter protein and RNA sequences by RefSeq nucleotide and protein accessions">
+                <param name="fasta_filter_select" type="select" label="Apply filter">
+                    <option value="">No</option>
+                    <option value="text">Enter accessions</option>
+                    <option value="file">Read a list of accessions from a dataset</option>
+                </param>
+                <when value=""/>
+                <when value="text">
+                    <param argument="--fasta-filter" type="text" label="RefSeq nucleotide and protein accessions" help="Comma separated">
+                        <sanitizer invalid_char="">
+                            <valid initial="string.letters,string.digits">
+                                <add value="," />
+                            </valid>
+                        </sanitizer>
+                    </param>
+                </when>
+                <when value="file">
+                    <param argument="--fasta-filter-file" type="data" format="txt" label="Dataset with list of RefSeq nucleotide and protein accessions" help=""/>
+                </when>
+            </conditional>
+        </section>
+        <section name="file_choices" title="Output options" expanded="true">
+            <conditional name="kingdom_cond">
+                <param name="kingdom_sel" type="select" label="Kingdom" help="Prokaryotic: Accessions starting with WP_. Data report has a different format and the rna, cds, 3/5' UTR and gene-product report are not suported. ">
+                    <option value="gene">Eukaryote</option>
+                    <option value="prok-gene">Prokaryote</option>
+                </param>
+                <when value="gene">
+                    <expand macro="gene_tsv_report_columns">
+                        <option value="gene-id" selected="true">NCBI GeneID</option>
+                        <option value="gene-type" selected="true">Gene Type</option>
+                        <option value="common-name" selected="true">Common Name</option>
+                        <option value="description" selected="true">Description</option>
+                        <option value="symbol" selected="true">Symbol</option>
+                        <option value="synonyms" selected="true">Synonyms</option>
+                        <option value="tax-id" selected="true">Taxonomic ID</option>
+                        <option value="tax-name" selected="true">Taxonomic Name</option>
+                    </expand>
+                    <expand macro="include">
+                        <expand macro="gene_includes">
+                            <option value="rna" selected="true">transcript (rna)</option>
+                            <option value="cds">nucleotide coding sequences (cds)</option>
+                            <option value="5p-utr">5'-UTR (5p-utr)</option>
+                            <option value="3p-utr">3'-UTR (3p-utr)</option>
+                            <option value="product-report"> (product-report)</option>
+                        </expand>
+                    </expand>
+                </when>
+                <when value="prok-gene">
+                    <expand macro="prok_gene_tsv_report_columns">
+                        <option value="accession" selected="true">Accession</option>
+                        <option value="description" selected="true">Description</option>
+                        <option value="ec-number" selected="true">EC Number</option>
+                        <option value="gene-symbol" selected="true">Gene Symbol</option>
+                        <option value="mapping-count" selected="true">Number of Genome Mappings</option>
+                        <option value="protein-length" selected="true">Protein Length</option>
+                        <option value="protein-name" selected="true">Protein Name</option>
+                    </expand>
+                    <expand macro="include">
+                        <expand macro="gene_includes"/>
+                    </expand>
+                </when>
+            </conditional>
+            <param name="decompress" type="boolean" label="Decompress FASTA" help="By default FASTA files are provided zipped (fasta.gz) if this is checked the data will be decompressed"/>
+        </section>
+    </inputs>
+    <outputs>
+        <data name="gene_data_report" format="tabular" label="NCBI Gene Datasets: Data Report" from_work_dir="gene_data_report.tsv"/>
+        <data name="gene_product_report" format="tabular" label="NCBI Gene Datasets: Product Report" from_work_dir="gene_product_report.tsv">
+            <filter>file_choices['kingdom_cond']['include'] and "product-report" in file_choices['kingdom_cond']['include']</filter>
+        </data>
+        <data name="gene_fasta" label="NCBI Gene Datasets: Gene fasta" format="fasta" from_work_dir="ncbi_dataset/data/gene.fna">
+            <filter>file_choices['kingdom_cond']['include'] and "gene" in file_choices['kingdom_cond']['include']</filter>
+        </data>
+        <data name="rna_fasta" label="NCBI Gene Datasets: RNA fasta" format="fasta" from_work_dir="ncbi_dataset/data/rna.fna">
+            <filter>file_choices['kingdom_cond']['include'] and "rna" in file_choices['kingdom_cond']['include']</filter>
+        </data>
+        <data name="protein_fasta" label="NCBI Gene Datasets: protein fasta" format="fasta" from_work_dir="ncbi_dataset/data/protein.faa">
+            <filter>file_choices['kingdom_cond']['include'] and "protein" in file_choices['kingdom_cond']['include']</filter>
+        </data>
+        <data name="cds_fasta" label="NCBI Gene Datasets: CDS fasta" format="fasta" from_work_dir="ncbi_dataset/data/cds.fna">
+            <filter>file_choices['kingdom_cond']['include'] and "cds" in file_choices['kingdom_cond']['include']</filter>
+        </data>
+        <data name="threep_utr_fasta" label="NCBI Gene Datasets: 3' UTR fasta" format="fasta" from_work_dir="ncbi_dataset/data/3p_utr.fna">
+            <filter>file_choices['kingdom_cond']['include'] and "5p-utr" in file_choices['kingdom_cond']['include']</filter>
+        </data>
+        <data name="fivep_utr_fasta" label="NCBI Gene Datasets: 5' UTR fasta" format="fasta" from_work_dir="ncbi_dataset/data/5p_utr.fna">
+            <filter>file_choices['kingdom_cond']['include'] and "5p-utr" in file_choices['kingdom_cond']['include']</filter>
+        </data>
+    </outputs>
+    <tests>
+        <!-- 1: datasets download gene gene-id 672 -->
+        <test expect_num_outputs="3">
+            <conditional name="query|subcommand">
+                <param name="download_by" value="gene-id"/>
+                <conditional name="text_or_file">
+                    <param name="text_or_file" value="text"/>
+                    <param name="accession" value="672"/>
+                </conditional>
+            </conditional>
+            <output name="gene_data_report">
+                <assert_contents>
+                    <has_text text="human"/>
+                    <has_text text="BRCA1"/>
+                    <has_n_lines n="2"/>
+                    <has_n_columns n="8"/>
+                </assert_contents>
+            </output>
+            <output name="rna_fasta">
+                <assert_contents>
+                    <has_text text=">"/>
+                </assert_contents>
+            </output>
+            <output name="protein_fasta">
+                <assert_contents>
+                    <has_text text=">"/>
+                </assert_contents>
+            </output>
+        </test>
+        <!-- 2: datasets download gene gene-id 2597 14433 -->
+        <test expect_num_outputs="3">
+            <conditional name="query|subcommand">
+                <param name="download_by" value="gene-id"/>
+                <conditional name="text_or_file">
+                    <param name="text_or_file" value="text"/>
+                    <param name="accession" value="2597,14433"/>
+                </conditional>
+            </conditional>
+            <output name="gene_data_report">
+                <assert_contents>
+                    <has_text text="house mouse"/>
+                    <has_text text="glyceraldehyde-3-phosphate dehydrogenase"/>
+                    <has_n_lines n="3"/>
+                    <has_n_columns n="8"/>
+                </assert_contents>
+            </output>
+            <output name="rna_fasta">
+                <assert_contents>
+                    <has_text text=">"/>
+                </assert_contents>
+            </output>
+            <output name="protein_fasta">
+                <assert_contents>
+                    <has_text text=">"/>
+                </assert_contents>
+            </output>
+        </test>
+        <!-- 3: same as above + give accessions by file, 2 different outputs and ortholog-->
+        <test expect_num_outputs="3">
+            <conditional name="query|subcommand">
+                <param name="download_by" value="gene-id"/>
+                <conditional name="text_or_file">
+                    <param name="text_or_file" value="file"/>
+                    <param name="inputfile" value="geneids.txt"/>
+                </conditional>
+                <param name="ortholog" value="Haplorrhini,Strepsirrhini"/>
+            </conditional>
+            <section name="file_choices">
+                <conditional name="kingdom_cond">
+                    <param name="include" value="gene,cds"/>
+                </conditional>
+            </section>
+            <output name="gene_data_report">
+                <assert_contents>
+                    <has_text text="baboon"/>
+                    <has_text text="glyceraldehyde-3-phosphate dehydrogenase"/>
+                    <has_n_lines n="31"/>
+                    <has_n_columns n="8"/>
+                </assert_contents>
+            </output>
+            <output name="gene_fasta">
+                <assert_contents>
+                    <has_text text=">"/>
+                </assert_contents>
+            </output>
+            <output name="cds_fasta">
+                <assert_contents>
+                    <has_text text=">"/>
+                </assert_contents>
+            </output>
+        </test>
+        <!-- 4: datasets download gene symbol tp53 -->
+        <test expect_num_outputs="1">
+            <conditional name="query|subcommand">
+                <param name="download_by" value="symbol"/>
+                <conditional name="text_or_file">
+                    <param name="text_or_file" value="text"/>
+                    <param name="accession" value="tp53"/>
+                </conditional>
+            </conditional>
+            <section name="file_choices">
+                <conditional name="kingdom_cond">
+                    <param name="include" value=""/>
+                </conditional>
+            </section>
+            <output name="gene_data_report">
+                <assert_contents>
+                    <has_text text="human"/>
+                    <has_n_lines n="2"/>
+                    <has_n_columns n="8"/>
+                </assert_contents>
+            </output>
+        </test>
+        <!-- 5: datasets download gene symbol brca1 \-\-taxon mouse -->
+        <test expect_num_outputs="4">
+            <conditional name="query|subcommand">
+                <param name="download_by" value="symbol"/>
+                <conditional name="text_or_file">
+                    <param name="text_or_file" value="text"/>
+                    <param name="accession" value="brca1"/>
+                </conditional>
+                <param name="taxon" value="mouse"/>
+            </conditional>
+            <section name="file_choices">
+                <conditional name="kingdom_cond">
+                    <param name="include" value="3p-utr,5p-utr,product-report"/>
+                </conditional>
+            </section>
+            <output name="gene_data_report">
+                <assert_contents>
+                    <has_text text="house mouse"/>
+                    <has_text text="Brca1"/>
+                    <has_n_lines n="2"/>
+                    <has_n_columns n="8"/>
+                </assert_contents>
+            </output>
+            <output name="gene_product_report">
+                <assert_contents>
+                    <has_text text="house mouse"/>
+                    <has_text text="XR_004936704.1"/>
+                    <has_n_lines n="137"/>
+                    <has_n_columns n="38"/>
+                </assert_contents>
+            </output>
+            <output name="threep_utr_fasta">
+                <assert_contents>
+                    <has_text text=">"/>
+                </assert_contents>
+            </output>
+            <output name="fivep_utr_fasta">
+                <assert_contents>
+                    <has_text text=">"/>
+                </assert_contents>
+            </output>
+        </test>
+        <!-- 6: datasets download gene symbol brca1 \-\-ortholog -->
+        <test expect_num_outputs="1">
+            <conditional name="query|subcommand">
+                <param name="download_by" value="symbol"/>
+                <conditional name="text_or_file">
+                    <param name="text_or_file" value="text"/>
+                    <param name="accession" value="brca1"/>
+                </conditional>
+                <param name="ortholog" value="rodentia"/>
+            </conditional>
+            <section name="file_choices">
+                <conditional name="kingdom_cond">
+                    <param name="include" value=""/>
+                </conditional>
+            </section>
+            <output name="gene_data_report">
+                <assert_contents>
+                    <has_text text="rat"/>
+                    <has_text text="Brca1"/>
+                    <has_n_lines n="38"/>
+                    <has_n_columns n="8"/>
+                </assert_contents>
+            </output>
+        </test>
+
+        <!-- 7: datasets download gene accession NP_000483.3 -->
+        <test expect_num_outputs="1">
+            <conditional name="query|subcommand">
+                <param name="download_by" value="accession"/>
+                <conditional name="text_or_file">
+                    <param name="text_or_file" value="text"/>
+                    <param name="accession" value="NP_000483.3"/>
+                </conditional>
+            </conditional>
+            <section name="file_choices">
+                <conditional name="kingdom_cond">
+                    <param name="include" value=""/>
+                </conditional>
+            </section>
+            <output name="gene_data_report">
+                <assert_contents>
+                    <has_text text="human"/>
+                    <has_n_lines n="2"/>
+                    <has_n_columns n="8"/>
+                </assert_contents>
+            </output>
+        </test>
+        <!-- 8: datasets download gene accession NM_000546.6 NM_000492.4 + ortholog-->
+        <test expect_num_outputs="1">
+            <conditional name="query|subcommand">
+                <param name="download_by" value="accession"/>
+                <conditional name="text_or_file">
+                    <param name="text_or_file" value="text"/>
+                    <param name="accession" value="NM_000546.6 NM_000492.4"/>
+                </conditional>
+                <param name="ortholog" value="true"/>
+            </conditional>
+            <section name="file_choices">
+                <conditional name="kingdom_cond">
+                    <param name="include" value=""/>
+                </conditional>
+            </section>
+            <output name="gene_data_report">
+                <assert_contents>
+                    <has_text text="human"/>
+                    <has_n_lines n="823"/>
+                    <has_n_columns n="8"/>
+                </assert_contents>
+            </output>
+        </test>
+
+        <!-- 9: datasets download gene accession WP_004675351.1 + include_flanks_bp -->
+        <test expect_num_outputs="3">
+            <conditional name="query|subcommand">
+                <param name="download_by" value="accession"/>
+                <conditional name="text_or_file">
+                    <param name="text_or_file" value="text"/>
+                    <param name="accession" value="WP_004675351.1"/>
+                </conditional>
+                <param name="include_flanks_bp" value="100"/>
+            </conditional>
+            <section name="file_choices">
+                <conditional name="kingdom_cond">
+                    <param name="kingdom_sel" value="prok-gene"/>
+                    <param name="include" value="gene,protein"/>
+                </conditional>
+            </section>
+            <output name="gene_data_report">
+                <assert_contents>
+                    <has_text text="glcE"/>
+                    <has_n_lines n="2"/>
+                    <has_n_columns n="7"/>
+                </assert_contents>
+            </output>
+            <output name="gene_fasta">
+                <assert_contents>
+                    <has_text text=">"/>
+                </assert_contents>
+            </output>
+            <output name="protein_fasta">
+                <assert_contents>
+                    <has_text text=">"/>
+                </assert_contents>
+            </output>
+            <assert_command>
+                <has_text text="include-flanks-bp 100"/>
+            </assert_command>
+        </test> 
+
+        <!-- 10: datasets download gene taxon human   -->
+        <test expect_num_outputs="1">
+            <conditional name="query|subcommand">
+                <param name="download_by" value="taxon"/>
+                <param name="taxon_positional" value="human"/>
+            </conditional>
+            <section name="file_choices">
+                <conditional name="kingdom_cond">
+                    <param name="include" value=""/>
+                </conditional>
+            </section>
+            <output name="gene_data_report">
+                <assert_contents>
+                    <has_text text="human"/>
+                    <has_n_lines n="72533"/>
+                    <has_n_columns n="8"/>
+                </assert_contents>
+            </output>
+        </test> 
+        <!-- 11: datasets download gene taxon human + \-\-fasta-filter  -->
+        <test expect_num_outputs="2">
+            <conditional name="query|subcommand">
+                <param name="download_by" value="taxon"/>
+                <param name="taxon_positional" value="human"/>
+            </conditional>
+            <section name="file_choices">
+                <conditional name="kingdom_cond">
+                    <param name="include" value="protein"/>
+                </conditional>
+            </section>
+            <section name="filters">
+                <conditional name="fasta_filter_cond">
+                    <param name="fasta_filter_select" value="text"/>
+                    <param name="fasta_filter" value="NP_542432.2"/>
+                </conditional>
+            </section>
+            <output name="gene_data_report">
+                <assert_contents>
+                    <has_text text="human"/>
+                    <has_n_lines n="72533"/>
+                    <has_n_columns n="8"/>
+                </assert_contents>
+            </output>
+            <output name="protein_fasta">
+                <assert_contents>
+                    <has_text text=">" n="1" />
+                </assert_contents>
+            </output></test>
+    </tests>
+    <help>
+<![CDATA[
+**Download Gene Datasets from NCBI**
+
+Download a gene dataset (gene sequence, transcipt, amino acid sequences, 
+nucleotide coding sequences, 5'-UTR, 3'-UTR) as well as gene and gene
+product reports. Genes can be referred by gene id, symbol, accession,
+or taxon.
+]]>
+    </help>
+</tool>
--- a/datasets_genome.xml	Mon Nov 21 11:40:05 2022 +0000
+++ b/datasets_genome.xml	Fri Dec 02 10:52:48 2022 +0000
@@ -5,16 +5,17 @@
     </macros>
     <expand macro="requirements"></expand>
     <command><![CDATA[
+#import re
 @SETUP_CERTIFICATES@
 datasets download genome $query.subcommand.download_by
 #if $query.subcommand.download_by == 'accession':
     #if $query.subcommand.text_or_file.text_or_file == 'text':
-        #echo " ".join(f"'{x}'" for x in $query.subcommand.text_or_file.accession.split(' ') if x)
+        #echo " ".join(f"'{x}'" for x in re.split(" |,", str($query.subcommand.text_or_file.accession)) if x)
     #else
         --inputfile '$query.subcommand.text_or_file.inputfile'
     #end if
 #else:
-    '$query.subcommand.taxon'
+    '$query.subcommand.taxon_positional'
     $query.subcommand.tax_exact_match
 #end if
 $filters.reference
@@ -37,24 +38,55 @@
     --search '$filters.search_term'
 #end for
 --no-progressbar
-#if $uncompressed
-&& 7z x -y ncbi_dataset.zip
-#else
-&& 7z l ncbi_dataset.zip > ncbi_dataset.txt
+--dehydrated
+
+## produce TSV report file
+&& dataformat tsv genome 
+    --package ncbi_dataset.zip
+    --fields #echo ",".join($file_choices.report_columns) 
+    > genome_data_report.tsv
+
+## unzip and rehydrate if any data is to be downloaded (include is not None)
+#if $file_choices.include
+    ## unzip
+    && 7z x -y ncbi_dataset.zip > 7z.log
+
+    ## rehydrate
+    && datasets rehydrate
+        --directory ./
+        #if not $file_choices.decompress
+            --gzip
+        #end if
+        --max-workers \${NCBI_DATASETS_MAX_WORKERS:-10}
+
+    ## rename all faa, fna (resp faa.gz, fna.gz) to fasta (resp fasta.gz) to allow discovery
+    && find ncbi_dataset \( -name "*.faa" -o -name "*.fna" -o -name "*.faa.gz" -o -name "*.fna.gz" \) -exec sh -c 'mv {} \$(echo {} | sed "s/.f[an]a\(.gz\)\?\$/.fasta\1/")' \;
+
+    ## unzip all compressed (non-fasta) files (jsonl files are just named .gz)
+    && find ncbi_dataset -name "*.jsonl.gz" -exec sh -c 'mv {} \$(dirname {})/\$(basename {} .gz)' \;
+    #if $file_choices.decompress
+        && find ncbi_dataset \( -name "*.gz" ! -name "*fasta.gz" \) -exec gunzip {} \;
+    #end if
+
+    #if "seq-report" in $file_choices.include
+        && find ncbi_dataset -name sequence_report.jsonl -exec sh -c 'dataformat tsv genome-seq --inputfile {} > \$(dirname {})/\$(basename {} .jsonl).tsv' \;
+    #end if
+    
+    && true  ## because Galaxy removes trailing ; from command
 #end if
 ]]></command>
     <inputs>
         <section name="query" title="Query" expanded="true">
             <conditional name="subcommand">
                 <param name="download_by" type="select" label="Choose how to find genomes to download">
-                    <option value="accession">Download by NCBI assembly or BioProject accession</option>
-                    <option value="taxon">Download by taxon</option>
+                    <option value="accession">By NCBI assembly or BioProject accession</option>
+                    <option value="taxon">By taxon (NCBI Taxonomy ID, scientific or common name at any tax rank)</option>
                 </param>
                 <when value="accession">
                     <expand macro="text_or_file"/>
                 </when>
                 <when value="taxon">
-                    <param name="taxon" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurus, etc."/>
+                    <expand macro="taxon_positional"/>
                     <param argument="--tax-exact-match" type="boolean" truevalue="--tax-exact-match" falsevalue="" label="Exclude sub-species when a species-level taxon is specified"/>
                 </when>
             </conditional>
@@ -67,7 +99,6 @@
                 <option value="latest">Latest</option>
                 <option value="all">All</option>
             </param>
-            <!-- TODO add test for assembly source: according to CLI doc args are RefSeq, GenBank, All and not refseq / genbank-->
             <expand macro="assembly_source"/>
             <expand macro="chromosomes"/>
             <param argument="--exclude-atypical" type="boolean" truevalue="--exclude-atypical" falsevalue="" label="Exclude atypical assemblies"/>
@@ -78,82 +109,93 @@
                 <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/>
             </repeat>
         </section>
-        <section name="file_choices" title="File Choices" expanded="true">
-            <expand macro="include"/>
+        <section name="file_choices" title="Output options" expanded="true">
+            <expand macro="tsv_report_columns">
+                <option value="accession" selected="true">accession</option>
+                <option value="organism-name" selected="true">organism-name</option>
+                <option value="assminfo-submitter" selected="true">assminfo-submitter</option>
+                <option value="assminfo-name" selected="true">assminfo-name</option>
+            </expand>
+            <expand macro="include">
+                <expand macro="genome_includes"/>
+            </expand>
+            <param name="decompress" type="boolean" label="Decompress FASTA" help="By default FASTA files are provided zipped (fasta.gz) if this is checked the data will be decompressed"/>
         </section>
-        <param name="uncompressed" type="boolean" label="Uncompress the dataset archive" checked="true"/>
     </inputs>
     <outputs>
-        <data name="compressed_archive" format="zip" label="Compressed Archive" from_work_dir="ncbi_dataset.zip">
-            <filter>not uncompressed</filter>
-        </data>
-        <data name="archive_contents" format="txt" label="Archive Contents" from_work_dir="ncbi_dataset.txt">
-            <filter>not uncompressed</filter>
-        </data>
-        <data name="genome_data_report" format="json" label="NCBI Genome Datasets: Data Report" from_work_dir="ncbi_dataset/data/assembly_data_report.jsonl">
-            <filter>uncompressed</filter>
-        </data>
+        <data name="genome_data_report" format="tabular" label="NCBI Genome Datasets: Data Report" from_work_dir="genome_data_report.tsv"/>
         <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/sequence_report.jsonl" ext="json" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
-            <filter>uncompressed and file_choices['include'] and "seq-report" in file_choices['include']</filter>
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/sequence_report.tsv" ext="tabular" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+            <filter>file_choices['include'] and "seq-report" in file_choices['include']</filter>
         </collection>
         <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list:list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)/(?!rna|cds_from)(?P&lt;identifier_1&gt;.*?)(_genomic)?.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
-            <filter>uncompressed and file_choices['include'] and "genome" in file_choices['include']</filter>
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)/(?!rna|cds_from)(?P&lt;identifier_1&gt;.*?)(_genomic)?\.(?P&lt;ext&gt;fasta(\.gz)?)"  directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+            <filter>file_choices['include'] and "genome" in file_choices['include']</filter>
         </collection>
         <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/rna\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
-            <filter>uncompressed and file_choices['include'] and "rna" in file_choices['include']</filter>
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/rna\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+            <filter>file_choices['include'] and "rna" in file_choices['include']</filter>
         </collection>
         <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/protein\.faa" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
-            <filter>uncompressed and file_choices['include'] and "protein" in file_choices['include']</filter>
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/protein\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+            <filter>file_choices['include'] and "protein" in file_choices['include']</filter>
         </collection>
         <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/cds_from_genomic\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
-            <filter>uncompressed and file_choices['include'] and "cds" in file_choices['include']</filter>
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/cds_from_genomic\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+            <filter>file_choices['include'] and "cds" in file_choices['include']</filter>
         </collection>
         <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list">
             <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
-            <filter>uncompressed and file_choices['include'] and "gff3" in file_choices['include']</filter>
+            <filter>file_choices['include'] and "gff3" in file_choices['include']</filter>
         </collection>
         <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list">
             <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
-            <filter>uncompressed and file_choices['include'] and "gtf" in file_choices['include']</filter>
+            <filter>file_choices['include'] and "gtf" in file_choices['include']</filter>
         </collection>
         <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list">
             <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
-            <filter>uncompressed and file_choices['include'] and "gbff" in file_choices['include']</filter>
+            <filter>file_choices['include'] and "gbff" in file_choices['include']</filter>
         </collection>
     </outputs>
     <tests>
-        <test expect_num_outputs="2">
+        <!-- Note: All but one test use the non-default decompress="true"
+
+            this is because (at 11/22) Galaxy can not apply text assertions on the content
+            of compressed files https://github.com/galaxyproject/galaxy/pull/15085
+        
+            So with decompress="true" more powerfull assertions are powerful.
+            A single test checks the default, ie decompress="false".
+        -->
+        <test expect_num_outputs="1">
             <conditional name="query|subcommand">
                 <param name="download_by" value="taxon"/>
-                <param name="text_or_file" value="text"/>
-                <param name="taxon" value="human"/>
+                <param name="taxon_positional" value="human"/>
             </conditional>
             <param name="chromosomes" value="21"/>
-            <param name="include" value=""/>
-            <param name="uncompressed" value="false"/>
             <param name="released_before" value="01/01/2018"/>
-            <output name="archive_contents">
+            <section name="file_choices">
+                <param name="include" value=""/>
+            </section>
+            <output name="genome_data_report">
                 <assert_contents>
-                    <has_text text="ncbi_dataset/data/dataset_catalog.json"/>
+                    <has_text text="Assembly Accession&#009;Assembly Name&#009;Assembly Submitter&#009;Organism Name"/>
+                    <has_n_lines n="144"/>
+                    <has_n_columns n="4"/>
                 </assert_contents>
             </output>
         </test>
         <test expect_num_outputs="2">
             <conditional name="query|subcommand">
                 <param name="download_by" value="taxon"/>
-                <param name="text_or_file" value="text"/>
-                <param name="taxon" value="human"/>
+                <param name="taxon_positional" value="human"/>
             </conditional>
             <param name="chromosomes" value="21"/>
-            <param name="include" value="genome"/>
-            <param name="uncompressed" value="true"/>
             <param name="assembly_level" value="chromosome,complete"/>
             <param name="released_before" value="01/01/2018"/>
+            <section name="file_choices">
+                <param name="include" value="genome"/>
+                <param name="decompress" value="true"/>
+            </section>
             <output_collection name="genome_fasta" type="list:list" count="14">
                 <expand macro="genome_fasta_assert" el1="GCA_000002115.2" el2="chr21" expression=">"/>
                 <expand macro="genome_fasta_assert" el1="GCA_000002125.2" el2="chr21" expression=">"/>
@@ -174,22 +216,24 @@
             <output name="genome_data_report">
                 <assert_contents>
                     <has_text text="Homo sapiens"/>
+                    <has_n_columns n="4"/>
                 </assert_contents>
             </output>
         </test>
-        <!-- same as precious test but assembly_source (refseq which removes some of the genomes) -->
+        <!-- same as previous test but assembly_source (refseq which removes some of the genomes) -->
         <test expect_num_outputs="2">
             <conditional name="query|subcommand">
                 <param name="download_by" value="taxon"/>
-                <param name="text_or_file" value="text"/>
-                <param name="taxon" value="human"/>
+                <param name="taxon_positional" value="human"/>
             </conditional>
             <param name="chromosomes" value="21"/>
-            <param name="include" value="genome"/>
-            <param name="uncompressed" value="true"/>
             <param name="assembly_level" value="chromosome,complete"/>
             <param name="assembly_source" value="refseq"/>
             <param name="released_before" value="01/01/2018"/>
+            <section name="file_choices">
+                <param name="include" value="genome"/>
+                <param name="decompress" value="true"/>
+            </section>
             <output_collection name="genome_fasta" type="list:list" count="2">
                 <expand macro="genome_fasta_assert" el1="GCF_000002125.1" el2="chr21" expression=">"/>
                 <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/>
@@ -197,6 +241,8 @@
             <output name="genome_data_report">
                 <assert_contents>
                     <has_text text="Homo sapiens"/>
+                    <has_n_lines n="5"/>
+                    <has_n_columns n="4"/>
                 </assert_contents>
             </output>
         </test>
@@ -208,21 +254,41 @@
                     <param name="accession" value="GCF_000013305.1 GCF_000007445.1"/>
                 </conditional>
             </conditional>
-            <param name="include" value="seq-report,gtf,cds"/>
-            <param name="uncompressed" value="true"/>
             <param name="released_before" value="01/01/2007"/>
+            <section name="file_choices">
+                <param name="include" value="seq-report,gtf,cds"/>
+                <param name="decompress" value="true"/>
+            </section>
             <output name="genome_data_report">
                 <assert_contents>
                     <has_text text="GCF_000013305.1"/>
+                    <has_n_lines n="3"/>
+                    <has_n_columns n="4"/>
                 </assert_contents>
             </output>
+            <output_collection name="sequence_report" type="list" count="2" >
+                <element name="GCF_000007445.1">
+                    <assert_contents>
+                        <has_text text="GCF_000007445.1"/>
+                        <has_n_lines n="2"/>
+                        <has_n_columns n="14"/>
+                    </assert_contents>
+                </element>
+                <element name="GCF_000013305.1">
+                    <assert_contents>
+                        <has_text text="GCF_000013305.1"/>
+                        <has_n_lines n="2"/>
+                        <has_n_columns n="14"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
             <output_collection name="genomic_gtf" type="list">
                 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/>
                 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/>
             </output_collection>
             <output_collection name="genomic_cds" type="list">
-                <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains"/>
-                <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.cds" compare="contains"/>
+                <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains" decompress="true"/>
+                <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.cds" compare="contains" decompress="true"/>
             </output_collection>
         </test>
         <test expect_num_outputs="4">
@@ -233,12 +299,17 @@
                     <param name="inputfile" value="accessions.txt"/>
                 </conditional>
             </conditional>
-            <param name="include" value="seq-report,gbff,gff3"/>
-            <param name="uncompressed" value="true"/>
             <param name="released_before" value="01/01/2007"/>
+            <section name="file_choices">
+                <param name="include" value="seq-report,gff3,gbff"/>
+                <param name="decompress" value="true"/>
+            </section>
             <output name="genome_data_report">
                 <assert_contents>
-                   <has_text text="SAMN02604181"/>
+                   <has_text text="GCF_000013305.1"/>
+                   <has_text text="GCF_000007445.1"/>
+                    <has_n_lines n="3"/>
+                    <has_n_columns n="4"/>
                 </assert_contents>
             </output>
             <output_collection name="genomic_gff" type="list">
@@ -250,7 +321,9 @@
                 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/>
             </output_collection>
         </test>
-        <test expect_num_outputs="2">
+
+        <!-- should not fail https://github.com/ncbi/datasets/issues/194 -->
+        <test expect_num_outputs="2" expect_failure="true">
             <conditional name="query|subcommand">
                 <param name="download_by" value="accession"/>
                 <conditional name="text_or_file">
@@ -258,32 +331,14 @@
                     <param name="accession" value="GCF_000001405"/>
                 </conditional>
             </conditional>
-            <param name="include" value="seq-report"/>
-            <param name="uncompressed" value="true"/>
             <param name="released_before" value="01/01/2015"/>
             <param name="assembly_version" value="all"/>
-            <output_collection name="sequence_report" count="4">
-                <element name="GCF_000001405.25">
-                    <assert_contents>
-                        <has_text text="assignedMoleculeLocationType"/>
-                    </assert_contents>
-                 </element>
-                <element name="GCF_000001405.26">
-                    <assert_contents>
-                        <has_text text="assignedMoleculeLocationType"/>
-                    </assert_contents>
-                 </element>
-                <element name="GCF_000001405.27">
-                    <assert_contents>
-                        <has_text text="assignedMoleculeLocationType"/>
-                    </assert_contents>
-                 </element>
-                <element name="GCF_000001405.28">
-                    <assert_contents>
-                        <has_text text="assignedMoleculeLocationType"/>
-                    </assert_contents>
-                 </element>
-            </output_collection>
+            <section name="file_choices">
+                <param name="include" value="seq-report"/>
+            </section>
+            <!-- 
+            <output_collection name="sequence_report" type="list" count="4" >
+            -->
         </test>
         <test expect_num_outputs="5">
             <conditional name="query|subcommand">
@@ -293,11 +348,64 @@
                     <param name="accession" value="GCF_000146045.2"/>
                 </conditional>
             </conditional>
-            <param name="include" value="seq-report,genome,rna,cds"/>
-            <param name="uncompressed" value="true"/>
+            <section name="file_choices">
+                <param name="include" value="genome,protein,rna,cds"/>
+                <param name="decompress" value="true"/>
+            </section>
             <output_collection name="genome_fasta" type="list:list" count="1">
                 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/>
             </output_collection>
+            <output_collection name="protein_fasta" type="list" count="1">
+                <element name="GCF_000146045.2" decompress="true">
+                    <assert_contents>
+                        <has_text text=">"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+            <output_collection name="rna_fasta" type="list" count="1">
+                <element name="GCF_000146045.2" decompress="true">
+                    <assert_contents>
+                        <has_text text=">"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test>
+        <!-- same as the previous test, but use the default value for decompress,
+            see comment at the beginning of the tests -->
+        <test expect_num_outputs="5">
+            <conditional name="query|subcommand">
+                <param name="download_by" value="accession"/>
+                <conditional name="text_or_file">
+                    <param name="text_or_file" value="text"/>
+                    <param name="accession" value="GCF_000146045.2"/>
+                </conditional>
+            </conditional>
+            <section name="file_choices">
+                <param name="include" value="genome,protein,rna,cds"/>
+            </section>
+            <output_collection name="genome_fasta" type="list:list" count="1">
+                <element name="GCF_000146045.2">
+                    <element name="GCF_000146045.2_R64" ftype="fasta.gz">
+                        <assert_contents>
+                            <has_size value="3843460"/>
+                        </assert_contents>
+                    </element>
+                </element>
+            </output_collection>
+            <output_collection name="protein_fasta" type="list" count="1">
+                <element name="GCF_000146045.2" ftype="fasta.gz">
+                    <assert_contents>
+                        <has_size value="1844838"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+            <output_collection name="rna_fasta" type="list" count="1">
+                <element name="GCF_000146045.2" ftype="fasta.gz">
+                    <assert_contents>
+                        <has_size value="2784534"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
         </test>
         <test expect_num_outputs="3">
             <conditional name="query|subcommand">
@@ -307,8 +415,11 @@
                     <param name="accession" value="GCF_000146045.2 GCF_000002945.1"/>
                 </conditional>
             </conditional>
-            <param name="include" value="seq-report,genome"/>
-            <param name="uncompressed" value="true"/>
+            <section name="file_choices">
+                <param name="include" value="seq-report,genome"/>
+                <param name="decompress" value="true"/>
+            </section>
+            <output_collection name="sequence_report" type="list" count="2"/>
             <output_collection name="genome_fasta" type="list:list" count="2">
                 <expand macro="genome_fasta_assert" el1="GCF_000002945.1" el2="GCF_000002945.1_ASM294v2" expression=">NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="4"/>
                 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/>
@@ -320,12 +431,10 @@
         <test expect_num_outputs="1" expect_test_failure="true">
             <conditional name="query|subcommand">
                 <param name="download_by" value="taxon"/>
-                <param name="text_or_file" value="text"/>
-                <param name="taxon" value="4932"/>
+                <param name="taxon_positional" value="4932"/>
                 <param name="tax_exact_match" value="true"/>
             </conditional>
             <param name="include" value=""/>
-            <param name="uncompressed" value="true"/>
             <output name="genome_data_report">
                 <assert_contents>
                    <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/>
@@ -338,15 +447,18 @@
 **Download Genome Datasets from NCBI**
 
 Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report.
-Genome datasets can be specified by NCBI Assembly or BioProject accession or taxon. Datasets are downloaded as a zip file.
+Genome datasets can be specified by NCBI Assembly or BioProject accession(s) or by taxon.
+
+The download is a three step process:
 
-Tthe default genome dataset includes the following files (if available):
- * data_report.jsonl (genome assembly and annotation metadata, not always available)
- * genomic.fna (genomic sequences)
- * rna.fna (transcript sequences)
- * protein.faa (protein sequences)
- * genomic.gff (genome annotation in gff3 format)
- * dataset_catalog.json (a list of files and file types included in the dataset)
+1. A "dehydrated" zip file is downloaded which includes the metadata and the download URL)
+2. The metadata is transformed into a tabular (TSV) file
+3. The data is hydrated (the actual data is downloaded)
+
+The 3rd step can be skipped by unselecting all output types in the `Include` parameter.
+Thereby its possible to inspect the metadata prior to the actual data download. Also this
+allows to use the tool for querying data sets (and their accessions) of interest which
+can then be downloaded in a second call using the accessions.
 ]]>
     </help>
 
--- a/macros.xml	Mon Nov 21 11:40:05 2022 +0000
+++ b/macros.xml	Fri Dec 02 10:52:48 2022 +0000
@@ -1,5 +1,5 @@
 <macros>
-    <token name="@TOOL_VERSION@">14.3</token>
+    <token name="@TOOL_VERSION@">14.4</token>
     <token name="@VERSION_SUFFIX@">0</token>
     <token name="@PROFILE@">21.01</token>
     <token name="@LICENSE@">MIT</token>
@@ -39,8 +39,10 @@
                 <option value="file">Read a list of @WHAT_EXTENDED@s from a dataset</option>
             </param>
             <when value="text">
-                <param name="accession" type="text" label="Enter space separated list of @WHAT@s" help="@HELP@">
-                    <yield/>
+                <!-- command section also allows spaces as separator for backward compatibility
+                     prefer comma because this is used also in other text params-->
+                <param name="accession" type="text" label="Enter comma separated list of @WHAT@s" help="@HELP@">
+                    <validator type="length" min="1" message="Provide at least one @WHAT@"/>
                 </param>
             </when>
             <when value="file">
@@ -59,18 +61,45 @@
             </sanitizer>
         </param>
     </xml>
-    <xml name="include">
-        <param argument="--include" type="select" multiple="true" optional="true">
-            <option value="genome" selected="true">genomic sequence (genome)</option>
-            <option value="rna">transcript (rna)</option>
-            <option value="protein">amnio acid sequences (protein)</option>
-            <option value="cds">nucleotide coding sequences (cds)</option>
-            <option value="gff3">general feature file (gff3)</option>
-            <option value="gtf">gene transfer format (gtf)</option>
-            <option value="gbff">GenBank flat file (gbff)</option>
-            <option value="seq-report">sequence report file (seq-report)</option>
+
+    <xml name="taxon_positional">
+        <param name="taxon_positional" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurus, etc."/>
+    </xml>
+
+    <xml name="ortholog">
+        <param argument="--ortholog" type="text" label="Retrieve orthologs for taxa" help="Retrieve data for an ortholog set. Provide one or more comma separated taxa (any rank) to filter results or 'all' for the complete set.">
+            <sanitizer invalid_char="">
+                <valid initial="string.letters,string.digits">
+                    <add value=" " />
+                    <add value="," />
+                    <add value="-" />
+                </valid>
+            </sanitizer>
         </param>
     </xml>
+
+    <xml name="include">
+        <param argument="--include" type="select" multiple="true" optional="true" label="Include" help="Download the following datasets (if available)">
+            <yield/>
+        </param>
+    </xml>
+    <xml name="genome_includes">
+        <option value="genome" selected="true">genomic sequence (genome)</option>
+        <option value="rna">transcript (rna)</option>
+        <option value="protein">amnio acid sequences (protein)</option>
+        <option value="cds">nucleotide coding sequences (cds)</option>
+        <option value="gff3">general feature file (gff3)</option>
+        <option value="gtf">gene transfer format (gtf)</option>
+        <option value="gbff">GenBank flat file (gbff)</option>
+        <option value="seq-report">sequence report file (seq-report)</option>
+        <yield/>
+    </xml>
+    <xml name="gene_includes">
+        <option value="gene">gene sequence (gene)</option>
+        <option value="protein" selected="true">amnio acid sequences (protein)</option>
+        <yield/>
+    </xml>
+
     <token name="@INCLUDE@"><![CDATA[
         --include
         #if $file_choices.include
@@ -79,6 +108,187 @@
             none
         #end if
     ]]></token>
+    <xml name="tsv_report_columns">
+        <param name="report_columns" type="select" multiple="true" optional="false" label="Columns in the report">
+            <option value="accession">Assembly Accession</option>
+            <option value="annotinfo-busco-complete">Annotation BUSCO Complete</option>
+            <option value="annotinfo-busco-duplicated">Annotation BUSCO Duplicated</option>
+            <option value="annotinfo-busco-fragmented">Annotation BUSCO Fragmented</option>
+            <option value="annotinfo-busco-lineage">Annotation BUSCO Lineage</option>
+            <option value="annotinfo-busco-missing">Annotation BUSCO Missing</option>
+            <option value="annotinfo-busco-singlecopy">Annotation BUSCO Single Copy</option>
+            <option value="annotinfo-busco-totalcount">Annotation BUSCO Total Count</option>
+            <option value="annotinfo-busco-ver">Annotation BUSCO Version</option>
+            <option value="annotinfo-featcount-gene-non-coding">Annotation Count Gene Non-coding</option>
+            <option value="annotinfo-featcount-gene-other">Annotation Count Gene Other</option>
+            <option value="annotinfo-featcount-gene-protein-coding">Annotation Count Gene Protein-coding</option>
+            <option value="annotinfo-featcount-gene-pseudogene">Annotation Count Gene Pseudogene</option>
+            <option value="annotinfo-featcount-gene-total">Annotation Count Gene Total</option>
+            <option value="annotinfo-method">Annotation Method</option>
+            <option value="annotinfo-name">Annotation Name</option>
+            <option value="annotinfo-pipeline">Annotation Pipeline</option>
+            <option value="annotinfo-provider">Annotation Provider</option>
+            <option value="annotinfo-release-date">Annotation Release Date</option>
+            <option value="annotinfo-release-version">Annotation Release Version</option>
+            <option value="annotinfo-report-url">Annotation Report URL</option>
+            <option value="annotinfo-software-version">Annotation Software Version</option>
+            <option value="annotinfo-status">Annotation Status</option>
+            <option value="assminfo-atypicalis-atypical">Assembly Atypical Is Atypical</option>
+            <option value="assminfo-atypicalwarnings">Assembly Atypical Warnings</option>
+            <option value="assminfo-bioproject-lineage-accession">Assembly BioProject Lineage Accession</option>
+            <option value="assminfo-bioproject-lineage-parent-accession">Assembly BioProject Lineage Parent Accession</option>
+            <option value="assminfo-bioproject-lineage-parent-accessions">Assembly BioProject Lineage Parent Accessions</option>
+            <option value="assminfo-bioproject-lineage-title">Assembly BioProject Lineage Title</option>
+            <option value="assminfo-biosample-accession">Assembly BioSample Accession</option>
+            <option value="assminfo-biosample-attribute-name">Assembly BioSample Attribute Name</option>
+            <option value="assminfo-biosample-attribute-value">Assembly BioSample Attribute Value</option>
+            <option value="assminfo-biosample-bioproject-accession">Assembly BioSample BioProject Accession</option>
+            <option value="assminfo-biosample-bioproject-parent-accession">Assembly BioSample BioProject Parent Accession</option>
+            <option value="assminfo-biosample-bioproject-parent-accessions">Assembly BioSample BioProject Parent Accessions</option>
+            <option value="assminfo-biosample-bioproject-title">Assembly BioSample BioProject Title</option>
+            <option value="assminfo-biosample-description-comment">Assembly BioSample Description Comment</option>
+            <option value="assminfo-biosample-description-organism-common-name">Assembly BioSample Description Organism Common Name</option>
+            <option value="assminfo-biosample-description-organism-infraspecific-breed">Assembly BioSample Description Organism Infraspecific Names Breed</option>
+            <option value="assminfo-biosample-description-organism-infraspecific-cultivar">Assembly BioSample Description Organism Infraspecific Names Cultivar</option>
+            <option value="assminfo-biosample-description-organism-infraspecific-ecotype">Assembly BioSample Description Organism Infraspecific Names Ecotype</option>
+            <option value="assminfo-biosample-description-organism-infraspecific-isolate">Assembly BioSample Description Organism Infraspecific Names Isolate</option>
+            <option value="assminfo-biosample-description-organism-infraspecific-sex">Assembly BioSample Description Organism Infraspecific Names Sex</option>
+            <option value="assminfo-biosample-description-organism-infraspecific-strain">Assembly BioSample Description Organism Infraspecific Names Strain</option>
+            <option value="assminfo-biosample-description-organism-name">Assembly BioSample Description Organism Name</option>
+            <option value="assminfo-biosample-description-organism-pangolin">Assembly BioSample Description Organism Pangolin Classification</option>
+            <option value="assminfo-biosample-description-organism-tax-id">Assembly BioSample Description Organism Taxonomic ID</option>
+            <option value="assminfo-biosample-description-title">Assembly BioSample Description Title</option>
+            <option value="assminfo-biosample-ids-db">Assembly BioSample Sample Identifiers Database</option>
+            <option value="assminfo-biosample-ids-label">Assembly BioSample Sample Identifiers Label</option>
+            <option value="assminfo-biosample-ids-value">Assembly BioSample Sample Identifiers Value</option>
+            <option value="assminfo-biosample-last-updated">Assembly BioSample Last updated</option>
+            <option value="assminfo-biosample-models">Assembly BioSample Models</option>
+            <option value="assminfo-biosample-owner-contact-lab">Assembly BioSample Owner Contact Lab</option>
+            <option value="assminfo-biosample-owner-name">Assembly BioSample Owner Name</option>
+            <option value="assminfo-biosample-package">Assembly BioSample Package</option>
+            <option value="assminfo-biosample-publication-date">Assembly BioSample Publication date</option>
+            <option value="assminfo-biosample-status-status">Assembly BioSample Status Status</option>
+            <option value="assminfo-biosample-status-when">Assembly BioSample Status When</option>
+            <option value="assminfo-biosample-submission-date">Assembly BioSample Submission date</option>
+            <option value="assminfo-blast-url">Assembly Blast URL</option>
+            <option value="assminfo-description">Assembly Description</option>
+            <option value="assminfo-level">Assembly Level</option>
+            <option value="assminfo-linked-assmaccession">Assembly Linked Assembly Accession</option>
+            <option value="assminfo-linked-assmtype">Assembly Linked Assembly Type</option>
+            <option value="assminfo-name">Assembly Name</option>
+            <option value="assminfo-paired-assmaccession">Assembly Paired Assembly Accession</option>
+            <option value="assminfo-paired-assmname">Assembly Paired Assembly Name</option>
+            <option value="assminfo-paired-assmstatus">Assembly Paired Assembly Status</option>
+            <option value="assminfo-refseq-category">Assembly Refseq Category</option>
+            <option value="assminfo-sequencing-tech">Assembly Sequencing Tech</option>
+            <option value="assminfo-status">Assembly Status</option>
+            <option value="assminfo-submission-date">Assembly Submission Date</option>
+            <option value="assminfo-submitter">Assembly Submitter</option>
+            <option value="assminfo-synonym">Assembly Synonym</option>
+            <option value="assminfo-type">Assembly Type</option>
+            <option value="assmstats-contig-l50">Assembly Stats Contig L50</option>
+            <option value="assmstats-contig-n50">Assembly Stats Contig N50</option>
+            <option value="assmstats-gaps-between-scaffolds-count">Assembly Stats Gaps Between Scaffolds Count</option>
+            <option value="assmstats-gc-count">Assembly Stats GC Count</option>
+            <option value="assmstats-gc-percent">Assembly Stats GC Percent</option>
+            <option value="assmstats-number-of-component-sequences">Assembly Stats Number of Component Sequences</option>
+            <option value="assmstats-number-of-contigs">Assembly Stats Number of Contigs</option>
+            <option value="assmstats-number-of-scaffolds">Assembly Stats Number of Scaffolds</option>
+            <option value="assmstats-scaffold-l50">Assembly Stats Scaffold L50</option>
+            <option value="assmstats-scaffold-n50">Assembly Stats Scaffold N50</option>
+            <option value="assmstats-total-number-of-chromosomes">Assembly Stats Total Number of Chromosomes</option>
+            <option value="assmstats-total-sequence-len">Assembly Stats Total Sequence Length</option>
+            <option value="assmstats-total-ungapped-len">Assembly Stats Total Ungapped Length</option>
+            <option value="current-accession">Current Accession</option>
+            <option value="organelle-assembly-name">Organelle Assembly Name</option>
+            <option value="organelle-bioproject-accessions">Organelle BioProject Accessions</option>
+            <option value="organelle-description">Organelle Description</option>
+            <option value="organelle-infraspecific-name">Organelle Infraspecific Name</option>
+            <option value="organelle-submitter">Organelle Submitter</option>
+            <option value="organelle-total-seq-length">Organelle Total Seq Length</option>
+            <option value="organism-common-name">Organism Common Name</option>
+            <option value="organism-infraspecific-breed">Organism Infraspecific Names Breed</option>
+            <option value="organism-infraspecific-cultivar">Organism Infraspecific Names Cultivar</option>
+            <option value="organism-infraspecific-ecotype">Organism Infraspecific Names Ecotype</option>
+            <option value="organism-infraspecific-isolate">Organism Infraspecific Names Isolate</option>
+            <option value="organism-infraspecific-sex">Organism Infraspecific Names Sex</option>
+            <option value="organism-infraspecific-strain">Organism Infraspecific Names Strain</option>
+            <option value="organism-name">Organism Name</option>
+            <option value="organism-pangolin">Organism Pangolin Classification</option>
+            <option value="organism-tax-id">Organism Taxonomic ID</option>
+            <option value="source_database">Source Database</option>
+            <option value="wgs-contigs-url">WGS contigs URL</option>
+            <option value="wgs-project-accession">WGS project accession</option>
+            <option value="wgs-url">WGS URL</option>
+            <yield/>
+        </param>
+    </xml>
+
+    <xml name="gene_tsv_report_columns">
+        <param name="report_columns" type="select" multiple="true" optional="false" label="Columns in the report">
+            <option value="annotation-assembly-accession">Annotation Assembly Accession</option>
+            <option value="annotation-assembly-name">Annotation Assembly Name</option>
+            <option value="annotation-genomic-range-accession">Annotation Genomic Range Accession</option>
+            <option value="annotation-genomic-range-exon-order">Annotation Genomic Range Exons Order</option>
+            <option value="annotation-genomic-range-exon-orientation">Annotation Genomic Range Exons Orientation</option>
+            <option value="annotation-genomic-range-exon-start">Annotation Genomic Range Exons Start</option>
+            <option value="annotation-genomic-range-exon-stop">Annotation Genomic Range Exons Stop</option>
+            <option value="annotation-genomic-range-range-order">Annotation Genomic Range Order</option>
+            <option value="annotation-genomic-range-range-orientation">Annotation Genomic Range Orientation</option>
+            <option value="annotation-genomic-range-range-start">Annotation Genomic Range Start</option>
+            <option value="annotation-genomic-range-range-stop">Annotation Genomic Range Stop</option>
+            <option value="annotation-genomic-range-seq-name">Annotation Genomic Range Seq Name</option>
+            <option value="annotation-release-date">Annotation Release Date</option>
+            <option value="annotation-release-name">Annotation Release Name</option>
+            <option value="chromosomes">Chromosomes</option>
+            <option value="common-name">Common Name</option>
+            <option value="description">Description</option>
+            <option value="ensembl-geneids">Ensembl GeneIDs</option>
+            <option value="gene-id">NCBI GeneID</option>
+            <option value="gene-type">Gene Type</option>
+            <option value="genomic-region-gene-range-accession">Genomic Region Gene Range Sequence Accession</option>
+            <option value="genomic-region-gene-range-range-order">Genomic Region Gene Range Order</option>
+            <option value="genomic-region-gene-range-range-orientation">Genomic Region Gene Range Orientation</option>
+            <option value="genomic-region-gene-range-range-start">Genomic Region Gene Range Start</option>
+            <option value="genomic-region-gene-range-range-stop">Genomic Region Gene Range Stop</option>
+            <option value="genomic-region-genomic-region-type">Genomic Region Genomic Region Type</option>
+            <option value="group-id">Gene Group Identifier</option>
+            <option value="group-method">Gene Group Method</option>
+            <option value="name-authority">Nomenclature Authority</option>
+            <option value="name-id">Nomenclature ID</option>
+            <option value="omim-ids">OMIM IDs</option>
+            <option value="orientation">Orientation</option>
+            <option value="ref-standard-gene-range-accession">Reference Standard Gene Range Sequence Accession</option>
+            <option value="ref-standard-gene-range-range-order">Reference Standard Gene Range Order</option>
+            <option value="ref-standard-gene-range-range-orientation">Reference Standard Gene Range Orientation</option>
+            <option value="ref-standard-gene-range-range-start">Reference Standard Gene Range Start</option>
+            <option value="ref-standard-gene-range-range-stop">Reference Standard Gene Range Stop</option>
+            <option value="ref-standard-genomic-region-type">Reference Standard Genomic Region Type</option>
+            <option value="replaced-gene-id">Replaced NCBI GeneID</option>
+            <option value="rna-type">RNA Type</option>
+            <option value="swissprot-accessions">SwissProt Accessions</option>
+            <option value="symbol">Symbol</option>
+            <option value="synonyms">Synonyms</option>
+            <option value="tax-id">Taxonomic ID</option>
+            <option value="tax-name">Taxonomic Name</option>
+            <yield/>
+        </param>
+    </xml>
+    <xml name="prok_gene_tsv_report_columns">
+        <param name="report_columns" type="select" multiple="true" optional="false" label="Columns in the report">
+            <option value="accession">Accession</option>
+            <option value="description">Description</option>
+            <option value="ec-number">EC Number</option>
+            <option value="gene-symbol">Gene Symbol</option>
+            <option value="mapping-count">Number of Genome Mappings</option>
+            <option value="name-evidence-accession">Protein Name EvidenceAccession</option>
+            <option value="name-evidence-category">Protein Name EvidenceCategory</option>
+            <option value="name-evidence-source">Protein Name EvidenceSource</option>
+            <option value="protein-length">Protein Length</option>
+            <option value="protein-name">Protein Name</option>
+            <yield/>
+        </param>
+    </xml>
     <xml name="released_options" token_released_what="genomes" token_before_or_after="before">
         <param argument="--released-@BEFORE_OR_AFTER@" type="text" optional="true" label="Only include @RELEASED_WHAT@ that have been released @BEFORE_OR_AFTER@ a specified date (MM/DD/YYYY)">
             <validator type="regex" message="enter a date in the form MM/DD/YYYY">[0-9]{2}/[0-9]{2}/[0-9]{4}</validator>
@@ -93,9 +303,9 @@
 #end if
     </token>
 
-    <xml name="genome_fasta_assert" tokens="el1,el2,expression" token_expression_n="1">
+    <xml name="genome_fasta_assert" tokens="el1,el2,expression" token_ftype="fasta" token_expression_n="1">
         <element name="@EL1@">
-            <element name="@EL2@">
+            <element name="@EL2@" ftype="@FTYPE@" decompress="true">
                 <assert_contents>
                     <has_text_matching expression="@EXPRESSION@" n="@EXPRESSION_N@"/>
                 </assert_contents>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/geneids.txt	Fri Dec 02 10:52:48 2022 +0000
@@ -0,0 +1,2 @@
+2597
+14433