Mercurial > repos > iuc > ncbi_datasets

<tool id="datasets_download_genome" name="NCBI Datasets Genomes" profile="@PROFILE@" license="@LICENSE@" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
    <description>download genome sequence, annotation and metadata</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements"></expand>
    <command><![CDATA[
@SETUP_CERTIFICATES@
datasets download genome $query.subcommand.download_by
#if $query.subcommand.download_by == 'accession':
    #if $query.subcommand.text_or_file.text_or_file == 'text':
        #echo " ".join(f"'{x}'" for x in $query.subcommand.text_or_file.accession.split(' ') if x)
    #else
        --inputfile '$query.subcommand.text_or_file.inputfile'
    #end if
#else:
    '$query.subcommand.taxon'
    $query.subcommand.tax_exact_match
#end if
$filters.reference
$filters.annotated
#if $filters.assembly_level:
    --assembly-level $filters.assembly_level
#end if
--assembly-version $filters.assembly_version
#if $filters.assembly_source:
    --assembly-source $filters.assembly_source
#end if
#if $filters.chromosomes:
    --chromosomes '$filters.chromosomes'
#end if
$filters.exclude_atypical
@INCLUDE@
@RELEASED_BEFORE@
@RELEASED_AFTER@
#for search_term in $filters.search:
    --search '$filters.search_term'
#end for
--no-progressbar
#if $uncompressed
&& 7z x -y ncbi_dataset.zip
#else
&& 7z l ncbi_dataset.zip > ncbi_dataset.txt
#end if
]]></command>
    <inputs>
        <section name="query" title="Query" expanded="true">
            <conditional name="subcommand">
                <param name="download_by" type="select" label="Choose how to find genomes to download">
                    <option value="accession">Download by NCBI assembly or BioProject accession</option>
                    <option value="taxon">Download by taxon</option>
                </param>
                <when value="accession">
                    <expand macro="text_or_file"/>
                </when>
                <when value="taxon">
                    <param name="taxon" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurus, etc."/>
                    <param argument="--tax-exact-match" type="boolean" truevalue="--tax-exact-match" falsevalue="" label="Exclude sub-species when a species-level taxon is specified"/>
                </when>
            </conditional>
        </section>
        <section name="filters" title="Filters and Limit">
            <param argument="--reference" type="boolean" truevalue="--reference" falsevalue="" label="Limit to reference and representative (GCF_ and GCA_) assemblies"/>
            <expand macro="annotation"/>
            <expand macro="assembly_level"/>
            <param argument="--assembly-version" type="select" label="Assembly version(s)">
                <option value="latest">Latest</option>
                <option value="all">All</option>
            </param>
            <!-- TODO add test for assembly source: according to CLI doc args are RefSeq, GenBank, All and not refseq / genbank-->
            <expand macro="assembly_source"/>
            <expand macro="chromosomes"/>
            <param argument="--exclude-atypical" type="boolean" truevalue="--exclude-atypical" falsevalue="" label="Exclude atypical assemblies"/>
            <expand macro="released_options"/>
            <expand macro="released_options" before_or_after="after"/>

            <repeat name="search" title="Add search terms">
                <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/>
            </repeat>
        </section>
        <section name="file_choices" title="File Choices" expanded="true">
            <expand macro="include"/>
        </section>
        <param name="uncompressed" type="boolean" label="Uncompress the dataset archive" checked="true"/>
    </inputs>
    <outputs>
        <data name="compressed_archive" format="zip" label="Compressed Archive" from_work_dir="ncbi_dataset.zip">
            <filter>not uncompressed</filter>
        </data>
        <data name="archive_contents" format="txt" label="Archive Contents" from_work_dir="ncbi_dataset.txt">
            <filter>not uncompressed</filter>
        </data>
        <data name="genome_data_report" format="json" label="NCBI Genome Datasets: Data Report" from_work_dir="ncbi_dataset/data/assembly_data_report.jsonl">
            <filter>uncompressed</filter>
        </data>
        <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/sequence_report.jsonl" ext="json" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
            <filter>uncompressed and file_choices['include'] and "seq-report" in file_choices['include']</filter>
        </collection>
        <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list:list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)/(?!rna|cds_from)(?P&lt;identifier_1&gt;.*?)(_genomic)?.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
            <filter>uncompressed and file_choices['include'] and "genome" in file_choices['include']</filter>
        </collection>
        <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/rna\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
            <filter>uncompressed and file_choices['include'] and "rna" in file_choices['include']</filter>
        </collection>
        <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/protein\.faa" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
            <filter>uncompressed and file_choices['include'] and "protein" in file_choices['include']</filter>
        </collection>
        <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/cds_from_genomic\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
            <filter>uncompressed and file_choices['include'] and "cds" in file_choices['include']</filter>
        </collection>
        <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
            <filter>uncompressed and file_choices['include'] and "gff3" in file_choices['include']</filter>
        </collection>
        <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
            <filter>uncompressed and file_choices['include'] and "gtf" in file_choices['include']</filter>
        </collection>
        <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
            <filter>uncompressed and file_choices['include'] and "gbff" in file_choices['include']</filter>
        </collection>
    </outputs>
    <tests>
        <test expect_num_outputs="2">
            <conditional name="query|subcommand">
                <param name="download_by" value="taxon"/>
                <param name="text_or_file" value="text"/>
                <param name="taxon" value="human"/>
            </conditional>
            <param name="chromosomes" value="21"/>
            <param name="include" value=""/>
            <param name="uncompressed" value="false"/>
            <param name="released_before" value="01/01/2018"/>
            <output name="archive_contents">
                <assert_contents>
                    <has_text text="ncbi_dataset/data/dataset_catalog.json"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="2">
            <conditional name="query|subcommand">
                <param name="download_by" value="taxon"/>
                <param name="text_or_file" value="text"/>
                <param name="taxon" value="human"/>
            </conditional>
            <param name="chromosomes" value="21"/>
            <param name="include" value="genome"/>
            <param name="uncompressed" value="true"/>
            <param name="assembly_level" value="chromosome,complete"/>
            <param name="released_before" value="01/01/2018"/>
            <output_collection name="genome_fasta" type="list:list" count="14">
                <expand macro="genome_fasta_assert" el1="GCA_000002115.2" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_000002125.2" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_000002135.3" el2="GCA_000002135.3_CRA_TCAGchr7v2" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_000212995.1" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_000252825.1" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_000306695.2" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_000365445.1" el2="chr21" expression=">"/>
                <!-- TODO chromosomes argument (or data) seems not reliable https://github.com/ncbi/datasets/issues/188-->
                <expand macro="genome_fasta_assert" el1="GCA_000442335.2" el2="GCA_000442335.2_LinearCen1.1_normalized" expression=">" expression_n="25"/>
                <expand macro="genome_fasta_assert" el1="GCA_001292825.2" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_001524155.4" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_001712695.1" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_022833125.2" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCF_000002125.1" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/>
            </output_collection>
            <output name="genome_data_report">
                <assert_contents>
                    <has_text text="Homo sapiens"/>
                </assert_contents>
            </output>
        </test>
        <!-- same as precious test but assembly_source (refseq which removes some of the genomes) -->
        <test expect_num_outputs="2">
            <conditional name="query|subcommand">
                <param name="download_by" value="taxon"/>
                <param name="text_or_file" value="text"/>
                <param name="taxon" value="human"/>
            </conditional>
            <param name="chromosomes" value="21"/>
            <param name="include" value="genome"/>
            <param name="uncompressed" value="true"/>
            <param name="assembly_level" value="chromosome,complete"/>
            <param name="assembly_source" value="refseq"/>
            <param name="released_before" value="01/01/2018"/>
            <output_collection name="genome_fasta" type="list:list" count="2">
                <expand macro="genome_fasta_assert" el1="GCF_000002125.1" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/>
            </output_collection>
            <output name="genome_data_report">
                <assert_contents>
                    <has_text text="Homo sapiens"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="4">
            <conditional name="query|subcommand">
                <param name="download_by" value="accession"/>
                <conditional name="text_or_file">
                    <param name="text_or_file" value="text"/>
                    <param name="accession" value="GCF_000013305.1 GCF_000007445.1"/>
                </conditional>
            </conditional>
            <param name="include" value="seq-report,gtf,cds"/>
            <param name="uncompressed" value="true"/>
            <param name="released_before" value="01/01/2007"/>
            <output name="genome_data_report">
                <assert_contents>
                    <has_text text="GCF_000013305.1"/>
                </assert_contents>
            </output>
            <output_collection name="genomic_gtf" type="list">
                <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/>
                <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/>
            </output_collection>
            <output_collection name="genomic_cds" type="list">
                <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains"/>
                <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.cds" compare="contains"/>
            </output_collection>
        </test>
        <test expect_num_outputs="4">
            <conditional name="query|subcommand">
                <param name="download_by" value="accession"/>
                <conditional name="text_or_file">
                    <param name="text_or_file" value="file"/>
                    <param name="inputfile" value="accessions.txt"/>
                </conditional>
            </conditional>
            <param name="include" value="seq-report,gbff,gff3"/>
            <param name="uncompressed" value="true"/>
            <param name="released_before" value="01/01/2007"/>
            <output name="genome_data_report">
                <assert_contents>
                   <has_text text="SAMN02604181"/>
                </assert_contents>
            </output>
            <output_collection name="genomic_gff" type="list">
                <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gff" compare="contains"/>
                <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gff" compare="contains"/>
            </output_collection>
            <output_collection name="genomic_gbff" type="list">
                <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gbff" compare="contains"/>
                <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/>
            </output_collection>
        </test>
        <test expect_num_outputs="2">
            <conditional name="query|subcommand">
                <param name="download_by" value="accession"/>
                <conditional name="text_or_file">
                    <param name="text_or_file" value="text"/>
                    <param name="accession" value="GCF_000001405"/>
                </conditional>
            </conditional>
            <param name="include" value="seq-report"/>
            <param name="uncompressed" value="true"/>
            <param name="released_before" value="01/01/2015"/>
            <param name="assembly_version" value="all"/>
            <output_collection name="sequence_report" count="4">
                <element name="GCF_000001405.25">
                    <assert_contents>
                        <has_text text="assignedMoleculeLocationType"/>
                    </assert_contents>
                 </element>
                <element name="GCF_000001405.26">
                    <assert_contents>
                        <has_text text="assignedMoleculeLocationType"/>
                    </assert_contents>
                 </element>
                <element name="GCF_000001405.27">
                    <assert_contents>
                        <has_text text="assignedMoleculeLocationType"/>
                    </assert_contents>
                 </element>
                <element name="GCF_000001405.28">
                    <assert_contents>
                        <has_text text="assignedMoleculeLocationType"/>
                    </assert_contents>
                 </element>
            </output_collection>
        </test>
        <test expect_num_outputs="5">
            <conditional name="query|subcommand">
                <param name="download_by" value="accession"/>
                <conditional name="text_or_file">
                    <param name="text_or_file" value="text"/>
                    <param name="accession" value="GCF_000146045.2"/>
                </conditional>
            </conditional>
            <param name="include" value="seq-report,genome,rna,cds"/>
            <param name="uncompressed" value="true"/>
            <output_collection name="genome_fasta" type="list:list" count="1">
                <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/>
            </output_collection>
        </test>
        <test expect_num_outputs="3">
            <conditional name="query|subcommand">
                <param name="download_by" value="accession"/>
                <conditional name="text_or_file">
                    <param name="text_or_file" value="text"/>
                    <param name="accession" value="GCF_000146045.2 GCF_000002945.1"/>
                </conditional>
            </conditional>
            <param name="include" value="seq-report,genome"/>
            <param name="uncompressed" value="true"/>
            <output_collection name="genome_fasta" type="list:list" count="2">
                <expand macro="genome_fasta_assert" el1="GCF_000002945.1" el2="GCF_000002945.1_ASM294v2" expression=">NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="4"/>
                <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/>
            </output_collection>
        </test>
        <!-- tax_exact_match seems not able to filter out strains
             https://github.com/ncbi/datasets/issues/187
             hence we set  expect_test_failure="true"-->
        <test expect_num_outputs="1" expect_test_failure="true">
            <conditional name="query|subcommand">
                <param name="download_by" value="taxon"/>
                <param name="text_or_file" value="text"/>
                <param name="taxon" value="4932"/>
                <param name="tax_exact_match" value="true"/>
            </conditional>
            <param name="include" value=""/>
            <param name="uncompressed" value="true"/>
            <output name="genome_data_report">
                <assert_contents>
                   <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help>
<![CDATA[
**Download Genome Datasets from NCBI**

Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report.
Genome datasets can be specified by NCBI Assembly or BioProject accession or taxon. Datasets are downloaded as a zip file.

Tthe default genome dataset includes the following files (if available):
 * data_report.jsonl (genome assembly and annotation metadata, not always available)
 * genomic.fna (genomic sequences)
 * rna.fna (transcript sequences)
 * protein.faa (protein sequences)
 * genomic.gff (genome annotation in gff3 format)
 * dataset_catalog.json (a list of files and file types included in the dataset)
]]>
    </help>

</tool>
author	iuc
date	Mon, 21 Nov 2022 11:40:05 +0000
parents	18eed8fa7f23
children	ac24fff14f23