Mercurial > repos > iuc > ncbi_datasets

<tool id="datasets_download_genome" name="NCBI Datasets Genomes" profile="@PROFILE@" license="@LICENSE@" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
    <description>download genome sequence, annotation and metadata</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements"></expand>
    <command><![CDATA[
#import re
@SETUP_CERTIFICATES@
datasets download genome $query.subcommand.download_by
#if $query.subcommand.download_by == 'accession':
    #if $query.subcommand.text_or_file.text_or_file == 'text':
        #echo " ".join(f"'{x}'" for x in re.split(" |,", str($query.subcommand.text_or_file.accession)) if x)
    #else
        --inputfile '$query.subcommand.text_or_file.inputfile'
    #end if
#else:
    '$query.subcommand.taxon_positional'
    $query.subcommand.tax_exact_match
#end if
$filters.reference
$filters.annotated
#if $filters.assembly_level:
    --assembly-level $filters.assembly_level
#end if
--assembly-version $filters.assembly_version
#if $filters.assembly_source:
    --assembly-source $filters.assembly_source
#end if
#if $filters.chromosomes:
    --chromosomes '$filters.chromosomes'
#end if
$filters.exclude_atypical
@INCLUDE@
@RELEASED_BEFORE@
@RELEASED_AFTER@
#for search_term in $filters.search:
    --search '$filters.search_term'
#end for
--no-progressbar
--dehydrated

## produce TSV report file
&& dataformat tsv genome
    --package ncbi_dataset.zip
    --fields #echo ",".join($file_choices.report_columns)
    > genome_data_report.tsv

## unzip and rehydrate if any data is to be downloaded (include is not None)
#if $file_choices.include
    ## unzip
    && 7z x -y ncbi_dataset.zip > 7z.log

    ## rehydrate
    && datasets rehydrate
        --directory ./
        #if not $file_choices.decompress
            --gzip
        #end if
        --max-workers \${NCBI_DATASETS_MAX_WORKERS:-10}

    ## rename all faa, fna (resp faa.gz, fna.gz) to fasta (resp fasta.gz) to allow discovery
    && find ncbi_dataset \( -name "*.faa" -o -name "*.fna" -o -name "*.faa.gz" -o -name "*.fna.gz" \) -exec sh -c 'mv {} \$(echo {} | sed "s/.f[an]a\(.gz\)\?\$/.fasta\1/")' \;

    ## unzip all compressed (non-fasta) files (jsonl files are just named .gz)
    && find ncbi_dataset -name "*.jsonl.gz" -exec sh -c 'mv {} \$(dirname {})/\$(basename {} .gz)' \;
    #if $file_choices.decompress
        && find ncbi_dataset \( -name "*.gz" ! -name "*fasta.gz" \) -exec gunzip {} \;
    #end if

    #if "seq-report" in $file_choices.include
        && find ncbi_dataset -name sequence_report.jsonl -exec sh -c 'dataformat tsv genome-seq --inputfile {} > \$(dirname {})/\$(basename {} .jsonl).tsv' \;
    #end if

    && true  ## because Galaxy removes trailing ; from command
#end if
]]></command>
    <inputs>
        <section name="query" title="Query" expanded="true">
            <conditional name="subcommand">
                <param name="download_by" type="select" label="Choose how to find genomes to download">
                    <option value="accession">By NCBI assembly or BioProject accession</option>
                    <option value="taxon">By taxon (NCBI Taxonomy ID, scientific or common name at any tax rank)</option>
                </param>
                <when value="accession">
                    <expand macro="text_or_file"/>
                </when>
                <when value="taxon">
                    <expand macro="taxon_positional"/>
                    <param argument="--tax-exact-match" type="boolean" truevalue="--tax-exact-match" falsevalue="" label="Exclude sub-species when a species-level taxon is specified"/>
                </when>
            </conditional>
        </section>
        <section name="filters" title="Filters and Limit">
            <param argument="--reference" type="boolean" truevalue="--reference" falsevalue="" label="Limit to reference and representative (GCF_ and GCA_) assemblies"/>
            <expand macro="annotation"/>
            <expand macro="assembly_level"/>
            <param argument="--assembly-version" type="select" label="Assembly version(s)">
                <option value="latest">Latest</option>
                <option value="all">All</option>
            </param>
            <expand macro="assembly_source"/>
            <expand macro="chromosomes"/>
            <param argument="--exclude-atypical" type="boolean" truevalue="--exclude-atypical" falsevalue="" label="Exclude atypical assemblies"/>
            <expand macro="released_options"/>
            <expand macro="released_options" before_or_after="after"/>

            <repeat name="search" title="Add search terms">
                <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/>
            </repeat>
        </section>
        <section name="file_choices" title="Output options" expanded="true">
            <expand macro="tsv_report_columns">
                <option value="accession" selected="true">accession</option>
                <option value="organism-name" selected="true">organism-name</option>
                <option value="assminfo-submitter" selected="true">assminfo-submitter</option>
                <option value="assminfo-name" selected="true">assminfo-name</option>
            </expand>
            <expand macro="include">
                <expand macro="genome_includes"/>
            </expand>
            <param name="decompress" type="boolean" label="Decompress FASTA" help="By default FASTA files are provided zipped (fasta.gz) if this is checked the data will be decompressed"/>
        </section>
    </inputs>
    <outputs>
        <data name="genome_data_report" format="tabular" label="NCBI Genome Datasets: Data Report" from_work_dir="genome_data_report.tsv"/>
        <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/sequence_report.tsv" ext="tabular" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
            <filter>file_choices['include'] and "seq-report" in file_choices['include']</filter>
        </collection>
        <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list:list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)/(?!rna|cds_from)(?P&lt;identifier_1&gt;.*?)(_genomic)?\.(?P&lt;ext&gt;fasta(\.gz)?)"  directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
            <filter>file_choices['include'] and "genome" in file_choices['include']</filter>
        </collection>
        <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/rna\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
            <filter>file_choices['include'] and "rna" in file_choices['include']</filter>
        </collection>
        <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/protein\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
            <filter>file_choices['include'] and "protein" in file_choices['include']</filter>
        </collection>
        <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/cds_from_genomic\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
            <filter>file_choices['include'] and "cds" in file_choices['include']</filter>
        </collection>
        <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
            <filter>file_choices['include'] and "gff3" in file_choices['include']</filter>
        </collection>
        <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
            <filter>file_choices['include'] and "gtf" in file_choices['include']</filter>
        </collection>
        <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
            <filter>file_choices['include'] and "gbff" in file_choices['include']</filter>
        </collection>
    </outputs>
    <tests>
        <!-- Note: All but one test use the non-default decompress="true"

            this is because (at 11/22) Galaxy can not apply text assertions on the content
            of compressed files https://github.com/galaxyproject/galaxy/pull/15085

            So with decompress="true" more powerfull assertions are powerful.
            A single test checks the default, ie decompress="false".
        -->
        <test expect_num_outputs="1">
            <conditional name="query|subcommand">
                <param name="download_by" value="taxon"/>
                <param name="taxon_positional" value="human"/>
            </conditional>
            <param name="chromosomes" value="21"/>
            <param name="released_before" value="01/01/2018"/>
            <section name="file_choices">
                <param name="include" value=""/>
            </section>
            <output name="genome_data_report">
                <assert_contents>
                    <has_text text="Assembly Accession&#009;Assembly Name&#009;Assembly Submitter&#009;Organism Name"/>
                    <has_n_lines n="144"/>
                    <has_n_columns n="4"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="2">
            <conditional name="query|subcommand">
                <param name="download_by" value="taxon"/>
                <param name="taxon_positional" value="human"/>
            </conditional>
            <param name="chromosomes" value="21"/>
            <param name="assembly_level" value="chromosome,complete"/>
            <param name="released_before" value="01/01/2018"/>
            <section name="file_choices">
                <param name="include" value="genome"/>
                <param name="decompress" value="true"/>
            </section>
            <output_collection name="genome_fasta" type="list:list" count="14">
                <expand macro="genome_fasta_assert" el1="GCA_000002115.2" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_000002125.2" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_000002135.3" el2="GCA_000002135.3_CRA_TCAGchr7v2" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_000212995.1" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_000252825.1" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_000306695.2" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_000365445.1" el2="chr21" expression=">"/>
                <!-- TODO chromosomes argument (or data) seems not reliable https://github.com/ncbi/datasets/issues/188-->
                <expand macro="genome_fasta_assert" el1="GCA_000442335.2" el2="GCA_000442335.2_LinearCen1.1_normalized" expression=">" expression_n="25"/>
                <expand macro="genome_fasta_assert" el1="GCA_001292825.2" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_001524155.4" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_001712695.1" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_022833125.2" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCF_000002125.1" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/>
            </output_collection>
            <output name="genome_data_report">
                <assert_contents>
                    <has_text text="Homo sapiens"/>
                    <has_n_columns n="4"/>
                </assert_contents>
            </output>
        </test>
        <!-- same as previous test but assembly_source (refseq which removes some of the genomes) -->
        <test expect_num_outputs="2">
            <conditional name="query|subcommand">
                <param name="download_by" value="taxon"/>
                <param name="taxon_positional" value="human"/>
            </conditional>
            <param name="chromosomes" value="21"/>
            <param name="assembly_level" value="chromosome,complete"/>
            <param name="assembly_source" value="refseq"/>
            <param name="released_before" value="01/01/2018"/>
            <section name="file_choices">
                <param name="include" value="genome"/>
                <param name="decompress" value="true"/>
            </section>
            <output_collection name="genome_fasta" type="list:list" count="2">
                <expand macro="genome_fasta_assert" el1="GCF_000002125.1" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/>
            </output_collection>
            <output name="genome_data_report">
                <assert_contents>
                    <has_text text="Homo sapiens"/>
                    <has_n_lines n="5"/>
                    <has_n_columns n="4"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="4">
            <conditional name="query|subcommand">
                <param name="download_by" value="accession"/>
                <conditional name="text_or_file">
                    <param name="text_or_file" value="text"/>
                    <param name="accession" value="GCF_000013305.1 GCF_000007445.1"/>
                </conditional>
            </conditional>
            <param name="released_before" value="01/01/2007"/>
            <section name="file_choices">
                <param name="include" value="seq-report,gtf,cds"/>
                <param name="decompress" value="true"/>
            </section>
            <output name="genome_data_report">
                <assert_contents>
                    <has_text text="GCF_000013305.1"/>
                    <has_n_lines n="3"/>
                    <has_n_columns n="4"/>
                </assert_contents>
            </output>
            <output_collection name="sequence_report" type="list" count="2" >
                <element name="GCF_000007445.1">
                    <assert_contents>
                        <has_text text="GCF_000007445.1"/>
                        <has_n_lines n="2"/>
                        <has_n_columns n="14"/>
                    </assert_contents>
                </element>
                <element name="GCF_000013305.1">
                    <assert_contents>
                        <has_text text="GCF_000013305.1"/>
                        <has_n_lines n="2"/>
                        <has_n_columns n="14"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="genomic_gtf" type="list">
                <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/>
                <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/>
            </output_collection>
            <output_collection name="genomic_cds" type="list">
                <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains" decompress="true"/>
                <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.cds" compare="contains" decompress="true"/>
            </output_collection>
        </test>
        <test expect_num_outputs="4">
            <conditional name="query|subcommand">
                <param name="download_by" value="accession"/>
                <conditional name="text_or_file">
                    <param name="text_or_file" value="file"/>
                    <param name="inputfile" value="accessions.txt"/>
                </conditional>
            </conditional>
            <param name="released_before" value="01/01/2007"/>
            <section name="file_choices">
                <param name="include" value="seq-report,gff3,gbff"/>
                <param name="decompress" value="true"/>
            </section>
            <output name="genome_data_report">
                <assert_contents>
                   <has_text text="GCF_000013305.1"/>
                   <has_text text="GCF_000007445.1"/>
                    <has_n_lines n="3"/>
                    <has_n_columns n="4"/>
                </assert_contents>
            </output>
            <output_collection name="genomic_gff" type="list">
                <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gff" compare="contains"/>
                <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gff" compare="contains"/>
            </output_collection>
            <output_collection name="genomic_gbff" type="list">
                <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gbff" compare="contains"/>
                <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/>
            </output_collection>
        </test>

        <!-- should not fail https://github.com/ncbi/datasets/issues/194 -->
        <test expect_num_outputs="2" expect_failure="true">
            <conditional name="query|subcommand">
                <param name="download_by" value="accession"/>
                <conditional name="text_or_file">
                    <param name="text_or_file" value="text"/>
                    <param name="accession" value="GCF_000001405"/>
                </conditional>
            </conditional>
            <param name="released_before" value="01/01/2015"/>
            <param name="assembly_version" value="all"/>
            <section name="file_choices">
                <param name="include" value="seq-report"/>
            </section>
            <!--
            <output_collection name="sequence_report" type="list" count="4" >
            -->
        </test>
        <test expect_num_outputs="5">
            <conditional name="query|subcommand">
                <param name="download_by" value="accession"/>
                <conditional name="text_or_file">
                    <param name="text_or_file" value="text"/>
                    <param name="accession" value="GCF_000146045.2"/>
                </conditional>
            </conditional>
            <section name="file_choices">
                <param name="include" value="genome,protein,rna,cds"/>
                <param name="decompress" value="true"/>
            </section>
            <output_collection name="genome_fasta" type="list:list" count="1">
                <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/>
            </output_collection>
            <output_collection name="protein_fasta" type="list" count="1">
                <element name="GCF_000146045.2" decompress="true">
                    <assert_contents>
                        <has_text text=">"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="rna_fasta" type="list" count="1">
                <element name="GCF_000146045.2" decompress="true">
                    <assert_contents>
                        <has_text text=">"/>
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <!-- same as the previous test, but use the default value for decompress,
            see comment at the beginning of the tests -->
        <test expect_num_outputs="5">
            <conditional name="query|subcommand">
                <param name="download_by" value="accession"/>
                <conditional name="text_or_file">
                    <param name="text_or_file" value="text"/>
                    <param name="accession" value="GCF_000146045.2"/>
                </conditional>
            </conditional>
            <section name="file_choices">
                <param name="include" value="genome,protein,rna,cds"/>
            </section>
            <output_collection name="genome_fasta" type="list:list" count="1">
                <element name="GCF_000146045.2">
                    <element name="GCF_000146045.2_R64" ftype="fasta.gz">
                        <assert_contents>
                            <has_size value="3843460"/>
                        </assert_contents>
                    </element>
                </element>
            </output_collection>
            <output_collection name="protein_fasta" type="list" count="1">
                <element name="GCF_000146045.2" ftype="fasta.gz">
                    <assert_contents>
                        <has_size value="1844838"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="rna_fasta" type="list" count="1">
                <element name="GCF_000146045.2" ftype="fasta.gz">
                    <assert_contents>
                        <has_size value="2784534"/>
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <test expect_num_outputs="3">
            <conditional name="query|subcommand">
                <param name="download_by" value="accession"/>
                <conditional name="text_or_file">
                    <param name="text_or_file" value="text"/>
                    <param name="accession" value="GCF_000146045.2 GCF_000002945.1"/>
                </conditional>
            </conditional>
            <section name="file_choices">
                <param name="include" value="seq-report,genome"/>
                <param name="decompress" value="true"/>
            </section>
            <output_collection name="sequence_report" type="list" count="2"/>
            <output_collection name="genome_fasta" type="list:list" count="2">
                <expand macro="genome_fasta_assert" el1="GCF_000002945.1" el2="GCF_000002945.1_ASM294v2" expression=">NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="4"/>
                <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/>
            </output_collection>
        </test>
        <!-- tax_exact_match seems not able to filter out strains
             https://github.com/ncbi/datasets/issues/187
             hence we set  expect_test_failure="true"-->
        <test expect_num_outputs="1" expect_test_failure="true">
            <conditional name="query|subcommand">
                <param name="download_by" value="taxon"/>
                <param name="taxon_positional" value="4932"/>
                <param name="tax_exact_match" value="true"/>
            </conditional>
            <param name="include" value=""/>
            <output name="genome_data_report">
                <assert_contents>
                   <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help>
<![CDATA[
**Download Genome Datasets from NCBI**

Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report.
Genome datasets can be specified by NCBI Assembly or BioProject accession(s) or by taxon.

The download is a three step process:

1. A "dehydrated" zip file is downloaded which includes the metadata and the download URL)
2. The metadata is transformed into a tabular (TSV) file
3. The data is hydrated (the actual data is downloaded)

The 3rd step can be skipped by unselecting all output types in the `Include` parameter.
Thereby its possible to inspect the metadata prior to the actual data download. Also this
allows to use the tool for querying data sets (and their accessions) of interest which
can then be downloaded in a second call using the accessions.
]]>
    </help>

</tool>
author	iuc
date	Fri, 02 Dec 2022 10:52:48 +0000
parents	a3395b1d871b
children	d78faac2c6ef