Mercurial > repos > iuc > ncbi_datasets
diff datasets_genome.xml @ 11:ac24fff14f23 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_datasets commit 4d7d3a56084e140f4fa63fb0e04a08b732f247f2
author | iuc |
---|---|
date | Fri, 02 Dec 2022 10:52:48 +0000 |
parents | a3395b1d871b |
children | d78faac2c6ef |
line wrap: on
line diff
--- a/datasets_genome.xml Mon Nov 21 11:40:05 2022 +0000 +++ b/datasets_genome.xml Fri Dec 02 10:52:48 2022 +0000 @@ -5,16 +5,17 @@ </macros> <expand macro="requirements"></expand> <command><![CDATA[ +#import re @SETUP_CERTIFICATES@ datasets download genome $query.subcommand.download_by #if $query.subcommand.download_by == 'accession': #if $query.subcommand.text_or_file.text_or_file == 'text': - #echo " ".join(f"'{x}'" for x in $query.subcommand.text_or_file.accession.split(' ') if x) + #echo " ".join(f"'{x}'" for x in re.split(" |,", str($query.subcommand.text_or_file.accession)) if x) #else --inputfile '$query.subcommand.text_or_file.inputfile' #end if #else: - '$query.subcommand.taxon' + '$query.subcommand.taxon_positional' $query.subcommand.tax_exact_match #end if $filters.reference @@ -37,24 +38,55 @@ --search '$filters.search_term' #end for --no-progressbar -#if $uncompressed -&& 7z x -y ncbi_dataset.zip -#else -&& 7z l ncbi_dataset.zip > ncbi_dataset.txt +--dehydrated + +## produce TSV report file +&& dataformat tsv genome + --package ncbi_dataset.zip + --fields #echo ",".join($file_choices.report_columns) + > genome_data_report.tsv + +## unzip and rehydrate if any data is to be downloaded (include is not None) +#if $file_choices.include + ## unzip + && 7z x -y ncbi_dataset.zip > 7z.log + + ## rehydrate + && datasets rehydrate + --directory ./ + #if not $file_choices.decompress + --gzip + #end if + --max-workers \${NCBI_DATASETS_MAX_WORKERS:-10} + + ## rename all faa, fna (resp faa.gz, fna.gz) to fasta (resp fasta.gz) to allow discovery + && find ncbi_dataset \( -name "*.faa" -o -name "*.fna" -o -name "*.faa.gz" -o -name "*.fna.gz" \) -exec sh -c 'mv {} \$(echo {} | sed "s/.f[an]a\(.gz\)\?\$/.fasta\1/")' \; + + ## unzip all compressed (non-fasta) files (jsonl files are just named .gz) + && find ncbi_dataset -name "*.jsonl.gz" -exec sh -c 'mv {} \$(dirname {})/\$(basename {} .gz)' \; + #if $file_choices.decompress + && find ncbi_dataset \( -name "*.gz" ! -name "*fasta.gz" \) -exec gunzip {} \; + #end if + + #if "seq-report" in $file_choices.include + && find ncbi_dataset -name sequence_report.jsonl -exec sh -c 'dataformat tsv genome-seq --inputfile {} > \$(dirname {})/\$(basename {} .jsonl).tsv' \; + #end if + + && true ## because Galaxy removes trailing ; from command #end if ]]></command> <inputs> <section name="query" title="Query" expanded="true"> <conditional name="subcommand"> <param name="download_by" type="select" label="Choose how to find genomes to download"> - <option value="accession">Download by NCBI assembly or BioProject accession</option> - <option value="taxon">Download by taxon</option> + <option value="accession">By NCBI assembly or BioProject accession</option> + <option value="taxon">By taxon (NCBI Taxonomy ID, scientific or common name at any tax rank)</option> </param> <when value="accession"> <expand macro="text_or_file"/> </when> <when value="taxon"> - <param name="taxon" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurus, etc."/> + <expand macro="taxon_positional"/> <param argument="--tax-exact-match" type="boolean" truevalue="--tax-exact-match" falsevalue="" label="Exclude sub-species when a species-level taxon is specified"/> </when> </conditional> @@ -67,7 +99,6 @@ <option value="latest">Latest</option> <option value="all">All</option> </param> - <!-- TODO add test for assembly source: according to CLI doc args are RefSeq, GenBank, All and not refseq / genbank--> <expand macro="assembly_source"/> <expand macro="chromosomes"/> <param argument="--exclude-atypical" type="boolean" truevalue="--exclude-atypical" falsevalue="" label="Exclude atypical assemblies"/> @@ -78,82 +109,93 @@ <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/> </repeat> </section> - <section name="file_choices" title="File Choices" expanded="true"> - <expand macro="include"/> + <section name="file_choices" title="Output options" expanded="true"> + <expand macro="tsv_report_columns"> + <option value="accession" selected="true">accession</option> + <option value="organism-name" selected="true">organism-name</option> + <option value="assminfo-submitter" selected="true">assminfo-submitter</option> + <option value="assminfo-name" selected="true">assminfo-name</option> + </expand> + <expand macro="include"> + <expand macro="genome_includes"/> + </expand> + <param name="decompress" type="boolean" label="Decompress FASTA" help="By default FASTA files are provided zipped (fasta.gz) if this is checked the data will be decompressed"/> </section> - <param name="uncompressed" type="boolean" label="Uncompress the dataset archive" checked="true"/> </inputs> <outputs> - <data name="compressed_archive" format="zip" label="Compressed Archive" from_work_dir="ncbi_dataset.zip"> - <filter>not uncompressed</filter> - </data> - <data name="archive_contents" format="txt" label="Archive Contents" from_work_dir="ncbi_dataset.txt"> - <filter>not uncompressed</filter> - </data> - <data name="genome_data_report" format="json" label="NCBI Genome Datasets: Data Report" from_work_dir="ncbi_dataset/data/assembly_data_report.jsonl"> - <filter>uncompressed</filter> - </data> + <data name="genome_data_report" format="tabular" label="NCBI Genome Datasets: Data Report" from_work_dir="genome_data_report.tsv"/> <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list"> - <discover_datasets pattern="(?P<identifier_0>.*?)\/sequence_report.jsonl" ext="json" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> - <filter>uncompressed and file_choices['include'] and "seq-report" in file_choices['include']</filter> + <discover_datasets pattern="(?P<identifier_0>.*?)\/sequence_report.tsv" ext="tabular" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> + <filter>file_choices['include'] and "seq-report" in file_choices['include']</filter> </collection> <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list:list"> - <discover_datasets pattern="(?P<identifier_0>.*?)/(?!rna|cds_from)(?P<identifier_1>.*?)(_genomic)?.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> - <filter>uncompressed and file_choices['include'] and "genome" in file_choices['include']</filter> + <discover_datasets pattern="(?P<identifier_0>.*?)/(?!rna|cds_from)(?P<identifier_1>.*?)(_genomic)?\.(?P<ext>fasta(\.gz)?)" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> + <filter>file_choices['include'] and "genome" in file_choices['include']</filter> </collection> <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list"> - <discover_datasets pattern="(?P<identifier_0>.*?)\/rna\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> - <filter>uncompressed and file_choices['include'] and "rna" in file_choices['include']</filter> + <discover_datasets pattern="(?P<identifier_0>.*?)\/rna\.(?P<ext>fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> + <filter>file_choices['include'] and "rna" in file_choices['include']</filter> </collection> <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list"> - <discover_datasets pattern="(?P<identifier_0>.*?)\/protein\.faa" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> - <filter>uncompressed and file_choices['include'] and "protein" in file_choices['include']</filter> + <discover_datasets pattern="(?P<identifier_0>.*?)\/protein\.(?P<ext>fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> + <filter>file_choices['include'] and "protein" in file_choices['include']</filter> </collection> <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list"> - <discover_datasets pattern="(?P<identifier_0>.*?)\/cds_from_genomic\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> - <filter>uncompressed and file_choices['include'] and "cds" in file_choices['include']</filter> + <discover_datasets pattern="(?P<identifier_0>.*?)\/cds_from_genomic\.(?P<ext>fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> + <filter>file_choices['include'] and "cds" in file_choices['include']</filter> </collection> <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list"> <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> - <filter>uncompressed and file_choices['include'] and "gff3" in file_choices['include']</filter> + <filter>file_choices['include'] and "gff3" in file_choices['include']</filter> </collection> <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list"> <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> - <filter>uncompressed and file_choices['include'] and "gtf" in file_choices['include']</filter> + <filter>file_choices['include'] and "gtf" in file_choices['include']</filter> </collection> <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list"> <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> - <filter>uncompressed and file_choices['include'] and "gbff" in file_choices['include']</filter> + <filter>file_choices['include'] and "gbff" in file_choices['include']</filter> </collection> </outputs> <tests> - <test expect_num_outputs="2"> + <!-- Note: All but one test use the non-default decompress="true" + + this is because (at 11/22) Galaxy can not apply text assertions on the content + of compressed files https://github.com/galaxyproject/galaxy/pull/15085 + + So with decompress="true" more powerfull assertions are powerful. + A single test checks the default, ie decompress="false". + --> + <test expect_num_outputs="1"> <conditional name="query|subcommand"> <param name="download_by" value="taxon"/> - <param name="text_or_file" value="text"/> - <param name="taxon" value="human"/> + <param name="taxon_positional" value="human"/> </conditional> <param name="chromosomes" value="21"/> - <param name="include" value=""/> - <param name="uncompressed" value="false"/> <param name="released_before" value="01/01/2018"/> - <output name="archive_contents"> + <section name="file_choices"> + <param name="include" value=""/> + </section> + <output name="genome_data_report"> <assert_contents> - <has_text text="ncbi_dataset/data/dataset_catalog.json"/> + <has_text text="Assembly Accession	Assembly Name	Assembly Submitter	Organism Name"/> + <has_n_lines n="144"/> + <has_n_columns n="4"/> </assert_contents> </output> </test> <test expect_num_outputs="2"> <conditional name="query|subcommand"> <param name="download_by" value="taxon"/> - <param name="text_or_file" value="text"/> - <param name="taxon" value="human"/> + <param name="taxon_positional" value="human"/> </conditional> <param name="chromosomes" value="21"/> - <param name="include" value="genome"/> - <param name="uncompressed" value="true"/> <param name="assembly_level" value="chromosome,complete"/> <param name="released_before" value="01/01/2018"/> + <section name="file_choices"> + <param name="include" value="genome"/> + <param name="decompress" value="true"/> + </section> <output_collection name="genome_fasta" type="list:list" count="14"> <expand macro="genome_fasta_assert" el1="GCA_000002115.2" el2="chr21" expression=">"/> <expand macro="genome_fasta_assert" el1="GCA_000002125.2" el2="chr21" expression=">"/> @@ -174,22 +216,24 @@ <output name="genome_data_report"> <assert_contents> <has_text text="Homo sapiens"/> + <has_n_columns n="4"/> </assert_contents> </output> </test> - <!-- same as precious test but assembly_source (refseq which removes some of the genomes) --> + <!-- same as previous test but assembly_source (refseq which removes some of the genomes) --> <test expect_num_outputs="2"> <conditional name="query|subcommand"> <param name="download_by" value="taxon"/> - <param name="text_or_file" value="text"/> - <param name="taxon" value="human"/> + <param name="taxon_positional" value="human"/> </conditional> <param name="chromosomes" value="21"/> - <param name="include" value="genome"/> - <param name="uncompressed" value="true"/> <param name="assembly_level" value="chromosome,complete"/> <param name="assembly_source" value="refseq"/> <param name="released_before" value="01/01/2018"/> + <section name="file_choices"> + <param name="include" value="genome"/> + <param name="decompress" value="true"/> + </section> <output_collection name="genome_fasta" type="list:list" count="2"> <expand macro="genome_fasta_assert" el1="GCF_000002125.1" el2="chr21" expression=">"/> <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/> @@ -197,6 +241,8 @@ <output name="genome_data_report"> <assert_contents> <has_text text="Homo sapiens"/> + <has_n_lines n="5"/> + <has_n_columns n="4"/> </assert_contents> </output> </test> @@ -208,21 +254,41 @@ <param name="accession" value="GCF_000013305.1 GCF_000007445.1"/> </conditional> </conditional> - <param name="include" value="seq-report,gtf,cds"/> - <param name="uncompressed" value="true"/> <param name="released_before" value="01/01/2007"/> + <section name="file_choices"> + <param name="include" value="seq-report,gtf,cds"/> + <param name="decompress" value="true"/> + </section> <output name="genome_data_report"> <assert_contents> <has_text text="GCF_000013305.1"/> + <has_n_lines n="3"/> + <has_n_columns n="4"/> </assert_contents> </output> + <output_collection name="sequence_report" type="list" count="2" > + <element name="GCF_000007445.1"> + <assert_contents> + <has_text text="GCF_000007445.1"/> + <has_n_lines n="2"/> + <has_n_columns n="14"/> + </assert_contents> + </element> + <element name="GCF_000013305.1"> + <assert_contents> + <has_text text="GCF_000013305.1"/> + <has_n_lines n="2"/> + <has_n_columns n="14"/> + </assert_contents> + </element> + </output_collection> <output_collection name="genomic_gtf" type="list"> <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/> <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/> </output_collection> <output_collection name="genomic_cds" type="list"> - <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains"/> - <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.cds" compare="contains"/> + <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains" decompress="true"/> + <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.cds" compare="contains" decompress="true"/> </output_collection> </test> <test expect_num_outputs="4"> @@ -233,12 +299,17 @@ <param name="inputfile" value="accessions.txt"/> </conditional> </conditional> - <param name="include" value="seq-report,gbff,gff3"/> - <param name="uncompressed" value="true"/> <param name="released_before" value="01/01/2007"/> + <section name="file_choices"> + <param name="include" value="seq-report,gff3,gbff"/> + <param name="decompress" value="true"/> + </section> <output name="genome_data_report"> <assert_contents> - <has_text text="SAMN02604181"/> + <has_text text="GCF_000013305.1"/> + <has_text text="GCF_000007445.1"/> + <has_n_lines n="3"/> + <has_n_columns n="4"/> </assert_contents> </output> <output_collection name="genomic_gff" type="list"> @@ -250,7 +321,9 @@ <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/> </output_collection> </test> - <test expect_num_outputs="2"> + + <!-- should not fail https://github.com/ncbi/datasets/issues/194 --> + <test expect_num_outputs="2" expect_failure="true"> <conditional name="query|subcommand"> <param name="download_by" value="accession"/> <conditional name="text_or_file"> @@ -258,32 +331,14 @@ <param name="accession" value="GCF_000001405"/> </conditional> </conditional> - <param name="include" value="seq-report"/> - <param name="uncompressed" value="true"/> <param name="released_before" value="01/01/2015"/> <param name="assembly_version" value="all"/> - <output_collection name="sequence_report" count="4"> - <element name="GCF_000001405.25"> - <assert_contents> - <has_text text="assignedMoleculeLocationType"/> - </assert_contents> - </element> - <element name="GCF_000001405.26"> - <assert_contents> - <has_text text="assignedMoleculeLocationType"/> - </assert_contents> - </element> - <element name="GCF_000001405.27"> - <assert_contents> - <has_text text="assignedMoleculeLocationType"/> - </assert_contents> - </element> - <element name="GCF_000001405.28"> - <assert_contents> - <has_text text="assignedMoleculeLocationType"/> - </assert_contents> - </element> - </output_collection> + <section name="file_choices"> + <param name="include" value="seq-report"/> + </section> + <!-- + <output_collection name="sequence_report" type="list" count="4" > + --> </test> <test expect_num_outputs="5"> <conditional name="query|subcommand"> @@ -293,11 +348,64 @@ <param name="accession" value="GCF_000146045.2"/> </conditional> </conditional> - <param name="include" value="seq-report,genome,rna,cds"/> - <param name="uncompressed" value="true"/> + <section name="file_choices"> + <param name="include" value="genome,protein,rna,cds"/> + <param name="decompress" value="true"/> + </section> <output_collection name="genome_fasta" type="list:list" count="1"> <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/> </output_collection> + <output_collection name="protein_fasta" type="list" count="1"> + <element name="GCF_000146045.2" decompress="true"> + <assert_contents> + <has_text text=">"/> + </assert_contents> + </element> + </output_collection> + <output_collection name="rna_fasta" type="list" count="1"> + <element name="GCF_000146045.2" decompress="true"> + <assert_contents> + <has_text text=">"/> + </assert_contents> + </element> + </output_collection> + </test> + <!-- same as the previous test, but use the default value for decompress, + see comment at the beginning of the tests --> + <test expect_num_outputs="5"> + <conditional name="query|subcommand"> + <param name="download_by" value="accession"/> + <conditional name="text_or_file"> + <param name="text_or_file" value="text"/> + <param name="accession" value="GCF_000146045.2"/> + </conditional> + </conditional> + <section name="file_choices"> + <param name="include" value="genome,protein,rna,cds"/> + </section> + <output_collection name="genome_fasta" type="list:list" count="1"> + <element name="GCF_000146045.2"> + <element name="GCF_000146045.2_R64" ftype="fasta.gz"> + <assert_contents> + <has_size value="3843460"/> + </assert_contents> + </element> + </element> + </output_collection> + <output_collection name="protein_fasta" type="list" count="1"> + <element name="GCF_000146045.2" ftype="fasta.gz"> + <assert_contents> + <has_size value="1844838"/> + </assert_contents> + </element> + </output_collection> + <output_collection name="rna_fasta" type="list" count="1"> + <element name="GCF_000146045.2" ftype="fasta.gz"> + <assert_contents> + <has_size value="2784534"/> + </assert_contents> + </element> + </output_collection> </test> <test expect_num_outputs="3"> <conditional name="query|subcommand"> @@ -307,8 +415,11 @@ <param name="accession" value="GCF_000146045.2 GCF_000002945.1"/> </conditional> </conditional> - <param name="include" value="seq-report,genome"/> - <param name="uncompressed" value="true"/> + <section name="file_choices"> + <param name="include" value="seq-report,genome"/> + <param name="decompress" value="true"/> + </section> + <output_collection name="sequence_report" type="list" count="2"/> <output_collection name="genome_fasta" type="list:list" count="2"> <expand macro="genome_fasta_assert" el1="GCF_000002945.1" el2="GCF_000002945.1_ASM294v2" expression=">NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="4"/> <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/> @@ -320,12 +431,10 @@ <test expect_num_outputs="1" expect_test_failure="true"> <conditional name="query|subcommand"> <param name="download_by" value="taxon"/> - <param name="text_or_file" value="text"/> - <param name="taxon" value="4932"/> + <param name="taxon_positional" value="4932"/> <param name="tax_exact_match" value="true"/> </conditional> <param name="include" value=""/> - <param name="uncompressed" value="true"/> <output name="genome_data_report"> <assert_contents> <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/> @@ -338,15 +447,18 @@ **Download Genome Datasets from NCBI** Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report. -Genome datasets can be specified by NCBI Assembly or BioProject accession or taxon. Datasets are downloaded as a zip file. +Genome datasets can be specified by NCBI Assembly or BioProject accession(s) or by taxon. + +The download is a three step process: -Tthe default genome dataset includes the following files (if available): - * data_report.jsonl (genome assembly and annotation metadata, not always available) - * genomic.fna (genomic sequences) - * rna.fna (transcript sequences) - * protein.faa (protein sequences) - * genomic.gff (genome annotation in gff3 format) - * dataset_catalog.json (a list of files and file types included in the dataset) +1. A "dehydrated" zip file is downloaded which includes the metadata and the download URL) +2. The metadata is transformed into a tabular (TSV) file +3. The data is hydrated (the actual data is downloaded) + +The 3rd step can be skipped by unselecting all output types in the `Include` parameter. +Thereby its possible to inspect the metadata prior to the actual data download. Also this +allows to use the tool for querying data sets (and their accessions) of interest which +can then be downloaded in a second call using the accessions. ]]> </help>