diff datasets_genome.xml @ 21:0f3b3813b6ae draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_datasets commit c1c3f90e4aa7dc258aa61d98ec4eac0b97eef426
author iuc
date Mon, 21 Jul 2025 19:28:16 +0000
parents da8260eba74b
children
line wrap: on
line diff
--- a/datasets_genome.xml	Mon Mar 17 11:05:45 2025 +0000
+++ b/datasets_genome.xml	Mon Jul 21 19:28:16 2025 +0000
@@ -4,9 +4,14 @@
         <import>macros.xml</import>
     </macros>
     <expand macro="bio_tools"/>
-    <expand macro="requirements"></expand>
+    <expand macro="requirements"/>
     <expand macro="version_command"/>
-    <command><![CDATA[
+    <stdio>
+        <regex match="Warning" source="stderr" level="warning" description=""/>
+        <regex match="skipping" source="stderr" level="warning" description=""/>
+        <regex match="ERROR" level="fatal"/>
+    </stdio>
+    <command detect_errors="exit_code"><![CDATA[
 #import re
 @SETUP_CERTIFICATES@
 datasets download genome $query.subcommand.download_by
@@ -116,7 +121,6 @@
             </param>
             <expand macro="released_options"/>
             <expand macro="released_options" before_or_after="after"/>
-
             <repeat name="search" title="Add search terms">
                 <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/>
             </repeat>
@@ -137,35 +141,35 @@
     <outputs>
         <data name="genome_data_report" format="tabular" label="NCBI Genome Datasets: Data Report" from_work_dir="genome_data_report.tsv"/>
         <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/sequence_report.tsv" ext="tabular" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/sequence_report.tsv" ext="tabular" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
             <filter>file_choices['include'] and "seq-report" in file_choices['include']</filter>
         </collection>
         <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list:list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)/(?!rna|cds_from)(?P&lt;identifier_1&gt;.*?)(_genomic)?\.(?P&lt;ext&gt;fasta(\.gz)?)"  directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)/(?!rna|cds_from)(?P&lt;identifier_1&gt;.*?)(_genomic)?\.(?P&lt;ext&gt;fasta(\.gz)?)" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
             <filter>file_choices['include'] and "genome" in file_choices['include']</filter>
         </collection>
         <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/rna\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/rna\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
             <filter>file_choices['include'] and "rna" in file_choices['include']</filter>
         </collection>
         <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/protein\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/protein\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
             <filter>file_choices['include'] and "protein" in file_choices['include']</filter>
         </collection>
         <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/cds_from_genomic\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/cds_from_genomic\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
             <filter>file_choices['include'] and "cds" in file_choices['include']</filter>
         </collection>
         <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
             <filter>file_choices['include'] and "gff3" in file_choices['include']</filter>
         </collection>
         <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
             <filter>file_choices['include'] and "gtf" in file_choices['include']</filter>
         </collection>
         <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data/" recurse="true" match_relative_path="true"/>
             <filter>file_choices['include'] and "gbff" in file_choices['include']</filter>
         </collection>
     </outputs>
@@ -175,8 +179,10 @@
                 <param name="download_by" value="taxon"/>
                 <param name="taxon_positional" value="human"/>
             </conditional>
-            <param name="chromosomes" value="21"/>
-            <param name="released_before" value="01/01/2018"/>
+            <section name="filters">
+                <param name="chromosomes" value="21"/>
+                <param name="released_before" value="01/01/2018"/>
+            </section>
             <section name="file_choices">
                 <!-- include a sequence (which should be downloaded as fasta.gz)
                      and one non-sequence (which should be decompressed) output -->
@@ -184,19 +190,19 @@
             </section>
             <output name="genome_data_report">
                 <assert_contents>
-                    <has_text text="Assembly Accession&#009;Assembly Name&#009;Assembly Submitter&#009;Organism Name"/>
+                    <has_text text="Assembly Accession&#9;Assembly Name&#9;Assembly Submitter&#9;Organism Name"/>
                     <has_n_lines n="142"/>
                     <has_n_columns n="4"/>
                 </assert_contents>
             </output>
-            <output_collection name="rna_fasta" type="list" count="1">
+            <output_collection name="rna_fasta" type="list" count="2">
                 <element name="GCF_000306695.2" decompress="true">
                     <assert_contents>
-                        <has_text text=">"/>
+                        <has_text text="&gt;"/>
                     </assert_contents>
                 </element>
             </output_collection>
-            <output_collection name="genomic_gff" type="list">
+            <output_collection name="genomic_gff" type="list" count="12">
                 <element name="GCF_000306695.2">
                     <assert_contents>
                         <has_n_lines min="1000000"/>
@@ -214,26 +220,27 @@
                 <param name="download_by" value="taxon"/>
                 <param name="taxon_positional" value="human"/>
             </conditional>
-            <param name="chromosomes" value="21"/>
-            <param name="assembly_level" value="chromosome,complete"/>
-            <param name="released_before" value="01/01/2018"/>
+            <section name="filters">
+                <param name="chromosomes" value="21"/>
+                <param name="assembly_level" value="chromosome,complete"/>  
+                <param name="released_before" value="01/01/2018"/>
+            </section>
             <section name="file_choices">
                 <param name="include" value="genome"/>
                 <param name="decompress" value="true"/>
             </section>
-            <output_collection name="genome_fasta" type="list:list" count="12">
-                <expand macro="genome_fasta_assert" el1="GCA_000002115.2" el2="chr21" expression=">"/>
-                <expand macro="genome_fasta_assert" el1="GCA_000002125.2" el2="chr21" expression=">"/>
-                <expand macro="genome_fasta_assert" el1="GCA_000212995.1" el2="chr21" expression=">"/>
-                <expand macro="genome_fasta_assert" el1="GCA_000252825.1" el2="chr21" expression=">"/>
-                <expand macro="genome_fasta_assert" el1="GCA_000306695.2" el2="chr21" expression=">"/>
-                <expand macro="genome_fasta_assert" el1="GCA_000365445.1" el2="chr21" expression=">"/>
-                <expand macro="genome_fasta_assert" el1="GCA_001292825.2" el2="chr21" expression=">"/>
-                <expand macro="genome_fasta_assert" el1="GCA_001524155.4" el2="chr21" expression=">"/>
-                <expand macro="genome_fasta_assert" el1="GCA_001712695.1" el2="chr21" expression=">"/>
-                <expand macro="genome_fasta_assert" el1="GCA_022833125.2" el2="chr21" expression=">"/>
-                <expand macro="genome_fasta_assert" el1="GCF_000002125.1" el2="chr21" expression=">"/>
-                <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/>
+            <output_collection name="genome_fasta" type="list:list" count="11">
+                <expand macro="genome_fasta_assert" el1="GCA_000002115.2" el2="chr21" expression="&gt;"/>
+                <expand macro="genome_fasta_assert" el1="GCA_000002125.2" el2="chr21" expression="&gt;"/>
+                <expand macro="genome_fasta_assert" el1="GCA_000212995.1" el2="chr21" expression="&gt;"/>
+                <expand macro="genome_fasta_assert" el1="GCA_000306695.2" el2="chr21" expression="&gt;"/>
+                <expand macro="genome_fasta_assert" el1="GCA_000365445.1" el2="chr21" expression="&gt;"/>
+                <expand macro="genome_fasta_assert" el1="GCA_001292825.2" el2="chr21" expression="&gt;"/>
+                <expand macro="genome_fasta_assert" el1="GCA_001524155.4" el2="chr21" expression="&gt;"/>
+                <expand macro="genome_fasta_assert" el1="GCA_001712695.1" el2="chr21" expression="&gt;"/>
+                <expand macro="genome_fasta_assert" el1="GCA_022833125.2" el2="chr21" expression="&gt;"/>
+                <expand macro="genome_fasta_assert" el1="GCF_000002125.1" el2="chr21" expression="&gt;"/>
+                <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression="&gt;"/>
                 <!-- According to  https://github.com/ncbi/datasets/issues/188, the following should not be included among the returned results anymore 09/2023 -->
                 <!--
                 <expand macro="genome_fasta_assert" el1="GCA_000442335.2" el2="GCA_000442335.2_LinearCen1.1_normalized" expression=">" expression_n="25"/>
@@ -253,10 +260,12 @@
                 <param name="download_by" value="taxon"/>
                 <param name="taxon_positional" value="human"/>
             </conditional>
-            <param name="chromosomes" value="21"/>
-            <param name="assembly_level" value="chromosome,complete"/>
-            <param name="assembly_source" value="refseq"/>
-            <param name="released_before" value="01/01/2018"/>
+            <section name="filters">
+                <param name="chromosomes" value="21"/>
+                <param name="assembly_level" value="chromosome,complete"/>
+                <param name="assembly_source" value="refseq"/>
+                <param name="released_before" value="01/01/2018"/>
+            </section>
             <section name="file_choices">
                 <param name="include" value="genome"/>
                 <param name="decompress" value="true"/>
@@ -288,7 +297,9 @@
                     <param name="accession" value="GCF_000013305.1 GCF_000007445.1"/>
                 </conditional>
             </conditional>
-            <param name="released_before" value="01/01/2007"/>
+            <section name="filters">
+                <param name="released_before" value="01/01/2007"/>
+            </section>
             <section name="file_choices">
                 <param name="include" value="seq-report,gtf,cds"/>
                 <param name="decompress" value="true"/>
@@ -300,7 +311,7 @@
                     <has_n_columns n="4"/>
                 </assert_contents>
             </output>
-            <output_collection name="sequence_report" type="list" count="2" >
+            <output_collection name="sequence_report" type="list" count="2">
                 <element name="GCF_000007445.1">
                     <assert_contents>
                         <has_text text="GCF_000007445.1"/>
@@ -316,7 +327,7 @@
                     </assert_contents>
                 </element>
             </output_collection>
-            <output_collection name="genomic_gtf" type="list">
+            <output_collection name="genomic_gtf" type="list" count="2">
                 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/>
                 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/>
             </output_collection>
@@ -333,7 +344,9 @@
                     <param name="inputfile" value="accessions.txt"/>
                 </conditional>
             </conditional>
-            <param name="released_before" value="01/01/2007"/>
+            <section name="filters">
+                <param name="released_before" value="01/01/2007"/>
+            </section>
             <section name="file_choices">
                 <param name="include" value="seq-report,gff3,gbff"/>
                 <param name="decompress" value="true"/>
@@ -355,7 +368,6 @@
                 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/>
             </output_collection>
         </test>
-
         <!-- should not fail https://github.com/ncbi/datasets/issues/194 -->
         <test expect_num_outputs="2">
             <conditional name="query|subcommand">
@@ -365,8 +377,10 @@
                     <param name="accession" value="GCF_000001405"/>
                 </conditional>
             </conditional>
-            <param name="released_before" value="01/01/2015"/>
-            <param name="assembly_version" value="all"/>
+            <section name="filters">
+                <param name="released_before" value="01/01/2015"/>
+                <param name="assembly_version" value="all"/>
+            </section>
             <section name="file_choices">
                 <param name="include" value="seq-report"/>
             </section>
@@ -395,19 +409,19 @@
                 <param name="decompress" value="true"/>
             </section>
             <output_collection name="genome_fasta" type="list:list" count="1">
-                <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/>
+                <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression="&gt;NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/>
             </output_collection>
             <output_collection name="protein_fasta" type="list" count="1">
                 <element name="GCF_000146045.2" decompress="true">
                     <assert_contents>
-                        <has_text text=">"/>
+                        <has_text text="&gt;"/>
                     </assert_contents>
                 </element>
             </output_collection>
             <output_collection name="rna_fasta" type="list" count="1">
                 <element name="GCF_000146045.2" decompress="true">
                     <assert_contents>
-                        <has_text text=">"/>
+                        <has_text text="&gt;"/>
                     </assert_contents>
                 </element>
             </output_collection>
@@ -437,7 +451,7 @@
             <output_collection name="protein_fasta" type="list" count="1">
                 <element name="GCF_000146045.2" ftype="fasta.gz">
                     <assert_contents>
-                        <has_size value="1845038" delta="2000"/>
+                        <has_size value="1847862" delta="2000"/>
                     </assert_contents>
                 </element>
             </output_collection>
@@ -463,22 +477,21 @@
             </section>
             <output_collection name="sequence_report" type="list" count="2"/>
             <output_collection name="genome_fasta" type="list:list" count="2">
-                <expand macro="genome_fasta_assert" el1="GCF_000002945.2" el2="GCF_000002945.2_ASM294v3" expression=">NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe.*" expression_n="4"/>
-                <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc].*" expression_n="17"/>
+                <expand macro="genome_fasta_assert" el1="GCF_000002945.2" el2="GCF_000002945.2_ASM294v3" expression="&gt;NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe.*" expression_n="4"/>
+                <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression="&gt;NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc].*" expression_n="17"/>
             </output_collection>
         </test>
         <!-- tax_exact_match should filter out strains
              https://github.com/ncbi/datasets/issues/187 -->
-        <test expect_num_outputs="1">
+        <test expect_num_outputs="2">
             <conditional name="query|subcommand">
                 <param name="download_by" value="taxon"/>
                 <param name="taxon_positional" value="4932"/>
                 <param name="tax_exact_match" value="true"/>
             </conditional>
-            <param name="include" value=""/>
             <output name="genome_data_report">
                 <assert_contents>
-                   <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/>
+                    <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/>
                 </assert_contents>
             </output>
         </test>