changeset 12:d78faac2c6ef draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_datasets commit affffdbe7237a2c0ba5793c0e7dd11cebb8413a9
author iuc
date Sat, 03 Dec 2022 13:29:32 +0000
parents ac24fff14f23
children 1e188c9610c3
files datasets_gene.xml datasets_genome.xml macros.xml
diffstat 3 files changed, 46 insertions(+), 10 deletions(-) [+]
line wrap: on
line diff
--- a/datasets_gene.xml	Fri Dec 02 10:52:48 2022 +0000
+++ b/datasets_gene.xml	Sat Dec 03 13:29:32 2022 +0000
@@ -1,9 +1,11 @@
 <tool id="datasets_download_gene" name="NCBI Datasets Gene" profile="@PROFILE@" license="@LICENSE@" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
     <description>download gene sequences and metadata</description>
+    <expand macro="bio_tools"/>
     <macros>
         <import>macros.xml</import>
     </macros>
     <expand macro="requirements"></expand>
+    <expand macro="version_command"/>
     <command><![CDATA[
 #import re
 @SETUP_CERTIFICATES@
@@ -286,7 +288,7 @@
                 <assert_contents>
                     <has_text text="baboon"/>
                     <has_text text="glyceraldehyde-3-phosphate dehydrogenase"/>
-                    <has_n_lines n="31"/>
+                    <has_n_lines min="30"/>
                     <has_n_columns n="8"/>
                 </assert_contents>
             </output>
@@ -350,7 +352,7 @@
                 <assert_contents>
                     <has_text text="house mouse"/>
                     <has_text text="XR_004936704.1"/>
-                    <has_n_lines n="137"/>
+                    <has_n_lines min="130"/>
                     <has_n_columns n="38"/>
                 </assert_contents>
             </output>
@@ -384,7 +386,7 @@
                 <assert_contents>
                     <has_text text="rat"/>
                     <has_text text="Brca1"/>
-                    <has_n_lines n="38"/>
+                    <has_n_lines min="30"/>
                     <has_n_columns n="8"/>
                 </assert_contents>
             </output>
@@ -430,7 +432,7 @@
             <output name="gene_data_report">
                 <assert_contents>
                     <has_text text="human"/>
-                    <has_n_lines n="823"/>
+                    <has_n_lines min="800"/>
                     <has_n_columns n="8"/>
                 </assert_contents>
             </output>
@@ -488,7 +490,7 @@
             <output name="gene_data_report">
                 <assert_contents>
                     <has_text text="human"/>
-                    <has_n_lines n="72533"/>
+                    <has_n_lines min="72000"/>
                     <has_n_columns n="8"/>
                 </assert_contents>
             </output>
@@ -513,7 +515,7 @@
             <output name="gene_data_report">
                 <assert_contents>
                     <has_text text="human"/>
-                    <has_n_lines n="72533"/>
+                    <has_n_lines min="72000"/>
                     <has_n_columns n="8"/>
                 </assert_contents>
             </output>
--- a/datasets_genome.xml	Fri Dec 02 10:52:48 2022 +0000
+++ b/datasets_genome.xml	Sat Dec 03 13:29:32 2022 +0000
@@ -1,9 +1,11 @@
 <tool id="datasets_download_genome" name="NCBI Datasets Genomes" profile="@PROFILE@" license="@LICENSE@" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
     <description>download genome sequence, annotation and metadata</description>
+    <expand macro="bio_tools"/>
     <macros>
         <import>macros.xml</import>
     </macros>
     <expand macro="requirements"></expand>
+    <expand macro="version_command"/>
     <command><![CDATA[
 #import re
 @SETUP_CERTIFICATES@
@@ -63,8 +65,10 @@
     && find ncbi_dataset \( -name "*.faa" -o -name "*.fna" -o -name "*.faa.gz" -o -name "*.fna.gz" \) -exec sh -c 'mv {} \$(echo {} | sed "s/.f[an]a\(.gz\)\?\$/.fasta\1/")' \;
 
     ## unzip all compressed (non-fasta) files (jsonl files are just named .gz)
+    ## note "not decompress" means that the datasets are provided uncompressed (datasets rehydrate is called we --gzip)
+    ##      in this case we need to decompress all datasets that don't have a Galaxy datatype allowing for compression
     && find ncbi_dataset -name "*.jsonl.gz" -exec sh -c 'mv {} \$(dirname {})/\$(basename {} .gz)' \;
-    #if $file_choices.decompress
+    #if not $file_choices.decompress
         && find ncbi_dataset \( -name "*.gz" ! -name "*fasta.gz" \) -exec gunzip {} \;
     #end if
 
@@ -166,7 +170,7 @@
             So with decompress="true" more powerfull assertions are powerful.
             A single test checks the default, ie decompress="false".
         -->
-        <test expect_num_outputs="1">
+        <test expect_num_outputs="3">
             <conditional name="query|subcommand">
                 <param name="download_by" value="taxon"/>
                 <param name="taxon_positional" value="human"/>
@@ -174,7 +178,9 @@
             <param name="chromosomes" value="21"/>
             <param name="released_before" value="01/01/2018"/>
             <section name="file_choices">
-                <param name="include" value=""/>
+                <!-- include a sequence (which should be downloaded as fasta.gz)
+                     and one non-sequence (which should be decompressed) output -->
+                <param name="include" value="rna,gff3"/>
             </section>
             <output name="genome_data_report">
                 <assert_contents>
@@ -183,6 +189,26 @@
                     <has_n_columns n="4"/>
                 </assert_contents>
             </output>
+            <output_collection name="rna_fasta" type="list" count="1">
+                <element name="GCF_000306695.2" decompress="true">
+                    <assert_contents>
+                        <has_text text=">"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+            <output_collection name="genomic_gff" type="list">
+                <element name="GCF_000306695.2">
+                    <assert_contents>
+                        <has_n_lines min="1000000"/>
+                        <has_line line="##gff-version 3"/>
+                        <!-- TODO this will only work when the galaxy python packakes for 22.05 have been released 
+                            <has_n_columns n="9" comment="#"/> -->
+                    </assert_contents>
+                </element>
+            </output_collection>
+            <assert_command>
+                <has_text text="gunzip"/>
+            </assert_command>
         </test>
         <test expect_num_outputs="2">
             <conditional name="query|subcommand">
--- a/macros.xml	Fri Dec 02 10:52:48 2022 +0000
+++ b/macros.xml	Sat Dec 03 13:29:32 2022 +0000
@@ -1,6 +1,6 @@
 <macros>
     <token name="@TOOL_VERSION@">14.4</token>
-    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@VERSION_SUFFIX@">1</token>
     <token name="@PROFILE@">21.01</token>
     <token name="@LICENSE@">MIT</token>
     <token name="@PROFILE_AND_LICENSE@">profile="@PROFILE@" license="@LICENSE@"</token>
@@ -15,6 +15,14 @@
             <requirement type="package" version="16.02">p7zip</requirement>
         </requirements>
     </xml>
+    <xml name="bio_tools">
+        <xrefs>
+            <xref type="bio.tools">ncbi_datasets</xref>
+        </xrefs>
+    </xml>
+    <xml name="version_command"><![CDATA[
+        datasets --version | cut -d" " -f 3
+    ]]></xml>
     <xml name="annotation">
         <param argument="--annotated" type="boolean" truevalue="--annotated" falsevalue="" label="Only include genomes with annotation ?"/>
     </xml>