diff fetch_fasta_from_NCBI.xml @ 5:706fe8139955 draft

"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/fetch_fasta_from_ncbi commit b5ef783237b244d684e26b1ed1cc333a8305ce3e"
author artbio
date Tue, 16 Mar 2021 23:26:58 +0000
parents c667d0ee39f5
children 4af77e1af12a
line wrap: on
line diff
--- a/fetch_fasta_from_NCBI.xml	Wed Nov 29 17:38:52 2017 -0500
+++ b/fetch_fasta_from_NCBI.xml	Tue Mar 16 23:26:58 2021 +0000
@@ -1,111 +1,104 @@
-<tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="2.3.0">
-  <description></description>
-  <command><![CDATA[
+<tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="3.0.0">
+    <description></description>
+    <requirements>
+        <requirement type="package" version="1.25.9">urllib3</requirement>
+    </requirements>
+    <command><![CDATA[
     python '$__tool_directory__'/fetch_fasta_from_NCBI.py
-        -i "$queryString"
-        -d $dbname
-        -l '$logfile'
-        -c
-        -o '$outfile';
-    #if $dry_run == ""
-        number_UIDs=\$(tail -n 2 $logfile | perl -ne '/Found (\d+) UID/ && print \$1');
-        python '$__tool_directory__'/fetch_fasta_from_NCBI.py
-            -i "$queryString"
-            -d $dbname
-            -u
-            -l '$logfile'
-            -o 'uid_outfile';
-        UID_array=( \$(head uid_outfile) );
-        array_len=\${#UID_array[@]};
-        counter=0;
-        number_of_groups=\$((array_len / 200000));
-        modulo=\$((array_len % 200000));
-        if [ "\$modulo" -gt 0 ];then
-            number_of_groups=\$((number_of_groups + 1));
-        fi;
-        group_number=1;
-        echo "----- Number of groups of batches: \$number_of_groups -----" >> $logfile;
-        for ((i=0; i+200000<array_len;i+=200000)); do
-            echo "----- Group number: \$group_number -----" >> $logfile;
-            echo "\${UID_array[@]:\$i:99999}" > uid_list_1.txt;
-            echo "\${UID_array[@]:\$((i+100000)):99999}" > uid_list_2.txt;
-            python '$__tool_directory__'/fetch_fasta_from_NCBI.py
-                -d $dbname
-                -l '$logfile'
-                -o 'tmp1_outfile'
-                --UID_list uid_list_1.txt&
-            python '$__tool_directory__'/fetch_fasta_from_NCBI.py
-                -d $dbname
-                -l 'tmp1_logfile'
-                -o 'tmp2_outfile'
-                --UID_list uid_list_2.txt&
-            wait;
-            cat tmp1_outfile tmp2_outfile>> $outfile;
-            rm tmp1_outfile tmp2_outfile;
-            cat tmp1_logfile >> $logfile;
-            rm tmp1_logfile;
-            rm uid_list_1.txt uid_list_2.txt;
-            group_number=\$((group_number + 1));
-            counter=\$(( counter + 200000 ));
-        done;
-        echo "----- Group number: \$group_number -----" >> $logfile;
-        echo "----- Last group -----" >> $logfile;
-        if [ "\$counter" -lt "\$array_len" ]; then
-            echo "\${UID_array[@]:\$counter:\$((array_len - counter + 1))}" > uid_list.txt;
-            python '$__tool_directory__'/fetch_fasta_from_NCBI.py
-                -d $dbname
-                -l '$logfile'
-                -o 'tmp_outfile'
-                --UID_list uid_list.txt;
-            rm uid_list.txt;
-            cat tmp_outfile >> $outfile;
-            rm tmp_outfile;
-        fi;
-    #end if
+        #if $query.option == 'query':
+            --query '$query.queryString'
+        #else:
+            --iud_file '$query.iud_list'
+        #end if
+        --dbname '$dbname'
+        --logfile '$logfile'
+        #if $fetch_option == 'fasta':
+            --fasta $fasta
+        #end if
   ]]></command>
+  <inputs>
 
-  <inputs>
-    <param name="queryString" type="text" size="5x80" area="True" value="txid10239[orgn] NOT txid131567[orgn] AND complete[all] NOT partial[title] NOT phage[title]" label="Query to NCBI in entrez format" help="exemple: Drosophila melanogaster[Organism] AND Gcn5[Title]">
-      <sanitizer>
-        <valid initial="string.printable">
-          <remove value="&quot;"/>
-          <remove value="\"/>
-        </valid>
-        <mapping initial="none">
-          <add source="&quot;" target="\&quot;"/>
-          <add source="\" target="\\"/>
-        </mapping>
-      </sanitizer>
-    </param>
+    <conditional name="query">
+        <param name="option" type="select" label="retrieve data from query or IUD list" display="radio">
+            <option value="query" selected="true">Query string</option>
+            <option value="list">IUD list</option>
+        </param>
+        <when value="query">
+            <param name="queryString" type="text" size="5x80" area="True"
+                   value=""
+                   label="Query to NCBI in entrez format"
+                   help="exemple: `Drosophila melanogaster[Organism] AND Gcn5[Title]`">
+            <sanitizer>
+                <valid initial="string.printable">
+                    <remove value="&quot;"/>
+                    <remove value="\"/>
+                </valid>
+                <mapping initial="none">
+                    <add source="&quot;" target="\&quot;"/>
+                    <add source="\" target="\\"/>
+                </mapping>
+            </sanitizer>
+            </param>
+        </when>
+        <when value="list">
+            <param name="iud_list" format="txt,tabular" type="data" label="A list of NCBI UIDs"
+                   help="a file with a single column of UIDs, in txt or tabular format"/>
+        </when>
+    </conditional>      
     <param name="dbname" type="select" label="NCBI database">
       <option value="nuccore">Nucleotide</option>
       <option value="protein">Protein</option>
     </param>
-    <param name="dry_run" type="boolean" label="Get only the number of sequences" truevalue="--count" falsevalue="" checked="false"/>
+    <param name="fetch_option" type="select" label="select what will be retrieved">
+      <option value="fasta" selected="true">Fasta and IUDs</option>
+      <option value="justiuds">Only IUDs</option>
+    </param>
   </inputs>
   <outputs>
-    <data name="outfile" format="fasta" label="${tool.name} (${dbname.value_label}) with queryString '${queryString.value}'" >
-      <filter> dry_run == False</filter>
+    <data name="fasta" format="fasta" label="Fasta sequences retrieved from NCBI" >
+      <filter>fetch_option == "fasta"</filter>
     </data>
-    <data format="txt" name="logfile" label="${tool.name}: log"/>
+    <data name="UIDs" format="txt" label="UIDs" from_work_dir="retrieved_uid_list.txt">
+      <filter>query['option'] == "query"</filter>
+    </data>
+    <data format="txt" name="logfile" label="logs"/>
   </outputs>
   <tests>
       <test>
           <param name="queryString" value="9629650[gi]" />
           <param name="dbname" value="nuccore" />
-          <output name="outfilename" ftype="fasta" file="output.fa" />
+          <param name="fetch_option" value="fasta"/>
+          <output name="fasta" ftype="fasta" file="output.fa" />
       </test>
       <test>
           <param name="queryString" value="CU929326[Accession]" />
           <param name="dbname" value="nuccore" />
-          <param name="date_filter" value="1"/>
-          <param name="dry_run" value="True"/>
+          <param name="fetch_option" value="justiuds"/>
           <output name="logfile" ftype="txt" file="dry_run.log" compare="sim_size"/>
       </test>
       <test>
-          <param name="queryString" value="Drosophila[Organism] AND 2014[PDAT] AND virus" />
-          <output name="outfilename" ftype="fasta" >
-              <metadata name="sequences" value="13" />
+          <param name="option" value="list" />
+          <param name="iud_list" value="input_list.txt" ftype="txt" />
+          <param name="dbname" value="nuccore" />
+          <param name="fetch_option" value="fasta"/>
+          <output name="fasta" ftype="fasta" file="output_list.fa"/>
+      </test>
+      <test>
+          <param name="queryString" value="Drosophila[Organism] AND 2017[Modification Date] AND virus" />
+          <param name="dbname" value="nuccore" />
+          <param name="fetch_option" value="fasta"/>
+          <output name="fasta" ftype="fasta" >
+              <metadata name="sequences" value="9" />
+          </output>
+      </test>
+      <test>
+          <param name="queryString" value="labalbalbalbaalablalbabal[Title]" />
+          <param name="dbname" value="nuccore" />
+          <param name="fetch_option" value="justiuds"/>
+          <output name="logfile" ftype="txt">
+              <assert_contents>
+                  <has_line_matching expression=".*Found\s+0\s+UIDs" />
+              </assert_contents>
           </output>
       </test>
   </tests>
@@ -114,7 +107,7 @@
 
 This tool retrieves nucleotide/peptide sequences from the corresponding NCBI database (nuccore or protein) for a given entrez query.
 
-The tool is preset with "txid10239[orgn] NOT txid131567[orgn] AND complete NOT partial[title] NOT phage[title]" for metaVisitor use purpose
+The tool can be set with the query "txid10239[orgn] NOT txid131567[orgn] AND complete NOT partial[title] NOT phage[title]" for metaVisitor use purpose
 
 See `Entrez help`_ for explanation of query formats
 
@@ -126,6 +119,33 @@
 
 Retrieval progress is reported in the log dataset.
 
+**Options**::
+  <![CDATA[
+  usage: fetch_fasta_from_NCBI.py [-h] [--query QUERY_STRING]
+                                  [--iud_file IUDS_FILE] [--output OUTNAME]
+                                  [--dbname DBNAME] [--fasta GET_FASTA]
+                                  [--logfile LOGFILE]
+                                  [--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
+  
+  Retrieve data from NCBI
+  
+  optional arguments:
+    -h, --help            show this help message and exit
+    --query QUERY_STRING, -i QUERY_STRING
+                          NCBI Query String
+    --iud_file IUDS_FILE  input list of iuds to be fetched
+    --output OUTNAME, -o OUTNAME
+                          output file name
+    --dbname DBNAME, -d DBNAME
+                          database type
+    --fasta GET_FASTA, -F GET_FASTA
+                          retrieve fasta sequences
+    --logfile LOGFILE, -l LOGFILE
+                          log file (default=stderr)
+    --loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}
+                          logging level (default: INFO)
+  ]]>
+
 **Acknowledgments**
 
 This Galaxy tool has been adapted from the galaxy tool `get_fasta_from_taxon`_.