view fetch_fasta_from_NCBI.xml @ 6:4af77e1af12a draft default tip

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/fetch_fasta_from_ncbi commit 1a90ac7cdeb35399011207ab43c78043fd5c7287
author artbio
date Sat, 14 Oct 2023 22:34:56 +0000
parents 706fe8139955
children
line wrap: on
line source

<tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="3.1.0">
    <description></description>
    <requirements>
        <requirement type="package" version="1.25.9">urllib3</requirement>
    </requirements>
    <command><![CDATA[
    python '$__tool_directory__'/fetch_fasta_from_NCBI.py
        #if $query.option == 'query':
            --query '$query.queryString'
        #else:
            --iud_file '$query.iud_list'
        #end if
        --dbname '$dbname'
        --logfile '$logfile'
        #if $fetch_option == 'fasta':
            --fasta $fasta
        #end if
  ]]></command>
  <inputs>

    <conditional name="query">
        <param name="option" type="select" label="retrieve data from query or IUD list" display="radio">
            <option value="query" selected="true">Query string</option>
            <option value="list">IUD list</option>
        </param>
        <when value="query">
            <param name="queryString" type="text" size="5x80" area="True"
                   value=""
                   label="Query to NCBI in entrez format"
                   help="exemple: `Drosophila melanogaster[Organism] AND Gcn5[Title]`">
            <sanitizer>
                <valid initial="string.printable">
                    <remove value="&quot;"/>
                    <remove value="\"/>
                </valid>
                <mapping initial="none">
                    <add source="&quot;" target="\&quot;"/>
                    <add source="\" target="\\"/>
                </mapping>
            </sanitizer>
            </param>
        </when>
        <when value="list">
            <param name="iud_list" format="txt,tabular" type="data" label="A list of NCBI UIDs"
                   help="a file with a single column of UIDs, in txt or tabular format"/>
        </when>
    </conditional>      
    <param name="dbname" type="select" label="NCBI database">
      <option value="nuccore">Nucleotide</option>
      <option value="protein">Protein</option>
    </param>
    <param name="fetch_option" type="select" label="select what will be retrieved">
      <option value="fasta" selected="true">Fasta and IUDs</option>
      <option value="justiuds">Only IUDs</option>
    </param>
  </inputs>
  <outputs>
    <data name="fasta" format="fasta" label="Fasta sequences retrieved from NCBI" >
      <filter>fetch_option == "fasta"</filter>
    </data>
    <data name="UIDs" format="txt" label="UIDs" from_work_dir="retrieved_uid_list.txt">
      <filter>query['option'] == "query"</filter>
    </data>
    <data format="txt" name="logfile" label="logs"/>
  </outputs>
  <tests>
      <test expect_num_outputs="3">
          <param name="queryString" value="9629650[gi]" />
          <param name="dbname" value="nuccore" />
          <param name="fetch_option" value="fasta"/>
          <output name="fasta" ftype="fasta" file="output.fa" />
      </test>
      <test expect_num_outputs="2">
          <param name="queryString" value="CU929326[Accession]" />
          <param name="dbname" value="nuccore" />
          <param name="fetch_option" value="justiuds"/>
          <output name="logfile" ftype="txt" file="dry_run.log" compare="sim_size"/>
      </test>
      <test expect_num_outputs="2">
          <param name="option" value="list" />
          <param name="iud_list" value="input_list.txt" ftype="txt" />
          <param name="dbname" value="nuccore" />
          <param name="fetch_option" value="fasta"/>
          <output name="fasta" ftype="fasta" file="output_list.fa"/>
      </test>
      <test expect_num_outputs="3">
          <param name="queryString" value="Drosophila[Organism] AND 2017[Modification Date] AND virus" />
          <param name="dbname" value="nuccore" />
          <param name="fetch_option" value="fasta"/>
          <output name="fasta" ftype="fasta" >
              <metadata name="sequences" value="2" />
          </output>
      </test>
      <test expect_num_outputs="2">
          <param name="queryString" value="labalbalbalbaalablalbabal[Title]" />
          <param name="dbname" value="nuccore" />
          <param name="fetch_option" value="justiuds"/>
          <output name="logfile" ftype="txt">
              <assert_contents>
                  <has_line_matching expression=".*Found\s+0\s+UIDs" />
              </assert_contents>
          </output>
      </test>
  </tests>
  <help>
**What it does**

This tool retrieves nucleotide/peptide sequences from the corresponding
NCBI database (nuccore or protein) for a given entrez query.

The tool can be set with the query
"txid10239[orgn] NOT txid131567[orgn] AND complete NOT partial[title] NOT phage[title]"
for metaVisitor use purpose

See `Entrez help`_ for explanation of query formats

Be sure to use the appropriate NCBI query syntax. Always use [] to specify the search fields.

By checking the checkbox you can also run your query without sequence
retrieval and get the number of sequences your query will fetch.

Note that the tool may fail in case of interrupted connexion with the NCBI database (see the log dataset)

Retrieval progress is reported in the log dataset.

**Options**::
  <![CDATA[
  usage: fetch_fasta_from_NCBI.py [-h] [--query QUERY_STRING]
                                  [--iud_file IUDS_FILE] [--output OUTNAME]
                                  [--dbname DBNAME] [--fasta GET_FASTA]
                                  [--logfile LOGFILE]
                                  [--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
  
  Retrieve data from NCBI
  
  optional arguments:
    -h, --help            show this help message and exit
    --query QUERY_STRING, -i QUERY_STRING
                          NCBI Query String
    --iud_file IUDS_FILE  input list of iuds to be fetched
    --output OUTNAME, -o OUTNAME
                          output file name
    --dbname DBNAME, -d DBNAME
                          database type
    --fasta GET_FASTA, -F GET_FASTA
                          retrieve fasta sequences
    --logfile LOGFILE, -l LOGFILE
                          log file (default=stderr)
    --loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}
                          logging level (default: INFO)
  ]]>


.. _Entrez help: https://www.ncbi.nlm.nih.gov/books/NBK3837/#EntrezHelp.Entrez_Searching_Options

  </help>
  <citations>
      <citation type="doi">10.1186/1471-2105-14-73</citation>
  </citations>
</tool>