Mercurial > repos > artbio > fetch_fasta_from_ncbi
diff fetch_fasta_from_NCBI.xml @ 5:706fe8139955 draft
"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/fetch_fasta_from_ncbi commit b5ef783237b244d684e26b1ed1cc333a8305ce3e"
author | artbio |
---|---|
date | Tue, 16 Mar 2021 23:26:58 +0000 |
parents | c667d0ee39f5 |
children | 4af77e1af12a |
line wrap: on
line diff
--- a/fetch_fasta_from_NCBI.xml Wed Nov 29 17:38:52 2017 -0500 +++ b/fetch_fasta_from_NCBI.xml Tue Mar 16 23:26:58 2021 +0000 @@ -1,111 +1,104 @@ -<tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="2.3.0"> - <description></description> - <command><![CDATA[ +<tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="3.0.0"> + <description></description> + <requirements> + <requirement type="package" version="1.25.9">urllib3</requirement> + </requirements> + <command><![CDATA[ python '$__tool_directory__'/fetch_fasta_from_NCBI.py - -i "$queryString" - -d $dbname - -l '$logfile' - -c - -o '$outfile'; - #if $dry_run == "" - number_UIDs=\$(tail -n 2 $logfile | perl -ne '/Found (\d+) UID/ && print \$1'); - python '$__tool_directory__'/fetch_fasta_from_NCBI.py - -i "$queryString" - -d $dbname - -u - -l '$logfile' - -o 'uid_outfile'; - UID_array=( \$(head uid_outfile) ); - array_len=\${#UID_array[@]}; - counter=0; - number_of_groups=\$((array_len / 200000)); - modulo=\$((array_len % 200000)); - if [ "\$modulo" -gt 0 ];then - number_of_groups=\$((number_of_groups + 1)); - fi; - group_number=1; - echo "----- Number of groups of batches: \$number_of_groups -----" >> $logfile; - for ((i=0; i+200000<array_len;i+=200000)); do - echo "----- Group number: \$group_number -----" >> $logfile; - echo "\${UID_array[@]:\$i:99999}" > uid_list_1.txt; - echo "\${UID_array[@]:\$((i+100000)):99999}" > uid_list_2.txt; - python '$__tool_directory__'/fetch_fasta_from_NCBI.py - -d $dbname - -l '$logfile' - -o 'tmp1_outfile' - --UID_list uid_list_1.txt& - python '$__tool_directory__'/fetch_fasta_from_NCBI.py - -d $dbname - -l 'tmp1_logfile' - -o 'tmp2_outfile' - --UID_list uid_list_2.txt& - wait; - cat tmp1_outfile tmp2_outfile>> $outfile; - rm tmp1_outfile tmp2_outfile; - cat tmp1_logfile >> $logfile; - rm tmp1_logfile; - rm uid_list_1.txt uid_list_2.txt; - group_number=\$((group_number + 1)); - counter=\$(( counter + 200000 )); - done; - echo "----- Group number: \$group_number -----" >> $logfile; - echo "----- Last group -----" >> $logfile; - if [ "\$counter" -lt "\$array_len" ]; then - echo "\${UID_array[@]:\$counter:\$((array_len - counter + 1))}" > uid_list.txt; - python '$__tool_directory__'/fetch_fasta_from_NCBI.py - -d $dbname - -l '$logfile' - -o 'tmp_outfile' - --UID_list uid_list.txt; - rm uid_list.txt; - cat tmp_outfile >> $outfile; - rm tmp_outfile; - fi; - #end if + #if $query.option == 'query': + --query '$query.queryString' + #else: + --iud_file '$query.iud_list' + #end if + --dbname '$dbname' + --logfile '$logfile' + #if $fetch_option == 'fasta': + --fasta $fasta + #end if ]]></command> + <inputs> - <inputs> - <param name="queryString" type="text" size="5x80" area="True" value="txid10239[orgn] NOT txid131567[orgn] AND complete[all] NOT partial[title] NOT phage[title]" label="Query to NCBI in entrez format" help="exemple: Drosophila melanogaster[Organism] AND Gcn5[Title]"> - <sanitizer> - <valid initial="string.printable"> - <remove value="""/> - <remove value="\"/> - </valid> - <mapping initial="none"> - <add source=""" target="\""/> - <add source="\" target="\\"/> - </mapping> - </sanitizer> - </param> + <conditional name="query"> + <param name="option" type="select" label="retrieve data from query or IUD list" display="radio"> + <option value="query" selected="true">Query string</option> + <option value="list">IUD list</option> + </param> + <when value="query"> + <param name="queryString" type="text" size="5x80" area="True" + value="" + label="Query to NCBI in entrez format" + help="exemple: `Drosophila melanogaster[Organism] AND Gcn5[Title]`"> + <sanitizer> + <valid initial="string.printable"> + <remove value="""/> + <remove value="\"/> + </valid> + <mapping initial="none"> + <add source=""" target="\""/> + <add source="\" target="\\"/> + </mapping> + </sanitizer> + </param> + </when> + <when value="list"> + <param name="iud_list" format="txt,tabular" type="data" label="A list of NCBI UIDs" + help="a file with a single column of UIDs, in txt or tabular format"/> + </when> + </conditional> <param name="dbname" type="select" label="NCBI database"> <option value="nuccore">Nucleotide</option> <option value="protein">Protein</option> </param> - <param name="dry_run" type="boolean" label="Get only the number of sequences" truevalue="--count" falsevalue="" checked="false"/> + <param name="fetch_option" type="select" label="select what will be retrieved"> + <option value="fasta" selected="true">Fasta and IUDs</option> + <option value="justiuds">Only IUDs</option> + </param> </inputs> <outputs> - <data name="outfile" format="fasta" label="${tool.name} (${dbname.value_label}) with queryString '${queryString.value}'" > - <filter> dry_run == False</filter> + <data name="fasta" format="fasta" label="Fasta sequences retrieved from NCBI" > + <filter>fetch_option == "fasta"</filter> </data> - <data format="txt" name="logfile" label="${tool.name}: log"/> + <data name="UIDs" format="txt" label="UIDs" from_work_dir="retrieved_uid_list.txt"> + <filter>query['option'] == "query"</filter> + </data> + <data format="txt" name="logfile" label="logs"/> </outputs> <tests> <test> <param name="queryString" value="9629650[gi]" /> <param name="dbname" value="nuccore" /> - <output name="outfilename" ftype="fasta" file="output.fa" /> + <param name="fetch_option" value="fasta"/> + <output name="fasta" ftype="fasta" file="output.fa" /> </test> <test> <param name="queryString" value="CU929326[Accession]" /> <param name="dbname" value="nuccore" /> - <param name="date_filter" value="1"/> - <param name="dry_run" value="True"/> + <param name="fetch_option" value="justiuds"/> <output name="logfile" ftype="txt" file="dry_run.log" compare="sim_size"/> </test> <test> - <param name="queryString" value="Drosophila[Organism] AND 2014[PDAT] AND virus" /> - <output name="outfilename" ftype="fasta" > - <metadata name="sequences" value="13" /> + <param name="option" value="list" /> + <param name="iud_list" value="input_list.txt" ftype="txt" /> + <param name="dbname" value="nuccore" /> + <param name="fetch_option" value="fasta"/> + <output name="fasta" ftype="fasta" file="output_list.fa"/> + </test> + <test> + <param name="queryString" value="Drosophila[Organism] AND 2017[Modification Date] AND virus" /> + <param name="dbname" value="nuccore" /> + <param name="fetch_option" value="fasta"/> + <output name="fasta" ftype="fasta" > + <metadata name="sequences" value="9" /> + </output> + </test> + <test> + <param name="queryString" value="labalbalbalbaalablalbabal[Title]" /> + <param name="dbname" value="nuccore" /> + <param name="fetch_option" value="justiuds"/> + <output name="logfile" ftype="txt"> + <assert_contents> + <has_line_matching expression=".*Found\s+0\s+UIDs" /> + </assert_contents> </output> </test> </tests> @@ -114,7 +107,7 @@ This tool retrieves nucleotide/peptide sequences from the corresponding NCBI database (nuccore or protein) for a given entrez query. -The tool is preset with "txid10239[orgn] NOT txid131567[orgn] AND complete NOT partial[title] NOT phage[title]" for metaVisitor use purpose +The tool can be set with the query "txid10239[orgn] NOT txid131567[orgn] AND complete NOT partial[title] NOT phage[title]" for metaVisitor use purpose See `Entrez help`_ for explanation of query formats @@ -126,6 +119,33 @@ Retrieval progress is reported in the log dataset. +**Options**:: + <![CDATA[ + usage: fetch_fasta_from_NCBI.py [-h] [--query QUERY_STRING] + [--iud_file IUDS_FILE] [--output OUTNAME] + [--dbname DBNAME] [--fasta GET_FASTA] + [--logfile LOGFILE] + [--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}] + + Retrieve data from NCBI + + optional arguments: + -h, --help show this help message and exit + --query QUERY_STRING, -i QUERY_STRING + NCBI Query String + --iud_file IUDS_FILE input list of iuds to be fetched + --output OUTNAME, -o OUTNAME + output file name + --dbname DBNAME, -d DBNAME + database type + --fasta GET_FASTA, -F GET_FASTA + retrieve fasta sequences + --logfile LOGFILE, -l LOGFILE + log file (default=stderr) + --loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} + logging level (default: INFO) + ]]> + **Acknowledgments** This Galaxy tool has been adapted from the galaxy tool `get_fasta_from_taxon`_.