comparison fetch_fasta_from_NCBI.xml @ 5:706fe8139955 draft

"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/fetch_fasta_from_ncbi commit b5ef783237b244d684e26b1ed1cc333a8305ce3e"
author artbio
date Tue, 16 Mar 2021 23:26:58 +0000
parents c667d0ee39f5
children 4af77e1af12a
comparison
equal deleted inserted replaced
4:c667d0ee39f5 5:706fe8139955
1 <tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="2.3.0"> 1 <tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="3.0.0">
2 <description></description> 2 <description></description>
3 <command><![CDATA[ 3 <requirements>
4 <requirement type="package" version="1.25.9">urllib3</requirement>
5 </requirements>
6 <command><![CDATA[
4 python '$__tool_directory__'/fetch_fasta_from_NCBI.py 7 python '$__tool_directory__'/fetch_fasta_from_NCBI.py
5 -i "$queryString" 8 #if $query.option == 'query':
6 -d $dbname 9 --query '$query.queryString'
7 -l '$logfile' 10 #else:
8 -c 11 --iud_file '$query.iud_list'
9 -o '$outfile'; 12 #end if
10 #if $dry_run == "" 13 --dbname '$dbname'
11 number_UIDs=\$(tail -n 2 $logfile | perl -ne '/Found (\d+) UID/ && print \$1'); 14 --logfile '$logfile'
12 python '$__tool_directory__'/fetch_fasta_from_NCBI.py 15 #if $fetch_option == 'fasta':
13 -i "$queryString" 16 --fasta $fasta
14 -d $dbname 17 #end if
15 -u
16 -l '$logfile'
17 -o 'uid_outfile';
18 UID_array=( \$(head uid_outfile) );
19 array_len=\${#UID_array[@]};
20 counter=0;
21 number_of_groups=\$((array_len / 200000));
22 modulo=\$((array_len % 200000));
23 if [ "\$modulo" -gt 0 ];then
24 number_of_groups=\$((number_of_groups + 1));
25 fi;
26 group_number=1;
27 echo "----- Number of groups of batches: \$number_of_groups -----" >> $logfile;
28 for ((i=0; i+200000<array_len;i+=200000)); do
29 echo "----- Group number: \$group_number -----" >> $logfile;
30 echo "\${UID_array[@]:\$i:99999}" > uid_list_1.txt;
31 echo "\${UID_array[@]:\$((i+100000)):99999}" > uid_list_2.txt;
32 python '$__tool_directory__'/fetch_fasta_from_NCBI.py
33 -d $dbname
34 -l '$logfile'
35 -o 'tmp1_outfile'
36 --UID_list uid_list_1.txt&
37 python '$__tool_directory__'/fetch_fasta_from_NCBI.py
38 -d $dbname
39 -l 'tmp1_logfile'
40 -o 'tmp2_outfile'
41 --UID_list uid_list_2.txt&
42 wait;
43 cat tmp1_outfile tmp2_outfile>> $outfile;
44 rm tmp1_outfile tmp2_outfile;
45 cat tmp1_logfile >> $logfile;
46 rm tmp1_logfile;
47 rm uid_list_1.txt uid_list_2.txt;
48 group_number=\$((group_number + 1));
49 counter=\$(( counter + 200000 ));
50 done;
51 echo "----- Group number: \$group_number -----" >> $logfile;
52 echo "----- Last group -----" >> $logfile;
53 if [ "\$counter" -lt "\$array_len" ]; then
54 echo "\${UID_array[@]:\$counter:\$((array_len - counter + 1))}" > uid_list.txt;
55 python '$__tool_directory__'/fetch_fasta_from_NCBI.py
56 -d $dbname
57 -l '$logfile'
58 -o 'tmp_outfile'
59 --UID_list uid_list.txt;
60 rm uid_list.txt;
61 cat tmp_outfile >> $outfile;
62 rm tmp_outfile;
63 fi;
64 #end if
65 ]]></command> 18 ]]></command>
19 <inputs>
66 20
67 <inputs> 21 <conditional name="query">
68 <param name="queryString" type="text" size="5x80" area="True" value="txid10239[orgn] NOT txid131567[orgn] AND complete[all] NOT partial[title] NOT phage[title]" label="Query to NCBI in entrez format" help="exemple: Drosophila melanogaster[Organism] AND Gcn5[Title]"> 22 <param name="option" type="select" label="retrieve data from query or IUD list" display="radio">
69 <sanitizer> 23 <option value="query" selected="true">Query string</option>
70 <valid initial="string.printable"> 24 <option value="list">IUD list</option>
71 <remove value="&quot;"/> 25 </param>
72 <remove value="\"/> 26 <when value="query">
73 </valid> 27 <param name="queryString" type="text" size="5x80" area="True"
74 <mapping initial="none"> 28 value=""
75 <add source="&quot;" target="\&quot;"/> 29 label="Query to NCBI in entrez format"
76 <add source="\" target="\\"/> 30 help="exemple: `Drosophila melanogaster[Organism] AND Gcn5[Title]`">
77 </mapping> 31 <sanitizer>
78 </sanitizer> 32 <valid initial="string.printable">
79 </param> 33 <remove value="&quot;"/>
34 <remove value="\"/>
35 </valid>
36 <mapping initial="none">
37 <add source="&quot;" target="\&quot;"/>
38 <add source="\" target="\\"/>
39 </mapping>
40 </sanitizer>
41 </param>
42 </when>
43 <when value="list">
44 <param name="iud_list" format="txt,tabular" type="data" label="A list of NCBI UIDs"
45 help="a file with a single column of UIDs, in txt or tabular format"/>
46 </when>
47 </conditional>
80 <param name="dbname" type="select" label="NCBI database"> 48 <param name="dbname" type="select" label="NCBI database">
81 <option value="nuccore">Nucleotide</option> 49 <option value="nuccore">Nucleotide</option>
82 <option value="protein">Protein</option> 50 <option value="protein">Protein</option>
83 </param> 51 </param>
84 <param name="dry_run" type="boolean" label="Get only the number of sequences" truevalue="--count" falsevalue="" checked="false"/> 52 <param name="fetch_option" type="select" label="select what will be retrieved">
53 <option value="fasta" selected="true">Fasta and IUDs</option>
54 <option value="justiuds">Only IUDs</option>
55 </param>
85 </inputs> 56 </inputs>
86 <outputs> 57 <outputs>
87 <data name="outfile" format="fasta" label="${tool.name} (${dbname.value_label}) with queryString '${queryString.value}'" > 58 <data name="fasta" format="fasta" label="Fasta sequences retrieved from NCBI" >
88 <filter> dry_run == False</filter> 59 <filter>fetch_option == "fasta"</filter>
89 </data> 60 </data>
90 <data format="txt" name="logfile" label="${tool.name}: log"/> 61 <data name="UIDs" format="txt" label="UIDs" from_work_dir="retrieved_uid_list.txt">
62 <filter>query['option'] == "query"</filter>
63 </data>
64 <data format="txt" name="logfile" label="logs"/>
91 </outputs> 65 </outputs>
92 <tests> 66 <tests>
93 <test> 67 <test>
94 <param name="queryString" value="9629650[gi]" /> 68 <param name="queryString" value="9629650[gi]" />
95 <param name="dbname" value="nuccore" /> 69 <param name="dbname" value="nuccore" />
96 <output name="outfilename" ftype="fasta" file="output.fa" /> 70 <param name="fetch_option" value="fasta"/>
71 <output name="fasta" ftype="fasta" file="output.fa" />
97 </test> 72 </test>
98 <test> 73 <test>
99 <param name="queryString" value="CU929326[Accession]" /> 74 <param name="queryString" value="CU929326[Accession]" />
100 <param name="dbname" value="nuccore" /> 75 <param name="dbname" value="nuccore" />
101 <param name="date_filter" value="1"/> 76 <param name="fetch_option" value="justiuds"/>
102 <param name="dry_run" value="True"/>
103 <output name="logfile" ftype="txt" file="dry_run.log" compare="sim_size"/> 77 <output name="logfile" ftype="txt" file="dry_run.log" compare="sim_size"/>
104 </test> 78 </test>
105 <test> 79 <test>
106 <param name="queryString" value="Drosophila[Organism] AND 2014[PDAT] AND virus" /> 80 <param name="option" value="list" />
107 <output name="outfilename" ftype="fasta" > 81 <param name="iud_list" value="input_list.txt" ftype="txt" />
108 <metadata name="sequences" value="13" /> 82 <param name="dbname" value="nuccore" />
83 <param name="fetch_option" value="fasta"/>
84 <output name="fasta" ftype="fasta" file="output_list.fa"/>
85 </test>
86 <test>
87 <param name="queryString" value="Drosophila[Organism] AND 2017[Modification Date] AND virus" />
88 <param name="dbname" value="nuccore" />
89 <param name="fetch_option" value="fasta"/>
90 <output name="fasta" ftype="fasta" >
91 <metadata name="sequences" value="9" />
92 </output>
93 </test>
94 <test>
95 <param name="queryString" value="labalbalbalbaalablalbabal[Title]" />
96 <param name="dbname" value="nuccore" />
97 <param name="fetch_option" value="justiuds"/>
98 <output name="logfile" ftype="txt">
99 <assert_contents>
100 <has_line_matching expression=".*Found\s+0\s+UIDs" />
101 </assert_contents>
109 </output> 102 </output>
110 </test> 103 </test>
111 </tests> 104 </tests>
112 <help> 105 <help>
113 **What it does** 106 **What it does**
114 107
115 This tool retrieves nucleotide/peptide sequences from the corresponding NCBI database (nuccore or protein) for a given entrez query. 108 This tool retrieves nucleotide/peptide sequences from the corresponding NCBI database (nuccore or protein) for a given entrez query.
116 109
117 The tool is preset with "txid10239[orgn] NOT txid131567[orgn] AND complete NOT partial[title] NOT phage[title]" for metaVisitor use purpose 110 The tool can be set with the query "txid10239[orgn] NOT txid131567[orgn] AND complete NOT partial[title] NOT phage[title]" for metaVisitor use purpose
118 111
119 See `Entrez help`_ for explanation of query formats 112 See `Entrez help`_ for explanation of query formats
120 113
121 Be sure to use the appropriate NCBI query syntax. Always use [] to specify the search fields. 114 Be sure to use the appropriate NCBI query syntax. Always use [] to specify the search fields.
122 115
123 By checking the checkbox you can also run your query without sequence retrieval and get the number of sequences your query will fetch. 116 By checking the checkbox you can also run your query without sequence retrieval and get the number of sequences your query will fetch.
124 117
125 Note that the tool may fail in case of interrupted connexion with the NCBI database (see the log dataset) 118 Note that the tool may fail in case of interrupted connexion with the NCBI database (see the log dataset)
126 119
127 Retrieval progress is reported in the log dataset. 120 Retrieval progress is reported in the log dataset.
121
122 **Options**::
123 <![CDATA[
124 usage: fetch_fasta_from_NCBI.py [-h] [--query QUERY_STRING]
125 [--iud_file IUDS_FILE] [--output OUTNAME]
126 [--dbname DBNAME] [--fasta GET_FASTA]
127 [--logfile LOGFILE]
128 [--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
129
130 Retrieve data from NCBI
131
132 optional arguments:
133 -h, --help show this help message and exit
134 --query QUERY_STRING, -i QUERY_STRING
135 NCBI Query String
136 --iud_file IUDS_FILE input list of iuds to be fetched
137 --output OUTNAME, -o OUTNAME
138 output file name
139 --dbname DBNAME, -d DBNAME
140 database type
141 --fasta GET_FASTA, -F GET_FASTA
142 retrieve fasta sequences
143 --logfile LOGFILE, -l LOGFILE
144 log file (default=stderr)
145 --loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}
146 logging level (default: INFO)
147 ]]>
128 148
129 **Acknowledgments** 149 **Acknowledgments**
130 150
131 This Galaxy tool has been adapted from the galaxy tool `get_fasta_from_taxon`_. 151 This Galaxy tool has been adapted from the galaxy tool `get_fasta_from_taxon`_.
132 152