Mercurial > repos > artbio > fetch_fasta_from_ncbi
comparison fetch_fasta_from_NCBI.xml @ 5:706fe8139955 draft
"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/fetch_fasta_from_ncbi commit b5ef783237b244d684e26b1ed1cc333a8305ce3e"
author | artbio |
---|---|
date | Tue, 16 Mar 2021 23:26:58 +0000 |
parents | c667d0ee39f5 |
children | 4af77e1af12a |
comparison
equal
deleted
inserted
replaced
4:c667d0ee39f5 | 5:706fe8139955 |
---|---|
1 <tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="2.3.0"> | 1 <tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="3.0.0"> |
2 <description></description> | 2 <description></description> |
3 <command><![CDATA[ | 3 <requirements> |
4 <requirement type="package" version="1.25.9">urllib3</requirement> | |
5 </requirements> | |
6 <command><![CDATA[ | |
4 python '$__tool_directory__'/fetch_fasta_from_NCBI.py | 7 python '$__tool_directory__'/fetch_fasta_from_NCBI.py |
5 -i "$queryString" | 8 #if $query.option == 'query': |
6 -d $dbname | 9 --query '$query.queryString' |
7 -l '$logfile' | 10 #else: |
8 -c | 11 --iud_file '$query.iud_list' |
9 -o '$outfile'; | 12 #end if |
10 #if $dry_run == "" | 13 --dbname '$dbname' |
11 number_UIDs=\$(tail -n 2 $logfile | perl -ne '/Found (\d+) UID/ && print \$1'); | 14 --logfile '$logfile' |
12 python '$__tool_directory__'/fetch_fasta_from_NCBI.py | 15 #if $fetch_option == 'fasta': |
13 -i "$queryString" | 16 --fasta $fasta |
14 -d $dbname | 17 #end if |
15 -u | |
16 -l '$logfile' | |
17 -o 'uid_outfile'; | |
18 UID_array=( \$(head uid_outfile) ); | |
19 array_len=\${#UID_array[@]}; | |
20 counter=0; | |
21 number_of_groups=\$((array_len / 200000)); | |
22 modulo=\$((array_len % 200000)); | |
23 if [ "\$modulo" -gt 0 ];then | |
24 number_of_groups=\$((number_of_groups + 1)); | |
25 fi; | |
26 group_number=1; | |
27 echo "----- Number of groups of batches: \$number_of_groups -----" >> $logfile; | |
28 for ((i=0; i+200000<array_len;i+=200000)); do | |
29 echo "----- Group number: \$group_number -----" >> $logfile; | |
30 echo "\${UID_array[@]:\$i:99999}" > uid_list_1.txt; | |
31 echo "\${UID_array[@]:\$((i+100000)):99999}" > uid_list_2.txt; | |
32 python '$__tool_directory__'/fetch_fasta_from_NCBI.py | |
33 -d $dbname | |
34 -l '$logfile' | |
35 -o 'tmp1_outfile' | |
36 --UID_list uid_list_1.txt& | |
37 python '$__tool_directory__'/fetch_fasta_from_NCBI.py | |
38 -d $dbname | |
39 -l 'tmp1_logfile' | |
40 -o 'tmp2_outfile' | |
41 --UID_list uid_list_2.txt& | |
42 wait; | |
43 cat tmp1_outfile tmp2_outfile>> $outfile; | |
44 rm tmp1_outfile tmp2_outfile; | |
45 cat tmp1_logfile >> $logfile; | |
46 rm tmp1_logfile; | |
47 rm uid_list_1.txt uid_list_2.txt; | |
48 group_number=\$((group_number + 1)); | |
49 counter=\$(( counter + 200000 )); | |
50 done; | |
51 echo "----- Group number: \$group_number -----" >> $logfile; | |
52 echo "----- Last group -----" >> $logfile; | |
53 if [ "\$counter" -lt "\$array_len" ]; then | |
54 echo "\${UID_array[@]:\$counter:\$((array_len - counter + 1))}" > uid_list.txt; | |
55 python '$__tool_directory__'/fetch_fasta_from_NCBI.py | |
56 -d $dbname | |
57 -l '$logfile' | |
58 -o 'tmp_outfile' | |
59 --UID_list uid_list.txt; | |
60 rm uid_list.txt; | |
61 cat tmp_outfile >> $outfile; | |
62 rm tmp_outfile; | |
63 fi; | |
64 #end if | |
65 ]]></command> | 18 ]]></command> |
19 <inputs> | |
66 | 20 |
67 <inputs> | 21 <conditional name="query"> |
68 <param name="queryString" type="text" size="5x80" area="True" value="txid10239[orgn] NOT txid131567[orgn] AND complete[all] NOT partial[title] NOT phage[title]" label="Query to NCBI in entrez format" help="exemple: Drosophila melanogaster[Organism] AND Gcn5[Title]"> | 22 <param name="option" type="select" label="retrieve data from query or IUD list" display="radio"> |
69 <sanitizer> | 23 <option value="query" selected="true">Query string</option> |
70 <valid initial="string.printable"> | 24 <option value="list">IUD list</option> |
71 <remove value="""/> | 25 </param> |
72 <remove value="\"/> | 26 <when value="query"> |
73 </valid> | 27 <param name="queryString" type="text" size="5x80" area="True" |
74 <mapping initial="none"> | 28 value="" |
75 <add source=""" target="\""/> | 29 label="Query to NCBI in entrez format" |
76 <add source="\" target="\\"/> | 30 help="exemple: `Drosophila melanogaster[Organism] AND Gcn5[Title]`"> |
77 </mapping> | 31 <sanitizer> |
78 </sanitizer> | 32 <valid initial="string.printable"> |
79 </param> | 33 <remove value="""/> |
34 <remove value="\"/> | |
35 </valid> | |
36 <mapping initial="none"> | |
37 <add source=""" target="\""/> | |
38 <add source="\" target="\\"/> | |
39 </mapping> | |
40 </sanitizer> | |
41 </param> | |
42 </when> | |
43 <when value="list"> | |
44 <param name="iud_list" format="txt,tabular" type="data" label="A list of NCBI UIDs" | |
45 help="a file with a single column of UIDs, in txt or tabular format"/> | |
46 </when> | |
47 </conditional> | |
80 <param name="dbname" type="select" label="NCBI database"> | 48 <param name="dbname" type="select" label="NCBI database"> |
81 <option value="nuccore">Nucleotide</option> | 49 <option value="nuccore">Nucleotide</option> |
82 <option value="protein">Protein</option> | 50 <option value="protein">Protein</option> |
83 </param> | 51 </param> |
84 <param name="dry_run" type="boolean" label="Get only the number of sequences" truevalue="--count" falsevalue="" checked="false"/> | 52 <param name="fetch_option" type="select" label="select what will be retrieved"> |
53 <option value="fasta" selected="true">Fasta and IUDs</option> | |
54 <option value="justiuds">Only IUDs</option> | |
55 </param> | |
85 </inputs> | 56 </inputs> |
86 <outputs> | 57 <outputs> |
87 <data name="outfile" format="fasta" label="${tool.name} (${dbname.value_label}) with queryString '${queryString.value}'" > | 58 <data name="fasta" format="fasta" label="Fasta sequences retrieved from NCBI" > |
88 <filter> dry_run == False</filter> | 59 <filter>fetch_option == "fasta"</filter> |
89 </data> | 60 </data> |
90 <data format="txt" name="logfile" label="${tool.name}: log"/> | 61 <data name="UIDs" format="txt" label="UIDs" from_work_dir="retrieved_uid_list.txt"> |
62 <filter>query['option'] == "query"</filter> | |
63 </data> | |
64 <data format="txt" name="logfile" label="logs"/> | |
91 </outputs> | 65 </outputs> |
92 <tests> | 66 <tests> |
93 <test> | 67 <test> |
94 <param name="queryString" value="9629650[gi]" /> | 68 <param name="queryString" value="9629650[gi]" /> |
95 <param name="dbname" value="nuccore" /> | 69 <param name="dbname" value="nuccore" /> |
96 <output name="outfilename" ftype="fasta" file="output.fa" /> | 70 <param name="fetch_option" value="fasta"/> |
71 <output name="fasta" ftype="fasta" file="output.fa" /> | |
97 </test> | 72 </test> |
98 <test> | 73 <test> |
99 <param name="queryString" value="CU929326[Accession]" /> | 74 <param name="queryString" value="CU929326[Accession]" /> |
100 <param name="dbname" value="nuccore" /> | 75 <param name="dbname" value="nuccore" /> |
101 <param name="date_filter" value="1"/> | 76 <param name="fetch_option" value="justiuds"/> |
102 <param name="dry_run" value="True"/> | |
103 <output name="logfile" ftype="txt" file="dry_run.log" compare="sim_size"/> | 77 <output name="logfile" ftype="txt" file="dry_run.log" compare="sim_size"/> |
104 </test> | 78 </test> |
105 <test> | 79 <test> |
106 <param name="queryString" value="Drosophila[Organism] AND 2014[PDAT] AND virus" /> | 80 <param name="option" value="list" /> |
107 <output name="outfilename" ftype="fasta" > | 81 <param name="iud_list" value="input_list.txt" ftype="txt" /> |
108 <metadata name="sequences" value="13" /> | 82 <param name="dbname" value="nuccore" /> |
83 <param name="fetch_option" value="fasta"/> | |
84 <output name="fasta" ftype="fasta" file="output_list.fa"/> | |
85 </test> | |
86 <test> | |
87 <param name="queryString" value="Drosophila[Organism] AND 2017[Modification Date] AND virus" /> | |
88 <param name="dbname" value="nuccore" /> | |
89 <param name="fetch_option" value="fasta"/> | |
90 <output name="fasta" ftype="fasta" > | |
91 <metadata name="sequences" value="9" /> | |
92 </output> | |
93 </test> | |
94 <test> | |
95 <param name="queryString" value="labalbalbalbaalablalbabal[Title]" /> | |
96 <param name="dbname" value="nuccore" /> | |
97 <param name="fetch_option" value="justiuds"/> | |
98 <output name="logfile" ftype="txt"> | |
99 <assert_contents> | |
100 <has_line_matching expression=".*Found\s+0\s+UIDs" /> | |
101 </assert_contents> | |
109 </output> | 102 </output> |
110 </test> | 103 </test> |
111 </tests> | 104 </tests> |
112 <help> | 105 <help> |
113 **What it does** | 106 **What it does** |
114 | 107 |
115 This tool retrieves nucleotide/peptide sequences from the corresponding NCBI database (nuccore or protein) for a given entrez query. | 108 This tool retrieves nucleotide/peptide sequences from the corresponding NCBI database (nuccore or protein) for a given entrez query. |
116 | 109 |
117 The tool is preset with "txid10239[orgn] NOT txid131567[orgn] AND complete NOT partial[title] NOT phage[title]" for metaVisitor use purpose | 110 The tool can be set with the query "txid10239[orgn] NOT txid131567[orgn] AND complete NOT partial[title] NOT phage[title]" for metaVisitor use purpose |
118 | 111 |
119 See `Entrez help`_ for explanation of query formats | 112 See `Entrez help`_ for explanation of query formats |
120 | 113 |
121 Be sure to use the appropriate NCBI query syntax. Always use [] to specify the search fields. | 114 Be sure to use the appropriate NCBI query syntax. Always use [] to specify the search fields. |
122 | 115 |
123 By checking the checkbox you can also run your query without sequence retrieval and get the number of sequences your query will fetch. | 116 By checking the checkbox you can also run your query without sequence retrieval and get the number of sequences your query will fetch. |
124 | 117 |
125 Note that the tool may fail in case of interrupted connexion with the NCBI database (see the log dataset) | 118 Note that the tool may fail in case of interrupted connexion with the NCBI database (see the log dataset) |
126 | 119 |
127 Retrieval progress is reported in the log dataset. | 120 Retrieval progress is reported in the log dataset. |
121 | |
122 **Options**:: | |
123 <![CDATA[ | |
124 usage: fetch_fasta_from_NCBI.py [-h] [--query QUERY_STRING] | |
125 [--iud_file IUDS_FILE] [--output OUTNAME] | |
126 [--dbname DBNAME] [--fasta GET_FASTA] | |
127 [--logfile LOGFILE] | |
128 [--loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL}] | |
129 | |
130 Retrieve data from NCBI | |
131 | |
132 optional arguments: | |
133 -h, --help show this help message and exit | |
134 --query QUERY_STRING, -i QUERY_STRING | |
135 NCBI Query String | |
136 --iud_file IUDS_FILE input list of iuds to be fetched | |
137 --output OUTNAME, -o OUTNAME | |
138 output file name | |
139 --dbname DBNAME, -d DBNAME | |
140 database type | |
141 --fasta GET_FASTA, -F GET_FASTA | |
142 retrieve fasta sequences | |
143 --logfile LOGFILE, -l LOGFILE | |
144 log file (default=stderr) | |
145 --loglevel {DEBUG,INFO,WARNING,ERROR,CRITICAL} | |
146 logging level (default: INFO) | |
147 ]]> | |
128 | 148 |
129 **Acknowledgments** | 149 **Acknowledgments** |
130 | 150 |
131 This Galaxy tool has been adapted from the galaxy tool `get_fasta_from_taxon`_. | 151 This Galaxy tool has been adapted from the galaxy tool `get_fasta_from_taxon`_. |
132 | 152 |