Mercurial > repos > artbio > fetch_fasta_from_ncbi
comparison fetch_fasta_from_NCBI.xml @ 4:c667d0ee39f5 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/fetch_fasta_from_ncbi commit ca3070e85c370b914ffa0562afe12b363e05aea4
author | artbio |
---|---|
date | Wed, 29 Nov 2017 17:38:52 -0500 |
parents | 8be88084f89c |
children | 706fe8139955 |
comparison
equal
deleted
inserted
replaced
3:8be88084f89c | 4:c667d0ee39f5 |
---|---|
1 <tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="2.2.1"> | 1 <tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="2.3.0"> |
2 <description></description> | 2 <description></description> |
3 <command><![CDATA[ | 3 <command><![CDATA[ |
4 python '$__tool_directory__'/fetch_fasta_from_NCBI.py | 4 python '$__tool_directory__'/fetch_fasta_from_NCBI.py |
5 -i "$queryString" | 5 -i "$queryString" |
6 -d $dbname | 6 -d $dbname |
7 -l '$logfile' | 7 -l '$logfile' |
8 $dry_run | 8 -c |
9 -o '$outfile' | 9 -o '$outfile'; |
10 #if $dry_run == "" | |
11 number_UIDs=\$(tail -n 2 $logfile | perl -ne '/Found (\d+) UID/ && print \$1'); | |
12 python '$__tool_directory__'/fetch_fasta_from_NCBI.py | |
13 -i "$queryString" | |
14 -d $dbname | |
15 -u | |
16 -l '$logfile' | |
17 -o 'uid_outfile'; | |
18 UID_array=( \$(head uid_outfile) ); | |
19 array_len=\${#UID_array[@]}; | |
20 counter=0; | |
21 number_of_groups=\$((array_len / 200000)); | |
22 modulo=\$((array_len % 200000)); | |
23 if [ "\$modulo" -gt 0 ];then | |
24 number_of_groups=\$((number_of_groups + 1)); | |
25 fi; | |
26 group_number=1; | |
27 echo "----- Number of groups of batches: \$number_of_groups -----" >> $logfile; | |
28 for ((i=0; i+200000<array_len;i+=200000)); do | |
29 echo "----- Group number: \$group_number -----" >> $logfile; | |
30 echo "\${UID_array[@]:\$i:99999}" > uid_list_1.txt; | |
31 echo "\${UID_array[@]:\$((i+100000)):99999}" > uid_list_2.txt; | |
32 python '$__tool_directory__'/fetch_fasta_from_NCBI.py | |
33 -d $dbname | |
34 -l '$logfile' | |
35 -o 'tmp1_outfile' | |
36 --UID_list uid_list_1.txt& | |
37 python '$__tool_directory__'/fetch_fasta_from_NCBI.py | |
38 -d $dbname | |
39 -l 'tmp1_logfile' | |
40 -o 'tmp2_outfile' | |
41 --UID_list uid_list_2.txt& | |
42 wait; | |
43 cat tmp1_outfile tmp2_outfile>> $outfile; | |
44 rm tmp1_outfile tmp2_outfile; | |
45 cat tmp1_logfile >> $logfile; | |
46 rm tmp1_logfile; | |
47 rm uid_list_1.txt uid_list_2.txt; | |
48 group_number=\$((group_number + 1)); | |
49 counter=\$(( counter + 200000 )); | |
50 done; | |
51 echo "----- Group number: \$group_number -----" >> $logfile; | |
52 echo "----- Last group -----" >> $logfile; | |
53 if [ "\$counter" -lt "\$array_len" ]; then | |
54 echo "\${UID_array[@]:\$counter:\$((array_len - counter + 1))}" > uid_list.txt; | |
55 python '$__tool_directory__'/fetch_fasta_from_NCBI.py | |
56 -d $dbname | |
57 -l '$logfile' | |
58 -o 'tmp_outfile' | |
59 --UID_list uid_list.txt; | |
60 rm uid_list.txt; | |
61 cat tmp_outfile >> $outfile; | |
62 rm tmp_outfile; | |
63 fi; | |
64 #end if | |
10 ]]></command> | 65 ]]></command> |
11 | 66 |
12 <inputs> | 67 <inputs> |
13 <param name="queryString" type="text" size="5x80" area="True" value="txid10239[orgn] NOT txid131567[orgn] AND complete[all] NOT partial[title] NOT phage[title]" label="Query to NCBI in entrez format" help="exemple:'Drosophila melanogaster[Organism] AND Gcn5[Title]"> | 68 <param name="queryString" type="text" size="5x80" area="True" value="txid10239[orgn] NOT txid131567[orgn] AND complete[all] NOT partial[title] NOT phage[title]" label="Query to NCBI in entrez format" help="exemple: Drosophila melanogaster[Organism] AND Gcn5[Title]"> |
14 <sanitizer> | 69 <sanitizer> |
15 <valid initial="string.printable"> | 70 <valid initial="string.printable"> |
16 <remove value="""/> | 71 <remove value="""/> |
17 <remove value="\"/> | 72 <remove value="\"/> |
18 </valid> | 73 </valid> |
24 </param> | 79 </param> |
25 <param name="dbname" type="select" label="NCBI database"> | 80 <param name="dbname" type="select" label="NCBI database"> |
26 <option value="nuccore">Nucleotide</option> | 81 <option value="nuccore">Nucleotide</option> |
27 <option value="protein">Protein</option> | 82 <option value="protein">Protein</option> |
28 </param> | 83 </param> |
29 <param name="dry_run" type="boolean" label="Dry run to get the number of sequences?" truevalue="--count" falsevalue="" checked="false"/> | 84 <param name="dry_run" type="boolean" label="Get only the number of sequences" truevalue="--count" falsevalue="" checked="false"/> |
30 </inputs> | 85 </inputs> |
31 <outputs> | 86 <outputs> |
32 <data name="outfile" format="fasta" label="${tool.name} (${dbname.value_label}) with queryString '${queryString.value}'" > | 87 <data name="outfile" format="fasta" label="${tool.name} (${dbname.value_label}) with queryString '${queryString.value}'" > |
33 <filter> dry_run == False</filter> | 88 <filter> dry_run == False</filter> |
34 </data> | 89 </data> |
35 <data format="txt" name="logfile" label="${tool.name}: log"/> | 90 <data format="txt" name="logfile" label="${tool.name}: log"/> |
36 </outputs> | 91 </outputs> |
37 <tests> | 92 <tests> |
38 <test> | 93 <test> |
39 <param name="queryString" value="9629650[gi]" /> | 94 <param name="queryString" value="9629650[gi]" /> |
40 <param name="dbname" value="nuccore" /> | 95 <param name="dbname" value="nuccore" /> |
41 <output name="outfilename" ftype="fasta" file="output.fa" /> | 96 <output name="outfilename" ftype="fasta" file="output.fa" /> |
42 </test> | 97 </test> |
43 <test> | 98 <test> |
44 <param name="queryString" value="CU929326[Accession]" /> | 99 <param name="queryString" value="CU929326[Accession]" /> |
45 <param name="dbname" value="nuccore" /> | 100 <param name="dbname" value="nuccore" /> |
46 <param name="date_filter" value="1"/> | 101 <param name="date_filter" value="1"/> |
47 <param name="dry_run" value="True"/> | 102 <param name="dry_run" value="True"/> |
48 <output name="logfile" ftype="txt" file="dry_run.log" compare="sim_size"/> | 103 <output name="logfile" ftype="txt" file="dry_run.log" compare="sim_size"/> |
49 </test> | 104 </test> |
105 <test> | |
106 <param name="queryString" value="Drosophila[Organism] AND 2014[PDAT] AND virus" /> | |
107 <output name="outfilename" ftype="fasta" > | |
108 <metadata name="sequences" value="13" /> | |
109 </output> | |
110 </test> | |
50 </tests> | 111 </tests> |
51 <help> | 112 <help> |
52 **What it does** | 113 **What it does** |
53 | 114 |
54 This tool retrieves nucleotide/peptide sequences from the corresponding NCBI database for a given entrez query. | 115 This tool retrieves nucleotide/peptide sequences from the corresponding NCBI database (nuccore or protein) for a given entrez query. |
55 | 116 |
56 The tool is preset with "txid10239[orgn] NOT txid131567[orgn] AND complete NOT partial[title] NOT phage[title]" for metaVisitor use purpose | 117 The tool is preset with "txid10239[orgn] NOT txid131567[orgn] AND complete NOT partial[title] NOT phage[title]" for metaVisitor use purpose |
57 | 118 |
58 See `Entrez help`_ for explanation of query formats | 119 See `Entrez help`_ for explanation of query formats |
59 | 120 |
60 Be sure to use the appropriate NCBI query syntax. Always use [] to specify the search fields. | 121 Be sure to use the appropriate NCBI query syntax. Always use [] to specify the search fields. |
61 | 122 |
123 By checking the checkbox you can also run your query without sequence retrieval and get the number of sequences your query will fetch. | |
124 | |
62 Note that the tool may fail in case of interrupted connexion with the NCBI database (see the log dataset) | 125 Note that the tool may fail in case of interrupted connexion with the NCBI database (see the log dataset) |
126 | |
127 Retrieval progress is reported in the log dataset. | |
63 | 128 |
64 **Acknowledgments** | 129 **Acknowledgments** |
65 | 130 |
66 This Galaxy tool has been adapted from the galaxy tool `get_fasta_from_taxon`_. | 131 This Galaxy tool has been adapted from the galaxy tool `get_fasta_from_taxon`_. |
67 | 132 |