comparison dante.xml @ 32:393fb45bd50f draft

planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
author petr-novak
date Tue, 04 Feb 2025 09:48:33 +0000
parents ae4cebdccf74
children 726b8447eb09
comparison
equal deleted inserted replaced
31:ae4cebdccf74 32:393fb45bd50f
1 <tool id="dante" name="Domain based ANnotation of Transposable Elements - DANTE" version="2.6.1"> 1 <tool id="dante" name="Domain based ANnotation of Transposable Elements - DANTE" version="2.6.2">
2 <description> Tool for annotation of transposable elements based on the similarity to conserved protein domains database. </description> 2 <description> Tool for annotation of transposable elements based on the similarity to conserved protein domains database. </description>
3 <requirements> 3 <requirements>
4 <requirement type="package">dante=0.2.6</requirement> 4 <requirement type="package">dante=0.2.5</requirement>
5 </requirements> 5 </requirements>
6 <stdio> 6 <stdio>
7 <regex match="Traceback" source="stderr" level="fatal" description="Unknown error" /> 7 <regex match="Traceback" source="stderr" level="fatal" description="Unknown error" />
8 <regex match="error" source="stderr" level="fatal" description="Unknown error" /> 8 <regex match="error" source="stderr" level="fatal" description="Unknown error" />
9 </stdio> 9 </stdio>
10 <command> 10 <command><![CDATA[
11 #if str($input_type.input_type_selector) == "aln" 11 #if str($input_type.input_type_selector) == "aln"
12 parse_aln.py -a $(input_sequences) -f sequences.fasta -p sequences.profile 12 parse_aln.py -a $(input_sequences) -f sequences.fasta -p sequences.profile
13 &amp;&amp; 13 &&
14 INPUT_SEQUENCES="sequences.fasta" 14 INPUT_SEQUENCES="sequences.fasta"
15 #else 15 #else
16 INPUT_SEQUENCES=$(input_sequences) 16 INPUT_SEQUENCES=$(input_sequences)
17 #end if 17 #end if
18 &amp;&amp; 18 &&
19 19 grep -c "^>" \${INPUT_SEQUENCES}
20 &&
21 NUMBER_OF_SEQUENCES=`grep -c "^>" \${INPUT_SEQUENCES}`
22 &&
23 if [ \${NUMBER_OF_SEQUENCES} -gt 10000 ]; then
24 SHORT_READS="--short_reads";
25 else
26 SHORT_READS="";
27 fi
28
29 &&
20 dante --query \${INPUT_SEQUENCES} --domain_gff ${DomGff} 30 dante --query \${INPUT_SEQUENCES} --domain_gff ${DomGff}
21 --database $database 31 --database $database
22 --scoring_matrix ${scoring_matrix} 32 --scoring_matrix ${scoring_matrix}
23 --cpu \${GALAXY_SLOTS:-1} 33 --cpu \${GALAXY_SLOTS:-1}
24 &amp;&amp; 34 \${SHORT_READS}
35
36 &&
25 dante_gff_output_filtering.py --dom_gff ${DomGff} 37 dante_gff_output_filtering.py --dom_gff ${DomGff}
26 --domains_prot_seq ${Domains_filtered} --domains_filtered ${DomGff_filtered} 38 --domains_prot_seq ${Domains_filtered} --domains_filtered ${DomGff_filtered}
27 --output_dir . 39 --output_dir .
28 --selected_dom All --th_identity 0.35 40 --selected_dom All --th_identity 0.35
29 --th_similarity 0.45 --th_length 0.8 41 --th_similarity 0.45 --th_length 0.8
30 --interruptions 3 --max_len_proportion 1.2 42 --interruptions 3 --max_len_proportion 1.2
31 --element_type '' 43 --element_type ''
32 44
33 #if str($input_type.input_type_selector) == "aln" 45 #if str($input_type.input_type_selector) == "aln"
34 &amp;&amp; 46 &&
35 coverage2gff.py -p sequences.profile -g ${DomGff} 47 coverage2gff.py -p sequences.profile -g ${DomGff}
36 #end if 48 #end if
37 49
38 #if str($iterative) == "Yes" 50 #if str($iterative) == "Yes"
39 &amp;&amp; 51 &&
40 dante_gff_output_filtering.py --dom_gff ${DomGff} 52 dante_gff_output_filtering.py --dom_gff ${DomGff}
41 --domains_prot_seq domains_filtered.fasta --domains_filtered domains_filtered.gff 53 --domains_prot_seq domains_filtered.fasta --domains_filtered domains_filtered.gff
42 --output_dir . 54 --output_dir .
43 --selected_dom All --th_identity 0.35 55 --selected_dom All --th_identity 0.35
44 --th_similarity 0.45 --th_length 0.9 56 --th_similarity 0.45 --th_length 0.9
45 --interruptions 1 --max_len_proportion 1.1 57 --interruptions 1 --max_len_proportion 1.1
46 --element_type '' 58 --element_type ''
47 &amp;&amp; 59 &&
48 60
49 61
50 62
51 fasta2database.py domains_filtered.fasta domains_filtered.db 63 fasta2database.py domains_filtered.fasta domains_filtered.db
52 domains_filtered.class 64 domains_filtered.class
53 &amp;&amp; 65 &&
54 66
55 lastdb -p domains_filtered.db domains_filtered.db 67 lastdb -p domains_filtered.db domains_filtered.db
56 &amp;&amp; 68 &&
57 69
58 dante.py --query \${INPUT_SEQUENCES} --domain_gff ${DomGff2} 70 dante.py --query \${INPUT_SEQUENCES} --domain_gff ${DomGff2}
59 --protein_database domains_filtered.db 71 --protein_database domains_filtered.db
60 --classification domains_filtered.class 72 --classification domains_filtered.class
61 --scoring_matrix BL80 73 --scoring_matrix BL80
62 74
63 75
64 #if str($input_type.input_type_selector) == "aln" 76 #if str($input_type.input_type_selector) == "aln"
65 &amp;&amp; 77 &&
66 coverage2gff.py -p sequences.profile -g ${DomGff2} 78 coverage2gff.py -p sequences.profile -g ${DomGff2}
67 #end if 79 #end if
68 #end if 80 #end if
69 81 ]]>
70 </command> 82 </command>
71 <inputs> 83 <inputs>
72 84
73 <conditional name="input_type"> 85 <conditional name="input_type">
74 <param name="input_type_selector" type="select" label="Choose the type of sequence data"> 86 <param name="input_type_selector" type="select" label="Choose the type of sequence data">
125 137
126 138
127 <help> 139 <help>
128 140
129 141
130 142
131 143
132 **WHAT IT DOES** 144 **WHAT IT DOES**
133 145
134 This tool uses external aligning programme `LAST`_ and RepeatExplorer database of TE protein domains(REXdb) (Viridiplantae and Metazoa) 146 This tool uses external aligning programme `LAST`_ and RepeatExplorer database of TE protein domains(REXdb) (Viridiplantae and Metazoa)
135 147
136 .. _LAST: http://last.cbrc.jp/ 148 .. _LAST: http://last.cbrc.jp/
137 149
138 *Lastal* runs similarity search to find hits between query DNA sequence and our database of protein domains from all Viridiplantae repetitive elements. Hits with overlapping positions in the sequence (even through other hits) forms a cluster which represents one potential protein domain. Strand orientation is taken into consideration when forming the clusters which means each cluster is built from forward or reverse stranded hits exclusively. The clusters are subsequently processed separately; within one cluster positions are scanned base-by-base and classification strings are assigned for each of them based on the database sequences which were mapped on that place. These asigned classification strings consist of a domain type as well as class and lineage of the repetitive element where the database protein comes from. Different classification levels are separated by "|" character. Every hit is scored according to the scoring matrix used for DNA-protein alignment (BLOSUM80). For single position only the hits reaching certain percentage (80% by default) of the overall best score within the whole cluster are reported. One cluster of overlapping hits represents one domain region and is recorded as one line in the resulting GFF3 file. Regarding the classition strings assigned to one region (cluster) there are three situations that can occur: 150 *Lastal* runs similarity search to find hits between query DNA sequence and our database of protein domains from all Viridiplantae repetitive elements. Hits with overlapping positions in the sequence (even through other hits) forms a cluster which represents one potential protein domain. Strand orientation is taken into consideration when forming the clusters which means each cluster is built from forward or reverse stranded hits exclusively. The clusters are subsequently processed separately; within one cluster positions are scanned base-by-base and classification strings are assigned for each of them based on the database sequences which were mapped on that place. These asigned classification strings consist of a domain type as well as class and lineage of the repetitive element where the database protein comes from. Different classification levels are separated by "|" character. Every hit is scored according to the scoring matrix used for DNA-protein alignment (BLOSUM80). For single position only the hits reaching certain percentage (80% by default) of the overall best score within the whole cluster are reported. One cluster of overlapping hits represents one domain region and is recorded as one line in the resulting GFF3 file. Regarding the classition strings assigned to one region (cluster) there are three situations that can occur:
139 151
140 1. There is a single classification string assigned to each position as well as classifications along all the positions in the region are mutually uniform, in this case domain's final classification is equivalent to this unique classification. 152 1. There is a single classification string assigned to each position as well as classifications along all the positions in the region are mutually uniform, in this case domain's final classification is equivalent to this unique classification.
141 2. There are multiple classification strings assigned to one cluster, i.e. one domain, which leads to classification to the common (less specific) level of all the strings 153 2. There are multiple classification strings assigned to one cluster, i.e. one domain, which leads to classification to the common (less specific) level of all the strings
142 3. There is a conflict at the domain type level, domains are reported with slash (e.g. RT/INT) and the classification is in this case ambiguous 154 3. There is a conflict at the domain type level, domains are reported with slash (e.g. RT/INT) and the classification is in this case ambiguous
143 155
144 **There are 2 outputs produced by this tool:** 156 **There are 2 outputs produced by this tool:**
145 157
146 1. GFF3 file of all proteins domains built from all hits found by LAST. Domains are reported per line as regions (start - end) on the original DNA sequence including the seq ID, alignment score and strand orientation. The last "Attributes" column contains several semicolon-separated information related to annotation, repetitive classification, alignment and its quality. This file can undergo further filtering using *Protein Domain Filter* tool 158 1. GFF3 file of all proteins domains built from all hits found by LAST. Domains are reported per line as regions (start - end) on the original DNA sequence including the seq ID, alignment score and strand orientation. The last "Attributes" column contains several semicolon-separated information related to annotation, repetitive classification, alignment and its quality. This file can undergo further filtering using *Protein Domain Filter* tool
147 159
148 - Attributes reported always: 160 - Attributes reported always:
149 161
150 Name 162 Name
151 type of domain; if ambiguous reported with slash 163 type of domain; if ambiguous reported with slash
152 164
153 Final_classification 165 Final_classification
154 definite classification based on all partial classifications of Region_hits_classifications attribute or 166 definite classification based on all partial classifications of Region_hits_classifications attribute or
155 "Ambiguous_domain" when there is an ambiguous domain type 167 "Ambiguous_domain" when there is an ambiguous domain type
156 168
157 Region_Hits_Classifications 169 Region_Hits_Classifications
158 all hits classifications (comma separated) from a certain domain region that reach the set score threshold; in case of multiple annotations the square brackets indicate the number of bases having this particular classification 170 all hits classifications (comma separated) from a certain domain region that reach the set score threshold; in case of multiple annotations the square brackets indicate the number of bases having this particular classification
159 171
160 - Attributes only reported in case of unambiguous domain type (all the attributes including quality information are related to the Best_Hit of the region): 172 - Attributes only reported in case of unambiguous domain type (all the attributes including quality information are related to the Best_Hit of the region):
161 173
162 Best_hit 174 Best_hit
163 classification and position of the best alignment with the highest score within the cluster; in the square brackets is the percentage of the whole cluster range that this best hit covers 175 classification and position of the best alignment with the highest score within the cluster; in the square brackets is the percentage of the whole cluster range that this best hit covers
164 176
165 Best_Hit_DB_Pos 177 Best_Hit_DB_Pos
166 showing which part of the original datatabase domain corresponding to the Best Hit was aligned on query DNA (e.g. **Best_Hit_DB_Pos=17:75of79** means the Best Hit reported in GFF represents region from 17th to 75th of total 79 aminoacids in the original domain from the database) 178 showing which part of the original datatabase domain corresponding to the Best Hit was aligned on query DNA (e.g. **Best_Hit_DB_Pos=17:75of79** means the Best Hit reported in GFF represents region from 17th to 75th of total 79 aminoacids in the original domain from the database)
167 179
168 DB_Seq 180 DB_Seq
169 database protein sequence of the best hit mapped to the query DNA 181 database protein sequence of the best hit mapped to the query DNA
170 182
171 Query_Seq 183 Query_Seq
172 alignment sequence of the query DNA for the best hit 184 alignment sequence of the query DNA for the best hit
173 185
174 Identity 186 Identity
175 ratio of identical amino acids in alignment sequence to the length of alignment 187 ratio of identical amino acids in alignment sequence to the length of alignment
176 188
177 Similarity 189 Similarity
178 ratio of alignment positions with positive score (according to the scoring matrix) to the length of alignment 190 ratio of alignment positions with positive score (according to the scoring matrix) to the length of alignment
179 191
180 Relat_Length 192 Relat_Length
181 ratio of gapless length of the aligned protein sequence to the whole length of the database protein 193 ratio of gapless length of the aligned protein sequence to the whole length of the database protein
182 194
183 Relat_Interruptions 195 Relat_Interruptions
184 number of the interruptions (frameshifts + stop codons) in aligned translated query sequence per each starting 100 AA 196 number of the interruptions (frameshifts + stop codons) in aligned translated query sequence per each starting 100 AA
185 197
186 Hit_to_DB_Length 198 Hit_to_DB_Length
187 proportion of alignment length to the original length of the protein domain from database 199 proportion of alignment length to the original length of the protein domain from database
188 200
189 201
190 202
191 !NOTE: Tool can in average process 0.5 Gbps of the DNA sequence per day. This is only a rough estimate and it is highly dependent on input data (repetive elements occurence) as well as computing resources. Maximum running time of the tool is 7 days. 203 !NOTE: Tool can in average process 0.5 Gbps of the DNA sequence per day. This is only a rough estimate and it is highly dependent on input data (repetive elements occurence) as well as computing resources. Maximum running time of the tool is 7 days.
192 204
193 </help> 205 </help>
194 </tool> 206 </tool>