Mercurial > repos > petr-novak > dante
annotate dante.xml @ 32:393fb45bd50f draft
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
author | petr-novak |
---|---|
date | Tue, 04 Feb 2025 09:48:33 +0000 |
parents | ae4cebdccf74 |
children | 726b8447eb09 |
rev | line source |
---|---|
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
1 <tool id="dante" name="Domain based ANnotation of Transposable Elements - DANTE" version="2.6.2"> |
0 | 2 <description> Tool for annotation of transposable elements based on the similarity to conserved protein domains database. </description> |
3 <requirements> | |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
4 <requirement type="package">dante=0.2.5</requirement> |
0 | 5 </requirements> |
15 | 6 <stdio> |
7 <regex match="Traceback" source="stderr" level="fatal" description="Unknown error" /> | |
8 <regex match="error" source="stderr" level="fatal" description="Unknown error" /> | |
9 </stdio> | |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
10 <command><![CDATA[ |
15 | 11 #if str($input_type.input_type_selector) == "aln" |
23
e2bbc79f0fac
"planemo upload commit baf4ca09569b1b709c37f2df712e778da05edaf9-dirty"
petr-novak
parents:
22
diff
changeset
|
12 parse_aln.py -a $(input_sequences) -f sequences.fasta -p sequences.profile |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
13 && |
15 | 14 INPUT_SEQUENCES="sequences.fasta" |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
15 #else |
15 | 16 INPUT_SEQUENCES=$(input_sequences) |
17 #end if | |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
18 && |
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
19 grep -c "^>" \${INPUT_SEQUENCES} |
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
20 && |
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
21 NUMBER_OF_SEQUENCES=`grep -c "^>" \${INPUT_SEQUENCES}` |
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
22 && |
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
23 if [ \${NUMBER_OF_SEQUENCES} -gt 10000 ]; then |
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
24 SHORT_READS="--short_reads"; |
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
25 else |
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
26 SHORT_READS=""; |
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
27 fi |
15 | 28 |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
29 && |
23
e2bbc79f0fac
"planemo upload commit baf4ca09569b1b709c37f2df712e778da05edaf9-dirty"
petr-novak
parents:
22
diff
changeset
|
30 dante --query \${INPUT_SEQUENCES} --domain_gff ${DomGff} |
e2bbc79f0fac
"planemo upload commit baf4ca09569b1b709c37f2df712e778da05edaf9-dirty"
petr-novak
parents:
22
diff
changeset
|
31 --database $database |
e2bbc79f0fac
"planemo upload commit baf4ca09569b1b709c37f2df712e778da05edaf9-dirty"
petr-novak
parents:
22
diff
changeset
|
32 --scoring_matrix ${scoring_matrix} |
24
df99812ded92
"planemo upload commit a0a9b02c60a91942a271b8b35648c0b152fe1ebd-dirty"
petr-novak
parents:
23
diff
changeset
|
33 --cpu \${GALAXY_SLOTS:-1} |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
34 \${SHORT_READS} |
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
35 |
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
36 && |
23
e2bbc79f0fac
"planemo upload commit baf4ca09569b1b709c37f2df712e778da05edaf9-dirty"
petr-novak
parents:
22
diff
changeset
|
37 dante_gff_output_filtering.py --dom_gff ${DomGff} |
22 | 38 --domains_prot_seq ${Domains_filtered} --domains_filtered ${DomGff_filtered} |
39 --output_dir . | |
40 --selected_dom All --th_identity 0.35 | |
41 --th_similarity 0.45 --th_length 0.8 | |
42 --interruptions 3 --max_len_proportion 1.2 | |
43 --element_type '' | |
15 | 44 |
45 #if str($input_type.input_type_selector) == "aln" | |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
46 && |
23
e2bbc79f0fac
"planemo upload commit baf4ca09569b1b709c37f2df712e778da05edaf9-dirty"
petr-novak
parents:
22
diff
changeset
|
47 coverage2gff.py -p sequences.profile -g ${DomGff} |
15 | 48 #end if |
10 | 49 |
15 | 50 #if str($iterative) == "Yes" |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
51 && |
23
e2bbc79f0fac
"planemo upload commit baf4ca09569b1b709c37f2df712e778da05edaf9-dirty"
petr-novak
parents:
22
diff
changeset
|
52 dante_gff_output_filtering.py --dom_gff ${DomGff} |
15 | 53 --domains_prot_seq domains_filtered.fasta --domains_filtered domains_filtered.gff |
54 --output_dir . | |
55 --selected_dom All --th_identity 0.35 | |
56 --th_similarity 0.45 --th_length 0.9 | |
57 --interruptions 1 --max_len_proportion 1.1 | |
58 --element_type '' | |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
59 && |
10 | 60 |
15 | 61 |
62 | |
23
e2bbc79f0fac
"planemo upload commit baf4ca09569b1b709c37f2df712e778da05edaf9-dirty"
petr-novak
parents:
22
diff
changeset
|
63 fasta2database.py domains_filtered.fasta domains_filtered.db |
15 | 64 domains_filtered.class |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
65 && |
10 | 66 |
15 | 67 lastdb -p domains_filtered.db domains_filtered.db |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
68 && |
15 | 69 |
23
e2bbc79f0fac
"planemo upload commit baf4ca09569b1b709c37f2df712e778da05edaf9-dirty"
petr-novak
parents:
22
diff
changeset
|
70 dante.py --query \${INPUT_SEQUENCES} --domain_gff ${DomGff2} |
15 | 71 --protein_database domains_filtered.db |
72 --classification domains_filtered.class | |
23
e2bbc79f0fac
"planemo upload commit baf4ca09569b1b709c37f2df712e778da05edaf9-dirty"
petr-novak
parents:
22
diff
changeset
|
73 --scoring_matrix BL80 |
15 | 74 |
10 | 75 |
15 | 76 #if str($input_type.input_type_selector) == "aln" |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
77 && |
23
e2bbc79f0fac
"planemo upload commit baf4ca09569b1b709c37f2df712e778da05edaf9-dirty"
petr-novak
parents:
22
diff
changeset
|
78 coverage2gff.py -p sequences.profile -g ${DomGff2} |
15 | 79 #end if |
80 #end if | |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
81 ]]> |
15 | 82 </command> |
83 <inputs> | |
0 | 84 |
15 | 85 <conditional name="input_type"> |
86 <param name="input_type_selector" type="select" label="Choose the type of sequence data"> | |
87 <option value="fasta" selected="true">Fasta</option> | |
88 <option value="aln">Aln file</option> | |
89 </param> | |
90 <when value="fasta"> | |
91 <param name="input_sequences" type="data" format="fasta" label="Sequences in fasta format"/> | |
92 </when> | |
93 <when value="aln"> | |
94 <param name="input_sequences" type="data" format="txt" label="Sequences in ALN format (extracted from RepeatExplorer)"/> | |
95 </when> | |
96 </conditional> | |
23
e2bbc79f0fac
"planemo upload commit baf4ca09569b1b709c37f2df712e778da05edaf9-dirty"
petr-novak
parents:
22
diff
changeset
|
97 <param name="database" type="select" label="Select REXdb database"> |
30
f0663cdbae66
planemo upload commit 29868d121127a8bb509a42fb917b09f669ad4a09-dirty
petr-novak
parents:
28
diff
changeset
|
98 <option value="Viridiplantae_v4.0" selected="true">Viridiplantae_v4.0</option> |
23
e2bbc79f0fac
"planemo upload commit baf4ca09569b1b709c37f2df712e778da05edaf9-dirty"
petr-novak
parents:
22
diff
changeset
|
99 <option value="Viridiplantae_v3.0" selected="true">Viridiplantae_v3.0</option> |
e2bbc79f0fac
"planemo upload commit baf4ca09569b1b709c37f2df712e778da05edaf9-dirty"
petr-novak
parents:
22
diff
changeset
|
100 <option value="Metazoa_v3.1" selected="true">Metazoa_v3.1</option> |
e2bbc79f0fac
"planemo upload commit baf4ca09569b1b709c37f2df712e778da05edaf9-dirty"
petr-novak
parents:
22
diff
changeset
|
101 <option value="Viridiplantae_v2.2" selected="true">Viridiplantae_v2.2</option> |
30
f0663cdbae66
planemo upload commit 29868d121127a8bb509a42fb917b09f669ad4a09-dirty
petr-novak
parents:
28
diff
changeset
|
102 <option value="Metazoa_v3.0" selected="true">Metazoa_v3.0</option> |
15 | 103 </param> |
104 <param name="scoring_matrix" type="select" label="Select scoring matrix"> | |
105 <option value="BL80" selected="true" >BLOSUM80</option> | |
106 <option value="BL62">BLOSUM62</option> | |
107 <option value="MIQS">MIQS</option> | |
108 </param> | |
0 | 109 |
15 | 110 <param name="iterative" type="select" label="Run iterative search" truevalue="true" valsevalue="false" |
111 help="Second iteration run search against database of proteins extracted from query. Second iteration can yield some extra hits in some cases."> | |
112 <option value="No" selected="true">No</option> | |
113 <option value="Yes">Yes</option> | |
114 </param> | |
115 </inputs> | |
0 | 116 |
15 | 117 <outputs> |
22 | 118 <data format="gff3" name="DomGff" label="DANTE on ${on_string}, full output" /> |
119 <data format="gff3" name="DomGff_filtered" label="DANTE on ${on_string}, filtered output" /> | |
120 <data format="fasta" name="Domains_filtered" label="DANTE on ${on_string}, protein domains, filtered output" /> | |
15 | 121 <data format="gff3" name="DomGff2" label="DANTE on ${on_string}: 2nd pass"> |
122 <filter>iterative == "Yes" </filter> | |
123 </data> | |
124 </outputs> | |
125 <tests> | |
126 <test> | |
127 <param name="input_type" value="fasta"/> | |
128 <param name="input_sequences" value="GEPY_test_long_1.fa"/> | |
129 <param name="db_type" value="Viridiplantae_v3.0"/> | |
130 <param name="scoring_matrix" value="BL80"/> | |
131 <param name="iterative" value="No"/> | |
132 <output name="DomGff" value="GEPY_test_long_1_output_unfiltered.gff3"/> | |
133 </test> | |
0 | 134 |
15 | 135 |
136 </tests> | |
0 | 137 |
138 | |
15 | 139 <help> |
0 | 140 |
15 | 141 |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
142 |
15 | 143 |
144 **WHAT IT DOES** | |
145 | |
146 This tool uses external aligning programme `LAST`_ and RepeatExplorer database of TE protein domains(REXdb) (Viridiplantae and Metazoa) | |
147 | |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
148 .. _LAST: http://last.cbrc.jp/ |
15 | 149 |
150 *Lastal* runs similarity search to find hits between query DNA sequence and our database of protein domains from all Viridiplantae repetitive elements. Hits with overlapping positions in the sequence (even through other hits) forms a cluster which represents one potential protein domain. Strand orientation is taken into consideration when forming the clusters which means each cluster is built from forward or reverse stranded hits exclusively. The clusters are subsequently processed separately; within one cluster positions are scanned base-by-base and classification strings are assigned for each of them based on the database sequences which were mapped on that place. These asigned classification strings consist of a domain type as well as class and lineage of the repetitive element where the database protein comes from. Different classification levels are separated by "|" character. Every hit is scored according to the scoring matrix used for DNA-protein alignment (BLOSUM80). For single position only the hits reaching certain percentage (80% by default) of the overall best score within the whole cluster are reported. One cluster of overlapping hits represents one domain region and is recorded as one line in the resulting GFF3 file. Regarding the classition strings assigned to one region (cluster) there are three situations that can occur: | |
0 | 151 |
15 | 152 1. There is a single classification string assigned to each position as well as classifications along all the positions in the region are mutually uniform, in this case domain's final classification is equivalent to this unique classification. |
153 2. There are multiple classification strings assigned to one cluster, i.e. one domain, which leads to classification to the common (less specific) level of all the strings | |
154 3. There is a conflict at the domain type level, domains are reported with slash (e.g. RT/INT) and the classification is in this case ambiguous | |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
155 |
15 | 156 **There are 2 outputs produced by this tool:** |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
157 |
15 | 158 1. GFF3 file of all proteins domains built from all hits found by LAST. Domains are reported per line as regions (start - end) on the original DNA sequence including the seq ID, alignment score and strand orientation. The last "Attributes" column contains several semicolon-separated information related to annotation, repetitive classification, alignment and its quality. This file can undergo further filtering using *Protein Domain Filter* tool |
0 | 159 |
15 | 160 - Attributes reported always: |
161 | |
162 Name | |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
163 type of domain; if ambiguous reported with slash |
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
164 |
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
165 Final_classification |
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
166 definite classification based on all partial classifications of Region_hits_classifications attribute or |
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
167 "Ambiguous_domain" when there is an ambiguous domain type |
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
168 |
15 | 169 Region_Hits_Classifications |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
170 all hits classifications (comma separated) from a certain domain region that reach the set score threshold; in case of multiple annotations the square brackets indicate the number of bases having this particular classification |
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
171 |
15 | 172 - Attributes only reported in case of unambiguous domain type (all the attributes including quality information are related to the Best_Hit of the region): |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
173 |
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
174 Best_hit |
0 | 175 classification and position of the best alignment with the highest score within the cluster; in the square brackets is the percentage of the whole cluster range that this best hit covers |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
176 |
15 | 177 Best_Hit_DB_Pos |
0 | 178 showing which part of the original datatabase domain corresponding to the Best Hit was aligned on query DNA (e.g. **Best_Hit_DB_Pos=17:75of79** means the Best Hit reported in GFF represents region from 17th to 75th of total 79 aminoacids in the original domain from the database) |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
179 |
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
180 DB_Seq |
0 | 181 database protein sequence of the best hit mapped to the query DNA |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
182 |
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
183 Query_Seq |
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
184 alignment sequence of the query DNA for the best hit |
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
185 |
15 | 186 Identity |
0 | 187 ratio of identical amino acids in alignment sequence to the length of alignment |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
188 |
15 | 189 Similarity |
0 | 190 ratio of alignment positions with positive score (according to the scoring matrix) to the length of alignment |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
191 |
15 | 192 Relat_Length |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
193 ratio of gapless length of the aligned protein sequence to the whole length of the database protein |
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
194 |
15 | 195 Relat_Interruptions |
0 | 196 number of the interruptions (frameshifts + stop codons) in aligned translated query sequence per each starting 100 AA |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
197 |
15 | 198 Hit_to_DB_Length |
0 | 199 proportion of alignment length to the original length of the protein domain from database |
32
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
200 |
393fb45bd50f
planemo upload commit ca3e700d61477ca5de353babb7f5f1db469d937b-dirty
petr-novak
parents:
31
diff
changeset
|
201 |
0 | 202 |
15 | 203 !NOTE: Tool can in average process 0.5 Gbps of the DNA sequence per day. This is only a rough estimate and it is highly dependent on input data (repetive elements occurence) as well as computing resources. Maximum running time of the tool is 7 days. |
0 | 204 |
15 | 205 </help> |
0 | 206 </tool> |
207 |