annotate dante.xml @ 14:a6c55d1bdb6c draft

Uploaded
author petr-novak
date Wed, 28 Aug 2019 08:08:47 -0400
parents d0431a839606
children 3151a72a6671
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
1 <tool id="dante" name="Domain based ANnotation of Transposable Elements - DANTE" version="1.0.0">
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
2 <description> Tool for annotation of transposable elements based on the similarity to conserved protein domains database. </description>
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
3 <requirements>
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
4 <requirement type="package">last</requirement>
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
5 <requirement type="package">numpy</requirement>
3
18d6c1c66798 Uploaded
petr-novak
parents: 0
diff changeset
6 <requirement type="package" version="1.0">rexdb</requirement>
10
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
7 <requirement type="set_environment">REXDB</requirement>
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
8 </requirements>
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
9 <stdio>
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
10 <regex match="Traceback" source="stderr" level="fatal" description="Unknown error" />
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
11 <regex match="error" source="stderr" level="fatal" description="Unknown error" />
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
12 </stdio>
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
13 <command>
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
14 python3 ${__tool_directory__}/dante.py --query ${input} --domain_gff ${DomGff}
3
18d6c1c66798 Uploaded
petr-novak
parents: 0
diff changeset
15 --protein_database \${REXDB}/${db_type}_pdb
6
6dcecbe81d78 Uploaded
petr-novak
parents: 3
diff changeset
16 --classification \${REXDB}/${db_type}_class
10
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
17 --scoring_matrix ${scoring_matrix}
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
18 &amp;&amp;
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
19
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
20 python3 ${__tool_directory__}/dante_gff_output_filtering.py --dom_gff ${DomGff}
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
21 --domains_prot_seq domains_filtered.fasta --domains_filtered domains_filtered.gff
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
22 --output_dir .
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
23 --selected_dom All --th_identity 0.35
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
24 --th_similarity 0.45 --th_length 0.9
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
25 --interruptions 1 --max_len_proportion 1.1
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
26 --element_type '' &amp;&amp;
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
27
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
28 python3 ${__tool_directory__}/fasta2database.py domains_filtered.fasta domains_filtered.db
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
29 domains_filtered.class &amp;&amp;
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
30
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
31 lastdb -p domains_filtered.db domains_filtered.db &amp;&amp;
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
32
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
33 python3 ${__tool_directory__}/dante.py --query ${input} --domain_gff ${DomGff2}
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
34 --protein_database domains_filtered.db
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
35 --classification domains_filtered.class
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
36 --scoring_matrix BL80
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
37
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
38 </command>
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
39 <inputs>
10
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
40 <param format="fasta" type="data" name="input"
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
41 label="Choose your input sequence" help="Input DNA must be in proper fasta format, multi-fasta containing more sequences is allowed" />
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
42
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
43 <param name="db_type" type="select" label="Select taxon and protein domain database version (REXdb)" help="">
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
44 <options from_file="rexdb_versions.loc">
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
45 <column name="name" index="0"/>
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
46 <column name="value" index="1"/>
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
47 </options>
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
48 </param>
10
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
49
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
50 <param name="scoring_matrix" type="select" label="Select scoring matrix">
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
51 <option value="BL80" selected="true" >BLOSUM80</option>
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
52 <option value="BL62">BLOSUM62</option>
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
53 <option value="MIQS">MIQS</option>
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
54 </param>
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
55 </inputs>
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
56
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
57 <outputs>
10
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
58 <data format="gff3" name="DomGff" label="protein domains detected in ${input.hid} - 1st pass (unfiltered)" />
d0431a839606 Uploaded
petr-novak
parents: 6
diff changeset
59 <data format="gff3" name="DomGff2" label="protein domains detected in ${input.hid} - 2nd pass (unfiltered)" />
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
60 </outputs>
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
61 <help>
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
62
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
63 THIS IS A PRIMARY OUTPUT THAT SHOULD UNDERGO FURTHER QUALITY FILTERING TO GET RID OFF POTENTIAL FALSE POSITIVE DOMAINS
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
64
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
65 **WHAT IT DOES**
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
66
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
67 This tool uses external aligning programme `LAST`_ and RepeatExplorer database of TE protein domains(REXdb) (Viridiplantae and Metazoa)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
68
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
69 .. _LAST: http://last.cbrc.jp/
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
70
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
71 *Lastal* runs similarity search to find hits between query DNA sequence and our database of protein domains from all Viridiplantae repetitive elements. Hits with overlapping positions in the sequence (even through other hits) forms a cluster which represents one potential protein domain. Strand orientation is taken into consideration when forming the clusters which means each cluster is built from forward or reverse stranded hits exclusively. The clusters are subsequently processed separately; within one cluster positions are scanned base-by-base and classification strings are assigned for each of them based on the database sequences which were mapped on that place. These asigned classification strings consist of a domain type as well as class and lineage of the repetitive element where the database protein comes from. Different classification levels are separated by "|" character. Every hit is scored according to the scoring matrix used for DNA-protein alignment (BLOSUM80). For single position only the hits reaching certain percentage (80% by default) of the overall best score within the whole cluster are reported. One cluster of overlapping hits represents one domain region and is recorded as one line in the resulting GFF3 file. Regarding the classition strings assigned to one region (cluster) there are three situations that can occur:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
72
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
73 1. There is a single classification string assigned to each position as well as classifications along all the positions in the region are mutually uniform, in this case domain's final classification is equivalent to this unique classification.
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
74 2. There are multiple classification strings assigned to one cluster, i.e. one domain, which leads to classification to the common (less specific) level of all the strings
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
75 3. There is a conflict at the domain type level, domains are reported with slash (e.g. RT/INT) and the classification is in this case ambiguous
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
76
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
77 **There are 2 outputs produced by this tool:**
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
78
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
79 1. GFF3 file of all proteins domains built from all hits found by LAST. Domains are reported per line as regions (start - end) on the original DNA sequence including the seq ID, alignment score and strand orientation. The last "Attributes" column contains several semicolon-separated information related to annotation, repetitive classification, alignment and its quality. This file can undergo further filtering using *Protein Domain Filter* tool
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
80
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
81 - Attributes reported always:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
82
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
83 Name
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
84 type of domain; if ambiguous reported with slash
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
85
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
86 Final_classification
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
87 definite classification based on all partial classifications of Region_hits_classifications attribute or
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
88 "Ambiguous_domain" when there is an ambiguous domain type
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
89
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
90 Region_Hits_Classifications
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
91 all hits classifications (comma separated) from a certain domain region that reach the set score threshold; in case of multiple annotations the square brackets indicate the number of bases having this particular classification
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
92
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
93 - Attributes only reported in case of unambiguous domain type (all the attributes including quality information are related to the Best_Hit of the region):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
94
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
95 Best_hit
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
96 classification and position of the best alignment with the highest score within the cluster; in the square brackets is the percentage of the whole cluster range that this best hit covers
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
97
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
98 Best_Hit_DB_Pos
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
99 showing which part of the original datatabase domain corresponding to the Best Hit was aligned on query DNA (e.g. **Best_Hit_DB_Pos=17:75of79** means the Best Hit reported in GFF represents region from 17th to 75th of total 79 aminoacids in the original domain from the database)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
100
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
101 DB_Seq
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
102 database protein sequence of the best hit mapped to the query DNA
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
103
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
104 Query_Seq
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
105 alignment sequence of the query DNA for the best hit
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
106
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
107 Identity
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
108 ratio of identical amino acids in alignment sequence to the length of alignment
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
109
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
110 Similarity
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
111 ratio of alignment positions with positive score (according to the scoring matrix) to the length of alignment
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
112
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
113 Relat_Length
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
114 ratio of gapless length of the aligned protein sequence to the whole length of the database protein
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
115
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
116 Relat_Interruptions
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
117 number of the interruptions (frameshifts + stop codons) in aligned translated query sequence per each starting 100 AA
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
118
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
119 Hit_to_DB_Length
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
120 proportion of alignment length to the original length of the protein domain from database
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
121
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
122
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
123
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
124 !NOTE: Tool can in average process 0.5 Gbps of the DNA sequence per day. This is only a rough estimate and it is highly dependent on input data (repetive elements occurence) as well as computing resources. Maximum running time of the tool is 7 days.
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
125
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
126 </help>
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
127 </tool>
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
128