comparison dante.xml @ 15:3151a72a6671 draft

Uploaded
author petr-novak
date Tue, 03 Sep 2019 05:20:02 -0400
parents d0431a839606
children 1eabd42e00ef
comparison
equal deleted inserted replaced
14:a6c55d1bdb6c 15:3151a72a6671
4 <requirement type="package">last</requirement> 4 <requirement type="package">last</requirement>
5 <requirement type="package">numpy</requirement> 5 <requirement type="package">numpy</requirement>
6 <requirement type="package" version="1.0">rexdb</requirement> 6 <requirement type="package" version="1.0">rexdb</requirement>
7 <requirement type="set_environment">REXDB</requirement> 7 <requirement type="set_environment">REXDB</requirement>
8 </requirements> 8 </requirements>
9 <stdio> 9 <stdio>
10 <regex match="Traceback" source="stderr" level="fatal" description="Unknown error" /> 10 <regex match="Traceback" source="stderr" level="fatal" description="Unknown error" />
11 <regex match="error" source="stderr" level="fatal" description="Unknown error" /> 11 <regex match="error" source="stderr" level="fatal" description="Unknown error" />
12 </stdio> 12 </stdio>
13 <command> 13 <command>
14 python3 ${__tool_directory__}/dante.py --query ${input} --domain_gff ${DomGff} 14 #if str($input_type.input_type_selector) == "aln"
15 --protein_database \${REXDB}/${db_type}_pdb 15 python3 ${__tool_directory__}/parse_aln.py -a $(input_sequences) -f sequences.fasta -p sequences.profile
16 --classification \${REXDB}/${db_type}_class 16 &amp;&amp;
17 --scoring_matrix ${scoring_matrix} 17 INPUT_SEQUENCES="sequences.fasta"
18 &amp;&amp; 18 #else
19 INPUT_SEQUENCES=$(input_sequences)
20 #end if
21 &amp;&amp;
19 22
20 python3 ${__tool_directory__}/dante_gff_output_filtering.py --dom_gff ${DomGff}
21 --domains_prot_seq domains_filtered.fasta --domains_filtered domains_filtered.gff
22 --output_dir .
23 --selected_dom All --th_identity 0.35
24 --th_similarity 0.45 --th_length 0.9
25 --interruptions 1 --max_len_proportion 1.1
26 --element_type '' &amp;&amp;
27 23
28 python3 ${__tool_directory__}/fasta2database.py domains_filtered.fasta domains_filtered.db 24 python3 ${__tool_directory__}/dante.py --query \${INPUT_SEQUENCES} --domain_gff ${DomGff}
29 domains_filtered.class &amp;&amp; 25 --protein_database \${REXDB}/${db_type}_pdb
26 --classification \${REXDB}/${db_type}_class
27 --scoring_matrix ${scoring_matrix}
30 28
31 lastdb -p domains_filtered.db domains_filtered.db &amp;&amp;
32 29
33 python3 ${__tool_directory__}/dante.py --query ${input} --domain_gff ${DomGff2} 30 #if str($input_type.input_type_selector) == "aln"
34 --protein_database domains_filtered.db 31 &amp;&amp;
35 --classification domains_filtered.class 32 python3 ${__tool_directory__}/coverage2gff.py -p sequences.profile -g ${DomGff}
36 --scoring_matrix BL80 33 #end if
37 34
38 </command> 35 #if str($iterative) == "Yes"
39 <inputs> 36 &amp;&amp;
40 <param format="fasta" type="data" name="input" 37 python3 ${__tool_directory__}/dante_gff_output_filtering.py --dom_gff ${DomGff}
41 label="Choose your input sequence" help="Input DNA must be in proper fasta format, multi-fasta containing more sequences is allowed" /> 38 --domains_prot_seq domains_filtered.fasta --domains_filtered domains_filtered.gff
39 --output_dir .
40 --selected_dom All --th_identity 0.35
41 --th_similarity 0.45 --th_length 0.9
42 --interruptions 1 --max_len_proportion 1.1
43 --element_type ''
44 &amp;&amp;
42 45
43 <param name="db_type" type="select" label="Select taxon and protein domain database version (REXdb)" help="">
44 <options from_file="rexdb_versions.loc">
45 <column name="name" index="0"/>
46 <column name="value" index="1"/>
47 </options>
48 </param>
49 46
50 <param name="scoring_matrix" type="select" label="Select scoring matrix">
51 <option value="BL80" selected="true" >BLOSUM80</option>
52 <option value="BL62">BLOSUM62</option>
53 <option value="MIQS">MIQS</option>
54 </param>
55 </inputs>
56 47
57 <outputs> 48 python3 ${__tool_directory__}/fasta2database.py domains_filtered.fasta domains_filtered.db
58 <data format="gff3" name="DomGff" label="protein domains detected in ${input.hid} - 1st pass (unfiltered)" /> 49 domains_filtered.class
59 <data format="gff3" name="DomGff2" label="protein domains detected in ${input.hid} - 2nd pass (unfiltered)" /> 50 &amp;&amp;
60 </outputs>
61 <help>
62 51
63 THIS IS A PRIMARY OUTPUT THAT SHOULD UNDERGO FURTHER QUALITY FILTERING TO GET RID OFF POTENTIAL FALSE POSITIVE DOMAINS 52 lastdb -p domains_filtered.db domains_filtered.db
53 &amp;&amp;
64 54
65 **WHAT IT DOES** 55 python3 ${__tool_directory__}/dante.py --query \${INPUT_SEQUENCES} --domain_gff ${DomGff2}
56 --protein_database domains_filtered.db
57 --classification domains_filtered.class
58 --scoring_matrix BL80
66 59
67 This tool uses external aligning programme `LAST`_ and RepeatExplorer database of TE protein domains(REXdb) (Viridiplantae and Metazoa)
68 60
69 .. _LAST: http://last.cbrc.jp/ 61 #if str($input_type.input_type_selector) == "aln"
62 &amp;&amp;
63 python3 ${__tool_directory__}/coverage2gff.py -p sequences.profile -g ${DomGff2}
64 #end if
65 #end if
70 66
71 *Lastal* runs similarity search to find hits between query DNA sequence and our database of protein domains from all Viridiplantae repetitive elements. Hits with overlapping positions in the sequence (even through other hits) forms a cluster which represents one potential protein domain. Strand orientation is taken into consideration when forming the clusters which means each cluster is built from forward or reverse stranded hits exclusively. The clusters are subsequently processed separately; within one cluster positions are scanned base-by-base and classification strings are assigned for each of them based on the database sequences which were mapped on that place. These asigned classification strings consist of a domain type as well as class and lineage of the repetitive element where the database protein comes from. Different classification levels are separated by "|" character. Every hit is scored according to the scoring matrix used for DNA-protein alignment (BLOSUM80). For single position only the hits reaching certain percentage (80% by default) of the overall best score within the whole cluster are reported. One cluster of overlapping hits represents one domain region and is recorded as one line in the resulting GFF3 file. Regarding the classition strings assigned to one region (cluster) there are three situations that can occur: 67 </command>
68 <inputs>
72 69
73 1. There is a single classification string assigned to each position as well as classifications along all the positions in the region are mutually uniform, in this case domain's final classification is equivalent to this unique classification. 70 <conditional name="input_type">
74 2. There are multiple classification strings assigned to one cluster, i.e. one domain, which leads to classification to the common (less specific) level of all the strings 71 <param name="input_type_selector" type="select" label="Choose the type of sequence data">
75 3. There is a conflict at the domain type level, domains are reported with slash (e.g. RT/INT) and the classification is in this case ambiguous 72 <option value="fasta" selected="true">Fasta</option>
76 73 <option value="aln">Aln file</option>
77 **There are 2 outputs produced by this tool:** 74 </param>
78 75 <when value="fasta">
79 1. GFF3 file of all proteins domains built from all hits found by LAST. Domains are reported per line as regions (start - end) on the original DNA sequence including the seq ID, alignment score and strand orientation. The last "Attributes" column contains several semicolon-separated information related to annotation, repetitive classification, alignment and its quality. This file can undergo further filtering using *Protein Domain Filter* tool 76 <param name="input_sequences" type="data" format="fasta" label="Sequences in fasta format"/>
77 </when>
78 <when value="aln">
79 <param name="input_sequences" type="data" format="txt" label="Sequences in ALN format (extracted from RepeatExplorer)"/>
80 </when>
81 </conditional>
82 <param name="db_type" type="select" label="Select taxon and protein domain database version (REXdb)" help="">
83 <options from_file="rexdb_versions.loc">
84 <column name="name" index="0"/>
85 <column name="value" index="1"/>
86 </options>
87 </param>
80 88
81 - Attributes reported always: 89 <param name="scoring_matrix" type="select" label="Select scoring matrix">
90 <option value="BL80" selected="true" >BLOSUM80</option>
91 <option value="BL62">BLOSUM62</option>
92 <option value="MIQS">MIQS</option>
93 </param>
82 94
83 Name 95 <param name="iterative" type="select" label="Run iterative search" truevalue="true" valsevalue="false"
96 help="Second iteration run search against database of proteins extracted from query. Second iteration can yield some extra hits in some cases.">
97 <option value="No" selected="true">No</option>
98 <option value="Yes">Yes</option>
99 </param>
100 </inputs>
101
102 <outputs>
103 <data format="gff3" name="DomGff" label="DANTE on ${on_string}" />
104 <data format="gff3" name="DomGff2" label="DANTE on ${on_string}: 2nd pass">
105 <filter>iterative == "Yes" </filter>
106 </data>
107 </outputs>
108 <tests>
109 <test>
110 <param name="input_type" value="fasta"/>
111 <param name="input_sequences" value="GEPY_test_long_1.fa"/>
112 <param name="db_type" value="Viridiplantae_v3.0"/>
113 <param name="scoring_matrix" value="BL80"/>
114 <param name="iterative" value="No"/>
115 <output name="DomGff" value="GEPY_test_long_1_output_unfiltered.gff3"/>
116 </test>
117
118
119 </tests>
120
121
122 <help>
123
124
125 THIS IS A PRIMARY OUTPUT THAT SHOULD UNDERGO FURTHER QUALITY FILTERING TO GET RID OFF POTENTIAL FALSE POSITIVE DOMAINS
126
127 **WHAT IT DOES**
128
129 This tool uses external aligning programme `LAST`_ and RepeatExplorer database of TE protein domains(REXdb) (Viridiplantae and Metazoa)
130
131 .. _LAST: http://last.cbrc.jp/
132
133 *Lastal* runs similarity search to find hits between query DNA sequence and our database of protein domains from all Viridiplantae repetitive elements. Hits with overlapping positions in the sequence (even through other hits) forms a cluster which represents one potential protein domain. Strand orientation is taken into consideration when forming the clusters which means each cluster is built from forward or reverse stranded hits exclusively. The clusters are subsequently processed separately; within one cluster positions are scanned base-by-base and classification strings are assigned for each of them based on the database sequences which were mapped on that place. These asigned classification strings consist of a domain type as well as class and lineage of the repetitive element where the database protein comes from. Different classification levels are separated by "|" character. Every hit is scored according to the scoring matrix used for DNA-protein alignment (BLOSUM80). For single position only the hits reaching certain percentage (80% by default) of the overall best score within the whole cluster are reported. One cluster of overlapping hits represents one domain region and is recorded as one line in the resulting GFF3 file. Regarding the classition strings assigned to one region (cluster) there are three situations that can occur:
134
135 1. There is a single classification string assigned to each position as well as classifications along all the positions in the region are mutually uniform, in this case domain's final classification is equivalent to this unique classification.
136 2. There are multiple classification strings assigned to one cluster, i.e. one domain, which leads to classification to the common (less specific) level of all the strings
137 3. There is a conflict at the domain type level, domains are reported with slash (e.g. RT/INT) and the classification is in this case ambiguous
138
139 **There are 2 outputs produced by this tool:**
140
141 1. GFF3 file of all proteins domains built from all hits found by LAST. Domains are reported per line as regions (start - end) on the original DNA sequence including the seq ID, alignment score and strand orientation. The last "Attributes" column contains several semicolon-separated information related to annotation, repetitive classification, alignment and its quality. This file can undergo further filtering using *Protein Domain Filter* tool
142
143 - Attributes reported always:
144
145 Name
84 type of domain; if ambiguous reported with slash 146 type of domain; if ambiguous reported with slash
85 147
86 Final_classification 148 Final_classification
87 definite classification based on all partial classifications of Region_hits_classifications attribute or 149 definite classification based on all partial classifications of Region_hits_classifications attribute or
88 "Ambiguous_domain" when there is an ambiguous domain type 150 "Ambiguous_domain" when there is an ambiguous domain type
89 151
90 Region_Hits_Classifications 152 Region_Hits_Classifications
91 all hits classifications (comma separated) from a certain domain region that reach the set score threshold; in case of multiple annotations the square brackets indicate the number of bases having this particular classification 153 all hits classifications (comma separated) from a certain domain region that reach the set score threshold; in case of multiple annotations the square brackets indicate the number of bases having this particular classification
92 154
93 - Attributes only reported in case of unambiguous domain type (all the attributes including quality information are related to the Best_Hit of the region): 155 - Attributes only reported in case of unambiguous domain type (all the attributes including quality information are related to the Best_Hit of the region):
94 156
95 Best_hit 157 Best_hit
96 classification and position of the best alignment with the highest score within the cluster; in the square brackets is the percentage of the whole cluster range that this best hit covers 158 classification and position of the best alignment with the highest score within the cluster; in the square brackets is the percentage of the whole cluster range that this best hit covers
97 159
98 Best_Hit_DB_Pos 160 Best_Hit_DB_Pos
99 showing which part of the original datatabase domain corresponding to the Best Hit was aligned on query DNA (e.g. **Best_Hit_DB_Pos=17:75of79** means the Best Hit reported in GFF represents region from 17th to 75th of total 79 aminoacids in the original domain from the database) 161 showing which part of the original datatabase domain corresponding to the Best Hit was aligned on query DNA (e.g. **Best_Hit_DB_Pos=17:75of79** means the Best Hit reported in GFF represents region from 17th to 75th of total 79 aminoacids in the original domain from the database)
100 162
101 DB_Seq 163 DB_Seq
102 database protein sequence of the best hit mapped to the query DNA 164 database protein sequence of the best hit mapped to the query DNA
103 165
104 Query_Seq 166 Query_Seq
105 alignment sequence of the query DNA for the best hit 167 alignment sequence of the query DNA for the best hit
106 168
107 Identity 169 Identity
108 ratio of identical amino acids in alignment sequence to the length of alignment 170 ratio of identical amino acids in alignment sequence to the length of alignment
109 171
110 Similarity 172 Similarity
111 ratio of alignment positions with positive score (according to the scoring matrix) to the length of alignment 173 ratio of alignment positions with positive score (according to the scoring matrix) to the length of alignment
112 174
113 Relat_Length 175 Relat_Length
114 ratio of gapless length of the aligned protein sequence to the whole length of the database protein 176 ratio of gapless length of the aligned protein sequence to the whole length of the database protein
115 177
116 Relat_Interruptions 178 Relat_Interruptions
117 number of the interruptions (frameshifts + stop codons) in aligned translated query sequence per each starting 100 AA 179 number of the interruptions (frameshifts + stop codons) in aligned translated query sequence per each starting 100 AA
118 180
119 Hit_to_DB_Length 181 Hit_to_DB_Length
120 proportion of alignment length to the original length of the protein domain from database 182 proportion of alignment length to the original length of the protein domain from database
121 183
122 184
123 185
124 !NOTE: Tool can in average process 0.5 Gbps of the DNA sequence per day. This is only a rough estimate and it is highly dependent on input data (repetive elements occurence) as well as computing resources. Maximum running time of the tool is 7 days. 186 !NOTE: Tool can in average process 0.5 Gbps of the DNA sequence per day. This is only a rough estimate and it is highly dependent on input data (repetive elements occurence) as well as computing resources. Maximum running time of the tool is 7 days.
125 187
126 </help> 188 </help>
127 </tool> 189 </tool>
128 190