0
|
1 <tool id="blastxml_to_tabular_selectable" name="BLAST XML to selected tabular columns" version="0.0.9">
|
|
2 <description>Convert BLAST XML output to tabular</description>
|
|
3 <command interpreter="python">
|
|
4 blastxml_to_tabular_selectable.py -o $tabular_file
|
|
5 #if $output.out_format == 'cols' and $output.columns:
|
|
6 -c '$output.columns'
|
|
7 #else
|
|
8 -c '$output.out_format'
|
|
9 #end if
|
|
10 $qdef
|
|
11 $allqueries
|
|
12 #if $unmatched:
|
|
13 -u $unmatched_file
|
|
14 #end if
|
|
15 #if $maxhits.__str__ != '':
|
|
16 --maxhits $maxhits
|
|
17 #end if
|
|
18 #if $maxhsps.__str__ != '':
|
|
19 --maxhsps $maxhsps
|
|
20 #end if
|
|
21 #for i in $blastxml_file#${i} #end for#
|
|
22 ## $blastxml_file
|
|
23 </command>
|
|
24 <inputs>
|
|
25 <param name="blastxml_file" type="data" format="blastxml" multiple="true" label="BLAST results as XML"/>
|
|
26 <param name="qdef" type="boolean" truevalue="-d" falsevalue="" checked="False" label="Use Iteration_query-def value for qseqid"/>
|
|
27 <param name="allqueries" type="boolean" truevalue="-a" falsevalue="" checked="False" label="Output all queries including those with no hits"/>
|
|
28 <param name="unmatched" type="boolean" truevalue="-u" falsevalue="" checked="False" label="Output a list with queries having no hits"/>
|
|
29 <param name="maxhits" type="integer" value="1" optional="true" label="Maximum number of Hits to display for a query">
|
|
30 <validator type="in_range" min="1" />
|
|
31 </param>
|
|
32 <param name="maxhsps" type="integer" value="1" optional="true" label="Maximum number of HSPs to display for a Hit">
|
|
33 <validator type="in_range" min="1" />
|
|
34 </param>
|
|
35
|
|
36 <conditional name="output">
|
|
37 <param name="out_format" type="select" label="Output format">
|
|
38 <option value="std" selected="True">Tabular (standard 12 columns)</option>
|
|
39 <option value="ext">Tabular (extended 24 columns)</option>
|
|
40 <option value="cols">Tabular (select columns to output)</option>
|
|
41 </param>
|
|
42 <when value="std"/>
|
|
43 <when value="ext"/>
|
|
44 <when value="cols">
|
|
45 <param name="columns" type="select" multiple="true" display="checkboxes" label="Output columns">
|
|
46 <option value="qseqid"> 1 qseqid Query Seq-id (ID of your sequence)</option>
|
|
47 <option value="sseqid"> 2 sseqid Subject Seq-id (ID of the database hit)</option>
|
|
48 <option value="pident"> 3 pident Percentage of identical matches</option>
|
|
49 <option value="length"> 4 length Alignment length</option>
|
|
50 <option value="mismatch"> 5 mismatch Number of mismatches</option>
|
|
51 <option value="gapopen"> 6 gapopen Number of gap openings</option>
|
|
52 <option value="qstart"> 7 qstart Start of alignment in query</option>
|
|
53 <option value="qend"> 8 qend End of alignment in query</option>
|
|
54 <option value="sstart"> 9 sstart Start of alignment in subject (database hit)</option>
|
|
55 <option value="send">10 send End of alignment in subject (database hit)</option>
|
|
56 <option value="evalue">11 evalue Expectation value (E-value)</option>
|
|
57 <option value="bitscore">12 bitscore Bit score</option>
|
|
58 <option value="sallseqid">13 sallseqid All subject Seq-id(s), separated by a ';'</option>
|
|
59 <option value="score">14 score Raw score</option>
|
|
60 <option value="nident">15 nident Number of identical matches</option>
|
|
61 <option value="positive">16 positive Number of positive-scoring matches</option>
|
|
62 <option value="gaps">17 gaps Total number of gaps</option>
|
|
63 <option value="ppos">18 ppos Percentage of positive-scoring matches</option>
|
|
64 <option value="qframe">19 qframe Query frame</option>
|
|
65 <option value="sframe">20 sframe Subject frame</option>
|
|
66 <option value="qseq">21 qseq Aligned part of query sequence</option>
|
|
67 <option value="sseq">22 sseq Aligned part of subject sequence</option>
|
|
68 <option value="qlen">23 qlen Query sequence length</option>
|
|
69 <option value="slen">24 slen Subject sequence length</option>
|
|
70 <option value="salltitles">25 salltitles = All subject title(s), separated by a '<>'</option>
|
|
71 </param>
|
|
72 </when>
|
|
73 </conditional>
|
|
74 </inputs>
|
|
75 <outputs>
|
|
76 <data name="tabular_file" format="tabular" label="BLAST results as tabular for ${on_string}" />
|
|
77 <data name="unmatched_file" format="tabular" label="Query sequences with no hits for ${on_string}">
|
|
78 <filter>unmatched == True</filter>
|
|
79 </data>
|
|
80 </outputs>
|
|
81 <requirements>
|
|
82 </requirements>
|
|
83 <tests>
|
|
84 <test>
|
|
85 <param name="blastxml_file" value="blastp_rhodopsin_proteins.xml" ftype="blastxml" />
|
|
86 <param name="out_format" value="std" />
|
|
87 <output name="tabular_file" file="blastp_rhodopsin_proteins_std.tabular" ftype="tabular" />
|
|
88 </test>
|
|
89 <test>
|
|
90 <param name="blastxml_file" value="blastp_rhodopsin_proteins.xml" ftype="blastxml" />
|
|
91 <param name="out_format" value="ext" />
|
|
92 <output name="tabular_file" file="blastp_rhodopsin_proteins_ext.tabular" ftype="tabular" />
|
|
93 </test>
|
|
94 <test>
|
|
95 <param name="blastxml_file" value="blastp_rhodopsin_proteins.xml" ftype="blastxml" />
|
|
96 <param name="out_format" value="cols" />
|
|
97 <param name="columns" value="qseqid,sseqid,length,bitscore" />
|
|
98 <output name="tabular_file" file="blastp_rhodopsin_proteins_selcol.tabular" ftype="tabular" />
|
|
99 </test>
|
|
100 <test>
|
|
101 <param name="blastxml_file" value="blastp_rhodopsin_proteins.xml" ftype="blastxml" />
|
|
102 <param name="out_format" value="ext" />
|
|
103 <param name="maxhits" value="10" />
|
|
104 <param name="maxhsps" value="10" />
|
|
105 <output name="tabular_file" file="blastp_rhodopsin_proteins_ext_allhits.tabular" ftype="tabular" />
|
|
106 </test>
|
|
107 <test>
|
|
108 <param name="blastxml_file" value="blastp_rhodopsin_proteins.xml" ftype="blastxml" />
|
|
109 <param name="out_format" value="ext" />
|
|
110 <param name="maxhits" value="1" />
|
|
111 <param name="maxhsps" value="1" />
|
|
112 <param name="unmatched" value="True" />
|
|
113 <param name="allqueries" value="True" />
|
|
114 <output name="tabular_file" file="blastp_rhodopsin_proteins_ext_allqueries.tabular" ftype="tabular" />
|
|
115 <output name="unmatched_file" file="unmatched_queries.tabular" ftype="tabular" />
|
|
116 </test>
|
|
117 </tests>
|
|
118 <help>
|
|
119
|
|
120 **What it does**
|
|
121
|
|
122 NCBI BLAST+ (and the older NCBI 'legacy' BLAST) can output in a range of
|
|
123 formats including tabular and a more detailed XML format. A complex workflow
|
|
124 may need both the XML and the tabular output - but running BLAST twice is
|
|
125 slow and wasteful.
|
|
126
|
|
127 This tool takes the BLAST XML output and by default converts it into the
|
|
128 standard 12 column tabular equivalent:
|
|
129
|
|
130 ====== ========= ============================================
|
|
131 Column NCBI name Description
|
|
132 ------ --------- --------------------------------------------
|
|
133 1 qseqid Query Seq-id (ID of your sequence)
|
|
134 2 sseqid Subject Seq-id (ID of the database hit)
|
|
135 3 pident Percentage of identical matches
|
|
136 4 length Alignment length
|
|
137 5 mismatch Number of mismatches
|
|
138 6 gapopen Number of gap openings
|
|
139 7 qstart Start of alignment in query
|
|
140 8 qend End of alignment in query
|
|
141 9 sstart Start of alignment in subject (database hit)
|
|
142 10 send End of alignment in subject (database hit)
|
|
143 11 evalue Expectation value (E-value)
|
|
144 12 bitscore Bit score
|
|
145 ====== ========= ============================================
|
|
146
|
|
147 The BLAST+ tools can optionally output additional columns of information,
|
|
148 but this takes longer to calculate. Most (but not all) of these columns are
|
|
149 included by selecting the extended tabular output. The extra columns are
|
|
150 included *after* the standard 12 columns. This is so that you can write
|
|
151 workflow filtering steps that accept either the 12 or 22 column tabular
|
|
152 BLAST output.
|
|
153
|
|
154 ====== ============= ===========================================
|
|
155 Column NCBI name Description
|
|
156 ------ ------------- -------------------------------------------
|
|
157 13 sallseqid All subject Seq-id(s), separated by a ';'
|
|
158 14 score Raw score
|
|
159 15 nident Number of identical matches
|
|
160 16 positive Number of positive-scoring matches
|
|
161 17 gaps Total number of gaps
|
|
162 18 ppos Percentage of positive-scoring matches
|
|
163 19 qframe Query frame
|
|
164 20 sframe Subject frame
|
|
165 21 qseq Aligned part of query sequence
|
|
166 22 sseq Aligned part of subject sequence
|
|
167 23 qlen Query sequence length
|
|
168 24 slen Subject sequence length
|
|
169 25 salltitles All subject title(s), separated by a '<>'
|
|
170 ====== ============= ===========================================
|
|
171
|
|
172 Beware that the XML file (and thus the conversion) and the tabular output
|
|
173 direct from BLAST+ may differ in the presence of XXXX masking on regions
|
|
174 low complexity (columns 21 and 22), and thus also calculated figures like
|
|
175 the percentage idenity (column 3).
|
|
176
|
|
177 </help>
|
|
178 </tool>
|