diff tools/ncbi_blast_plus/blastxml_to_tabular.xml @ 2:45ba7c750bc8 draft

This update should have no functional effect. Renamed files (since xml.py will be inaccurate once BLAST databases are added in future update).
author peterjc
date Thu, 20 Sep 2012 10:12:43 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/blastxml_to_tabular.xml	Thu Sep 20 10:12:43 2012 -0400
@@ -0,0 +1,127 @@
+<tool id="blastxml_to_tabular" name="BLAST XML to tabular" version="0.0.8">
+    <description>Convert BLAST XML output to tabular</description>
+    <command interpreter="python">
+      blastxml_to_tabular.py $blastxml_file $tabular_file $out_format
+    </command>
+    <inputs>
+        <param name="blastxml_file" type="data" format="blastxml" label="BLAST results as XML"/> 
+        <param name="out_format" type="select" label="Output format">
+            <option value="std" selected="True">Tabular (standard 12 columns)</option>
+            <option value="ext">Tabular (extended 24 columns)</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="tabular_file" format="tabular" label="BLAST results as tabular" />
+    </outputs>
+    <requirements>
+    </requirements>
+    <tests>
+        <test>
+            <param name="blastxml_file" value="blastp_four_human_vs_rhodopsin.xml" ftype="blastxml" />
+            <param name="out_format" value="std" />
+            <!-- Note this has some white space differences from the actual blastp output blast_four_human_vs_rhodopsin.tabluar -->
+            <output name="tabular_file" file="blastp_four_human_vs_rhodopsin_converted.tabular" ftype="tabular" />
+        </test>
+        <test>
+            <param name="blastxml_file" value="blastp_four_human_vs_rhodopsin.xml" ftype="blastxml" />
+            <param name="out_format" value="ext" />
+            <!-- Note this has some white space differences from the actual blastp output blast_four_human_vs_rhodopsin_22c.tabluar -->
+            <output name="tabular_file" file="blastp_four_human_vs_rhodopsin_converted_ext.tabular" ftype="tabular" />
+        </test>
+        <test>
+            <param name="blastxml_file" value="blastp_sample.xml" ftype="blastxml" />
+            <param name="out_format" value="std" />
+            <!-- Note this has some white space differences from the actual blastp output -->
+            <output name="tabular_file" file="blastp_sample_converted.tabular" ftype="tabular" />
+        </test>
+        <test>
+            <param name="blastxml_file" value="blastx_rhodopsin_vs_four_human.xml" ftype="blastxml" />
+            <param name="out_format" value="std" />
+            <!-- Note this has some white space differences from the actual blastx output -->
+            <output name="tabular_file" file="blastx_rhodopsin_vs_four_human_converted.tabular" ftype="tabular" />
+        </test>
+        <test>
+            <param name="blastxml_file" value="blastx_rhodopsin_vs_four_human.xml" ftype="blastxml" />
+            <param name="out_format" value="ext" />
+            <!-- Note this has some white space and XXXX masking differences from the actual blastx output -->
+            <output name="tabular_file" file="blastx_rhodopsin_vs_four_human_converted_ext.tabular" ftype="tabular" />
+        </test>
+        <test>
+            <param name="blastxml_file" value="blastx_sample.xml" ftype="blastxml" />
+            <param name="out_format" value="std" />
+            <!-- Note this has some white space differences from the actual blastx output -->
+            <output name="tabular_file" file="blastx_sample_converted.tabular" ftype="tabular" />
+        </test>
+        <test>
+            <param name="blastxml_file" value="blastp_human_vs_pdb_seg_no.xml" ftype="blastxml" />
+            <param name="out_format" value="std" />
+            <!-- Note this has some white space differences from the actual blastp output -->
+            <output name="tabular_file" file="blastp_human_vs_pdb_seg_no_converted_std.tabular" ftype="tabular" />
+        </test>
+        <test>
+            <param name="blastxml_file" value="blastp_human_vs_pdb_seg_no.xml" ftype="blastxml" />
+            <param name="out_format" value="ext" />
+            <!-- Note this has some white space differences from the actual blastp output -->
+            <output name="tabular_file" file="blastp_human_vs_pdb_seg_no_converted_ext.tabular" ftype="tabular" />
+        </test>
+    </tests>
+    <help>
+    
+**What it does**
+
+NCBI BLAST+ (and the older NCBI 'legacy' BLAST) can output in a range of
+formats including tabular and a more detailed XML format. A complex workflow
+may need both the XML and the tabular output - but running BLAST twice is
+slow and wasteful.
+
+This tool takes the BLAST XML output and by default converts it into the
+standard 12 column tabular equivalent:
+
+====== ========= ============================================
+Column NCBI name Description
+------ --------- --------------------------------------------
+     1 qseqid    Query Seq-id (ID of your sequence)
+     2 sseqid    Subject Seq-id (ID of the database hit)
+     3 pident    Percentage of identical matches
+     4 length    Alignment length
+     5 mismatch  Number of mismatches
+     6 gapopen   Number of gap openings
+     7 qstart    Start of alignment in query
+     8 qend      End of alignment in query
+     9 sstart    Start of alignment in subject (database hit)
+    10 send      End of alignment in subject (database hit)
+    11 evalue    Expectation value (E-value)
+    12 bitscore  Bit score
+====== ========= ============================================
+
+The BLAST+ tools can optionally output additional columns of information,
+but this takes longer to calculate. Most (but not all) of these columns are
+included by selecting the extended tabular output. The extra columns are
+included *after* the standard 12 columns. This is so that you can write
+workflow filtering steps that accept either the 12 or 22 column tabular
+BLAST output.
+
+====== ============= ===========================================
+Column NCBI name     Description
+------ ------------- -------------------------------------------
+    13 sallseqid     All subject Seq-id(s), separated by a ';'
+    14 score         Raw score
+    15 nident        Number of identical matches
+    16 positive      Number of positive-scoring matches
+    17 gaps          Total number of gaps
+    18 ppos          Percentage of positive-scoring matches
+    19 qframe        Query frame
+    20 sframe        Subject frame
+    21 qseq          Aligned part of query sequence
+    22 sseq          Aligned part of subject sequence
+    23 qlen          Query sequence length
+    24 slen          Subject sequence length
+====== ============= ===========================================
+
+Beware that the XML file (and thus the conversion) and the tabular output
+direct from BLAST+ may differ in the presence of XXXX masking on regions
+low complexity (columns 21 and 22), and thus also calculated figures like
+the percentage idenity (column 3).
+
+    </help>
+</tool>