Mercurial > repos > galaxyp > blastxml_to_tabular_selectable
comparison blastxml_to_tabular_selectable.py @ 0:2bd0cbccb3c6
Uploaded
| author | galaxyp |
|---|---|
| date | Wed, 08 Oct 2014 19:38:28 -0400 |
| parents | |
| children | 5da5dcc5e13a |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:2bd0cbccb3c6 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """Convert a BLAST XML file to 12 column tabular output | |
| 3 | |
| 4 Takes three command line options, input BLAST XML filename, output tabular | |
| 5 BLAST filename, output format (std for standard 12 columns, or ext for the | |
| 6 extended 25 columns offered in the BLAST+ wrappers). | |
| 7 | |
| 8 The 12 columns output are 'qseqid sseqid pident length mismatch gapopen qstart | |
| 9 qend sstart send evalue bitscore' or 'std' at the BLAST+ command line, which | |
| 10 mean: | |
| 11 | |
| 12 ====== ========= ============================================ | |
| 13 Column NCBI name Description | |
| 14 ------ --------- -------------------------------------------- | |
| 15 1 qseqid Query Seq-id (ID of your sequence) | |
| 16 2 sseqid Subject Seq-id (ID of the database hit) | |
| 17 3 pident Percentage of identical matches | |
| 18 4 length Alignment length | |
| 19 5 mismatch Number of mismatches | |
| 20 6 gapopen Number of gap openings | |
| 21 7 qstart Start of alignment in query | |
| 22 8 qend End of alignment in query | |
| 23 9 sstart Start of alignment in subject (database hit) | |
| 24 10 send End of alignment in subject (database hit) | |
| 25 11 evalue Expectation value (E-value) | |
| 26 12 bitscore Bit score | |
| 27 ====== ========= ============================================ | |
| 28 | |
| 29 The additional columns offered in the Galaxy BLAST+ wrappers are: | |
| 30 | |
| 31 ====== ============= =========================================== | |
| 32 Column NCBI name Description | |
| 33 ------ ------------- ------------------------------------------- | |
| 34 13 sallseqid All subject Seq-id(s), separated by a ';' | |
| 35 14 score Raw score | |
| 36 15 nident Number of identical matches | |
| 37 16 positive Number of positive-scoring matches | |
| 38 17 gaps Total number of gaps | |
| 39 18 ppos Percentage of positive-scoring matches | |
| 40 19 qframe Query frame | |
| 41 20 sframe Subject frame | |
| 42 21 qseq Aligned part of query sequence | |
| 43 22 sseq Aligned part of subject sequence | |
| 44 23 qlen Query sequence length | |
| 45 24 slen Subject sequence length | |
| 46 25 salltitles All subject titles, separated by '<>' | |
| 47 ====== ============= =========================================== | |
| 48 | |
| 49 Most of these fields are given explicitly in the XML file, others some like | |
| 50 the percentage identity and the number of gap openings must be calculated. | |
| 51 | |
| 52 Be aware that the sequence in the extended tabular output or XML direct from | |
| 53 BLAST+ may or may not use XXXX masking on regions of low complexity. This | |
| 54 can throw the off the calculation of percentage identity and gap openings. | |
| 55 [In fact, both BLAST 2.2.24+ and 2.2.25+ have a subtle bug in this regard, | |
| 56 with these numbers changing depending on whether or not the low complexity | |
| 57 filter is used.] | |
| 58 | |
| 59 This script attempts to produce identical output to what BLAST+ would have done. | |
| 60 However, check this with "diff -b ..." since BLAST+ sometimes includes an extra | |
| 61 space character (probably a bug). | |
| 62 """ | |
| 63 import sys | |
| 64 import re | |
| 65 import os | |
| 66 from optparse import OptionParser | |
| 67 | |
| 68 if "-v" in sys.argv or "--version" in sys.argv: | |
| 69 print "v0.0.12" | |
| 70 sys.exit(0) | |
| 71 | |
| 72 if sys.version_info[:2] >= ( 2, 5 ): | |
| 73 try: | |
| 74 from xml.etree import cElementTree as ElementTree | |
| 75 except ImportError: | |
| 76 from xml.etree import ElementTree as ElementTree | |
| 77 else: | |
| 78 from galaxy import eggs | |
| 79 import pkg_resources; pkg_resources.require( "elementtree" ) | |
| 80 from elementtree import ElementTree | |
| 81 | |
| 82 def stop_err( msg ): | |
| 83 sys.stderr.write("%s\n" % msg) | |
| 84 sys.exit(1) | |
| 85 | |
| 86 usage = "usage: %prog [options] blastxml[,...]" | |
| 87 parser = OptionParser(usage=usage) | |
| 88 parser.add_option('-o','--output', dest='output', default = None, help='output file path', metavar="FILE") | |
| 89 parser.add_option("-c", "--columns", dest="columns", default='std', help="[std|ext|colname[,colname,...]] std: 12 column, ext: 25 column, or user specified columns") | |
| 90 parser.add_option("-a", "--allqueries", action="store_true", dest="allqueries", default=False, help="ouptut row for each query, including those with no hits") | |
| 91 parser.add_option("-d", "--qdef", action="store_true", dest="qdef", default=False, help="use Iteration_query-def for qseqid") | |
| 92 parser.add_option('-u', '--unmatched', dest='unmatched', default = None, help='unmatched queries file path', metavar="FILE") | |
| 93 parser.add_option('-m', '--maxhits', dest='maxhits', type="int", default = 1, help='maximum number of HITs for output for a Query') | |
| 94 parser.add_option('-p', '--maxhsps', dest='maxhsps', type="int", default = 1, help='maximum number of HSPs for output for a Hit') | |
| 95 (options, args) = parser.parse_args() | |
| 96 | |
| 97 colnames = 'qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,sallseqid,score,nident,positive,gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles'.split(',') | |
| 98 | |
| 99 if len(args) < 1: | |
| 100 parser.error("no BLASTXML input files") | |
| 101 | |
| 102 if options.columns == "std": | |
| 103 out_fmt = 'std' | |
| 104 extended = False | |
| 105 columns = colnames[0:12] | |
| 106 elif options.columns == "x22": | |
| 107 stop_err("Format argument x22 has been replaced with ext (extended 25 columns)") | |
| 108 elif options.columns == "ext": | |
| 109 out_fmt = 'ext' | |
| 110 extended = True | |
| 111 columns = colnames[0:25] | |
| 112 else: | |
| 113 out_fmt = 'cols' | |
| 114 extended = True | |
| 115 sel_cols = options.columns.split(',') | |
| 116 columns = [] | |
| 117 for col in sel_cols: | |
| 118 if col in colnames: | |
| 119 columns.append(col) | |
| 120 | |
| 121 for in_file in args: | |
| 122 if not os.path.isfile(in_file): | |
| 123 stop_err("Input BLAST XML file not found: %s" % in_file) | |
| 124 | |
| 125 unhitfile = open(options.unmatched,'w') if options.unmatched else None | |
| 126 | |
| 127 re_default_query_id = re.compile("^Query_\d+$") | |
| 128 assert re_default_query_id.match("Query_101") | |
| 129 assert not re_default_query_id.match("Query_101a") | |
| 130 assert not re_default_query_id.match("MyQuery_101") | |
| 131 re_default_subject_id = re.compile("^Subject_\d+$") | |
| 132 assert re_default_subject_id.match("Subject_1") | |
| 133 assert not re_default_subject_id.match("Subject_") | |
| 134 assert not re_default_subject_id.match("Subject_12a") | |
| 135 assert not re_default_subject_id.match("TheSubject_1") | |
| 136 | |
| 137 header = "#%s\n" % "\t".join(columns) | |
| 138 outfile = open(options.output, 'w') if options.output else sys.stdout | |
| 139 outfile.write(header) | |
| 140 | |
| 141 def handle_event(event, elem): | |
| 142 blast_program = None | |
| 143 if event == "end" and elem.tag == "BlastOutput_program": | |
| 144 blast_program = elem.text | |
| 145 # for every <Iteration> tag | |
| 146 if event == "end" and elem.tag == "Iteration": | |
| 147 #Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA | |
| 148 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID> | |
| 149 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def> | |
| 150 # <Iteration_query-len>406</Iteration_query-len> | |
| 151 # <Iteration_hits></Iteration_hits> | |
| 152 # | |
| 153 #Or, from BLAST 2.2.24+ run online | |
| 154 # <Iteration_query-ID>Query_1</Iteration_query-ID> | |
| 155 # <Iteration_query-def>Sample</Iteration_query-def> | |
| 156 # <Iteration_query-len>516</Iteration_query-len> | |
| 157 # <Iteration_hits>... | |
| 158 qseqid = elem.findtext("Iteration_query-ID") | |
| 159 if options.qdef or re_default_query_id.match(qseqid): | |
| 160 #Place holder ID, take the first word of the query definition | |
| 161 qseqid = elem.findtext("Iteration_query-def").split(None,1)[0] | |
| 162 qlen = int(elem.findtext("Iteration_query-len")) | |
| 163 # If no hits in this iteration | |
| 164 if not elem.find("Iteration_hits/Hit"): | |
| 165 if unhitfile: | |
| 166 unhitfile.write("%s\n" % qseqid) | |
| 167 if options.allqueries: | |
| 168 if columns and len(columns) > 0: | |
| 169 v = [] | |
| 170 for name in columns: | |
| 171 if name == 'qseqid': | |
| 172 v.append(qseqid) | |
| 173 elif name == 'qlen': | |
| 174 v.append(str(qlen)) | |
| 175 else: | |
| 176 v.append('') | |
| 177 outfile.write("\t".join(v) + "\n") | |
| 178 # for every <Hit> within <Iteration> | |
| 179 for hitn, hit in enumerate(elem.findall("Iteration_hits/Hit")): | |
| 180 if hitn >= options.maxhits: | |
| 181 break | |
| 182 #Expecting either this, | |
| 183 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id> | |
| 184 # <Hit_def>RecName: Full=Rhodopsin</Hit_def> | |
| 185 # <Hit_accession>P56514</Hit_accession> | |
| 186 #or, | |
| 187 # <Hit_id>Subject_1</Hit_id> | |
| 188 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def> | |
| 189 # <Hit_accession>Subject_1</Hit_accession> | |
| 190 # | |
| 191 #apparently depending on the parse_deflines switch | |
| 192 # | |
| 193 #Or, with a local database not using -parse_seqids can get this, | |
| 194 # <Hit_id>gnl|BL_ORD_ID|2</Hit_id> | |
| 195 # <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence</Hit_def> | |
| 196 # <Hit_accession>2</Hit_accession> | |
| 197 sseqid = hit.findtext("Hit_id").split(None,1)[0] | |
| 198 hit_def = sseqid + " " + hit.findtext("Hit_def") | |
| 199 if re_default_subject_id.match(sseqid) \ | |
| 200 and sseqid == hit.findtext("Hit_accession"): | |
| 201 #Place holder ID, take the first word of the subject definition | |
| 202 hit_def = hit.findtext("Hit_def") | |
| 203 sseqid = hit_def.split(None,1)[0] | |
| 204 # for every <Hsp> within <Hit> | |
| 205 for hspn,hsp in enumerate(hit.findall("Hit_hsps/Hsp")): | |
| 206 if hspn >= options.maxhsps: | |
| 207 break | |
| 208 nident = hsp.findtext("Hsp_identity") | |
| 209 length = hsp.findtext("Hsp_align-len") | |
| 210 pident = "%0.2f" % (100*float(nident)/float(length)) | |
| 211 | |
| 212 q_seq = hsp.findtext("Hsp_qseq") | |
| 213 h_seq = hsp.findtext("Hsp_hseq") | |
| 214 m_seq = hsp.findtext("Hsp_midline") | |
| 215 assert len(q_seq) == len(h_seq) == len(m_seq) == int(length) | |
| 216 gapopen = str(len(q_seq.replace('-', ' ').split())-1 + \ | |
| 217 len(h_seq.replace('-', ' ').split())-1) | |
| 218 | |
| 219 mismatch = m_seq.count(' ') + m_seq.count('+') \ | |
| 220 - q_seq.count('-') - h_seq.count('-') | |
| 221 #TODO - Remove this alternative mismatch calculation and test | |
| 222 #once satisifed there are no problems | |
| 223 expected_mismatch = len(q_seq) \ | |
| 224 - sum(1 for q,h in zip(q_seq, h_seq) \ | |
| 225 if q == h or q == "-" or h == "-") | |
| 226 xx = sum(1 for q,h in zip(q_seq, h_seq) if q=="X" and h=="X") | |
| 227 if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx): | |
| 228 stop_err("%s vs %s mismatches, expected %i <= %i <= %i" \ | |
| 229 % (qseqid, sseqid, expected_mismatch - q_seq.count("X"), | |
| 230 int(mismatch), expected_mismatch)) | |
| 231 | |
| 232 #TODO - Remove this alternative identity calculation and test | |
| 233 #once satisifed there are no problems | |
| 234 expected_identity = sum(1 for q,h in zip(q_seq, h_seq) if q == h) | |
| 235 if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")): | |
| 236 stop_err("%s vs %s identities, expected %i <= %i <= %i" \ | |
| 237 % (qseqid, sseqid, expected_identity, int(nident), | |
| 238 expected_identity + q_seq.count("X"))) | |
| 239 | |
| 240 | |
| 241 evalue = hsp.findtext("Hsp_evalue") | |
| 242 if evalue == "0": | |
| 243 evalue = "0.0" | |
| 244 else: | |
| 245 evalue = "%0.0e" % float(evalue) | |
| 246 | |
| 247 bitscore = float(hsp.findtext("Hsp_bit-score")) | |
| 248 if bitscore < 100: | |
| 249 #Seems to show one decimal place for lower scores | |
| 250 bitscore = "%0.1f" % bitscore | |
| 251 else: | |
| 252 #Note BLAST does not round to nearest int, it truncates | |
| 253 bitscore = "%i" % bitscore | |
| 254 | |
| 255 values = [qseqid, | |
| 256 sseqid, | |
| 257 pident, | |
| 258 length, #hsp.findtext("Hsp_align-len") | |
| 259 str(mismatch), | |
| 260 gapopen, | |
| 261 hsp.findtext("Hsp_query-from"), #qstart, | |
| 262 hsp.findtext("Hsp_query-to"), #qend, | |
| 263 hsp.findtext("Hsp_hit-from"), #sstart, | |
| 264 hsp.findtext("Hsp_hit-to"), #send, | |
| 265 evalue, #hsp.findtext("Hsp_evalue") in scientific notation | |
| 266 bitscore, #hsp.findtext("Hsp_bit-score") rounded | |
| 267 ] | |
| 268 | |
| 269 if extended: | |
| 270 sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(">")) | |
| 271 salltitles = "<>".join(name.split(None,1)[1] for name in hit_def.split(" >")) | |
| 272 #print hit_def, "-->", sallseqid | |
| 273 positive = hsp.findtext("Hsp_positive") | |
| 274 ppos = "%0.2f" % (100*float(positive)/float(length)) | |
| 275 qframe = hsp.findtext("Hsp_query-frame") | |
| 276 sframe = hsp.findtext("Hsp_hit-frame") | |
| 277 if blast_program == "blastp": | |
| 278 #Probably a bug in BLASTP that they use 0 or 1 depending on format | |
| 279 if qframe == "0": qframe = "1" | |
| 280 if sframe == "0": sframe = "1" | |
| 281 slen = int(hit.findtext("Hit_len")) | |
| 282 values.extend([sallseqid, | |
| 283 hsp.findtext("Hsp_score"), #score, | |
| 284 nident, | |
| 285 positive, | |
| 286 hsp.findtext("Hsp_gaps"), #gaps, | |
| 287 ppos, | |
| 288 qframe, | |
| 289 sframe, | |
| 290 #NOTE - for blastp, XML shows original seq, tabular uses XXX masking | |
| 291 q_seq, | |
| 292 h_seq, | |
| 293 str(qlen), | |
| 294 str(slen), | |
| 295 salltitles, | |
| 296 ]) | |
| 297 if out_fmt == 'cols': | |
| 298 if columns and len(columns) > 0: | |
| 299 v = [] | |
| 300 for name in columns: | |
| 301 v.append(values[colnames.index(name)]) | |
| 302 values = v | |
| 303 #print "\t".join(values) | |
| 304 outfile.write("\t".join(values) + "\n") | |
| 305 # prevents ElementTree from growing large datastructure | |
| 306 root.clear() | |
| 307 elem.clear() | |
| 308 | |
| 309 | |
| 310 for in_file in args: | |
| 311 # get an iterable | |
| 312 try: | |
| 313 context = ElementTree.iterparse(in_file, events=("start", "end")) | |
| 314 except: | |
| 315 stop_err("Invalid data format.") | |
| 316 # turn it into an iterator | |
| 317 context = iter(context) | |
| 318 # get the root element | |
| 319 try: | |
| 320 event, root = context.next() | |
| 321 except: | |
| 322 stop_err( "Invalid data format." ) | |
| 323 for event, elem in context: | |
| 324 handle_event(event, elem) | |
| 325 | |
| 326 if unhitfile: | |
| 327 unhitfile.close() | |
| 328 if options.output: | |
| 329 outfile.close() |
