0
|
1 #!/usr/bin/python
|
|
2 """Convert a BLAST XML file to 12 column tabular output
|
|
3
|
|
4 This tool can be used both via command line and via a local Galaxy install.
|
|
5 Galaxy uses .loc files as indicated by the tool_data_table_conf.xml.sample.
|
|
6 The command line version uses .tab versions of the above files:
|
|
7 blast_reporting_fields.loc
|
|
8 fasta_reference_dbs.loc
|
|
9 So for command-line use, ensure the .tab files are updated to their .loc counterparts.
|
|
10
|
|
11 Takes three command line options, input BLAST XML filename, output tabular
|
|
12 BLAST filename, output format (std for standard 12 columns, or ext for the
|
|
13 extended 24 columns offered in the BLAST+ wrappers).
|
|
14
|
|
15 The 12 columns output are 'qseqid sseqid pident length mismatch gapopen qstart
|
|
16 qend sstart send evalue bitscore' or 'std' at the BLAST+ command line, which
|
|
17 mean:
|
|
18
|
|
19 ====== ========= ============================================
|
|
20 Column NCBI name Description
|
|
21 ------ --------- --------------------------------------------
|
|
22 1 qseqid Query Seq-id (ID of your sequence)
|
|
23 2 sseqid Subject Seq-id (ID of the database hit)
|
|
24 3 pident Percentage of identical matches
|
|
25 4 length Alignment length
|
|
26 5 mismatch Number of mismatches
|
|
27 6 gapopen Number of gap openings
|
|
28 7 qstart Start of alignment in query
|
|
29 8 qend End of alignment in query
|
|
30 9 sstart Start of alignment in subject (database hit)
|
|
31 10 send End of alignment in subject (database hit)
|
|
32 11 evalue Expectation value (E-value)
|
|
33 12 bitscore Bit score
|
|
34 ====== ========= ============================================
|
|
35
|
|
36 The additional columns offered in the Galaxy BLAST+ wrappers are:
|
|
37
|
|
38 ====== ============= ===========================================
|
|
39 Column NCBI name Description
|
|
40 ------ ------------- -------------------------------------------
|
|
41 13 sallseqid All subject Seq-id(s), separated by a ';'
|
|
42 14 score Raw score
|
|
43 15 nident Number of identical matches
|
|
44 16 positive Number of positive-scoring matches
|
|
45 17 gaps Total number of gaps
|
|
46 18 ppos Percentage of positive-scoring matches
|
|
47 19 qframe Query frame
|
|
48 20 sframe Subject frame
|
|
49 21 qseq Aligned part of query sequence
|
|
50 22 sseq Aligned part of subject sequence
|
|
51 23 qlen Query sequence length
|
|
52 24 slen Subject sequence length
|
|
53 ====== ============= ===========================================
|
|
54
|
|
55 Very slight modifications were made to the "BLAST XML to tabular" tool that
|
|
56 ships with Galaxy to output two more column columns:
|
|
57
|
|
58 ====== ============= ===========================================
|
|
59 Column NCBI name Description
|
|
60 ------ ------------- -------------------------------------------
|
|
61 25 pcov Percentage coverage
|
|
62 26 sallseqdescr All subject Seq-descr(s), separated by a ','
|
|
63 ====== ============= ===========================================
|
|
64
|
|
65 Most of these fields are given explicitly in the XML file, others some like
|
|
66 the percentage identity and the number of gap openings must be calculated.
|
|
67
|
|
68 In addition an option exists to select particular columns for the output
|
|
69 report. Reference bin columns will be added if they have been included in
|
|
70 search.
|
|
71
|
|
72 A command line version can be used. Type blast_reporting.py -h for help.
|
|
73
|
|
74 Be aware that the sequence in the extended tabular output or XML direct from
|
|
75 BLAST+ may or may not use XXXX masking on regions of low complexity. This
|
|
76 can throw the off the calculation of percentage identity and gap openings.
|
|
77 [In fact, both BLAST 2.2.24+ and 2.2.25+ have a subtle bug in this regard,
|
|
78 with these numbers changing depending on whether or not the low complexity
|
|
79 filter is used.]
|
|
80
|
|
81 This script attempts to produce identical output to what BLAST+ would have done.
|
|
82 However, check this with "diff -b ..." since BLAST+ sometimes includes an extra
|
|
83 space character (probably a bug).
|
|
84
|
|
85 python blast_reporting.py in_file out_tabular_file out_html_file out_format
|
|
86 """
|
|
87 import sys
|
|
88 import re
|
|
89 import os.path
|
|
90 import common
|
|
91 import reference_bins
|
|
92 #import templates.html_report
|
|
93
|
|
94 if __name__ == '__main__' and __package__ is None:
|
|
95 from os import path
|
|
96 sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))
|
|
97
|
|
98 if sys.version_info[:2] >= ( 2, 5 ):
|
|
99 import xml.etree.cElementTree as ElementTree
|
|
100 else:
|
|
101 from galaxy import eggs
|
|
102 import pkg_resources; pkg_resources.require( "elementtree" )
|
|
103 from elementtree import ElementTree
|
|
104
|
|
105 class GenericRecord(object): pass
|
|
106
|
|
107 class XMLRecordScan(object):
|
|
108 """
|
|
109 XML Input file usually looks like:
|
|
110
|
|
111 <BlastOutput>
|
|
112 <BlastOutput_program>blastn</BlastOutput_program>
|
|
113 <BlastOutput_param>
|
|
114 <Parameters>
|
|
115 <Parameters_expect>0.001</Parameters_expect>
|
|
116 <Parameters_sc-match>1</Parameters_sc-match>
|
|
117 <Parameters_sc-mismatch>-2</Parameters_sc-mismatch>
|
|
118 <Parameters_gap-open>0</Parameters_gap-open>
|
|
119 <Parameters_gap-extend>0</Parameters_gap-extend>
|
|
120 <Parameters_filter>L;m;</Parameters_filter>
|
|
121 </Parameters>
|
|
122 </BlastOutput_param>
|
|
123 <Iteration>
|
|
124 <Iteration_iter-num>1</Iteration_iter-num>
|
|
125 <Iteration_query-ID>Query_1</Iteration_query-ID>
|
|
126 <Iteration_query-def>ENA|EF604038|EF604038.1 Uncultured bacterium clone 16saw43-2g09.q1k 16S ribosomal RNA gene, partial sequence</Iteration_query-def>
|
|
127 <Iteration_query-len>1364</Iteration_query-len>
|
|
128 <Iteration_hits>
|
|
129
|
|
130 <Hit>
|
|
131 <Hit_num>1</Hit_num>
|
|
132 <Hit_id>gi|444439670|ref|NR_074985.1|</Hit_id>
|
|
133 <Hit_hsps>...
|
|
134 <Hsp>...
|
|
135 """
|
|
136 def __init__(self, options, output_format):
|
|
137 """ Creates a record object that holds field data for each <hit> iteration in Blastn XML data
|
|
138
|
|
139 .record object: holds values read in from <XML> <hit> record mainly.
|
|
140 .tags dictionary: XML tags and the record.[x] fields/attributes that should be set to tag values.
|
|
141 .column_format dictionary: Name to field count dictionary used for selecting # of output fields
|
|
142 .fieldSpec dictionary: Specification of each possible field's type (for validation), full name, and suitability for sorting, filtering, etc.
|
|
143 .custom_columns array takes list of custom columns to output. (If sorting by a column it must be in this list)
|
|
144 .reference_bins dictionary
|
|
145
|
|
146 """
|
|
147 self.record = GenericRecord() # Set up so we can use object attributes.
|
|
148
|
|
149 #This is a list of all incomming blast generated XML fields that we want to capture
|
|
150 # self.record gets all underscored variables values as well as new derived ones in process() below
|
|
151 self.tags = {
|
|
152 "BlastOutput_program": '_blast_program',
|
|
153 "Iteration_query-ID": '_qseqid',
|
|
154 "Iteration_query-def": '_qdef',
|
|
155 "Iteration_query-len": '_qlen', #extended+ calc
|
|
156
|
|
157 "Hit_id": '_hit_id',
|
|
158 "Hit_def": '_hit_def', #extended+ calc
|
|
159 "Hit_accession": '_hit_acc',
|
|
160 "Hit_len": '_hit_len',
|
|
161
|
|
162 "Hsp_bit-score": '_bitscore', #basic
|
|
163 "Hsp_score": '_score', #extended
|
|
164 "Hsp_evalue": '_evalue', #basic
|
|
165 "Hsp_query-from": '_qstart', #basic, extended+ calc
|
|
166 "Hsp_query-to": '_qend', #basic, extended+ calc
|
|
167 "Hsp_hit-from": '_sstart', #basic
|
|
168 "Hsp_hit-to": '_send', #basic
|
|
169 "Hsp_query-frame": '_qframe', #extended only
|
|
170 "Hsp_hit-frame": '_sframe', #extended only
|
|
171 "Hsp_identity": '_nident', #extended
|
|
172 "Hsp_positive": '_positive', #extended
|
|
173 "Hsp_gaps": '_gaps', #extended
|
|
174 "Hsp_align-len": '_length', #basic
|
|
175 "Hsp_qseq": '_qseq', #extended
|
|
176 "Hsp_hseq": '_sseq', #extended
|
|
177 "Hsp_midline": '_mseq' #basic
|
|
178 }
|
|
179
|
|
180 self.column_format = {
|
|
181 'std':12,
|
|
182 'ext':24,
|
|
183 'std+seqs':12,
|
|
184 'ext+':26,
|
|
185 'custom':1
|
|
186 }
|
|
187
|
|
188 if not output_format in self.column_format:
|
|
189 common.stop_err("Format argument should be std (12 column) or ext (extended 24 columns) or ext+ (extended 26+ columns) or custom (you choose fields). Format argument x22 has been replaced with ext (extended 24 columns)")
|
|
190
|
|
191 # Array of columns destined for tab-delimited output - This defines default ORDER of fields too.
|
|
192 # Raw data fields that never get output: _bitscore, _evalue, _qframe, _sframe,
|
|
193 # and this that has no m_frame equivalent: _mseq
|
|
194 self.columns_in = 'qseqid sseqid pident _length mismatch gapopen _qstart _qend _sstart _send evalue bitscore \
|
|
195 sallseqid _score _nident _positive _gaps ppos qframe sframe _qseq _sseq qlen slen \
|
|
196 pcov sallseqdescr accessionid sseqdescr _mseq'.split()
|
|
197
|
|
198 fieldSpecFile = os.path.join(os.path.dirname(__file__), 'blast_reporting_fields.tab')
|
|
199 self.field_spec = common.FieldSpec(fieldSpecFile, self.columns_in)
|
|
200
|
|
201 # Include first N fields from .columns according to format.
|
|
202 # In all cases qseqid is included.
|
|
203 # Default everything to "column".
|
|
204 columns_out = self.columns_in[0:self.column_format[output_format]]
|
|
205
|
|
206 # This column list is designed for creating phylogeny reports.
|
|
207 if output_format == 'std+seqs': columns_out.extend(['_qseq','_sseq'])
|
|
208
|
|
209 self.columns = self.field_spec.initColumns(columns_out, options.custom_fields)
|
|
210
|
|
211 # We're making these columns hidden for this particular HTML report format
|
|
212 # UNLESS they are mentioned in options.custom_fields
|
|
213 if output_format == 'std+seqs':
|
|
214 for (ptr, target) in enumerate(self.columns):
|
|
215 if target['field'] in ['_qseq','_sseq'] and options.custom_fields and not target['field'] in options.custom_fields :
|
|
216 target['group'] = 'hidden'
|
|
217
|
|
218 # ADD SELECTED BINS TO COLUMN LIST;
|
|
219 self.binManager = reference_bins.ReferenceBins()
|
|
220 self.binManager.build_bins(options.reference_bins, self.columns)
|
|
221
|
|
222 def setRecordAttr(self, tag, text):
|
|
223 #self.record is a class object (not a dictionary) so using setattr()
|
|
224 setattr(self.record, self.tags[tag], text)
|
|
225
|
|
226 # Called after set() has processed a bunch of <hit> ...</hit> tags
|
|
227 def processRecord(self) :
|
|
228
|
|
229 bline = self.record
|
|
230
|
|
231 # NCBI notes: Expecting either this,
|
|
232 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id>
|
|
233 # <Hit_def>RecName: Full=Rhodopsin</Hit_def>
|
|
234 # <Hit_accession>P56514</Hit_accession>
|
|
235 #or,
|
|
236 # <Hit_id>Subject_1</Hit_id>
|
|
237 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def>
|
|
238 # <Hit_accession>Subject_1</Hit_accession>
|
|
239 #or,
|
|
240 # <Hit_id>Subject_1</Hit_id>
|
|
241 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def>
|
|
242 # <Hit_accession>Subject_1</Hit_accession>
|
|
243 #apparently depending on the parse_deflines switch
|
|
244
|
|
245 sseqid = self.record._hit_id.split(None,1)[0]
|
|
246
|
|
247 # If Hit_id == Hit_accession AND it is a default "Subject_1" ...
|
|
248 # OR Hit_accession IN Hit_id and BL_ORD_ID|XXXX contains hit_accession
|
|
249 if common.re_default_subject_id.match(sseqid) and sseqid.find(bline._hit_acc):
|
|
250 # and sseqid == bline._hit_acc:
|
|
251 #Place holder ID, take the first word of the subject definition
|
|
252 hit_def = bline._hit_def
|
|
253 sseqid = hit_def.split(None,1)[0]
|
|
254 else:
|
|
255 hit_def = sseqid + " " + bline._hit_def
|
|
256
|
|
257 self.record.sseqid = sseqid
|
|
258
|
|
259 if common.re_default_ncbi_id.match(sseqid):
|
|
260 self.record.accessionid = sseqid.split('|')[3]
|
|
261 elif common.re_default_ref_id.match(sseqid):
|
|
262 self.record.accessionid = sseqid.split('|')[1]
|
|
263 else:
|
|
264 # Have to use the whole string.
|
|
265 self.record.accessionid = sseqid
|
|
266
|
|
267
|
|
268 # NCBI notes: Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA
|
|
269 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>
|
|
270 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>
|
|
271 # <Iteration_query-len>406</Iteration_query-len>
|
|
272 # <Iteration_hits></Iteration_hits>
|
|
273 #
|
|
274 #Or, from BLAST 2.2.24+ run online
|
|
275 # <Iteration_query-ID>Query_1</Iteration_query-ID>
|
|
276 # <Iteration_query-def>Sample</Iteration_query-def>
|
|
277 # <Iteration_query-len>516</Iteration_query-len>
|
|
278 # <Iteration_hits>...
|
|
279
|
|
280 # Note BioPython's approach http://biopython.org/DIST/docs/api/Bio.SearchIO.BlastIO.blast_xml-pysrc.html
|
|
281 # ... if hit_id.startswith('gnl|BL_ORD_ID|'): ...
|
|
282
|
|
283 if common.re_default_query_id.match(bline._qseqid):
|
|
284 #Place holder ID, take the first word of the query definition
|
|
285 qseqid = bline._qdef.split(None,1)[0]
|
|
286 else:
|
|
287 qseqid = bline._qseqid
|
|
288
|
|
289 self.record.qseqid = qseqid
|
|
290
|
|
291 self.record.evalue = "0.0" if bline._evalue == "0" else "%0.0e" % float(bline._evalue)
|
|
292
|
|
293 # NCBI notes:
|
|
294 # if bline._bitscore < 100:
|
|
295 # #Seems to show one decimal place for lower scores
|
|
296 # bitscore = "%0.1f" % bline._bitscore
|
|
297 # else:
|
|
298 # #Note BLAST does not round to nearest int, it truncates
|
|
299 # bitscore = "%i" % bline._bitscore
|
|
300 bitscore = float(bline._bitscore)
|
|
301 self.record.bitscore = "%0.1f" % bitscore if bitscore < 100 else "%i" % bitscore
|
|
302
|
|
303 self.record.pident = "%0.2f" % (100*float(bline._nident)/float(bline._length))
|
|
304
|
|
305 self.record.gapopen = str(len(bline._qseq.replace('-', ' ').split())-1 + \
|
|
306 len(bline._sseq.replace('-', ' ').split())-1)
|
|
307
|
|
308 mismatch = bline._mseq.count(' ') + bline._mseq.count('+') \
|
|
309 - bline._qseq.count('-') - bline._sseq.count('-')
|
|
310 #assert len(bline._qseq) == len(bline._sseq) == len(bline._mseq) == int(bline._length)
|
|
311 self.record.mismatch = str(mismatch)
|
|
312
|
|
313 # Extended fields
|
|
314 #sallseqid gets ";" delimited list of first words in each hit_def "x>y>z" expression.
|
|
315 #Nov 7 2013 fix: https://github.com/peterjc/galaxy_blast/blob/master/tools/ncbi_blast_plus/blastxml_to_tabular.py
|
|
316 hit_def_array = hit_def.split(" >") #Note: elem.text below converts escaped ">" back to ">"
|
|
317 try:
|
|
318 self.record.sallseqid = ";".join(name.split(None,1)[0] for name in hit_def_array)
|
|
319 except IndexError as e:
|
|
320 common.stop_err("Problem splitting multiple hit ids?\n%r\n--> %s" % (hit_def, e))
|
|
321
|
|
322 # Calculate accession ids, and check bin(s) for them, update record accordingly.
|
|
323 self.binManager.setStatus(self.record)
|
|
324
|
|
325 self.record.ppos = "%0.2f" % (100*float(bline._positive)/float(bline._length))
|
|
326 qframe = bline._qframe
|
|
327 sframe = bline._sframe
|
|
328 if bline._blast_program == "blastp":
|
|
329 #Probably a bug in BLASTP that they use 0 or 1 depending on format
|
|
330 if qframe == "0": qframe = "1"
|
|
331 if sframe == "0": sframe = "1"
|
|
332
|
|
333 self.record.qframe = qframe
|
|
334 self.record.sframe = sframe
|
|
335 self.record.slen = str(int(bline._hit_len))
|
|
336 self.record.qlen = str(int(bline._qlen))
|
|
337
|
|
338 #extended+
|
|
339 self.record.pcov = "%0.2f" % (float(int(bline._qend) - int(bline._qstart) + 1)/int(bline._qlen) * 100)
|
|
340 sallseqdescr = []
|
|
341 try:
|
|
342 for name in hit_def_array:
|
|
343 id_desc = name.split(None,1)
|
|
344 if len(id_desc) == 1: sallseqdescr.append('missing description - database issue')
|
|
345 else: sallseqdescr.append(id_desc[1])
|
|
346 except IndexError as e:
|
|
347 common.stop_err("Problem splitting multiple hits?\n%r\n--> %s" % (hit_def, e))
|
|
348 # Example sallseqdescr is "Mus musculus ribosomal protein S8 (Rps8), mRNA ;Mus musculus ES cells cDNA, RIKEN full-length enriched library, clone:2410041L12 product:ribosomal protein S8, full insert sequence"
|
|
349
|
|
350 self.record.sallseqdescr = ";".join(sallseqdescr)
|
|
351 self.record.sseqdescr = sallseqdescr[0]
|
|
352
|
|
353 return True # One may return false anywhere above to filter out current <Hsp> record.
|
|
354
|
|
355
|
|
356 # Tab-delimited order is important, so we can't just cycle through (unordered) self.record attributes.
|
|
357 #
|
|
358 # @uses .record object with field attributes
|
|
359 # @uses .prelim_columns (used before final column selection)
|
|
360 def outputTabDelimited(self):
|
|
361 values = []
|
|
362
|
|
363 for col in self.columns:
|
|
364 values.append(getattr(self.record, col['field']))
|
|
365
|
|
366 return '\t'.join(values) + '\n'
|
|
367
|
|
368
|
|
369
|
|
370 class ReportEngine(object):
|
|
371
|
|
372 def __init__(self): pass
|
|
373
|
|
374 def __main__(self):
|
|
375
|
|
376
|
|
377 ## *************************** Parse Command Line *****************************
|
|
378 parser = common.MyParser(
|
|
379 description = 'Generates tab-delimited table report based on BLAST XML results.',
|
|
380 usage = 'python blast_reporting.py [blastxml_input_file] [out_format] [tabular_output_file] [option: html_output_file] [option: selection_output_file:id_1:id_2:id_3] [options]',
|
|
381 epilog="""Details:
|
|
382
|
|
383 This tool can be used both via command line and via a local Galaxy install.
|
|
384 Galaxy uses .loc files (blast_reporting_fields.loc, fasta_reference_dbs.loc)
|
|
385 as indicated by the tool's tool_data_table_conf.xml.sample. The command line script
|
|
386 uses .tab versions (located in the script's folder) which need to reflect any changes
|
|
387 made in the .loc versions.
|
|
388
|
|
389 Note: the selection file option is used mainly by the galaxy blast reporting tool.
|
|
390
|
|
391 [out_format] is one of:
|
|
392 "std" : standard 12 column
|
|
393 "std+seqs" : standard 12 column plus search and matched sequences
|
|
394 "ext" : extended 24 column
|
|
395 "ext+": 26+ column
|
|
396 "custom": Use only given field selections.
|
|
397
|
|
398 Use -i to see possible field (column) selections as defined by blast_reporting_fields.tab.
|
|
399
|
|
400 REFERENCE_BINS: Selected bins have their columns shown in output table for clarity, even when custom fields are selected, unless selecting the bin "exclude" option.
|
|
401
|
|
402 FILTERS:
|
|
403 Format: ([field_name]:[comparator] [value];)*
|
|
404 e.g. "pident: gt 97; sallseqdescr: excludes bovine|clone|environmental|swine|uncultivated|uncultured|unidentified"
|
|
405 [comparator] =
|
|
406 == numeric equal
|
|
407 != numeric not equal
|
|
408 gt numeric greater than
|
|
409 gte numeric greater than or equal to
|
|
410 lt numeric less than
|
|
411 lte numeric less than or equal to
|
|
412 includes (search text fields for included words/phrases)
|
|
413 excludes (same as above but exclude result if text found)
|
|
414
|
|
415 Textual comparisons may have a value consisting of phrases to search for separated by "|" (disjunction).
|
|
416
|
|
417
|
|
418 """)
|
|
419
|
|
420 parser.set_defaults(row_limit=0)
|
|
421 # Don't use "-h" , it is reserved for --help!
|
|
422
|
|
423 parser.add_option('-b', '--bins', type='string', dest='reference_bins',
|
|
424 help='Provide a comma-delimited list of reference databases to check, along with their sort order, and a flag to exclude them if desired, e.g. "16Sncbi desc,euzby desc,16Srdp exclude". See -i option for a list of available databases.')
|
|
425
|
|
426 parser.add_option('-c', '--columns', type='string', dest='custom_fields',
|
|
427 help='To modify sorting and formatting, specify a comma-delimited list of field specifications of the form: "[field_name]:[column|table|section]:[asc|desc|none]:[new label text];..." .')
|
|
428
|
|
429 parser.add_option('-f', '--filter', type='string', dest='filters',
|
|
430 help='Provide a semicolon-delimited list of fields and their criteria to filter by.')
|
|
431
|
|
432 parser.add_option('-i', '--info', dest='info', default=False, action='store_true',
|
|
433 help='Provides list of columns and their descriptions, for use in filter, sort and custom column lists. Shows a list of available sequence type reference bins as well')
|
|
434
|
|
435 parser.add_option('-l', '--label', type='string', dest='column_labels',
|
|
436 help='Include field labels in first row of tab-delimited result table as short names or data field names (or none)')
|
|
437
|
|
438 parser.add_option('-n', '--number', type='int', dest='row_limit',
|
|
439 help='Provide a limit to the number of rows of returned data. The default 0=unlimited.')
|
|
440
|
|
441 #TESTING
|
|
442 parser.add_option('-B', '--refbins', type='string', dest='refbins',
|
|
443 help='Testing library_data form input.')
|
|
444
|
|
445 parser.add_option('-r', '--redundant', dest='drop_redundant_hits', default=False, action='store_true',
|
|
446 help='Return only first match to a gene bank id result.')
|
|
447
|
|
448 parser.add_option('-t', '--tests', dest='test_ids', help='Enter "all" or comma-separated id(s) of tests to run.')
|
|
449
|
|
450 options, args = parser.parse_args()
|
|
451
|
|
452 if options.test_ids:
|
|
453
|
|
454 # Future: read this spec from test-data folder itself?
|
|
455 tests = {
|
|
456 '1a': {'input':'blast_reporting_1.blastxml std','outputs':'blast_reporting_1a.tabular','options':''},
|
|
457 '1a1':{'input':'blast_reporting_1.blastxml std','outputs':'blast_reporting_1a1.tabular blast_reporting_1a1.html','options':'-r'},
|
|
458 '1b': {'input':'blast_reporting_1.blastxml std','outputs':'blast_reporting_1b.tabular','options':'-r -f "pident:gte 97"'},
|
|
459 '1c': {'input':'blast_reporting_1.blastxml std','outputs':'blast_reporting_1c.tabular','options':'-r -f "pident:gte 97" -c "pident:column:asc" -l label'},
|
|
460 '1d': {
|
|
461 'input':'blast_reporting_1.blastxml ext+',
|
|
462 'outputs':'blast_reporting_1d.tabular blast_reporting_1d.html blast_reporting_1d1.tabular',
|
|
463 'options':'-r -f "pident:gte 99" -l label -n 5'
|
|
464 }
|
|
465 }
|
|
466 common.testSuite(options.test_ids, tests, '/tmp/')
|
|
467
|
|
468 import time
|
|
469 time_start = time.time()
|
|
470
|
|
471 # "info" command provides a dump of all the fields that can be displayed from the Blast search.
|
|
472 if options.info:
|
|
473 # CAN WE LOCATE THIS FILE AS GALAXY SEES IT??????
|
|
474 print 'FIELDS:\n'
|
|
475 field_spec_path = os.path.join(os.path.dirname(__file__), 'blast_reporting_fields.tab')
|
|
476 fields = common.FieldSpec(field_spec_path)
|
|
477 for field in sorted(fields.dict.keys()):
|
|
478 print field + "\t" + fields.getAttribute(field,'type') + "\t" + fields.getAttribute(field,'name')
|
|
479
|
|
480 print '\nREFERENCE BINS:\n'
|
|
481 field_spec_path = os.path.join(os.path.dirname(__file__), 'fasta_reference_dbs.tab')
|
|
482 fields = common.FieldSpec(field_spec_path)
|
|
483 for field in sorted(fields.dict.keys()):
|
|
484 print field + "\t" + fields.getAttribute(field, 'path') + field + '/accession_ids.tab' + '\t' + fields.getAttribute(field, 'name')
|
|
485
|
|
486 sys.exit(1)
|
|
487
|
|
488 try:
|
|
489 in_file, output_format, out_tabular_file = args[0:3]
|
|
490
|
|
491 except:
|
|
492 common.stop_err("Expecting 3 arguments: input BLAST XML file, out format (std | std+seqs | ext | ext+ | custom), and output tabular file")
|
|
493
|
|
494 try:
|
|
495 # Get an iterable, see http://effbot.org/zone/element-iterparse.htm
|
|
496 context = ElementTree.iterparse(in_file, events=("start","end")) # By default only does end events.
|
|
497 context = iter(context)
|
|
498 event, root = context.next() # Creates reference to root element on 'start' event, for housecleaning below.
|
|
499 except:
|
|
500 common.stop_err("Invalid data format. !!")
|
|
501
|
|
502 tagGroup = XMLRecordScan(options, output_format)
|
|
503 fieldFilter = common.FieldFilter(tagGroup, options) # .filter list field names are changed above.
|
|
504
|
|
505 if options.reference_bins: print 'Database bins: %s' % str([bin.name for (ptr, bin) in enumerate(tagGroup.binManager.reference_bins) ]).translate(None, "[']")
|
|
506 if options.custom_fields: print 'Customized Fields: %s' % options.custom_fields
|
|
507 if options.filters: print 'Filters: ' + options.filters
|
|
508 if options.drop_redundant_hits: print 'Throwing out redundant hits...'
|
|
509
|
|
510 # ************************ FILE OUTPUT *****************************
|
|
511 # IT IS CRITICAL THAT EVERY <HIT>/<HSP> RETURN A COMPLETE XML SET OF TAGS OTHERWISE PREV. RECORD VALUES PERSIST
|
|
512 # NOTE: GALAXY 2012 has bug in html data display - it will show duplicate records OCCASIONALLY (at least on some browsers). You have to download data file to verify there are no duplicates
|
|
513
|
|
514 row_count = 0
|
|
515 row_count_filtered = 0
|
|
516 outfile = open(out_tabular_file, 'w')
|
|
517 query_stats = []
|
|
518
|
|
519 for event, elem in context:
|
|
520
|
|
521 # Alternative is to wipe Hit/Hsp fields on event == "start".
|
|
522 tag = elem.tag
|
|
523 if event == 'end':
|
|
524 if tag in tagGroup.tags : #Content of these tags fills a tabular line with column info.
|
|
525 tagGroup.setRecordAttr(tag, elem.text)
|
|
526 if tag == 'Iteration_query-def':
|
|
527 row_count = 0
|
|
528 row_count_filtered = 0
|
|
529 query_stats.append({'id':elem.text, 'rows' : 0, 'filtered_rows' : 0})
|
|
530
|
|
531 # Process each </hsp> record
|
|
532 elif tag == 'Hsp':
|
|
533 row_count += 1
|
|
534 query_stats[-1]['rows'] = row_count # real rows, not clipped
|
|
535 if options.row_limit == 0 or row_count_filtered < options.row_limit:
|
|
536
|
|
537 # Transform <Hsp> record & add field info.
|
|
538 if tagGroup.processRecord():
|
|
539
|
|
540 #if tagGroup.processFilters():
|
|
541 if fieldFilter.process(tagGroup.record):
|
|
542 row_count_filtered +=1
|
|
543 query_stats[-1]['filtered_rows'] = row_count_filtered
|
|
544 outfile.write(tagGroup.outputTabDelimited())
|
|
545
|
|
546 root.clear() # Clears references from root to (now unused) children to keep iterated datastructure small ???
|
|
547
|
|
548 elem.clear() # I think root.clear() cover this case.
|
|
549
|
|
550 root.clear()
|
|
551 outfile.close()
|
|
552
|
|
553
|
|
554 # Use fast Linux "sort" after filtering & file write
|
|
555 common.fileSort(out_tabular_file, tagGroup.columns)
|
|
556
|
|
557 """
|
|
558 The "Selection file" option is meant for galaxy UI use in conjunction
|
|
559 with the "Select Subsets on data" tool. If a selection_file is called
|
|
560 for, then we need to extract its id as well. For that we have to test
|
|
561 for somewhat odd expression from xml-generated command line, the
|
|
562 [$selection_file:$selection_file.hid:$selection_file.dataset_id:$selection_file.id]
|
|
563 Selection list doesn't necessarily need the HTML selectable report template,
|
|
564 but that template was designed to feed the galaxy "Select subsets" tool with its data.
|
|
565 """
|
|
566
|
|
567 if len(args) > 4 and args[4] != 'None:None:None:None':
|
|
568 selection_file_data = args[4]
|
|
569
|
|
570 #print ('selection file data:' + selection_file_data)
|
|
571 sel_file_fields = selection_file_data.split(':')
|
|
572 selection_file = sel_file_fields[0]
|
|
573 # From galaxy, incoming format is $selection_file:$selection_file.hid:$selection_file.dataset_id:$selection_file.id
|
|
574 # From command line, user won't have specified any of this, so ignore.
|
|
575 if len(sel_file_fields) > 3:
|
|
576 options.dataset_selection_id = sel_file_fields[3]
|
|
577
|
|
578 else:
|
|
579 options.dataset_selection_id = 0
|
|
580
|
|
581 common.fileSelections(out_tabular_file, selection_file, tagGroup, options)
|
|
582
|
|
583
|
|
584 """
|
|
585 We must have a template in order to write anything to above html output file.
|
|
586 All report templates need to be listed in the module's tabular data "blast_reporting_templates" folder.
|
|
587 # There are two possible HTML Report template locations:
|
|
588 # 1) The stock reports included in the module in the "templates/" subfolder, e.g. html_report.py
|
|
589 # 2) User customized templates. To set this up:
|
|
590 - add a custom template folder in a location of your choice.
|
|
591 - Copy this module's templates folder into it.
|
|
592 - The new folder must be in python's sys.path, which is achieved by adding a .pth file to python's site-packages folder.. E.g. set up /usr/lib/python2.6/site-packages/galaxy-custom-modules.pth to contain "/usr/local/galaxy/shared/python2.6_galaxy_custom_modules"
|
|
593 , and place 'templates_custom/html_report.py' in there.
|
|
594 """
|
|
595 if len(args) > 3:
|
|
596 out_html_file = args[3] #Galaxy-generated
|
|
597 # args[5] = html_template, default from galaxy xml is 'templates.html_report', but testing can receive 'None' value
|
|
598 if len(args) > 5 and len(args[5].strip()) > 0 and not args[5].strip() == 'None':
|
|
599
|
|
600 html_template = args[5] #User-selected
|
|
601 if not html_template.translate(None, "._-" ).isalnum():
|
|
602 common.stop_err("The HTML Report template name is not correct. It should be a python class path like templates.html_report)! : " + html_template)
|
|
603
|
|
604 else:
|
|
605 html_template = 'templates.html_report'
|
|
606
|
|
607 try:
|
|
608 # See http://stackoverflow.com/questions/769534/dynamic-loading-of-python-modules
|
|
609 HTMLReportModule = __import__(html_template, fromlist=['does not in fact matter what goes here!'])
|
|
610 # Now create final tabular, html (or future: xml) data
|
|
611 htmlManager = HTMLReportModule.HTMLReport(tagGroup, options, query_stats)
|
|
612 # htmlManager might not be initialized if the caller couldn't provide all the data the particular template needed.
|
|
613 htmlManager.render(out_tabular_file, out_html_file)
|
|
614
|
|
615 except ImportError:
|
|
616 common.stop_err("Unable to locate HTML Report template! : " + html_template)
|
|
617
|
|
618
|
|
619 common.fileTabular(out_tabular_file, tagGroup, options)
|
|
620
|
|
621 print('Execution time (seconds): ' + str(int(time.time()-time_start)))
|
|
622
|
|
623
|
|
624 if __name__ == '__main__':
|
|
625 # Command line access
|
|
626 reportEngine = ReportEngine()
|
|
627 reportEngine.__main__()
|