comparison blast_reporting.py @ 0:7db7ecc78ad6 draft

Uploaded
author damion
date Mon, 02 Mar 2015 20:46:00 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:7db7ecc78ad6
1 #!/usr/bin/python
2 """Convert a BLAST XML file to 12 column tabular output
3
4 This tool can be used both via command line and via a local Galaxy install.
5 Galaxy uses .loc files as indicated by the tool_data_table_conf.xml.sample.
6 The command line version uses .tab versions of the above files:
7 blast_reporting_fields.loc
8 fasta_reference_dbs.loc
9 So for command-line use, ensure the .tab files are updated to their .loc counterparts.
10
11 Takes three command line options, input BLAST XML filename, output tabular
12 BLAST filename, output format (std for standard 12 columns, or ext for the
13 extended 24 columns offered in the BLAST+ wrappers).
14
15 The 12 columns output are 'qseqid sseqid pident length mismatch gapopen qstart
16 qend sstart send evalue bitscore' or 'std' at the BLAST+ command line, which
17 mean:
18
19 ====== ========= ============================================
20 Column NCBI name Description
21 ------ --------- --------------------------------------------
22 1 qseqid Query Seq-id (ID of your sequence)
23 2 sseqid Subject Seq-id (ID of the database hit)
24 3 pident Percentage of identical matches
25 4 length Alignment length
26 5 mismatch Number of mismatches
27 6 gapopen Number of gap openings
28 7 qstart Start of alignment in query
29 8 qend End of alignment in query
30 9 sstart Start of alignment in subject (database hit)
31 10 send End of alignment in subject (database hit)
32 11 evalue Expectation value (E-value)
33 12 bitscore Bit score
34 ====== ========= ============================================
35
36 The additional columns offered in the Galaxy BLAST+ wrappers are:
37
38 ====== ============= ===========================================
39 Column NCBI name Description
40 ------ ------------- -------------------------------------------
41 13 sallseqid All subject Seq-id(s), separated by a ';'
42 14 score Raw score
43 15 nident Number of identical matches
44 16 positive Number of positive-scoring matches
45 17 gaps Total number of gaps
46 18 ppos Percentage of positive-scoring matches
47 19 qframe Query frame
48 20 sframe Subject frame
49 21 qseq Aligned part of query sequence
50 22 sseq Aligned part of subject sequence
51 23 qlen Query sequence length
52 24 slen Subject sequence length
53 ====== ============= ===========================================
54
55 Very slight modifications were made to the "BLAST XML to tabular" tool that
56 ships with Galaxy to output two more column columns:
57
58 ====== ============= ===========================================
59 Column NCBI name Description
60 ------ ------------- -------------------------------------------
61 25 pcov Percentage coverage
62 26 sallseqdescr All subject Seq-descr(s), separated by a ','
63 ====== ============= ===========================================
64
65 Most of these fields are given explicitly in the XML file, others some like
66 the percentage identity and the number of gap openings must be calculated.
67
68 In addition an option exists to select particular columns for the output
69 report. Reference bin columns will be added if they have been included in
70 search.
71
72 A command line version can be used. Type blast_reporting.py -h for help.
73
74 Be aware that the sequence in the extended tabular output or XML direct from
75 BLAST+ may or may not use XXXX masking on regions of low complexity. This
76 can throw the off the calculation of percentage identity and gap openings.
77 [In fact, both BLAST 2.2.24+ and 2.2.25+ have a subtle bug in this regard,
78 with these numbers changing depending on whether or not the low complexity
79 filter is used.]
80
81 This script attempts to produce identical output to what BLAST+ would have done.
82 However, check this with "diff -b ..." since BLAST+ sometimes includes an extra
83 space character (probably a bug).
84
85 python blast_reporting.py in_file out_tabular_file out_html_file out_format
86 """
87 import sys
88 import re
89 import os.path
90 import common
91 import reference_bins
92 #import templates.html_report
93
94 if __name__ == '__main__' and __package__ is None:
95 from os import path
96 sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))
97
98 if sys.version_info[:2] >= ( 2, 5 ):
99 import xml.etree.cElementTree as ElementTree
100 else:
101 from galaxy import eggs
102 import pkg_resources; pkg_resources.require( "elementtree" )
103 from elementtree import ElementTree
104
105 class GenericRecord(object): pass
106
107 class XMLRecordScan(object):
108 """
109 XML Input file usually looks like:
110
111 <BlastOutput>
112 <BlastOutput_program>blastn</BlastOutput_program>
113 <BlastOutput_param>
114 <Parameters>
115 <Parameters_expect>0.001</Parameters_expect>
116 <Parameters_sc-match>1</Parameters_sc-match>
117 <Parameters_sc-mismatch>-2</Parameters_sc-mismatch>
118 <Parameters_gap-open>0</Parameters_gap-open>
119 <Parameters_gap-extend>0</Parameters_gap-extend>
120 <Parameters_filter>L;m;</Parameters_filter>
121 </Parameters>
122 </BlastOutput_param>
123 <Iteration>
124 <Iteration_iter-num>1</Iteration_iter-num>
125 <Iteration_query-ID>Query_1</Iteration_query-ID>
126 <Iteration_query-def>ENA|EF604038|EF604038.1 Uncultured bacterium clone 16saw43-2g09.q1k 16S ribosomal RNA gene, partial sequence</Iteration_query-def>
127 <Iteration_query-len>1364</Iteration_query-len>
128 <Iteration_hits>
129
130 <Hit>
131 <Hit_num>1</Hit_num>
132 <Hit_id>gi|444439670|ref|NR_074985.1|</Hit_id>
133 <Hit_hsps>...
134 <Hsp>...
135 """
136 def __init__(self, options, output_format):
137 """ Creates a record object that holds field data for each <hit> iteration in Blastn XML data
138
139 .record object: holds values read in from <XML> <hit> record mainly.
140 .tags dictionary: XML tags and the record.[x] fields/attributes that should be set to tag values.
141 .column_format dictionary: Name to field count dictionary used for selecting # of output fields
142 .fieldSpec dictionary: Specification of each possible field's type (for validation), full name, and suitability for sorting, filtering, etc.
143 .custom_columns array takes list of custom columns to output. (If sorting by a column it must be in this list)
144 .reference_bins dictionary
145
146 """
147 self.record = GenericRecord() # Set up so we can use object attributes.
148
149 #This is a list of all incomming blast generated XML fields that we want to capture
150 # self.record gets all underscored variables values as well as new derived ones in process() below
151 self.tags = {
152 "BlastOutput_program": '_blast_program',
153 "Iteration_query-ID": '_qseqid',
154 "Iteration_query-def": '_qdef',
155 "Iteration_query-len": '_qlen', #extended+ calc
156
157 "Hit_id": '_hit_id',
158 "Hit_def": '_hit_def', #extended+ calc
159 "Hit_accession": '_hit_acc',
160 "Hit_len": '_hit_len',
161
162 "Hsp_bit-score": '_bitscore', #basic
163 "Hsp_score": '_score', #extended
164 "Hsp_evalue": '_evalue', #basic
165 "Hsp_query-from": '_qstart', #basic, extended+ calc
166 "Hsp_query-to": '_qend', #basic, extended+ calc
167 "Hsp_hit-from": '_sstart', #basic
168 "Hsp_hit-to": '_send', #basic
169 "Hsp_query-frame": '_qframe', #extended only
170 "Hsp_hit-frame": '_sframe', #extended only
171 "Hsp_identity": '_nident', #extended
172 "Hsp_positive": '_positive', #extended
173 "Hsp_gaps": '_gaps', #extended
174 "Hsp_align-len": '_length', #basic
175 "Hsp_qseq": '_qseq', #extended
176 "Hsp_hseq": '_sseq', #extended
177 "Hsp_midline": '_mseq' #basic
178 }
179
180 self.column_format = {
181 'std':12,
182 'ext':24,
183 'std+seqs':12,
184 'ext+':26,
185 'custom':1
186 }
187
188 if not output_format in self.column_format:
189 common.stop_err("Format argument should be std (12 column) or ext (extended 24 columns) or ext+ (extended 26+ columns) or custom (you choose fields). Format argument x22 has been replaced with ext (extended 24 columns)")
190
191 # Array of columns destined for tab-delimited output - This defines default ORDER of fields too.
192 # Raw data fields that never get output: _bitscore, _evalue, _qframe, _sframe,
193 # and this that has no m_frame equivalent: _mseq
194 self.columns_in = 'qseqid sseqid pident _length mismatch gapopen _qstart _qend _sstart _send evalue bitscore \
195 sallseqid _score _nident _positive _gaps ppos qframe sframe _qseq _sseq qlen slen \
196 pcov sallseqdescr accessionid sseqdescr _mseq'.split()
197
198 fieldSpecFile = os.path.join(os.path.dirname(__file__), 'blast_reporting_fields.tab')
199 self.field_spec = common.FieldSpec(fieldSpecFile, self.columns_in)
200
201 # Include first N fields from .columns according to format.
202 # In all cases qseqid is included.
203 # Default everything to "column".
204 columns_out = self.columns_in[0:self.column_format[output_format]]
205
206 # This column list is designed for creating phylogeny reports.
207 if output_format == 'std+seqs': columns_out.extend(['_qseq','_sseq'])
208
209 self.columns = self.field_spec.initColumns(columns_out, options.custom_fields)
210
211 # We're making these columns hidden for this particular HTML report format
212 # UNLESS they are mentioned in options.custom_fields
213 if output_format == 'std+seqs':
214 for (ptr, target) in enumerate(self.columns):
215 if target['field'] in ['_qseq','_sseq'] and options.custom_fields and not target['field'] in options.custom_fields :
216 target['group'] = 'hidden'
217
218 # ADD SELECTED BINS TO COLUMN LIST;
219 self.binManager = reference_bins.ReferenceBins()
220 self.binManager.build_bins(options.reference_bins, self.columns)
221
222 def setRecordAttr(self, tag, text):
223 #self.record is a class object (not a dictionary) so using setattr()
224 setattr(self.record, self.tags[tag], text)
225
226 # Called after set() has processed a bunch of <hit> ...</hit> tags
227 def processRecord(self) :
228
229 bline = self.record
230
231 # NCBI notes: Expecting either this,
232 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id>
233 # <Hit_def>RecName: Full=Rhodopsin</Hit_def>
234 # <Hit_accession>P56514</Hit_accession>
235 #or,
236 # <Hit_id>Subject_1</Hit_id>
237 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def>
238 # <Hit_accession>Subject_1</Hit_accession>
239 #or,
240 # <Hit_id>Subject_1</Hit_id>
241 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def>
242 # <Hit_accession>Subject_1</Hit_accession>
243 #apparently depending on the parse_deflines switch
244
245 sseqid = self.record._hit_id.split(None,1)[0]
246
247 # If Hit_id == Hit_accession AND it is a default "Subject_1" ...
248 # OR Hit_accession IN Hit_id and BL_ORD_ID|XXXX contains hit_accession
249 if common.re_default_subject_id.match(sseqid) and sseqid.find(bline._hit_acc):
250 # and sseqid == bline._hit_acc:
251 #Place holder ID, take the first word of the subject definition
252 hit_def = bline._hit_def
253 sseqid = hit_def.split(None,1)[0]
254 else:
255 hit_def = sseqid + " " + bline._hit_def
256
257 self.record.sseqid = sseqid
258
259 if common.re_default_ncbi_id.match(sseqid):
260 self.record.accessionid = sseqid.split('|')[3]
261 elif common.re_default_ref_id.match(sseqid):
262 self.record.accessionid = sseqid.split('|')[1]
263 else:
264 # Have to use the whole string.
265 self.record.accessionid = sseqid
266
267
268 # NCBI notes: Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA
269 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>
270 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>
271 # <Iteration_query-len>406</Iteration_query-len>
272 # <Iteration_hits></Iteration_hits>
273 #
274 #Or, from BLAST 2.2.24+ run online
275 # <Iteration_query-ID>Query_1</Iteration_query-ID>
276 # <Iteration_query-def>Sample</Iteration_query-def>
277 # <Iteration_query-len>516</Iteration_query-len>
278 # <Iteration_hits>...
279
280 # Note BioPython's approach http://biopython.org/DIST/docs/api/Bio.SearchIO.BlastIO.blast_xml-pysrc.html
281 # ... if hit_id.startswith('gnl|BL_ORD_ID|'): ...
282
283 if common.re_default_query_id.match(bline._qseqid):
284 #Place holder ID, take the first word of the query definition
285 qseqid = bline._qdef.split(None,1)[0]
286 else:
287 qseqid = bline._qseqid
288
289 self.record.qseqid = qseqid
290
291 self.record.evalue = "0.0" if bline._evalue == "0" else "%0.0e" % float(bline._evalue)
292
293 # NCBI notes:
294 # if bline._bitscore < 100:
295 # #Seems to show one decimal place for lower scores
296 # bitscore = "%0.1f" % bline._bitscore
297 # else:
298 # #Note BLAST does not round to nearest int, it truncates
299 # bitscore = "%i" % bline._bitscore
300 bitscore = float(bline._bitscore)
301 self.record.bitscore = "%0.1f" % bitscore if bitscore < 100 else "%i" % bitscore
302
303 self.record.pident = "%0.2f" % (100*float(bline._nident)/float(bline._length))
304
305 self.record.gapopen = str(len(bline._qseq.replace('-', ' ').split())-1 + \
306 len(bline._sseq.replace('-', ' ').split())-1)
307
308 mismatch = bline._mseq.count(' ') + bline._mseq.count('+') \
309 - bline._qseq.count('-') - bline._sseq.count('-')
310 #assert len(bline._qseq) == len(bline._sseq) == len(bline._mseq) == int(bline._length)
311 self.record.mismatch = str(mismatch)
312
313 # Extended fields
314 #sallseqid gets ";" delimited list of first words in each hit_def "x>y>z" expression.
315 #Nov 7 2013 fix: https://github.com/peterjc/galaxy_blast/blob/master/tools/ncbi_blast_plus/blastxml_to_tabular.py
316 hit_def_array = hit_def.split(" >") #Note: elem.text below converts escaped "&gt;" back to ">"
317 try:
318 self.record.sallseqid = ";".join(name.split(None,1)[0] for name in hit_def_array)
319 except IndexError as e:
320 common.stop_err("Problem splitting multiple hit ids?\n%r\n--> %s" % (hit_def, e))
321
322 # Calculate accession ids, and check bin(s) for them, update record accordingly.
323 self.binManager.setStatus(self.record)
324
325 self.record.ppos = "%0.2f" % (100*float(bline._positive)/float(bline._length))
326 qframe = bline._qframe
327 sframe = bline._sframe
328 if bline._blast_program == "blastp":
329 #Probably a bug in BLASTP that they use 0 or 1 depending on format
330 if qframe == "0": qframe = "1"
331 if sframe == "0": sframe = "1"
332
333 self.record.qframe = qframe
334 self.record.sframe = sframe
335 self.record.slen = str(int(bline._hit_len))
336 self.record.qlen = str(int(bline._qlen))
337
338 #extended+
339 self.record.pcov = "%0.2f" % (float(int(bline._qend) - int(bline._qstart) + 1)/int(bline._qlen) * 100)
340 sallseqdescr = []
341 try:
342 for name in hit_def_array:
343 id_desc = name.split(None,1)
344 if len(id_desc) == 1: sallseqdescr.append('missing description - database issue')
345 else: sallseqdescr.append(id_desc[1])
346 except IndexError as e:
347 common.stop_err("Problem splitting multiple hits?\n%r\n--> %s" % (hit_def, e))
348 # Example sallseqdescr is "Mus musculus ribosomal protein S8 (Rps8), mRNA ;Mus musculus ES cells cDNA, RIKEN full-length enriched library, clone:2410041L12 product:ribosomal protein S8, full insert sequence"
349
350 self.record.sallseqdescr = ";".join(sallseqdescr)
351 self.record.sseqdescr = sallseqdescr[0]
352
353 return True # One may return false anywhere above to filter out current <Hsp> record.
354
355
356 # Tab-delimited order is important, so we can't just cycle through (unordered) self.record attributes.
357 #
358 # @uses .record object with field attributes
359 # @uses .prelim_columns (used before final column selection)
360 def outputTabDelimited(self):
361 values = []
362
363 for col in self.columns:
364 values.append(getattr(self.record, col['field']))
365
366 return '\t'.join(values) + '\n'
367
368
369
370 class ReportEngine(object):
371
372 def __init__(self): pass
373
374 def __main__(self):
375
376
377 ## *************************** Parse Command Line *****************************
378 parser = common.MyParser(
379 description = 'Generates tab-delimited table report based on BLAST XML results.',
380 usage = 'python blast_reporting.py [blastxml_input_file] [out_format] [tabular_output_file] [option: html_output_file] [option: selection_output_file:id_1:id_2:id_3] [options]',
381 epilog="""Details:
382
383 This tool can be used both via command line and via a local Galaxy install.
384 Galaxy uses .loc files (blast_reporting_fields.loc, fasta_reference_dbs.loc)
385 as indicated by the tool's tool_data_table_conf.xml.sample. The command line script
386 uses .tab versions (located in the script's folder) which need to reflect any changes
387 made in the .loc versions.
388
389 Note: the selection file option is used mainly by the galaxy blast reporting tool.
390
391 [out_format] is one of:
392 "std" : standard 12 column
393 "std+seqs" : standard 12 column plus search and matched sequences
394 "ext" : extended 24 column
395 "ext+": 26+ column
396 "custom": Use only given field selections.
397
398 Use -i to see possible field (column) selections as defined by blast_reporting_fields.tab.
399
400 REFERENCE_BINS: Selected bins have their columns shown in output table for clarity, even when custom fields are selected, unless selecting the bin "exclude" option.
401
402 FILTERS:
403 Format: ([field_name]:[comparator] [value];)*
404 e.g. "pident: gt 97; sallseqdescr: excludes bovine|clone|environmental|swine|uncultivated|uncultured|unidentified"
405 [comparator] =
406 == numeric equal
407 != numeric not equal
408 gt numeric greater than
409 gte numeric greater than or equal to
410 lt numeric less than
411 lte numeric less than or equal to
412 includes (search text fields for included words/phrases)
413 excludes (same as above but exclude result if text found)
414
415 Textual comparisons may have a value consisting of phrases to search for separated by "|" (disjunction).
416
417
418 """)
419
420 parser.set_defaults(row_limit=0)
421 # Don't use "-h" , it is reserved for --help!
422
423 parser.add_option('-b', '--bins', type='string', dest='reference_bins',
424 help='Provide a comma-delimited list of reference databases to check, along with their sort order, and a flag to exclude them if desired, e.g. "16Sncbi desc,euzby desc,16Srdp exclude". See -i option for a list of available databases.')
425
426 parser.add_option('-c', '--columns', type='string', dest='custom_fields',
427 help='To modify sorting and formatting, specify a comma-delimited list of field specifications of the form: "[field_name]:[column|table|section]:[asc|desc|none]:[new label text];..." .')
428
429 parser.add_option('-f', '--filter', type='string', dest='filters',
430 help='Provide a semicolon-delimited list of fields and their criteria to filter by.')
431
432 parser.add_option('-i', '--info', dest='info', default=False, action='store_true',
433 help='Provides list of columns and their descriptions, for use in filter, sort and custom column lists. Shows a list of available sequence type reference bins as well')
434
435 parser.add_option('-l', '--label', type='string', dest='column_labels',
436 help='Include field labels in first row of tab-delimited result table as short names or data field names (or none)')
437
438 parser.add_option('-n', '--number', type='int', dest='row_limit',
439 help='Provide a limit to the number of rows of returned data. The default 0=unlimited.')
440
441 #TESTING
442 parser.add_option('-B', '--refbins', type='string', dest='refbins',
443 help='Testing library_data form input.')
444
445 parser.add_option('-r', '--redundant', dest='drop_redundant_hits', default=False, action='store_true',
446 help='Return only first match to a gene bank id result.')
447
448 parser.add_option('-t', '--tests', dest='test_ids', help='Enter "all" or comma-separated id(s) of tests to run.')
449
450 options, args = parser.parse_args()
451
452 if options.test_ids:
453
454 # Future: read this spec from test-data folder itself?
455 tests = {
456 '1a': {'input':'blast_reporting_1.blastxml std','outputs':'blast_reporting_1a.tabular','options':''},
457 '1a1':{'input':'blast_reporting_1.blastxml std','outputs':'blast_reporting_1a1.tabular blast_reporting_1a1.html','options':'-r'},
458 '1b': {'input':'blast_reporting_1.blastxml std','outputs':'blast_reporting_1b.tabular','options':'-r -f "pident:gte 97"'},
459 '1c': {'input':'blast_reporting_1.blastxml std','outputs':'blast_reporting_1c.tabular','options':'-r -f "pident:gte 97" -c "pident:column:asc" -l label'},
460 '1d': {
461 'input':'blast_reporting_1.blastxml ext+',
462 'outputs':'blast_reporting_1d.tabular blast_reporting_1d.html blast_reporting_1d1.tabular',
463 'options':'-r -f "pident:gte 99" -l label -n 5'
464 }
465 }
466 common.testSuite(options.test_ids, tests, '/tmp/')
467
468 import time
469 time_start = time.time()
470
471 # "info" command provides a dump of all the fields that can be displayed from the Blast search.
472 if options.info:
473 # CAN WE LOCATE THIS FILE AS GALAXY SEES IT??????
474 print 'FIELDS:\n'
475 field_spec_path = os.path.join(os.path.dirname(__file__), 'blast_reporting_fields.tab')
476 fields = common.FieldSpec(field_spec_path)
477 for field in sorted(fields.dict.keys()):
478 print field + "\t" + fields.getAttribute(field,'type') + "\t" + fields.getAttribute(field,'name')
479
480 print '\nREFERENCE BINS:\n'
481 field_spec_path = os.path.join(os.path.dirname(__file__), 'fasta_reference_dbs.tab')
482 fields = common.FieldSpec(field_spec_path)
483 for field in sorted(fields.dict.keys()):
484 print field + "\t" + fields.getAttribute(field, 'path') + field + '/accession_ids.tab' + '\t' + fields.getAttribute(field, 'name')
485
486 sys.exit(1)
487
488 try:
489 in_file, output_format, out_tabular_file = args[0:3]
490
491 except:
492 common.stop_err("Expecting 3 arguments: input BLAST XML file, out format (std | std+seqs | ext | ext+ | custom), and output tabular file")
493
494 try:
495 # Get an iterable, see http://effbot.org/zone/element-iterparse.htm
496 context = ElementTree.iterparse(in_file, events=("start","end")) # By default only does end events.
497 context = iter(context)
498 event, root = context.next() # Creates reference to root element on 'start' event, for housecleaning below.
499 except:
500 common.stop_err("Invalid data format. !!")
501
502 tagGroup = XMLRecordScan(options, output_format)
503 fieldFilter = common.FieldFilter(tagGroup, options) # .filter list field names are changed above.
504
505 if options.reference_bins: print 'Database bins: %s' % str([bin.name for (ptr, bin) in enumerate(tagGroup.binManager.reference_bins) ]).translate(None, "[']")
506 if options.custom_fields: print 'Customized Fields: %s' % options.custom_fields
507 if options.filters: print 'Filters: ' + options.filters
508 if options.drop_redundant_hits: print 'Throwing out redundant hits...'
509
510 # ************************ FILE OUTPUT *****************************
511 # IT IS CRITICAL THAT EVERY <HIT>/<HSP> RETURN A COMPLETE XML SET OF TAGS OTHERWISE PREV. RECORD VALUES PERSIST
512 # NOTE: GALAXY 2012 has bug in html data display - it will show duplicate records OCCASIONALLY (at least on some browsers). You have to download data file to verify there are no duplicates
513
514 row_count = 0
515 row_count_filtered = 0
516 outfile = open(out_tabular_file, 'w')
517 query_stats = []
518
519 for event, elem in context:
520
521 # Alternative is to wipe Hit/Hsp fields on event == "start".
522 tag = elem.tag
523 if event == 'end':
524 if tag in tagGroup.tags : #Content of these tags fills a tabular line with column info.
525 tagGroup.setRecordAttr(tag, elem.text)
526 if tag == 'Iteration_query-def':
527 row_count = 0
528 row_count_filtered = 0
529 query_stats.append({'id':elem.text, 'rows' : 0, 'filtered_rows' : 0})
530
531 # Process each </hsp> record
532 elif tag == 'Hsp':
533 row_count += 1
534 query_stats[-1]['rows'] = row_count # real rows, not clipped
535 if options.row_limit == 0 or row_count_filtered < options.row_limit:
536
537 # Transform <Hsp> record & add field info.
538 if tagGroup.processRecord():
539
540 #if tagGroup.processFilters():
541 if fieldFilter.process(tagGroup.record):
542 row_count_filtered +=1
543 query_stats[-1]['filtered_rows'] = row_count_filtered
544 outfile.write(tagGroup.outputTabDelimited())
545
546 root.clear() # Clears references from root to (now unused) children to keep iterated datastructure small ???
547
548 elem.clear() # I think root.clear() cover this case.
549
550 root.clear()
551 outfile.close()
552
553
554 # Use fast Linux "sort" after filtering & file write
555 common.fileSort(out_tabular_file, tagGroup.columns)
556
557 """
558 The "Selection file" option is meant for galaxy UI use in conjunction
559 with the "Select Subsets on data" tool. If a selection_file is called
560 for, then we need to extract its id as well. For that we have to test
561 for somewhat odd expression from xml-generated command line, the
562 [$selection_file:$selection_file.hid:$selection_file.dataset_id:$selection_file.id]
563 Selection list doesn't necessarily need the HTML selectable report template,
564 but that template was designed to feed the galaxy "Select subsets" tool with its data.
565 """
566
567 if len(args) > 4 and args[4] != 'None:None:None:None':
568 selection_file_data = args[4]
569
570 #print ('selection file data:' + selection_file_data)
571 sel_file_fields = selection_file_data.split(':')
572 selection_file = sel_file_fields[0]
573 # From galaxy, incoming format is $selection_file:$selection_file.hid:$selection_file.dataset_id:$selection_file.id
574 # From command line, user won't have specified any of this, so ignore.
575 if len(sel_file_fields) > 3:
576 options.dataset_selection_id = sel_file_fields[3]
577
578 else:
579 options.dataset_selection_id = 0
580
581 common.fileSelections(out_tabular_file, selection_file, tagGroup, options)
582
583
584 """
585 We must have a template in order to write anything to above html output file.
586 All report templates need to be listed in the module's tabular data "blast_reporting_templates" folder.
587 # There are two possible HTML Report template locations:
588 # 1) The stock reports included in the module in the "templates/" subfolder, e.g. html_report.py
589 # 2) User customized templates. To set this up:
590 - add a custom template folder in a location of your choice.
591 - Copy this module's templates folder into it.
592 - The new folder must be in python's sys.path, which is achieved by adding a .pth file to python's site-packages folder.. E.g. set up /usr/lib/python2.6/site-packages/galaxy-custom-modules.pth to contain "/usr/local/galaxy/shared/python2.6_galaxy_custom_modules"
593 , and place 'templates_custom/html_report.py' in there.
594 """
595 if len(args) > 3:
596 out_html_file = args[3] #Galaxy-generated
597 # args[5] = html_template, default from galaxy xml is 'templates.html_report', but testing can receive 'None' value
598 if len(args) > 5 and len(args[5].strip()) > 0 and not args[5].strip() == 'None':
599
600 html_template = args[5] #User-selected
601 if not html_template.translate(None, "._-" ).isalnum():
602 common.stop_err("The HTML Report template name is not correct. It should be a python class path like templates.html_report)! : " + html_template)
603
604 else:
605 html_template = 'templates.html_report'
606
607 try:
608 # See http://stackoverflow.com/questions/769534/dynamic-loading-of-python-modules
609 HTMLReportModule = __import__(html_template, fromlist=['does not in fact matter what goes here!'])
610 # Now create final tabular, html (or future: xml) data
611 htmlManager = HTMLReportModule.HTMLReport(tagGroup, options, query_stats)
612 # htmlManager might not be initialized if the caller couldn't provide all the data the particular template needed.
613 htmlManager.render(out_tabular_file, out_html_file)
614
615 except ImportError:
616 common.stop_err("Unable to locate HTML Report template! : " + html_template)
617
618
619 common.fileTabular(out_tabular_file, tagGroup, options)
620
621 print('Execution time (seconds): ' + str(int(time.time()-time_start)))
622
623
624 if __name__ == '__main__':
625 # Command line access
626 reportEngine = ReportEngine()
627 reportEngine.__main__()