Mercurial > repos > damion > blast_reporting
comparison blast_reporting.py @ 0:7db7ecc78ad6 draft
Uploaded
author | damion |
---|---|
date | Mon, 02 Mar 2015 20:46:00 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:7db7ecc78ad6 |
---|---|
1 #!/usr/bin/python | |
2 """Convert a BLAST XML file to 12 column tabular output | |
3 | |
4 This tool can be used both via command line and via a local Galaxy install. | |
5 Galaxy uses .loc files as indicated by the tool_data_table_conf.xml.sample. | |
6 The command line version uses .tab versions of the above files: | |
7 blast_reporting_fields.loc | |
8 fasta_reference_dbs.loc | |
9 So for command-line use, ensure the .tab files are updated to their .loc counterparts. | |
10 | |
11 Takes three command line options, input BLAST XML filename, output tabular | |
12 BLAST filename, output format (std for standard 12 columns, or ext for the | |
13 extended 24 columns offered in the BLAST+ wrappers). | |
14 | |
15 The 12 columns output are 'qseqid sseqid pident length mismatch gapopen qstart | |
16 qend sstart send evalue bitscore' or 'std' at the BLAST+ command line, which | |
17 mean: | |
18 | |
19 ====== ========= ============================================ | |
20 Column NCBI name Description | |
21 ------ --------- -------------------------------------------- | |
22 1 qseqid Query Seq-id (ID of your sequence) | |
23 2 sseqid Subject Seq-id (ID of the database hit) | |
24 3 pident Percentage of identical matches | |
25 4 length Alignment length | |
26 5 mismatch Number of mismatches | |
27 6 gapopen Number of gap openings | |
28 7 qstart Start of alignment in query | |
29 8 qend End of alignment in query | |
30 9 sstart Start of alignment in subject (database hit) | |
31 10 send End of alignment in subject (database hit) | |
32 11 evalue Expectation value (E-value) | |
33 12 bitscore Bit score | |
34 ====== ========= ============================================ | |
35 | |
36 The additional columns offered in the Galaxy BLAST+ wrappers are: | |
37 | |
38 ====== ============= =========================================== | |
39 Column NCBI name Description | |
40 ------ ------------- ------------------------------------------- | |
41 13 sallseqid All subject Seq-id(s), separated by a ';' | |
42 14 score Raw score | |
43 15 nident Number of identical matches | |
44 16 positive Number of positive-scoring matches | |
45 17 gaps Total number of gaps | |
46 18 ppos Percentage of positive-scoring matches | |
47 19 qframe Query frame | |
48 20 sframe Subject frame | |
49 21 qseq Aligned part of query sequence | |
50 22 sseq Aligned part of subject sequence | |
51 23 qlen Query sequence length | |
52 24 slen Subject sequence length | |
53 ====== ============= =========================================== | |
54 | |
55 Very slight modifications were made to the "BLAST XML to tabular" tool that | |
56 ships with Galaxy to output two more column columns: | |
57 | |
58 ====== ============= =========================================== | |
59 Column NCBI name Description | |
60 ------ ------------- ------------------------------------------- | |
61 25 pcov Percentage coverage | |
62 26 sallseqdescr All subject Seq-descr(s), separated by a ',' | |
63 ====== ============= =========================================== | |
64 | |
65 Most of these fields are given explicitly in the XML file, others some like | |
66 the percentage identity and the number of gap openings must be calculated. | |
67 | |
68 In addition an option exists to select particular columns for the output | |
69 report. Reference bin columns will be added if they have been included in | |
70 search. | |
71 | |
72 A command line version can be used. Type blast_reporting.py -h for help. | |
73 | |
74 Be aware that the sequence in the extended tabular output or XML direct from | |
75 BLAST+ may or may not use XXXX masking on regions of low complexity. This | |
76 can throw the off the calculation of percentage identity and gap openings. | |
77 [In fact, both BLAST 2.2.24+ and 2.2.25+ have a subtle bug in this regard, | |
78 with these numbers changing depending on whether or not the low complexity | |
79 filter is used.] | |
80 | |
81 This script attempts to produce identical output to what BLAST+ would have done. | |
82 However, check this with "diff -b ..." since BLAST+ sometimes includes an extra | |
83 space character (probably a bug). | |
84 | |
85 python blast_reporting.py in_file out_tabular_file out_html_file out_format | |
86 """ | |
87 import sys | |
88 import re | |
89 import os.path | |
90 import common | |
91 import reference_bins | |
92 #import templates.html_report | |
93 | |
94 if __name__ == '__main__' and __package__ is None: | |
95 from os import path | |
96 sys.path.append(path.dirname(path.dirname(path.abspath(__file__)))) | |
97 | |
98 if sys.version_info[:2] >= ( 2, 5 ): | |
99 import xml.etree.cElementTree as ElementTree | |
100 else: | |
101 from galaxy import eggs | |
102 import pkg_resources; pkg_resources.require( "elementtree" ) | |
103 from elementtree import ElementTree | |
104 | |
105 class GenericRecord(object): pass | |
106 | |
107 class XMLRecordScan(object): | |
108 """ | |
109 XML Input file usually looks like: | |
110 | |
111 <BlastOutput> | |
112 <BlastOutput_program>blastn</BlastOutput_program> | |
113 <BlastOutput_param> | |
114 <Parameters> | |
115 <Parameters_expect>0.001</Parameters_expect> | |
116 <Parameters_sc-match>1</Parameters_sc-match> | |
117 <Parameters_sc-mismatch>-2</Parameters_sc-mismatch> | |
118 <Parameters_gap-open>0</Parameters_gap-open> | |
119 <Parameters_gap-extend>0</Parameters_gap-extend> | |
120 <Parameters_filter>L;m;</Parameters_filter> | |
121 </Parameters> | |
122 </BlastOutput_param> | |
123 <Iteration> | |
124 <Iteration_iter-num>1</Iteration_iter-num> | |
125 <Iteration_query-ID>Query_1</Iteration_query-ID> | |
126 <Iteration_query-def>ENA|EF604038|EF604038.1 Uncultured bacterium clone 16saw43-2g09.q1k 16S ribosomal RNA gene, partial sequence</Iteration_query-def> | |
127 <Iteration_query-len>1364</Iteration_query-len> | |
128 <Iteration_hits> | |
129 | |
130 <Hit> | |
131 <Hit_num>1</Hit_num> | |
132 <Hit_id>gi|444439670|ref|NR_074985.1|</Hit_id> | |
133 <Hit_hsps>... | |
134 <Hsp>... | |
135 """ | |
136 def __init__(self, options, output_format): | |
137 """ Creates a record object that holds field data for each <hit> iteration in Blastn XML data | |
138 | |
139 .record object: holds values read in from <XML> <hit> record mainly. | |
140 .tags dictionary: XML tags and the record.[x] fields/attributes that should be set to tag values. | |
141 .column_format dictionary: Name to field count dictionary used for selecting # of output fields | |
142 .fieldSpec dictionary: Specification of each possible field's type (for validation), full name, and suitability for sorting, filtering, etc. | |
143 .custom_columns array takes list of custom columns to output. (If sorting by a column it must be in this list) | |
144 .reference_bins dictionary | |
145 | |
146 """ | |
147 self.record = GenericRecord() # Set up so we can use object attributes. | |
148 | |
149 #This is a list of all incomming blast generated XML fields that we want to capture | |
150 # self.record gets all underscored variables values as well as new derived ones in process() below | |
151 self.tags = { | |
152 "BlastOutput_program": '_blast_program', | |
153 "Iteration_query-ID": '_qseqid', | |
154 "Iteration_query-def": '_qdef', | |
155 "Iteration_query-len": '_qlen', #extended+ calc | |
156 | |
157 "Hit_id": '_hit_id', | |
158 "Hit_def": '_hit_def', #extended+ calc | |
159 "Hit_accession": '_hit_acc', | |
160 "Hit_len": '_hit_len', | |
161 | |
162 "Hsp_bit-score": '_bitscore', #basic | |
163 "Hsp_score": '_score', #extended | |
164 "Hsp_evalue": '_evalue', #basic | |
165 "Hsp_query-from": '_qstart', #basic, extended+ calc | |
166 "Hsp_query-to": '_qend', #basic, extended+ calc | |
167 "Hsp_hit-from": '_sstart', #basic | |
168 "Hsp_hit-to": '_send', #basic | |
169 "Hsp_query-frame": '_qframe', #extended only | |
170 "Hsp_hit-frame": '_sframe', #extended only | |
171 "Hsp_identity": '_nident', #extended | |
172 "Hsp_positive": '_positive', #extended | |
173 "Hsp_gaps": '_gaps', #extended | |
174 "Hsp_align-len": '_length', #basic | |
175 "Hsp_qseq": '_qseq', #extended | |
176 "Hsp_hseq": '_sseq', #extended | |
177 "Hsp_midline": '_mseq' #basic | |
178 } | |
179 | |
180 self.column_format = { | |
181 'std':12, | |
182 'ext':24, | |
183 'std+seqs':12, | |
184 'ext+':26, | |
185 'custom':1 | |
186 } | |
187 | |
188 if not output_format in self.column_format: | |
189 common.stop_err("Format argument should be std (12 column) or ext (extended 24 columns) or ext+ (extended 26+ columns) or custom (you choose fields). Format argument x22 has been replaced with ext (extended 24 columns)") | |
190 | |
191 # Array of columns destined for tab-delimited output - This defines default ORDER of fields too. | |
192 # Raw data fields that never get output: _bitscore, _evalue, _qframe, _sframe, | |
193 # and this that has no m_frame equivalent: _mseq | |
194 self.columns_in = 'qseqid sseqid pident _length mismatch gapopen _qstart _qend _sstart _send evalue bitscore \ | |
195 sallseqid _score _nident _positive _gaps ppos qframe sframe _qseq _sseq qlen slen \ | |
196 pcov sallseqdescr accessionid sseqdescr _mseq'.split() | |
197 | |
198 fieldSpecFile = os.path.join(os.path.dirname(__file__), 'blast_reporting_fields.tab') | |
199 self.field_spec = common.FieldSpec(fieldSpecFile, self.columns_in) | |
200 | |
201 # Include first N fields from .columns according to format. | |
202 # In all cases qseqid is included. | |
203 # Default everything to "column". | |
204 columns_out = self.columns_in[0:self.column_format[output_format]] | |
205 | |
206 # This column list is designed for creating phylogeny reports. | |
207 if output_format == 'std+seqs': columns_out.extend(['_qseq','_sseq']) | |
208 | |
209 self.columns = self.field_spec.initColumns(columns_out, options.custom_fields) | |
210 | |
211 # We're making these columns hidden for this particular HTML report format | |
212 # UNLESS they are mentioned in options.custom_fields | |
213 if output_format == 'std+seqs': | |
214 for (ptr, target) in enumerate(self.columns): | |
215 if target['field'] in ['_qseq','_sseq'] and options.custom_fields and not target['field'] in options.custom_fields : | |
216 target['group'] = 'hidden' | |
217 | |
218 # ADD SELECTED BINS TO COLUMN LIST; | |
219 self.binManager = reference_bins.ReferenceBins() | |
220 self.binManager.build_bins(options.reference_bins, self.columns) | |
221 | |
222 def setRecordAttr(self, tag, text): | |
223 #self.record is a class object (not a dictionary) so using setattr() | |
224 setattr(self.record, self.tags[tag], text) | |
225 | |
226 # Called after set() has processed a bunch of <hit> ...</hit> tags | |
227 def processRecord(self) : | |
228 | |
229 bline = self.record | |
230 | |
231 # NCBI notes: Expecting either this, | |
232 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id> | |
233 # <Hit_def>RecName: Full=Rhodopsin</Hit_def> | |
234 # <Hit_accession>P56514</Hit_accession> | |
235 #or, | |
236 # <Hit_id>Subject_1</Hit_id> | |
237 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def> | |
238 # <Hit_accession>Subject_1</Hit_accession> | |
239 #or, | |
240 # <Hit_id>Subject_1</Hit_id> | |
241 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def> | |
242 # <Hit_accession>Subject_1</Hit_accession> | |
243 #apparently depending on the parse_deflines switch | |
244 | |
245 sseqid = self.record._hit_id.split(None,1)[0] | |
246 | |
247 # If Hit_id == Hit_accession AND it is a default "Subject_1" ... | |
248 # OR Hit_accession IN Hit_id and BL_ORD_ID|XXXX contains hit_accession | |
249 if common.re_default_subject_id.match(sseqid) and sseqid.find(bline._hit_acc): | |
250 # and sseqid == bline._hit_acc: | |
251 #Place holder ID, take the first word of the subject definition | |
252 hit_def = bline._hit_def | |
253 sseqid = hit_def.split(None,1)[0] | |
254 else: | |
255 hit_def = sseqid + " " + bline._hit_def | |
256 | |
257 self.record.sseqid = sseqid | |
258 | |
259 if common.re_default_ncbi_id.match(sseqid): | |
260 self.record.accessionid = sseqid.split('|')[3] | |
261 elif common.re_default_ref_id.match(sseqid): | |
262 self.record.accessionid = sseqid.split('|')[1] | |
263 else: | |
264 # Have to use the whole string. | |
265 self.record.accessionid = sseqid | |
266 | |
267 | |
268 # NCBI notes: Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA | |
269 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID> | |
270 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def> | |
271 # <Iteration_query-len>406</Iteration_query-len> | |
272 # <Iteration_hits></Iteration_hits> | |
273 # | |
274 #Or, from BLAST 2.2.24+ run online | |
275 # <Iteration_query-ID>Query_1</Iteration_query-ID> | |
276 # <Iteration_query-def>Sample</Iteration_query-def> | |
277 # <Iteration_query-len>516</Iteration_query-len> | |
278 # <Iteration_hits>... | |
279 | |
280 # Note BioPython's approach http://biopython.org/DIST/docs/api/Bio.SearchIO.BlastIO.blast_xml-pysrc.html | |
281 # ... if hit_id.startswith('gnl|BL_ORD_ID|'): ... | |
282 | |
283 if common.re_default_query_id.match(bline._qseqid): | |
284 #Place holder ID, take the first word of the query definition | |
285 qseqid = bline._qdef.split(None,1)[0] | |
286 else: | |
287 qseqid = bline._qseqid | |
288 | |
289 self.record.qseqid = qseqid | |
290 | |
291 self.record.evalue = "0.0" if bline._evalue == "0" else "%0.0e" % float(bline._evalue) | |
292 | |
293 # NCBI notes: | |
294 # if bline._bitscore < 100: | |
295 # #Seems to show one decimal place for lower scores | |
296 # bitscore = "%0.1f" % bline._bitscore | |
297 # else: | |
298 # #Note BLAST does not round to nearest int, it truncates | |
299 # bitscore = "%i" % bline._bitscore | |
300 bitscore = float(bline._bitscore) | |
301 self.record.bitscore = "%0.1f" % bitscore if bitscore < 100 else "%i" % bitscore | |
302 | |
303 self.record.pident = "%0.2f" % (100*float(bline._nident)/float(bline._length)) | |
304 | |
305 self.record.gapopen = str(len(bline._qseq.replace('-', ' ').split())-1 + \ | |
306 len(bline._sseq.replace('-', ' ').split())-1) | |
307 | |
308 mismatch = bline._mseq.count(' ') + bline._mseq.count('+') \ | |
309 - bline._qseq.count('-') - bline._sseq.count('-') | |
310 #assert len(bline._qseq) == len(bline._sseq) == len(bline._mseq) == int(bline._length) | |
311 self.record.mismatch = str(mismatch) | |
312 | |
313 # Extended fields | |
314 #sallseqid gets ";" delimited list of first words in each hit_def "x>y>z" expression. | |
315 #Nov 7 2013 fix: https://github.com/peterjc/galaxy_blast/blob/master/tools/ncbi_blast_plus/blastxml_to_tabular.py | |
316 hit_def_array = hit_def.split(" >") #Note: elem.text below converts escaped ">" back to ">" | |
317 try: | |
318 self.record.sallseqid = ";".join(name.split(None,1)[0] for name in hit_def_array) | |
319 except IndexError as e: | |
320 common.stop_err("Problem splitting multiple hit ids?\n%r\n--> %s" % (hit_def, e)) | |
321 | |
322 # Calculate accession ids, and check bin(s) for them, update record accordingly. | |
323 self.binManager.setStatus(self.record) | |
324 | |
325 self.record.ppos = "%0.2f" % (100*float(bline._positive)/float(bline._length)) | |
326 qframe = bline._qframe | |
327 sframe = bline._sframe | |
328 if bline._blast_program == "blastp": | |
329 #Probably a bug in BLASTP that they use 0 or 1 depending on format | |
330 if qframe == "0": qframe = "1" | |
331 if sframe == "0": sframe = "1" | |
332 | |
333 self.record.qframe = qframe | |
334 self.record.sframe = sframe | |
335 self.record.slen = str(int(bline._hit_len)) | |
336 self.record.qlen = str(int(bline._qlen)) | |
337 | |
338 #extended+ | |
339 self.record.pcov = "%0.2f" % (float(int(bline._qend) - int(bline._qstart) + 1)/int(bline._qlen) * 100) | |
340 sallseqdescr = [] | |
341 try: | |
342 for name in hit_def_array: | |
343 id_desc = name.split(None,1) | |
344 if len(id_desc) == 1: sallseqdescr.append('missing description - database issue') | |
345 else: sallseqdescr.append(id_desc[1]) | |
346 except IndexError as e: | |
347 common.stop_err("Problem splitting multiple hits?\n%r\n--> %s" % (hit_def, e)) | |
348 # Example sallseqdescr is "Mus musculus ribosomal protein S8 (Rps8), mRNA ;Mus musculus ES cells cDNA, RIKEN full-length enriched library, clone:2410041L12 product:ribosomal protein S8, full insert sequence" | |
349 | |
350 self.record.sallseqdescr = ";".join(sallseqdescr) | |
351 self.record.sseqdescr = sallseqdescr[0] | |
352 | |
353 return True # One may return false anywhere above to filter out current <Hsp> record. | |
354 | |
355 | |
356 # Tab-delimited order is important, so we can't just cycle through (unordered) self.record attributes. | |
357 # | |
358 # @uses .record object with field attributes | |
359 # @uses .prelim_columns (used before final column selection) | |
360 def outputTabDelimited(self): | |
361 values = [] | |
362 | |
363 for col in self.columns: | |
364 values.append(getattr(self.record, col['field'])) | |
365 | |
366 return '\t'.join(values) + '\n' | |
367 | |
368 | |
369 | |
370 class ReportEngine(object): | |
371 | |
372 def __init__(self): pass | |
373 | |
374 def __main__(self): | |
375 | |
376 | |
377 ## *************************** Parse Command Line ***************************** | |
378 parser = common.MyParser( | |
379 description = 'Generates tab-delimited table report based on BLAST XML results.', | |
380 usage = 'python blast_reporting.py [blastxml_input_file] [out_format] [tabular_output_file] [option: html_output_file] [option: selection_output_file:id_1:id_2:id_3] [options]', | |
381 epilog="""Details: | |
382 | |
383 This tool can be used both via command line and via a local Galaxy install. | |
384 Galaxy uses .loc files (blast_reporting_fields.loc, fasta_reference_dbs.loc) | |
385 as indicated by the tool's tool_data_table_conf.xml.sample. The command line script | |
386 uses .tab versions (located in the script's folder) which need to reflect any changes | |
387 made in the .loc versions. | |
388 | |
389 Note: the selection file option is used mainly by the galaxy blast reporting tool. | |
390 | |
391 [out_format] is one of: | |
392 "std" : standard 12 column | |
393 "std+seqs" : standard 12 column plus search and matched sequences | |
394 "ext" : extended 24 column | |
395 "ext+": 26+ column | |
396 "custom": Use only given field selections. | |
397 | |
398 Use -i to see possible field (column) selections as defined by blast_reporting_fields.tab. | |
399 | |
400 REFERENCE_BINS: Selected bins have their columns shown in output table for clarity, even when custom fields are selected, unless selecting the bin "exclude" option. | |
401 | |
402 FILTERS: | |
403 Format: ([field_name]:[comparator] [value];)* | |
404 e.g. "pident: gt 97; sallseqdescr: excludes bovine|clone|environmental|swine|uncultivated|uncultured|unidentified" | |
405 [comparator] = | |
406 == numeric equal | |
407 != numeric not equal | |
408 gt numeric greater than | |
409 gte numeric greater than or equal to | |
410 lt numeric less than | |
411 lte numeric less than or equal to | |
412 includes (search text fields for included words/phrases) | |
413 excludes (same as above but exclude result if text found) | |
414 | |
415 Textual comparisons may have a value consisting of phrases to search for separated by "|" (disjunction). | |
416 | |
417 | |
418 """) | |
419 | |
420 parser.set_defaults(row_limit=0) | |
421 # Don't use "-h" , it is reserved for --help! | |
422 | |
423 parser.add_option('-b', '--bins', type='string', dest='reference_bins', | |
424 help='Provide a comma-delimited list of reference databases to check, along with their sort order, and a flag to exclude them if desired, e.g. "16Sncbi desc,euzby desc,16Srdp exclude". See -i option for a list of available databases.') | |
425 | |
426 parser.add_option('-c', '--columns', type='string', dest='custom_fields', | |
427 help='To modify sorting and formatting, specify a comma-delimited list of field specifications of the form: "[field_name]:[column|table|section]:[asc|desc|none]:[new label text];..." .') | |
428 | |
429 parser.add_option('-f', '--filter', type='string', dest='filters', | |
430 help='Provide a semicolon-delimited list of fields and their criteria to filter by.') | |
431 | |
432 parser.add_option('-i', '--info', dest='info', default=False, action='store_true', | |
433 help='Provides list of columns and their descriptions, for use in filter, sort and custom column lists. Shows a list of available sequence type reference bins as well') | |
434 | |
435 parser.add_option('-l', '--label', type='string', dest='column_labels', | |
436 help='Include field labels in first row of tab-delimited result table as short names or data field names (or none)') | |
437 | |
438 parser.add_option('-n', '--number', type='int', dest='row_limit', | |
439 help='Provide a limit to the number of rows of returned data. The default 0=unlimited.') | |
440 | |
441 #TESTING | |
442 parser.add_option('-B', '--refbins', type='string', dest='refbins', | |
443 help='Testing library_data form input.') | |
444 | |
445 parser.add_option('-r', '--redundant', dest='drop_redundant_hits', default=False, action='store_true', | |
446 help='Return only first match to a gene bank id result.') | |
447 | |
448 parser.add_option('-t', '--tests', dest='test_ids', help='Enter "all" or comma-separated id(s) of tests to run.') | |
449 | |
450 options, args = parser.parse_args() | |
451 | |
452 if options.test_ids: | |
453 | |
454 # Future: read this spec from test-data folder itself? | |
455 tests = { | |
456 '1a': {'input':'blast_reporting_1.blastxml std','outputs':'blast_reporting_1a.tabular','options':''}, | |
457 '1a1':{'input':'blast_reporting_1.blastxml std','outputs':'blast_reporting_1a1.tabular blast_reporting_1a1.html','options':'-r'}, | |
458 '1b': {'input':'blast_reporting_1.blastxml std','outputs':'blast_reporting_1b.tabular','options':'-r -f "pident:gte 97"'}, | |
459 '1c': {'input':'blast_reporting_1.blastxml std','outputs':'blast_reporting_1c.tabular','options':'-r -f "pident:gte 97" -c "pident:column:asc" -l label'}, | |
460 '1d': { | |
461 'input':'blast_reporting_1.blastxml ext+', | |
462 'outputs':'blast_reporting_1d.tabular blast_reporting_1d.html blast_reporting_1d1.tabular', | |
463 'options':'-r -f "pident:gte 99" -l label -n 5' | |
464 } | |
465 } | |
466 common.testSuite(options.test_ids, tests, '/tmp/') | |
467 | |
468 import time | |
469 time_start = time.time() | |
470 | |
471 # "info" command provides a dump of all the fields that can be displayed from the Blast search. | |
472 if options.info: | |
473 # CAN WE LOCATE THIS FILE AS GALAXY SEES IT?????? | |
474 print 'FIELDS:\n' | |
475 field_spec_path = os.path.join(os.path.dirname(__file__), 'blast_reporting_fields.tab') | |
476 fields = common.FieldSpec(field_spec_path) | |
477 for field in sorted(fields.dict.keys()): | |
478 print field + "\t" + fields.getAttribute(field,'type') + "\t" + fields.getAttribute(field,'name') | |
479 | |
480 print '\nREFERENCE BINS:\n' | |
481 field_spec_path = os.path.join(os.path.dirname(__file__), 'fasta_reference_dbs.tab') | |
482 fields = common.FieldSpec(field_spec_path) | |
483 for field in sorted(fields.dict.keys()): | |
484 print field + "\t" + fields.getAttribute(field, 'path') + field + '/accession_ids.tab' + '\t' + fields.getAttribute(field, 'name') | |
485 | |
486 sys.exit(1) | |
487 | |
488 try: | |
489 in_file, output_format, out_tabular_file = args[0:3] | |
490 | |
491 except: | |
492 common.stop_err("Expecting 3 arguments: input BLAST XML file, out format (std | std+seqs | ext | ext+ | custom), and output tabular file") | |
493 | |
494 try: | |
495 # Get an iterable, see http://effbot.org/zone/element-iterparse.htm | |
496 context = ElementTree.iterparse(in_file, events=("start","end")) # By default only does end events. | |
497 context = iter(context) | |
498 event, root = context.next() # Creates reference to root element on 'start' event, for housecleaning below. | |
499 except: | |
500 common.stop_err("Invalid data format. !!") | |
501 | |
502 tagGroup = XMLRecordScan(options, output_format) | |
503 fieldFilter = common.FieldFilter(tagGroup, options) # .filter list field names are changed above. | |
504 | |
505 if options.reference_bins: print 'Database bins: %s' % str([bin.name for (ptr, bin) in enumerate(tagGroup.binManager.reference_bins) ]).translate(None, "[']") | |
506 if options.custom_fields: print 'Customized Fields: %s' % options.custom_fields | |
507 if options.filters: print 'Filters: ' + options.filters | |
508 if options.drop_redundant_hits: print 'Throwing out redundant hits...' | |
509 | |
510 # ************************ FILE OUTPUT ***************************** | |
511 # IT IS CRITICAL THAT EVERY <HIT>/<HSP> RETURN A COMPLETE XML SET OF TAGS OTHERWISE PREV. RECORD VALUES PERSIST | |
512 # NOTE: GALAXY 2012 has bug in html data display - it will show duplicate records OCCASIONALLY (at least on some browsers). You have to download data file to verify there are no duplicates | |
513 | |
514 row_count = 0 | |
515 row_count_filtered = 0 | |
516 outfile = open(out_tabular_file, 'w') | |
517 query_stats = [] | |
518 | |
519 for event, elem in context: | |
520 | |
521 # Alternative is to wipe Hit/Hsp fields on event == "start". | |
522 tag = elem.tag | |
523 if event == 'end': | |
524 if tag in tagGroup.tags : #Content of these tags fills a tabular line with column info. | |
525 tagGroup.setRecordAttr(tag, elem.text) | |
526 if tag == 'Iteration_query-def': | |
527 row_count = 0 | |
528 row_count_filtered = 0 | |
529 query_stats.append({'id':elem.text, 'rows' : 0, 'filtered_rows' : 0}) | |
530 | |
531 # Process each </hsp> record | |
532 elif tag == 'Hsp': | |
533 row_count += 1 | |
534 query_stats[-1]['rows'] = row_count # real rows, not clipped | |
535 if options.row_limit == 0 or row_count_filtered < options.row_limit: | |
536 | |
537 # Transform <Hsp> record & add field info. | |
538 if tagGroup.processRecord(): | |
539 | |
540 #if tagGroup.processFilters(): | |
541 if fieldFilter.process(tagGroup.record): | |
542 row_count_filtered +=1 | |
543 query_stats[-1]['filtered_rows'] = row_count_filtered | |
544 outfile.write(tagGroup.outputTabDelimited()) | |
545 | |
546 root.clear() # Clears references from root to (now unused) children to keep iterated datastructure small ??? | |
547 | |
548 elem.clear() # I think root.clear() cover this case. | |
549 | |
550 root.clear() | |
551 outfile.close() | |
552 | |
553 | |
554 # Use fast Linux "sort" after filtering & file write | |
555 common.fileSort(out_tabular_file, tagGroup.columns) | |
556 | |
557 """ | |
558 The "Selection file" option is meant for galaxy UI use in conjunction | |
559 with the "Select Subsets on data" tool. If a selection_file is called | |
560 for, then we need to extract its id as well. For that we have to test | |
561 for somewhat odd expression from xml-generated command line, the | |
562 [$selection_file:$selection_file.hid:$selection_file.dataset_id:$selection_file.id] | |
563 Selection list doesn't necessarily need the HTML selectable report template, | |
564 but that template was designed to feed the galaxy "Select subsets" tool with its data. | |
565 """ | |
566 | |
567 if len(args) > 4 and args[4] != 'None:None:None:None': | |
568 selection_file_data = args[4] | |
569 | |
570 #print ('selection file data:' + selection_file_data) | |
571 sel_file_fields = selection_file_data.split(':') | |
572 selection_file = sel_file_fields[0] | |
573 # From galaxy, incoming format is $selection_file:$selection_file.hid:$selection_file.dataset_id:$selection_file.id | |
574 # From command line, user won't have specified any of this, so ignore. | |
575 if len(sel_file_fields) > 3: | |
576 options.dataset_selection_id = sel_file_fields[3] | |
577 | |
578 else: | |
579 options.dataset_selection_id = 0 | |
580 | |
581 common.fileSelections(out_tabular_file, selection_file, tagGroup, options) | |
582 | |
583 | |
584 """ | |
585 We must have a template in order to write anything to above html output file. | |
586 All report templates need to be listed in the module's tabular data "blast_reporting_templates" folder. | |
587 # There are two possible HTML Report template locations: | |
588 # 1) The stock reports included in the module in the "templates/" subfolder, e.g. html_report.py | |
589 # 2) User customized templates. To set this up: | |
590 - add a custom template folder in a location of your choice. | |
591 - Copy this module's templates folder into it. | |
592 - The new folder must be in python's sys.path, which is achieved by adding a .pth file to python's site-packages folder.. E.g. set up /usr/lib/python2.6/site-packages/galaxy-custom-modules.pth to contain "/usr/local/galaxy/shared/python2.6_galaxy_custom_modules" | |
593 , and place 'templates_custom/html_report.py' in there. | |
594 """ | |
595 if len(args) > 3: | |
596 out_html_file = args[3] #Galaxy-generated | |
597 # args[5] = html_template, default from galaxy xml is 'templates.html_report', but testing can receive 'None' value | |
598 if len(args) > 5 and len(args[5].strip()) > 0 and not args[5].strip() == 'None': | |
599 | |
600 html_template = args[5] #User-selected | |
601 if not html_template.translate(None, "._-" ).isalnum(): | |
602 common.stop_err("The HTML Report template name is not correct. It should be a python class path like templates.html_report)! : " + html_template) | |
603 | |
604 else: | |
605 html_template = 'templates.html_report' | |
606 | |
607 try: | |
608 # See http://stackoverflow.com/questions/769534/dynamic-loading-of-python-modules | |
609 HTMLReportModule = __import__(html_template, fromlist=['does not in fact matter what goes here!']) | |
610 # Now create final tabular, html (or future: xml) data | |
611 htmlManager = HTMLReportModule.HTMLReport(tagGroup, options, query_stats) | |
612 # htmlManager might not be initialized if the caller couldn't provide all the data the particular template needed. | |
613 htmlManager.render(out_tabular_file, out_html_file) | |
614 | |
615 except ImportError: | |
616 common.stop_err("Unable to locate HTML Report template! : " + html_template) | |
617 | |
618 | |
619 common.fileTabular(out_tabular_file, tagGroup, options) | |
620 | |
621 print('Execution time (seconds): ' + str(int(time.time()-time_start))) | |
622 | |
623 | |
624 if __name__ == '__main__': | |
625 # Command line access | |
626 reportEngine = ReportEngine() | |
627 reportEngine.__main__() |