3
|
1 """
|
|
2 BlastXml class
|
|
3 """
|
|
4
|
|
5 from galaxy.datatypes.data import get_file_peek
|
4
|
6 from galaxy.datatypes.data import Text, Data
|
3
|
7 from galaxy.datatypes.xml import GenericXml
|
4
|
8 from galaxy.datatypes.metadata import MetadataElement
|
3
|
9
|
|
10 class BlastXml( GenericXml ):
|
|
11 """NCBI Blast XML Output data"""
|
|
12 file_ext = "blastxml"
|
|
13
|
|
14 def set_peek( self, dataset, is_multi_byte=False ):
|
|
15 """Set the peek and blurb text"""
|
|
16 if not dataset.dataset.purged:
|
|
17 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
18 dataset.blurb = 'NCBI Blast XML data'
|
|
19 else:
|
|
20 dataset.peek = 'file does not exist'
|
|
21 dataset.blurb = 'file purged from disk'
|
4
|
22
|
3
|
23 def sniff( self, filename ):
|
|
24 """
|
|
25 Determines whether the file is blastxml
|
|
26
|
|
27 >>> fname = get_test_fname( 'megablast_xml_parser_test1.blastxml' )
|
|
28 >>> BlastXml().sniff( fname )
|
|
29 True
|
|
30 >>> fname = get_test_fname( 'tblastn_four_human_vs_rhodopsin.xml' )
|
|
31 >>> BlastXml().sniff( fname )
|
|
32 True
|
|
33 >>> fname = get_test_fname( 'interval.interval' )
|
|
34 >>> BlastXml().sniff( fname )
|
|
35 False
|
|
36 """
|
|
37 #TODO - Use a context manager on Python 2.5+ to close handle
|
|
38 handle = open(filename)
|
|
39 line = handle.readline()
|
|
40 if line.strip() != '<?xml version="1.0"?>':
|
|
41 handle.close()
|
|
42 return False
|
|
43 line = handle.readline()
|
|
44 if line.strip() not in ['<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">',
|
|
45 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">']:
|
|
46 handle.close()
|
|
47 return False
|
|
48 line = handle.readline()
|
|
49 if line.strip() != '<BlastOutput>':
|
|
50 handle.close()
|
|
51 return False
|
|
52 handle.close()
|
|
53 return True
|
|
54
|
|
55 def merge(split_files, output_file):
|
|
56 """Merging multiple XML files is non-trivial and must be done in subclasses."""
|
|
57 if len(split_files) == 1:
|
|
58 #For one file only, use base class method (move/copy)
|
|
59 return Text.merge(split_files, output_file)
|
4
|
60 if not split_files:
|
|
61 raise ValueError("Given no BLAST XML files, %r, to merge into %s" \
|
|
62 % (split_files, output_file))
|
3
|
63 out = open(output_file, "w")
|
|
64 h = None
|
|
65 for f in split_files:
|
|
66 h = open(f)
|
|
67 body = False
|
|
68 header = h.readline()
|
|
69 if not header:
|
|
70 out.close()
|
|
71 h.close()
|
|
72 raise ValueError("BLAST XML file %s was empty" % f)
|
|
73 if header.strip() != '<?xml version="1.0"?>':
|
|
74 out.write(header) #for diagnosis
|
|
75 out.close()
|
|
76 h.close()
|
|
77 raise ValueError("%s is not an XML file!" % f)
|
|
78 line = h.readline()
|
|
79 header += line
|
|
80 if line.strip() not in ['<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">',
|
|
81 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">']:
|
|
82 out.write(header) #for diagnosis
|
|
83 out.close()
|
|
84 h.close()
|
|
85 raise ValueError("%s is not a BLAST XML file!" % f)
|
|
86 while True:
|
|
87 line = h.readline()
|
|
88 if not line:
|
|
89 out.write(header) #for diagnosis
|
|
90 out.close()
|
|
91 h.close()
|
|
92 raise ValueError("BLAST XML file %s ended prematurely" % f)
|
|
93 header += line
|
|
94 if "<Iteration>" in line:
|
|
95 break
|
|
96 if len(header) > 10000:
|
|
97 #Something has gone wrong, don't load too much into memory!
|
|
98 #Write what we have to the merged file for diagnostics
|
|
99 out.write(header)
|
|
100 out.close()
|
|
101 h.close()
|
|
102 raise ValueError("BLAST XML file %s has too long a header!" % f)
|
|
103 if "<BlastOutput>" not in header:
|
|
104 out.close()
|
|
105 h.close()
|
|
106 raise ValueError("%s is not a BLAST XML file:\n%s\n..." % (f, header))
|
|
107 if f == split_files[0]:
|
|
108 out.write(header)
|
|
109 old_header = header
|
|
110 elif old_header[:300] != header[:300]:
|
|
111 #Enough to check <BlastOutput_program> and <BlastOutput_version> match
|
|
112 out.close()
|
|
113 h.close()
|
|
114 raise ValueError("BLAST XML headers don't match for %s and %s - have:\n%s\n...\n\nAnd:\n%s\n...\n" \
|
|
115 % (split_files[0], f, old_header[:300], header[:300]))
|
|
116 else:
|
|
117 out.write(" <Iteration>\n")
|
|
118 for line in h:
|
|
119 if "</BlastOutput_iterations>" in line:
|
|
120 break
|
|
121 #TODO - Increment <Iteration_iter-num> and if required automatic query names
|
|
122 #like <Iteration_query-ID>Query_3</Iteration_query-ID> to be increasing?
|
|
123 out.write(line)
|
|
124 h.close()
|
|
125 out.write(" </BlastOutput_iterations>\n")
|
|
126 out.write("</BlastOutput>\n")
|
|
127 out.close()
|
|
128 merge = staticmethod(merge)
|
|
129
|
4
|
130
|
|
131 class _BlastDb(object):
|
|
132 """Base class for BLAST database datatype."""
|
|
133
|
|
134 def set_peek( self, dataset, is_multi_byte=False ):
|
|
135 """Set the peek and blurb text."""
|
|
136 if not dataset.dataset.purged:
|
|
137 dataset.peek = "BLAST database (multiple files)"
|
|
138 dataset.blurb = "BLAST database (multiple files)"
|
|
139 else:
|
|
140 dataset.peek = 'file does not exist'
|
|
141 dataset.blurb = 'file purged from disk'
|
|
142
|
|
143 def display_peek( self, dataset ):
|
|
144 """Create HTML content, used for displaying peek."""
|
|
145 try:
|
|
146 return dataset.peek
|
|
147 except:
|
|
148 return "BLAST database (multiple files)"
|
|
149
|
|
150 def display_data(self, trans, data, preview=False, filename=None,
|
|
151 to_ext=None, size=None, offset=None, **kwd):
|
|
152 """Apparently an old display method, but still gets called.
|
|
153
|
|
154 This allows us to format the data shown in the central pane via the "eye" icon.
|
|
155 """
|
|
156 return "This is a BLAST database."
|
|
157
|
|
158 def get_mime(self):
|
|
159 """Returns the mime type of the datatype (pretend it is text for peek)"""
|
|
160 return 'text/plain'
|
|
161
|
|
162 def merge(split_files, output_file):
|
|
163 """Merge BLAST databases (not implemented for now)."""
|
|
164 raise NotImplementedError("Merging BLAST databases is non-trivial (do this via makeblastdb?)")
|
|
165
|
|
166 def split( cls, input_datasets, subdir_generator_function, split_params):
|
|
167 """Split a BLAST database (not implemented for now)."""
|
|
168 if split_params is None:
|
|
169 return None
|
|
170 raise NotImplementedError("Can't split BLAST databases")
|
|
171
|
|
172
|
|
173 class BlastNucDb( _BlastDb, Data ):
|
|
174 """Class for nucleotide BLAST database files."""
|
|
175 file_ext = 'blastdbn'
|
|
176 composite_type ='basic'
|
|
177 MetadataElement( readonly=True, optional=True, visible=False, no_value=0 )
|
|
178
|
|
179 def __init__(self,**kwd):
|
|
180 Data.__init__(self, **kwd)
|
|
181 self.add_composite_file('blastdb.nhr')
|
|
182 self.add_composite_file('blastdb.nin')
|
|
183 self.add_composite_file('blastdb.nsq')
|
|
184 self.add_composite_file('blastdb.nhd', optional=True)
|
|
185 self.add_composite_file('blastdb.nsi', optional=True)
|
|
186 self.add_composite_file('blastdb.nhi', optional=True)
|
|
187 self.add_composite_file('blastdb.nog', optional=True)
|
|
188 self.add_composite_file('blastdb.nsd', optional=True)
|
|
189
|
|
190 def display_data(self, trans, data, preview=False, filename=None,
|
|
191 to_ext=None, size=None, offset=None, **kwd):
|
|
192 """Apparently an old display method, but still gets called.
|
|
193
|
|
194 This allows us to format the data shown in the central pane via the "eye" icon.
|
|
195 """
|
|
196 return "This is a BLAST nucleotide database."
|
|
197
|
|
198 class BlastProtDb( _BlastDb, Data ):
|
|
199 """Class for protein BLAST database files."""
|
|
200 file_ext = 'blastdbp'
|
|
201 composite_type ='basic'
|
|
202 MetadataElement( readonly=True, optional=True, visible=False, no_value=0 )
|
|
203
|
|
204 def __init__(self,**kwd):
|
|
205 Data.__init__(self, **kwd)
|
|
206 self.add_composite_file('blastdb.phr')
|
|
207 self.add_composite_file('blastdb.pin')
|
|
208 self.add_composite_file('blastdb.psq')
|
|
209 self.add_composite_file('blastdb.pnd', optional=True)
|
|
210 self.add_composite_file('blastdb.pni', optional=True)
|
|
211 self.add_composite_file('blastdb.psd', optional=True)
|
|
212 self.add_composite_file('blastdb.psi', optional=True)
|
|
213 self.add_composite_file('blastdb.psq', optional=True)
|
|
214 self.add_composite_file('blastdb.phd', optional=True)
|
|
215 self.add_composite_file('blastdb.phi', optional=True)
|
|
216 self.add_composite_file('blastdb.pog', optional=True)
|
|
217
|
|
218 def display_data(self, trans, data, preview=False, filename=None,
|
|
219 to_ext=None, size=None, offset=None, **kwd):
|
|
220 """Apparently an old display method, but still gets called.
|
|
221
|
|
222 This allows us to format the data shown in the central pane via the "eye" icon.
|
|
223 """
|
|
224 return "This is a BLAST protein database."
|