2
|
1 """
|
|
2 Proteomics format classes
|
|
3 """
|
|
4 import logging
|
|
5 import re
|
|
6 from galaxy.datatypes.data import *
|
|
7 from galaxy.datatypes.xml import *
|
|
8 from galaxy.datatypes.sniff import *
|
|
9 from galaxy.datatypes.binary import *
|
|
10
|
|
11 log = logging.getLogger(__name__)
|
|
12
|
|
13
|
|
14 class Xls( Binary ):
|
|
15 """Class describing a binary excel spreadsheet file"""
|
|
16 file_ext = "xls"
|
|
17
|
|
18 def set_peek( self, dataset, is_multi_byte=False ):
|
|
19 if not dataset.dataset.purged:
|
|
20 dataset.peek = "Excel Spreadsheet file"
|
|
21 dataset.blurb = data.nice_size( dataset.get_size() )
|
|
22 else:
|
|
23 dataset.peek = 'file does not exist'
|
|
24 dataset.blurb = 'file purged from disk'
|
|
25 def display_peek( self, dataset ):
|
|
26 try:
|
|
27 return dataset.peek
|
|
28 except:
|
|
29 return "Binary xls file (%s)" % ( data.nice_size( dataset.get_size() ) )
|
|
30
|
|
31 class PepXml(GenericXml):
|
|
32 """pepXML data"""
|
|
33 file_ext = "pepxml"
|
|
34
|
|
35 def set_peek( self, dataset, is_multi_byte=False ):
|
|
36 """Set the peek and blurb text"""
|
|
37 if not dataset.dataset.purged:
|
|
38 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
39 dataset.blurb = 'pepXML data'
|
|
40 else:
|
|
41 dataset.peek = 'file does not exist'
|
|
42 dataset.blurb = 'file purged from disk'
|
|
43 def sniff( self, filename ):
|
|
44 """
|
|
45 Determines whether the file is pepXML
|
|
46 """
|
|
47 #TODO - Use a context manager on Python 2.5+ to close handle
|
|
48 handle = open(filename)
|
|
49 xmlns_re = re.compile(".*pepXML\"")
|
|
50 for i in range(3):
|
|
51 line = handle.readline()
|
|
52 if xmlns_re.match(line.strip()):
|
|
53 handle.close()
|
|
54 return True
|
|
55
|
|
56 handle.close()
|
|
57 return False
|
|
58
|
|
59 class MzML( GenericXml ):
|
|
60 """mzML data"""
|
|
61 file_ext = "mzml"
|
|
62
|
|
63 def set_peek( self, dataset, is_multi_byte=False ):
|
|
64 """Set the peek and blurb text"""
|
|
65 if not dataset.dataset.purged:
|
|
66 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
67 dataset.blurb = 'mzML Mass Spectrometry data'
|
|
68 else:
|
|
69 dataset.peek = 'file does not exist'
|
|
70 dataset.blurb = 'file purged from disk'
|
|
71
|
|
72 def sniff( self, filename ):
|
|
73 handle = open(filename)
|
|
74 xmlns_re = re.compile("^<mzML")
|
|
75 for i in range(3):
|
|
76 line = handle.readline()
|
|
77 if xmlns_re.match(line.strip()):
|
|
78 handle.close()
|
|
79 return True
|
|
80
|
|
81 handle.close()
|
|
82 return False
|
|
83
|
|
84
|
|
85 class ProtXML( Text ):
|
|
86 """protXML data"""
|
|
87 file_ext = "protxml"
|
|
88
|
|
89 def set_peek( self, dataset, is_multi_byte=False ):
|
|
90 """Set the peek and blurb text"""
|
|
91 if not dataset.dataset.purged:
|
|
92 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
93 dataset.blurb = 'prot XML Search Results'
|
|
94 else:
|
|
95 dataset.peek = 'file does not exist'
|
|
96 dataset.blurb = 'file purged from disk'
|
|
97 def sniff( self, filename ):
|
|
98 protxml_header = [ '<?xml version="1.0" encoding="ISO-8859-1"?>',
|
|
99 'xmlns="http://regis-web.systemsbiology.net/protXML"' ]
|
|
100
|
|
101 for i, line in enumerate( file( filename ) ):
|
|
102 if i >= len( pepxml_header ):
|
|
103 return True
|
|
104 line = line.rstrip( '\n\r' )
|
|
105 if protxml_header[ i ] not in line:
|
|
106 return False
|
|
107
|
|
108
|
|
109
|
|
110 class MzXML( Text ):
|
|
111 """mzXML data"""
|
|
112 file_ext = "mzXML"
|
|
113
|
|
114 def set_peek( self, dataset, is_multi_byte=False ):
|
|
115 """Set the peek and blurb text"""
|
|
116 if not dataset.dataset.purged:
|
|
117 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
118 dataset.blurb = 'mzXML Mass Spectrometry data'
|
|
119 else:
|
|
120 dataset.peek = 'file does not exist'
|
|
121 dataset.blurb = 'file purged from disk'
|
|
122 def sniff( self, filename ):
|
|
123 mzxml_header = [ '<?xml version="1.0" encoding="ISO-8859-1"?>',
|
|
124 '<mzXML xmlns="http://sashimi.sourceforge.net/schema_revision/mzXML_2.1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://sashimi.sourceforge.net/schema_revision/mzXML_2.1 http://sashimi.sourceforge.net/schema_revision/mzXML_2.1/mzXML_idx_2.1.xsd">' ]
|
|
125 for i, line in enumerate( file( filename ) ):
|
|
126 if i >= len( mzxml_header ):
|
|
127 return True
|
|
128 line = line.rstrip( '\n\r' )
|
|
129 if line != mzxml_header[ i ]:
|
|
130 return False
|
|
131
|
|
132 class Mgf( Text ):
|
|
133 """Mascot Generic Format data"""
|
|
134 file_ext = "mgf"
|
|
135
|
|
136 def set_peek( self, dataset, is_multi_byte=False ):
|
|
137 """Set the peek and blurb text"""
|
|
138 if not dataset.dataset.purged:
|
|
139 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
140 dataset.blurb = 'mgf Mascot Generic Format'
|
|
141 else:
|
|
142 dataset.peek = 'file does not exist'
|
|
143 dataset.blurb = 'file purged from disk'
|
|
144
|
|
145
|
|
146 def sniff( self, filename ):
|
|
147 mgf_begin_ions = "BEGIN IONS"
|
|
148 max_lines=100
|
|
149
|
|
150 for i, line in enumerate( file( filename ) ):
|
|
151 line = line.rstrip( '\n\r' )
|
|
152 if line==mgf_begin_ions:
|
|
153 return True
|
|
154 if i>max_lines:
|
|
155 return False
|
|
156
|
|
157
|
|
158 class MascotDat( Text ):
|
|
159 """Mascot search results """
|
|
160 file_ext = "mascotdat"
|
|
161
|
|
162 def set_peek( self, dataset, is_multi_byte=False ):
|
|
163 """Set the peek and blurb text"""
|
|
164 if not dataset.dataset.purged:
|
|
165 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
166 dataset.blurb = 'mascotdat Mascot Search Results'
|
|
167 else:
|
|
168 dataset.peek = 'file does not exist'
|
|
169 dataset.blurb = 'file purged from disk'
|
|
170
|
|
171
|
|
172 def sniff( self, filename ):
|
|
173 mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)"
|
|
174 max_lines=10
|
|
175
|
|
176 for i, line in enumerate( file( filename ) ):
|
|
177 line = line.rstrip( '\n\r' )
|
|
178 if line==mime_version:
|
|
179 return True
|
|
180 if i>max_lines:
|
|
181 return False
|
|
182
|
|
183
|
|
184 class RAW( Binary ):
|
|
185 """Class describing a Thermo Finnigan binary RAW file"""
|
|
186 file_ext = "raw"
|
|
187 def sniff( self, filename ):
|
|
188 # Thermo Finnigan RAW format is proprietary and hence not well documented.
|
|
189 # Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n
|
|
190 # This combination represents 17 bytes, but to play safe we read 20 bytes from
|
|
191 # the start of the file.
|
|
192 try:
|
|
193 header = open( filename ).read(20)
|
|
194 hexheader = binascii.b2a_hex( header )
|
|
195 finnigan = binascii.hexlify( 'F\0i\0n\0n\0i\0g\0a\0n' )
|
|
196 if hexheader.find(finnigan) != -1:
|
|
197 return True
|
|
198 return False
|
|
199 except:
|
|
200 return False
|
|
201 def set_peek( self, dataset, is_multi_byte=False ):
|
|
202 if not dataset.dataset.purged:
|
|
203 dataset.peek = "Thermo Finnigan RAW file"
|
|
204 dataset.blurb = data.nice_size( dataset.get_size() )
|
|
205 else:
|
|
206 dataset.peek = 'file does not exist'
|
|
207 dataset.blurb = 'file purged from disk'
|
|
208 def display_peek( self, dataset ):
|
|
209 try:
|
|
210 return dataset.peek
|
|
211 except:
|
|
212 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) )
|
|
213
|
|
214
|
|
215 class Msp(Text):
|
|
216 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """
|
|
217 file_ext = "msp"
|
|
218
|
|
219 @staticmethod
|
|
220 def next_line_starts_with(contents, prefix):
|
|
221 next_line = contents.readline()
|
|
222 return next_line != None and next_line.startswith(prefix)
|
|
223
|
|
224 def sniff(self, filename):
|
|
225 """ Determines whether the file is a NIST MSP output file.
|
|
226
|
|
227 >>> fname = get_test_fname('test.msp')
|
|
228 >>> Msp().sniff(fname)
|
|
229 True
|
|
230 >>> fname = get_test_fname('test.mzXML')
|
|
231 >>> Msp().sniff(fname)
|
|
232 False
|
|
233 """
|
|
234 with open(filename, 'r') as contents:
|
|
235 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:")
|
|
236
|
|
237 class Ms2(Text):
|
|
238 file_ext = "ms2"
|
|
239
|
|
240 def sniff(self, filename):
|
|
241 """ Determines whether the file is a valid ms2 file.
|
|
242
|
|
243 >>> fname = get_test_fname('test.msp')
|
|
244 >>> Ms2().sniff(fname)
|
|
245 False
|
|
246 >>> fname = get_test_fname('test.ms2')
|
|
247 >>> Ms2().sniff(fname)
|
|
248 True
|
|
249 """
|
|
250
|
|
251 with open(filename, 'r') as contents:
|
|
252 header_lines = []
|
|
253 while True:
|
|
254 line = contents.readline()
|
|
255 if line == None or len(line) == 0:
|
|
256 pass
|
|
257 elif line.startswith('H\t'):
|
|
258 header_lines.append(line)
|
|
259 else:
|
|
260 break
|
|
261 for header_field in ['CreationDate', 'Extractor', 'ExtractorVersion', 'ExtractorOptions']:
|
|
262 found_header = False
|
|
263 for header_line in header_lines:
|
|
264 if header_line.startswith('H\t%s' % (header_field)):
|
|
265 found_header = True
|
|
266 break
|
|
267 if not found_header:
|
|
268 return False
|
|
269
|
|
270 return True
|
|
271
|
|
272 # unsniffable binary format, should do something about this
|
|
273 class XHunterAslFormat(Binary):
|
|
274 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """
|
|
275 file_ext = "hlf"
|