0
|
1 """
|
|
2 Proteomics format classes
|
|
3 """
|
|
4 import logging
|
|
5 import re
|
5
|
6 import binascii
|
|
7
|
0
|
8 from galaxy.datatypes.sniff import *
|
7
|
9 from galaxy.datatypes import data
|
5
|
10 from galaxy.datatypes.data import Text
|
|
11 from galaxy.datatypes.xml import GenericXml
|
|
12 from galaxy.datatypes.binary import Binary
|
|
13 from galaxy.datatypes.tabular import Tabular
|
|
14 from galaxy.datatypes.interval import Gff
|
0
|
15
|
|
16 log = logging.getLogger(__name__)
|
|
17
|
4
|
18 class ProtGff( Gff ):
|
|
19 """Tab delimited data in Gff format"""
|
|
20 file_ext = "prot_gff"
|
|
21 def set_peek( self, dataset, is_multi_byte=False ):
|
|
22 """Set the peek and blurb text"""
|
|
23 if not dataset.dataset.purged:
|
|
24 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
25 dataset.blurb = 'Proteogenomics GFF'
|
|
26 else:
|
|
27 dataset.peek = 'file does not exist'
|
|
28 dataset.blurb = 'file purged from disk'
|
|
29
|
|
30 def sniff( self, filename ):
|
|
31 handle = open(filename)
|
|
32 xmlns_re = re.compile("^##gff-version")
|
|
33 for i in range(3):
|
|
34 line = handle.readline()
|
|
35 if xmlns_re.match(line.strip()):
|
|
36 handle.close()
|
|
37 return True
|
|
38
|
|
39 handle.close()
|
|
40 return False
|
|
41
|
0
|
42
|
|
43 class Xls( Binary ):
|
|
44 """Class describing a binary excel spreadsheet file"""
|
|
45 file_ext = "xls"
|
|
46
|
|
47 def set_peek( self, dataset, is_multi_byte=False ):
|
|
48 if not dataset.dataset.purged:
|
|
49 dataset.peek = "Excel Spreadsheet file"
|
|
50 dataset.blurb = data.nice_size( dataset.get_size() )
|
|
51 else:
|
|
52 dataset.peek = 'file does not exist'
|
|
53 dataset.blurb = 'file purged from disk'
|
|
54 def display_peek( self, dataset ):
|
|
55 try:
|
|
56 return dataset.peek
|
|
57 except:
|
|
58 return "Binary xls file (%s)" % ( data.nice_size( dataset.get_size() ) )
|
|
59
|
5
|
60 class IdpDB( Binary ):
|
|
61 file_ext = "idpDB"
|
|
62
|
|
63 if hasattr(Binary, 'register_unsniffable_binary_ext'):
|
|
64 Binary.register_unsniffable_binary_ext('idpDB')
|
|
65
|
|
66
|
|
67 class PepXmlReport( Tabular ):
|
|
68 """pepxml converted to tabular report"""
|
|
69 file_ext = "tsv"
|
|
70
|
|
71 def __init__(self, **kwd):
|
|
72 Tabular.__init__( self, **kwd )
|
|
73 self.column_names = ['Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility']
|
|
74
|
|
75 def display_peek( self, dataset ):
|
|
76 """Returns formated html of peek"""
|
|
77 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
|
|
78
|
|
79
|
|
80 class ProtXmlReport( Tabular ):
|
|
81 """protxml converted to tabular report"""
|
|
82 file_ext = "tsv"
|
|
83 comment_lines = 1
|
|
84
|
|
85 def __init__(self, **kwd):
|
|
86 Tabular.__init__( self, **kwd )
|
|
87 self.column_names = ["Entry Number", "Group Probability", "Protein", "Protein Link", "Protein Probability", "Percent Coverage", "Number of Unique Peptides", "Total Independent Spectra", "Percent Share of Spectrum ID's", "Description", "Protein Molecular Weight", "Protein Length", "Is Nondegenerate Evidence", "Weight", "Precursor Ion Charge", "Peptide sequence", "Peptide Link", "NSP Adjusted Probability", "Initial Probability", "Number of Total Termini", "Number of Sibling Peptides Bin", "Number of Instances", "Peptide Group Designator", "Is Evidence?"]
|
|
88
|
|
89 def display_peek( self, dataset ):
|
|
90 """Returns formated html of peek"""
|
|
91 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
|
|
92
|
|
93 class ProteomicsXml( GenericXml ):
|
0
|
94 """ An enhanced XML datatype used to reuse code across several
|
|
95 proteomic/mass-spec datatypes. """
|
|
96
|
|
97 def sniff(self, filename):
|
|
98 """ Determines whether the file is the correct XML type. """
|
5
|
99 with open(filename, 'r') as contents:
|
0
|
100 while True:
|
|
101 line = contents.readline()
|
|
102 if line == None or not line.startswith('<?'):
|
|
103 break
|
|
104 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string
|
|
105 return line != None and re.match(pattern, line) != None
|
|
106
|
|
107 def set_peek( self, dataset, is_multi_byte=False ):
|
|
108 """Set the peek and blurb text"""
|
|
109 if not dataset.dataset.purged:
|
|
110 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
111 dataset.blurb = self.blurb
|
|
112 else:
|
|
113 dataset.peek = 'file does not exist'
|
|
114 dataset.blurb = 'file purged from disk'
|
|
115
|
5
|
116
|
0
|
117 class PepXml(ProteomicsXml):
|
|
118 """pepXML data"""
|
|
119 file_ext = "pepxml"
|
|
120 blurb = 'pepXML data'
|
|
121 root = "msms_pipeline_analysis"
|
5
|
122
|
0
|
123
|
|
124 class MzML(ProteomicsXml):
|
|
125 """mzML data"""
|
|
126 file_ext = "mzml"
|
|
127 blurb = 'mzML Mass Spectrometry data'
|
|
128 root = "(mzML|indexedmzML)"
|
|
129
|
|
130
|
|
131 class ProtXML(ProteomicsXml):
|
|
132 """protXML data"""
|
|
133 file_ext = "protxml"
|
|
134 blurb = 'prot XML Search Results'
|
|
135 root = "protein_summary"
|
|
136
|
|
137
|
|
138 class MzXML(ProteomicsXml):
|
|
139 """mzXML data"""
|
5
|
140 file_ext = "mzxml"
|
0
|
141 blurb = "mzXML Mass Spectrometry data"
|
|
142 root = "mzXML"
|
|
143
|
|
144 ## PSI datatypes
|
|
145 class MzIdentML(ProteomicsXml):
|
|
146 file_ext = "mzid"
|
|
147 blurb = "XML identified peptides and proteins."
|
|
148 root = "MzIdentML"
|
5
|
149
|
0
|
150
|
|
151 class TraML(ProteomicsXml):
|
5
|
152 file_ext = "traml"
|
0
|
153 blurb = "TraML transition list"
|
|
154 root = "TraML"
|
|
155
|
|
156
|
|
157 class MzQuantML(ProteomicsXml):
|
|
158 file_ext = "mzq"
|
|
159 blurb = "XML quantification data"
|
|
160 root = "MzQuantML"
|
|
161
|
5
|
162
|
|
163 class ConsensusXML(ProteomicsXml):
|
|
164 file_ext = "consensusxml"
|
|
165 blurb = "OpenMS multiple LC-MS map alignment file"
|
|
166 root = "consensusXML"
|
|
167
|
|
168
|
|
169 class FeatureXML(ProteomicsXml):
|
|
170 file_ext = "featurexml"
|
|
171 blurb = "OpenMS feature file"
|
|
172 root = "featureMap"
|
|
173
|
|
174
|
|
175 class IdXML(ProteomicsXml):
|
|
176 file_ext = "idxml"
|
|
177 blurb = "OpenMS identification file"
|
|
178 root = "IdXML"
|
|
179
|
|
180
|
0
|
181 class Mgf( Text ):
|
|
182 """Mascot Generic Format data"""
|
|
183 file_ext = "mgf"
|
|
184
|
|
185 def set_peek( self, dataset, is_multi_byte=False ):
|
|
186 """Set the peek and blurb text"""
|
|
187 if not dataset.dataset.purged:
|
|
188 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
189 dataset.blurb = 'mgf Mascot Generic Format'
|
|
190 else:
|
|
191 dataset.peek = 'file does not exist'
|
|
192 dataset.blurb = 'file purged from disk'
|
|
193
|
|
194 def sniff( self, filename ):
|
|
195 mgf_begin_ions = "BEGIN IONS"
|
|
196 max_lines=100
|
|
197
|
|
198 for i, line in enumerate( file( filename ) ):
|
|
199 line = line.rstrip( '\n\r' )
|
|
200 if line==mgf_begin_ions:
|
|
201 return True
|
|
202 if i>max_lines:
|
|
203 return False
|
5
|
204
|
|
205
|
0
|
206 class MascotDat( Text ):
|
|
207 """Mascot search results """
|
|
208 file_ext = "mascotdat"
|
|
209
|
|
210 def set_peek( self, dataset, is_multi_byte=False ):
|
|
211 """Set the peek and blurb text"""
|
|
212 if not dataset.dataset.purged:
|
|
213 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
214 dataset.blurb = 'mascotdat Mascot Search Results'
|
|
215 else:
|
|
216 dataset.peek = 'file does not exist'
|
|
217 dataset.blurb = 'file purged from disk'
|
|
218
|
|
219
|
|
220 def sniff( self, filename ):
|
|
221 mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)"
|
|
222 max_lines=10
|
|
223
|
|
224 for i, line in enumerate( file( filename ) ):
|
|
225 line = line.rstrip( '\n\r' )
|
|
226 if line==mime_version:
|
|
227 return True
|
|
228 if i>max_lines:
|
|
229 return False
|
|
230
|
|
231
|
|
232 class RAW( Binary ):
|
|
233 """Class describing a Thermo Finnigan binary RAW file"""
|
|
234 file_ext = "raw"
|
|
235 def sniff( self, filename ):
|
|
236 # Thermo Finnigan RAW format is proprietary and hence not well documented.
|
|
237 # Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n
|
|
238 # This combination represents 17 bytes, but to play safe we read 20 bytes from
|
|
239 # the start of the file.
|
|
240 try:
|
|
241 header = open( filename ).read(20)
|
|
242 hexheader = binascii.b2a_hex( header )
|
|
243 finnigan = binascii.hexlify( 'F\0i\0n\0n\0i\0g\0a\0n' )
|
|
244 if hexheader.find(finnigan) != -1:
|
|
245 return True
|
|
246 return False
|
|
247 except:
|
|
248 return False
|
|
249 def set_peek( self, dataset, is_multi_byte=False ):
|
|
250 if not dataset.dataset.purged:
|
|
251 dataset.peek = "Thermo Finnigan RAW file"
|
|
252 dataset.blurb = data.nice_size( dataset.get_size() )
|
|
253 else:
|
|
254 dataset.peek = 'file does not exist'
|
|
255 dataset.blurb = 'file purged from disk'
|
|
256 def display_peek( self, dataset ):
|
|
257 try:
|
|
258 return dataset.peek
|
|
259 except:
|
|
260 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) )
|
|
261
|
|
262
|
|
263 if hasattr(Binary, 'register_sniffable_binary_format'):
|
5
|
264 Binary.register_sniffable_binary_format('raw', 'raw', RAW)
|
0
|
265
|
|
266
|
5
|
267 class Msp( Text ):
|
0
|
268 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """
|
|
269 file_ext = "msp"
|
|
270
|
|
271 @staticmethod
|
|
272 def next_line_starts_with(contents, prefix):
|
|
273 next_line = contents.readline()
|
|
274 return next_line != None and next_line.startswith(prefix)
|
|
275
|
|
276 def sniff(self, filename):
|
|
277 """ Determines whether the file is a NIST MSP output file.
|
|
278
|
|
279 >>> fname = get_test_fname('test.msp')
|
|
280 >>> Msp().sniff(fname)
|
|
281 True
|
|
282 >>> fname = get_test_fname('test.mzXML')
|
|
283 >>> Msp().sniff(fname)
|
|
284 False
|
|
285 """
|
|
286 with open(filename, 'r') as contents:
|
|
287 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:")
|
|
288
|
|
289 class Ms2(Text):
|
|
290 file_ext = "ms2"
|
|
291
|
|
292 def sniff(self, filename):
|
|
293 """ Determines whether the file is a valid ms2 file.
|
|
294
|
|
295 >>> fname = get_test_fname('test.msp')
|
|
296 >>> Ms2().sniff(fname)
|
|
297 False
|
|
298 >>> fname = get_test_fname('test.ms2')
|
|
299 >>> Ms2().sniff(fname)
|
|
300 True
|
|
301 """
|
|
302
|
|
303 with open(filename, 'r') as contents:
|
|
304 header_lines = []
|
|
305 while True:
|
|
306 line = contents.readline()
|
|
307 if line == None or len(line) == 0:
|
|
308 pass
|
|
309 elif line.startswith('H\t'):
|
|
310 header_lines.append(line)
|
|
311 else:
|
|
312 break
|
|
313 for header_field in ['CreationDate', 'Extractor', 'ExtractorVersion', 'ExtractorOptions']:
|
|
314 found_header = False
|
|
315 for header_line in header_lines:
|
|
316 if header_line.startswith('H\t%s' % (header_field)):
|
|
317 found_header = True
|
|
318 break
|
|
319 if not found_header:
|
|
320 return False
|
|
321
|
|
322 return True
|
|
323
|
|
324 # unsniffable binary format, should do something about this
|
5
|
325 class XHunterAslFormat( Binary ):
|
0
|
326 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """
|
|
327 file_ext = "hlf"
|
|
328
|
|
329 if hasattr(Binary, 'register_unsniffable_binary_ext'):
|
|
330 Binary.register_unsniffable_binary_ext('hlf')
|