9
|
1 """
|
|
2 Proteomics format classes
|
|
3 """
|
|
4 import logging
|
|
5 import re
|
|
6 import binascii
|
|
7
|
|
8 from galaxy.datatypes.sniff import *
|
|
9 from galaxy.datatypes import data
|
|
10 from galaxy.datatypes.data import Text
|
|
11 from galaxy.datatypes.xml import GenericXml
|
|
12 from galaxy.datatypes.binary import Binary
|
|
13 from galaxy.datatypes.tabular import Tabular
|
|
14 from galaxy.datatypes.interval import Gff
|
|
15
|
|
16 log = logging.getLogger(__name__)
|
|
17
|
|
18
|
|
19 class Wiff( Binary ):
|
|
20 """Class for wiff files."""
|
|
21 file_ext = 'wiff'
|
|
22 allow_datatype_change = False
|
|
23 composite_type = 'auto_primary_file'
|
|
24
|
|
25 def __init__(self, **kwd):
|
|
26 Binary.__init__(self, **kwd)
|
|
27 self.add_composite_file( 'wiff',
|
|
28 description = 'AB SCIEX files in .wiff format. This can contain all needed information or only metadata.',
|
|
29 is_binary = True )
|
|
30 self.add_composite_file( 'wiff_scan',
|
|
31 description = 'AB SCIEX spectra file (wiff.scan), if the corresponding .wiff file only contains metadata.',
|
|
32 optional = 'True', is_binary = True )
|
|
33
|
|
34 def generate_primary_file( self, dataset = None ):
|
|
35 rval = ['<html><head><title>Wiff Composite Dataset </title></head><p/>']
|
|
36 rval.append('<div>This composite dataset is composed of the following files:<p/><ul>')
|
|
37 for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
|
|
38 fn = composite_name
|
|
39 opt_text = ''
|
|
40 if composite_file.optional:
|
|
41 opt_text = ' (optional)'
|
|
42 if composite_file.get('description'):
|
|
43 rval.append( '<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
|
|
44 else:
|
|
45 rval.append( '<li><a href="%s" type="text/plain">%s</a>%s</li>' % ( fn, fn, opt_text ) )
|
|
46 rval.append( '</ul></div></html>' )
|
|
47 return "\n".join( rval )
|
|
48
|
|
49
|
|
50
|
|
51 if hasattr(Binary, 'register_unsniffable_binary_ext'):
|
|
52 Binary.register_unsniffable_binary_ext('wiff')
|
|
53
|
|
54
|
|
55 class IdpDB( Binary ):
|
|
56 file_ext = "idpDB"
|
|
57
|
|
58 if hasattr(Binary, 'register_unsniffable_binary_ext'):
|
|
59 Binary.register_unsniffable_binary_ext('idpDB')
|
|
60
|
|
61
|
|
62 class PepXmlReport( Tabular ):
|
|
63 """pepxml converted to tabular report"""
|
|
64 file_ext = "tsv"
|
|
65
|
|
66 def __init__(self, **kwd):
|
|
67 Tabular.__init__( self, **kwd )
|
|
68 self.column_names = ['Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility']
|
|
69
|
|
70 def display_peek( self, dataset ):
|
|
71 """Returns formated html of peek"""
|
|
72 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
|
|
73
|
|
74
|
|
75 class ProtXmlReport( Tabular ):
|
|
76 """protxml converted to tabular report"""
|
|
77 file_ext = "tsv"
|
|
78 comment_lines = 1
|
|
79
|
|
80 def __init__(self, **kwd):
|
|
81 Tabular.__init__( self, **kwd )
|
|
82 self.column_names = ["Entry Number", "Group Probability", "Protein", "Protein Link", "Protein Probability", "Percent Coverage", "Number of Unique Peptides", "Total Independent Spectra", "Percent Share of Spectrum ID's", "Description", "Protein Molecular Weight", "Protein Length", "Is Nondegenerate Evidence", "Weight", "Precursor Ion Charge", "Peptide sequence", "Peptide Link", "NSP Adjusted Probability", "Initial Probability", "Number of Total Termini", "Number of Sibling Peptides Bin", "Number of Instances", "Peptide Group Designator", "Is Evidence?"]
|
|
83
|
|
84 def display_peek( self, dataset ):
|
|
85 """Returns formated html of peek"""
|
|
86 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
|
|
87
|
|
88 class ProteomicsXml( GenericXml ):
|
|
89 """ An enhanced XML datatype used to reuse code across several
|
|
90 proteomic/mass-spec datatypes. """
|
|
91
|
|
92 def sniff(self, filename):
|
|
93 """ Determines whether the file is the correct XML type. """
|
|
94 with open(filename, 'r') as contents:
|
|
95 while True:
|
|
96 line = contents.readline()
|
|
97 if line == None or not line.startswith('<?'):
|
|
98 break
|
|
99 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string
|
|
100 return line != None and re.match(pattern, line) != None
|
|
101
|
|
102 def set_peek( self, dataset, is_multi_byte=False ):
|
|
103 """Set the peek and blurb text"""
|
|
104 if not dataset.dataset.purged:
|
|
105 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
106 dataset.blurb = self.blurb
|
|
107 else:
|
|
108 dataset.peek = 'file does not exist'
|
|
109 dataset.blurb = 'file purged from disk'
|
|
110
|
|
111
|
|
112 class PepXml(ProteomicsXml):
|
|
113 """pepXML data"""
|
|
114 file_ext = "pepxml"
|
|
115 blurb = 'pepXML data'
|
|
116 root = "msms_pipeline_analysis"
|
|
117
|
|
118
|
|
119 class MzML(ProteomicsXml):
|
|
120 """mzML data"""
|
|
121 file_ext = "mzml"
|
|
122 blurb = 'mzML Mass Spectrometry data'
|
|
123 root = "(mzML|indexedmzML)"
|
|
124
|
|
125
|
|
126 class ProtXML(ProteomicsXml):
|
|
127 """protXML data"""
|
|
128 file_ext = "protxml"
|
|
129 blurb = 'prot XML Search Results'
|
|
130 root = "protein_summary"
|
|
131
|
|
132
|
|
133 class MzXML(ProteomicsXml):
|
|
134 """mzXML data"""
|
|
135 file_ext = "mzxml"
|
|
136 blurb = "mzXML Mass Spectrometry data"
|
|
137 root = "mzXML"
|
|
138
|
|
139 ## PSI datatypes
|
|
140 class MzIdentML(ProteomicsXml):
|
|
141 file_ext = "mzid"
|
|
142 blurb = "XML identified peptides and proteins."
|
|
143 root = "MzIdentML"
|
|
144
|
|
145
|
|
146 class TraML(ProteomicsXml):
|
|
147 file_ext = "traml"
|
|
148 blurb = "TraML transition list"
|
|
149 root = "TraML"
|
|
150
|
|
151
|
|
152 class MzQuantML(ProteomicsXml):
|
|
153 file_ext = "mzq"
|
|
154 blurb = "XML quantification data"
|
|
155 root = "MzQuantML"
|
|
156
|
|
157
|
|
158 class ConsensusXML(ProteomicsXml):
|
|
159 file_ext = "consensusxml"
|
|
160 blurb = "OpenMS multiple LC-MS map alignment file"
|
|
161 root = "consensusXML"
|
|
162
|
|
163
|
|
164 class FeatureXML(ProteomicsXml):
|
|
165 file_ext = "featurexml"
|
|
166 blurb = "OpenMS feature file"
|
|
167 root = "featureMap"
|
|
168
|
|
169
|
|
170 class IdXML(ProteomicsXml):
|
|
171 file_ext = "idxml"
|
|
172 blurb = "OpenMS identification file"
|
|
173 root = "IdXML"
|
|
174
|
10
|
175 class TandemXML(ProteomicsXml):
|
|
176 file_ext = "tandem"
|
|
177 blurb = "X!Tandem search results file"
|
|
178 root = "TandemXML"
|
9
|
179
|
|
180 class Mgf( Text ):
|
|
181 """Mascot Generic Format data"""
|
|
182 file_ext = "mgf"
|
|
183
|
|
184 def set_peek( self, dataset, is_multi_byte=False ):
|
|
185 """Set the peek and blurb text"""
|
|
186 if not dataset.dataset.purged:
|
|
187 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
188 dataset.blurb = 'mgf Mascot Generic Format'
|
|
189 else:
|
|
190 dataset.peek = 'file does not exist'
|
|
191 dataset.blurb = 'file purged from disk'
|
|
192
|
|
193 def sniff( self, filename ):
|
|
194 mgf_begin_ions = "BEGIN IONS"
|
|
195 max_lines=100
|
|
196
|
|
197 for i, line in enumerate( file( filename ) ):
|
|
198 line = line.rstrip( '\n\r' )
|
|
199 if line==mgf_begin_ions:
|
|
200 return True
|
|
201 if i>max_lines:
|
|
202 return False
|
|
203
|
|
204
|
|
205 class MascotDat( Text ):
|
|
206 """Mascot search results """
|
|
207 file_ext = "mascotdat"
|
|
208
|
|
209 def set_peek( self, dataset, is_multi_byte=False ):
|
|
210 """Set the peek and blurb text"""
|
|
211 if not dataset.dataset.purged:
|
|
212 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
213 dataset.blurb = 'mascotdat Mascot Search Results'
|
|
214 else:
|
|
215 dataset.peek = 'file does not exist'
|
|
216 dataset.blurb = 'file purged from disk'
|
|
217
|
|
218
|
|
219 def sniff( self, filename ):
|
|
220 mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)"
|
|
221 max_lines=10
|
|
222
|
|
223 for i, line in enumerate( file( filename ) ):
|
|
224 line = line.rstrip( '\n\r' )
|
|
225 if line==mime_version:
|
|
226 return True
|
|
227 if i>max_lines:
|
|
228 return False
|
|
229
|
|
230
|
|
231 class RAW( Binary ):
|
|
232 """Class describing a Thermo Finnigan binary RAW file"""
|
|
233 file_ext = "raw"
|
|
234 def sniff( self, filename ):
|
|
235 # Thermo Finnigan RAW format is proprietary and hence not well documented.
|
|
236 # Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n
|
|
237 # This combination represents 17 bytes, but to play safe we read 20 bytes from
|
|
238 # the start of the file.
|
|
239 try:
|
|
240 header = open( filename ).read(20)
|
|
241 hexheader = binascii.b2a_hex( header )
|
|
242 finnigan = binascii.hexlify( 'F\0i\0n\0n\0i\0g\0a\0n' )
|
|
243 if hexheader.find(finnigan) != -1:
|
|
244 return True
|
|
245 return False
|
|
246 except:
|
|
247 return False
|
|
248 def set_peek( self, dataset, is_multi_byte=False ):
|
|
249 if not dataset.dataset.purged:
|
|
250 dataset.peek = "Thermo Finnigan RAW file"
|
|
251 dataset.blurb = data.nice_size( dataset.get_size() )
|
|
252 else:
|
|
253 dataset.peek = 'file does not exist'
|
|
254 dataset.blurb = 'file purged from disk'
|
|
255 def display_peek( self, dataset ):
|
|
256 try:
|
|
257 return dataset.peek
|
|
258 except:
|
|
259 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) )
|
|
260
|
|
261
|
|
262 if hasattr(Binary, 'register_sniffable_binary_format'):
|
|
263 Binary.register_sniffable_binary_format('raw', 'raw', RAW)
|
|
264
|
|
265
|
|
266 class Msp( Text ):
|
|
267 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """
|
|
268 file_ext = "msp"
|
|
269
|
|
270 @staticmethod
|
|
271 def next_line_starts_with(contents, prefix):
|
|
272 next_line = contents.readline()
|
|
273 return next_line != None and next_line.startswith(prefix)
|
|
274
|
|
275 def sniff(self, filename):
|
|
276 """ Determines whether the file is a NIST MSP output file.
|
|
277
|
|
278 >>> fname = get_test_fname('test.msp')
|
|
279 >>> Msp().sniff(fname)
|
|
280 True
|
|
281 >>> fname = get_test_fname('test.mzXML')
|
|
282 >>> Msp().sniff(fname)
|
|
283 False
|
|
284 """
|
|
285 with open(filename, 'r') as contents:
|
|
286 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:")
|
|
287
|
|
288 class Ms2(Text):
|
|
289 file_ext = "ms2"
|
|
290
|
|
291 def sniff(self, filename):
|
|
292 """ Determines whether the file is a valid ms2 file.
|
|
293
|
|
294 >>> fname = get_test_fname('test.msp')
|
|
295 >>> Ms2().sniff(fname)
|
|
296 False
|
|
297 >>> fname = get_test_fname('test.ms2')
|
|
298 >>> Ms2().sniff(fname)
|
|
299 True
|
|
300 """
|
|
301
|
|
302 with open(filename, 'r') as contents:
|
|
303 header_lines = []
|
|
304 while True:
|
|
305 line = contents.readline()
|
|
306 if line == None or len(line) == 0:
|
|
307 pass
|
|
308 elif line.startswith('H\t'):
|
|
309 header_lines.append(line)
|
|
310 else:
|
|
311 break
|
|
312 for header_field in ['CreationDate', 'Extractor', 'ExtractorVersion', 'ExtractorOptions']:
|
|
313 found_header = False
|
|
314 for header_line in header_lines:
|
|
315 if header_line.startswith('H\t%s' % (header_field)):
|
|
316 found_header = True
|
|
317 break
|
|
318 if not found_header:
|
|
319 return False
|
|
320
|
|
321 return True
|
|
322
|
|
323 # unsniffable binary format, should do something about this
|
|
324 class XHunterAslFormat( Binary ):
|
|
325 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """
|
|
326 file_ext = "hlf"
|
|
327
|
|
328 if hasattr(Binary, 'register_unsniffable_binary_ext'):
|
|
329 Binary.register_unsniffable_binary_ext('hlf')
|