comparison proteomics.py @ 0:c10a62c886b8

Uploaded
author iracooke
date Sun, 06 Jan 2013 19:07:22 -0500
parents
children 09b89b345de2
comparison
equal deleted inserted replaced
-1:000000000000 0:c10a62c886b8
1 """
2 Proteomics format classes
3 """
4 import logging
5 import re
6 from galaxy.datatypes.data import *
7 from galaxy.datatypes.xml import *
8 from galaxy.datatypes.sniff import *
9 from galaxy.datatypes.binary import *
10
11 log = logging.getLogger(__name__)
12
13
14 class Xls( Binary ):
15 """Class describing a binary excel spreadsheet file"""
16 file_ext = "xls"
17
18 def set_peek( self, dataset, is_multi_byte=False ):
19 if not dataset.dataset.purged:
20 dataset.peek = "Excel Spreadsheet file"
21 dataset.blurb = data.nice_size( dataset.get_size() )
22 else:
23 dataset.peek = 'file does not exist'
24 dataset.blurb = 'file purged from disk'
25 def display_peek( self, dataset ):
26 try:
27 return dataset.peek
28 except:
29 return "Binary xls file (%s)" % ( data.nice_size( dataset.get_size() ) )
30
31 class ProteomicsXml(GenericXml):
32 """ An enhanced XML datatype used to reuse code across several
33 proteomic/mass-spec datatypes. """
34
35 def sniff(self, filename):
36 """ Determines whether the file is the correct XML type. """
37 with open(filename, 'r') as contents:
38 while True:
39 line = contents.readline()
40 if line == None or not line.startswith('<?'):
41 break
42 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string
43 return line != None and re.match(pattern, line) != None
44
45 def set_peek( self, dataset, is_multi_byte=False ):
46 """Set the peek and blurb text"""
47 if not dataset.dataset.purged:
48 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
49 dataset.blurb = self.blurb
50 else:
51 dataset.peek = 'file does not exist'
52 dataset.blurb = 'file purged from disk'
53
54 class PepXml(ProteomicsXml):
55 """pepXML data"""
56 file_ext = "pepxml"
57 blurb = 'pepXML data'
58 root = "msms_pipeline_analysis"
59
60
61 class MzML(ProteomicsXml):
62 """mzML data"""
63 file_ext = "mzml"
64 blurb = 'mzML Mass Spectrometry data'
65 root = "(mzML|indexedmzML)"
66
67
68 class ProtXML(ProteomicsXml):
69 """protXML data"""
70 file_ext = "protxml"
71 blurb = 'prot XML Search Results'
72 root = "protein_summary"
73
74
75 class MzXML(ProteomicsXml):
76 """mzXML data"""
77 file_ext = "mzXML"
78 blurb = "mzXML Mass Spectrometry data"
79 root = "mzXML"
80
81 ## PSI datatypes
82 class MzIdentML(ProteomicsXml):
83 file_ext = "mzid"
84 blurb = "XML identified peptides and proteins."
85 root = "MzIdentML"
86
87
88 class TraML(ProteomicsXml):
89 file_ext = "traML"
90 blurb = "TraML transition list"
91 root = "TraML"
92
93
94 class MzQuantML(ProteomicsXml):
95 file_ext = "mzq"
96 blurb = "XML quantification data"
97 root = "MzQuantML"
98
99
100 class Mgf( Text ):
101 """Mascot Generic Format data"""
102 file_ext = "mgf"
103
104 def set_peek( self, dataset, is_multi_byte=False ):
105 """Set the peek and blurb text"""
106 if not dataset.dataset.purged:
107 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
108 dataset.blurb = 'mgf Mascot Generic Format'
109 else:
110 dataset.peek = 'file does not exist'
111 dataset.blurb = 'file purged from disk'
112
113
114 def sniff( self, filename ):
115 mgf_begin_ions = "BEGIN IONS"
116 max_lines=100
117
118 for i, line in enumerate( file( filename ) ):
119 line = line.rstrip( '\n\r' )
120 if line==mgf_begin_ions:
121 return True
122 if i>max_lines:
123 return False
124
125
126 class MascotDat( Text ):
127 """Mascot search results """
128 file_ext = "mascotdat"
129
130 def set_peek( self, dataset, is_multi_byte=False ):
131 """Set the peek and blurb text"""
132 if not dataset.dataset.purged:
133 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
134 dataset.blurb = 'mascotdat Mascot Search Results'
135 else:
136 dataset.peek = 'file does not exist'
137 dataset.blurb = 'file purged from disk'
138
139
140 def sniff( self, filename ):
141 mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)"
142 max_lines=10
143
144 for i, line in enumerate( file( filename ) ):
145 line = line.rstrip( '\n\r' )
146 if line==mime_version:
147 return True
148 if i>max_lines:
149 return False
150
151
152 class RAW( Binary ):
153 """Class describing a Thermo Finnigan binary RAW file"""
154 file_ext = "raw"
155 def sniff( self, filename ):
156 # Thermo Finnigan RAW format is proprietary and hence not well documented.
157 # Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n
158 # This combination represents 17 bytes, but to play safe we read 20 bytes from
159 # the start of the file.
160 try:
161 header = open( filename ).read(20)
162 hexheader = binascii.b2a_hex( header )
163 finnigan = binascii.hexlify( 'F\0i\0n\0n\0i\0g\0a\0n' )
164 if hexheader.find(finnigan) != -1:
165 return True
166 return False
167 except:
168 return False
169 def set_peek( self, dataset, is_multi_byte=False ):
170 if not dataset.dataset.purged:
171 dataset.peek = "Thermo Finnigan RAW file"
172 dataset.blurb = data.nice_size( dataset.get_size() )
173 else:
174 dataset.peek = 'file does not exist'
175 dataset.blurb = 'file purged from disk'
176 def display_peek( self, dataset ):
177 try:
178 return dataset.peek
179 except:
180 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) )
181
182
183 if hasattr(Binary, 'register_sniffable_binary_format'):
184 Binary.register_sniffable_binary_format('RAW', 'RAW', RAW)
185
186
187 class Msp(Text):
188 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """
189 file_ext = "msp"
190
191 @staticmethod
192 def next_line_starts_with(contents, prefix):
193 next_line = contents.readline()
194 return next_line != None and next_line.startswith(prefix)
195
196 def sniff(self, filename):
197 """ Determines whether the file is a NIST MSP output file.
198
199 >>> fname = get_test_fname('test.msp')
200 >>> Msp().sniff(fname)
201 True
202 >>> fname = get_test_fname('test.mzXML')
203 >>> Msp().sniff(fname)
204 False
205 """
206 with open(filename, 'r') as contents:
207 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:")
208
209 class Ms2(Text):
210 file_ext = "ms2"
211
212 def sniff(self, filename):
213 """ Determines whether the file is a valid ms2 file.
214
215 >>> fname = get_test_fname('test.msp')
216 >>> Ms2().sniff(fname)
217 False
218 >>> fname = get_test_fname('test.ms2')
219 >>> Ms2().sniff(fname)
220 True
221 """
222
223 with open(filename, 'r') as contents:
224 header_lines = []
225 while True:
226 line = contents.readline()
227 if line == None or len(line) == 0:
228 pass
229 elif line.startswith('H\t'):
230 header_lines.append(line)
231 else:
232 break
233 for header_field in ['CreationDate', 'Extractor', 'ExtractorVersion', 'ExtractorOptions']:
234 found_header = False
235 for header_line in header_lines:
236 if header_line.startswith('H\t%s' % (header_field)):
237 found_header = True
238 break
239 if not found_header:
240 return False
241
242 return True
243
244 # unsniffable binary format, should do something about this
245 class XHunterAslFormat(Binary):
246 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """
247 file_ext = "hlf"
248
249
250 if hasattr(Binary, 'register_unsniffable_binary_ext'):
251 Binary.register_unsniffable_binary_ext('hlf')