Mercurial > repos > iracooke > proteomics_datatypes
comparison proteomics.py @ 0:c10a62c886b8
Uploaded
author | iracooke |
---|---|
date | Sun, 06 Jan 2013 19:07:22 -0500 |
parents | |
children | 09b89b345de2 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c10a62c886b8 |
---|---|
1 """ | |
2 Proteomics format classes | |
3 """ | |
4 import logging | |
5 import re | |
6 from galaxy.datatypes.data import * | |
7 from galaxy.datatypes.xml import * | |
8 from galaxy.datatypes.sniff import * | |
9 from galaxy.datatypes.binary import * | |
10 | |
11 log = logging.getLogger(__name__) | |
12 | |
13 | |
14 class Xls( Binary ): | |
15 """Class describing a binary excel spreadsheet file""" | |
16 file_ext = "xls" | |
17 | |
18 def set_peek( self, dataset, is_multi_byte=False ): | |
19 if not dataset.dataset.purged: | |
20 dataset.peek = "Excel Spreadsheet file" | |
21 dataset.blurb = data.nice_size( dataset.get_size() ) | |
22 else: | |
23 dataset.peek = 'file does not exist' | |
24 dataset.blurb = 'file purged from disk' | |
25 def display_peek( self, dataset ): | |
26 try: | |
27 return dataset.peek | |
28 except: | |
29 return "Binary xls file (%s)" % ( data.nice_size( dataset.get_size() ) ) | |
30 | |
31 class ProteomicsXml(GenericXml): | |
32 """ An enhanced XML datatype used to reuse code across several | |
33 proteomic/mass-spec datatypes. """ | |
34 | |
35 def sniff(self, filename): | |
36 """ Determines whether the file is the correct XML type. """ | |
37 with open(filename, 'r') as contents: | |
38 while True: | |
39 line = contents.readline() | |
40 if line == None or not line.startswith('<?'): | |
41 break | |
42 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string | |
43 return line != None and re.match(pattern, line) != None | |
44 | |
45 def set_peek( self, dataset, is_multi_byte=False ): | |
46 """Set the peek and blurb text""" | |
47 if not dataset.dataset.purged: | |
48 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
49 dataset.blurb = self.blurb | |
50 else: | |
51 dataset.peek = 'file does not exist' | |
52 dataset.blurb = 'file purged from disk' | |
53 | |
54 class PepXml(ProteomicsXml): | |
55 """pepXML data""" | |
56 file_ext = "pepxml" | |
57 blurb = 'pepXML data' | |
58 root = "msms_pipeline_analysis" | |
59 | |
60 | |
61 class MzML(ProteomicsXml): | |
62 """mzML data""" | |
63 file_ext = "mzml" | |
64 blurb = 'mzML Mass Spectrometry data' | |
65 root = "(mzML|indexedmzML)" | |
66 | |
67 | |
68 class ProtXML(ProteomicsXml): | |
69 """protXML data""" | |
70 file_ext = "protxml" | |
71 blurb = 'prot XML Search Results' | |
72 root = "protein_summary" | |
73 | |
74 | |
75 class MzXML(ProteomicsXml): | |
76 """mzXML data""" | |
77 file_ext = "mzXML" | |
78 blurb = "mzXML Mass Spectrometry data" | |
79 root = "mzXML" | |
80 | |
81 ## PSI datatypes | |
82 class MzIdentML(ProteomicsXml): | |
83 file_ext = "mzid" | |
84 blurb = "XML identified peptides and proteins." | |
85 root = "MzIdentML" | |
86 | |
87 | |
88 class TraML(ProteomicsXml): | |
89 file_ext = "traML" | |
90 blurb = "TraML transition list" | |
91 root = "TraML" | |
92 | |
93 | |
94 class MzQuantML(ProteomicsXml): | |
95 file_ext = "mzq" | |
96 blurb = "XML quantification data" | |
97 root = "MzQuantML" | |
98 | |
99 | |
100 class Mgf( Text ): | |
101 """Mascot Generic Format data""" | |
102 file_ext = "mgf" | |
103 | |
104 def set_peek( self, dataset, is_multi_byte=False ): | |
105 """Set the peek and blurb text""" | |
106 if not dataset.dataset.purged: | |
107 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
108 dataset.blurb = 'mgf Mascot Generic Format' | |
109 else: | |
110 dataset.peek = 'file does not exist' | |
111 dataset.blurb = 'file purged from disk' | |
112 | |
113 | |
114 def sniff( self, filename ): | |
115 mgf_begin_ions = "BEGIN IONS" | |
116 max_lines=100 | |
117 | |
118 for i, line in enumerate( file( filename ) ): | |
119 line = line.rstrip( '\n\r' ) | |
120 if line==mgf_begin_ions: | |
121 return True | |
122 if i>max_lines: | |
123 return False | |
124 | |
125 | |
126 class MascotDat( Text ): | |
127 """Mascot search results """ | |
128 file_ext = "mascotdat" | |
129 | |
130 def set_peek( self, dataset, is_multi_byte=False ): | |
131 """Set the peek and blurb text""" | |
132 if not dataset.dataset.purged: | |
133 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
134 dataset.blurb = 'mascotdat Mascot Search Results' | |
135 else: | |
136 dataset.peek = 'file does not exist' | |
137 dataset.blurb = 'file purged from disk' | |
138 | |
139 | |
140 def sniff( self, filename ): | |
141 mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)" | |
142 max_lines=10 | |
143 | |
144 for i, line in enumerate( file( filename ) ): | |
145 line = line.rstrip( '\n\r' ) | |
146 if line==mime_version: | |
147 return True | |
148 if i>max_lines: | |
149 return False | |
150 | |
151 | |
152 class RAW( Binary ): | |
153 """Class describing a Thermo Finnigan binary RAW file""" | |
154 file_ext = "raw" | |
155 def sniff( self, filename ): | |
156 # Thermo Finnigan RAW format is proprietary and hence not well documented. | |
157 # Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n | |
158 # This combination represents 17 bytes, but to play safe we read 20 bytes from | |
159 # the start of the file. | |
160 try: | |
161 header = open( filename ).read(20) | |
162 hexheader = binascii.b2a_hex( header ) | |
163 finnigan = binascii.hexlify( 'F\0i\0n\0n\0i\0g\0a\0n' ) | |
164 if hexheader.find(finnigan) != -1: | |
165 return True | |
166 return False | |
167 except: | |
168 return False | |
169 def set_peek( self, dataset, is_multi_byte=False ): | |
170 if not dataset.dataset.purged: | |
171 dataset.peek = "Thermo Finnigan RAW file" | |
172 dataset.blurb = data.nice_size( dataset.get_size() ) | |
173 else: | |
174 dataset.peek = 'file does not exist' | |
175 dataset.blurb = 'file purged from disk' | |
176 def display_peek( self, dataset ): | |
177 try: | |
178 return dataset.peek | |
179 except: | |
180 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) ) | |
181 | |
182 | |
183 if hasattr(Binary, 'register_sniffable_binary_format'): | |
184 Binary.register_sniffable_binary_format('RAW', 'RAW', RAW) | |
185 | |
186 | |
187 class Msp(Text): | |
188 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """ | |
189 file_ext = "msp" | |
190 | |
191 @staticmethod | |
192 def next_line_starts_with(contents, prefix): | |
193 next_line = contents.readline() | |
194 return next_line != None and next_line.startswith(prefix) | |
195 | |
196 def sniff(self, filename): | |
197 """ Determines whether the file is a NIST MSP output file. | |
198 | |
199 >>> fname = get_test_fname('test.msp') | |
200 >>> Msp().sniff(fname) | |
201 True | |
202 >>> fname = get_test_fname('test.mzXML') | |
203 >>> Msp().sniff(fname) | |
204 False | |
205 """ | |
206 with open(filename, 'r') as contents: | |
207 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:") | |
208 | |
209 class Ms2(Text): | |
210 file_ext = "ms2" | |
211 | |
212 def sniff(self, filename): | |
213 """ Determines whether the file is a valid ms2 file. | |
214 | |
215 >>> fname = get_test_fname('test.msp') | |
216 >>> Ms2().sniff(fname) | |
217 False | |
218 >>> fname = get_test_fname('test.ms2') | |
219 >>> Ms2().sniff(fname) | |
220 True | |
221 """ | |
222 | |
223 with open(filename, 'r') as contents: | |
224 header_lines = [] | |
225 while True: | |
226 line = contents.readline() | |
227 if line == None or len(line) == 0: | |
228 pass | |
229 elif line.startswith('H\t'): | |
230 header_lines.append(line) | |
231 else: | |
232 break | |
233 for header_field in ['CreationDate', 'Extractor', 'ExtractorVersion', 'ExtractorOptions']: | |
234 found_header = False | |
235 for header_line in header_lines: | |
236 if header_line.startswith('H\t%s' % (header_field)): | |
237 found_header = True | |
238 break | |
239 if not found_header: | |
240 return False | |
241 | |
242 return True | |
243 | |
244 # unsniffable binary format, should do something about this | |
245 class XHunterAslFormat(Binary): | |
246 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """ | |
247 file_ext = "hlf" | |
248 | |
249 | |
250 if hasattr(Binary, 'register_unsniffable_binary_ext'): | |
251 Binary.register_unsniffable_binary_ext('hlf') |