Mercurial > repos > iracooke > protk
comparison lib/galaxy/datatypes/proteomics.py @ 2:418f42b34049 draft
Reuploading
author | iracooke |
---|---|
date | Mon, 23 Jul 2012 00:20:58 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
1:deaedec14cc8 | 2:418f42b34049 |
---|---|
1 """ | |
2 Proteomics format classes | |
3 """ | |
4 import logging | |
5 import re | |
6 from galaxy.datatypes.data import * | |
7 from galaxy.datatypes.xml import * | |
8 from galaxy.datatypes.sniff import * | |
9 from galaxy.datatypes.binary import * | |
10 | |
11 log = logging.getLogger(__name__) | |
12 | |
13 | |
14 class Xls( Binary ): | |
15 """Class describing a binary excel spreadsheet file""" | |
16 file_ext = "xls" | |
17 | |
18 def set_peek( self, dataset, is_multi_byte=False ): | |
19 if not dataset.dataset.purged: | |
20 dataset.peek = "Excel Spreadsheet file" | |
21 dataset.blurb = data.nice_size( dataset.get_size() ) | |
22 else: | |
23 dataset.peek = 'file does not exist' | |
24 dataset.blurb = 'file purged from disk' | |
25 def display_peek( self, dataset ): | |
26 try: | |
27 return dataset.peek | |
28 except: | |
29 return "Binary xls file (%s)" % ( data.nice_size( dataset.get_size() ) ) | |
30 | |
31 class PepXml(GenericXml): | |
32 """pepXML data""" | |
33 file_ext = "pepxml" | |
34 | |
35 def set_peek( self, dataset, is_multi_byte=False ): | |
36 """Set the peek and blurb text""" | |
37 if not dataset.dataset.purged: | |
38 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
39 dataset.blurb = 'pepXML data' | |
40 else: | |
41 dataset.peek = 'file does not exist' | |
42 dataset.blurb = 'file purged from disk' | |
43 def sniff( self, filename ): | |
44 """ | |
45 Determines whether the file is pepXML | |
46 """ | |
47 #TODO - Use a context manager on Python 2.5+ to close handle | |
48 handle = open(filename) | |
49 xmlns_re = re.compile(".*pepXML\"") | |
50 for i in range(3): | |
51 line = handle.readline() | |
52 if xmlns_re.match(line.strip()): | |
53 handle.close() | |
54 return True | |
55 | |
56 handle.close() | |
57 return False | |
58 | |
59 class MzML( GenericXml ): | |
60 """mzML data""" | |
61 file_ext = "mzml" | |
62 | |
63 def set_peek( self, dataset, is_multi_byte=False ): | |
64 """Set the peek and blurb text""" | |
65 if not dataset.dataset.purged: | |
66 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
67 dataset.blurb = 'mzML Mass Spectrometry data' | |
68 else: | |
69 dataset.peek = 'file does not exist' | |
70 dataset.blurb = 'file purged from disk' | |
71 | |
72 def sniff( self, filename ): | |
73 handle = open(filename) | |
74 xmlns_re = re.compile("^<mzML") | |
75 for i in range(3): | |
76 line = handle.readline() | |
77 if xmlns_re.match(line.strip()): | |
78 handle.close() | |
79 return True | |
80 | |
81 handle.close() | |
82 return False | |
83 | |
84 | |
85 class ProtXML( Text ): | |
86 """protXML data""" | |
87 file_ext = "protxml" | |
88 | |
89 def set_peek( self, dataset, is_multi_byte=False ): | |
90 """Set the peek and blurb text""" | |
91 if not dataset.dataset.purged: | |
92 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
93 dataset.blurb = 'prot XML Search Results' | |
94 else: | |
95 dataset.peek = 'file does not exist' | |
96 dataset.blurb = 'file purged from disk' | |
97 def sniff( self, filename ): | |
98 protxml_header = [ '<?xml version="1.0" encoding="ISO-8859-1"?>', | |
99 'xmlns="http://regis-web.systemsbiology.net/protXML"' ] | |
100 | |
101 for i, line in enumerate( file( filename ) ): | |
102 if i >= len( pepxml_header ): | |
103 return True | |
104 line = line.rstrip( '\n\r' ) | |
105 if protxml_header[ i ] not in line: | |
106 return False | |
107 | |
108 | |
109 | |
110 class MzXML( Text ): | |
111 """mzXML data""" | |
112 file_ext = "mzXML" | |
113 | |
114 def set_peek( self, dataset, is_multi_byte=False ): | |
115 """Set the peek and blurb text""" | |
116 if not dataset.dataset.purged: | |
117 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
118 dataset.blurb = 'mzXML Mass Spectrometry data' | |
119 else: | |
120 dataset.peek = 'file does not exist' | |
121 dataset.blurb = 'file purged from disk' | |
122 def sniff( self, filename ): | |
123 mzxml_header = [ '<?xml version="1.0" encoding="ISO-8859-1"?>', | |
124 '<mzXML xmlns="http://sashimi.sourceforge.net/schema_revision/mzXML_2.1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://sashimi.sourceforge.net/schema_revision/mzXML_2.1 http://sashimi.sourceforge.net/schema_revision/mzXML_2.1/mzXML_idx_2.1.xsd">' ] | |
125 for i, line in enumerate( file( filename ) ): | |
126 if i >= len( mzxml_header ): | |
127 return True | |
128 line = line.rstrip( '\n\r' ) | |
129 if line != mzxml_header[ i ]: | |
130 return False | |
131 | |
132 class Mgf( Text ): | |
133 """Mascot Generic Format data""" | |
134 file_ext = "mgf" | |
135 | |
136 def set_peek( self, dataset, is_multi_byte=False ): | |
137 """Set the peek and blurb text""" | |
138 if not dataset.dataset.purged: | |
139 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
140 dataset.blurb = 'mgf Mascot Generic Format' | |
141 else: | |
142 dataset.peek = 'file does not exist' | |
143 dataset.blurb = 'file purged from disk' | |
144 | |
145 | |
146 def sniff( self, filename ): | |
147 mgf_begin_ions = "BEGIN IONS" | |
148 max_lines=100 | |
149 | |
150 for i, line in enumerate( file( filename ) ): | |
151 line = line.rstrip( '\n\r' ) | |
152 if line==mgf_begin_ions: | |
153 return True | |
154 if i>max_lines: | |
155 return False | |
156 | |
157 | |
158 class MascotDat( Text ): | |
159 """Mascot search results """ | |
160 file_ext = "mascotdat" | |
161 | |
162 def set_peek( self, dataset, is_multi_byte=False ): | |
163 """Set the peek and blurb text""" | |
164 if not dataset.dataset.purged: | |
165 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
166 dataset.blurb = 'mascotdat Mascot Search Results' | |
167 else: | |
168 dataset.peek = 'file does not exist' | |
169 dataset.blurb = 'file purged from disk' | |
170 | |
171 | |
172 def sniff( self, filename ): | |
173 mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)" | |
174 max_lines=10 | |
175 | |
176 for i, line in enumerate( file( filename ) ): | |
177 line = line.rstrip( '\n\r' ) | |
178 if line==mime_version: | |
179 return True | |
180 if i>max_lines: | |
181 return False | |
182 | |
183 | |
184 class RAW( Binary ): | |
185 """Class describing a Thermo Finnigan binary RAW file""" | |
186 file_ext = "raw" | |
187 def sniff( self, filename ): | |
188 # Thermo Finnigan RAW format is proprietary and hence not well documented. | |
189 # Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n | |
190 # This combination represents 17 bytes, but to play safe we read 20 bytes from | |
191 # the start of the file. | |
192 try: | |
193 header = open( filename ).read(20) | |
194 hexheader = binascii.b2a_hex( header ) | |
195 finnigan = binascii.hexlify( 'F\0i\0n\0n\0i\0g\0a\0n' ) | |
196 if hexheader.find(finnigan) != -1: | |
197 return True | |
198 return False | |
199 except: | |
200 return False | |
201 def set_peek( self, dataset, is_multi_byte=False ): | |
202 if not dataset.dataset.purged: | |
203 dataset.peek = "Thermo Finnigan RAW file" | |
204 dataset.blurb = data.nice_size( dataset.get_size() ) | |
205 else: | |
206 dataset.peek = 'file does not exist' | |
207 dataset.blurb = 'file purged from disk' | |
208 def display_peek( self, dataset ): | |
209 try: | |
210 return dataset.peek | |
211 except: | |
212 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) ) | |
213 | |
214 | |
215 class Msp(Text): | |
216 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """ | |
217 file_ext = "msp" | |
218 | |
219 @staticmethod | |
220 def next_line_starts_with(contents, prefix): | |
221 next_line = contents.readline() | |
222 return next_line != None and next_line.startswith(prefix) | |
223 | |
224 def sniff(self, filename): | |
225 """ Determines whether the file is a NIST MSP output file. | |
226 | |
227 >>> fname = get_test_fname('test.msp') | |
228 >>> Msp().sniff(fname) | |
229 True | |
230 >>> fname = get_test_fname('test.mzXML') | |
231 >>> Msp().sniff(fname) | |
232 False | |
233 """ | |
234 with open(filename, 'r') as contents: | |
235 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:") | |
236 | |
237 class Ms2(Text): | |
238 file_ext = "ms2" | |
239 | |
240 def sniff(self, filename): | |
241 """ Determines whether the file is a valid ms2 file. | |
242 | |
243 >>> fname = get_test_fname('test.msp') | |
244 >>> Ms2().sniff(fname) | |
245 False | |
246 >>> fname = get_test_fname('test.ms2') | |
247 >>> Ms2().sniff(fname) | |
248 True | |
249 """ | |
250 | |
251 with open(filename, 'r') as contents: | |
252 header_lines = [] | |
253 while True: | |
254 line = contents.readline() | |
255 if line == None or len(line) == 0: | |
256 pass | |
257 elif line.startswith('H\t'): | |
258 header_lines.append(line) | |
259 else: | |
260 break | |
261 for header_field in ['CreationDate', 'Extractor', 'ExtractorVersion', 'ExtractorOptions']: | |
262 found_header = False | |
263 for header_line in header_lines: | |
264 if header_line.startswith('H\t%s' % (header_field)): | |
265 found_header = True | |
266 break | |
267 if not found_header: | |
268 return False | |
269 | |
270 return True | |
271 | |
272 # unsniffable binary format, should do something about this | |
273 class XHunterAslFormat(Binary): | |
274 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """ | |
275 file_ext = "hlf" |