Mercurial > repos > iracooke > proteomics_datatypes
annotate proteomics.py @ 18:d1ea609e57d4 draft
Make splib a composite datatype
author | iracooke |
---|---|
date | Wed, 20 May 2015 01:39:39 -0400 |
parents | 29c43b953c1c |
children | e5551a35e508 |
rev | line source |
---|---|
9 | 1 """ |
2 Proteomics format classes | |
3 """ | |
4 import logging | |
5 import re | |
6 import binascii | |
7 | |
8 from galaxy.datatypes.sniff import * | |
9 from galaxy.datatypes import data | |
10 from galaxy.datatypes.data import Text | |
11 from galaxy.datatypes.xml import GenericXml | |
12 from galaxy.datatypes.binary import Binary | |
13 from galaxy.datatypes.tabular import Tabular | |
14 from galaxy.datatypes.interval import Gff | |
15 | |
16 log = logging.getLogger(__name__) | |
17 | |
18 | |
19 class Wiff( Binary ): | |
20 """Class for wiff files.""" | |
21 file_ext = 'wiff' | |
22 allow_datatype_change = False | |
23 composite_type = 'auto_primary_file' | |
24 | |
25 def __init__(self, **kwd): | |
26 Binary.__init__(self, **kwd) | |
27 self.add_composite_file( 'wiff', | |
28 description = 'AB SCIEX files in .wiff format. This can contain all needed information or only metadata.', | |
29 is_binary = True ) | |
30 self.add_composite_file( 'wiff_scan', | |
31 description = 'AB SCIEX spectra file (wiff.scan), if the corresponding .wiff file only contains metadata.', | |
32 optional = 'True', is_binary = True ) | |
33 | |
34 def generate_primary_file( self, dataset = None ): | |
35 rval = ['<html><head><title>Wiff Composite Dataset </title></head><p/>'] | |
36 rval.append('<div>This composite dataset is composed of the following files:<p/><ul>') | |
37 for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems(): | |
38 fn = composite_name | |
39 opt_text = '' | |
40 if composite_file.optional: | |
41 opt_text = ' (optional)' | |
42 if composite_file.get('description'): | |
43 rval.append( '<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) ) | |
44 else: | |
45 rval.append( '<li><a href="%s" type="text/plain">%s</a>%s</li>' % ( fn, fn, opt_text ) ) | |
46 rval.append( '</ul></div></html>' ) | |
47 return "\n".join( rval ) | |
48 | |
49 | |
50 | |
51 if hasattr(Binary, 'register_unsniffable_binary_ext'): | |
52 Binary.register_unsniffable_binary_ext('wiff') | |
53 | |
54 | |
55 class IdpDB( Binary ): | |
56 file_ext = "idpDB" | |
57 | |
58 if hasattr(Binary, 'register_unsniffable_binary_ext'): | |
59 Binary.register_unsniffable_binary_ext('idpDB') | |
60 | |
61 | |
62 class PepXmlReport( Tabular ): | |
63 """pepxml converted to tabular report""" | |
64 file_ext = "tsv" | |
65 | |
66 def __init__(self, **kwd): | |
67 Tabular.__init__( self, **kwd ) | |
68 self.column_names = ['Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility'] | |
69 | |
70 def display_peek( self, dataset ): | |
71 """Returns formated html of peek""" | |
72 return Tabular.make_html_table( self, dataset, column_names=self.column_names ) | |
73 | |
74 | |
75 class ProtXmlReport( Tabular ): | |
76 """protxml converted to tabular report""" | |
77 file_ext = "tsv" | |
78 comment_lines = 1 | |
79 | |
80 def __init__(self, **kwd): | |
81 Tabular.__init__( self, **kwd ) | |
82 self.column_names = ["Entry Number", "Group Probability", "Protein", "Protein Link", "Protein Probability", "Percent Coverage", "Number of Unique Peptides", "Total Independent Spectra", "Percent Share of Spectrum ID's", "Description", "Protein Molecular Weight", "Protein Length", "Is Nondegenerate Evidence", "Weight", "Precursor Ion Charge", "Peptide sequence", "Peptide Link", "NSP Adjusted Probability", "Initial Probability", "Number of Total Termini", "Number of Sibling Peptides Bin", "Number of Instances", "Peptide Group Designator", "Is Evidence?"] | |
83 | |
84 def display_peek( self, dataset ): | |
85 """Returns formated html of peek""" | |
86 return Tabular.make_html_table( self, dataset, column_names=self.column_names ) | |
87 | |
88 class ProteomicsXml( GenericXml ): | |
89 """ An enhanced XML datatype used to reuse code across several | |
90 proteomic/mass-spec datatypes. """ | |
91 | |
92 def sniff(self, filename): | |
93 """ Determines whether the file is the correct XML type. """ | |
94 with open(filename, 'r') as contents: | |
95 while True: | |
96 line = contents.readline() | |
97 if line == None or not line.startswith('<?'): | |
98 break | |
99 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string | |
100 return line != None and re.match(pattern, line) != None | |
101 | |
102 def set_peek( self, dataset, is_multi_byte=False ): | |
103 """Set the peek and blurb text""" | |
104 if not dataset.dataset.purged: | |
105 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
106 dataset.blurb = self.blurb | |
107 else: | |
108 dataset.peek = 'file does not exist' | |
109 dataset.blurb = 'file purged from disk' | |
110 | |
111 | |
112 class PepXml(ProteomicsXml): | |
113 """pepXML data""" | |
114 file_ext = "pepxml" | |
115 blurb = 'pepXML data' | |
116 root = "msms_pipeline_analysis" | |
117 | |
118 | |
119 class MzML(ProteomicsXml): | |
120 """mzML data""" | |
121 file_ext = "mzml" | |
122 blurb = 'mzML Mass Spectrometry data' | |
123 root = "(mzML|indexedmzML)" | |
124 | |
125 | |
126 class ProtXML(ProteomicsXml): | |
127 """protXML data""" | |
128 file_ext = "protxml" | |
129 blurb = 'prot XML Search Results' | |
130 root = "protein_summary" | |
131 | |
132 | |
133 class MzXML(ProteomicsXml): | |
134 """mzXML data""" | |
135 file_ext = "mzxml" | |
136 blurb = "mzXML Mass Spectrometry data" | |
137 root = "mzXML" | |
138 | |
139 ## PSI datatypes | |
140 class MzIdentML(ProteomicsXml): | |
141 file_ext = "mzid" | |
142 blurb = "XML identified peptides and proteins." | |
143 root = "MzIdentML" | |
144 | |
145 | |
146 class TraML(ProteomicsXml): | |
147 file_ext = "traml" | |
148 blurb = "TraML transition list" | |
149 root = "TraML" | |
150 | |
151 | |
152 class MzQuantML(ProteomicsXml): | |
153 file_ext = "mzq" | |
154 blurb = "XML quantification data" | |
155 root = "MzQuantML" | |
156 | |
157 | |
158 class ConsensusXML(ProteomicsXml): | |
159 file_ext = "consensusxml" | |
160 blurb = "OpenMS multiple LC-MS map alignment file" | |
161 root = "consensusXML" | |
162 | |
163 | |
164 class FeatureXML(ProteomicsXml): | |
165 file_ext = "featurexml" | |
166 blurb = "OpenMS feature file" | |
167 root = "featureMap" | |
168 | |
169 | |
170 class IdXML(ProteomicsXml): | |
171 file_ext = "idxml" | |
172 blurb = "OpenMS identification file" | |
173 root = "IdXML" | |
174 | |
10 | 175 class TandemXML(ProteomicsXml): |
176 file_ext = "tandem" | |
177 blurb = "X!Tandem search results file" | |
12 | 178 root = "bioml" |
9 | 179 |
180 class Mgf( Text ): | |
181 """Mascot Generic Format data""" | |
182 file_ext = "mgf" | |
183 | |
184 def set_peek( self, dataset, is_multi_byte=False ): | |
185 """Set the peek and blurb text""" | |
186 if not dataset.dataset.purged: | |
187 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
188 dataset.blurb = 'mgf Mascot Generic Format' | |
189 else: | |
190 dataset.peek = 'file does not exist' | |
191 dataset.blurb = 'file purged from disk' | |
192 | |
193 def sniff( self, filename ): | |
194 mgf_begin_ions = "BEGIN IONS" | |
195 max_lines=100 | |
196 | |
197 for i, line in enumerate( file( filename ) ): | |
198 line = line.rstrip( '\n\r' ) | |
199 if line==mgf_begin_ions: | |
200 return True | |
201 if i>max_lines: | |
202 return False | |
203 | |
204 | |
205 class MascotDat( Text ): | |
206 """Mascot search results """ | |
207 file_ext = "mascotdat" | |
208 | |
209 def set_peek( self, dataset, is_multi_byte=False ): | |
210 """Set the peek and blurb text""" | |
211 if not dataset.dataset.purged: | |
212 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
213 dataset.blurb = 'mascotdat Mascot Search Results' | |
214 else: | |
215 dataset.peek = 'file does not exist' | |
216 dataset.blurb = 'file purged from disk' | |
217 | |
218 | |
219 def sniff( self, filename ): | |
220 mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)" | |
221 max_lines=10 | |
222 | |
223 for i, line in enumerate( file( filename ) ): | |
224 line = line.rstrip( '\n\r' ) | |
225 if line==mime_version: | |
226 return True | |
227 if i>max_lines: | |
228 return False | |
229 | |
230 | |
231 class RAW( Binary ): | |
232 """Class describing a Thermo Finnigan binary RAW file""" | |
233 file_ext = "raw" | |
234 def sniff( self, filename ): | |
235 # Thermo Finnigan RAW format is proprietary and hence not well documented. | |
236 # Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n | |
237 # This combination represents 17 bytes, but to play safe we read 20 bytes from | |
238 # the start of the file. | |
239 try: | |
240 header = open( filename ).read(20) | |
241 hexheader = binascii.b2a_hex( header ) | |
242 finnigan = binascii.hexlify( 'F\0i\0n\0n\0i\0g\0a\0n' ) | |
243 if hexheader.find(finnigan) != -1: | |
244 return True | |
245 return False | |
246 except: | |
247 return False | |
248 def set_peek( self, dataset, is_multi_byte=False ): | |
249 if not dataset.dataset.purged: | |
250 dataset.peek = "Thermo Finnigan RAW file" | |
251 dataset.blurb = data.nice_size( dataset.get_size() ) | |
252 else: | |
253 dataset.peek = 'file does not exist' | |
254 dataset.blurb = 'file purged from disk' | |
255 def display_peek( self, dataset ): | |
256 try: | |
257 return dataset.peek | |
258 except: | |
259 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) ) | |
260 | |
261 | |
262 if hasattr(Binary, 'register_sniffable_binary_format'): | |
263 Binary.register_sniffable_binary_format('raw', 'raw', RAW) | |
264 | |
265 | |
266 class Msp( Text ): | |
267 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """ | |
268 file_ext = "msp" | |
269 | |
270 @staticmethod | |
271 def next_line_starts_with(contents, prefix): | |
272 next_line = contents.readline() | |
273 return next_line != None and next_line.startswith(prefix) | |
274 | |
275 def sniff(self, filename): | |
276 """ Determines whether the file is a NIST MSP output file. | |
277 | |
278 >>> fname = get_test_fname('test.msp') | |
279 >>> Msp().sniff(fname) | |
280 True | |
281 >>> fname = get_test_fname('test.mzXML') | |
282 >>> Msp().sniff(fname) | |
283 False | |
284 """ | |
285 with open(filename, 'r') as contents: | |
286 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:") | |
287 | |
16
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
288 |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
289 class SPLib( Msp ): |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
290 """SpectraST Spectral Library. Closely related to msp format""" |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
291 file_ext = "splib" |
18 | 292 composite_type = 'auto_primary_file' |
293 | |
294 def __init__(self, **kwd): | |
295 Msp.__init__(self, **kwd) | |
296 self.add_composite_file( 'library.splib', | |
297 description = 'Spectral Library. Contains actual library spectra', | |
298 is_binary = False ) | |
299 self.add_composite_file( 'library.spidx', | |
300 description = 'Spectrum index', is_binary = False ) | |
301 self.add_composite_file( 'library.pepidx', | |
302 description = 'Peptide index', is_binary = False) | |
303 | |
304 | |
305 def generate_primary_file( self, dataset = None ): | |
306 rval = ['<html><head><title>Spectral Library Composite Dataset </title></head><p/>'] | |
307 rval.append('<div>This composite dataset is composed of the following files:<p/><ul>') | |
308 for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems(): | |
309 fn = composite_name | |
310 opt_text = '' | |
311 if composite_file.optional: | |
312 opt_text = ' (optional)' | |
313 if composite_file.get('description'): | |
314 rval.append( '<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) ) | |
315 else: | |
316 rval.append( '<li><a href="%s" type="text/plain">%s</a>%s</li>' % ( fn, fn, opt_text ) ) | |
317 rval.append( '</ul></div></html>' ) | |
318 return "\n".join( rval ) | |
319 | |
320 | |
16
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
321 |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
322 def set_peek( self, dataset, is_multi_byte=False ): |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
323 """Set the peek and blurb text""" |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
324 if not dataset.dataset.purged: |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
325 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
326 dataset.blurb = 'splib Spectral Library Format' |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
327 else: |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
328 dataset.peek = 'file does not exist' |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
329 dataset.blurb = 'file purged from disk' |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
330 |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
331 |
17 | 332 def sniff(self, filename): |
333 """ Determines whether the file is a SpectraST generated file. | |
334 """ | |
335 with open(filename, 'r') as contents: | |
336 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "LibID:") | |
337 | |
338 | |
9 | 339 class Ms2(Text): |
340 file_ext = "ms2" | |
341 | |
342 def sniff(self, filename): | |
343 """ Determines whether the file is a valid ms2 file. | |
344 | |
345 >>> fname = get_test_fname('test.msp') | |
346 >>> Ms2().sniff(fname) | |
347 False | |
348 >>> fname = get_test_fname('test.ms2') | |
349 >>> Ms2().sniff(fname) | |
350 True | |
351 """ | |
352 | |
353 with open(filename, 'r') as contents: | |
354 header_lines = [] | |
355 while True: | |
356 line = contents.readline() | |
357 if line == None or len(line) == 0: | |
358 pass | |
359 elif line.startswith('H\t'): | |
360 header_lines.append(line) | |
361 else: | |
362 break | |
363 for header_field in ['CreationDate', 'Extractor', 'ExtractorVersion', 'ExtractorOptions']: | |
364 found_header = False | |
365 for header_line in header_lines: | |
366 if header_line.startswith('H\t%s' % (header_field)): | |
367 found_header = True | |
368 break | |
369 if not found_header: | |
370 return False | |
371 | |
372 return True | |
373 | |
374 # unsniffable binary format, should do something about this | |
375 class XHunterAslFormat( Binary ): | |
376 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """ | |
377 file_ext = "hlf" | |
378 | |
379 if hasattr(Binary, 'register_unsniffable_binary_ext'): | |
380 Binary.register_unsniffable_binary_ext('hlf') |