Mercurial > repos > iracooke > proteomics_datatypes
annotate proteomics.py @ 19:e5551a35e508 draft
planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
author | iracooke |
---|---|
date | Wed, 03 Jun 2015 18:24:50 -0400 |
parents | d1ea609e57d4 |
children |
rev | line source |
---|---|
9 | 1 """ |
2 Proteomics format classes | |
3 """ | |
4 import logging | |
5 import re | |
6 import binascii | |
7 | |
8 from galaxy.datatypes.sniff import * | |
9 from galaxy.datatypes import data | |
10 from galaxy.datatypes.data import Text | |
11 from galaxy.datatypes.xml import GenericXml | |
12 from galaxy.datatypes.binary import Binary | |
13 from galaxy.datatypes.tabular import Tabular | |
14 from galaxy.datatypes.interval import Gff | |
15 | |
16 log = logging.getLogger(__name__) | |
17 | |
18 | |
19 class Wiff( Binary ): | |
20 """Class for wiff files.""" | |
21 file_ext = 'wiff' | |
22 allow_datatype_change = False | |
23 composite_type = 'auto_primary_file' | |
24 | |
25 def __init__(self, **kwd): | |
26 Binary.__init__(self, **kwd) | |
27 self.add_composite_file( 'wiff', | |
28 description = 'AB SCIEX files in .wiff format. This can contain all needed information or only metadata.', | |
29 is_binary = True ) | |
30 self.add_composite_file( 'wiff_scan', | |
31 description = 'AB SCIEX spectra file (wiff.scan), if the corresponding .wiff file only contains metadata.', | |
32 optional = 'True', is_binary = True ) | |
33 | |
34 def generate_primary_file( self, dataset = None ): | |
35 rval = ['<html><head><title>Wiff Composite Dataset </title></head><p/>'] | |
36 rval.append('<div>This composite dataset is composed of the following files:<p/><ul>') | |
37 for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems(): | |
38 fn = composite_name | |
39 opt_text = '' | |
40 if composite_file.optional: | |
41 opt_text = ' (optional)' | |
42 if composite_file.get('description'): | |
43 rval.append( '<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) ) | |
44 else: | |
45 rval.append( '<li><a href="%s" type="text/plain">%s</a>%s</li>' % ( fn, fn, opt_text ) ) | |
46 rval.append( '</ul></div></html>' ) | |
47 return "\n".join( rval ) | |
48 | |
49 | |
50 | |
51 if hasattr(Binary, 'register_unsniffable_binary_ext'): | |
52 Binary.register_unsniffable_binary_ext('wiff') | |
53 | |
54 | |
55 class IdpDB( Binary ): | |
56 file_ext = "idpDB" | |
57 | |
58 if hasattr(Binary, 'register_unsniffable_binary_ext'): | |
59 Binary.register_unsniffable_binary_ext('idpDB') | |
60 | |
61 | |
62 class PepXmlReport( Tabular ): | |
63 """pepxml converted to tabular report""" | |
64 file_ext = "tsv" | |
65 | |
66 def __init__(self, **kwd): | |
67 Tabular.__init__( self, **kwd ) | |
68 self.column_names = ['Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility'] | |
69 | |
70 def display_peek( self, dataset ): | |
71 """Returns formated html of peek""" | |
72 return Tabular.make_html_table( self, dataset, column_names=self.column_names ) | |
73 | |
74 | |
75 class ProtXmlReport( Tabular ): | |
76 """protxml converted to tabular report""" | |
77 file_ext = "tsv" | |
78 comment_lines = 1 | |
79 | |
80 def __init__(self, **kwd): | |
81 Tabular.__init__( self, **kwd ) | |
82 self.column_names = ["Entry Number", "Group Probability", "Protein", "Protein Link", "Protein Probability", "Percent Coverage", "Number of Unique Peptides", "Total Independent Spectra", "Percent Share of Spectrum ID's", "Description", "Protein Molecular Weight", "Protein Length", "Is Nondegenerate Evidence", "Weight", "Precursor Ion Charge", "Peptide sequence", "Peptide Link", "NSP Adjusted Probability", "Initial Probability", "Number of Total Termini", "Number of Sibling Peptides Bin", "Number of Instances", "Peptide Group Designator", "Is Evidence?"] | |
83 | |
84 def display_peek( self, dataset ): | |
85 """Returns formated html of peek""" | |
86 return Tabular.make_html_table( self, dataset, column_names=self.column_names ) | |
87 | |
88 class ProteomicsXml( GenericXml ): | |
89 """ An enhanced XML datatype used to reuse code across several | |
90 proteomic/mass-spec datatypes. """ | |
91 | |
92 def sniff(self, filename): | |
93 """ Determines whether the file is the correct XML type. """ | |
94 with open(filename, 'r') as contents: | |
95 while True: | |
96 line = contents.readline() | |
97 if line == None or not line.startswith('<?'): | |
98 break | |
99 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string | |
100 return line != None and re.match(pattern, line) != None | |
101 | |
102 def set_peek( self, dataset, is_multi_byte=False ): | |
103 """Set the peek and blurb text""" | |
104 if not dataset.dataset.purged: | |
105 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
106 dataset.blurb = self.blurb | |
107 else: | |
108 dataset.peek = 'file does not exist' | |
109 dataset.blurb = 'file purged from disk' | |
110 | |
111 | |
112 class PepXml(ProteomicsXml): | |
113 """pepXML data""" | |
114 file_ext = "pepxml" | |
115 blurb = 'pepXML data' | |
116 root = "msms_pipeline_analysis" | |
117 | |
118 | |
119 class MzML(ProteomicsXml): | |
120 """mzML data""" | |
121 file_ext = "mzml" | |
122 blurb = 'mzML Mass Spectrometry data' | |
123 root = "(mzML|indexedmzML)" | |
124 | |
125 | |
126 class ProtXML(ProteomicsXml): | |
127 """protXML data""" | |
128 file_ext = "protxml" | |
129 blurb = 'prot XML Search Results' | |
130 root = "protein_summary" | |
131 | |
132 | |
133 class MzXML(ProteomicsXml): | |
134 """mzXML data""" | |
135 file_ext = "mzxml" | |
136 blurb = "mzXML Mass Spectrometry data" | |
137 root = "mzXML" | |
138 | |
139 ## PSI datatypes | |
140 class MzIdentML(ProteomicsXml): | |
141 file_ext = "mzid" | |
142 blurb = "XML identified peptides and proteins." | |
143 root = "MzIdentML" | |
144 | |
145 | |
146 class TraML(ProteomicsXml): | |
147 file_ext = "traml" | |
148 blurb = "TraML transition list" | |
149 root = "TraML" | |
150 | |
151 | |
152 class MzQuantML(ProteomicsXml): | |
153 file_ext = "mzq" | |
154 blurb = "XML quantification data" | |
155 root = "MzQuantML" | |
156 | |
157 | |
158 class ConsensusXML(ProteomicsXml): | |
159 file_ext = "consensusxml" | |
160 blurb = "OpenMS multiple LC-MS map alignment file" | |
161 root = "consensusXML" | |
162 | |
163 | |
164 class FeatureXML(ProteomicsXml): | |
165 file_ext = "featurexml" | |
166 blurb = "OpenMS feature file" | |
167 root = "featureMap" | |
168 | |
169 | |
170 class IdXML(ProteomicsXml): | |
171 file_ext = "idxml" | |
172 blurb = "OpenMS identification file" | |
173 root = "IdXML" | |
174 | |
10 | 175 class TandemXML(ProteomicsXml): |
176 file_ext = "tandem" | |
177 blurb = "X!Tandem search results file" | |
12 | 178 root = "bioml" |
9 | 179 |
180 class Mgf( Text ): | |
181 """Mascot Generic Format data""" | |
182 file_ext = "mgf" | |
183 | |
184 def set_peek( self, dataset, is_multi_byte=False ): | |
185 """Set the peek and blurb text""" | |
186 if not dataset.dataset.purged: | |
187 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
188 dataset.blurb = 'mgf Mascot Generic Format' | |
189 else: | |
190 dataset.peek = 'file does not exist' | |
191 dataset.blurb = 'file purged from disk' | |
192 | |
193 def sniff( self, filename ): | |
194 mgf_begin_ions = "BEGIN IONS" | |
195 max_lines=100 | |
196 | |
197 for i, line in enumerate( file( filename ) ): | |
198 line = line.rstrip( '\n\r' ) | |
199 if line==mgf_begin_ions: | |
200 return True | |
201 if i>max_lines: | |
202 return False | |
203 | |
204 | |
205 class MascotDat( Text ): | |
206 """Mascot search results """ | |
207 file_ext = "mascotdat" | |
208 | |
209 def set_peek( self, dataset, is_multi_byte=False ): | |
210 """Set the peek and blurb text""" | |
211 if not dataset.dataset.purged: | |
212 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
213 dataset.blurb = 'mascotdat Mascot Search Results' | |
214 else: | |
215 dataset.peek = 'file does not exist' | |
216 dataset.blurb = 'file purged from disk' | |
217 | |
218 | |
219 def sniff( self, filename ): | |
220 mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)" | |
221 max_lines=10 | |
222 | |
223 for i, line in enumerate( file( filename ) ): | |
224 line = line.rstrip( '\n\r' ) | |
225 if line==mime_version: | |
226 return True | |
227 if i>max_lines: | |
228 return False | |
229 | |
230 | |
231 class RAW( Binary ): | |
232 """Class describing a Thermo Finnigan binary RAW file""" | |
233 file_ext = "raw" | |
234 def sniff( self, filename ): | |
235 # Thermo Finnigan RAW format is proprietary and hence not well documented. | |
236 # Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n | |
237 # This combination represents 17 bytes, but to play safe we read 20 bytes from | |
238 # the start of the file. | |
239 try: | |
240 header = open( filename ).read(20) | |
241 hexheader = binascii.b2a_hex( header ) | |
242 finnigan = binascii.hexlify( 'F\0i\0n\0n\0i\0g\0a\0n' ) | |
243 if hexheader.find(finnigan) != -1: | |
244 return True | |
245 return False | |
246 except: | |
247 return False | |
248 def set_peek( self, dataset, is_multi_byte=False ): | |
249 if not dataset.dataset.purged: | |
250 dataset.peek = "Thermo Finnigan RAW file" | |
251 dataset.blurb = data.nice_size( dataset.get_size() ) | |
252 else: | |
253 dataset.peek = 'file does not exist' | |
254 dataset.blurb = 'file purged from disk' | |
255 def display_peek( self, dataset ): | |
256 try: | |
257 return dataset.peek | |
258 except: | |
259 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) ) | |
260 | |
261 | |
262 if hasattr(Binary, 'register_sniffable_binary_format'): | |
263 Binary.register_sniffable_binary_format('raw', 'raw', RAW) | |
264 | |
265 | |
266 class Msp( Text ): | |
267 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """ | |
268 file_ext = "msp" | |
269 | |
270 @staticmethod | |
271 def next_line_starts_with(contents, prefix): | |
272 next_line = contents.readline() | |
273 return next_line != None and next_line.startswith(prefix) | |
274 | |
275 def sniff(self, filename): | |
276 """ Determines whether the file is a NIST MSP output file. | |
277 | |
278 >>> fname = get_test_fname('test.msp') | |
279 >>> Msp().sniff(fname) | |
280 True | |
281 >>> fname = get_test_fname('test.mzXML') | |
282 >>> Msp().sniff(fname) | |
283 False | |
284 """ | |
285 with open(filename, 'r') as contents: | |
286 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:") | |
287 | |
19
e5551a35e508
planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents:
18
diff
changeset
|
288 class SPLibNoIndex( Text ): |
e5551a35e508
planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents:
18
diff
changeset
|
289 """SPlib without index file """ |
e5551a35e508
planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents:
18
diff
changeset
|
290 file_ext = "splib" |
e5551a35e508
planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents:
18
diff
changeset
|
291 |
e5551a35e508
planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents:
18
diff
changeset
|
292 def set_peek( self, dataset, is_multi_byte=False ): |
e5551a35e508
planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents:
18
diff
changeset
|
293 """Set the peek and blurb text""" |
e5551a35e508
planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents:
18
diff
changeset
|
294 if not dataset.dataset.purged: |
e5551a35e508
planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents:
18
diff
changeset
|
295 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) |
e5551a35e508
planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents:
18
diff
changeset
|
296 dataset.blurb = 'Spectral Library without index files' |
e5551a35e508
planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents:
18
diff
changeset
|
297 else: |
e5551a35e508
planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents:
18
diff
changeset
|
298 dataset.peek = 'file does not exist' |
e5551a35e508
planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents:
18
diff
changeset
|
299 dataset.blurb = 'file purged from disk' |
e5551a35e508
planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents:
18
diff
changeset
|
300 |
16
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
301 |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
302 class SPLib( Msp ): |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
303 """SpectraST Spectral Library. Closely related to msp format""" |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
304 file_ext = "splib" |
18 | 305 composite_type = 'auto_primary_file' |
306 | |
307 def __init__(self, **kwd): | |
308 Msp.__init__(self, **kwd) | |
309 self.add_composite_file( 'library.splib', | |
310 description = 'Spectral Library. Contains actual library spectra', | |
311 is_binary = False ) | |
312 self.add_composite_file( 'library.spidx', | |
313 description = 'Spectrum index', is_binary = False ) | |
314 self.add_composite_file( 'library.pepidx', | |
315 description = 'Peptide index', is_binary = False) | |
316 | |
317 | |
318 def generate_primary_file( self, dataset = None ): | |
319 rval = ['<html><head><title>Spectral Library Composite Dataset </title></head><p/>'] | |
320 rval.append('<div>This composite dataset is composed of the following files:<p/><ul>') | |
321 for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems(): | |
322 fn = composite_name | |
323 opt_text = '' | |
324 if composite_file.optional: | |
325 opt_text = ' (optional)' | |
326 if composite_file.get('description'): | |
327 rval.append( '<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) ) | |
328 else: | |
329 rval.append( '<li><a href="%s" type="text/plain">%s</a>%s</li>' % ( fn, fn, opt_text ) ) | |
330 rval.append( '</ul></div></html>' ) | |
331 return "\n".join( rval ) | |
332 | |
333 | |
16
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
334 |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
335 def set_peek( self, dataset, is_multi_byte=False ): |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
336 """Set the peek and blurb text""" |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
337 if not dataset.dataset.purged: |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
338 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
339 dataset.blurb = 'splib Spectral Library Format' |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
340 else: |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
341 dataset.peek = 'file does not exist' |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
342 dataset.blurb = 'file purged from disk' |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
343 |
e6a02a387448
planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents:
12
diff
changeset
|
344 |
17 | 345 def sniff(self, filename): |
346 """ Determines whether the file is a SpectraST generated file. | |
347 """ | |
348 with open(filename, 'r') as contents: | |
349 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "LibID:") | |
350 | |
351 | |
9 | 352 class Ms2(Text): |
353 file_ext = "ms2" | |
354 | |
355 def sniff(self, filename): | |
356 """ Determines whether the file is a valid ms2 file. | |
357 | |
358 >>> fname = get_test_fname('test.msp') | |
359 >>> Ms2().sniff(fname) | |
360 False | |
361 >>> fname = get_test_fname('test.ms2') | |
362 >>> Ms2().sniff(fname) | |
363 True | |
364 """ | |
365 | |
366 with open(filename, 'r') as contents: | |
367 header_lines = [] | |
368 while True: | |
369 line = contents.readline() | |
370 if line == None or len(line) == 0: | |
371 pass | |
372 elif line.startswith('H\t'): | |
373 header_lines.append(line) | |
374 else: | |
375 break | |
376 for header_field in ['CreationDate', 'Extractor', 'ExtractorVersion', 'ExtractorOptions']: | |
377 found_header = False | |
378 for header_line in header_lines: | |
379 if header_line.startswith('H\t%s' % (header_field)): | |
380 found_header = True | |
381 break | |
382 if not found_header: | |
383 return False | |
384 | |
385 return True | |
386 | |
387 # unsniffable binary format, should do something about this | |
388 class XHunterAslFormat( Binary ): | |
389 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """ | |
390 file_ext = "hlf" | |
391 | |
392 if hasattr(Binary, 'register_unsniffable_binary_ext'): | |
393 Binary.register_unsniffable_binary_ext('hlf') | |
19
e5551a35e508
planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents:
18
diff
changeset
|
394 |
e5551a35e508
planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents:
18
diff
changeset
|
395 |
e5551a35e508
planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents:
18
diff
changeset
|
396 class Sf3(Binary): |
e5551a35e508
planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents:
18
diff
changeset
|
397 """Class describing a Scaffold SF3 files""" |
e5551a35e508
planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents:
18
diff
changeset
|
398 file_ext = "sf3" |
e5551a35e508
planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents:
18
diff
changeset
|
399 |
e5551a35e508
planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents:
18
diff
changeset
|
400 if hasattr(Binary, 'register_unsniffable_binary_ext'): |
e5551a35e508
planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents:
18
diff
changeset
|
401 Binary.register_unsniffable_binary_ext('sf3') |
e5551a35e508
planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents:
18
diff
changeset
|
402 |
e5551a35e508
planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents:
18
diff
changeset
|
403 |