annotate proteomics.py @ 9:6ca516faacfc draft

Uploaded
author iracooke
date Thu, 05 Jun 2014 18:08:35 -0400
parents
children ef74edade8be
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
9
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
1 """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
2 Proteomics format classes
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
3 """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
4 import logging
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
5 import re
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
6 import binascii
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
7
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
8 from galaxy.datatypes.sniff import *
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
9 from galaxy.datatypes import data
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
10 from galaxy.datatypes.data import Text
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
11 from galaxy.datatypes.xml import GenericXml
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
12 from galaxy.datatypes.binary import Binary
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
13 from galaxy.datatypes.tabular import Tabular
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
14 from galaxy.datatypes.interval import Gff
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
15
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
16 log = logging.getLogger(__name__)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
17
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
18
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
19 class Wiff( Binary ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
20 """Class for wiff files."""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
21 file_ext = 'wiff'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
22 allow_datatype_change = False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
23 composite_type = 'auto_primary_file'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
24
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
25 def __init__(self, **kwd):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
26 Binary.__init__(self, **kwd)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
27 self.add_composite_file( 'wiff',
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
28 description = 'AB SCIEX files in .wiff format. This can contain all needed information or only metadata.',
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
29 is_binary = True )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
30 self.add_composite_file( 'wiff_scan',
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
31 description = 'AB SCIEX spectra file (wiff.scan), if the corresponding .wiff file only contains metadata.',
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
32 optional = 'True', is_binary = True )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
33
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
34 def generate_primary_file( self, dataset = None ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
35 rval = ['<html><head><title>Wiff Composite Dataset </title></head><p/>']
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
36 rval.append('<div>This composite dataset is composed of the following files:<p/><ul>')
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
37 for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
38 fn = composite_name
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
39 opt_text = ''
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
40 if composite_file.optional:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
41 opt_text = ' (optional)'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
42 if composite_file.get('description'):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
43 rval.append( '<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
44 else:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
45 rval.append( '<li><a href="%s" type="text/plain">%s</a>%s</li>' % ( fn, fn, opt_text ) )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
46 rval.append( '</ul></div></html>' )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
47 return "\n".join( rval )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
48
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
49
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
50
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
51 if hasattr(Binary, 'register_unsniffable_binary_ext'):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
52 Binary.register_unsniffable_binary_ext('wiff')
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
53
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
54
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
55 class IdpDB( Binary ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
56 file_ext = "idpDB"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
57
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
58 if hasattr(Binary, 'register_unsniffable_binary_ext'):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
59 Binary.register_unsniffable_binary_ext('idpDB')
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
60
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
61
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
62 class PepXmlReport( Tabular ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
63 """pepxml converted to tabular report"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
64 file_ext = "tsv"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
65
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
66 def __init__(self, **kwd):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
67 Tabular.__init__( self, **kwd )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
68 self.column_names = ['Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility']
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
69
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
70 def display_peek( self, dataset ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
71 """Returns formated html of peek"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
72 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
73
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
74
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
75 class ProtXmlReport( Tabular ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
76 """protxml converted to tabular report"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
77 file_ext = "tsv"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
78 comment_lines = 1
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
79
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
80 def __init__(self, **kwd):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
81 Tabular.__init__( self, **kwd )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
82 self.column_names = ["Entry Number", "Group Probability", "Protein", "Protein Link", "Protein Probability", "Percent Coverage", "Number of Unique Peptides", "Total Independent Spectra", "Percent Share of Spectrum ID's", "Description", "Protein Molecular Weight", "Protein Length", "Is Nondegenerate Evidence", "Weight", "Precursor Ion Charge", "Peptide sequence", "Peptide Link", "NSP Adjusted Probability", "Initial Probability", "Number of Total Termini", "Number of Sibling Peptides Bin", "Number of Instances", "Peptide Group Designator", "Is Evidence?"]
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
83
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
84 def display_peek( self, dataset ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
85 """Returns formated html of peek"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
86 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
87
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
88 class ProteomicsXml( GenericXml ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
89 """ An enhanced XML datatype used to reuse code across several
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
90 proteomic/mass-spec datatypes. """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
91
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
92 def sniff(self, filename):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
93 """ Determines whether the file is the correct XML type. """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
94 with open(filename, 'r') as contents:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
95 while True:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
96 line = contents.readline()
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
97 if line == None or not line.startswith('<?'):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
98 break
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
99 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
100 return line != None and re.match(pattern, line) != None
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
101
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
102 def set_peek( self, dataset, is_multi_byte=False ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
103 """Set the peek and blurb text"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
104 if not dataset.dataset.purged:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
105 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
106 dataset.blurb = self.blurb
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
107 else:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
108 dataset.peek = 'file does not exist'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
109 dataset.blurb = 'file purged from disk'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
110
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
111
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
112 class PepXml(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
113 """pepXML data"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
114 file_ext = "pepxml"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
115 blurb = 'pepXML data'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
116 root = "msms_pipeline_analysis"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
117
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
118
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
119 class MzML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
120 """mzML data"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
121 file_ext = "mzml"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
122 blurb = 'mzML Mass Spectrometry data'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
123 root = "(mzML|indexedmzML)"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
124
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
125
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
126 class ProtXML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
127 """protXML data"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
128 file_ext = "protxml"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
129 blurb = 'prot XML Search Results'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
130 root = "protein_summary"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
131
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
132
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
133 class MzXML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
134 """mzXML data"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
135 file_ext = "mzxml"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
136 blurb = "mzXML Mass Spectrometry data"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
137 root = "mzXML"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
138
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
139 ## PSI datatypes
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
140 class MzIdentML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
141 file_ext = "mzid"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
142 blurb = "XML identified peptides and proteins."
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
143 root = "MzIdentML"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
144
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
145
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
146 class TraML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
147 file_ext = "traml"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
148 blurb = "TraML transition list"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
149 root = "TraML"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
150
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
151
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
152 class MzQuantML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
153 file_ext = "mzq"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
154 blurb = "XML quantification data"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
155 root = "MzQuantML"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
156
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
157
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
158 class ConsensusXML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
159 file_ext = "consensusxml"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
160 blurb = "OpenMS multiple LC-MS map alignment file"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
161 root = "consensusXML"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
162
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
163
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
164 class FeatureXML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
165 file_ext = "featurexml"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
166 blurb = "OpenMS feature file"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
167 root = "featureMap"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
168
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
169
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
170 class IdXML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
171 file_ext = "idxml"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
172 blurb = "OpenMS identification file"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
173 root = "IdXML"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
174
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
175
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
176 class Mgf( Text ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
177 """Mascot Generic Format data"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
178 file_ext = "mgf"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
179
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
180 def set_peek( self, dataset, is_multi_byte=False ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
181 """Set the peek and blurb text"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
182 if not dataset.dataset.purged:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
183 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
184 dataset.blurb = 'mgf Mascot Generic Format'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
185 else:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
186 dataset.peek = 'file does not exist'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
187 dataset.blurb = 'file purged from disk'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
188
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
189 def sniff( self, filename ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
190 mgf_begin_ions = "BEGIN IONS"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
191 max_lines=100
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
192
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
193 for i, line in enumerate( file( filename ) ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
194 line = line.rstrip( '\n\r' )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
195 if line==mgf_begin_ions:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
196 return True
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
197 if i>max_lines:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
198 return False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
199
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
200
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
201 class MascotDat( Text ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
202 """Mascot search results """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
203 file_ext = "mascotdat"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
204
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
205 def set_peek( self, dataset, is_multi_byte=False ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
206 """Set the peek and blurb text"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
207 if not dataset.dataset.purged:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
208 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
209 dataset.blurb = 'mascotdat Mascot Search Results'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
210 else:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
211 dataset.peek = 'file does not exist'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
212 dataset.blurb = 'file purged from disk'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
213
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
214
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
215 def sniff( self, filename ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
216 mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
217 max_lines=10
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
218
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
219 for i, line in enumerate( file( filename ) ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
220 line = line.rstrip( '\n\r' )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
221 if line==mime_version:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
222 return True
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
223 if i>max_lines:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
224 return False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
225
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
226
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
227 class RAW( Binary ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
228 """Class describing a Thermo Finnigan binary RAW file"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
229 file_ext = "raw"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
230 def sniff( self, filename ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
231 # Thermo Finnigan RAW format is proprietary and hence not well documented.
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
232 # Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
233 # This combination represents 17 bytes, but to play safe we read 20 bytes from
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
234 # the start of the file.
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
235 try:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
236 header = open( filename ).read(20)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
237 hexheader = binascii.b2a_hex( header )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
238 finnigan = binascii.hexlify( 'F\0i\0n\0n\0i\0g\0a\0n' )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
239 if hexheader.find(finnigan) != -1:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
240 return True
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
241 return False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
242 except:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
243 return False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
244 def set_peek( self, dataset, is_multi_byte=False ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
245 if not dataset.dataset.purged:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
246 dataset.peek = "Thermo Finnigan RAW file"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
247 dataset.blurb = data.nice_size( dataset.get_size() )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
248 else:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
249 dataset.peek = 'file does not exist'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
250 dataset.blurb = 'file purged from disk'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
251 def display_peek( self, dataset ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
252 try:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
253 return dataset.peek
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
254 except:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
255 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
256
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
257
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
258 if hasattr(Binary, 'register_sniffable_binary_format'):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
259 Binary.register_sniffable_binary_format('raw', 'raw', RAW)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
260
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
261
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
262 class Msp( Text ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
263 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
264 file_ext = "msp"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
265
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
266 @staticmethod
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
267 def next_line_starts_with(contents, prefix):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
268 next_line = contents.readline()
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
269 return next_line != None and next_line.startswith(prefix)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
270
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
271 def sniff(self, filename):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
272 """ Determines whether the file is a NIST MSP output file.
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
273
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
274 >>> fname = get_test_fname('test.msp')
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
275 >>> Msp().sniff(fname)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
276 True
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
277 >>> fname = get_test_fname('test.mzXML')
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
278 >>> Msp().sniff(fname)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
279 False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
280 """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
281 with open(filename, 'r') as contents:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
282 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:")
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
283
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
284 class Ms2(Text):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
285 file_ext = "ms2"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
286
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
287 def sniff(self, filename):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
288 """ Determines whether the file is a valid ms2 file.
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
289
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
290 >>> fname = get_test_fname('test.msp')
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
291 >>> Ms2().sniff(fname)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
292 False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
293 >>> fname = get_test_fname('test.ms2')
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
294 >>> Ms2().sniff(fname)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
295 True
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
296 """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
297
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
298 with open(filename, 'r') as contents:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
299 header_lines = []
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
300 while True:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
301 line = contents.readline()
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
302 if line == None or len(line) == 0:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
303 pass
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
304 elif line.startswith('H\t'):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
305 header_lines.append(line)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
306 else:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
307 break
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
308 for header_field in ['CreationDate', 'Extractor', 'ExtractorVersion', 'ExtractorOptions']:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
309 found_header = False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
310 for header_line in header_lines:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
311 if header_line.startswith('H\t%s' % (header_field)):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
312 found_header = True
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
313 break
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
314 if not found_header:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
315 return False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
316
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
317 return True
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
318
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
319 # unsniffable binary format, should do something about this
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
320 class XHunterAslFormat( Binary ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
321 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
322 file_ext = "hlf"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
323
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
324 if hasattr(Binary, 'register_unsniffable_binary_ext'):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
325 Binary.register_unsniffable_binary_ext('hlf')