annotate proteomics.py @ 7:b82d4034e0f8 draft

Uploaded
author bgruening
date Tue, 11 Feb 2014 17:51:48 -0500
parents df8b867ab71a
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
1 """
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
2 Proteomics format classes
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
3 """
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
4 import logging
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
5 import re
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
6 import binascii
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
7
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
8 from galaxy.datatypes.sniff import *
7
b82d4034e0f8 Uploaded
bgruening
parents: 5
diff changeset
9 from galaxy.datatypes import data
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
10 from galaxy.datatypes.data import Text
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
11 from galaxy.datatypes.xml import GenericXml
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
12 from galaxy.datatypes.binary import Binary
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
13 from galaxy.datatypes.tabular import Tabular
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
14 from galaxy.datatypes.interval import Gff
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
15
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
16 log = logging.getLogger(__name__)
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
17
4
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
18 class ProtGff( Gff ):
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
19 """Tab delimited data in Gff format"""
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
20 file_ext = "prot_gff"
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
21 def set_peek( self, dataset, is_multi_byte=False ):
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
22 """Set the peek and blurb text"""
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
23 if not dataset.dataset.purged:
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
24 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
25 dataset.blurb = 'Proteogenomics GFF'
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
26 else:
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
27 dataset.peek = 'file does not exist'
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
28 dataset.blurb = 'file purged from disk'
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
29
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
30 def sniff( self, filename ):
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
31 handle = open(filename)
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
32 xmlns_re = re.compile("^##gff-version")
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
33 for i in range(3):
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
34 line = handle.readline()
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
35 if xmlns_re.match(line.strip()):
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
36 handle.close()
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
37 return True
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
38
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
39 handle.close()
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
40 return False
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
41
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
42
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
43 class Xls( Binary ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
44 """Class describing a binary excel spreadsheet file"""
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
45 file_ext = "xls"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
46
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
47 def set_peek( self, dataset, is_multi_byte=False ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
48 if not dataset.dataset.purged:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
49 dataset.peek = "Excel Spreadsheet file"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
50 dataset.blurb = data.nice_size( dataset.get_size() )
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
51 else:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
52 dataset.peek = 'file does not exist'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
53 dataset.blurb = 'file purged from disk'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
54 def display_peek( self, dataset ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
55 try:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
56 return dataset.peek
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
57 except:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
58 return "Binary xls file (%s)" % ( data.nice_size( dataset.get_size() ) )
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
59
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
60 class IdpDB( Binary ):
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
61 file_ext = "idpDB"
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
62
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
63 if hasattr(Binary, 'register_unsniffable_binary_ext'):
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
64 Binary.register_unsniffable_binary_ext('idpDB')
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
65
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
66
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
67 class PepXmlReport( Tabular ):
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
68 """pepxml converted to tabular report"""
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
69 file_ext = "tsv"
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
70
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
71 def __init__(self, **kwd):
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
72 Tabular.__init__( self, **kwd )
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
73 self.column_names = ['Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility']
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
74
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
75 def display_peek( self, dataset ):
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
76 """Returns formated html of peek"""
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
77 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
78
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
79
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
80 class ProtXmlReport( Tabular ):
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
81 """protxml converted to tabular report"""
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
82 file_ext = "tsv"
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
83 comment_lines = 1
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
84
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
85 def __init__(self, **kwd):
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
86 Tabular.__init__( self, **kwd )
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
87 self.column_names = ["Entry Number", "Group Probability", "Protein", "Protein Link", "Protein Probability", "Percent Coverage", "Number of Unique Peptides", "Total Independent Spectra", "Percent Share of Spectrum ID's", "Description", "Protein Molecular Weight", "Protein Length", "Is Nondegenerate Evidence", "Weight", "Precursor Ion Charge", "Peptide sequence", "Peptide Link", "NSP Adjusted Probability", "Initial Probability", "Number of Total Termini", "Number of Sibling Peptides Bin", "Number of Instances", "Peptide Group Designator", "Is Evidence?"]
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
88
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
89 def display_peek( self, dataset ):
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
90 """Returns formated html of peek"""
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
91 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
92
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
93 class ProteomicsXml( GenericXml ):
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
94 """ An enhanced XML datatype used to reuse code across several
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
95 proteomic/mass-spec datatypes. """
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
96
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
97 def sniff(self, filename):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
98 """ Determines whether the file is the correct XML type. """
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
99 with open(filename, 'r') as contents:
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
100 while True:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
101 line = contents.readline()
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
102 if line == None or not line.startswith('<?'):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
103 break
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
104 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
105 return line != None and re.match(pattern, line) != None
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
106
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
107 def set_peek( self, dataset, is_multi_byte=False ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
108 """Set the peek and blurb text"""
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
109 if not dataset.dataset.purged:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
110 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
111 dataset.blurb = self.blurb
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
112 else:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
113 dataset.peek = 'file does not exist'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
114 dataset.blurb = 'file purged from disk'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
115
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
116
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
117 class PepXml(ProteomicsXml):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
118 """pepXML data"""
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
119 file_ext = "pepxml"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
120 blurb = 'pepXML data'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
121 root = "msms_pipeline_analysis"
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
122
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
123
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
124 class MzML(ProteomicsXml):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
125 """mzML data"""
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
126 file_ext = "mzml"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
127 blurb = 'mzML Mass Spectrometry data'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
128 root = "(mzML|indexedmzML)"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
129
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
130
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
131 class ProtXML(ProteomicsXml):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
132 """protXML data"""
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
133 file_ext = "protxml"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
134 blurb = 'prot XML Search Results'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
135 root = "protein_summary"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
136
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
137
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
138 class MzXML(ProteomicsXml):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
139 """mzXML data"""
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
140 file_ext = "mzxml"
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
141 blurb = "mzXML Mass Spectrometry data"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
142 root = "mzXML"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
143
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
144 ## PSI datatypes
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
145 class MzIdentML(ProteomicsXml):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
146 file_ext = "mzid"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
147 blurb = "XML identified peptides and proteins."
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
148 root = "MzIdentML"
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
149
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
150
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
151 class TraML(ProteomicsXml):
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
152 file_ext = "traml"
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
153 blurb = "TraML transition list"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
154 root = "TraML"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
155
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
156
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
157 class MzQuantML(ProteomicsXml):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
158 file_ext = "mzq"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
159 blurb = "XML quantification data"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
160 root = "MzQuantML"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
161
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
162
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
163 class ConsensusXML(ProteomicsXml):
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
164 file_ext = "consensusxml"
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
165 blurb = "OpenMS multiple LC-MS map alignment file"
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
166 root = "consensusXML"
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
167
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
168
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
169 class FeatureXML(ProteomicsXml):
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
170 file_ext = "featurexml"
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
171 blurb = "OpenMS feature file"
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
172 root = "featureMap"
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
173
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
174
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
175 class IdXML(ProteomicsXml):
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
176 file_ext = "idxml"
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
177 blurb = "OpenMS identification file"
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
178 root = "IdXML"
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
179
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
180
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
181 class Mgf( Text ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
182 """Mascot Generic Format data"""
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
183 file_ext = "mgf"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
184
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
185 def set_peek( self, dataset, is_multi_byte=False ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
186 """Set the peek and blurb text"""
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
187 if not dataset.dataset.purged:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
188 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
189 dataset.blurb = 'mgf Mascot Generic Format'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
190 else:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
191 dataset.peek = 'file does not exist'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
192 dataset.blurb = 'file purged from disk'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
193
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
194 def sniff( self, filename ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
195 mgf_begin_ions = "BEGIN IONS"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
196 max_lines=100
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
197
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
198 for i, line in enumerate( file( filename ) ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
199 line = line.rstrip( '\n\r' )
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
200 if line==mgf_begin_ions:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
201 return True
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
202 if i>max_lines:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
203 return False
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
204
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
205
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
206 class MascotDat( Text ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
207 """Mascot search results """
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
208 file_ext = "mascotdat"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
209
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
210 def set_peek( self, dataset, is_multi_byte=False ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
211 """Set the peek and blurb text"""
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
212 if not dataset.dataset.purged:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
213 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
214 dataset.blurb = 'mascotdat Mascot Search Results'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
215 else:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
216 dataset.peek = 'file does not exist'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
217 dataset.blurb = 'file purged from disk'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
218
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
219
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
220 def sniff( self, filename ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
221 mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
222 max_lines=10
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
223
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
224 for i, line in enumerate( file( filename ) ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
225 line = line.rstrip( '\n\r' )
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
226 if line==mime_version:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
227 return True
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
228 if i>max_lines:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
229 return False
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
230
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
231
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
232 class RAW( Binary ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
233 """Class describing a Thermo Finnigan binary RAW file"""
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
234 file_ext = "raw"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
235 def sniff( self, filename ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
236 # Thermo Finnigan RAW format is proprietary and hence not well documented.
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
237 # Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
238 # This combination represents 17 bytes, but to play safe we read 20 bytes from
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
239 # the start of the file.
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
240 try:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
241 header = open( filename ).read(20)
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
242 hexheader = binascii.b2a_hex( header )
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
243 finnigan = binascii.hexlify( 'F\0i\0n\0n\0i\0g\0a\0n' )
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
244 if hexheader.find(finnigan) != -1:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
245 return True
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
246 return False
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
247 except:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
248 return False
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
249 def set_peek( self, dataset, is_multi_byte=False ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
250 if not dataset.dataset.purged:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
251 dataset.peek = "Thermo Finnigan RAW file"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
252 dataset.blurb = data.nice_size( dataset.get_size() )
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
253 else:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
254 dataset.peek = 'file does not exist'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
255 dataset.blurb = 'file purged from disk'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
256 def display_peek( self, dataset ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
257 try:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
258 return dataset.peek
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
259 except:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
260 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) )
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
261
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
262
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
263 if hasattr(Binary, 'register_sniffable_binary_format'):
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
264 Binary.register_sniffable_binary_format('raw', 'raw', RAW)
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
265
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
266
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
267 class Msp( Text ):
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
268 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
269 file_ext = "msp"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
270
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
271 @staticmethod
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
272 def next_line_starts_with(contents, prefix):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
273 next_line = contents.readline()
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
274 return next_line != None and next_line.startswith(prefix)
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
275
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
276 def sniff(self, filename):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
277 """ Determines whether the file is a NIST MSP output file.
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
278
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
279 >>> fname = get_test_fname('test.msp')
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
280 >>> Msp().sniff(fname)
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
281 True
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
282 >>> fname = get_test_fname('test.mzXML')
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
283 >>> Msp().sniff(fname)
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
284 False
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
285 """
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
286 with open(filename, 'r') as contents:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
287 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:")
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
288
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
289 class Ms2(Text):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
290 file_ext = "ms2"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
291
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
292 def sniff(self, filename):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
293 """ Determines whether the file is a valid ms2 file.
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
294
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
295 >>> fname = get_test_fname('test.msp')
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
296 >>> Ms2().sniff(fname)
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
297 False
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
298 >>> fname = get_test_fname('test.ms2')
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
299 >>> Ms2().sniff(fname)
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
300 True
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
301 """
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
302
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
303 with open(filename, 'r') as contents:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
304 header_lines = []
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
305 while True:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
306 line = contents.readline()
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
307 if line == None or len(line) == 0:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
308 pass
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
309 elif line.startswith('H\t'):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
310 header_lines.append(line)
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
311 else:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
312 break
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
313 for header_field in ['CreationDate', 'Extractor', 'ExtractorVersion', 'ExtractorOptions']:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
314 found_header = False
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
315 for header_line in header_lines:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
316 if header_line.startswith('H\t%s' % (header_field)):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
317 found_header = True
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
318 break
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
319 if not found_header:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
320 return False
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
321
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
322 return True
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
323
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
324 # unsniffable binary format, should do something about this
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
325 class XHunterAslFormat( Binary ):
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
326 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
327 file_ext = "hlf"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
328
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
329 if hasattr(Binary, 'register_unsniffable_binary_ext'):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
330 Binary.register_unsniffable_binary_ext('hlf')