annotate proteomics.py @ 18:d1ea609e57d4 draft

Make splib a composite datatype
author iracooke
date Wed, 20 May 2015 01:39:39 -0400
parents 29c43b953c1c
children e5551a35e508
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
9
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
1 """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
2 Proteomics format classes
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
3 """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
4 import logging
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
5 import re
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
6 import binascii
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
7
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
8 from galaxy.datatypes.sniff import *
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
9 from galaxy.datatypes import data
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
10 from galaxy.datatypes.data import Text
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
11 from galaxy.datatypes.xml import GenericXml
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
12 from galaxy.datatypes.binary import Binary
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
13 from galaxy.datatypes.tabular import Tabular
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
14 from galaxy.datatypes.interval import Gff
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
15
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
16 log = logging.getLogger(__name__)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
17
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
18
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
19 class Wiff( Binary ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
20 """Class for wiff files."""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
21 file_ext = 'wiff'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
22 allow_datatype_change = False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
23 composite_type = 'auto_primary_file'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
24
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
25 def __init__(self, **kwd):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
26 Binary.__init__(self, **kwd)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
27 self.add_composite_file( 'wiff',
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
28 description = 'AB SCIEX files in .wiff format. This can contain all needed information or only metadata.',
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
29 is_binary = True )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
30 self.add_composite_file( 'wiff_scan',
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
31 description = 'AB SCIEX spectra file (wiff.scan), if the corresponding .wiff file only contains metadata.',
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
32 optional = 'True', is_binary = True )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
33
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
34 def generate_primary_file( self, dataset = None ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
35 rval = ['<html><head><title>Wiff Composite Dataset </title></head><p/>']
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
36 rval.append('<div>This composite dataset is composed of the following files:<p/><ul>')
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
37 for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
38 fn = composite_name
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
39 opt_text = ''
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
40 if composite_file.optional:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
41 opt_text = ' (optional)'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
42 if composite_file.get('description'):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
43 rval.append( '<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
44 else:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
45 rval.append( '<li><a href="%s" type="text/plain">%s</a>%s</li>' % ( fn, fn, opt_text ) )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
46 rval.append( '</ul></div></html>' )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
47 return "\n".join( rval )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
48
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
49
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
50
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
51 if hasattr(Binary, 'register_unsniffable_binary_ext'):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
52 Binary.register_unsniffable_binary_ext('wiff')
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
53
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
54
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
55 class IdpDB( Binary ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
56 file_ext = "idpDB"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
57
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
58 if hasattr(Binary, 'register_unsniffable_binary_ext'):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
59 Binary.register_unsniffable_binary_ext('idpDB')
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
60
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
61
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
62 class PepXmlReport( Tabular ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
63 """pepxml converted to tabular report"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
64 file_ext = "tsv"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
65
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
66 def __init__(self, **kwd):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
67 Tabular.__init__( self, **kwd )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
68 self.column_names = ['Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility']
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
69
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
70 def display_peek( self, dataset ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
71 """Returns formated html of peek"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
72 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
73
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
74
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
75 class ProtXmlReport( Tabular ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
76 """protxml converted to tabular report"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
77 file_ext = "tsv"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
78 comment_lines = 1
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
79
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
80 def __init__(self, **kwd):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
81 Tabular.__init__( self, **kwd )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
82 self.column_names = ["Entry Number", "Group Probability", "Protein", "Protein Link", "Protein Probability", "Percent Coverage", "Number of Unique Peptides", "Total Independent Spectra", "Percent Share of Spectrum ID's", "Description", "Protein Molecular Weight", "Protein Length", "Is Nondegenerate Evidence", "Weight", "Precursor Ion Charge", "Peptide sequence", "Peptide Link", "NSP Adjusted Probability", "Initial Probability", "Number of Total Termini", "Number of Sibling Peptides Bin", "Number of Instances", "Peptide Group Designator", "Is Evidence?"]
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
83
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
84 def display_peek( self, dataset ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
85 """Returns formated html of peek"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
86 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
87
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
88 class ProteomicsXml( GenericXml ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
89 """ An enhanced XML datatype used to reuse code across several
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
90 proteomic/mass-spec datatypes. """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
91
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
92 def sniff(self, filename):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
93 """ Determines whether the file is the correct XML type. """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
94 with open(filename, 'r') as contents:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
95 while True:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
96 line = contents.readline()
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
97 if line == None or not line.startswith('<?'):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
98 break
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
99 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
100 return line != None and re.match(pattern, line) != None
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
101
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
102 def set_peek( self, dataset, is_multi_byte=False ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
103 """Set the peek and blurb text"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
104 if not dataset.dataset.purged:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
105 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
106 dataset.blurb = self.blurb
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
107 else:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
108 dataset.peek = 'file does not exist'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
109 dataset.blurb = 'file purged from disk'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
110
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
111
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
112 class PepXml(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
113 """pepXML data"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
114 file_ext = "pepxml"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
115 blurb = 'pepXML data'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
116 root = "msms_pipeline_analysis"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
117
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
118
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
119 class MzML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
120 """mzML data"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
121 file_ext = "mzml"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
122 blurb = 'mzML Mass Spectrometry data'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
123 root = "(mzML|indexedmzML)"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
124
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
125
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
126 class ProtXML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
127 """protXML data"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
128 file_ext = "protxml"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
129 blurb = 'prot XML Search Results'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
130 root = "protein_summary"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
131
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
132
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
133 class MzXML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
134 """mzXML data"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
135 file_ext = "mzxml"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
136 blurb = "mzXML Mass Spectrometry data"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
137 root = "mzXML"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
138
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
139 ## PSI datatypes
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
140 class MzIdentML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
141 file_ext = "mzid"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
142 blurb = "XML identified peptides and proteins."
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
143 root = "MzIdentML"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
144
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
145
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
146 class TraML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
147 file_ext = "traml"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
148 blurb = "TraML transition list"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
149 root = "TraML"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
150
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
151
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
152 class MzQuantML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
153 file_ext = "mzq"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
154 blurb = "XML quantification data"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
155 root = "MzQuantML"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
156
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
157
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
158 class ConsensusXML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
159 file_ext = "consensusxml"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
160 blurb = "OpenMS multiple LC-MS map alignment file"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
161 root = "consensusXML"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
162
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
163
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
164 class FeatureXML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
165 file_ext = "featurexml"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
166 blurb = "OpenMS feature file"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
167 root = "featureMap"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
168
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
169
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
170 class IdXML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
171 file_ext = "idxml"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
172 blurb = "OpenMS identification file"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
173 root = "IdXML"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
174
10
ef74edade8be Uploaded
iracooke
parents: 9
diff changeset
175 class TandemXML(ProteomicsXml):
ef74edade8be Uploaded
iracooke
parents: 9
diff changeset
176 file_ext = "tandem"
ef74edade8be Uploaded
iracooke
parents: 9
diff changeset
177 blurb = "X!Tandem search results file"
12
74dae57933ae Uploaded
iracooke
parents: 10
diff changeset
178 root = "bioml"
9
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
179
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
180 class Mgf( Text ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
181 """Mascot Generic Format data"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
182 file_ext = "mgf"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
183
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
184 def set_peek( self, dataset, is_multi_byte=False ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
185 """Set the peek and blurb text"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
186 if not dataset.dataset.purged:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
187 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
188 dataset.blurb = 'mgf Mascot Generic Format'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
189 else:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
190 dataset.peek = 'file does not exist'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
191 dataset.blurb = 'file purged from disk'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
192
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
193 def sniff( self, filename ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
194 mgf_begin_ions = "BEGIN IONS"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
195 max_lines=100
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
196
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
197 for i, line in enumerate( file( filename ) ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
198 line = line.rstrip( '\n\r' )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
199 if line==mgf_begin_ions:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
200 return True
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
201 if i>max_lines:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
202 return False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
203
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
204
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
205 class MascotDat( Text ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
206 """Mascot search results """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
207 file_ext = "mascotdat"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
208
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
209 def set_peek( self, dataset, is_multi_byte=False ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
210 """Set the peek and blurb text"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
211 if not dataset.dataset.purged:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
212 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
213 dataset.blurb = 'mascotdat Mascot Search Results'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
214 else:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
215 dataset.peek = 'file does not exist'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
216 dataset.blurb = 'file purged from disk'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
217
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
218
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
219 def sniff( self, filename ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
220 mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
221 max_lines=10
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
222
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
223 for i, line in enumerate( file( filename ) ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
224 line = line.rstrip( '\n\r' )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
225 if line==mime_version:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
226 return True
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
227 if i>max_lines:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
228 return False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
229
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
230
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
231 class RAW( Binary ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
232 """Class describing a Thermo Finnigan binary RAW file"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
233 file_ext = "raw"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
234 def sniff( self, filename ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
235 # Thermo Finnigan RAW format is proprietary and hence not well documented.
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
236 # Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
237 # This combination represents 17 bytes, but to play safe we read 20 bytes from
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
238 # the start of the file.
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
239 try:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
240 header = open( filename ).read(20)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
241 hexheader = binascii.b2a_hex( header )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
242 finnigan = binascii.hexlify( 'F\0i\0n\0n\0i\0g\0a\0n' )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
243 if hexheader.find(finnigan) != -1:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
244 return True
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
245 return False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
246 except:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
247 return False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
248 def set_peek( self, dataset, is_multi_byte=False ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
249 if not dataset.dataset.purged:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
250 dataset.peek = "Thermo Finnigan RAW file"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
251 dataset.blurb = data.nice_size( dataset.get_size() )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
252 else:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
253 dataset.peek = 'file does not exist'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
254 dataset.blurb = 'file purged from disk'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
255 def display_peek( self, dataset ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
256 try:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
257 return dataset.peek
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
258 except:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
259 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
260
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
261
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
262 if hasattr(Binary, 'register_sniffable_binary_format'):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
263 Binary.register_sniffable_binary_format('raw', 'raw', RAW)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
264
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
265
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
266 class Msp( Text ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
267 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
268 file_ext = "msp"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
269
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
270 @staticmethod
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
271 def next_line_starts_with(contents, prefix):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
272 next_line = contents.readline()
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
273 return next_line != None and next_line.startswith(prefix)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
274
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
275 def sniff(self, filename):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
276 """ Determines whether the file is a NIST MSP output file.
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
277
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
278 >>> fname = get_test_fname('test.msp')
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
279 >>> Msp().sniff(fname)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
280 True
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
281 >>> fname = get_test_fname('test.mzXML')
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
282 >>> Msp().sniff(fname)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
283 False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
284 """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
285 with open(filename, 'r') as contents:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
286 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:")
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
287
16
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
288
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
289 class SPLib( Msp ):
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
290 """SpectraST Spectral Library. Closely related to msp format"""
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
291 file_ext = "splib"
18
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
292 composite_type = 'auto_primary_file'
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
293
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
294 def __init__(self, **kwd):
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
295 Msp.__init__(self, **kwd)
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
296 self.add_composite_file( 'library.splib',
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
297 description = 'Spectral Library. Contains actual library spectra',
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
298 is_binary = False )
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
299 self.add_composite_file( 'library.spidx',
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
300 description = 'Spectrum index', is_binary = False )
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
301 self.add_composite_file( 'library.pepidx',
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
302 description = 'Peptide index', is_binary = False)
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
303
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
304
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
305 def generate_primary_file( self, dataset = None ):
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
306 rval = ['<html><head><title>Spectral Library Composite Dataset </title></head><p/>']
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
307 rval.append('<div>This composite dataset is composed of the following files:<p/><ul>')
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
308 for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
309 fn = composite_name
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
310 opt_text = ''
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
311 if composite_file.optional:
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
312 opt_text = ' (optional)'
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
313 if composite_file.get('description'):
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
314 rval.append( '<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
315 else:
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
316 rval.append( '<li><a href="%s" type="text/plain">%s</a>%s</li>' % ( fn, fn, opt_text ) )
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
317 rval.append( '</ul></div></html>' )
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
318 return "\n".join( rval )
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
319
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
320
16
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
321
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
322 def set_peek( self, dataset, is_multi_byte=False ):
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
323 """Set the peek and blurb text"""
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
324 if not dataset.dataset.purged:
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
325 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
326 dataset.blurb = 'splib Spectral Library Format'
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
327 else:
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
328 dataset.peek = 'file does not exist'
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
329 dataset.blurb = 'file purged from disk'
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
330
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
331
17
29c43b953c1c Fix splib sniffer
iracooke
parents: 16
diff changeset
332 def sniff(self, filename):
29c43b953c1c Fix splib sniffer
iracooke
parents: 16
diff changeset
333 """ Determines whether the file is a SpectraST generated file.
29c43b953c1c Fix splib sniffer
iracooke
parents: 16
diff changeset
334 """
29c43b953c1c Fix splib sniffer
iracooke
parents: 16
diff changeset
335 with open(filename, 'r') as contents:
29c43b953c1c Fix splib sniffer
iracooke
parents: 16
diff changeset
336 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "LibID:")
29c43b953c1c Fix splib sniffer
iracooke
parents: 16
diff changeset
337
29c43b953c1c Fix splib sniffer
iracooke
parents: 16
diff changeset
338
9
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
339 class Ms2(Text):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
340 file_ext = "ms2"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
341
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
342 def sniff(self, filename):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
343 """ Determines whether the file is a valid ms2 file.
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
344
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
345 >>> fname = get_test_fname('test.msp')
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
346 >>> Ms2().sniff(fname)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
347 False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
348 >>> fname = get_test_fname('test.ms2')
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
349 >>> Ms2().sniff(fname)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
350 True
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
351 """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
352
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
353 with open(filename, 'r') as contents:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
354 header_lines = []
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
355 while True:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
356 line = contents.readline()
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
357 if line == None or len(line) == 0:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
358 pass
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
359 elif line.startswith('H\t'):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
360 header_lines.append(line)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
361 else:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
362 break
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
363 for header_field in ['CreationDate', 'Extractor', 'ExtractorVersion', 'ExtractorOptions']:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
364 found_header = False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
365 for header_line in header_lines:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
366 if header_line.startswith('H\t%s' % (header_field)):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
367 found_header = True
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
368 break
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
369 if not found_header:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
370 return False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
371
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
372 return True
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
373
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
374 # unsniffable binary format, should do something about this
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
375 class XHunterAslFormat( Binary ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
376 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
377 file_ext = "hlf"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
378
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
379 if hasattr(Binary, 'register_unsniffable_binary_ext'):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
380 Binary.register_unsniffable_binary_ext('hlf')