annotate proteomics.py @ 20:300fc3aa6954 draft default tip

planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit 298f29ddf450b5821c22cae53615f03d96334d32
author iracooke
date Thu, 04 Jun 2015 08:10:37 -0400
parents e5551a35e508
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
9
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
1 """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
2 Proteomics format classes
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
3 """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
4 import logging
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
5 import re
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
6 import binascii
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
7
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
8 from galaxy.datatypes.sniff import *
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
9 from galaxy.datatypes import data
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
10 from galaxy.datatypes.data import Text
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
11 from galaxy.datatypes.xml import GenericXml
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
12 from galaxy.datatypes.binary import Binary
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
13 from galaxy.datatypes.tabular import Tabular
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
14 from galaxy.datatypes.interval import Gff
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
15
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
16 log = logging.getLogger(__name__)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
17
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
18
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
19 class Wiff( Binary ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
20 """Class for wiff files."""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
21 file_ext = 'wiff'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
22 allow_datatype_change = False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
23 composite_type = 'auto_primary_file'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
24
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
25 def __init__(self, **kwd):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
26 Binary.__init__(self, **kwd)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
27 self.add_composite_file( 'wiff',
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
28 description = 'AB SCIEX files in .wiff format. This can contain all needed information or only metadata.',
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
29 is_binary = True )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
30 self.add_composite_file( 'wiff_scan',
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
31 description = 'AB SCIEX spectra file (wiff.scan), if the corresponding .wiff file only contains metadata.',
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
32 optional = 'True', is_binary = True )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
33
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
34 def generate_primary_file( self, dataset = None ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
35 rval = ['<html><head><title>Wiff Composite Dataset </title></head><p/>']
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
36 rval.append('<div>This composite dataset is composed of the following files:<p/><ul>')
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
37 for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
38 fn = composite_name
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
39 opt_text = ''
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
40 if composite_file.optional:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
41 opt_text = ' (optional)'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
42 if composite_file.get('description'):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
43 rval.append( '<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
44 else:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
45 rval.append( '<li><a href="%s" type="text/plain">%s</a>%s</li>' % ( fn, fn, opt_text ) )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
46 rval.append( '</ul></div></html>' )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
47 return "\n".join( rval )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
48
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
49
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
50
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
51 if hasattr(Binary, 'register_unsniffable_binary_ext'):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
52 Binary.register_unsniffable_binary_ext('wiff')
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
53
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
54
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
55 class IdpDB( Binary ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
56 file_ext = "idpDB"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
57
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
58 if hasattr(Binary, 'register_unsniffable_binary_ext'):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
59 Binary.register_unsniffable_binary_ext('idpDB')
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
60
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
61
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
62 class PepXmlReport( Tabular ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
63 """pepxml converted to tabular report"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
64 file_ext = "tsv"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
65
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
66 def __init__(self, **kwd):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
67 Tabular.__init__( self, **kwd )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
68 self.column_names = ['Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility']
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
69
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
70 def display_peek( self, dataset ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
71 """Returns formated html of peek"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
72 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
73
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
74
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
75 class ProtXmlReport( Tabular ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
76 """protxml converted to tabular report"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
77 file_ext = "tsv"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
78 comment_lines = 1
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
79
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
80 def __init__(self, **kwd):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
81 Tabular.__init__( self, **kwd )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
82 self.column_names = ["Entry Number", "Group Probability", "Protein", "Protein Link", "Protein Probability", "Percent Coverage", "Number of Unique Peptides", "Total Independent Spectra", "Percent Share of Spectrum ID's", "Description", "Protein Molecular Weight", "Protein Length", "Is Nondegenerate Evidence", "Weight", "Precursor Ion Charge", "Peptide sequence", "Peptide Link", "NSP Adjusted Probability", "Initial Probability", "Number of Total Termini", "Number of Sibling Peptides Bin", "Number of Instances", "Peptide Group Designator", "Is Evidence?"]
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
83
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
84 def display_peek( self, dataset ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
85 """Returns formated html of peek"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
86 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
87
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
88 class ProteomicsXml( GenericXml ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
89 """ An enhanced XML datatype used to reuse code across several
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
90 proteomic/mass-spec datatypes. """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
91
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
92 def sniff(self, filename):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
93 """ Determines whether the file is the correct XML type. """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
94 with open(filename, 'r') as contents:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
95 while True:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
96 line = contents.readline()
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
97 if line == None or not line.startswith('<?'):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
98 break
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
99 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
100 return line != None and re.match(pattern, line) != None
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
101
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
102 def set_peek( self, dataset, is_multi_byte=False ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
103 """Set the peek and blurb text"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
104 if not dataset.dataset.purged:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
105 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
106 dataset.blurb = self.blurb
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
107 else:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
108 dataset.peek = 'file does not exist'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
109 dataset.blurb = 'file purged from disk'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
110
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
111
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
112 class PepXml(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
113 """pepXML data"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
114 file_ext = "pepxml"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
115 blurb = 'pepXML data'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
116 root = "msms_pipeline_analysis"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
117
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
118
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
119 class MzML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
120 """mzML data"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
121 file_ext = "mzml"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
122 blurb = 'mzML Mass Spectrometry data'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
123 root = "(mzML|indexedmzML)"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
124
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
125
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
126 class ProtXML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
127 """protXML data"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
128 file_ext = "protxml"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
129 blurb = 'prot XML Search Results'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
130 root = "protein_summary"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
131
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
132
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
133 class MzXML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
134 """mzXML data"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
135 file_ext = "mzxml"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
136 blurb = "mzXML Mass Spectrometry data"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
137 root = "mzXML"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
138
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
139 ## PSI datatypes
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
140 class MzIdentML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
141 file_ext = "mzid"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
142 blurb = "XML identified peptides and proteins."
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
143 root = "MzIdentML"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
144
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
145
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
146 class TraML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
147 file_ext = "traml"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
148 blurb = "TraML transition list"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
149 root = "TraML"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
150
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
151
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
152 class MzQuantML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
153 file_ext = "mzq"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
154 blurb = "XML quantification data"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
155 root = "MzQuantML"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
156
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
157
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
158 class ConsensusXML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
159 file_ext = "consensusxml"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
160 blurb = "OpenMS multiple LC-MS map alignment file"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
161 root = "consensusXML"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
162
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
163
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
164 class FeatureXML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
165 file_ext = "featurexml"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
166 blurb = "OpenMS feature file"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
167 root = "featureMap"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
168
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
169
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
170 class IdXML(ProteomicsXml):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
171 file_ext = "idxml"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
172 blurb = "OpenMS identification file"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
173 root = "IdXML"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
174
10
ef74edade8be Uploaded
iracooke
parents: 9
diff changeset
175 class TandemXML(ProteomicsXml):
ef74edade8be Uploaded
iracooke
parents: 9
diff changeset
176 file_ext = "tandem"
ef74edade8be Uploaded
iracooke
parents: 9
diff changeset
177 blurb = "X!Tandem search results file"
12
74dae57933ae Uploaded
iracooke
parents: 10
diff changeset
178 root = "bioml"
9
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
179
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
180 class Mgf( Text ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
181 """Mascot Generic Format data"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
182 file_ext = "mgf"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
183
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
184 def set_peek( self, dataset, is_multi_byte=False ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
185 """Set the peek and blurb text"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
186 if not dataset.dataset.purged:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
187 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
188 dataset.blurb = 'mgf Mascot Generic Format'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
189 else:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
190 dataset.peek = 'file does not exist'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
191 dataset.blurb = 'file purged from disk'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
192
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
193 def sniff( self, filename ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
194 mgf_begin_ions = "BEGIN IONS"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
195 max_lines=100
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
196
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
197 for i, line in enumerate( file( filename ) ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
198 line = line.rstrip( '\n\r' )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
199 if line==mgf_begin_ions:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
200 return True
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
201 if i>max_lines:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
202 return False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
203
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
204
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
205 class MascotDat( Text ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
206 """Mascot search results """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
207 file_ext = "mascotdat"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
208
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
209 def set_peek( self, dataset, is_multi_byte=False ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
210 """Set the peek and blurb text"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
211 if not dataset.dataset.purged:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
212 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
213 dataset.blurb = 'mascotdat Mascot Search Results'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
214 else:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
215 dataset.peek = 'file does not exist'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
216 dataset.blurb = 'file purged from disk'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
217
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
218
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
219 def sniff( self, filename ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
220 mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
221 max_lines=10
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
222
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
223 for i, line in enumerate( file( filename ) ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
224 line = line.rstrip( '\n\r' )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
225 if line==mime_version:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
226 return True
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
227 if i>max_lines:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
228 return False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
229
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
230
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
231 class RAW( Binary ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
232 """Class describing a Thermo Finnigan binary RAW file"""
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
233 file_ext = "raw"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
234 def sniff( self, filename ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
235 # Thermo Finnigan RAW format is proprietary and hence not well documented.
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
236 # Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
237 # This combination represents 17 bytes, but to play safe we read 20 bytes from
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
238 # the start of the file.
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
239 try:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
240 header = open( filename ).read(20)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
241 hexheader = binascii.b2a_hex( header )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
242 finnigan = binascii.hexlify( 'F\0i\0n\0n\0i\0g\0a\0n' )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
243 if hexheader.find(finnigan) != -1:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
244 return True
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
245 return False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
246 except:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
247 return False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
248 def set_peek( self, dataset, is_multi_byte=False ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
249 if not dataset.dataset.purged:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
250 dataset.peek = "Thermo Finnigan RAW file"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
251 dataset.blurb = data.nice_size( dataset.get_size() )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
252 else:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
253 dataset.peek = 'file does not exist'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
254 dataset.blurb = 'file purged from disk'
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
255 def display_peek( self, dataset ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
256 try:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
257 return dataset.peek
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
258 except:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
259 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) )
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
260
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
261
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
262 if hasattr(Binary, 'register_sniffable_binary_format'):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
263 Binary.register_sniffable_binary_format('raw', 'raw', RAW)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
264
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
265
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
266 class Msp( Text ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
267 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
268 file_ext = "msp"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
269
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
270 @staticmethod
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
271 def next_line_starts_with(contents, prefix):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
272 next_line = contents.readline()
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
273 return next_line != None and next_line.startswith(prefix)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
274
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
275 def sniff(self, filename):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
276 """ Determines whether the file is a NIST MSP output file.
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
277
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
278 >>> fname = get_test_fname('test.msp')
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
279 >>> Msp().sniff(fname)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
280 True
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
281 >>> fname = get_test_fname('test.mzXML')
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
282 >>> Msp().sniff(fname)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
283 False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
284 """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
285 with open(filename, 'r') as contents:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
286 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:")
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
287
19
e5551a35e508 planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents: 18
diff changeset
288 class SPLibNoIndex( Text ):
e5551a35e508 planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents: 18
diff changeset
289 """SPlib without index file """
e5551a35e508 planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents: 18
diff changeset
290 file_ext = "splib"
e5551a35e508 planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents: 18
diff changeset
291
e5551a35e508 planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents: 18
diff changeset
292 def set_peek( self, dataset, is_multi_byte=False ):
e5551a35e508 planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents: 18
diff changeset
293 """Set the peek and blurb text"""
e5551a35e508 planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents: 18
diff changeset
294 if not dataset.dataset.purged:
e5551a35e508 planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents: 18
diff changeset
295 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
e5551a35e508 planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents: 18
diff changeset
296 dataset.blurb = 'Spectral Library without index files'
e5551a35e508 planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents: 18
diff changeset
297 else:
e5551a35e508 planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents: 18
diff changeset
298 dataset.peek = 'file does not exist'
e5551a35e508 planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents: 18
diff changeset
299 dataset.blurb = 'file purged from disk'
e5551a35e508 planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents: 18
diff changeset
300
16
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
301
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
302 class SPLib( Msp ):
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
303 """SpectraST Spectral Library. Closely related to msp format"""
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
304 file_ext = "splib"
18
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
305 composite_type = 'auto_primary_file'
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
306
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
307 def __init__(self, **kwd):
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
308 Msp.__init__(self, **kwd)
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
309 self.add_composite_file( 'library.splib',
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
310 description = 'Spectral Library. Contains actual library spectra',
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
311 is_binary = False )
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
312 self.add_composite_file( 'library.spidx',
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
313 description = 'Spectrum index', is_binary = False )
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
314 self.add_composite_file( 'library.pepidx',
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
315 description = 'Peptide index', is_binary = False)
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
316
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
317
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
318 def generate_primary_file( self, dataset = None ):
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
319 rval = ['<html><head><title>Spectral Library Composite Dataset </title></head><p/>']
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
320 rval.append('<div>This composite dataset is composed of the following files:<p/><ul>')
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
321 for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
322 fn = composite_name
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
323 opt_text = ''
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
324 if composite_file.optional:
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
325 opt_text = ' (optional)'
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
326 if composite_file.get('description'):
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
327 rval.append( '<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
328 else:
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
329 rval.append( '<li><a href="%s" type="text/plain">%s</a>%s</li>' % ( fn, fn, opt_text ) )
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
330 rval.append( '</ul></div></html>' )
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
331 return "\n".join( rval )
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
332
d1ea609e57d4 Make splib a composite datatype
iracooke
parents: 17
diff changeset
333
16
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
334
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
335 def set_peek( self, dataset, is_multi_byte=False ):
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
336 """Set the peek and blurb text"""
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
337 if not dataset.dataset.purged:
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
338 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
339 dataset.blurb = 'splib Spectral Library Format'
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
340 else:
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
341 dataset.peek = 'file does not exist'
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
342 dataset.blurb = 'file purged from disk'
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
343
e6a02a387448 planemo upload commit 4f23896d1a519a87f189c6238a95b8f5b5933e9a-dirty
iracooke
parents: 12
diff changeset
344
17
29c43b953c1c Fix splib sniffer
iracooke
parents: 16
diff changeset
345 def sniff(self, filename):
29c43b953c1c Fix splib sniffer
iracooke
parents: 16
diff changeset
346 """ Determines whether the file is a SpectraST generated file.
29c43b953c1c Fix splib sniffer
iracooke
parents: 16
diff changeset
347 """
29c43b953c1c Fix splib sniffer
iracooke
parents: 16
diff changeset
348 with open(filename, 'r') as contents:
29c43b953c1c Fix splib sniffer
iracooke
parents: 16
diff changeset
349 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "LibID:")
29c43b953c1c Fix splib sniffer
iracooke
parents: 16
diff changeset
350
29c43b953c1c Fix splib sniffer
iracooke
parents: 16
diff changeset
351
9
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
352 class Ms2(Text):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
353 file_ext = "ms2"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
354
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
355 def sniff(self, filename):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
356 """ Determines whether the file is a valid ms2 file.
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
357
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
358 >>> fname = get_test_fname('test.msp')
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
359 >>> Ms2().sniff(fname)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
360 False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
361 >>> fname = get_test_fname('test.ms2')
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
362 >>> Ms2().sniff(fname)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
363 True
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
364 """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
365
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
366 with open(filename, 'r') as contents:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
367 header_lines = []
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
368 while True:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
369 line = contents.readline()
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
370 if line == None or len(line) == 0:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
371 pass
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
372 elif line.startswith('H\t'):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
373 header_lines.append(line)
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
374 else:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
375 break
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
376 for header_field in ['CreationDate', 'Extractor', 'ExtractorVersion', 'ExtractorOptions']:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
377 found_header = False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
378 for header_line in header_lines:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
379 if header_line.startswith('H\t%s' % (header_field)):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
380 found_header = True
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
381 break
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
382 if not found_header:
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
383 return False
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
384
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
385 return True
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
386
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
387 # unsniffable binary format, should do something about this
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
388 class XHunterAslFormat( Binary ):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
389 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
390 file_ext = "hlf"
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
391
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
392 if hasattr(Binary, 'register_unsniffable_binary_ext'):
6ca516faacfc Uploaded
iracooke
parents:
diff changeset
393 Binary.register_unsniffable_binary_ext('hlf')
19
e5551a35e508 planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents: 18
diff changeset
394
e5551a35e508 planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents: 18
diff changeset
395
e5551a35e508 planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents: 18
diff changeset
396 class Sf3(Binary):
e5551a35e508 planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents: 18
diff changeset
397 """Class describing a Scaffold SF3 files"""
e5551a35e508 planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents: 18
diff changeset
398 file_ext = "sf3"
e5551a35e508 planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents: 18
diff changeset
399
e5551a35e508 planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents: 18
diff changeset
400 if hasattr(Binary, 'register_unsniffable_binary_ext'):
e5551a35e508 planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents: 18
diff changeset
401 Binary.register_unsniffable_binary_ext('sf3')
e5551a35e508 planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents: 18
diff changeset
402
e5551a35e508 planemo upload for repository https://github.com/iracooke/proteomics-datatypes commit a71820c91d9d7629415a27526bbf700800d12f3f
iracooke
parents: 18
diff changeset
403