comparison proteomics.py @ 5:df8b867ab71a draft

Uploaded
author bgruening
date Fri, 07 Feb 2014 09:21:23 -0500
parents 09b89b345de2
children b82d4034e0f8
comparison
equal deleted inserted replaced
4:09b89b345de2 5:df8b867ab71a
1 """ 1 """
2 Proteomics format classes 2 Proteomics format classes
3 """ 3 """
4 import logging 4 import logging
5 import re 5 import re
6 from galaxy.datatypes.data import * 6 import binascii
7 from galaxy.datatypes.xml import * 7
8 from galaxy.datatypes.sniff import * 8 from galaxy.datatypes.sniff import *
9 from galaxy.datatypes.binary import * 9 from galaxy.datatypes.data import Text
10 from galaxy.datatypes.interval import * 10 from galaxy.datatypes.xml import GenericXml
11 from galaxy.datatypes.binary import Binary
12 from galaxy.datatypes.tabular import Tabular
13 from galaxy.datatypes.interval import Gff
11 14
12 log = logging.getLogger(__name__) 15 log = logging.getLogger(__name__)
13 16
14 class ProtGff( Gff ): 17 class ProtGff( Gff ):
15 """Tab delimited data in Gff format""" 18 """Tab delimited data in Gff format"""
51 try: 54 try:
52 return dataset.peek 55 return dataset.peek
53 except: 56 except:
54 return "Binary xls file (%s)" % ( data.nice_size( dataset.get_size() ) ) 57 return "Binary xls file (%s)" % ( data.nice_size( dataset.get_size() ) )
55 58
56 class ProteomicsXml(GenericXml): 59 class IdpDB( Binary ):
60 file_ext = "idpDB"
61
62 if hasattr(Binary, 'register_unsniffable_binary_ext'):
63 Binary.register_unsniffable_binary_ext('idpDB')
64
65
66 class PepXmlReport( Tabular ):
67 """pepxml converted to tabular report"""
68 file_ext = "tsv"
69
70 def __init__(self, **kwd):
71 Tabular.__init__( self, **kwd )
72 self.column_names = ['Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility']
73
74 def display_peek( self, dataset ):
75 """Returns formated html of peek"""
76 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
77
78
79 class ProtXmlReport( Tabular ):
80 """protxml converted to tabular report"""
81 file_ext = "tsv"
82 comment_lines = 1
83
84 def __init__(self, **kwd):
85 Tabular.__init__( self, **kwd )
86 self.column_names = ["Entry Number", "Group Probability", "Protein", "Protein Link", "Protein Probability", "Percent Coverage", "Number of Unique Peptides", "Total Independent Spectra", "Percent Share of Spectrum ID's", "Description", "Protein Molecular Weight", "Protein Length", "Is Nondegenerate Evidence", "Weight", "Precursor Ion Charge", "Peptide sequence", "Peptide Link", "NSP Adjusted Probability", "Initial Probability", "Number of Total Termini", "Number of Sibling Peptides Bin", "Number of Instances", "Peptide Group Designator", "Is Evidence?"]
87
88 def display_peek( self, dataset ):
89 """Returns formated html of peek"""
90 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
91
92 class ProteomicsXml( GenericXml ):
57 """ An enhanced XML datatype used to reuse code across several 93 """ An enhanced XML datatype used to reuse code across several
58 proteomic/mass-spec datatypes. """ 94 proteomic/mass-spec datatypes. """
59 95
60 def sniff(self, filename): 96 def sniff(self, filename):
61 """ Determines whether the file is the correct XML type. """ 97 """ Determines whether the file is the correct XML type. """
62 with open(filename, 'r') as contents: 98 with open(filename, 'r') as contents:
63 while True: 99 while True:
64 line = contents.readline() 100 line = contents.readline()
65 if line == None or not line.startswith('<?'): 101 if line == None or not line.startswith('<?'):
66 break 102 break
67 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string 103 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string
74 dataset.blurb = self.blurb 110 dataset.blurb = self.blurb
75 else: 111 else:
76 dataset.peek = 'file does not exist' 112 dataset.peek = 'file does not exist'
77 dataset.blurb = 'file purged from disk' 113 dataset.blurb = 'file purged from disk'
78 114
115
79 class PepXml(ProteomicsXml): 116 class PepXml(ProteomicsXml):
80 """pepXML data""" 117 """pepXML data"""
81 file_ext = "pepxml" 118 file_ext = "pepxml"
82 blurb = 'pepXML data' 119 blurb = 'pepXML data'
83 root = "msms_pipeline_analysis" 120 root = "msms_pipeline_analysis"
84 121
85 122
86 class MzML(ProteomicsXml): 123 class MzML(ProteomicsXml):
87 """mzML data""" 124 """mzML data"""
88 file_ext = "mzml" 125 file_ext = "mzml"
89 blurb = 'mzML Mass Spectrometry data' 126 blurb = 'mzML Mass Spectrometry data'
97 root = "protein_summary" 134 root = "protein_summary"
98 135
99 136
100 class MzXML(ProteomicsXml): 137 class MzXML(ProteomicsXml):
101 """mzXML data""" 138 """mzXML data"""
102 file_ext = "mzXML" 139 file_ext = "mzxml"
103 blurb = "mzXML Mass Spectrometry data" 140 blurb = "mzXML Mass Spectrometry data"
104 root = "mzXML" 141 root = "mzXML"
105 142
106 ## PSI datatypes 143 ## PSI datatypes
107 class MzIdentML(ProteomicsXml): 144 class MzIdentML(ProteomicsXml):
108 file_ext = "mzid" 145 file_ext = "mzid"
109 blurb = "XML identified peptides and proteins." 146 blurb = "XML identified peptides and proteins."
110 root = "MzIdentML" 147 root = "MzIdentML"
111 148
112 149
113 class TraML(ProteomicsXml): 150 class TraML(ProteomicsXml):
114 file_ext = "traML" 151 file_ext = "traml"
115 blurb = "TraML transition list" 152 blurb = "TraML transition list"
116 root = "TraML" 153 root = "TraML"
117 154
118 155
119 class MzQuantML(ProteomicsXml): 156 class MzQuantML(ProteomicsXml):
120 file_ext = "mzq" 157 file_ext = "mzq"
121 blurb = "XML quantification data" 158 blurb = "XML quantification data"
122 root = "MzQuantML" 159 root = "MzQuantML"
123 160
124 161
162 class ConsensusXML(ProteomicsXml):
163 file_ext = "consensusxml"
164 blurb = "OpenMS multiple LC-MS map alignment file"
165 root = "consensusXML"
166
167
168 class FeatureXML(ProteomicsXml):
169 file_ext = "featurexml"
170 blurb = "OpenMS feature file"
171 root = "featureMap"
172
173
174 class IdXML(ProteomicsXml):
175 file_ext = "idxml"
176 blurb = "OpenMS identification file"
177 root = "IdXML"
178
179
125 class Mgf( Text ): 180 class Mgf( Text ):
126 """Mascot Generic Format data""" 181 """Mascot Generic Format data"""
127 file_ext = "mgf" 182 file_ext = "mgf"
128 183
129 def set_peek( self, dataset, is_multi_byte=False ): 184 def set_peek( self, dataset, is_multi_byte=False ):
132 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) 187 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
133 dataset.blurb = 'mgf Mascot Generic Format' 188 dataset.blurb = 'mgf Mascot Generic Format'
134 else: 189 else:
135 dataset.peek = 'file does not exist' 190 dataset.peek = 'file does not exist'
136 dataset.blurb = 'file purged from disk' 191 dataset.blurb = 'file purged from disk'
137
138 192
139 def sniff( self, filename ): 193 def sniff( self, filename ):
140 mgf_begin_ions = "BEGIN IONS" 194 mgf_begin_ions = "BEGIN IONS"
141 max_lines=100 195 max_lines=100
142 196
144 line = line.rstrip( '\n\r' ) 198 line = line.rstrip( '\n\r' )
145 if line==mgf_begin_ions: 199 if line==mgf_begin_ions:
146 return True 200 return True
147 if i>max_lines: 201 if i>max_lines:
148 return False 202 return False
149 203
150 204
151 class MascotDat( Text ): 205 class MascotDat( Text ):
152 """Mascot search results """ 206 """Mascot search results """
153 file_ext = "mascotdat" 207 file_ext = "mascotdat"
154 208
155 def set_peek( self, dataset, is_multi_byte=False ): 209 def set_peek( self, dataset, is_multi_byte=False ):
204 except: 258 except:
205 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) ) 259 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) )
206 260
207 261
208 if hasattr(Binary, 'register_sniffable_binary_format'): 262 if hasattr(Binary, 'register_sniffable_binary_format'):
209 Binary.register_sniffable_binary_format('RAW', 'RAW', RAW) 263 Binary.register_sniffable_binary_format('raw', 'raw', RAW)
210 264
211 265
212 class Msp(Text): 266 class Msp( Text ):
213 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """ 267 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """
214 file_ext = "msp" 268 file_ext = "msp"
215 269
216 @staticmethod 270 @staticmethod
217 def next_line_starts_with(contents, prefix): 271 def next_line_starts_with(contents, prefix):
265 return False 319 return False
266 320
267 return True 321 return True
268 322
269 # unsniffable binary format, should do something about this 323 # unsniffable binary format, should do something about this
270 class XHunterAslFormat(Binary): 324 class XHunterAslFormat( Binary ):
271 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """ 325 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """
272 file_ext = "hlf" 326 file_ext = "hlf"
273 327
274
275 if hasattr(Binary, 'register_unsniffable_binary_ext'): 328 if hasattr(Binary, 'register_unsniffable_binary_ext'):
276 Binary.register_unsniffable_binary_ext('hlf') 329 Binary.register_unsniffable_binary_ext('hlf')