Mercurial > repos > iracooke > proteomics_datatypes
comparison proteomics.py @ 5:df8b867ab71a draft
Uploaded
author | bgruening |
---|---|
date | Fri, 07 Feb 2014 09:21:23 -0500 |
parents | 09b89b345de2 |
children | b82d4034e0f8 |
comparison
equal
deleted
inserted
replaced
4:09b89b345de2 | 5:df8b867ab71a |
---|---|
1 """ | 1 """ |
2 Proteomics format classes | 2 Proteomics format classes |
3 """ | 3 """ |
4 import logging | 4 import logging |
5 import re | 5 import re |
6 from galaxy.datatypes.data import * | 6 import binascii |
7 from galaxy.datatypes.xml import * | 7 |
8 from galaxy.datatypes.sniff import * | 8 from galaxy.datatypes.sniff import * |
9 from galaxy.datatypes.binary import * | 9 from galaxy.datatypes.data import Text |
10 from galaxy.datatypes.interval import * | 10 from galaxy.datatypes.xml import GenericXml |
11 from galaxy.datatypes.binary import Binary | |
12 from galaxy.datatypes.tabular import Tabular | |
13 from galaxy.datatypes.interval import Gff | |
11 | 14 |
12 log = logging.getLogger(__name__) | 15 log = logging.getLogger(__name__) |
13 | 16 |
14 class ProtGff( Gff ): | 17 class ProtGff( Gff ): |
15 """Tab delimited data in Gff format""" | 18 """Tab delimited data in Gff format""" |
51 try: | 54 try: |
52 return dataset.peek | 55 return dataset.peek |
53 except: | 56 except: |
54 return "Binary xls file (%s)" % ( data.nice_size( dataset.get_size() ) ) | 57 return "Binary xls file (%s)" % ( data.nice_size( dataset.get_size() ) ) |
55 | 58 |
56 class ProteomicsXml(GenericXml): | 59 class IdpDB( Binary ): |
60 file_ext = "idpDB" | |
61 | |
62 if hasattr(Binary, 'register_unsniffable_binary_ext'): | |
63 Binary.register_unsniffable_binary_ext('idpDB') | |
64 | |
65 | |
66 class PepXmlReport( Tabular ): | |
67 """pepxml converted to tabular report""" | |
68 file_ext = "tsv" | |
69 | |
70 def __init__(self, **kwd): | |
71 Tabular.__init__( self, **kwd ) | |
72 self.column_names = ['Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility'] | |
73 | |
74 def display_peek( self, dataset ): | |
75 """Returns formated html of peek""" | |
76 return Tabular.make_html_table( self, dataset, column_names=self.column_names ) | |
77 | |
78 | |
79 class ProtXmlReport( Tabular ): | |
80 """protxml converted to tabular report""" | |
81 file_ext = "tsv" | |
82 comment_lines = 1 | |
83 | |
84 def __init__(self, **kwd): | |
85 Tabular.__init__( self, **kwd ) | |
86 self.column_names = ["Entry Number", "Group Probability", "Protein", "Protein Link", "Protein Probability", "Percent Coverage", "Number of Unique Peptides", "Total Independent Spectra", "Percent Share of Spectrum ID's", "Description", "Protein Molecular Weight", "Protein Length", "Is Nondegenerate Evidence", "Weight", "Precursor Ion Charge", "Peptide sequence", "Peptide Link", "NSP Adjusted Probability", "Initial Probability", "Number of Total Termini", "Number of Sibling Peptides Bin", "Number of Instances", "Peptide Group Designator", "Is Evidence?"] | |
87 | |
88 def display_peek( self, dataset ): | |
89 """Returns formated html of peek""" | |
90 return Tabular.make_html_table( self, dataset, column_names=self.column_names ) | |
91 | |
92 class ProteomicsXml( GenericXml ): | |
57 """ An enhanced XML datatype used to reuse code across several | 93 """ An enhanced XML datatype used to reuse code across several |
58 proteomic/mass-spec datatypes. """ | 94 proteomic/mass-spec datatypes. """ |
59 | 95 |
60 def sniff(self, filename): | 96 def sniff(self, filename): |
61 """ Determines whether the file is the correct XML type. """ | 97 """ Determines whether the file is the correct XML type. """ |
62 with open(filename, 'r') as contents: | 98 with open(filename, 'r') as contents: |
63 while True: | 99 while True: |
64 line = contents.readline() | 100 line = contents.readline() |
65 if line == None or not line.startswith('<?'): | 101 if line == None or not line.startswith('<?'): |
66 break | 102 break |
67 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string | 103 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string |
74 dataset.blurb = self.blurb | 110 dataset.blurb = self.blurb |
75 else: | 111 else: |
76 dataset.peek = 'file does not exist' | 112 dataset.peek = 'file does not exist' |
77 dataset.blurb = 'file purged from disk' | 113 dataset.blurb = 'file purged from disk' |
78 | 114 |
115 | |
79 class PepXml(ProteomicsXml): | 116 class PepXml(ProteomicsXml): |
80 """pepXML data""" | 117 """pepXML data""" |
81 file_ext = "pepxml" | 118 file_ext = "pepxml" |
82 blurb = 'pepXML data' | 119 blurb = 'pepXML data' |
83 root = "msms_pipeline_analysis" | 120 root = "msms_pipeline_analysis" |
84 | 121 |
85 | 122 |
86 class MzML(ProteomicsXml): | 123 class MzML(ProteomicsXml): |
87 """mzML data""" | 124 """mzML data""" |
88 file_ext = "mzml" | 125 file_ext = "mzml" |
89 blurb = 'mzML Mass Spectrometry data' | 126 blurb = 'mzML Mass Spectrometry data' |
97 root = "protein_summary" | 134 root = "protein_summary" |
98 | 135 |
99 | 136 |
100 class MzXML(ProteomicsXml): | 137 class MzXML(ProteomicsXml): |
101 """mzXML data""" | 138 """mzXML data""" |
102 file_ext = "mzXML" | 139 file_ext = "mzxml" |
103 blurb = "mzXML Mass Spectrometry data" | 140 blurb = "mzXML Mass Spectrometry data" |
104 root = "mzXML" | 141 root = "mzXML" |
105 | 142 |
106 ## PSI datatypes | 143 ## PSI datatypes |
107 class MzIdentML(ProteomicsXml): | 144 class MzIdentML(ProteomicsXml): |
108 file_ext = "mzid" | 145 file_ext = "mzid" |
109 blurb = "XML identified peptides and proteins." | 146 blurb = "XML identified peptides and proteins." |
110 root = "MzIdentML" | 147 root = "MzIdentML" |
111 | 148 |
112 | 149 |
113 class TraML(ProteomicsXml): | 150 class TraML(ProteomicsXml): |
114 file_ext = "traML" | 151 file_ext = "traml" |
115 blurb = "TraML transition list" | 152 blurb = "TraML transition list" |
116 root = "TraML" | 153 root = "TraML" |
117 | 154 |
118 | 155 |
119 class MzQuantML(ProteomicsXml): | 156 class MzQuantML(ProteomicsXml): |
120 file_ext = "mzq" | 157 file_ext = "mzq" |
121 blurb = "XML quantification data" | 158 blurb = "XML quantification data" |
122 root = "MzQuantML" | 159 root = "MzQuantML" |
123 | 160 |
124 | 161 |
162 class ConsensusXML(ProteomicsXml): | |
163 file_ext = "consensusxml" | |
164 blurb = "OpenMS multiple LC-MS map alignment file" | |
165 root = "consensusXML" | |
166 | |
167 | |
168 class FeatureXML(ProteomicsXml): | |
169 file_ext = "featurexml" | |
170 blurb = "OpenMS feature file" | |
171 root = "featureMap" | |
172 | |
173 | |
174 class IdXML(ProteomicsXml): | |
175 file_ext = "idxml" | |
176 blurb = "OpenMS identification file" | |
177 root = "IdXML" | |
178 | |
179 | |
125 class Mgf( Text ): | 180 class Mgf( Text ): |
126 """Mascot Generic Format data""" | 181 """Mascot Generic Format data""" |
127 file_ext = "mgf" | 182 file_ext = "mgf" |
128 | 183 |
129 def set_peek( self, dataset, is_multi_byte=False ): | 184 def set_peek( self, dataset, is_multi_byte=False ): |
132 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | 187 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) |
133 dataset.blurb = 'mgf Mascot Generic Format' | 188 dataset.blurb = 'mgf Mascot Generic Format' |
134 else: | 189 else: |
135 dataset.peek = 'file does not exist' | 190 dataset.peek = 'file does not exist' |
136 dataset.blurb = 'file purged from disk' | 191 dataset.blurb = 'file purged from disk' |
137 | |
138 | 192 |
139 def sniff( self, filename ): | 193 def sniff( self, filename ): |
140 mgf_begin_ions = "BEGIN IONS" | 194 mgf_begin_ions = "BEGIN IONS" |
141 max_lines=100 | 195 max_lines=100 |
142 | 196 |
144 line = line.rstrip( '\n\r' ) | 198 line = line.rstrip( '\n\r' ) |
145 if line==mgf_begin_ions: | 199 if line==mgf_begin_ions: |
146 return True | 200 return True |
147 if i>max_lines: | 201 if i>max_lines: |
148 return False | 202 return False |
149 | 203 |
150 | 204 |
151 class MascotDat( Text ): | 205 class MascotDat( Text ): |
152 """Mascot search results """ | 206 """Mascot search results """ |
153 file_ext = "mascotdat" | 207 file_ext = "mascotdat" |
154 | 208 |
155 def set_peek( self, dataset, is_multi_byte=False ): | 209 def set_peek( self, dataset, is_multi_byte=False ): |
204 except: | 258 except: |
205 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) ) | 259 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) ) |
206 | 260 |
207 | 261 |
208 if hasattr(Binary, 'register_sniffable_binary_format'): | 262 if hasattr(Binary, 'register_sniffable_binary_format'): |
209 Binary.register_sniffable_binary_format('RAW', 'RAW', RAW) | 263 Binary.register_sniffable_binary_format('raw', 'raw', RAW) |
210 | 264 |
211 | 265 |
212 class Msp(Text): | 266 class Msp( Text ): |
213 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """ | 267 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """ |
214 file_ext = "msp" | 268 file_ext = "msp" |
215 | 269 |
216 @staticmethod | 270 @staticmethod |
217 def next_line_starts_with(contents, prefix): | 271 def next_line_starts_with(contents, prefix): |
265 return False | 319 return False |
266 | 320 |
267 return True | 321 return True |
268 | 322 |
269 # unsniffable binary format, should do something about this | 323 # unsniffable binary format, should do something about this |
270 class XHunterAslFormat(Binary): | 324 class XHunterAslFormat( Binary ): |
271 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """ | 325 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """ |
272 file_ext = "hlf" | 326 file_ext = "hlf" |
273 | 327 |
274 | |
275 if hasattr(Binary, 'register_unsniffable_binary_ext'): | 328 if hasattr(Binary, 'register_unsniffable_binary_ext'): |
276 Binary.register_unsniffable_binary_ext('hlf') | 329 Binary.register_unsniffable_binary_ext('hlf') |