Mercurial > repos > iracooke > proteomics_datatypes
changeset 5:df8b867ab71a draft
Uploaded
author | bgruening |
---|---|
date | Fri, 07 Feb 2014 09:21:23 -0500 |
parents | 09b89b345de2 |
children | 1f484bf888ca |
files | datatypes_conf.xml proteomics.py proteomics_datatypes.diff |
diffstat | 3 files changed, 291 insertions(+), 22 deletions(-) [+] |
line wrap: on
line diff
--- a/datatypes_conf.xml Sun Jun 09 08:16:08 2013 -0500 +++ b/datatypes_conf.xml Fri Feb 07 09:21:23 2014 -0500 @@ -2,7 +2,7 @@ <datatypes> <datatype_files> <datatype_file name="proteomics.py"/> - </datatype_files> + </datatype_files> <registration display_path="display_applications"> <datatype extension="prot_gff" type="galaxy.datatypes.proteomics:ProtGff" mimetype="application/xml" display_in_upload="true"> <display file="proteomics/ProtGff.xml" /> @@ -22,16 +22,23 @@ <datatype extension="protxml" type="galaxy.datatypes.proteomics:ProtXML" display_in_upload="true" > <display file="proteomics/ProtXml.xml"/> </datatype> + <datatype extension="pepxml.tsv" type="galaxy.datatypes.proteomics:PepXmlReport" display_in_upload="true" /> + <datatype extension="protxml.tsv" type="galaxy.datatypes.proteomics:ProtXmlReport" display_in_upload="true" /> <datatype extension="mascotdat" type="galaxy.datatypes.proteomics:MascotDat" display_in_upload="false" /> <datatype extension="mzml" type="galaxy.datatypes.proteomics:MzML" mimetype="application/xml" display_in_upload="true"> <display file="proteomics/mzML.xml"/> + <display file="proteomics/protvis_mzml.xml"/> </datatype> - <datatype extension="mgf" type="galaxy.datatypes.proteomics:Mgf" display_in_upload="true" /> + <datatype extension="mgf" type="galaxy.datatypes.proteomics:Mgf" display_in_upload="true" /> <datatype extension="xls" type="galaxy.datatypes.proteomics:Xls" display_in_upload="true" /> + <datatype extension="idpdb" type="galaxy.datatypes.proteomics:IdpDB" display_in_upload="true" /> <datatype extension="mzxml" type="galaxy.datatypes.proteomics:MzXML" mimetype="application/xml" display_in_upload="true" /> <datatype extension="mzq" type="galaxy.datatypes.proteomics:MzQuantML" mimetype="application/xml" display_in_upload="true" /> <datatype extension="mzid" type="galaxy.datatypes.proteomics:MzIdentML" mimetype="application/xml" display_in_upload="true" /> - <datatype extension="traML" type="galaxy.datatypes.proteomics:TraML" mimetype="application/xml" display_in_upload="true" /> + <datatype extension="traml" type="galaxy.datatypes.proteomics:TraML" mimetype="application/xml" display_in_upload="true" /> + <datatype extension="featurexml" type="galaxy.datatypes.proteomics:FeatureXML" mimetype="application/xml" display_in_upload="true" /> + <datatype extension="consensusxml" type="galaxy.datatypes.proteomics:ConsensusXML" mimetype="application/xml" display_in_upload="true" /> + <datatype extension="idxml" type="galaxy.datatypes.proteomics:IdXML" mimetype="application/xml" display_in_upload="true" /> <datatype extension="raw" type="galaxy.datatypes.proteomics:RAW" display_in_upload="true" /> <datatype extension="msp" type="galaxy.datatypes.proteomics:Msp" display_in_upload="true" /> <datatype extension="ms2" type="galaxy.datatypes.proteomics:Ms2" display_in_upload="true" /> @@ -39,7 +46,7 @@ </registration> <sniffers> <sniffer type="galaxy.datatypes.proteomics:ProtGff"/> - <sniffer type="galaxy.datatypes.proteomics:MzML"/> + <sniffer type="galaxy.datatypes.proteomics:MzML"/> <sniffer type="galaxy.datatypes.proteomics:PepXml"/> <sniffer type="galaxy.datatypes.proteomics:Mgf"/> <sniffer type="galaxy.datatypes.proteomics:ProtXML"/>
--- a/proteomics.py Sun Jun 09 08:16:08 2013 -0500 +++ b/proteomics.py Fri Feb 07 09:21:23 2014 -0500 @@ -3,11 +3,14 @@ """ import logging import re -from galaxy.datatypes.data import * -from galaxy.datatypes.xml import * +import binascii + from galaxy.datatypes.sniff import * -from galaxy.datatypes.binary import * -from galaxy.datatypes.interval import * +from galaxy.datatypes.data import Text +from galaxy.datatypes.xml import GenericXml +from galaxy.datatypes.binary import Binary +from galaxy.datatypes.tabular import Tabular +from galaxy.datatypes.interval import Gff log = logging.getLogger(__name__) @@ -53,13 +56,46 @@ except: return "Binary xls file (%s)" % ( data.nice_size( dataset.get_size() ) ) -class ProteomicsXml(GenericXml): +class IdpDB( Binary ): + file_ext = "idpDB" + +if hasattr(Binary, 'register_unsniffable_binary_ext'): + Binary.register_unsniffable_binary_ext('idpDB') + + +class PepXmlReport( Tabular ): + """pepxml converted to tabular report""" + file_ext = "tsv" + + def __init__(self, **kwd): + Tabular.__init__( self, **kwd ) + self.column_names = ['Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility'] + + def display_peek( self, dataset ): + """Returns formated html of peek""" + return Tabular.make_html_table( self, dataset, column_names=self.column_names ) + + +class ProtXmlReport( Tabular ): + """protxml converted to tabular report""" + file_ext = "tsv" + comment_lines = 1 + + def __init__(self, **kwd): + Tabular.__init__( self, **kwd ) + self.column_names = ["Entry Number", "Group Probability", "Protein", "Protein Link", "Protein Probability", "Percent Coverage", "Number of Unique Peptides", "Total Independent Spectra", "Percent Share of Spectrum ID's", "Description", "Protein Molecular Weight", "Protein Length", "Is Nondegenerate Evidence", "Weight", "Precursor Ion Charge", "Peptide sequence", "Peptide Link", "NSP Adjusted Probability", "Initial Probability", "Number of Total Termini", "Number of Sibling Peptides Bin", "Number of Instances", "Peptide Group Designator", "Is Evidence?"] + + def display_peek( self, dataset ): + """Returns formated html of peek""" + return Tabular.make_html_table( self, dataset, column_names=self.column_names ) + +class ProteomicsXml( GenericXml ): """ An enhanced XML datatype used to reuse code across several proteomic/mass-spec datatypes. """ def sniff(self, filename): """ Determines whether the file is the correct XML type. """ - with open(filename, 'r') as contents: + with open(filename, 'r') as contents: while True: line = contents.readline() if line == None or not line.startswith('<?'): @@ -76,12 +112,13 @@ dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' + class PepXml(ProteomicsXml): """pepXML data""" file_ext = "pepxml" blurb = 'pepXML data' root = "msms_pipeline_analysis" - + class MzML(ProteomicsXml): """mzML data""" @@ -99,7 +136,7 @@ class MzXML(ProteomicsXml): """mzXML data""" - file_ext = "mzXML" + file_ext = "mzxml" blurb = "mzXML Mass Spectrometry data" root = "mzXML" @@ -108,10 +145,10 @@ file_ext = "mzid" blurb = "XML identified peptides and proteins." root = "MzIdentML" - + class TraML(ProteomicsXml): - file_ext = "traML" + file_ext = "traml" blurb = "TraML transition list" root = "TraML" @@ -121,7 +158,25 @@ blurb = "XML quantification data" root = "MzQuantML" - + +class ConsensusXML(ProteomicsXml): + file_ext = "consensusxml" + blurb = "OpenMS multiple LC-MS map alignment file" + root = "consensusXML" + + +class FeatureXML(ProteomicsXml): + file_ext = "featurexml" + blurb = "OpenMS feature file" + root = "featureMap" + + +class IdXML(ProteomicsXml): + file_ext = "idxml" + blurb = "OpenMS identification file" + root = "IdXML" + + class Mgf( Text ): """Mascot Generic Format data""" file_ext = "mgf" @@ -135,7 +190,6 @@ dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' - def sniff( self, filename ): mgf_begin_ions = "BEGIN IONS" max_lines=100 @@ -146,8 +200,8 @@ return True if i>max_lines: return False - - + + class MascotDat( Text ): """Mascot search results """ file_ext = "mascotdat" @@ -206,10 +260,10 @@ if hasattr(Binary, 'register_sniffable_binary_format'): - Binary.register_sniffable_binary_format('RAW', 'RAW', RAW) + Binary.register_sniffable_binary_format('raw', 'raw', RAW) -class Msp(Text): +class Msp( Text ): """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """ file_ext = "msp" @@ -267,10 +321,9 @@ return True # unsniffable binary format, should do something about this -class XHunterAslFormat(Binary): +class XHunterAslFormat( Binary ): """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """ file_ext = "hlf" - if hasattr(Binary, 'register_unsniffable_binary_ext'): Binary.register_unsniffable_binary_ext('hlf')
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/proteomics_datatypes.diff Fri Feb 07 09:21:23 2014 -0500 @@ -0,0 +1,209 @@ +diff -r 09b89b345de2 datatypes_conf.xml +--- a/datatypes_conf.xml Sun Jun 09 08:16:08 2013 -0500 ++++ b/datatypes_conf.xml Fri Feb 07 00:05:57 2014 +0100 +@@ -2,7 +2,7 @@ + <datatypes> + <datatype_files> + <datatype_file name="proteomics.py"/> +- </datatype_files> ++ </datatype_files> + <registration display_path="display_applications"> + <datatype extension="prot_gff" type="galaxy.datatypes.proteomics:ProtGff" mimetype="application/xml" display_in_upload="true"> + <display file="proteomics/ProtGff.xml" /> +@@ -22,16 +22,22 @@ + <datatype extension="protxml" type="galaxy.datatypes.proteomics:ProtXML" display_in_upload="true" > + <display file="proteomics/ProtXml.xml"/> + </datatype> ++ <datatype extension="pepxml.tsv" type="galaxy.datatypes.proteomics:PepXmlReport" display_in_upload="true" /> ++ <datatype extension="protxml.tsv" type="galaxy.datatypes.proteomics:ProtXmlReport" display_in_upload="true" /> + <datatype extension="mascotdat" type="galaxy.datatypes.proteomics:MascotDat" display_in_upload="false" /> + <datatype extension="mzml" type="galaxy.datatypes.proteomics:MzML" mimetype="application/xml" display_in_upload="true"> + <display file="proteomics/mzML.xml"/> ++ <display file="proteomics/protvis_mzml.xml"/> + </datatype> +- <datatype extension="mgf" type="galaxy.datatypes.proteomics:Mgf" display_in_upload="true" /> ++ <datatype extension="mgf" type="galaxy.datatypes.proteomics:Mgf" display_in_upload="true" /> + <datatype extension="xls" type="galaxy.datatypes.proteomics:Xls" display_in_upload="true" /> + <datatype extension="mzxml" type="galaxy.datatypes.proteomics:MzXML" mimetype="application/xml" display_in_upload="true" /> + <datatype extension="mzq" type="galaxy.datatypes.proteomics:MzQuantML" mimetype="application/xml" display_in_upload="true" /> + <datatype extension="mzid" type="galaxy.datatypes.proteomics:MzIdentML" mimetype="application/xml" display_in_upload="true" /> +- <datatype extension="traML" type="galaxy.datatypes.proteomics:TraML" mimetype="application/xml" display_in_upload="true" /> ++ <datatype extension="traml" type="galaxy.datatypes.proteomics:TraML" mimetype="application/xml" display_in_upload="true" /> ++ <datatype extension="featurexml" type="galaxy.datatypes.proteomics:FeatureXML" mimetype="application/xml" display_in_upload="true" /> ++ <datatype extension="consensusxml" type="galaxy.datatypes.proteomics:ConsensusXML" mimetype="application/xml" display_in_upload="true" /> ++ <datatype extension="idxml" type="galaxy.datatypes.proteomics:IdXML" mimetype="application/xml" display_in_upload="true" /> + <datatype extension="raw" type="galaxy.datatypes.proteomics:RAW" display_in_upload="true" /> + <datatype extension="msp" type="galaxy.datatypes.proteomics:Msp" display_in_upload="true" /> + <datatype extension="ms2" type="galaxy.datatypes.proteomics:Ms2" display_in_upload="true" /> +@@ -39,7 +45,7 @@ + </registration> + <sniffers> + <sniffer type="galaxy.datatypes.proteomics:ProtGff"/> +- <sniffer type="galaxy.datatypes.proteomics:MzML"/> ++ <sniffer type="galaxy.datatypes.proteomics:MzML"/> + <sniffer type="galaxy.datatypes.proteomics:PepXml"/> + <sniffer type="galaxy.datatypes.proteomics:Mgf"/> + <sniffer type="galaxy.datatypes.proteomics:ProtXML"/> +diff -r 09b89b345de2 proteomics.py +--- a/proteomics.py Sun Jun 09 08:16:08 2013 -0500 ++++ b/proteomics.py Fri Feb 07 00:05:57 2014 +0100 +@@ -3,11 +3,13 @@ + """ + import logging + import re ++import binascii ++ + from galaxy.datatypes.data import * + from galaxy.datatypes.xml import * + from galaxy.datatypes.sniff import * + from galaxy.datatypes.binary import * +-from galaxy.datatypes.interval import * ++from galaxy.datatypes.tabular import Tabular + + log = logging.getLogger(__name__) + +@@ -53,13 +55,45 @@ + except: + return "Binary xls file (%s)" % ( data.nice_size( dataset.get_size() ) ) + ++class IdpDB(Binary): ++ file_ext = "idpDB" ++ ++if hasattr(Binary, 'register_unsniffable_binary_ext'): ++ Binary.register_unsniffable_binary_ext('idpDB') ++ ++class PepXmlReport(Tabular): ++ """pepxml converted to tabular report""" ++ file_ext = "tsv" ++ ++ def __init__(self, **kwd): ++ Tabular.__init__( self, **kwd ) ++ self.column_names = ['Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility'] ++ ++ def display_peek( self, dataset ): ++ """Returns formated html of peek""" ++ return Tabular.make_html_table( self, dataset, column_names=self.column_names ) ++ ++ ++class ProtXmlReport(Tabular): ++ """protxml converted to tabular report""" ++ file_ext = "tsv" ++ comment_lines = 1 ++ ++ def __init__(self, **kwd): ++ Tabular.__init__( self, **kwd ) ++ self.column_names = ["Entry Number", "Group Probability", "Protein", "Protein Link", "Protein Probability", "Percent Coverage", "Number of Unique Peptides", "Total Independent Spectra", "Percent Share of Spectrum ID's", "Description", "Protein Molecular Weight", "Protein Length", "Is Nondegenerate Evidence", "Weight", "Precursor Ion Charge", "Peptide sequence", "Peptide Link", "NSP Adjusted Probability", "Initial Probability", "Number of Total Termini", "Number of Sibling Peptides Bin", "Number of Instances", "Peptide Group Designator", "Is Evidence?"] ++ ++ def display_peek( self, dataset ): ++ """Returns formated html of peek""" ++ return Tabular.make_html_table( self, dataset, column_names=self.column_names ) ++ + class ProteomicsXml(GenericXml): + """ An enhanced XML datatype used to reuse code across several + proteomic/mass-spec datatypes. """ + + def sniff(self, filename): + """ Determines whether the file is the correct XML type. """ +- with open(filename, 'r') as contents: ++ with open(filename, 'r') as contents: + while True: + line = contents.readline() + if line == None or not line.startswith('<?'): +@@ -81,7 +115,7 @@ + file_ext = "pepxml" + blurb = 'pepXML data' + root = "msms_pipeline_analysis" +- ++ + + class MzML(ProteomicsXml): + """mzML data""" +@@ -99,7 +133,7 @@ + + class MzXML(ProteomicsXml): + """mzXML data""" +- file_ext = "mzXML" ++ file_ext = "mzxml" + blurb = "mzXML Mass Spectrometry data" + root = "mzXML" + +@@ -108,10 +142,10 @@ + file_ext = "mzid" + blurb = "XML identified peptides and proteins." + root = "MzIdentML" +- ++ + + class TraML(ProteomicsXml): +- file_ext = "traML" ++ file_ext = "traml" + blurb = "TraML transition list" + root = "TraML" + +@@ -121,7 +155,25 @@ + blurb = "XML quantification data" + root = "MzQuantML" + +- ++ ++class ConsensusXML(ProteomicsXml): ++ file_ext = "consensusxml" ++ blurb = "OpenMS multiple LC-MS map alignment file" ++ root = "consensusXML" ++ ++ ++class FeatureXML(ProteomicsXml): ++ file_ext = "featurexml" ++ blurb = "OpenMS feature file" ++ root = "featureMap" ++ ++ ++class IdXML(ProteomicsXml): ++ file_ext = "idxml" ++ blurb = "OpenMS identification file" ++ root = "IdXML" ++ ++ + class Mgf( Text ): + """Mascot Generic Format data""" + file_ext = "mgf" +@@ -135,7 +187,6 @@ + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + +- + def sniff( self, filename ): + mgf_begin_ions = "BEGIN IONS" + max_lines=100 +@@ -146,8 +197,8 @@ + return True + if i>max_lines: + return False +- +- ++ ++ + class MascotDat( Text ): + """Mascot search results """ + file_ext = "mascotdat" +@@ -206,7 +257,7 @@ + + + if hasattr(Binary, 'register_sniffable_binary_format'): +- Binary.register_sniffable_binary_format('RAW', 'RAW', RAW) ++ Binary.register_sniffable_binary_format('raw', 'raw', RAW) + + + class Msp(Text): +@@ -267,10 +318,9 @@ + return True + + # unsniffable binary format, should do something about this +-class XHunterAslFormat(Binary): ++class XHunterAslFormat( Binary ): + """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """ + file_ext = "hlf" + +- + if hasattr(Binary, 'register_unsniffable_binary_ext'): + Binary.register_unsniffable_binary_ext('hlf')