changeset 0:e1c29f302301 draft

Uploaded
author devteam
date Fri, 17 Aug 2012 09:10:31 -0400
parents
children 10dce68b584b
files datatypes_conf.xml xml.py
diffstat 2 files changed, 137 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes_conf.xml	Fri Aug 17 09:10:31 2012 -0400
@@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+<datatypes>
+    <datatype_files>
+        <datatype_file name="xml.py"/>
+    </datatype_files>
+    <registration>
+        <datatype extension="blastxml" type="galaxy.datatypes.xml:BlastXml" mimetype="application/xml" display_in_upload="true"/>
+    </registration>
+    <sniffers>
+        <sniffer type="galaxy.datatypes.xml:BlastXml"/>
+    </sniffers>
+</datatypes>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/xml.py	Fri Aug 17 09:10:31 2012 -0400
@@ -0,0 +1,124 @@
+"""
+BlastXml class
+"""
+
+from galaxy.datatypes.data import get_file_peek
+from galaxy.datatypes.data import Text
+from galaxy.datatypes.xml import GenericXml
+
+class BlastXml( GenericXml ):
+    """NCBI Blast XML Output data"""
+    file_ext = "blastxml"
+
+    def set_peek( self, dataset, is_multi_byte=False ):
+        """Set the peek and blurb text"""
+        if not dataset.dataset.purged:
+            dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
+            dataset.blurb = 'NCBI Blast XML data'
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
+    def sniff( self, filename ):
+        """
+        Determines whether the file is blastxml
+        
+        >>> fname = get_test_fname( 'megablast_xml_parser_test1.blastxml' )
+        >>> BlastXml().sniff( fname )
+        True
+        >>> fname = get_test_fname( 'tblastn_four_human_vs_rhodopsin.xml' )
+        >>> BlastXml().sniff( fname )
+        True
+        >>> fname = get_test_fname( 'interval.interval' )
+        >>> BlastXml().sniff( fname )
+        False
+        """
+        #TODO - Use a context manager on Python 2.5+ to close handle
+        handle = open(filename)
+        line = handle.readline()
+        if line.strip() != '<?xml version="1.0"?>':
+            handle.close()
+            return False
+        line = handle.readline()
+        if line.strip() not in ['<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">',
+                                '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">']:
+            handle.close()
+            return False
+        line = handle.readline()
+        if line.strip() != '<BlastOutput>':
+            handle.close()
+            return False
+        handle.close()
+        return True
+    
+    def merge(split_files, output_file):
+        """Merging multiple XML files is non-trivial and must be done in subclasses."""
+        if len(split_files) == 1:
+            #For one file only, use base class method (move/copy)
+            return Text.merge(split_files, output_file)
+        out = open(output_file, "w")
+        h = None
+        for f in split_files:
+            h = open(f)
+            body = False
+            header = h.readline()
+            if not header:
+                out.close()
+                h.close()
+                raise ValueError("BLAST XML file %s was empty" % f)
+            if header.strip() != '<?xml version="1.0"?>':
+                out.write(header) #for diagnosis
+                out.close()
+                h.close()
+                raise ValueError("%s is not an XML file!" % f)
+            line = h.readline()
+            header += line
+            if line.strip() not in ['<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">',
+                                    '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">']:
+                out.write(header) #for diagnosis
+                out.close()
+                h.close()
+                raise ValueError("%s is not a BLAST XML file!" % f)
+            while True:
+                line = h.readline()
+                if not line:
+                    out.write(header) #for diagnosis
+                    out.close()
+                    h.close()
+                    raise ValueError("BLAST XML file %s ended prematurely" % f)
+                header += line
+                if "<Iteration>" in line:
+                    break
+                if len(header) > 10000:
+                    #Something has gone wrong, don't load too much into memory!
+                    #Write what we have to the merged file for diagnostics
+                    out.write(header)
+                    out.close()
+                    h.close()
+                    raise ValueError("BLAST XML file %s has too long a header!" % f)
+            if "<BlastOutput>" not in header:
+                out.close()
+                h.close()
+                raise ValueError("%s is not a BLAST XML file:\n%s\n..." % (f, header))
+            if f == split_files[0]:
+                out.write(header)
+                old_header = header
+            elif old_header[:300] != header[:300]:
+                #Enough to check <BlastOutput_program> and <BlastOutput_version> match
+                out.close()
+                h.close()
+                raise ValueError("BLAST XML headers don't match for %s and %s - have:\n%s\n...\n\nAnd:\n%s\n...\n" \
+                                 % (split_files[0], f, old_header[:300], header[:300]))
+            else:
+                out.write("    <Iteration>\n")
+            for line in h:
+                if "</BlastOutput_iterations>" in line:
+                    break
+                #TODO - Increment <Iteration_iter-num> and if required automatic query names
+                #like <Iteration_query-ID>Query_3</Iteration_query-ID> to be increasing?
+                out.write(line)
+            h.close()
+        out.write("  </BlastOutput_iterations>\n")
+        out.write("</BlastOutput>\n")
+        out.close()
+    merge = staticmethod(merge)
+