changeset 0:0e941a69a6fa draft default tip

Uploaded
author chrisb
date Wed, 23 Mar 2016 14:34:50 -0400
parents
children
files datatypes/README.md datatypes/datatypes_conf.xml datatypes/glycan.py
diffstat 3 files changed, 2013 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/README.md	Wed Mar 23 14:34:50 2016 -0400
@@ -0,0 +1,19 @@
+
+Custom glycan data types for galaxy
+===================================
+
+New glycan data types for galaxy, included as part of the glycan tools repo instead of being included manually (as done previously)
+Ideas from http://gregvonkuster.org/galaxy-tool-shed-including-custom-datatypes-repositories/
+
+Supported data types include (copied from datatypes_conf.xml):
+
+    <sniffer type="galaxy.datatypes.glycan:kcf"/>
+    <sniffer type="galaxy.datatypes.glycan:glycoct"/>
+    <sniffer type="galaxy.datatypes.glycan:glycoct_xml"/>
+    <sniffer type="galaxy.datatypes.glycan:glydeii"/>
+    <sniffer type="galaxy.datatypes.glycan:linucs"/>
+    <sniffer type="galaxy.datatypes.glycan:iupac"/>
+    <sniffer type="galaxy.datatypes.glycan:linearcode"/>
+    <sniffer type="galaxy.datatypes.glycan:msa"/>
+    <sniffer type="galaxy.datatypes.glycan:wurcs"/>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/datatypes_conf.xml	Wed Mar 23 14:34:50 2016 -0400
@@ -0,0 +1,30 @@
+<?xml version="1.0"?>
+<datatypes>
+    <datatype_files>
+        <datatype_file name="glycan.py"/>
+    </datatype_files>
+    <registration>
+        <datatype extension="kcf" type="galaxy.datatypes.glycan:kcf" mimetype="text/plain" display_in_upload="true"/>
+        <datatype extension="glycoct" type="galaxy.datatypes.glycan:glycoct" mimetype="text/plain" display_in_upload="true"/>
+        <datatype extension="glycoct_xml" type="galaxy.datatypes.glycan:glycoct_xml" mimetype="text/xml" display_in_upload="true"/>
+        <datatype extension="glydeii" type="galaxy.datatypes.glycan:glydeii" mimetype="text/xml" display_in_upload="true"/>
+        <datatype extension="linucs" type="galaxy.datatypes.glycan:linucs" mimetype="text/plain" display_in_upload="true"/>
+        <datatype extension="iupac" type="galaxy.datatypes.glycan:iupac" mimetype="text/plain" display_in_upload="true"/>
+        <datatype extension="linearcode" type="galaxy.datatypes.glycan:linearcode" mimetype="text/plain" display_in_upload="true"/>
+        <datatype extension="msa" type="galaxy.datatypes.glycan:msa" mimetype="text/plain" display_in_upload="true"/>
+        <datatype extension="wurcs" type="galaxy.datatypes.glycan:wurcs" mimetype="text/plain" display_in_upload="true"/>
+
+    </registration>
+    <sniffers>
+        <sniffer type="galaxy.datatypes.glycan:kcf"/>
+        <sniffer type="galaxy.datatypes.glycan:glycoct"/>
+        <sniffer type="galaxy.datatypes.glycan:glycoct_xml"/>
+        <sniffer type="galaxy.datatypes.glycan:glydeii"/>
+        <sniffer type="galaxy.datatypes.glycan:linucs"/>
+        <sniffer type="galaxy.datatypes.glycan:iupac"/>
+        <sniffer type="galaxy.datatypes.glycan:linearcode"/>
+        <sniffer type="galaxy.datatypes.glycan:msa"/>
+        <sniffer type="galaxy.datatypes.glycan:wurcs"/>
+    </sniffers>
+</datatypes>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/glycan.py	Wed Mar 23 14:34:50 2016 -0400
@@ -0,0 +1,1964 @@
+__license__ = "MIT"
+
+import logging
+from galaxy.datatypes import metadata
+import mimetypes
+import os
+import shutil
+import sys
+import traceback
+import tempfile
+import zipfile
+from cgi import escape
+from inspect import isclass
+import galaxy.util as util
+from galaxy.datatypes import data
+from galaxy.datatypes.metadata import \
+    MetadataElement  # import directly to maintain ease of use in Datatype class definitions
+from galaxy.util import inflector
+from galaxy.util.bunch import Bunch
+from galaxy.util.odict import odict
+from galaxy.util.sanitize_html import sanitize_html
+
+from galaxy.datatypes import dataproviders
+
+from galaxy import eggs
+
+eggs.require("Paste")
+import paste
+
+
+class kcf(data.Data):
+    file_ext = 'kcf'
+    line_class = 'line'
+
+    """Add metadata elements"""
+    MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+                    visible=False, no_value=0)
+
+    def write_from_stream(self, dataset, stream):
+        """Writes data from a stream"""
+        # write it twice for now
+        fd, temp_name = tempfile.mkstemp()
+        while 1:
+            chunk = stream.read(1048576)
+            if not chunk:
+                break
+            os.write(fd, chunk)
+        os.close(fd)
+        # rewrite the file with unix newlines
+        fp = open(dataset.file_name, 'wt')
+        for line in file(temp_name, "U"):
+            line = line.strip() + '\n'
+            fp.write(line)
+        fp.close()
+
+    def set_raw_data(self, dataset, data):
+        """Saves the data on the disc"""
+        fd, temp_name = tempfile.mkstemp()
+        os.write(fd, data)
+        os.close(fd)
+        # rewrite the file with unix newlines
+        fp = open(dataset.file_name, 'wt')
+        for line in file(temp_name, "U"):
+            line = line.strip() + '\n'
+            fp.write(line)
+        fp.close()
+        os.remove(temp_name)
+
+    def get_mime(self):
+        """Returns the mime type of the datatype"""
+        return 'text/plain'
+
+    def set_meta(self, dataset, **kwd):
+        """
+        Set the number of lines of data in dataset.
+        """
+        dataset.metadata.data_lines = self.count_data_lines(dataset)
+
+    def estimate_file_lines(self, dataset):
+        """
+        Perform a rough estimate by extrapolating number of lines from a small read.
+        """
+        sample_size = 1048576
+        dataset_fh = open(dataset.file_name)
+        dataset_read = dataset_fh.read(sample_size)
+        dataset_fh.close()
+        sample_lines = dataset_read.count('\n')
+        est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+        return est_lines
+
+    def count_data_lines(self, dataset):
+        """
+        Count the number of lines of data in dataset,
+        skipping all blank lines and comments.
+        """
+        data_lines = 0
+        for line in file(dataset.file_name):
+            line = line.strip()
+            if line and not line.startswith('#'):
+                data_lines += 1
+        return data_lines
+
+    def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+        """
+        Set the peek.  This method is used by various subclasses of Text.
+        """
+        if not dataset.dataset.purged:
+            # The file must exist on disk for the get_file_peek() method
+            dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+                                         skipchars=skipchars)
+            if line_count is None:
+                # See if line_count is stored in the metadata
+                if dataset.metadata.data_lines:
+                    dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+                                               inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+                else:
+                    # Number of lines is not known ( this should not happen ), and auto-detect is
+                    # needed to set metadata
+                    # This can happen when the file is larger than max_optional_metadata_filesize.
+                    if int(dataset.get_size()) <= 1048576:
+                        # Small dataset, recount all lines and reset peek afterward.
+                        lc = self.count_data_lines(dataset)
+                        dataset.metadata.data_lines = lc
+                        dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+                    else:
+                        est_lines = self.estimate_file_lines(dataset)
+                        dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+                                                    inflector.cond_plural(est_lines, self.line_class) )
+            else:
+                dataset.blurb = "%s %s" % (
+                    util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
+
+    def sniff(self, filename):
+        """All KCF Files simply put a 'ENTRY' in its first line.
+        This applies to all possible kcfs. In this case check
+        for  'Glycan' to confirm it's a glycan """
+        try:
+            from suds.client import Client
+
+            url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
+            client = Client(url)
+            kcfresponse = client.service.DeterminingForm(file(filename, 'r').read())
+            if kcfresponse.array[0] == "KCF":
+                return True
+            else:
+                return False
+        except ImportError:
+            # cannot use import suds so use simple checker
+            print "using KCF simple checker"
+            f = open(filename, "r")
+            firstline = f.readline().upper()  # note we are uppercasing here to avoid CasE SenSitIVity
+            f.close()
+
+            if "ENTRY" in firstline and "GLYCAN" in firstline:
+                return True
+            else:
+                return False
+        except Exception, e:
+            # note I am not raising an error rather return False  and let another sniffer try to type this data
+            traceback.print_exc(file=sys.stdout)
+            return False
+
+    def split(cls, input_datasets, subdir_generator_function, split_params):
+        """
+        Split the input files by line.
+        """
+        if split_params is None:
+            return
+
+        if len(input_datasets) > 1:
+            raise Exception("Text file splitting does not support multiple files")
+        input_files = [ds.file_name for ds in input_datasets]
+
+        lines_per_file = None
+        chunk_size = None
+        if split_params['split_mode'] == 'number_of_parts':
+            lines_per_file = []
+            # Computing the length is expensive!
+            def _file_len(fname):
+                i = 0
+                f = open(fname)
+                for i, l in enumerate(f):
+                    pass
+                f.close()
+                return i + 1
+
+            length = _file_len(input_files[0])
+            parts = int(split_params['split_size'])
+            if length < parts:
+                parts = length
+            len_each, remainder = divmod(length, parts)
+            while length > 0:
+                chunk = len_each
+                if remainder > 0:
+                    chunk += 1
+                lines_per_file.append(chunk)
+                remainder = - 1
+                length -= chunk
+        elif split_params['split_mode'] == 'to_size':
+            chunk_size = int(split_params['split_size'])
+        else:
+            raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+
+        f = open(input_files[0], 'rt')
+        try:
+            chunk_idx = 0
+            file_done = False
+            part_file = None
+            while not file_done:
+                if lines_per_file is None:
+                    this_chunk_size = chunk_size
+                elif chunk_idx < len(lines_per_file):
+                    this_chunk_size = lines_per_file[chunk_idx]
+                    chunk_idx += 1
+                lines_remaining = this_chunk_size
+                part_file = None
+                while lines_remaining > 0:
+                    a_line = f.readline()
+                    if a_line == '':
+                        file_done = True
+                        break
+                    if part_file is None:
+                        part_dir = subdir_generator_function()
+                        part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+                        part_file = open(part_path, 'w')
+                    part_file.write(a_line)
+                    lines_remaining -= 1
+                if part_file is not None:
+                    part_file.close()
+        except Exception, e:
+            log.error('Unable to split files: %s' % str(e))
+            f.close()
+            if part_file is not None:
+                part_file.close()
+            raise
+        f.close()
+
+    split = classmethod(split)
+
+
+class glycoct(data.Data):
+    file_ext = 'glycoct'
+    line_class = 'line'
+
+    """Add metadata elements"""
+    MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+                    visible=False, no_value=0)
+
+    def write_from_stream(self, dataset, stream):
+        """Writes data from a stream"""
+        # write it twice for now
+        fd, temp_name = tempfile.mkstemp()
+        while 1:
+            chunk = stream.read(1048576)
+            if not chunk:
+                break
+            os.write(fd, chunk)
+        os.close(fd)
+        # rewrite the file with unix newlines
+        fp = open(dataset.file_name, 'wt')
+        for line in file(temp_name, "U"):
+            line = line.strip() + '\n'
+            fp.write(line)
+        fp.close()
+
+    def set_raw_data(self, dataset, data):
+        """Saves the data on the disc"""
+        fd, temp_name = tempfile.mkstemp()
+        os.write(fd, data)
+        os.close(fd)
+        # rewrite the file with unix newlines
+        fp = open(dataset.file_name, 'wt')
+        for line in file(temp_name, "U"):
+            line = line.strip() + '\n'
+            fp.write(line)
+        fp.close()
+        os.remove(temp_name)
+
+    def get_mime(self):
+        """Returns the mime type of the datatype"""
+        return 'text/plain'
+
+    def set_meta(self, dataset, **kwd):
+        """
+        Set the number of lines of data in dataset.
+        """
+        dataset.metadata.data_lines = self.count_data_lines(dataset)
+
+    def estimate_file_lines(self, dataset):
+        """
+        Perform a rough estimate by extrapolating number of lines from a small read.
+        """
+        sample_size = 1048576
+        dataset_fh = open(dataset.file_name)
+        dataset_read = dataset_fh.read(sample_size)
+        dataset_fh.close()
+        sample_lines = dataset_read.count('\n')
+        est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+        return est_lines
+
+    def count_data_lines(self, dataset):
+        """
+        Count the number of lines of data in dataset,
+        skipping all blank lines and comments.
+        """
+        data_lines = 0
+        for line in file(dataset.file_name):
+            line = line.strip()
+            if line and not line.startswith('#'):
+                data_lines += 1
+        return data_lines
+
+    def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+        """
+        Set the peek.  This method is used by various subclasses of Text.
+        """
+        if not dataset.dataset.purged:
+            # The file must exist on disk for the get_file_peek() method
+            dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+                                         skipchars=skipchars)
+            if line_count is None:
+                # See if line_count is stored in the metadata
+                if dataset.metadata.data_lines:
+                    dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+                                               inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+                else:
+                    # Number of lines is not known ( this should not happen ), and auto-detect is
+                    # needed to set metadata
+                    # This can happen when the file is larger than max_optional_metadata_filesize.
+                    if int(dataset.get_size()) <= 1048576:
+                        # Small dataset, recount all lines and reset peek afterward.
+                        lc = self.count_data_lines(dataset)
+                        dataset.metadata.data_lines = lc
+                        dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+                    else:
+                        est_lines = self.estimate_file_lines(dataset)
+                        dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+                                                    inflector.cond_plural(est_lines, self.line_class) )
+            else:
+                dataset.blurb = "%s %s" % (
+                    util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
+
+    def sniff(self, filename):
+        """All glycoct_condensed files simply put a 'RES' in its first line and a LIN later. """
+        try:
+            f = open(filename, "r")
+            firstline = f.readline().upper()  # note we are uppercasing here to avoid CasE SenSitIVity
+            lines = f.read()
+            f.close()
+
+            # if "RES" in firstline and "LIN" in lines:
+            if "RES" in firstline and "LIN" in lines:
+                return True
+            else:
+                return False
+        except Exception, e:
+            # note I am not raising an error rather return False  and let another sniffer try to type this data
+            traceback.print_exc(file=sys.stdout)
+            return False
+
+    def split(cls, input_datasets, subdir_generator_function, split_params):
+        """
+        Split the input files by line.
+        """
+        if split_params is None:
+            return
+
+        if len(input_datasets) > 1:
+            raise Exception("Text file splitting does not support multiple files")
+        input_files = [ds.file_name for ds in input_datasets]
+
+        lines_per_file = None
+        chunk_size = None
+        if split_params['split_mode'] == 'number_of_parts':
+            lines_per_file = []
+            # Computing the length is expensive!
+            def _file_len(fname):
+                i = 0
+                f = open(fname)
+                for i, l in enumerate(f):
+                    pass
+                f.close()
+                return i + 1
+
+            length = _file_len(input_files[0])
+            parts = int(split_params['split_size'])
+            if length < parts:
+                parts = length
+            len_each, remainder = divmod(length, parts)
+            while length > 0:
+                chunk = len_each
+                if remainder > 0:
+                    chunk += 1
+                lines_per_file.append(chunk)
+                remainder = - 1
+                length -= chunk
+        elif split_params['split_mode'] == 'to_size':
+            chunk_size = int(split_params['split_size'])
+        else:
+            raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+
+        f = open(input_files[0], 'rt')
+        try:
+            chunk_idx = 0
+            file_done = False
+            part_file = None
+            while not file_done:
+                if lines_per_file is None:
+                    this_chunk_size = chunk_size
+                elif chunk_idx < len(lines_per_file):
+                    this_chunk_size = lines_per_file[chunk_idx]
+                    chunk_idx += 1
+                lines_remaining = this_chunk_size
+                part_file = None
+                while lines_remaining > 0:
+                    a_line = f.readline()
+                    if a_line == '':
+                        file_done = True
+                        break
+                    if part_file is None:
+                        part_dir = subdir_generator_function()
+                        part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+                        part_file = open(part_path, 'w')
+                    part_file.write(a_line)
+                    lines_remaining -= 1
+                if part_file is not None:
+                    part_file.close()
+        except Exception, e:
+            log.error('Unable to split files: %s' % str(e))
+            f.close()
+            if part_file is not None:
+                part_file.close()
+            raise
+        f.close()
+
+    split = classmethod(split)
+
+# ------------- Utility methods --------------
+
+# nice_size used to be here, but to resolve cyclical dependencies it's been
+# moved to galaxy.util.  It belongs there anyway since it's used outside
+# datatypes.
+nice_size = util.nice_size
+
+
+def get_test_fname(fname):
+    """Returns test data filename"""
+    path, name = os.path.split(__file__)
+    full_path = os.path.join(path, 'test', fname)
+    return full_path
+
+
+def get_file_peek(file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipchars=[]):
+    """
+    Returns the first LINE_COUNT lines wrapped to WIDTH
+
+    ## >>> fname = get_test_fname('4.bed')
+    ## >>> get_file_peek(fname)
+    ## 'chr22    30128507    31828507    uc003bnx.1_cds_2_0_chr22_29227_f    0    +\n'
+
+    """
+    # Set size for file.readline() to a negative number to force it to
+    # read until either a newline or EOF.  Needed for datasets with very
+    # long lines.
+    if WIDTH == 'unlimited':
+        WIDTH = -1
+    lines = []
+    count = 0
+    file_type = None
+    data_checked = False
+    temp = open(file_name, "U")
+    while count <= LINE_COUNT:
+        line = temp.readline(WIDTH)
+        if line and not is_multi_byte and not data_checked:
+            # See if we have a compressed or binary file
+            if line[0:2] == util.gzip_magic:
+                file_type = 'gzipped'
+                break
+            else:
+                for char in line:
+                    if ord(char) > 128:
+                        file_type = 'binary'
+                        break
+            data_checked = True
+        if file_type in ['gzipped', 'binary']:
+            break
+        skip_line = False
+        for skipchar in skipchars:
+            if line.startswith(skipchar):
+                skip_line = True
+                break
+        if not skip_line:
+            lines.append(line)
+            count += 1
+    temp.close()
+    if file_type in ['gzipped', 'binary']:
+        text = "%s file" % file_type
+    else:
+        try:
+            text = unicode('\n'.join(lines), 'utf-8')
+        except UnicodeDecodeError:
+            text = "binary/unknown file"
+    return text
+
+
+class glycoct_xml(data.Data):
+    file_ext = 'glycoct_xml'
+    line_class = 'line'
+
+    """Add metadata elements"""
+    MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+                    visible=False, no_value=0)
+
+    def write_from_stream(self, dataset, stream):
+        """Writes data from a stream"""
+        # write it twice for now
+        fd, temp_name = tempfile.mkstemp()
+        while 1:
+            chunk = stream.read(1048576)
+            if not chunk:
+                break
+            os.write(fd, chunk)
+        os.close(fd)
+        # rewrite the file with unix newlines
+        fp = open(dataset.file_name, 'wt')
+        for line in file(temp_name, "U"):
+            line = line.strip() + '\n'
+            fp.write(line)
+        fp.close()
+
+    def set_raw_data(self, dataset, data):
+        """Saves the data on the disc"""
+        fd, temp_name = tempfile.mkstemp()
+        os.write(fd, data)
+        os.close(fd)
+        # rewrite the file with unix newlines
+        fp = open(dataset.file_name, 'wt')
+        for line in file(temp_name, "U"):
+            line = line.strip() + '\n'
+            fp.write(line)
+        fp.close()
+        os.remove(temp_name)
+
+    def get_mime(self):
+        """Returns the mime type of the datatype"""
+        return 'text/xml'
+
+    def set_meta(self, dataset, **kwd):
+        """
+        Set the number of lines of data in dataset.
+        """
+        dataset.metadata.data_lines = self.count_data_lines(dataset)
+
+    def estimate_file_lines(self, dataset):
+        """
+        Perform a rough estimate by extrapolating number of lines from a small read.
+        """
+        sample_size = 1048576
+        dataset_fh = open(dataset.file_name)
+        dataset_read = dataset_fh.read(sample_size)
+        dataset_fh.close()
+        sample_lines = dataset_read.count('\n')
+        est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+        return est_lines
+
+    def count_data_lines(self, dataset):
+        """
+        Count the number of lines of data in dataset,
+        skipping all blank lines and comments.
+        """
+        data_lines = 0
+        for line in file(dataset.file_name):
+            line = line.strip()
+            if line and not line.startswith('#'):
+                data_lines += 1
+        return data_lines
+
+    def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+        """
+        Set the peek.  This method is used by various subclasses of Text.
+        """
+        if not dataset.dataset.purged:
+            # The file must exist on disk for the get_file_peek() method
+            dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+                                         skipchars=skipchars)
+            if line_count is None:
+                # See if line_count is stored in the metadata
+                if dataset.metadata.data_lines:
+                    dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+                                               inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+                else:
+                    # Number of lines is not known ( this should not happen ), and auto-detect is
+                    # needed to set metadata
+                    # This can happen when the file is larger than max_optional_metadata_filesize.
+                    if int(dataset.get_size()) <= 1048576:
+                        # Small dataset, recount all lines and reset peek afterward.
+                        lc = self.count_data_lines(dataset)
+                        dataset.metadata.data_lines = lc
+                        dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+                    else:
+                        est_lines = self.estimate_file_lines(dataset)
+                        dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+                                                    inflector.cond_plural(est_lines, self.line_class) )
+            else:
+                dataset.blurb = "%s %s" % (
+                    util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
+
+    def sniff(self, filename):
+        """All glycoct XML files should use the rings form determination script """
+        try:
+            from suds.client import Client
+
+            url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
+            client = Client(url)
+            response = client.service.DeterminingForm(file(filename, 'r').read())
+            if response.array[0] == "GlycoCT":
+                return True
+            else:
+                return False
+        except ImportError:
+            # cannot use import suds so use simple checker
+            print "using glycoct XML simple checker"
+            import xml.etree.cElementTree as ET
+
+            tree = ET.parse(filename)
+            root = tree.getroot()
+            if root.tag == 'sugar':
+                print root.tag, root.attrib
+                return True
+            else:
+                return False
+        except Exception, e:
+            # note I am not raising an error rather return False  and let another sniffer try to type this data
+            traceback.print_exc(file=sys.stdout)
+            return False
+
+    def split(cls, input_datasets, subdir_generator_function, split_params):
+        """
+        Split the input files by line.
+        """
+        if split_params is None:
+            return
+
+        if len(input_datasets) > 1:
+            raise Exception("Text file splitting does not support multiple files")
+        input_files = [ds.file_name for ds in input_datasets]
+
+        lines_per_file = None
+        chunk_size = None
+        if split_params['split_mode'] == 'number_of_parts':
+            lines_per_file = []
+            # Computing the length is expensive!
+            def _file_len(fname):
+                i = 0
+                f = open(fname)
+                for i, l in enumerate(f):
+                    pass
+                f.close()
+                return i + 1
+
+            length = _file_len(input_files[0])
+            parts = int(split_params['split_size'])
+            if length < parts:
+                parts = length
+            len_each, remainder = divmod(length, parts)
+            while length > 0:
+                chunk = len_each
+                if remainder > 0:
+                    chunk += 1
+                lines_per_file.append(chunk)
+                remainder = - 1
+                length -= chunk
+        elif split_params['split_mode'] == 'to_size':
+            chunk_size = int(split_params['split_size'])
+        else:
+            raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+
+        f = open(input_files[0], 'rt')
+        try:
+            chunk_idx = 0
+            file_done = False
+            part_file = None
+            while not file_done:
+                if lines_per_file is None:
+                    this_chunk_size = chunk_size
+                elif chunk_idx < len(lines_per_file):
+                    this_chunk_size = lines_per_file[chunk_idx]
+                    chunk_idx += 1
+                lines_remaining = this_chunk_size
+                part_file = None
+                while lines_remaining > 0:
+                    a_line = f.readline()
+                    if a_line == '':
+                        file_done = True
+                        break
+                    if part_file is None:
+                        part_dir = subdir_generator_function()
+                        part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+                        part_file = open(part_path, 'w')
+                    part_file.write(a_line)
+                    lines_remaining -= 1
+                if part_file is not None:
+                    part_file.close()
+        except Exception, e:
+            log.error('Unable to split files: %s' % str(e))
+            f.close()
+            if part_file is not None:
+                part_file.close()
+            raise
+        f.close()
+
+    split = classmethod(split)
+
+
+class glydeii(data.Data):
+    file_ext = 'glydeii'
+    line_class = 'line'
+
+    """Add metadata elements"""
+    MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+                    visible=False, no_value=0)
+
+    def write_from_stream(self, dataset, stream):
+        """Writes data from a stream"""
+        # write it twice for now
+        fd, temp_name = tempfile.mkstemp()
+        while 1:
+            chunk = stream.read(1048576)
+            if not chunk:
+                break
+            os.write(fd, chunk)
+        os.close(fd)
+        # rewrite the file with unix newlines
+        fp = open(dataset.file_name, 'wt')
+        for line in file(temp_name, "U"):
+            line = line.strip() + '\n'
+            fp.write(line)
+        fp.close()
+
+    def set_raw_data(self, dataset, data):
+        """Saves the data on the disc"""
+        fd, temp_name = tempfile.mkstemp()
+        os.write(fd, data)
+        os.close(fd)
+        # rewrite the file with unix newlines
+        fp = open(dataset.file_name, 'wt')
+        for line in file(temp_name, "U"):
+            line = line.strip() + '\n'
+            fp.write(line)
+        fp.close()
+        os.remove(temp_name)
+
+    def get_mime(self):
+        """Returns the mime type of the datatype"""
+        return 'text/xml'
+
+    def set_meta(self, dataset, **kwd):
+        """
+        Set the number of lines of data in dataset.
+        """
+        dataset.metadata.data_lines = self.count_data_lines(dataset)
+
+    def estimate_file_lines(self, dataset):
+        """
+        Perform a rough estimate by extrapolating number of lines from a small read.
+        """
+        sample_size = 1048576
+        dataset_fh = open(dataset.file_name)
+        dataset_read = dataset_fh.read(sample_size)
+        dataset_fh.close()
+        sample_lines = dataset_read.count('\n')
+        est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+        return est_lines
+
+    def count_data_lines(self, dataset):
+        """
+        Count the number of lines of data in dataset,
+        skipping all blank lines and comments.
+        """
+        data_lines = 0
+        for line in file(dataset.file_name):
+            line = line.strip()
+            if line and not line.startswith('#'):
+                data_lines += 1
+        return data_lines
+
+    def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+        """
+        Set the peek.  This method is used by various subclasses of Text.
+        """
+        if not dataset.dataset.purged:
+            # The file must exist on disk for the get_file_peek() method
+            dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+                                         skipchars=skipchars)
+            if line_count is None:
+                # See if line_count is stored in the metadata
+                if dataset.metadata.data_lines:
+                    dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+                                               inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+                else:
+                    # Number of lines is not known ( this should not happen ), and auto-detect is
+                    # needed to set metadata
+                    # This can happen when the file is larger than max_optional_metadata_filesize.
+                    if int(dataset.get_size()) <= 1048576:
+                        # Small dataset, recount all lines and reset peek afterward.
+                        lc = self.count_data_lines(dataset)
+                        dataset.metadata.data_lines = lc
+                        dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+                    else:
+                        est_lines = self.estimate_file_lines(dataset)
+                        dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+                                                    inflector.cond_plural(est_lines, self.line_class) )
+            else:
+                dataset.blurb = "%s %s" % (
+                    util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
+
+    def sniff(self, filename):
+        """All GlydeII XML files should use the rings form determination script """
+        try:
+            from suds.client import Client
+
+            url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
+            client = Client(url)
+            response = client.service.DeterminingForm(file(filename, 'r').read())
+            if response.array[0] == "GLYDEII":
+                return True
+            else:
+                return False
+        except ImportError:
+            # cannot use import suds so use simple checker
+            print "using GlydeII simple checker"
+            import xml.etree.cElementTree as ET
+
+            tree = ET.parse(filename)
+            root = tree.getroot()
+            if root.tag == 'GlydeII':
+                print root.tag
+                return True
+            else:
+                return False
+        except Exception, e:
+            # note I am not raising an error rather return False  and let another sniffer try to type this data
+            traceback.print_exc(file=sys.stdout)
+            return False
+
+    def split(cls, input_datasets, subdir_generator_function, split_params):
+        """
+        Split the input files by line.
+        """
+        if split_params is None:
+            return
+
+        if len(input_datasets) > 1:
+            raise Exception("Text file splitting does not support multiple files")
+        input_files = [ds.file_name for ds in input_datasets]
+
+        lines_per_file = None
+        chunk_size = None
+        if split_params['split_mode'] == 'number_of_parts':
+            lines_per_file = []
+            # Computing the length is expensive!
+            def _file_len(fname):
+                i = 0
+                f = open(fname)
+                for i, l in enumerate(f):
+                    pass
+                f.close()
+                return i + 1
+
+            length = _file_len(input_files[0])
+            parts = int(split_params['split_size'])
+            if length < parts:
+                parts = length
+            len_each, remainder = divmod(length, parts)
+            while length > 0:
+                chunk = len_each
+                if remainder > 0:
+                    chunk += 1
+                lines_per_file.append(chunk)
+                remainder = - 1
+                length -= chunk
+        elif split_params['split_mode'] == 'to_size':
+            chunk_size = int(split_params['split_size'])
+        else:
+            raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+
+        f = open(input_files[0], 'rt')
+        try:
+            chunk_idx = 0
+            file_done = False
+            part_file = None
+            while not file_done:
+                if lines_per_file is None:
+                    this_chunk_size = chunk_size
+                elif chunk_idx < len(lines_per_file):
+                    this_chunk_size = lines_per_file[chunk_idx]
+                    chunk_idx += 1
+                lines_remaining = this_chunk_size
+                part_file = None
+                while lines_remaining > 0:
+                    a_line = f.readline()
+                    if a_line == '':
+                        file_done = True
+                        break
+                    if part_file is None:
+                        part_dir = subdir_generator_function()
+                        part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+                        part_file = open(part_path, 'w')
+                    part_file.write(a_line)
+                    lines_remaining -= 1
+                if part_file is not None:
+                    part_file.close()
+        except Exception, e:
+            log.error('Unable to split files: %s' % str(e))
+            f.close()
+            if part_file is not None:
+                part_file.close()
+            raise
+        f.close()
+
+    split = classmethod(split)
+
+
+class linucs(data.Data):
+    file_ext = 'linucs'
+    line_class = 'line'
+
+    """Add metadata elements"""
+    MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+                    visible=False, no_value=0)
+
+    def write_from_stream(self, dataset, stream):
+        """Writes data from a stream"""
+        # write it twice for now
+        fd, temp_name = tempfile.mkstemp()
+        while 1:
+            chunk = stream.read(1048576)
+            if not chunk:
+                break
+            os.write(fd, chunk)
+        os.close(fd)
+        # rewrite the file with unix newlines
+        fp = open(dataset.file_name, 'wt')
+        for line in file(temp_name, "U"):
+            line = line.strip() + '\n'
+            fp.write(line)
+        fp.close()
+
+    def set_raw_data(self, dataset, data):
+        """Saves the data on the disc"""
+        fd, temp_name = tempfile.mkstemp()
+        os.write(fd, data)
+        os.close(fd)
+        # rewrite the file with unix newlines
+        fp = open(dataset.file_name, 'wt')
+        for line in file(temp_name, "U"):
+            line = line.strip() + '\n'
+            fp.write(line)
+        fp.close()
+        os.remove(temp_name)
+
+    def get_mime(self):
+        """Returns the mime type of the datatype"""
+        return 'text/plain'
+
+    def set_meta(self, dataset, **kwd):
+        """
+        Set the number of lines of data in dataset.
+        """
+        dataset.metadata.data_lines = self.count_data_lines(dataset)
+
+    def estimate_file_lines(self, dataset):
+        """
+        Perform a rough estimate by extrapolating number of lines from a small read.
+        """
+        sample_size = 1048576
+        dataset_fh = open(dataset.file_name)
+        dataset_read = dataset_fh.read(sample_size)
+        dataset_fh.close()
+        sample_lines = dataset_read.count('\n')
+        est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+        return est_lines
+
+    def count_data_lines(self, dataset):
+        """
+        Count the number of lines of data in dataset,
+        skipping all blank lines and comments.
+        """
+        data_lines = 0
+        for line in file(dataset.file_name):
+            line = line.strip()
+            if line and not line.startswith('#'):
+                data_lines += 1
+        return data_lines
+
+    def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+        """
+        Set the peek.  This method is used by various subclasses of Text.
+        """
+        if not dataset.dataset.purged:
+            # The file must exist on disk for the get_file_peek() method
+            dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+                                         skipchars=skipchars)
+            if line_count is None:
+                # See if line_count is stored in the metadata
+                if dataset.metadata.data_lines:
+                    dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+                                               inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+                else:
+                    # Number of lines is not known ( this should not happen ), and auto-detect is
+                    # needed to set metadata
+                    # This can happen when the file is larger than max_optional_metadata_filesize.
+                    if int(dataset.get_size()) <= 1048576:
+                        # Small dataset, recount all lines and reset peek afterward.
+                        lc = self.count_data_lines(dataset)
+                        dataset.metadata.data_lines = lc
+                        dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+                    else:
+                        est_lines = self.estimate_file_lines(dataset)
+                        dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+                                                    inflector.cond_plural(est_lines, self.line_class) )
+            else:
+                dataset.blurb = "%s %s" % (
+                    util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
+
+    def sniff(self, filename):
+        """All LINUCS files should use the rings form determination script """
+        try:
+            from suds.client import Client
+
+            url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
+            client = Client(url)
+            response = client.service.DeterminingForm(file(filename, 'r').read())
+            if response.array[0] == "LINUCS":
+                return True
+            else:
+                return False
+        except ImportError:
+            # cannot use import suds so use simple checker
+            print "using LINUCS simple checker"
+
+            f = open(filename, "r")
+            firstline = f.readline()
+            f.close()
+
+            if "[" in firstline and "]" in firstline and "{" in firstline and "}" in firstline:
+                return True
+            else:
+                return False
+        except Exception, e:
+            # note I am not raising an error rather return False  and let another sniffer try to type this data
+            traceback.print_exc(file=sys.stdout)
+            return False
+
+    def split(cls, input_datasets, subdir_generator_function, split_params):
+        """
+        Split the input files by line.
+        """
+        if split_params is None:
+            return
+
+        if len(input_datasets) > 1:
+            raise Exception("Text file splitting does not support multiple files")
+        input_files = [ds.file_name for ds in input_datasets]
+
+        lines_per_file = None
+        chunk_size = None
+        if split_params['split_mode'] == 'number_of_parts':
+            lines_per_file = []
+            # Computing the length is expensive!
+            def _file_len(fname):
+                i = 0
+                f = open(fname)
+                for i, l in enumerate(f):
+                    pass
+                f.close()
+                return i + 1
+
+            length = _file_len(input_files[0])
+            parts = int(split_params['split_size'])
+            if length < parts:
+                parts = length
+            len_each, remainder = divmod(length, parts)
+            while length > 0:
+                chunk = len_each
+                if remainder > 0:
+                    chunk += 1
+                lines_per_file.append(chunk)
+                remainder = - 1
+                length -= chunk
+        elif split_params['split_mode'] == 'to_size':
+            chunk_size = int(split_params['split_size'])
+        else:
+            raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+
+        f = open(input_files[0], 'rt')
+        try:
+            chunk_idx = 0
+            file_done = False
+            part_file = None
+            while not file_done:
+                if lines_per_file is None:
+                    this_chunk_size = chunk_size
+                elif chunk_idx < len(lines_per_file):
+                    this_chunk_size = lines_per_file[chunk_idx]
+                    chunk_idx += 1
+                lines_remaining = this_chunk_size
+                part_file = None
+                while lines_remaining > 0:
+                    a_line = f.readline()
+                    if a_line == '':
+                        file_done = True
+                        break
+                    if part_file is None:
+                        part_dir = subdir_generator_function()
+                        part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+                        part_file = open(part_path, 'w')
+                    part_file.write(a_line)
+                    lines_remaining -= 1
+                if part_file is not None:
+                    part_file.close()
+        except Exception, e:
+            log.error('Unable to split files: %s' % str(e))
+            f.close()
+            if part_file is not None:
+                part_file.close()
+            raise
+        f.close()
+
+    split = classmethod(split)
+
+
+class iupac(data.Data):
+    file_ext = 'iupac'
+    line_class = 'line'
+
+    """Add metadata elements"""
+    MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+                    visible=False, no_value=0)
+
+    def write_from_stream(self, dataset, stream):
+        """Writes data from a stream"""
+        # write it twice for now
+        fd, temp_name = tempfile.mkstemp()
+        while 1:
+            chunk = stream.read(1048576)
+            if not chunk:
+                break
+            os.write(fd, chunk)
+        os.close(fd)
+        # rewrite the file with unix newlines
+        fp = open(dataset.file_name, 'wt')
+        for line in file(temp_name, "U"):
+            line = line.strip() + '\n'
+            fp.write(line)
+        fp.close()
+
+    def set_raw_data(self, dataset, data):
+        """Saves the data on the disc"""
+        fd, temp_name = tempfile.mkstemp()
+        os.write(fd, data)
+        os.close(fd)
+        # rewrite the file with unix newlines
+        fp = open(dataset.file_name, 'wt')
+        for line in file(temp_name, "U"):
+            line = line.strip() + '\n'
+            fp.write(line)
+        fp.close()
+        os.remove(temp_name)
+
+    def get_mime(self):
+        """Returns the mime type of the datatype"""
+        return 'text/plain'
+
+    def set_meta(self, dataset, **kwd):
+        """
+        Set the number of lines of data in dataset.
+        """
+        dataset.metadata.data_lines = self.count_data_lines(dataset)
+
+    def estimate_file_lines(self, dataset):
+        """
+        Perform a rough estimate by extrapolating number of lines from a small read.
+        """
+        sample_size = 1048576
+        dataset_fh = open(dataset.file_name)
+        dataset_read = dataset_fh.read(sample_size)
+        dataset_fh.close()
+        sample_lines = dataset_read.count('\n')
+        est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+        return est_lines
+
+    def count_data_lines(self, dataset):
+        """
+        Count the number of lines of data in dataset,
+        skipping all blank lines and comments.
+        """
+        data_lines = 0
+        for line in file(dataset.file_name):
+            line = line.strip()
+            if line and not line.startswith('#'):
+                data_lines += 1
+        return data_lines
+
+    def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+        """
+        Set the peek.  This method is used by various subclasses of Text.
+        """
+        if not dataset.dataset.purged:
+            # The file must exist on disk for the get_file_peek() method
+            dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+                                         skipchars=skipchars)
+            if line_count is None:
+                # See if line_count is stored in the metadata
+                if dataset.metadata.data_lines:
+                    dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+                                               inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+                else:
+                    # Number of lines is not known ( this should not happen ), and auto-detect is
+                    # needed to set metadata
+                    # This can happen when the file is larger than max_optional_metadata_filesize.
+                    if int(dataset.get_size()) <= 1048576:
+                        # Small dataset, recount all lines and reset peek afterward.
+                        lc = self.count_data_lines(dataset)
+                        dataset.metadata.data_lines = lc
+                        dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+                    else:
+                        est_lines = self.estimate_file_lines(dataset)
+                        dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+                                                    inflector.cond_plural(est_lines, self.line_class) )
+            else:
+                dataset.blurb = "%s %s" % (
+                    util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
+
+    def sniff(self, filename):
+        """All IUPAC files should use the rings form determination script """
+        try:
+            from suds.client import Client
+
+            url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
+            client = Client(url)
+            response = client.service.DeterminingForm(file(filename, 'r').read())
+            if response.array[0] == "IUPAC":
+                return True
+            else:
+                return False
+        except ImportError:
+            # cannot use import suds so use simple checker
+            print "using IUPAC simple checker"
+            f = open(filename, "r")
+            firstline = f.readline()
+            f.close()
+
+            if "[" in firstline or "]" in firstline or "(" in firstline or ")" in firstline:
+                if "{" in firstline or "}" in firstline:
+                    return False
+                else:
+                    return True
+            else:
+                return False
+        except Exception, e:
+            # note I am not raising an error rather return False  and let another sniffer try to type this data
+            traceback.print_exc(file=sys.stdout)
+            return False
+
+    def split(cls, input_datasets, subdir_generator_function, split_params):
+        """
+        Split the input files by line.
+        """
+        if split_params is None:
+            return
+
+        if len(input_datasets) > 1:
+            raise Exception("Text file splitting does not support multiple files")
+        input_files = [ds.file_name for ds in input_datasets]
+
+        lines_per_file = None
+        chunk_size = None
+        if split_params['split_mode'] == 'number_of_parts':
+            lines_per_file = []
+            # Computing the length is expensive!
+            def _file_len(fname):
+                i = 0
+                f = open(fname)
+                for i, l in enumerate(f):
+                    pass
+                f.close()
+                return i + 1
+
+            length = _file_len(input_files[0])
+            parts = int(split_params['split_size'])
+            if length < parts:
+                parts = length
+            len_each, remainder = divmod(length, parts)
+            while length > 0:
+                chunk = len_each
+                if remainder > 0:
+                    chunk += 1
+                lines_per_file.append(chunk)
+                remainder = - 1
+                length -= chunk
+        elif split_params['split_mode'] == 'to_size':
+            chunk_size = int(split_params['split_size'])
+        else:
+            raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+
+        f = open(input_files[0], 'rt')
+        try:
+            chunk_idx = 0
+            file_done = False
+            part_file = None
+            while not file_done:
+                if lines_per_file is None:
+                    this_chunk_size = chunk_size
+                elif chunk_idx < len(lines_per_file):
+                    this_chunk_size = lines_per_file[chunk_idx]
+                    chunk_idx += 1
+                lines_remaining = this_chunk_size
+                part_file = None
+                while lines_remaining > 0:
+                    a_line = f.readline()
+                    if a_line == '':
+                        file_done = True
+                        break
+                    if part_file is None:
+                        part_dir = subdir_generator_function()
+                        part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+                        part_file = open(part_path, 'w')
+                    part_file.write(a_line)
+                    lines_remaining -= 1
+                if part_file is not None:
+                    part_file.close()
+        except Exception, e:
+            log.error('Unable to split files: %s' % str(e))
+            f.close()
+            if part_file is not None:
+                part_file.close()
+            raise
+        f.close()
+
+    split = classmethod(split)
+
+
+class linearcode(data.Data):
+    file_ext = 'linearcode'
+    line_class = 'line'
+
+    """Add metadata elements"""
+    MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+                    visible=False, no_value=0)
+
+    def write_from_stream(self, dataset, stream):
+        """Writes data from a stream"""
+        # write it twice for now
+        fd, temp_name = tempfile.mkstemp()
+        while 1:
+            chunk = stream.read(1048576)
+            if not chunk:
+                break
+            os.write(fd, chunk)
+        os.close(fd)
+        # rewrite the file with unix newlines
+        fp = open(dataset.file_name, 'wt')
+        for line in file(temp_name, "U"):
+            line = line.strip() + '\n'
+            fp.write(line)
+        fp.close()
+
+    def set_raw_data(self, dataset, data):
+        """Saves the data on the disc"""
+        fd, temp_name = tempfile.mkstemp()
+        os.write(fd, data)
+        os.close(fd)
+        # rewrite the file with unix newlines
+        fp = open(dataset.file_name, 'wt')
+        for line in file(temp_name, "U"):
+            line = line.strip() + '\n'
+            fp.write(line)
+        fp.close()
+        os.remove(temp_name)
+
+    def get_mime(self):
+        """Returns the mime type of the datatype"""
+        return 'text/plain'
+
+    def set_meta(self, dataset, **kwd):
+        """
+        Set the number of lines of data in dataset.
+        """
+        dataset.metadata.data_lines = self.count_data_lines(dataset)
+
+    def estimate_file_lines(self, dataset):
+        """
+        Perform a rough estimate by extrapolating number of lines from a small read.
+        """
+        sample_size = 1048576
+        dataset_fh = open(dataset.file_name)
+        dataset_read = dataset_fh.read(sample_size)
+        dataset_fh.close()
+        sample_lines = dataset_read.count('\n')
+        est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+        return est_lines
+
+    def count_data_lines(self, dataset):
+        """
+        Count the number of lines of data in dataset,
+        skipping all blank lines and comments.
+        """
+        data_lines = 0
+        for line in file(dataset.file_name):
+            line = line.strip()
+            if line and not line.startswith('#'):
+                data_lines += 1
+        return data_lines
+
+    def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+        """
+        Set the peek.  This method is used by various subclasses of Text.
+        """
+        if not dataset.dataset.purged:
+            # The file must exist on disk for the get_file_peek() method
+            dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+                                         skipchars=skipchars)
+            if line_count is None:
+                # See if line_count is stored in the metadata
+                if dataset.metadata.data_lines:
+                    dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+                                               inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+                else:
+                    # Number of lines is not known ( this should not happen ), and auto-detect is
+                    # needed to set metadata
+                    # This can happen when the file is larger than max_optional_metadata_filesize.
+                    if int(dataset.get_size()) <= 1048576:
+                        # Small dataset, recount all lines and reset peek afterward.
+                        lc = self.count_data_lines(dataset)
+                        dataset.metadata.data_lines = lc
+                        dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+                    else:
+                        est_lines = self.estimate_file_lines(dataset)
+                        dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+                                                    inflector.cond_plural(est_lines, self.line_class) )
+            else:
+                dataset.blurb = "%s %s" % (
+                    util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
+
+    def sniff(self, filename):
+        """All linear code files should use the rings form determination script """
+        try:
+            from suds.client import Client
+
+            url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
+            client = Client(url)
+            lcresponse = client.service.DeterminingForm(file(filename, 'r').read())
+            if lcresponse.array[0] == "LinearCode":
+                print "LinearCode"
+                return True
+            else:
+                print "Unable to guess format"
+                return False
+        except ImportError:
+            # cannot use import suds so use simple checker
+            print "using LinearCode simple checker - nope it does not exist yet"
+            return False
+        except Exception, e:
+            # note I am not raising an error rather return False  and let another sniffer try to type this data
+            traceback.print_exc(file=sys.stdout)
+            return False
+
+    def split(cls, input_datasets, subdir_generator_function, split_params):
+        """
+        Split the input files by line.
+        """
+        if split_params is None:
+            return
+
+        if len(input_datasets) > 1:
+            raise Exception("Text file splitting does not support multiple files")
+        input_files = [ds.file_name for ds in input_datasets]
+
+        lines_per_file = None
+        chunk_size = None
+        if split_params['split_mode'] == 'number_of_parts':
+            lines_per_file = []
+            # Computing the length is expensive!
+            def _file_len(fname):
+                i = 0
+                f = open(fname)
+                for i, l in enumerate(f):
+                    pass
+                f.close()
+                return i + 1
+
+            length = _file_len(input_files[0])
+            parts = int(split_params['split_size'])
+            if length < parts:
+                parts = length
+            len_each, remainder = divmod(length, parts)
+            while length > 0:
+                chunk = len_each
+                if remainder > 0:
+                    chunk += 1
+                lines_per_file.append(chunk)
+                remainder = - 1
+                length -= chunk
+        elif split_params['split_mode'] == 'to_size':
+            chunk_size = int(split_params['split_size'])
+        else:
+            raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+
+        f = open(input_files[0], 'rt')
+        try:
+            chunk_idx = 0
+            file_done = False
+            part_file = None
+            while not file_done:
+                if lines_per_file is None:
+                    this_chunk_size = chunk_size
+                elif chunk_idx < len(lines_per_file):
+                    this_chunk_size = lines_per_file[chunk_idx]
+                    chunk_idx += 1
+                lines_remaining = this_chunk_size
+                part_file = None
+                while lines_remaining > 0:
+                    a_line = f.readline()
+                    if a_line == '':
+                        file_done = True
+                        break
+                    if part_file is None:
+                        part_dir = subdir_generator_function()
+                        part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+                        part_file = open(part_path, 'w')
+                    part_file.write(a_line)
+                    lines_remaining -= 1
+                if part_file is not None:
+                    part_file.close()
+        except Exception, e:
+            log.error('Unable to split files: %s' % str(e))
+            f.close()
+            if part_file is not None:
+                part_file.close()
+            raise
+        f.close()
+
+    split = classmethod(split)
+
+
+class msa(data.Data):
+    file_ext = 'msa'
+    line_class = 'line'
+
+    """Add metadata elements"""
+    MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+                    visible=False, no_value=0)
+
+    def write_from_stream(self, dataset, stream):
+        """Writes data from a stream"""
+        # write it twice for now
+        fd, temp_name = tempfile.mkstemp()
+        while 1:
+            chunk = stream.read(1048576)
+            if not chunk:
+                break
+            os.write(fd, chunk)
+        os.close(fd)
+        # rewrite the file with unix newlines
+        fp = open(dataset.file_name, 'wt')
+        for line in file(temp_name, "U"):
+            line = line.strip() + '\n'
+            fp.write(line)
+        fp.close()
+
+    def set_raw_data(self, dataset, data):
+        """Saves the data on the disc"""
+        fd, temp_name = tempfile.mkstemp()
+        os.write(fd, data)
+        os.close(fd)
+        # rewrite the file with unix newlines
+        fp = open(dataset.file_name, 'wt')
+        for line in file(temp_name, "U"):
+            line = line.strip() + '\n'
+            fp.write(line)
+        fp.close()
+        os.remove(temp_name)
+
+    def get_mime(self):
+        """Returns the mime type of the datatype"""
+        return 'text/plain'
+
+    def set_meta(self, dataset, **kwd):
+        """
+        Set the number of lines of data in dataset.
+        """
+        dataset.metadata.data_lines = self.count_data_lines(dataset)
+
+    def estimate_file_lines(self, dataset):
+        """
+        Perform a rough estimate by extrapolating number of lines from a small read.
+        """
+        sample_size = 1048576
+        dataset_fh = open(dataset.file_name)
+        dataset_read = dataset_fh.read(sample_size)
+        dataset_fh.close()
+        sample_lines = dataset_read.count('\n')
+        est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+        return est_lines
+
+    def count_data_lines(self, dataset):
+        """
+        Count the number of lines of data in dataset,
+        skipping all blank lines and comments.
+        """
+        data_lines = 0
+        for line in file(dataset.file_name):
+            line = line.strip()
+            if line and not line.startswith('#'):
+                data_lines += 1
+        return data_lines
+
+    def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+        """
+        Set the peek.  This method is used by various subclasses of Text.
+        """
+        if not dataset.dataset.purged:
+            # The file must exist on disk for the get_file_peek() method
+            dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+                                         skipchars=skipchars)
+            if line_count is None:
+                # See if line_count is stored in the metadata
+                if dataset.metadata.data_lines:
+                    dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+                                               inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+                else:
+                    # Number of lines is not known ( this should not happen ), and auto-detect is
+                    # needed to set metadata
+                    # This can happen when the file is larger than max_optional_metadata_filesize.
+                    if int(dataset.get_size()) <= 1048576:
+                        # Small dataset, recount all lines and reset peek afterward.
+                        lc = self.count_data_lines(dataset)
+                        dataset.metadata.data_lines = lc
+                        dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+                    else:
+                        est_lines = self.estimate_file_lines(dataset)
+                        dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+                                                    inflector.cond_plural(est_lines, self.line_class) )
+            else:
+                dataset.blurb = "%s %s" % (
+                    util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
+
+    def sniff(self, filename):
+        """All msa Files simply put a '# .msa' in the first line.  """
+        try:
+            f = open(filename, "r")
+            firstline = f.readline().upper()  # note we are uppercasing here to avoid CasE SenSitIVity
+            f.close()
+
+            if "# .MSA" in firstline:
+                return True
+            else:
+                return False
+        except:
+            traceback.print_exc(file=sys.stdout)
+            return False
+
+    def split(cls, input_datasets, subdir_generator_function, split_params):
+        """
+        Split the input files by line.
+        """
+        if split_params is None:
+            return
+
+        if len(input_datasets) > 1:
+            raise Exception("Text file splitting does not support multiple files")
+        input_files = [ds.file_name for ds in input_datasets]
+
+        lines_per_file = None
+        chunk_size = None
+        if split_params['split_mode'] == 'number_of_parts':
+            lines_per_file = []
+            # Computing the length is expensive!
+            def _file_len(fname):
+                i = 0
+                f = open(fname)
+                for i, l in enumerate(f):
+                    pass
+                f.close()
+                return i + 1
+
+            length = _file_len(input_files[0])
+            parts = int(split_params['split_size'])
+            if length < parts:
+                parts = length
+            len_each, remainder = divmod(length, parts)
+            while length > 0:
+                chunk = len_each
+                if remainder > 0:
+                    chunk += 1
+                lines_per_file.append(chunk)
+                remainder = - 1
+                length -= chunk
+        elif split_params['split_mode'] == 'to_size':
+            chunk_size = int(split_params['split_size'])
+        else:
+            raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+
+        f = open(input_files[0], 'rt')
+        try:
+            chunk_idx = 0
+            file_done = False
+            part_file = None
+            while not file_done:
+                if lines_per_file is None:
+                    this_chunk_size = chunk_size
+                elif chunk_idx < len(lines_per_file):
+                    this_chunk_size = lines_per_file[chunk_idx]
+                    chunk_idx += 1
+                lines_remaining = this_chunk_size
+                part_file = None
+                while lines_remaining > 0:
+                    a_line = f.readline()
+                    if a_line == '':
+                        file_done = True
+                        break
+                    if part_file is None:
+                        part_dir = subdir_generator_function()
+                        part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+                        part_file = open(part_path, 'w')
+                    part_file.write(a_line)
+                    lines_remaining -= 1
+                if part_file is not None:
+                    part_file.close()
+        except Exception, e:
+            log.error('Unable to split files: %s' % str(e))
+            f.close()
+            if part_file is not None:
+                part_file.close()
+            raise
+        f.close()
+
+    split = classmethod(split)
+
+
+class wurcs(data.Data):
+    file_ext = 'wurcs'
+    line_class = 'line'
+
+    """Add metadata elements"""
+    MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+                    visible=False, no_value=0)
+
+    def write_from_stream(self, dataset, stream):
+        """Writes data from a stream"""
+        # write it twice for now
+        fd, temp_name = tempfile.mkstemp()
+        while 1:
+            chunk = stream.read(1048576)
+            if not chunk:
+                break
+            os.write(fd, chunk)
+        os.close(fd)
+        # rewrite the file with unix newlines
+        fp = open(dataset.file_name, 'wt')
+        for line in file(temp_name, "U"):
+            line = line.strip() + '\n'
+            fp.write(line)
+        fp.close()
+
+    def set_raw_data(self, dataset, data):
+        """Saves the data on the disc"""
+        fd, temp_name = tempfile.mkstemp()
+        os.write(fd, data)
+        os.close(fd)
+        # rewrite the file with unix newlines
+        fp = open(dataset.file_name, 'wt')
+        for line in file(temp_name, "U"):
+            line = line.strip() + '\n'
+            fp.write(line)
+        fp.close()
+        os.remove(temp_name)
+
+    def get_mime(self):
+        """Returns the mime type of the datatype"""
+        return 'text/plain'
+
+    def set_meta(self, dataset, **kwd):
+        """
+        Set the number of lines of data in dataset.
+        """
+        dataset.metadata.data_lines = self.count_data_lines(dataset)
+
+    def estimate_file_lines(self, dataset):
+        """
+        Perform a rough estimate by extrapolating number of lines from a small read.
+        """
+        sample_size = 1048576
+        dataset_fh = open(dataset.file_name)
+        dataset_read = dataset_fh.read(sample_size)
+        dataset_fh.close()
+        sample_lines = dataset_read.count('\n')
+        est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+        return est_lines
+
+    def count_data_lines(self, dataset):
+        """
+        Count the number of lines of data in dataset,
+        skipping all blank lines and comments.
+        """
+        data_lines = 0
+        for line in file(dataset.file_name):
+            line = line.strip()
+            if line and not line.startswith('#'):
+                data_lines += 1
+        return data_lines
+
+    def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+        """
+        Set the peek.  This method is used by various subclasses of Text.
+        """
+        if not dataset.dataset.purged:
+            # The file must exist on disk for the get_file_peek() method
+            dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+                                         skipchars=skipchars)
+            if line_count is None:
+                # See if line_count is stored in the metadata
+                if dataset.metadata.data_lines:
+                    dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+                                               inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+                else:
+                    # Number of lines is not known ( this should not happen ), and auto-detect is
+                    # needed to set metadata
+                    # This can happen when the file is larger than max_optional_metadata_filesize.
+                    if int(dataset.get_size()) <= 1048576:
+                        # Small dataset, recount all lines and reset peek afterward.
+                        lc = self.count_data_lines(dataset)
+                        dataset.metadata.data_lines = lc
+                        dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+                    else:
+                        est_lines = self.estimate_file_lines(dataset)
+                        dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+                                                    inflector.cond_plural(est_lines, self.line_class) )
+            else:
+                dataset.blurb = "%s %s" % (
+                    util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
+
+    def sniff(self, filename):
+        """All WURCS Files start with WURCS= then the version number. see http://www.wurcs-wg.org/definition.php and  http://rings.t.soka.ac.jp/
+WURCS=2.0/4,3/[x2112h+1:x|1,5|2*NCC/3=O|4*OSO/3=O/3=O][12122a+1:b|1,5][12112h+1:b|1,5|2*NCC/3=O|6*OSO/3=O/3=O][12122a+1:b|1,5]1+3,2+1|2+4,3+1|3+3,4+1"""
+        try:
+            f = open(filename, "r")
+            firstline = f.readline().upper()  # note we are uppercasing here to avoid CasE SenSitIVity
+            f.close()
+            if "WURCS" in firstline:
+                return True
+            else:
+                return False
+        except:
+            traceback.print_exc(file=sys.stdout)
+            return False
+
+
+    def split(cls, input_datasets, subdir_generator_function, split_params):
+        """
+        Split the input files by line.
+        """
+        if split_params is None:
+            return
+
+        if len(input_datasets) > 1:
+            raise Exception("Text file splitting does not support multiple files")
+        input_files = [ds.file_name for ds in input_datasets]
+
+        lines_per_file = None
+        chunk_size = None
+        if split_params['split_mode'] == 'number_of_parts':
+            lines_per_file = []
+            # Computing the length is expensive!
+            def _file_len(fname):
+                i = 0
+                f = open(fname)
+                for i, l in enumerate(f):
+                    pass
+                f.close()
+                return i + 1
+
+            length = _file_len(input_files[0])
+            parts = int(split_params['split_size'])
+            if length < parts:
+                parts = length
+            len_each, remainder = divmod(length, parts)
+            while length > 0:
+                chunk = len_each
+                if remainder > 0:
+                    chunk += 1
+                lines_per_file.append(chunk)
+                remainder = - 1
+                length -= chunk
+        elif split_params['split_mode'] == 'to_size':
+            chunk_size = int(split_params['split_size'])
+        else:
+            raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+
+        f = open(input_files[0], 'rt')
+        try:
+            chunk_idx = 0
+            file_done = False
+            part_file = None
+            while not file_done:
+                if lines_per_file is None:
+                    this_chunk_size = chunk_size
+                elif chunk_idx < len(lines_per_file):
+                    this_chunk_size = lines_per_file[chunk_idx]
+                    chunk_idx += 1
+                lines_remaining = this_chunk_size
+                part_file = None
+                while lines_remaining > 0:
+                    a_line = f.readline()
+                    if a_line == '':
+                        file_done = True
+                        break
+                    if part_file is None:
+                        part_dir = subdir_generator_function()
+                        part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+                        part_file = open(part_path, 'w')
+                    part_file.write(a_line)
+                    lines_remaining -= 1
+                if part_file is not None:
+                    part_file.close()
+        except Exception, e:
+            log.error('Unable to split files: %s' % str(e))
+            f.close()
+            if part_file is not None:
+                part_file.close()
+            raise
+        f.close()
+
+    split = classmethod(split)
+
+