Mercurial > repos > chrisb > gap_datatypes

__license__ = "MIT"

import logging
from galaxy.datatypes import metadata
import mimetypes
import os
import shutil
import sys
import traceback
import tempfile
import zipfile
from cgi import escape
from inspect import isclass
import galaxy.util as util
from galaxy.datatypes import data
from galaxy.datatypes.metadata import \
    MetadataElement  # import directly to maintain ease of use in Datatype class definitions
from galaxy.util import inflector
from galaxy.util.bunch import Bunch
from galaxy.util.odict import odict
from galaxy.util.sanitize_html import sanitize_html

from galaxy.datatypes import dataproviders

from galaxy import eggs

eggs.require("Paste")
import paste


class kcf(data.Data):
    file_ext = 'kcf'
    line_class = 'line'

    """Add metadata elements"""
    MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
                    visible=False, no_value=0)

    def write_from_stream(self, dataset, stream):
        """Writes data from a stream"""
        # write it twice for now
        fd, temp_name = tempfile.mkstemp()
        while 1:
            chunk = stream.read(1048576)
            if not chunk:
                break
            os.write(fd, chunk)
        os.close(fd)
        # rewrite the file with unix newlines
        fp = open(dataset.file_name, 'wt')
        for line in file(temp_name, "U"):
            line = line.strip() + '\n'
            fp.write(line)
        fp.close()

    def set_raw_data(self, dataset, data):
        """Saves the data on the disc"""
        fd, temp_name = tempfile.mkstemp()
        os.write(fd, data)
        os.close(fd)
        # rewrite the file with unix newlines
        fp = open(dataset.file_name, 'wt')
        for line in file(temp_name, "U"):
            line = line.strip() + '\n'
            fp.write(line)
        fp.close()
        os.remove(temp_name)

    def get_mime(self):
        """Returns the mime type of the datatype"""
        return 'text/plain'

    def set_meta(self, dataset, **kwd):
        """
        Set the number of lines of data in dataset.
        """
        dataset.metadata.data_lines = self.count_data_lines(dataset)

    def estimate_file_lines(self, dataset):
        """
        Perform a rough estimate by extrapolating number of lines from a small read.
        """
        sample_size = 1048576
        dataset_fh = open(dataset.file_name)
        dataset_read = dataset_fh.read(sample_size)
        dataset_fh.close()
        sample_lines = dataset_read.count('\n')
        est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
        return est_lines

    def count_data_lines(self, dataset):
        """
        Count the number of lines of data in dataset,
        skipping all blank lines and comments.
        """
        data_lines = 0
        for line in file(dataset.file_name):
            line = line.strip()
            if line and not line.startswith('#'):
                data_lines += 1
        return data_lines

    def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
        """
        Set the peek.  This method is used by various subclasses of Text.
        """
        if not dataset.dataset.purged:
            # The file must exist on disk for the get_file_peek() method
            dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
                                         skipchars=skipchars)
            if line_count is None:
                # See if line_count is stored in the metadata
                if dataset.metadata.data_lines:
                    dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
                                               inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
                else:
                    # Number of lines is not known ( this should not happen ), and auto-detect is
                    # needed to set metadata
                    # This can happen when the file is larger than max_optional_metadata_filesize.
                    if int(dataset.get_size()) <= 1048576:
                        # Small dataset, recount all lines and reset peek afterward.
                        lc = self.count_data_lines(dataset)
                        dataset.metadata.data_lines = lc
                        dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
                    else:
                        est_lines = self.estimate_file_lines(dataset)
                        dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
                                                    inflector.cond_plural(est_lines, self.line_class) )
            else:
                dataset.blurb = "%s %s" % (
                    util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
        else:
            dataset.peek = 'file does not exist'
            dataset.blurb = 'file purged from disk'

    def sniff(self, filename):
        """All KCF Files simply put a 'ENTRY' in its first line.
        This applies to all possible kcfs. In this case check
        for  'Glycan' to confirm it's a glycan """
        try:
            from suds.client import Client

            url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
            client = Client(url)
            kcfresponse = client.service.DeterminingForm(file(filename, 'r').read())
            if kcfresponse.array[0] == "KCF":
                return True
            else:
                return False
        except ImportError:
            # cannot use import suds so use simple checker
            print "using KCF simple checker"
            f = open(filename, "r")
            firstline = f.readline().upper()  # note we are uppercasing here to avoid CasE SenSitIVity
            f.close()

            if "ENTRY" in firstline and "GLYCAN" in firstline:
                return True
            else:
                return False
        except Exception, e:
            # note I am not raising an error rather return False  and let another sniffer try to type this data
            traceback.print_exc(file=sys.stdout)
            return False

    def split(cls, input_datasets, subdir_generator_function, split_params):
        """
        Split the input files by line.
        """
        if split_params is None:
            return

        if len(input_datasets) > 1:
            raise Exception("Text file splitting does not support multiple files")
        input_files = [ds.file_name for ds in input_datasets]

        lines_per_file = None
        chunk_size = None
        if split_params['split_mode'] == 'number_of_parts':
            lines_per_file = []
            # Computing the length is expensive!
            def _file_len(fname):
                i = 0
                f = open(fname)
                for i, l in enumerate(f):
                    pass
                f.close()
                return i + 1

            length = _file_len(input_files[0])
            parts = int(split_params['split_size'])
            if length < parts:
                parts = length
            len_each, remainder = divmod(length, parts)
            while length > 0:
                chunk = len_each
                if remainder > 0:
                    chunk += 1
                lines_per_file.append(chunk)
                remainder = - 1
                length -= chunk
        elif split_params['split_mode'] == 'to_size':
            chunk_size = int(split_params['split_size'])
        else:
            raise Exception('Unsupported split mode %s' % split_params['split_mode'])

        f = open(input_files[0], 'rt')
        try:
            chunk_idx = 0
            file_done = False
            part_file = None
            while not file_done:
                if lines_per_file is None:
                    this_chunk_size = chunk_size
                elif chunk_idx < len(lines_per_file):
                    this_chunk_size = lines_per_file[chunk_idx]
                    chunk_idx += 1
                lines_remaining = this_chunk_size
                part_file = None
                while lines_remaining > 0:
                    a_line = f.readline()
                    if a_line == '':
                        file_done = True
                        break
                    if part_file is None:
                        part_dir = subdir_generator_function()
                        part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
                        part_file = open(part_path, 'w')
                    part_file.write(a_line)
                    lines_remaining -= 1
                if part_file is not None:
                    part_file.close()
        except Exception, e:
            log.error('Unable to split files: %s' % str(e))
            f.close()
            if part_file is not None:
                part_file.close()
            raise
        f.close()

    split = classmethod(split)


class glycoct(data.Data):
    file_ext = 'glycoct'
    line_class = 'line'

    """Add metadata elements"""
    MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
                    visible=False, no_value=0)

    def write_from_stream(self, dataset, stream):
        """Writes data from a stream"""
        # write it twice for now
        fd, temp_name = tempfile.mkstemp()
        while 1:
            chunk = stream.read(1048576)
            if not chunk:
                break
            os.write(fd, chunk)
        os.close(fd)
        # rewrite the file with unix newlines
        fp = open(dataset.file_name, 'wt')
        for line in file(temp_name, "U"):
            line = line.strip() + '\n'
            fp.write(line)
        fp.close()

    def set_raw_data(self, dataset, data):
        """Saves the data on the disc"""
        fd, temp_name = tempfile.mkstemp()
        os.write(fd, data)
        os.close(fd)
        # rewrite the file with unix newlines
        fp = open(dataset.file_name, 'wt')
        for line in file(temp_name, "U"):
            line = line.strip() + '\n'
            fp.write(line)
        fp.close()
        os.remove(temp_name)

    def get_mime(self):
        """Returns the mime type of the datatype"""
        return 'text/plain'

    def set_meta(self, dataset, **kwd):
        """
        Set the number of lines of data in dataset.
        """
        dataset.metadata.data_lines = self.count_data_lines(dataset)

    def estimate_file_lines(self, dataset):
        """
        Perform a rough estimate by extrapolating number of lines from a small read.
        """
        sample_size = 1048576
        dataset_fh = open(dataset.file_name)
        dataset_read = dataset_fh.read(sample_size)
        dataset_fh.close()
        sample_lines = dataset_read.count('\n')
        est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
        return est_lines

    def count_data_lines(self, dataset):
        """
        Count the number of lines of data in dataset,
        skipping all blank lines and comments.
        """
        data_lines = 0
        for line in file(dataset.file_name):
            line = line.strip()
            if line and not line.startswith('#'):
                data_lines += 1
        return data_lines

    def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
        """
        Set the peek.  This method is used by various subclasses of Text.
        """
        if not dataset.dataset.purged:
            # The file must exist on disk for the get_file_peek() method
            dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
                                         skipchars=skipchars)
            if line_count is None:
                # See if line_count is stored in the metadata
                if dataset.metadata.data_lines:
                    dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
                                               inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
                else:
                    # Number of lines is not known ( this should not happen ), and auto-detect is
                    # needed to set metadata
                    # This can happen when the file is larger than max_optional_metadata_filesize.
                    if int(dataset.get_size()) <= 1048576:
                        # Small dataset, recount all lines and reset peek afterward.
                        lc = self.count_data_lines(dataset)
                        dataset.metadata.data_lines = lc
                        dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
                    else:
                        est_lines = self.estimate_file_lines(dataset)
                        dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
                                                    inflector.cond_plural(est_lines, self.line_class) )
            else:
                dataset.blurb = "%s %s" % (
                    util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
        else:
            dataset.peek = 'file does not exist'
            dataset.blurb = 'file purged from disk'

    def sniff(self, filename):
        """All glycoct_condensed files simply put a 'RES' in its first line and a LIN later. """
        try:
            f = open(filename, "r")
            firstline = f.readline().upper()  # note we are uppercasing here to avoid CasE SenSitIVity
            lines = f.read()
            f.close()

            # if "RES" in firstline and "LIN" in lines:
            if "RES" in firstline and "LIN" in lines:
                return True
            else:
                return False
        except Exception, e:
            # note I am not raising an error rather return False  and let another sniffer try to type this data
            traceback.print_exc(file=sys.stdout)
            return False

    def split(cls, input_datasets, subdir_generator_function, split_params):
        """
        Split the input files by line.
        """
        if split_params is None:
            return

        if len(input_datasets) > 1:
            raise Exception("Text file splitting does not support multiple files")
        input_files = [ds.file_name for ds in input_datasets]

        lines_per_file = None
        chunk_size = None
        if split_params['split_mode'] == 'number_of_parts':
            lines_per_file = []
            # Computing the length is expensive!
            def _file_len(fname):
                i = 0
                f = open(fname)
                for i, l in enumerate(f):
                    pass
                f.close()
                return i + 1

            length = _file_len(input_files[0])
            parts = int(split_params['split_size'])
            if length < parts:
                parts = length
            len_each, remainder = divmod(length, parts)
            while length > 0:
                chunk = len_each
                if remainder > 0:
                    chunk += 1
                lines_per_file.append(chunk)
                remainder = - 1
                length -= chunk
        elif split_params['split_mode'] == 'to_size':
            chunk_size = int(split_params['split_size'])
        else:
            raise Exception('Unsupported split mode %s' % split_params['split_mode'])

        f = open(input_files[0], 'rt')
        try:
            chunk_idx = 0
            file_done = False
            part_file = None
            while not file_done:
                if lines_per_file is None:
                    this_chunk_size = chunk_size
                elif chunk_idx < len(lines_per_file):
                    this_chunk_size = lines_per_file[chunk_idx]
                    chunk_idx += 1
                lines_remaining = this_chunk_size
                part_file = None
                while lines_remaining > 0:
                    a_line = f.readline()
                    if a_line == '':
                        file_done = True
                        break
                    if part_file is None:
                        part_dir = subdir_generator_function()
                        part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
                        part_file = open(part_path, 'w')
                    part_file.write(a_line)
                    lines_remaining -= 1
                if part_file is not None:
                    part_file.close()
        except Exception, e:
            log.error('Unable to split files: %s' % str(e))
            f.close()
            if part_file is not None:
                part_file.close()
            raise
        f.close()

    split = classmethod(split)

# ------------- Utility methods --------------

# nice_size used to be here, but to resolve cyclical dependencies it's been
# moved to galaxy.util.  It belongs there anyway since it's used outside
# datatypes.
nice_size = util.nice_size


def get_test_fname(fname):
    """Returns test data filename"""
    path, name = os.path.split(__file__)
    full_path = os.path.join(path, 'test', fname)
    return full_path


def get_file_peek(file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipchars=[]):
    """
    Returns the first LINE_COUNT lines wrapped to WIDTH

    ## >>> fname = get_test_fname('4.bed')
    ## >>> get_file_peek(fname)
    ## 'chr22    30128507    31828507    uc003bnx.1_cds_2_0_chr22_29227_f    0    +\n'

    """
    # Set size for file.readline() to a negative number to force it to
    # read until either a newline or EOF.  Needed for datasets with very
    # long lines.
    if WIDTH == 'unlimited':
        WIDTH = -1
    lines = []
    count = 0
    file_type = None
    data_checked = False
    temp = open(file_name, "U")
    while count <= LINE_COUNT:
        line = temp.readline(WIDTH)
        if line and not is_multi_byte and not data_checked:
            # See if we have a compressed or binary file
            if line[0:2] == util.gzip_magic:
                file_type = 'gzipped'
                break
            else:
                for char in line:
                    if ord(char) > 128:
                        file_type = 'binary'
                        break
            data_checked = True
        if file_type in ['gzipped', 'binary']:
            break
        skip_line = False
        for skipchar in skipchars:
            if line.startswith(skipchar):
                skip_line = True
                break
        if not skip_line:
            lines.append(line)
            count += 1
    temp.close()
    if file_type in ['gzipped', 'binary']:
        text = "%s file" % file_type
    else:
        try:
            text = unicode('\n'.join(lines), 'utf-8')
        except UnicodeDecodeError:
            text = "binary/unknown file"
    return text


class glycoct_xml(data.Data):
    file_ext = 'glycoct_xml'
    line_class = 'line'

    """Add metadata elements"""
    MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
                    visible=False, no_value=0)

    def write_from_stream(self, dataset, stream):
        """Writes data from a stream"""
        # write it twice for now
        fd, temp_name = tempfile.mkstemp()
        while 1:
            chunk = stream.read(1048576)
            if not chunk:
                break
            os.write(fd, chunk)
        os.close(fd)
        # rewrite the file with unix newlines
        fp = open(dataset.file_name, 'wt')
        for line in file(temp_name, "U"):
            line = line.strip() + '\n'
            fp.write(line)
        fp.close()

    def set_raw_data(self, dataset, data):
        """Saves the data on the disc"""
        fd, temp_name = tempfile.mkstemp()
        os.write(fd, data)
        os.close(fd)
        # rewrite the file with unix newlines
        fp = open(dataset.file_name, 'wt')
        for line in file(temp_name, "U"):
            line = line.strip() + '\n'
            fp.write(line)
        fp.close()
        os.remove(temp_name)

    def get_mime(self):
        """Returns the mime type of the datatype"""
        return 'text/xml'

    def set_meta(self, dataset, **kwd):
        """
        Set the number of lines of data in dataset.
        """
        dataset.metadata.data_lines = self.count_data_lines(dataset)

    def estimate_file_lines(self, dataset):
        """
        Perform a rough estimate by extrapolating number of lines from a small read.
        """
        sample_size = 1048576
        dataset_fh = open(dataset.file_name)
        dataset_read = dataset_fh.read(sample_size)
        dataset_fh.close()
        sample_lines = dataset_read.count('\n')
        est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
        return est_lines

    def count_data_lines(self, dataset):
        """
        Count the number of lines of data in dataset,
        skipping all blank lines and comments.
        """
        data_lines = 0
        for line in file(dataset.file_name):
            line = line.strip()
            if line and not line.startswith('#'):
                data_lines += 1
        return data_lines

    def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
        """
        Set the peek.  This method is used by various subclasses of Text.
        """
        if not dataset.dataset.purged:
            # The file must exist on disk for the get_file_peek() method
            dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
                                         skipchars=skipchars)
            if line_count is None:
                # See if line_count is stored in the metadata
                if dataset.metadata.data_lines:
                    dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
                                               inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
                else:
                    # Number of lines is not known ( this should not happen ), and auto-detect is
                    # needed to set metadata
                    # This can happen when the file is larger than max_optional_metadata_filesize.
                    if int(dataset.get_size()) <= 1048576:
                        # Small dataset, recount all lines and reset peek afterward.
                        lc = self.count_data_lines(dataset)
                        dataset.metadata.data_lines = lc
                        dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
                    else:
                        est_lines = self.estimate_file_lines(dataset)
                        dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
                                                    inflector.cond_plural(est_lines, self.line_class) )
            else:
                dataset.blurb = "%s %s" % (
                    util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
        else:
            dataset.peek = 'file does not exist'
            dataset.blurb = 'file purged from disk'

    def sniff(self, filename):
        """All glycoct XML files should use the rings form determination script """
        try:
            from suds.client import Client

            url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
            client = Client(url)
            response = client.service.DeterminingForm(file(filename, 'r').read())
            if response.array[0] == "GlycoCT":
                return True
            else:
                return False
        except ImportError:
            # cannot use import suds so use simple checker
            print "using glycoct XML simple checker"
            import xml.etree.cElementTree as ET

            tree = ET.parse(filename)
            root = tree.getroot()
            if root.tag == 'sugar':
                print root.tag, root.attrib
                return True
            else:
                return False
        except Exception, e:
            # note I am not raising an error rather return False  and let another sniffer try to type this data
            traceback.print_exc(file=sys.stdout)
            return False

    def split(cls, input_datasets, subdir_generator_function, split_params):
        """
        Split the input files by line.
        """
        if split_params is None:
            return

        if len(input_datasets) > 1:
            raise Exception("Text file splitting does not support multiple files")
        input_files = [ds.file_name for ds in input_datasets]

        lines_per_file = None
        chunk_size = None
        if split_params['split_mode'] == 'number_of_parts':
            lines_per_file = []
            # Computing the length is expensive!
            def _file_len(fname):
                i = 0
                f = open(fname)
                for i, l in enumerate(f):
                    pass
                f.close()
                return i + 1

            length = _file_len(input_files[0])
            parts = int(split_params['split_size'])
            if length < parts:
                parts = length
            len_each, remainder = divmod(length, parts)
            while length > 0:
                chunk = len_each
                if remainder > 0:
                    chunk += 1
                lines_per_file.append(chunk)
                remainder = - 1
                length -= chunk
        elif split_params['split_mode'] == 'to_size':
            chunk_size = int(split_params['split_size'])
        else:
            raise Exception('Unsupported split mode %s' % split_params['split_mode'])

        f = open(input_files[0], 'rt')
        try:
            chunk_idx = 0
            file_done = False
            part_file = None
            while not file_done:
                if lines_per_file is None:
                    this_chunk_size = chunk_size
                elif chunk_idx < len(lines_per_file):
                    this_chunk_size = lines_per_file[chunk_idx]
                    chunk_idx += 1
                lines_remaining = this_chunk_size
                part_file = None
                while lines_remaining > 0:
                    a_line = f.readline()
                    if a_line == '':
                        file_done = True
                        break
                    if part_file is None:
                        part_dir = subdir_generator_function()
                        part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
                        part_file = open(part_path, 'w')
                    part_file.write(a_line)
                    lines_remaining -= 1
                if part_file is not None:
                    part_file.close()
        except Exception, e:
            log.error('Unable to split files: %s' % str(e))
            f.close()
            if part_file is not None:
                part_file.close()
            raise
        f.close()

    split = classmethod(split)


class glydeii(data.Data):
    file_ext = 'glydeii'
    line_class = 'line'

    """Add metadata elements"""
    MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
                    visible=False, no_value=0)

    def write_from_stream(self, dataset, stream):
        """Writes data from a stream"""
        # write it twice for now
        fd, temp_name = tempfile.mkstemp()
        while 1:
            chunk = stream.read(1048576)
            if not chunk:
                break
            os.write(fd, chunk)
        os.close(fd)
        # rewrite the file with unix newlines
        fp = open(dataset.file_name, 'wt')
        for line in file(temp_name, "U"):
            line = line.strip() + '\n'
            fp.write(line)
        fp.close()

    def set_raw_data(self, dataset, data):
        """Saves the data on the disc"""
        fd, temp_name = tempfile.mkstemp()
        os.write(fd, data)
        os.close(fd)
        # rewrite the file with unix newlines
        fp = open(dataset.file_name, 'wt')
        for line in file(temp_name, "U"):
            line = line.strip() + '\n'
            fp.write(line)
        fp.close()
        os.remove(temp_name)

    def get_mime(self):
        """Returns the mime type of the datatype"""
        return 'text/xml'

    def set_meta(self, dataset, **kwd):
        """
        Set the number of lines of data in dataset.
        """
        dataset.metadata.data_lines = self.count_data_lines(dataset)

    def estimate_file_lines(self, dataset):
        """
        Perform a rough estimate by extrapolating number of lines from a small read.
        """
        sample_size = 1048576
        dataset_fh = open(dataset.file_name)
        dataset_read = dataset_fh.read(sample_size)
        dataset_fh.close()
        sample_lines = dataset_read.count('\n')
        est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
        return est_lines

    def count_data_lines(self, dataset):
        """
        Count the number of lines of data in dataset,
        skipping all blank lines and comments.
        """
        data_lines = 0
        for line in file(dataset.file_name):
            line = line.strip()
            if line and not line.startswith('#'):
                data_lines += 1
        return data_lines

    def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
        """
        Set the peek.  This method is used by various subclasses of Text.
        """
        if not dataset.dataset.purged:
            # The file must exist on disk for the get_file_peek() method
            dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
                                         skipchars=skipchars)
            if line_count is None:
                # See if line_count is stored in the metadata
                if dataset.metadata.data_lines:
                    dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
                                               inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
                else:
                    # Number of lines is not known ( this should not happen ), and auto-detect is
                    # needed to set metadata
                    # This can happen when the file is larger than max_optional_metadata_filesize.
                    if int(dataset.get_size()) <= 1048576:
                        # Small dataset, recount all lines and reset peek afterward.
                        lc = self.count_data_lines(dataset)
                        dataset.metadata.data_lines = lc
                        dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
                    else:
                        est_lines = self.estimate_file_lines(dataset)
                        dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
                                                    inflector.cond_plural(est_lines, self.line_class) )
            else:
                dataset.blurb = "%s %s" % (
                    util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
        else:
            dataset.peek = 'file does not exist'
            dataset.blurb = 'file purged from disk'

    def sniff(self, filename):
        """All GlydeII XML files should use the rings form determination script """
        try:
            from suds.client import Client

            url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
            client = Client(url)
            response = client.service.DeterminingForm(file(filename, 'r').read())
            if response.array[0] == "GLYDEII":
                return True
            else:
                return False
        except ImportError:
            # cannot use import suds so use simple checker
            print "using GlydeII simple checker"
            import xml.etree.cElementTree as ET

            tree = ET.parse(filename)
            root = tree.getroot()
            if root.tag == 'GlydeII':
                print root.tag
                return True
            else:
                return False
        except Exception, e:
            # note I am not raising an error rather return False  and let another sniffer try to type this data
            traceback.print_exc(file=sys.stdout)
            return False

    def split(cls, input_datasets, subdir_generator_function, split_params):
        """
        Split the input files by line.
        """
        if split_params is None:
            return

        if len(input_datasets) > 1:
            raise Exception("Text file splitting does not support multiple files")
        input_files = [ds.file_name for ds in input_datasets]

        lines_per_file = None
        chunk_size = None
        if split_params['split_mode'] == 'number_of_parts':
            lines_per_file = []
            # Computing the length is expensive!
            def _file_len(fname):
                i = 0
                f = open(fname)
                for i, l in enumerate(f):
                    pass
                f.close()
                return i + 1

            length = _file_len(input_files[0])
            parts = int(split_params['split_size'])
            if length < parts:
                parts = length
            len_each, remainder = divmod(length, parts)
            while length > 0:
                chunk = len_each
                if remainder > 0:
                    chunk += 1
                lines_per_file.append(chunk)
                remainder = - 1
                length -= chunk
        elif split_params['split_mode'] == 'to_size':
            chunk_size = int(split_params['split_size'])
        else:
            raise Exception('Unsupported split mode %s' % split_params['split_mode'])

        f = open(input_files[0], 'rt')
        try:
            chunk_idx = 0
            file_done = False
            part_file = None
            while not file_done:
                if lines_per_file is None:
                    this_chunk_size = chunk_size
                elif chunk_idx < len(lines_per_file):
                    this_chunk_size = lines_per_file[chunk_idx]
                    chunk_idx += 1
                lines_remaining = this_chunk_size
                part_file = None
                while lines_remaining > 0:
                    a_line = f.readline()
                    if a_line == '':
                        file_done = True
                        break
                    if part_file is None:
                        part_dir = subdir_generator_function()
                        part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
                        part_file = open(part_path, 'w')
                    part_file.write(a_line)
                    lines_remaining -= 1
                if part_file is not None:
                    part_file.close()
        except Exception, e:
            log.error('Unable to split files: %s' % str(e))
            f.close()
            if part_file is not None:
                part_file.close()
            raise
        f.close()

    split = classmethod(split)


class linucs(data.Data):
    file_ext = 'linucs'
    line_class = 'line'

    """Add metadata elements"""
    MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
                    visible=False, no_value=0)

    def write_from_stream(self, dataset, stream):
        """Writes data from a stream"""
        # write it twice for now
        fd, temp_name = tempfile.mkstemp()
        while 1:
            chunk = stream.read(1048576)
            if not chunk:
                break
            os.write(fd, chunk)
        os.close(fd)
        # rewrite the file with unix newlines
        fp = open(dataset.file_name, 'wt')
        for line in file(temp_name, "U"):
            line = line.strip() + '\n'
            fp.write(line)
        fp.close()

    def set_raw_data(self, dataset, data):
        """Saves the data on the disc"""
        fd, temp_name = tempfile.mkstemp()
        os.write(fd, data)
        os.close(fd)
        # rewrite the file with unix newlines
        fp = open(dataset.file_name, 'wt')
        for line in file(temp_name, "U"):
            line = line.strip() + '\n'
            fp.write(line)
        fp.close()
        os.remove(temp_name)

    def get_mime(self):
        """Returns the mime type of the datatype"""
        return 'text/plain'

    def set_meta(self, dataset, **kwd):
        """
        Set the number of lines of data in dataset.
        """
        dataset.metadata.data_lines = self.count_data_lines(dataset)

    def estimate_file_lines(self, dataset):
        """
        Perform a rough estimate by extrapolating number of lines from a small read.
        """
        sample_size = 1048576
        dataset_fh = open(dataset.file_name)
        dataset_read = dataset_fh.read(sample_size)
        dataset_fh.close()
        sample_lines = dataset_read.count('\n')
        est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
        return est_lines

    def count_data_lines(self, dataset):
        """
        Count the number of lines of data in dataset,
        skipping all blank lines and comments.
        """
        data_lines = 0
        for line in file(dataset.file_name):
            line = line.strip()
            if line and not line.startswith('#'):
                data_lines += 1
        return data_lines

    def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
        """
        Set the peek.  This method is used by various subclasses of Text.
        """
        if not dataset.dataset.purged:
            # The file must exist on disk for the get_file_peek() method
            dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
                                         skipchars=skipchars)
            if line_count is None:
                # See if line_count is stored in the metadata
                if dataset.metadata.data_lines:
                    dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
                                               inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
                else:
                    # Number of lines is not known ( this should not happen ), and auto-detect is
                    # needed to set metadata
                    # This can happen when the file is larger than max_optional_metadata_filesize.
                    if int(dataset.get_size()) <= 1048576:
                        # Small dataset, recount all lines and reset peek afterward.
                        lc = self.count_data_lines(dataset)
                        dataset.metadata.data_lines = lc
                        dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
                    else:
                        est_lines = self.estimate_file_lines(dataset)
                        dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
                                                    inflector.cond_plural(est_lines, self.line_class) )
            else:
                dataset.blurb = "%s %s" % (
                    util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
        else:
            dataset.peek = 'file does not exist'
            dataset.blurb = 'file purged from disk'

    def sniff(self, filename):
        """All LINUCS files should use the rings form determination script """
        try:
            from suds.client import Client

            url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
            client = Client(url)
            response = client.service.DeterminingForm(file(filename, 'r').read())
            if response.array[0] == "LINUCS":
                return True
            else:
                return False
        except ImportError:
            # cannot use import suds so use simple checker
            print "using LINUCS simple checker"

            f = open(filename, "r")
            firstline = f.readline()
            f.close()

            if "[" in firstline and "]" in firstline and "{" in firstline and "}" in firstline:
                return True
            else:
                return False
        except Exception, e:
            # note I am not raising an error rather return False  and let another sniffer try to type this data
            traceback.print_exc(file=sys.stdout)
            return False

    def split(cls, input_datasets, subdir_generator_function, split_params):
        """
        Split the input files by line.
        """
        if split_params is None:
            return

        if len(input_datasets) > 1:
            raise Exception("Text file splitting does not support multiple files")
        input_files = [ds.file_name for ds in input_datasets]

        lines_per_file = None
        chunk_size = None
        if split_params['split_mode'] == 'number_of_parts':
            lines_per_file = []
            # Computing the length is expensive!
            def _file_len(fname):
                i = 0
                f = open(fname)
                for i, l in enumerate(f):
                    pass
                f.close()
                return i + 1

            length = _file_len(input_files[0])
            parts = int(split_params['split_size'])
            if length < parts:
                parts = length
            len_each, remainder = divmod(length, parts)
            while length > 0:
                chunk = len_each
                if remainder > 0:
                    chunk += 1
                lines_per_file.append(chunk)
                remainder = - 1
                length -= chunk
        elif split_params['split_mode'] == 'to_size':
            chunk_size = int(split_params['split_size'])
        else:
            raise Exception('Unsupported split mode %s' % split_params['split_mode'])

        f = open(input_files[0], 'rt')
        try:
            chunk_idx = 0
            file_done = False
            part_file = None
            while not file_done:
                if lines_per_file is None:
                    this_chunk_size = chunk_size
                elif chunk_idx < len(lines_per_file):
                    this_chunk_size = lines_per_file[chunk_idx]
                    chunk_idx += 1
                lines_remaining = this_chunk_size
                part_file = None
                while lines_remaining > 0:
                    a_line = f.readline()
                    if a_line == '':
                        file_done = True
                        break
                    if part_file is None:
                        part_dir = subdir_generator_function()
                        part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
                        part_file = open(part_path, 'w')
                    part_file.write(a_line)
                    lines_remaining -= 1
                if part_file is not None:
                    part_file.close()
        except Exception, e:
            log.error('Unable to split files: %s' % str(e))
            f.close()
            if part_file is not None:
                part_file.close()
            raise
        f.close()

    split = classmethod(split)


class iupac(data.Data):
    file_ext = 'iupac'
    line_class = 'line'

    """Add metadata elements"""
    MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
                    visible=False, no_value=0)

    def write_from_stream(self, dataset, stream):
        """Writes data from a stream"""
        # write it twice for now
        fd, temp_name = tempfile.mkstemp()
        while 1:
            chunk = stream.read(1048576)
            if not chunk:
                break
            os.write(fd, chunk)
        os.close(fd)
        # rewrite the file with unix newlines
        fp = open(dataset.file_name, 'wt')
        for line in file(temp_name, "U"):
            line = line.strip() + '\n'
            fp.write(line)
        fp.close()

    def set_raw_data(self, dataset, data):
        """Saves the data on the disc"""
        fd, temp_name = tempfile.mkstemp()
        os.write(fd, data)
        os.close(fd)
        # rewrite the file with unix newlines
        fp = open(dataset.file_name, 'wt')
        for line in file(temp_name, "U"):
            line = line.strip() + '\n'
            fp.write(line)
        fp.close()
        os.remove(temp_name)

    def get_mime(self):
        """Returns the mime type of the datatype"""
        return 'text/plain'

    def set_meta(self, dataset, **kwd):
        """
        Set the number of lines of data in dataset.
        """
        dataset.metadata.data_lines = self.count_data_lines(dataset)

    def estimate_file_lines(self, dataset):
        """
        Perform a rough estimate by extrapolating number of lines from a small read.
        """
        sample_size = 1048576
        dataset_fh = open(dataset.file_name)
        dataset_read = dataset_fh.read(sample_size)
        dataset_fh.close()
        sample_lines = dataset_read.count('\n')
        est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
        return est_lines

    def count_data_lines(self, dataset):
        """
        Count the number of lines of data in dataset,
        skipping all blank lines and comments.
        """
        data_lines = 0
        for line in file(dataset.file_name):
            line = line.strip()
            if line and not line.startswith('#'):
                data_lines += 1
        return data_lines

    def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
        """
        Set the peek.  This method is used by various subclasses of Text.
        """
        if not dataset.dataset.purged:
            # The file must exist on disk for the get_file_peek() method
            dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
                                         skipchars=skipchars)
            if line_count is None:
                # See if line_count is stored in the metadata
                if dataset.metadata.data_lines:
                    dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
                                               inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
                else:
                    # Number of lines is not known ( this should not happen ), and auto-detect is
                    # needed to set metadata
                    # This can happen when the file is larger than max_optional_metadata_filesize.
                    if int(dataset.get_size()) <= 1048576:
                        # Small dataset, recount all lines and reset peek afterward.
                        lc = self.count_data_lines(dataset)
                        dataset.metadata.data_lines = lc
                        dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
                    else:
                        est_lines = self.estimate_file_lines(dataset)
                        dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
                                                    inflector.cond_plural(est_lines, self.line_class) )
            else:
                dataset.blurb = "%s %s" % (
                    util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
        else:
            dataset.peek = 'file does not exist'
            dataset.blurb = 'file purged from disk'

    def sniff(self, filename):
        """All IUPAC files should use the rings form determination script """
        try:
            from suds.client import Client

            url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
            client = Client(url)
            response = client.service.DeterminingForm(file(filename, 'r').read())
            if response.array[0] == "IUPAC":
                return True
            else:
                return False
        except ImportError:
            # cannot use import suds so use simple checker
            print "using IUPAC simple checker"
            f = open(filename, "r")
            firstline = f.readline()
            f.close()

            if "[" in firstline or "]" in firstline or "(" in firstline or ")" in firstline:
                if "{" in firstline or "}" in firstline:
                    return False
                else:
                    return True
            else:
                return False
        except Exception, e:
            # note I am not raising an error rather return False  and let another sniffer try to type this data
            traceback.print_exc(file=sys.stdout)
            return False

    def split(cls, input_datasets, subdir_generator_function, split_params):
        """
        Split the input files by line.
        """
        if split_params is None:
            return

        if len(input_datasets) > 1:
            raise Exception("Text file splitting does not support multiple files")
        input_files = [ds.file_name for ds in input_datasets]

        lines_per_file = None
        chunk_size = None
        if split_params['split_mode'] == 'number_of_parts':
            lines_per_file = []
            # Computing the length is expensive!
            def _file_len(fname):
                i = 0
                f = open(fname)
                for i, l in enumerate(f):
                    pass
                f.close()
                return i + 1

            length = _file_len(input_files[0])
            parts = int(split_params['split_size'])
            if length < parts:
                parts = length
            len_each, remainder = divmod(length, parts)
            while length > 0:
                chunk = len_each
                if remainder > 0:
                    chunk += 1
                lines_per_file.append(chunk)
                remainder = - 1
                length -= chunk
        elif split_params['split_mode'] == 'to_size':
            chunk_size = int(split_params['split_size'])
        else:
            raise Exception('Unsupported split mode %s' % split_params['split_mode'])

        f = open(input_files[0], 'rt')
        try:
            chunk_idx = 0
            file_done = False
            part_file = None
            while not file_done:
                if lines_per_file is None:
                    this_chunk_size = chunk_size
                elif chunk_idx < len(lines_per_file):
                    this_chunk_size = lines_per_file[chunk_idx]
                    chunk_idx += 1
                lines_remaining = this_chunk_size
                part_file = None
                while lines_remaining > 0:
                    a_line = f.readline()
                    if a_line == '':
                        file_done = True
                        break
                    if part_file is None:
                        part_dir = subdir_generator_function()
                        part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
                        part_file = open(part_path, 'w')
                    part_file.write(a_line)
                    lines_remaining -= 1
                if part_file is not None:
                    part_file.close()
        except Exception, e:
            log.error('Unable to split files: %s' % str(e))
            f.close()
            if part_file is not None:
                part_file.close()
            raise
        f.close()

    split = classmethod(split)


class linearcode(data.Data):
    file_ext = 'linearcode'
    line_class = 'line'

    """Add metadata elements"""
    MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
                    visible=False, no_value=0)

    def write_from_stream(self, dataset, stream):
        """Writes data from a stream"""
        # write it twice for now
        fd, temp_name = tempfile.mkstemp()
        while 1:
            chunk = stream.read(1048576)
            if not chunk:
                break
            os.write(fd, chunk)
        os.close(fd)
        # rewrite the file with unix newlines
        fp = open(dataset.file_name, 'wt')
        for line in file(temp_name, "U"):
            line = line.strip() + '\n'
            fp.write(line)
        fp.close()

    def set_raw_data(self, dataset, data):
        """Saves the data on the disc"""
        fd, temp_name = tempfile.mkstemp()
        os.write(fd, data)
        os.close(fd)
        # rewrite the file with unix newlines
        fp = open(dataset.file_name, 'wt')
        for line in file(temp_name, "U"):
            line = line.strip() + '\n'
            fp.write(line)
        fp.close()
        os.remove(temp_name)

    def get_mime(self):
        """Returns the mime type of the datatype"""
        return 'text/plain'

    def set_meta(self, dataset, **kwd):
        """
        Set the number of lines of data in dataset.
        """
        dataset.metadata.data_lines = self.count_data_lines(dataset)

    def estimate_file_lines(self, dataset):
        """
        Perform a rough estimate by extrapolating number of lines from a small read.
        """
        sample_size = 1048576
        dataset_fh = open(dataset.file_name)
        dataset_read = dataset_fh.read(sample_size)
        dataset_fh.close()
        sample_lines = dataset_read.count('\n')
        est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
        return est_lines

    def count_data_lines(self, dataset):
        """
        Count the number of lines of data in dataset,
        skipping all blank lines and comments.
        """
        data_lines = 0
        for line in file(dataset.file_name):
            line = line.strip()
            if line and not line.startswith('#'):
                data_lines += 1
        return data_lines

    def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
        """
        Set the peek.  This method is used by various subclasses of Text.
        """
        if not dataset.dataset.purged:
            # The file must exist on disk for the get_file_peek() method
            dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
                                         skipchars=skipchars)
            if line_count is None:
                # See if line_count is stored in the metadata
                if dataset.metadata.data_lines:
                    dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
                                               inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
                else:
                    # Number of lines is not known ( this should not happen ), and auto-detect is
                    # needed to set metadata
                    # This can happen when the file is larger than max_optional_metadata_filesize.
                    if int(dataset.get_size()) <= 1048576:
                        # Small dataset, recount all lines and reset peek afterward.
                        lc = self.count_data_lines(dataset)
                        dataset.metadata.data_lines = lc
                        dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
                    else:
                        est_lines = self.estimate_file_lines(dataset)
                        dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
                                                    inflector.cond_plural(est_lines, self.line_class) )
            else:
                dataset.blurb = "%s %s" % (
                    util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
        else:
            dataset.peek = 'file does not exist'
            dataset.blurb = 'file purged from disk'

    def sniff(self, filename):
        """All linear code files should use the rings form determination script """
        try:
            from suds.client import Client

            url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
            client = Client(url)
            lcresponse = client.service.DeterminingForm(file(filename, 'r').read())
            if lcresponse.array[0] == "LinearCode":
                print "LinearCode"
                return True
            else:
                print "Unable to guess format"
                return False
        except ImportError:
            # cannot use import suds so use simple checker
            print "using LinearCode simple checker - nope it does not exist yet"
            return False
        except Exception, e:
            # note I am not raising an error rather return False  and let another sniffer try to type this data
            traceback.print_exc(file=sys.stdout)
            return False

    def split(cls, input_datasets, subdir_generator_function, split_params):
        """
        Split the input files by line.
        """
        if split_params is None:
            return

        if len(input_datasets) > 1:
            raise Exception("Text file splitting does not support multiple files")
        input_files = [ds.file_name for ds in input_datasets]

        lines_per_file = None
        chunk_size = None
        if split_params['split_mode'] == 'number_of_parts':
            lines_per_file = []
            # Computing the length is expensive!
            def _file_len(fname):
                i = 0
                f = open(fname)
                for i, l in enumerate(f):
                    pass
                f.close()
                return i + 1

            length = _file_len(input_files[0])
            parts = int(split_params['split_size'])
            if length < parts:
                parts = length
            len_each, remainder = divmod(length, parts)
            while length > 0:
                chunk = len_each
                if remainder > 0:
                    chunk += 1
                lines_per_file.append(chunk)
                remainder = - 1
                length -= chunk
        elif split_params['split_mode'] == 'to_size':
            chunk_size = int(split_params['split_size'])
        else:
            raise Exception('Unsupported split mode %s' % split_params['split_mode'])

        f = open(input_files[0], 'rt')
        try:
            chunk_idx = 0
            file_done = False
            part_file = None
            while not file_done:
                if lines_per_file is None:
                    this_chunk_size = chunk_size
                elif chunk_idx < len(lines_per_file):
                    this_chunk_size = lines_per_file[chunk_idx]
                    chunk_idx += 1
                lines_remaining = this_chunk_size
                part_file = None
                while lines_remaining > 0:
                    a_line = f.readline()
                    if a_line == '':
                        file_done = True
                        break
                    if part_file is None:
                        part_dir = subdir_generator_function()
                        part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
                        part_file = open(part_path, 'w')
                    part_file.write(a_line)
                    lines_remaining -= 1
                if part_file is not None:
                    part_file.close()
        except Exception, e:
            log.error('Unable to split files: %s' % str(e))
            f.close()
            if part_file is not None:
                part_file.close()
            raise
        f.close()

    split = classmethod(split)


class msa(data.Data):
    file_ext = 'msa'
    line_class = 'line'

    """Add metadata elements"""
    MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
                    visible=False, no_value=0)

    def write_from_stream(self, dataset, stream):
        """Writes data from a stream"""
        # write it twice for now
        fd, temp_name = tempfile.mkstemp()
        while 1:
            chunk = stream.read(1048576)
            if not chunk:
                break
            os.write(fd, chunk)
        os.close(fd)
        # rewrite the file with unix newlines
        fp = open(dataset.file_name, 'wt')
        for line in file(temp_name, "U"):
            line = line.strip() + '\n'
            fp.write(line)
        fp.close()

    def set_raw_data(self, dataset, data):
        """Saves the data on the disc"""
        fd, temp_name = tempfile.mkstemp()
        os.write(fd, data)
        os.close(fd)
        # rewrite the file with unix newlines
        fp = open(dataset.file_name, 'wt')
        for line in file(temp_name, "U"):
            line = line.strip() + '\n'
            fp.write(line)
        fp.close()
        os.remove(temp_name)

    def get_mime(self):
        """Returns the mime type of the datatype"""
        return 'text/plain'

    def set_meta(self, dataset, **kwd):
        """
        Set the number of lines of data in dataset.
        """
        dataset.metadata.data_lines = self.count_data_lines(dataset)

    def estimate_file_lines(self, dataset):
        """
        Perform a rough estimate by extrapolating number of lines from a small read.
        """
        sample_size = 1048576
        dataset_fh = open(dataset.file_name)
        dataset_read = dataset_fh.read(sample_size)
        dataset_fh.close()
        sample_lines = dataset_read.count('\n')
        est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
        return est_lines

    def count_data_lines(self, dataset):
        """
        Count the number of lines of data in dataset,
        skipping all blank lines and comments.
        """
        data_lines = 0
        for line in file(dataset.file_name):
            line = line.strip()
            if line and not line.startswith('#'):
                data_lines += 1
        return data_lines

    def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
        """
        Set the peek.  This method is used by various subclasses of Text.
        """
        if not dataset.dataset.purged:
            # The file must exist on disk for the get_file_peek() method
            dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
                                         skipchars=skipchars)
            if line_count is None:
                # See if line_count is stored in the metadata
                if dataset.metadata.data_lines:
                    dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
                                               inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
                else:
                    # Number of lines is not known ( this should not happen ), and auto-detect is
                    # needed to set metadata
                    # This can happen when the file is larger than max_optional_metadata_filesize.
                    if int(dataset.get_size()) <= 1048576:
                        # Small dataset, recount all lines and reset peek afterward.
                        lc = self.count_data_lines(dataset)
                        dataset.metadata.data_lines = lc
                        dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
                    else:
                        est_lines = self.estimate_file_lines(dataset)
                        dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
                                                    inflector.cond_plural(est_lines, self.line_class) )
            else:
                dataset.blurb = "%s %s" % (
                    util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
        else:
            dataset.peek = 'file does not exist'
            dataset.blurb = 'file purged from disk'

    def sniff(self, filename):
        """All msa Files simply put a '# .msa' in the first line.  """
        try:
            f = open(filename, "r")
            firstline = f.readline().upper()  # note we are uppercasing here to avoid CasE SenSitIVity
            f.close()

            if "# .MSA" in firstline:
                return True
            else:
                return False
        except:
            traceback.print_exc(file=sys.stdout)
            return False

    def split(cls, input_datasets, subdir_generator_function, split_params):
        """
        Split the input files by line.
        """
        if split_params is None:
            return

        if len(input_datasets) > 1:
            raise Exception("Text file splitting does not support multiple files")
        input_files = [ds.file_name for ds in input_datasets]

        lines_per_file = None
        chunk_size = None
        if split_params['split_mode'] == 'number_of_parts':
            lines_per_file = []
            # Computing the length is expensive!
            def _file_len(fname):
                i = 0
                f = open(fname)
                for i, l in enumerate(f):
                    pass
                f.close()
                return i + 1

            length = _file_len(input_files[0])
            parts = int(split_params['split_size'])
            if length < parts:
                parts = length
            len_each, remainder = divmod(length, parts)
            while length > 0:
                chunk = len_each
                if remainder > 0:
                    chunk += 1
                lines_per_file.append(chunk)
                remainder = - 1
                length -= chunk
        elif split_params['split_mode'] == 'to_size':
            chunk_size = int(split_params['split_size'])
        else:
            raise Exception('Unsupported split mode %s' % split_params['split_mode'])

        f = open(input_files[0], 'rt')
        try:
            chunk_idx = 0
            file_done = False
            part_file = None
            while not file_done:
                if lines_per_file is None:
                    this_chunk_size = chunk_size
                elif chunk_idx < len(lines_per_file):
                    this_chunk_size = lines_per_file[chunk_idx]
                    chunk_idx += 1
                lines_remaining = this_chunk_size
                part_file = None
                while lines_remaining > 0:
                    a_line = f.readline()
                    if a_line == '':
                        file_done = True
                        break
                    if part_file is None:
                        part_dir = subdir_generator_function()
                        part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
                        part_file = open(part_path, 'w')
                    part_file.write(a_line)
                    lines_remaining -= 1
                if part_file is not None:
                    part_file.close()
        except Exception, e:
            log.error('Unable to split files: %s' % str(e))
            f.close()
            if part_file is not None:
                part_file.close()
            raise
        f.close()

    split = classmethod(split)


class wurcs(data.Data):
    file_ext = 'wurcs'
    line_class = 'line'

    """Add metadata elements"""
    MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
                    visible=False, no_value=0)

    def write_from_stream(self, dataset, stream):
        """Writes data from a stream"""
        # write it twice for now
        fd, temp_name = tempfile.mkstemp()
        while 1:
            chunk = stream.read(1048576)
            if not chunk:
                break
            os.write(fd, chunk)
        os.close(fd)
        # rewrite the file with unix newlines
        fp = open(dataset.file_name, 'wt')
        for line in file(temp_name, "U"):
            line = line.strip() + '\n'
            fp.write(line)
        fp.close()

    def set_raw_data(self, dataset, data):
        """Saves the data on the disc"""
        fd, temp_name = tempfile.mkstemp()
        os.write(fd, data)
        os.close(fd)
        # rewrite the file with unix newlines
        fp = open(dataset.file_name, 'wt')
        for line in file(temp_name, "U"):
            line = line.strip() + '\n'
            fp.write(line)
        fp.close()
        os.remove(temp_name)

    def get_mime(self):
        """Returns the mime type of the datatype"""
        return 'text/plain'

    def set_meta(self, dataset, **kwd):
        """
        Set the number of lines of data in dataset.
        """
        dataset.metadata.data_lines = self.count_data_lines(dataset)

    def estimate_file_lines(self, dataset):
        """
        Perform a rough estimate by extrapolating number of lines from a small read.
        """
        sample_size = 1048576
        dataset_fh = open(dataset.file_name)
        dataset_read = dataset_fh.read(sample_size)
        dataset_fh.close()
        sample_lines = dataset_read.count('\n')
        est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
        return est_lines

    def count_data_lines(self, dataset):
        """
        Count the number of lines of data in dataset,
        skipping all blank lines and comments.
        """
        data_lines = 0
        for line in file(dataset.file_name):
            line = line.strip()
            if line and not line.startswith('#'):
                data_lines += 1
        return data_lines

    def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
        """
        Set the peek.  This method is used by various subclasses of Text.
        """
        if not dataset.dataset.purged:
            # The file must exist on disk for the get_file_peek() method
            dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
                                         skipchars=skipchars)
            if line_count is None:
                # See if line_count is stored in the metadata
                if dataset.metadata.data_lines:
                    dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
                                               inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
                else:
                    # Number of lines is not known ( this should not happen ), and auto-detect is
                    # needed to set metadata
                    # This can happen when the file is larger than max_optional_metadata_filesize.
                    if int(dataset.get_size()) <= 1048576:
                        # Small dataset, recount all lines and reset peek afterward.
                        lc = self.count_data_lines(dataset)
                        dataset.metadata.data_lines = lc
                        dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
                    else:
                        est_lines = self.estimate_file_lines(dataset)
                        dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
                                                    inflector.cond_plural(est_lines, self.line_class) )
            else:
                dataset.blurb = "%s %s" % (
                    util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
        else:
            dataset.peek = 'file does not exist'
            dataset.blurb = 'file purged from disk'

    def sniff(self, filename):
        """All WURCS Files start with WURCS= then the version number. see http://www.wurcs-wg.org/definition.php and  http://rings.t.soka.ac.jp/
WURCS=2.0/4,3/[x2112h+1:x|1,5|2*NCC/3=O|4*OSO/3=O/3=O][12122a+1:b|1,5][12112h+1:b|1,5|2*NCC/3=O|6*OSO/3=O/3=O][12122a+1:b|1,5]1+3,2+1|2+4,3+1|3+3,4+1"""
        try:
            f = open(filename, "r")
            firstline = f.readline().upper()  # note we are uppercasing here to avoid CasE SenSitIVity
            f.close()
            if "WURCS" in firstline:
                return True
            else:
                return False
        except:
            traceback.print_exc(file=sys.stdout)
            return False


    def split(cls, input_datasets, subdir_generator_function, split_params):
        """
        Split the input files by line.
        """
        if split_params is None:
            return

        if len(input_datasets) > 1:
            raise Exception("Text file splitting does not support multiple files")
        input_files = [ds.file_name for ds in input_datasets]

        lines_per_file = None
        chunk_size = None
        if split_params['split_mode'] == 'number_of_parts':
            lines_per_file = []
            # Computing the length is expensive!
            def _file_len(fname):
                i = 0
                f = open(fname)
                for i, l in enumerate(f):
                    pass
                f.close()
                return i + 1

            length = _file_len(input_files[0])
            parts = int(split_params['split_size'])
            if length < parts:
                parts = length
            len_each, remainder = divmod(length, parts)
            while length > 0:
                chunk = len_each
                if remainder > 0:
                    chunk += 1
                lines_per_file.append(chunk)
                remainder = - 1
                length -= chunk
        elif split_params['split_mode'] == 'to_size':
            chunk_size = int(split_params['split_size'])
        else:
            raise Exception('Unsupported split mode %s' % split_params['split_mode'])

        f = open(input_files[0], 'rt')
        try:
            chunk_idx = 0
            file_done = False
            part_file = None
            while not file_done:
                if lines_per_file is None:
                    this_chunk_size = chunk_size
                elif chunk_idx < len(lines_per_file):
                    this_chunk_size = lines_per_file[chunk_idx]
                    chunk_idx += 1
                lines_remaining = this_chunk_size
                part_file = None
                while lines_remaining > 0:
                    a_line = f.readline()
                    if a_line == '':
                        file_done = True
                        break
                    if part_file is None:
                        part_dir = subdir_generator_function()
                        part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
                        part_file = open(part_path, 'w')
                    part_file.write(a_line)
                    lines_remaining -= 1
                if part_file is not None:
                    part_file.close()
        except Exception, e:
            log.error('Unable to split files: %s' % str(e))
            f.close()
            if part_file is not None:
                part_file.close()
            raise
        f.close()

    split = classmethod(split)
author	chrisb
date	Wed, 23 Mar 2016 14:34:50 -0400
parents
children