Mercurial > repos > chrisb > gap_datatypes
changeset 0:0e941a69a6fa draft default tip
Uploaded
author | chrisb |
---|---|
date | Wed, 23 Mar 2016 14:34:50 -0400 |
parents | |
children | |
files | datatypes/README.md datatypes/datatypes_conf.xml datatypes/glycan.py |
diffstat | 3 files changed, 2013 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/README.md Wed Mar 23 14:34:50 2016 -0400 @@ -0,0 +1,19 @@ + +Custom glycan data types for galaxy +=================================== + +New glycan data types for galaxy, included as part of the glycan tools repo instead of being included manually (as done previously) +Ideas from http://gregvonkuster.org/galaxy-tool-shed-including-custom-datatypes-repositories/ + +Supported data types include (copied from datatypes_conf.xml): + + <sniffer type="galaxy.datatypes.glycan:kcf"/> + <sniffer type="galaxy.datatypes.glycan:glycoct"/> + <sniffer type="galaxy.datatypes.glycan:glycoct_xml"/> + <sniffer type="galaxy.datatypes.glycan:glydeii"/> + <sniffer type="galaxy.datatypes.glycan:linucs"/> + <sniffer type="galaxy.datatypes.glycan:iupac"/> + <sniffer type="galaxy.datatypes.glycan:linearcode"/> + <sniffer type="galaxy.datatypes.glycan:msa"/> + <sniffer type="galaxy.datatypes.glycan:wurcs"/> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/datatypes_conf.xml Wed Mar 23 14:34:50 2016 -0400 @@ -0,0 +1,30 @@ +<?xml version="1.0"?> +<datatypes> + <datatype_files> + <datatype_file name="glycan.py"/> + </datatype_files> + <registration> + <datatype extension="kcf" type="galaxy.datatypes.glycan:kcf" mimetype="text/plain" display_in_upload="true"/> + <datatype extension="glycoct" type="galaxy.datatypes.glycan:glycoct" mimetype="text/plain" display_in_upload="true"/> + <datatype extension="glycoct_xml" type="galaxy.datatypes.glycan:glycoct_xml" mimetype="text/xml" display_in_upload="true"/> + <datatype extension="glydeii" type="galaxy.datatypes.glycan:glydeii" mimetype="text/xml" display_in_upload="true"/> + <datatype extension="linucs" type="galaxy.datatypes.glycan:linucs" mimetype="text/plain" display_in_upload="true"/> + <datatype extension="iupac" type="galaxy.datatypes.glycan:iupac" mimetype="text/plain" display_in_upload="true"/> + <datatype extension="linearcode" type="galaxy.datatypes.glycan:linearcode" mimetype="text/plain" display_in_upload="true"/> + <datatype extension="msa" type="galaxy.datatypes.glycan:msa" mimetype="text/plain" display_in_upload="true"/> + <datatype extension="wurcs" type="galaxy.datatypes.glycan:wurcs" mimetype="text/plain" display_in_upload="true"/> + + </registration> + <sniffers> + <sniffer type="galaxy.datatypes.glycan:kcf"/> + <sniffer type="galaxy.datatypes.glycan:glycoct"/> + <sniffer type="galaxy.datatypes.glycan:glycoct_xml"/> + <sniffer type="galaxy.datatypes.glycan:glydeii"/> + <sniffer type="galaxy.datatypes.glycan:linucs"/> + <sniffer type="galaxy.datatypes.glycan:iupac"/> + <sniffer type="galaxy.datatypes.glycan:linearcode"/> + <sniffer type="galaxy.datatypes.glycan:msa"/> + <sniffer type="galaxy.datatypes.glycan:wurcs"/> + </sniffers> +</datatypes> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/glycan.py Wed Mar 23 14:34:50 2016 -0400 @@ -0,0 +1,1964 @@ +__license__ = "MIT" + +import logging +from galaxy.datatypes import metadata +import mimetypes +import os +import shutil +import sys +import traceback +import tempfile +import zipfile +from cgi import escape +from inspect import isclass +import galaxy.util as util +from galaxy.datatypes import data +from galaxy.datatypes.metadata import \ + MetadataElement # import directly to maintain ease of use in Datatype class definitions +from galaxy.util import inflector +from galaxy.util.bunch import Bunch +from galaxy.util.odict import odict +from galaxy.util.sanitize_html import sanitize_html + +from galaxy.datatypes import dataproviders + +from galaxy import eggs + +eggs.require("Paste") +import paste + + +class kcf(data.Data): + file_ext = 'kcf' + line_class = 'line' + + """Add metadata elements""" + MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, + visible=False, no_value=0) + + def write_from_stream(self, dataset, stream): + """Writes data from a stream""" + # write it twice for now + fd, temp_name = tempfile.mkstemp() + while 1: + chunk = stream.read(1048576) + if not chunk: + break + os.write(fd, chunk) + os.close(fd) + # rewrite the file with unix newlines + fp = open(dataset.file_name, 'wt') + for line in file(temp_name, "U"): + line = line.strip() + '\n' + fp.write(line) + fp.close() + + def set_raw_data(self, dataset, data): + """Saves the data on the disc""" + fd, temp_name = tempfile.mkstemp() + os.write(fd, data) + os.close(fd) + # rewrite the file with unix newlines + fp = open(dataset.file_name, 'wt') + for line in file(temp_name, "U"): + line = line.strip() + '\n' + fp.write(line) + fp.close() + os.remove(temp_name) + + def get_mime(self): + """Returns the mime type of the datatype""" + return 'text/plain' + + def set_meta(self, dataset, **kwd): + """ + Set the number of lines of data in dataset. + """ + dataset.metadata.data_lines = self.count_data_lines(dataset) + + def estimate_file_lines(self, dataset): + """ + Perform a rough estimate by extrapolating number of lines from a small read. + """ + sample_size = 1048576 + dataset_fh = open(dataset.file_name) + dataset_read = dataset_fh.read(sample_size) + dataset_fh.close() + sample_lines = dataset_read.count('\n') + est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) + return est_lines + + def count_data_lines(self, dataset): + """ + Count the number of lines of data in dataset, + skipping all blank lines and comments. + """ + data_lines = 0 + for line in file(dataset.file_name): + line = line.strip() + if line and not line.startswith('#'): + data_lines += 1 + return data_lines + + def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): + """ + Set the peek. This method is used by various subclasses of Text. + """ + if not dataset.dataset.purged: + # The file must exist on disk for the get_file_peek() method + dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, + skipchars=skipchars) + if line_count is None: + # See if line_count is stored in the metadata + if dataset.metadata.data_lines: + dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), + inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) + else: + # Number of lines is not known ( this should not happen ), and auto-detect is + # needed to set metadata + # This can happen when the file is larger than max_optional_metadata_filesize. + if int(dataset.get_size()) <= 1048576: + # Small dataset, recount all lines and reset peek afterward. + lc = self.count_data_lines(dataset) + dataset.metadata.data_lines = lc + dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) + else: + est_lines = self.estimate_file_lines(dataset) + dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), + inflector.cond_plural(est_lines, self.line_class) ) + else: + dataset.blurb = "%s %s" % ( + util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + + def sniff(self, filename): + """All KCF Files simply put a 'ENTRY' in its first line. + This applies to all possible kcfs. In this case check + for 'Glycan' to confirm it's a glycan """ + try: + from suds.client import Client + + url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl' + client = Client(url) + kcfresponse = client.service.DeterminingForm(file(filename, 'r').read()) + if kcfresponse.array[0] == "KCF": + return True + else: + return False + except ImportError: + # cannot use import suds so use simple checker + print "using KCF simple checker" + f = open(filename, "r") + firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity + f.close() + + if "ENTRY" in firstline and "GLYCAN" in firstline: + return True + else: + return False + except Exception, e: + # note I am not raising an error rather return False and let another sniffer try to type this data + traceback.print_exc(file=sys.stdout) + return False + + def split(cls, input_datasets, subdir_generator_function, split_params): + """ + Split the input files by line. + """ + if split_params is None: + return + + if len(input_datasets) > 1: + raise Exception("Text file splitting does not support multiple files") + input_files = [ds.file_name for ds in input_datasets] + + lines_per_file = None + chunk_size = None + if split_params['split_mode'] == 'number_of_parts': + lines_per_file = [] + # Computing the length is expensive! + def _file_len(fname): + i = 0 + f = open(fname) + for i, l in enumerate(f): + pass + f.close() + return i + 1 + + length = _file_len(input_files[0]) + parts = int(split_params['split_size']) + if length < parts: + parts = length + len_each, remainder = divmod(length, parts) + while length > 0: + chunk = len_each + if remainder > 0: + chunk += 1 + lines_per_file.append(chunk) + remainder = - 1 + length -= chunk + elif split_params['split_mode'] == 'to_size': + chunk_size = int(split_params['split_size']) + else: + raise Exception('Unsupported split mode %s' % split_params['split_mode']) + + f = open(input_files[0], 'rt') + try: + chunk_idx = 0 + file_done = False + part_file = None + while not file_done: + if lines_per_file is None: + this_chunk_size = chunk_size + elif chunk_idx < len(lines_per_file): + this_chunk_size = lines_per_file[chunk_idx] + chunk_idx += 1 + lines_remaining = this_chunk_size + part_file = None + while lines_remaining > 0: + a_line = f.readline() + if a_line == '': + file_done = True + break + if part_file is None: + part_dir = subdir_generator_function() + part_path = os.path.join(part_dir, os.path.basename(input_files[0])) + part_file = open(part_path, 'w') + part_file.write(a_line) + lines_remaining -= 1 + if part_file is not None: + part_file.close() + except Exception, e: + log.error('Unable to split files: %s' % str(e)) + f.close() + if part_file is not None: + part_file.close() + raise + f.close() + + split = classmethod(split) + + +class glycoct(data.Data): + file_ext = 'glycoct' + line_class = 'line' + + """Add metadata elements""" + MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, + visible=False, no_value=0) + + def write_from_stream(self, dataset, stream): + """Writes data from a stream""" + # write it twice for now + fd, temp_name = tempfile.mkstemp() + while 1: + chunk = stream.read(1048576) + if not chunk: + break + os.write(fd, chunk) + os.close(fd) + # rewrite the file with unix newlines + fp = open(dataset.file_name, 'wt') + for line in file(temp_name, "U"): + line = line.strip() + '\n' + fp.write(line) + fp.close() + + def set_raw_data(self, dataset, data): + """Saves the data on the disc""" + fd, temp_name = tempfile.mkstemp() + os.write(fd, data) + os.close(fd) + # rewrite the file with unix newlines + fp = open(dataset.file_name, 'wt') + for line in file(temp_name, "U"): + line = line.strip() + '\n' + fp.write(line) + fp.close() + os.remove(temp_name) + + def get_mime(self): + """Returns the mime type of the datatype""" + return 'text/plain' + + def set_meta(self, dataset, **kwd): + """ + Set the number of lines of data in dataset. + """ + dataset.metadata.data_lines = self.count_data_lines(dataset) + + def estimate_file_lines(self, dataset): + """ + Perform a rough estimate by extrapolating number of lines from a small read. + """ + sample_size = 1048576 + dataset_fh = open(dataset.file_name) + dataset_read = dataset_fh.read(sample_size) + dataset_fh.close() + sample_lines = dataset_read.count('\n') + est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) + return est_lines + + def count_data_lines(self, dataset): + """ + Count the number of lines of data in dataset, + skipping all blank lines and comments. + """ + data_lines = 0 + for line in file(dataset.file_name): + line = line.strip() + if line and not line.startswith('#'): + data_lines += 1 + return data_lines + + def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): + """ + Set the peek. This method is used by various subclasses of Text. + """ + if not dataset.dataset.purged: + # The file must exist on disk for the get_file_peek() method + dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, + skipchars=skipchars) + if line_count is None: + # See if line_count is stored in the metadata + if dataset.metadata.data_lines: + dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), + inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) + else: + # Number of lines is not known ( this should not happen ), and auto-detect is + # needed to set metadata + # This can happen when the file is larger than max_optional_metadata_filesize. + if int(dataset.get_size()) <= 1048576: + # Small dataset, recount all lines and reset peek afterward. + lc = self.count_data_lines(dataset) + dataset.metadata.data_lines = lc + dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) + else: + est_lines = self.estimate_file_lines(dataset) + dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), + inflector.cond_plural(est_lines, self.line_class) ) + else: + dataset.blurb = "%s %s" % ( + util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + + def sniff(self, filename): + """All glycoct_condensed files simply put a 'RES' in its first line and a LIN later. """ + try: + f = open(filename, "r") + firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity + lines = f.read() + f.close() + + # if "RES" in firstline and "LIN" in lines: + if "RES" in firstline and "LIN" in lines: + return True + else: + return False + except Exception, e: + # note I am not raising an error rather return False and let another sniffer try to type this data + traceback.print_exc(file=sys.stdout) + return False + + def split(cls, input_datasets, subdir_generator_function, split_params): + """ + Split the input files by line. + """ + if split_params is None: + return + + if len(input_datasets) > 1: + raise Exception("Text file splitting does not support multiple files") + input_files = [ds.file_name for ds in input_datasets] + + lines_per_file = None + chunk_size = None + if split_params['split_mode'] == 'number_of_parts': + lines_per_file = [] + # Computing the length is expensive! + def _file_len(fname): + i = 0 + f = open(fname) + for i, l in enumerate(f): + pass + f.close() + return i + 1 + + length = _file_len(input_files[0]) + parts = int(split_params['split_size']) + if length < parts: + parts = length + len_each, remainder = divmod(length, parts) + while length > 0: + chunk = len_each + if remainder > 0: + chunk += 1 + lines_per_file.append(chunk) + remainder = - 1 + length -= chunk + elif split_params['split_mode'] == 'to_size': + chunk_size = int(split_params['split_size']) + else: + raise Exception('Unsupported split mode %s' % split_params['split_mode']) + + f = open(input_files[0], 'rt') + try: + chunk_idx = 0 + file_done = False + part_file = None + while not file_done: + if lines_per_file is None: + this_chunk_size = chunk_size + elif chunk_idx < len(lines_per_file): + this_chunk_size = lines_per_file[chunk_idx] + chunk_idx += 1 + lines_remaining = this_chunk_size + part_file = None + while lines_remaining > 0: + a_line = f.readline() + if a_line == '': + file_done = True + break + if part_file is None: + part_dir = subdir_generator_function() + part_path = os.path.join(part_dir, os.path.basename(input_files[0])) + part_file = open(part_path, 'w') + part_file.write(a_line) + lines_remaining -= 1 + if part_file is not None: + part_file.close() + except Exception, e: + log.error('Unable to split files: %s' % str(e)) + f.close() + if part_file is not None: + part_file.close() + raise + f.close() + + split = classmethod(split) + +# ------------- Utility methods -------------- + +# nice_size used to be here, but to resolve cyclical dependencies it's been +# moved to galaxy.util. It belongs there anyway since it's used outside +# datatypes. +nice_size = util.nice_size + + +def get_test_fname(fname): + """Returns test data filename""" + path, name = os.path.split(__file__) + full_path = os.path.join(path, 'test', fname) + return full_path + + +def get_file_peek(file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipchars=[]): + """ + Returns the first LINE_COUNT lines wrapped to WIDTH + + ## >>> fname = get_test_fname('4.bed') + ## >>> get_file_peek(fname) + ## 'chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 +\n' + + """ + # Set size for file.readline() to a negative number to force it to + # read until either a newline or EOF. Needed for datasets with very + # long lines. + if WIDTH == 'unlimited': + WIDTH = -1 + lines = [] + count = 0 + file_type = None + data_checked = False + temp = open(file_name, "U") + while count <= LINE_COUNT: + line = temp.readline(WIDTH) + if line and not is_multi_byte and not data_checked: + # See if we have a compressed or binary file + if line[0:2] == util.gzip_magic: + file_type = 'gzipped' + break + else: + for char in line: + if ord(char) > 128: + file_type = 'binary' + break + data_checked = True + if file_type in ['gzipped', 'binary']: + break + skip_line = False + for skipchar in skipchars: + if line.startswith(skipchar): + skip_line = True + break + if not skip_line: + lines.append(line) + count += 1 + temp.close() + if file_type in ['gzipped', 'binary']: + text = "%s file" % file_type + else: + try: + text = unicode('\n'.join(lines), 'utf-8') + except UnicodeDecodeError: + text = "binary/unknown file" + return text + + +class glycoct_xml(data.Data): + file_ext = 'glycoct_xml' + line_class = 'line' + + """Add metadata elements""" + MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, + visible=False, no_value=0) + + def write_from_stream(self, dataset, stream): + """Writes data from a stream""" + # write it twice for now + fd, temp_name = tempfile.mkstemp() + while 1: + chunk = stream.read(1048576) + if not chunk: + break + os.write(fd, chunk) + os.close(fd) + # rewrite the file with unix newlines + fp = open(dataset.file_name, 'wt') + for line in file(temp_name, "U"): + line = line.strip() + '\n' + fp.write(line) + fp.close() + + def set_raw_data(self, dataset, data): + """Saves the data on the disc""" + fd, temp_name = tempfile.mkstemp() + os.write(fd, data) + os.close(fd) + # rewrite the file with unix newlines + fp = open(dataset.file_name, 'wt') + for line in file(temp_name, "U"): + line = line.strip() + '\n' + fp.write(line) + fp.close() + os.remove(temp_name) + + def get_mime(self): + """Returns the mime type of the datatype""" + return 'text/xml' + + def set_meta(self, dataset, **kwd): + """ + Set the number of lines of data in dataset. + """ + dataset.metadata.data_lines = self.count_data_lines(dataset) + + def estimate_file_lines(self, dataset): + """ + Perform a rough estimate by extrapolating number of lines from a small read. + """ + sample_size = 1048576 + dataset_fh = open(dataset.file_name) + dataset_read = dataset_fh.read(sample_size) + dataset_fh.close() + sample_lines = dataset_read.count('\n') + est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) + return est_lines + + def count_data_lines(self, dataset): + """ + Count the number of lines of data in dataset, + skipping all blank lines and comments. + """ + data_lines = 0 + for line in file(dataset.file_name): + line = line.strip() + if line and not line.startswith('#'): + data_lines += 1 + return data_lines + + def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): + """ + Set the peek. This method is used by various subclasses of Text. + """ + if not dataset.dataset.purged: + # The file must exist on disk for the get_file_peek() method + dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, + skipchars=skipchars) + if line_count is None: + # See if line_count is stored in the metadata + if dataset.metadata.data_lines: + dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), + inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) + else: + # Number of lines is not known ( this should not happen ), and auto-detect is + # needed to set metadata + # This can happen when the file is larger than max_optional_metadata_filesize. + if int(dataset.get_size()) <= 1048576: + # Small dataset, recount all lines and reset peek afterward. + lc = self.count_data_lines(dataset) + dataset.metadata.data_lines = lc + dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) + else: + est_lines = self.estimate_file_lines(dataset) + dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), + inflector.cond_plural(est_lines, self.line_class) ) + else: + dataset.blurb = "%s %s" % ( + util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + + def sniff(self, filename): + """All glycoct XML files should use the rings form determination script """ + try: + from suds.client import Client + + url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl' + client = Client(url) + response = client.service.DeterminingForm(file(filename, 'r').read()) + if response.array[0] == "GlycoCT": + return True + else: + return False + except ImportError: + # cannot use import suds so use simple checker + print "using glycoct XML simple checker" + import xml.etree.cElementTree as ET + + tree = ET.parse(filename) + root = tree.getroot() + if root.tag == 'sugar': + print root.tag, root.attrib + return True + else: + return False + except Exception, e: + # note I am not raising an error rather return False and let another sniffer try to type this data + traceback.print_exc(file=sys.stdout) + return False + + def split(cls, input_datasets, subdir_generator_function, split_params): + """ + Split the input files by line. + """ + if split_params is None: + return + + if len(input_datasets) > 1: + raise Exception("Text file splitting does not support multiple files") + input_files = [ds.file_name for ds in input_datasets] + + lines_per_file = None + chunk_size = None + if split_params['split_mode'] == 'number_of_parts': + lines_per_file = [] + # Computing the length is expensive! + def _file_len(fname): + i = 0 + f = open(fname) + for i, l in enumerate(f): + pass + f.close() + return i + 1 + + length = _file_len(input_files[0]) + parts = int(split_params['split_size']) + if length < parts: + parts = length + len_each, remainder = divmod(length, parts) + while length > 0: + chunk = len_each + if remainder > 0: + chunk += 1 + lines_per_file.append(chunk) + remainder = - 1 + length -= chunk + elif split_params['split_mode'] == 'to_size': + chunk_size = int(split_params['split_size']) + else: + raise Exception('Unsupported split mode %s' % split_params['split_mode']) + + f = open(input_files[0], 'rt') + try: + chunk_idx = 0 + file_done = False + part_file = None + while not file_done: + if lines_per_file is None: + this_chunk_size = chunk_size + elif chunk_idx < len(lines_per_file): + this_chunk_size = lines_per_file[chunk_idx] + chunk_idx += 1 + lines_remaining = this_chunk_size + part_file = None + while lines_remaining > 0: + a_line = f.readline() + if a_line == '': + file_done = True + break + if part_file is None: + part_dir = subdir_generator_function() + part_path = os.path.join(part_dir, os.path.basename(input_files[0])) + part_file = open(part_path, 'w') + part_file.write(a_line) + lines_remaining -= 1 + if part_file is not None: + part_file.close() + except Exception, e: + log.error('Unable to split files: %s' % str(e)) + f.close() + if part_file is not None: + part_file.close() + raise + f.close() + + split = classmethod(split) + + +class glydeii(data.Data): + file_ext = 'glydeii' + line_class = 'line' + + """Add metadata elements""" + MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, + visible=False, no_value=0) + + def write_from_stream(self, dataset, stream): + """Writes data from a stream""" + # write it twice for now + fd, temp_name = tempfile.mkstemp() + while 1: + chunk = stream.read(1048576) + if not chunk: + break + os.write(fd, chunk) + os.close(fd) + # rewrite the file with unix newlines + fp = open(dataset.file_name, 'wt') + for line in file(temp_name, "U"): + line = line.strip() + '\n' + fp.write(line) + fp.close() + + def set_raw_data(self, dataset, data): + """Saves the data on the disc""" + fd, temp_name = tempfile.mkstemp() + os.write(fd, data) + os.close(fd) + # rewrite the file with unix newlines + fp = open(dataset.file_name, 'wt') + for line in file(temp_name, "U"): + line = line.strip() + '\n' + fp.write(line) + fp.close() + os.remove(temp_name) + + def get_mime(self): + """Returns the mime type of the datatype""" + return 'text/xml' + + def set_meta(self, dataset, **kwd): + """ + Set the number of lines of data in dataset. + """ + dataset.metadata.data_lines = self.count_data_lines(dataset) + + def estimate_file_lines(self, dataset): + """ + Perform a rough estimate by extrapolating number of lines from a small read. + """ + sample_size = 1048576 + dataset_fh = open(dataset.file_name) + dataset_read = dataset_fh.read(sample_size) + dataset_fh.close() + sample_lines = dataset_read.count('\n') + est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) + return est_lines + + def count_data_lines(self, dataset): + """ + Count the number of lines of data in dataset, + skipping all blank lines and comments. + """ + data_lines = 0 + for line in file(dataset.file_name): + line = line.strip() + if line and not line.startswith('#'): + data_lines += 1 + return data_lines + + def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): + """ + Set the peek. This method is used by various subclasses of Text. + """ + if not dataset.dataset.purged: + # The file must exist on disk for the get_file_peek() method + dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, + skipchars=skipchars) + if line_count is None: + # See if line_count is stored in the metadata + if dataset.metadata.data_lines: + dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), + inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) + else: + # Number of lines is not known ( this should not happen ), and auto-detect is + # needed to set metadata + # This can happen when the file is larger than max_optional_metadata_filesize. + if int(dataset.get_size()) <= 1048576: + # Small dataset, recount all lines and reset peek afterward. + lc = self.count_data_lines(dataset) + dataset.metadata.data_lines = lc + dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) + else: + est_lines = self.estimate_file_lines(dataset) + dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), + inflector.cond_plural(est_lines, self.line_class) ) + else: + dataset.blurb = "%s %s" % ( + util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + + def sniff(self, filename): + """All GlydeII XML files should use the rings form determination script """ + try: + from suds.client import Client + + url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl' + client = Client(url) + response = client.service.DeterminingForm(file(filename, 'r').read()) + if response.array[0] == "GLYDEII": + return True + else: + return False + except ImportError: + # cannot use import suds so use simple checker + print "using GlydeII simple checker" + import xml.etree.cElementTree as ET + + tree = ET.parse(filename) + root = tree.getroot() + if root.tag == 'GlydeII': + print root.tag + return True + else: + return False + except Exception, e: + # note I am not raising an error rather return False and let another sniffer try to type this data + traceback.print_exc(file=sys.stdout) + return False + + def split(cls, input_datasets, subdir_generator_function, split_params): + """ + Split the input files by line. + """ + if split_params is None: + return + + if len(input_datasets) > 1: + raise Exception("Text file splitting does not support multiple files") + input_files = [ds.file_name for ds in input_datasets] + + lines_per_file = None + chunk_size = None + if split_params['split_mode'] == 'number_of_parts': + lines_per_file = [] + # Computing the length is expensive! + def _file_len(fname): + i = 0 + f = open(fname) + for i, l in enumerate(f): + pass + f.close() + return i + 1 + + length = _file_len(input_files[0]) + parts = int(split_params['split_size']) + if length < parts: + parts = length + len_each, remainder = divmod(length, parts) + while length > 0: + chunk = len_each + if remainder > 0: + chunk += 1 + lines_per_file.append(chunk) + remainder = - 1 + length -= chunk + elif split_params['split_mode'] == 'to_size': + chunk_size = int(split_params['split_size']) + else: + raise Exception('Unsupported split mode %s' % split_params['split_mode']) + + f = open(input_files[0], 'rt') + try: + chunk_idx = 0 + file_done = False + part_file = None + while not file_done: + if lines_per_file is None: + this_chunk_size = chunk_size + elif chunk_idx < len(lines_per_file): + this_chunk_size = lines_per_file[chunk_idx] + chunk_idx += 1 + lines_remaining = this_chunk_size + part_file = None + while lines_remaining > 0: + a_line = f.readline() + if a_line == '': + file_done = True + break + if part_file is None: + part_dir = subdir_generator_function() + part_path = os.path.join(part_dir, os.path.basename(input_files[0])) + part_file = open(part_path, 'w') + part_file.write(a_line) + lines_remaining -= 1 + if part_file is not None: + part_file.close() + except Exception, e: + log.error('Unable to split files: %s' % str(e)) + f.close() + if part_file is not None: + part_file.close() + raise + f.close() + + split = classmethod(split) + + +class linucs(data.Data): + file_ext = 'linucs' + line_class = 'line' + + """Add metadata elements""" + MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, + visible=False, no_value=0) + + def write_from_stream(self, dataset, stream): + """Writes data from a stream""" + # write it twice for now + fd, temp_name = tempfile.mkstemp() + while 1: + chunk = stream.read(1048576) + if not chunk: + break + os.write(fd, chunk) + os.close(fd) + # rewrite the file with unix newlines + fp = open(dataset.file_name, 'wt') + for line in file(temp_name, "U"): + line = line.strip() + '\n' + fp.write(line) + fp.close() + + def set_raw_data(self, dataset, data): + """Saves the data on the disc""" + fd, temp_name = tempfile.mkstemp() + os.write(fd, data) + os.close(fd) + # rewrite the file with unix newlines + fp = open(dataset.file_name, 'wt') + for line in file(temp_name, "U"): + line = line.strip() + '\n' + fp.write(line) + fp.close() + os.remove(temp_name) + + def get_mime(self): + """Returns the mime type of the datatype""" + return 'text/plain' + + def set_meta(self, dataset, **kwd): + """ + Set the number of lines of data in dataset. + """ + dataset.metadata.data_lines = self.count_data_lines(dataset) + + def estimate_file_lines(self, dataset): + """ + Perform a rough estimate by extrapolating number of lines from a small read. + """ + sample_size = 1048576 + dataset_fh = open(dataset.file_name) + dataset_read = dataset_fh.read(sample_size) + dataset_fh.close() + sample_lines = dataset_read.count('\n') + est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) + return est_lines + + def count_data_lines(self, dataset): + """ + Count the number of lines of data in dataset, + skipping all blank lines and comments. + """ + data_lines = 0 + for line in file(dataset.file_name): + line = line.strip() + if line and not line.startswith('#'): + data_lines += 1 + return data_lines + + def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): + """ + Set the peek. This method is used by various subclasses of Text. + """ + if not dataset.dataset.purged: + # The file must exist on disk for the get_file_peek() method + dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, + skipchars=skipchars) + if line_count is None: + # See if line_count is stored in the metadata + if dataset.metadata.data_lines: + dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), + inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) + else: + # Number of lines is not known ( this should not happen ), and auto-detect is + # needed to set metadata + # This can happen when the file is larger than max_optional_metadata_filesize. + if int(dataset.get_size()) <= 1048576: + # Small dataset, recount all lines and reset peek afterward. + lc = self.count_data_lines(dataset) + dataset.metadata.data_lines = lc + dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) + else: + est_lines = self.estimate_file_lines(dataset) + dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), + inflector.cond_plural(est_lines, self.line_class) ) + else: + dataset.blurb = "%s %s" % ( + util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + + def sniff(self, filename): + """All LINUCS files should use the rings form determination script """ + try: + from suds.client import Client + + url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl' + client = Client(url) + response = client.service.DeterminingForm(file(filename, 'r').read()) + if response.array[0] == "LINUCS": + return True + else: + return False + except ImportError: + # cannot use import suds so use simple checker + print "using LINUCS simple checker" + + f = open(filename, "r") + firstline = f.readline() + f.close() + + if "[" in firstline and "]" in firstline and "{" in firstline and "}" in firstline: + return True + else: + return False + except Exception, e: + # note I am not raising an error rather return False and let another sniffer try to type this data + traceback.print_exc(file=sys.stdout) + return False + + def split(cls, input_datasets, subdir_generator_function, split_params): + """ + Split the input files by line. + """ + if split_params is None: + return + + if len(input_datasets) > 1: + raise Exception("Text file splitting does not support multiple files") + input_files = [ds.file_name for ds in input_datasets] + + lines_per_file = None + chunk_size = None + if split_params['split_mode'] == 'number_of_parts': + lines_per_file = [] + # Computing the length is expensive! + def _file_len(fname): + i = 0 + f = open(fname) + for i, l in enumerate(f): + pass + f.close() + return i + 1 + + length = _file_len(input_files[0]) + parts = int(split_params['split_size']) + if length < parts: + parts = length + len_each, remainder = divmod(length, parts) + while length > 0: + chunk = len_each + if remainder > 0: + chunk += 1 + lines_per_file.append(chunk) + remainder = - 1 + length -= chunk + elif split_params['split_mode'] == 'to_size': + chunk_size = int(split_params['split_size']) + else: + raise Exception('Unsupported split mode %s' % split_params['split_mode']) + + f = open(input_files[0], 'rt') + try: + chunk_idx = 0 + file_done = False + part_file = None + while not file_done: + if lines_per_file is None: + this_chunk_size = chunk_size + elif chunk_idx < len(lines_per_file): + this_chunk_size = lines_per_file[chunk_idx] + chunk_idx += 1 + lines_remaining = this_chunk_size + part_file = None + while lines_remaining > 0: + a_line = f.readline() + if a_line == '': + file_done = True + break + if part_file is None: + part_dir = subdir_generator_function() + part_path = os.path.join(part_dir, os.path.basename(input_files[0])) + part_file = open(part_path, 'w') + part_file.write(a_line) + lines_remaining -= 1 + if part_file is not None: + part_file.close() + except Exception, e: + log.error('Unable to split files: %s' % str(e)) + f.close() + if part_file is not None: + part_file.close() + raise + f.close() + + split = classmethod(split) + + +class iupac(data.Data): + file_ext = 'iupac' + line_class = 'line' + + """Add metadata elements""" + MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, + visible=False, no_value=0) + + def write_from_stream(self, dataset, stream): + """Writes data from a stream""" + # write it twice for now + fd, temp_name = tempfile.mkstemp() + while 1: + chunk = stream.read(1048576) + if not chunk: + break + os.write(fd, chunk) + os.close(fd) + # rewrite the file with unix newlines + fp = open(dataset.file_name, 'wt') + for line in file(temp_name, "U"): + line = line.strip() + '\n' + fp.write(line) + fp.close() + + def set_raw_data(self, dataset, data): + """Saves the data on the disc""" + fd, temp_name = tempfile.mkstemp() + os.write(fd, data) + os.close(fd) + # rewrite the file with unix newlines + fp = open(dataset.file_name, 'wt') + for line in file(temp_name, "U"): + line = line.strip() + '\n' + fp.write(line) + fp.close() + os.remove(temp_name) + + def get_mime(self): + """Returns the mime type of the datatype""" + return 'text/plain' + + def set_meta(self, dataset, **kwd): + """ + Set the number of lines of data in dataset. + """ + dataset.metadata.data_lines = self.count_data_lines(dataset) + + def estimate_file_lines(self, dataset): + """ + Perform a rough estimate by extrapolating number of lines from a small read. + """ + sample_size = 1048576 + dataset_fh = open(dataset.file_name) + dataset_read = dataset_fh.read(sample_size) + dataset_fh.close() + sample_lines = dataset_read.count('\n') + est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) + return est_lines + + def count_data_lines(self, dataset): + """ + Count the number of lines of data in dataset, + skipping all blank lines and comments. + """ + data_lines = 0 + for line in file(dataset.file_name): + line = line.strip() + if line and not line.startswith('#'): + data_lines += 1 + return data_lines + + def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): + """ + Set the peek. This method is used by various subclasses of Text. + """ + if not dataset.dataset.purged: + # The file must exist on disk for the get_file_peek() method + dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, + skipchars=skipchars) + if line_count is None: + # See if line_count is stored in the metadata + if dataset.metadata.data_lines: + dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), + inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) + else: + # Number of lines is not known ( this should not happen ), and auto-detect is + # needed to set metadata + # This can happen when the file is larger than max_optional_metadata_filesize. + if int(dataset.get_size()) <= 1048576: + # Small dataset, recount all lines and reset peek afterward. + lc = self.count_data_lines(dataset) + dataset.metadata.data_lines = lc + dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) + else: + est_lines = self.estimate_file_lines(dataset) + dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), + inflector.cond_plural(est_lines, self.line_class) ) + else: + dataset.blurb = "%s %s" % ( + util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + + def sniff(self, filename): + """All IUPAC files should use the rings form determination script """ + try: + from suds.client import Client + + url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl' + client = Client(url) + response = client.service.DeterminingForm(file(filename, 'r').read()) + if response.array[0] == "IUPAC": + return True + else: + return False + except ImportError: + # cannot use import suds so use simple checker + print "using IUPAC simple checker" + f = open(filename, "r") + firstline = f.readline() + f.close() + + if "[" in firstline or "]" in firstline or "(" in firstline or ")" in firstline: + if "{" in firstline or "}" in firstline: + return False + else: + return True + else: + return False + except Exception, e: + # note I am not raising an error rather return False and let another sniffer try to type this data + traceback.print_exc(file=sys.stdout) + return False + + def split(cls, input_datasets, subdir_generator_function, split_params): + """ + Split the input files by line. + """ + if split_params is None: + return + + if len(input_datasets) > 1: + raise Exception("Text file splitting does not support multiple files") + input_files = [ds.file_name for ds in input_datasets] + + lines_per_file = None + chunk_size = None + if split_params['split_mode'] == 'number_of_parts': + lines_per_file = [] + # Computing the length is expensive! + def _file_len(fname): + i = 0 + f = open(fname) + for i, l in enumerate(f): + pass + f.close() + return i + 1 + + length = _file_len(input_files[0]) + parts = int(split_params['split_size']) + if length < parts: + parts = length + len_each, remainder = divmod(length, parts) + while length > 0: + chunk = len_each + if remainder > 0: + chunk += 1 + lines_per_file.append(chunk) + remainder = - 1 + length -= chunk + elif split_params['split_mode'] == 'to_size': + chunk_size = int(split_params['split_size']) + else: + raise Exception('Unsupported split mode %s' % split_params['split_mode']) + + f = open(input_files[0], 'rt') + try: + chunk_idx = 0 + file_done = False + part_file = None + while not file_done: + if lines_per_file is None: + this_chunk_size = chunk_size + elif chunk_idx < len(lines_per_file): + this_chunk_size = lines_per_file[chunk_idx] + chunk_idx += 1 + lines_remaining = this_chunk_size + part_file = None + while lines_remaining > 0: + a_line = f.readline() + if a_line == '': + file_done = True + break + if part_file is None: + part_dir = subdir_generator_function() + part_path = os.path.join(part_dir, os.path.basename(input_files[0])) + part_file = open(part_path, 'w') + part_file.write(a_line) + lines_remaining -= 1 + if part_file is not None: + part_file.close() + except Exception, e: + log.error('Unable to split files: %s' % str(e)) + f.close() + if part_file is not None: + part_file.close() + raise + f.close() + + split = classmethod(split) + + +class linearcode(data.Data): + file_ext = 'linearcode' + line_class = 'line' + + """Add metadata elements""" + MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, + visible=False, no_value=0) + + def write_from_stream(self, dataset, stream): + """Writes data from a stream""" + # write it twice for now + fd, temp_name = tempfile.mkstemp() + while 1: + chunk = stream.read(1048576) + if not chunk: + break + os.write(fd, chunk) + os.close(fd) + # rewrite the file with unix newlines + fp = open(dataset.file_name, 'wt') + for line in file(temp_name, "U"): + line = line.strip() + '\n' + fp.write(line) + fp.close() + + def set_raw_data(self, dataset, data): + """Saves the data on the disc""" + fd, temp_name = tempfile.mkstemp() + os.write(fd, data) + os.close(fd) + # rewrite the file with unix newlines + fp = open(dataset.file_name, 'wt') + for line in file(temp_name, "U"): + line = line.strip() + '\n' + fp.write(line) + fp.close() + os.remove(temp_name) + + def get_mime(self): + """Returns the mime type of the datatype""" + return 'text/plain' + + def set_meta(self, dataset, **kwd): + """ + Set the number of lines of data in dataset. + """ + dataset.metadata.data_lines = self.count_data_lines(dataset) + + def estimate_file_lines(self, dataset): + """ + Perform a rough estimate by extrapolating number of lines from a small read. + """ + sample_size = 1048576 + dataset_fh = open(dataset.file_name) + dataset_read = dataset_fh.read(sample_size) + dataset_fh.close() + sample_lines = dataset_read.count('\n') + est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) + return est_lines + + def count_data_lines(self, dataset): + """ + Count the number of lines of data in dataset, + skipping all blank lines and comments. + """ + data_lines = 0 + for line in file(dataset.file_name): + line = line.strip() + if line and not line.startswith('#'): + data_lines += 1 + return data_lines + + def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): + """ + Set the peek. This method is used by various subclasses of Text. + """ + if not dataset.dataset.purged: + # The file must exist on disk for the get_file_peek() method + dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, + skipchars=skipchars) + if line_count is None: + # See if line_count is stored in the metadata + if dataset.metadata.data_lines: + dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), + inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) + else: + # Number of lines is not known ( this should not happen ), and auto-detect is + # needed to set metadata + # This can happen when the file is larger than max_optional_metadata_filesize. + if int(dataset.get_size()) <= 1048576: + # Small dataset, recount all lines and reset peek afterward. + lc = self.count_data_lines(dataset) + dataset.metadata.data_lines = lc + dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) + else: + est_lines = self.estimate_file_lines(dataset) + dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), + inflector.cond_plural(est_lines, self.line_class) ) + else: + dataset.blurb = "%s %s" % ( + util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + + def sniff(self, filename): + """All linear code files should use the rings form determination script """ + try: + from suds.client import Client + + url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl' + client = Client(url) + lcresponse = client.service.DeterminingForm(file(filename, 'r').read()) + if lcresponse.array[0] == "LinearCode": + print "LinearCode" + return True + else: + print "Unable to guess format" + return False + except ImportError: + # cannot use import suds so use simple checker + print "using LinearCode simple checker - nope it does not exist yet" + return False + except Exception, e: + # note I am not raising an error rather return False and let another sniffer try to type this data + traceback.print_exc(file=sys.stdout) + return False + + def split(cls, input_datasets, subdir_generator_function, split_params): + """ + Split the input files by line. + """ + if split_params is None: + return + + if len(input_datasets) > 1: + raise Exception("Text file splitting does not support multiple files") + input_files = [ds.file_name for ds in input_datasets] + + lines_per_file = None + chunk_size = None + if split_params['split_mode'] == 'number_of_parts': + lines_per_file = [] + # Computing the length is expensive! + def _file_len(fname): + i = 0 + f = open(fname) + for i, l in enumerate(f): + pass + f.close() + return i + 1 + + length = _file_len(input_files[0]) + parts = int(split_params['split_size']) + if length < parts: + parts = length + len_each, remainder = divmod(length, parts) + while length > 0: + chunk = len_each + if remainder > 0: + chunk += 1 + lines_per_file.append(chunk) + remainder = - 1 + length -= chunk + elif split_params['split_mode'] == 'to_size': + chunk_size = int(split_params['split_size']) + else: + raise Exception('Unsupported split mode %s' % split_params['split_mode']) + + f = open(input_files[0], 'rt') + try: + chunk_idx = 0 + file_done = False + part_file = None + while not file_done: + if lines_per_file is None: + this_chunk_size = chunk_size + elif chunk_idx < len(lines_per_file): + this_chunk_size = lines_per_file[chunk_idx] + chunk_idx += 1 + lines_remaining = this_chunk_size + part_file = None + while lines_remaining > 0: + a_line = f.readline() + if a_line == '': + file_done = True + break + if part_file is None: + part_dir = subdir_generator_function() + part_path = os.path.join(part_dir, os.path.basename(input_files[0])) + part_file = open(part_path, 'w') + part_file.write(a_line) + lines_remaining -= 1 + if part_file is not None: + part_file.close() + except Exception, e: + log.error('Unable to split files: %s' % str(e)) + f.close() + if part_file is not None: + part_file.close() + raise + f.close() + + split = classmethod(split) + + +class msa(data.Data): + file_ext = 'msa' + line_class = 'line' + + """Add metadata elements""" + MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, + visible=False, no_value=0) + + def write_from_stream(self, dataset, stream): + """Writes data from a stream""" + # write it twice for now + fd, temp_name = tempfile.mkstemp() + while 1: + chunk = stream.read(1048576) + if not chunk: + break + os.write(fd, chunk) + os.close(fd) + # rewrite the file with unix newlines + fp = open(dataset.file_name, 'wt') + for line in file(temp_name, "U"): + line = line.strip() + '\n' + fp.write(line) + fp.close() + + def set_raw_data(self, dataset, data): + """Saves the data on the disc""" + fd, temp_name = tempfile.mkstemp() + os.write(fd, data) + os.close(fd) + # rewrite the file with unix newlines + fp = open(dataset.file_name, 'wt') + for line in file(temp_name, "U"): + line = line.strip() + '\n' + fp.write(line) + fp.close() + os.remove(temp_name) + + def get_mime(self): + """Returns the mime type of the datatype""" + return 'text/plain' + + def set_meta(self, dataset, **kwd): + """ + Set the number of lines of data in dataset. + """ + dataset.metadata.data_lines = self.count_data_lines(dataset) + + def estimate_file_lines(self, dataset): + """ + Perform a rough estimate by extrapolating number of lines from a small read. + """ + sample_size = 1048576 + dataset_fh = open(dataset.file_name) + dataset_read = dataset_fh.read(sample_size) + dataset_fh.close() + sample_lines = dataset_read.count('\n') + est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) + return est_lines + + def count_data_lines(self, dataset): + """ + Count the number of lines of data in dataset, + skipping all blank lines and comments. + """ + data_lines = 0 + for line in file(dataset.file_name): + line = line.strip() + if line and not line.startswith('#'): + data_lines += 1 + return data_lines + + def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): + """ + Set the peek. This method is used by various subclasses of Text. + """ + if not dataset.dataset.purged: + # The file must exist on disk for the get_file_peek() method + dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, + skipchars=skipchars) + if line_count is None: + # See if line_count is stored in the metadata + if dataset.metadata.data_lines: + dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), + inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) + else: + # Number of lines is not known ( this should not happen ), and auto-detect is + # needed to set metadata + # This can happen when the file is larger than max_optional_metadata_filesize. + if int(dataset.get_size()) <= 1048576: + # Small dataset, recount all lines and reset peek afterward. + lc = self.count_data_lines(dataset) + dataset.metadata.data_lines = lc + dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) + else: + est_lines = self.estimate_file_lines(dataset) + dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), + inflector.cond_plural(est_lines, self.line_class) ) + else: + dataset.blurb = "%s %s" % ( + util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + + def sniff(self, filename): + """All msa Files simply put a '# .msa' in the first line. """ + try: + f = open(filename, "r") + firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity + f.close() + + if "# .MSA" in firstline: + return True + else: + return False + except: + traceback.print_exc(file=sys.stdout) + return False + + def split(cls, input_datasets, subdir_generator_function, split_params): + """ + Split the input files by line. + """ + if split_params is None: + return + + if len(input_datasets) > 1: + raise Exception("Text file splitting does not support multiple files") + input_files = [ds.file_name for ds in input_datasets] + + lines_per_file = None + chunk_size = None + if split_params['split_mode'] == 'number_of_parts': + lines_per_file = [] + # Computing the length is expensive! + def _file_len(fname): + i = 0 + f = open(fname) + for i, l in enumerate(f): + pass + f.close() + return i + 1 + + length = _file_len(input_files[0]) + parts = int(split_params['split_size']) + if length < parts: + parts = length + len_each, remainder = divmod(length, parts) + while length > 0: + chunk = len_each + if remainder > 0: + chunk += 1 + lines_per_file.append(chunk) + remainder = - 1 + length -= chunk + elif split_params['split_mode'] == 'to_size': + chunk_size = int(split_params['split_size']) + else: + raise Exception('Unsupported split mode %s' % split_params['split_mode']) + + f = open(input_files[0], 'rt') + try: + chunk_idx = 0 + file_done = False + part_file = None + while not file_done: + if lines_per_file is None: + this_chunk_size = chunk_size + elif chunk_idx < len(lines_per_file): + this_chunk_size = lines_per_file[chunk_idx] + chunk_idx += 1 + lines_remaining = this_chunk_size + part_file = None + while lines_remaining > 0: + a_line = f.readline() + if a_line == '': + file_done = True + break + if part_file is None: + part_dir = subdir_generator_function() + part_path = os.path.join(part_dir, os.path.basename(input_files[0])) + part_file = open(part_path, 'w') + part_file.write(a_line) + lines_remaining -= 1 + if part_file is not None: + part_file.close() + except Exception, e: + log.error('Unable to split files: %s' % str(e)) + f.close() + if part_file is not None: + part_file.close() + raise + f.close() + + split = classmethod(split) + + +class wurcs(data.Data): + file_ext = 'wurcs' + line_class = 'line' + + """Add metadata elements""" + MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, + visible=False, no_value=0) + + def write_from_stream(self, dataset, stream): + """Writes data from a stream""" + # write it twice for now + fd, temp_name = tempfile.mkstemp() + while 1: + chunk = stream.read(1048576) + if not chunk: + break + os.write(fd, chunk) + os.close(fd) + # rewrite the file with unix newlines + fp = open(dataset.file_name, 'wt') + for line in file(temp_name, "U"): + line = line.strip() + '\n' + fp.write(line) + fp.close() + + def set_raw_data(self, dataset, data): + """Saves the data on the disc""" + fd, temp_name = tempfile.mkstemp() + os.write(fd, data) + os.close(fd) + # rewrite the file with unix newlines + fp = open(dataset.file_name, 'wt') + for line in file(temp_name, "U"): + line = line.strip() + '\n' + fp.write(line) + fp.close() + os.remove(temp_name) + + def get_mime(self): + """Returns the mime type of the datatype""" + return 'text/plain' + + def set_meta(self, dataset, **kwd): + """ + Set the number of lines of data in dataset. + """ + dataset.metadata.data_lines = self.count_data_lines(dataset) + + def estimate_file_lines(self, dataset): + """ + Perform a rough estimate by extrapolating number of lines from a small read. + """ + sample_size = 1048576 + dataset_fh = open(dataset.file_name) + dataset_read = dataset_fh.read(sample_size) + dataset_fh.close() + sample_lines = dataset_read.count('\n') + est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) + return est_lines + + def count_data_lines(self, dataset): + """ + Count the number of lines of data in dataset, + skipping all blank lines and comments. + """ + data_lines = 0 + for line in file(dataset.file_name): + line = line.strip() + if line and not line.startswith('#'): + data_lines += 1 + return data_lines + + def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): + """ + Set the peek. This method is used by various subclasses of Text. + """ + if not dataset.dataset.purged: + # The file must exist on disk for the get_file_peek() method + dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, + skipchars=skipchars) + if line_count is None: + # See if line_count is stored in the metadata + if dataset.metadata.data_lines: + dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), + inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) + else: + # Number of lines is not known ( this should not happen ), and auto-detect is + # needed to set metadata + # This can happen when the file is larger than max_optional_metadata_filesize. + if int(dataset.get_size()) <= 1048576: + # Small dataset, recount all lines and reset peek afterward. + lc = self.count_data_lines(dataset) + dataset.metadata.data_lines = lc + dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) + else: + est_lines = self.estimate_file_lines(dataset) + dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), + inflector.cond_plural(est_lines, self.line_class) ) + else: + dataset.blurb = "%s %s" % ( + util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + + def sniff(self, filename): + """All WURCS Files start with WURCS= then the version number. see http://www.wurcs-wg.org/definition.php and http://rings.t.soka.ac.jp/ +WURCS=2.0/4,3/[x2112h+1:x|1,5|2*NCC/3=O|4*OSO/3=O/3=O][12122a+1:b|1,5][12112h+1:b|1,5|2*NCC/3=O|6*OSO/3=O/3=O][12122a+1:b|1,5]1+3,2+1|2+4,3+1|3+3,4+1""" + try: + f = open(filename, "r") + firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity + f.close() + if "WURCS" in firstline: + return True + else: + return False + except: + traceback.print_exc(file=sys.stdout) + return False + + + def split(cls, input_datasets, subdir_generator_function, split_params): + """ + Split the input files by line. + """ + if split_params is None: + return + + if len(input_datasets) > 1: + raise Exception("Text file splitting does not support multiple files") + input_files = [ds.file_name for ds in input_datasets] + + lines_per_file = None + chunk_size = None + if split_params['split_mode'] == 'number_of_parts': + lines_per_file = [] + # Computing the length is expensive! + def _file_len(fname): + i = 0 + f = open(fname) + for i, l in enumerate(f): + pass + f.close() + return i + 1 + + length = _file_len(input_files[0]) + parts = int(split_params['split_size']) + if length < parts: + parts = length + len_each, remainder = divmod(length, parts) + while length > 0: + chunk = len_each + if remainder > 0: + chunk += 1 + lines_per_file.append(chunk) + remainder = - 1 + length -= chunk + elif split_params['split_mode'] == 'to_size': + chunk_size = int(split_params['split_size']) + else: + raise Exception('Unsupported split mode %s' % split_params['split_mode']) + + f = open(input_files[0], 'rt') + try: + chunk_idx = 0 + file_done = False + part_file = None + while not file_done: + if lines_per_file is None: + this_chunk_size = chunk_size + elif chunk_idx < len(lines_per_file): + this_chunk_size = lines_per_file[chunk_idx] + chunk_idx += 1 + lines_remaining = this_chunk_size + part_file = None + while lines_remaining > 0: + a_line = f.readline() + if a_line == '': + file_done = True + break + if part_file is None: + part_dir = subdir_generator_function() + part_path = os.path.join(part_dir, os.path.basename(input_files[0])) + part_file = open(part_path, 'w') + part_file.write(a_line) + lines_remaining -= 1 + if part_file is not None: + part_file.close() + except Exception, e: + log.error('Unable to split files: %s' % str(e)) + f.close() + if part_file is not None: + part_file.close() + raise + f.close() + + split = classmethod(split) + +