# HG changeset patch
# User chrisb
# Date 1458758090 14400
# Node ID 0e941a69a6fa3eac4680efe85a220244de396ad7
Uploaded
diff -r 000000000000 -r 0e941a69a6fa datatypes/README.md
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/README.md Wed Mar 23 14:34:50 2016 -0400
@@ -0,0 +1,19 @@
+
+Custom glycan data types for galaxy
+===================================
+
+New glycan data types for galaxy, included as part of the glycan tools repo instead of being included manually (as done previously)
+Ideas from http://gregvonkuster.org/galaxy-tool-shed-including-custom-datatypes-repositories/
+
+Supported data types include (copied from datatypes_conf.xml):
+
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r 0e941a69a6fa datatypes/datatypes_conf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/datatypes_conf.xml Wed Mar 23 14:34:50 2016 -0400
@@ -0,0 +1,30 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r 0e941a69a6fa datatypes/glycan.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/glycan.py Wed Mar 23 14:34:50 2016 -0400
@@ -0,0 +1,1964 @@
+__license__ = "MIT"
+
+import logging
+from galaxy.datatypes import metadata
+import mimetypes
+import os
+import shutil
+import sys
+import traceback
+import tempfile
+import zipfile
+from cgi import escape
+from inspect import isclass
+import galaxy.util as util
+from galaxy.datatypes import data
+from galaxy.datatypes.metadata import \
+ MetadataElement # import directly to maintain ease of use in Datatype class definitions
+from galaxy.util import inflector
+from galaxy.util.bunch import Bunch
+from galaxy.util.odict import odict
+from galaxy.util.sanitize_html import sanitize_html
+
+from galaxy.datatypes import dataproviders
+
+from galaxy import eggs
+
+eggs.require("Paste")
+import paste
+
+
+class kcf(data.Data):
+ file_ext = 'kcf'
+ line_class = 'line'
+
+ """Add metadata elements"""
+ MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+ visible=False, no_value=0)
+
+ def write_from_stream(self, dataset, stream):
+ """Writes data from a stream"""
+ # write it twice for now
+ fd, temp_name = tempfile.mkstemp()
+ while 1:
+ chunk = stream.read(1048576)
+ if not chunk:
+ break
+ os.write(fd, chunk)
+ os.close(fd)
+ # rewrite the file with unix newlines
+ fp = open(dataset.file_name, 'wt')
+ for line in file(temp_name, "U"):
+ line = line.strip() + '\n'
+ fp.write(line)
+ fp.close()
+
+ def set_raw_data(self, dataset, data):
+ """Saves the data on the disc"""
+ fd, temp_name = tempfile.mkstemp()
+ os.write(fd, data)
+ os.close(fd)
+ # rewrite the file with unix newlines
+ fp = open(dataset.file_name, 'wt')
+ for line in file(temp_name, "U"):
+ line = line.strip() + '\n'
+ fp.write(line)
+ fp.close()
+ os.remove(temp_name)
+
+ def get_mime(self):
+ """Returns the mime type of the datatype"""
+ return 'text/plain'
+
+ def set_meta(self, dataset, **kwd):
+ """
+ Set the number of lines of data in dataset.
+ """
+ dataset.metadata.data_lines = self.count_data_lines(dataset)
+
+ def estimate_file_lines(self, dataset):
+ """
+ Perform a rough estimate by extrapolating number of lines from a small read.
+ """
+ sample_size = 1048576
+ dataset_fh = open(dataset.file_name)
+ dataset_read = dataset_fh.read(sample_size)
+ dataset_fh.close()
+ sample_lines = dataset_read.count('\n')
+ est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+ return est_lines
+
+ def count_data_lines(self, dataset):
+ """
+ Count the number of lines of data in dataset,
+ skipping all blank lines and comments.
+ """
+ data_lines = 0
+ for line in file(dataset.file_name):
+ line = line.strip()
+ if line and not line.startswith('#'):
+ data_lines += 1
+ return data_lines
+
+ def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+ """
+ Set the peek. This method is used by various subclasses of Text.
+ """
+ if not dataset.dataset.purged:
+ # The file must exist on disk for the get_file_peek() method
+ dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+ skipchars=skipchars)
+ if line_count is None:
+ # See if line_count is stored in the metadata
+ if dataset.metadata.data_lines:
+ dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+ inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+ else:
+ # Number of lines is not known ( this should not happen ), and auto-detect is
+ # needed to set metadata
+ # This can happen when the file is larger than max_optional_metadata_filesize.
+ if int(dataset.get_size()) <= 1048576:
+ # Small dataset, recount all lines and reset peek afterward.
+ lc = self.count_data_lines(dataset)
+ dataset.metadata.data_lines = lc
+ dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+ else:
+ est_lines = self.estimate_file_lines(dataset)
+ dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+ inflector.cond_plural(est_lines, self.line_class) )
+ else:
+ dataset.blurb = "%s %s" % (
+ util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+ else:
+ dataset.peek = 'file does not exist'
+ dataset.blurb = 'file purged from disk'
+
+ def sniff(self, filename):
+ """All KCF Files simply put a 'ENTRY' in its first line.
+ This applies to all possible kcfs. In this case check
+ for 'Glycan' to confirm it's a glycan """
+ try:
+ from suds.client import Client
+
+ url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
+ client = Client(url)
+ kcfresponse = client.service.DeterminingForm(file(filename, 'r').read())
+ if kcfresponse.array[0] == "KCF":
+ return True
+ else:
+ return False
+ except ImportError:
+ # cannot use import suds so use simple checker
+ print "using KCF simple checker"
+ f = open(filename, "r")
+ firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity
+ f.close()
+
+ if "ENTRY" in firstline and "GLYCAN" in firstline:
+ return True
+ else:
+ return False
+ except Exception, e:
+ # note I am not raising an error rather return False and let another sniffer try to type this data
+ traceback.print_exc(file=sys.stdout)
+ return False
+
+ def split(cls, input_datasets, subdir_generator_function, split_params):
+ """
+ Split the input files by line.
+ """
+ if split_params is None:
+ return
+
+ if len(input_datasets) > 1:
+ raise Exception("Text file splitting does not support multiple files")
+ input_files = [ds.file_name for ds in input_datasets]
+
+ lines_per_file = None
+ chunk_size = None
+ if split_params['split_mode'] == 'number_of_parts':
+ lines_per_file = []
+ # Computing the length is expensive!
+ def _file_len(fname):
+ i = 0
+ f = open(fname)
+ for i, l in enumerate(f):
+ pass
+ f.close()
+ return i + 1
+
+ length = _file_len(input_files[0])
+ parts = int(split_params['split_size'])
+ if length < parts:
+ parts = length
+ len_each, remainder = divmod(length, parts)
+ while length > 0:
+ chunk = len_each
+ if remainder > 0:
+ chunk += 1
+ lines_per_file.append(chunk)
+ remainder = - 1
+ length -= chunk
+ elif split_params['split_mode'] == 'to_size':
+ chunk_size = int(split_params['split_size'])
+ else:
+ raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+
+ f = open(input_files[0], 'rt')
+ try:
+ chunk_idx = 0
+ file_done = False
+ part_file = None
+ while not file_done:
+ if lines_per_file is None:
+ this_chunk_size = chunk_size
+ elif chunk_idx < len(lines_per_file):
+ this_chunk_size = lines_per_file[chunk_idx]
+ chunk_idx += 1
+ lines_remaining = this_chunk_size
+ part_file = None
+ while lines_remaining > 0:
+ a_line = f.readline()
+ if a_line == '':
+ file_done = True
+ break
+ if part_file is None:
+ part_dir = subdir_generator_function()
+ part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+ part_file = open(part_path, 'w')
+ part_file.write(a_line)
+ lines_remaining -= 1
+ if part_file is not None:
+ part_file.close()
+ except Exception, e:
+ log.error('Unable to split files: %s' % str(e))
+ f.close()
+ if part_file is not None:
+ part_file.close()
+ raise
+ f.close()
+
+ split = classmethod(split)
+
+
+class glycoct(data.Data):
+ file_ext = 'glycoct'
+ line_class = 'line'
+
+ """Add metadata elements"""
+ MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+ visible=False, no_value=0)
+
+ def write_from_stream(self, dataset, stream):
+ """Writes data from a stream"""
+ # write it twice for now
+ fd, temp_name = tempfile.mkstemp()
+ while 1:
+ chunk = stream.read(1048576)
+ if not chunk:
+ break
+ os.write(fd, chunk)
+ os.close(fd)
+ # rewrite the file with unix newlines
+ fp = open(dataset.file_name, 'wt')
+ for line in file(temp_name, "U"):
+ line = line.strip() + '\n'
+ fp.write(line)
+ fp.close()
+
+ def set_raw_data(self, dataset, data):
+ """Saves the data on the disc"""
+ fd, temp_name = tempfile.mkstemp()
+ os.write(fd, data)
+ os.close(fd)
+ # rewrite the file with unix newlines
+ fp = open(dataset.file_name, 'wt')
+ for line in file(temp_name, "U"):
+ line = line.strip() + '\n'
+ fp.write(line)
+ fp.close()
+ os.remove(temp_name)
+
+ def get_mime(self):
+ """Returns the mime type of the datatype"""
+ return 'text/plain'
+
+ def set_meta(self, dataset, **kwd):
+ """
+ Set the number of lines of data in dataset.
+ """
+ dataset.metadata.data_lines = self.count_data_lines(dataset)
+
+ def estimate_file_lines(self, dataset):
+ """
+ Perform a rough estimate by extrapolating number of lines from a small read.
+ """
+ sample_size = 1048576
+ dataset_fh = open(dataset.file_name)
+ dataset_read = dataset_fh.read(sample_size)
+ dataset_fh.close()
+ sample_lines = dataset_read.count('\n')
+ est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+ return est_lines
+
+ def count_data_lines(self, dataset):
+ """
+ Count the number of lines of data in dataset,
+ skipping all blank lines and comments.
+ """
+ data_lines = 0
+ for line in file(dataset.file_name):
+ line = line.strip()
+ if line and not line.startswith('#'):
+ data_lines += 1
+ return data_lines
+
+ def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+ """
+ Set the peek. This method is used by various subclasses of Text.
+ """
+ if not dataset.dataset.purged:
+ # The file must exist on disk for the get_file_peek() method
+ dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+ skipchars=skipchars)
+ if line_count is None:
+ # See if line_count is stored in the metadata
+ if dataset.metadata.data_lines:
+ dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+ inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+ else:
+ # Number of lines is not known ( this should not happen ), and auto-detect is
+ # needed to set metadata
+ # This can happen when the file is larger than max_optional_metadata_filesize.
+ if int(dataset.get_size()) <= 1048576:
+ # Small dataset, recount all lines and reset peek afterward.
+ lc = self.count_data_lines(dataset)
+ dataset.metadata.data_lines = lc
+ dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+ else:
+ est_lines = self.estimate_file_lines(dataset)
+ dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+ inflector.cond_plural(est_lines, self.line_class) )
+ else:
+ dataset.blurb = "%s %s" % (
+ util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+ else:
+ dataset.peek = 'file does not exist'
+ dataset.blurb = 'file purged from disk'
+
+ def sniff(self, filename):
+ """All glycoct_condensed files simply put a 'RES' in its first line and a LIN later. """
+ try:
+ f = open(filename, "r")
+ firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity
+ lines = f.read()
+ f.close()
+
+ # if "RES" in firstline and "LIN" in lines:
+ if "RES" in firstline and "LIN" in lines:
+ return True
+ else:
+ return False
+ except Exception, e:
+ # note I am not raising an error rather return False and let another sniffer try to type this data
+ traceback.print_exc(file=sys.stdout)
+ return False
+
+ def split(cls, input_datasets, subdir_generator_function, split_params):
+ """
+ Split the input files by line.
+ """
+ if split_params is None:
+ return
+
+ if len(input_datasets) > 1:
+ raise Exception("Text file splitting does not support multiple files")
+ input_files = [ds.file_name for ds in input_datasets]
+
+ lines_per_file = None
+ chunk_size = None
+ if split_params['split_mode'] == 'number_of_parts':
+ lines_per_file = []
+ # Computing the length is expensive!
+ def _file_len(fname):
+ i = 0
+ f = open(fname)
+ for i, l in enumerate(f):
+ pass
+ f.close()
+ return i + 1
+
+ length = _file_len(input_files[0])
+ parts = int(split_params['split_size'])
+ if length < parts:
+ parts = length
+ len_each, remainder = divmod(length, parts)
+ while length > 0:
+ chunk = len_each
+ if remainder > 0:
+ chunk += 1
+ lines_per_file.append(chunk)
+ remainder = - 1
+ length -= chunk
+ elif split_params['split_mode'] == 'to_size':
+ chunk_size = int(split_params['split_size'])
+ else:
+ raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+
+ f = open(input_files[0], 'rt')
+ try:
+ chunk_idx = 0
+ file_done = False
+ part_file = None
+ while not file_done:
+ if lines_per_file is None:
+ this_chunk_size = chunk_size
+ elif chunk_idx < len(lines_per_file):
+ this_chunk_size = lines_per_file[chunk_idx]
+ chunk_idx += 1
+ lines_remaining = this_chunk_size
+ part_file = None
+ while lines_remaining > 0:
+ a_line = f.readline()
+ if a_line == '':
+ file_done = True
+ break
+ if part_file is None:
+ part_dir = subdir_generator_function()
+ part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+ part_file = open(part_path, 'w')
+ part_file.write(a_line)
+ lines_remaining -= 1
+ if part_file is not None:
+ part_file.close()
+ except Exception, e:
+ log.error('Unable to split files: %s' % str(e))
+ f.close()
+ if part_file is not None:
+ part_file.close()
+ raise
+ f.close()
+
+ split = classmethod(split)
+
+# ------------- Utility methods --------------
+
+# nice_size used to be here, but to resolve cyclical dependencies it's been
+# moved to galaxy.util. It belongs there anyway since it's used outside
+# datatypes.
+nice_size = util.nice_size
+
+
+def get_test_fname(fname):
+ """Returns test data filename"""
+ path, name = os.path.split(__file__)
+ full_path = os.path.join(path, 'test', fname)
+ return full_path
+
+
+def get_file_peek(file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipchars=[]):
+ """
+ Returns the first LINE_COUNT lines wrapped to WIDTH
+
+ ## >>> fname = get_test_fname('4.bed')
+ ## >>> get_file_peek(fname)
+ ## 'chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 +\n'
+
+ """
+ # Set size for file.readline() to a negative number to force it to
+ # read until either a newline or EOF. Needed for datasets with very
+ # long lines.
+ if WIDTH == 'unlimited':
+ WIDTH = -1
+ lines = []
+ count = 0
+ file_type = None
+ data_checked = False
+ temp = open(file_name, "U")
+ while count <= LINE_COUNT:
+ line = temp.readline(WIDTH)
+ if line and not is_multi_byte and not data_checked:
+ # See if we have a compressed or binary file
+ if line[0:2] == util.gzip_magic:
+ file_type = 'gzipped'
+ break
+ else:
+ for char in line:
+ if ord(char) > 128:
+ file_type = 'binary'
+ break
+ data_checked = True
+ if file_type in ['gzipped', 'binary']:
+ break
+ skip_line = False
+ for skipchar in skipchars:
+ if line.startswith(skipchar):
+ skip_line = True
+ break
+ if not skip_line:
+ lines.append(line)
+ count += 1
+ temp.close()
+ if file_type in ['gzipped', 'binary']:
+ text = "%s file" % file_type
+ else:
+ try:
+ text = unicode('\n'.join(lines), 'utf-8')
+ except UnicodeDecodeError:
+ text = "binary/unknown file"
+ return text
+
+
+class glycoct_xml(data.Data):
+ file_ext = 'glycoct_xml'
+ line_class = 'line'
+
+ """Add metadata elements"""
+ MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+ visible=False, no_value=0)
+
+ def write_from_stream(self, dataset, stream):
+ """Writes data from a stream"""
+ # write it twice for now
+ fd, temp_name = tempfile.mkstemp()
+ while 1:
+ chunk = stream.read(1048576)
+ if not chunk:
+ break
+ os.write(fd, chunk)
+ os.close(fd)
+ # rewrite the file with unix newlines
+ fp = open(dataset.file_name, 'wt')
+ for line in file(temp_name, "U"):
+ line = line.strip() + '\n'
+ fp.write(line)
+ fp.close()
+
+ def set_raw_data(self, dataset, data):
+ """Saves the data on the disc"""
+ fd, temp_name = tempfile.mkstemp()
+ os.write(fd, data)
+ os.close(fd)
+ # rewrite the file with unix newlines
+ fp = open(dataset.file_name, 'wt')
+ for line in file(temp_name, "U"):
+ line = line.strip() + '\n'
+ fp.write(line)
+ fp.close()
+ os.remove(temp_name)
+
+ def get_mime(self):
+ """Returns the mime type of the datatype"""
+ return 'text/xml'
+
+ def set_meta(self, dataset, **kwd):
+ """
+ Set the number of lines of data in dataset.
+ """
+ dataset.metadata.data_lines = self.count_data_lines(dataset)
+
+ def estimate_file_lines(self, dataset):
+ """
+ Perform a rough estimate by extrapolating number of lines from a small read.
+ """
+ sample_size = 1048576
+ dataset_fh = open(dataset.file_name)
+ dataset_read = dataset_fh.read(sample_size)
+ dataset_fh.close()
+ sample_lines = dataset_read.count('\n')
+ est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+ return est_lines
+
+ def count_data_lines(self, dataset):
+ """
+ Count the number of lines of data in dataset,
+ skipping all blank lines and comments.
+ """
+ data_lines = 0
+ for line in file(dataset.file_name):
+ line = line.strip()
+ if line and not line.startswith('#'):
+ data_lines += 1
+ return data_lines
+
+ def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+ """
+ Set the peek. This method is used by various subclasses of Text.
+ """
+ if not dataset.dataset.purged:
+ # The file must exist on disk for the get_file_peek() method
+ dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+ skipchars=skipchars)
+ if line_count is None:
+ # See if line_count is stored in the metadata
+ if dataset.metadata.data_lines:
+ dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+ inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+ else:
+ # Number of lines is not known ( this should not happen ), and auto-detect is
+ # needed to set metadata
+ # This can happen when the file is larger than max_optional_metadata_filesize.
+ if int(dataset.get_size()) <= 1048576:
+ # Small dataset, recount all lines and reset peek afterward.
+ lc = self.count_data_lines(dataset)
+ dataset.metadata.data_lines = lc
+ dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+ else:
+ est_lines = self.estimate_file_lines(dataset)
+ dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+ inflector.cond_plural(est_lines, self.line_class) )
+ else:
+ dataset.blurb = "%s %s" % (
+ util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+ else:
+ dataset.peek = 'file does not exist'
+ dataset.blurb = 'file purged from disk'
+
+ def sniff(self, filename):
+ """All glycoct XML files should use the rings form determination script """
+ try:
+ from suds.client import Client
+
+ url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
+ client = Client(url)
+ response = client.service.DeterminingForm(file(filename, 'r').read())
+ if response.array[0] == "GlycoCT":
+ return True
+ else:
+ return False
+ except ImportError:
+ # cannot use import suds so use simple checker
+ print "using glycoct XML simple checker"
+ import xml.etree.cElementTree as ET
+
+ tree = ET.parse(filename)
+ root = tree.getroot()
+ if root.tag == 'sugar':
+ print root.tag, root.attrib
+ return True
+ else:
+ return False
+ except Exception, e:
+ # note I am not raising an error rather return False and let another sniffer try to type this data
+ traceback.print_exc(file=sys.stdout)
+ return False
+
+ def split(cls, input_datasets, subdir_generator_function, split_params):
+ """
+ Split the input files by line.
+ """
+ if split_params is None:
+ return
+
+ if len(input_datasets) > 1:
+ raise Exception("Text file splitting does not support multiple files")
+ input_files = [ds.file_name for ds in input_datasets]
+
+ lines_per_file = None
+ chunk_size = None
+ if split_params['split_mode'] == 'number_of_parts':
+ lines_per_file = []
+ # Computing the length is expensive!
+ def _file_len(fname):
+ i = 0
+ f = open(fname)
+ for i, l in enumerate(f):
+ pass
+ f.close()
+ return i + 1
+
+ length = _file_len(input_files[0])
+ parts = int(split_params['split_size'])
+ if length < parts:
+ parts = length
+ len_each, remainder = divmod(length, parts)
+ while length > 0:
+ chunk = len_each
+ if remainder > 0:
+ chunk += 1
+ lines_per_file.append(chunk)
+ remainder = - 1
+ length -= chunk
+ elif split_params['split_mode'] == 'to_size':
+ chunk_size = int(split_params['split_size'])
+ else:
+ raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+
+ f = open(input_files[0], 'rt')
+ try:
+ chunk_idx = 0
+ file_done = False
+ part_file = None
+ while not file_done:
+ if lines_per_file is None:
+ this_chunk_size = chunk_size
+ elif chunk_idx < len(lines_per_file):
+ this_chunk_size = lines_per_file[chunk_idx]
+ chunk_idx += 1
+ lines_remaining = this_chunk_size
+ part_file = None
+ while lines_remaining > 0:
+ a_line = f.readline()
+ if a_line == '':
+ file_done = True
+ break
+ if part_file is None:
+ part_dir = subdir_generator_function()
+ part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+ part_file = open(part_path, 'w')
+ part_file.write(a_line)
+ lines_remaining -= 1
+ if part_file is not None:
+ part_file.close()
+ except Exception, e:
+ log.error('Unable to split files: %s' % str(e))
+ f.close()
+ if part_file is not None:
+ part_file.close()
+ raise
+ f.close()
+
+ split = classmethod(split)
+
+
+class glydeii(data.Data):
+ file_ext = 'glydeii'
+ line_class = 'line'
+
+ """Add metadata elements"""
+ MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+ visible=False, no_value=0)
+
+ def write_from_stream(self, dataset, stream):
+ """Writes data from a stream"""
+ # write it twice for now
+ fd, temp_name = tempfile.mkstemp()
+ while 1:
+ chunk = stream.read(1048576)
+ if not chunk:
+ break
+ os.write(fd, chunk)
+ os.close(fd)
+ # rewrite the file with unix newlines
+ fp = open(dataset.file_name, 'wt')
+ for line in file(temp_name, "U"):
+ line = line.strip() + '\n'
+ fp.write(line)
+ fp.close()
+
+ def set_raw_data(self, dataset, data):
+ """Saves the data on the disc"""
+ fd, temp_name = tempfile.mkstemp()
+ os.write(fd, data)
+ os.close(fd)
+ # rewrite the file with unix newlines
+ fp = open(dataset.file_name, 'wt')
+ for line in file(temp_name, "U"):
+ line = line.strip() + '\n'
+ fp.write(line)
+ fp.close()
+ os.remove(temp_name)
+
+ def get_mime(self):
+ """Returns the mime type of the datatype"""
+ return 'text/xml'
+
+ def set_meta(self, dataset, **kwd):
+ """
+ Set the number of lines of data in dataset.
+ """
+ dataset.metadata.data_lines = self.count_data_lines(dataset)
+
+ def estimate_file_lines(self, dataset):
+ """
+ Perform a rough estimate by extrapolating number of lines from a small read.
+ """
+ sample_size = 1048576
+ dataset_fh = open(dataset.file_name)
+ dataset_read = dataset_fh.read(sample_size)
+ dataset_fh.close()
+ sample_lines = dataset_read.count('\n')
+ est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+ return est_lines
+
+ def count_data_lines(self, dataset):
+ """
+ Count the number of lines of data in dataset,
+ skipping all blank lines and comments.
+ """
+ data_lines = 0
+ for line in file(dataset.file_name):
+ line = line.strip()
+ if line and not line.startswith('#'):
+ data_lines += 1
+ return data_lines
+
+ def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+ """
+ Set the peek. This method is used by various subclasses of Text.
+ """
+ if not dataset.dataset.purged:
+ # The file must exist on disk for the get_file_peek() method
+ dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+ skipchars=skipchars)
+ if line_count is None:
+ # See if line_count is stored in the metadata
+ if dataset.metadata.data_lines:
+ dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+ inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+ else:
+ # Number of lines is not known ( this should not happen ), and auto-detect is
+ # needed to set metadata
+ # This can happen when the file is larger than max_optional_metadata_filesize.
+ if int(dataset.get_size()) <= 1048576:
+ # Small dataset, recount all lines and reset peek afterward.
+ lc = self.count_data_lines(dataset)
+ dataset.metadata.data_lines = lc
+ dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+ else:
+ est_lines = self.estimate_file_lines(dataset)
+ dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+ inflector.cond_plural(est_lines, self.line_class) )
+ else:
+ dataset.blurb = "%s %s" % (
+ util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+ else:
+ dataset.peek = 'file does not exist'
+ dataset.blurb = 'file purged from disk'
+
+ def sniff(self, filename):
+ """All GlydeII XML files should use the rings form determination script """
+ try:
+ from suds.client import Client
+
+ url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
+ client = Client(url)
+ response = client.service.DeterminingForm(file(filename, 'r').read())
+ if response.array[0] == "GLYDEII":
+ return True
+ else:
+ return False
+ except ImportError:
+ # cannot use import suds so use simple checker
+ print "using GlydeII simple checker"
+ import xml.etree.cElementTree as ET
+
+ tree = ET.parse(filename)
+ root = tree.getroot()
+ if root.tag == 'GlydeII':
+ print root.tag
+ return True
+ else:
+ return False
+ except Exception, e:
+ # note I am not raising an error rather return False and let another sniffer try to type this data
+ traceback.print_exc(file=sys.stdout)
+ return False
+
+ def split(cls, input_datasets, subdir_generator_function, split_params):
+ """
+ Split the input files by line.
+ """
+ if split_params is None:
+ return
+
+ if len(input_datasets) > 1:
+ raise Exception("Text file splitting does not support multiple files")
+ input_files = [ds.file_name for ds in input_datasets]
+
+ lines_per_file = None
+ chunk_size = None
+ if split_params['split_mode'] == 'number_of_parts':
+ lines_per_file = []
+ # Computing the length is expensive!
+ def _file_len(fname):
+ i = 0
+ f = open(fname)
+ for i, l in enumerate(f):
+ pass
+ f.close()
+ return i + 1
+
+ length = _file_len(input_files[0])
+ parts = int(split_params['split_size'])
+ if length < parts:
+ parts = length
+ len_each, remainder = divmod(length, parts)
+ while length > 0:
+ chunk = len_each
+ if remainder > 0:
+ chunk += 1
+ lines_per_file.append(chunk)
+ remainder = - 1
+ length -= chunk
+ elif split_params['split_mode'] == 'to_size':
+ chunk_size = int(split_params['split_size'])
+ else:
+ raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+
+ f = open(input_files[0], 'rt')
+ try:
+ chunk_idx = 0
+ file_done = False
+ part_file = None
+ while not file_done:
+ if lines_per_file is None:
+ this_chunk_size = chunk_size
+ elif chunk_idx < len(lines_per_file):
+ this_chunk_size = lines_per_file[chunk_idx]
+ chunk_idx += 1
+ lines_remaining = this_chunk_size
+ part_file = None
+ while lines_remaining > 0:
+ a_line = f.readline()
+ if a_line == '':
+ file_done = True
+ break
+ if part_file is None:
+ part_dir = subdir_generator_function()
+ part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+ part_file = open(part_path, 'w')
+ part_file.write(a_line)
+ lines_remaining -= 1
+ if part_file is not None:
+ part_file.close()
+ except Exception, e:
+ log.error('Unable to split files: %s' % str(e))
+ f.close()
+ if part_file is not None:
+ part_file.close()
+ raise
+ f.close()
+
+ split = classmethod(split)
+
+
+class linucs(data.Data):
+ file_ext = 'linucs'
+ line_class = 'line'
+
+ """Add metadata elements"""
+ MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+ visible=False, no_value=0)
+
+ def write_from_stream(self, dataset, stream):
+ """Writes data from a stream"""
+ # write it twice for now
+ fd, temp_name = tempfile.mkstemp()
+ while 1:
+ chunk = stream.read(1048576)
+ if not chunk:
+ break
+ os.write(fd, chunk)
+ os.close(fd)
+ # rewrite the file with unix newlines
+ fp = open(dataset.file_name, 'wt')
+ for line in file(temp_name, "U"):
+ line = line.strip() + '\n'
+ fp.write(line)
+ fp.close()
+
+ def set_raw_data(self, dataset, data):
+ """Saves the data on the disc"""
+ fd, temp_name = tempfile.mkstemp()
+ os.write(fd, data)
+ os.close(fd)
+ # rewrite the file with unix newlines
+ fp = open(dataset.file_name, 'wt')
+ for line in file(temp_name, "U"):
+ line = line.strip() + '\n'
+ fp.write(line)
+ fp.close()
+ os.remove(temp_name)
+
+ def get_mime(self):
+ """Returns the mime type of the datatype"""
+ return 'text/plain'
+
+ def set_meta(self, dataset, **kwd):
+ """
+ Set the number of lines of data in dataset.
+ """
+ dataset.metadata.data_lines = self.count_data_lines(dataset)
+
+ def estimate_file_lines(self, dataset):
+ """
+ Perform a rough estimate by extrapolating number of lines from a small read.
+ """
+ sample_size = 1048576
+ dataset_fh = open(dataset.file_name)
+ dataset_read = dataset_fh.read(sample_size)
+ dataset_fh.close()
+ sample_lines = dataset_read.count('\n')
+ est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+ return est_lines
+
+ def count_data_lines(self, dataset):
+ """
+ Count the number of lines of data in dataset,
+ skipping all blank lines and comments.
+ """
+ data_lines = 0
+ for line in file(dataset.file_name):
+ line = line.strip()
+ if line and not line.startswith('#'):
+ data_lines += 1
+ return data_lines
+
+ def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+ """
+ Set the peek. This method is used by various subclasses of Text.
+ """
+ if not dataset.dataset.purged:
+ # The file must exist on disk for the get_file_peek() method
+ dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+ skipchars=skipchars)
+ if line_count is None:
+ # See if line_count is stored in the metadata
+ if dataset.metadata.data_lines:
+ dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+ inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+ else:
+ # Number of lines is not known ( this should not happen ), and auto-detect is
+ # needed to set metadata
+ # This can happen when the file is larger than max_optional_metadata_filesize.
+ if int(dataset.get_size()) <= 1048576:
+ # Small dataset, recount all lines and reset peek afterward.
+ lc = self.count_data_lines(dataset)
+ dataset.metadata.data_lines = lc
+ dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+ else:
+ est_lines = self.estimate_file_lines(dataset)
+ dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+ inflector.cond_plural(est_lines, self.line_class) )
+ else:
+ dataset.blurb = "%s %s" % (
+ util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+ else:
+ dataset.peek = 'file does not exist'
+ dataset.blurb = 'file purged from disk'
+
+ def sniff(self, filename):
+ """All LINUCS files should use the rings form determination script """
+ try:
+ from suds.client import Client
+
+ url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
+ client = Client(url)
+ response = client.service.DeterminingForm(file(filename, 'r').read())
+ if response.array[0] == "LINUCS":
+ return True
+ else:
+ return False
+ except ImportError:
+ # cannot use import suds so use simple checker
+ print "using LINUCS simple checker"
+
+ f = open(filename, "r")
+ firstline = f.readline()
+ f.close()
+
+ if "[" in firstline and "]" in firstline and "{" in firstline and "}" in firstline:
+ return True
+ else:
+ return False
+ except Exception, e:
+ # note I am not raising an error rather return False and let another sniffer try to type this data
+ traceback.print_exc(file=sys.stdout)
+ return False
+
+ def split(cls, input_datasets, subdir_generator_function, split_params):
+ """
+ Split the input files by line.
+ """
+ if split_params is None:
+ return
+
+ if len(input_datasets) > 1:
+ raise Exception("Text file splitting does not support multiple files")
+ input_files = [ds.file_name for ds in input_datasets]
+
+ lines_per_file = None
+ chunk_size = None
+ if split_params['split_mode'] == 'number_of_parts':
+ lines_per_file = []
+ # Computing the length is expensive!
+ def _file_len(fname):
+ i = 0
+ f = open(fname)
+ for i, l in enumerate(f):
+ pass
+ f.close()
+ return i + 1
+
+ length = _file_len(input_files[0])
+ parts = int(split_params['split_size'])
+ if length < parts:
+ parts = length
+ len_each, remainder = divmod(length, parts)
+ while length > 0:
+ chunk = len_each
+ if remainder > 0:
+ chunk += 1
+ lines_per_file.append(chunk)
+ remainder = - 1
+ length -= chunk
+ elif split_params['split_mode'] == 'to_size':
+ chunk_size = int(split_params['split_size'])
+ else:
+ raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+
+ f = open(input_files[0], 'rt')
+ try:
+ chunk_idx = 0
+ file_done = False
+ part_file = None
+ while not file_done:
+ if lines_per_file is None:
+ this_chunk_size = chunk_size
+ elif chunk_idx < len(lines_per_file):
+ this_chunk_size = lines_per_file[chunk_idx]
+ chunk_idx += 1
+ lines_remaining = this_chunk_size
+ part_file = None
+ while lines_remaining > 0:
+ a_line = f.readline()
+ if a_line == '':
+ file_done = True
+ break
+ if part_file is None:
+ part_dir = subdir_generator_function()
+ part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+ part_file = open(part_path, 'w')
+ part_file.write(a_line)
+ lines_remaining -= 1
+ if part_file is not None:
+ part_file.close()
+ except Exception, e:
+ log.error('Unable to split files: %s' % str(e))
+ f.close()
+ if part_file is not None:
+ part_file.close()
+ raise
+ f.close()
+
+ split = classmethod(split)
+
+
+class iupac(data.Data):
+ file_ext = 'iupac'
+ line_class = 'line'
+
+ """Add metadata elements"""
+ MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+ visible=False, no_value=0)
+
+ def write_from_stream(self, dataset, stream):
+ """Writes data from a stream"""
+ # write it twice for now
+ fd, temp_name = tempfile.mkstemp()
+ while 1:
+ chunk = stream.read(1048576)
+ if not chunk:
+ break
+ os.write(fd, chunk)
+ os.close(fd)
+ # rewrite the file with unix newlines
+ fp = open(dataset.file_name, 'wt')
+ for line in file(temp_name, "U"):
+ line = line.strip() + '\n'
+ fp.write(line)
+ fp.close()
+
+ def set_raw_data(self, dataset, data):
+ """Saves the data on the disc"""
+ fd, temp_name = tempfile.mkstemp()
+ os.write(fd, data)
+ os.close(fd)
+ # rewrite the file with unix newlines
+ fp = open(dataset.file_name, 'wt')
+ for line in file(temp_name, "U"):
+ line = line.strip() + '\n'
+ fp.write(line)
+ fp.close()
+ os.remove(temp_name)
+
+ def get_mime(self):
+ """Returns the mime type of the datatype"""
+ return 'text/plain'
+
+ def set_meta(self, dataset, **kwd):
+ """
+ Set the number of lines of data in dataset.
+ """
+ dataset.metadata.data_lines = self.count_data_lines(dataset)
+
+ def estimate_file_lines(self, dataset):
+ """
+ Perform a rough estimate by extrapolating number of lines from a small read.
+ """
+ sample_size = 1048576
+ dataset_fh = open(dataset.file_name)
+ dataset_read = dataset_fh.read(sample_size)
+ dataset_fh.close()
+ sample_lines = dataset_read.count('\n')
+ est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+ return est_lines
+
+ def count_data_lines(self, dataset):
+ """
+ Count the number of lines of data in dataset,
+ skipping all blank lines and comments.
+ """
+ data_lines = 0
+ for line in file(dataset.file_name):
+ line = line.strip()
+ if line and not line.startswith('#'):
+ data_lines += 1
+ return data_lines
+
+ def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+ """
+ Set the peek. This method is used by various subclasses of Text.
+ """
+ if not dataset.dataset.purged:
+ # The file must exist on disk for the get_file_peek() method
+ dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+ skipchars=skipchars)
+ if line_count is None:
+ # See if line_count is stored in the metadata
+ if dataset.metadata.data_lines:
+ dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+ inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+ else:
+ # Number of lines is not known ( this should not happen ), and auto-detect is
+ # needed to set metadata
+ # This can happen when the file is larger than max_optional_metadata_filesize.
+ if int(dataset.get_size()) <= 1048576:
+ # Small dataset, recount all lines and reset peek afterward.
+ lc = self.count_data_lines(dataset)
+ dataset.metadata.data_lines = lc
+ dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+ else:
+ est_lines = self.estimate_file_lines(dataset)
+ dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+ inflector.cond_plural(est_lines, self.line_class) )
+ else:
+ dataset.blurb = "%s %s" % (
+ util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+ else:
+ dataset.peek = 'file does not exist'
+ dataset.blurb = 'file purged from disk'
+
+ def sniff(self, filename):
+ """All IUPAC files should use the rings form determination script """
+ try:
+ from suds.client import Client
+
+ url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
+ client = Client(url)
+ response = client.service.DeterminingForm(file(filename, 'r').read())
+ if response.array[0] == "IUPAC":
+ return True
+ else:
+ return False
+ except ImportError:
+ # cannot use import suds so use simple checker
+ print "using IUPAC simple checker"
+ f = open(filename, "r")
+ firstline = f.readline()
+ f.close()
+
+ if "[" in firstline or "]" in firstline or "(" in firstline or ")" in firstline:
+ if "{" in firstline or "}" in firstline:
+ return False
+ else:
+ return True
+ else:
+ return False
+ except Exception, e:
+ # note I am not raising an error rather return False and let another sniffer try to type this data
+ traceback.print_exc(file=sys.stdout)
+ return False
+
+ def split(cls, input_datasets, subdir_generator_function, split_params):
+ """
+ Split the input files by line.
+ """
+ if split_params is None:
+ return
+
+ if len(input_datasets) > 1:
+ raise Exception("Text file splitting does not support multiple files")
+ input_files = [ds.file_name for ds in input_datasets]
+
+ lines_per_file = None
+ chunk_size = None
+ if split_params['split_mode'] == 'number_of_parts':
+ lines_per_file = []
+ # Computing the length is expensive!
+ def _file_len(fname):
+ i = 0
+ f = open(fname)
+ for i, l in enumerate(f):
+ pass
+ f.close()
+ return i + 1
+
+ length = _file_len(input_files[0])
+ parts = int(split_params['split_size'])
+ if length < parts:
+ parts = length
+ len_each, remainder = divmod(length, parts)
+ while length > 0:
+ chunk = len_each
+ if remainder > 0:
+ chunk += 1
+ lines_per_file.append(chunk)
+ remainder = - 1
+ length -= chunk
+ elif split_params['split_mode'] == 'to_size':
+ chunk_size = int(split_params['split_size'])
+ else:
+ raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+
+ f = open(input_files[0], 'rt')
+ try:
+ chunk_idx = 0
+ file_done = False
+ part_file = None
+ while not file_done:
+ if lines_per_file is None:
+ this_chunk_size = chunk_size
+ elif chunk_idx < len(lines_per_file):
+ this_chunk_size = lines_per_file[chunk_idx]
+ chunk_idx += 1
+ lines_remaining = this_chunk_size
+ part_file = None
+ while lines_remaining > 0:
+ a_line = f.readline()
+ if a_line == '':
+ file_done = True
+ break
+ if part_file is None:
+ part_dir = subdir_generator_function()
+ part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+ part_file = open(part_path, 'w')
+ part_file.write(a_line)
+ lines_remaining -= 1
+ if part_file is not None:
+ part_file.close()
+ except Exception, e:
+ log.error('Unable to split files: %s' % str(e))
+ f.close()
+ if part_file is not None:
+ part_file.close()
+ raise
+ f.close()
+
+ split = classmethod(split)
+
+
+class linearcode(data.Data):
+ file_ext = 'linearcode'
+ line_class = 'line'
+
+ """Add metadata elements"""
+ MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+ visible=False, no_value=0)
+
+ def write_from_stream(self, dataset, stream):
+ """Writes data from a stream"""
+ # write it twice for now
+ fd, temp_name = tempfile.mkstemp()
+ while 1:
+ chunk = stream.read(1048576)
+ if not chunk:
+ break
+ os.write(fd, chunk)
+ os.close(fd)
+ # rewrite the file with unix newlines
+ fp = open(dataset.file_name, 'wt')
+ for line in file(temp_name, "U"):
+ line = line.strip() + '\n'
+ fp.write(line)
+ fp.close()
+
+ def set_raw_data(self, dataset, data):
+ """Saves the data on the disc"""
+ fd, temp_name = tempfile.mkstemp()
+ os.write(fd, data)
+ os.close(fd)
+ # rewrite the file with unix newlines
+ fp = open(dataset.file_name, 'wt')
+ for line in file(temp_name, "U"):
+ line = line.strip() + '\n'
+ fp.write(line)
+ fp.close()
+ os.remove(temp_name)
+
+ def get_mime(self):
+ """Returns the mime type of the datatype"""
+ return 'text/plain'
+
+ def set_meta(self, dataset, **kwd):
+ """
+ Set the number of lines of data in dataset.
+ """
+ dataset.metadata.data_lines = self.count_data_lines(dataset)
+
+ def estimate_file_lines(self, dataset):
+ """
+ Perform a rough estimate by extrapolating number of lines from a small read.
+ """
+ sample_size = 1048576
+ dataset_fh = open(dataset.file_name)
+ dataset_read = dataset_fh.read(sample_size)
+ dataset_fh.close()
+ sample_lines = dataset_read.count('\n')
+ est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+ return est_lines
+
+ def count_data_lines(self, dataset):
+ """
+ Count the number of lines of data in dataset,
+ skipping all blank lines and comments.
+ """
+ data_lines = 0
+ for line in file(dataset.file_name):
+ line = line.strip()
+ if line and not line.startswith('#'):
+ data_lines += 1
+ return data_lines
+
+ def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+ """
+ Set the peek. This method is used by various subclasses of Text.
+ """
+ if not dataset.dataset.purged:
+ # The file must exist on disk for the get_file_peek() method
+ dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+ skipchars=skipchars)
+ if line_count is None:
+ # See if line_count is stored in the metadata
+ if dataset.metadata.data_lines:
+ dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+ inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+ else:
+ # Number of lines is not known ( this should not happen ), and auto-detect is
+ # needed to set metadata
+ # This can happen when the file is larger than max_optional_metadata_filesize.
+ if int(dataset.get_size()) <= 1048576:
+ # Small dataset, recount all lines and reset peek afterward.
+ lc = self.count_data_lines(dataset)
+ dataset.metadata.data_lines = lc
+ dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+ else:
+ est_lines = self.estimate_file_lines(dataset)
+ dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+ inflector.cond_plural(est_lines, self.line_class) )
+ else:
+ dataset.blurb = "%s %s" % (
+ util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+ else:
+ dataset.peek = 'file does not exist'
+ dataset.blurb = 'file purged from disk'
+
+ def sniff(self, filename):
+ """All linear code files should use the rings form determination script """
+ try:
+ from suds.client import Client
+
+ url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
+ client = Client(url)
+ lcresponse = client.service.DeterminingForm(file(filename, 'r').read())
+ if lcresponse.array[0] == "LinearCode":
+ print "LinearCode"
+ return True
+ else:
+ print "Unable to guess format"
+ return False
+ except ImportError:
+ # cannot use import suds so use simple checker
+ print "using LinearCode simple checker - nope it does not exist yet"
+ return False
+ except Exception, e:
+ # note I am not raising an error rather return False and let another sniffer try to type this data
+ traceback.print_exc(file=sys.stdout)
+ return False
+
+ def split(cls, input_datasets, subdir_generator_function, split_params):
+ """
+ Split the input files by line.
+ """
+ if split_params is None:
+ return
+
+ if len(input_datasets) > 1:
+ raise Exception("Text file splitting does not support multiple files")
+ input_files = [ds.file_name for ds in input_datasets]
+
+ lines_per_file = None
+ chunk_size = None
+ if split_params['split_mode'] == 'number_of_parts':
+ lines_per_file = []
+ # Computing the length is expensive!
+ def _file_len(fname):
+ i = 0
+ f = open(fname)
+ for i, l in enumerate(f):
+ pass
+ f.close()
+ return i + 1
+
+ length = _file_len(input_files[0])
+ parts = int(split_params['split_size'])
+ if length < parts:
+ parts = length
+ len_each, remainder = divmod(length, parts)
+ while length > 0:
+ chunk = len_each
+ if remainder > 0:
+ chunk += 1
+ lines_per_file.append(chunk)
+ remainder = - 1
+ length -= chunk
+ elif split_params['split_mode'] == 'to_size':
+ chunk_size = int(split_params['split_size'])
+ else:
+ raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+
+ f = open(input_files[0], 'rt')
+ try:
+ chunk_idx = 0
+ file_done = False
+ part_file = None
+ while not file_done:
+ if lines_per_file is None:
+ this_chunk_size = chunk_size
+ elif chunk_idx < len(lines_per_file):
+ this_chunk_size = lines_per_file[chunk_idx]
+ chunk_idx += 1
+ lines_remaining = this_chunk_size
+ part_file = None
+ while lines_remaining > 0:
+ a_line = f.readline()
+ if a_line == '':
+ file_done = True
+ break
+ if part_file is None:
+ part_dir = subdir_generator_function()
+ part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+ part_file = open(part_path, 'w')
+ part_file.write(a_line)
+ lines_remaining -= 1
+ if part_file is not None:
+ part_file.close()
+ except Exception, e:
+ log.error('Unable to split files: %s' % str(e))
+ f.close()
+ if part_file is not None:
+ part_file.close()
+ raise
+ f.close()
+
+ split = classmethod(split)
+
+
+class msa(data.Data):
+ file_ext = 'msa'
+ line_class = 'line'
+
+ """Add metadata elements"""
+ MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+ visible=False, no_value=0)
+
+ def write_from_stream(self, dataset, stream):
+ """Writes data from a stream"""
+ # write it twice for now
+ fd, temp_name = tempfile.mkstemp()
+ while 1:
+ chunk = stream.read(1048576)
+ if not chunk:
+ break
+ os.write(fd, chunk)
+ os.close(fd)
+ # rewrite the file with unix newlines
+ fp = open(dataset.file_name, 'wt')
+ for line in file(temp_name, "U"):
+ line = line.strip() + '\n'
+ fp.write(line)
+ fp.close()
+
+ def set_raw_data(self, dataset, data):
+ """Saves the data on the disc"""
+ fd, temp_name = tempfile.mkstemp()
+ os.write(fd, data)
+ os.close(fd)
+ # rewrite the file with unix newlines
+ fp = open(dataset.file_name, 'wt')
+ for line in file(temp_name, "U"):
+ line = line.strip() + '\n'
+ fp.write(line)
+ fp.close()
+ os.remove(temp_name)
+
+ def get_mime(self):
+ """Returns the mime type of the datatype"""
+ return 'text/plain'
+
+ def set_meta(self, dataset, **kwd):
+ """
+ Set the number of lines of data in dataset.
+ """
+ dataset.metadata.data_lines = self.count_data_lines(dataset)
+
+ def estimate_file_lines(self, dataset):
+ """
+ Perform a rough estimate by extrapolating number of lines from a small read.
+ """
+ sample_size = 1048576
+ dataset_fh = open(dataset.file_name)
+ dataset_read = dataset_fh.read(sample_size)
+ dataset_fh.close()
+ sample_lines = dataset_read.count('\n')
+ est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+ return est_lines
+
+ def count_data_lines(self, dataset):
+ """
+ Count the number of lines of data in dataset,
+ skipping all blank lines and comments.
+ """
+ data_lines = 0
+ for line in file(dataset.file_name):
+ line = line.strip()
+ if line and not line.startswith('#'):
+ data_lines += 1
+ return data_lines
+
+ def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+ """
+ Set the peek. This method is used by various subclasses of Text.
+ """
+ if not dataset.dataset.purged:
+ # The file must exist on disk for the get_file_peek() method
+ dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+ skipchars=skipchars)
+ if line_count is None:
+ # See if line_count is stored in the metadata
+ if dataset.metadata.data_lines:
+ dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+ inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+ else:
+ # Number of lines is not known ( this should not happen ), and auto-detect is
+ # needed to set metadata
+ # This can happen when the file is larger than max_optional_metadata_filesize.
+ if int(dataset.get_size()) <= 1048576:
+ # Small dataset, recount all lines and reset peek afterward.
+ lc = self.count_data_lines(dataset)
+ dataset.metadata.data_lines = lc
+ dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+ else:
+ est_lines = self.estimate_file_lines(dataset)
+ dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+ inflector.cond_plural(est_lines, self.line_class) )
+ else:
+ dataset.blurb = "%s %s" % (
+ util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+ else:
+ dataset.peek = 'file does not exist'
+ dataset.blurb = 'file purged from disk'
+
+ def sniff(self, filename):
+ """All msa Files simply put a '# .msa' in the first line. """
+ try:
+ f = open(filename, "r")
+ firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity
+ f.close()
+
+ if "# .MSA" in firstline:
+ return True
+ else:
+ return False
+ except:
+ traceback.print_exc(file=sys.stdout)
+ return False
+
+ def split(cls, input_datasets, subdir_generator_function, split_params):
+ """
+ Split the input files by line.
+ """
+ if split_params is None:
+ return
+
+ if len(input_datasets) > 1:
+ raise Exception("Text file splitting does not support multiple files")
+ input_files = [ds.file_name for ds in input_datasets]
+
+ lines_per_file = None
+ chunk_size = None
+ if split_params['split_mode'] == 'number_of_parts':
+ lines_per_file = []
+ # Computing the length is expensive!
+ def _file_len(fname):
+ i = 0
+ f = open(fname)
+ for i, l in enumerate(f):
+ pass
+ f.close()
+ return i + 1
+
+ length = _file_len(input_files[0])
+ parts = int(split_params['split_size'])
+ if length < parts:
+ parts = length
+ len_each, remainder = divmod(length, parts)
+ while length > 0:
+ chunk = len_each
+ if remainder > 0:
+ chunk += 1
+ lines_per_file.append(chunk)
+ remainder = - 1
+ length -= chunk
+ elif split_params['split_mode'] == 'to_size':
+ chunk_size = int(split_params['split_size'])
+ else:
+ raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+
+ f = open(input_files[0], 'rt')
+ try:
+ chunk_idx = 0
+ file_done = False
+ part_file = None
+ while not file_done:
+ if lines_per_file is None:
+ this_chunk_size = chunk_size
+ elif chunk_idx < len(lines_per_file):
+ this_chunk_size = lines_per_file[chunk_idx]
+ chunk_idx += 1
+ lines_remaining = this_chunk_size
+ part_file = None
+ while lines_remaining > 0:
+ a_line = f.readline()
+ if a_line == '':
+ file_done = True
+ break
+ if part_file is None:
+ part_dir = subdir_generator_function()
+ part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+ part_file = open(part_path, 'w')
+ part_file.write(a_line)
+ lines_remaining -= 1
+ if part_file is not None:
+ part_file.close()
+ except Exception, e:
+ log.error('Unable to split files: %s' % str(e))
+ f.close()
+ if part_file is not None:
+ part_file.close()
+ raise
+ f.close()
+
+ split = classmethod(split)
+
+
+class wurcs(data.Data):
+ file_ext = 'wurcs'
+ line_class = 'line'
+
+ """Add metadata elements"""
+ MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+ visible=False, no_value=0)
+
+ def write_from_stream(self, dataset, stream):
+ """Writes data from a stream"""
+ # write it twice for now
+ fd, temp_name = tempfile.mkstemp()
+ while 1:
+ chunk = stream.read(1048576)
+ if not chunk:
+ break
+ os.write(fd, chunk)
+ os.close(fd)
+ # rewrite the file with unix newlines
+ fp = open(dataset.file_name, 'wt')
+ for line in file(temp_name, "U"):
+ line = line.strip() + '\n'
+ fp.write(line)
+ fp.close()
+
+ def set_raw_data(self, dataset, data):
+ """Saves the data on the disc"""
+ fd, temp_name = tempfile.mkstemp()
+ os.write(fd, data)
+ os.close(fd)
+ # rewrite the file with unix newlines
+ fp = open(dataset.file_name, 'wt')
+ for line in file(temp_name, "U"):
+ line = line.strip() + '\n'
+ fp.write(line)
+ fp.close()
+ os.remove(temp_name)
+
+ def get_mime(self):
+ """Returns the mime type of the datatype"""
+ return 'text/plain'
+
+ def set_meta(self, dataset, **kwd):
+ """
+ Set the number of lines of data in dataset.
+ """
+ dataset.metadata.data_lines = self.count_data_lines(dataset)
+
+ def estimate_file_lines(self, dataset):
+ """
+ Perform a rough estimate by extrapolating number of lines from a small read.
+ """
+ sample_size = 1048576
+ dataset_fh = open(dataset.file_name)
+ dataset_read = dataset_fh.read(sample_size)
+ dataset_fh.close()
+ sample_lines = dataset_read.count('\n')
+ est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+ return est_lines
+
+ def count_data_lines(self, dataset):
+ """
+ Count the number of lines of data in dataset,
+ skipping all blank lines and comments.
+ """
+ data_lines = 0
+ for line in file(dataset.file_name):
+ line = line.strip()
+ if line and not line.startswith('#'):
+ data_lines += 1
+ return data_lines
+
+ def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+ """
+ Set the peek. This method is used by various subclasses of Text.
+ """
+ if not dataset.dataset.purged:
+ # The file must exist on disk for the get_file_peek() method
+ dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+ skipchars=skipchars)
+ if line_count is None:
+ # See if line_count is stored in the metadata
+ if dataset.metadata.data_lines:
+ dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+ inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+ else:
+ # Number of lines is not known ( this should not happen ), and auto-detect is
+ # needed to set metadata
+ # This can happen when the file is larger than max_optional_metadata_filesize.
+ if int(dataset.get_size()) <= 1048576:
+ # Small dataset, recount all lines and reset peek afterward.
+ lc = self.count_data_lines(dataset)
+ dataset.metadata.data_lines = lc
+ dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+ else:
+ est_lines = self.estimate_file_lines(dataset)
+ dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+ inflector.cond_plural(est_lines, self.line_class) )
+ else:
+ dataset.blurb = "%s %s" % (
+ util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+ else:
+ dataset.peek = 'file does not exist'
+ dataset.blurb = 'file purged from disk'
+
+ def sniff(self, filename):
+ """All WURCS Files start with WURCS= then the version number. see http://www.wurcs-wg.org/definition.php and http://rings.t.soka.ac.jp/
+WURCS=2.0/4,3/[x2112h+1:x|1,5|2*NCC/3=O|4*OSO/3=O/3=O][12122a+1:b|1,5][12112h+1:b|1,5|2*NCC/3=O|6*OSO/3=O/3=O][12122a+1:b|1,5]1+3,2+1|2+4,3+1|3+3,4+1"""
+ try:
+ f = open(filename, "r")
+ firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity
+ f.close()
+ if "WURCS" in firstline:
+ return True
+ else:
+ return False
+ except:
+ traceback.print_exc(file=sys.stdout)
+ return False
+
+
+ def split(cls, input_datasets, subdir_generator_function, split_params):
+ """
+ Split the input files by line.
+ """
+ if split_params is None:
+ return
+
+ if len(input_datasets) > 1:
+ raise Exception("Text file splitting does not support multiple files")
+ input_files = [ds.file_name for ds in input_datasets]
+
+ lines_per_file = None
+ chunk_size = None
+ if split_params['split_mode'] == 'number_of_parts':
+ lines_per_file = []
+ # Computing the length is expensive!
+ def _file_len(fname):
+ i = 0
+ f = open(fname)
+ for i, l in enumerate(f):
+ pass
+ f.close()
+ return i + 1
+
+ length = _file_len(input_files[0])
+ parts = int(split_params['split_size'])
+ if length < parts:
+ parts = length
+ len_each, remainder = divmod(length, parts)
+ while length > 0:
+ chunk = len_each
+ if remainder > 0:
+ chunk += 1
+ lines_per_file.append(chunk)
+ remainder = - 1
+ length -= chunk
+ elif split_params['split_mode'] == 'to_size':
+ chunk_size = int(split_params['split_size'])
+ else:
+ raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+
+ f = open(input_files[0], 'rt')
+ try:
+ chunk_idx = 0
+ file_done = False
+ part_file = None
+ while not file_done:
+ if lines_per_file is None:
+ this_chunk_size = chunk_size
+ elif chunk_idx < len(lines_per_file):
+ this_chunk_size = lines_per_file[chunk_idx]
+ chunk_idx += 1
+ lines_remaining = this_chunk_size
+ part_file = None
+ while lines_remaining > 0:
+ a_line = f.readline()
+ if a_line == '':
+ file_done = True
+ break
+ if part_file is None:
+ part_dir = subdir_generator_function()
+ part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+ part_file = open(part_path, 'w')
+ part_file.write(a_line)
+ lines_remaining -= 1
+ if part_file is not None:
+ part_file.close()
+ except Exception, e:
+ log.error('Unable to split files: %s' % str(e))
+ f.close()
+ if part_file is not None:
+ part_file.close()
+ raise
+ f.close()
+
+ split = classmethod(split)
+
+