Mercurial > repos > chrisb > gap_datatypes
view datatypes/glycan.py @ 0:0e941a69a6fa draft default tip
Uploaded
author | chrisb |
---|---|
date | Wed, 23 Mar 2016 14:34:50 -0400 |
parents | |
children |
line wrap: on
line source
__license__ = "MIT" import logging from galaxy.datatypes import metadata import mimetypes import os import shutil import sys import traceback import tempfile import zipfile from cgi import escape from inspect import isclass import galaxy.util as util from galaxy.datatypes import data from galaxy.datatypes.metadata import \ MetadataElement # import directly to maintain ease of use in Datatype class definitions from galaxy.util import inflector from galaxy.util.bunch import Bunch from galaxy.util.odict import odict from galaxy.util.sanitize_html import sanitize_html from galaxy.datatypes import dataproviders from galaxy import eggs eggs.require("Paste") import paste class kcf(data.Data): file_ext = 'kcf' line_class = 'line' """Add metadata elements""" MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, visible=False, no_value=0) def write_from_stream(self, dataset, stream): """Writes data from a stream""" # write it twice for now fd, temp_name = tempfile.mkstemp() while 1: chunk = stream.read(1048576) if not chunk: break os.write(fd, chunk) os.close(fd) # rewrite the file with unix newlines fp = open(dataset.file_name, 'wt') for line in file(temp_name, "U"): line = line.strip() + '\n' fp.write(line) fp.close() def set_raw_data(self, dataset, data): """Saves the data on the disc""" fd, temp_name = tempfile.mkstemp() os.write(fd, data) os.close(fd) # rewrite the file with unix newlines fp = open(dataset.file_name, 'wt') for line in file(temp_name, "U"): line = line.strip() + '\n' fp.write(line) fp.close() os.remove(temp_name) def get_mime(self): """Returns the mime type of the datatype""" return 'text/plain' def set_meta(self, dataset, **kwd): """ Set the number of lines of data in dataset. """ dataset.metadata.data_lines = self.count_data_lines(dataset) def estimate_file_lines(self, dataset): """ Perform a rough estimate by extrapolating number of lines from a small read. """ sample_size = 1048576 dataset_fh = open(dataset.file_name) dataset_read = dataset_fh.read(sample_size) dataset_fh.close() sample_lines = dataset_read.count('\n') est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) return est_lines def count_data_lines(self, dataset): """ Count the number of lines of data in dataset, skipping all blank lines and comments. """ data_lines = 0 for line in file(dataset.file_name): line = line.strip() if line and not line.startswith('#'): data_lines += 1 return data_lines def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): """ Set the peek. This method is used by various subclasses of Text. """ if not dataset.dataset.purged: # The file must exist on disk for the get_file_peek() method dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, skipchars=skipchars) if line_count is None: # See if line_count is stored in the metadata if dataset.metadata.data_lines: dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) else: # Number of lines is not known ( this should not happen ), and auto-detect is # needed to set metadata # This can happen when the file is larger than max_optional_metadata_filesize. if int(dataset.get_size()) <= 1048576: # Small dataset, recount all lines and reset peek afterward. lc = self.count_data_lines(dataset) dataset.metadata.data_lines = lc dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) else: est_lines = self.estimate_file_lines(dataset) dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), inflector.cond_plural(est_lines, self.line_class) ) else: dataset.blurb = "%s %s" % ( util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' def sniff(self, filename): """All KCF Files simply put a 'ENTRY' in its first line. This applies to all possible kcfs. In this case check for 'Glycan' to confirm it's a glycan """ try: from suds.client import Client url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl' client = Client(url) kcfresponse = client.service.DeterminingForm(file(filename, 'r').read()) if kcfresponse.array[0] == "KCF": return True else: return False except ImportError: # cannot use import suds so use simple checker print "using KCF simple checker" f = open(filename, "r") firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity f.close() if "ENTRY" in firstline and "GLYCAN" in firstline: return True else: return False except Exception, e: # note I am not raising an error rather return False and let another sniffer try to type this data traceback.print_exc(file=sys.stdout) return False def split(cls, input_datasets, subdir_generator_function, split_params): """ Split the input files by line. """ if split_params is None: return if len(input_datasets) > 1: raise Exception("Text file splitting does not support multiple files") input_files = [ds.file_name for ds in input_datasets] lines_per_file = None chunk_size = None if split_params['split_mode'] == 'number_of_parts': lines_per_file = [] # Computing the length is expensive! def _file_len(fname): i = 0 f = open(fname) for i, l in enumerate(f): pass f.close() return i + 1 length = _file_len(input_files[0]) parts = int(split_params['split_size']) if length < parts: parts = length len_each, remainder = divmod(length, parts) while length > 0: chunk = len_each if remainder > 0: chunk += 1 lines_per_file.append(chunk) remainder = - 1 length -= chunk elif split_params['split_mode'] == 'to_size': chunk_size = int(split_params['split_size']) else: raise Exception('Unsupported split mode %s' % split_params['split_mode']) f = open(input_files[0], 'rt') try: chunk_idx = 0 file_done = False part_file = None while not file_done: if lines_per_file is None: this_chunk_size = chunk_size elif chunk_idx < len(lines_per_file): this_chunk_size = lines_per_file[chunk_idx] chunk_idx += 1 lines_remaining = this_chunk_size part_file = None while lines_remaining > 0: a_line = f.readline() if a_line == '': file_done = True break if part_file is None: part_dir = subdir_generator_function() part_path = os.path.join(part_dir, os.path.basename(input_files[0])) part_file = open(part_path, 'w') part_file.write(a_line) lines_remaining -= 1 if part_file is not None: part_file.close() except Exception, e: log.error('Unable to split files: %s' % str(e)) f.close() if part_file is not None: part_file.close() raise f.close() split = classmethod(split) class glycoct(data.Data): file_ext = 'glycoct' line_class = 'line' """Add metadata elements""" MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, visible=False, no_value=0) def write_from_stream(self, dataset, stream): """Writes data from a stream""" # write it twice for now fd, temp_name = tempfile.mkstemp() while 1: chunk = stream.read(1048576) if not chunk: break os.write(fd, chunk) os.close(fd) # rewrite the file with unix newlines fp = open(dataset.file_name, 'wt') for line in file(temp_name, "U"): line = line.strip() + '\n' fp.write(line) fp.close() def set_raw_data(self, dataset, data): """Saves the data on the disc""" fd, temp_name = tempfile.mkstemp() os.write(fd, data) os.close(fd) # rewrite the file with unix newlines fp = open(dataset.file_name, 'wt') for line in file(temp_name, "U"): line = line.strip() + '\n' fp.write(line) fp.close() os.remove(temp_name) def get_mime(self): """Returns the mime type of the datatype""" return 'text/plain' def set_meta(self, dataset, **kwd): """ Set the number of lines of data in dataset. """ dataset.metadata.data_lines = self.count_data_lines(dataset) def estimate_file_lines(self, dataset): """ Perform a rough estimate by extrapolating number of lines from a small read. """ sample_size = 1048576 dataset_fh = open(dataset.file_name) dataset_read = dataset_fh.read(sample_size) dataset_fh.close() sample_lines = dataset_read.count('\n') est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) return est_lines def count_data_lines(self, dataset): """ Count the number of lines of data in dataset, skipping all blank lines and comments. """ data_lines = 0 for line in file(dataset.file_name): line = line.strip() if line and not line.startswith('#'): data_lines += 1 return data_lines def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): """ Set the peek. This method is used by various subclasses of Text. """ if not dataset.dataset.purged: # The file must exist on disk for the get_file_peek() method dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, skipchars=skipchars) if line_count is None: # See if line_count is stored in the metadata if dataset.metadata.data_lines: dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) else: # Number of lines is not known ( this should not happen ), and auto-detect is # needed to set metadata # This can happen when the file is larger than max_optional_metadata_filesize. if int(dataset.get_size()) <= 1048576: # Small dataset, recount all lines and reset peek afterward. lc = self.count_data_lines(dataset) dataset.metadata.data_lines = lc dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) else: est_lines = self.estimate_file_lines(dataset) dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), inflector.cond_plural(est_lines, self.line_class) ) else: dataset.blurb = "%s %s" % ( util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' def sniff(self, filename): """All glycoct_condensed files simply put a 'RES' in its first line and a LIN later. """ try: f = open(filename, "r") firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity lines = f.read() f.close() # if "RES" in firstline and "LIN" in lines: if "RES" in firstline and "LIN" in lines: return True else: return False except Exception, e: # note I am not raising an error rather return False and let another sniffer try to type this data traceback.print_exc(file=sys.stdout) return False def split(cls, input_datasets, subdir_generator_function, split_params): """ Split the input files by line. """ if split_params is None: return if len(input_datasets) > 1: raise Exception("Text file splitting does not support multiple files") input_files = [ds.file_name for ds in input_datasets] lines_per_file = None chunk_size = None if split_params['split_mode'] == 'number_of_parts': lines_per_file = [] # Computing the length is expensive! def _file_len(fname): i = 0 f = open(fname) for i, l in enumerate(f): pass f.close() return i + 1 length = _file_len(input_files[0]) parts = int(split_params['split_size']) if length < parts: parts = length len_each, remainder = divmod(length, parts) while length > 0: chunk = len_each if remainder > 0: chunk += 1 lines_per_file.append(chunk) remainder = - 1 length -= chunk elif split_params['split_mode'] == 'to_size': chunk_size = int(split_params['split_size']) else: raise Exception('Unsupported split mode %s' % split_params['split_mode']) f = open(input_files[0], 'rt') try: chunk_idx = 0 file_done = False part_file = None while not file_done: if lines_per_file is None: this_chunk_size = chunk_size elif chunk_idx < len(lines_per_file): this_chunk_size = lines_per_file[chunk_idx] chunk_idx += 1 lines_remaining = this_chunk_size part_file = None while lines_remaining > 0: a_line = f.readline() if a_line == '': file_done = True break if part_file is None: part_dir = subdir_generator_function() part_path = os.path.join(part_dir, os.path.basename(input_files[0])) part_file = open(part_path, 'w') part_file.write(a_line) lines_remaining -= 1 if part_file is not None: part_file.close() except Exception, e: log.error('Unable to split files: %s' % str(e)) f.close() if part_file is not None: part_file.close() raise f.close() split = classmethod(split) # ------------- Utility methods -------------- # nice_size used to be here, but to resolve cyclical dependencies it's been # moved to galaxy.util. It belongs there anyway since it's used outside # datatypes. nice_size = util.nice_size def get_test_fname(fname): """Returns test data filename""" path, name = os.path.split(__file__) full_path = os.path.join(path, 'test', fname) return full_path def get_file_peek(file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipchars=[]): """ Returns the first LINE_COUNT lines wrapped to WIDTH ## >>> fname = get_test_fname('4.bed') ## >>> get_file_peek(fname) ## 'chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 +\n' """ # Set size for file.readline() to a negative number to force it to # read until either a newline or EOF. Needed for datasets with very # long lines. if WIDTH == 'unlimited': WIDTH = -1 lines = [] count = 0 file_type = None data_checked = False temp = open(file_name, "U") while count <= LINE_COUNT: line = temp.readline(WIDTH) if line and not is_multi_byte and not data_checked: # See if we have a compressed or binary file if line[0:2] == util.gzip_magic: file_type = 'gzipped' break else: for char in line: if ord(char) > 128: file_type = 'binary' break data_checked = True if file_type in ['gzipped', 'binary']: break skip_line = False for skipchar in skipchars: if line.startswith(skipchar): skip_line = True break if not skip_line: lines.append(line) count += 1 temp.close() if file_type in ['gzipped', 'binary']: text = "%s file" % file_type else: try: text = unicode('\n'.join(lines), 'utf-8') except UnicodeDecodeError: text = "binary/unknown file" return text class glycoct_xml(data.Data): file_ext = 'glycoct_xml' line_class = 'line' """Add metadata elements""" MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, visible=False, no_value=0) def write_from_stream(self, dataset, stream): """Writes data from a stream""" # write it twice for now fd, temp_name = tempfile.mkstemp() while 1: chunk = stream.read(1048576) if not chunk: break os.write(fd, chunk) os.close(fd) # rewrite the file with unix newlines fp = open(dataset.file_name, 'wt') for line in file(temp_name, "U"): line = line.strip() + '\n' fp.write(line) fp.close() def set_raw_data(self, dataset, data): """Saves the data on the disc""" fd, temp_name = tempfile.mkstemp() os.write(fd, data) os.close(fd) # rewrite the file with unix newlines fp = open(dataset.file_name, 'wt') for line in file(temp_name, "U"): line = line.strip() + '\n' fp.write(line) fp.close() os.remove(temp_name) def get_mime(self): """Returns the mime type of the datatype""" return 'text/xml' def set_meta(self, dataset, **kwd): """ Set the number of lines of data in dataset. """ dataset.metadata.data_lines = self.count_data_lines(dataset) def estimate_file_lines(self, dataset): """ Perform a rough estimate by extrapolating number of lines from a small read. """ sample_size = 1048576 dataset_fh = open(dataset.file_name) dataset_read = dataset_fh.read(sample_size) dataset_fh.close() sample_lines = dataset_read.count('\n') est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) return est_lines def count_data_lines(self, dataset): """ Count the number of lines of data in dataset, skipping all blank lines and comments. """ data_lines = 0 for line in file(dataset.file_name): line = line.strip() if line and not line.startswith('#'): data_lines += 1 return data_lines def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): """ Set the peek. This method is used by various subclasses of Text. """ if not dataset.dataset.purged: # The file must exist on disk for the get_file_peek() method dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, skipchars=skipchars) if line_count is None: # See if line_count is stored in the metadata if dataset.metadata.data_lines: dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) else: # Number of lines is not known ( this should not happen ), and auto-detect is # needed to set metadata # This can happen when the file is larger than max_optional_metadata_filesize. if int(dataset.get_size()) <= 1048576: # Small dataset, recount all lines and reset peek afterward. lc = self.count_data_lines(dataset) dataset.metadata.data_lines = lc dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) else: est_lines = self.estimate_file_lines(dataset) dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), inflector.cond_plural(est_lines, self.line_class) ) else: dataset.blurb = "%s %s" % ( util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' def sniff(self, filename): """All glycoct XML files should use the rings form determination script """ try: from suds.client import Client url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl' client = Client(url) response = client.service.DeterminingForm(file(filename, 'r').read()) if response.array[0] == "GlycoCT": return True else: return False except ImportError: # cannot use import suds so use simple checker print "using glycoct XML simple checker" import xml.etree.cElementTree as ET tree = ET.parse(filename) root = tree.getroot() if root.tag == 'sugar': print root.tag, root.attrib return True else: return False except Exception, e: # note I am not raising an error rather return False and let another sniffer try to type this data traceback.print_exc(file=sys.stdout) return False def split(cls, input_datasets, subdir_generator_function, split_params): """ Split the input files by line. """ if split_params is None: return if len(input_datasets) > 1: raise Exception("Text file splitting does not support multiple files") input_files = [ds.file_name for ds in input_datasets] lines_per_file = None chunk_size = None if split_params['split_mode'] == 'number_of_parts': lines_per_file = [] # Computing the length is expensive! def _file_len(fname): i = 0 f = open(fname) for i, l in enumerate(f): pass f.close() return i + 1 length = _file_len(input_files[0]) parts = int(split_params['split_size']) if length < parts: parts = length len_each, remainder = divmod(length, parts) while length > 0: chunk = len_each if remainder > 0: chunk += 1 lines_per_file.append(chunk) remainder = - 1 length -= chunk elif split_params['split_mode'] == 'to_size': chunk_size = int(split_params['split_size']) else: raise Exception('Unsupported split mode %s' % split_params['split_mode']) f = open(input_files[0], 'rt') try: chunk_idx = 0 file_done = False part_file = None while not file_done: if lines_per_file is None: this_chunk_size = chunk_size elif chunk_idx < len(lines_per_file): this_chunk_size = lines_per_file[chunk_idx] chunk_idx += 1 lines_remaining = this_chunk_size part_file = None while lines_remaining > 0: a_line = f.readline() if a_line == '': file_done = True break if part_file is None: part_dir = subdir_generator_function() part_path = os.path.join(part_dir, os.path.basename(input_files[0])) part_file = open(part_path, 'w') part_file.write(a_line) lines_remaining -= 1 if part_file is not None: part_file.close() except Exception, e: log.error('Unable to split files: %s' % str(e)) f.close() if part_file is not None: part_file.close() raise f.close() split = classmethod(split) class glydeii(data.Data): file_ext = 'glydeii' line_class = 'line' """Add metadata elements""" MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, visible=False, no_value=0) def write_from_stream(self, dataset, stream): """Writes data from a stream""" # write it twice for now fd, temp_name = tempfile.mkstemp() while 1: chunk = stream.read(1048576) if not chunk: break os.write(fd, chunk) os.close(fd) # rewrite the file with unix newlines fp = open(dataset.file_name, 'wt') for line in file(temp_name, "U"): line = line.strip() + '\n' fp.write(line) fp.close() def set_raw_data(self, dataset, data): """Saves the data on the disc""" fd, temp_name = tempfile.mkstemp() os.write(fd, data) os.close(fd) # rewrite the file with unix newlines fp = open(dataset.file_name, 'wt') for line in file(temp_name, "U"): line = line.strip() + '\n' fp.write(line) fp.close() os.remove(temp_name) def get_mime(self): """Returns the mime type of the datatype""" return 'text/xml' def set_meta(self, dataset, **kwd): """ Set the number of lines of data in dataset. """ dataset.metadata.data_lines = self.count_data_lines(dataset) def estimate_file_lines(self, dataset): """ Perform a rough estimate by extrapolating number of lines from a small read. """ sample_size = 1048576 dataset_fh = open(dataset.file_name) dataset_read = dataset_fh.read(sample_size) dataset_fh.close() sample_lines = dataset_read.count('\n') est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) return est_lines def count_data_lines(self, dataset): """ Count the number of lines of data in dataset, skipping all blank lines and comments. """ data_lines = 0 for line in file(dataset.file_name): line = line.strip() if line and not line.startswith('#'): data_lines += 1 return data_lines def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): """ Set the peek. This method is used by various subclasses of Text. """ if not dataset.dataset.purged: # The file must exist on disk for the get_file_peek() method dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, skipchars=skipchars) if line_count is None: # See if line_count is stored in the metadata if dataset.metadata.data_lines: dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) else: # Number of lines is not known ( this should not happen ), and auto-detect is # needed to set metadata # This can happen when the file is larger than max_optional_metadata_filesize. if int(dataset.get_size()) <= 1048576: # Small dataset, recount all lines and reset peek afterward. lc = self.count_data_lines(dataset) dataset.metadata.data_lines = lc dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) else: est_lines = self.estimate_file_lines(dataset) dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), inflector.cond_plural(est_lines, self.line_class) ) else: dataset.blurb = "%s %s" % ( util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' def sniff(self, filename): """All GlydeII XML files should use the rings form determination script """ try: from suds.client import Client url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl' client = Client(url) response = client.service.DeterminingForm(file(filename, 'r').read()) if response.array[0] == "GLYDEII": return True else: return False except ImportError: # cannot use import suds so use simple checker print "using GlydeII simple checker" import xml.etree.cElementTree as ET tree = ET.parse(filename) root = tree.getroot() if root.tag == 'GlydeII': print root.tag return True else: return False except Exception, e: # note I am not raising an error rather return False and let another sniffer try to type this data traceback.print_exc(file=sys.stdout) return False def split(cls, input_datasets, subdir_generator_function, split_params): """ Split the input files by line. """ if split_params is None: return if len(input_datasets) > 1: raise Exception("Text file splitting does not support multiple files") input_files = [ds.file_name for ds in input_datasets] lines_per_file = None chunk_size = None if split_params['split_mode'] == 'number_of_parts': lines_per_file = [] # Computing the length is expensive! def _file_len(fname): i = 0 f = open(fname) for i, l in enumerate(f): pass f.close() return i + 1 length = _file_len(input_files[0]) parts = int(split_params['split_size']) if length < parts: parts = length len_each, remainder = divmod(length, parts) while length > 0: chunk = len_each if remainder > 0: chunk += 1 lines_per_file.append(chunk) remainder = - 1 length -= chunk elif split_params['split_mode'] == 'to_size': chunk_size = int(split_params['split_size']) else: raise Exception('Unsupported split mode %s' % split_params['split_mode']) f = open(input_files[0], 'rt') try: chunk_idx = 0 file_done = False part_file = None while not file_done: if lines_per_file is None: this_chunk_size = chunk_size elif chunk_idx < len(lines_per_file): this_chunk_size = lines_per_file[chunk_idx] chunk_idx += 1 lines_remaining = this_chunk_size part_file = None while lines_remaining > 0: a_line = f.readline() if a_line == '': file_done = True break if part_file is None: part_dir = subdir_generator_function() part_path = os.path.join(part_dir, os.path.basename(input_files[0])) part_file = open(part_path, 'w') part_file.write(a_line) lines_remaining -= 1 if part_file is not None: part_file.close() except Exception, e: log.error('Unable to split files: %s' % str(e)) f.close() if part_file is not None: part_file.close() raise f.close() split = classmethod(split) class linucs(data.Data): file_ext = 'linucs' line_class = 'line' """Add metadata elements""" MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, visible=False, no_value=0) def write_from_stream(self, dataset, stream): """Writes data from a stream""" # write it twice for now fd, temp_name = tempfile.mkstemp() while 1: chunk = stream.read(1048576) if not chunk: break os.write(fd, chunk) os.close(fd) # rewrite the file with unix newlines fp = open(dataset.file_name, 'wt') for line in file(temp_name, "U"): line = line.strip() + '\n' fp.write(line) fp.close() def set_raw_data(self, dataset, data): """Saves the data on the disc""" fd, temp_name = tempfile.mkstemp() os.write(fd, data) os.close(fd) # rewrite the file with unix newlines fp = open(dataset.file_name, 'wt') for line in file(temp_name, "U"): line = line.strip() + '\n' fp.write(line) fp.close() os.remove(temp_name) def get_mime(self): """Returns the mime type of the datatype""" return 'text/plain' def set_meta(self, dataset, **kwd): """ Set the number of lines of data in dataset. """ dataset.metadata.data_lines = self.count_data_lines(dataset) def estimate_file_lines(self, dataset): """ Perform a rough estimate by extrapolating number of lines from a small read. """ sample_size = 1048576 dataset_fh = open(dataset.file_name) dataset_read = dataset_fh.read(sample_size) dataset_fh.close() sample_lines = dataset_read.count('\n') est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) return est_lines def count_data_lines(self, dataset): """ Count the number of lines of data in dataset, skipping all blank lines and comments. """ data_lines = 0 for line in file(dataset.file_name): line = line.strip() if line and not line.startswith('#'): data_lines += 1 return data_lines def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): """ Set the peek. This method is used by various subclasses of Text. """ if not dataset.dataset.purged: # The file must exist on disk for the get_file_peek() method dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, skipchars=skipchars) if line_count is None: # See if line_count is stored in the metadata if dataset.metadata.data_lines: dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) else: # Number of lines is not known ( this should not happen ), and auto-detect is # needed to set metadata # This can happen when the file is larger than max_optional_metadata_filesize. if int(dataset.get_size()) <= 1048576: # Small dataset, recount all lines and reset peek afterward. lc = self.count_data_lines(dataset) dataset.metadata.data_lines = lc dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) else: est_lines = self.estimate_file_lines(dataset) dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), inflector.cond_plural(est_lines, self.line_class) ) else: dataset.blurb = "%s %s" % ( util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' def sniff(self, filename): """All LINUCS files should use the rings form determination script """ try: from suds.client import Client url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl' client = Client(url) response = client.service.DeterminingForm(file(filename, 'r').read()) if response.array[0] == "LINUCS": return True else: return False except ImportError: # cannot use import suds so use simple checker print "using LINUCS simple checker" f = open(filename, "r") firstline = f.readline() f.close() if "[" in firstline and "]" in firstline and "{" in firstline and "}" in firstline: return True else: return False except Exception, e: # note I am not raising an error rather return False and let another sniffer try to type this data traceback.print_exc(file=sys.stdout) return False def split(cls, input_datasets, subdir_generator_function, split_params): """ Split the input files by line. """ if split_params is None: return if len(input_datasets) > 1: raise Exception("Text file splitting does not support multiple files") input_files = [ds.file_name for ds in input_datasets] lines_per_file = None chunk_size = None if split_params['split_mode'] == 'number_of_parts': lines_per_file = [] # Computing the length is expensive! def _file_len(fname): i = 0 f = open(fname) for i, l in enumerate(f): pass f.close() return i + 1 length = _file_len(input_files[0]) parts = int(split_params['split_size']) if length < parts: parts = length len_each, remainder = divmod(length, parts) while length > 0: chunk = len_each if remainder > 0: chunk += 1 lines_per_file.append(chunk) remainder = - 1 length -= chunk elif split_params['split_mode'] == 'to_size': chunk_size = int(split_params['split_size']) else: raise Exception('Unsupported split mode %s' % split_params['split_mode']) f = open(input_files[0], 'rt') try: chunk_idx = 0 file_done = False part_file = None while not file_done: if lines_per_file is None: this_chunk_size = chunk_size elif chunk_idx < len(lines_per_file): this_chunk_size = lines_per_file[chunk_idx] chunk_idx += 1 lines_remaining = this_chunk_size part_file = None while lines_remaining > 0: a_line = f.readline() if a_line == '': file_done = True break if part_file is None: part_dir = subdir_generator_function() part_path = os.path.join(part_dir, os.path.basename(input_files[0])) part_file = open(part_path, 'w') part_file.write(a_line) lines_remaining -= 1 if part_file is not None: part_file.close() except Exception, e: log.error('Unable to split files: %s' % str(e)) f.close() if part_file is not None: part_file.close() raise f.close() split = classmethod(split) class iupac(data.Data): file_ext = 'iupac' line_class = 'line' """Add metadata elements""" MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, visible=False, no_value=0) def write_from_stream(self, dataset, stream): """Writes data from a stream""" # write it twice for now fd, temp_name = tempfile.mkstemp() while 1: chunk = stream.read(1048576) if not chunk: break os.write(fd, chunk) os.close(fd) # rewrite the file with unix newlines fp = open(dataset.file_name, 'wt') for line in file(temp_name, "U"): line = line.strip() + '\n' fp.write(line) fp.close() def set_raw_data(self, dataset, data): """Saves the data on the disc""" fd, temp_name = tempfile.mkstemp() os.write(fd, data) os.close(fd) # rewrite the file with unix newlines fp = open(dataset.file_name, 'wt') for line in file(temp_name, "U"): line = line.strip() + '\n' fp.write(line) fp.close() os.remove(temp_name) def get_mime(self): """Returns the mime type of the datatype""" return 'text/plain' def set_meta(self, dataset, **kwd): """ Set the number of lines of data in dataset. """ dataset.metadata.data_lines = self.count_data_lines(dataset) def estimate_file_lines(self, dataset): """ Perform a rough estimate by extrapolating number of lines from a small read. """ sample_size = 1048576 dataset_fh = open(dataset.file_name) dataset_read = dataset_fh.read(sample_size) dataset_fh.close() sample_lines = dataset_read.count('\n') est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) return est_lines def count_data_lines(self, dataset): """ Count the number of lines of data in dataset, skipping all blank lines and comments. """ data_lines = 0 for line in file(dataset.file_name): line = line.strip() if line and not line.startswith('#'): data_lines += 1 return data_lines def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): """ Set the peek. This method is used by various subclasses of Text. """ if not dataset.dataset.purged: # The file must exist on disk for the get_file_peek() method dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, skipchars=skipchars) if line_count is None: # See if line_count is stored in the metadata if dataset.metadata.data_lines: dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) else: # Number of lines is not known ( this should not happen ), and auto-detect is # needed to set metadata # This can happen when the file is larger than max_optional_metadata_filesize. if int(dataset.get_size()) <= 1048576: # Small dataset, recount all lines and reset peek afterward. lc = self.count_data_lines(dataset) dataset.metadata.data_lines = lc dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) else: est_lines = self.estimate_file_lines(dataset) dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), inflector.cond_plural(est_lines, self.line_class) ) else: dataset.blurb = "%s %s" % ( util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' def sniff(self, filename): """All IUPAC files should use the rings form determination script """ try: from suds.client import Client url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl' client = Client(url) response = client.service.DeterminingForm(file(filename, 'r').read()) if response.array[0] == "IUPAC": return True else: return False except ImportError: # cannot use import suds so use simple checker print "using IUPAC simple checker" f = open(filename, "r") firstline = f.readline() f.close() if "[" in firstline or "]" in firstline or "(" in firstline or ")" in firstline: if "{" in firstline or "}" in firstline: return False else: return True else: return False except Exception, e: # note I am not raising an error rather return False and let another sniffer try to type this data traceback.print_exc(file=sys.stdout) return False def split(cls, input_datasets, subdir_generator_function, split_params): """ Split the input files by line. """ if split_params is None: return if len(input_datasets) > 1: raise Exception("Text file splitting does not support multiple files") input_files = [ds.file_name for ds in input_datasets] lines_per_file = None chunk_size = None if split_params['split_mode'] == 'number_of_parts': lines_per_file = [] # Computing the length is expensive! def _file_len(fname): i = 0 f = open(fname) for i, l in enumerate(f): pass f.close() return i + 1 length = _file_len(input_files[0]) parts = int(split_params['split_size']) if length < parts: parts = length len_each, remainder = divmod(length, parts) while length > 0: chunk = len_each if remainder > 0: chunk += 1 lines_per_file.append(chunk) remainder = - 1 length -= chunk elif split_params['split_mode'] == 'to_size': chunk_size = int(split_params['split_size']) else: raise Exception('Unsupported split mode %s' % split_params['split_mode']) f = open(input_files[0], 'rt') try: chunk_idx = 0 file_done = False part_file = None while not file_done: if lines_per_file is None: this_chunk_size = chunk_size elif chunk_idx < len(lines_per_file): this_chunk_size = lines_per_file[chunk_idx] chunk_idx += 1 lines_remaining = this_chunk_size part_file = None while lines_remaining > 0: a_line = f.readline() if a_line == '': file_done = True break if part_file is None: part_dir = subdir_generator_function() part_path = os.path.join(part_dir, os.path.basename(input_files[0])) part_file = open(part_path, 'w') part_file.write(a_line) lines_remaining -= 1 if part_file is not None: part_file.close() except Exception, e: log.error('Unable to split files: %s' % str(e)) f.close() if part_file is not None: part_file.close() raise f.close() split = classmethod(split) class linearcode(data.Data): file_ext = 'linearcode' line_class = 'line' """Add metadata elements""" MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, visible=False, no_value=0) def write_from_stream(self, dataset, stream): """Writes data from a stream""" # write it twice for now fd, temp_name = tempfile.mkstemp() while 1: chunk = stream.read(1048576) if not chunk: break os.write(fd, chunk) os.close(fd) # rewrite the file with unix newlines fp = open(dataset.file_name, 'wt') for line in file(temp_name, "U"): line = line.strip() + '\n' fp.write(line) fp.close() def set_raw_data(self, dataset, data): """Saves the data on the disc""" fd, temp_name = tempfile.mkstemp() os.write(fd, data) os.close(fd) # rewrite the file with unix newlines fp = open(dataset.file_name, 'wt') for line in file(temp_name, "U"): line = line.strip() + '\n' fp.write(line) fp.close() os.remove(temp_name) def get_mime(self): """Returns the mime type of the datatype""" return 'text/plain' def set_meta(self, dataset, **kwd): """ Set the number of lines of data in dataset. """ dataset.metadata.data_lines = self.count_data_lines(dataset) def estimate_file_lines(self, dataset): """ Perform a rough estimate by extrapolating number of lines from a small read. """ sample_size = 1048576 dataset_fh = open(dataset.file_name) dataset_read = dataset_fh.read(sample_size) dataset_fh.close() sample_lines = dataset_read.count('\n') est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) return est_lines def count_data_lines(self, dataset): """ Count the number of lines of data in dataset, skipping all blank lines and comments. """ data_lines = 0 for line in file(dataset.file_name): line = line.strip() if line and not line.startswith('#'): data_lines += 1 return data_lines def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): """ Set the peek. This method is used by various subclasses of Text. """ if not dataset.dataset.purged: # The file must exist on disk for the get_file_peek() method dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, skipchars=skipchars) if line_count is None: # See if line_count is stored in the metadata if dataset.metadata.data_lines: dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) else: # Number of lines is not known ( this should not happen ), and auto-detect is # needed to set metadata # This can happen when the file is larger than max_optional_metadata_filesize. if int(dataset.get_size()) <= 1048576: # Small dataset, recount all lines and reset peek afterward. lc = self.count_data_lines(dataset) dataset.metadata.data_lines = lc dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) else: est_lines = self.estimate_file_lines(dataset) dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), inflector.cond_plural(est_lines, self.line_class) ) else: dataset.blurb = "%s %s" % ( util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' def sniff(self, filename): """All linear code files should use the rings form determination script """ try: from suds.client import Client url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl' client = Client(url) lcresponse = client.service.DeterminingForm(file(filename, 'r').read()) if lcresponse.array[0] == "LinearCode": print "LinearCode" return True else: print "Unable to guess format" return False except ImportError: # cannot use import suds so use simple checker print "using LinearCode simple checker - nope it does not exist yet" return False except Exception, e: # note I am not raising an error rather return False and let another sniffer try to type this data traceback.print_exc(file=sys.stdout) return False def split(cls, input_datasets, subdir_generator_function, split_params): """ Split the input files by line. """ if split_params is None: return if len(input_datasets) > 1: raise Exception("Text file splitting does not support multiple files") input_files = [ds.file_name for ds in input_datasets] lines_per_file = None chunk_size = None if split_params['split_mode'] == 'number_of_parts': lines_per_file = [] # Computing the length is expensive! def _file_len(fname): i = 0 f = open(fname) for i, l in enumerate(f): pass f.close() return i + 1 length = _file_len(input_files[0]) parts = int(split_params['split_size']) if length < parts: parts = length len_each, remainder = divmod(length, parts) while length > 0: chunk = len_each if remainder > 0: chunk += 1 lines_per_file.append(chunk) remainder = - 1 length -= chunk elif split_params['split_mode'] == 'to_size': chunk_size = int(split_params['split_size']) else: raise Exception('Unsupported split mode %s' % split_params['split_mode']) f = open(input_files[0], 'rt') try: chunk_idx = 0 file_done = False part_file = None while not file_done: if lines_per_file is None: this_chunk_size = chunk_size elif chunk_idx < len(lines_per_file): this_chunk_size = lines_per_file[chunk_idx] chunk_idx += 1 lines_remaining = this_chunk_size part_file = None while lines_remaining > 0: a_line = f.readline() if a_line == '': file_done = True break if part_file is None: part_dir = subdir_generator_function() part_path = os.path.join(part_dir, os.path.basename(input_files[0])) part_file = open(part_path, 'w') part_file.write(a_line) lines_remaining -= 1 if part_file is not None: part_file.close() except Exception, e: log.error('Unable to split files: %s' % str(e)) f.close() if part_file is not None: part_file.close() raise f.close() split = classmethod(split) class msa(data.Data): file_ext = 'msa' line_class = 'line' """Add metadata elements""" MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, visible=False, no_value=0) def write_from_stream(self, dataset, stream): """Writes data from a stream""" # write it twice for now fd, temp_name = tempfile.mkstemp() while 1: chunk = stream.read(1048576) if not chunk: break os.write(fd, chunk) os.close(fd) # rewrite the file with unix newlines fp = open(dataset.file_name, 'wt') for line in file(temp_name, "U"): line = line.strip() + '\n' fp.write(line) fp.close() def set_raw_data(self, dataset, data): """Saves the data on the disc""" fd, temp_name = tempfile.mkstemp() os.write(fd, data) os.close(fd) # rewrite the file with unix newlines fp = open(dataset.file_name, 'wt') for line in file(temp_name, "U"): line = line.strip() + '\n' fp.write(line) fp.close() os.remove(temp_name) def get_mime(self): """Returns the mime type of the datatype""" return 'text/plain' def set_meta(self, dataset, **kwd): """ Set the number of lines of data in dataset. """ dataset.metadata.data_lines = self.count_data_lines(dataset) def estimate_file_lines(self, dataset): """ Perform a rough estimate by extrapolating number of lines from a small read. """ sample_size = 1048576 dataset_fh = open(dataset.file_name) dataset_read = dataset_fh.read(sample_size) dataset_fh.close() sample_lines = dataset_read.count('\n') est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) return est_lines def count_data_lines(self, dataset): """ Count the number of lines of data in dataset, skipping all blank lines and comments. """ data_lines = 0 for line in file(dataset.file_name): line = line.strip() if line and not line.startswith('#'): data_lines += 1 return data_lines def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): """ Set the peek. This method is used by various subclasses of Text. """ if not dataset.dataset.purged: # The file must exist on disk for the get_file_peek() method dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, skipchars=skipchars) if line_count is None: # See if line_count is stored in the metadata if dataset.metadata.data_lines: dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) else: # Number of lines is not known ( this should not happen ), and auto-detect is # needed to set metadata # This can happen when the file is larger than max_optional_metadata_filesize. if int(dataset.get_size()) <= 1048576: # Small dataset, recount all lines and reset peek afterward. lc = self.count_data_lines(dataset) dataset.metadata.data_lines = lc dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) else: est_lines = self.estimate_file_lines(dataset) dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), inflector.cond_plural(est_lines, self.line_class) ) else: dataset.blurb = "%s %s" % ( util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' def sniff(self, filename): """All msa Files simply put a '# .msa' in the first line. """ try: f = open(filename, "r") firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity f.close() if "# .MSA" in firstline: return True else: return False except: traceback.print_exc(file=sys.stdout) return False def split(cls, input_datasets, subdir_generator_function, split_params): """ Split the input files by line. """ if split_params is None: return if len(input_datasets) > 1: raise Exception("Text file splitting does not support multiple files") input_files = [ds.file_name for ds in input_datasets] lines_per_file = None chunk_size = None if split_params['split_mode'] == 'number_of_parts': lines_per_file = [] # Computing the length is expensive! def _file_len(fname): i = 0 f = open(fname) for i, l in enumerate(f): pass f.close() return i + 1 length = _file_len(input_files[0]) parts = int(split_params['split_size']) if length < parts: parts = length len_each, remainder = divmod(length, parts) while length > 0: chunk = len_each if remainder > 0: chunk += 1 lines_per_file.append(chunk) remainder = - 1 length -= chunk elif split_params['split_mode'] == 'to_size': chunk_size = int(split_params['split_size']) else: raise Exception('Unsupported split mode %s' % split_params['split_mode']) f = open(input_files[0], 'rt') try: chunk_idx = 0 file_done = False part_file = None while not file_done: if lines_per_file is None: this_chunk_size = chunk_size elif chunk_idx < len(lines_per_file): this_chunk_size = lines_per_file[chunk_idx] chunk_idx += 1 lines_remaining = this_chunk_size part_file = None while lines_remaining > 0: a_line = f.readline() if a_line == '': file_done = True break if part_file is None: part_dir = subdir_generator_function() part_path = os.path.join(part_dir, os.path.basename(input_files[0])) part_file = open(part_path, 'w') part_file.write(a_line) lines_remaining -= 1 if part_file is not None: part_file.close() except Exception, e: log.error('Unable to split files: %s' % str(e)) f.close() if part_file is not None: part_file.close() raise f.close() split = classmethod(split) class wurcs(data.Data): file_ext = 'wurcs' line_class = 'line' """Add metadata elements""" MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, visible=False, no_value=0) def write_from_stream(self, dataset, stream): """Writes data from a stream""" # write it twice for now fd, temp_name = tempfile.mkstemp() while 1: chunk = stream.read(1048576) if not chunk: break os.write(fd, chunk) os.close(fd) # rewrite the file with unix newlines fp = open(dataset.file_name, 'wt') for line in file(temp_name, "U"): line = line.strip() + '\n' fp.write(line) fp.close() def set_raw_data(self, dataset, data): """Saves the data on the disc""" fd, temp_name = tempfile.mkstemp() os.write(fd, data) os.close(fd) # rewrite the file with unix newlines fp = open(dataset.file_name, 'wt') for line in file(temp_name, "U"): line = line.strip() + '\n' fp.write(line) fp.close() os.remove(temp_name) def get_mime(self): """Returns the mime type of the datatype""" return 'text/plain' def set_meta(self, dataset, **kwd): """ Set the number of lines of data in dataset. """ dataset.metadata.data_lines = self.count_data_lines(dataset) def estimate_file_lines(self, dataset): """ Perform a rough estimate by extrapolating number of lines from a small read. """ sample_size = 1048576 dataset_fh = open(dataset.file_name) dataset_read = dataset_fh.read(sample_size) dataset_fh.close() sample_lines = dataset_read.count('\n') est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) return est_lines def count_data_lines(self, dataset): """ Count the number of lines of data in dataset, skipping all blank lines and comments. """ data_lines = 0 for line in file(dataset.file_name): line = line.strip() if line and not line.startswith('#'): data_lines += 1 return data_lines def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): """ Set the peek. This method is used by various subclasses of Text. """ if not dataset.dataset.purged: # The file must exist on disk for the get_file_peek() method dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, skipchars=skipchars) if line_count is None: # See if line_count is stored in the metadata if dataset.metadata.data_lines: dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) else: # Number of lines is not known ( this should not happen ), and auto-detect is # needed to set metadata # This can happen when the file is larger than max_optional_metadata_filesize. if int(dataset.get_size()) <= 1048576: # Small dataset, recount all lines and reset peek afterward. lc = self.count_data_lines(dataset) dataset.metadata.data_lines = lc dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) else: est_lines = self.estimate_file_lines(dataset) dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), inflector.cond_plural(est_lines, self.line_class) ) else: dataset.blurb = "%s %s" % ( util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' def sniff(self, filename): """All WURCS Files start with WURCS= then the version number. see http://www.wurcs-wg.org/definition.php and http://rings.t.soka.ac.jp/ WURCS=2.0/4,3/[x2112h+1:x|1,5|2*NCC/3=O|4*OSO/3=O/3=O][12122a+1:b|1,5][12112h+1:b|1,5|2*NCC/3=O|6*OSO/3=O/3=O][12122a+1:b|1,5]1+3,2+1|2+4,3+1|3+3,4+1""" try: f = open(filename, "r") firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity f.close() if "WURCS" in firstline: return True else: return False except: traceback.print_exc(file=sys.stdout) return False def split(cls, input_datasets, subdir_generator_function, split_params): """ Split the input files by line. """ if split_params is None: return if len(input_datasets) > 1: raise Exception("Text file splitting does not support multiple files") input_files = [ds.file_name for ds in input_datasets] lines_per_file = None chunk_size = None if split_params['split_mode'] == 'number_of_parts': lines_per_file = [] # Computing the length is expensive! def _file_len(fname): i = 0 f = open(fname) for i, l in enumerate(f): pass f.close() return i + 1 length = _file_len(input_files[0]) parts = int(split_params['split_size']) if length < parts: parts = length len_each, remainder = divmod(length, parts) while length > 0: chunk = len_each if remainder > 0: chunk += 1 lines_per_file.append(chunk) remainder = - 1 length -= chunk elif split_params['split_mode'] == 'to_size': chunk_size = int(split_params['split_size']) else: raise Exception('Unsupported split mode %s' % split_params['split_mode']) f = open(input_files[0], 'rt') try: chunk_idx = 0 file_done = False part_file = None while not file_done: if lines_per_file is None: this_chunk_size = chunk_size elif chunk_idx < len(lines_per_file): this_chunk_size = lines_per_file[chunk_idx] chunk_idx += 1 lines_remaining = this_chunk_size part_file = None while lines_remaining > 0: a_line = f.readline() if a_line == '': file_done = True break if part_file is None: part_dir = subdir_generator_function() part_path = os.path.join(part_dir, os.path.basename(input_files[0])) part_file = open(part_path, 'w') part_file.write(a_line) lines_remaining -= 1 if part_file is not None: part_file.close() except Exception, e: log.error('Unable to split files: %s' % str(e)) f.close() if part_file is not None: part_file.close() raise f.close() split = classmethod(split)