annotate datatypes/glycan.py @ 0:0e941a69a6fa draft default tip

Uploaded
author chrisb
date Wed, 23 Mar 2016 14:34:50 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1 __license__ = "MIT"
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
2
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
3 import logging
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
4 from galaxy.datatypes import metadata
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
5 import mimetypes
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
6 import os
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
7 import shutil
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
8 import sys
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
9 import traceback
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
10 import tempfile
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
11 import zipfile
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
12 from cgi import escape
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
13 from inspect import isclass
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
14 import galaxy.util as util
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
15 from galaxy.datatypes import data
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
16 from galaxy.datatypes.metadata import \
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
17 MetadataElement # import directly to maintain ease of use in Datatype class definitions
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
18 from galaxy.util import inflector
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
19 from galaxy.util.bunch import Bunch
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
20 from galaxy.util.odict import odict
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
21 from galaxy.util.sanitize_html import sanitize_html
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
22
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
23 from galaxy.datatypes import dataproviders
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
24
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
25 from galaxy import eggs
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
26
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
27 eggs.require("Paste")
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
28 import paste
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
29
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
30
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
31 class kcf(data.Data):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
32 file_ext = 'kcf'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
33 line_class = 'line'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
34
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
35 """Add metadata elements"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
36 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
37 visible=False, no_value=0)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
38
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
39 def write_from_stream(self, dataset, stream):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
40 """Writes data from a stream"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
41 # write it twice for now
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
42 fd, temp_name = tempfile.mkstemp()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
43 while 1:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
44 chunk = stream.read(1048576)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
45 if not chunk:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
46 break
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
47 os.write(fd, chunk)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
48 os.close(fd)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
49 # rewrite the file with unix newlines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
50 fp = open(dataset.file_name, 'wt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
51 for line in file(temp_name, "U"):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
52 line = line.strip() + '\n'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
53 fp.write(line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
54 fp.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
55
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
56 def set_raw_data(self, dataset, data):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
57 """Saves the data on the disc"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
58 fd, temp_name = tempfile.mkstemp()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
59 os.write(fd, data)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
60 os.close(fd)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
61 # rewrite the file with unix newlines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
62 fp = open(dataset.file_name, 'wt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
63 for line in file(temp_name, "U"):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
64 line = line.strip() + '\n'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
65 fp.write(line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
66 fp.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
67 os.remove(temp_name)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
68
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
69 def get_mime(self):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
70 """Returns the mime type of the datatype"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
71 return 'text/plain'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
72
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
73 def set_meta(self, dataset, **kwd):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
74 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
75 Set the number of lines of data in dataset.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
76 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
77 dataset.metadata.data_lines = self.count_data_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
78
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
79 def estimate_file_lines(self, dataset):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
80 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
81 Perform a rough estimate by extrapolating number of lines from a small read.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
82 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
83 sample_size = 1048576
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
84 dataset_fh = open(dataset.file_name)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
85 dataset_read = dataset_fh.read(sample_size)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
86 dataset_fh.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
87 sample_lines = dataset_read.count('\n')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
88 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
89 return est_lines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
90
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
91 def count_data_lines(self, dataset):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
92 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
93 Count the number of lines of data in dataset,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
94 skipping all blank lines and comments.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
95 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
96 data_lines = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
97 for line in file(dataset.file_name):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
98 line = line.strip()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
99 if line and not line.startswith('#'):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
100 data_lines += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
101 return data_lines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
102
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
103 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
104 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
105 Set the peek. This method is used by various subclasses of Text.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
106 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
107 if not dataset.dataset.purged:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
108 # The file must exist on disk for the get_file_peek() method
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
109 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
110 skipchars=skipchars)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
111 if line_count is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
112 # See if line_count is stored in the metadata
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
113 if dataset.metadata.data_lines:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
114 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
115 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
116 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
117 # Number of lines is not known ( this should not happen ), and auto-detect is
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
118 # needed to set metadata
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
119 # This can happen when the file is larger than max_optional_metadata_filesize.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
120 if int(dataset.get_size()) <= 1048576:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
121 # Small dataset, recount all lines and reset peek afterward.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
122 lc = self.count_data_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
123 dataset.metadata.data_lines = lc
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
124 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
125 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
126 est_lines = self.estimate_file_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
127 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
128 inflector.cond_plural(est_lines, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
129 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
130 dataset.blurb = "%s %s" % (
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
131 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
132 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
133 dataset.peek = 'file does not exist'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
134 dataset.blurb = 'file purged from disk'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
135
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
136 def sniff(self, filename):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
137 """All KCF Files simply put a 'ENTRY' in its first line.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
138 This applies to all possible kcfs. In this case check
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
139 for 'Glycan' to confirm it's a glycan """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
140 try:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
141 from suds.client import Client
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
142
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
143 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
144 client = Client(url)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
145 kcfresponse = client.service.DeterminingForm(file(filename, 'r').read())
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
146 if kcfresponse.array[0] == "KCF":
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
147 return True
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
148 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
149 return False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
150 except ImportError:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
151 # cannot use import suds so use simple checker
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
152 print "using KCF simple checker"
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
153 f = open(filename, "r")
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
154 firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
155 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
156
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
157 if "ENTRY" in firstline and "GLYCAN" in firstline:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
158 return True
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
159 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
160 return False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
161 except Exception, e:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
162 # note I am not raising an error rather return False and let another sniffer try to type this data
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
163 traceback.print_exc(file=sys.stdout)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
164 return False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
165
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
166 def split(cls, input_datasets, subdir_generator_function, split_params):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
167 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
168 Split the input files by line.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
169 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
170 if split_params is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
171 return
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
172
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
173 if len(input_datasets) > 1:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
174 raise Exception("Text file splitting does not support multiple files")
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
175 input_files = [ds.file_name for ds in input_datasets]
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
176
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
177 lines_per_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
178 chunk_size = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
179 if split_params['split_mode'] == 'number_of_parts':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
180 lines_per_file = []
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
181 # Computing the length is expensive!
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
182 def _file_len(fname):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
183 i = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
184 f = open(fname)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
185 for i, l in enumerate(f):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
186 pass
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
187 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
188 return i + 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
189
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
190 length = _file_len(input_files[0])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
191 parts = int(split_params['split_size'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
192 if length < parts:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
193 parts = length
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
194 len_each, remainder = divmod(length, parts)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
195 while length > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
196 chunk = len_each
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
197 if remainder > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
198 chunk += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
199 lines_per_file.append(chunk)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
200 remainder = - 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
201 length -= chunk
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
202 elif split_params['split_mode'] == 'to_size':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
203 chunk_size = int(split_params['split_size'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
204 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
205 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
206
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
207 f = open(input_files[0], 'rt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
208 try:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
209 chunk_idx = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
210 file_done = False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
211 part_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
212 while not file_done:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
213 if lines_per_file is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
214 this_chunk_size = chunk_size
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
215 elif chunk_idx < len(lines_per_file):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
216 this_chunk_size = lines_per_file[chunk_idx]
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
217 chunk_idx += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
218 lines_remaining = this_chunk_size
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
219 part_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
220 while lines_remaining > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
221 a_line = f.readline()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
222 if a_line == '':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
223 file_done = True
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
224 break
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
225 if part_file is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
226 part_dir = subdir_generator_function()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
227 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
228 part_file = open(part_path, 'w')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
229 part_file.write(a_line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
230 lines_remaining -= 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
231 if part_file is not None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
232 part_file.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
233 except Exception, e:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
234 log.error('Unable to split files: %s' % str(e))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
235 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
236 if part_file is not None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
237 part_file.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
238 raise
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
239 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
240
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
241 split = classmethod(split)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
242
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
243
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
244 class glycoct(data.Data):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
245 file_ext = 'glycoct'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
246 line_class = 'line'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
247
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
248 """Add metadata elements"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
249 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
250 visible=False, no_value=0)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
251
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
252 def write_from_stream(self, dataset, stream):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
253 """Writes data from a stream"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
254 # write it twice for now
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
255 fd, temp_name = tempfile.mkstemp()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
256 while 1:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
257 chunk = stream.read(1048576)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
258 if not chunk:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
259 break
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
260 os.write(fd, chunk)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
261 os.close(fd)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
262 # rewrite the file with unix newlines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
263 fp = open(dataset.file_name, 'wt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
264 for line in file(temp_name, "U"):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
265 line = line.strip() + '\n'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
266 fp.write(line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
267 fp.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
268
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
269 def set_raw_data(self, dataset, data):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
270 """Saves the data on the disc"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
271 fd, temp_name = tempfile.mkstemp()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
272 os.write(fd, data)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
273 os.close(fd)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
274 # rewrite the file with unix newlines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
275 fp = open(dataset.file_name, 'wt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
276 for line in file(temp_name, "U"):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
277 line = line.strip() + '\n'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
278 fp.write(line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
279 fp.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
280 os.remove(temp_name)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
281
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
282 def get_mime(self):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
283 """Returns the mime type of the datatype"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
284 return 'text/plain'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
285
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
286 def set_meta(self, dataset, **kwd):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
287 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
288 Set the number of lines of data in dataset.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
289 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
290 dataset.metadata.data_lines = self.count_data_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
291
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
292 def estimate_file_lines(self, dataset):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
293 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
294 Perform a rough estimate by extrapolating number of lines from a small read.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
295 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
296 sample_size = 1048576
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
297 dataset_fh = open(dataset.file_name)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
298 dataset_read = dataset_fh.read(sample_size)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
299 dataset_fh.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
300 sample_lines = dataset_read.count('\n')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
301 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
302 return est_lines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
303
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
304 def count_data_lines(self, dataset):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
305 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
306 Count the number of lines of data in dataset,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
307 skipping all blank lines and comments.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
308 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
309 data_lines = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
310 for line in file(dataset.file_name):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
311 line = line.strip()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
312 if line and not line.startswith('#'):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
313 data_lines += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
314 return data_lines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
315
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
316 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
317 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
318 Set the peek. This method is used by various subclasses of Text.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
319 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
320 if not dataset.dataset.purged:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
321 # The file must exist on disk for the get_file_peek() method
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
322 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
323 skipchars=skipchars)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
324 if line_count is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
325 # See if line_count is stored in the metadata
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
326 if dataset.metadata.data_lines:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
327 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
328 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
329 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
330 # Number of lines is not known ( this should not happen ), and auto-detect is
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
331 # needed to set metadata
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
332 # This can happen when the file is larger than max_optional_metadata_filesize.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
333 if int(dataset.get_size()) <= 1048576:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
334 # Small dataset, recount all lines and reset peek afterward.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
335 lc = self.count_data_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
336 dataset.metadata.data_lines = lc
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
337 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
338 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
339 est_lines = self.estimate_file_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
340 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
341 inflector.cond_plural(est_lines, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
342 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
343 dataset.blurb = "%s %s" % (
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
344 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
345 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
346 dataset.peek = 'file does not exist'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
347 dataset.blurb = 'file purged from disk'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
348
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
349 def sniff(self, filename):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
350 """All glycoct_condensed files simply put a 'RES' in its first line and a LIN later. """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
351 try:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
352 f = open(filename, "r")
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
353 firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
354 lines = f.read()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
355 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
356
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
357 # if "RES" in firstline and "LIN" in lines:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
358 if "RES" in firstline and "LIN" in lines:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
359 return True
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
360 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
361 return False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
362 except Exception, e:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
363 # note I am not raising an error rather return False and let another sniffer try to type this data
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
364 traceback.print_exc(file=sys.stdout)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
365 return False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
366
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
367 def split(cls, input_datasets, subdir_generator_function, split_params):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
368 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
369 Split the input files by line.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
370 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
371 if split_params is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
372 return
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
373
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
374 if len(input_datasets) > 1:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
375 raise Exception("Text file splitting does not support multiple files")
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
376 input_files = [ds.file_name for ds in input_datasets]
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
377
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
378 lines_per_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
379 chunk_size = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
380 if split_params['split_mode'] == 'number_of_parts':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
381 lines_per_file = []
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
382 # Computing the length is expensive!
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
383 def _file_len(fname):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
384 i = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
385 f = open(fname)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
386 for i, l in enumerate(f):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
387 pass
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
388 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
389 return i + 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
390
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
391 length = _file_len(input_files[0])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
392 parts = int(split_params['split_size'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
393 if length < parts:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
394 parts = length
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
395 len_each, remainder = divmod(length, parts)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
396 while length > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
397 chunk = len_each
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
398 if remainder > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
399 chunk += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
400 lines_per_file.append(chunk)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
401 remainder = - 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
402 length -= chunk
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
403 elif split_params['split_mode'] == 'to_size':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
404 chunk_size = int(split_params['split_size'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
405 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
406 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
407
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
408 f = open(input_files[0], 'rt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
409 try:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
410 chunk_idx = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
411 file_done = False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
412 part_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
413 while not file_done:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
414 if lines_per_file is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
415 this_chunk_size = chunk_size
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
416 elif chunk_idx < len(lines_per_file):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
417 this_chunk_size = lines_per_file[chunk_idx]
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
418 chunk_idx += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
419 lines_remaining = this_chunk_size
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
420 part_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
421 while lines_remaining > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
422 a_line = f.readline()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
423 if a_line == '':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
424 file_done = True
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
425 break
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
426 if part_file is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
427 part_dir = subdir_generator_function()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
428 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
429 part_file = open(part_path, 'w')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
430 part_file.write(a_line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
431 lines_remaining -= 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
432 if part_file is not None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
433 part_file.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
434 except Exception, e:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
435 log.error('Unable to split files: %s' % str(e))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
436 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
437 if part_file is not None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
438 part_file.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
439 raise
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
440 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
441
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
442 split = classmethod(split)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
443
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
444 # ------------- Utility methods --------------
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
445
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
446 # nice_size used to be here, but to resolve cyclical dependencies it's been
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
447 # moved to galaxy.util. It belongs there anyway since it's used outside
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
448 # datatypes.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
449 nice_size = util.nice_size
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
450
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
451
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
452 def get_test_fname(fname):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
453 """Returns test data filename"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
454 path, name = os.path.split(__file__)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
455 full_path = os.path.join(path, 'test', fname)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
456 return full_path
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
457
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
458
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
459 def get_file_peek(file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipchars=[]):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
460 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
461 Returns the first LINE_COUNT lines wrapped to WIDTH
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
462
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
463 ## >>> fname = get_test_fname('4.bed')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
464 ## >>> get_file_peek(fname)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
465 ## 'chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 +\n'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
466
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
467 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
468 # Set size for file.readline() to a negative number to force it to
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
469 # read until either a newline or EOF. Needed for datasets with very
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
470 # long lines.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
471 if WIDTH == 'unlimited':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
472 WIDTH = -1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
473 lines = []
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
474 count = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
475 file_type = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
476 data_checked = False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
477 temp = open(file_name, "U")
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
478 while count <= LINE_COUNT:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
479 line = temp.readline(WIDTH)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
480 if line and not is_multi_byte and not data_checked:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
481 # See if we have a compressed or binary file
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
482 if line[0:2] == util.gzip_magic:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
483 file_type = 'gzipped'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
484 break
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
485 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
486 for char in line:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
487 if ord(char) > 128:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
488 file_type = 'binary'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
489 break
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
490 data_checked = True
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
491 if file_type in ['gzipped', 'binary']:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
492 break
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
493 skip_line = False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
494 for skipchar in skipchars:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
495 if line.startswith(skipchar):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
496 skip_line = True
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
497 break
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
498 if not skip_line:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
499 lines.append(line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
500 count += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
501 temp.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
502 if file_type in ['gzipped', 'binary']:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
503 text = "%s file" % file_type
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
504 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
505 try:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
506 text = unicode('\n'.join(lines), 'utf-8')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
507 except UnicodeDecodeError:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
508 text = "binary/unknown file"
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
509 return text
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
510
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
511
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
512 class glycoct_xml(data.Data):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
513 file_ext = 'glycoct_xml'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
514 line_class = 'line'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
515
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
516 """Add metadata elements"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
517 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
518 visible=False, no_value=0)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
519
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
520 def write_from_stream(self, dataset, stream):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
521 """Writes data from a stream"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
522 # write it twice for now
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
523 fd, temp_name = tempfile.mkstemp()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
524 while 1:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
525 chunk = stream.read(1048576)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
526 if not chunk:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
527 break
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
528 os.write(fd, chunk)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
529 os.close(fd)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
530 # rewrite the file with unix newlines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
531 fp = open(dataset.file_name, 'wt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
532 for line in file(temp_name, "U"):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
533 line = line.strip() + '\n'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
534 fp.write(line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
535 fp.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
536
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
537 def set_raw_data(self, dataset, data):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
538 """Saves the data on the disc"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
539 fd, temp_name = tempfile.mkstemp()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
540 os.write(fd, data)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
541 os.close(fd)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
542 # rewrite the file with unix newlines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
543 fp = open(dataset.file_name, 'wt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
544 for line in file(temp_name, "U"):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
545 line = line.strip() + '\n'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
546 fp.write(line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
547 fp.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
548 os.remove(temp_name)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
549
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
550 def get_mime(self):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
551 """Returns the mime type of the datatype"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
552 return 'text/xml'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
553
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
554 def set_meta(self, dataset, **kwd):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
555 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
556 Set the number of lines of data in dataset.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
557 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
558 dataset.metadata.data_lines = self.count_data_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
559
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
560 def estimate_file_lines(self, dataset):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
561 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
562 Perform a rough estimate by extrapolating number of lines from a small read.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
563 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
564 sample_size = 1048576
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
565 dataset_fh = open(dataset.file_name)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
566 dataset_read = dataset_fh.read(sample_size)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
567 dataset_fh.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
568 sample_lines = dataset_read.count('\n')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
569 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
570 return est_lines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
571
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
572 def count_data_lines(self, dataset):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
573 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
574 Count the number of lines of data in dataset,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
575 skipping all blank lines and comments.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
576 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
577 data_lines = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
578 for line in file(dataset.file_name):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
579 line = line.strip()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
580 if line and not line.startswith('#'):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
581 data_lines += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
582 return data_lines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
583
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
584 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
585 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
586 Set the peek. This method is used by various subclasses of Text.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
587 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
588 if not dataset.dataset.purged:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
589 # The file must exist on disk for the get_file_peek() method
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
590 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
591 skipchars=skipchars)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
592 if line_count is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
593 # See if line_count is stored in the metadata
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
594 if dataset.metadata.data_lines:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
595 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
596 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
597 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
598 # Number of lines is not known ( this should not happen ), and auto-detect is
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
599 # needed to set metadata
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
600 # This can happen when the file is larger than max_optional_metadata_filesize.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
601 if int(dataset.get_size()) <= 1048576:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
602 # Small dataset, recount all lines and reset peek afterward.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
603 lc = self.count_data_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
604 dataset.metadata.data_lines = lc
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
605 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
606 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
607 est_lines = self.estimate_file_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
608 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
609 inflector.cond_plural(est_lines, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
610 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
611 dataset.blurb = "%s %s" % (
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
612 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
613 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
614 dataset.peek = 'file does not exist'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
615 dataset.blurb = 'file purged from disk'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
616
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
617 def sniff(self, filename):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
618 """All glycoct XML files should use the rings form determination script """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
619 try:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
620 from suds.client import Client
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
621
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
622 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
623 client = Client(url)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
624 response = client.service.DeterminingForm(file(filename, 'r').read())
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
625 if response.array[0] == "GlycoCT":
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
626 return True
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
627 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
628 return False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
629 except ImportError:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
630 # cannot use import suds so use simple checker
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
631 print "using glycoct XML simple checker"
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
632 import xml.etree.cElementTree as ET
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
633
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
634 tree = ET.parse(filename)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
635 root = tree.getroot()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
636 if root.tag == 'sugar':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
637 print root.tag, root.attrib
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
638 return True
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
639 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
640 return False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
641 except Exception, e:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
642 # note I am not raising an error rather return False and let another sniffer try to type this data
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
643 traceback.print_exc(file=sys.stdout)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
644 return False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
645
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
646 def split(cls, input_datasets, subdir_generator_function, split_params):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
647 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
648 Split the input files by line.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
649 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
650 if split_params is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
651 return
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
652
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
653 if len(input_datasets) > 1:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
654 raise Exception("Text file splitting does not support multiple files")
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
655 input_files = [ds.file_name for ds in input_datasets]
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
656
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
657 lines_per_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
658 chunk_size = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
659 if split_params['split_mode'] == 'number_of_parts':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
660 lines_per_file = []
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
661 # Computing the length is expensive!
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
662 def _file_len(fname):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
663 i = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
664 f = open(fname)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
665 for i, l in enumerate(f):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
666 pass
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
667 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
668 return i + 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
669
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
670 length = _file_len(input_files[0])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
671 parts = int(split_params['split_size'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
672 if length < parts:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
673 parts = length
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
674 len_each, remainder = divmod(length, parts)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
675 while length > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
676 chunk = len_each
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
677 if remainder > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
678 chunk += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
679 lines_per_file.append(chunk)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
680 remainder = - 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
681 length -= chunk
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
682 elif split_params['split_mode'] == 'to_size':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
683 chunk_size = int(split_params['split_size'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
684 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
685 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
686
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
687 f = open(input_files[0], 'rt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
688 try:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
689 chunk_idx = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
690 file_done = False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
691 part_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
692 while not file_done:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
693 if lines_per_file is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
694 this_chunk_size = chunk_size
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
695 elif chunk_idx < len(lines_per_file):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
696 this_chunk_size = lines_per_file[chunk_idx]
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
697 chunk_idx += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
698 lines_remaining = this_chunk_size
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
699 part_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
700 while lines_remaining > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
701 a_line = f.readline()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
702 if a_line == '':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
703 file_done = True
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
704 break
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
705 if part_file is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
706 part_dir = subdir_generator_function()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
707 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
708 part_file = open(part_path, 'w')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
709 part_file.write(a_line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
710 lines_remaining -= 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
711 if part_file is not None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
712 part_file.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
713 except Exception, e:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
714 log.error('Unable to split files: %s' % str(e))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
715 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
716 if part_file is not None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
717 part_file.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
718 raise
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
719 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
720
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
721 split = classmethod(split)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
722
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
723
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
724 class glydeii(data.Data):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
725 file_ext = 'glydeii'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
726 line_class = 'line'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
727
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
728 """Add metadata elements"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
729 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
730 visible=False, no_value=0)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
731
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
732 def write_from_stream(self, dataset, stream):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
733 """Writes data from a stream"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
734 # write it twice for now
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
735 fd, temp_name = tempfile.mkstemp()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
736 while 1:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
737 chunk = stream.read(1048576)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
738 if not chunk:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
739 break
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
740 os.write(fd, chunk)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
741 os.close(fd)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
742 # rewrite the file with unix newlines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
743 fp = open(dataset.file_name, 'wt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
744 for line in file(temp_name, "U"):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
745 line = line.strip() + '\n'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
746 fp.write(line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
747 fp.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
748
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
749 def set_raw_data(self, dataset, data):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
750 """Saves the data on the disc"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
751 fd, temp_name = tempfile.mkstemp()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
752 os.write(fd, data)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
753 os.close(fd)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
754 # rewrite the file with unix newlines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
755 fp = open(dataset.file_name, 'wt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
756 for line in file(temp_name, "U"):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
757 line = line.strip() + '\n'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
758 fp.write(line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
759 fp.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
760 os.remove(temp_name)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
761
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
762 def get_mime(self):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
763 """Returns the mime type of the datatype"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
764 return 'text/xml'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
765
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
766 def set_meta(self, dataset, **kwd):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
767 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
768 Set the number of lines of data in dataset.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
769 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
770 dataset.metadata.data_lines = self.count_data_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
771
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
772 def estimate_file_lines(self, dataset):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
773 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
774 Perform a rough estimate by extrapolating number of lines from a small read.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
775 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
776 sample_size = 1048576
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
777 dataset_fh = open(dataset.file_name)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
778 dataset_read = dataset_fh.read(sample_size)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
779 dataset_fh.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
780 sample_lines = dataset_read.count('\n')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
781 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
782 return est_lines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
783
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
784 def count_data_lines(self, dataset):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
785 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
786 Count the number of lines of data in dataset,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
787 skipping all blank lines and comments.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
788 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
789 data_lines = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
790 for line in file(dataset.file_name):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
791 line = line.strip()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
792 if line and not line.startswith('#'):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
793 data_lines += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
794 return data_lines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
795
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
796 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
797 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
798 Set the peek. This method is used by various subclasses of Text.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
799 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
800 if not dataset.dataset.purged:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
801 # The file must exist on disk for the get_file_peek() method
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
802 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
803 skipchars=skipchars)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
804 if line_count is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
805 # See if line_count is stored in the metadata
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
806 if dataset.metadata.data_lines:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
807 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
808 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
809 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
810 # Number of lines is not known ( this should not happen ), and auto-detect is
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
811 # needed to set metadata
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
812 # This can happen when the file is larger than max_optional_metadata_filesize.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
813 if int(dataset.get_size()) <= 1048576:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
814 # Small dataset, recount all lines and reset peek afterward.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
815 lc = self.count_data_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
816 dataset.metadata.data_lines = lc
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
817 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
818 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
819 est_lines = self.estimate_file_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
820 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
821 inflector.cond_plural(est_lines, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
822 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
823 dataset.blurb = "%s %s" % (
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
824 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
825 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
826 dataset.peek = 'file does not exist'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
827 dataset.blurb = 'file purged from disk'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
828
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
829 def sniff(self, filename):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
830 """All GlydeII XML files should use the rings form determination script """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
831 try:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
832 from suds.client import Client
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
833
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
834 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
835 client = Client(url)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
836 response = client.service.DeterminingForm(file(filename, 'r').read())
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
837 if response.array[0] == "GLYDEII":
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
838 return True
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
839 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
840 return False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
841 except ImportError:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
842 # cannot use import suds so use simple checker
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
843 print "using GlydeII simple checker"
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
844 import xml.etree.cElementTree as ET
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
845
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
846 tree = ET.parse(filename)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
847 root = tree.getroot()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
848 if root.tag == 'GlydeII':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
849 print root.tag
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
850 return True
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
851 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
852 return False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
853 except Exception, e:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
854 # note I am not raising an error rather return False and let another sniffer try to type this data
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
855 traceback.print_exc(file=sys.stdout)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
856 return False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
857
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
858 def split(cls, input_datasets, subdir_generator_function, split_params):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
859 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
860 Split the input files by line.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
861 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
862 if split_params is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
863 return
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
864
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
865 if len(input_datasets) > 1:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
866 raise Exception("Text file splitting does not support multiple files")
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
867 input_files = [ds.file_name for ds in input_datasets]
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
868
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
869 lines_per_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
870 chunk_size = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
871 if split_params['split_mode'] == 'number_of_parts':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
872 lines_per_file = []
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
873 # Computing the length is expensive!
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
874 def _file_len(fname):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
875 i = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
876 f = open(fname)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
877 for i, l in enumerate(f):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
878 pass
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
879 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
880 return i + 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
881
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
882 length = _file_len(input_files[0])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
883 parts = int(split_params['split_size'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
884 if length < parts:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
885 parts = length
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
886 len_each, remainder = divmod(length, parts)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
887 while length > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
888 chunk = len_each
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
889 if remainder > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
890 chunk += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
891 lines_per_file.append(chunk)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
892 remainder = - 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
893 length -= chunk
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
894 elif split_params['split_mode'] == 'to_size':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
895 chunk_size = int(split_params['split_size'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
896 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
897 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
898
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
899 f = open(input_files[0], 'rt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
900 try:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
901 chunk_idx = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
902 file_done = False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
903 part_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
904 while not file_done:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
905 if lines_per_file is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
906 this_chunk_size = chunk_size
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
907 elif chunk_idx < len(lines_per_file):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
908 this_chunk_size = lines_per_file[chunk_idx]
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
909 chunk_idx += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
910 lines_remaining = this_chunk_size
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
911 part_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
912 while lines_remaining > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
913 a_line = f.readline()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
914 if a_line == '':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
915 file_done = True
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
916 break
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
917 if part_file is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
918 part_dir = subdir_generator_function()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
919 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
920 part_file = open(part_path, 'w')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
921 part_file.write(a_line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
922 lines_remaining -= 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
923 if part_file is not None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
924 part_file.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
925 except Exception, e:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
926 log.error('Unable to split files: %s' % str(e))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
927 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
928 if part_file is not None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
929 part_file.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
930 raise
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
931 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
932
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
933 split = classmethod(split)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
934
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
935
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
936 class linucs(data.Data):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
937 file_ext = 'linucs'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
938 line_class = 'line'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
939
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
940 """Add metadata elements"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
941 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
942 visible=False, no_value=0)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
943
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
944 def write_from_stream(self, dataset, stream):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
945 """Writes data from a stream"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
946 # write it twice for now
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
947 fd, temp_name = tempfile.mkstemp()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
948 while 1:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
949 chunk = stream.read(1048576)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
950 if not chunk:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
951 break
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
952 os.write(fd, chunk)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
953 os.close(fd)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
954 # rewrite the file with unix newlines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
955 fp = open(dataset.file_name, 'wt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
956 for line in file(temp_name, "U"):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
957 line = line.strip() + '\n'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
958 fp.write(line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
959 fp.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
960
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
961 def set_raw_data(self, dataset, data):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
962 """Saves the data on the disc"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
963 fd, temp_name = tempfile.mkstemp()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
964 os.write(fd, data)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
965 os.close(fd)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
966 # rewrite the file with unix newlines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
967 fp = open(dataset.file_name, 'wt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
968 for line in file(temp_name, "U"):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
969 line = line.strip() + '\n'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
970 fp.write(line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
971 fp.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
972 os.remove(temp_name)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
973
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
974 def get_mime(self):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
975 """Returns the mime type of the datatype"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
976 return 'text/plain'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
977
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
978 def set_meta(self, dataset, **kwd):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
979 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
980 Set the number of lines of data in dataset.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
981 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
982 dataset.metadata.data_lines = self.count_data_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
983
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
984 def estimate_file_lines(self, dataset):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
985 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
986 Perform a rough estimate by extrapolating number of lines from a small read.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
987 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
988 sample_size = 1048576
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
989 dataset_fh = open(dataset.file_name)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
990 dataset_read = dataset_fh.read(sample_size)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
991 dataset_fh.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
992 sample_lines = dataset_read.count('\n')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
993 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
994 return est_lines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
995
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
996 def count_data_lines(self, dataset):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
997 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
998 Count the number of lines of data in dataset,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
999 skipping all blank lines and comments.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1000 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1001 data_lines = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1002 for line in file(dataset.file_name):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1003 line = line.strip()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1004 if line and not line.startswith('#'):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1005 data_lines += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1006 return data_lines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1007
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1008 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1009 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1010 Set the peek. This method is used by various subclasses of Text.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1011 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1012 if not dataset.dataset.purged:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1013 # The file must exist on disk for the get_file_peek() method
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1014 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1015 skipchars=skipchars)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1016 if line_count is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1017 # See if line_count is stored in the metadata
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1018 if dataset.metadata.data_lines:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1019 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1020 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1021 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1022 # Number of lines is not known ( this should not happen ), and auto-detect is
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1023 # needed to set metadata
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1024 # This can happen when the file is larger than max_optional_metadata_filesize.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1025 if int(dataset.get_size()) <= 1048576:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1026 # Small dataset, recount all lines and reset peek afterward.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1027 lc = self.count_data_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1028 dataset.metadata.data_lines = lc
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1029 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1030 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1031 est_lines = self.estimate_file_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1032 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1033 inflector.cond_plural(est_lines, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1034 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1035 dataset.blurb = "%s %s" % (
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1036 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1037 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1038 dataset.peek = 'file does not exist'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1039 dataset.blurb = 'file purged from disk'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1040
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1041 def sniff(self, filename):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1042 """All LINUCS files should use the rings form determination script """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1043 try:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1044 from suds.client import Client
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1045
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1046 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1047 client = Client(url)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1048 response = client.service.DeterminingForm(file(filename, 'r').read())
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1049 if response.array[0] == "LINUCS":
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1050 return True
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1051 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1052 return False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1053 except ImportError:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1054 # cannot use import suds so use simple checker
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1055 print "using LINUCS simple checker"
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1056
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1057 f = open(filename, "r")
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1058 firstline = f.readline()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1059 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1060
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1061 if "[" in firstline and "]" in firstline and "{" in firstline and "}" in firstline:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1062 return True
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1063 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1064 return False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1065 except Exception, e:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1066 # note I am not raising an error rather return False and let another sniffer try to type this data
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1067 traceback.print_exc(file=sys.stdout)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1068 return False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1069
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1070 def split(cls, input_datasets, subdir_generator_function, split_params):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1071 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1072 Split the input files by line.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1073 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1074 if split_params is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1075 return
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1076
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1077 if len(input_datasets) > 1:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1078 raise Exception("Text file splitting does not support multiple files")
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1079 input_files = [ds.file_name for ds in input_datasets]
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1080
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1081 lines_per_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1082 chunk_size = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1083 if split_params['split_mode'] == 'number_of_parts':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1084 lines_per_file = []
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1085 # Computing the length is expensive!
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1086 def _file_len(fname):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1087 i = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1088 f = open(fname)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1089 for i, l in enumerate(f):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1090 pass
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1091 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1092 return i + 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1093
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1094 length = _file_len(input_files[0])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1095 parts = int(split_params['split_size'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1096 if length < parts:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1097 parts = length
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1098 len_each, remainder = divmod(length, parts)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1099 while length > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1100 chunk = len_each
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1101 if remainder > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1102 chunk += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1103 lines_per_file.append(chunk)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1104 remainder = - 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1105 length -= chunk
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1106 elif split_params['split_mode'] == 'to_size':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1107 chunk_size = int(split_params['split_size'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1108 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1109 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1110
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1111 f = open(input_files[0], 'rt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1112 try:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1113 chunk_idx = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1114 file_done = False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1115 part_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1116 while not file_done:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1117 if lines_per_file is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1118 this_chunk_size = chunk_size
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1119 elif chunk_idx < len(lines_per_file):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1120 this_chunk_size = lines_per_file[chunk_idx]
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1121 chunk_idx += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1122 lines_remaining = this_chunk_size
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1123 part_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1124 while lines_remaining > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1125 a_line = f.readline()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1126 if a_line == '':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1127 file_done = True
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1128 break
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1129 if part_file is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1130 part_dir = subdir_generator_function()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1131 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1132 part_file = open(part_path, 'w')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1133 part_file.write(a_line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1134 lines_remaining -= 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1135 if part_file is not None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1136 part_file.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1137 except Exception, e:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1138 log.error('Unable to split files: %s' % str(e))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1139 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1140 if part_file is not None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1141 part_file.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1142 raise
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1143 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1144
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1145 split = classmethod(split)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1146
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1147
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1148 class iupac(data.Data):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1149 file_ext = 'iupac'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1150 line_class = 'line'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1151
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1152 """Add metadata elements"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1153 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1154 visible=False, no_value=0)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1155
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1156 def write_from_stream(self, dataset, stream):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1157 """Writes data from a stream"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1158 # write it twice for now
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1159 fd, temp_name = tempfile.mkstemp()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1160 while 1:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1161 chunk = stream.read(1048576)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1162 if not chunk:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1163 break
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1164 os.write(fd, chunk)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1165 os.close(fd)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1166 # rewrite the file with unix newlines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1167 fp = open(dataset.file_name, 'wt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1168 for line in file(temp_name, "U"):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1169 line = line.strip() + '\n'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1170 fp.write(line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1171 fp.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1172
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1173 def set_raw_data(self, dataset, data):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1174 """Saves the data on the disc"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1175 fd, temp_name = tempfile.mkstemp()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1176 os.write(fd, data)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1177 os.close(fd)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1178 # rewrite the file with unix newlines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1179 fp = open(dataset.file_name, 'wt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1180 for line in file(temp_name, "U"):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1181 line = line.strip() + '\n'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1182 fp.write(line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1183 fp.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1184 os.remove(temp_name)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1185
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1186 def get_mime(self):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1187 """Returns the mime type of the datatype"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1188 return 'text/plain'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1189
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1190 def set_meta(self, dataset, **kwd):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1191 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1192 Set the number of lines of data in dataset.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1193 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1194 dataset.metadata.data_lines = self.count_data_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1195
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1196 def estimate_file_lines(self, dataset):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1197 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1198 Perform a rough estimate by extrapolating number of lines from a small read.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1199 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1200 sample_size = 1048576
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1201 dataset_fh = open(dataset.file_name)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1202 dataset_read = dataset_fh.read(sample_size)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1203 dataset_fh.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1204 sample_lines = dataset_read.count('\n')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1205 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1206 return est_lines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1207
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1208 def count_data_lines(self, dataset):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1209 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1210 Count the number of lines of data in dataset,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1211 skipping all blank lines and comments.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1212 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1213 data_lines = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1214 for line in file(dataset.file_name):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1215 line = line.strip()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1216 if line and not line.startswith('#'):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1217 data_lines += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1218 return data_lines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1219
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1220 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1221 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1222 Set the peek. This method is used by various subclasses of Text.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1223 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1224 if not dataset.dataset.purged:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1225 # The file must exist on disk for the get_file_peek() method
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1226 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1227 skipchars=skipchars)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1228 if line_count is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1229 # See if line_count is stored in the metadata
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1230 if dataset.metadata.data_lines:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1231 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1232 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1233 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1234 # Number of lines is not known ( this should not happen ), and auto-detect is
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1235 # needed to set metadata
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1236 # This can happen when the file is larger than max_optional_metadata_filesize.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1237 if int(dataset.get_size()) <= 1048576:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1238 # Small dataset, recount all lines and reset peek afterward.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1239 lc = self.count_data_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1240 dataset.metadata.data_lines = lc
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1241 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1242 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1243 est_lines = self.estimate_file_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1244 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1245 inflector.cond_plural(est_lines, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1246 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1247 dataset.blurb = "%s %s" % (
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1248 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1249 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1250 dataset.peek = 'file does not exist'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1251 dataset.blurb = 'file purged from disk'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1252
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1253 def sniff(self, filename):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1254 """All IUPAC files should use the rings form determination script """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1255 try:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1256 from suds.client import Client
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1257
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1258 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1259 client = Client(url)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1260 response = client.service.DeterminingForm(file(filename, 'r').read())
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1261 if response.array[0] == "IUPAC":
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1262 return True
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1263 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1264 return False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1265 except ImportError:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1266 # cannot use import suds so use simple checker
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1267 print "using IUPAC simple checker"
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1268 f = open(filename, "r")
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1269 firstline = f.readline()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1270 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1271
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1272 if "[" in firstline or "]" in firstline or "(" in firstline or ")" in firstline:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1273 if "{" in firstline or "}" in firstline:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1274 return False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1275 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1276 return True
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1277 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1278 return False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1279 except Exception, e:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1280 # note I am not raising an error rather return False and let another sniffer try to type this data
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1281 traceback.print_exc(file=sys.stdout)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1282 return False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1283
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1284 def split(cls, input_datasets, subdir_generator_function, split_params):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1285 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1286 Split the input files by line.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1287 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1288 if split_params is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1289 return
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1290
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1291 if len(input_datasets) > 1:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1292 raise Exception("Text file splitting does not support multiple files")
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1293 input_files = [ds.file_name for ds in input_datasets]
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1294
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1295 lines_per_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1296 chunk_size = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1297 if split_params['split_mode'] == 'number_of_parts':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1298 lines_per_file = []
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1299 # Computing the length is expensive!
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1300 def _file_len(fname):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1301 i = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1302 f = open(fname)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1303 for i, l in enumerate(f):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1304 pass
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1305 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1306 return i + 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1307
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1308 length = _file_len(input_files[0])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1309 parts = int(split_params['split_size'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1310 if length < parts:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1311 parts = length
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1312 len_each, remainder = divmod(length, parts)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1313 while length > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1314 chunk = len_each
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1315 if remainder > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1316 chunk += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1317 lines_per_file.append(chunk)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1318 remainder = - 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1319 length -= chunk
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1320 elif split_params['split_mode'] == 'to_size':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1321 chunk_size = int(split_params['split_size'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1322 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1323 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1324
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1325 f = open(input_files[0], 'rt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1326 try:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1327 chunk_idx = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1328 file_done = False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1329 part_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1330 while not file_done:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1331 if lines_per_file is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1332 this_chunk_size = chunk_size
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1333 elif chunk_idx < len(lines_per_file):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1334 this_chunk_size = lines_per_file[chunk_idx]
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1335 chunk_idx += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1336 lines_remaining = this_chunk_size
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1337 part_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1338 while lines_remaining > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1339 a_line = f.readline()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1340 if a_line == '':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1341 file_done = True
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1342 break
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1343 if part_file is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1344 part_dir = subdir_generator_function()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1345 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1346 part_file = open(part_path, 'w')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1347 part_file.write(a_line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1348 lines_remaining -= 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1349 if part_file is not None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1350 part_file.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1351 except Exception, e:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1352 log.error('Unable to split files: %s' % str(e))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1353 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1354 if part_file is not None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1355 part_file.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1356 raise
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1357 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1358
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1359 split = classmethod(split)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1360
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1361
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1362 class linearcode(data.Data):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1363 file_ext = 'linearcode'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1364 line_class = 'line'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1365
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1366 """Add metadata elements"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1367 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1368 visible=False, no_value=0)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1369
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1370 def write_from_stream(self, dataset, stream):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1371 """Writes data from a stream"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1372 # write it twice for now
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1373 fd, temp_name = tempfile.mkstemp()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1374 while 1:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1375 chunk = stream.read(1048576)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1376 if not chunk:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1377 break
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1378 os.write(fd, chunk)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1379 os.close(fd)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1380 # rewrite the file with unix newlines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1381 fp = open(dataset.file_name, 'wt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1382 for line in file(temp_name, "U"):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1383 line = line.strip() + '\n'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1384 fp.write(line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1385 fp.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1386
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1387 def set_raw_data(self, dataset, data):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1388 """Saves the data on the disc"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1389 fd, temp_name = tempfile.mkstemp()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1390 os.write(fd, data)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1391 os.close(fd)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1392 # rewrite the file with unix newlines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1393 fp = open(dataset.file_name, 'wt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1394 for line in file(temp_name, "U"):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1395 line = line.strip() + '\n'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1396 fp.write(line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1397 fp.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1398 os.remove(temp_name)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1399
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1400 def get_mime(self):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1401 """Returns the mime type of the datatype"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1402 return 'text/plain'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1403
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1404 def set_meta(self, dataset, **kwd):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1405 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1406 Set the number of lines of data in dataset.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1407 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1408 dataset.metadata.data_lines = self.count_data_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1409
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1410 def estimate_file_lines(self, dataset):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1411 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1412 Perform a rough estimate by extrapolating number of lines from a small read.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1413 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1414 sample_size = 1048576
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1415 dataset_fh = open(dataset.file_name)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1416 dataset_read = dataset_fh.read(sample_size)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1417 dataset_fh.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1418 sample_lines = dataset_read.count('\n')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1419 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1420 return est_lines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1421
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1422 def count_data_lines(self, dataset):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1423 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1424 Count the number of lines of data in dataset,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1425 skipping all blank lines and comments.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1426 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1427 data_lines = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1428 for line in file(dataset.file_name):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1429 line = line.strip()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1430 if line and not line.startswith('#'):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1431 data_lines += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1432 return data_lines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1433
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1434 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1435 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1436 Set the peek. This method is used by various subclasses of Text.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1437 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1438 if not dataset.dataset.purged:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1439 # The file must exist on disk for the get_file_peek() method
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1440 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1441 skipchars=skipchars)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1442 if line_count is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1443 # See if line_count is stored in the metadata
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1444 if dataset.metadata.data_lines:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1445 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1446 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1447 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1448 # Number of lines is not known ( this should not happen ), and auto-detect is
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1449 # needed to set metadata
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1450 # This can happen when the file is larger than max_optional_metadata_filesize.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1451 if int(dataset.get_size()) <= 1048576:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1452 # Small dataset, recount all lines and reset peek afterward.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1453 lc = self.count_data_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1454 dataset.metadata.data_lines = lc
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1455 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1456 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1457 est_lines = self.estimate_file_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1458 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1459 inflector.cond_plural(est_lines, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1460 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1461 dataset.blurb = "%s %s" % (
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1462 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1463 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1464 dataset.peek = 'file does not exist'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1465 dataset.blurb = 'file purged from disk'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1466
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1467 def sniff(self, filename):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1468 """All linear code files should use the rings form determination script """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1469 try:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1470 from suds.client import Client
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1471
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1472 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1473 client = Client(url)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1474 lcresponse = client.service.DeterminingForm(file(filename, 'r').read())
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1475 if lcresponse.array[0] == "LinearCode":
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1476 print "LinearCode"
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1477 return True
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1478 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1479 print "Unable to guess format"
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1480 return False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1481 except ImportError:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1482 # cannot use import suds so use simple checker
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1483 print "using LinearCode simple checker - nope it does not exist yet"
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1484 return False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1485 except Exception, e:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1486 # note I am not raising an error rather return False and let another sniffer try to type this data
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1487 traceback.print_exc(file=sys.stdout)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1488 return False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1489
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1490 def split(cls, input_datasets, subdir_generator_function, split_params):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1491 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1492 Split the input files by line.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1493 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1494 if split_params is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1495 return
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1496
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1497 if len(input_datasets) > 1:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1498 raise Exception("Text file splitting does not support multiple files")
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1499 input_files = [ds.file_name for ds in input_datasets]
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1500
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1501 lines_per_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1502 chunk_size = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1503 if split_params['split_mode'] == 'number_of_parts':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1504 lines_per_file = []
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1505 # Computing the length is expensive!
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1506 def _file_len(fname):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1507 i = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1508 f = open(fname)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1509 for i, l in enumerate(f):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1510 pass
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1511 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1512 return i + 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1513
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1514 length = _file_len(input_files[0])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1515 parts = int(split_params['split_size'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1516 if length < parts:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1517 parts = length
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1518 len_each, remainder = divmod(length, parts)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1519 while length > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1520 chunk = len_each
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1521 if remainder > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1522 chunk += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1523 lines_per_file.append(chunk)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1524 remainder = - 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1525 length -= chunk
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1526 elif split_params['split_mode'] == 'to_size':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1527 chunk_size = int(split_params['split_size'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1528 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1529 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1530
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1531 f = open(input_files[0], 'rt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1532 try:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1533 chunk_idx = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1534 file_done = False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1535 part_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1536 while not file_done:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1537 if lines_per_file is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1538 this_chunk_size = chunk_size
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1539 elif chunk_idx < len(lines_per_file):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1540 this_chunk_size = lines_per_file[chunk_idx]
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1541 chunk_idx += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1542 lines_remaining = this_chunk_size
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1543 part_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1544 while lines_remaining > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1545 a_line = f.readline()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1546 if a_line == '':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1547 file_done = True
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1548 break
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1549 if part_file is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1550 part_dir = subdir_generator_function()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1551 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1552 part_file = open(part_path, 'w')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1553 part_file.write(a_line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1554 lines_remaining -= 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1555 if part_file is not None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1556 part_file.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1557 except Exception, e:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1558 log.error('Unable to split files: %s' % str(e))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1559 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1560 if part_file is not None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1561 part_file.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1562 raise
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1563 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1564
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1565 split = classmethod(split)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1566
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1567
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1568 class msa(data.Data):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1569 file_ext = 'msa'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1570 line_class = 'line'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1571
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1572 """Add metadata elements"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1573 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1574 visible=False, no_value=0)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1575
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1576 def write_from_stream(self, dataset, stream):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1577 """Writes data from a stream"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1578 # write it twice for now
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1579 fd, temp_name = tempfile.mkstemp()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1580 while 1:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1581 chunk = stream.read(1048576)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1582 if not chunk:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1583 break
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1584 os.write(fd, chunk)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1585 os.close(fd)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1586 # rewrite the file with unix newlines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1587 fp = open(dataset.file_name, 'wt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1588 for line in file(temp_name, "U"):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1589 line = line.strip() + '\n'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1590 fp.write(line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1591 fp.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1592
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1593 def set_raw_data(self, dataset, data):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1594 """Saves the data on the disc"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1595 fd, temp_name = tempfile.mkstemp()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1596 os.write(fd, data)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1597 os.close(fd)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1598 # rewrite the file with unix newlines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1599 fp = open(dataset.file_name, 'wt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1600 for line in file(temp_name, "U"):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1601 line = line.strip() + '\n'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1602 fp.write(line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1603 fp.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1604 os.remove(temp_name)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1605
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1606 def get_mime(self):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1607 """Returns the mime type of the datatype"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1608 return 'text/plain'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1609
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1610 def set_meta(self, dataset, **kwd):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1611 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1612 Set the number of lines of data in dataset.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1613 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1614 dataset.metadata.data_lines = self.count_data_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1615
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1616 def estimate_file_lines(self, dataset):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1617 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1618 Perform a rough estimate by extrapolating number of lines from a small read.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1619 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1620 sample_size = 1048576
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1621 dataset_fh = open(dataset.file_name)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1622 dataset_read = dataset_fh.read(sample_size)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1623 dataset_fh.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1624 sample_lines = dataset_read.count('\n')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1625 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1626 return est_lines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1627
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1628 def count_data_lines(self, dataset):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1629 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1630 Count the number of lines of data in dataset,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1631 skipping all blank lines and comments.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1632 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1633 data_lines = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1634 for line in file(dataset.file_name):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1635 line = line.strip()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1636 if line and not line.startswith('#'):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1637 data_lines += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1638 return data_lines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1639
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1640 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1641 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1642 Set the peek. This method is used by various subclasses of Text.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1643 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1644 if not dataset.dataset.purged:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1645 # The file must exist on disk for the get_file_peek() method
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1646 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1647 skipchars=skipchars)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1648 if line_count is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1649 # See if line_count is stored in the metadata
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1650 if dataset.metadata.data_lines:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1651 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1652 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1653 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1654 # Number of lines is not known ( this should not happen ), and auto-detect is
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1655 # needed to set metadata
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1656 # This can happen when the file is larger than max_optional_metadata_filesize.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1657 if int(dataset.get_size()) <= 1048576:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1658 # Small dataset, recount all lines and reset peek afterward.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1659 lc = self.count_data_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1660 dataset.metadata.data_lines = lc
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1661 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1662 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1663 est_lines = self.estimate_file_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1664 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1665 inflector.cond_plural(est_lines, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1666 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1667 dataset.blurb = "%s %s" % (
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1668 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1669 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1670 dataset.peek = 'file does not exist'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1671 dataset.blurb = 'file purged from disk'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1672
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1673 def sniff(self, filename):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1674 """All msa Files simply put a '# .msa' in the first line. """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1675 try:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1676 f = open(filename, "r")
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1677 firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1678 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1679
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1680 if "# .MSA" in firstline:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1681 return True
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1682 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1683 return False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1684 except:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1685 traceback.print_exc(file=sys.stdout)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1686 return False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1687
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1688 def split(cls, input_datasets, subdir_generator_function, split_params):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1689 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1690 Split the input files by line.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1691 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1692 if split_params is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1693 return
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1694
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1695 if len(input_datasets) > 1:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1696 raise Exception("Text file splitting does not support multiple files")
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1697 input_files = [ds.file_name for ds in input_datasets]
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1698
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1699 lines_per_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1700 chunk_size = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1701 if split_params['split_mode'] == 'number_of_parts':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1702 lines_per_file = []
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1703 # Computing the length is expensive!
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1704 def _file_len(fname):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1705 i = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1706 f = open(fname)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1707 for i, l in enumerate(f):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1708 pass
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1709 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1710 return i + 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1711
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1712 length = _file_len(input_files[0])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1713 parts = int(split_params['split_size'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1714 if length < parts:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1715 parts = length
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1716 len_each, remainder = divmod(length, parts)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1717 while length > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1718 chunk = len_each
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1719 if remainder > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1720 chunk += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1721 lines_per_file.append(chunk)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1722 remainder = - 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1723 length -= chunk
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1724 elif split_params['split_mode'] == 'to_size':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1725 chunk_size = int(split_params['split_size'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1726 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1727 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1728
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1729 f = open(input_files[0], 'rt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1730 try:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1731 chunk_idx = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1732 file_done = False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1733 part_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1734 while not file_done:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1735 if lines_per_file is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1736 this_chunk_size = chunk_size
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1737 elif chunk_idx < len(lines_per_file):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1738 this_chunk_size = lines_per_file[chunk_idx]
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1739 chunk_idx += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1740 lines_remaining = this_chunk_size
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1741 part_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1742 while lines_remaining > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1743 a_line = f.readline()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1744 if a_line == '':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1745 file_done = True
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1746 break
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1747 if part_file is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1748 part_dir = subdir_generator_function()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1749 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1750 part_file = open(part_path, 'w')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1751 part_file.write(a_line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1752 lines_remaining -= 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1753 if part_file is not None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1754 part_file.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1755 except Exception, e:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1756 log.error('Unable to split files: %s' % str(e))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1757 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1758 if part_file is not None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1759 part_file.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1760 raise
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1761 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1762
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1763 split = classmethod(split)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1764
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1765
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1766 class wurcs(data.Data):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1767 file_ext = 'wurcs'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1768 line_class = 'line'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1769
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1770 """Add metadata elements"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1771 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1772 visible=False, no_value=0)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1773
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1774 def write_from_stream(self, dataset, stream):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1775 """Writes data from a stream"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1776 # write it twice for now
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1777 fd, temp_name = tempfile.mkstemp()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1778 while 1:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1779 chunk = stream.read(1048576)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1780 if not chunk:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1781 break
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1782 os.write(fd, chunk)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1783 os.close(fd)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1784 # rewrite the file with unix newlines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1785 fp = open(dataset.file_name, 'wt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1786 for line in file(temp_name, "U"):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1787 line = line.strip() + '\n'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1788 fp.write(line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1789 fp.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1790
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1791 def set_raw_data(self, dataset, data):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1792 """Saves the data on the disc"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1793 fd, temp_name = tempfile.mkstemp()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1794 os.write(fd, data)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1795 os.close(fd)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1796 # rewrite the file with unix newlines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1797 fp = open(dataset.file_name, 'wt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1798 for line in file(temp_name, "U"):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1799 line = line.strip() + '\n'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1800 fp.write(line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1801 fp.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1802 os.remove(temp_name)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1803
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1804 def get_mime(self):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1805 """Returns the mime type of the datatype"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1806 return 'text/plain'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1807
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1808 def set_meta(self, dataset, **kwd):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1809 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1810 Set the number of lines of data in dataset.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1811 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1812 dataset.metadata.data_lines = self.count_data_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1813
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1814 def estimate_file_lines(self, dataset):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1815 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1816 Perform a rough estimate by extrapolating number of lines from a small read.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1817 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1818 sample_size = 1048576
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1819 dataset_fh = open(dataset.file_name)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1820 dataset_read = dataset_fh.read(sample_size)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1821 dataset_fh.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1822 sample_lines = dataset_read.count('\n')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1823 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1824 return est_lines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1825
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1826 def count_data_lines(self, dataset):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1827 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1828 Count the number of lines of data in dataset,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1829 skipping all blank lines and comments.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1830 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1831 data_lines = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1832 for line in file(dataset.file_name):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1833 line = line.strip()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1834 if line and not line.startswith('#'):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1835 data_lines += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1836 return data_lines
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1837
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1838 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1839 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1840 Set the peek. This method is used by various subclasses of Text.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1841 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1842 if not dataset.dataset.purged:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1843 # The file must exist on disk for the get_file_peek() method
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1844 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1845 skipchars=skipchars)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1846 if line_count is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1847 # See if line_count is stored in the metadata
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1848 if dataset.metadata.data_lines:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1849 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1850 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1851 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1852 # Number of lines is not known ( this should not happen ), and auto-detect is
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1853 # needed to set metadata
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1854 # This can happen when the file is larger than max_optional_metadata_filesize.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1855 if int(dataset.get_size()) <= 1048576:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1856 # Small dataset, recount all lines and reset peek afterward.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1857 lc = self.count_data_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1858 dataset.metadata.data_lines = lc
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1859 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1860 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1861 est_lines = self.estimate_file_lines(dataset)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1862 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1863 inflector.cond_plural(est_lines, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1864 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1865 dataset.blurb = "%s %s" % (
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1866 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1867 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1868 dataset.peek = 'file does not exist'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1869 dataset.blurb = 'file purged from disk'
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1870
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1871 def sniff(self, filename):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1872 """All WURCS Files start with WURCS= then the version number. see http://www.wurcs-wg.org/definition.php and http://rings.t.soka.ac.jp/
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1873 WURCS=2.0/4,3/[x2112h+1:x|1,5|2*NCC/3=O|4*OSO/3=O/3=O][12122a+1:b|1,5][12112h+1:b|1,5|2*NCC/3=O|6*OSO/3=O/3=O][12122a+1:b|1,5]1+3,2+1|2+4,3+1|3+3,4+1"""
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1874 try:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1875 f = open(filename, "r")
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1876 firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1877 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1878 if "WURCS" in firstline:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1879 return True
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1880 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1881 return False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1882 except:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1883 traceback.print_exc(file=sys.stdout)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1884 return False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1885
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1886
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1887 def split(cls, input_datasets, subdir_generator_function, split_params):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1888 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1889 Split the input files by line.
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1890 """
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1891 if split_params is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1892 return
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1893
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1894 if len(input_datasets) > 1:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1895 raise Exception("Text file splitting does not support multiple files")
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1896 input_files = [ds.file_name for ds in input_datasets]
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1897
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1898 lines_per_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1899 chunk_size = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1900 if split_params['split_mode'] == 'number_of_parts':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1901 lines_per_file = []
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1902 # Computing the length is expensive!
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1903 def _file_len(fname):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1904 i = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1905 f = open(fname)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1906 for i, l in enumerate(f):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1907 pass
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1908 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1909 return i + 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1910
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1911 length = _file_len(input_files[0])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1912 parts = int(split_params['split_size'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1913 if length < parts:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1914 parts = length
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1915 len_each, remainder = divmod(length, parts)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1916 while length > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1917 chunk = len_each
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1918 if remainder > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1919 chunk += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1920 lines_per_file.append(chunk)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1921 remainder = - 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1922 length -= chunk
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1923 elif split_params['split_mode'] == 'to_size':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1924 chunk_size = int(split_params['split_size'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1925 else:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1926 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1927
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1928 f = open(input_files[0], 'rt')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1929 try:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1930 chunk_idx = 0
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1931 file_done = False
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1932 part_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1933 while not file_done:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1934 if lines_per_file is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1935 this_chunk_size = chunk_size
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1936 elif chunk_idx < len(lines_per_file):
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1937 this_chunk_size = lines_per_file[chunk_idx]
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1938 chunk_idx += 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1939 lines_remaining = this_chunk_size
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1940 part_file = None
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1941 while lines_remaining > 0:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1942 a_line = f.readline()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1943 if a_line == '':
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1944 file_done = True
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1945 break
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1946 if part_file is None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1947 part_dir = subdir_generator_function()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1948 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1949 part_file = open(part_path, 'w')
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1950 part_file.write(a_line)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1951 lines_remaining -= 1
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1952 if part_file is not None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1953 part_file.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1954 except Exception, e:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1955 log.error('Unable to split files: %s' % str(e))
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1956 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1957 if part_file is not None:
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1958 part_file.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1959 raise
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1960 f.close()
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1961
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1962 split = classmethod(split)
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1963
0e941a69a6fa Uploaded
chrisb
parents:
diff changeset
1964