0
|
1 __license__ = "MIT"
|
|
2
|
|
3 import logging
|
|
4 from galaxy.datatypes import metadata
|
|
5 import mimetypes
|
|
6 import os
|
|
7 import shutil
|
|
8 import sys
|
|
9 import traceback
|
|
10 import tempfile
|
|
11 import zipfile
|
|
12 from cgi import escape
|
|
13 from inspect import isclass
|
|
14 import galaxy.util as util
|
|
15 from galaxy.datatypes import data
|
|
16 from galaxy.datatypes.metadata import \
|
|
17 MetadataElement # import directly to maintain ease of use in Datatype class definitions
|
|
18 from galaxy.util import inflector
|
|
19 from galaxy.util.bunch import Bunch
|
|
20 from galaxy.util.odict import odict
|
|
21 from galaxy.util.sanitize_html import sanitize_html
|
|
22
|
|
23 from galaxy.datatypes import dataproviders
|
|
24
|
|
25 from galaxy import eggs
|
|
26
|
|
27 eggs.require("Paste")
|
|
28 import paste
|
|
29
|
|
30
|
|
31 class kcf(data.Data):
|
|
32 file_ext = 'kcf'
|
|
33 line_class = 'line'
|
|
34
|
|
35 """Add metadata elements"""
|
|
36 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
|
|
37 visible=False, no_value=0)
|
|
38
|
|
39 def write_from_stream(self, dataset, stream):
|
|
40 """Writes data from a stream"""
|
|
41 # write it twice for now
|
|
42 fd, temp_name = tempfile.mkstemp()
|
|
43 while 1:
|
|
44 chunk = stream.read(1048576)
|
|
45 if not chunk:
|
|
46 break
|
|
47 os.write(fd, chunk)
|
|
48 os.close(fd)
|
|
49 # rewrite the file with unix newlines
|
|
50 fp = open(dataset.file_name, 'wt')
|
|
51 for line in file(temp_name, "U"):
|
|
52 line = line.strip() + '\n'
|
|
53 fp.write(line)
|
|
54 fp.close()
|
|
55
|
|
56 def set_raw_data(self, dataset, data):
|
|
57 """Saves the data on the disc"""
|
|
58 fd, temp_name = tempfile.mkstemp()
|
|
59 os.write(fd, data)
|
|
60 os.close(fd)
|
|
61 # rewrite the file with unix newlines
|
|
62 fp = open(dataset.file_name, 'wt')
|
|
63 for line in file(temp_name, "U"):
|
|
64 line = line.strip() + '\n'
|
|
65 fp.write(line)
|
|
66 fp.close()
|
|
67 os.remove(temp_name)
|
|
68
|
|
69 def get_mime(self):
|
|
70 """Returns the mime type of the datatype"""
|
|
71 return 'text/plain'
|
|
72
|
|
73 def set_meta(self, dataset, **kwd):
|
|
74 """
|
|
75 Set the number of lines of data in dataset.
|
|
76 """
|
|
77 dataset.metadata.data_lines = self.count_data_lines(dataset)
|
|
78
|
|
79 def estimate_file_lines(self, dataset):
|
|
80 """
|
|
81 Perform a rough estimate by extrapolating number of lines from a small read.
|
|
82 """
|
|
83 sample_size = 1048576
|
|
84 dataset_fh = open(dataset.file_name)
|
|
85 dataset_read = dataset_fh.read(sample_size)
|
|
86 dataset_fh.close()
|
|
87 sample_lines = dataset_read.count('\n')
|
|
88 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
|
|
89 return est_lines
|
|
90
|
|
91 def count_data_lines(self, dataset):
|
|
92 """
|
|
93 Count the number of lines of data in dataset,
|
|
94 skipping all blank lines and comments.
|
|
95 """
|
|
96 data_lines = 0
|
|
97 for line in file(dataset.file_name):
|
|
98 line = line.strip()
|
|
99 if line and not line.startswith('#'):
|
|
100 data_lines += 1
|
|
101 return data_lines
|
|
102
|
|
103 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
|
|
104 """
|
|
105 Set the peek. This method is used by various subclasses of Text.
|
|
106 """
|
|
107 if not dataset.dataset.purged:
|
|
108 # The file must exist on disk for the get_file_peek() method
|
|
109 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
|
|
110 skipchars=skipchars)
|
|
111 if line_count is None:
|
|
112 # See if line_count is stored in the metadata
|
|
113 if dataset.metadata.data_lines:
|
|
114 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
|
|
115 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
|
|
116 else:
|
|
117 # Number of lines is not known ( this should not happen ), and auto-detect is
|
|
118 # needed to set metadata
|
|
119 # This can happen when the file is larger than max_optional_metadata_filesize.
|
|
120 if int(dataset.get_size()) <= 1048576:
|
|
121 # Small dataset, recount all lines and reset peek afterward.
|
|
122 lc = self.count_data_lines(dataset)
|
|
123 dataset.metadata.data_lines = lc
|
|
124 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
|
|
125 else:
|
|
126 est_lines = self.estimate_file_lines(dataset)
|
|
127 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
|
|
128 inflector.cond_plural(est_lines, self.line_class) )
|
|
129 else:
|
|
130 dataset.blurb = "%s %s" % (
|
|
131 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
|
|
132 else:
|
|
133 dataset.peek = 'file does not exist'
|
|
134 dataset.blurb = 'file purged from disk'
|
|
135
|
|
136 def sniff(self, filename):
|
|
137 """All KCF Files simply put a 'ENTRY' in its first line.
|
|
138 This applies to all possible kcfs. In this case check
|
|
139 for 'Glycan' to confirm it's a glycan """
|
|
140 try:
|
|
141 from suds.client import Client
|
|
142
|
|
143 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
|
|
144 client = Client(url)
|
|
145 kcfresponse = client.service.DeterminingForm(file(filename, 'r').read())
|
|
146 if kcfresponse.array[0] == "KCF":
|
|
147 return True
|
|
148 else:
|
|
149 return False
|
|
150 except ImportError:
|
|
151 # cannot use import suds so use simple checker
|
|
152 print "using KCF simple checker"
|
|
153 f = open(filename, "r")
|
|
154 firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity
|
|
155 f.close()
|
|
156
|
|
157 if "ENTRY" in firstline and "GLYCAN" in firstline:
|
|
158 return True
|
|
159 else:
|
|
160 return False
|
|
161 except Exception, e:
|
|
162 # note I am not raising an error rather return False and let another sniffer try to type this data
|
|
163 traceback.print_exc(file=sys.stdout)
|
|
164 return False
|
|
165
|
|
166 def split(cls, input_datasets, subdir_generator_function, split_params):
|
|
167 """
|
|
168 Split the input files by line.
|
|
169 """
|
|
170 if split_params is None:
|
|
171 return
|
|
172
|
|
173 if len(input_datasets) > 1:
|
|
174 raise Exception("Text file splitting does not support multiple files")
|
|
175 input_files = [ds.file_name for ds in input_datasets]
|
|
176
|
|
177 lines_per_file = None
|
|
178 chunk_size = None
|
|
179 if split_params['split_mode'] == 'number_of_parts':
|
|
180 lines_per_file = []
|
|
181 # Computing the length is expensive!
|
|
182 def _file_len(fname):
|
|
183 i = 0
|
|
184 f = open(fname)
|
|
185 for i, l in enumerate(f):
|
|
186 pass
|
|
187 f.close()
|
|
188 return i + 1
|
|
189
|
|
190 length = _file_len(input_files[0])
|
|
191 parts = int(split_params['split_size'])
|
|
192 if length < parts:
|
|
193 parts = length
|
|
194 len_each, remainder = divmod(length, parts)
|
|
195 while length > 0:
|
|
196 chunk = len_each
|
|
197 if remainder > 0:
|
|
198 chunk += 1
|
|
199 lines_per_file.append(chunk)
|
|
200 remainder = - 1
|
|
201 length -= chunk
|
|
202 elif split_params['split_mode'] == 'to_size':
|
|
203 chunk_size = int(split_params['split_size'])
|
|
204 else:
|
|
205 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
|
|
206
|
|
207 f = open(input_files[0], 'rt')
|
|
208 try:
|
|
209 chunk_idx = 0
|
|
210 file_done = False
|
|
211 part_file = None
|
|
212 while not file_done:
|
|
213 if lines_per_file is None:
|
|
214 this_chunk_size = chunk_size
|
|
215 elif chunk_idx < len(lines_per_file):
|
|
216 this_chunk_size = lines_per_file[chunk_idx]
|
|
217 chunk_idx += 1
|
|
218 lines_remaining = this_chunk_size
|
|
219 part_file = None
|
|
220 while lines_remaining > 0:
|
|
221 a_line = f.readline()
|
|
222 if a_line == '':
|
|
223 file_done = True
|
|
224 break
|
|
225 if part_file is None:
|
|
226 part_dir = subdir_generator_function()
|
|
227 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
|
|
228 part_file = open(part_path, 'w')
|
|
229 part_file.write(a_line)
|
|
230 lines_remaining -= 1
|
|
231 if part_file is not None:
|
|
232 part_file.close()
|
|
233 except Exception, e:
|
|
234 log.error('Unable to split files: %s' % str(e))
|
|
235 f.close()
|
|
236 if part_file is not None:
|
|
237 part_file.close()
|
|
238 raise
|
|
239 f.close()
|
|
240
|
|
241 split = classmethod(split)
|
|
242
|
|
243
|
|
244 class glycoct(data.Data):
|
|
245 file_ext = 'glycoct'
|
|
246 line_class = 'line'
|
|
247
|
|
248 """Add metadata elements"""
|
|
249 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
|
|
250 visible=False, no_value=0)
|
|
251
|
|
252 def write_from_stream(self, dataset, stream):
|
|
253 """Writes data from a stream"""
|
|
254 # write it twice for now
|
|
255 fd, temp_name = tempfile.mkstemp()
|
|
256 while 1:
|
|
257 chunk = stream.read(1048576)
|
|
258 if not chunk:
|
|
259 break
|
|
260 os.write(fd, chunk)
|
|
261 os.close(fd)
|
|
262 # rewrite the file with unix newlines
|
|
263 fp = open(dataset.file_name, 'wt')
|
|
264 for line in file(temp_name, "U"):
|
|
265 line = line.strip() + '\n'
|
|
266 fp.write(line)
|
|
267 fp.close()
|
|
268
|
|
269 def set_raw_data(self, dataset, data):
|
|
270 """Saves the data on the disc"""
|
|
271 fd, temp_name = tempfile.mkstemp()
|
|
272 os.write(fd, data)
|
|
273 os.close(fd)
|
|
274 # rewrite the file with unix newlines
|
|
275 fp = open(dataset.file_name, 'wt')
|
|
276 for line in file(temp_name, "U"):
|
|
277 line = line.strip() + '\n'
|
|
278 fp.write(line)
|
|
279 fp.close()
|
|
280 os.remove(temp_name)
|
|
281
|
|
282 def get_mime(self):
|
|
283 """Returns the mime type of the datatype"""
|
|
284 return 'text/plain'
|
|
285
|
|
286 def set_meta(self, dataset, **kwd):
|
|
287 """
|
|
288 Set the number of lines of data in dataset.
|
|
289 """
|
|
290 dataset.metadata.data_lines = self.count_data_lines(dataset)
|
|
291
|
|
292 def estimate_file_lines(self, dataset):
|
|
293 """
|
|
294 Perform a rough estimate by extrapolating number of lines from a small read.
|
|
295 """
|
|
296 sample_size = 1048576
|
|
297 dataset_fh = open(dataset.file_name)
|
|
298 dataset_read = dataset_fh.read(sample_size)
|
|
299 dataset_fh.close()
|
|
300 sample_lines = dataset_read.count('\n')
|
|
301 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
|
|
302 return est_lines
|
|
303
|
|
304 def count_data_lines(self, dataset):
|
|
305 """
|
|
306 Count the number of lines of data in dataset,
|
|
307 skipping all blank lines and comments.
|
|
308 """
|
|
309 data_lines = 0
|
|
310 for line in file(dataset.file_name):
|
|
311 line = line.strip()
|
|
312 if line and not line.startswith('#'):
|
|
313 data_lines += 1
|
|
314 return data_lines
|
|
315
|
|
316 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
|
|
317 """
|
|
318 Set the peek. This method is used by various subclasses of Text.
|
|
319 """
|
|
320 if not dataset.dataset.purged:
|
|
321 # The file must exist on disk for the get_file_peek() method
|
|
322 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
|
|
323 skipchars=skipchars)
|
|
324 if line_count is None:
|
|
325 # See if line_count is stored in the metadata
|
|
326 if dataset.metadata.data_lines:
|
|
327 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
|
|
328 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
|
|
329 else:
|
|
330 # Number of lines is not known ( this should not happen ), and auto-detect is
|
|
331 # needed to set metadata
|
|
332 # This can happen when the file is larger than max_optional_metadata_filesize.
|
|
333 if int(dataset.get_size()) <= 1048576:
|
|
334 # Small dataset, recount all lines and reset peek afterward.
|
|
335 lc = self.count_data_lines(dataset)
|
|
336 dataset.metadata.data_lines = lc
|
|
337 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
|
|
338 else:
|
|
339 est_lines = self.estimate_file_lines(dataset)
|
|
340 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
|
|
341 inflector.cond_plural(est_lines, self.line_class) )
|
|
342 else:
|
|
343 dataset.blurb = "%s %s" % (
|
|
344 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
|
|
345 else:
|
|
346 dataset.peek = 'file does not exist'
|
|
347 dataset.blurb = 'file purged from disk'
|
|
348
|
|
349 def sniff(self, filename):
|
|
350 """All glycoct_condensed files simply put a 'RES' in its first line and a LIN later. """
|
|
351 try:
|
|
352 f = open(filename, "r")
|
|
353 firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity
|
|
354 lines = f.read()
|
|
355 f.close()
|
|
356
|
|
357 # if "RES" in firstline and "LIN" in lines:
|
|
358 if "RES" in firstline and "LIN" in lines:
|
|
359 return True
|
|
360 else:
|
|
361 return False
|
|
362 except Exception, e:
|
|
363 # note I am not raising an error rather return False and let another sniffer try to type this data
|
|
364 traceback.print_exc(file=sys.stdout)
|
|
365 return False
|
|
366
|
|
367 def split(cls, input_datasets, subdir_generator_function, split_params):
|
|
368 """
|
|
369 Split the input files by line.
|
|
370 """
|
|
371 if split_params is None:
|
|
372 return
|
|
373
|
|
374 if len(input_datasets) > 1:
|
|
375 raise Exception("Text file splitting does not support multiple files")
|
|
376 input_files = [ds.file_name for ds in input_datasets]
|
|
377
|
|
378 lines_per_file = None
|
|
379 chunk_size = None
|
|
380 if split_params['split_mode'] == 'number_of_parts':
|
|
381 lines_per_file = []
|
|
382 # Computing the length is expensive!
|
|
383 def _file_len(fname):
|
|
384 i = 0
|
|
385 f = open(fname)
|
|
386 for i, l in enumerate(f):
|
|
387 pass
|
|
388 f.close()
|
|
389 return i + 1
|
|
390
|
|
391 length = _file_len(input_files[0])
|
|
392 parts = int(split_params['split_size'])
|
|
393 if length < parts:
|
|
394 parts = length
|
|
395 len_each, remainder = divmod(length, parts)
|
|
396 while length > 0:
|
|
397 chunk = len_each
|
|
398 if remainder > 0:
|
|
399 chunk += 1
|
|
400 lines_per_file.append(chunk)
|
|
401 remainder = - 1
|
|
402 length -= chunk
|
|
403 elif split_params['split_mode'] == 'to_size':
|
|
404 chunk_size = int(split_params['split_size'])
|
|
405 else:
|
|
406 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
|
|
407
|
|
408 f = open(input_files[0], 'rt')
|
|
409 try:
|
|
410 chunk_idx = 0
|
|
411 file_done = False
|
|
412 part_file = None
|
|
413 while not file_done:
|
|
414 if lines_per_file is None:
|
|
415 this_chunk_size = chunk_size
|
|
416 elif chunk_idx < len(lines_per_file):
|
|
417 this_chunk_size = lines_per_file[chunk_idx]
|
|
418 chunk_idx += 1
|
|
419 lines_remaining = this_chunk_size
|
|
420 part_file = None
|
|
421 while lines_remaining > 0:
|
|
422 a_line = f.readline()
|
|
423 if a_line == '':
|
|
424 file_done = True
|
|
425 break
|
|
426 if part_file is None:
|
|
427 part_dir = subdir_generator_function()
|
|
428 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
|
|
429 part_file = open(part_path, 'w')
|
|
430 part_file.write(a_line)
|
|
431 lines_remaining -= 1
|
|
432 if part_file is not None:
|
|
433 part_file.close()
|
|
434 except Exception, e:
|
|
435 log.error('Unable to split files: %s' % str(e))
|
|
436 f.close()
|
|
437 if part_file is not None:
|
|
438 part_file.close()
|
|
439 raise
|
|
440 f.close()
|
|
441
|
|
442 split = classmethod(split)
|
|
443
|
|
444 # ------------- Utility methods --------------
|
|
445
|
|
446 # nice_size used to be here, but to resolve cyclical dependencies it's been
|
|
447 # moved to galaxy.util. It belongs there anyway since it's used outside
|
|
448 # datatypes.
|
|
449 nice_size = util.nice_size
|
|
450
|
|
451
|
|
452 def get_test_fname(fname):
|
|
453 """Returns test data filename"""
|
|
454 path, name = os.path.split(__file__)
|
|
455 full_path = os.path.join(path, 'test', fname)
|
|
456 return full_path
|
|
457
|
|
458
|
|
459 def get_file_peek(file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipchars=[]):
|
|
460 """
|
|
461 Returns the first LINE_COUNT lines wrapped to WIDTH
|
|
462
|
|
463 ## >>> fname = get_test_fname('4.bed')
|
|
464 ## >>> get_file_peek(fname)
|
|
465 ## 'chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 +\n'
|
|
466
|
|
467 """
|
|
468 # Set size for file.readline() to a negative number to force it to
|
|
469 # read until either a newline or EOF. Needed for datasets with very
|
|
470 # long lines.
|
|
471 if WIDTH == 'unlimited':
|
|
472 WIDTH = -1
|
|
473 lines = []
|
|
474 count = 0
|
|
475 file_type = None
|
|
476 data_checked = False
|
|
477 temp = open(file_name, "U")
|
|
478 while count <= LINE_COUNT:
|
|
479 line = temp.readline(WIDTH)
|
|
480 if line and not is_multi_byte and not data_checked:
|
|
481 # See if we have a compressed or binary file
|
|
482 if line[0:2] == util.gzip_magic:
|
|
483 file_type = 'gzipped'
|
|
484 break
|
|
485 else:
|
|
486 for char in line:
|
|
487 if ord(char) > 128:
|
|
488 file_type = 'binary'
|
|
489 break
|
|
490 data_checked = True
|
|
491 if file_type in ['gzipped', 'binary']:
|
|
492 break
|
|
493 skip_line = False
|
|
494 for skipchar in skipchars:
|
|
495 if line.startswith(skipchar):
|
|
496 skip_line = True
|
|
497 break
|
|
498 if not skip_line:
|
|
499 lines.append(line)
|
|
500 count += 1
|
|
501 temp.close()
|
|
502 if file_type in ['gzipped', 'binary']:
|
|
503 text = "%s file" % file_type
|
|
504 else:
|
|
505 try:
|
|
506 text = unicode('\n'.join(lines), 'utf-8')
|
|
507 except UnicodeDecodeError:
|
|
508 text = "binary/unknown file"
|
|
509 return text
|
|
510
|
|
511
|
|
512 class glycoct_xml(data.Data):
|
|
513 file_ext = 'glycoct_xml'
|
|
514 line_class = 'line'
|
|
515
|
|
516 """Add metadata elements"""
|
|
517 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
|
|
518 visible=False, no_value=0)
|
|
519
|
|
520 def write_from_stream(self, dataset, stream):
|
|
521 """Writes data from a stream"""
|
|
522 # write it twice for now
|
|
523 fd, temp_name = tempfile.mkstemp()
|
|
524 while 1:
|
|
525 chunk = stream.read(1048576)
|
|
526 if not chunk:
|
|
527 break
|
|
528 os.write(fd, chunk)
|
|
529 os.close(fd)
|
|
530 # rewrite the file with unix newlines
|
|
531 fp = open(dataset.file_name, 'wt')
|
|
532 for line in file(temp_name, "U"):
|
|
533 line = line.strip() + '\n'
|
|
534 fp.write(line)
|
|
535 fp.close()
|
|
536
|
|
537 def set_raw_data(self, dataset, data):
|
|
538 """Saves the data on the disc"""
|
|
539 fd, temp_name = tempfile.mkstemp()
|
|
540 os.write(fd, data)
|
|
541 os.close(fd)
|
|
542 # rewrite the file with unix newlines
|
|
543 fp = open(dataset.file_name, 'wt')
|
|
544 for line in file(temp_name, "U"):
|
|
545 line = line.strip() + '\n'
|
|
546 fp.write(line)
|
|
547 fp.close()
|
|
548 os.remove(temp_name)
|
|
549
|
|
550 def get_mime(self):
|
|
551 """Returns the mime type of the datatype"""
|
|
552 return 'text/xml'
|
|
553
|
|
554 def set_meta(self, dataset, **kwd):
|
|
555 """
|
|
556 Set the number of lines of data in dataset.
|
|
557 """
|
|
558 dataset.metadata.data_lines = self.count_data_lines(dataset)
|
|
559
|
|
560 def estimate_file_lines(self, dataset):
|
|
561 """
|
|
562 Perform a rough estimate by extrapolating number of lines from a small read.
|
|
563 """
|
|
564 sample_size = 1048576
|
|
565 dataset_fh = open(dataset.file_name)
|
|
566 dataset_read = dataset_fh.read(sample_size)
|
|
567 dataset_fh.close()
|
|
568 sample_lines = dataset_read.count('\n')
|
|
569 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
|
|
570 return est_lines
|
|
571
|
|
572 def count_data_lines(self, dataset):
|
|
573 """
|
|
574 Count the number of lines of data in dataset,
|
|
575 skipping all blank lines and comments.
|
|
576 """
|
|
577 data_lines = 0
|
|
578 for line in file(dataset.file_name):
|
|
579 line = line.strip()
|
|
580 if line and not line.startswith('#'):
|
|
581 data_lines += 1
|
|
582 return data_lines
|
|
583
|
|
584 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
|
|
585 """
|
|
586 Set the peek. This method is used by various subclasses of Text.
|
|
587 """
|
|
588 if not dataset.dataset.purged:
|
|
589 # The file must exist on disk for the get_file_peek() method
|
|
590 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
|
|
591 skipchars=skipchars)
|
|
592 if line_count is None:
|
|
593 # See if line_count is stored in the metadata
|
|
594 if dataset.metadata.data_lines:
|
|
595 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
|
|
596 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
|
|
597 else:
|
|
598 # Number of lines is not known ( this should not happen ), and auto-detect is
|
|
599 # needed to set metadata
|
|
600 # This can happen when the file is larger than max_optional_metadata_filesize.
|
|
601 if int(dataset.get_size()) <= 1048576:
|
|
602 # Small dataset, recount all lines and reset peek afterward.
|
|
603 lc = self.count_data_lines(dataset)
|
|
604 dataset.metadata.data_lines = lc
|
|
605 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
|
|
606 else:
|
|
607 est_lines = self.estimate_file_lines(dataset)
|
|
608 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
|
|
609 inflector.cond_plural(est_lines, self.line_class) )
|
|
610 else:
|
|
611 dataset.blurb = "%s %s" % (
|
|
612 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
|
|
613 else:
|
|
614 dataset.peek = 'file does not exist'
|
|
615 dataset.blurb = 'file purged from disk'
|
|
616
|
|
617 def sniff(self, filename):
|
|
618 """All glycoct XML files should use the rings form determination script """
|
|
619 try:
|
|
620 from suds.client import Client
|
|
621
|
|
622 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
|
|
623 client = Client(url)
|
|
624 response = client.service.DeterminingForm(file(filename, 'r').read())
|
|
625 if response.array[0] == "GlycoCT":
|
|
626 return True
|
|
627 else:
|
|
628 return False
|
|
629 except ImportError:
|
|
630 # cannot use import suds so use simple checker
|
|
631 print "using glycoct XML simple checker"
|
|
632 import xml.etree.cElementTree as ET
|
|
633
|
|
634 tree = ET.parse(filename)
|
|
635 root = tree.getroot()
|
|
636 if root.tag == 'sugar':
|
|
637 print root.tag, root.attrib
|
|
638 return True
|
|
639 else:
|
|
640 return False
|
|
641 except Exception, e:
|
|
642 # note I am not raising an error rather return False and let another sniffer try to type this data
|
|
643 traceback.print_exc(file=sys.stdout)
|
|
644 return False
|
|
645
|
|
646 def split(cls, input_datasets, subdir_generator_function, split_params):
|
|
647 """
|
|
648 Split the input files by line.
|
|
649 """
|
|
650 if split_params is None:
|
|
651 return
|
|
652
|
|
653 if len(input_datasets) > 1:
|
|
654 raise Exception("Text file splitting does not support multiple files")
|
|
655 input_files = [ds.file_name for ds in input_datasets]
|
|
656
|
|
657 lines_per_file = None
|
|
658 chunk_size = None
|
|
659 if split_params['split_mode'] == 'number_of_parts':
|
|
660 lines_per_file = []
|
|
661 # Computing the length is expensive!
|
|
662 def _file_len(fname):
|
|
663 i = 0
|
|
664 f = open(fname)
|
|
665 for i, l in enumerate(f):
|
|
666 pass
|
|
667 f.close()
|
|
668 return i + 1
|
|
669
|
|
670 length = _file_len(input_files[0])
|
|
671 parts = int(split_params['split_size'])
|
|
672 if length < parts:
|
|
673 parts = length
|
|
674 len_each, remainder = divmod(length, parts)
|
|
675 while length > 0:
|
|
676 chunk = len_each
|
|
677 if remainder > 0:
|
|
678 chunk += 1
|
|
679 lines_per_file.append(chunk)
|
|
680 remainder = - 1
|
|
681 length -= chunk
|
|
682 elif split_params['split_mode'] == 'to_size':
|
|
683 chunk_size = int(split_params['split_size'])
|
|
684 else:
|
|
685 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
|
|
686
|
|
687 f = open(input_files[0], 'rt')
|
|
688 try:
|
|
689 chunk_idx = 0
|
|
690 file_done = False
|
|
691 part_file = None
|
|
692 while not file_done:
|
|
693 if lines_per_file is None:
|
|
694 this_chunk_size = chunk_size
|
|
695 elif chunk_idx < len(lines_per_file):
|
|
696 this_chunk_size = lines_per_file[chunk_idx]
|
|
697 chunk_idx += 1
|
|
698 lines_remaining = this_chunk_size
|
|
699 part_file = None
|
|
700 while lines_remaining > 0:
|
|
701 a_line = f.readline()
|
|
702 if a_line == '':
|
|
703 file_done = True
|
|
704 break
|
|
705 if part_file is None:
|
|
706 part_dir = subdir_generator_function()
|
|
707 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
|
|
708 part_file = open(part_path, 'w')
|
|
709 part_file.write(a_line)
|
|
710 lines_remaining -= 1
|
|
711 if part_file is not None:
|
|
712 part_file.close()
|
|
713 except Exception, e:
|
|
714 log.error('Unable to split files: %s' % str(e))
|
|
715 f.close()
|
|
716 if part_file is not None:
|
|
717 part_file.close()
|
|
718 raise
|
|
719 f.close()
|
|
720
|
|
721 split = classmethod(split)
|
|
722
|
|
723
|
|
724 class glydeii(data.Data):
|
|
725 file_ext = 'glydeii'
|
|
726 line_class = 'line'
|
|
727
|
|
728 """Add metadata elements"""
|
|
729 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
|
|
730 visible=False, no_value=0)
|
|
731
|
|
732 def write_from_stream(self, dataset, stream):
|
|
733 """Writes data from a stream"""
|
|
734 # write it twice for now
|
|
735 fd, temp_name = tempfile.mkstemp()
|
|
736 while 1:
|
|
737 chunk = stream.read(1048576)
|
|
738 if not chunk:
|
|
739 break
|
|
740 os.write(fd, chunk)
|
|
741 os.close(fd)
|
|
742 # rewrite the file with unix newlines
|
|
743 fp = open(dataset.file_name, 'wt')
|
|
744 for line in file(temp_name, "U"):
|
|
745 line = line.strip() + '\n'
|
|
746 fp.write(line)
|
|
747 fp.close()
|
|
748
|
|
749 def set_raw_data(self, dataset, data):
|
|
750 """Saves the data on the disc"""
|
|
751 fd, temp_name = tempfile.mkstemp()
|
|
752 os.write(fd, data)
|
|
753 os.close(fd)
|
|
754 # rewrite the file with unix newlines
|
|
755 fp = open(dataset.file_name, 'wt')
|
|
756 for line in file(temp_name, "U"):
|
|
757 line = line.strip() + '\n'
|
|
758 fp.write(line)
|
|
759 fp.close()
|
|
760 os.remove(temp_name)
|
|
761
|
|
762 def get_mime(self):
|
|
763 """Returns the mime type of the datatype"""
|
|
764 return 'text/xml'
|
|
765
|
|
766 def set_meta(self, dataset, **kwd):
|
|
767 """
|
|
768 Set the number of lines of data in dataset.
|
|
769 """
|
|
770 dataset.metadata.data_lines = self.count_data_lines(dataset)
|
|
771
|
|
772 def estimate_file_lines(self, dataset):
|
|
773 """
|
|
774 Perform a rough estimate by extrapolating number of lines from a small read.
|
|
775 """
|
|
776 sample_size = 1048576
|
|
777 dataset_fh = open(dataset.file_name)
|
|
778 dataset_read = dataset_fh.read(sample_size)
|
|
779 dataset_fh.close()
|
|
780 sample_lines = dataset_read.count('\n')
|
|
781 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
|
|
782 return est_lines
|
|
783
|
|
784 def count_data_lines(self, dataset):
|
|
785 """
|
|
786 Count the number of lines of data in dataset,
|
|
787 skipping all blank lines and comments.
|
|
788 """
|
|
789 data_lines = 0
|
|
790 for line in file(dataset.file_name):
|
|
791 line = line.strip()
|
|
792 if line and not line.startswith('#'):
|
|
793 data_lines += 1
|
|
794 return data_lines
|
|
795
|
|
796 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
|
|
797 """
|
|
798 Set the peek. This method is used by various subclasses of Text.
|
|
799 """
|
|
800 if not dataset.dataset.purged:
|
|
801 # The file must exist on disk for the get_file_peek() method
|
|
802 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
|
|
803 skipchars=skipchars)
|
|
804 if line_count is None:
|
|
805 # See if line_count is stored in the metadata
|
|
806 if dataset.metadata.data_lines:
|
|
807 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
|
|
808 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
|
|
809 else:
|
|
810 # Number of lines is not known ( this should not happen ), and auto-detect is
|
|
811 # needed to set metadata
|
|
812 # This can happen when the file is larger than max_optional_metadata_filesize.
|
|
813 if int(dataset.get_size()) <= 1048576:
|
|
814 # Small dataset, recount all lines and reset peek afterward.
|
|
815 lc = self.count_data_lines(dataset)
|
|
816 dataset.metadata.data_lines = lc
|
|
817 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
|
|
818 else:
|
|
819 est_lines = self.estimate_file_lines(dataset)
|
|
820 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
|
|
821 inflector.cond_plural(est_lines, self.line_class) )
|
|
822 else:
|
|
823 dataset.blurb = "%s %s" % (
|
|
824 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
|
|
825 else:
|
|
826 dataset.peek = 'file does not exist'
|
|
827 dataset.blurb = 'file purged from disk'
|
|
828
|
|
829 def sniff(self, filename):
|
|
830 """All GlydeII XML files should use the rings form determination script """
|
|
831 try:
|
|
832 from suds.client import Client
|
|
833
|
|
834 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
|
|
835 client = Client(url)
|
|
836 response = client.service.DeterminingForm(file(filename, 'r').read())
|
|
837 if response.array[0] == "GLYDEII":
|
|
838 return True
|
|
839 else:
|
|
840 return False
|
|
841 except ImportError:
|
|
842 # cannot use import suds so use simple checker
|
|
843 print "using GlydeII simple checker"
|
|
844 import xml.etree.cElementTree as ET
|
|
845
|
|
846 tree = ET.parse(filename)
|
|
847 root = tree.getroot()
|
|
848 if root.tag == 'GlydeII':
|
|
849 print root.tag
|
|
850 return True
|
|
851 else:
|
|
852 return False
|
|
853 except Exception, e:
|
|
854 # note I am not raising an error rather return False and let another sniffer try to type this data
|
|
855 traceback.print_exc(file=sys.stdout)
|
|
856 return False
|
|
857
|
|
858 def split(cls, input_datasets, subdir_generator_function, split_params):
|
|
859 """
|
|
860 Split the input files by line.
|
|
861 """
|
|
862 if split_params is None:
|
|
863 return
|
|
864
|
|
865 if len(input_datasets) > 1:
|
|
866 raise Exception("Text file splitting does not support multiple files")
|
|
867 input_files = [ds.file_name for ds in input_datasets]
|
|
868
|
|
869 lines_per_file = None
|
|
870 chunk_size = None
|
|
871 if split_params['split_mode'] == 'number_of_parts':
|
|
872 lines_per_file = []
|
|
873 # Computing the length is expensive!
|
|
874 def _file_len(fname):
|
|
875 i = 0
|
|
876 f = open(fname)
|
|
877 for i, l in enumerate(f):
|
|
878 pass
|
|
879 f.close()
|
|
880 return i + 1
|
|
881
|
|
882 length = _file_len(input_files[0])
|
|
883 parts = int(split_params['split_size'])
|
|
884 if length < parts:
|
|
885 parts = length
|
|
886 len_each, remainder = divmod(length, parts)
|
|
887 while length > 0:
|
|
888 chunk = len_each
|
|
889 if remainder > 0:
|
|
890 chunk += 1
|
|
891 lines_per_file.append(chunk)
|
|
892 remainder = - 1
|
|
893 length -= chunk
|
|
894 elif split_params['split_mode'] == 'to_size':
|
|
895 chunk_size = int(split_params['split_size'])
|
|
896 else:
|
|
897 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
|
|
898
|
|
899 f = open(input_files[0], 'rt')
|
|
900 try:
|
|
901 chunk_idx = 0
|
|
902 file_done = False
|
|
903 part_file = None
|
|
904 while not file_done:
|
|
905 if lines_per_file is None:
|
|
906 this_chunk_size = chunk_size
|
|
907 elif chunk_idx < len(lines_per_file):
|
|
908 this_chunk_size = lines_per_file[chunk_idx]
|
|
909 chunk_idx += 1
|
|
910 lines_remaining = this_chunk_size
|
|
911 part_file = None
|
|
912 while lines_remaining > 0:
|
|
913 a_line = f.readline()
|
|
914 if a_line == '':
|
|
915 file_done = True
|
|
916 break
|
|
917 if part_file is None:
|
|
918 part_dir = subdir_generator_function()
|
|
919 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
|
|
920 part_file = open(part_path, 'w')
|
|
921 part_file.write(a_line)
|
|
922 lines_remaining -= 1
|
|
923 if part_file is not None:
|
|
924 part_file.close()
|
|
925 except Exception, e:
|
|
926 log.error('Unable to split files: %s' % str(e))
|
|
927 f.close()
|
|
928 if part_file is not None:
|
|
929 part_file.close()
|
|
930 raise
|
|
931 f.close()
|
|
932
|
|
933 split = classmethod(split)
|
|
934
|
|
935
|
|
936 class linucs(data.Data):
|
|
937 file_ext = 'linucs'
|
|
938 line_class = 'line'
|
|
939
|
|
940 """Add metadata elements"""
|
|
941 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
|
|
942 visible=False, no_value=0)
|
|
943
|
|
944 def write_from_stream(self, dataset, stream):
|
|
945 """Writes data from a stream"""
|
|
946 # write it twice for now
|
|
947 fd, temp_name = tempfile.mkstemp()
|
|
948 while 1:
|
|
949 chunk = stream.read(1048576)
|
|
950 if not chunk:
|
|
951 break
|
|
952 os.write(fd, chunk)
|
|
953 os.close(fd)
|
|
954 # rewrite the file with unix newlines
|
|
955 fp = open(dataset.file_name, 'wt')
|
|
956 for line in file(temp_name, "U"):
|
|
957 line = line.strip() + '\n'
|
|
958 fp.write(line)
|
|
959 fp.close()
|
|
960
|
|
961 def set_raw_data(self, dataset, data):
|
|
962 """Saves the data on the disc"""
|
|
963 fd, temp_name = tempfile.mkstemp()
|
|
964 os.write(fd, data)
|
|
965 os.close(fd)
|
|
966 # rewrite the file with unix newlines
|
|
967 fp = open(dataset.file_name, 'wt')
|
|
968 for line in file(temp_name, "U"):
|
|
969 line = line.strip() + '\n'
|
|
970 fp.write(line)
|
|
971 fp.close()
|
|
972 os.remove(temp_name)
|
|
973
|
|
974 def get_mime(self):
|
|
975 """Returns the mime type of the datatype"""
|
|
976 return 'text/plain'
|
|
977
|
|
978 def set_meta(self, dataset, **kwd):
|
|
979 """
|
|
980 Set the number of lines of data in dataset.
|
|
981 """
|
|
982 dataset.metadata.data_lines = self.count_data_lines(dataset)
|
|
983
|
|
984 def estimate_file_lines(self, dataset):
|
|
985 """
|
|
986 Perform a rough estimate by extrapolating number of lines from a small read.
|
|
987 """
|
|
988 sample_size = 1048576
|
|
989 dataset_fh = open(dataset.file_name)
|
|
990 dataset_read = dataset_fh.read(sample_size)
|
|
991 dataset_fh.close()
|
|
992 sample_lines = dataset_read.count('\n')
|
|
993 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
|
|
994 return est_lines
|
|
995
|
|
996 def count_data_lines(self, dataset):
|
|
997 """
|
|
998 Count the number of lines of data in dataset,
|
|
999 skipping all blank lines and comments.
|
|
1000 """
|
|
1001 data_lines = 0
|
|
1002 for line in file(dataset.file_name):
|
|
1003 line = line.strip()
|
|
1004 if line and not line.startswith('#'):
|
|
1005 data_lines += 1
|
|
1006 return data_lines
|
|
1007
|
|
1008 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
|
|
1009 """
|
|
1010 Set the peek. This method is used by various subclasses of Text.
|
|
1011 """
|
|
1012 if not dataset.dataset.purged:
|
|
1013 # The file must exist on disk for the get_file_peek() method
|
|
1014 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
|
|
1015 skipchars=skipchars)
|
|
1016 if line_count is None:
|
|
1017 # See if line_count is stored in the metadata
|
|
1018 if dataset.metadata.data_lines:
|
|
1019 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
|
|
1020 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
|
|
1021 else:
|
|
1022 # Number of lines is not known ( this should not happen ), and auto-detect is
|
|
1023 # needed to set metadata
|
|
1024 # This can happen when the file is larger than max_optional_metadata_filesize.
|
|
1025 if int(dataset.get_size()) <= 1048576:
|
|
1026 # Small dataset, recount all lines and reset peek afterward.
|
|
1027 lc = self.count_data_lines(dataset)
|
|
1028 dataset.metadata.data_lines = lc
|
|
1029 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
|
|
1030 else:
|
|
1031 est_lines = self.estimate_file_lines(dataset)
|
|
1032 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
|
|
1033 inflector.cond_plural(est_lines, self.line_class) )
|
|
1034 else:
|
|
1035 dataset.blurb = "%s %s" % (
|
|
1036 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
|
|
1037 else:
|
|
1038 dataset.peek = 'file does not exist'
|
|
1039 dataset.blurb = 'file purged from disk'
|
|
1040
|
|
1041 def sniff(self, filename):
|
|
1042 """All LINUCS files should use the rings form determination script """
|
|
1043 try:
|
|
1044 from suds.client import Client
|
|
1045
|
|
1046 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
|
|
1047 client = Client(url)
|
|
1048 response = client.service.DeterminingForm(file(filename, 'r').read())
|
|
1049 if response.array[0] == "LINUCS":
|
|
1050 return True
|
|
1051 else:
|
|
1052 return False
|
|
1053 except ImportError:
|
|
1054 # cannot use import suds so use simple checker
|
|
1055 print "using LINUCS simple checker"
|
|
1056
|
|
1057 f = open(filename, "r")
|
|
1058 firstline = f.readline()
|
|
1059 f.close()
|
|
1060
|
|
1061 if "[" in firstline and "]" in firstline and "{" in firstline and "}" in firstline:
|
|
1062 return True
|
|
1063 else:
|
|
1064 return False
|
|
1065 except Exception, e:
|
|
1066 # note I am not raising an error rather return False and let another sniffer try to type this data
|
|
1067 traceback.print_exc(file=sys.stdout)
|
|
1068 return False
|
|
1069
|
|
1070 def split(cls, input_datasets, subdir_generator_function, split_params):
|
|
1071 """
|
|
1072 Split the input files by line.
|
|
1073 """
|
|
1074 if split_params is None:
|
|
1075 return
|
|
1076
|
|
1077 if len(input_datasets) > 1:
|
|
1078 raise Exception("Text file splitting does not support multiple files")
|
|
1079 input_files = [ds.file_name for ds in input_datasets]
|
|
1080
|
|
1081 lines_per_file = None
|
|
1082 chunk_size = None
|
|
1083 if split_params['split_mode'] == 'number_of_parts':
|
|
1084 lines_per_file = []
|
|
1085 # Computing the length is expensive!
|
|
1086 def _file_len(fname):
|
|
1087 i = 0
|
|
1088 f = open(fname)
|
|
1089 for i, l in enumerate(f):
|
|
1090 pass
|
|
1091 f.close()
|
|
1092 return i + 1
|
|
1093
|
|
1094 length = _file_len(input_files[0])
|
|
1095 parts = int(split_params['split_size'])
|
|
1096 if length < parts:
|
|
1097 parts = length
|
|
1098 len_each, remainder = divmod(length, parts)
|
|
1099 while length > 0:
|
|
1100 chunk = len_each
|
|
1101 if remainder > 0:
|
|
1102 chunk += 1
|
|
1103 lines_per_file.append(chunk)
|
|
1104 remainder = - 1
|
|
1105 length -= chunk
|
|
1106 elif split_params['split_mode'] == 'to_size':
|
|
1107 chunk_size = int(split_params['split_size'])
|
|
1108 else:
|
|
1109 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
|
|
1110
|
|
1111 f = open(input_files[0], 'rt')
|
|
1112 try:
|
|
1113 chunk_idx = 0
|
|
1114 file_done = False
|
|
1115 part_file = None
|
|
1116 while not file_done:
|
|
1117 if lines_per_file is None:
|
|
1118 this_chunk_size = chunk_size
|
|
1119 elif chunk_idx < len(lines_per_file):
|
|
1120 this_chunk_size = lines_per_file[chunk_idx]
|
|
1121 chunk_idx += 1
|
|
1122 lines_remaining = this_chunk_size
|
|
1123 part_file = None
|
|
1124 while lines_remaining > 0:
|
|
1125 a_line = f.readline()
|
|
1126 if a_line == '':
|
|
1127 file_done = True
|
|
1128 break
|
|
1129 if part_file is None:
|
|
1130 part_dir = subdir_generator_function()
|
|
1131 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
|
|
1132 part_file = open(part_path, 'w')
|
|
1133 part_file.write(a_line)
|
|
1134 lines_remaining -= 1
|
|
1135 if part_file is not None:
|
|
1136 part_file.close()
|
|
1137 except Exception, e:
|
|
1138 log.error('Unable to split files: %s' % str(e))
|
|
1139 f.close()
|
|
1140 if part_file is not None:
|
|
1141 part_file.close()
|
|
1142 raise
|
|
1143 f.close()
|
|
1144
|
|
1145 split = classmethod(split)
|
|
1146
|
|
1147
|
|
1148 class iupac(data.Data):
|
|
1149 file_ext = 'iupac'
|
|
1150 line_class = 'line'
|
|
1151
|
|
1152 """Add metadata elements"""
|
|
1153 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
|
|
1154 visible=False, no_value=0)
|
|
1155
|
|
1156 def write_from_stream(self, dataset, stream):
|
|
1157 """Writes data from a stream"""
|
|
1158 # write it twice for now
|
|
1159 fd, temp_name = tempfile.mkstemp()
|
|
1160 while 1:
|
|
1161 chunk = stream.read(1048576)
|
|
1162 if not chunk:
|
|
1163 break
|
|
1164 os.write(fd, chunk)
|
|
1165 os.close(fd)
|
|
1166 # rewrite the file with unix newlines
|
|
1167 fp = open(dataset.file_name, 'wt')
|
|
1168 for line in file(temp_name, "U"):
|
|
1169 line = line.strip() + '\n'
|
|
1170 fp.write(line)
|
|
1171 fp.close()
|
|
1172
|
|
1173 def set_raw_data(self, dataset, data):
|
|
1174 """Saves the data on the disc"""
|
|
1175 fd, temp_name = tempfile.mkstemp()
|
|
1176 os.write(fd, data)
|
|
1177 os.close(fd)
|
|
1178 # rewrite the file with unix newlines
|
|
1179 fp = open(dataset.file_name, 'wt')
|
|
1180 for line in file(temp_name, "U"):
|
|
1181 line = line.strip() + '\n'
|
|
1182 fp.write(line)
|
|
1183 fp.close()
|
|
1184 os.remove(temp_name)
|
|
1185
|
|
1186 def get_mime(self):
|
|
1187 """Returns the mime type of the datatype"""
|
|
1188 return 'text/plain'
|
|
1189
|
|
1190 def set_meta(self, dataset, **kwd):
|
|
1191 """
|
|
1192 Set the number of lines of data in dataset.
|
|
1193 """
|
|
1194 dataset.metadata.data_lines = self.count_data_lines(dataset)
|
|
1195
|
|
1196 def estimate_file_lines(self, dataset):
|
|
1197 """
|
|
1198 Perform a rough estimate by extrapolating number of lines from a small read.
|
|
1199 """
|
|
1200 sample_size = 1048576
|
|
1201 dataset_fh = open(dataset.file_name)
|
|
1202 dataset_read = dataset_fh.read(sample_size)
|
|
1203 dataset_fh.close()
|
|
1204 sample_lines = dataset_read.count('\n')
|
|
1205 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
|
|
1206 return est_lines
|
|
1207
|
|
1208 def count_data_lines(self, dataset):
|
|
1209 """
|
|
1210 Count the number of lines of data in dataset,
|
|
1211 skipping all blank lines and comments.
|
|
1212 """
|
|
1213 data_lines = 0
|
|
1214 for line in file(dataset.file_name):
|
|
1215 line = line.strip()
|
|
1216 if line and not line.startswith('#'):
|
|
1217 data_lines += 1
|
|
1218 return data_lines
|
|
1219
|
|
1220 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
|
|
1221 """
|
|
1222 Set the peek. This method is used by various subclasses of Text.
|
|
1223 """
|
|
1224 if not dataset.dataset.purged:
|
|
1225 # The file must exist on disk for the get_file_peek() method
|
|
1226 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
|
|
1227 skipchars=skipchars)
|
|
1228 if line_count is None:
|
|
1229 # See if line_count is stored in the metadata
|
|
1230 if dataset.metadata.data_lines:
|
|
1231 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
|
|
1232 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
|
|
1233 else:
|
|
1234 # Number of lines is not known ( this should not happen ), and auto-detect is
|
|
1235 # needed to set metadata
|
|
1236 # This can happen when the file is larger than max_optional_metadata_filesize.
|
|
1237 if int(dataset.get_size()) <= 1048576:
|
|
1238 # Small dataset, recount all lines and reset peek afterward.
|
|
1239 lc = self.count_data_lines(dataset)
|
|
1240 dataset.metadata.data_lines = lc
|
|
1241 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
|
|
1242 else:
|
|
1243 est_lines = self.estimate_file_lines(dataset)
|
|
1244 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
|
|
1245 inflector.cond_plural(est_lines, self.line_class) )
|
|
1246 else:
|
|
1247 dataset.blurb = "%s %s" % (
|
|
1248 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
|
|
1249 else:
|
|
1250 dataset.peek = 'file does not exist'
|
|
1251 dataset.blurb = 'file purged from disk'
|
|
1252
|
|
1253 def sniff(self, filename):
|
|
1254 """All IUPAC files should use the rings form determination script """
|
|
1255 try:
|
|
1256 from suds.client import Client
|
|
1257
|
|
1258 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
|
|
1259 client = Client(url)
|
|
1260 response = client.service.DeterminingForm(file(filename, 'r').read())
|
|
1261 if response.array[0] == "IUPAC":
|
|
1262 return True
|
|
1263 else:
|
|
1264 return False
|
|
1265 except ImportError:
|
|
1266 # cannot use import suds so use simple checker
|
|
1267 print "using IUPAC simple checker"
|
|
1268 f = open(filename, "r")
|
|
1269 firstline = f.readline()
|
|
1270 f.close()
|
|
1271
|
|
1272 if "[" in firstline or "]" in firstline or "(" in firstline or ")" in firstline:
|
|
1273 if "{" in firstline or "}" in firstline:
|
|
1274 return False
|
|
1275 else:
|
|
1276 return True
|
|
1277 else:
|
|
1278 return False
|
|
1279 except Exception, e:
|
|
1280 # note I am not raising an error rather return False and let another sniffer try to type this data
|
|
1281 traceback.print_exc(file=sys.stdout)
|
|
1282 return False
|
|
1283
|
|
1284 def split(cls, input_datasets, subdir_generator_function, split_params):
|
|
1285 """
|
|
1286 Split the input files by line.
|
|
1287 """
|
|
1288 if split_params is None:
|
|
1289 return
|
|
1290
|
|
1291 if len(input_datasets) > 1:
|
|
1292 raise Exception("Text file splitting does not support multiple files")
|
|
1293 input_files = [ds.file_name for ds in input_datasets]
|
|
1294
|
|
1295 lines_per_file = None
|
|
1296 chunk_size = None
|
|
1297 if split_params['split_mode'] == 'number_of_parts':
|
|
1298 lines_per_file = []
|
|
1299 # Computing the length is expensive!
|
|
1300 def _file_len(fname):
|
|
1301 i = 0
|
|
1302 f = open(fname)
|
|
1303 for i, l in enumerate(f):
|
|
1304 pass
|
|
1305 f.close()
|
|
1306 return i + 1
|
|
1307
|
|
1308 length = _file_len(input_files[0])
|
|
1309 parts = int(split_params['split_size'])
|
|
1310 if length < parts:
|
|
1311 parts = length
|
|
1312 len_each, remainder = divmod(length, parts)
|
|
1313 while length > 0:
|
|
1314 chunk = len_each
|
|
1315 if remainder > 0:
|
|
1316 chunk += 1
|
|
1317 lines_per_file.append(chunk)
|
|
1318 remainder = - 1
|
|
1319 length -= chunk
|
|
1320 elif split_params['split_mode'] == 'to_size':
|
|
1321 chunk_size = int(split_params['split_size'])
|
|
1322 else:
|
|
1323 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
|
|
1324
|
|
1325 f = open(input_files[0], 'rt')
|
|
1326 try:
|
|
1327 chunk_idx = 0
|
|
1328 file_done = False
|
|
1329 part_file = None
|
|
1330 while not file_done:
|
|
1331 if lines_per_file is None:
|
|
1332 this_chunk_size = chunk_size
|
|
1333 elif chunk_idx < len(lines_per_file):
|
|
1334 this_chunk_size = lines_per_file[chunk_idx]
|
|
1335 chunk_idx += 1
|
|
1336 lines_remaining = this_chunk_size
|
|
1337 part_file = None
|
|
1338 while lines_remaining > 0:
|
|
1339 a_line = f.readline()
|
|
1340 if a_line == '':
|
|
1341 file_done = True
|
|
1342 break
|
|
1343 if part_file is None:
|
|
1344 part_dir = subdir_generator_function()
|
|
1345 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
|
|
1346 part_file = open(part_path, 'w')
|
|
1347 part_file.write(a_line)
|
|
1348 lines_remaining -= 1
|
|
1349 if part_file is not None:
|
|
1350 part_file.close()
|
|
1351 except Exception, e:
|
|
1352 log.error('Unable to split files: %s' % str(e))
|
|
1353 f.close()
|
|
1354 if part_file is not None:
|
|
1355 part_file.close()
|
|
1356 raise
|
|
1357 f.close()
|
|
1358
|
|
1359 split = classmethod(split)
|
|
1360
|
|
1361
|
|
1362 class linearcode(data.Data):
|
|
1363 file_ext = 'linearcode'
|
|
1364 line_class = 'line'
|
|
1365
|
|
1366 """Add metadata elements"""
|
|
1367 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
|
|
1368 visible=False, no_value=0)
|
|
1369
|
|
1370 def write_from_stream(self, dataset, stream):
|
|
1371 """Writes data from a stream"""
|
|
1372 # write it twice for now
|
|
1373 fd, temp_name = tempfile.mkstemp()
|
|
1374 while 1:
|
|
1375 chunk = stream.read(1048576)
|
|
1376 if not chunk:
|
|
1377 break
|
|
1378 os.write(fd, chunk)
|
|
1379 os.close(fd)
|
|
1380 # rewrite the file with unix newlines
|
|
1381 fp = open(dataset.file_name, 'wt')
|
|
1382 for line in file(temp_name, "U"):
|
|
1383 line = line.strip() + '\n'
|
|
1384 fp.write(line)
|
|
1385 fp.close()
|
|
1386
|
|
1387 def set_raw_data(self, dataset, data):
|
|
1388 """Saves the data on the disc"""
|
|
1389 fd, temp_name = tempfile.mkstemp()
|
|
1390 os.write(fd, data)
|
|
1391 os.close(fd)
|
|
1392 # rewrite the file with unix newlines
|
|
1393 fp = open(dataset.file_name, 'wt')
|
|
1394 for line in file(temp_name, "U"):
|
|
1395 line = line.strip() + '\n'
|
|
1396 fp.write(line)
|
|
1397 fp.close()
|
|
1398 os.remove(temp_name)
|
|
1399
|
|
1400 def get_mime(self):
|
|
1401 """Returns the mime type of the datatype"""
|
|
1402 return 'text/plain'
|
|
1403
|
|
1404 def set_meta(self, dataset, **kwd):
|
|
1405 """
|
|
1406 Set the number of lines of data in dataset.
|
|
1407 """
|
|
1408 dataset.metadata.data_lines = self.count_data_lines(dataset)
|
|
1409
|
|
1410 def estimate_file_lines(self, dataset):
|
|
1411 """
|
|
1412 Perform a rough estimate by extrapolating number of lines from a small read.
|
|
1413 """
|
|
1414 sample_size = 1048576
|
|
1415 dataset_fh = open(dataset.file_name)
|
|
1416 dataset_read = dataset_fh.read(sample_size)
|
|
1417 dataset_fh.close()
|
|
1418 sample_lines = dataset_read.count('\n')
|
|
1419 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
|
|
1420 return est_lines
|
|
1421
|
|
1422 def count_data_lines(self, dataset):
|
|
1423 """
|
|
1424 Count the number of lines of data in dataset,
|
|
1425 skipping all blank lines and comments.
|
|
1426 """
|
|
1427 data_lines = 0
|
|
1428 for line in file(dataset.file_name):
|
|
1429 line = line.strip()
|
|
1430 if line and not line.startswith('#'):
|
|
1431 data_lines += 1
|
|
1432 return data_lines
|
|
1433
|
|
1434 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
|
|
1435 """
|
|
1436 Set the peek. This method is used by various subclasses of Text.
|
|
1437 """
|
|
1438 if not dataset.dataset.purged:
|
|
1439 # The file must exist on disk for the get_file_peek() method
|
|
1440 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
|
|
1441 skipchars=skipchars)
|
|
1442 if line_count is None:
|
|
1443 # See if line_count is stored in the metadata
|
|
1444 if dataset.metadata.data_lines:
|
|
1445 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
|
|
1446 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
|
|
1447 else:
|
|
1448 # Number of lines is not known ( this should not happen ), and auto-detect is
|
|
1449 # needed to set metadata
|
|
1450 # This can happen when the file is larger than max_optional_metadata_filesize.
|
|
1451 if int(dataset.get_size()) <= 1048576:
|
|
1452 # Small dataset, recount all lines and reset peek afterward.
|
|
1453 lc = self.count_data_lines(dataset)
|
|
1454 dataset.metadata.data_lines = lc
|
|
1455 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
|
|
1456 else:
|
|
1457 est_lines = self.estimate_file_lines(dataset)
|
|
1458 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
|
|
1459 inflector.cond_plural(est_lines, self.line_class) )
|
|
1460 else:
|
|
1461 dataset.blurb = "%s %s" % (
|
|
1462 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
|
|
1463 else:
|
|
1464 dataset.peek = 'file does not exist'
|
|
1465 dataset.blurb = 'file purged from disk'
|
|
1466
|
|
1467 def sniff(self, filename):
|
|
1468 """All linear code files should use the rings form determination script """
|
|
1469 try:
|
|
1470 from suds.client import Client
|
|
1471
|
|
1472 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
|
|
1473 client = Client(url)
|
|
1474 lcresponse = client.service.DeterminingForm(file(filename, 'r').read())
|
|
1475 if lcresponse.array[0] == "LinearCode":
|
|
1476 print "LinearCode"
|
|
1477 return True
|
|
1478 else:
|
|
1479 print "Unable to guess format"
|
|
1480 return False
|
|
1481 except ImportError:
|
|
1482 # cannot use import suds so use simple checker
|
|
1483 print "using LinearCode simple checker - nope it does not exist yet"
|
|
1484 return False
|
|
1485 except Exception, e:
|
|
1486 # note I am not raising an error rather return False and let another sniffer try to type this data
|
|
1487 traceback.print_exc(file=sys.stdout)
|
|
1488 return False
|
|
1489
|
|
1490 def split(cls, input_datasets, subdir_generator_function, split_params):
|
|
1491 """
|
|
1492 Split the input files by line.
|
|
1493 """
|
|
1494 if split_params is None:
|
|
1495 return
|
|
1496
|
|
1497 if len(input_datasets) > 1:
|
|
1498 raise Exception("Text file splitting does not support multiple files")
|
|
1499 input_files = [ds.file_name for ds in input_datasets]
|
|
1500
|
|
1501 lines_per_file = None
|
|
1502 chunk_size = None
|
|
1503 if split_params['split_mode'] == 'number_of_parts':
|
|
1504 lines_per_file = []
|
|
1505 # Computing the length is expensive!
|
|
1506 def _file_len(fname):
|
|
1507 i = 0
|
|
1508 f = open(fname)
|
|
1509 for i, l in enumerate(f):
|
|
1510 pass
|
|
1511 f.close()
|
|
1512 return i + 1
|
|
1513
|
|
1514 length = _file_len(input_files[0])
|
|
1515 parts = int(split_params['split_size'])
|
|
1516 if length < parts:
|
|
1517 parts = length
|
|
1518 len_each, remainder = divmod(length, parts)
|
|
1519 while length > 0:
|
|
1520 chunk = len_each
|
|
1521 if remainder > 0:
|
|
1522 chunk += 1
|
|
1523 lines_per_file.append(chunk)
|
|
1524 remainder = - 1
|
|
1525 length -= chunk
|
|
1526 elif split_params['split_mode'] == 'to_size':
|
|
1527 chunk_size = int(split_params['split_size'])
|
|
1528 else:
|
|
1529 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
|
|
1530
|
|
1531 f = open(input_files[0], 'rt')
|
|
1532 try:
|
|
1533 chunk_idx = 0
|
|
1534 file_done = False
|
|
1535 part_file = None
|
|
1536 while not file_done:
|
|
1537 if lines_per_file is None:
|
|
1538 this_chunk_size = chunk_size
|
|
1539 elif chunk_idx < len(lines_per_file):
|
|
1540 this_chunk_size = lines_per_file[chunk_idx]
|
|
1541 chunk_idx += 1
|
|
1542 lines_remaining = this_chunk_size
|
|
1543 part_file = None
|
|
1544 while lines_remaining > 0:
|
|
1545 a_line = f.readline()
|
|
1546 if a_line == '':
|
|
1547 file_done = True
|
|
1548 break
|
|
1549 if part_file is None:
|
|
1550 part_dir = subdir_generator_function()
|
|
1551 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
|
|
1552 part_file = open(part_path, 'w')
|
|
1553 part_file.write(a_line)
|
|
1554 lines_remaining -= 1
|
|
1555 if part_file is not None:
|
|
1556 part_file.close()
|
|
1557 except Exception, e:
|
|
1558 log.error('Unable to split files: %s' % str(e))
|
|
1559 f.close()
|
|
1560 if part_file is not None:
|
|
1561 part_file.close()
|
|
1562 raise
|
|
1563 f.close()
|
|
1564
|
|
1565 split = classmethod(split)
|
|
1566
|
|
1567
|
|
1568 class msa(data.Data):
|
|
1569 file_ext = 'msa'
|
|
1570 line_class = 'line'
|
|
1571
|
|
1572 """Add metadata elements"""
|
|
1573 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
|
|
1574 visible=False, no_value=0)
|
|
1575
|
|
1576 def write_from_stream(self, dataset, stream):
|
|
1577 """Writes data from a stream"""
|
|
1578 # write it twice for now
|
|
1579 fd, temp_name = tempfile.mkstemp()
|
|
1580 while 1:
|
|
1581 chunk = stream.read(1048576)
|
|
1582 if not chunk:
|
|
1583 break
|
|
1584 os.write(fd, chunk)
|
|
1585 os.close(fd)
|
|
1586 # rewrite the file with unix newlines
|
|
1587 fp = open(dataset.file_name, 'wt')
|
|
1588 for line in file(temp_name, "U"):
|
|
1589 line = line.strip() + '\n'
|
|
1590 fp.write(line)
|
|
1591 fp.close()
|
|
1592
|
|
1593 def set_raw_data(self, dataset, data):
|
|
1594 """Saves the data on the disc"""
|
|
1595 fd, temp_name = tempfile.mkstemp()
|
|
1596 os.write(fd, data)
|
|
1597 os.close(fd)
|
|
1598 # rewrite the file with unix newlines
|
|
1599 fp = open(dataset.file_name, 'wt')
|
|
1600 for line in file(temp_name, "U"):
|
|
1601 line = line.strip() + '\n'
|
|
1602 fp.write(line)
|
|
1603 fp.close()
|
|
1604 os.remove(temp_name)
|
|
1605
|
|
1606 def get_mime(self):
|
|
1607 """Returns the mime type of the datatype"""
|
|
1608 return 'text/plain'
|
|
1609
|
|
1610 def set_meta(self, dataset, **kwd):
|
|
1611 """
|
|
1612 Set the number of lines of data in dataset.
|
|
1613 """
|
|
1614 dataset.metadata.data_lines = self.count_data_lines(dataset)
|
|
1615
|
|
1616 def estimate_file_lines(self, dataset):
|
|
1617 """
|
|
1618 Perform a rough estimate by extrapolating number of lines from a small read.
|
|
1619 """
|
|
1620 sample_size = 1048576
|
|
1621 dataset_fh = open(dataset.file_name)
|
|
1622 dataset_read = dataset_fh.read(sample_size)
|
|
1623 dataset_fh.close()
|
|
1624 sample_lines = dataset_read.count('\n')
|
|
1625 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
|
|
1626 return est_lines
|
|
1627
|
|
1628 def count_data_lines(self, dataset):
|
|
1629 """
|
|
1630 Count the number of lines of data in dataset,
|
|
1631 skipping all blank lines and comments.
|
|
1632 """
|
|
1633 data_lines = 0
|
|
1634 for line in file(dataset.file_name):
|
|
1635 line = line.strip()
|
|
1636 if line and not line.startswith('#'):
|
|
1637 data_lines += 1
|
|
1638 return data_lines
|
|
1639
|
|
1640 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
|
|
1641 """
|
|
1642 Set the peek. This method is used by various subclasses of Text.
|
|
1643 """
|
|
1644 if not dataset.dataset.purged:
|
|
1645 # The file must exist on disk for the get_file_peek() method
|
|
1646 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
|
|
1647 skipchars=skipchars)
|
|
1648 if line_count is None:
|
|
1649 # See if line_count is stored in the metadata
|
|
1650 if dataset.metadata.data_lines:
|
|
1651 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
|
|
1652 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
|
|
1653 else:
|
|
1654 # Number of lines is not known ( this should not happen ), and auto-detect is
|
|
1655 # needed to set metadata
|
|
1656 # This can happen when the file is larger than max_optional_metadata_filesize.
|
|
1657 if int(dataset.get_size()) <= 1048576:
|
|
1658 # Small dataset, recount all lines and reset peek afterward.
|
|
1659 lc = self.count_data_lines(dataset)
|
|
1660 dataset.metadata.data_lines = lc
|
|
1661 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
|
|
1662 else:
|
|
1663 est_lines = self.estimate_file_lines(dataset)
|
|
1664 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
|
|
1665 inflector.cond_plural(est_lines, self.line_class) )
|
|
1666 else:
|
|
1667 dataset.blurb = "%s %s" % (
|
|
1668 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
|
|
1669 else:
|
|
1670 dataset.peek = 'file does not exist'
|
|
1671 dataset.blurb = 'file purged from disk'
|
|
1672
|
|
1673 def sniff(self, filename):
|
|
1674 """All msa Files simply put a '# .msa' in the first line. """
|
|
1675 try:
|
|
1676 f = open(filename, "r")
|
|
1677 firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity
|
|
1678 f.close()
|
|
1679
|
|
1680 if "# .MSA" in firstline:
|
|
1681 return True
|
|
1682 else:
|
|
1683 return False
|
|
1684 except:
|
|
1685 traceback.print_exc(file=sys.stdout)
|
|
1686 return False
|
|
1687
|
|
1688 def split(cls, input_datasets, subdir_generator_function, split_params):
|
|
1689 """
|
|
1690 Split the input files by line.
|
|
1691 """
|
|
1692 if split_params is None:
|
|
1693 return
|
|
1694
|
|
1695 if len(input_datasets) > 1:
|
|
1696 raise Exception("Text file splitting does not support multiple files")
|
|
1697 input_files = [ds.file_name for ds in input_datasets]
|
|
1698
|
|
1699 lines_per_file = None
|
|
1700 chunk_size = None
|
|
1701 if split_params['split_mode'] == 'number_of_parts':
|
|
1702 lines_per_file = []
|
|
1703 # Computing the length is expensive!
|
|
1704 def _file_len(fname):
|
|
1705 i = 0
|
|
1706 f = open(fname)
|
|
1707 for i, l in enumerate(f):
|
|
1708 pass
|
|
1709 f.close()
|
|
1710 return i + 1
|
|
1711
|
|
1712 length = _file_len(input_files[0])
|
|
1713 parts = int(split_params['split_size'])
|
|
1714 if length < parts:
|
|
1715 parts = length
|
|
1716 len_each, remainder = divmod(length, parts)
|
|
1717 while length > 0:
|
|
1718 chunk = len_each
|
|
1719 if remainder > 0:
|
|
1720 chunk += 1
|
|
1721 lines_per_file.append(chunk)
|
|
1722 remainder = - 1
|
|
1723 length -= chunk
|
|
1724 elif split_params['split_mode'] == 'to_size':
|
|
1725 chunk_size = int(split_params['split_size'])
|
|
1726 else:
|
|
1727 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
|
|
1728
|
|
1729 f = open(input_files[0], 'rt')
|
|
1730 try:
|
|
1731 chunk_idx = 0
|
|
1732 file_done = False
|
|
1733 part_file = None
|
|
1734 while not file_done:
|
|
1735 if lines_per_file is None:
|
|
1736 this_chunk_size = chunk_size
|
|
1737 elif chunk_idx < len(lines_per_file):
|
|
1738 this_chunk_size = lines_per_file[chunk_idx]
|
|
1739 chunk_idx += 1
|
|
1740 lines_remaining = this_chunk_size
|
|
1741 part_file = None
|
|
1742 while lines_remaining > 0:
|
|
1743 a_line = f.readline()
|
|
1744 if a_line == '':
|
|
1745 file_done = True
|
|
1746 break
|
|
1747 if part_file is None:
|
|
1748 part_dir = subdir_generator_function()
|
|
1749 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
|
|
1750 part_file = open(part_path, 'w')
|
|
1751 part_file.write(a_line)
|
|
1752 lines_remaining -= 1
|
|
1753 if part_file is not None:
|
|
1754 part_file.close()
|
|
1755 except Exception, e:
|
|
1756 log.error('Unable to split files: %s' % str(e))
|
|
1757 f.close()
|
|
1758 if part_file is not None:
|
|
1759 part_file.close()
|
|
1760 raise
|
|
1761 f.close()
|
|
1762
|
|
1763 split = classmethod(split)
|
|
1764
|
|
1765
|
|
1766 class wurcs(data.Data):
|
|
1767 file_ext = 'wurcs'
|
|
1768 line_class = 'line'
|
|
1769
|
|
1770 """Add metadata elements"""
|
|
1771 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
|
|
1772 visible=False, no_value=0)
|
|
1773
|
|
1774 def write_from_stream(self, dataset, stream):
|
|
1775 """Writes data from a stream"""
|
|
1776 # write it twice for now
|
|
1777 fd, temp_name = tempfile.mkstemp()
|
|
1778 while 1:
|
|
1779 chunk = stream.read(1048576)
|
|
1780 if not chunk:
|
|
1781 break
|
|
1782 os.write(fd, chunk)
|
|
1783 os.close(fd)
|
|
1784 # rewrite the file with unix newlines
|
|
1785 fp = open(dataset.file_name, 'wt')
|
|
1786 for line in file(temp_name, "U"):
|
|
1787 line = line.strip() + '\n'
|
|
1788 fp.write(line)
|
|
1789 fp.close()
|
|
1790
|
|
1791 def set_raw_data(self, dataset, data):
|
|
1792 """Saves the data on the disc"""
|
|
1793 fd, temp_name = tempfile.mkstemp()
|
|
1794 os.write(fd, data)
|
|
1795 os.close(fd)
|
|
1796 # rewrite the file with unix newlines
|
|
1797 fp = open(dataset.file_name, 'wt')
|
|
1798 for line in file(temp_name, "U"):
|
|
1799 line = line.strip() + '\n'
|
|
1800 fp.write(line)
|
|
1801 fp.close()
|
|
1802 os.remove(temp_name)
|
|
1803
|
|
1804 def get_mime(self):
|
|
1805 """Returns the mime type of the datatype"""
|
|
1806 return 'text/plain'
|
|
1807
|
|
1808 def set_meta(self, dataset, **kwd):
|
|
1809 """
|
|
1810 Set the number of lines of data in dataset.
|
|
1811 """
|
|
1812 dataset.metadata.data_lines = self.count_data_lines(dataset)
|
|
1813
|
|
1814 def estimate_file_lines(self, dataset):
|
|
1815 """
|
|
1816 Perform a rough estimate by extrapolating number of lines from a small read.
|
|
1817 """
|
|
1818 sample_size = 1048576
|
|
1819 dataset_fh = open(dataset.file_name)
|
|
1820 dataset_read = dataset_fh.read(sample_size)
|
|
1821 dataset_fh.close()
|
|
1822 sample_lines = dataset_read.count('\n')
|
|
1823 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
|
|
1824 return est_lines
|
|
1825
|
|
1826 def count_data_lines(self, dataset):
|
|
1827 """
|
|
1828 Count the number of lines of data in dataset,
|
|
1829 skipping all blank lines and comments.
|
|
1830 """
|
|
1831 data_lines = 0
|
|
1832 for line in file(dataset.file_name):
|
|
1833 line = line.strip()
|
|
1834 if line and not line.startswith('#'):
|
|
1835 data_lines += 1
|
|
1836 return data_lines
|
|
1837
|
|
1838 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
|
|
1839 """
|
|
1840 Set the peek. This method is used by various subclasses of Text.
|
|
1841 """
|
|
1842 if not dataset.dataset.purged:
|
|
1843 # The file must exist on disk for the get_file_peek() method
|
|
1844 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
|
|
1845 skipchars=skipchars)
|
|
1846 if line_count is None:
|
|
1847 # See if line_count is stored in the metadata
|
|
1848 if dataset.metadata.data_lines:
|
|
1849 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
|
|
1850 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
|
|
1851 else:
|
|
1852 # Number of lines is not known ( this should not happen ), and auto-detect is
|
|
1853 # needed to set metadata
|
|
1854 # This can happen when the file is larger than max_optional_metadata_filesize.
|
|
1855 if int(dataset.get_size()) <= 1048576:
|
|
1856 # Small dataset, recount all lines and reset peek afterward.
|
|
1857 lc = self.count_data_lines(dataset)
|
|
1858 dataset.metadata.data_lines = lc
|
|
1859 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
|
|
1860 else:
|
|
1861 est_lines = self.estimate_file_lines(dataset)
|
|
1862 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
|
|
1863 inflector.cond_plural(est_lines, self.line_class) )
|
|
1864 else:
|
|
1865 dataset.blurb = "%s %s" % (
|
|
1866 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
|
|
1867 else:
|
|
1868 dataset.peek = 'file does not exist'
|
|
1869 dataset.blurb = 'file purged from disk'
|
|
1870
|
|
1871 def sniff(self, filename):
|
|
1872 """All WURCS Files start with WURCS= then the version number. see http://www.wurcs-wg.org/definition.php and http://rings.t.soka.ac.jp/
|
|
1873 WURCS=2.0/4,3/[x2112h+1:x|1,5|2*NCC/3=O|4*OSO/3=O/3=O][12122a+1:b|1,5][12112h+1:b|1,5|2*NCC/3=O|6*OSO/3=O/3=O][12122a+1:b|1,5]1+3,2+1|2+4,3+1|3+3,4+1"""
|
|
1874 try:
|
|
1875 f = open(filename, "r")
|
|
1876 firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity
|
|
1877 f.close()
|
|
1878 if "WURCS" in firstline:
|
|
1879 return True
|
|
1880 else:
|
|
1881 return False
|
|
1882 except:
|
|
1883 traceback.print_exc(file=sys.stdout)
|
|
1884 return False
|
|
1885
|
|
1886
|
|
1887 def split(cls, input_datasets, subdir_generator_function, split_params):
|
|
1888 """
|
|
1889 Split the input files by line.
|
|
1890 """
|
|
1891 if split_params is None:
|
|
1892 return
|
|
1893
|
|
1894 if len(input_datasets) > 1:
|
|
1895 raise Exception("Text file splitting does not support multiple files")
|
|
1896 input_files = [ds.file_name for ds in input_datasets]
|
|
1897
|
|
1898 lines_per_file = None
|
|
1899 chunk_size = None
|
|
1900 if split_params['split_mode'] == 'number_of_parts':
|
|
1901 lines_per_file = []
|
|
1902 # Computing the length is expensive!
|
|
1903 def _file_len(fname):
|
|
1904 i = 0
|
|
1905 f = open(fname)
|
|
1906 for i, l in enumerate(f):
|
|
1907 pass
|
|
1908 f.close()
|
|
1909 return i + 1
|
|
1910
|
|
1911 length = _file_len(input_files[0])
|
|
1912 parts = int(split_params['split_size'])
|
|
1913 if length < parts:
|
|
1914 parts = length
|
|
1915 len_each, remainder = divmod(length, parts)
|
|
1916 while length > 0:
|
|
1917 chunk = len_each
|
|
1918 if remainder > 0:
|
|
1919 chunk += 1
|
|
1920 lines_per_file.append(chunk)
|
|
1921 remainder = - 1
|
|
1922 length -= chunk
|
|
1923 elif split_params['split_mode'] == 'to_size':
|
|
1924 chunk_size = int(split_params['split_size'])
|
|
1925 else:
|
|
1926 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
|
|
1927
|
|
1928 f = open(input_files[0], 'rt')
|
|
1929 try:
|
|
1930 chunk_idx = 0
|
|
1931 file_done = False
|
|
1932 part_file = None
|
|
1933 while not file_done:
|
|
1934 if lines_per_file is None:
|
|
1935 this_chunk_size = chunk_size
|
|
1936 elif chunk_idx < len(lines_per_file):
|
|
1937 this_chunk_size = lines_per_file[chunk_idx]
|
|
1938 chunk_idx += 1
|
|
1939 lines_remaining = this_chunk_size
|
|
1940 part_file = None
|
|
1941 while lines_remaining > 0:
|
|
1942 a_line = f.readline()
|
|
1943 if a_line == '':
|
|
1944 file_done = True
|
|
1945 break
|
|
1946 if part_file is None:
|
|
1947 part_dir = subdir_generator_function()
|
|
1948 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
|
|
1949 part_file = open(part_path, 'w')
|
|
1950 part_file.write(a_line)
|
|
1951 lines_remaining -= 1
|
|
1952 if part_file is not None:
|
|
1953 part_file.close()
|
|
1954 except Exception, e:
|
|
1955 log.error('Unable to split files: %s' % str(e))
|
|
1956 f.close()
|
|
1957 if part_file is not None:
|
|
1958 part_file.close()
|
|
1959 raise
|
|
1960 f.close()
|
|
1961
|
|
1962 split = classmethod(split)
|
|
1963
|
|
1964
|