comparison datatypes/glycan.py @ 0:0e941a69a6fa draft default tip

Uploaded
author chrisb
date Wed, 23 Mar 2016 14:34:50 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:0e941a69a6fa
1 __license__ = "MIT"
2
3 import logging
4 from galaxy.datatypes import metadata
5 import mimetypes
6 import os
7 import shutil
8 import sys
9 import traceback
10 import tempfile
11 import zipfile
12 from cgi import escape
13 from inspect import isclass
14 import galaxy.util as util
15 from galaxy.datatypes import data
16 from galaxy.datatypes.metadata import \
17 MetadataElement # import directly to maintain ease of use in Datatype class definitions
18 from galaxy.util import inflector
19 from galaxy.util.bunch import Bunch
20 from galaxy.util.odict import odict
21 from galaxy.util.sanitize_html import sanitize_html
22
23 from galaxy.datatypes import dataproviders
24
25 from galaxy import eggs
26
27 eggs.require("Paste")
28 import paste
29
30
31 class kcf(data.Data):
32 file_ext = 'kcf'
33 line_class = 'line'
34
35 """Add metadata elements"""
36 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
37 visible=False, no_value=0)
38
39 def write_from_stream(self, dataset, stream):
40 """Writes data from a stream"""
41 # write it twice for now
42 fd, temp_name = tempfile.mkstemp()
43 while 1:
44 chunk = stream.read(1048576)
45 if not chunk:
46 break
47 os.write(fd, chunk)
48 os.close(fd)
49 # rewrite the file with unix newlines
50 fp = open(dataset.file_name, 'wt')
51 for line in file(temp_name, "U"):
52 line = line.strip() + '\n'
53 fp.write(line)
54 fp.close()
55
56 def set_raw_data(self, dataset, data):
57 """Saves the data on the disc"""
58 fd, temp_name = tempfile.mkstemp()
59 os.write(fd, data)
60 os.close(fd)
61 # rewrite the file with unix newlines
62 fp = open(dataset.file_name, 'wt')
63 for line in file(temp_name, "U"):
64 line = line.strip() + '\n'
65 fp.write(line)
66 fp.close()
67 os.remove(temp_name)
68
69 def get_mime(self):
70 """Returns the mime type of the datatype"""
71 return 'text/plain'
72
73 def set_meta(self, dataset, **kwd):
74 """
75 Set the number of lines of data in dataset.
76 """
77 dataset.metadata.data_lines = self.count_data_lines(dataset)
78
79 def estimate_file_lines(self, dataset):
80 """
81 Perform a rough estimate by extrapolating number of lines from a small read.
82 """
83 sample_size = 1048576
84 dataset_fh = open(dataset.file_name)
85 dataset_read = dataset_fh.read(sample_size)
86 dataset_fh.close()
87 sample_lines = dataset_read.count('\n')
88 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
89 return est_lines
90
91 def count_data_lines(self, dataset):
92 """
93 Count the number of lines of data in dataset,
94 skipping all blank lines and comments.
95 """
96 data_lines = 0
97 for line in file(dataset.file_name):
98 line = line.strip()
99 if line and not line.startswith('#'):
100 data_lines += 1
101 return data_lines
102
103 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
104 """
105 Set the peek. This method is used by various subclasses of Text.
106 """
107 if not dataset.dataset.purged:
108 # The file must exist on disk for the get_file_peek() method
109 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
110 skipchars=skipchars)
111 if line_count is None:
112 # See if line_count is stored in the metadata
113 if dataset.metadata.data_lines:
114 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
115 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
116 else:
117 # Number of lines is not known ( this should not happen ), and auto-detect is
118 # needed to set metadata
119 # This can happen when the file is larger than max_optional_metadata_filesize.
120 if int(dataset.get_size()) <= 1048576:
121 # Small dataset, recount all lines and reset peek afterward.
122 lc = self.count_data_lines(dataset)
123 dataset.metadata.data_lines = lc
124 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
125 else:
126 est_lines = self.estimate_file_lines(dataset)
127 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
128 inflector.cond_plural(est_lines, self.line_class) )
129 else:
130 dataset.blurb = "%s %s" % (
131 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
132 else:
133 dataset.peek = 'file does not exist'
134 dataset.blurb = 'file purged from disk'
135
136 def sniff(self, filename):
137 """All KCF Files simply put a 'ENTRY' in its first line.
138 This applies to all possible kcfs. In this case check
139 for 'Glycan' to confirm it's a glycan """
140 try:
141 from suds.client import Client
142
143 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
144 client = Client(url)
145 kcfresponse = client.service.DeterminingForm(file(filename, 'r').read())
146 if kcfresponse.array[0] == "KCF":
147 return True
148 else:
149 return False
150 except ImportError:
151 # cannot use import suds so use simple checker
152 print "using KCF simple checker"
153 f = open(filename, "r")
154 firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity
155 f.close()
156
157 if "ENTRY" in firstline and "GLYCAN" in firstline:
158 return True
159 else:
160 return False
161 except Exception, e:
162 # note I am not raising an error rather return False and let another sniffer try to type this data
163 traceback.print_exc(file=sys.stdout)
164 return False
165
166 def split(cls, input_datasets, subdir_generator_function, split_params):
167 """
168 Split the input files by line.
169 """
170 if split_params is None:
171 return
172
173 if len(input_datasets) > 1:
174 raise Exception("Text file splitting does not support multiple files")
175 input_files = [ds.file_name for ds in input_datasets]
176
177 lines_per_file = None
178 chunk_size = None
179 if split_params['split_mode'] == 'number_of_parts':
180 lines_per_file = []
181 # Computing the length is expensive!
182 def _file_len(fname):
183 i = 0
184 f = open(fname)
185 for i, l in enumerate(f):
186 pass
187 f.close()
188 return i + 1
189
190 length = _file_len(input_files[0])
191 parts = int(split_params['split_size'])
192 if length < parts:
193 parts = length
194 len_each, remainder = divmod(length, parts)
195 while length > 0:
196 chunk = len_each
197 if remainder > 0:
198 chunk += 1
199 lines_per_file.append(chunk)
200 remainder = - 1
201 length -= chunk
202 elif split_params['split_mode'] == 'to_size':
203 chunk_size = int(split_params['split_size'])
204 else:
205 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
206
207 f = open(input_files[0], 'rt')
208 try:
209 chunk_idx = 0
210 file_done = False
211 part_file = None
212 while not file_done:
213 if lines_per_file is None:
214 this_chunk_size = chunk_size
215 elif chunk_idx < len(lines_per_file):
216 this_chunk_size = lines_per_file[chunk_idx]
217 chunk_idx += 1
218 lines_remaining = this_chunk_size
219 part_file = None
220 while lines_remaining > 0:
221 a_line = f.readline()
222 if a_line == '':
223 file_done = True
224 break
225 if part_file is None:
226 part_dir = subdir_generator_function()
227 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
228 part_file = open(part_path, 'w')
229 part_file.write(a_line)
230 lines_remaining -= 1
231 if part_file is not None:
232 part_file.close()
233 except Exception, e:
234 log.error('Unable to split files: %s' % str(e))
235 f.close()
236 if part_file is not None:
237 part_file.close()
238 raise
239 f.close()
240
241 split = classmethod(split)
242
243
244 class glycoct(data.Data):
245 file_ext = 'glycoct'
246 line_class = 'line'
247
248 """Add metadata elements"""
249 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
250 visible=False, no_value=0)
251
252 def write_from_stream(self, dataset, stream):
253 """Writes data from a stream"""
254 # write it twice for now
255 fd, temp_name = tempfile.mkstemp()
256 while 1:
257 chunk = stream.read(1048576)
258 if not chunk:
259 break
260 os.write(fd, chunk)
261 os.close(fd)
262 # rewrite the file with unix newlines
263 fp = open(dataset.file_name, 'wt')
264 for line in file(temp_name, "U"):
265 line = line.strip() + '\n'
266 fp.write(line)
267 fp.close()
268
269 def set_raw_data(self, dataset, data):
270 """Saves the data on the disc"""
271 fd, temp_name = tempfile.mkstemp()
272 os.write(fd, data)
273 os.close(fd)
274 # rewrite the file with unix newlines
275 fp = open(dataset.file_name, 'wt')
276 for line in file(temp_name, "U"):
277 line = line.strip() + '\n'
278 fp.write(line)
279 fp.close()
280 os.remove(temp_name)
281
282 def get_mime(self):
283 """Returns the mime type of the datatype"""
284 return 'text/plain'
285
286 def set_meta(self, dataset, **kwd):
287 """
288 Set the number of lines of data in dataset.
289 """
290 dataset.metadata.data_lines = self.count_data_lines(dataset)
291
292 def estimate_file_lines(self, dataset):
293 """
294 Perform a rough estimate by extrapolating number of lines from a small read.
295 """
296 sample_size = 1048576
297 dataset_fh = open(dataset.file_name)
298 dataset_read = dataset_fh.read(sample_size)
299 dataset_fh.close()
300 sample_lines = dataset_read.count('\n')
301 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
302 return est_lines
303
304 def count_data_lines(self, dataset):
305 """
306 Count the number of lines of data in dataset,
307 skipping all blank lines and comments.
308 """
309 data_lines = 0
310 for line in file(dataset.file_name):
311 line = line.strip()
312 if line and not line.startswith('#'):
313 data_lines += 1
314 return data_lines
315
316 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
317 """
318 Set the peek. This method is used by various subclasses of Text.
319 """
320 if not dataset.dataset.purged:
321 # The file must exist on disk for the get_file_peek() method
322 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
323 skipchars=skipchars)
324 if line_count is None:
325 # See if line_count is stored in the metadata
326 if dataset.metadata.data_lines:
327 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
328 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
329 else:
330 # Number of lines is not known ( this should not happen ), and auto-detect is
331 # needed to set metadata
332 # This can happen when the file is larger than max_optional_metadata_filesize.
333 if int(dataset.get_size()) <= 1048576:
334 # Small dataset, recount all lines and reset peek afterward.
335 lc = self.count_data_lines(dataset)
336 dataset.metadata.data_lines = lc
337 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
338 else:
339 est_lines = self.estimate_file_lines(dataset)
340 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
341 inflector.cond_plural(est_lines, self.line_class) )
342 else:
343 dataset.blurb = "%s %s" % (
344 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
345 else:
346 dataset.peek = 'file does not exist'
347 dataset.blurb = 'file purged from disk'
348
349 def sniff(self, filename):
350 """All glycoct_condensed files simply put a 'RES' in its first line and a LIN later. """
351 try:
352 f = open(filename, "r")
353 firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity
354 lines = f.read()
355 f.close()
356
357 # if "RES" in firstline and "LIN" in lines:
358 if "RES" in firstline and "LIN" in lines:
359 return True
360 else:
361 return False
362 except Exception, e:
363 # note I am not raising an error rather return False and let another sniffer try to type this data
364 traceback.print_exc(file=sys.stdout)
365 return False
366
367 def split(cls, input_datasets, subdir_generator_function, split_params):
368 """
369 Split the input files by line.
370 """
371 if split_params is None:
372 return
373
374 if len(input_datasets) > 1:
375 raise Exception("Text file splitting does not support multiple files")
376 input_files = [ds.file_name for ds in input_datasets]
377
378 lines_per_file = None
379 chunk_size = None
380 if split_params['split_mode'] == 'number_of_parts':
381 lines_per_file = []
382 # Computing the length is expensive!
383 def _file_len(fname):
384 i = 0
385 f = open(fname)
386 for i, l in enumerate(f):
387 pass
388 f.close()
389 return i + 1
390
391 length = _file_len(input_files[0])
392 parts = int(split_params['split_size'])
393 if length < parts:
394 parts = length
395 len_each, remainder = divmod(length, parts)
396 while length > 0:
397 chunk = len_each
398 if remainder > 0:
399 chunk += 1
400 lines_per_file.append(chunk)
401 remainder = - 1
402 length -= chunk
403 elif split_params['split_mode'] == 'to_size':
404 chunk_size = int(split_params['split_size'])
405 else:
406 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
407
408 f = open(input_files[0], 'rt')
409 try:
410 chunk_idx = 0
411 file_done = False
412 part_file = None
413 while not file_done:
414 if lines_per_file is None:
415 this_chunk_size = chunk_size
416 elif chunk_idx < len(lines_per_file):
417 this_chunk_size = lines_per_file[chunk_idx]
418 chunk_idx += 1
419 lines_remaining = this_chunk_size
420 part_file = None
421 while lines_remaining > 0:
422 a_line = f.readline()
423 if a_line == '':
424 file_done = True
425 break
426 if part_file is None:
427 part_dir = subdir_generator_function()
428 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
429 part_file = open(part_path, 'w')
430 part_file.write(a_line)
431 lines_remaining -= 1
432 if part_file is not None:
433 part_file.close()
434 except Exception, e:
435 log.error('Unable to split files: %s' % str(e))
436 f.close()
437 if part_file is not None:
438 part_file.close()
439 raise
440 f.close()
441
442 split = classmethod(split)
443
444 # ------------- Utility methods --------------
445
446 # nice_size used to be here, but to resolve cyclical dependencies it's been
447 # moved to galaxy.util. It belongs there anyway since it's used outside
448 # datatypes.
449 nice_size = util.nice_size
450
451
452 def get_test_fname(fname):
453 """Returns test data filename"""
454 path, name = os.path.split(__file__)
455 full_path = os.path.join(path, 'test', fname)
456 return full_path
457
458
459 def get_file_peek(file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipchars=[]):
460 """
461 Returns the first LINE_COUNT lines wrapped to WIDTH
462
463 ## >>> fname = get_test_fname('4.bed')
464 ## >>> get_file_peek(fname)
465 ## 'chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 +\n'
466
467 """
468 # Set size for file.readline() to a negative number to force it to
469 # read until either a newline or EOF. Needed for datasets with very
470 # long lines.
471 if WIDTH == 'unlimited':
472 WIDTH = -1
473 lines = []
474 count = 0
475 file_type = None
476 data_checked = False
477 temp = open(file_name, "U")
478 while count <= LINE_COUNT:
479 line = temp.readline(WIDTH)
480 if line and not is_multi_byte and not data_checked:
481 # See if we have a compressed or binary file
482 if line[0:2] == util.gzip_magic:
483 file_type = 'gzipped'
484 break
485 else:
486 for char in line:
487 if ord(char) > 128:
488 file_type = 'binary'
489 break
490 data_checked = True
491 if file_type in ['gzipped', 'binary']:
492 break
493 skip_line = False
494 for skipchar in skipchars:
495 if line.startswith(skipchar):
496 skip_line = True
497 break
498 if not skip_line:
499 lines.append(line)
500 count += 1
501 temp.close()
502 if file_type in ['gzipped', 'binary']:
503 text = "%s file" % file_type
504 else:
505 try:
506 text = unicode('\n'.join(lines), 'utf-8')
507 except UnicodeDecodeError:
508 text = "binary/unknown file"
509 return text
510
511
512 class glycoct_xml(data.Data):
513 file_ext = 'glycoct_xml'
514 line_class = 'line'
515
516 """Add metadata elements"""
517 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
518 visible=False, no_value=0)
519
520 def write_from_stream(self, dataset, stream):
521 """Writes data from a stream"""
522 # write it twice for now
523 fd, temp_name = tempfile.mkstemp()
524 while 1:
525 chunk = stream.read(1048576)
526 if not chunk:
527 break
528 os.write(fd, chunk)
529 os.close(fd)
530 # rewrite the file with unix newlines
531 fp = open(dataset.file_name, 'wt')
532 for line in file(temp_name, "U"):
533 line = line.strip() + '\n'
534 fp.write(line)
535 fp.close()
536
537 def set_raw_data(self, dataset, data):
538 """Saves the data on the disc"""
539 fd, temp_name = tempfile.mkstemp()
540 os.write(fd, data)
541 os.close(fd)
542 # rewrite the file with unix newlines
543 fp = open(dataset.file_name, 'wt')
544 for line in file(temp_name, "U"):
545 line = line.strip() + '\n'
546 fp.write(line)
547 fp.close()
548 os.remove(temp_name)
549
550 def get_mime(self):
551 """Returns the mime type of the datatype"""
552 return 'text/xml'
553
554 def set_meta(self, dataset, **kwd):
555 """
556 Set the number of lines of data in dataset.
557 """
558 dataset.metadata.data_lines = self.count_data_lines(dataset)
559
560 def estimate_file_lines(self, dataset):
561 """
562 Perform a rough estimate by extrapolating number of lines from a small read.
563 """
564 sample_size = 1048576
565 dataset_fh = open(dataset.file_name)
566 dataset_read = dataset_fh.read(sample_size)
567 dataset_fh.close()
568 sample_lines = dataset_read.count('\n')
569 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
570 return est_lines
571
572 def count_data_lines(self, dataset):
573 """
574 Count the number of lines of data in dataset,
575 skipping all blank lines and comments.
576 """
577 data_lines = 0
578 for line in file(dataset.file_name):
579 line = line.strip()
580 if line and not line.startswith('#'):
581 data_lines += 1
582 return data_lines
583
584 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
585 """
586 Set the peek. This method is used by various subclasses of Text.
587 """
588 if not dataset.dataset.purged:
589 # The file must exist on disk for the get_file_peek() method
590 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
591 skipchars=skipchars)
592 if line_count is None:
593 # See if line_count is stored in the metadata
594 if dataset.metadata.data_lines:
595 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
596 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
597 else:
598 # Number of lines is not known ( this should not happen ), and auto-detect is
599 # needed to set metadata
600 # This can happen when the file is larger than max_optional_metadata_filesize.
601 if int(dataset.get_size()) <= 1048576:
602 # Small dataset, recount all lines and reset peek afterward.
603 lc = self.count_data_lines(dataset)
604 dataset.metadata.data_lines = lc
605 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
606 else:
607 est_lines = self.estimate_file_lines(dataset)
608 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
609 inflector.cond_plural(est_lines, self.line_class) )
610 else:
611 dataset.blurb = "%s %s" % (
612 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
613 else:
614 dataset.peek = 'file does not exist'
615 dataset.blurb = 'file purged from disk'
616
617 def sniff(self, filename):
618 """All glycoct XML files should use the rings form determination script """
619 try:
620 from suds.client import Client
621
622 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
623 client = Client(url)
624 response = client.service.DeterminingForm(file(filename, 'r').read())
625 if response.array[0] == "GlycoCT":
626 return True
627 else:
628 return False
629 except ImportError:
630 # cannot use import suds so use simple checker
631 print "using glycoct XML simple checker"
632 import xml.etree.cElementTree as ET
633
634 tree = ET.parse(filename)
635 root = tree.getroot()
636 if root.tag == 'sugar':
637 print root.tag, root.attrib
638 return True
639 else:
640 return False
641 except Exception, e:
642 # note I am not raising an error rather return False and let another sniffer try to type this data
643 traceback.print_exc(file=sys.stdout)
644 return False
645
646 def split(cls, input_datasets, subdir_generator_function, split_params):
647 """
648 Split the input files by line.
649 """
650 if split_params is None:
651 return
652
653 if len(input_datasets) > 1:
654 raise Exception("Text file splitting does not support multiple files")
655 input_files = [ds.file_name for ds in input_datasets]
656
657 lines_per_file = None
658 chunk_size = None
659 if split_params['split_mode'] == 'number_of_parts':
660 lines_per_file = []
661 # Computing the length is expensive!
662 def _file_len(fname):
663 i = 0
664 f = open(fname)
665 for i, l in enumerate(f):
666 pass
667 f.close()
668 return i + 1
669
670 length = _file_len(input_files[0])
671 parts = int(split_params['split_size'])
672 if length < parts:
673 parts = length
674 len_each, remainder = divmod(length, parts)
675 while length > 0:
676 chunk = len_each
677 if remainder > 0:
678 chunk += 1
679 lines_per_file.append(chunk)
680 remainder = - 1
681 length -= chunk
682 elif split_params['split_mode'] == 'to_size':
683 chunk_size = int(split_params['split_size'])
684 else:
685 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
686
687 f = open(input_files[0], 'rt')
688 try:
689 chunk_idx = 0
690 file_done = False
691 part_file = None
692 while not file_done:
693 if lines_per_file is None:
694 this_chunk_size = chunk_size
695 elif chunk_idx < len(lines_per_file):
696 this_chunk_size = lines_per_file[chunk_idx]
697 chunk_idx += 1
698 lines_remaining = this_chunk_size
699 part_file = None
700 while lines_remaining > 0:
701 a_line = f.readline()
702 if a_line == '':
703 file_done = True
704 break
705 if part_file is None:
706 part_dir = subdir_generator_function()
707 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
708 part_file = open(part_path, 'w')
709 part_file.write(a_line)
710 lines_remaining -= 1
711 if part_file is not None:
712 part_file.close()
713 except Exception, e:
714 log.error('Unable to split files: %s' % str(e))
715 f.close()
716 if part_file is not None:
717 part_file.close()
718 raise
719 f.close()
720
721 split = classmethod(split)
722
723
724 class glydeii(data.Data):
725 file_ext = 'glydeii'
726 line_class = 'line'
727
728 """Add metadata elements"""
729 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
730 visible=False, no_value=0)
731
732 def write_from_stream(self, dataset, stream):
733 """Writes data from a stream"""
734 # write it twice for now
735 fd, temp_name = tempfile.mkstemp()
736 while 1:
737 chunk = stream.read(1048576)
738 if not chunk:
739 break
740 os.write(fd, chunk)
741 os.close(fd)
742 # rewrite the file with unix newlines
743 fp = open(dataset.file_name, 'wt')
744 for line in file(temp_name, "U"):
745 line = line.strip() + '\n'
746 fp.write(line)
747 fp.close()
748
749 def set_raw_data(self, dataset, data):
750 """Saves the data on the disc"""
751 fd, temp_name = tempfile.mkstemp()
752 os.write(fd, data)
753 os.close(fd)
754 # rewrite the file with unix newlines
755 fp = open(dataset.file_name, 'wt')
756 for line in file(temp_name, "U"):
757 line = line.strip() + '\n'
758 fp.write(line)
759 fp.close()
760 os.remove(temp_name)
761
762 def get_mime(self):
763 """Returns the mime type of the datatype"""
764 return 'text/xml'
765
766 def set_meta(self, dataset, **kwd):
767 """
768 Set the number of lines of data in dataset.
769 """
770 dataset.metadata.data_lines = self.count_data_lines(dataset)
771
772 def estimate_file_lines(self, dataset):
773 """
774 Perform a rough estimate by extrapolating number of lines from a small read.
775 """
776 sample_size = 1048576
777 dataset_fh = open(dataset.file_name)
778 dataset_read = dataset_fh.read(sample_size)
779 dataset_fh.close()
780 sample_lines = dataset_read.count('\n')
781 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
782 return est_lines
783
784 def count_data_lines(self, dataset):
785 """
786 Count the number of lines of data in dataset,
787 skipping all blank lines and comments.
788 """
789 data_lines = 0
790 for line in file(dataset.file_name):
791 line = line.strip()
792 if line and not line.startswith('#'):
793 data_lines += 1
794 return data_lines
795
796 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
797 """
798 Set the peek. This method is used by various subclasses of Text.
799 """
800 if not dataset.dataset.purged:
801 # The file must exist on disk for the get_file_peek() method
802 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
803 skipchars=skipchars)
804 if line_count is None:
805 # See if line_count is stored in the metadata
806 if dataset.metadata.data_lines:
807 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
808 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
809 else:
810 # Number of lines is not known ( this should not happen ), and auto-detect is
811 # needed to set metadata
812 # This can happen when the file is larger than max_optional_metadata_filesize.
813 if int(dataset.get_size()) <= 1048576:
814 # Small dataset, recount all lines and reset peek afterward.
815 lc = self.count_data_lines(dataset)
816 dataset.metadata.data_lines = lc
817 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
818 else:
819 est_lines = self.estimate_file_lines(dataset)
820 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
821 inflector.cond_plural(est_lines, self.line_class) )
822 else:
823 dataset.blurb = "%s %s" % (
824 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
825 else:
826 dataset.peek = 'file does not exist'
827 dataset.blurb = 'file purged from disk'
828
829 def sniff(self, filename):
830 """All GlydeII XML files should use the rings form determination script """
831 try:
832 from suds.client import Client
833
834 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
835 client = Client(url)
836 response = client.service.DeterminingForm(file(filename, 'r').read())
837 if response.array[0] == "GLYDEII":
838 return True
839 else:
840 return False
841 except ImportError:
842 # cannot use import suds so use simple checker
843 print "using GlydeII simple checker"
844 import xml.etree.cElementTree as ET
845
846 tree = ET.parse(filename)
847 root = tree.getroot()
848 if root.tag == 'GlydeII':
849 print root.tag
850 return True
851 else:
852 return False
853 except Exception, e:
854 # note I am not raising an error rather return False and let another sniffer try to type this data
855 traceback.print_exc(file=sys.stdout)
856 return False
857
858 def split(cls, input_datasets, subdir_generator_function, split_params):
859 """
860 Split the input files by line.
861 """
862 if split_params is None:
863 return
864
865 if len(input_datasets) > 1:
866 raise Exception("Text file splitting does not support multiple files")
867 input_files = [ds.file_name for ds in input_datasets]
868
869 lines_per_file = None
870 chunk_size = None
871 if split_params['split_mode'] == 'number_of_parts':
872 lines_per_file = []
873 # Computing the length is expensive!
874 def _file_len(fname):
875 i = 0
876 f = open(fname)
877 for i, l in enumerate(f):
878 pass
879 f.close()
880 return i + 1
881
882 length = _file_len(input_files[0])
883 parts = int(split_params['split_size'])
884 if length < parts:
885 parts = length
886 len_each, remainder = divmod(length, parts)
887 while length > 0:
888 chunk = len_each
889 if remainder > 0:
890 chunk += 1
891 lines_per_file.append(chunk)
892 remainder = - 1
893 length -= chunk
894 elif split_params['split_mode'] == 'to_size':
895 chunk_size = int(split_params['split_size'])
896 else:
897 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
898
899 f = open(input_files[0], 'rt')
900 try:
901 chunk_idx = 0
902 file_done = False
903 part_file = None
904 while not file_done:
905 if lines_per_file is None:
906 this_chunk_size = chunk_size
907 elif chunk_idx < len(lines_per_file):
908 this_chunk_size = lines_per_file[chunk_idx]
909 chunk_idx += 1
910 lines_remaining = this_chunk_size
911 part_file = None
912 while lines_remaining > 0:
913 a_line = f.readline()
914 if a_line == '':
915 file_done = True
916 break
917 if part_file is None:
918 part_dir = subdir_generator_function()
919 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
920 part_file = open(part_path, 'w')
921 part_file.write(a_line)
922 lines_remaining -= 1
923 if part_file is not None:
924 part_file.close()
925 except Exception, e:
926 log.error('Unable to split files: %s' % str(e))
927 f.close()
928 if part_file is not None:
929 part_file.close()
930 raise
931 f.close()
932
933 split = classmethod(split)
934
935
936 class linucs(data.Data):
937 file_ext = 'linucs'
938 line_class = 'line'
939
940 """Add metadata elements"""
941 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
942 visible=False, no_value=0)
943
944 def write_from_stream(self, dataset, stream):
945 """Writes data from a stream"""
946 # write it twice for now
947 fd, temp_name = tempfile.mkstemp()
948 while 1:
949 chunk = stream.read(1048576)
950 if not chunk:
951 break
952 os.write(fd, chunk)
953 os.close(fd)
954 # rewrite the file with unix newlines
955 fp = open(dataset.file_name, 'wt')
956 for line in file(temp_name, "U"):
957 line = line.strip() + '\n'
958 fp.write(line)
959 fp.close()
960
961 def set_raw_data(self, dataset, data):
962 """Saves the data on the disc"""
963 fd, temp_name = tempfile.mkstemp()
964 os.write(fd, data)
965 os.close(fd)
966 # rewrite the file with unix newlines
967 fp = open(dataset.file_name, 'wt')
968 for line in file(temp_name, "U"):
969 line = line.strip() + '\n'
970 fp.write(line)
971 fp.close()
972 os.remove(temp_name)
973
974 def get_mime(self):
975 """Returns the mime type of the datatype"""
976 return 'text/plain'
977
978 def set_meta(self, dataset, **kwd):
979 """
980 Set the number of lines of data in dataset.
981 """
982 dataset.metadata.data_lines = self.count_data_lines(dataset)
983
984 def estimate_file_lines(self, dataset):
985 """
986 Perform a rough estimate by extrapolating number of lines from a small read.
987 """
988 sample_size = 1048576
989 dataset_fh = open(dataset.file_name)
990 dataset_read = dataset_fh.read(sample_size)
991 dataset_fh.close()
992 sample_lines = dataset_read.count('\n')
993 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
994 return est_lines
995
996 def count_data_lines(self, dataset):
997 """
998 Count the number of lines of data in dataset,
999 skipping all blank lines and comments.
1000 """
1001 data_lines = 0
1002 for line in file(dataset.file_name):
1003 line = line.strip()
1004 if line and not line.startswith('#'):
1005 data_lines += 1
1006 return data_lines
1007
1008 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
1009 """
1010 Set the peek. This method is used by various subclasses of Text.
1011 """
1012 if not dataset.dataset.purged:
1013 # The file must exist on disk for the get_file_peek() method
1014 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
1015 skipchars=skipchars)
1016 if line_count is None:
1017 # See if line_count is stored in the metadata
1018 if dataset.metadata.data_lines:
1019 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
1020 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
1021 else:
1022 # Number of lines is not known ( this should not happen ), and auto-detect is
1023 # needed to set metadata
1024 # This can happen when the file is larger than max_optional_metadata_filesize.
1025 if int(dataset.get_size()) <= 1048576:
1026 # Small dataset, recount all lines and reset peek afterward.
1027 lc = self.count_data_lines(dataset)
1028 dataset.metadata.data_lines = lc
1029 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
1030 else:
1031 est_lines = self.estimate_file_lines(dataset)
1032 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
1033 inflector.cond_plural(est_lines, self.line_class) )
1034 else:
1035 dataset.blurb = "%s %s" % (
1036 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
1037 else:
1038 dataset.peek = 'file does not exist'
1039 dataset.blurb = 'file purged from disk'
1040
1041 def sniff(self, filename):
1042 """All LINUCS files should use the rings form determination script """
1043 try:
1044 from suds.client import Client
1045
1046 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
1047 client = Client(url)
1048 response = client.service.DeterminingForm(file(filename, 'r').read())
1049 if response.array[0] == "LINUCS":
1050 return True
1051 else:
1052 return False
1053 except ImportError:
1054 # cannot use import suds so use simple checker
1055 print "using LINUCS simple checker"
1056
1057 f = open(filename, "r")
1058 firstline = f.readline()
1059 f.close()
1060
1061 if "[" in firstline and "]" in firstline and "{" in firstline and "}" in firstline:
1062 return True
1063 else:
1064 return False
1065 except Exception, e:
1066 # note I am not raising an error rather return False and let another sniffer try to type this data
1067 traceback.print_exc(file=sys.stdout)
1068 return False
1069
1070 def split(cls, input_datasets, subdir_generator_function, split_params):
1071 """
1072 Split the input files by line.
1073 """
1074 if split_params is None:
1075 return
1076
1077 if len(input_datasets) > 1:
1078 raise Exception("Text file splitting does not support multiple files")
1079 input_files = [ds.file_name for ds in input_datasets]
1080
1081 lines_per_file = None
1082 chunk_size = None
1083 if split_params['split_mode'] == 'number_of_parts':
1084 lines_per_file = []
1085 # Computing the length is expensive!
1086 def _file_len(fname):
1087 i = 0
1088 f = open(fname)
1089 for i, l in enumerate(f):
1090 pass
1091 f.close()
1092 return i + 1
1093
1094 length = _file_len(input_files[0])
1095 parts = int(split_params['split_size'])
1096 if length < parts:
1097 parts = length
1098 len_each, remainder = divmod(length, parts)
1099 while length > 0:
1100 chunk = len_each
1101 if remainder > 0:
1102 chunk += 1
1103 lines_per_file.append(chunk)
1104 remainder = - 1
1105 length -= chunk
1106 elif split_params['split_mode'] == 'to_size':
1107 chunk_size = int(split_params['split_size'])
1108 else:
1109 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
1110
1111 f = open(input_files[0], 'rt')
1112 try:
1113 chunk_idx = 0
1114 file_done = False
1115 part_file = None
1116 while not file_done:
1117 if lines_per_file is None:
1118 this_chunk_size = chunk_size
1119 elif chunk_idx < len(lines_per_file):
1120 this_chunk_size = lines_per_file[chunk_idx]
1121 chunk_idx += 1
1122 lines_remaining = this_chunk_size
1123 part_file = None
1124 while lines_remaining > 0:
1125 a_line = f.readline()
1126 if a_line == '':
1127 file_done = True
1128 break
1129 if part_file is None:
1130 part_dir = subdir_generator_function()
1131 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
1132 part_file = open(part_path, 'w')
1133 part_file.write(a_line)
1134 lines_remaining -= 1
1135 if part_file is not None:
1136 part_file.close()
1137 except Exception, e:
1138 log.error('Unable to split files: %s' % str(e))
1139 f.close()
1140 if part_file is not None:
1141 part_file.close()
1142 raise
1143 f.close()
1144
1145 split = classmethod(split)
1146
1147
1148 class iupac(data.Data):
1149 file_ext = 'iupac'
1150 line_class = 'line'
1151
1152 """Add metadata elements"""
1153 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
1154 visible=False, no_value=0)
1155
1156 def write_from_stream(self, dataset, stream):
1157 """Writes data from a stream"""
1158 # write it twice for now
1159 fd, temp_name = tempfile.mkstemp()
1160 while 1:
1161 chunk = stream.read(1048576)
1162 if not chunk:
1163 break
1164 os.write(fd, chunk)
1165 os.close(fd)
1166 # rewrite the file with unix newlines
1167 fp = open(dataset.file_name, 'wt')
1168 for line in file(temp_name, "U"):
1169 line = line.strip() + '\n'
1170 fp.write(line)
1171 fp.close()
1172
1173 def set_raw_data(self, dataset, data):
1174 """Saves the data on the disc"""
1175 fd, temp_name = tempfile.mkstemp()
1176 os.write(fd, data)
1177 os.close(fd)
1178 # rewrite the file with unix newlines
1179 fp = open(dataset.file_name, 'wt')
1180 for line in file(temp_name, "U"):
1181 line = line.strip() + '\n'
1182 fp.write(line)
1183 fp.close()
1184 os.remove(temp_name)
1185
1186 def get_mime(self):
1187 """Returns the mime type of the datatype"""
1188 return 'text/plain'
1189
1190 def set_meta(self, dataset, **kwd):
1191 """
1192 Set the number of lines of data in dataset.
1193 """
1194 dataset.metadata.data_lines = self.count_data_lines(dataset)
1195
1196 def estimate_file_lines(self, dataset):
1197 """
1198 Perform a rough estimate by extrapolating number of lines from a small read.
1199 """
1200 sample_size = 1048576
1201 dataset_fh = open(dataset.file_name)
1202 dataset_read = dataset_fh.read(sample_size)
1203 dataset_fh.close()
1204 sample_lines = dataset_read.count('\n')
1205 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
1206 return est_lines
1207
1208 def count_data_lines(self, dataset):
1209 """
1210 Count the number of lines of data in dataset,
1211 skipping all blank lines and comments.
1212 """
1213 data_lines = 0
1214 for line in file(dataset.file_name):
1215 line = line.strip()
1216 if line and not line.startswith('#'):
1217 data_lines += 1
1218 return data_lines
1219
1220 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
1221 """
1222 Set the peek. This method is used by various subclasses of Text.
1223 """
1224 if not dataset.dataset.purged:
1225 # The file must exist on disk for the get_file_peek() method
1226 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
1227 skipchars=skipchars)
1228 if line_count is None:
1229 # See if line_count is stored in the metadata
1230 if dataset.metadata.data_lines:
1231 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
1232 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
1233 else:
1234 # Number of lines is not known ( this should not happen ), and auto-detect is
1235 # needed to set metadata
1236 # This can happen when the file is larger than max_optional_metadata_filesize.
1237 if int(dataset.get_size()) <= 1048576:
1238 # Small dataset, recount all lines and reset peek afterward.
1239 lc = self.count_data_lines(dataset)
1240 dataset.metadata.data_lines = lc
1241 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
1242 else:
1243 est_lines = self.estimate_file_lines(dataset)
1244 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
1245 inflector.cond_plural(est_lines, self.line_class) )
1246 else:
1247 dataset.blurb = "%s %s" % (
1248 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
1249 else:
1250 dataset.peek = 'file does not exist'
1251 dataset.blurb = 'file purged from disk'
1252
1253 def sniff(self, filename):
1254 """All IUPAC files should use the rings form determination script """
1255 try:
1256 from suds.client import Client
1257
1258 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
1259 client = Client(url)
1260 response = client.service.DeterminingForm(file(filename, 'r').read())
1261 if response.array[0] == "IUPAC":
1262 return True
1263 else:
1264 return False
1265 except ImportError:
1266 # cannot use import suds so use simple checker
1267 print "using IUPAC simple checker"
1268 f = open(filename, "r")
1269 firstline = f.readline()
1270 f.close()
1271
1272 if "[" in firstline or "]" in firstline or "(" in firstline or ")" in firstline:
1273 if "{" in firstline or "}" in firstline:
1274 return False
1275 else:
1276 return True
1277 else:
1278 return False
1279 except Exception, e:
1280 # note I am not raising an error rather return False and let another sniffer try to type this data
1281 traceback.print_exc(file=sys.stdout)
1282 return False
1283
1284 def split(cls, input_datasets, subdir_generator_function, split_params):
1285 """
1286 Split the input files by line.
1287 """
1288 if split_params is None:
1289 return
1290
1291 if len(input_datasets) > 1:
1292 raise Exception("Text file splitting does not support multiple files")
1293 input_files = [ds.file_name for ds in input_datasets]
1294
1295 lines_per_file = None
1296 chunk_size = None
1297 if split_params['split_mode'] == 'number_of_parts':
1298 lines_per_file = []
1299 # Computing the length is expensive!
1300 def _file_len(fname):
1301 i = 0
1302 f = open(fname)
1303 for i, l in enumerate(f):
1304 pass
1305 f.close()
1306 return i + 1
1307
1308 length = _file_len(input_files[0])
1309 parts = int(split_params['split_size'])
1310 if length < parts:
1311 parts = length
1312 len_each, remainder = divmod(length, parts)
1313 while length > 0:
1314 chunk = len_each
1315 if remainder > 0:
1316 chunk += 1
1317 lines_per_file.append(chunk)
1318 remainder = - 1
1319 length -= chunk
1320 elif split_params['split_mode'] == 'to_size':
1321 chunk_size = int(split_params['split_size'])
1322 else:
1323 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
1324
1325 f = open(input_files[0], 'rt')
1326 try:
1327 chunk_idx = 0
1328 file_done = False
1329 part_file = None
1330 while not file_done:
1331 if lines_per_file is None:
1332 this_chunk_size = chunk_size
1333 elif chunk_idx < len(lines_per_file):
1334 this_chunk_size = lines_per_file[chunk_idx]
1335 chunk_idx += 1
1336 lines_remaining = this_chunk_size
1337 part_file = None
1338 while lines_remaining > 0:
1339 a_line = f.readline()
1340 if a_line == '':
1341 file_done = True
1342 break
1343 if part_file is None:
1344 part_dir = subdir_generator_function()
1345 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
1346 part_file = open(part_path, 'w')
1347 part_file.write(a_line)
1348 lines_remaining -= 1
1349 if part_file is not None:
1350 part_file.close()
1351 except Exception, e:
1352 log.error('Unable to split files: %s' % str(e))
1353 f.close()
1354 if part_file is not None:
1355 part_file.close()
1356 raise
1357 f.close()
1358
1359 split = classmethod(split)
1360
1361
1362 class linearcode(data.Data):
1363 file_ext = 'linearcode'
1364 line_class = 'line'
1365
1366 """Add metadata elements"""
1367 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
1368 visible=False, no_value=0)
1369
1370 def write_from_stream(self, dataset, stream):
1371 """Writes data from a stream"""
1372 # write it twice for now
1373 fd, temp_name = tempfile.mkstemp()
1374 while 1:
1375 chunk = stream.read(1048576)
1376 if not chunk:
1377 break
1378 os.write(fd, chunk)
1379 os.close(fd)
1380 # rewrite the file with unix newlines
1381 fp = open(dataset.file_name, 'wt')
1382 for line in file(temp_name, "U"):
1383 line = line.strip() + '\n'
1384 fp.write(line)
1385 fp.close()
1386
1387 def set_raw_data(self, dataset, data):
1388 """Saves the data on the disc"""
1389 fd, temp_name = tempfile.mkstemp()
1390 os.write(fd, data)
1391 os.close(fd)
1392 # rewrite the file with unix newlines
1393 fp = open(dataset.file_name, 'wt')
1394 for line in file(temp_name, "U"):
1395 line = line.strip() + '\n'
1396 fp.write(line)
1397 fp.close()
1398 os.remove(temp_name)
1399
1400 def get_mime(self):
1401 """Returns the mime type of the datatype"""
1402 return 'text/plain'
1403
1404 def set_meta(self, dataset, **kwd):
1405 """
1406 Set the number of lines of data in dataset.
1407 """
1408 dataset.metadata.data_lines = self.count_data_lines(dataset)
1409
1410 def estimate_file_lines(self, dataset):
1411 """
1412 Perform a rough estimate by extrapolating number of lines from a small read.
1413 """
1414 sample_size = 1048576
1415 dataset_fh = open(dataset.file_name)
1416 dataset_read = dataset_fh.read(sample_size)
1417 dataset_fh.close()
1418 sample_lines = dataset_read.count('\n')
1419 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
1420 return est_lines
1421
1422 def count_data_lines(self, dataset):
1423 """
1424 Count the number of lines of data in dataset,
1425 skipping all blank lines and comments.
1426 """
1427 data_lines = 0
1428 for line in file(dataset.file_name):
1429 line = line.strip()
1430 if line and not line.startswith('#'):
1431 data_lines += 1
1432 return data_lines
1433
1434 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
1435 """
1436 Set the peek. This method is used by various subclasses of Text.
1437 """
1438 if not dataset.dataset.purged:
1439 # The file must exist on disk for the get_file_peek() method
1440 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
1441 skipchars=skipchars)
1442 if line_count is None:
1443 # See if line_count is stored in the metadata
1444 if dataset.metadata.data_lines:
1445 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
1446 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
1447 else:
1448 # Number of lines is not known ( this should not happen ), and auto-detect is
1449 # needed to set metadata
1450 # This can happen when the file is larger than max_optional_metadata_filesize.
1451 if int(dataset.get_size()) <= 1048576:
1452 # Small dataset, recount all lines and reset peek afterward.
1453 lc = self.count_data_lines(dataset)
1454 dataset.metadata.data_lines = lc
1455 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
1456 else:
1457 est_lines = self.estimate_file_lines(dataset)
1458 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
1459 inflector.cond_plural(est_lines, self.line_class) )
1460 else:
1461 dataset.blurb = "%s %s" % (
1462 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
1463 else:
1464 dataset.peek = 'file does not exist'
1465 dataset.blurb = 'file purged from disk'
1466
1467 def sniff(self, filename):
1468 """All linear code files should use the rings form determination script """
1469 try:
1470 from suds.client import Client
1471
1472 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
1473 client = Client(url)
1474 lcresponse = client.service.DeterminingForm(file(filename, 'r').read())
1475 if lcresponse.array[0] == "LinearCode":
1476 print "LinearCode"
1477 return True
1478 else:
1479 print "Unable to guess format"
1480 return False
1481 except ImportError:
1482 # cannot use import suds so use simple checker
1483 print "using LinearCode simple checker - nope it does not exist yet"
1484 return False
1485 except Exception, e:
1486 # note I am not raising an error rather return False and let another sniffer try to type this data
1487 traceback.print_exc(file=sys.stdout)
1488 return False
1489
1490 def split(cls, input_datasets, subdir_generator_function, split_params):
1491 """
1492 Split the input files by line.
1493 """
1494 if split_params is None:
1495 return
1496
1497 if len(input_datasets) > 1:
1498 raise Exception("Text file splitting does not support multiple files")
1499 input_files = [ds.file_name for ds in input_datasets]
1500
1501 lines_per_file = None
1502 chunk_size = None
1503 if split_params['split_mode'] == 'number_of_parts':
1504 lines_per_file = []
1505 # Computing the length is expensive!
1506 def _file_len(fname):
1507 i = 0
1508 f = open(fname)
1509 for i, l in enumerate(f):
1510 pass
1511 f.close()
1512 return i + 1
1513
1514 length = _file_len(input_files[0])
1515 parts = int(split_params['split_size'])
1516 if length < parts:
1517 parts = length
1518 len_each, remainder = divmod(length, parts)
1519 while length > 0:
1520 chunk = len_each
1521 if remainder > 0:
1522 chunk += 1
1523 lines_per_file.append(chunk)
1524 remainder = - 1
1525 length -= chunk
1526 elif split_params['split_mode'] == 'to_size':
1527 chunk_size = int(split_params['split_size'])
1528 else:
1529 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
1530
1531 f = open(input_files[0], 'rt')
1532 try:
1533 chunk_idx = 0
1534 file_done = False
1535 part_file = None
1536 while not file_done:
1537 if lines_per_file is None:
1538 this_chunk_size = chunk_size
1539 elif chunk_idx < len(lines_per_file):
1540 this_chunk_size = lines_per_file[chunk_idx]
1541 chunk_idx += 1
1542 lines_remaining = this_chunk_size
1543 part_file = None
1544 while lines_remaining > 0:
1545 a_line = f.readline()
1546 if a_line == '':
1547 file_done = True
1548 break
1549 if part_file is None:
1550 part_dir = subdir_generator_function()
1551 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
1552 part_file = open(part_path, 'w')
1553 part_file.write(a_line)
1554 lines_remaining -= 1
1555 if part_file is not None:
1556 part_file.close()
1557 except Exception, e:
1558 log.error('Unable to split files: %s' % str(e))
1559 f.close()
1560 if part_file is not None:
1561 part_file.close()
1562 raise
1563 f.close()
1564
1565 split = classmethod(split)
1566
1567
1568 class msa(data.Data):
1569 file_ext = 'msa'
1570 line_class = 'line'
1571
1572 """Add metadata elements"""
1573 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
1574 visible=False, no_value=0)
1575
1576 def write_from_stream(self, dataset, stream):
1577 """Writes data from a stream"""
1578 # write it twice for now
1579 fd, temp_name = tempfile.mkstemp()
1580 while 1:
1581 chunk = stream.read(1048576)
1582 if not chunk:
1583 break
1584 os.write(fd, chunk)
1585 os.close(fd)
1586 # rewrite the file with unix newlines
1587 fp = open(dataset.file_name, 'wt')
1588 for line in file(temp_name, "U"):
1589 line = line.strip() + '\n'
1590 fp.write(line)
1591 fp.close()
1592
1593 def set_raw_data(self, dataset, data):
1594 """Saves the data on the disc"""
1595 fd, temp_name = tempfile.mkstemp()
1596 os.write(fd, data)
1597 os.close(fd)
1598 # rewrite the file with unix newlines
1599 fp = open(dataset.file_name, 'wt')
1600 for line in file(temp_name, "U"):
1601 line = line.strip() + '\n'
1602 fp.write(line)
1603 fp.close()
1604 os.remove(temp_name)
1605
1606 def get_mime(self):
1607 """Returns the mime type of the datatype"""
1608 return 'text/plain'
1609
1610 def set_meta(self, dataset, **kwd):
1611 """
1612 Set the number of lines of data in dataset.
1613 """
1614 dataset.metadata.data_lines = self.count_data_lines(dataset)
1615
1616 def estimate_file_lines(self, dataset):
1617 """
1618 Perform a rough estimate by extrapolating number of lines from a small read.
1619 """
1620 sample_size = 1048576
1621 dataset_fh = open(dataset.file_name)
1622 dataset_read = dataset_fh.read(sample_size)
1623 dataset_fh.close()
1624 sample_lines = dataset_read.count('\n')
1625 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
1626 return est_lines
1627
1628 def count_data_lines(self, dataset):
1629 """
1630 Count the number of lines of data in dataset,
1631 skipping all blank lines and comments.
1632 """
1633 data_lines = 0
1634 for line in file(dataset.file_name):
1635 line = line.strip()
1636 if line and not line.startswith('#'):
1637 data_lines += 1
1638 return data_lines
1639
1640 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
1641 """
1642 Set the peek. This method is used by various subclasses of Text.
1643 """
1644 if not dataset.dataset.purged:
1645 # The file must exist on disk for the get_file_peek() method
1646 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
1647 skipchars=skipchars)
1648 if line_count is None:
1649 # See if line_count is stored in the metadata
1650 if dataset.metadata.data_lines:
1651 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
1652 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
1653 else:
1654 # Number of lines is not known ( this should not happen ), and auto-detect is
1655 # needed to set metadata
1656 # This can happen when the file is larger than max_optional_metadata_filesize.
1657 if int(dataset.get_size()) <= 1048576:
1658 # Small dataset, recount all lines and reset peek afterward.
1659 lc = self.count_data_lines(dataset)
1660 dataset.metadata.data_lines = lc
1661 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
1662 else:
1663 est_lines = self.estimate_file_lines(dataset)
1664 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
1665 inflector.cond_plural(est_lines, self.line_class) )
1666 else:
1667 dataset.blurb = "%s %s" % (
1668 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
1669 else:
1670 dataset.peek = 'file does not exist'
1671 dataset.blurb = 'file purged from disk'
1672
1673 def sniff(self, filename):
1674 """All msa Files simply put a '# .msa' in the first line. """
1675 try:
1676 f = open(filename, "r")
1677 firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity
1678 f.close()
1679
1680 if "# .MSA" in firstline:
1681 return True
1682 else:
1683 return False
1684 except:
1685 traceback.print_exc(file=sys.stdout)
1686 return False
1687
1688 def split(cls, input_datasets, subdir_generator_function, split_params):
1689 """
1690 Split the input files by line.
1691 """
1692 if split_params is None:
1693 return
1694
1695 if len(input_datasets) > 1:
1696 raise Exception("Text file splitting does not support multiple files")
1697 input_files = [ds.file_name for ds in input_datasets]
1698
1699 lines_per_file = None
1700 chunk_size = None
1701 if split_params['split_mode'] == 'number_of_parts':
1702 lines_per_file = []
1703 # Computing the length is expensive!
1704 def _file_len(fname):
1705 i = 0
1706 f = open(fname)
1707 for i, l in enumerate(f):
1708 pass
1709 f.close()
1710 return i + 1
1711
1712 length = _file_len(input_files[0])
1713 parts = int(split_params['split_size'])
1714 if length < parts:
1715 parts = length
1716 len_each, remainder = divmod(length, parts)
1717 while length > 0:
1718 chunk = len_each
1719 if remainder > 0:
1720 chunk += 1
1721 lines_per_file.append(chunk)
1722 remainder = - 1
1723 length -= chunk
1724 elif split_params['split_mode'] == 'to_size':
1725 chunk_size = int(split_params['split_size'])
1726 else:
1727 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
1728
1729 f = open(input_files[0], 'rt')
1730 try:
1731 chunk_idx = 0
1732 file_done = False
1733 part_file = None
1734 while not file_done:
1735 if lines_per_file is None:
1736 this_chunk_size = chunk_size
1737 elif chunk_idx < len(lines_per_file):
1738 this_chunk_size = lines_per_file[chunk_idx]
1739 chunk_idx += 1
1740 lines_remaining = this_chunk_size
1741 part_file = None
1742 while lines_remaining > 0:
1743 a_line = f.readline()
1744 if a_line == '':
1745 file_done = True
1746 break
1747 if part_file is None:
1748 part_dir = subdir_generator_function()
1749 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
1750 part_file = open(part_path, 'w')
1751 part_file.write(a_line)
1752 lines_remaining -= 1
1753 if part_file is not None:
1754 part_file.close()
1755 except Exception, e:
1756 log.error('Unable to split files: %s' % str(e))
1757 f.close()
1758 if part_file is not None:
1759 part_file.close()
1760 raise
1761 f.close()
1762
1763 split = classmethod(split)
1764
1765
1766 class wurcs(data.Data):
1767 file_ext = 'wurcs'
1768 line_class = 'line'
1769
1770 """Add metadata elements"""
1771 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
1772 visible=False, no_value=0)
1773
1774 def write_from_stream(self, dataset, stream):
1775 """Writes data from a stream"""
1776 # write it twice for now
1777 fd, temp_name = tempfile.mkstemp()
1778 while 1:
1779 chunk = stream.read(1048576)
1780 if not chunk:
1781 break
1782 os.write(fd, chunk)
1783 os.close(fd)
1784 # rewrite the file with unix newlines
1785 fp = open(dataset.file_name, 'wt')
1786 for line in file(temp_name, "U"):
1787 line = line.strip() + '\n'
1788 fp.write(line)
1789 fp.close()
1790
1791 def set_raw_data(self, dataset, data):
1792 """Saves the data on the disc"""
1793 fd, temp_name = tempfile.mkstemp()
1794 os.write(fd, data)
1795 os.close(fd)
1796 # rewrite the file with unix newlines
1797 fp = open(dataset.file_name, 'wt')
1798 for line in file(temp_name, "U"):
1799 line = line.strip() + '\n'
1800 fp.write(line)
1801 fp.close()
1802 os.remove(temp_name)
1803
1804 def get_mime(self):
1805 """Returns the mime type of the datatype"""
1806 return 'text/plain'
1807
1808 def set_meta(self, dataset, **kwd):
1809 """
1810 Set the number of lines of data in dataset.
1811 """
1812 dataset.metadata.data_lines = self.count_data_lines(dataset)
1813
1814 def estimate_file_lines(self, dataset):
1815 """
1816 Perform a rough estimate by extrapolating number of lines from a small read.
1817 """
1818 sample_size = 1048576
1819 dataset_fh = open(dataset.file_name)
1820 dataset_read = dataset_fh.read(sample_size)
1821 dataset_fh.close()
1822 sample_lines = dataset_read.count('\n')
1823 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
1824 return est_lines
1825
1826 def count_data_lines(self, dataset):
1827 """
1828 Count the number of lines of data in dataset,
1829 skipping all blank lines and comments.
1830 """
1831 data_lines = 0
1832 for line in file(dataset.file_name):
1833 line = line.strip()
1834 if line and not line.startswith('#'):
1835 data_lines += 1
1836 return data_lines
1837
1838 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
1839 """
1840 Set the peek. This method is used by various subclasses of Text.
1841 """
1842 if not dataset.dataset.purged:
1843 # The file must exist on disk for the get_file_peek() method
1844 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
1845 skipchars=skipchars)
1846 if line_count is None:
1847 # See if line_count is stored in the metadata
1848 if dataset.metadata.data_lines:
1849 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
1850 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
1851 else:
1852 # Number of lines is not known ( this should not happen ), and auto-detect is
1853 # needed to set metadata
1854 # This can happen when the file is larger than max_optional_metadata_filesize.
1855 if int(dataset.get_size()) <= 1048576:
1856 # Small dataset, recount all lines and reset peek afterward.
1857 lc = self.count_data_lines(dataset)
1858 dataset.metadata.data_lines = lc
1859 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
1860 else:
1861 est_lines = self.estimate_file_lines(dataset)
1862 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
1863 inflector.cond_plural(est_lines, self.line_class) )
1864 else:
1865 dataset.blurb = "%s %s" % (
1866 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
1867 else:
1868 dataset.peek = 'file does not exist'
1869 dataset.blurb = 'file purged from disk'
1870
1871 def sniff(self, filename):
1872 """All WURCS Files start with WURCS= then the version number. see http://www.wurcs-wg.org/definition.php and http://rings.t.soka.ac.jp/
1873 WURCS=2.0/4,3/[x2112h+1:x|1,5|2*NCC/3=O|4*OSO/3=O/3=O][12122a+1:b|1,5][12112h+1:b|1,5|2*NCC/3=O|6*OSO/3=O/3=O][12122a+1:b|1,5]1+3,2+1|2+4,3+1|3+3,4+1"""
1874 try:
1875 f = open(filename, "r")
1876 firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity
1877 f.close()
1878 if "WURCS" in firstline:
1879 return True
1880 else:
1881 return False
1882 except:
1883 traceback.print_exc(file=sys.stdout)
1884 return False
1885
1886
1887 def split(cls, input_datasets, subdir_generator_function, split_params):
1888 """
1889 Split the input files by line.
1890 """
1891 if split_params is None:
1892 return
1893
1894 if len(input_datasets) > 1:
1895 raise Exception("Text file splitting does not support multiple files")
1896 input_files = [ds.file_name for ds in input_datasets]
1897
1898 lines_per_file = None
1899 chunk_size = None
1900 if split_params['split_mode'] == 'number_of_parts':
1901 lines_per_file = []
1902 # Computing the length is expensive!
1903 def _file_len(fname):
1904 i = 0
1905 f = open(fname)
1906 for i, l in enumerate(f):
1907 pass
1908 f.close()
1909 return i + 1
1910
1911 length = _file_len(input_files[0])
1912 parts = int(split_params['split_size'])
1913 if length < parts:
1914 parts = length
1915 len_each, remainder = divmod(length, parts)
1916 while length > 0:
1917 chunk = len_each
1918 if remainder > 0:
1919 chunk += 1
1920 lines_per_file.append(chunk)
1921 remainder = - 1
1922 length -= chunk
1923 elif split_params['split_mode'] == 'to_size':
1924 chunk_size = int(split_params['split_size'])
1925 else:
1926 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
1927
1928 f = open(input_files[0], 'rt')
1929 try:
1930 chunk_idx = 0
1931 file_done = False
1932 part_file = None
1933 while not file_done:
1934 if lines_per_file is None:
1935 this_chunk_size = chunk_size
1936 elif chunk_idx < len(lines_per_file):
1937 this_chunk_size = lines_per_file[chunk_idx]
1938 chunk_idx += 1
1939 lines_remaining = this_chunk_size
1940 part_file = None
1941 while lines_remaining > 0:
1942 a_line = f.readline()
1943 if a_line == '':
1944 file_done = True
1945 break
1946 if part_file is None:
1947 part_dir = subdir_generator_function()
1948 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
1949 part_file = open(part_path, 'w')
1950 part_file.write(a_line)
1951 lines_remaining -= 1
1952 if part_file is not None:
1953 part_file.close()
1954 except Exception, e:
1955 log.error('Unable to split files: %s' % str(e))
1956 f.close()
1957 if part_file is not None:
1958 part_file.close()
1959 raise
1960 f.close()
1961
1962 split = classmethod(split)
1963
1964