comparison molecules.py @ 0:85eca06eefc6 draft default tip

Uploaded
author bgruening
date Thu, 15 Aug 2013 03:19:26 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:85eca06eefc6
1 # -*- coding: utf-8 -*-
2
3 from galaxy.datatypes import data
4 import logging
5 from galaxy.datatypes.sniff import get_headers, get_test_fname
6 from galaxy.datatypes.data import get_file_peek
7 from galaxy.datatypes.tabular import Tabular
8 from galaxy.datatypes.binary import Binary
9 from galaxy.datatypes.xml import GenericXml
10 import subprocess
11 import os
12 #import pybel
13 #import openbabel
14 #openbabel.obErrorLog.StopLogging()
15
16 from galaxy.datatypes.metadata import MetadataElement
17 from galaxy.datatypes import metadata
18
19 log = logging.getLogger(__name__)
20
21 def count_special_lines( word, filename, invert = False ):
22 """
23 searching for special 'words' using the grep tool
24 grep is used to speed up the searching and counting
25 The number of hits is returned.
26 """
27 try:
28 cmd = ["grep", "-c"]
29 if invert:
30 cmd.append('-v')
31 cmd.extend([word, filename])
32 out = subprocess.Popen(cmd, stdout=subprocess.PIPE)
33 return int(out.communicate()[0].split()[0])
34 except:
35 pass
36 return 0
37
38 def count_lines( filename, non_empty = False):
39 """
40 counting the number of lines from the 'filename' file
41 """
42 try:
43 if non_empty:
44 out = subprocess.Popen(['grep', '-cve', '^\s*$', filename], stdout=subprocess.PIPE)
45 else:
46 out = subprocess.Popen(['wc', '-l', filename], stdout=subprocess.PIPE)
47 return int(out.communicate()[0].split()[0])
48 except:
49 pass
50 return 0
51
52
53 class GenericMolFile( data.Text ):
54 """
55 abstract class for most of the molecule files
56 """
57 MetadataElement( name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0 )
58
59 def set_peek( self, dataset, is_multi_byte=False ):
60 if not dataset.dataset.purged:
61 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
62 if (dataset.metadata.number_of_molecules == 1):
63 dataset.blurb = "1 molecule"
64 else:
65 dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules
66 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
67 else:
68 dataset.peek = 'file does not exist'
69 dataset.blurb = 'file purged from disk'
70
71 def get_mime(self):
72 return 'text/plain'
73
74 class MOL( GenericMolFile ):
75 file_ext = "mol"
76 def sniff( self, filename ):
77 if count_special_lines("^M\s*END", filename) == 1:
78 return True
79 else:
80 return False
81
82 def set_meta( self, dataset, **kwd ):
83 """
84 Set the number molecules, in the case of MOL its always one.
85 """
86 dataset.metadata.number_of_molecules = 1
87
88
89 class SDF( GenericMolFile ):
90 file_ext = "sdf"
91 def sniff( self, filename ):
92 if count_special_lines("^\$\$\$\$", filename) > 0:
93 return True
94 else:
95 return False
96
97 def set_meta( self, dataset, **kwd ):
98 """
99 Set the number of molecules in dataset.
100 """
101 dataset.metadata.number_of_molecules = count_special_lines("^\$\$\$\$", dataset.file_name)
102
103 def split( cls, input_datasets, subdir_generator_function, split_params):
104 """
105 Split the input files by molecule records.
106 """
107 if split_params is None:
108 return None
109
110 if len(input_datasets) > 1:
111 raise Exception("SD-file splitting does not support multiple files")
112 input_files = [ds.file_name for ds in input_datasets]
113
114 chunk_size = None
115 if split_params['split_mode'] == 'number_of_parts':
116 raise Exception('Split mode "%s" is currently not implemented for SD-files.' % split_params['split_mode'])
117 elif split_params['split_mode'] == 'to_size':
118 chunk_size = int(split_params['split_size'])
119 else:
120 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
121
122 def _read_sdf_records( filename ):
123 lines = []
124 with open(filename) as handle:
125 for line in handle:
126 lines.append( line )
127 if line.startswith("$$$$"):
128 yield lines
129 lines = []
130
131 def _write_part_sdf_file( accumulated_lines ):
132 part_dir = subdir_generator_function()
133 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
134 part_file = open(part_path, 'w')
135 part_file.writelines( accumulated_lines )
136 part_file.close()
137
138 try:
139 sdf_records = _read_sdf_records( input_files[0] )
140 sdf_lines_accumulated = []
141 for counter, sdf_record in enumerate( sdf_records, start = 1):
142 sdf_lines_accumulated.extend( sdf_record )
143 if counter % chunk_size == 0:
144 _write_part_sdf_file( sdf_lines_accumulated )
145 sdf_lines_accumulated = []
146 if sdf_lines_accumulated:
147 _write_part_sdf_file( sdf_lines_accumulated )
148 except Exception, e:
149 log.error('Unable to split files: %s' % str(e))
150 raise
151 split = classmethod(split)
152
153
154 class MOL2( GenericMolFile ):
155 file_ext = "mol2"
156 def sniff( self, filename ):
157 if count_special_lines("@\<TRIPOS\>MOLECULE", filename) > 0:
158 return True
159 else:
160 return False
161
162 def set_meta( self, dataset, **kwd ):
163 """
164 Set the number of lines of data in dataset.
165 """
166 dataset.metadata.number_of_molecules = count_special_lines("@<TRIPOS>MOLECULE", dataset.file_name)#self.count_data_lines(dataset)
167
168 def split( cls, input_datasets, subdir_generator_function, split_params):
169 """
170 Split the input files by molecule records.
171 """
172 if split_params is None:
173 return None
174
175 if len(input_datasets) > 1:
176 raise Exception("MOL2-file splitting does not support multiple files")
177 input_files = [ds.file_name for ds in input_datasets]
178
179 chunk_size = None
180 if split_params['split_mode'] == 'number_of_parts':
181 raise Exception('Split mode "%s" is currently not implemented for MOL2-files.' % split_params['split_mode'])
182 elif split_params['split_mode'] == 'to_size':
183 chunk_size = int(split_params['split_size'])
184 else:
185 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
186
187 def _read_mol2_records( filename ):
188 lines = []
189 start = True
190 with open(filename) as handle:
191 for line in handle:
192 if line.startswith("@<TRIPOS>MOLECULE"):
193 if start:
194 start = False
195 else:
196 yield lines
197 lines = []
198 lines.append( line )
199
200 def _write_part_mol2_file( accumulated_lines ):
201 part_dir = subdir_generator_function()
202 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
203 part_file = open(part_path, 'w')
204 part_file.writelines( accumulated_lines )
205 part_file.close()
206
207 try:
208 mol2_records = _read_mol2_records( input_files[0] )
209 mol2_lines_accumulated = []
210 for counter, mol2_record in enumerate( mol2_records, start = 1):
211 mol2_lines_accumulated.extend( mol2_record )
212 if counter % chunk_size == 0:
213 _write_part_mol2_file( mol2_lines_accumulated )
214 mol2_lines_accumulated = []
215 if mol2_lines_accumulated:
216 _write_part_mol2_file( mol2_lines_accumulated )
217 except Exception, e:
218 log.error('Unable to split files: %s' % str(e))
219 raise
220 split = classmethod(split)
221
222
223
224 class FPS( GenericMolFile ):
225 """
226 chemfp fingerprint file: http://code.google.com/p/chem-fingerprints/wiki/FPS
227 """
228 file_ext = "fps"
229 def sniff( self, filename ):
230 header = get_headers( filename, sep='\t', count=1 )
231 if header[0][0].strip() == '#FPS1':
232 return True
233 else:
234 return False
235
236 def set_meta( self, dataset, **kwd ):
237 """
238 Set the number of lines of data in dataset.
239 """
240 dataset.metadata.number_of_molecules = count_special_lines('^#', dataset.file_name, invert = True)
241
242
243 def split( cls, input_datasets, subdir_generator_function, split_params):
244 """
245 Split the input files by fingerprint records.
246 """
247 if split_params is None:
248 return None
249
250 if len(input_datasets) > 1:
251 raise Exception("FPS-file splitting does not support multiple files")
252 input_files = [ds.file_name for ds in input_datasets]
253
254 chunk_size = None
255 if split_params['split_mode'] == 'number_of_parts':
256 raise Exception('Split mode "%s" is currently not implemented for MOL2-files.' % split_params['split_mode'])
257 elif split_params['split_mode'] == 'to_size':
258 chunk_size = int(split_params['split_size'])
259 else:
260 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
261
262
263 def _write_part_fingerprint_file( accumulated_lines ):
264 part_dir = subdir_generator_function()
265 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
266 part_file = open(part_path, 'w')
267 part_file.writelines( accumulated_lines )
268 part_file.close()
269
270 try:
271 header_lines = []
272 lines_accumulated = []
273 fingerprint_counter = 0
274 for line in open( input_files[0] ):
275 if not line.strip():
276 continue
277 if line.startswith('#'):
278 header_lines.append( line )
279 else:
280 fingerprint_counter += 1
281 lines_accumulated.append( line )
282 if fingerprint_counter != 0 and fingerprint_counter % chunk_size == 0:
283 _write_part_fingerprint_file( header_lines + lines_accumulated )
284 lines_accumulated = []
285 if lines_accumulated:
286 _write_part_fingerprint_file( header_lines + lines_accumulated )
287 except Exception, e:
288 log.error('Unable to split files: %s' % str(e))
289 raise
290 split = classmethod(split)
291
292
293 def merge(split_files, output_file):
294 """
295 Merging fps files requires merging the header manually.
296 We take the header from the first file.
297 """
298 if len(split_files) == 1:
299 #For one file only, use base class method (move/copy)
300 return data.Text.merge(split_files, output_file)
301 if not split_files:
302 raise ValueError("No fps files given, %r, to merge into %s" \
303 % (split_files, output_file))
304 out = open(output_file, "w")
305 first = True
306 for filename in split_files:
307 with open(filename) as handle:
308 for line in handle:
309 if line.startswith('#'):
310 if first:
311 out.write(line)
312 else:
313 # line is no header and not a comment, we assume the first header is written to out and we set 'first' to False
314 first = False
315 out.write(line)
316 out.close()
317 merge = staticmethod(merge)
318
319
320
321 class OBFS( Binary ):
322 """OpenBabel Fastsearch format (fs)."""
323 file_ext = 'fs'
324 composite_type ='basic'
325 allow_datatype_change = False
326
327 MetadataElement( name="base_name", default='OpenBabel Fastsearch Index',
328 readonly=True, visible=True, optional=True,)
329
330 def __init__(self,**kwd):
331 """
332 A Fastsearch Index consists of a binary file with the fingerprints
333 and a pointer the actual molecule file.
334 """
335 Binary.__init__(self, **kwd)
336 self.add_composite_file('molecule.fs', is_binary = True,
337 description = 'OpenBabel Fastsearch Index' )
338 self.add_composite_file('molecule.sdf', optional=True,
339 is_binary = False, description = 'Molecule File' )
340 self.add_composite_file('molecule.smi', optional=True,
341 is_binary = False, description = 'Molecule File' )
342 self.add_composite_file('molecule.inchi', optional=True,
343 is_binary = False, description = 'Molecule File' )
344 self.add_composite_file('molecule.mol2', optional=True,
345 is_binary = False, description = 'Molecule File' )
346 self.add_composite_file('molecule.cml', optional=True,
347 is_binary = False, description = 'Molecule File' )
348
349 def set_peek( self, dataset, is_multi_byte=False ):
350 """Set the peek and blurb text."""
351 if not dataset.dataset.purged:
352 dataset.peek = "OpenBabel Fastsearch Index"
353 dataset.blurb = "OpenBabel Fastsearch Index"
354 else:
355 dataset.peek = "file does not exist"
356 dataset.blurb = "file purged from disk"
357
358 def display_peek( self, dataset ):
359 """Create HTML content, used for displaying peek."""
360 try:
361 return dataset.peek
362 except:
363 return "OpenBabel Fastsearch Index"
364
365 def display_data(self, trans, data, preview=False, filename=None,
366 to_ext=None, size=None, offset=None, **kwd):
367 """Apparently an old display method, but still gets called.
368
369 This allows us to format the data shown in the central pane via the "eye" icon.
370 """
371 return "This is a OpenBabel Fastsearch format. You can speed up your similarity and substructure search with it."
372
373 def get_mime(self):
374 """Returns the mime type of the datatype (pretend it is text for peek)"""
375 return 'text/plain'
376
377 def merge(split_files, output_file, extra_merge_args):
378 """Merging Fastsearch indices is not supported."""
379 raise NotImplementedError("Merging Fastsearch indices is not supported.")
380
381 def split( cls, input_datasets, subdir_generator_function, split_params):
382 """Splitting Fastsearch indices is not supported."""
383 if split_params is None:
384 return None
385 raise NotImplementedError("Splitting Fastsearch indices is not possible.")
386
387
388
389 class DRF( GenericMolFile ):
390 file_ext = "drf"
391
392 def set_meta( self, dataset, **kwd ):
393 """
394 Set the number of lines of data in dataset.
395 """
396 dataset.metadata.number_of_molecules = count_special_lines('\"ligand id\"', dataset.file_name, invert = True)
397
398
399 class PHAR( GenericMolFile ):
400 """
401 Pharmacophore database format from silicos-it.
402 """
403 file_ext = "phar"
404 def set_peek( self, dataset, is_multi_byte=False ):
405 if not dataset.dataset.purged:
406 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
407 dataset.blurb = "pharmacophore"
408 else:
409 dataset.peek = 'file does not exist'
410 dataset.blurb = 'file purged from disk'
411
412
413 class PDB( GenericMolFile ):
414 """
415 Protein Databank format.
416 http://www.wwpdb.org/documentation/format33/v3.3.html
417 """
418 file_ext = "pdb"
419 def sniff( self, filename ):
420 headers = get_headers( filename, sep=' ', count=300 )
421 h = t = c = s = k = e = False
422 for line in headers:
423 section_name = line[0].strip()
424 if section_name == 'HEADER':
425 h = True
426 elif section_name == 'TITLE':
427 t = True
428 elif section_name == 'COMPND':
429 c = True
430 elif section_name == 'SOURCE':
431 s = True
432 elif section_name == 'KEYWDS':
433 k = True
434 elif section_name == 'EXPDTA':
435 e = True
436
437 if h*t*c*s*k*e == True:
438 return True
439 else:
440 return False
441
442 def set_peek( self, dataset, is_multi_byte=False ):
443 if not dataset.dataset.purged:
444 atom_numbers = count_special_lines("^ATOM", dataset.file_name)
445 hetatm_numbers = count_special_lines("^HETATM", dataset.file_name)
446 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
447 dataset.blurb = "%s atoms and %s HET-atoms" % (atom_numbers, hetatm_numbers)
448 else:
449 dataset.peek = 'file does not exist'
450 dataset.blurb = 'file purged from disk'
451
452
453 class grd( data.Text ):
454 file_ext = "grd"
455 def set_peek( self, dataset, is_multi_byte=False ):
456 if not dataset.dataset.purged:
457 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
458 dataset.blurb = "grids for docking"
459 else:
460 dataset.peek = 'file does not exist'
461 dataset.blurb = 'file purged from disk'
462
463
464 class grdtgz( Binary ):
465 file_ext = "grd.tgz"
466 def set_peek( self, dataset, is_multi_byte=False ):
467 if not dataset.dataset.purged:
468 dataset.peek = 'binary data'
469 dataset.blurb = "compressed grids for docking"
470 else:
471 dataset.peek = 'file does not exist'
472 dataset.blurb = 'file purged from disk'
473
474
475 class InChI( Tabular ):
476 file_ext = "inchi"
477 column_names = [ 'InChI' ]
478 MetadataElement( name="columns", default=2, desc="Number of columns", readonly=True, visible=False )
479 MetadataElement( name="column_types", default=['str'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False )
480 MetadataElement( name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0 )
481
482 def set_meta( self, dataset, **kwd ):
483 """
484 Set the number of lines of data in dataset.
485 """
486 dataset.metadata.number_of_molecules = self.count_data_lines(dataset)
487
488 def set_peek( self, dataset, is_multi_byte=False ):
489 if not dataset.dataset.purged:
490 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
491 if (dataset.metadata.number_of_molecules == 1):
492 dataset.blurb = "1 molecule"
493 else:
494 dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules
495 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
496 else:
497 dataset.peek = 'file does not exist'
498 dataset.blurb = 'file purged from disk'
499
500 def sniff( self, filename ):
501 """
502 InChI files starts with 'InChI='
503 """
504 inchi_lines = get_headers( filename, sep=' ', count=10 )
505 for inchi in inchi_lines:
506 if not inchi[0].startswith('InChI='):
507 return False
508 return True
509
510
511 class SMILES( Tabular ):
512 file_ext = "smi"
513 column_names = [ 'SMILES', 'TITLE' ]
514 MetadataElement( name="columns", default=2, desc="Number of columns", readonly=True, visible=False )
515 MetadataElement( name="column_types", default=['str','str'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False )
516 MetadataElement( name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0 )
517
518 def set_meta( self, dataset, **kwd ):
519 """
520 Set the number of lines of data in dataset.
521 """
522 dataset.metadata.number_of_molecules = self.count_data_lines(dataset)
523
524 def set_peek( self, dataset, is_multi_byte=False ):
525 if not dataset.dataset.purged:
526 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
527 if (dataset.metadata.number_of_molecules == 1):
528 dataset.blurb = "1 molecule"
529 else:
530 dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules
531 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
532 else:
533 dataset.peek = 'file does not exist'
534 dataset.blurb = 'file purged from disk'
535
536
537 '''
538 def sniff( self, filename ):
539 """
540 Its hard or impossible to sniff a SMILES File. We can
541 try to import the first SMILES and check if it is a molecule, but
542 currently its not possible to use external libraries from the toolshed
543 in datatype definition files. TODO
544 """
545 self.molecule_number = count_lines( filename, non_empty = True )
546 word_count = count_lines( filename )
547
548 if self.molecule_number != word_count:
549 return False
550
551 if self.molecule_number > 0:
552 # test first 3 SMILES
553 smiles_lines = get_headers( filename, sep='\t', count=3 )
554 for smiles_line in smiles_lines:
555 if len(smiles_line) > 2:
556 return False
557 smiles = smiles_line[0]
558 try:
559 # if we have atoms, we have a molecule
560 if not len( pybel.readstring('smi', smiles).atoms ) > 0:
561 return False
562 except:
563 # if convert fails its not a smiles string
564 return False
565 return True
566 else:
567 return False
568 '''
569
570
571
572 class CML( GenericXml ):
573 """
574 Chemical Markup Language
575 http://cml.sourceforge.net/
576 """
577 file_ext = "cml"
578 MetadataElement( name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0 )
579
580
581 def set_meta( self, dataset, **kwd ):
582 """
583 Set the number of lines of data in dataset.
584 """
585 dataset.metadata.number_of_molecules = count_special_lines( '^\s*<molecule', dataset.file_name )
586
587 def set_peek( self, dataset, is_multi_byte=False ):
588 if not dataset.dataset.purged:
589 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
590 if (dataset.metadata.number_of_molecules == 1):
591 dataset.blurb = "1 molecule"
592 else:
593 dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules
594 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
595 else:
596 dataset.peek = 'file does not exist'
597 dataset.blurb = 'file purged from disk'
598
599 def sniff( self, filename ):
600 """
601 Try to guess if the file is a CML file.
602 TODO: add true positive test, need to submit a CML example
603
604 >>> fname = get_test_fname( 'interval.interval' )
605 >>> CML().sniff( fname )
606 False
607 """
608 handle = open(filename)
609 line = handle.readline()
610 if line.strip() != '<?xml version="1.0"?>':
611 handle.close()
612 return False
613 line = handle.readline()
614 if line.strip().find('http://www.xml-cml.org/schema') == -1:
615 handle.close()
616 return False
617 handle.close()
618 return True
619
620
621 def split( cls, input_datasets, subdir_generator_function, split_params):
622 """
623 Split the input files by molecule records.
624 """
625 if split_params is None:
626 return None
627
628 if len(input_datasets) > 1:
629 raise Exception("CML-file splitting does not support multiple files")
630 input_files = [ds.file_name for ds in input_datasets]
631
632 chunk_size = None
633 if split_params['split_mode'] == 'number_of_parts':
634 raise Exception('Split mode "%s" is currently not implemented for CML-files.' % split_params['split_mode'])
635 elif split_params['split_mode'] == 'to_size':
636 chunk_size = int(split_params['split_size'])
637 else:
638 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
639
640 def _read_cml_records( filename ):
641 lines = []
642 with open(filename) as handle:
643 for line in handle:
644 if line.lstrip().startswith('<?xml version="1.0"?>') or \
645 line.lstrip().startswith('<cml xmlns="http://www.xml-cml.org/schema') or \
646 line.lstrip().startswith('</cml>'):
647 continue
648 lines.append( line )
649 if line.lstrip().startswith('</molecule>'):
650 yield lines
651 lines = []
652
653 header_lines = ['<?xml version="1.0"?>\n', '<cml xmlns="http://www.xml-cml.org/schema">\n']
654 footer_line = ['</cml>\n']
655 def _write_part_cml_file( accumulated_lines ):
656 part_dir = subdir_generator_function()
657 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
658 part_file = open(part_path, 'w')
659 part_file.writelines( header_lines )
660 part_file.writelines( accumulated_lines )
661 part_file.writelines( footer_line )
662 part_file.close()
663
664 try:
665 cml_records = _read_cml_records( input_files[0] )
666 cml_lines_accumulated = []
667 for counter, cml_record in enumerate( cml_records, start = 1):
668 cml_lines_accumulated.extend( cml_record )
669 if counter % chunk_size == 0:
670 _write_part_cml_file( cml_lines_accumulated )
671 cml_lines_accumulated = []
672 if cml_lines_accumulated:
673 _write_part_cml_file( cml_lines_accumulated )
674 except Exception, e:
675 log.error('Unable to split files: %s' % str(e))
676 raise
677 split = classmethod(split)
678
679
680 def merge(split_files, output_file):
681 """
682 Merging CML files.
683 """
684 if len(split_files) == 1:
685 #For one file only, use base class method (move/copy)
686 return Text.merge(split_files, output_file)
687 if not split_files:
688 raise ValueError("Given no CML files, %r, to merge into %s" \
689 % (split_files, output_file))
690 with open(output_file, "w") as out:
691 for filename in split_files:
692 with open( filename ) as handle:
693 header = handle.readline()
694 if not header:
695 raise ValueError("CML file %s was empty" % f)
696 if not header.lstrip().startswith('<?xml version="1.0"?>'):
697 out.write(header)
698 raise ValueError("%s is not a valid XML file!" % f)
699 line = handle.readline()
700 header += line
701 if not line.lstrip().startswith('<cml xmlns="http://www.xml-cml.org/schema'):
702 out.write(header)
703 raise ValueError("%s is not a CML file!" % f)
704 molecule_found = False
705 for line in handle.readlines():
706 # we found two required header lines, the next line should start with <molecule >
707 if line.lstrip().startswith('</cml>'):
708 continue
709 if line.lstrip().startswith('<molecule'):
710 molecule_found = True
711 if molecule_found:
712 out.write( line )
713 out.write("</cml>\n")
714 merge = staticmethod(merge)
715
716
717 if __name__ == '__main__':
718 """
719 TODO: We need to figure out, how to put example files under /lib/galaxy/datatypes/test/ from a toolshed, so that doctest can work properly.
720 """
721 inchi = get_test_fname('drugbank_drugs.inchi')
722 smiles = get_test_fname('drugbank_drugs.smi')
723 sdf = get_test_fname('drugbank_drugs.sdf')
724 fps = get_test_fname('50_chemfp_fingerprints_FPS1.fps')
725 pdb = get_test_fname('2zbz.pdb')
726 cml = get_test_fname('/home/bag/Downloads/approved.cml')
727
728 print 'CML test'
729 print CML().sniff(cml), 'cml'
730 print CML().sniff(inchi)
731 print CML().sniff(pdb)
732 CML().split()
733 """
734 print 'SMILES test'
735 print SMILES().sniff(smiles), 'smi'
736 print SMILES().sniff(inchi)
737 print SMILES().sniff(pdb)
738 """
739 print 'InChI test'
740 print InChI().sniff(smiles)
741 print InChI().sniff(sdf)
742 print InChI().sniff(inchi), 'inchi'
743
744 print 'FPS test'
745 print FPS().sniff(smiles)
746 print FPS().sniff(sdf)
747 f = FPS()
748 print f.sniff(fps)
749
750 print 'SDF test'
751 print SDF().sniff(smiles)
752 print SDF().sniff(sdf), 'sdf'
753 print SDF().sniff(fps)
754
755 print 'PDB test'
756 print PDB().sniff(smiles)
757 print PDB().sniff(sdf)
758 print PDB().sniff(fps)
759 print PDB().sniff(pdb), 'pdb'