annotate molecules.py @ 0:85eca06eefc6 draft default tip

Uploaded
author bgruening
date Thu, 15 Aug 2013 03:19:26 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
1 # -*- coding: utf-8 -*-
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
2
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
3 from galaxy.datatypes import data
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
4 import logging
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
5 from galaxy.datatypes.sniff import get_headers, get_test_fname
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
6 from galaxy.datatypes.data import get_file_peek
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
7 from galaxy.datatypes.tabular import Tabular
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
8 from galaxy.datatypes.binary import Binary
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
9 from galaxy.datatypes.xml import GenericXml
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
10 import subprocess
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
11 import os
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
12 #import pybel
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
13 #import openbabel
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
14 #openbabel.obErrorLog.StopLogging()
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
15
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
16 from galaxy.datatypes.metadata import MetadataElement
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
17 from galaxy.datatypes import metadata
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
18
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
19 log = logging.getLogger(__name__)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
20
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
21 def count_special_lines( word, filename, invert = False ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
22 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
23 searching for special 'words' using the grep tool
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
24 grep is used to speed up the searching and counting
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
25 The number of hits is returned.
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
26 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
27 try:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
28 cmd = ["grep", "-c"]
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
29 if invert:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
30 cmd.append('-v')
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
31 cmd.extend([word, filename])
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
32 out = subprocess.Popen(cmd, stdout=subprocess.PIPE)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
33 return int(out.communicate()[0].split()[0])
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
34 except:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
35 pass
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
36 return 0
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
37
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
38 def count_lines( filename, non_empty = False):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
39 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
40 counting the number of lines from the 'filename' file
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
41 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
42 try:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
43 if non_empty:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
44 out = subprocess.Popen(['grep', '-cve', '^\s*$', filename], stdout=subprocess.PIPE)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
45 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
46 out = subprocess.Popen(['wc', '-l', filename], stdout=subprocess.PIPE)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
47 return int(out.communicate()[0].split()[0])
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
48 except:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
49 pass
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
50 return 0
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
51
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
52
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
53 class GenericMolFile( data.Text ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
54 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
55 abstract class for most of the molecule files
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
56 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
57 MetadataElement( name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0 )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
58
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
59 def set_peek( self, dataset, is_multi_byte=False ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
60 if not dataset.dataset.purged:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
61 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
62 if (dataset.metadata.number_of_molecules == 1):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
63 dataset.blurb = "1 molecule"
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
64 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
65 dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
66 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
67 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
68 dataset.peek = 'file does not exist'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
69 dataset.blurb = 'file purged from disk'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
70
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
71 def get_mime(self):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
72 return 'text/plain'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
73
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
74 class MOL( GenericMolFile ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
75 file_ext = "mol"
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
76 def sniff( self, filename ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
77 if count_special_lines("^M\s*END", filename) == 1:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
78 return True
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
79 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
80 return False
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
81
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
82 def set_meta( self, dataset, **kwd ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
83 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
84 Set the number molecules, in the case of MOL its always one.
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
85 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
86 dataset.metadata.number_of_molecules = 1
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
87
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
88
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
89 class SDF( GenericMolFile ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
90 file_ext = "sdf"
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
91 def sniff( self, filename ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
92 if count_special_lines("^\$\$\$\$", filename) > 0:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
93 return True
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
94 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
95 return False
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
96
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
97 def set_meta( self, dataset, **kwd ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
98 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
99 Set the number of molecules in dataset.
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
100 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
101 dataset.metadata.number_of_molecules = count_special_lines("^\$\$\$\$", dataset.file_name)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
102
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
103 def split( cls, input_datasets, subdir_generator_function, split_params):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
104 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
105 Split the input files by molecule records.
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
106 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
107 if split_params is None:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
108 return None
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
109
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
110 if len(input_datasets) > 1:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
111 raise Exception("SD-file splitting does not support multiple files")
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
112 input_files = [ds.file_name for ds in input_datasets]
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
113
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
114 chunk_size = None
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
115 if split_params['split_mode'] == 'number_of_parts':
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
116 raise Exception('Split mode "%s" is currently not implemented for SD-files.' % split_params['split_mode'])
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
117 elif split_params['split_mode'] == 'to_size':
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
118 chunk_size = int(split_params['split_size'])
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
119 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
120 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
121
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
122 def _read_sdf_records( filename ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
123 lines = []
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
124 with open(filename) as handle:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
125 for line in handle:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
126 lines.append( line )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
127 if line.startswith("$$$$"):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
128 yield lines
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
129 lines = []
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
130
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
131 def _write_part_sdf_file( accumulated_lines ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
132 part_dir = subdir_generator_function()
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
133 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
134 part_file = open(part_path, 'w')
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
135 part_file.writelines( accumulated_lines )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
136 part_file.close()
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
137
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
138 try:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
139 sdf_records = _read_sdf_records( input_files[0] )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
140 sdf_lines_accumulated = []
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
141 for counter, sdf_record in enumerate( sdf_records, start = 1):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
142 sdf_lines_accumulated.extend( sdf_record )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
143 if counter % chunk_size == 0:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
144 _write_part_sdf_file( sdf_lines_accumulated )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
145 sdf_lines_accumulated = []
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
146 if sdf_lines_accumulated:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
147 _write_part_sdf_file( sdf_lines_accumulated )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
148 except Exception, e:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
149 log.error('Unable to split files: %s' % str(e))
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
150 raise
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
151 split = classmethod(split)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
152
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
153
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
154 class MOL2( GenericMolFile ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
155 file_ext = "mol2"
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
156 def sniff( self, filename ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
157 if count_special_lines("@\<TRIPOS\>MOLECULE", filename) > 0:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
158 return True
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
159 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
160 return False
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
161
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
162 def set_meta( self, dataset, **kwd ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
163 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
164 Set the number of lines of data in dataset.
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
165 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
166 dataset.metadata.number_of_molecules = count_special_lines("@<TRIPOS>MOLECULE", dataset.file_name)#self.count_data_lines(dataset)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
167
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
168 def split( cls, input_datasets, subdir_generator_function, split_params):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
169 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
170 Split the input files by molecule records.
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
171 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
172 if split_params is None:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
173 return None
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
174
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
175 if len(input_datasets) > 1:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
176 raise Exception("MOL2-file splitting does not support multiple files")
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
177 input_files = [ds.file_name for ds in input_datasets]
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
178
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
179 chunk_size = None
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
180 if split_params['split_mode'] == 'number_of_parts':
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
181 raise Exception('Split mode "%s" is currently not implemented for MOL2-files.' % split_params['split_mode'])
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
182 elif split_params['split_mode'] == 'to_size':
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
183 chunk_size = int(split_params['split_size'])
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
184 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
185 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
186
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
187 def _read_mol2_records( filename ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
188 lines = []
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
189 start = True
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
190 with open(filename) as handle:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
191 for line in handle:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
192 if line.startswith("@<TRIPOS>MOLECULE"):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
193 if start:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
194 start = False
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
195 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
196 yield lines
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
197 lines = []
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
198 lines.append( line )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
199
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
200 def _write_part_mol2_file( accumulated_lines ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
201 part_dir = subdir_generator_function()
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
202 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
203 part_file = open(part_path, 'w')
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
204 part_file.writelines( accumulated_lines )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
205 part_file.close()
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
206
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
207 try:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
208 mol2_records = _read_mol2_records( input_files[0] )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
209 mol2_lines_accumulated = []
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
210 for counter, mol2_record in enumerate( mol2_records, start = 1):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
211 mol2_lines_accumulated.extend( mol2_record )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
212 if counter % chunk_size == 0:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
213 _write_part_mol2_file( mol2_lines_accumulated )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
214 mol2_lines_accumulated = []
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
215 if mol2_lines_accumulated:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
216 _write_part_mol2_file( mol2_lines_accumulated )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
217 except Exception, e:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
218 log.error('Unable to split files: %s' % str(e))
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
219 raise
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
220 split = classmethod(split)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
221
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
222
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
223
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
224 class FPS( GenericMolFile ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
225 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
226 chemfp fingerprint file: http://code.google.com/p/chem-fingerprints/wiki/FPS
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
227 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
228 file_ext = "fps"
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
229 def sniff( self, filename ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
230 header = get_headers( filename, sep='\t', count=1 )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
231 if header[0][0].strip() == '#FPS1':
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
232 return True
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
233 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
234 return False
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
235
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
236 def set_meta( self, dataset, **kwd ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
237 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
238 Set the number of lines of data in dataset.
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
239 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
240 dataset.metadata.number_of_molecules = count_special_lines('^#', dataset.file_name, invert = True)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
241
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
242
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
243 def split( cls, input_datasets, subdir_generator_function, split_params):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
244 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
245 Split the input files by fingerprint records.
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
246 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
247 if split_params is None:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
248 return None
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
249
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
250 if len(input_datasets) > 1:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
251 raise Exception("FPS-file splitting does not support multiple files")
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
252 input_files = [ds.file_name for ds in input_datasets]
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
253
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
254 chunk_size = None
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
255 if split_params['split_mode'] == 'number_of_parts':
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
256 raise Exception('Split mode "%s" is currently not implemented for MOL2-files.' % split_params['split_mode'])
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
257 elif split_params['split_mode'] == 'to_size':
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
258 chunk_size = int(split_params['split_size'])
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
259 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
260 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
261
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
262
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
263 def _write_part_fingerprint_file( accumulated_lines ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
264 part_dir = subdir_generator_function()
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
265 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
266 part_file = open(part_path, 'w')
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
267 part_file.writelines( accumulated_lines )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
268 part_file.close()
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
269
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
270 try:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
271 header_lines = []
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
272 lines_accumulated = []
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
273 fingerprint_counter = 0
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
274 for line in open( input_files[0] ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
275 if not line.strip():
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
276 continue
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
277 if line.startswith('#'):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
278 header_lines.append( line )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
279 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
280 fingerprint_counter += 1
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
281 lines_accumulated.append( line )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
282 if fingerprint_counter != 0 and fingerprint_counter % chunk_size == 0:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
283 _write_part_fingerprint_file( header_lines + lines_accumulated )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
284 lines_accumulated = []
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
285 if lines_accumulated:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
286 _write_part_fingerprint_file( header_lines + lines_accumulated )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
287 except Exception, e:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
288 log.error('Unable to split files: %s' % str(e))
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
289 raise
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
290 split = classmethod(split)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
291
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
292
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
293 def merge(split_files, output_file):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
294 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
295 Merging fps files requires merging the header manually.
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
296 We take the header from the first file.
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
297 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
298 if len(split_files) == 1:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
299 #For one file only, use base class method (move/copy)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
300 return data.Text.merge(split_files, output_file)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
301 if not split_files:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
302 raise ValueError("No fps files given, %r, to merge into %s" \
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
303 % (split_files, output_file))
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
304 out = open(output_file, "w")
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
305 first = True
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
306 for filename in split_files:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
307 with open(filename) as handle:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
308 for line in handle:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
309 if line.startswith('#'):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
310 if first:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
311 out.write(line)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
312 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
313 # line is no header and not a comment, we assume the first header is written to out and we set 'first' to False
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
314 first = False
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
315 out.write(line)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
316 out.close()
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
317 merge = staticmethod(merge)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
318
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
319
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
320
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
321 class OBFS( Binary ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
322 """OpenBabel Fastsearch format (fs)."""
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
323 file_ext = 'fs'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
324 composite_type ='basic'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
325 allow_datatype_change = False
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
326
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
327 MetadataElement( name="base_name", default='OpenBabel Fastsearch Index',
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
328 readonly=True, visible=True, optional=True,)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
329
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
330 def __init__(self,**kwd):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
331 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
332 A Fastsearch Index consists of a binary file with the fingerprints
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
333 and a pointer the actual molecule file.
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
334 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
335 Binary.__init__(self, **kwd)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
336 self.add_composite_file('molecule.fs', is_binary = True,
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
337 description = 'OpenBabel Fastsearch Index' )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
338 self.add_composite_file('molecule.sdf', optional=True,
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
339 is_binary = False, description = 'Molecule File' )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
340 self.add_composite_file('molecule.smi', optional=True,
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
341 is_binary = False, description = 'Molecule File' )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
342 self.add_composite_file('molecule.inchi', optional=True,
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
343 is_binary = False, description = 'Molecule File' )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
344 self.add_composite_file('molecule.mol2', optional=True,
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
345 is_binary = False, description = 'Molecule File' )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
346 self.add_composite_file('molecule.cml', optional=True,
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
347 is_binary = False, description = 'Molecule File' )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
348
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
349 def set_peek( self, dataset, is_multi_byte=False ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
350 """Set the peek and blurb text."""
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
351 if not dataset.dataset.purged:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
352 dataset.peek = "OpenBabel Fastsearch Index"
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
353 dataset.blurb = "OpenBabel Fastsearch Index"
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
354 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
355 dataset.peek = "file does not exist"
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
356 dataset.blurb = "file purged from disk"
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
357
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
358 def display_peek( self, dataset ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
359 """Create HTML content, used for displaying peek."""
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
360 try:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
361 return dataset.peek
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
362 except:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
363 return "OpenBabel Fastsearch Index"
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
364
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
365 def display_data(self, trans, data, preview=False, filename=None,
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
366 to_ext=None, size=None, offset=None, **kwd):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
367 """Apparently an old display method, but still gets called.
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
368
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
369 This allows us to format the data shown in the central pane via the "eye" icon.
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
370 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
371 return "This is a OpenBabel Fastsearch format. You can speed up your similarity and substructure search with it."
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
372
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
373 def get_mime(self):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
374 """Returns the mime type of the datatype (pretend it is text for peek)"""
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
375 return 'text/plain'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
376
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
377 def merge(split_files, output_file, extra_merge_args):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
378 """Merging Fastsearch indices is not supported."""
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
379 raise NotImplementedError("Merging Fastsearch indices is not supported.")
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
380
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
381 def split( cls, input_datasets, subdir_generator_function, split_params):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
382 """Splitting Fastsearch indices is not supported."""
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
383 if split_params is None:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
384 return None
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
385 raise NotImplementedError("Splitting Fastsearch indices is not possible.")
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
386
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
387
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
388
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
389 class DRF( GenericMolFile ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
390 file_ext = "drf"
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
391
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
392 def set_meta( self, dataset, **kwd ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
393 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
394 Set the number of lines of data in dataset.
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
395 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
396 dataset.metadata.number_of_molecules = count_special_lines('\"ligand id\"', dataset.file_name, invert = True)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
397
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
398
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
399 class PHAR( GenericMolFile ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
400 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
401 Pharmacophore database format from silicos-it.
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
402 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
403 file_ext = "phar"
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
404 def set_peek( self, dataset, is_multi_byte=False ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
405 if not dataset.dataset.purged:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
406 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
407 dataset.blurb = "pharmacophore"
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
408 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
409 dataset.peek = 'file does not exist'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
410 dataset.blurb = 'file purged from disk'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
411
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
412
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
413 class PDB( GenericMolFile ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
414 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
415 Protein Databank format.
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
416 http://www.wwpdb.org/documentation/format33/v3.3.html
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
417 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
418 file_ext = "pdb"
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
419 def sniff( self, filename ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
420 headers = get_headers( filename, sep=' ', count=300 )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
421 h = t = c = s = k = e = False
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
422 for line in headers:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
423 section_name = line[0].strip()
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
424 if section_name == 'HEADER':
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
425 h = True
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
426 elif section_name == 'TITLE':
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
427 t = True
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
428 elif section_name == 'COMPND':
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
429 c = True
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
430 elif section_name == 'SOURCE':
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
431 s = True
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
432 elif section_name == 'KEYWDS':
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
433 k = True
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
434 elif section_name == 'EXPDTA':
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
435 e = True
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
436
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
437 if h*t*c*s*k*e == True:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
438 return True
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
439 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
440 return False
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
441
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
442 def set_peek( self, dataset, is_multi_byte=False ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
443 if not dataset.dataset.purged:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
444 atom_numbers = count_special_lines("^ATOM", dataset.file_name)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
445 hetatm_numbers = count_special_lines("^HETATM", dataset.file_name)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
446 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
447 dataset.blurb = "%s atoms and %s HET-atoms" % (atom_numbers, hetatm_numbers)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
448 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
449 dataset.peek = 'file does not exist'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
450 dataset.blurb = 'file purged from disk'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
451
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
452
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
453 class grd( data.Text ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
454 file_ext = "grd"
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
455 def set_peek( self, dataset, is_multi_byte=False ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
456 if not dataset.dataset.purged:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
457 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
458 dataset.blurb = "grids for docking"
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
459 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
460 dataset.peek = 'file does not exist'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
461 dataset.blurb = 'file purged from disk'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
462
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
463
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
464 class grdtgz( Binary ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
465 file_ext = "grd.tgz"
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
466 def set_peek( self, dataset, is_multi_byte=False ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
467 if not dataset.dataset.purged:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
468 dataset.peek = 'binary data'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
469 dataset.blurb = "compressed grids for docking"
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
470 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
471 dataset.peek = 'file does not exist'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
472 dataset.blurb = 'file purged from disk'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
473
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
474
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
475 class InChI( Tabular ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
476 file_ext = "inchi"
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
477 column_names = [ 'InChI' ]
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
478 MetadataElement( name="columns", default=2, desc="Number of columns", readonly=True, visible=False )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
479 MetadataElement( name="column_types", default=['str'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
480 MetadataElement( name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0 )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
481
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
482 def set_meta( self, dataset, **kwd ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
483 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
484 Set the number of lines of data in dataset.
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
485 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
486 dataset.metadata.number_of_molecules = self.count_data_lines(dataset)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
487
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
488 def set_peek( self, dataset, is_multi_byte=False ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
489 if not dataset.dataset.purged:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
490 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
491 if (dataset.metadata.number_of_molecules == 1):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
492 dataset.blurb = "1 molecule"
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
493 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
494 dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
495 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
496 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
497 dataset.peek = 'file does not exist'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
498 dataset.blurb = 'file purged from disk'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
499
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
500 def sniff( self, filename ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
501 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
502 InChI files starts with 'InChI='
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
503 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
504 inchi_lines = get_headers( filename, sep=' ', count=10 )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
505 for inchi in inchi_lines:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
506 if not inchi[0].startswith('InChI='):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
507 return False
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
508 return True
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
509
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
510
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
511 class SMILES( Tabular ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
512 file_ext = "smi"
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
513 column_names = [ 'SMILES', 'TITLE' ]
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
514 MetadataElement( name="columns", default=2, desc="Number of columns", readonly=True, visible=False )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
515 MetadataElement( name="column_types", default=['str','str'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
516 MetadataElement( name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0 )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
517
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
518 def set_meta( self, dataset, **kwd ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
519 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
520 Set the number of lines of data in dataset.
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
521 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
522 dataset.metadata.number_of_molecules = self.count_data_lines(dataset)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
523
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
524 def set_peek( self, dataset, is_multi_byte=False ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
525 if not dataset.dataset.purged:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
526 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
527 if (dataset.metadata.number_of_molecules == 1):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
528 dataset.blurb = "1 molecule"
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
529 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
530 dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
531 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
532 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
533 dataset.peek = 'file does not exist'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
534 dataset.blurb = 'file purged from disk'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
535
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
536
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
537 '''
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
538 def sniff( self, filename ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
539 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
540 Its hard or impossible to sniff a SMILES File. We can
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
541 try to import the first SMILES and check if it is a molecule, but
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
542 currently its not possible to use external libraries from the toolshed
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
543 in datatype definition files. TODO
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
544 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
545 self.molecule_number = count_lines( filename, non_empty = True )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
546 word_count = count_lines( filename )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
547
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
548 if self.molecule_number != word_count:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
549 return False
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
550
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
551 if self.molecule_number > 0:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
552 # test first 3 SMILES
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
553 smiles_lines = get_headers( filename, sep='\t', count=3 )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
554 for smiles_line in smiles_lines:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
555 if len(smiles_line) > 2:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
556 return False
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
557 smiles = smiles_line[0]
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
558 try:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
559 # if we have atoms, we have a molecule
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
560 if not len( pybel.readstring('smi', smiles).atoms ) > 0:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
561 return False
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
562 except:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
563 # if convert fails its not a smiles string
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
564 return False
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
565 return True
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
566 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
567 return False
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
568 '''
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
569
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
570
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
571
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
572 class CML( GenericXml ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
573 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
574 Chemical Markup Language
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
575 http://cml.sourceforge.net/
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
576 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
577 file_ext = "cml"
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
578 MetadataElement( name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0 )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
579
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
580
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
581 def set_meta( self, dataset, **kwd ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
582 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
583 Set the number of lines of data in dataset.
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
584 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
585 dataset.metadata.number_of_molecules = count_special_lines( '^\s*<molecule', dataset.file_name )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
586
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
587 def set_peek( self, dataset, is_multi_byte=False ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
588 if not dataset.dataset.purged:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
589 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
590 if (dataset.metadata.number_of_molecules == 1):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
591 dataset.blurb = "1 molecule"
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
592 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
593 dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
594 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
595 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
596 dataset.peek = 'file does not exist'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
597 dataset.blurb = 'file purged from disk'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
598
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
599 def sniff( self, filename ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
600 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
601 Try to guess if the file is a CML file.
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
602 TODO: add true positive test, need to submit a CML example
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
603
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
604 >>> fname = get_test_fname( 'interval.interval' )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
605 >>> CML().sniff( fname )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
606 False
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
607 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
608 handle = open(filename)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
609 line = handle.readline()
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
610 if line.strip() != '<?xml version="1.0"?>':
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
611 handle.close()
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
612 return False
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
613 line = handle.readline()
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
614 if line.strip().find('http://www.xml-cml.org/schema') == -1:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
615 handle.close()
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
616 return False
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
617 handle.close()
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
618 return True
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
619
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
620
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
621 def split( cls, input_datasets, subdir_generator_function, split_params):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
622 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
623 Split the input files by molecule records.
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
624 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
625 if split_params is None:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
626 return None
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
627
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
628 if len(input_datasets) > 1:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
629 raise Exception("CML-file splitting does not support multiple files")
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
630 input_files = [ds.file_name for ds in input_datasets]
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
631
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
632 chunk_size = None
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
633 if split_params['split_mode'] == 'number_of_parts':
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
634 raise Exception('Split mode "%s" is currently not implemented for CML-files.' % split_params['split_mode'])
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
635 elif split_params['split_mode'] == 'to_size':
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
636 chunk_size = int(split_params['split_size'])
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
637 else:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
638 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
639
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
640 def _read_cml_records( filename ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
641 lines = []
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
642 with open(filename) as handle:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
643 for line in handle:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
644 if line.lstrip().startswith('<?xml version="1.0"?>') or \
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
645 line.lstrip().startswith('<cml xmlns="http://www.xml-cml.org/schema') or \
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
646 line.lstrip().startswith('</cml>'):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
647 continue
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
648 lines.append( line )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
649 if line.lstrip().startswith('</molecule>'):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
650 yield lines
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
651 lines = []
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
652
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
653 header_lines = ['<?xml version="1.0"?>\n', '<cml xmlns="http://www.xml-cml.org/schema">\n']
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
654 footer_line = ['</cml>\n']
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
655 def _write_part_cml_file( accumulated_lines ):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
656 part_dir = subdir_generator_function()
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
657 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
658 part_file = open(part_path, 'w')
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
659 part_file.writelines( header_lines )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
660 part_file.writelines( accumulated_lines )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
661 part_file.writelines( footer_line )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
662 part_file.close()
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
663
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
664 try:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
665 cml_records = _read_cml_records( input_files[0] )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
666 cml_lines_accumulated = []
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
667 for counter, cml_record in enumerate( cml_records, start = 1):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
668 cml_lines_accumulated.extend( cml_record )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
669 if counter % chunk_size == 0:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
670 _write_part_cml_file( cml_lines_accumulated )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
671 cml_lines_accumulated = []
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
672 if cml_lines_accumulated:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
673 _write_part_cml_file( cml_lines_accumulated )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
674 except Exception, e:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
675 log.error('Unable to split files: %s' % str(e))
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
676 raise
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
677 split = classmethod(split)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
678
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
679
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
680 def merge(split_files, output_file):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
681 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
682 Merging CML files.
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
683 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
684 if len(split_files) == 1:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
685 #For one file only, use base class method (move/copy)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
686 return Text.merge(split_files, output_file)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
687 if not split_files:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
688 raise ValueError("Given no CML files, %r, to merge into %s" \
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
689 % (split_files, output_file))
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
690 with open(output_file, "w") as out:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
691 for filename in split_files:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
692 with open( filename ) as handle:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
693 header = handle.readline()
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
694 if not header:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
695 raise ValueError("CML file %s was empty" % f)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
696 if not header.lstrip().startswith('<?xml version="1.0"?>'):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
697 out.write(header)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
698 raise ValueError("%s is not a valid XML file!" % f)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
699 line = handle.readline()
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
700 header += line
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
701 if not line.lstrip().startswith('<cml xmlns="http://www.xml-cml.org/schema'):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
702 out.write(header)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
703 raise ValueError("%s is not a CML file!" % f)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
704 molecule_found = False
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
705 for line in handle.readlines():
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
706 # we found two required header lines, the next line should start with <molecule >
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
707 if line.lstrip().startswith('</cml>'):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
708 continue
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
709 if line.lstrip().startswith('<molecule'):
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
710 molecule_found = True
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
711 if molecule_found:
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
712 out.write( line )
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
713 out.write("</cml>\n")
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
714 merge = staticmethod(merge)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
715
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
716
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
717 if __name__ == '__main__':
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
718 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
719 TODO: We need to figure out, how to put example files under /lib/galaxy/datatypes/test/ from a toolshed, so that doctest can work properly.
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
720 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
721 inchi = get_test_fname('drugbank_drugs.inchi')
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
722 smiles = get_test_fname('drugbank_drugs.smi')
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
723 sdf = get_test_fname('drugbank_drugs.sdf')
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
724 fps = get_test_fname('50_chemfp_fingerprints_FPS1.fps')
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
725 pdb = get_test_fname('2zbz.pdb')
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
726 cml = get_test_fname('/home/bag/Downloads/approved.cml')
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
727
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
728 print 'CML test'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
729 print CML().sniff(cml), 'cml'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
730 print CML().sniff(inchi)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
731 print CML().sniff(pdb)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
732 CML().split()
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
733 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
734 print 'SMILES test'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
735 print SMILES().sniff(smiles), 'smi'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
736 print SMILES().sniff(inchi)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
737 print SMILES().sniff(pdb)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
738 """
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
739 print 'InChI test'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
740 print InChI().sniff(smiles)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
741 print InChI().sniff(sdf)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
742 print InChI().sniff(inchi), 'inchi'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
743
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
744 print 'FPS test'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
745 print FPS().sniff(smiles)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
746 print FPS().sniff(sdf)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
747 f = FPS()
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
748 print f.sniff(fps)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
749
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
750 print 'SDF test'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
751 print SDF().sniff(smiles)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
752 print SDF().sniff(sdf), 'sdf'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
753 print SDF().sniff(fps)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
754
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
755 print 'PDB test'
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
756 print PDB().sniff(smiles)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
757 print PDB().sniff(sdf)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
758 print PDB().sniff(fps)
85eca06eefc6 Uploaded
bgruening
parents:
diff changeset
759 print PDB().sniff(pdb), 'pdb'