comparison data_manager/data_manager_rsync.py @ 1:8ff92bd7e2a3 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_rsync_g2 commit 8652f36a3a3838dca989426961561e81432acf4f
author iuc
date Tue, 04 Apr 2017 18:13:26 -0400
parents 0a3a6f862104
children e0329ab30f6d
comparison
equal deleted inserted replaced
0:0a3a6f862104 1:8ff92bd7e2a3
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 #Dan Blankenberg 2 # Dan Blankenberg
3 3 from __future__ import print_function
4 import sys 4
5 import datetime
6 import logging
7 import optparse
5 import os 8 import os
9 import shutil
10 import subprocess
6 import tempfile 11 import tempfile
7 import shutil 12 from json import (
8 import optparse 13 dumps,
9 import urllib2 14 loads
10 import subprocess 15 )
11 import datetime
12 from os.path import basename 16 from os.path import basename
13 from json import loads, dumps
14 from xml.etree.ElementTree import tostring 17 from xml.etree.ElementTree import tostring
15 18 try:
16 import logging 19 # For Python 3.0 and later
20 from urllib.request import urlopen
21 except ImportError:
22 # Fall back to Python 2 imports
23 from urllib2 import urlopen
24
17 _log_name = __name__ 25 _log_name = __name__
18 if _log_name == '__builtin__': 26 if _log_name == '__builtin__':
19 _log_name = 'toolshed.installed.g2.rsync.data.manager' 27 _log_name = 'toolshed.installed.g2.rsync.data.manager'
20 log = logging.getLogger( _log_name ) 28 log = logging.getLogger( _log_name )
21 29
25 LOCATION_DIR = "location" 33 LOCATION_DIR = "location"
26 INDEX_DIR = "indexes" 34 INDEX_DIR = "indexes"
27 35
28 # Pull the Tool Data Table files from github 36 # Pull the Tool Data Table files from github
29 # FIXME: These files should be accessible from the rsync server directly. 37 # FIXME: These files should be accessible from the rsync server directly.
30 TOOL_DATA_TABLE_CONF_XML_URLS = { 'main':"https://raw.githubusercontent.com/galaxyproject/usegalaxy-playbook/master/files/galaxy/usegalaxy.org/config/tool_data_table_conf.xml", 38 TOOL_DATA_TABLE_CONF_XML_URLS = {'main': "https://raw.githubusercontent.com/galaxyproject/usegalaxy-playbook/master/files/galaxy/usegalaxy.org/config/tool_data_table_conf.xml",
31 'test':"https://raw.githubusercontent.com/galaxyproject/usegalaxy-playbook/master/files/galaxy/test.galaxyproject.org/config/tool_data_table_conf.xml" } 39 'test': "https://raw.githubusercontent.com/galaxyproject/usegalaxy-playbook/master/files/galaxy/test.galaxyproject.org/config/tool_data_table_conf.xml" }
32 40
33 # Replace data table source entries with local temporary location 41 # Replace data table source entries with local temporary location
34 GALAXY_DATA_CANONICAL_PATH = "/galaxy/data/" 42 GALAXY_DATA_CANONICAL_PATH = "/galaxy/data/"
35 TOOL_DATA_TABLE_CONF_XML_REPLACE_SOURCE = '<file path="%slocation/' % ( GALAXY_DATA_CANONICAL_PATH ) 43 TOOL_DATA_TABLE_CONF_XML_REPLACE_SOURCE = '<file path="%slocation/' % ( GALAXY_DATA_CANONICAL_PATH )
36 TOOL_DATA_TABLE_CONF_XML_REPLACE_TARGET = '<file path="%s/' 44 TOOL_DATA_TABLE_CONF_XML_REPLACE_TARGET = '<file path="%s/'
48 # TODO: Make additional handler actions available for tables that can't fit into the the basic 56 # TODO: Make additional handler actions available for tables that can't fit into the the basic
49 # "take the value of path" as a dir and copy contents. 57 # "take the value of path" as a dir and copy contents.
50 # e.g. mafs. Although this maf table is goofy and doesn't have path defined in <table> def, 58 # e.g. mafs. Although this maf table is goofy and doesn't have path defined in <table> def,
51 # but it does exit in the .loc. 59 # but it does exit in the .loc.
52 60
61
53 # --- These methods are called by/within the Galaxy Application 62 # --- These methods are called by/within the Galaxy Application
54
55 def exec_before_job( app, inp_data, out_data, param_dict, tool=None, **kwd ): 63 def exec_before_job( app, inp_data, out_data, param_dict, tool=None, **kwd ):
56 # Look for any data tables that haven't been defined for this data manager before and dynamically add them to Galaxy 64 # Look for any data tables that haven't been defined for this data manager before and dynamically add them to Galaxy
57 param_dict = dict( **param_dict ) 65 param_dict = dict( **param_dict )
58 param_dict['data_table_entries'] = param_dict.get( 'data_table_entries', [] ) 66 param_dict['data_table_entries'] = param_dict.get( 'data_table_entries', [] )
59 if not isinstance( param_dict['data_table_entries'], list ): 67 if not isinstance( param_dict['data_table_entries'], list ):
65 tool_shed_repository = None 73 tool_shed_repository = None
66 tdtm = None 74 tdtm = None
67 data_manager = app.data_managers.get_manager( tool.data_manager_id, None ) 75 data_manager = app.data_managers.get_manager( tool.data_manager_id, None )
68 data_table_entries = get_data_table_entries( param_dict ) 76 data_table_entries = get_data_table_entries( param_dict )
69 data_tables = load_data_tables_from_url( data_table_class=app.tool_data_tables.__class__ ).get( 'data_tables' ) 77 data_tables = load_data_tables_from_url( data_table_class=app.tool_data_tables.__class__ ).get( 'data_tables' )
70 for data_table_name, entries in data_table_entries.iteritems(): 78 for data_table_name, entries in data_table_entries.items():
71 #get data table managed by this data Manager 79 # get data table managed by this data Manager
72 has_data_table = app.tool_data_tables.get_tables().get( data_table_name ) 80 has_data_table = app.tool_data_tables.get_tables().get( data_table_name )
73 if has_data_table: 81 if has_data_table:
74 has_data_table = bool( has_data_table.get_filename_for_source( data_manager, None ) ) 82 has_data_table = bool( has_data_table.get_filename_for_source( data_manager, None ) )
75 if not has_data_table: 83 if not has_data_table:
76 if tdtm is None: 84 if tdtm is None:
77 from tool_shed.tools import data_table_manager 85 from tool_shed.tools import data_table_manager
78 tdtm = data_table_manager.ToolDataTableManager( app ) 86 tdtm = data_table_manager.ToolDataTableManager( app )
79 target_dir, tool_path, relative_target_dir = tdtm.get_target_install_dir( tool_shed_repository ) 87 target_dir, tool_path, relative_target_dir = tdtm.get_target_install_dir( tool_shed_repository )
80 #Dynamically add this data table 88 # Dynamically add this data table
81 log.debug( "Attempting to dynamically create a missing Tool Data Table named %s." % data_table_name ) 89 log.debug( "Attempting to dynamically create a missing Tool Data Table named %s." % data_table_name )
82 data_table = data_tables[data_table_name] 90 data_table = data_tables[data_table_name]
83 repo_info = tdtm.generate_repository_info_elem_from_repository( tool_shed_repository, parent_elem=None ) 91 repo_info = tdtm.generate_repository_info_elem_from_repository( tool_shed_repository, parent_elem=None )
84 if repo_info is not None: 92 if repo_info is not None:
85 repo_info = tostring( repo_info ) 93 repo_info = tostring( repo_info )
87 tmp_file.write( get_new_xml_definition( app, data_table, data_manager, repo_info, target_dir ) ) 95 tmp_file.write( get_new_xml_definition( app, data_table, data_manager, repo_info, target_dir ) )
88 tmp_file.flush() 96 tmp_file.flush()
89 app.tool_data_tables.add_new_entries_from_config_file( tmp_file.name, None, app.config.shed_tool_data_table_config, persist=True ) 97 app.tool_data_tables.add_new_entries_from_config_file( tmp_file.name, None, app.config.shed_tool_data_table_config, persist=True )
90 tmp_file.close() 98 tmp_file.close()
91 99
100
92 def galaxy_code_get_available_data_tables( trans ): 101 def galaxy_code_get_available_data_tables( trans ):
93 #list of data tables 102 # list of data tables
94 found_tables = get_available_tables( trans ) 103 found_tables = get_available_tables( trans )
95 rval = map( lambda x: ( ( x, x, DEFAULT_SELECTED ) ), found_tables ) 104 rval = [ ( x, x, DEFAULT_SELECTED ) for x in found_tables]
96 return rval 105 return rval
106
97 107
98 def galaxy_code_get_available_data_tables_entries( trans, dbkey, data_table_names ): 108 def galaxy_code_get_available_data_tables_entries( trans, dbkey, data_table_names ):
99 #available entries, optionally filtered by dbkey and table names 109 # available entries, optionally filtered by dbkey and table names
100 if dbkey in [ None, '', '?' ]: 110 if dbkey in [ None, '', '?' ]:
101 dbkey = None 111 dbkey = None
102 if data_table_names in [ None, '', '?' ]: 112 if data_table_names in [ None, '', '?' ]:
103 data_table_names = None 113 data_table_names = None
104 found_tables = get_available_tables_for_dbkey( trans, dbkey, data_table_names ) 114 found_tables = get_available_tables_for_dbkey( trans, dbkey, data_table_names )
105 dbkey_text = '(%s) ' % ( dbkey ) if dbkey else '' 115 dbkey_text = '(%s) ' % ( dbkey ) if dbkey else ''
106 rval = map( lambda x: ( "%s%s" % ( dbkey_text, x[0] ), dumps( dict( name=x[0].split( ': ' )[0], entry=x[1] ) ).encode( 'base64' ).rstrip(), DEFAULT_SELECTED ), found_tables.items() ) 116 rval = [( "%s%s" % ( dbkey_text, x[0] ), dumps( dict( name=x[0].split( ': ' )[0], entry=x[1] ) ).encode( 'base64' ).rstrip(), DEFAULT_SELECTED ) for x in found_tables.items()]
107 return rval 117 return rval
108 118
109 # --- End Galaxy called Methods --- 119 # --- End Galaxy called Methods ---
110 120
111 121
114 # probably because it doesn't recognize the rsync scheme 124 # probably because it doesn't recognize the rsync scheme
115 base = base.rstrip( '/' ) 125 base = base.rstrip( '/' )
116 url = url.lstrip( '/' ) 126 url = url.lstrip( '/' )
117 return "%s/%s" % ( base, url ) 127 return "%s/%s" % ( base, url )
118 128
129
119 def rsync_list_dir( server, dir=None, skip_names=[] ): 130 def rsync_list_dir( server, dir=None, skip_names=[] ):
120 #drwxr-xr-x 50 2014/05/16 20:58:11 . 131 # drwxr-xr-x 50 2014/05/16 20:58:11 .
121 if dir: 132 if dir:
122 dir = rsync_urljoin( server, dir ) 133 dir = rsync_urljoin( server, dir )
123 else: 134 else:
124 dir = server 135 dir = server
125 rsync_response = tempfile.NamedTemporaryFile() 136 rsync_response = tempfile.NamedTemporaryFile()
151 size = line.strip() 162 size = line.strip()
152 rval[ name ] = dict( name=name, permissions=perms, bytes=size, date=date, time=time ) 163 rval[ name ] = dict( name=name, permissions=perms, bytes=size, date=date, time=time )
153 rsync_response.close() 164 rsync_response.close()
154 return rval 165 return rval
155 166
167
156 def rsync_sync_to_dir( source, target ): 168 def rsync_sync_to_dir( source, target ):
157 rsync_response = tempfile.NamedTemporaryFile() 169 rsync_response = tempfile.NamedTemporaryFile()
158 rsync_stderr = tempfile.NamedTemporaryFile() 170 rsync_stderr = tempfile.NamedTemporaryFile()
159 rsync_cmd = [ RSYNC_CMD, '-avzP', source, target ] 171 rsync_cmd = [ RSYNC_CMD, '-avzP', source, target ]
160 return_code = subprocess.call( rsync_cmd, stdout=rsync_response, stderr=rsync_stderr ) 172 return_code = subprocess.call( rsync_cmd, stdout=rsync_response, stderr=rsync_stderr )
174 186
175 def data_table_needs_refresh( cached_data_table, url ): 187 def data_table_needs_refresh( cached_data_table, url ):
176 if cached_data_table is None: 188 if cached_data_table is None:
177 return True, {} 189 return True, {}
178 if datetime.datetime.now() - cached_data_table.get( 'time_loaded' ) > CACHE_TIME: 190 if datetime.datetime.now() - cached_data_table.get( 'time_loaded' ) > CACHE_TIME:
179 data_table_text = urllib2.urlopen( url ).read() 191 data_table_text = urlopen( url ).read()
180 if cached_data_table.get( 'data_table_text', None ) != data_table_text: 192 if cached_data_table.get( 'data_table_text', None ) != data_table_text:
181 return True, {'data_table_text':data_table_text} 193 return True, {'data_table_text': data_table_text}
182 loc_file_attrs = rsync_list_dir( RSYNC_SERVER, LOCATION_DIR ) 194 loc_file_attrs = rsync_list_dir( RSYNC_SERVER, LOCATION_DIR )
183 if cached_data_table.get( 'loc_file_attrs', None ) != loc_file_attrs: 195 if cached_data_table.get( 'loc_file_attrs', None ) != loc_file_attrs:
184 return True, {'loc_file_attrs':loc_file_attrs} 196 return True, {'loc_file_attrs': loc_file_attrs}
185 return False, {} 197 return False, {}
198
186 199
187 def load_data_tables_from_url( url=None, site='main', data_table_class=None ): 200 def load_data_tables_from_url( url=None, site='main', data_table_class=None ):
188 if not url: 201 if not url:
189 url = TOOL_DATA_TABLE_CONF_XML_URLS.get( site, None ) 202 url = TOOL_DATA_TABLE_CONF_XML_URLS.get( site, None )
190 assert url, ValueError( 'You must provide either a URL or a site=name.' ) 203 assert url, ValueError( 'You must provide either a URL or a site=name.' )
191 204
192 cached_data_table = TOOL_DATA_TABLES_LOADED_BY_URL.get( url, None ) 205 cached_data_table = TOOL_DATA_TABLES_LOADED_BY_URL.get( url, None )
193 refresh, attribs = data_table_needs_refresh( cached_data_table, url ) 206 refresh, attribs = data_table_needs_refresh( cached_data_table, url )
194 if refresh: 207 if refresh:
195 data_table_text = attribs.get( 'data_table_text' )or urllib2.urlopen( url ).read() 208 data_table_text = attribs.get( 'data_table_text' )or urlopen( url ).read()
196 loc_file_attrs = attribs.get( 'loc_file_attrs' ) or rsync_list_dir( RSYNC_SERVER, LOCATION_DIR ) 209 loc_file_attrs = attribs.get( 'loc_file_attrs' ) or rsync_list_dir( RSYNC_SERVER, LOCATION_DIR )
197 210
198 tmp_dir = tempfile.mkdtemp( prefix='rsync_g2_' ) 211 tmp_dir = tempfile.mkdtemp( prefix='rsync_g2_' )
199 tmp_loc_dir = os.path.join( tmp_dir, 'location' ) 212 tmp_loc_dir = os.path.join( tmp_dir, 'location' )
200 os.mkdir( tmp_loc_dir ) 213 os.mkdir( tmp_loc_dir )
201 rsync_sync_to_dir( rsync_urljoin( RSYNC_SERVER, LOCATION_DIR ), os.path.abspath( tmp_loc_dir ) ) 214 rsync_sync_to_dir( rsync_urljoin( RSYNC_SERVER, LOCATION_DIR ), os.path.abspath( tmp_loc_dir ) )
202 215
203
204 new_data_table_text = data_table_text.replace( TOOL_DATA_TABLE_CONF_XML_REPLACE_SOURCE, TOOL_DATA_TABLE_CONF_XML_REPLACE_TARGET % ( tmp_loc_dir ) ) 216 new_data_table_text = data_table_text.replace( TOOL_DATA_TABLE_CONF_XML_REPLACE_SOURCE, TOOL_DATA_TABLE_CONF_XML_REPLACE_TARGET % ( tmp_loc_dir ) )
205 data_table_fh = tempfile.NamedTemporaryFile( dir=tmp_dir, prefix='rysnc_data_manager_data_table_conf_' ) 217 data_table_fh = tempfile.NamedTemporaryFile( dir=tmp_dir, prefix='rysnc_data_manager_data_table_conf_' )
206 data_table_fh.write( new_data_table_text ) 218 data_table_fh.write( new_data_table_text )
207 data_table_fh.flush() 219 data_table_fh.flush()
208 tmp_data_dir = os.path.join( tmp_dir, 'tool-data' ) 220 tmp_data_dir = os.path.join( tmp_dir, 'tool-data' )
209 os.mkdir( tmp_data_dir ) 221 os.mkdir( tmp_data_dir )
210 data_tables = data_table_class( tmp_data_dir, config_filename=data_table_fh.name ) 222 data_tables = data_table_class( tmp_data_dir, config_filename=data_table_fh.name )
211 for name, data_table in data_tables.data_tables.items(): 223 for name, data_table in list(data_tables.data_tables.items()):
212 if name in EXCLUDE_DATA_TABLES or not data_table_has_path_column( data_table ): 224 if name in EXCLUDE_DATA_TABLES or not data_table_has_path_column( data_table ):
213 log.debug( 'Removing data table "%s" because it is excluded by name or does not have a defined "path" column.', name ) 225 log.debug( 'Removing data table "%s" because it is excluded by name or does not have a defined "path" column.', name )
214 del data_tables.data_tables[name] 226 del data_tables.data_tables[name]
215 cached_data_table = { 'data_tables': data_tables, 'tmp_dir': tmp_dir, 'data_table_text': data_table_text, 'tmp_loc_dir': tmp_loc_dir, 'loc_file_attrs': loc_file_attrs, 'time_loaded': datetime.datetime.now() } 227 cached_data_table = { 'data_tables': data_tables, 'tmp_dir': tmp_dir, 'data_table_text': data_table_text, 'tmp_loc_dir': tmp_loc_dir, 'loc_file_attrs': loc_file_attrs, 'time_loaded': datetime.datetime.now() }
216 TOOL_DATA_TABLES_LOADED_BY_URL[ url ] = cached_data_table 228 TOOL_DATA_TABLES_LOADED_BY_URL[ url ] = cached_data_table
217 #delete the files 229 # delete the files
218 data_table_fh.close() 230 data_table_fh.close()
219 cleanup_before_exit( tmp_dir ) 231 cleanup_before_exit( tmp_dir )
220 return cached_data_table 232 return cached_data_table
233
221 234
222 def data_table_has_path_column( data_table ): 235 def data_table_has_path_column( data_table ):
223 col_names = data_table.get_column_name_list() 236 col_names = data_table.get_column_name_list()
224 for name in PATH_COLUMN_NAMES: 237 for name in PATH_COLUMN_NAMES:
225 if name in col_names: 238 if name in col_names:
226 return True 239 return True
227 return False 240 return False
228 241
242
229 def get_available_tables( trans ): 243 def get_available_tables( trans ):
230 #list of data tables 244 # list of data tables
231 data_tables = load_data_tables_from_url( data_table_class=trans.app.tool_data_tables.__class__ ) 245 data_tables = load_data_tables_from_url( data_table_class=trans.app.tool_data_tables.__class__ )
232 return data_tables.get( 'data_tables' ).get_tables().keys() 246 return list(data_tables.get( 'data_tables' ).get_tables().keys())
247
233 248
234 def get_new_xml_definition( app, data_table, data_manager, repo_info=None, location_file_dir=None ): 249 def get_new_xml_definition( app, data_table, data_manager, repo_info=None, location_file_dir=None ):
235 sub_dict = { 'table_name': data_table.name, 'comment_char': '', 'columns': '', 'file_path': '' } 250 sub_dict = { 'table_name': data_table.name, 'comment_char': '', 'columns': '', 'file_path': '' }
236 sub_dict.update( data_manager.get_tool_shed_repository_info_dict() ) 251 sub_dict.update( data_manager.get_tool_shed_repository_info_dict() )
237 if data_table.comment_char: 252 if data_table.comment_char:
240 if name is not None: 255 if name is not None:
241 sub_dict['columns'] = "%s\n%s" % ( sub_dict['columns'], '<column name="%s" index="%s" />' % ( name, i ) ) 256 sub_dict['columns'] = "%s\n%s" % ( sub_dict['columns'], '<column name="%s" index="%s" />' % ( name, i ) )
242 location_file_dir = location_file_dir or app.config.galaxy_data_manager_data_path 257 location_file_dir = location_file_dir or app.config.galaxy_data_manager_data_path
243 for filename in data_table.filenames.keys(): 258 for filename in data_table.filenames.keys():
244 sub_dict['file_path'] = basename( filename ) 259 sub_dict['file_path'] = basename( filename )
245 sub_dict['file_path'] = os.path.join( location_file_dir, sub_dict['file_path'] ) #os.path.abspath? 260 sub_dict['file_path'] = os.path.join( location_file_dir, sub_dict['file_path'] ) # os.path.abspath?
246 if not os.path.exists( sub_dict['file_path'] ): 261 if not os.path.exists( sub_dict['file_path'] ):
247 # Create empty file 262 # Create empty file
248 open( sub_dict['file_path'], 'wb+' ).close() 263 open( sub_dict['file_path'], 'wb+' ).close()
249 break 264 break
250 sub_dict[ 'repo_info' ] = repo_info or '' 265 sub_dict[ 'repo_info' ] = repo_info or ''
254 <file path="%(file_path)s" /> 269 <file path="%(file_path)s" />
255 %(repo_info)s 270 %(repo_info)s
256 </table></tables> 271 </table></tables>
257 """ % sub_dict 272 """ % sub_dict
258 273
274
259 def get_available_tables_for_dbkey( trans, dbkey, data_table_names ): 275 def get_available_tables_for_dbkey( trans, dbkey, data_table_names ):
260 my_data_tables = trans.app.tool_data_tables.get_tables()
261 data_tables = load_data_tables_from_url( data_table_class=trans.app.tool_data_tables.__class__ ) 276 data_tables = load_data_tables_from_url( data_table_class=trans.app.tool_data_tables.__class__ )
262 rval = {} 277 rval = {}
263 for name, data_table in data_tables.get( 'data_tables' ).get_tables().iteritems(): 278 for name, data_table in data_tables.get( 'data_tables' ).get_tables().items():
264 if ( not data_table_names or name in data_table_names ): #name in my_data_tables.keys() and 279 if ( not data_table_names or name in data_table_names ):
265 #TODO: check that columns are similiar 280 # TODO: check that columns are similiar
266 if not dbkey: 281 if not dbkey:
267 entry_getter = data_table.get_named_fields_list() 282 entry_getter = data_table.get_named_fields_list()
268 else: 283 else:
269 entry_getter = data_table.get_entries( 'dbkey', dbkey, None, default=[] ) 284 entry_getter = data_table.get_entries( 'dbkey', dbkey, None, default=[] )
270 for entry in entry_getter: 285 for entry in entry_getter:
271 name = "%s: %s" % ( data_table.name, dumps( entry ) ) 286 name = "%s: %s" % ( data_table.name, dumps( entry ) )
272 rval[name] = entry 287 rval[name] = entry
273 return rval 288 return rval
289
274 290
275 def split_path_all( path ): 291 def split_path_all( path ):
276 rval = [] 292 rval = []
277 path = path.rstrip( '/' ) 293 path = path.rstrip( '/' )
278 while True: 294 while True:
284 rval.append( head ) 300 rval.append( head )
285 break 301 break
286 else: 302 else:
287 break 303 break
288 rval.reverse() 304 rval.reverse()
289 return rval 305 return rval
306
290 307
291 def get_data_for_path( path, data_root_dir ): 308 def get_data_for_path( path, data_root_dir ):
292 # We list dir with a /, but copy data without 309 # We list dir with a /, but copy data without
293 # listing with / gives a . entry when its a dir 310 # listing with / gives a . entry when its a dir
294 # cloning without the / will copy that whole directory into the target, 311 # cloning without the / will copy that whole directory into the target,
295 # instead of just that target's contents 312 # instead of just that target's contents
296 if path.startswith( GALAXY_DATA_CANONICAL_PATH ): 313 if path.startswith( GALAXY_DATA_CANONICAL_PATH ):
297 path = path[ len( GALAXY_DATA_CANONICAL_PATH ):] 314 path = path[ len( GALAXY_DATA_CANONICAL_PATH ):]
298 make_path = path 315 make_path = path
299 rsync_source = rsync_urljoin( rsync_urljoin( RSYNC_SERVER, INDEX_DIR ), path ) 316 rsync_source = rsync_urljoin( rsync_urljoin( RSYNC_SERVER, INDEX_DIR ), path )
300 if rsync_source.endswith( '/' ): 317 if rsync_source.endswith( '/' ):
301 rsync_source = rsync_source[:-1] 318 rsync_source = rsync_source[:-1]
302 try: 319 try:
303 dir_list = rsync_list_dir( rsync_source + "/" ) 320 dir_list = rsync_list_dir( rsync_source + "/" )
304 except Exception, e: 321 except Exception:
305 dir_list = None 322 dir_list = None
306 while not dir_list or '.' not in dir_list: 323 while not dir_list or '.' not in dir_list:
307 head, tail = os.path.split( make_path ) 324 head, tail = os.path.split( make_path )
308 if not head: 325 if not head:
309 head = tail 326 head = tail
310 make_path = head 327 make_path = head
311 rsync_source = rsync_urljoin( rsync_urljoin( RSYNC_SERVER, INDEX_DIR ), head ) #if we error here, likely due to a connection issue 328 rsync_source = rsync_urljoin( rsync_urljoin( RSYNC_SERVER, INDEX_DIR ), head ) # if we error here, likely due to a connection issue
312 if rsync_source.endswith( '/' ): 329 if rsync_source.endswith( '/' ):
313 rsync_source = rsync_source[:-1] 330 rsync_source = rsync_source[:-1]
314 dir_list = rsync_list_dir( rsync_source + "/" ) 331 dir_list = rsync_list_dir( rsync_source + "/" )
315 split_path = split_path_all( make_path ) 332 split_path = split_path_all( make_path )
316 target_path = data_root_dir 333 target_path = data_root_dir
319 if not os.path.exists( target_path ): 336 if not os.path.exists( target_path ):
320 os.mkdir( target_path ) 337 os.mkdir( target_path )
321 rsync_sync_to_dir( rsync_source, target_path ) 338 rsync_sync_to_dir( rsync_source, target_path )
322 return path 339 return path
323 340
341
324 def get_data_and_munge_path( data_table_name, data_table_entry, data_root_dir ): 342 def get_data_and_munge_path( data_table_name, data_table_entry, data_root_dir ):
325 path_cols = [] 343 path_cols = []
326 for key, value in data_table_entry.iteritems(): 344 for key, value in data_table_entry.items():
327 if key in PATH_COLUMN_NAMES: 345 if key in PATH_COLUMN_NAMES:
328 path_cols.append( ( key, value ) ) 346 path_cols.append( ( key, value ) )
329 found_data = False
330 if path_cols: 347 if path_cols:
331 for col_name, value in path_cols: 348 for col_name, value in path_cols:
332 #GALAXY_DATA_CANONICAL_PATH
333 if value.startswith( GALAXY_DATA_CANONICAL_PATH ): 349 if value.startswith( GALAXY_DATA_CANONICAL_PATH ):
334 data_table_entry[col_name] = get_data_for_path( value, data_root_dir ) 350 data_table_entry[col_name] = get_data_for_path( value, data_root_dir )
335 found_data = True
336 else: 351 else:
337 print 'unable to determine location of rsync data for', data_table_name, data_table_entry 352 print('unable to determine location of rsync data for', data_table_name, data_table_entry)
338 return data_table_entry 353 return data_table_entry
339 354
355
340 def fulfill_data_table_entries( data_table_entries, data_manager_dict, data_root_dir ): 356 def fulfill_data_table_entries( data_table_entries, data_manager_dict, data_root_dir ):
341 for data_table_name, entries in data_table_entries.iteritems(): 357 for data_table_name, entries in data_table_entries.items():
342 for entry in entries: 358 for entry in entries:
343 entry = get_data_and_munge_path( data_table_name, entry, data_root_dir ) 359 entry = get_data_and_munge_path( data_table_name, entry, data_root_dir )
344 _add_data_table_entry( data_manager_dict, data_table_name, entry ) 360 _add_data_table_entry( data_manager_dict, data_table_name, entry )
345 return data_manager_dict 361 return data_manager_dict
362
346 363
347 def _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ): 364 def _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ):
348 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) 365 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} )
349 data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get( data_table_name, [] ) 366 data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get( data_table_name, [] )
350 data_manager_dict['data_tables'][data_table_name].append( data_table_entry ) 367 data_manager_dict['data_tables'][data_table_name].append( data_table_entry )
351 return data_manager_dict 368 return data_manager_dict
352 369
370
353 def cleanup_before_exit( tmp_dir ): 371 def cleanup_before_exit( tmp_dir ):
354 if tmp_dir and os.path.exists( tmp_dir ): 372 if tmp_dir and os.path.exists( tmp_dir ):
355 shutil.rmtree( tmp_dir ) 373 shutil.rmtree( tmp_dir )
356 374
375
357 def get_data_table_entries( params ): 376 def get_data_table_entries( params ):
358 rval = {} 377 rval = {}
359 data_table_entries = params.get( 'data_table_entries', None ) 378 data_table_entries = params.get( 'data_table_entries', None )
360 if data_table_entries : 379 if data_table_entries:
361 for entry_text in data_table_entries.split( ',' ): 380 for entry_text in data_table_entries.split( ',' ):
362 entry_text = entry_text.strip().decode( 'base64' ) 381 entry_text = entry_text.strip().decode( 'base64' )
363 entry_dict = loads( entry_text ) 382 entry_dict = loads( entry_text )
364 data_table_name = entry_dict['name'] 383 data_table_name = entry_dict['name']
365 data_table_entry = entry_dict['entry'] 384 data_table_entry = entry_dict['entry']
366 rval[ data_table_name ] = rval.get( data_table_name, [] ) 385 rval[ data_table_name ] = rval.get( data_table_name, [] )
367 rval[ data_table_name ].append( data_table_entry ) 386 rval[ data_table_name ].append( data_table_entry )
368 return rval 387 return rval
369 388
389
370 def main(): 390 def main():
371 #Parse Command Line
372 parser = optparse.OptionParser() 391 parser = optparse.OptionParser()
373 (options, args) = parser.parse_args() 392 (options, args) = parser.parse_args()
374 393
375 filename = args[0] 394 filename = args[0]
376 395
377 params = loads( open( filename ).read() ) 396 params = loads( open( filename ).read() )
378 target_directory = params[ 'output_data' ][0]['extra_files_path'] 397 target_directory = params[ 'output_data' ][0]['extra_files_path']
379 os.mkdir( target_directory ) 398 os.mkdir( target_directory )
380 data_manager_dict = {} 399 data_manager_dict = {}
381 400
382 data_table_entries = get_data_table_entries( params['param_dict'] ) 401 data_table_entries = get_data_table_entries( params['param_dict'] )
383 402
384 # Populate the data Tables 403 # Populate the data Tables
385 data_manager_dict = fulfill_data_table_entries( data_table_entries, data_manager_dict, target_directory ) 404 data_manager_dict = fulfill_data_table_entries( data_table_entries, data_manager_dict, target_directory )
386 405
387 #save info to json file 406 # save info to json file
388 open( filename, 'wb' ).write( dumps( data_manager_dict ) ) 407 open( filename, 'wb' ).write( dumps( data_manager_dict ) )
389 408
390 if __name__ == "__main__": main() 409
410 if __name__ == "__main__":
411 main()