Mercurial > repos > devteam > data_manager_rsync_g2
comparison data_manager/data_manager_rsync.py @ 2:e0329ab30f6d draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_rsync_g2 commit 57f71aa633a43ab02bbf05acd0c6d7f406e01f1e"
| author | iuc |
|---|---|
| date | Thu, 28 Nov 2019 15:47:47 -0500 |
| parents | 8ff92bd7e2a3 |
| children |
comparison
equal
deleted
inserted
replaced
| 1:8ff92bd7e2a3 | 2:e0329ab30f6d |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 # Dan Blankenberg | 2 # Dan Blankenberg |
| 3 from __future__ import print_function | 3 from __future__ import print_function |
| 4 | 4 |
| 5 import base64 | |
| 5 import datetime | 6 import datetime |
| 6 import logging | 7 import logging |
| 7 import optparse | 8 import optparse |
| 8 import os | 9 import os |
| 9 import shutil | 10 import shutil |
| 23 from urllib2 import urlopen | 24 from urllib2 import urlopen |
| 24 | 25 |
| 25 _log_name = __name__ | 26 _log_name = __name__ |
| 26 if _log_name == '__builtin__': | 27 if _log_name == '__builtin__': |
| 27 _log_name = 'toolshed.installed.g2.rsync.data.manager' | 28 _log_name = 'toolshed.installed.g2.rsync.data.manager' |
| 28 log = logging.getLogger( _log_name ) | 29 log = logging.getLogger(_log_name) |
| 29 | 30 |
| 30 # Get the Data from the Galaxy Project rsync server | 31 # Get the Data from the Galaxy Project rsync server |
| 31 RSYNC_CMD = 'rsync' | 32 RSYNC_CMD = 'rsync' |
| 32 RSYNC_SERVER = "rsync://datacache.g2.bx.psu.edu/" | 33 RSYNC_SERVER = "rsync://datacache.g2.bx.psu.edu/" |
| 33 LOCATION_DIR = "location" | 34 LOCATION_DIR = "location" |
| 34 INDEX_DIR = "indexes" | 35 INDEX_DIR = "indexes" |
| 35 | 36 |
| 36 # Pull the Tool Data Table files from github | 37 # Pull the Tool Data Table files from github |
| 37 # FIXME: These files should be accessible from the rsync server directly. | 38 # FIXME: These files should be accessible from the rsync server directly. |
| 38 TOOL_DATA_TABLE_CONF_XML_URLS = {'main': "https://raw.githubusercontent.com/galaxyproject/usegalaxy-playbook/master/files/galaxy/usegalaxy.org/config/tool_data_table_conf.xml", | 39 TOOL_DATA_TABLE_CONF_XML_URLS = {'main': "https://raw.githubusercontent.com/galaxyproject/usegalaxy-playbook/master/env/main/files/galaxy/config/tool_data_table_conf.xml", |
| 39 'test': "https://raw.githubusercontent.com/galaxyproject/usegalaxy-playbook/master/files/galaxy/test.galaxyproject.org/config/tool_data_table_conf.xml" } | 40 'test': "https://raw.githubusercontent.com/galaxyproject/usegalaxy-playbook/master/env/test/files/galaxy/config/tool_data_table_conf.xml"} |
| 40 | 41 |
| 41 # Replace data table source entries with local temporary location | 42 # Replace data table source entries with local temporary location |
| 42 GALAXY_DATA_CANONICAL_PATH = "/galaxy/data/" | 43 GALAXY_DATA_CANONICAL_PATH = "/galaxy/data/" |
| 43 TOOL_DATA_TABLE_CONF_XML_REPLACE_SOURCE = '<file path="%slocation/' % ( GALAXY_DATA_CANONICAL_PATH ) | 44 TOOL_DATA_TABLE_CONF_XML_REPLACE_SOURCE = '<file path="%slocation/' % (GALAXY_DATA_CANONICAL_PATH) |
| 44 TOOL_DATA_TABLE_CONF_XML_REPLACE_TARGET = '<file path="%s/' | 45 TOOL_DATA_TABLE_CONF_XML_REPLACE_TARGET = '<file path="%s/' |
| 45 | 46 |
| 46 # Some basic Caching, so we don't have to reload and download everything every time | 47 # Some basic Caching, so we don't have to reload and download everything every time |
| 47 CACHE_TIME = datetime.timedelta( minutes=10 ) | 48 CACHE_TIME = datetime.timedelta(minutes=10) |
| 48 TOOL_DATA_TABLES_LOADED_BY_URL = {} | 49 TOOL_DATA_TABLES_LOADED_BY_URL = {} |
| 49 | 50 |
| 50 # Entries will not be selected by default | 51 # Entries will not be selected by default |
| 51 DEFAULT_SELECTED = False | 52 DEFAULT_SELECTED = False |
| 52 | 53 |
| 58 # e.g. mafs. Although this maf table is goofy and doesn't have path defined in <table> def, | 59 # e.g. mafs. Although this maf table is goofy and doesn't have path defined in <table> def, |
| 59 # but it does exit in the .loc. | 60 # but it does exit in the .loc. |
| 60 | 61 |
| 61 | 62 |
| 62 # --- These methods are called by/within the Galaxy Application | 63 # --- These methods are called by/within the Galaxy Application |
| 63 def exec_before_job( app, inp_data, out_data, param_dict, tool=None, **kwd ): | 64 def exec_before_job(app, inp_data, out_data, param_dict, tool=None, **kwd): |
| 64 # Look for any data tables that haven't been defined for this data manager before and dynamically add them to Galaxy | 65 # Look for any data tables that haven't been defined for this data manager before and dynamically add them to Galaxy |
| 65 param_dict = dict( **param_dict ) | 66 param_dict = dict(**param_dict) |
| 66 param_dict['data_table_entries'] = param_dict.get( 'data_table_entries', [] ) | 67 param_dict['data_table_entries'] = param_dict.get('data_table_entries', []) |
| 67 if not isinstance( param_dict['data_table_entries'], list ): | 68 if not isinstance(param_dict['data_table_entries'], list): |
| 68 param_dict['data_table_entries'] = [param_dict['data_table_entries']] | 69 param_dict['data_table_entries'] = [param_dict['data_table_entries']] |
| 69 param_dict['data_table_entries'] = ",".join( param_dict['data_table_entries'] ) | 70 param_dict['data_table_entries'] = ",".join(param_dict['data_table_entries']) |
| 70 if tool: | 71 if tool: |
| 71 tool_shed_repository = tool.tool_shed_repository | 72 tool_shed_repository = tool.tool_shed_repository |
| 72 else: | 73 else: |
| 73 tool_shed_repository = None | 74 tool_shed_repository = None |
| 74 tdtm = None | 75 tdtm = None |
| 75 data_manager = app.data_managers.get_manager( tool.data_manager_id, None ) | 76 data_manager = app.data_managers.get_manager(tool.data_manager_id, None) |
| 76 data_table_entries = get_data_table_entries( param_dict ) | 77 data_table_entries = get_data_table_entries(param_dict) |
| 77 data_tables = load_data_tables_from_url( data_table_class=app.tool_data_tables.__class__ ).get( 'data_tables' ) | 78 data_tables = load_data_tables_from_url(data_table_class=app.tool_data_tables.__class__).get('data_tables') |
| 78 for data_table_name, entries in data_table_entries.items(): | 79 for data_table_name, entries in data_table_entries.items(): |
| 79 # get data table managed by this data Manager | 80 # get data table managed by this data Manager |
| 80 has_data_table = app.tool_data_tables.get_tables().get( data_table_name ) | 81 has_data_table = app.tool_data_tables.get_tables().get(str(data_table_name)) |
| 81 if has_data_table: | 82 if has_data_table: |
| 82 has_data_table = bool( has_data_table.get_filename_for_source( data_manager, None ) ) | 83 has_data_table = bool(has_data_table.get_filename_for_source(data_manager, None)) |
| 83 if not has_data_table: | 84 if not has_data_table: |
| 84 if tdtm is None: | 85 if tdtm is None: |
| 85 from tool_shed.tools import data_table_manager | 86 from tool_shed.tools import data_table_manager |
| 86 tdtm = data_table_manager.ToolDataTableManager( app ) | 87 tdtm = data_table_manager.ToolDataTableManager(app) |
| 87 target_dir, tool_path, relative_target_dir = tdtm.get_target_install_dir( tool_shed_repository ) | 88 target_dir, tool_path, relative_target_dir = tdtm.get_target_install_dir(tool_shed_repository) |
| 88 # Dynamically add this data table | 89 # Dynamically add this data table |
| 89 log.debug( "Attempting to dynamically create a missing Tool Data Table named %s." % data_table_name ) | 90 log.debug("Attempting to dynamically create a missing Tool Data Table named %s." % data_table_name) |
| 90 data_table = data_tables[data_table_name] | 91 data_table = data_tables[data_table_name] |
| 91 repo_info = tdtm.generate_repository_info_elem_from_repository( tool_shed_repository, parent_elem=None ) | 92 repo_info = tdtm.generate_repository_info_elem_from_repository(tool_shed_repository, parent_elem=None) |
| 92 if repo_info is not None: | 93 if repo_info is not None: |
| 93 repo_info = tostring( repo_info ) | 94 repo_info = tostring(repo_info) |
| 94 tmp_file = tempfile.NamedTemporaryFile() | 95 tmp_file = tempfile.NamedTemporaryFile(mode="w") |
| 95 tmp_file.write( get_new_xml_definition( app, data_table, data_manager, repo_info, target_dir ) ) | 96 tmp_file.write(get_new_xml_definition(app, data_table, data_manager, repo_info, target_dir)) |
| 96 tmp_file.flush() | 97 tmp_file.flush() |
| 97 app.tool_data_tables.add_new_entries_from_config_file( tmp_file.name, None, app.config.shed_tool_data_table_config, persist=True ) | 98 app.tool_data_tables.add_new_entries_from_config_file(tmp_file.name, None, app.config.shed_tool_data_table_config, persist=True) |
| 98 tmp_file.close() | 99 tmp_file.close() |
| 99 | 100 |
| 100 | 101 |
| 101 def galaxy_code_get_available_data_tables( trans ): | 102 def galaxy_code_get_available_data_tables(trans): |
| 102 # list of data tables | 103 # list of data tables |
| 103 found_tables = get_available_tables( trans ) | 104 found_tables = get_available_tables(trans) |
| 104 rval = [ ( x, x, DEFAULT_SELECTED ) for x in found_tables] | 105 rval = [(x, x, DEFAULT_SELECTED) for x in found_tables] |
| 105 return rval | 106 return rval |
| 106 | 107 |
| 107 | 108 |
| 108 def galaxy_code_get_available_data_tables_entries( trans, dbkey, data_table_names ): | 109 def galaxy_code_get_available_data_tables_entries(trans, dbkey, data_table_names): |
| 109 # available entries, optionally filtered by dbkey and table names | 110 # available entries, optionally filtered by dbkey and table names |
| 110 if dbkey in [ None, '', '?' ]: | 111 if dbkey in [None, '', '?']: |
| 111 dbkey = None | 112 dbkey = None |
| 112 if data_table_names in [ None, '', '?' ]: | 113 if data_table_names in [None, '', '?']: |
| 113 data_table_names = None | 114 data_table_names = None |
| 114 found_tables = get_available_tables_for_dbkey( trans, dbkey, data_table_names ) | 115 found_tables = get_available_tables_for_dbkey(trans, dbkey, data_table_names) |
| 115 dbkey_text = '(%s) ' % ( dbkey ) if dbkey else '' | 116 dbkey_text = '(%s) ' % (dbkey) if dbkey else '' |
| 116 rval = [( "%s%s" % ( dbkey_text, x[0] ), dumps( dict( name=x[0].split( ': ' )[0], entry=x[1] ) ).encode( 'base64' ).rstrip(), DEFAULT_SELECTED ) for x in found_tables.items()] | 117 rval = [("%s%s" % (dbkey_text, x[0]), base64.b64encode(dumps(dict(name=x[0].split(': ')[0], entry=x[1]), sort_keys=True).rstrip().encode('utf-8')).decode('utf-8'), DEFAULT_SELECTED) for x in found_tables.items()] |
| 117 return rval | 118 return rval |
| 118 | 119 |
| 119 # --- End Galaxy called Methods --- | 120 # --- End Galaxy called Methods --- |
| 120 | 121 |
| 121 | 122 |
| 122 def rsync_urljoin( base, url ): | 123 def rsync_urljoin(base, url): |
| 123 # urlparse.urljoin doesn't work correctly for our use-case | 124 # urlparse.urljoin doesn't work correctly for our use-case |
| 124 # probably because it doesn't recognize the rsync scheme | 125 # probably because it doesn't recognize rhe rsync scheme |
| 125 base = base.rstrip( '/' ) | 126 base = base.rstrip('/') |
| 126 url = url.lstrip( '/' ) | 127 url = url.lstrip('/') |
| 127 return "%s/%s" % ( base, url ) | 128 return "%s/%s" % (base, url) |
| 128 | 129 |
| 129 | 130 |
| 130 def rsync_list_dir( server, dir=None, skip_names=[] ): | 131 def rsync_list_dir(server, dir=None, skip_names=[]): |
| 131 # drwxr-xr-x 50 2014/05/16 20:58:11 . | 132 # drwxr-xr-x 50 2014/05/16 20:58:11 . |
| 132 if dir: | 133 if dir: |
| 133 dir = rsync_urljoin( server, dir ) | 134 dir = rsync_urljoin(server, dir) |
| 134 else: | 135 else: |
| 135 dir = server | 136 dir = server |
| 136 rsync_response = tempfile.NamedTemporaryFile() | 137 rsync_response = tempfile.NamedTemporaryFile() |
| 137 rsync_stderr = tempfile.NamedTemporaryFile() | 138 rsync_stderr = tempfile.NamedTemporaryFile() |
| 138 rsync_cmd = [ RSYNC_CMD, '--list-only', dir ] | 139 rsync_cmd = [RSYNC_CMD, '--list-only', dir] |
| 139 return_code = subprocess.call( rsync_cmd, stdout=rsync_response, stderr=rsync_stderr ) | 140 return_code = subprocess.call(rsync_cmd, stdout=rsync_response, stderr=rsync_stderr) |
| 140 rsync_response.flush() | 141 rsync_response.flush() |
| 141 rsync_response.seek(0) | 142 rsync_response.seek(0) |
| 142 rsync_stderr.flush() | 143 rsync_stderr.flush() |
| 143 rsync_stderr.seek(0) | 144 rsync_stderr.seek(0) |
| 144 if return_code: | 145 if return_code: |
| 145 msg = "stdout:\n%s\nstderr:\n%s" % ( rsync_response.read(), rsync_stderr.read() ) | 146 msg = "stdout:\n%s\nstderr:\n%s" % (rsync_response.read(), rsync_stderr.read()) |
| 146 rsync_response.close() | 147 rsync_response.close() |
| 147 rsync_stderr.close() | 148 rsync_stderr.close() |
| 148 raise Exception( 'Failed to execute rsync command (%s), returncode=%s. Rsync_output:\n%s' % ( rsync_cmd, return_code, msg ) ) | 149 raise Exception('Failed to execute rsync command (%s), returncode=%s. Rsync_output:\n%s' % (rsync_cmd, return_code, msg)) |
| 149 rsync_stderr.close() | 150 rsync_stderr.close() |
| 150 rval = {} | 151 rval = {} |
| 151 for line in rsync_response: | 152 for line in rsync_response: |
| 152 perms, line = line.split( None, 1 ) | 153 perms, line = line.split(None, 1) |
| 153 line = line.strip() | 154 line = line.strip() |
| 154 size, line = line.split( None, 1 ) | 155 size, line = line.split(None, 1) |
| 155 line = line.strip() | 156 line = line.strip() |
| 156 date, line = line.split( None, 1 ) | 157 date, line = line.split(None, 1) |
| 157 line = line.strip() | 158 line = line.strip() |
| 158 time, line = line.split( None, 1 ) | 159 time, line = line.split(None, 1) |
| 159 name = line.strip() | 160 name = line.strip() |
| 160 if name in skip_names: | 161 if name in skip_names: |
| 161 continue | 162 continue |
| 162 size = line.strip() | 163 size = line.strip() |
| 163 rval[ name ] = dict( name=name, permissions=perms, bytes=size, date=date, time=time ) | 164 rval[name] = dict(name=name, permissions=perms, bytes=size, date=date, time=time) |
| 164 rsync_response.close() | 165 rsync_response.close() |
| 165 return rval | 166 return rval |
| 166 | 167 |
| 167 | 168 |
| 168 def rsync_sync_to_dir( source, target ): | 169 def rsync_sync_to_dir(source, target): |
| 169 rsync_response = tempfile.NamedTemporaryFile() | 170 rsync_response = tempfile.NamedTemporaryFile() |
| 170 rsync_stderr = tempfile.NamedTemporaryFile() | 171 rsync_stderr = tempfile.NamedTemporaryFile() |
| 171 rsync_cmd = [ RSYNC_CMD, '-avzP', source, target ] | 172 rsync_cmd = [RSYNC_CMD, '-avzP', source, target] |
| 172 return_code = subprocess.call( rsync_cmd, stdout=rsync_response, stderr=rsync_stderr ) | 173 return_code = subprocess.call(rsync_cmd, stdout=rsync_response, stderr=rsync_stderr) |
| 173 rsync_response.flush() | 174 rsync_response.flush() |
| 174 rsync_response.seek(0) | 175 rsync_response.seek(0) |
| 175 rsync_stderr.flush() | 176 rsync_stderr.flush() |
| 176 rsync_stderr.seek(0) | 177 rsync_stderr.seek(0) |
| 177 if return_code: | 178 if return_code: |
| 178 msg = "stdout:\n%s\nstderr:\n%s" % ( rsync_response.read(), rsync_stderr.read() ) | 179 msg = "stdout:\n%s\nstderr:\n%s" % (rsync_response.read(), rsync_stderr.read()) |
| 179 rsync_response.close() | 180 rsync_response.close() |
| 180 rsync_stderr.close() | 181 rsync_stderr.close() |
| 181 raise Exception( 'Failed to execute rsync command (%s), returncode=%s. Rsync_output:\n%s' % ( rsync_cmd, return_code, msg ) ) | 182 raise Exception('Failed to execute rsync command (%s), returncode=%s. Rsync_output:\n%s' % (rsync_cmd, return_code, msg)) |
| 182 rsync_response.close() | 183 rsync_response.close() |
| 183 rsync_stderr.close() | 184 rsync_stderr.close() |
| 184 return return_code | 185 return return_code |
| 185 | 186 |
| 186 | 187 |
| 187 def data_table_needs_refresh( cached_data_table, url ): | 188 def data_table_needs_refresh(cached_data_table, url): |
| 188 if cached_data_table is None: | 189 if cached_data_table is None: |
| 189 return True, {} | 190 return True, {} |
| 190 if datetime.datetime.now() - cached_data_table.get( 'time_loaded' ) > CACHE_TIME: | 191 if datetime.datetime.now() - cached_data_table.get('time_loaded') > CACHE_TIME: |
| 191 data_table_text = urlopen( url ).read() | 192 data_table_text = urlopen(url).read().decode('utf-8') |
| 192 if cached_data_table.get( 'data_table_text', None ) != data_table_text: | 193 if cached_data_table.get('data_table_text', None) != data_table_text: |
| 193 return True, {'data_table_text': data_table_text} | 194 return True, {'data_table_text': data_table_text} |
| 194 loc_file_attrs = rsync_list_dir( RSYNC_SERVER, LOCATION_DIR ) | 195 loc_file_attrs = rsync_list_dir(RSYNC_SERVER, LOCATION_DIR) |
| 195 if cached_data_table.get( 'loc_file_attrs', None ) != loc_file_attrs: | 196 if cached_data_table.get('loc_file_attrs', None) != loc_file_attrs: |
| 196 return True, {'loc_file_attrs': loc_file_attrs} | 197 return True, {'loc_file_attrs': loc_file_attrs} |
| 197 return False, {} | 198 return False, {} |
| 198 | 199 |
| 199 | 200 |
| 200 def load_data_tables_from_url( url=None, site='main', data_table_class=None ): | 201 def load_data_tables_from_url(url=None, site='main', data_table_class=None): |
| 201 if not url: | 202 if not url: |
| 202 url = TOOL_DATA_TABLE_CONF_XML_URLS.get( site, None ) | 203 url = TOOL_DATA_TABLE_CONF_XML_URLS.get(site, None) |
| 203 assert url, ValueError( 'You must provide either a URL or a site=name.' ) | 204 assert url, ValueError('You must provide either a URL or a site=name.') |
| 204 | 205 |
| 205 cached_data_table = TOOL_DATA_TABLES_LOADED_BY_URL.get( url, None ) | 206 cached_data_table = TOOL_DATA_TABLES_LOADED_BY_URL.get(url, None) |
| 206 refresh, attribs = data_table_needs_refresh( cached_data_table, url ) | 207 refresh, attribs = data_table_needs_refresh(cached_data_table, url) |
| 207 if refresh: | 208 if refresh: |
| 208 data_table_text = attribs.get( 'data_table_text' )or urlopen( url ).read() | 209 data_table_text = attribs.get('data_table_text') or urlopen(url).read().decode('utf-8') |
| 209 loc_file_attrs = attribs.get( 'loc_file_attrs' ) or rsync_list_dir( RSYNC_SERVER, LOCATION_DIR ) | 210 loc_file_attrs = attribs.get('loc_file_attrs') or rsync_list_dir(RSYNC_SERVER, LOCATION_DIR) |
| 210 | 211 |
| 211 tmp_dir = tempfile.mkdtemp( prefix='rsync_g2_' ) | 212 tmp_dir = tempfile.mkdtemp(prefix='rsync_g2_') |
| 212 tmp_loc_dir = os.path.join( tmp_dir, 'location' ) | 213 tmp_loc_dir = os.path.join(tmp_dir, 'location') |
| 213 os.mkdir( tmp_loc_dir ) | 214 os.mkdir(tmp_loc_dir) |
| 214 rsync_sync_to_dir( rsync_urljoin( RSYNC_SERVER, LOCATION_DIR ), os.path.abspath( tmp_loc_dir ) ) | 215 rsync_sync_to_dir(rsync_urljoin(RSYNC_SERVER, LOCATION_DIR), os.path.abspath(tmp_loc_dir)) |
| 215 | 216 |
| 216 new_data_table_text = data_table_text.replace( TOOL_DATA_TABLE_CONF_XML_REPLACE_SOURCE, TOOL_DATA_TABLE_CONF_XML_REPLACE_TARGET % ( tmp_loc_dir ) ) | 217 new_data_table_text = data_table_text.replace(TOOL_DATA_TABLE_CONF_XML_REPLACE_SOURCE, TOOL_DATA_TABLE_CONF_XML_REPLACE_TARGET % (tmp_loc_dir)) |
| 217 data_table_fh = tempfile.NamedTemporaryFile( dir=tmp_dir, prefix='rysnc_data_manager_data_table_conf_' ) | 218 data_table_fh = tempfile.NamedTemporaryFile(dir=tmp_dir, prefix='rysnc_data_manager_data_table_conf_', mode="w") |
| 218 data_table_fh.write( new_data_table_text ) | 219 data_table_fh.write(new_data_table_text) |
| 219 data_table_fh.flush() | 220 data_table_fh.flush() |
| 220 tmp_data_dir = os.path.join( tmp_dir, 'tool-data' ) | 221 tmp_data_dir = os.path.join(tmp_dir, 'tool-data') |
| 221 os.mkdir( tmp_data_dir ) | 222 os.mkdir(tmp_data_dir) |
| 222 data_tables = data_table_class( tmp_data_dir, config_filename=data_table_fh.name ) | 223 data_tables = data_table_class(tmp_data_dir, config_filename=data_table_fh.name) |
| 223 for name, data_table in list(data_tables.data_tables.items()): | 224 for name, data_table in list(data_tables.data_tables.items()): |
| 224 if name in EXCLUDE_DATA_TABLES or not data_table_has_path_column( data_table ): | 225 if name in EXCLUDE_DATA_TABLES or not data_table_has_path_column(data_table): |
| 225 log.debug( 'Removing data table "%s" because it is excluded by name or does not have a defined "path" column.', name ) | 226 log.debug('Removing data table "%s" because it is excluded by name or does not have a defined "path" column.', name) |
| 226 del data_tables.data_tables[name] | 227 del data_tables.data_tables[name] |
| 227 cached_data_table = { 'data_tables': data_tables, 'tmp_dir': tmp_dir, 'data_table_text': data_table_text, 'tmp_loc_dir': tmp_loc_dir, 'loc_file_attrs': loc_file_attrs, 'time_loaded': datetime.datetime.now() } | 228 cached_data_table = {'data_tables': data_tables, 'tmp_dir': tmp_dir, 'data_table_text': data_table_text, 'tmp_loc_dir': tmp_loc_dir, 'loc_file_attrs': loc_file_attrs, 'time_loaded': datetime.datetime.now()} |
| 228 TOOL_DATA_TABLES_LOADED_BY_URL[ url ] = cached_data_table | 229 TOOL_DATA_TABLES_LOADED_BY_URL[url] = cached_data_table |
| 229 # delete the files | 230 # delete the files |
| 230 data_table_fh.close() | 231 data_table_fh.close() |
| 231 cleanup_before_exit( tmp_dir ) | 232 cleanup_before_exit(tmp_dir) |
| 232 return cached_data_table | 233 return cached_data_table |
| 233 | 234 |
| 234 | 235 |
| 235 def data_table_has_path_column( data_table ): | 236 def data_table_has_path_column(data_table): |
| 236 col_names = data_table.get_column_name_list() | 237 col_names = data_table.get_column_name_list() |
| 237 for name in PATH_COLUMN_NAMES: | 238 for name in PATH_COLUMN_NAMES: |
| 238 if name in col_names: | 239 if name in col_names: |
| 239 return True | 240 return True |
| 240 return False | 241 return False |
| 241 | 242 |
| 242 | 243 |
| 243 def get_available_tables( trans ): | 244 def get_available_tables(trans): |
| 244 # list of data tables | 245 # list of data tables |
| 245 data_tables = load_data_tables_from_url( data_table_class=trans.app.tool_data_tables.__class__ ) | 246 data_tables = load_data_tables_from_url(data_table_class=trans.app.tool_data_tables.__class__) |
| 246 return list(data_tables.get( 'data_tables' ).get_tables().keys()) | 247 return data_tables.get('data_tables').get_tables().keys() |
| 247 | 248 |
| 248 | 249 |
| 249 def get_new_xml_definition( app, data_table, data_manager, repo_info=None, location_file_dir=None ): | 250 def get_new_xml_definition(app, data_table, data_manager, repo_info=None, location_file_dir=None): |
| 250 sub_dict = { 'table_name': data_table.name, 'comment_char': '', 'columns': '', 'file_path': '' } | 251 sub_dict = {'table_name': data_table.name, 'comment_char': '', 'columns': '', 'file_path': ''} |
| 251 sub_dict.update( data_manager.get_tool_shed_repository_info_dict() ) | 252 sub_dict.update(data_manager.get_tool_shed_repository_info_dict()) |
| 252 if data_table.comment_char: | 253 if data_table.comment_char: |
| 253 sub_dict['comment_char'] = 'comment_char="%s"' % ( data_table.comment_char ) | 254 sub_dict['comment_char'] = 'comment_char="%s"' % (data_table.comment_char) |
| 254 for i, name in enumerate( data_table.get_column_name_list() ): | 255 for i, name in enumerate(data_table.get_column_name_list()): |
| 255 if name is not None: | 256 if name is not None: |
| 256 sub_dict['columns'] = "%s\n%s" % ( sub_dict['columns'], '<column name="%s" index="%s" />' % ( name, i ) ) | 257 sub_dict['columns'] = "%s\n%s" % (sub_dict['columns'], '<column name="%s" index="%s" />' % (name, i)) |
| 257 location_file_dir = location_file_dir or app.config.galaxy_data_manager_data_path | 258 location_file_dir = location_file_dir or app.config.galaxy_data_manager_data_path |
| 258 for filename in data_table.filenames.keys(): | 259 for filename in data_table.filenames.keys(): |
| 259 sub_dict['file_path'] = basename( filename ) | 260 sub_dict['file_path'] = basename(filename) |
| 260 sub_dict['file_path'] = os.path.join( location_file_dir, sub_dict['file_path'] ) # os.path.abspath? | 261 sub_dict['file_path'] = os.path.join(location_file_dir, sub_dict['file_path']) # os.path.abspath? |
| 261 if not os.path.exists( sub_dict['file_path'] ): | 262 if not os.path.exists(sub_dict['file_path']): |
| 262 # Create empty file | 263 # Create empty file |
| 263 open( sub_dict['file_path'], 'wb+' ).close() | 264 open(sub_dict['file_path'], 'wb+').close() |
| 264 break | 265 break |
| 265 sub_dict[ 'repo_info' ] = repo_info or '' | 266 sub_dict['repo_info'] = repo_info or '' |
| 266 return """ | 267 return """ |
| 267 <tables><table name="%(table_name)s" %(comment_char)s> | 268 <tables><table name="%(table_name)s" %(comment_char)s> |
| 268 %(columns)s | 269 %(columns)s |
| 269 <file path="%(file_path)s" /> | 270 <file path="%(file_path)s" /> |
| 270 %(repo_info)s | 271 %(repo_info)s |
| 271 </table></tables> | 272 </table></tables> |
| 272 """ % sub_dict | 273 """ % sub_dict |
| 273 | 274 |
| 274 | 275 |
| 275 def get_available_tables_for_dbkey( trans, dbkey, data_table_names ): | 276 def get_available_tables_for_dbkey(trans, dbkey, data_table_names): |
| 276 data_tables = load_data_tables_from_url( data_table_class=trans.app.tool_data_tables.__class__ ) | 277 data_tables = load_data_tables_from_url(data_table_class=trans.app.tool_data_tables.__class__) |
| 277 rval = {} | 278 rval = {} |
| 278 for name, data_table in data_tables.get( 'data_tables' ).get_tables().items(): | 279 for name, data_table in data_tables.get('data_tables').get_tables().items(): |
| 279 if ( not data_table_names or name in data_table_names ): | 280 if (not data_table_names or name in data_table_names): |
| 280 # TODO: check that columns are similiar | 281 # TODO: check that columns are similiar |
| 281 if not dbkey: | 282 if not dbkey: |
| 282 entry_getter = data_table.get_named_fields_list() | 283 entry_getter = data_table.get_named_fields_list() |
| 283 else: | 284 else: |
| 284 entry_getter = data_table.get_entries( 'dbkey', dbkey, None, default=[] ) | 285 entry_getter = data_table.get_entries('dbkey', dbkey, None, default=[]) |
| 285 for entry in entry_getter: | 286 for entry in entry_getter: |
| 286 name = "%s: %s" % ( data_table.name, dumps( entry ) ) | 287 name = "%s: %s" % (data_table.name, dumps(entry)) |
| 287 rval[name] = entry | 288 rval[name] = entry |
| 288 return rval | 289 return rval |
| 289 | 290 |
| 290 | 291 |
| 291 def split_path_all( path ): | 292 def split_path_all(path): |
| 292 rval = [] | 293 rval = [] |
| 293 path = path.rstrip( '/' ) | 294 path = path.rstrip('/') |
| 294 while True: | 295 while True: |
| 295 head, tail = os.path.split( path ) | 296 head, tail = os.path.split(path) |
| 296 if tail: | 297 if tail: |
| 297 rval.append( tail ) | 298 rval.append(tail) |
| 298 path = head | 299 path = head |
| 299 elif head: | 300 elif head: |
| 300 rval.append( head ) | 301 rval.append(head) |
| 301 break | 302 break |
| 302 else: | 303 else: |
| 303 break | 304 break |
| 304 rval.reverse() | 305 rval.reverse() |
| 305 return rval | 306 return rval |
| 306 | 307 |
| 307 | 308 |
| 308 def get_data_for_path( path, data_root_dir ): | 309 def get_data_for_path(path, data_root_dir): |
| 309 # We list dir with a /, but copy data without | 310 # We list dir with a /, but copy data without |
| 310 # listing with / gives a . entry when its a dir | 311 # listing with / gives a . entry when its a dir |
| 311 # cloning without the / will copy that whole directory into the target, | 312 # cloning without the / will copy that whole directory into the target, |
| 312 # instead of just that target's contents | 313 # instead of just that target's contents |
| 313 if path.startswith( GALAXY_DATA_CANONICAL_PATH ): | 314 if path.startswith(GALAXY_DATA_CANONICAL_PATH): |
| 314 path = path[ len( GALAXY_DATA_CANONICAL_PATH ):] | 315 path = path[len(GALAXY_DATA_CANONICAL_PATH):] |
| 315 make_path = path | 316 make_path = path |
| 316 rsync_source = rsync_urljoin( rsync_urljoin( RSYNC_SERVER, INDEX_DIR ), path ) | 317 rsync_source = rsync_urljoin(rsync_urljoin(RSYNC_SERVER, INDEX_DIR), path) |
| 317 if rsync_source.endswith( '/' ): | 318 if rsync_source.endswith('/'): |
| 318 rsync_source = rsync_source[:-1] | 319 rsync_source = rsync_source[:-1] |
| 319 try: | 320 try: |
| 320 dir_list = rsync_list_dir( rsync_source + "/" ) | 321 dir_list = rsync_list_dir(rsync_source + "/") |
| 321 except Exception: | 322 except Exception: |
| 322 dir_list = None | 323 dir_list = None |
| 323 while not dir_list or '.' not in dir_list: | 324 while not dir_list or '.' not in dir_list: |
| 324 head, tail = os.path.split( make_path ) | 325 head, tail = os.path.split(make_path) |
| 325 if not head: | 326 if not head: |
| 326 head = tail | 327 head = tail |
| 327 make_path = head | 328 make_path = head |
| 328 rsync_source = rsync_urljoin( rsync_urljoin( RSYNC_SERVER, INDEX_DIR ), head ) # if we error here, likely due to a connection issue | 329 rsync_source = rsync_urljoin(rsync_urljoin(RSYNC_SERVER, INDEX_DIR), head) # if we error here, likely due to a connection issue |
| 329 if rsync_source.endswith( '/' ): | 330 if rsync_source.endswith('/'): |
| 330 rsync_source = rsync_source[:-1] | 331 rsync_source = rsync_source[:-1] |
| 331 dir_list = rsync_list_dir( rsync_source + "/" ) | 332 dir_list = rsync_list_dir(rsync_source + "/") |
| 332 split_path = split_path_all( make_path ) | 333 split_path = split_path_all(make_path) |
| 333 target_path = data_root_dir | 334 target_path = data_root_dir |
| 334 for p in split_path[:-1]: | 335 for p in split_path[:-1]: |
| 335 target_path = os.path.join( target_path, p ) | 336 target_path = os.path.join(target_path, p) |
| 336 if not os.path.exists( target_path ): | 337 if not os.path.exists(target_path): |
| 337 os.mkdir( target_path ) | 338 os.mkdir(target_path) |
| 338 rsync_sync_to_dir( rsync_source, target_path ) | 339 rsync_sync_to_dir(rsync_source, target_path) |
| 339 return path | 340 return path |
| 340 | 341 |
| 341 | 342 |
| 342 def get_data_and_munge_path( data_table_name, data_table_entry, data_root_dir ): | 343 def get_data_and_munge_path(data_table_name, data_table_entry, data_root_dir): |
| 343 path_cols = [] | 344 path_cols = [] |
| 344 for key, value in data_table_entry.items(): | 345 for key, value in data_table_entry.items(): |
| 345 if key in PATH_COLUMN_NAMES: | 346 if key in PATH_COLUMN_NAMES: |
| 346 path_cols.append( ( key, value ) ) | 347 path_cols.append((key, value)) |
| 347 if path_cols: | 348 if path_cols: |
| 348 for col_name, value in path_cols: | 349 for col_name, value in path_cols: |
| 349 if value.startswith( GALAXY_DATA_CANONICAL_PATH ): | 350 if value.startswith(GALAXY_DATA_CANONICAL_PATH): |
| 350 data_table_entry[col_name] = get_data_for_path( value, data_root_dir ) | 351 data_table_entry[col_name] = get_data_for_path(value, data_root_dir) |
| 351 else: | 352 else: |
| 352 print('unable to determine location of rsync data for', data_table_name, data_table_entry) | 353 print('unable to determine location of rsync data for', data_table_name, data_table_entry) |
| 353 return data_table_entry | 354 return data_table_entry |
| 354 | 355 |
| 355 | 356 |
| 356 def fulfill_data_table_entries( data_table_entries, data_manager_dict, data_root_dir ): | 357 def fulfill_data_table_entries(data_table_entries, data_manager_dict, data_root_dir): |
| 357 for data_table_name, entries in data_table_entries.items(): | 358 for data_table_name, entries in data_table_entries.items(): |
| 358 for entry in entries: | 359 for entry in entries: |
| 359 entry = get_data_and_munge_path( data_table_name, entry, data_root_dir ) | 360 entry = get_data_and_munge_path(data_table_name, entry, data_root_dir) |
| 360 _add_data_table_entry( data_manager_dict, data_table_name, entry ) | 361 _add_data_table_entry(data_manager_dict, data_table_name, entry) |
| 361 return data_manager_dict | 362 return data_manager_dict |
| 362 | 363 |
| 363 | 364 |
| 364 def _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ): | 365 def _add_data_table_entry(data_manager_dict, data_table_name, data_table_entry): |
| 365 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) | 366 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) |
| 366 data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get( data_table_name, [] ) | 367 data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get(data_table_name, []) |
| 367 data_manager_dict['data_tables'][data_table_name].append( data_table_entry ) | 368 data_manager_dict['data_tables'][data_table_name].append(data_table_entry) |
| 368 return data_manager_dict | 369 return data_manager_dict |
| 369 | 370 |
| 370 | 371 |
| 371 def cleanup_before_exit( tmp_dir ): | 372 def cleanup_before_exit(tmp_dir): |
| 372 if tmp_dir and os.path.exists( tmp_dir ): | 373 if tmp_dir and os.path.exists(tmp_dir): |
| 373 shutil.rmtree( tmp_dir ) | 374 shutil.rmtree(tmp_dir) |
| 374 | 375 |
| 375 | 376 |
| 376 def get_data_table_entries( params ): | 377 def get_data_table_entries(params): |
| 377 rval = {} | 378 rval = {} |
| 378 data_table_entries = params.get( 'data_table_entries', None ) | 379 data_table_entries = params.get('data_table_entries', None) |
| 379 if data_table_entries: | 380 if data_table_entries: |
| 380 for entry_text in data_table_entries.split( ',' ): | 381 for entry_text in data_table_entries.split(','): |
| 381 entry_text = entry_text.strip().decode( 'base64' ) | 382 entry_text = base64.b64decode(entry_text.strip().encode('utf-8')) |
| 382 entry_dict = loads( entry_text ) | 383 entry_dict = loads(entry_text) |
| 383 data_table_name = entry_dict['name'] | 384 data_table_name = entry_dict['name'] |
| 384 data_table_entry = entry_dict['entry'] | 385 data_table_entry = entry_dict['entry'] |
| 385 rval[ data_table_name ] = rval.get( data_table_name, [] ) | 386 rval[data_table_name] = rval.get(data_table_name, []) |
| 386 rval[ data_table_name ].append( data_table_entry ) | 387 rval[data_table_name].append(data_table_entry) |
| 387 return rval | 388 return rval |
| 388 | 389 |
| 389 | 390 |
| 390 def main(): | 391 def main(): |
| 391 parser = optparse.OptionParser() | 392 parser = optparse.OptionParser() |
| 392 (options, args) = parser.parse_args() | 393 (options, args) = parser.parse_args() |
| 393 | 394 |
| 394 filename = args[0] | 395 filename = args[0] |
| 395 | 396 |
| 396 params = loads( open( filename ).read() ) | 397 params = loads(open(filename).read()) |
| 397 target_directory = params[ 'output_data' ][0]['extra_files_path'] | 398 target_directory = params['output_data'][0]['extra_files_path'] |
| 398 os.mkdir( target_directory ) | 399 os.mkdir(target_directory) |
| 399 data_manager_dict = {} | 400 data_manager_dict = {} |
| 400 | 401 |
| 401 data_table_entries = get_data_table_entries( params['param_dict'] ) | 402 data_table_entries = get_data_table_entries(params['param_dict']) |
| 402 | 403 |
| 403 # Populate the data Tables | 404 # Populate the data Tables |
| 404 data_manager_dict = fulfill_data_table_entries( data_table_entries, data_manager_dict, target_directory ) | 405 data_manager_dict = fulfill_data_table_entries(data_table_entries, data_manager_dict, target_directory) |
| 405 | 406 |
| 406 # save info to json file | 407 # save info to json file |
| 407 open( filename, 'wb' ).write( dumps( data_manager_dict ) ) | 408 open(filename, 'w').write(dumps(data_manager_dict, sort_keys=True)) |
| 408 | 409 |
| 409 | 410 |
| 410 if __name__ == "__main__": | 411 if __name__ == "__main__": |
| 411 main() | 412 main() |
