| 1 | 1 #!/usr/bin/python | 
|  | 2 | 
|  | 3 """ | 
|  | 4 ****************************** versioned_data_cache_clear.py ****************************** | 
|  | 5  Call this script directly to clear out all but the latest galaxy Versioned Data data library | 
|  | 6  and server data store cached folder versions. | 
|  | 7 | 
|  | 8  SUGGEST RUNNING THIS UNDER GALAXY OR LESS PRIVILEGED USER, BUT the versioneddata_api_key file does need to be readable by the user. | 
|  | 9 | 
|  | 10 """ | 
|  | 11 import vdb_retrieval | 
|  | 12 import vdb_common | 
|  | 13 import glob | 
|  | 14 import os | 
|  | 15 | 
|  | 16 # Note that globals from vdb_retrieval can be referenced by prefixing with vdb_retrieval.XYZ | 
|  | 17 # Note that this script uses the admin_api established in vdb_retrieval.py | 
|  | 18 | 
|  | 19 retrieval_obj = vdb_retrieval.VDBRetrieval() | 
|  | 20 retrieval_obj.set_admin_api() | 
|  | 21 retrieval_obj.user_api = retrieval_obj.admin_api | 
|  | 22 retrieval_obj.set_datastores() | 
|  | 23 | 
|  | 24 workflow_keepers = [] #stack of Versioned Data library dataset_ids that if found in a workflow data input folder key name, can be saved; otherwise remove folder. | 
|  | 25 library_folder_deletes = [] | 
|  | 26 library_dataset_deletes = [] | 
|  | 27 | 
|  | 28 # Cycle through datastores, listing subfolders under each, sorted. | 
|  | 29 # Permanently delete all but latest subfolder. | 
|  | 30 for data_store in retrieval_obj.data_stores: | 
|  | 31 	spec_file_id = data_store['id'] | 
|  | 32 	# STEP 1:  Determine data store type and location | 
|  | 33 	data_store_spec = retrieval_obj.admin_api.libraries.show_folder(retrieval_obj.library_id, spec_file_id) | 
|  | 34 	data_store_type = retrieval_obj.test_data_store_type(data_store_spec['name']) | 
|  | 35 | 
|  | 36 	if not data_store_type in 'folder biomaj': # Folders are static - they don't do caching. | 
|  | 37 | 
|  | 38 		base_folder_id = data_store_spec['folder_id'] | 
|  | 39 		ds_obj = retrieval_obj.get_data_store_gateway(data_store_type, spec_file_id) | 
|  | 40 | 
|  | 41 		print | 
|  | 42 | 
|  | 43 		#Cycle through library tree; have to look at the whole thing since there's no /[string]/* wildcard search: | 
|  | 44 		folders = retrieval_obj.get_library_folders(ds_obj.library_label_path) | 
|  | 45 		for ptr, folder in enumerate(folders): | 
|  | 46 | 
|  | 47 			# Ignore folder that represents data store itself: | 
|  | 48 			if ptr == 0: | 
|  | 49 				print 'Data Store ::' + folder['name'] | 
|  | 50 | 
|  | 51 			# Keep most recent cache item | 
|  | 52 			elif ptr == len(folders)-1: | 
|  | 53 				print 'Cached Version ::' + folder['name'] | 
|  | 54 				workflow_keepers.extend(folder['files']) | 
|  | 55 | 
|  | 56 			# Drop version caches that are further in the past: | 
|  | 57 			else: | 
|  | 58 				print 'Clearing version cache:' + folder['name'] | 
|  | 59 				library_folder_deletes.extend(folder['id']) | 
|  | 60 				library_dataset_deletes.extend(folder['files']) | 
|  | 61 | 
|  | 62 | 
|  | 63 		# Now auto-clean versioned/ folders too? | 
|  | 64 		print "Server loc: " + ds_obj.data_store_path | 
|  | 65 | 
|  | 66 		items = os.listdir(ds_obj.data_store_path) | 
|  | 67 		items = sorted(items, key=lambda el: vdb_common.natural_sort_key(el), reverse=True) | 
|  | 68 		count = 0 | 
|  | 69 		for name in items: | 
|  | 70 | 
|  | 71 			# If it is a directory and it isn't the master or symlinked "current" one: | 
|  | 72 			# Add ability to skip sym-linked folders too? | 
|  | 73 			version_folder=os.path.join(ds_obj.data_store_path, name) | 
|  | 74 			if not name == 'master' \ | 
|  | 75 				and os.path.isdir(version_folder) \ | 
|  | 76 				and not os.path.islink(version_folder): | 
|  | 77 | 
|  | 78 				count += 1 | 
|  | 79 				if count == 1: | 
|  | 80 					print "Keeping cache:" + name | 
|  | 81 				else: | 
|  | 82 					print "Dropping cache:" + name | 
|  | 83 					for root2, dirs2, files2 in os.walk(version_folder): | 
|  | 84 						for version_file in files2: | 
|  | 85 							full_path = os.path.join(root2, version_file) | 
|  | 86 							print "Removing " +	full_path | 
|  | 87 							os.remove(full_path) | 
|  | 88 						#Not expecting any subfolders here. | 
|  | 89 | 
|  | 90 					os.rmdir(version_folder) | 
|  | 91 | 
|  | 92 | 
|  | 93 # Permanently delete specific data library datasets: | 
|  | 94 for item in library_dataset_deletes: | 
|  | 95 	retrieval_obj.admin_api.libraries.delete_library_dataset(retrieval_obj.library_id, item['id'], purged=True) | 
|  | 96 | 
|  | 97 | 
|  | 98 # Newer Bioblend API method for deleting galaxy library folders. | 
|  | 99 # OLD Galaxy way possible: http DELETE request to {{url}}/api/folders/{{encoded_folder_id}}?key={{key}} | 
|  | 100 if 'folders' in dir(retrieval_obj.admin_api): | 
|  | 101 	for folder in library_folder_deletes: | 
|  | 102 		retrieval_obj.admin_api.folders.delete(folder['id']) | 
|  | 103 | 
|  | 104 | 
|  | 105 print workflow_keepers | 
|  | 106 | 
|  | 107 workflow_cache_folders = retrieval_obj.get_library_folders('/'+ vdb_retrieval.VDB_WORKFLOW_CACHE_FOLDER_NAME+'/') | 
|  | 108 | 
|  | 109 for folder in workflow_cache_folders: | 
|  | 110 	dataset_ids = folder['name'].split('_') #input dataset ids separated by underscore | 
|  | 111 	count = 0 | 
|  | 112 	for id in dataset_ids: | 
|  | 113 		if id in workflow_keepers: | 
|  | 114 			count += 1 | 
|  | 115 | 
|  | 116 	# If every input dataset in workflow cache exists in library cache, then keep it. | 
|  | 117 	if count == len(dataset_ids): | 
|  | 118 		continue | 
|  | 119 | 
|  | 120 	# We have one or more cached datasets to drop. | 
|  | 121 	print "Dropping workflow cache: " + folder['name'] | 
|  | 122 	for id in [item['id'] for item in folder['files']]: | 
|  | 123 		print id | 
|  | 124 		retrieval_obj.admin_api.libraries.delete_library_dataset(retrieval_obj.library_id, id, purged=True) | 
|  | 125 | 
|  | 126 	# NOW DELETE WORKFLOW FOLDER. | 
|  | 127 	if 'folders' in dir(retrieval_obj.admin_api): | 
|  | 128 		retrieval_obj.admin_api.folders.delete(folder['id']) | 
|  | 129 | 
|  | 130 |