Mercurial > repos > damion > versioned_data
comparison versioned_data_cache_clear.py @ 1:5c5027485f7d draft
Uploaded correct file
| author | damion |
|---|---|
| date | Sun, 09 Aug 2015 16:07:50 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:d31a1bd74e63 | 1:5c5027485f7d |
|---|---|
| 1 #!/usr/bin/python | |
| 2 | |
| 3 """ | |
| 4 ****************************** versioned_data_cache_clear.py ****************************** | |
| 5 Call this script directly to clear out all but the latest galaxy Versioned Data data library | |
| 6 and server data store cached folder versions. | |
| 7 | |
| 8 SUGGEST RUNNING THIS UNDER GALAXY OR LESS PRIVILEGED USER, BUT the versioneddata_api_key file does need to be readable by the user. | |
| 9 | |
| 10 """ | |
| 11 import vdb_retrieval | |
| 12 import vdb_common | |
| 13 import glob | |
| 14 import os | |
| 15 | |
| 16 # Note that globals from vdb_retrieval can be referenced by prefixing with vdb_retrieval.XYZ | |
| 17 # Note that this script uses the admin_api established in vdb_retrieval.py | |
| 18 | |
| 19 retrieval_obj = vdb_retrieval.VDBRetrieval() | |
| 20 retrieval_obj.set_admin_api() | |
| 21 retrieval_obj.user_api = retrieval_obj.admin_api | |
| 22 retrieval_obj.set_datastores() | |
| 23 | |
| 24 workflow_keepers = [] #stack of Versioned Data library dataset_ids that if found in a workflow data input folder key name, can be saved; otherwise remove folder. | |
| 25 library_folder_deletes = [] | |
| 26 library_dataset_deletes = [] | |
| 27 | |
| 28 # Cycle through datastores, listing subfolders under each, sorted. | |
| 29 # Permanently delete all but latest subfolder. | |
| 30 for data_store in retrieval_obj.data_stores: | |
| 31 spec_file_id = data_store['id'] | |
| 32 # STEP 1: Determine data store type and location | |
| 33 data_store_spec = retrieval_obj.admin_api.libraries.show_folder(retrieval_obj.library_id, spec_file_id) | |
| 34 data_store_type = retrieval_obj.test_data_store_type(data_store_spec['name']) | |
| 35 | |
| 36 if not data_store_type in 'folder biomaj': # Folders are static - they don't do caching. | |
| 37 | |
| 38 base_folder_id = data_store_spec['folder_id'] | |
| 39 ds_obj = retrieval_obj.get_data_store_gateway(data_store_type, spec_file_id) | |
| 40 | |
| 41 print | |
| 42 | |
| 43 #Cycle through library tree; have to look at the whole thing since there's no /[string]/* wildcard search: | |
| 44 folders = retrieval_obj.get_library_folders(ds_obj.library_label_path) | |
| 45 for ptr, folder in enumerate(folders): | |
| 46 | |
| 47 # Ignore folder that represents data store itself: | |
| 48 if ptr == 0: | |
| 49 print 'Data Store ::' + folder['name'] | |
| 50 | |
| 51 # Keep most recent cache item | |
| 52 elif ptr == len(folders)-1: | |
| 53 print 'Cached Version ::' + folder['name'] | |
| 54 workflow_keepers.extend(folder['files']) | |
| 55 | |
| 56 # Drop version caches that are further in the past: | |
| 57 else: | |
| 58 print 'Clearing version cache:' + folder['name'] | |
| 59 library_folder_deletes.extend(folder['id']) | |
| 60 library_dataset_deletes.extend(folder['files']) | |
| 61 | |
| 62 | |
| 63 # Now auto-clean versioned/ folders too? | |
| 64 print "Server loc: " + ds_obj.data_store_path | |
| 65 | |
| 66 items = os.listdir(ds_obj.data_store_path) | |
| 67 items = sorted(items, key=lambda el: vdb_common.natural_sort_key(el), reverse=True) | |
| 68 count = 0 | |
| 69 for name in items: | |
| 70 | |
| 71 # If it is a directory and it isn't the master or symlinked "current" one: | |
| 72 # Add ability to skip sym-linked folders too? | |
| 73 version_folder=os.path.join(ds_obj.data_store_path, name) | |
| 74 if not name == 'master' \ | |
| 75 and os.path.isdir(version_folder) \ | |
| 76 and not os.path.islink(version_folder): | |
| 77 | |
| 78 count += 1 | |
| 79 if count == 1: | |
| 80 print "Keeping cache:" + name | |
| 81 else: | |
| 82 print "Dropping cache:" + name | |
| 83 for root2, dirs2, files2 in os.walk(version_folder): | |
| 84 for version_file in files2: | |
| 85 full_path = os.path.join(root2, version_file) | |
| 86 print "Removing " + full_path | |
| 87 os.remove(full_path) | |
| 88 #Not expecting any subfolders here. | |
| 89 | |
| 90 os.rmdir(version_folder) | |
| 91 | |
| 92 | |
| 93 # Permanently delete specific data library datasets: | |
| 94 for item in library_dataset_deletes: | |
| 95 retrieval_obj.admin_api.libraries.delete_library_dataset(retrieval_obj.library_id, item['id'], purged=True) | |
| 96 | |
| 97 | |
| 98 # Newer Bioblend API method for deleting galaxy library folders. | |
| 99 # OLD Galaxy way possible: http DELETE request to {{url}}/api/folders/{{encoded_folder_id}}?key={{key}} | |
| 100 if 'folders' in dir(retrieval_obj.admin_api): | |
| 101 for folder in library_folder_deletes: | |
| 102 retrieval_obj.admin_api.folders.delete(folder['id']) | |
| 103 | |
| 104 | |
| 105 print workflow_keepers | |
| 106 | |
| 107 workflow_cache_folders = retrieval_obj.get_library_folders('/'+ vdb_retrieval.VDB_WORKFLOW_CACHE_FOLDER_NAME+'/') | |
| 108 | |
| 109 for folder in workflow_cache_folders: | |
| 110 dataset_ids = folder['name'].split('_') #input dataset ids separated by underscore | |
| 111 count = 0 | |
| 112 for id in dataset_ids: | |
| 113 if id in workflow_keepers: | |
| 114 count += 1 | |
| 115 | |
| 116 # If every input dataset in workflow cache exists in library cache, then keep it. | |
| 117 if count == len(dataset_ids): | |
| 118 continue | |
| 119 | |
| 120 # We have one or more cached datasets to drop. | |
| 121 print "Dropping workflow cache: " + folder['name'] | |
| 122 for id in [item['id'] for item in folder['files']]: | |
| 123 print id | |
| 124 retrieval_obj.admin_api.libraries.delete_library_dataset(retrieval_obj.library_id, id, purged=True) | |
| 125 | |
| 126 # NOW DELETE WORKFLOW FOLDER. | |
| 127 if 'folders' in dir(retrieval_obj.admin_api): | |
| 128 retrieval_obj.admin_api.folders.delete(folder['id']) | |
| 129 | |
| 130 |
