# HG changeset patch # User fabio # Date 1497386380 14400 # Node ID d65de900967e70fb2ba2c34f4c8359952cb6e555 # Parent 228038cd0683e93501367d1cdc4d7ecf484eea74 Uploaded 20170613 diff -r 228038cd0683 -r d65de900967e ._gdcwebapp.xml Binary file ._gdcwebapp.xml has changed diff -r 228038cd0683 -r d65de900967e ._json_collect_data_source.py Binary file ._json_collect_data_source.py has changed diff -r 228038cd0683 -r d65de900967e json_collect_data_source.py --- a/json_collect_data_source.py Wed Jun 07 18:02:01 2017 -0400 +++ b/json_collect_data_source.py Tue Jun 13 16:39:40 2017 -0400 @@ -6,6 +6,7 @@ import os from operator import itemgetter import tarfile +import zipfile __version__ = "1.0.0" CHUNK_SIZE = 2**20 #1mb @@ -13,6 +14,7 @@ def splitext(path): + # extract the folder path and extension of a file from its path for ext in ['.tar.gz', '.tar.bz2']: if path.endswith(ext): path, ext = path[:-len(ext)], path[-len(ext):] @@ -57,16 +59,10 @@ query_stream.close() output_stream.close() -def store_file_from_archive( file_object, target_output_filename, isString=False ): - """ Store file after extracting from archive and organize them as a collection using the structure - (collection-name)_(file-name).ext as file name - """ +def store_file_from_tarfile( file_object, target_output_filename, isString=False ): + # store the file_object (from tarfile) on the filesystem output_stream = open( target_output_filename, 'wb' ) - #chunk_write( file_object.read(), output_stream ) - if not isString: - output_stream.write(file_object.read()) - else: - output_stream.write(file_object) + output_stream.write(file_object.read()) output_stream.close() @@ -105,23 +101,47 @@ return "%s\n" % json.dumps( meta_dict ) -def walk_on_archive(target_output_filename, check_ext, archive_name, appdata_path, db_key="?"): +def walk_on_archive(target_output_filename, check_ext, archive_library, archive_name, appdata_path, db_key="?"): + # fix archive name using valid chars only + archive_name = ''.join(e for e in archive_name if e in VALID_CHARS) archive_name = archive_name.replace("_", "-").replace(".", "-") - with tarfile.open( target_output_filename, check_ext ) as tf: - for entry in tf: - if entry.isfile(): - fileobj = tf.extractfile( entry ) - # reserve the underscore for the collection searator - filename = os.path.basename( entry.name ).replace("_", "-") - extension = splitext( filename )[1] - # pattern: (?P[^_]+)_(?P[^_]+)_(?P[^_]+)_(?P[^_]+) - if (len(extension) > 0): - filename = (filename[0:len(filename)-(len(extension)+1)]).replace(".", "-") + "." + extension + "_" + extension - else: - extension = "auto" - filename_with_collection_prefix = archive_name + "_" + filename + "_" + db_key - target_entry_output_filename = os.path.join(appdata_path, filename_with_collection_prefix) - store_file_from_archive( fileobj, target_entry_output_filename ) + if archive_library is "zipfile": + # iterate over entries inside the archive [zip] + with zipfile.ZipFile( target_output_filename, check_ext ) as zf: + for entry in zf.namelist(): + # if entry is file + if entry.startswith("%s/" % entry.rstrip("/")) is False: + # retrieve file name + # the underscore character is reserved + filename = os.path.basename( entry.split("/")[-1] ).replace("_", "-") + # retrieve file extension + extension = splitext( filename )[1] + # if no extension use 'auto' + if (len(extension) == 0): + extension = "auto" + # pattern: (?P[^_]+)_(?P[^_]+)_(?P[^_]+)_(?P[^_]+) + filename_with_collection_prefix = archive_name + "_" + filename + "_" + extension + "_" + db_key + # store current entry on filesystem + zf.extract( filename_with_collection_prefix, appdata_path ) + elif archive_library is "tarfile": + # iterate over entries inside the archive [gz, bz2, tar] + with tarfile.open( target_output_filename, check_ext ) as tf: + for entry in tf: + if entry.isfile(): + fileobj = tf.extractfile( entry ) + # retrieve file name + # the underscore character is reserved + filename = os.path.basename( (entry.name).split("/")[-1] ).replace("_", "-") + # retrieve file extension + extension = splitext( filename )[1] + # if no extension use 'auto' + if (len(extension) == 0): + extension = "auto" + # pattern: (?P[^_]+)_(?P[^_]+)_(?P[^_]+)_(?P[^_]+) + filename_with_collection_prefix = archive_name + "_" + filename + "_" + extension + "_" + db_key + target_entry_output_filename = os.path.join(appdata_path, filename_with_collection_prefix) + # store current entry on filesystem + store_file_from_tarfile( fileobj, target_entry_output_filename ) return True @@ -137,13 +157,35 @@ url = query_item.get( 'url' ) filename = query_item.get( 'name' ) + # the organize parameter is considered for archives only + organize = query_item.get( 'organize', None ) + if organize is None: + organize = False + else: + if organize.lower() == "true": + organize = True + elif organize.lower() == "false": + organize = False + else: + # if organize parameter is malformed -> set organize to False + organize = False + + # check file extension + # if the file is an archive -> do not write metadata and extract files check_ext = "" + archive_library = None if ( url.endswith( "gz" ) ): check_ext = "r:gz" + archive_library = "tarfile" elif ( url.endswith( "bz2" ) ): check_ext = "r:bz2" + archive_library = "tarfile" elif ( url.endswith( "tar" ) ): check_ext = "r:" + archive_library = "tarfile" + elif ( url.endswith( "zip" ) ): + check_ext = "r" + archive_library = "zipfile" isArchive = bool( check_ext and check_ext.strip() ) extra_data = query_item.get( 'extra_data', None ) @@ -151,30 +193,35 @@ filename = ''.join( c in VALID_CHARS and c or '-' for c in filename ) name = construct_multi_filename( hda_id, filename, extension ) target_output_filename = os.path.normpath( '/'.join( [ output_base_path, name ] ) ) - if isArchive is False: + if (isArchive is False) or ((isArchive is True) and (organize is False)): metadata_parameter_file.write( metadata_to_json( dataset_id, query_item, target_output_filename, ds_type='new_primary_dataset', primary=primary) ) else: target_output_filename = output_filename - if isArchive is False: + if (isArchive is False) or ((isArchive is True) and (organize is False)): metadata_parameter_file.write( metadata_to_json( dataset_id, query_item, target_output_filename, ds_type='dataset', primary=primary) ) - if isArchive is False: + if (isArchive is False) or ((isArchive is True) and (organize is False)): download_from_query( query_item, target_output_filename ) else: + # if the current entry is an archive download it inside appdata folder target_output_path = os.path.join(appdata_path, filename) download_from_query( query_item, target_output_path ) if extra_data: + # just download extra data extra_files_path = ''.join( [ target_output_filename, 'files' ] ) download_extra_data( extra_data, extra_files_path ) - """ the following code handles archives and decompress them in a collection """ - if ( isArchive ): + # if the current file is an archive and want to organize the content + # -> decompress the archive and populate the collection (has to be defined in the tool xml schema) + if isArchive and organize: + # set the same db_key for each file inside the archive + # use the db_key associated to the archive (if it exists) db_key = "?" archive_metadata = query_item.get( 'metadata', None ) if archive_metadata is not None: @@ -182,7 +229,11 @@ db_key = archive_metadata.get( 'db_key' ) except: pass - walk_on_archive(target_output_path, check_ext, filename, appdata_path, db_key) + archive_name = query_item.get( 'name', None ) + if archive_name is None: + archive_name = filename + # iterate over the archive content + walk_on_archive(target_output_path, check_ext, archive_library, archive_name, appdata_path, db_key) return True @@ -214,7 +265,7 @@ # read tool job configuration file and parse parameters we need json_params = json.loads( open( options.json_param_file, 'r' ).read() ) - print("json_params: "+str(json_params)) + #print("json_params: "+str(json_params)) dataset_url, output_filename, \ extra_files_path, file_name, \ @@ -250,9 +301,10 @@ ------ [ {"url":"http://url_of_file", - "name":"encode WigData", - "extension":"wig", - "metadata":{"db_key":"hg19"}, + "name":"My Archive", + "extension":"tar.gz", + "organize":"true", + "metadata":{"db_key":"hg38"}, "extra_data":[ {"url":"http://url_of_ext_file", "path":"rel/path/to/ext_file"} ] @@ -261,12 +313,13 @@ """ # Parse the command line options - usage = "Usage: json_data_source_mod.py max_size --json_param_file filename [options]" + usage = "Usage: json_collect_data_source.py max_size --json_param_file filename [options]" parser = optparse.OptionParser(usage = usage) parser.add_option("-j", "--json_param_file", type="string", action="store", dest="json_param_file", help="json schema return data") parser.add_option("-p", "--path", type="string", action="store", dest="path", help="new file path") + # set appdata: temporary directory in which the archives will be decompressed parser.add_option("-a", "--appdata", type="string", action="store", dest="appdata", help="appdata folder name") parser.add_option("-v", "--version", action="store_true", dest="version",