Mercurial > repos > fabio > gdcwebapp
diff json_data_source_mod.py @ 12:80593f75d74a draft
Uploaded
author | fabio |
---|---|
date | Tue, 30 May 2017 12:26:32 -0400 |
parents | c0be9583df97 |
children | babc444d4bd0 |
line wrap: on
line diff
--- a/json_data_source_mod.py Thu May 25 17:58:23 2017 -0400 +++ b/json_data_source_mod.py Tue May 30 12:26:32 2017 -0400 @@ -57,12 +57,16 @@ query_stream.close() output_stream.close() -def store_file_from_archive( file_object, target_output_filename ): +def store_file_from_archive( file_object, target_output_filename, isString=False ): """ Store file after extracting from archive and organize them as a collection using the structure (collection-name)_(file-name).ext as file name """ output_stream = open( target_output_filename, 'wb' ) - chunk_write( file_object.read(), output_stream ) + #chunk_write( file_object.read(), output_stream ) + if not isString: + output_stream.write(file_object.read()) + else: + output_stream.write(file_object) output_stream.close() @@ -85,20 +89,6 @@ output_stream.close() -def metadata_to_json_for_archive_entry( dataset_id, extension, metaname, filename, ds_type='dataset', primary=False ): - """ Return line separated JSON """ - meta_dict = dict( type = ds_type, - ext = extension, - filename = filename, - name = metaname, - metadata = {} ) - if primary: - meta_dict[ 'base_dataset_id' ] = dataset_id - else: - meta_dict[ 'dataset_id' ] = dataset_id - return "%s\n" % json.dumps( meta_dict ) - - def metadata_to_json( dataset_id, metadata, filename, ds_type='dataset', primary=False): """ Return line separated JSON """ meta_dict = dict( type = ds_type, @@ -115,7 +105,27 @@ return "%s\n" % json.dumps( meta_dict ) -def download_files_and_write_metadata(query_item, json_params, output_base_path, metadata_parameter_file, primary, appdata_path): +def walk_on_archive(target_output_filename, check_ext, archive_name, appdata_path): + archive_name = archive_name.replace("_", "-").replace(".", "-") + with tarfile.open( target_output_filename, check_ext ) as tf: + for entry in tf: + if entry.isfile(): + fileobj = tf.extractfile( entry ) + # reserve the underscore for the collection searator + filename = os.path.basename( entry.name ).replace("_", "-") + extension = splitext( filename )[1] + # pattern: (?P<identifier_0>[^_]+)_(?P<identifier_1>[^_]+) + if (len(extension) > 0): + filename = (filename[0:len(filename)-(len(extension)+1)]).replace(".", "-") + "." + extension + else: + extension = "auto" + filename_with_collection_prefix = archive_name + "_" + filename + target_entry_output_filename = os.path.join(appdata_path, filename_with_collection_prefix) + store_file_from_archive( fileobj, target_entry_output_filename ) + return True + + +def download_files_and_write_metadata(query_item, json_params, output_base_path, metadata_parameter_file, primary, appdata_path, options, args): """ Main work function that operates on the JSON representation of one dataset and its metadata. Returns True. """ @@ -124,68 +134,48 @@ ext, out_data_name, \ hda_id, dataset_id = set_up_config_values(json_params) extension = query_item.get( 'extension' ) - filename = query_item.get( 'url' ) + #filename = query_item.get( 'url' ) + filename = query_item.get( 'name' ) + + check_ext = "" + if ( filename.endswith( "gz" ) ): + check_ext = "r:gz" + elif ( filename.endswith( "bz2" ) ): + check_ext = "r:bz2" + elif ( filename.endswith( "tar" ) ): + check_ext = "r:" + isArchive = bool( check_ext and check_ext.strip() ) + extra_data = query_item.get( 'extra_data', None ) if primary: filename = ''.join( c in VALID_CHARS and c or '-' for c in filename ) name = construct_multi_filename( hda_id, filename, extension ) target_output_filename = os.path.normpath( '/'.join( [ output_base_path, name ] ) ) - metadata_parameter_file.write( metadata_to_json( dataset_id, query_item, - target_output_filename, - ds_type='new_primary_dataset', - primary=primary) ) + if isArchive is False: + metadata_parameter_file.write( metadata_to_json( dataset_id, query_item, + target_output_filename, + ds_type='new_primary_dataset', + primary=primary) ) else: target_output_filename = output_filename - metadata_parameter_file.write( metadata_to_json( dataset_id, query_item, - target_output_filename, - ds_type='dataset', - primary=primary) ) + if isArchive is False: + metadata_parameter_file.write( metadata_to_json( dataset_id, query_item, + target_output_filename, + ds_type='dataset', + primary=primary) ) + download_from_query( query_item, target_output_filename ) if extra_data: extra_files_path = ''.join( [ target_output_filename, 'files' ] ) download_extra_data( extra_data, extra_files_path ) """ the following code handles archives and decompress them in a collection """ - check_ext = "" - if ( fname.endswith( "gz" ) ): - check_ext = "r:gz" - elif ( fname.endswith( "bz2" ) ): - check_ext = "r:bz2" - elif ( fname.endswith( "tar" ) ): - check_ext = "r:" - if ( bool( check_ext and check_ext.strip() ) ): - with tarfile.open( target_output_filename, check_ext ) as tf: - for entry in tf: - fileobj = tf.extractfile( entry ) - if entry.isfile(): - - #dataset_url, output_filename, \ - # extra_files_path, file_name, \ - # ext, out_data_name, \ - # hda_id, dataset_id = set_up_config_values(json_params) - - filename = os.path.basename( entry.name ) - extension = splitext( filename ) - extra_data = None - #target_output_filename = output_filename - # (?P<archive_name>.*)_(?P<file_name>.*)\..* - filename_with_collection_prefix = query_item.get( 'name' ) + "_" + filename - target_output_filename = os.path.join(appdata_path, filename_with_collection_prefix) - - #metadata_parameter_file.write( metadata_to_json_for_archive_entry( dataset_id, extension, - # filename, target_output_filename, - # ds_type='dataset', - # primary=primary) ) - - store_file_from_archive( fileobj, target_output_filename ) - + if ( isArchive ): + walk_on_archive(target_output_filename, check_ext, query_item.get( 'name' ), appdata_path) + return True -def set_up_config_values(): - extra_files_path, file_name, ext, out_data_name, hda_id, dataset_id = \ - itemgetter('extra_files_path', 'file_name', 'ext', 'out_data_name', 'hda_id', 'dataset_id')(output_data[0]) - def set_up_config_values(json_params): """ Parse json_params file and return a tuple of necessary configuration values. @@ -202,7 +192,7 @@ hda_id, dataset_id) -def download_from_json_data( options, args ): +def download_from_json_data( options, args, json_params=None, json_dataset_url=None ): """ Parse the returned JSON data and download files. Write metadata to flat JSON file. """ @@ -212,7 +202,9 @@ os.makedirs(appdata_path) # read tool job configuration file and parse parameters we need - json_params = json.loads( open( options.json_param_file, 'r' ).read() ) + if json_params is None: + json_params = json.loads( open( options.json_param_file, 'r' ).read() ) + dataset_url, output_filename, \ extra_files_path, file_name, \ ext, out_data_name, \ @@ -222,7 +214,10 @@ # get JSON response from data source # TODO: make sure response is not enormous - query_params = json.loads(urllib.urlopen( dataset_url ).read()) + if json_dataset_url is None: + query_params = json.loads(urllib.urlopen( dataset_url ).read()) + else: + query_params = json.loads(urllib.urlopen( json_dataset_url ).read()) # download and write files primary = False # query_item, hda_id, output_base_path, dataset_id @@ -231,11 +226,11 @@ # TODO: do something with the nested list as a collection for query_subitem in query_item: primary = download_files_and_write_metadata(query_subitem, json_params, output_base_path, - metadata_parameter_file, primary, appdata_path) + metadata_parameter_file, primary, appdata_path, options, args) elif isinstance( query_item, dict ): primary = download_files_and_write_metadata(query_item, json_params, output_base_path, - metadata_parameter_file, primary, appdata_path) + metadata_parameter_file, primary, appdata_path, options, args) metadata_parameter_file.close() def __main__():