gdcwebapp: json_data_source

comparison json_data_source_mod.py @ 12:80593f75d74a draft

Uploaded

author	fabio
date	Tue, 30 May 2017 12:26:32 -0400
parents	c0be9583df97
children	babc444d4bd0

comparison

equal deleted inserted replaced

-:9d24947d4335
+:80593f75d74a
 output_stream = open( target_output_filename, 'wb' )
 chunk_write( query_stream, output_stream )
 query_stream.close()
 output_stream.close()
-def store_file_from_archive( file_object, target_output_filename ):
+def store_file_from_archive( file_object, target_output_filename, isString=False ):
 """ Store file after extracting from archive and organize them as a collection using the structure
 (collection-name)_(file-name).ext as file name
 """
 output_stream = open( target_output_filename, 'wb' )
-chunk_write( file_object.read(), output_stream )
+#chunk_write( file_object.read(), output_stream )
+if not isString:
+output_stream.write(file_object.read())
+else:
+output_stream.write(file_object)
 output_stream.close()
 def download_extra_data( query_ext_data, base_path ):
 """ Download any extra data defined in the JSON.
 os.makedirs( os.path.normpath( '/'.join( [ base_path, os.path.dirname( ext_path ) ] ) ) )
 output_stream = open( os.path.normpath( '/'.join( [ base_path, ext_path ] ) ), 'wb' )
 chunk_write( query_stream, output_stream )
 query_stream.close()
 output_stream.close()
-def metadata_to_json_for_archive_entry( dataset_id, extension, metaname, filename, ds_type='dataset', primary=False ):
-""" Return line separated JSON """
-meta_dict = dict( type = ds_type,
-ext = extension,
-filename = filename,
-name = metaname,
-metadata = {} )
-if primary:
-meta_dict[ 'base_dataset_id' ] = dataset_id
-else:
-meta_dict[ 'dataset_id' ] = dataset_id
-return "%s\n" % json.dumps( meta_dict )
 def metadata_to_json( dataset_id, metadata, filename, ds_type='dataset', primary=False):
 """ Return line separated JSON """
 meta_dict = dict( type = ds_type,
 else:
 meta_dict[ 'dataset_id' ] = dataset_id
 return "%s\n" % json.dumps( meta_dict )
-def download_files_and_write_metadata(query_item, json_params, output_base_path, metadata_parameter_file, primary, appdata_path):
+def walk_on_archive(target_output_filename, check_ext, archive_name, appdata_path):
+archive_name = archive_name.replace("_", "-").replace(".", "-")
+with tarfile.open( target_output_filename, check_ext ) as tf:
+for entry in tf:
+if entry.isfile():
+fileobj = tf.extractfile( entry )
+# reserve the underscore for the collection searator
+filename = os.path.basename( entry.name ).replace("_", "-")
+extension = splitext( filename )[1]
+# pattern: (?P&lt;identifier_0&gt;[^_]+)_(?P&lt;identifier_1&gt;[^_]+)
+if (len(extension) > 0):
+filename = (filename[0:len(filename)-(len(extension)+1)]).replace(".", "-") + "." + extension
+else:
+extension = "auto"
+filename_with_collection_prefix = archive_name + "_" + filename
+target_entry_output_filename = os.path.join(appdata_path, filename_with_collection_prefix)
+store_file_from_archive( fileobj, target_entry_output_filename )
+return True
+def download_files_and_write_metadata(query_item, json_params, output_base_path, metadata_parameter_file, primary, appdata_path, options, args):
 """ Main work function that operates on the JSON representation of
 one dataset and its metadata. Returns True.
 """
 dataset_url, output_filename, \
 extra_files_path, file_name, \
 ext, out_data_name, \
 hda_id, dataset_id = set_up_config_values(json_params)
 extension = query_item.get( 'extension' )
-filename = query_item.get( 'url' )
+#filename = query_item.get( 'url' )
+filename = query_item.get( 'name' )
+check_ext = ""
+if ( filename.endswith( "gz" ) ):
+check_ext = "r:gz"
+elif ( filename.endswith( "bz2" ) ):
+check_ext = "r:bz2"
+elif ( filename.endswith( "tar" ) ):
+check_ext = "r:"
+isArchive = bool( check_ext and check_ext.strip() )
 extra_data = query_item.get( 'extra_data', None )
 if primary:
 filename = ''.join( c in VALID_CHARS and c or '-' for c in filename )
 name = construct_multi_filename( hda_id, filename, extension )
 target_output_filename = os.path.normpath( '/'.join( [ output_base_path, name ] ) )
-metadata_parameter_file.write( metadata_to_json( dataset_id, query_item,
+if isArchive is False:
-target_output_filename,
+metadata_parameter_file.write( metadata_to_json( dataset_id, query_item,
-ds_type='new_primary_dataset',
+target_output_filename,
-primary=primary) )
+ds_type='new_primary_dataset',
+primary=primary) )
 else:
 target_output_filename = output_filename
-metadata_parameter_file.write( metadata_to_json( dataset_id, query_item,
+if isArchive is False:
-target_output_filename,
+metadata_parameter_file.write( metadata_to_json( dataset_id, query_item,
-ds_type='dataset',
+target_output_filename,
-primary=primary) )
+ds_type='dataset',
+primary=primary) )
 download_from_query( query_item, target_output_filename )
 if extra_data:
 extra_files_path = ''.join( [ target_output_filename, 'files' ] )
 download_extra_data( extra_data, extra_files_path )
 """ the following code handles archives and decompress them in a collection """
-check_ext = ""
+if ( isArchive ):
-if ( fname.endswith( "gz" ) ):
+walk_on_archive(target_output_filename, check_ext, query_item.get( 'name' ), appdata_path)
-check_ext = "r:gz"
-elif ( fname.endswith( "bz2" ) ):
-check_ext = "r:bz2"
-elif ( fname.endswith( "tar" ) ):
-check_ext = "r:"
-if ( bool( check_ext and check_ext.strip() ) ):
-with tarfile.open( target_output_filename, check_ext ) as tf:
-for entry in tf:
-fileobj = tf.extractfile( entry )
-if entry.isfile():
-#dataset_url, output_filename, \
-#    extra_files_path, file_name, \
-#    ext, out_data_name, \
-#    hda_id, dataset_id = set_up_config_values(json_params)
-filename = os.path.basename( entry.name )
-extension = splitext( filename )
-extra_data = None
-#target_output_filename = output_filename
-# (?P&lt;archive_name&gt;.*)_(?P&lt;file_name&gt;.*)\..*
-filename_with_collection_prefix = query_item.get( 'name' ) + "_" + filename
-target_output_filename = os.path.join(appdata_path, filename_with_collection_prefix)
-#metadata_parameter_file.write( metadata_to_json_for_archive_entry( dataset_id, extension,
-#                                     filename, target_output_filename,
-#                                     ds_type='dataset',
-#                                     primary=primary) )
-store_file_from_archive( fileobj, target_output_filename )
 return True
-def set_up_config_values():
-extra_files_path, file_name, ext, out_data_name, hda_id, dataset_id = \
-itemgetter('extra_files_path', 'file_name', 'ext', 'out_data_name', 'hda_id', 'dataset_id')(output_data[0])
 def set_up_config_values(json_params):
 """ Parse json_params file and return a tuple of necessary configuration
 values.
 """
 extra_files_path, file_name,
 ext, out_data_name,
 hda_id, dataset_id)
-def download_from_json_data( options, args ):
+def download_from_json_data( options, args, json_params=None, json_dataset_url=None ):
 """ Parse the returned JSON data and download files. Write metadata
 to flat JSON file.
 """
 output_base_path = options.path
 appdata_path = options.appdata
 if not os.path.exists(appdata_path):
 os.makedirs(appdata_path)
 # read tool job configuration file and parse parameters we need
-json_params = json.loads( open( options.json_param_file, 'r' ).read() )
+if json_params is None:
+json_params = json.loads( open( options.json_param_file, 'r' ).read() )
 dataset_url, output_filename, \
 extra_files_path, file_name, \
 ext, out_data_name, \
 hda_id, dataset_id = set_up_config_values(json_params)
 # line separated JSON file to contain all dataset metadata
 metadata_parameter_file = open( json_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'wb' )
 # get JSON response from data source
 # TODO: make sure response is not enormous
-query_params = json.loads(urllib.urlopen( dataset_url ).read())
+if json_dataset_url is None:
+query_params = json.loads(urllib.urlopen( dataset_url ).read())
+else:
+query_params = json.loads(urllib.urlopen( json_dataset_url ).read())
 # download and write files
 primary = False
 # query_item, hda_id, output_base_path, dataset_id
 for query_item in query_params:
 if isinstance( query_item, list ):
 # TODO: do something with the nested list as a collection
 for query_subitem in query_item:
 primary = download_files_and_write_metadata(query_subitem, json_params, output_base_path,
-metadata_parameter_file, primary, appdata_path)
+metadata_parameter_file, primary, appdata_path, options, args)
 elif isinstance( query_item, dict ):
 primary = download_files_and_write_metadata(query_item, json_params, output_base_path,
-metadata_parameter_file, primary, appdata_path)
+metadata_parameter_file, primary, appdata_path, options, args)
 metadata_parameter_file.close()
 def __main__():
 """ Read the JSON return from a data source. Parse each line and request
 the data, download to "newfilepath", and write metadata.

Mercurial > repos > fabio > gdcwebapp

comparison json_data_source_mod.py @ 12:80593f75d74a draft