Mercurial > repos > iuc > data_manager_gemini_database_downloader

--- a/data_manager/data_manager_gemini_download.py	Sun Nov 22 12:49:35 2020 +0000
+++ b/data_manager/data_manager_gemini_download.py	Sat Dec 03 10:37:24 2022 +0000
@@ -1,4 +1,6 @@
-#!/usr/bin/env python
+#!/usr/bin/env python2
+
+# IMPORTANT: This will run using Python 2 still!

 import datetime
 import json
@@ -14,6 +16,11 @@
         yaml.dump(config, fo, allow_unicode=False, default_flow_style=False)


+def load_gemini_config(config_file):
+    with open(config_file) as fi:
+        return yaml.load(fi)
+
+
 def main():
     today = datetime.date.today()
     with open(sys.argv[1]) as fh:
@@ -21,36 +28,7 @@
     target_directory = params['output_data'][0]['extra_files_path']
     os.mkdir(target_directory)

-    # Generate a minimal configuration file for GEMINI update
-    # to instruct the tool to download the annotation data into a
-    # subfolder of the target directory.
-    config_file = os.path.join(target_directory, 'gemini-config.yaml')
-    anno_dir = os.path.join(target_directory, 'gemini/data')
-    gemini_bootstrap_config = {'annotation_dir': anno_dir}
-    write_gemini_config(gemini_bootstrap_config, config_file)
-
-    # Now gemini update can be called to download the data.
-    # The GEMINI_CONFIG environment variable lets the tool discover
-    # the configuration file we prepared for it.
-    # Note that the tool will rewrite the file turning it into a
-    # complete gemini configuration file.
-    gemini_env = os.environ.copy()
-    gemini_env['GEMINI_CONFIG'] = target_directory
-    cmd = "gemini update --dataonly %s %s" % (
-        params['param_dict']['gerp_bp'],
-        params['param_dict']['cadd']
-    )
-    subprocess.check_call(cmd, shell=True, env=gemini_env)
-
-    # GEMINI tool wrappers that need access to the annotation files
-    # are supposed to symlink them into a gemini/data subfolder of
-    # the job working directory. To have GEMINI discover them there,
-    # we need to set this location as the 'annotation_dir' in the
-    # configuration file.
-    with open(config_file) as fi:
-        config = yaml.load(fi)
-    config['annotation_dir'] = 'gemini/data'
-    write_gemini_config(config, config_file)
+    # Prepare the metadata for the new data table record

     # The name of the database should reflect whether it was built with or
     # without the optional GERP-bp data, the CADD scores, or both.
@@ -65,7 +43,6 @@
     else:
         anno_desc = ''

-    # Finally, we prepare the metadata for the new data table record ...
     data_manager_dict = {
         'data_tables': {
             'gemini_versioned_databases': [
@@ -83,10 +60,49 @@
         }
     }

-    # ... and save it to the json results file
+    # Save the data table metadata to the json results file
     with open(sys.argv[1], 'w') as fh:
         json.dump(data_manager_dict, fh, sort_keys=True)

+    # Generate a minimal configuration file for GEMINI update
+    # to instruct the tool to download the annotation data into a
+    # subfolder of the target directory.
+    config_file = os.path.join(target_directory, 'gemini-config.yaml')
+    anno_dir = os.path.join(target_directory, 'gemini/data')
+    gemini_bootstrap_config = {'annotation_dir': anno_dir}
+    write_gemini_config(gemini_bootstrap_config, config_file)
+
+    # Verify that we can read the config_file just created as we need to do so
+    # after the data download has finished and it is very annoying to have this
+    # fail after dozens of Gbs of data have been downloaded
+    config = load_gemini_config(config_file)
+
+    # Now gemini update can be called to download the data.
+    # The GEMINI_CONFIG environment variable lets the tool discover
+    # the configuration file we prepared for it.
+    # Note that the tool will rewrite the file turning it into a
+    # complete gemini configuration file.
+    gemini_env = os.environ.copy()
+    gemini_env['GEMINI_CONFIG'] = target_directory
+    cmd = ['gemini', 'update', '--dataonly']
+    if params['param_dict']['gerp_bp']:
+        cmd += ['--extra', 'gerp_bp']
+    if params['param_dict']['cadd']:
+        cmd += ['--extra', 'cadd_score']
+
+    if not params['param_dict']['test_data_manager']:
+        # This is not a test => Going to embark on a massive download now
+        subprocess.check_call(cmd, env=gemini_env)
+
+    # GEMINI tool wrappers that need access to the annotation files
+    # are supposed to symlink them into a gemini/data subfolder of
+    # the job working directory. To have GEMINI discover them there,
+    # we need to set this location as the 'annotation_dir' in the
+    # configuration file.
+    config = load_gemini_config(config_file)
+    config['annotation_dir'] = 'gemini/data'
+    write_gemini_config(config, config_file)
+

 if __name__ == "__main__":
     main()
--- a/data_manager/data_manager_gemini_download.xml	Sun Nov 22 12:49:35 2020 +0000
+++ b/data_manager/data_manager_gemini_download.xml	Sat Dec 03 10:37:24 2022 +0000
@@ -1,4 +1,4 @@
-<tool id="data_manager_gemini_download" name="GEMINI Download" version="@VERSION@" tool_type="manage_data">
+<tool id="data_manager_gemini_download" name="GEMINI Download" version="@VERSION@+galaxy1" tool_type="manage_data" profile="18.09">
     <description>the annotation files required by the GEMINI suite of tools</description>
     <macros>
         <token name="@VERSION@">0.20.1</token>
@@ -11,16 +11,32 @@
         python '$__tool_directory__/data_manager_gemini_download.py' '$out_file'
     </command>
     <inputs>
-        <param name="cadd" type="boolean" truevalue="--extra cadd_score" falsevalue="" checked="True"
+        <param name="cadd" type="boolean" truevalue="cadd" falsevalue="" checked="True"
             label="Download CADD scores for GEMINI database annotation" help="(--extra cadd_score)"/>
-        <param name="gerp_bp" type="boolean" truevalue="--extra gerp_bp" falsevalue="" checked="True"
+        <param name="gerp_bp" type="boolean" truevalue="gerp_bp" falsevalue="" checked="True"
             label="Download GERP for GEMINI database annotation" help="(--extra gerp_bp)"/>
         <param name="gemini_db_version" type="hidden" value="@DB_VERSION@"/>
+        <param name="test_data_manager" type="hidden" value=""/>
     </inputs>
     <outputs>
         <data name="out_file" format="data_manager_json" label="${tool.name}"/>
     </outputs>
     <tests>
+        <test expect_num_outputs="1">
+            <param name="test_data_manager" value="--test" />
+            <param name="cadd" value="false" />
+            <param name="gerp_bp" value="false" />
+            <output name="out_file" file="test.json" compare="re_match" />
+        </test>
+        <!-- Test 2 with the latest option -->
+        <test expect_num_outputs="1">
+            <param name="test_data_manager" value="--test" />
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="&quot;name&quot;: &quot;GEMINI annotations w/ GERP &amp; CADD" />
+                </assert_contents>
+            </output>
+        </test>
     </tests>
     <help>
 This tool downloads the GEMINI annotation files and makes them available to
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/gemini_versioned_databases.loc	Sat Dec 03 10:37:24 2022 +0000
@@ -0,0 +1,3 @@
+## GEMINI versioned databases
+#DownloadDate	dbkey	DBversion	Description
+#2018-07-08	hg19	181	GEMINI annotations (2018-07-08 snapshot)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test.json	Sat Dec 03 10:37:24 2022 +0000
@@ -0,0 +1,1 @@
+\{"data_tables": \{"gemini_versioned_databases": \[\{"dbkey": "hg19", "name": "GEMINI annotations \(.+ snapshot\)", "path": "./.+", "value": ".+", "version": "200"\}\]\}\}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.test	Sat Dec 03 10:37:24 2022 +0000
@@ -0,0 +1,7 @@
+<tables>
+    <table name="gemini_versioned_databases" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, dbkey, version, name, path</columns>
+        <file path="${__HERE__}/test-data/gemini_versioned_databases.loc" />
+    </table>
+</tables>
+