Mercurial > repos > iuc > ega_download_client

--- a/pyega3.xml	Fri Oct 30 22:18:41 2020 +0000
+++ b/pyega3.xml	Tue Apr 12 11:36:51 2022 +0000
@@ -1,9 +1,9 @@
-<tool id="pyega3" name="EGA Download Client" version="@VERSION@+galaxy0" profile="19.09" >
+<tool id="pyega3" name="EGA Download Client" version="@TOOL_VERSION@+galaxy0" profile="21.01" >
     <macros>
-        <token name="@VERSION@">3.4.0</token>
+        <token name="@TOOL_VERSION@">4.0.0</token>
     </macros>
     <requirements>
-        <requirement type="package" version="@VERSION@">pyega3</requirement>
+        <requirement type="package" version="@TOOL_VERSION@">pyega3</requirement>
     </requirements>
         <command detect_errors="exit_code"><![CDATA[
 #set $username = $__user__.extra_preferences.get('ega_account|username', "")
@@ -18,9 +18,18 @@
 #elif $action.action_type == "list_dataset_files"
     pyega3 -cf '$credentials'
       files '$action.dataset_id'
+    &&
+
+    ## create file header
+    echo -e 'File ID\tStatus\tBytes\tCheck sum\tFile name' > '$dataset_file_list' &&
+
+    ## remove timestamps and convert spaces to tabs
+    grep EGAF pyega3_output.log | sed -e 's/^\[.*\]\s\+//g' | sed 's/\s\+/\t/g' >> '$dataset_file_list'
+
 #elif $action.action_type == "download_file"
-    pyega3 -cf '$credentials'
+    pyega3 -c \${PYEGA_CONNECTIONS:-30} -cf '$credentials'
       fetch '$action.file_id'
+      --max-retries 10
       #if $action.range.reference_name
         --reference-name '$action.range.reference_name'
         #if $action.range.start
@@ -30,7 +39,30 @@
           --end $action.range.end
         #end if
       #end if
-      --saveto '$downloaded_file'
+    && mv ${action.file_id} downloads
+    && rm -f downloads/*.md5  ## checksum validation already performed by pyEGA, cleanup downloads folder
+
+#elif $action.action_type == "download_files"
+    #set file_ids=[x.split('\t')[int(str($action.id_column))-1] for x in open(str($id_table)).readlines() if x.split('\t')[int(str($action.id_column))-1].startswith('EGAF') ]
+    mkdir downloads
+    #for f in $file_ids
+      &&
+      pyega3 -c \${PYEGA_CONNECTIONS:-30} -cf '$credentials'
+        fetch '$f'
+          --max-retries 10
+          #if $action.range.reference_name
+          --reference-name '$action.range.reference_name'
+          #if $action.range.start
+            --start $action.range.start
+          #end if
+          #if $action.range.end
+            --end $action.range.end
+          #end if
+        #end if
+        --output-dir downloads
+    #end for
+    && rm -f downloads/**/*.md5  ## checksum validation already performed by pyEGA, clean up downloads folder
+
 #end if
     ]]></command>
     <configfiles>
@@ -53,6 +85,7 @@
                 <option value="list_datasets"> List my authorized datasets </option>
                 <option value="list_dataset_files"> List files in a datasets </option>
                 <option value="download_file"> Download a file </option>
+                <option value="download_files"> Download multiple files (based on a file with IDs) </option>
             </param>
             <when value="list_dataset_files">
                 <param name="dataset_id" type="text" optional="false" label="EGA Dataset Accession ID" help="Identifier starting with 'EGAD'. For example: EGAD00001003338">
@@ -65,40 +98,59 @@
                      <validator type="regex" message="EGA Accession ID must be a string of numbers prefixed by 'EGAD' (datasets) or 'EGAF' (files)">EGAF[0-9]+</validator>
                 </param>
                 <section name="range" title="Request a specific Genomic range?" expanded="false">
-                <param argument="--reference-name" type="text" optional="true" label="Reference Sequence Name" help="For example 'chr1', '1', or 'chrX'. If unspecified, all data is returned." />
-                <param argument="--start" type="integer" optional="true" min="0" label="Start Position" help="0-based, inclusive. Only used if a reference sequence name was specified"/>
-                <param argument="--end" type="integer" optional="true" min="0" label="End Position" help="0-based, exclusive. Only used if a reference sequence name was specified"/>
+                    <param argument="--reference-name" type="text" optional="true" label="Reference Sequence Name" help="For example 'chr1', '1', or 'chrX'. If unspecified, all data is returned." />
+                    <param argument="--start" type="integer" optional="true" min="0" label="Start Position" help="0-based, inclusive. Only used if a reference sequence name was specified"/>
+                    <param argument="--end" type="integer" optional="true" min="0" label="End Position" help="0-based, exclusive. Only used if a reference sequence name was specified"/>
+                </section>
+            </when>
+            <when value="download_files">
+                <param name="id_table" type="data" format="tabular" label="Table with IDs to download" help="A tabular file where one column contains the set of file IDs. This will output a collection. Please select files that are all the same format (e.g. all BAM or all VCF)."/>
+                <param name="id_column" type="data_column" data_ref="id_table" label="Column containing the file IDs" help="File Identifiers starting with 'EGAF'. For example: EGAF00001753735" />
+                <section name="range" title="Request a specific Genomic range? (will be applied to ALL requested files)" expanded="false">
+                    <param argument="--reference-name" type="text" optional="true" label="Reference Sequence Name" help="For example 'chr1', '1', or 'chrX'. If unspecified, all data is returned." />
+                    <param argument="--start" type="integer" optional="true" min="0" label="Start Position" help="0-based, inclusive. Only used if a reference sequence name was specified"/>
+                    <param argument="--end" type="integer" optional="true" min="0" label="End Position" help="0-based, exclusive. Only used if a reference sequence name was specified"/>
                 </section>
             </when>
         </conditional>
+        <param name="output_log" type="boolean" checked="false" label="Output the log file?"/>
     </inputs>
     <outputs>
         <data name="authorized_datasets" format="txt" from_work_dir="pyega3_output.log" label="${tool.name}: authorized datasets">
             <filter> action['action_type'] == 'list_datasets' </filter>
         </data>
-        <data name="dataset_file_list" format="txt" from_work_dir="pyega3_output.log" label="${tool.name}: dataset file list">
+        <data name="dataset_file_list" format="tabular" label="${tool.name}: ${action.dataset_id} - file list">
             <filter> action['action_type'] == 'list_dataset_files' </filter>
         </data>
-        <data name="downloaded_file" auto_format="true" label="${tool.name}: ${action.file_id} ${action.range.reference_name} ${action.range.start} ${action.range.end}">
+        <data name="downloaded_file" auto_format="true" from_work_dir="downloads/*" label="${tool.name}: ${action.file_id} ${action.range.reference_name} ${action.range.start} ${action.range.end}">
             <filter> action['action_type'] == 'download_file' </filter>
         </data>
+        <data name="logfile" format="txt" from_work_dir="pyega3_output.log" label="${tool.name}: log">
+            <filter> output_log </filter>
+        </data>
+        <collection name="downloaded_file_collection" type="list" label="${tool.name} on ${on_string}: Downloaded datasets">
+            <filter> action['action_type'] == 'download_files' </filter>
+            <discover_datasets pattern="__designation_and_ext__" recurse="true" directory="downloads" />
+        </collection>
     </outputs>
     <tests>
         <test expect_num_outputs="1"><!-- list datasets with default credentials -->
             <param name="action_type" value="list_datasets"/>
             <output name="authorized_datasets" ftype="txt">
                 <assert_contents>
-                    <has_text text="pyEGA3 - EGA python client version @VERSION@"/>
+                    <has_text text="pyEGA3 - EGA python client version @TOOL_VERSION@"/>
                     <has_text text="EGAD00001003338"/>
                 </assert_contents>
             </output>
         </test>
-        <test expect_num_outputs="1"><!-- list dataset files with default credentials -->
+        <test expect_num_outputs="2"><!-- list dataset files with default credentials, and request a log output file -->
             <param name="action_type" value="list_dataset_files"/>
             <param name="dataset_id" value="EGAD00001003338"/>
-            <output name="dataset_file_list" ftype="txt">
+            <param name="output_log" value="true"/>
+            <output name="dataset_file_list" file="filelist_EGAD00001003338.tabular"/>
+            <output name="logfile" ftype="txt">
                 <assert_contents>
-                    <has_text text="pyEGA3 - EGA python client version @VERSION@"/>
+                    <has_text text="pyEGA3 - EGA python client version @TOOL_VERSION@"/>
                     <has_line_matching expression="^\[.*\]\s+File ID\s+Status\s+Bytes\s+Check sum\s+File name$"/>
                     <has_text text="EGAF00001753734"/>
                 </assert_contents>
@@ -109,7 +161,7 @@
             <param name="file_id" value="EGAF00001775036"/>
             <output name="downloaded_file" md5="3b89b96387db5199fef6ba613f70e27c"/>
         </test>
-         <test expect_num_outputs="1"> <!-- download a single file, with genomic range specified -->
+        <test expect_num_outputs="1"> <!-- download a single file, with genomic range specified -->
             <param name="action_type" value="download_file"/>
             <param name="file_id" value="EGAF00001753756"/>
             <param name="reference_name" value="1"/>
@@ -117,6 +169,27 @@
             <param name="end" value="10000"/>
             <output name="downloaded_file" ftype="bam" md5="e576a38748feec45aa45191f6e902ce2"/>
         </test>
+        <test expect_num_outputs="1"> <!-- download multiple files -->
+            <param name="action_type" value="download_files"/>
+            <param name="id_table" value="filelist.tabular"/>
+            <param name="id_column" value="1"/>
+            <output_collection name="downloaded_file_collection" type="list" count="2">
+                <element name="ENCFF000VWO.bam" md5="b8ae14d5d1f717ab17d45e8fc36946a0" />
+                <element name="ENCFF284YOU.bam" md5="3b89b96387db5199fef6ba613f70e27c" />
+            </output_collection>
+        </test>
+        <test expect_num_outputs="1"> <!-- download multiple files, in combination with a genomic range -->
+            <param name="action_type" value="download_files"/>
+            <param name="id_table" value="filelist2.tabular"/>
+            <param name="id_column" value="1"/>
+            <param name="reference_name" value="1"/>
+            <param name="start" value="100"/>
+            <param name="end" value="10000"/>
+            <output_collection name="downloaded_file_collection" count="2">
+                <element name="NA19239_genomic_range_1_100_10000" md5="bcdcf18846233cbe5cc8afd95168552c" />
+                <element name="NA19240_genomic_range_1_100_10000" md5="e576a38748feec45aa45191f6e902ce2" />
+            </output_collection>
+        </test>
     </tests>
     <help><![CDATA[
 The pyEGA3 download client is a python-based tool for viewing and downloading files from authorized EGA datasets.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/filelist.tabular	Tue Apr 12 11:36:51 2022 +0000
@@ -0,0 +1,3 @@
+File ID	Status	Bytes	Check sum	File name
+EGAF00001775034	1	5991400	b8ae14d5d1f717ab17d45e8fc36946a0	ENCFF000VWO.bam.bai
+EGAF00001775036	1	4804928	3b89b96387db5199fef6ba613f70e27c	ENCFF284YOU.bam.bai
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/filelist2.tabular	Tue Apr 12 11:36:51 2022 +0000
@@ -0,0 +1,3 @@
+File ID	Status	Bytes	Check sum	File name
+EGAF00001753756	1	140445765831	2413ce93a4b2b50fa0c2ff5bdf97695f	NA19240.bam
+EGAF00001753754	1	136016115737	59fbc3828fb878d8e637557ce707d445	NA19239.bam
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/filelist_EGAD00001003338.tabular	Tue Apr 12 11:36:51 2022 +0000
@@ -0,0 +1,49 @@
+File ID	Status	Bytes	Check sum	File name
+EGAF00005000662	1	25444204	274de4071bca5354ff16a1de0116c455	NA19238.chr22.vcf.gz
+EGAF00005000663	1	18596	02fdb6fc68b854f98fef710ff4dee0c1	NA19238.chr22.vcf.gz.tbi
+EGAF00005000664	1	26957200	62b16cc9ce6ceb3ef97b98c99aa6fec5	NA19238.chr22.bcf
+EGAF00005000665	1	14509	7cf0f467fd44dd783ff05cb4662642b6	NA19238.chr22.bcf.csi
+EGAF00005001623	1	214453766	ad7d6e0c05edafd7faed7601f7f3eaba	ALL_chr22_20130502_2504Individuals.vcf.gz
+EGAF00005001624	1	36094	4202e9a481aa8103b471531a96665047	ALL_chr22_20130502_2504Individuals.vcf.gz.tbi
+EGAF00005001625	1	186424665	c65ca1a4abd55351598ccbc65ebfa9a6	ALL_chr22_20130502_2504Individuals.bcf
+EGAF00005001626	1	27620	09e3b4724404fc7bb5f9948f80016757	ALL_chr22_20130502_2504Individuals.bcf.csi
+EGAF00005007180	1	1837578063	74d3b803823d3f8b73bd592941f23726	HG01775.GRCh38DH.exome.cram
+EGAF00005007181	1	2938941932	910141b9f4ccbfbf57813dee1a7a3f1d	NA18534.GRCh38DH.exome.cram
+EGAF00005007323	1	5719142	388fb466c983d4bec2082941647409f3	ALL.chrY.phase3_integrated_v2a.20130502.genotypes.vcf.gz
+EGAF00005007324	1	8074	fa37e14805cce3221f6f9d3a4cd749a4	ALL.chrY.phase3_integrated_v2a.20130502.genotypes.vcf.gz.tbi
+EGAF00005007325	1	5527171	395c0d3d454d7c7d61c4f771fbab02fc	ALL.chrY.phase3_integrated_v2a.20130502.genotypes.bcf
+EGAF00005007326	1	6251	ae2d2097a8744877d9d20907200cbdcf	ALL.chrY.phase3_integrated_v2a.20130502.genotypes.bcf.csi
+EGAF00005007327	1	850737	f3dee64b466efe334b2cac77f5c2f710	HG01775.chrY.vcf.gz
+EGAF00005007328	1	4981	d0e71e5dd7f5279e113c4f0dfd37fc23	HG01775.chrY.vcf.gz.tbi
+EGAF00005007329	1	876313	aaca702e347ae6caa734d44527a49212	HG01775.chrY.bcf
+EGAF00005007330	1	4722	110b493c17210ff3484ed2561a2fe21f	HG01775.chrY.bcf.csi
+EGAF00005007331	1	137465	fcf1cc38cd404ea1cdba3975d26f4a8b	HG01775.GRCh38DH.exome.cram.crai
+EGAF00005007332	1	229305	56e8de04466aba23ab5acbaf1c087045	NA18534.GRCh38DH.exome.cram.crai
+EGAF00001753734	1	45030910198	040ef7533533a3db67a35b9f454b9269	NA12878.cram
+EGAF00001753735	1	1575103	41fd8741e91924eae19c6baa7893eeb8	NA12878.crai
+EGAF00001753736	1	38215425935	bbc03793c9534a22f77e751d2723cb10	NA12891.cram
+EGAF00001753737	1	1310034	0ab7a2d110740561871ccdca7f15f13b	NA12891.crai
+EGAF00001753738	1	38370156211	a7503d228d0851b999b826b736b8dd32	NA12892.cram
+EGAF00001753739	1	1331384	bb569235226b5b9f0578d34d1b52482e	NA12892.crai
+EGAF00001753740	1	34823972801	492780f603da2f5f3306c41011e0acd2	NA19238.cram
+EGAF00001753741	1	1195785	3b862e018b0b85db7954cbed2e17b6ba	NA19238.crai
+EGAF00001753742	1	44113571936	d963539652de2ea20005d98e934d59c2	NA19239.cram
+EGAF00001753743	1	1514700	be2024ccbf5b3bd9132f6d270a37118c	NA19239.crai
+EGAF00001753744	1	48309446909	728bea9317cbab1c98429e43e48f9a83	NA19240.cram
+EGAF00001753745	1	1622405	18e0e7070b6cf4d042c7f9bee15d56bd	NA19240.crai
+EGAF00001753746	1	143427187111	11395de33f28ed867170d2dc723cc700	NA12878.bam
+EGAF00001753747	1	8949984	a23a84c89d338796f78e68804c8d2c6c	NA12878.bam.bai
+EGAF00001753748	1	4317237247	71a78dfb5258939abab2257a2abd1126	NA12891.bam
+EGAF00001753749	1	9212704	e04dbb7ccbc24ccd853d89b8b066166c	NA12891.bai
+EGAF00001753750	1	66145394874	201bded705401615fe5e90988d509656	NA12892.bam
+EGAF00001753751	1	9204720	c1eadd98469fcd3ced4c51a84b3ce307	NA12892.bai
+EGAF00001753752	1	229774247950	0751106bbe1c4c83ec934a5972a4efdf	NA19238.bam
+EGAF00001753753	1	9379032	028ab5c73fea03c349e0d73943913141	NA19238.bai
+EGAF00001753754	1	136016115737	59fbc3828fb878d8e637557ce707d445	NA19239.bam
+EGAF00001753755	1	9005792	767fc92be753de8cf570690bd7fbe629	NA19239.bai
+EGAF00001753756	1	140445765831	2413ce93a4b2b50fa0c2ff5bdf97695f	NA19240.bam
+EGAF00001753757	1	9018288	351130149989cca43fe8c7382e9d326a	NA19240.bai
+EGAF00001770106	1	462139278	ce073afcbc07afa343f2d4e4d07efeda	ENCFF000VWO.bam
+EGAF00001770107	1	3551031027	dfef3f355230915418a78da460665d56	ENCFF284YOU.bam
+EGAF00001775034	1	5991400	b8ae14d5d1f717ab17d45e8fc36946a0	ENCFF000VWO.bam.bai
+EGAF00001775036	1	4804928	3b89b96387db5199fef6ba613f70e27c	ENCFF284YOU.bam.bai