Mercurial > repos > ieguinoa > ena_upload
changeset 2:2f7a70c0d3ab draft default tip
Uploaded
author | ieguinoa |
---|---|
date | Mon, 21 Feb 2022 14:22:53 +0000 |
parents | 9681a9180730 |
children | |
files | .shed.yml dump_yaml.py ena_upload.xml extract_tables.py samples_macros.xml |
diffstat | 5 files changed, 112 insertions(+), 66 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.shed.yml Mon Feb 21 14:22:53 2022 +0000 @@ -0,0 +1,12 @@ +categories: + - Data Export +description: | + Submits experimental data and respective metadata to the European Nucleotide Archive (ENA). +long_description: | + The program submits experimental data and respective metadata to the European Nucleotide Archive (ENA). + The metadata should be provided in separate tables corresponding to ENA objects STUDY, SAMPLE, EXPERIMENT and RUN +name: ena_upload +owner: iuc +remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/ena_upload +homepage_url: https://github.com/usegalaxy-eu/ena-upload-cli +type: unrestricted
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dump_yaml.py Mon Feb 21 14:22:53 2022 +0000 @@ -0,0 +1,37 @@ +import sys +import yaml + +def fetch_table_data(table_path): + data_dict = {} + with open(table_path) as table_to_load: + # load headers + headers = table_to_load.readline().strip('\n').split('\t') + row_id = 0 + for line in table_to_load.readlines(): + # print(line) + line_data = line.strip('\n').split('\t') + row_dict = {} + for col_num in range(len(headers)): + col_name = headers[col_num] + row_dict[col_name] = line_data[col_num] + data_dict[row_id] = row_dict + row_id += 1 + return data_dict + +all_data_dict = {} +print('YAML -------------') +studies_table_path = sys.argv[1] +table_data = fetch_table_data(studies_table_path) +all_data_dict['ENA_study'] = table_data +samples_table_path = sys.argv[2] +table_data = fetch_table_data(samples_table_path) +all_data_dict['ENA_sample'] = table_data +experiments_table_path = sys.argv[3] +table_data = fetch_table_data(experiments_table_path) +all_data_dict['ENA_experiment'] = table_data +runs_table_path = sys.argv[4] +table_data = fetch_table_data(runs_table_path) +all_data_dict['ENA_run'] = table_data +# print(all_data_dict) +print(yaml.dump(all_data_dict)) +print('YAML -------------')
--- a/ena_upload.xml Wed Feb 02 17:30:12 2022 +0000 +++ b/ena_upload.xml Mon Feb 21 14:22:53 2022 +0000 @@ -26,6 +26,10 @@ #if $action_options.input_format_conditional.input_format == "build_tables": python '$__tool_directory__/extract_tables.py' --action $action_options.action --out_dir ./submission_files --studies $studies_json; #end if + +credentials_path='test_fake_path'; +echo "username: test_user" > \$credentials_path; +echo "password: test_password" >> \$credentials_path; #if $action_options.input_format_conditional.input_format == "build_tables": @@ -65,13 +69,6 @@ fi; #end if -#if $action_options.test_submit == "True" or $action_options.test_submit_parameters.dry_run == "true": - credentials_path='test_fake_path'; - #if $action_options.test_submit_parameters.dry_run == "true" and $action_options.input_format_conditional.input_format == "excel_tables": - echo "username: test_user" > \$credentials_path; - echo "password: test_password" >> \$credentials_path; - #end if -#end if ## create the list of files to upload and make the symlinks #import re @@ -173,8 +170,6 @@ #end if #end if - -#if $action_options.test_submit_parameters.dry_run == "false" or $action_options.input_format_conditional.input_format == "excel_tables": #if $action_options.action == "add": ena-upload-cli --tool 'ena-upload-cli v@VERSION@ @ Galaxy' @@ -215,6 +210,8 @@ >> '$output'; #end if + + #if $action_options.action == "modify": ena-upload-cli --tool 'ena-upload-cli v@VERSION@ @ Galaxy' @@ -251,27 +248,28 @@ #end if >> '$output'; #end if - #if $action_options.test_submit_parameters.dry_run == "false": - echo -e 'center_name\t$action_options.center' >> '$output'; - echo -e 'action_option\t$action_options.action' >> '$output'; - #if $action_options.input_format_conditional.input_format == "build_tables": - cp $studies_table_path_updated $studies_table_out 2>/dev/null; - cp $samples_table_path_updated $samples_table_out 2>/dev/null; - cp $experiments_table_path_updated $experiments_table_out 2>/dev/null; - cp $runs_table_path_updated $runs_table_out 2>/dev/null; - #end if - #else - ## for the excel input case, copy the upload-cli generated tables to the output files - ## this applies for both draft and real submissions - cp './ENA_template_experiment_updated.tsv' $experiments_table_out; - cp './ENA_template_sample_updated.tsv' $samples_table_out; - cp './ENA_template_study_updated.tsv' $studies_table_out; - cp './ENA_template_run_updated.tsv' $runs_table_out; - #end if -#else: - exit 0; + +#if $action_options.test_submit_parameters.dry_run == "false": + echo -e 'center_name\t$action_options.center' >> '$output'; + echo -e 'action_option\t$action_options.action' >> '$output'; +#end if + +## copy updated files +#if $action_options.input_format_conditional.input_format == "excel_tables": + ## for the excel input case, copy the upload-cli generated tables to the output files + ## this applies for both draft and real submissions + cp './ENA_template_experiment_updated.tsv' $experiments_table_out; + cp './ENA_template_sample_updated.tsv' $samples_table_out; + cp './ENA_template_study_updated.tsv' $studies_table_out; + cp './ENA_template_run_updated.tsv' $runs_table_out; +#else + cp $studies_table_path_updated $studies_table_out 2>/dev/null; + cp $samples_table_path_updated $samples_table_out 2>/dev/null; + cp $experiments_table_path_updated $experiments_table_out 2>/dev/null; + cp $runs_table_path_updated $runs_table_out 2>/dev/null; #end if +python '$__tool_directory__/dump_yaml.py' $studies_table_out $samples_table_out $experiments_table_out $runs_table_out >> $output; ]]></command> <configfiles> <configfile name="credentials"><![CDATA[ @@ -453,6 +451,15 @@ <has_line_matching expression="r_20201007_026\te_20201007_026\tC026_exp5_clean.fastq.gz\tFASTQ(.*)"/> </assert_contents> </output> + <output name="output"> + <assert_contents> + <has_line_matching expression="YAML -------------"/> + <has_line_matching expression="ENA_experiment:"/> + <has_line_matching expression="ENA_sample:"/> + <has_line_matching expression="ENA_study:"/> + <has_line_matching expression="ENA_run:"/> + </assert_contents> + </output> </test> <!--Test 3: excel input of NON-VIRAL samples--> <test> @@ -549,7 +556,7 @@ </section> <conditional name="input_format_conditional"> <param name="input_format" value="build_tables"/> - <param name="add_extension" value="true"/> + <param name="add_extension" value="false"/> <conditional name="conditional_viral_metadata"> <param name="viral_sample" value="False"/> <repeat name="rep_study"> @@ -561,7 +568,7 @@ <param name="sample_title" value="Test Sample title"/> <param name="sample_description" value="Test Sample description"/> <param name="scientific_name" value="Test Sample scientific name"/> - <param name="tax_id" value="Test Sample tax_id"/> + <param name="tax_id" value="2697049"/> <repeat name="rep_experiment"> <param name="experiment_title" value="Test experiment title"/> <param name="experiment_design" value="Test experiment design description"/> @@ -574,7 +581,7 @@ <param name="platform" value="ILLUMINA"/> <param name="instrument_model" value="Illumina HiSeq 4000"/> <repeat name="rep_runs"> - <param name="upload_files" value="1.fastqsanger.gz,sample.fq" ftype="fastqsanger"/> + <param name="upload_files" value="1.fastqsanger.gz,sample.fq" ftype="fastqsanger.gz"/> </repeat> </repeat> </repeat> @@ -632,7 +639,7 @@ <param name="sample_title" value="Test Sample title"/> <param name="sample_description" value="Test Sample description"/> <param name="scientific_name" value="Test Sample scientific name"/> - <param name="tax_id" value="Test Sample tax_id"/> + <param name="tax_id" value="2697049"/> <repeat name="rep_experiment"> <param name="experiment_title" value="Test experiment title"/> <param name="experiment_design" value="Test experiment design description"/> @@ -660,16 +667,16 @@ </test> <!--Test 7: with submit_test to skip credentials checksRUN failing build tables from user input fields NON-VIRAL samples also tests compression of uncompressed inputs and adding the .gz suffix --> - <test expect_failure="true"> + <test expect_failure="false"> <conditional name="action_options"> <param name="action" value="add"/> <section name="test_submit_parameters"> <param name="submit_dev" value="true" /> - <param name="dry_run" value="false" /> + <param name="dry_run" value="true" /> </section> <param name="test_submit" value="True"/> <conditional name="input_format_conditional"> - <param name="add_extension" value="true"/> + <param name="add_extension" value="false"/> <param name="input_format" value="build_tables"/> <conditional name="conditional_viral_metadata"> <param name="viral_sample" value="false"/> @@ -682,7 +689,7 @@ <param name="sample_title" value="Test Sample title"/> <param name="sample_description" value="Test Sample description"/> <param name="scientific_name" value="Test Sample scientific name"/> - <param name="tax_id" value="Test Sample tax_id"/> + <param name="tax_id" value="2697049"/> <repeat name="rep_experiment"> <param name="experiment_title" value="Test experiment title"/> <param name="experiment_design" value="Test experiment design description"/> @@ -696,7 +703,7 @@ <param name="instrument_model" value="Illumina HiSeq 4000"/> <repeat name="rep_runs"> <param name="run_base_name" value="run_from_hospital_X"/> - <param name="upload_files" value="sample.fq,sample.fq" ftype="fastqsanger"/> + <param name="upload_files" value="1.fastqsanger.gz,2.fastqsanger.gz" ftype="fastqsanger.gz"/> </repeat> </repeat> </repeat> @@ -707,25 +714,22 @@ <param name="center" value="Some research center"/> <assert_command> <has_text_matching expression="ena-upload-cli"/> - <has_text_matching expression="--data 'sample.fq.fastq.gz' 'sample.fq.fastq.gz'"/> + <has_text_matching expression="--data '1.fastqsanger.gz' '2.fastqsanger.gz'"/> <has_text_matching expression="--action 'add' --center 'Some research center'"/> <not_has_text text="modify" /> </assert_command> - <assert_stderr> - <has_text_matching expression="Oops, the file test_fake_path does not exist"/> - </assert_stderr> </test> <!--Test 8: viral submission - User input metadata - Add extension = False--> - <test expect_failure="true"> + <test expect_failure="false"> <conditional name="action_options"> <param name="action" value="add"/> <section name="test_submit_parameters"> <param name="submit_dev" value="false" /> - <param name="dry_run" value="false" /> + <param name="dry_run" value="true" /> </section> <param name="test_submit" value="True"/> <conditional name="input_format_conditional"> - <param name="add_extension" value="False"/> + <param name="add_extension" value="false"/> <param name="input_format" value="build_tables"/> <conditional name="conditional_viral_metadata"> <param name="viral_sample" value="true"/> @@ -738,7 +742,7 @@ <param name="sample_title" value="Test Sample title"/> <param name="sample_description" value="Test Sample description"/> <param name="scientific_name" value="Test Sample scientific name"/> - <param name="tax_id" value="Test Sample tax_id"/> + <param name="tax_id" value="2697049"/> <param name="collection_date" value="2020"/> <param name="geo_location_country" value="Belgium"/> <param name="host_common_name" value="Human"/> @@ -777,21 +781,18 @@ <has_text_matching expression="--action 'add' --center 'Some research center'"/> <has_text_matching expression="--checklist ERC000033"/> </assert_command> - <assert_stderr> - <has_text_matching expression="Oops, the file test_fake_path does not exist"/> - </assert_stderr> </test> <!--Test 9: modify option and auto compression - viral submission - User input metadata--> - <test expect_failure="true"> + <test expect_failure="false"> <conditional name="action_options"> <param name="action" value="modify"/> <section name="test_submit_parameters"> <param name="submit_dev" value="false" /> - <param name="dry_run" value="false" /> + <param name="dry_run" value="true" /> </section> <param name="test_submit" value="True"/> <conditional name="input_format_conditional"> - <param name="add_extension" value="False"/> + <param name="add_extension" value="fasle"/> <param name="input_format" value="build_tables"/> <conditional name="conditional_viral_metadata"> <param name="viral_sample" value="True"/> @@ -804,7 +805,7 @@ <param name="sample_title" value="Test Sample title"/> <param name="sample_description" value="Test Sample description"/> <param name="scientific_name" value="Test Sample scientific name"/> - <param name="tax_id" value="Test Sample tax_id"/> + <param name="tax_id" value="2697049"/> <param name="collection_date" value="2020"/> <param name="geo_location_country" value="Belgium"/> <param name="host_common_name" value="Human"/> @@ -828,7 +829,7 @@ <param name="instrument_model" value="Illumina HiSeq 4000"/> <repeat name="rep_runs"> <param name="run_base_name" value="run_from_hospital_X"/> - <param name="upload_files" value="sample.fq" ftype="fastqsanger"/> + <param name="upload_files" value="1.fastqsanger.gz" ftype="fastqsanger.gz"/> </repeat> </repeat> </repeat> @@ -839,21 +840,17 @@ <param name="center" value="Some research center"/> <assert_command> <has_text_matching expression="ena-upload-cli"/> - <has_text_matching expression="--data 'sample.fq.gz'"/> + <has_text_matching expression="--data '1.fastqsanger.gz'"/> <has_text_matching expression="--action 'modify' --center 'Some research center'"/> <has_text_matching expression="--checklist ERC000033"/> <not_has_text text="add" /> </assert_command> - <assert_stderr> - <has_text_matching expression="Oops, the file test_fake_path does not exist"/> - </assert_stderr> </test> </tests> <help><![CDATA[ This is a wrapper for the ENA upload tool in https://github.com/usegalaxy-eu/ena-upload-cli The input metadata can be submitted following the tabular format of the templates in https://github.com/usegalaxy-eu/ena-upload-cli/tree/master/example_tables - It is also possible to submit an excel file by following the template in https://drive.google.com/file/d/1ncC22--tW2v-EI-te_r86sAZujIPAjlX/view?usp=sharing - For viral submissions a larger set of metadata is required, you can find the template in https://drive.google.com/file/d/1Gx78GKh58PmRjdmJ05DBbpObAL-3oUFX/view?usp=sharing + It is also possible to submit an excel file by following the template in https://github.com/ELIXIR-Belgium/ENA-metadata-templates ]]></help> <citations> </citations>
--- a/extract_tables.py Wed Feb 02 17:30:12 2022 +0000 +++ b/extract_tables.py Mon Feb 21 14:22:53 2022 +0000 @@ -44,10 +44,10 @@ 'ENA_submission_data'])) if "geo_location" in study['samples'][0].keys(): # sample belongs to a viral sample samples_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'scientific_name', - 'taxon_id', 'sample_description', 'collection_date', - 'geographic_location', 'host_common_name', 'host_subject_id', - 'host_health_state', 'host_sex', 'host_scientific_name', - 'collector_name', 'collecting_institution', 'isolate', + 'taxon_id', 'sample_description', 'collection date', + 'geographic location (country and/or sea)', 'host common name', 'host subject id', + 'host health state', 'host sex', 'host scientific name', + 'collector name', 'collecting institution', 'isolate', 'submission_date']) + '\n') else: samples_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'scientific_name',
--- a/samples_macros.xml Wed Feb 02 17:30:12 2022 +0000 +++ b/samples_macros.xml Mon Feb 21 14:22:53 2022 +0000 @@ -67,14 +67,14 @@ <xml name="table_inputs_macro"> <conditional name="input_format_conditional"> <param name="input_format" type="select" label="Would you like to submit pregenerated table files or interactively define the input structures?"> - <option value="excel_tables" selected="True">User generated metadata tables based on Excel templates</option> + <option value="excel_tables" selected="True">User generated metadata tables based on Excel template</option> <option value="build_tables" selected="False">Interactive generation of the study structure (recommended for small studies)</option> <option value="user_generated_tables" selected="False">User generated tabular files (studies - samples - experiments - runs) </option> </param> <when value="excel_tables"> <!--<param name="viral_submission" type="boolean" label="Does your submission data belong to a viral sample?" help="If you select yes then your data will be submitted using the ENA virus pathogen reporting standard checklist (see: https://ena-browser-docs.readthedocs.io/en/latest/help_and_guides/sars-cov-2-submissions.html)" />--> <expand macro="checklist_input_macro"/> - <param name="xlsx_file" type="data" format="xlsx" label="Select Excel (xlsx) file based on templates" /> + <param name="xlsx_file" type="data" format="xlsx" label="Select Excel (xlsx) file based on template" /> <expand macro="run_inputs_macro" /> </when> <when value="user_generated_tables"> @@ -207,7 +207,7 @@ <param name="sample_title" type="text" label="Sample title"/> <param name="sample_description" type="text" help="e.g: liver cells" label="Describe the type of sample"/> <param name="scientific_name" type="text" label="Enter the species of the sample" help="e.g Severe acute respiratory syndrome coronavirus 2"/> - <param name="tax_id" type="text" label="Enter the taxonomic ID corresponding to the sample species" /> + <param name="tax_id" type="integer" value="0" label="Enter the taxonomic ID corresponding to the sample species" /> <repeat name="rep_experiment" title="Sequencing experiments performed with this sample" min="1" > <param name="experiment_title" type="text" label="Specify an experiment title" /> <param name="experiment_design" type="text" label="Describe the experiment design" />