Mercurial > repos > iuc > ena_upload

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/check_remote.py	Tue Oct 19 15:57:14 2021 +0000
@@ -0,0 +1,23 @@
+import json
+
+import requests
+
+URL = "https://www.ebi.ac.uk/ena/portal/api/search"
+
+
+def check_remote_entry(entry_type, query_dict, out_format='json'):
+    '''
+    Checks if an entry with that alias exists in the ENA repos
+    entry_type = [study | sample | experiment | run]
+    '''
+    assert entry_type in ['study', 'sample', 'experiment', 'run']
+    params_dict = {}
+    query_str = ' AND '.join(['%s=%s' % (key, value) for (key, value) in query_dict.items()])
+    params_dict['query'] = query_str
+    params_dict['result'] = 'read_' + entry_type
+    params_dict['fields'] = entry_type + '_alias'
+    params_dict['format'] = out_format
+    response = requests.post(URL, data=params_dict)
+    if response.content != b'':
+        return json.loads(response.content)
+    return []
--- a/ena_upload.xml	Wed Aug 18 19:42:49 2021 +0000
+++ b/ena_upload.xml	Tue Oct 19 15:57:14 2021 +0000
@@ -1,6 +1,6 @@
-<tool id="ena_upload" name="ENA Upload tool" version="0.3.3" profile="20.01" license="MIT">
+<tool id="ena_upload" name="ENA Upload tool" version="@VERSION@" profile="20.01" license="MIT">
     <macros>
-        <token name="@VERSION@">0.3.1</token>
+        <token name="@VERSION@">0.4.1</token>
         <import>samples_macros.xml</import>
     </macros>
     <requirements>
@@ -33,6 +33,9 @@
     #if $action_options.input_format_conditional.viral_submission == "true":
         --vir
     #end if
+    #if $action_options.test_submit_parameters.submit_dev == "true":
+        --dev
+    #end if
     --action '$action_options.action' --form '$action_options.input_format_conditional.xlsx_file' --out_dir ./submission_files --verbose > '$output';
 #end if

@@ -84,7 +87,26 @@
           #for $run in $experiment.rep_runs:
             #for $file in $run.upload_files:
                 #set $safename_reads_file = re.sub('[^\w\-_\.]', '_', $file.element_identifier)
-                ln -s '$file' $safename_reads_file &&
+                #if $action_options.input_format_conditional.add_extension == "true":
+                    #set $extension = '.fastq'
+                #else
+                    #set $extension = ''
+                #end if
+                #if $file.is_of_type('fastq', 'fastqsanger'):
+                    ## compression output is defined as safename_reads_file so no need to symlink
+                    #set $safename_reads_file = $safename_reads_file + $extension + '.gz'
+                    gzip -c '$file' > $safename_reads_file &&
+                #else:
+                    #if $action_options.input_format_conditional.add_extension == "true":
+                        #if $file.is_of_type('fastq.gz', 'fastqsanger.gz'):
+                            #set $compression = '.gz'
+                        #elif $file.is_of_type('fastqsanger.bz2', 'fastq.bz2'):
+                            #set $compression = '.bz2'
+                        #end if
+                        #set $safename_reads_file = $safename_reads_file + $extension + $compression
+                    #end if
+                    ln -s '$file' $safename_reads_file &&
+                #end if
                 $files_to_upload.append(str($safename_reads_file))
             #end for
           #end for
@@ -95,31 +117,61 @@
     #if $action_options.input_format_conditional.run_input_format_conditional.run_input_format == 'paired_list':
         #for $pair in $action_options.input_format_conditional.run_input_format_conditional.paired_end_collection:
             #set $safename_reads_file = re.sub('[^\w\-_\.]', '_', $pair.name)
-            #if $pair.forward.is_of_type('fastq.gz', 'fastqsanger.gz'):
-                #set $safename_fwd_reads_file = $safename_reads_file + '_1.fastq.gz'
-            #elif $pair.forward.is_of_type('fastqsanger.bz2', 'fastq.bz2'):
-                #set $safename_fwd_reads_file = $safename_reads_file + '_1.fastq.bz2'
-            #else:
-                #set $safename_fwd_reads_file = $safename_reads_file + '_1.fastq'
-            #end if
-            #if $pair.reverse.is_of_type('fastq.gz', 'fastqsanger.gz'):
-                #set $safename_rev_reads_file = $safename_reads_file + '_2.fastq.gz'
-            #elif $pair.reverse.is_of_type('fastqsanger.bz2', 'fastq.bz2'):
-                #set $safename_rev_reads_file = $safename_reads_file + '_2.fastq.bz2'
-            #else:
-                #set $safename_rev_reads_file = $safename_reads_file + '_2.fastq'
+            ## Always need to add .fastq + compression suffix because the name is based on the pair name which has no extensions
+            #if $pair.forward.is_of_type('fastq', 'fastqsanger'):
+                ## compress the file, no need to create the link then
+                ## always add the compression suffix (.gz)
+                #set $safename_fwd_reads_file = $safename_reads_file + '_1' + 'fastq' + '.gz'
+                gzip -c '$file' > $safename_fwd_reads_file &&
+            #else
+                #if $pair.forward.is_of_type('fastq.gz', 'fastqsanger.gz'):
+                    #set $compression = '.gz'
+                #elif $pair.forward.is_of_type('fastqsanger.bz2', 'fastq.bz2'):
+                    #set $compression = '.bz2'
+                #end if
+                #set $safename_fwd_reads_file = $safename_reads_file + '_1' + '.fastq' + $compression
+                ln -s '$pair.forward' $safename_fwd_reads_file &&
             #end if

-            ln -s '$pair.forward' $safename_fwd_reads_file &&
+            #if $pair.reverse.is_of_type('fastq', 'fastqsanger'):
+                ## compress the file, no need to create the link then
+                #set $safename_reverse_reads_file = $safename_reads_file + '_1' + '.fastq' + '.gz'
+                gzip -c '$file' > $safename_rev_reads_file &&
+            #else
+                #if $pair.reverse.is_of_type('fastqsanger.bz2', 'fastq.bz2'):
+                    #set $compression = '.bz2'
+                #elif $pair.reverse.is_of_type('fastqsanger.gz', 'fastq.gz'):
+                    #set $compression = '.gz'
+                #end if
+                #set $safename_rev_reads_file = $safename_reads_file + '_2' + '.fastq' + $compression
+                ln -s '$pair.reverse' $safename_rev_reads_file &&
+            #end if
             $files_to_upload.append(str($safename_fwd_reads_file))
-            ln -s '$pair.reverse' $safename_rev_reads_file &&
             $files_to_upload.append(str($safename_rev_reads_file))
         #end for
     #end if
     #if $action_options.input_format_conditional.run_input_format_conditional.run_input_format == 'multiple_selection_list':
         #for $file in $action_options.input_format_conditional.run_input_format_conditional.data:
             #set $safename_reads_file = re.sub('[^\w\-_\.]', '_', $file.element_identifier)
-            ln -s '$file' $safename_reads_file &&
+            #if $file.is_of_type('fastq', 'fastqsanger'):
+                ## always compress add the gz extension
+                #if $action_options.input_format_conditional.run_input_format_conditional.add_extension == "true":
+                    #set $safename_reads_file = $safename_reads_file + 'fastq.gz'
+                #else
+                    #set $safename_reads_file = $safename_reads_file + '.gz'
+                #end if
+                gzip -c '$file' > $safename_reads_file &&
+            #else
+                #if $action_options.input_format_conditional.run_input_format_conditional.add_extension == "true":
+                    #if $file.is_of_type('fastq.gz', 'fastqsanger.gz'):
+                        #set $extension = 'fastq.gz'
+                    #elif $file.is_of_type('fastqsanger.bz2', 'fastq.bz2'):
+                        #set $extension = 'fastq.bz2'
+                    #end if
+                    #set $safename_reads_file = $safename_reads_file + $extension
+                #end if
+                ln -s '$file' $safename_reads_file &&
+            #end if
             $files_to_upload.append(str($safename_reads_file))
         #end for
     #end if
@@ -127,6 +179,7 @@


 #if $action_options.test_submit_parameters.dry_run == "false":
+#if $action_options.action == "add":
 ena-upload-cli
     --tool 'ena-upload-cli v@VERSION@ @ Galaxy'
     --action '$action_options.action'
@@ -136,22 +189,23 @@
     #for $dataset in $files_to_upload:
         '$dataset'
     #end for
+--action add
 --experiment '$experiments_table_path'
 --study '$studies_table_path'
 --run '$runs_table_path'
 --sample '$samples_table_path'
 #if $action_options.input_format_conditional.input_format == "user_generated_tables":
     #if "$action_options.input_format_conditional.viral_submission" == "true":
-        --vir
+        --checklist ERC000033
     #end if
 #else:
     #if $action_options.input_format_conditional.input_format == "build_tables":
         #if $action_options.input_format_conditional.conditional_viral_metadata.viral_sample == "true":
-          --vir
+          --checklist ERC000033
         #end if
     #else:
         #if $action_options.input_format_conditional.viral_submission == "true":
-          --vir
+          --checklist ERC000033
         #end if
     #end if
 #end if
@@ -160,6 +214,40 @@
     -d
 #end if
     >> '$output';
+#end if
+
+#if $action_options.action == "modify":
+    ena-upload-cli
+    --tool 'ena-upload-cli v@VERSION@ @ Galaxy'
+    --action '$action_options.action'
+    --center '$action_options.center'
+    --secret \${credentials_path}
+    --data
+    #for $dataset in $files_to_upload:
+        '$dataset'
+    #end for
+--action 'modify'
+--experiment '$experiments_table_path'
+--study '$studies_table_path'
+--run '$runs_table_path'
+--sample '$samples_table_path'
+#if $action_options.input_format_conditional.input_format == "user_generated_tables":
+    #if "$action_options.input_format_conditional.viral_submission" == "true":
+        --checklist ERC000033
+    #end if
+#else:
+    #if $action_options.input_format_conditional.input_format == "build_tables":
+        #if $action_options.input_format_conditional.conditional_viral_metadata.viral_sample == "true":
+          --checklist ERC000033
+        #end if
+    #else:
+        #if $action_options.input_format_conditional.viral_submission == "true":
+          --checklist ERC000033
+        #end if
+    #end if
+#end if
+    >> '$output';
+#end if
     echo -e 'center_name\t$action_options.center' >> '$output';
     echo -e 'action_option\t$action_options.action' >> '$output';
     #if $action_options.input_format_conditional.input_format != "user_generated_tables":
@@ -232,6 +320,7 @@
             </when>
             <when value="modify">
                 <expand macro="test_submit_section"/>
+                <param name="test_submit" type="hidden" value="False" />
                 <expand macro="table_inputs_macro" />
             </when>
         </conditional>
@@ -252,7 +341,7 @@
         </data>
     </outputs>
     <tests>
-        <!--Test excel input of VIRAL samples -->
+        <!--Test 1:  excel input of VIRAL samples -->
         <test>
             <conditional name="action_options">
                 <param name="action" value="add"/>
@@ -265,6 +354,7 @@
                     <param name="viral_submission" value="True"/>
                     <param name="xlsx_file" value="metadata_test_viral.xlsx"/>
                     <conditional name="run_input_format_conditional">
+                        <param name="add_extension" value="true"/>
                         <param name="run_input_format" value="multiple_selection_list"/>
                         <param name="data" value="sample.fq"/>
                     </conditional>
@@ -276,7 +366,7 @@
                     <has_n_lines n="5"/>
                     <has_n_columns n="17"/>
                     <has_line_matching expression="alias\tstatus\taccession\ttitle\tstudy_alias\tsample_alias\tdesign_description\tlibrary_name\tlibrary_strategy\tlibrary_source\tlibrary_selection\tlibrary_layout\tinsert_size\tlibrary_construction_protocol\tplatform\tinstrument_model\tsubmission_date" />
-                    <has_line_matching expression="e_(.*)_026\tadd\taccession_ena\tNanopore sequencing\tSARS-CoV-2_genomes_01\ts_(.*)"/>
+                    <has_line_matching expression="e_(.*)_026\tmodify\taccession_ena\tNanopore sequencing\tSARS-CoV-2_genomes_01\ts_(.*)"/>
                 </assert_contents>
             </output>
             <output name="studies_table_out">
@@ -284,14 +374,13 @@
                     <has_n_lines n="2"/>
                     <has_n_columns n="8"/>
                     <has_line_matching expression="alias\tstatus\taccession\ttitle\tstudy_type\tstudy_abstract\tpubmed_id\tsubmission_date"/>
-                    <has_line_matching expression="SARS-CoV-2_genomes_01\tadd\tENA_accession\tWhole-genome sequencing of SARS-CoV-2 from Covid-19 patients\tWhole Genome Sequencing\tWhole-genome sequences of SARS-CoV-2 from oro-pharyngeal swabs obtained from Covid-19 patients(.*)"/>
+                    <has_line_matching expression="SARS-CoV-2_genomes_01\tmodify\tENA_accession\tWhole-genome sequencing of SARS-CoV-2 from Covid-19 patients\tWhole Genome Sequencing\tWhole-genome sequences of SARS-CoV-2 from oro-pharyngeal swabs obtained from Covid-19 patients(.*)"/>
                 </assert_contents>
             </output>
             <output name="samples_table_out">
                 <assert_contents>
                     <has_n_lines n="5"/>
                     <has_n_columns n="18"/>
-                    <has_line_matching expression="alias\ttitle\tscientific_name\tsample_description\tstatus\taccession\ttaxon_id\tsubmission_date\tgeographic_location\thost_common_name\thost_subject_id\thost_health_state\thost_sex\thost_scientific_name\tcollector_name\tcollecting_institution\tisolate\tcollection_date"/>
                 </assert_contents>
             </output>
             <output name="runs_table_out">
@@ -299,11 +388,11 @@
                     <has_n_lines n="5"/>
                     <has_n_columns n="8"/>
                     <has_line_matching expression="alias\tstatus\taccession\texperiment_alias\tfile_name\tfile_format\tfile_checksum\tsubmission_date"/>
-                    <has_line_matching expression="r_(.*)_026\tadd\tena_run_accession\te_(.*)_026\tC026_exp5_clean.fastq.gz\tfastq\tfile_checksum\tsubmission_date_ENA"/>
+                    <has_line_matching expression="r_(.*)_026\tmodify\tena_run_accession\te_(.*)_026\tC026_exp5_clean.fastq.gz\tfastq\t\tsubmission_date_ENA"/>
                 </assert_contents>
             </output>
         </test>
-        <!--Test excel input of VIRAL samples with extended columns-->
+        <!--Test 2: excel input of VIRAL samples with extended columns-->
         <test>
             <conditional name="action_options">
                 <param name="action" value="add"/>
@@ -316,6 +405,7 @@
                     <param name="viral_submission" value="True"/>
                     <param name="xlsx_file" value="metadata_test_viral_optional_columns.xlsx"/>
                     <conditional name="run_input_format_conditional">
+                        <param name="add_extension" value="true"/>
                         <param name="run_input_format" value="multiple_selection_list"/>
                         <param name="data" value="sample.fq"/>
                     </conditional>
@@ -327,7 +417,7 @@
                     <has_n_lines n="5"/>
                     <has_n_columns n="17"/>
                     <has_line_matching expression="alias\tstatus\taccession\ttitle\tstudy_alias\tsample_alias\tdesign_description\tlibrary_name\tlibrary_strategy\tlibrary_source\tlibrary_selection\tlibrary_layout\tinsert_size\tlibrary_construction_protocol\tplatform\tinstrument_model\tsubmission_date" />
-                    <has_line_matching expression="e_(.*)_026\tadd\taccession_ena\tNanopore sequencing\tSARS-CoV-2_genomes_01\ts_(.*)"/>
+                    <has_line_matching expression="e_(.*)_026\tmodify\taccession_ena\tNanopore sequencing\tSARS-CoV-2_genomes_01\ts_(.*)"/>
                 </assert_contents>
             </output>
             <output name="studies_table_out">
@@ -335,14 +425,13 @@
                     <has_n_lines n="2"/>
                     <has_n_columns n="8"/>
                     <has_line_matching expression="alias\tstatus\taccession\ttitle\tstudy_type\tstudy_abstract\tpubmed_id\tsubmission_date"/>
-                    <has_line_matching expression="SARS-CoV-2_genomes_01\tadd\tENA_accession\tWhole-genome sequencing of SARS-CoV-2 from Covid-19 patients\tWhole Genome Sequencing\tWhole-genome sequences of SARS-CoV-2 from oro-pharyngeal swabs obtained from Covid-19 patients(.*)"/>
+                    <has_line_matching expression="SARS-CoV-2_genomes_01\tmodify\tENA_accession\tWhole-genome sequencing of SARS-CoV-2 from Covid-19 patients\tWhole Genome Sequencing\tWhole-genome sequences of SARS-CoV-2 from oro-pharyngeal swabs obtained from Covid-19 patients(.*)"/>
                 </assert_contents>
             </output>
             <output name="samples_table_out">
                 <assert_contents>
                     <has_n_lines n="5"/>
                     <has_n_columns n="42"/>
-                    <has_line_matching expression="alias\ttitle\tscientific_name\tsample_description\tstatus\taccession\ttaxon_id\tsubmission_date\tgeographic_location\thost_common_name\thost_subject_id\thost_health_state\thost_sex\thost_scientific_name\tcollector_name\tcollecting_institution\tisolate\tcollection_date\tgeographic_location_latitude\tgeographic_location_longitude\tsample_capture_status\thost_disease_outcome\thost_age\tvirus_identifier\treceipt_date\tdefinition_for_seropositive_sample\tserotype\thost_habitat\tisolation_source_host_associated\thost_behaviour\tisolation_source_non_host_associated\tsubject_exposure\tsubject_exposure_duration\ttype_exposure\tpersonal_protective_equipment\thospitalisation\tillness_duration\tillness_symptoms\tsample_storage_conditions\tstrain\thost_description\tgravidity"/>
                 </assert_contents>
             </output>
             <output name="runs_table_out">
@@ -350,11 +439,11 @@
                     <has_n_lines n="5"/>
                     <has_n_columns n="8"/>
                     <has_line_matching expression="alias\tstatus\taccession\texperiment_alias\tfile_name\tfile_format\tfile_checksum\tsubmission_date"/>
-                    <has_line_matching expression="r_(.*)_026\tadd\tena_run_accession\te_(.*)_026\tC026_exp5_clean.fastq.gz\tfastq\tfile_checksum\tsubmission_date_ENA"/>
+                    <has_line_matching expression="r_(.*)_026\tmodify\tena_run_accession\te_(.*)_026\tC026_exp5_clean.fastq.gz\tfastq\t\tsubmission_date_ENA"/>
                 </assert_contents>
             </output>
         </test>
-        <!--Test excel input of NON-VIRAL samples-->
+        <!--Test 3: excel input of NON-VIRAL samples-->
         <test>
             <conditional name="action_options">
                 <param name="action" value="add"/>
@@ -367,6 +456,7 @@
                     <param name="viral_submission" value="False"/>
                     <param name="xlsx_file" value="metadata_test_nonviral.xlsx"/>
                     <conditional name="run_input_format_conditional">
+                        <param name="add_extension" value="true"/>
                         <param name="run_input_format" value="multiple_selection_list"/>
                         <param name="data" value="sample.fq"/>
                     </conditional>
@@ -399,11 +489,11 @@
                     <has_n_lines n="5"/>
                     <has_n_columns n="8"/>
                     <has_line_matching expression="alias\tstatus\taccession\texperiment_alias\tfile_name\tfile_format\tfile_checksum\tsubmission_date"/>
-                    <has_line_matching expression="r_(.*)_026\tadd\tena_run_accession\te_(.*)_026\tC026_exp5_clean.fastq.gz\tfastq\tfile_checksum\tsubmission_date_ENA"/>
+                    <has_line_matching expression="r_(.*)_026\tmodify\tena_run_accession\te_(.*)_026\tC026_exp5_clean.fastq.gz\tfastq\t\tsubmission_date_ENA"/>
                 </assert_contents>
             </output>
         </test>
-        <!--Test failure on excel input of NON-VIRAL samples with runs PAIRED collection -->
+        <!--Test 4: failure on excel input of NON-VIRAL samples with runs PAIRED collection -->
         <test expect_failure="true">
             <conditional name="action_options">
                 <param name="action" value="add"/>
@@ -412,6 +502,7 @@
                     <param name="dry_run" value="false" />
                 </section>
                 <conditional name="input_format_conditional">
+                    <param name="add_extension" value="true"/>
                     <param name="input_format" value="excel_tables"/>
                     <param name="viral_submission" value="False"/>
                     <param name="xlsx_file" value="metadata_test_nonviral.xlsx"/>
@@ -437,7 +528,7 @@
                 <has_text_matching expression="--action 'add' --center 'Some research center'"/>
             </assert_command>
         </test>
-        <!--Test build tables from user input fields NON-VIRAL samples-->
+        <!--Test 5: build tables from user input fields NON-VIRAL samples-->
         <test>
             <conditional name="action_options">
                 <param name="action" value="add"/>
@@ -447,6 +538,7 @@
                 </section>
                 <conditional name="input_format_conditional">
                     <param name="input_format" value="build_tables"/>
+                    <param name="add_extension" value="true"/>
                     <conditional name="conditional_viral_metadata">
                         <param name="viral_sample" value="False"/>
                         <repeat name="rep_study">
@@ -507,7 +599,7 @@
                 </assert_contents>
             </output>
         </test>
-        <!--Test RUN failing build tables from user input fields NON-VIRAL samples-->
+        <!--Test 6: RUN failing build tables from user input fields NON-VIRAL samples-->
         <test expect_failure="true">
             <conditional name="action_options">
                 <param name="action" value="add"/>
@@ -517,6 +609,7 @@
                 </section>
                 <conditional name="input_format_conditional">
                     <param name="input_format" value="build_tables"/>
+                    <param name="add_extension" value="true"/>
                     <conditional name="conditional_viral_metadata">
                         <param name="viral_sample" value="false"/>
                         <repeat name="rep_study">
@@ -554,7 +647,8 @@
                 <has_text_matching expression="No ENA credentials defined"/>
             </assert_stdout>
         </test>
-        <!--Test with submit_test to skip credentials checksRUN failing build tables from user input fields NON-VIRAL samples-->
+        <!--Test 7: with submit_test to skip credentials checksRUN failing build tables from user input fields NON-VIRAL samples
+            also tests compression of uncompressed inputs and adding the .gz suffix -->
         <test expect_failure="true">
             <conditional name="action_options">
                 <param name="action" value="add"/>
@@ -564,6 +658,7 @@
                 </section>
                 <param name="test_submit" value="True"/>
                 <conditional name="input_format_conditional">
+                    <param name="add_extension" value="true"/>
                     <param name="input_format" value="build_tables"/>
                     <conditional name="conditional_viral_metadata">
                         <param name="viral_sample" value="false"/>
@@ -590,7 +685,7 @@
                                     <param name="instrument_model" value="Illumina HiSeq 4000"/>
                                     <repeat name="rep_runs">
                                         <param name="run_base_name" value="run_from_hospital_X"/>
-                                        <param name="upload_files" value="1.fastqsanger.gz,sample.fq" ftype="fastqsanger"/>
+                                        <param name="upload_files" value="sample.fq,sample.fq" ftype="fastqsanger"/>
                                     </repeat>
                                 </repeat>
                             </repeat>
@@ -601,14 +696,15 @@
             <param name="center" value="Some research center"/>
             <assert_command>
                 <has_text_matching expression="ena-upload-cli"/>
-                <has_text_matching expression="--data '1.fastqsanger.gz' 'sample.fq'"/>
+                <has_text_matching expression="--data 'sample.fq.fastq.gz' 'sample.fq.fastq.gz'"/>
                 <has_text_matching expression="--action 'add' --center 'Some research center'"/>
+                <not_has_text text="modify" />
             </assert_command>
             <assert_stderr>
                 <has_text_matching expression="Oops, the file test_fake_path does not exist"/>
             </assert_stderr>
         </test>
-        <!--test viral submission - User input metadata-->
+        <!--Test 8: viral submission - User input metadata - Add extension = False-->
         <test expect_failure="true">
             <conditional name="action_options">
                 <param name="action" value="add"/>
@@ -618,6 +714,73 @@
                 </section>
                 <param name="test_submit" value="True"/>
                 <conditional name="input_format_conditional">
+                    <param name="add_extension" value="False"/>
+                    <param name="input_format" value="build_tables"/>
+                    <conditional name="conditional_viral_metadata">
+                        <param name="viral_sample" value="true"/>
+                        <repeat name="rep_study">
+                            <param name="study_title" value="Test study title"/>
+                            <param name="study_abstract" value="Test study abstract"/>
+                            <param name="study_type" value="Epigenetics"/>
+                            <param name="study_pubmed_id" value="Test study pubmedID"/>
+                            <repeat name="rep_sample">
+                                <param name="sample_title" value="Test Sample title"/>
+                                <param name="sample_description" value="Test Sample description"/>
+                                <param name="scientific_name" value="Test Sample scientific name"/>
+                                <param name="tax_id" value="Test Sample tax_id"/>
+                                <param name="collection_date" value="2020"/>
+                                <param name="geo_location_country" value="Belgium"/>
+                                <param name="host_common_name" value="Human"/>
+                                <param name="host_subject_id" value="Patient_001"/>
+                                <param name="host_health_state" value="healthy"/>
+                                <param name="host_sex" value="female"/>
+                                <param name="host_scientific_name" value="homo sapiens"/>
+                                <param name="collector_name" value="John The Collector"/>
+                                <param name="collecting_institution" value="Hospital 01"/>
+                                <param name="isolate" value="sample_001"/>
+                                <repeat name="rep_experiment">
+                                    <param name="experiment_title" value="Test experiment title"/>
+                                    <param name="experiment_design" value="Test experiment design description"/>
+                                    <param name="library_strategy" value="CTS"/>
+                                    <param name="library_source" value="GENOMIC"/>
+                                    <param name="library_selection" value="PCR"/>
+                                    <param name="library_layout" value="SINGLE"/>
+                                    <param name="insert_size" value="150"/>
+                                    <param name="library_construction_protocol" value="Test library construction"/>
+                                    <param name="platform" value="ILLUMINA"/>
+                                    <param name="instrument_model" value="Illumina HiSeq 4000"/>
+                                    <repeat name="rep_runs">
+                                        <param name="run_base_name" value="run_from_hospital_X"/>
+                                        <param name="upload_files" value="1.fastqsanger.gz,2.fastqsanger.gz" ftype="fastqsanger.gz"/>
+                                    </repeat>
+                                </repeat>
+                            </repeat>
+                        </repeat>
+                    </conditional>
+                </conditional>
+            </conditional>
+            <param name="center" value="Some research center"/>
+            <assert_command>
+                <has_text_matching expression="ena-upload-cli"/>
+                <has_text_matching expression="--data '1.fastqsanger.gz' '2.fastqsanger.gz'"/>
+                <has_text_matching expression="--action 'add' --center 'Some research center'"/>
+                <has_text_matching expression="--checklist ERC000033"/>
+            </assert_command>
+            <assert_stderr>
+                <has_text_matching expression="Oops, the file test_fake_path does not exist"/>
+            </assert_stderr>
+        </test>
+        <!--Test 9: modify option and auto compression - viral submission - User input metadata-->
+        <test expect_failure="true">
+            <conditional name="action_options">
+                <param name="action" value="modify"/>
+                <section name="test_submit_parameters">
+                    <param name="submit_dev" value="false" />
+                    <param name="dry_run" value="false" />
+                </section>
+                <param name="test_submit" value="True"/>
+                <conditional name="input_format_conditional">
+                    <param name="add_extension" value="False"/>
                     <param name="input_format" value="build_tables"/>
                     <conditional name="conditional_viral_metadata">
                         <param name="viral_sample" value="True"/>
@@ -654,7 +817,7 @@
                                     <param name="instrument_model" value="Illumina HiSeq 4000"/>
                                     <repeat name="rep_runs">
                                         <param name="run_base_name" value="run_from_hospital_X"/>
-                                        <param name="upload_files" value="1.fastqsanger.gz,sample.fq" ftype="fastqsanger"/>
+                                        <param name="upload_files" value="sample.fq" ftype="fastqsanger"/>
                                     </repeat>
                                 </repeat>
                             </repeat>
@@ -665,9 +828,10 @@
             <param name="center" value="Some research center"/>
             <assert_command>
                 <has_text_matching expression="ena-upload-cli"/>
-                <has_text_matching expression="--data '1.fastqsanger.gz' 'sample.fq'"/>
-                <has_text_matching expression="--action 'add' --center 'Some research center'"/>
-                <has_text_matching expression="--vir"/>
+                <has_text_matching expression="--data 'sample.fq.gz'"/>
+                <has_text_matching expression="--action 'modify' --center 'Some research center'"/>
+                <has_text_matching expression="--checklist ERC000033"/>
+                <not_has_text text="add" />
             </assert_command>
             <assert_stderr>
                 <has_text_matching expression="Oops, the file test_fake_path does not exist"/>
--- a/process_xlsx.py	Wed Aug 18 19:42:49 2021 +0000
+++ b/process_xlsx.py	Tue Oct 19 15:57:14 2021 +0000
@@ -4,11 +4,24 @@

 import xlrd
 import yaml
+from check_remote import check_remote_entry
 from mappings import optional_samples_cols_mapping

 FILE_FORMAT = 'fastq'


+def identify_action(entry_type, alias):
+    ''' define action ['add' | 'modify'] that needs to be perfomed for this entry '''
+    query = {entry_type + '_alias': alias}
+    remote_accessions = check_remote_entry(entry_type, query)
+    if len(remote_accessions) > 0:
+        print(f'Found: {entry_type} entry with alias {alias}')
+        return 'modify'
+    else:
+        print(f'No {entry_type} entry found with alias {alias}')
+        return 'add'
+
+
 def extract_data(xl_sheet, expected_columns, optional_cols=None):
     """
     1. Check that the columns I expect are present in the sheet
@@ -86,6 +99,7 @@
 parser.add_argument('--out_dir', dest='out_path', required=True)
 parser.add_argument('--action', dest='action', required=True)
 parser.add_argument('--vir', dest='viral_submission', required=False, action='store_true')
+parser.add_argument('--dev', dest='dev_submission', required=False, action='store_true')
 parser.add_argument('--verbose', dest='verbose', required=False, action='store_true')
 args = parser.parse_args()

@@ -148,10 +162,10 @@
 samples_cols = samples_cols + ['status', 'accession', 'taxon_id', 'submission_date']
 if args.viral_submission:
     # extend the samples columns with the viral specific data
-    samples_cols = samples_cols + ['geographic_location', 'host_common_name',
-                                   'host_subject_id', 'host_health_state', 'host_sex',
-                                   'host_scientific_name', 'collector_name',
-                                   'collecting_institution', 'isolate']
+    samples_cols = samples_cols + ['geographic location (country and/or sea)', 'host common name',
+                                   'host subject id', 'host health state', 'host sex',
+                                   'host scientific name', 'collector name',
+                                   'collecting institution', 'isolate']
     if len(samples_optional_cols_loaded) > 0:
         for optional_cols_excel in samples_optional_cols_loaded:
             samples_cols.append(optional_samples_cols_mapping[optional_cols_excel])
@@ -168,7 +182,7 @@
 runs_table.write('\t'.join(['alias', 'status', 'accession', 'experiment_alias', 'file_name',
                             'file_format', 'file_checksum', 'submission_date']) + '\n')
 action = args.action
-
+# actionable_items
 # WRITE  DICTIONARIES TO TABLE FILES

 # ADD A TIMESTAMP TO THE ALIAS? SEEMS LIKE ENA REQUIRES ALL ENTRIES FOR A WEBIN TO HAVE UNIQUE IDS?
@@ -178,14 +192,22 @@
 exp_included = []
 for study_alias, study in studies_dict.items():
     # study_alias = study_alias + '_' + timestamp
-    studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'],
+    if args.dev_submission:
+        entry_action = args.action
+    else:
+        entry_action = identify_action('study', study_alias)
+    studies_table.write('\t'.join([study_alias, entry_action, 'ENA_accession', study['title'],
                                    study['study_type'], study['study_abstract'], '',
                                    'ENA_submission_data']) + '\n')  # assuming no pubmed_id
 for sample_alias, sample in samples_dict.items():
     # sample_alias = sample_alias + '_' + timestamp
+    if args.dev_submission:
+        entry_action = args.action
+    else:
+        entry_action = identify_action('sample', sample_alias)
     samples_row_values = [sample_alias, sample['title'], sample['scientific_name'],
-                          sample['sample_description'], action, 'ena_accession',
-                          'tax_id_updated_by_ENA', 'ENA_submission_date']
+                          sample['sample_description'], entry_action, 'ena_accession',
+                          '', 'ENA_submission_date']
     if args.viral_submission:
         # add the values that are unique for the viral samples
         if sample['collector name'] == '':
@@ -230,7 +252,12 @@
         # (not listed in the samples or study dict)
         # process the experiments for this sample
         if exp['sample_alias'] == sample_alias:
-            experiments_table.write('\t'.join([exp_alias, action, 'accession_ena', exp['title'],
+            # check the remote status
+            if args.dev_submission:
+                entry_action = args.action
+            else:
+                entry_action = identify_action('experiment', exp_alias)
+            experiments_table.write('\t'.join([exp_alias, entry_action, 'accession_ena', exp['title'],
                                                exp['study_alias'], sample_alias,
                                                exp['design_description'], exp['library_name'],
                                                exp['library_strategy'], exp['library_source'],
@@ -250,9 +277,13 @@
                     runs_list = run
                 for run_entry in runs_list:
                     if run_entry['experiment_alias'] == exp_alias:
-                        runs_table.write('\t'.join([run_alias, action, 'ena_run_accession',
+                        if args.dev_submission:
+                            entry_action = args.action
+                        else:
+                            entry_action = identify_action('run', run_alias)
+                        runs_table.write('\t'.join([run_alias, entry_action, 'ena_run_accession',
                                                     exp_alias, run_entry['file_name'],
-                                                    FILE_FORMAT, 'file_checksum',
+                                                    FILE_FORMAT, '',
                                                     'submission_date_ENA']) + '\n')
                 runs_included.append(run_alias)
--- a/samples_macros.xml	Wed Aug 18 19:42:49 2021 +0000
+++ b/samples_macros.xml	Tue Oct 19 15:57:14 2021 +0000
@@ -12,10 +12,11 @@
                 <option value="paired_list" selected="False">Input from a paired collection</option>
             </param>
             <when value="multiple_selection_list">
-                <param name="data" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="Select individual datasets or a dataset collection" help="Names should match the compressed run's files names defined in the metadata"/>
+                <param name="add_extension" type="boolean" checked="False" label="Add .fastq.(gz,.bz2) extension to the Galaxy dataset names to match the ones described in the input tables?"/>
+                <param name="data" type="data" format="fastq,fastqsanger,fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="Select individual datasets or a dataset collection" help="Names should match the compressed run's files names defined in the metadata"/>
             </when>
             <when value="paired_list">
-                <param name="paired_end_collection" collection_type="list:paired" type="data_collection" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" label="List of paired-end runs files" help="Names should match the compressed run's files names defined in the metadata" />
+                <param name="paired_end_collection" collection_type="list:paired" type="data_collection" format="fastq,fastqsanger,fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" label="List of paired-end runs files" help="Names should match the compressed run's files names defined in the metadata" />
             </when>
         </conditional>
     </xml>
@@ -40,6 +41,7 @@
                 <param name="runs_users_table" type="data" format="tabular" multiple="false" label="Runs table" help="Runs metadata file"/>
             </when>
             <when value="build_tables">
+                <param name="add_extension" type="boolean" checked="false" label="Add .fastq.(gz.bz2) extension to the Galaxy dataset names to match the ones described in the input tables?"/>
                 <conditional name="conditional_viral_metadata">
                     <param name="viral_sample" type="boolean" truevalue="true" falsevalue="false" label="Does your submission contains viral samples?" />
                     <when value="true">
@@ -138,8 +140,8 @@
                         </options>
                     </param>
                     <repeat name="rep_runs" title="Runs executed within this experiment" min="1" >
-                        <param name="run_base_name" type="text" optional="False" label="Run alias" help="If an alias is not provided it will be generated combining the sample and experiment indexes"/>
-                        <param name="upload_files" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="File(s) associated with this run"/>
+                        <param name="run_base_name" type="text" optional="False" value="" label="Run alias" help="If an alias is not provided it will be generated combining the sample and experiment indexes"/>
+                        <param name="upload_files" type="data" format="fastq,fastqsanger,fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="File(s) associated with this run"/>
                     </repeat>
                 </repeat>
             </repeat>
@@ -203,8 +205,8 @@
                     </options>
                 </param>
                 <repeat name="rep_runs" title="Runs executed within this experiment" min="1" >
-                    <param name="run_base_name" type="text" optional="False" label="Run alias" help="If an alias is not provided it will be generated combining the sample and experiment indexes"/>
-                    <param name="upload_files" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="File(s) associated with this run"/>
+                    <param name="run_base_name" type="text" optional="False" value="" label="Run alias" help="If an alias is not provided it will be generated combining the sample and experiment indexes"/>
+                    <param name="upload_files" type="data" format="fastq,fastqsanger,fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="File(s) associated with this run"/>
                 </repeat>
             </repeat>
         </repeat>