Mercurial > repos > iuc > ena_upload

--- a/ena_upload.xml	Fri Apr 30 12:09:25 2021 +0000
+++ b/ena_upload.xml	Thu Jul 15 20:12:34 2021 +0000
@@ -1,6 +1,6 @@
-<tool id="ena_upload" name="ENA Upload tool" version="0.3.1" profile="20.01" license="MIT">
+<tool id="ena_upload" name="ENA Upload tool" version="0.3.2" profile="20.01" license="MIT">
     <macros>
-        <token name="@VERSION@">0.2.7</token>
+        <token name="@VERSION@">0.3.1</token>
         <import>samples_macros.xml</import>
     </macros>
     <requirements>
@@ -48,7 +48,6 @@
     ln -s '$action_options.input_format_conditional.samples_users_table' $samples_table_path &&
 #end if

-
 #if $action_options.test_submit_parameters.dry_run == "false" and $action_options.test_submit == "False":
     webin_id=`grep 'username' $credentials`;
     if [ "\$webin_id" = "" ]; then
@@ -292,7 +291,58 @@
                 <assert_contents>
                     <has_n_lines n="5"/>
                     <has_n_columns n="18"/>
-                    <has_line_matching expression="alias\tstatus\taccession\ttitle\tscientific_name\ttaxon_id\tsample_description\tcollection_date\tgeographic_location\thost_common_name\thost_subject_id\thost_health_state\thost_sex\thost_scientific_name\tcollector_name\tcollecting_institution\tisolate\tsubmission_date"/>
+                    <has_line_matching expression="alias\ttitle\tscientific_name\tsample_description\tstatus\taccession\ttaxon_id\tsubmission_date\tgeographic_location\thost_common_name\thost_subject_id\thost_health_state\thost_sex\thost_scientific_name\tcollector_name\tcollecting_institution\tisolate\tcollection_date"/>
+                </assert_contents>
+            </output>
+            <output name="runs_table_out">
+                <assert_contents>
+                    <has_n_lines n="5"/>
+                    <has_n_columns n="8"/>
+                    <has_line_matching expression="alias\tstatus\taccession\texperiment_alias\tfile_name\tfile_format\tfile_checksum\tsubmission_date"/>
+                    <has_line_matching expression="r_(.*)_026\tadd\tena_run_accession\te_(.*)_026\tC026_exp5_clean.fastq.gz\tfastq\tfile_checksum\tsubmission_date_ENA"/>
+                </assert_contents>
+            </output>
+        </test>
+        <!--Test excel input of VIRAL samples with extended columns-->
+        <test>
+            <conditional name="action_options">
+                <param name="action" value="add"/>
+                <section name="test_submit_parameters">
+                    <param name="submit_dev" value="false" />
+                    <param name="dry_run" value="true" />
+                </section>
+                <conditional name="input_format_conditional">
+                    <param name="input_format" value="excel_tables"/>
+                    <param name="viral_submission" value="True"/>
+                    <param name="xlsx_file" value="metadata_test_viral_optional_columns.xlsx"/>
+                    <conditional name="run_input_format_conditional">
+                        <param name="run_input_format" value="multiple_selection_list"/>
+                        <param name="data" value="sample.fq"/>
+                    </conditional>
+                </conditional>
+            </conditional>
+            <param name="center" value="Some research center"/>
+            <output name="experiments_table_out">
+                <assert_contents>
+                    <has_n_lines n="5"/>
+                    <has_n_columns n="17"/>
+                    <has_line_matching expression="alias\tstatus\taccession\ttitle\tstudy_alias\tsample_alias\tdesign_description\tlibrary_name\tlibrary_strategy\tlibrary_source\tlibrary_selection\tlibrary_layout\tinsert_size\tlibrary_construction_protocol\tplatform\tinstrument_model\tsubmission_date" />
+                    <has_line_matching expression="e_(.*)_026\tadd\taccession_ena\tNanopore sequencing\tSARS-CoV-2_genomes_01\ts_(.*)"/>
+                </assert_contents>
+            </output>
+            <output name="studies_table_out">
+                <assert_contents>
+                    <has_n_lines n="2"/>
+                    <has_n_columns n="8"/>
+                    <has_line_matching expression="alias\tstatus\taccession\ttitle\tstudy_type\tstudy_abstract\tpubmed_id\tsubmission_date"/>
+                    <has_line_matching expression="SARS-CoV-2_genomes_01\tadd\tENA_accession\tWhole-genome sequencing of SARS-CoV-2 from Covid-19 patients\tWhole Genome Sequencing\tWhole-genome sequences of SARS-CoV-2 from oro-pharyngeal swabs obtained from Covid-19 patients(.*)"/>
+                </assert_contents>
+            </output>
+            <output name="samples_table_out">
+                <assert_contents>
+                    <has_n_lines n="5"/>
+                    <has_n_columns n="42"/>
+                    <has_line_matching expression="alias\ttitle\tscientific_name\tsample_description\tstatus\taccession\ttaxon_id\tsubmission_date\tgeographic_location\thost_common_name\thost_subject_id\thost_health_state\thost_sex\thost_scientific_name\tcollector_name\tcollecting_institution\tisolate\tcollection_date\tgeographic_location_latitude\tgeographic_location_longitude\tsample_capture_status\thost_disease_outcome\thost_age\tvirus_identifier\treceipt_date\tdefinition_for_seropositive_sample\tserotype\thost_habitat\tisolation_source_host_associated\thost_behaviour\tisolation_source_non_host_associated\tsubject_exposure\tsubject_exposure_duration\ttype_exposure\tpersonal_protective_equipment\thospitalisation\tillness_duration\tillness_symptoms\tsample_storage_conditions\tstrain\thost_description\tgravidity"/>
                 </assert_contents>
             </output>
             <output name="runs_table_out">
@@ -341,7 +391,7 @@
                 <assert_contents>
                     <has_n_lines n="5"/>
                     <has_n_columns n="8"/>
-                        <has_line_matching expression="alias\tstatus\taccession\ttitle\tscientific_name\ttaxon_id\tsample_description\tsubmission_date"/>
+                        <has_line_matching expression="alias\ttitle\tscientific_name\tsample_description\tstatus\taccession\ttaxon_id\tsubmission_date"/>
                 </assert_contents>
             </output>
             <output name="runs_table_out">
@@ -628,7 +678,7 @@
         This is a wrapper for the ENA upload tool in https://github.com/usegalaxy-eu/ena-upload-cli
         The input metadata can be submitted following the tabular format of the templates in https://github.com/usegalaxy-eu/ena-upload-cli/tree/master/example_tables
         It is also possible to submit an excel file by following the template in https://drive.google.com/file/d/1ncC22--tW2v-EI-te_r86sAZujIPAjlX/view?usp=sharing
-        For viral submissions a larger set of metadata is required, you can find the template in https://drive.google.com/file/d/1U4VdcczsIecIXxseV8svE1zO_CBUadog/view?usp=sharing
+        For viral submissions a larger set of metadata is required, you can find the template in https://drive.google.com/file/d/1Gx78GKh58PmRjdmJ05DBbpObAL-3oUFX/view?usp=sharing
     ]]></help>
     <citations>
     </citations>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mappings.py	Thu Jul 15 20:12:34 2021 +0000
@@ -0,0 +1,29 @@
+
+optional_samples_cols_mapping = {
+    "collection date": "collection_date",
+    "geographic location (latitude)": "geographic_location_latitude",
+    "geographic location (longitude)": "geographic_location_longitude",
+    "geographic location (region)": "geographic_location_region",
+    "sample capture status": "sample_capture_status",
+    "host disease outcome": "host_disease_outcome",
+    "host_age": "host_age",
+    "virus identifier": "virus_identifier",
+    "receipt date": "receipt_date",
+    "definition for seropositive sample": "definition_for_seropositive_sample",
+    "serotype (required for a seropositive sample)": "serotype",
+    "host habitat": "host_habitat",
+    "isolation source host-associated": "isolation_source_host_associated",
+    "host behaviour": "host_behaviour",
+    "isolation source non-host-associated": "isolation_source_non_host_associated",
+    "subject exposure": "subject_exposure",
+    "subject exposure duration": "subject_exposure_duration",
+    "type exposure": "type_exposure",
+    "personal protective equipment": "personal_protective_equipment",
+    "hospitalisation": "hospitalisation",
+    "illness duration": "illness_duration",
+    "illness symptoms": "illness_symptoms",
+    "sample storage conditions": "sample_storage_conditions",
+    "strain": "strain",
+    "host description": "host_description",
+    "gravidity": "gravidity"
+}
--- a/process_xlsx.py	Fri Apr 30 12:09:25 2021 +0000
+++ b/process_xlsx.py	Thu Jul 15 20:12:34 2021 +0000
@@ -4,23 +4,35 @@

 import xlrd
 import yaml
+from mappings import optional_samples_cols_mapping

 FILE_FORMAT = 'fastq'


-def extract_data(xl_sheet, expected_columns):
+def extract_data(xl_sheet, expected_columns, optional_cols=None):
     """
     1. Check that the columns I expect are present in the sheet
     (any order and mixed with others, it's just a verification that
     the user filled the correct template)
     2. Fill a dictionary with the rows data indexed by first column in list"""
     sheet_columns = {}
+    if optional_cols is None:
+        optional_cols = []
+    optional_cols_loaded = []
     for sh_col in range(xl_sheet.ncols):
-        if xl_sheet.cell(0, sh_col).value in expected_columns:
+        if (xl_sheet.cell(0, sh_col).value in expected_columns) \
+           or (xl_sheet.cell(0, sh_col).value in optional_cols):
             if xl_sheet.cell(0, sh_col).value in sheet_columns.keys():
-                sys.exit("Duplicated columns")
+                sys.exit("Duplicated columns found")
             else:
                 sheet_columns[xl_sheet.cell(0, sh_col).value] = sh_col
+                if xl_sheet.cell(0, sh_col).value in optional_cols:
+                    # store the list of optional cols available
+                    optional_cols_loaded.append(xl_sheet.cell(0, sh_col).value)
+    provided_cols = expected_columns + optional_cols_loaded
+
+    # check that the required columns are all present
+    # TODO: revise this for optional columns
     for col in range(len(expected_columns)):
         assert expected_columns[col] in sheet_columns.keys(), \
             "Expected column %s not found" % expected_columns[col]
@@ -32,9 +44,9 @@
     # skip first 2 rows: column names + comments rows
     for row_id in range(2, xl_sheet.nrows):
         row_dict = {}
-        for col in range(1, len(expected_columns)):
-            sheet_col_index = sheet_columns[expected_columns[col]]
-            row_dict[expected_columns[col]] = xl_sheet.cell(row_id, sheet_col_index).value
+        for col in range(1, len(provided_cols)):
+            sheet_col_index = sheet_columns[provided_cols[col]]
+            row_dict[provided_cols[col]] = xl_sheet.cell(row_id, sheet_col_index).value
         # should check for duplicate alias/ids?
         if xl_sheet.cell(row_id, index_col).value in data_dict.keys():
             tmp = data_dict[xl_sheet.cell(row_id, index_col).value]
@@ -42,7 +54,7 @@
             data_dict[xl_sheet.cell(row_id, index_col).value].append(row_dict)
         else:
             data_dict[xl_sheet.cell(row_id, index_col).value] = row_dict
-    return data_dict
+    return data_dict, optional_cols_loaded


 def paste_xls2yaml(xlsx_path):
@@ -86,22 +98,25 @@
     raise ValueError('No entries found in studies sheet')
 studies_dict = {}
 studies_col = ['alias', 'title', 'study_type', 'study_abstract']
-studies_dict = extract_data(xl_sheet, studies_col)
+studies_dict, _ = extract_data(xl_sheet, studies_col)

 # PARSE SAMPLES
 #################
 xl_sheet = xl_workbook.sheet_by_name('ENA_sample')
 if xl_sheet.nrows < 3:
     raise ValueError('No entries found in samples')
+
+samples_cols_excel = ['alias', 'title', 'scientific_name', 'sample_description']
+# optional_samples_cols_mapping = {}
 if args.viral_submission:
-    samples_cols = ['alias', 'title', 'scientific_name', 'sample_description',
-                    'geographic location (country and/or sea)', 'host common name',
-                    'host health state', 'host sex', 'host scientific name', 'collector name',
-                    'collection date', 'collecting institution', 'isolate']
-else:
-    samples_cols = ['alias', 'title', 'scientific_name', 'sample_description']
-samples_dict = extract_data(xl_sheet, samples_cols)
+    # load columns names from the table
+    samples_cols_excel = samples_cols_excel + ['geographic location (country and/or sea)',
+                                               'host common name', 'host health state',
+                                               'host sex', 'host scientific name', 'collector name',
+                                               'collecting institution', 'isolate']

+samples_dict, samples_optional_cols_loaded = extract_data(xl_sheet, samples_cols_excel,
+                                                          optional_samples_cols_mapping.keys())
 # PARSE EXPERIMENTS
 #################
 xl_sheet = xl_workbook.sheet_by_name('ENA_experiment')
@@ -112,7 +127,7 @@
                'library_layout', 'insert_size', 'library_construction_protocol',
                'platform', 'instrument_model']

-experiments_dict = extract_data(xl_sheet, exp_columns)
+experiments_dict, _ = extract_data(xl_sheet, exp_columns)

 # PARSE RUNS SHEET
 #################
@@ -120,23 +135,27 @@
 if xl_sheet.nrows < 3:
     raise ValueError('No entries found in runs sheet')
 run_cols = ['alias', 'experiment_alias', 'file_name', 'file_format']
-runs_dict = extract_data(xl_sheet, run_cols)
+runs_dict, _ = extract_data(xl_sheet, run_cols)

 # WRITE HEADERS TO TABLES
 studies_table = open(pathlib.Path(args.out_path) / 'studies.tsv', 'w')
 studies_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_type',
                                'study_abstract', 'pubmed_id', 'submission_date']) + '\n')
 samples_table = open(pathlib.Path(args.out_path) / 'samples.tsv', 'w')
+
+samples_cols = ['alias', 'title', 'scientific_name', 'sample_description']
+# extend the samples_cols list to add the ones that are filled by the CLI
+samples_cols = samples_cols + ['status', 'accession', 'taxon_id', 'submission_date']
 if args.viral_submission:
-    samples_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'scientific_name',
-                                   'taxon_id', 'sample_description', 'collection_date',
-                                   'geographic_location', 'host_common_name', 'host_subject_id',
-                                   'host_health_state', 'host_sex', 'host_scientific_name',
-                                   'collector_name', 'collecting_institution', 'isolate',
-                                   'submission_date']) + '\n')
-else:
-    samples_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'scientific_name',
-                                   'taxon_id', 'sample_description', 'submission_date']) + '\n')
+    # extend the samples columns with the viral specific data
+    samples_cols = samples_cols + ['geographic_location', 'host_common_name',
+                                   'host_subject_id', 'host_health_state', 'host_sex',
+                                   'host_scientific_name', 'collector_name',
+                                   'collecting_institution', 'isolate']
+    if len(samples_optional_cols_loaded) > 0:
+        for optional_cols_excel in samples_optional_cols_loaded:
+            samples_cols.append(optional_samples_cols_mapping[optional_cols_excel])
+samples_table.write('\t'.join(samples_cols) + '\n')

 experiments_table = open(pathlib.Path(args.out_path) / 'experiments.tsv', 'w')
 experiments_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_alias',
@@ -164,22 +183,44 @@
                                    'ENA_submission_data']) + '\n')  # assuming no pubmed_id
 for sample_alias, sample in samples_dict.items():
     # sample_alias = sample_alias + '_' + timestamp
+    samples_row_values = [sample_alias, sample['title'], sample['scientific_name'],
+                          sample['sample_description'], action, 'ena_accession',
+                          'tax_id_updated_by_ENA', 'ENA_submission_date']
     if args.viral_submission:
+        # add the values that are unique for the viral samples
         if sample['collector name'] == '':
             sample['collector name'] = 'unknown'
-        samples_table.write('\t'.join([sample_alias, action, 'ena_accession', sample['title'],
-                                       sample['scientific_name'], 'tax_id_updated_by_ENA',
-                                       sample['sample_description'], sample['collection date'],
-                                       sample['geographic location (country and/or sea)'],
-                                       sample['host common name'], 'host subject id',
-                                       sample['host health state'], sample['host sex'],
-                                       sample['host scientific name'], sample['collector name'],
-                                       sample['collecting institution'], sample['isolate'],
-                                       'ENA_submission_date']) + '\n')
-    else:
-        samples_table.write('\t'.join([sample_alias, action, 'ena_accession', sample['title'],
-                                       sample['scientific_name'], 'tax_id_updated_by_ENA',
-                                       sample['sample_description']]) + '\n')
+        samples_row_values = samples_row_values + \
+            [sample['geographic location (country and/or sea)'], sample['host common name'],
+             'host subject id', sample['host health state'], sample['host sex'],
+             sample['host scientific name'], sample['collector name'],
+             sample['collecting institution'], sample['isolate']]
+        # add the (possible) optional columns values
+        if len(samples_optional_cols_loaded) > 0:
+            for optional_col in samples_optional_cols_loaded:
+                # parse values stored as in excel date format (=float)
+                if optional_col in ('collection date', 'receipt date'):
+                    # check if excel stored it as date
+                    if isinstance(sample[optional_col], float):
+                        year, month, day, hour, minute, second = xlrd.xldate_as_tuple(
+                            sample[optional_col], xl_workbook.datemode)
+                        month = "{:02d}".format(month)
+                        day = "{:02d}".format(day)
+                        hour = "{:02d}".format(hour)
+                        minute = "{:02d}".format(minute)
+                        second = "{:02d}".format(second)
+                        # format it as 2008-01-23T19:23:10
+                        sample[optional_col] = str(year) + '-' + str(month) + '-' + str(day) + \
+                            'T' + str(hour) + ':' + str(minute) + ':' + str(second)
+                # excel stores everything as float so I need to check if
+                # the value was actually an int and keep it as int
+                if isinstance(sample[optional_col], float):
+                    if int(sample[optional_col]) == sample[optional_col]:
+                        # it is not really a float but an int
+                        sample[optional_col] = int(sample[optional_col])
+                samples_row_values.append(str(sample[optional_col]))
+    samples_table.write('\t'.join(samples_row_values) + '\n')
+
     for exp_alias, exp in experiments_dict.items():
         # should I check here if any experiment has a study or sample alias that is incorrect?
         # (not listed in the samples or study dict)
--- a/samples_macros.xml	Fri Apr 30 12:09:25 2021 +0000
+++ b/samples_macros.xml	Thu Jul 15 20:12:34 2021 +0000
@@ -138,7 +138,7 @@
                         </options>
                     </param>
                     <repeat name="rep_runs" title="Runs executed within this experiment" min="1" >
-                        <param name="run_base_name" type="text" optional="False" default="" label="Run alias" help="If an alias is not provided it will be generated combining the sample and experiment indexes"/>
+                        <param name="run_base_name" type="text" optional="False" label="Run alias" help="If an alias is not provided it will be generated combining the sample and experiment indexes"/>
                         <param name="upload_files" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="File(s) associated with this run"/>
                     </repeat>
                 </repeat>
@@ -203,7 +203,7 @@
                     </options>
                 </param>
                 <repeat name="rep_runs" title="Runs executed within this experiment" min="1" >
-                    <param name="run_base_name" type="text" optional="False" default="" label="Run alias" help="If an alias is not provided it will be generated combining the sample and experiment indexes"/>
+                    <param name="run_base_name" type="text" optional="False" label="Run alias" help="If an alias is not provided it will be generated combining the sample and experiment indexes"/>
                     <param name="upload_files" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="File(s) associated with this run"/>
                 </repeat>
             </repeat>
Binary file test-data/metadata_test_viral_optional_columns.xlsx has changed