ena_upload: process_xlsx.py comparison

comparison process_xlsx.py @ 2:9e2df763086c draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ena_upload commit 1eed23745846ce215e9bdc4a4934d6bc8f41b24e"

author	iuc
date	Thu, 15 Jul 2021 20:12:34 +0000
parents	57251c760cab
children	59bb6d34fca6

comparison

equal deleted inserted replaced

-:57251c760cab
+:9e2df763086c
 import pathlib
 import sys
 import xlrd
 import yaml
+from mappings import optional_samples_cols_mapping
 FILE_FORMAT = 'fastq'
-def extract_data(xl_sheet, expected_columns):
+def extract_data(xl_sheet, expected_columns, optional_cols=None):
 """
 1. Check that the columns I expect are present in the sheet
 (any order and mixed with others, it's just a verification that
 the user filled the correct template)
 2. Fill a dictionary with the rows data indexed by first column in list"""
 sheet_columns = {}
+if optional_cols is None:
+optional_cols = []
+optional_cols_loaded = []
 for sh_col in range(xl_sheet.ncols):
-if xl_sheet.cell(0, sh_col).value in expected_columns:
+if (xl_sheet.cell(0, sh_col).value in expected_columns) \
+or (xl_sheet.cell(0, sh_col).value in optional_cols):
 if xl_sheet.cell(0, sh_col).value in sheet_columns.keys():
-sys.exit("Duplicated columns")
+sys.exit("Duplicated columns found")
 else:
 sheet_columns[xl_sheet.cell(0, sh_col).value] = sh_col
+if xl_sheet.cell(0, sh_col).value in optional_cols:
+# store the list of optional cols available
+optional_cols_loaded.append(xl_sheet.cell(0, sh_col).value)
+provided_cols = expected_columns + optional_cols_loaded
+# check that the required columns are all present
+# TODO: revise this for optional columns
 for col in range(len(expected_columns)):
 assert expected_columns[col] in sheet_columns.keys(), \
 "Expected column %s not found" % expected_columns[col]
 # fetch rows in a dict
 # the first of the expected columns will be the index
 index_col = sheet_columns[expected_columns[0]]
 # skip first 2 rows: column names + comments rows
 for row_id in range(2, xl_sheet.nrows):
 row_dict = {}
-for col in range(1, len(expected_columns)):
+for col in range(1, len(provided_cols)):
-sheet_col_index = sheet_columns[expected_columns[col]]
+sheet_col_index = sheet_columns[provided_cols[col]]
-row_dict[expected_columns[col]] = xl_sheet.cell(row_id, sheet_col_index).value
+row_dict[provided_cols[col]] = xl_sheet.cell(row_id, sheet_col_index).value
 # should check for duplicate alias/ids?
 if xl_sheet.cell(row_id, index_col).value in data_dict.keys():
 tmp = data_dict[xl_sheet.cell(row_id, index_col).value]
 data_dict[xl_sheet.cell(row_id, index_col).value] = [tmp]
 data_dict[xl_sheet.cell(row_id, index_col).value].append(row_dict)
 else:
 data_dict[xl_sheet.cell(row_id, index_col).value] = row_dict
-return data_dict
+return data_dict, optional_cols_loaded
 def paste_xls2yaml(xlsx_path):
 print('YAML -------------')
 xls = xlrd.open_workbook(xlsx_path)
 xl_sheet = xl_workbook.sheet_by_name('ENA_study')
 if xl_sheet.nrows < 3:
 raise ValueError('No entries found in studies sheet')
 studies_dict = {}
 studies_col = ['alias', 'title', 'study_type', 'study_abstract']
-studies_dict = extract_data(xl_sheet, studies_col)
+studies_dict, _ = extract_data(xl_sheet, studies_col)
 # PARSE SAMPLES
 #################
 xl_sheet = xl_workbook.sheet_by_name('ENA_sample')
 if xl_sheet.nrows < 3:
 raise ValueError('No entries found in samples')
+samples_cols_excel = ['alias', 'title', 'scientific_name', 'sample_description']
+# optional_samples_cols_mapping = {}
 if args.viral_submission:
-samples_cols = ['alias', 'title', 'scientific_name', 'sample_description',
+# load columns names from the table
-'geographic location (country and/or sea)', 'host common name',
+samples_cols_excel = samples_cols_excel + ['geographic location (country and/or sea)',
-'host health state', 'host sex', 'host scientific name', 'collector name',
+'host common name', 'host health state',
-'collection date', 'collecting institution', 'isolate']
+'host sex', 'host scientific name', 'collector name',
-else:
+'collecting institution', 'isolate']
-samples_cols = ['alias', 'title', 'scientific_name', 'sample_description']
-samples_dict = extract_data(xl_sheet, samples_cols)
+samples_dict, samples_optional_cols_loaded = extract_data(xl_sheet, samples_cols_excel,
+optional_samples_cols_mapping.keys())
 # PARSE EXPERIMENTS
 #################
 xl_sheet = xl_workbook.sheet_by_name('ENA_experiment')
 if xl_sheet.nrows < 3:
 raise ValueError('No experiments found in experiments sheet')
 exp_columns = ['alias', 'title', 'study_alias', 'sample_alias', 'design_description',
 'library_name', 'library_strategy', 'library_source', 'library_selection',
 'library_layout', 'insert_size', 'library_construction_protocol',
 'platform', 'instrument_model']
-experiments_dict = extract_data(xl_sheet, exp_columns)
+experiments_dict, _ = extract_data(xl_sheet, exp_columns)
 # PARSE RUNS SHEET
 #################
 xl_sheet = xl_workbook.sheet_by_name('ENA_run')
 if xl_sheet.nrows < 3:
 raise ValueError('No entries found in runs sheet')
 run_cols = ['alias', 'experiment_alias', 'file_name', 'file_format']
-runs_dict = extract_data(xl_sheet, run_cols)
+runs_dict, _ = extract_data(xl_sheet, run_cols)
 # WRITE HEADERS TO TABLES
 studies_table = open(pathlib.Path(args.out_path) / 'studies.tsv', 'w')
 studies_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_type',
 'study_abstract', 'pubmed_id', 'submission_date']) + '\n')
 samples_table = open(pathlib.Path(args.out_path) / 'samples.tsv', 'w')
+samples_cols = ['alias', 'title', 'scientific_name', 'sample_description']
+# extend the samples_cols list to add the ones that are filled by the CLI
+samples_cols = samples_cols + ['status', 'accession', 'taxon_id', 'submission_date']
 if args.viral_submission:
-samples_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'scientific_name',
+# extend the samples columns with the viral specific data
-'taxon_id', 'sample_description', 'collection_date',
+samples_cols = samples_cols + ['geographic_location', 'host_common_name',
-'geographic_location', 'host_common_name', 'host_subject_id',
+'host_subject_id', 'host_health_state', 'host_sex',
-'host_health_state', 'host_sex', 'host_scientific_name',
+'host_scientific_name', 'collector_name',
-'collector_name', 'collecting_institution', 'isolate',
+'collecting_institution', 'isolate']
-'submission_date']) + '\n')
+if len(samples_optional_cols_loaded) > 0:
-else:
+for optional_cols_excel in samples_optional_cols_loaded:
-samples_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'scientific_name',
+samples_cols.append(optional_samples_cols_mapping[optional_cols_excel])
-'taxon_id', 'sample_description', 'submission_date']) + '\n')
+samples_table.write('\t'.join(samples_cols) + '\n')
 experiments_table = open(pathlib.Path(args.out_path) / 'experiments.tsv', 'w')
 experiments_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_alias',
 'sample_alias', 'design_description', 'library_name',
 'library_strategy', 'library_source', 'library_selection',
 studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'],
 study['study_type'], study['study_abstract'], '',
 'ENA_submission_data']) + '\n')  # assuming no pubmed_id
 for sample_alias, sample in samples_dict.items():
 # sample_alias = sample_alias + '_' + timestamp
+samples_row_values = [sample_alias, sample['title'], sample['scientific_name'],
+sample['sample_description'], action, 'ena_accession',
+'tax_id_updated_by_ENA', 'ENA_submission_date']
 if args.viral_submission:
+# add the values that are unique for the viral samples
 if sample['collector name'] == '':
 sample['collector name'] = 'unknown'
-samples_table.write('\t'.join([sample_alias, action, 'ena_accession', sample['title'],
+samples_row_values = samples_row_values + \
-sample['scientific_name'], 'tax_id_updated_by_ENA',
+[sample['geographic location (country and/or sea)'], sample['host common name'],
-sample['sample_description'], sample['collection date'],
+'host subject id', sample['host health state'], sample['host sex'],
-sample['geographic location (country and/or sea)'],
+sample['host scientific name'], sample['collector name'],
-sample['host common name'], 'host subject id',
+sample['collecting institution'], sample['isolate']]
-sample['host health state'], sample['host sex'],
+# add the (possible) optional columns values
-sample['host scientific name'], sample['collector name'],
+if len(samples_optional_cols_loaded) > 0:
-sample['collecting institution'], sample['isolate'],
+for optional_col in samples_optional_cols_loaded:
-'ENA_submission_date']) + '\n')
+# parse values stored as in excel date format (=float)
-else:
+if optional_col in ('collection date', 'receipt date'):
-samples_table.write('\t'.join([sample_alias, action, 'ena_accession', sample['title'],
+# check if excel stored it as date
-sample['scientific_name'], 'tax_id_updated_by_ENA',
+if isinstance(sample[optional_col], float):
-sample['sample_description']]) + '\n')
+year, month, day, hour, minute, second = xlrd.xldate_as_tuple(
+sample[optional_col], xl_workbook.datemode)
+month = "{:02d}".format(month)
+day = "{:02d}".format(day)
+hour = "{:02d}".format(hour)
+minute = "{:02d}".format(minute)
+second = "{:02d}".format(second)
+# format it as 2008-01-23T19:23:10
+sample[optional_col] = str(year) + '-' + str(month) + '-' + str(day) + \
+'T' + str(hour) + ':' + str(minute) + ':' + str(second)
+# excel stores everything as float so I need to check if
+# the value was actually an int and keep it as int
+if isinstance(sample[optional_col], float):
+if int(sample[optional_col]) == sample[optional_col]:
+# it is not really a float but an int
+sample[optional_col] = int(sample[optional_col])
+samples_row_values.append(str(sample[optional_col]))
+samples_table.write('\t'.join(samples_row_values) + '\n')
 for exp_alias, exp in experiments_dict.items():
 # should I check here if any experiment has a study or sample alias that is incorrect?
 # (not listed in the samples or study dict)
 # process the experiments for this sample
 if exp['sample_alias'] == sample_alias:

Mercurial > repos > iuc > ena_upload

comparison process_xlsx.py @ 2:9e2df763086c draft