Mercurial > repos > iuc > ena_upload
view process_xlsx.py @ 2:9e2df763086c draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ena_upload commit 1eed23745846ce215e9bdc4a4934d6bc8f41b24e"
author | iuc |
---|---|
date | Thu, 15 Jul 2021 20:12:34 +0000 |
parents | 57251c760cab |
children | 59bb6d34fca6 |
line wrap: on
line source
import argparse import pathlib import sys import xlrd import yaml from mappings import optional_samples_cols_mapping FILE_FORMAT = 'fastq' def extract_data(xl_sheet, expected_columns, optional_cols=None): """ 1. Check that the columns I expect are present in the sheet (any order and mixed with others, it's just a verification that the user filled the correct template) 2. Fill a dictionary with the rows data indexed by first column in list""" sheet_columns = {} if optional_cols is None: optional_cols = [] optional_cols_loaded = [] for sh_col in range(xl_sheet.ncols): if (xl_sheet.cell(0, sh_col).value in expected_columns) \ or (xl_sheet.cell(0, sh_col).value in optional_cols): if xl_sheet.cell(0, sh_col).value in sheet_columns.keys(): sys.exit("Duplicated columns found") else: sheet_columns[xl_sheet.cell(0, sh_col).value] = sh_col if xl_sheet.cell(0, sh_col).value in optional_cols: # store the list of optional cols available optional_cols_loaded.append(xl_sheet.cell(0, sh_col).value) provided_cols = expected_columns + optional_cols_loaded # check that the required columns are all present # TODO: revise this for optional columns for col in range(len(expected_columns)): assert expected_columns[col] in sheet_columns.keys(), \ "Expected column %s not found" % expected_columns[col] # fetch rows in a dict data_dict = {} # the first of the expected columns will be the index index_col = sheet_columns[expected_columns[0]] # skip first 2 rows: column names + comments rows for row_id in range(2, xl_sheet.nrows): row_dict = {} for col in range(1, len(provided_cols)): sheet_col_index = sheet_columns[provided_cols[col]] row_dict[provided_cols[col]] = xl_sheet.cell(row_id, sheet_col_index).value # should check for duplicate alias/ids? if xl_sheet.cell(row_id, index_col).value in data_dict.keys(): tmp = data_dict[xl_sheet.cell(row_id, index_col).value] data_dict[xl_sheet.cell(row_id, index_col).value] = [tmp] data_dict[xl_sheet.cell(row_id, index_col).value].append(row_dict) else: data_dict[xl_sheet.cell(row_id, index_col).value] = row_dict return data_dict, optional_cols_loaded def paste_xls2yaml(xlsx_path): print('YAML -------------') xls = xlrd.open_workbook(xlsx_path) content_dict = {} for sheet_name in xls.sheet_names(): if sheet_name == 'controlled_vocabulary': continue xls_sheet = xls.sheet_by_name(sheet_name) sheet_contents_dict = {} colnames = [] for col in range(xls_sheet.ncols): colnames.append(xls_sheet.cell(0, col).value) # skip first 2 rows (column names and suggestions) for row_id in range(2, xls_sheet.nrows): row_dict = {} for col_id in range(0, xls_sheet.ncols): row_dict[colnames[col_id]] = xls_sheet.cell(row_id, col_id).value # should check for duplicate alias/ids? sheet_contents_dict[row_id] = row_dict content_dict[sheet_name] = sheet_contents_dict yaml.dump(content_dict, sys.stdout) print('YAML -------------') parser = argparse.ArgumentParser() parser.add_argument('--form', dest='xlsx_path', required=True) parser.add_argument('--out_dir', dest='out_path', required=True) parser.add_argument('--action', dest='action', required=True) parser.add_argument('--vir', dest='viral_submission', required=False, action='store_true') parser.add_argument('--verbose', dest='verbose', required=False, action='store_true') args = parser.parse_args() xl_workbook = xlrd.open_workbook(args.xlsx_path) # PARSE STUDIES ################# xl_sheet = xl_workbook.sheet_by_name('ENA_study') if xl_sheet.nrows < 3: raise ValueError('No entries found in studies sheet') studies_dict = {} studies_col = ['alias', 'title', 'study_type', 'study_abstract'] studies_dict, _ = extract_data(xl_sheet, studies_col) # PARSE SAMPLES ################# xl_sheet = xl_workbook.sheet_by_name('ENA_sample') if xl_sheet.nrows < 3: raise ValueError('No entries found in samples') samples_cols_excel = ['alias', 'title', 'scientific_name', 'sample_description'] # optional_samples_cols_mapping = {} if args.viral_submission: # load columns names from the table samples_cols_excel = samples_cols_excel + ['geographic location (country and/or sea)', 'host common name', 'host health state', 'host sex', 'host scientific name', 'collector name', 'collecting institution', 'isolate'] samples_dict, samples_optional_cols_loaded = extract_data(xl_sheet, samples_cols_excel, optional_samples_cols_mapping.keys()) # PARSE EXPERIMENTS ################# xl_sheet = xl_workbook.sheet_by_name('ENA_experiment') if xl_sheet.nrows < 3: raise ValueError('No experiments found in experiments sheet') exp_columns = ['alias', 'title', 'study_alias', 'sample_alias', 'design_description', 'library_name', 'library_strategy', 'library_source', 'library_selection', 'library_layout', 'insert_size', 'library_construction_protocol', 'platform', 'instrument_model'] experiments_dict, _ = extract_data(xl_sheet, exp_columns) # PARSE RUNS SHEET ################# xl_sheet = xl_workbook.sheet_by_name('ENA_run') if xl_sheet.nrows < 3: raise ValueError('No entries found in runs sheet') run_cols = ['alias', 'experiment_alias', 'file_name', 'file_format'] runs_dict, _ = extract_data(xl_sheet, run_cols) # WRITE HEADERS TO TABLES studies_table = open(pathlib.Path(args.out_path) / 'studies.tsv', 'w') studies_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_type', 'study_abstract', 'pubmed_id', 'submission_date']) + '\n') samples_table = open(pathlib.Path(args.out_path) / 'samples.tsv', 'w') samples_cols = ['alias', 'title', 'scientific_name', 'sample_description'] # extend the samples_cols list to add the ones that are filled by the CLI samples_cols = samples_cols + ['status', 'accession', 'taxon_id', 'submission_date'] if args.viral_submission: # extend the samples columns with the viral specific data samples_cols = samples_cols + ['geographic_location', 'host_common_name', 'host_subject_id', 'host_health_state', 'host_sex', 'host_scientific_name', 'collector_name', 'collecting_institution', 'isolate'] if len(samples_optional_cols_loaded) > 0: for optional_cols_excel in samples_optional_cols_loaded: samples_cols.append(optional_samples_cols_mapping[optional_cols_excel]) samples_table.write('\t'.join(samples_cols) + '\n') experiments_table = open(pathlib.Path(args.out_path) / 'experiments.tsv', 'w') experiments_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_alias', 'sample_alias', 'design_description', 'library_name', 'library_strategy', 'library_source', 'library_selection', 'library_layout', 'insert_size', 'library_construction_protocol', 'platform', 'instrument_model', 'submission_date']) + '\n') runs_table = open(pathlib.Path(args.out_path) / 'runs.tsv', 'w') runs_table.write('\t'.join(['alias', 'status', 'accession', 'experiment_alias', 'file_name', 'file_format', 'file_checksum', 'submission_date']) + '\n') action = args.action # WRITE DICTIONARIES TO TABLE FILES # ADD A TIMESTAMP TO THE ALIAS? SEEMS LIKE ENA REQUIRES ALL ENTRIES FOR A WEBIN TO HAVE UNIQUE IDS? # dt_oobj = datetime.now(tz=None) # timestamp = dt_oobj.strftime("%Y%m%d_%H:%M:%S") runs_included = [] exp_included = [] for study_alias, study in studies_dict.items(): # study_alias = study_alias + '_' + timestamp studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'], study['study_type'], study['study_abstract'], '', 'ENA_submission_data']) + '\n') # assuming no pubmed_id for sample_alias, sample in samples_dict.items(): # sample_alias = sample_alias + '_' + timestamp samples_row_values = [sample_alias, sample['title'], sample['scientific_name'], sample['sample_description'], action, 'ena_accession', 'tax_id_updated_by_ENA', 'ENA_submission_date'] if args.viral_submission: # add the values that are unique for the viral samples if sample['collector name'] == '': sample['collector name'] = 'unknown' samples_row_values = samples_row_values + \ [sample['geographic location (country and/or sea)'], sample['host common name'], 'host subject id', sample['host health state'], sample['host sex'], sample['host scientific name'], sample['collector name'], sample['collecting institution'], sample['isolate']] # add the (possible) optional columns values if len(samples_optional_cols_loaded) > 0: for optional_col in samples_optional_cols_loaded: # parse values stored as in excel date format (=float) if optional_col in ('collection date', 'receipt date'): # check if excel stored it as date if isinstance(sample[optional_col], float): year, month, day, hour, minute, second = xlrd.xldate_as_tuple( sample[optional_col], xl_workbook.datemode) month = "{:02d}".format(month) day = "{:02d}".format(day) hour = "{:02d}".format(hour) minute = "{:02d}".format(minute) second = "{:02d}".format(second) # format it as 2008-01-23T19:23:10 sample[optional_col] = str(year) + '-' + str(month) + '-' + str(day) + \ 'T' + str(hour) + ':' + str(minute) + ':' + str(second) # excel stores everything as float so I need to check if # the value was actually an int and keep it as int if isinstance(sample[optional_col], float): if int(sample[optional_col]) == sample[optional_col]: # it is not really a float but an int sample[optional_col] = int(sample[optional_col]) samples_row_values.append(str(sample[optional_col])) samples_table.write('\t'.join(samples_row_values) + '\n') for exp_alias, exp in experiments_dict.items(): # should I check here if any experiment has a study or sample alias that is incorrect? # (not listed in the samples or study dict) # process the experiments for this sample if exp['sample_alias'] == sample_alias: lib_alias = 'library_' + exp_alias + '_' + exp['sample_alias'] experiments_table.write('\t'.join([exp_alias, action, 'accession_ena', exp['title'], exp['study_alias'], sample_alias, exp['design_description'], lib_alias, exp['library_strategy'], exp['library_source'], exp['library_selection'], exp['library_layout'].lower(), str(int(exp['insert_size'])), exp['library_construction_protocol'], exp['platform'], exp['instrument_model'], 'submission_date_ENA']) + '\n') exp_included.append(exp_alias) for run_alias, run in runs_dict.items(): # check that the experiments library_layout is set to paired # when multiple entries are associated with the same run alias if not isinstance(run, list): runs_list = [run] else: runs_list = run for run_entry in runs_list: if run_entry['experiment_alias'] == exp_alias: runs_table.write('\t'.join([run_alias, action, 'ena_run_accession', exp_alias, run_entry['file_name'], FILE_FORMAT, 'file_checksum', 'submission_date_ENA']) + '\n') runs_included.append(run_alias) # check if any experiment or run was not associated with any sample for run in runs_dict.keys(): if run not in runs_included: print(f'The run {run} is listed in the runs section but not associated with any \ used experiment') for exp in experiments_dict.keys(): if exp not in exp_included: print(f'The experiment {exp} is listed in the experiments section but not associated \ with any used sample') studies_table.close() samples_table.close() experiments_table.close() runs_table.close() if args.verbose: paste_xls2yaml(args.xlsx_path)