Mercurial > repos > iuc > ena_upload
diff process_xlsx.py @ 1:57251c760cab draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ena_upload commit ffea061c1ad6e7291abfe220230dbdbe8d19a2bd"
author | iuc |
---|---|
date | Fri, 30 Apr 2021 12:09:25 +0000 |
parents | 382518f24d6d |
children | 9e2df763086c |
line wrap: on
line diff
--- a/process_xlsx.py Sat Nov 28 09:45:44 2020 +0000 +++ b/process_xlsx.py Fri Apr 30 12:09:25 2021 +0000 @@ -3,7 +3,7 @@ import sys import xlrd - +import yaml FILE_FORMAT = 'fastq' @@ -36,15 +36,45 @@ sheet_col_index = sheet_columns[expected_columns[col]] row_dict[expected_columns[col]] = xl_sheet.cell(row_id, sheet_col_index).value # should check for duplicate alias/ids? - data_dict[xl_sheet.cell(row_id, index_col).value] = row_dict + if xl_sheet.cell(row_id, index_col).value in data_dict.keys(): + tmp = data_dict[xl_sheet.cell(row_id, index_col).value] + data_dict[xl_sheet.cell(row_id, index_col).value] = [tmp] + data_dict[xl_sheet.cell(row_id, index_col).value].append(row_dict) + else: + data_dict[xl_sheet.cell(row_id, index_col).value] = row_dict return data_dict +def paste_xls2yaml(xlsx_path): + print('YAML -------------') + xls = xlrd.open_workbook(xlsx_path) + content_dict = {} + for sheet_name in xls.sheet_names(): + if sheet_name == 'controlled_vocabulary': + continue + xls_sheet = xls.sheet_by_name(sheet_name) + sheet_contents_dict = {} + colnames = [] + for col in range(xls_sheet.ncols): + colnames.append(xls_sheet.cell(0, col).value) + # skip first 2 rows (column names and suggestions) + for row_id in range(2, xls_sheet.nrows): + row_dict = {} + for col_id in range(0, xls_sheet.ncols): + row_dict[colnames[col_id]] = xls_sheet.cell(row_id, col_id).value + # should check for duplicate alias/ids? + sheet_contents_dict[row_id] = row_dict + content_dict[sheet_name] = sheet_contents_dict + yaml.dump(content_dict, sys.stdout) + print('YAML -------------') + + parser = argparse.ArgumentParser() parser.add_argument('--form', dest='xlsx_path', required=True) parser.add_argument('--out_dir', dest='out_path', required=True) parser.add_argument('--action', dest='action', required=True) parser.add_argument('--vir', dest='viral_submission', required=False, action='store_true') +parser.add_argument('--verbose', dest='verbose', required=False, action='store_true') args = parser.parse_args() xl_workbook = xlrd.open_workbook(args.xlsx_path) @@ -77,9 +107,11 @@ xl_sheet = xl_workbook.sheet_by_name('ENA_experiment') if xl_sheet.nrows < 3: raise ValueError('No experiments found in experiments sheet') -exp_columns = ['alias', 'title', 'study_alias', 'sample_alias', 'design_description', 'library_name', - 'library_strategy', 'library_source', 'library_selection', 'library_layout', - 'insert_size', 'library_construction_protocol', 'platform', 'instrument_model'] +exp_columns = ['alias', 'title', 'study_alias', 'sample_alias', 'design_description', + 'library_name', 'library_strategy', 'library_source', 'library_selection', + 'library_layout', 'insert_size', 'library_construction_protocol', + 'platform', 'instrument_model'] + experiments_dict = extract_data(xl_sheet, exp_columns) # PARSE RUNS SHEET @@ -123,6 +155,8 @@ # ADD A TIMESTAMP TO THE ALIAS? SEEMS LIKE ENA REQUIRES ALL ENTRIES FOR A WEBIN TO HAVE UNIQUE IDS? # dt_oobj = datetime.now(tz=None) # timestamp = dt_oobj.strftime("%Y%m%d_%H:%M:%S") +runs_included = [] +exp_included = [] for study_alias, study in studies_dict.items(): # study_alias = study_alias + '_' + timestamp studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'], @@ -162,12 +196,37 @@ exp['library_construction_protocol'], exp['platform'], exp['instrument_model'], 'submission_date_ENA']) + '\n') + exp_included.append(exp_alias) for run_alias, run in runs_dict.items(): - if run['experiment_alias'] == exp_alias: - runs_table.write('\t'.join([run_alias, action, 'ena_run_accession', exp_alias, - run['file_name'], FILE_FORMAT, 'file_checksum', - 'submission_date_ENA']) + '\n') + # check that the experiments library_layout is set to paired + # when multiple entries are associated with the same run alias + if not isinstance(run, list): + runs_list = [run] + else: + runs_list = run + for run_entry in runs_list: + if run_entry['experiment_alias'] == exp_alias: + runs_table.write('\t'.join([run_alias, action, 'ena_run_accession', + exp_alias, run_entry['file_name'], + FILE_FORMAT, 'file_checksum', + 'submission_date_ENA']) + '\n') + runs_included.append(run_alias) + +# check if any experiment or run was not associated with any sample +for run in runs_dict.keys(): + if run not in runs_included: + print(f'The run {run} is listed in the runs section but not associated with any \ + used experiment') + +for exp in experiments_dict.keys(): + if exp not in exp_included: + print(f'The experiment {exp} is listed in the experiments section but not associated \ + with any used sample') + studies_table.close() samples_table.close() experiments_table.close() runs_table.close() + +if args.verbose: + paste_xls2yaml(args.xlsx_path)