Mercurial > repos > iuc > ena_upload
comparison process_xlsx.py @ 0:382518f24d6d draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ena_upload commit 57b434bcf493554d060a99b65e66f274d5c00e0a"
| author | iuc |
|---|---|
| date | Sat, 28 Nov 2020 09:45:44 +0000 |
| parents | |
| children | 57251c760cab |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:382518f24d6d |
|---|---|
| 1 import argparse | |
| 2 import pathlib | |
| 3 import sys | |
| 4 | |
| 5 import xlrd | |
| 6 | |
| 7 | |
| 8 FILE_FORMAT = 'fastq' | |
| 9 | |
| 10 | |
| 11 def extract_data(xl_sheet, expected_columns): | |
| 12 """ | |
| 13 1. Check that the columns I expect are present in the sheet | |
| 14 (any order and mixed with others, it's just a verification that | |
| 15 the user filled the correct template) | |
| 16 2. Fill a dictionary with the rows data indexed by first column in list""" | |
| 17 sheet_columns = {} | |
| 18 for sh_col in range(xl_sheet.ncols): | |
| 19 if xl_sheet.cell(0, sh_col).value in expected_columns: | |
| 20 if xl_sheet.cell(0, sh_col).value in sheet_columns.keys(): | |
| 21 sys.exit("Duplicated columns") | |
| 22 else: | |
| 23 sheet_columns[xl_sheet.cell(0, sh_col).value] = sh_col | |
| 24 for col in range(len(expected_columns)): | |
| 25 assert expected_columns[col] in sheet_columns.keys(), \ | |
| 26 "Expected column %s not found" % expected_columns[col] | |
| 27 | |
| 28 # fetch rows in a dict | |
| 29 data_dict = {} | |
| 30 # the first of the expected columns will be the index | |
| 31 index_col = sheet_columns[expected_columns[0]] | |
| 32 # skip first 2 rows: column names + comments rows | |
| 33 for row_id in range(2, xl_sheet.nrows): | |
| 34 row_dict = {} | |
| 35 for col in range(1, len(expected_columns)): | |
| 36 sheet_col_index = sheet_columns[expected_columns[col]] | |
| 37 row_dict[expected_columns[col]] = xl_sheet.cell(row_id, sheet_col_index).value | |
| 38 # should check for duplicate alias/ids? | |
| 39 data_dict[xl_sheet.cell(row_id, index_col).value] = row_dict | |
| 40 return data_dict | |
| 41 | |
| 42 | |
| 43 parser = argparse.ArgumentParser() | |
| 44 parser.add_argument('--form', dest='xlsx_path', required=True) | |
| 45 parser.add_argument('--out_dir', dest='out_path', required=True) | |
| 46 parser.add_argument('--action', dest='action', required=True) | |
| 47 parser.add_argument('--vir', dest='viral_submission', required=False, action='store_true') | |
| 48 args = parser.parse_args() | |
| 49 | |
| 50 xl_workbook = xlrd.open_workbook(args.xlsx_path) | |
| 51 | |
| 52 # PARSE STUDIES | |
| 53 ################# | |
| 54 xl_sheet = xl_workbook.sheet_by_name('ENA_study') | |
| 55 if xl_sheet.nrows < 3: | |
| 56 raise ValueError('No entries found in studies sheet') | |
| 57 studies_dict = {} | |
| 58 studies_col = ['alias', 'title', 'study_type', 'study_abstract'] | |
| 59 studies_dict = extract_data(xl_sheet, studies_col) | |
| 60 | |
| 61 # PARSE SAMPLES | |
| 62 ################# | |
| 63 xl_sheet = xl_workbook.sheet_by_name('ENA_sample') | |
| 64 if xl_sheet.nrows < 3: | |
| 65 raise ValueError('No entries found in samples') | |
| 66 if args.viral_submission: | |
| 67 samples_cols = ['alias', 'title', 'scientific_name', 'sample_description', | |
| 68 'geographic location (country and/or sea)', 'host common name', | |
| 69 'host health state', 'host sex', 'host scientific name', 'collector name', | |
| 70 'collection date', 'collecting institution', 'isolate'] | |
| 71 else: | |
| 72 samples_cols = ['alias', 'title', 'scientific_name', 'sample_description'] | |
| 73 samples_dict = extract_data(xl_sheet, samples_cols) | |
| 74 | |
| 75 # PARSE EXPERIMENTS | |
| 76 ################# | |
| 77 xl_sheet = xl_workbook.sheet_by_name('ENA_experiment') | |
| 78 if xl_sheet.nrows < 3: | |
| 79 raise ValueError('No experiments found in experiments sheet') | |
| 80 exp_columns = ['alias', 'title', 'study_alias', 'sample_alias', 'design_description', 'library_name', | |
| 81 'library_strategy', 'library_source', 'library_selection', 'library_layout', | |
| 82 'insert_size', 'library_construction_protocol', 'platform', 'instrument_model'] | |
| 83 experiments_dict = extract_data(xl_sheet, exp_columns) | |
| 84 | |
| 85 # PARSE RUNS SHEET | |
| 86 ################# | |
| 87 xl_sheet = xl_workbook.sheet_by_name('ENA_run') | |
| 88 if xl_sheet.nrows < 3: | |
| 89 raise ValueError('No entries found in runs sheet') | |
| 90 run_cols = ['alias', 'experiment_alias', 'file_name', 'file_format'] | |
| 91 runs_dict = extract_data(xl_sheet, run_cols) | |
| 92 | |
| 93 # WRITE HEADERS TO TABLES | |
| 94 studies_table = open(pathlib.Path(args.out_path) / 'studies.tsv', 'w') | |
| 95 studies_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_type', | |
| 96 'study_abstract', 'pubmed_id', 'submission_date']) + '\n') | |
| 97 samples_table = open(pathlib.Path(args.out_path) / 'samples.tsv', 'w') | |
| 98 if args.viral_submission: | |
| 99 samples_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'scientific_name', | |
| 100 'taxon_id', 'sample_description', 'collection_date', | |
| 101 'geographic_location', 'host_common_name', 'host_subject_id', | |
| 102 'host_health_state', 'host_sex', 'host_scientific_name', | |
| 103 'collector_name', 'collecting_institution', 'isolate', | |
| 104 'submission_date']) + '\n') | |
| 105 else: | |
| 106 samples_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'scientific_name', | |
| 107 'taxon_id', 'sample_description', 'submission_date']) + '\n') | |
| 108 | |
| 109 experiments_table = open(pathlib.Path(args.out_path) / 'experiments.tsv', 'w') | |
| 110 experiments_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_alias', | |
| 111 'sample_alias', 'design_description', 'library_name', | |
| 112 'library_strategy', 'library_source', 'library_selection', | |
| 113 'library_layout', 'insert_size', 'library_construction_protocol', | |
| 114 'platform', 'instrument_model', 'submission_date']) + '\n') | |
| 115 | |
| 116 runs_table = open(pathlib.Path(args.out_path) / 'runs.tsv', 'w') | |
| 117 runs_table.write('\t'.join(['alias', 'status', 'accession', 'experiment_alias', 'file_name', | |
| 118 'file_format', 'file_checksum', 'submission_date']) + '\n') | |
| 119 action = args.action | |
| 120 | |
| 121 # WRITE DICTIONARIES TO TABLE FILES | |
| 122 | |
| 123 # ADD A TIMESTAMP TO THE ALIAS? SEEMS LIKE ENA REQUIRES ALL ENTRIES FOR A WEBIN TO HAVE UNIQUE IDS? | |
| 124 # dt_oobj = datetime.now(tz=None) | |
| 125 # timestamp = dt_oobj.strftime("%Y%m%d_%H:%M:%S") | |
| 126 for study_alias, study in studies_dict.items(): | |
| 127 # study_alias = study_alias + '_' + timestamp | |
| 128 studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'], | |
| 129 study['study_type'], study['study_abstract'], '', | |
| 130 'ENA_submission_data']) + '\n') # assuming no pubmed_id | |
| 131 for sample_alias, sample in samples_dict.items(): | |
| 132 # sample_alias = sample_alias + '_' + timestamp | |
| 133 if args.viral_submission: | |
| 134 if sample['collector name'] == '': | |
| 135 sample['collector name'] = 'unknown' | |
| 136 samples_table.write('\t'.join([sample_alias, action, 'ena_accession', sample['title'], | |
| 137 sample['scientific_name'], 'tax_id_updated_by_ENA', | |
| 138 sample['sample_description'], sample['collection date'], | |
| 139 sample['geographic location (country and/or sea)'], | |
| 140 sample['host common name'], 'host subject id', | |
| 141 sample['host health state'], sample['host sex'], | |
| 142 sample['host scientific name'], sample['collector name'], | |
| 143 sample['collecting institution'], sample['isolate'], | |
| 144 'ENA_submission_date']) + '\n') | |
| 145 else: | |
| 146 samples_table.write('\t'.join([sample_alias, action, 'ena_accession', sample['title'], | |
| 147 sample['scientific_name'], 'tax_id_updated_by_ENA', | |
| 148 sample['sample_description']]) + '\n') | |
| 149 for exp_alias, exp in experiments_dict.items(): | |
| 150 # should I check here if any experiment has a study or sample alias that is incorrect? | |
| 151 # (not listed in the samples or study dict) | |
| 152 # process the experiments for this sample | |
| 153 if exp['sample_alias'] == sample_alias: | |
| 154 lib_alias = 'library_' + exp_alias + '_' + exp['sample_alias'] | |
| 155 experiments_table.write('\t'.join([exp_alias, action, 'accession_ena', exp['title'], | |
| 156 exp['study_alias'], sample_alias, | |
| 157 exp['design_description'], lib_alias, | |
| 158 exp['library_strategy'], exp['library_source'], | |
| 159 exp['library_selection'], | |
| 160 exp['library_layout'].lower(), | |
| 161 str(int(exp['insert_size'])), | |
| 162 exp['library_construction_protocol'], | |
| 163 exp['platform'], exp['instrument_model'], | |
| 164 'submission_date_ENA']) + '\n') | |
| 165 for run_alias, run in runs_dict.items(): | |
| 166 if run['experiment_alias'] == exp_alias: | |
| 167 runs_table.write('\t'.join([run_alias, action, 'ena_run_accession', exp_alias, | |
| 168 run['file_name'], FILE_FORMAT, 'file_checksum', | |
| 169 'submission_date_ENA']) + '\n') | |
| 170 studies_table.close() | |
| 171 samples_table.close() | |
| 172 experiments_table.close() | |
| 173 runs_table.close() |
