Mercurial > repos > iuc > ena_upload
comparison process_xlsx.py @ 2:9e2df763086c draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ena_upload commit 1eed23745846ce215e9bdc4a4934d6bc8f41b24e"
| author | iuc |
|---|---|
| date | Thu, 15 Jul 2021 20:12:34 +0000 |
| parents | 57251c760cab |
| children | 59bb6d34fca6 |
comparison
equal
deleted
inserted
replaced
| 1:57251c760cab | 2:9e2df763086c |
|---|---|
| 2 import pathlib | 2 import pathlib |
| 3 import sys | 3 import sys |
| 4 | 4 |
| 5 import xlrd | 5 import xlrd |
| 6 import yaml | 6 import yaml |
| 7 from mappings import optional_samples_cols_mapping | |
| 7 | 8 |
| 8 FILE_FORMAT = 'fastq' | 9 FILE_FORMAT = 'fastq' |
| 9 | 10 |
| 10 | 11 |
| 11 def extract_data(xl_sheet, expected_columns): | 12 def extract_data(xl_sheet, expected_columns, optional_cols=None): |
| 12 """ | 13 """ |
| 13 1. Check that the columns I expect are present in the sheet | 14 1. Check that the columns I expect are present in the sheet |
| 14 (any order and mixed with others, it's just a verification that | 15 (any order and mixed with others, it's just a verification that |
| 15 the user filled the correct template) | 16 the user filled the correct template) |
| 16 2. Fill a dictionary with the rows data indexed by first column in list""" | 17 2. Fill a dictionary with the rows data indexed by first column in list""" |
| 17 sheet_columns = {} | 18 sheet_columns = {} |
| 19 if optional_cols is None: | |
| 20 optional_cols = [] | |
| 21 optional_cols_loaded = [] | |
| 18 for sh_col in range(xl_sheet.ncols): | 22 for sh_col in range(xl_sheet.ncols): |
| 19 if xl_sheet.cell(0, sh_col).value in expected_columns: | 23 if (xl_sheet.cell(0, sh_col).value in expected_columns) \ |
| 24 or (xl_sheet.cell(0, sh_col).value in optional_cols): | |
| 20 if xl_sheet.cell(0, sh_col).value in sheet_columns.keys(): | 25 if xl_sheet.cell(0, sh_col).value in sheet_columns.keys(): |
| 21 sys.exit("Duplicated columns") | 26 sys.exit("Duplicated columns found") |
| 22 else: | 27 else: |
| 23 sheet_columns[xl_sheet.cell(0, sh_col).value] = sh_col | 28 sheet_columns[xl_sheet.cell(0, sh_col).value] = sh_col |
| 29 if xl_sheet.cell(0, sh_col).value in optional_cols: | |
| 30 # store the list of optional cols available | |
| 31 optional_cols_loaded.append(xl_sheet.cell(0, sh_col).value) | |
| 32 provided_cols = expected_columns + optional_cols_loaded | |
| 33 | |
| 34 # check that the required columns are all present | |
| 35 # TODO: revise this for optional columns | |
| 24 for col in range(len(expected_columns)): | 36 for col in range(len(expected_columns)): |
| 25 assert expected_columns[col] in sheet_columns.keys(), \ | 37 assert expected_columns[col] in sheet_columns.keys(), \ |
| 26 "Expected column %s not found" % expected_columns[col] | 38 "Expected column %s not found" % expected_columns[col] |
| 27 | 39 |
| 28 # fetch rows in a dict | 40 # fetch rows in a dict |
| 30 # the first of the expected columns will be the index | 42 # the first of the expected columns will be the index |
| 31 index_col = sheet_columns[expected_columns[0]] | 43 index_col = sheet_columns[expected_columns[0]] |
| 32 # skip first 2 rows: column names + comments rows | 44 # skip first 2 rows: column names + comments rows |
| 33 for row_id in range(2, xl_sheet.nrows): | 45 for row_id in range(2, xl_sheet.nrows): |
| 34 row_dict = {} | 46 row_dict = {} |
| 35 for col in range(1, len(expected_columns)): | 47 for col in range(1, len(provided_cols)): |
| 36 sheet_col_index = sheet_columns[expected_columns[col]] | 48 sheet_col_index = sheet_columns[provided_cols[col]] |
| 37 row_dict[expected_columns[col]] = xl_sheet.cell(row_id, sheet_col_index).value | 49 row_dict[provided_cols[col]] = xl_sheet.cell(row_id, sheet_col_index).value |
| 38 # should check for duplicate alias/ids? | 50 # should check for duplicate alias/ids? |
| 39 if xl_sheet.cell(row_id, index_col).value in data_dict.keys(): | 51 if xl_sheet.cell(row_id, index_col).value in data_dict.keys(): |
| 40 tmp = data_dict[xl_sheet.cell(row_id, index_col).value] | 52 tmp = data_dict[xl_sheet.cell(row_id, index_col).value] |
| 41 data_dict[xl_sheet.cell(row_id, index_col).value] = [tmp] | 53 data_dict[xl_sheet.cell(row_id, index_col).value] = [tmp] |
| 42 data_dict[xl_sheet.cell(row_id, index_col).value].append(row_dict) | 54 data_dict[xl_sheet.cell(row_id, index_col).value].append(row_dict) |
| 43 else: | 55 else: |
| 44 data_dict[xl_sheet.cell(row_id, index_col).value] = row_dict | 56 data_dict[xl_sheet.cell(row_id, index_col).value] = row_dict |
| 45 return data_dict | 57 return data_dict, optional_cols_loaded |
| 46 | 58 |
| 47 | 59 |
| 48 def paste_xls2yaml(xlsx_path): | 60 def paste_xls2yaml(xlsx_path): |
| 49 print('YAML -------------') | 61 print('YAML -------------') |
| 50 xls = xlrd.open_workbook(xlsx_path) | 62 xls = xlrd.open_workbook(xlsx_path) |
| 84 xl_sheet = xl_workbook.sheet_by_name('ENA_study') | 96 xl_sheet = xl_workbook.sheet_by_name('ENA_study') |
| 85 if xl_sheet.nrows < 3: | 97 if xl_sheet.nrows < 3: |
| 86 raise ValueError('No entries found in studies sheet') | 98 raise ValueError('No entries found in studies sheet') |
| 87 studies_dict = {} | 99 studies_dict = {} |
| 88 studies_col = ['alias', 'title', 'study_type', 'study_abstract'] | 100 studies_col = ['alias', 'title', 'study_type', 'study_abstract'] |
| 89 studies_dict = extract_data(xl_sheet, studies_col) | 101 studies_dict, _ = extract_data(xl_sheet, studies_col) |
| 90 | 102 |
| 91 # PARSE SAMPLES | 103 # PARSE SAMPLES |
| 92 ################# | 104 ################# |
| 93 xl_sheet = xl_workbook.sheet_by_name('ENA_sample') | 105 xl_sheet = xl_workbook.sheet_by_name('ENA_sample') |
| 94 if xl_sheet.nrows < 3: | 106 if xl_sheet.nrows < 3: |
| 95 raise ValueError('No entries found in samples') | 107 raise ValueError('No entries found in samples') |
| 108 | |
| 109 samples_cols_excel = ['alias', 'title', 'scientific_name', 'sample_description'] | |
| 110 # optional_samples_cols_mapping = {} | |
| 96 if args.viral_submission: | 111 if args.viral_submission: |
| 97 samples_cols = ['alias', 'title', 'scientific_name', 'sample_description', | 112 # load columns names from the table |
| 98 'geographic location (country and/or sea)', 'host common name', | 113 samples_cols_excel = samples_cols_excel + ['geographic location (country and/or sea)', |
| 99 'host health state', 'host sex', 'host scientific name', 'collector name', | 114 'host common name', 'host health state', |
| 100 'collection date', 'collecting institution', 'isolate'] | 115 'host sex', 'host scientific name', 'collector name', |
| 101 else: | 116 'collecting institution', 'isolate'] |
| 102 samples_cols = ['alias', 'title', 'scientific_name', 'sample_description'] | 117 |
| 103 samples_dict = extract_data(xl_sheet, samples_cols) | 118 samples_dict, samples_optional_cols_loaded = extract_data(xl_sheet, samples_cols_excel, |
| 104 | 119 optional_samples_cols_mapping.keys()) |
| 105 # PARSE EXPERIMENTS | 120 # PARSE EXPERIMENTS |
| 106 ################# | 121 ################# |
| 107 xl_sheet = xl_workbook.sheet_by_name('ENA_experiment') | 122 xl_sheet = xl_workbook.sheet_by_name('ENA_experiment') |
| 108 if xl_sheet.nrows < 3: | 123 if xl_sheet.nrows < 3: |
| 109 raise ValueError('No experiments found in experiments sheet') | 124 raise ValueError('No experiments found in experiments sheet') |
| 110 exp_columns = ['alias', 'title', 'study_alias', 'sample_alias', 'design_description', | 125 exp_columns = ['alias', 'title', 'study_alias', 'sample_alias', 'design_description', |
| 111 'library_name', 'library_strategy', 'library_source', 'library_selection', | 126 'library_name', 'library_strategy', 'library_source', 'library_selection', |
| 112 'library_layout', 'insert_size', 'library_construction_protocol', | 127 'library_layout', 'insert_size', 'library_construction_protocol', |
| 113 'platform', 'instrument_model'] | 128 'platform', 'instrument_model'] |
| 114 | 129 |
| 115 experiments_dict = extract_data(xl_sheet, exp_columns) | 130 experiments_dict, _ = extract_data(xl_sheet, exp_columns) |
| 116 | 131 |
| 117 # PARSE RUNS SHEET | 132 # PARSE RUNS SHEET |
| 118 ################# | 133 ################# |
| 119 xl_sheet = xl_workbook.sheet_by_name('ENA_run') | 134 xl_sheet = xl_workbook.sheet_by_name('ENA_run') |
| 120 if xl_sheet.nrows < 3: | 135 if xl_sheet.nrows < 3: |
| 121 raise ValueError('No entries found in runs sheet') | 136 raise ValueError('No entries found in runs sheet') |
| 122 run_cols = ['alias', 'experiment_alias', 'file_name', 'file_format'] | 137 run_cols = ['alias', 'experiment_alias', 'file_name', 'file_format'] |
| 123 runs_dict = extract_data(xl_sheet, run_cols) | 138 runs_dict, _ = extract_data(xl_sheet, run_cols) |
| 124 | 139 |
| 125 # WRITE HEADERS TO TABLES | 140 # WRITE HEADERS TO TABLES |
| 126 studies_table = open(pathlib.Path(args.out_path) / 'studies.tsv', 'w') | 141 studies_table = open(pathlib.Path(args.out_path) / 'studies.tsv', 'w') |
| 127 studies_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_type', | 142 studies_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_type', |
| 128 'study_abstract', 'pubmed_id', 'submission_date']) + '\n') | 143 'study_abstract', 'pubmed_id', 'submission_date']) + '\n') |
| 129 samples_table = open(pathlib.Path(args.out_path) / 'samples.tsv', 'w') | 144 samples_table = open(pathlib.Path(args.out_path) / 'samples.tsv', 'w') |
| 145 | |
| 146 samples_cols = ['alias', 'title', 'scientific_name', 'sample_description'] | |
| 147 # extend the samples_cols list to add the ones that are filled by the CLI | |
| 148 samples_cols = samples_cols + ['status', 'accession', 'taxon_id', 'submission_date'] | |
| 130 if args.viral_submission: | 149 if args.viral_submission: |
| 131 samples_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'scientific_name', | 150 # extend the samples columns with the viral specific data |
| 132 'taxon_id', 'sample_description', 'collection_date', | 151 samples_cols = samples_cols + ['geographic_location', 'host_common_name', |
| 133 'geographic_location', 'host_common_name', 'host_subject_id', | 152 'host_subject_id', 'host_health_state', 'host_sex', |
| 134 'host_health_state', 'host_sex', 'host_scientific_name', | 153 'host_scientific_name', 'collector_name', |
| 135 'collector_name', 'collecting_institution', 'isolate', | 154 'collecting_institution', 'isolate'] |
| 136 'submission_date']) + '\n') | 155 if len(samples_optional_cols_loaded) > 0: |
| 137 else: | 156 for optional_cols_excel in samples_optional_cols_loaded: |
| 138 samples_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'scientific_name', | 157 samples_cols.append(optional_samples_cols_mapping[optional_cols_excel]) |
| 139 'taxon_id', 'sample_description', 'submission_date']) + '\n') | 158 samples_table.write('\t'.join(samples_cols) + '\n') |
| 140 | 159 |
| 141 experiments_table = open(pathlib.Path(args.out_path) / 'experiments.tsv', 'w') | 160 experiments_table = open(pathlib.Path(args.out_path) / 'experiments.tsv', 'w') |
| 142 experiments_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_alias', | 161 experiments_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_alias', |
| 143 'sample_alias', 'design_description', 'library_name', | 162 'sample_alias', 'design_description', 'library_name', |
| 144 'library_strategy', 'library_source', 'library_selection', | 163 'library_strategy', 'library_source', 'library_selection', |
| 162 studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'], | 181 studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'], |
| 163 study['study_type'], study['study_abstract'], '', | 182 study['study_type'], study['study_abstract'], '', |
| 164 'ENA_submission_data']) + '\n') # assuming no pubmed_id | 183 'ENA_submission_data']) + '\n') # assuming no pubmed_id |
| 165 for sample_alias, sample in samples_dict.items(): | 184 for sample_alias, sample in samples_dict.items(): |
| 166 # sample_alias = sample_alias + '_' + timestamp | 185 # sample_alias = sample_alias + '_' + timestamp |
| 186 samples_row_values = [sample_alias, sample['title'], sample['scientific_name'], | |
| 187 sample['sample_description'], action, 'ena_accession', | |
| 188 'tax_id_updated_by_ENA', 'ENA_submission_date'] | |
| 167 if args.viral_submission: | 189 if args.viral_submission: |
| 190 # add the values that are unique for the viral samples | |
| 168 if sample['collector name'] == '': | 191 if sample['collector name'] == '': |
| 169 sample['collector name'] = 'unknown' | 192 sample['collector name'] = 'unknown' |
| 170 samples_table.write('\t'.join([sample_alias, action, 'ena_accession', sample['title'], | 193 samples_row_values = samples_row_values + \ |
| 171 sample['scientific_name'], 'tax_id_updated_by_ENA', | 194 [sample['geographic location (country and/or sea)'], sample['host common name'], |
| 172 sample['sample_description'], sample['collection date'], | 195 'host subject id', sample['host health state'], sample['host sex'], |
| 173 sample['geographic location (country and/or sea)'], | 196 sample['host scientific name'], sample['collector name'], |
| 174 sample['host common name'], 'host subject id', | 197 sample['collecting institution'], sample['isolate']] |
| 175 sample['host health state'], sample['host sex'], | 198 # add the (possible) optional columns values |
| 176 sample['host scientific name'], sample['collector name'], | 199 if len(samples_optional_cols_loaded) > 0: |
| 177 sample['collecting institution'], sample['isolate'], | 200 for optional_col in samples_optional_cols_loaded: |
| 178 'ENA_submission_date']) + '\n') | 201 # parse values stored as in excel date format (=float) |
| 179 else: | 202 if optional_col in ('collection date', 'receipt date'): |
| 180 samples_table.write('\t'.join([sample_alias, action, 'ena_accession', sample['title'], | 203 # check if excel stored it as date |
| 181 sample['scientific_name'], 'tax_id_updated_by_ENA', | 204 if isinstance(sample[optional_col], float): |
| 182 sample['sample_description']]) + '\n') | 205 year, month, day, hour, minute, second = xlrd.xldate_as_tuple( |
| 206 sample[optional_col], xl_workbook.datemode) | |
| 207 month = "{:02d}".format(month) | |
| 208 day = "{:02d}".format(day) | |
| 209 hour = "{:02d}".format(hour) | |
| 210 minute = "{:02d}".format(minute) | |
| 211 second = "{:02d}".format(second) | |
| 212 # format it as 2008-01-23T19:23:10 | |
| 213 sample[optional_col] = str(year) + '-' + str(month) + '-' + str(day) + \ | |
| 214 'T' + str(hour) + ':' + str(minute) + ':' + str(second) | |
| 215 # excel stores everything as float so I need to check if | |
| 216 # the value was actually an int and keep it as int | |
| 217 if isinstance(sample[optional_col], float): | |
| 218 if int(sample[optional_col]) == sample[optional_col]: | |
| 219 # it is not really a float but an int | |
| 220 sample[optional_col] = int(sample[optional_col]) | |
| 221 samples_row_values.append(str(sample[optional_col])) | |
| 222 samples_table.write('\t'.join(samples_row_values) + '\n') | |
| 223 | |
| 183 for exp_alias, exp in experiments_dict.items(): | 224 for exp_alias, exp in experiments_dict.items(): |
| 184 # should I check here if any experiment has a study or sample alias that is incorrect? | 225 # should I check here if any experiment has a study or sample alias that is incorrect? |
| 185 # (not listed in the samples or study dict) | 226 # (not listed in the samples or study dict) |
| 186 # process the experiments for this sample | 227 # process the experiments for this sample |
| 187 if exp['sample_alias'] == sample_alias: | 228 if exp['sample_alias'] == sample_alias: |
