Mercurial > repos > iuc > ena_upload
comparison process_xlsx.py @ 2:9e2df763086c draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ena_upload commit 1eed23745846ce215e9bdc4a4934d6bc8f41b24e"
author | iuc |
---|---|
date | Thu, 15 Jul 2021 20:12:34 +0000 |
parents | 57251c760cab |
children | 59bb6d34fca6 |
comparison
equal
deleted
inserted
replaced
1:57251c760cab | 2:9e2df763086c |
---|---|
2 import pathlib | 2 import pathlib |
3 import sys | 3 import sys |
4 | 4 |
5 import xlrd | 5 import xlrd |
6 import yaml | 6 import yaml |
7 from mappings import optional_samples_cols_mapping | |
7 | 8 |
8 FILE_FORMAT = 'fastq' | 9 FILE_FORMAT = 'fastq' |
9 | 10 |
10 | 11 |
11 def extract_data(xl_sheet, expected_columns): | 12 def extract_data(xl_sheet, expected_columns, optional_cols=None): |
12 """ | 13 """ |
13 1. Check that the columns I expect are present in the sheet | 14 1. Check that the columns I expect are present in the sheet |
14 (any order and mixed with others, it's just a verification that | 15 (any order and mixed with others, it's just a verification that |
15 the user filled the correct template) | 16 the user filled the correct template) |
16 2. Fill a dictionary with the rows data indexed by first column in list""" | 17 2. Fill a dictionary with the rows data indexed by first column in list""" |
17 sheet_columns = {} | 18 sheet_columns = {} |
19 if optional_cols is None: | |
20 optional_cols = [] | |
21 optional_cols_loaded = [] | |
18 for sh_col in range(xl_sheet.ncols): | 22 for sh_col in range(xl_sheet.ncols): |
19 if xl_sheet.cell(0, sh_col).value in expected_columns: | 23 if (xl_sheet.cell(0, sh_col).value in expected_columns) \ |
24 or (xl_sheet.cell(0, sh_col).value in optional_cols): | |
20 if xl_sheet.cell(0, sh_col).value in sheet_columns.keys(): | 25 if xl_sheet.cell(0, sh_col).value in sheet_columns.keys(): |
21 sys.exit("Duplicated columns") | 26 sys.exit("Duplicated columns found") |
22 else: | 27 else: |
23 sheet_columns[xl_sheet.cell(0, sh_col).value] = sh_col | 28 sheet_columns[xl_sheet.cell(0, sh_col).value] = sh_col |
29 if xl_sheet.cell(0, sh_col).value in optional_cols: | |
30 # store the list of optional cols available | |
31 optional_cols_loaded.append(xl_sheet.cell(0, sh_col).value) | |
32 provided_cols = expected_columns + optional_cols_loaded | |
33 | |
34 # check that the required columns are all present | |
35 # TODO: revise this for optional columns | |
24 for col in range(len(expected_columns)): | 36 for col in range(len(expected_columns)): |
25 assert expected_columns[col] in sheet_columns.keys(), \ | 37 assert expected_columns[col] in sheet_columns.keys(), \ |
26 "Expected column %s not found" % expected_columns[col] | 38 "Expected column %s not found" % expected_columns[col] |
27 | 39 |
28 # fetch rows in a dict | 40 # fetch rows in a dict |
30 # the first of the expected columns will be the index | 42 # the first of the expected columns will be the index |
31 index_col = sheet_columns[expected_columns[0]] | 43 index_col = sheet_columns[expected_columns[0]] |
32 # skip first 2 rows: column names + comments rows | 44 # skip first 2 rows: column names + comments rows |
33 for row_id in range(2, xl_sheet.nrows): | 45 for row_id in range(2, xl_sheet.nrows): |
34 row_dict = {} | 46 row_dict = {} |
35 for col in range(1, len(expected_columns)): | 47 for col in range(1, len(provided_cols)): |
36 sheet_col_index = sheet_columns[expected_columns[col]] | 48 sheet_col_index = sheet_columns[provided_cols[col]] |
37 row_dict[expected_columns[col]] = xl_sheet.cell(row_id, sheet_col_index).value | 49 row_dict[provided_cols[col]] = xl_sheet.cell(row_id, sheet_col_index).value |
38 # should check for duplicate alias/ids? | 50 # should check for duplicate alias/ids? |
39 if xl_sheet.cell(row_id, index_col).value in data_dict.keys(): | 51 if xl_sheet.cell(row_id, index_col).value in data_dict.keys(): |
40 tmp = data_dict[xl_sheet.cell(row_id, index_col).value] | 52 tmp = data_dict[xl_sheet.cell(row_id, index_col).value] |
41 data_dict[xl_sheet.cell(row_id, index_col).value] = [tmp] | 53 data_dict[xl_sheet.cell(row_id, index_col).value] = [tmp] |
42 data_dict[xl_sheet.cell(row_id, index_col).value].append(row_dict) | 54 data_dict[xl_sheet.cell(row_id, index_col).value].append(row_dict) |
43 else: | 55 else: |
44 data_dict[xl_sheet.cell(row_id, index_col).value] = row_dict | 56 data_dict[xl_sheet.cell(row_id, index_col).value] = row_dict |
45 return data_dict | 57 return data_dict, optional_cols_loaded |
46 | 58 |
47 | 59 |
48 def paste_xls2yaml(xlsx_path): | 60 def paste_xls2yaml(xlsx_path): |
49 print('YAML -------------') | 61 print('YAML -------------') |
50 xls = xlrd.open_workbook(xlsx_path) | 62 xls = xlrd.open_workbook(xlsx_path) |
84 xl_sheet = xl_workbook.sheet_by_name('ENA_study') | 96 xl_sheet = xl_workbook.sheet_by_name('ENA_study') |
85 if xl_sheet.nrows < 3: | 97 if xl_sheet.nrows < 3: |
86 raise ValueError('No entries found in studies sheet') | 98 raise ValueError('No entries found in studies sheet') |
87 studies_dict = {} | 99 studies_dict = {} |
88 studies_col = ['alias', 'title', 'study_type', 'study_abstract'] | 100 studies_col = ['alias', 'title', 'study_type', 'study_abstract'] |
89 studies_dict = extract_data(xl_sheet, studies_col) | 101 studies_dict, _ = extract_data(xl_sheet, studies_col) |
90 | 102 |
91 # PARSE SAMPLES | 103 # PARSE SAMPLES |
92 ################# | 104 ################# |
93 xl_sheet = xl_workbook.sheet_by_name('ENA_sample') | 105 xl_sheet = xl_workbook.sheet_by_name('ENA_sample') |
94 if xl_sheet.nrows < 3: | 106 if xl_sheet.nrows < 3: |
95 raise ValueError('No entries found in samples') | 107 raise ValueError('No entries found in samples') |
108 | |
109 samples_cols_excel = ['alias', 'title', 'scientific_name', 'sample_description'] | |
110 # optional_samples_cols_mapping = {} | |
96 if args.viral_submission: | 111 if args.viral_submission: |
97 samples_cols = ['alias', 'title', 'scientific_name', 'sample_description', | 112 # load columns names from the table |
98 'geographic location (country and/or sea)', 'host common name', | 113 samples_cols_excel = samples_cols_excel + ['geographic location (country and/or sea)', |
99 'host health state', 'host sex', 'host scientific name', 'collector name', | 114 'host common name', 'host health state', |
100 'collection date', 'collecting institution', 'isolate'] | 115 'host sex', 'host scientific name', 'collector name', |
101 else: | 116 'collecting institution', 'isolate'] |
102 samples_cols = ['alias', 'title', 'scientific_name', 'sample_description'] | 117 |
103 samples_dict = extract_data(xl_sheet, samples_cols) | 118 samples_dict, samples_optional_cols_loaded = extract_data(xl_sheet, samples_cols_excel, |
104 | 119 optional_samples_cols_mapping.keys()) |
105 # PARSE EXPERIMENTS | 120 # PARSE EXPERIMENTS |
106 ################# | 121 ################# |
107 xl_sheet = xl_workbook.sheet_by_name('ENA_experiment') | 122 xl_sheet = xl_workbook.sheet_by_name('ENA_experiment') |
108 if xl_sheet.nrows < 3: | 123 if xl_sheet.nrows < 3: |
109 raise ValueError('No experiments found in experiments sheet') | 124 raise ValueError('No experiments found in experiments sheet') |
110 exp_columns = ['alias', 'title', 'study_alias', 'sample_alias', 'design_description', | 125 exp_columns = ['alias', 'title', 'study_alias', 'sample_alias', 'design_description', |
111 'library_name', 'library_strategy', 'library_source', 'library_selection', | 126 'library_name', 'library_strategy', 'library_source', 'library_selection', |
112 'library_layout', 'insert_size', 'library_construction_protocol', | 127 'library_layout', 'insert_size', 'library_construction_protocol', |
113 'platform', 'instrument_model'] | 128 'platform', 'instrument_model'] |
114 | 129 |
115 experiments_dict = extract_data(xl_sheet, exp_columns) | 130 experiments_dict, _ = extract_data(xl_sheet, exp_columns) |
116 | 131 |
117 # PARSE RUNS SHEET | 132 # PARSE RUNS SHEET |
118 ################# | 133 ################# |
119 xl_sheet = xl_workbook.sheet_by_name('ENA_run') | 134 xl_sheet = xl_workbook.sheet_by_name('ENA_run') |
120 if xl_sheet.nrows < 3: | 135 if xl_sheet.nrows < 3: |
121 raise ValueError('No entries found in runs sheet') | 136 raise ValueError('No entries found in runs sheet') |
122 run_cols = ['alias', 'experiment_alias', 'file_name', 'file_format'] | 137 run_cols = ['alias', 'experiment_alias', 'file_name', 'file_format'] |
123 runs_dict = extract_data(xl_sheet, run_cols) | 138 runs_dict, _ = extract_data(xl_sheet, run_cols) |
124 | 139 |
125 # WRITE HEADERS TO TABLES | 140 # WRITE HEADERS TO TABLES |
126 studies_table = open(pathlib.Path(args.out_path) / 'studies.tsv', 'w') | 141 studies_table = open(pathlib.Path(args.out_path) / 'studies.tsv', 'w') |
127 studies_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_type', | 142 studies_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_type', |
128 'study_abstract', 'pubmed_id', 'submission_date']) + '\n') | 143 'study_abstract', 'pubmed_id', 'submission_date']) + '\n') |
129 samples_table = open(pathlib.Path(args.out_path) / 'samples.tsv', 'w') | 144 samples_table = open(pathlib.Path(args.out_path) / 'samples.tsv', 'w') |
145 | |
146 samples_cols = ['alias', 'title', 'scientific_name', 'sample_description'] | |
147 # extend the samples_cols list to add the ones that are filled by the CLI | |
148 samples_cols = samples_cols + ['status', 'accession', 'taxon_id', 'submission_date'] | |
130 if args.viral_submission: | 149 if args.viral_submission: |
131 samples_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'scientific_name', | 150 # extend the samples columns with the viral specific data |
132 'taxon_id', 'sample_description', 'collection_date', | 151 samples_cols = samples_cols + ['geographic_location', 'host_common_name', |
133 'geographic_location', 'host_common_name', 'host_subject_id', | 152 'host_subject_id', 'host_health_state', 'host_sex', |
134 'host_health_state', 'host_sex', 'host_scientific_name', | 153 'host_scientific_name', 'collector_name', |
135 'collector_name', 'collecting_institution', 'isolate', | 154 'collecting_institution', 'isolate'] |
136 'submission_date']) + '\n') | 155 if len(samples_optional_cols_loaded) > 0: |
137 else: | 156 for optional_cols_excel in samples_optional_cols_loaded: |
138 samples_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'scientific_name', | 157 samples_cols.append(optional_samples_cols_mapping[optional_cols_excel]) |
139 'taxon_id', 'sample_description', 'submission_date']) + '\n') | 158 samples_table.write('\t'.join(samples_cols) + '\n') |
140 | 159 |
141 experiments_table = open(pathlib.Path(args.out_path) / 'experiments.tsv', 'w') | 160 experiments_table = open(pathlib.Path(args.out_path) / 'experiments.tsv', 'w') |
142 experiments_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_alias', | 161 experiments_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_alias', |
143 'sample_alias', 'design_description', 'library_name', | 162 'sample_alias', 'design_description', 'library_name', |
144 'library_strategy', 'library_source', 'library_selection', | 163 'library_strategy', 'library_source', 'library_selection', |
162 studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'], | 181 studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'], |
163 study['study_type'], study['study_abstract'], '', | 182 study['study_type'], study['study_abstract'], '', |
164 'ENA_submission_data']) + '\n') # assuming no pubmed_id | 183 'ENA_submission_data']) + '\n') # assuming no pubmed_id |
165 for sample_alias, sample in samples_dict.items(): | 184 for sample_alias, sample in samples_dict.items(): |
166 # sample_alias = sample_alias + '_' + timestamp | 185 # sample_alias = sample_alias + '_' + timestamp |
186 samples_row_values = [sample_alias, sample['title'], sample['scientific_name'], | |
187 sample['sample_description'], action, 'ena_accession', | |
188 'tax_id_updated_by_ENA', 'ENA_submission_date'] | |
167 if args.viral_submission: | 189 if args.viral_submission: |
190 # add the values that are unique for the viral samples | |
168 if sample['collector name'] == '': | 191 if sample['collector name'] == '': |
169 sample['collector name'] = 'unknown' | 192 sample['collector name'] = 'unknown' |
170 samples_table.write('\t'.join([sample_alias, action, 'ena_accession', sample['title'], | 193 samples_row_values = samples_row_values + \ |
171 sample['scientific_name'], 'tax_id_updated_by_ENA', | 194 [sample['geographic location (country and/or sea)'], sample['host common name'], |
172 sample['sample_description'], sample['collection date'], | 195 'host subject id', sample['host health state'], sample['host sex'], |
173 sample['geographic location (country and/or sea)'], | 196 sample['host scientific name'], sample['collector name'], |
174 sample['host common name'], 'host subject id', | 197 sample['collecting institution'], sample['isolate']] |
175 sample['host health state'], sample['host sex'], | 198 # add the (possible) optional columns values |
176 sample['host scientific name'], sample['collector name'], | 199 if len(samples_optional_cols_loaded) > 0: |
177 sample['collecting institution'], sample['isolate'], | 200 for optional_col in samples_optional_cols_loaded: |
178 'ENA_submission_date']) + '\n') | 201 # parse values stored as in excel date format (=float) |
179 else: | 202 if optional_col in ('collection date', 'receipt date'): |
180 samples_table.write('\t'.join([sample_alias, action, 'ena_accession', sample['title'], | 203 # check if excel stored it as date |
181 sample['scientific_name'], 'tax_id_updated_by_ENA', | 204 if isinstance(sample[optional_col], float): |
182 sample['sample_description']]) + '\n') | 205 year, month, day, hour, minute, second = xlrd.xldate_as_tuple( |
206 sample[optional_col], xl_workbook.datemode) | |
207 month = "{:02d}".format(month) | |
208 day = "{:02d}".format(day) | |
209 hour = "{:02d}".format(hour) | |
210 minute = "{:02d}".format(minute) | |
211 second = "{:02d}".format(second) | |
212 # format it as 2008-01-23T19:23:10 | |
213 sample[optional_col] = str(year) + '-' + str(month) + '-' + str(day) + \ | |
214 'T' + str(hour) + ':' + str(minute) + ':' + str(second) | |
215 # excel stores everything as float so I need to check if | |
216 # the value was actually an int and keep it as int | |
217 if isinstance(sample[optional_col], float): | |
218 if int(sample[optional_col]) == sample[optional_col]: | |
219 # it is not really a float but an int | |
220 sample[optional_col] = int(sample[optional_col]) | |
221 samples_row_values.append(str(sample[optional_col])) | |
222 samples_table.write('\t'.join(samples_row_values) + '\n') | |
223 | |
183 for exp_alias, exp in experiments_dict.items(): | 224 for exp_alias, exp in experiments_dict.items(): |
184 # should I check here if any experiment has a study or sample alias that is incorrect? | 225 # should I check here if any experiment has a study or sample alias that is incorrect? |
185 # (not listed in the samples or study dict) | 226 # (not listed in the samples or study dict) |
186 # process the experiments for this sample | 227 # process the experiments for this sample |
187 if exp['sample_alias'] == sample_alias: | 228 if exp['sample_alias'] == sample_alias: |