Mercurial > repos > iuc > ena_upload
comparison process_xlsx.py @ 4:26ccb678abc8 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ena_upload commit ba358013c83e7dfffec895946d36585f237e54c5"
author | iuc |
---|---|
date | Tue, 19 Oct 2021 15:57:14 +0000 |
parents | 59bb6d34fca6 |
children | 4aab5ae907b6 |
comparison
equal
deleted
inserted
replaced
3:59bb6d34fca6 | 4:26ccb678abc8 |
---|---|
2 import pathlib | 2 import pathlib |
3 import sys | 3 import sys |
4 | 4 |
5 import xlrd | 5 import xlrd |
6 import yaml | 6 import yaml |
7 from check_remote import check_remote_entry | |
7 from mappings import optional_samples_cols_mapping | 8 from mappings import optional_samples_cols_mapping |
8 | 9 |
9 FILE_FORMAT = 'fastq' | 10 FILE_FORMAT = 'fastq' |
11 | |
12 | |
13 def identify_action(entry_type, alias): | |
14 ''' define action ['add' | 'modify'] that needs to be perfomed for this entry ''' | |
15 query = {entry_type + '_alias': alias} | |
16 remote_accessions = check_remote_entry(entry_type, query) | |
17 if len(remote_accessions) > 0: | |
18 print(f'Found: {entry_type} entry with alias {alias}') | |
19 return 'modify' | |
20 else: | |
21 print(f'No {entry_type} entry found with alias {alias}') | |
22 return 'add' | |
10 | 23 |
11 | 24 |
12 def extract_data(xl_sheet, expected_columns, optional_cols=None): | 25 def extract_data(xl_sheet, expected_columns, optional_cols=None): |
13 """ | 26 """ |
14 1. Check that the columns I expect are present in the sheet | 27 1. Check that the columns I expect are present in the sheet |
84 parser = argparse.ArgumentParser() | 97 parser = argparse.ArgumentParser() |
85 parser.add_argument('--form', dest='xlsx_path', required=True) | 98 parser.add_argument('--form', dest='xlsx_path', required=True) |
86 parser.add_argument('--out_dir', dest='out_path', required=True) | 99 parser.add_argument('--out_dir', dest='out_path', required=True) |
87 parser.add_argument('--action', dest='action', required=True) | 100 parser.add_argument('--action', dest='action', required=True) |
88 parser.add_argument('--vir', dest='viral_submission', required=False, action='store_true') | 101 parser.add_argument('--vir', dest='viral_submission', required=False, action='store_true') |
102 parser.add_argument('--dev', dest='dev_submission', required=False, action='store_true') | |
89 parser.add_argument('--verbose', dest='verbose', required=False, action='store_true') | 103 parser.add_argument('--verbose', dest='verbose', required=False, action='store_true') |
90 args = parser.parse_args() | 104 args = parser.parse_args() |
91 | 105 |
92 xl_workbook = xlrd.open_workbook(args.xlsx_path) | 106 xl_workbook = xlrd.open_workbook(args.xlsx_path) |
93 | 107 |
146 samples_cols = ['alias', 'title', 'scientific_name', 'sample_description'] | 160 samples_cols = ['alias', 'title', 'scientific_name', 'sample_description'] |
147 # extend the samples_cols list to add the ones that are filled by the CLI | 161 # extend the samples_cols list to add the ones that are filled by the CLI |
148 samples_cols = samples_cols + ['status', 'accession', 'taxon_id', 'submission_date'] | 162 samples_cols = samples_cols + ['status', 'accession', 'taxon_id', 'submission_date'] |
149 if args.viral_submission: | 163 if args.viral_submission: |
150 # extend the samples columns with the viral specific data | 164 # extend the samples columns with the viral specific data |
151 samples_cols = samples_cols + ['geographic_location', 'host_common_name', | 165 samples_cols = samples_cols + ['geographic location (country and/or sea)', 'host common name', |
152 'host_subject_id', 'host_health_state', 'host_sex', | 166 'host subject id', 'host health state', 'host sex', |
153 'host_scientific_name', 'collector_name', | 167 'host scientific name', 'collector name', |
154 'collecting_institution', 'isolate'] | 168 'collecting institution', 'isolate'] |
155 if len(samples_optional_cols_loaded) > 0: | 169 if len(samples_optional_cols_loaded) > 0: |
156 for optional_cols_excel in samples_optional_cols_loaded: | 170 for optional_cols_excel in samples_optional_cols_loaded: |
157 samples_cols.append(optional_samples_cols_mapping[optional_cols_excel]) | 171 samples_cols.append(optional_samples_cols_mapping[optional_cols_excel]) |
158 samples_table.write('\t'.join(samples_cols) + '\n') | 172 samples_table.write('\t'.join(samples_cols) + '\n') |
159 | 173 |
166 | 180 |
167 runs_table = open(pathlib.Path(args.out_path) / 'runs.tsv', 'w') | 181 runs_table = open(pathlib.Path(args.out_path) / 'runs.tsv', 'w') |
168 runs_table.write('\t'.join(['alias', 'status', 'accession', 'experiment_alias', 'file_name', | 182 runs_table.write('\t'.join(['alias', 'status', 'accession', 'experiment_alias', 'file_name', |
169 'file_format', 'file_checksum', 'submission_date']) + '\n') | 183 'file_format', 'file_checksum', 'submission_date']) + '\n') |
170 action = args.action | 184 action = args.action |
171 | 185 # actionable_items |
172 # WRITE DICTIONARIES TO TABLE FILES | 186 # WRITE DICTIONARIES TO TABLE FILES |
173 | 187 |
174 # ADD A TIMESTAMP TO THE ALIAS? SEEMS LIKE ENA REQUIRES ALL ENTRIES FOR A WEBIN TO HAVE UNIQUE IDS? | 188 # ADD A TIMESTAMP TO THE ALIAS? SEEMS LIKE ENA REQUIRES ALL ENTRIES FOR A WEBIN TO HAVE UNIQUE IDS? |
175 # dt_oobj = datetime.now(tz=None) | 189 # dt_oobj = datetime.now(tz=None) |
176 # timestamp = dt_oobj.strftime("%Y%m%d_%H:%M:%S") | 190 # timestamp = dt_oobj.strftime("%Y%m%d_%H:%M:%S") |
177 runs_included = [] | 191 runs_included = [] |
178 exp_included = [] | 192 exp_included = [] |
179 for study_alias, study in studies_dict.items(): | 193 for study_alias, study in studies_dict.items(): |
180 # study_alias = study_alias + '_' + timestamp | 194 # study_alias = study_alias + '_' + timestamp |
181 studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'], | 195 if args.dev_submission: |
196 entry_action = args.action | |
197 else: | |
198 entry_action = identify_action('study', study_alias) | |
199 studies_table.write('\t'.join([study_alias, entry_action, 'ENA_accession', study['title'], | |
182 study['study_type'], study['study_abstract'], '', | 200 study['study_type'], study['study_abstract'], '', |
183 'ENA_submission_data']) + '\n') # assuming no pubmed_id | 201 'ENA_submission_data']) + '\n') # assuming no pubmed_id |
184 for sample_alias, sample in samples_dict.items(): | 202 for sample_alias, sample in samples_dict.items(): |
185 # sample_alias = sample_alias + '_' + timestamp | 203 # sample_alias = sample_alias + '_' + timestamp |
204 if args.dev_submission: | |
205 entry_action = args.action | |
206 else: | |
207 entry_action = identify_action('sample', sample_alias) | |
186 samples_row_values = [sample_alias, sample['title'], sample['scientific_name'], | 208 samples_row_values = [sample_alias, sample['title'], sample['scientific_name'], |
187 sample['sample_description'], action, 'ena_accession', | 209 sample['sample_description'], entry_action, 'ena_accession', |
188 'tax_id_updated_by_ENA', 'ENA_submission_date'] | 210 '', 'ENA_submission_date'] |
189 if args.viral_submission: | 211 if args.viral_submission: |
190 # add the values that are unique for the viral samples | 212 # add the values that are unique for the viral samples |
191 if sample['collector name'] == '': | 213 if sample['collector name'] == '': |
192 sample['collector name'] = 'unknown' | 214 sample['collector name'] = 'unknown' |
193 samples_row_values = samples_row_values + \ | 215 samples_row_values = samples_row_values + \ |
228 for exp_alias, exp in experiments_dict.items(): | 250 for exp_alias, exp in experiments_dict.items(): |
229 # should I check here if any experiment has a study or sample alias that is incorrect? | 251 # should I check here if any experiment has a study or sample alias that is incorrect? |
230 # (not listed in the samples or study dict) | 252 # (not listed in the samples or study dict) |
231 # process the experiments for this sample | 253 # process the experiments for this sample |
232 if exp['sample_alias'] == sample_alias: | 254 if exp['sample_alias'] == sample_alias: |
233 experiments_table.write('\t'.join([exp_alias, action, 'accession_ena', exp['title'], | 255 # check the remote status |
256 if args.dev_submission: | |
257 entry_action = args.action | |
258 else: | |
259 entry_action = identify_action('experiment', exp_alias) | |
260 experiments_table.write('\t'.join([exp_alias, entry_action, 'accession_ena', exp['title'], | |
234 exp['study_alias'], sample_alias, | 261 exp['study_alias'], sample_alias, |
235 exp['design_description'], exp['library_name'], | 262 exp['design_description'], exp['library_name'], |
236 exp['library_strategy'], exp['library_source'], | 263 exp['library_strategy'], exp['library_source'], |
237 exp['library_selection'], | 264 exp['library_selection'], |
238 exp['library_layout'].lower(), | 265 exp['library_layout'].lower(), |
248 runs_list = [run] | 275 runs_list = [run] |
249 else: | 276 else: |
250 runs_list = run | 277 runs_list = run |
251 for run_entry in runs_list: | 278 for run_entry in runs_list: |
252 if run_entry['experiment_alias'] == exp_alias: | 279 if run_entry['experiment_alias'] == exp_alias: |
253 runs_table.write('\t'.join([run_alias, action, 'ena_run_accession', | 280 if args.dev_submission: |
281 entry_action = args.action | |
282 else: | |
283 entry_action = identify_action('run', run_alias) | |
284 runs_table.write('\t'.join([run_alias, entry_action, 'ena_run_accession', | |
254 exp_alias, run_entry['file_name'], | 285 exp_alias, run_entry['file_name'], |
255 FILE_FORMAT, 'file_checksum', | 286 FILE_FORMAT, '', |
256 'submission_date_ENA']) + '\n') | 287 'submission_date_ENA']) + '\n') |
257 runs_included.append(run_alias) | 288 runs_included.append(run_alias) |
258 | 289 |
259 # check if any experiment or run was not associated with any sample | 290 # check if any experiment or run was not associated with any sample |
260 for run in runs_dict.keys(): | 291 for run in runs_dict.keys(): |