comparison process_xlsx.py @ 4:26ccb678abc8 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ena_upload commit ba358013c83e7dfffec895946d36585f237e54c5"
author iuc
date Tue, 19 Oct 2021 15:57:14 +0000
parents 59bb6d34fca6
children 4aab5ae907b6
comparison
equal deleted inserted replaced
3:59bb6d34fca6 4:26ccb678abc8
2 import pathlib 2 import pathlib
3 import sys 3 import sys
4 4
5 import xlrd 5 import xlrd
6 import yaml 6 import yaml
7 from check_remote import check_remote_entry
7 from mappings import optional_samples_cols_mapping 8 from mappings import optional_samples_cols_mapping
8 9
9 FILE_FORMAT = 'fastq' 10 FILE_FORMAT = 'fastq'
11
12
13 def identify_action(entry_type, alias):
14 ''' define action ['add' | 'modify'] that needs to be perfomed for this entry '''
15 query = {entry_type + '_alias': alias}
16 remote_accessions = check_remote_entry(entry_type, query)
17 if len(remote_accessions) > 0:
18 print(f'Found: {entry_type} entry with alias {alias}')
19 return 'modify'
20 else:
21 print(f'No {entry_type} entry found with alias {alias}')
22 return 'add'
10 23
11 24
12 def extract_data(xl_sheet, expected_columns, optional_cols=None): 25 def extract_data(xl_sheet, expected_columns, optional_cols=None):
13 """ 26 """
14 1. Check that the columns I expect are present in the sheet 27 1. Check that the columns I expect are present in the sheet
84 parser = argparse.ArgumentParser() 97 parser = argparse.ArgumentParser()
85 parser.add_argument('--form', dest='xlsx_path', required=True) 98 parser.add_argument('--form', dest='xlsx_path', required=True)
86 parser.add_argument('--out_dir', dest='out_path', required=True) 99 parser.add_argument('--out_dir', dest='out_path', required=True)
87 parser.add_argument('--action', dest='action', required=True) 100 parser.add_argument('--action', dest='action', required=True)
88 parser.add_argument('--vir', dest='viral_submission', required=False, action='store_true') 101 parser.add_argument('--vir', dest='viral_submission', required=False, action='store_true')
102 parser.add_argument('--dev', dest='dev_submission', required=False, action='store_true')
89 parser.add_argument('--verbose', dest='verbose', required=False, action='store_true') 103 parser.add_argument('--verbose', dest='verbose', required=False, action='store_true')
90 args = parser.parse_args() 104 args = parser.parse_args()
91 105
92 xl_workbook = xlrd.open_workbook(args.xlsx_path) 106 xl_workbook = xlrd.open_workbook(args.xlsx_path)
93 107
146 samples_cols = ['alias', 'title', 'scientific_name', 'sample_description'] 160 samples_cols = ['alias', 'title', 'scientific_name', 'sample_description']
147 # extend the samples_cols list to add the ones that are filled by the CLI 161 # extend the samples_cols list to add the ones that are filled by the CLI
148 samples_cols = samples_cols + ['status', 'accession', 'taxon_id', 'submission_date'] 162 samples_cols = samples_cols + ['status', 'accession', 'taxon_id', 'submission_date']
149 if args.viral_submission: 163 if args.viral_submission:
150 # extend the samples columns with the viral specific data 164 # extend the samples columns with the viral specific data
151 samples_cols = samples_cols + ['geographic_location', 'host_common_name', 165 samples_cols = samples_cols + ['geographic location (country and/or sea)', 'host common name',
152 'host_subject_id', 'host_health_state', 'host_sex', 166 'host subject id', 'host health state', 'host sex',
153 'host_scientific_name', 'collector_name', 167 'host scientific name', 'collector name',
154 'collecting_institution', 'isolate'] 168 'collecting institution', 'isolate']
155 if len(samples_optional_cols_loaded) > 0: 169 if len(samples_optional_cols_loaded) > 0:
156 for optional_cols_excel in samples_optional_cols_loaded: 170 for optional_cols_excel in samples_optional_cols_loaded:
157 samples_cols.append(optional_samples_cols_mapping[optional_cols_excel]) 171 samples_cols.append(optional_samples_cols_mapping[optional_cols_excel])
158 samples_table.write('\t'.join(samples_cols) + '\n') 172 samples_table.write('\t'.join(samples_cols) + '\n')
159 173
166 180
167 runs_table = open(pathlib.Path(args.out_path) / 'runs.tsv', 'w') 181 runs_table = open(pathlib.Path(args.out_path) / 'runs.tsv', 'w')
168 runs_table.write('\t'.join(['alias', 'status', 'accession', 'experiment_alias', 'file_name', 182 runs_table.write('\t'.join(['alias', 'status', 'accession', 'experiment_alias', 'file_name',
169 'file_format', 'file_checksum', 'submission_date']) + '\n') 183 'file_format', 'file_checksum', 'submission_date']) + '\n')
170 action = args.action 184 action = args.action
171 185 # actionable_items
172 # WRITE DICTIONARIES TO TABLE FILES 186 # WRITE DICTIONARIES TO TABLE FILES
173 187
174 # ADD A TIMESTAMP TO THE ALIAS? SEEMS LIKE ENA REQUIRES ALL ENTRIES FOR A WEBIN TO HAVE UNIQUE IDS? 188 # ADD A TIMESTAMP TO THE ALIAS? SEEMS LIKE ENA REQUIRES ALL ENTRIES FOR A WEBIN TO HAVE UNIQUE IDS?
175 # dt_oobj = datetime.now(tz=None) 189 # dt_oobj = datetime.now(tz=None)
176 # timestamp = dt_oobj.strftime("%Y%m%d_%H:%M:%S") 190 # timestamp = dt_oobj.strftime("%Y%m%d_%H:%M:%S")
177 runs_included = [] 191 runs_included = []
178 exp_included = [] 192 exp_included = []
179 for study_alias, study in studies_dict.items(): 193 for study_alias, study in studies_dict.items():
180 # study_alias = study_alias + '_' + timestamp 194 # study_alias = study_alias + '_' + timestamp
181 studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'], 195 if args.dev_submission:
196 entry_action = args.action
197 else:
198 entry_action = identify_action('study', study_alias)
199 studies_table.write('\t'.join([study_alias, entry_action, 'ENA_accession', study['title'],
182 study['study_type'], study['study_abstract'], '', 200 study['study_type'], study['study_abstract'], '',
183 'ENA_submission_data']) + '\n') # assuming no pubmed_id 201 'ENA_submission_data']) + '\n') # assuming no pubmed_id
184 for sample_alias, sample in samples_dict.items(): 202 for sample_alias, sample in samples_dict.items():
185 # sample_alias = sample_alias + '_' + timestamp 203 # sample_alias = sample_alias + '_' + timestamp
204 if args.dev_submission:
205 entry_action = args.action
206 else:
207 entry_action = identify_action('sample', sample_alias)
186 samples_row_values = [sample_alias, sample['title'], sample['scientific_name'], 208 samples_row_values = [sample_alias, sample['title'], sample['scientific_name'],
187 sample['sample_description'], action, 'ena_accession', 209 sample['sample_description'], entry_action, 'ena_accession',
188 'tax_id_updated_by_ENA', 'ENA_submission_date'] 210 '', 'ENA_submission_date']
189 if args.viral_submission: 211 if args.viral_submission:
190 # add the values that are unique for the viral samples 212 # add the values that are unique for the viral samples
191 if sample['collector name'] == '': 213 if sample['collector name'] == '':
192 sample['collector name'] = 'unknown' 214 sample['collector name'] = 'unknown'
193 samples_row_values = samples_row_values + \ 215 samples_row_values = samples_row_values + \
228 for exp_alias, exp in experiments_dict.items(): 250 for exp_alias, exp in experiments_dict.items():
229 # should I check here if any experiment has a study or sample alias that is incorrect? 251 # should I check here if any experiment has a study or sample alias that is incorrect?
230 # (not listed in the samples or study dict) 252 # (not listed in the samples or study dict)
231 # process the experiments for this sample 253 # process the experiments for this sample
232 if exp['sample_alias'] == sample_alias: 254 if exp['sample_alias'] == sample_alias:
233 experiments_table.write('\t'.join([exp_alias, action, 'accession_ena', exp['title'], 255 # check the remote status
256 if args.dev_submission:
257 entry_action = args.action
258 else:
259 entry_action = identify_action('experiment', exp_alias)
260 experiments_table.write('\t'.join([exp_alias, entry_action, 'accession_ena', exp['title'],
234 exp['study_alias'], sample_alias, 261 exp['study_alias'], sample_alias,
235 exp['design_description'], exp['library_name'], 262 exp['design_description'], exp['library_name'],
236 exp['library_strategy'], exp['library_source'], 263 exp['library_strategy'], exp['library_source'],
237 exp['library_selection'], 264 exp['library_selection'],
238 exp['library_layout'].lower(), 265 exp['library_layout'].lower(),
248 runs_list = [run] 275 runs_list = [run]
249 else: 276 else:
250 runs_list = run 277 runs_list = run
251 for run_entry in runs_list: 278 for run_entry in runs_list:
252 if run_entry['experiment_alias'] == exp_alias: 279 if run_entry['experiment_alias'] == exp_alias:
253 runs_table.write('\t'.join([run_alias, action, 'ena_run_accession', 280 if args.dev_submission:
281 entry_action = args.action
282 else:
283 entry_action = identify_action('run', run_alias)
284 runs_table.write('\t'.join([run_alias, entry_action, 'ena_run_accession',
254 exp_alias, run_entry['file_name'], 285 exp_alias, run_entry['file_name'],
255 FILE_FORMAT, 'file_checksum', 286 FILE_FORMAT, '',
256 'submission_date_ENA']) + '\n') 287 'submission_date_ENA']) + '\n')
257 runs_included.append(run_alias) 288 runs_included.append(run_alias)
258 289
259 # check if any experiment or run was not associated with any sample 290 # check if any experiment or run was not associated with any sample
260 for run in runs_dict.keys(): 291 for run in runs_dict.keys():