ena_upload: process_xlsx.py comparison

comparison process_xlsx.py @ 4:26ccb678abc8 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ena_upload commit ba358013c83e7dfffec895946d36585f237e54c5"

author	iuc
date	Tue, 19 Oct 2021 15:57:14 +0000
parents	59bb6d34fca6
children	4aab5ae907b6

comparison

equal deleted inserted replaced

-:59bb6d34fca6
+:26ccb678abc8
 import pathlib
 import sys
 import xlrd
 import yaml
+from check_remote import check_remote_entry
 from mappings import optional_samples_cols_mapping
 FILE_FORMAT = 'fastq'
+def identify_action(entry_type, alias):
+''' define action ['add' | 'modify'] that needs to be perfomed for this entry '''
+query = {entry_type + '_alias': alias}
+remote_accessions = check_remote_entry(entry_type, query)
+if len(remote_accessions) > 0:
+print(f'Found: {entry_type} entry with alias {alias}')
+return 'modify'
+else:
+print(f'No {entry_type} entry found with alias {alias}')
+return 'add'
 def extract_data(xl_sheet, expected_columns, optional_cols=None):
 """
 1. Check that the columns I expect are present in the sheet
 parser = argparse.ArgumentParser()
 parser.add_argument('--form', dest='xlsx_path', required=True)
 parser.add_argument('--out_dir', dest='out_path', required=True)
 parser.add_argument('--action', dest='action', required=True)
 parser.add_argument('--vir', dest='viral_submission', required=False, action='store_true')
+parser.add_argument('--dev', dest='dev_submission', required=False, action='store_true')
 parser.add_argument('--verbose', dest='verbose', required=False, action='store_true')
 args = parser.parse_args()
 xl_workbook = xlrd.open_workbook(args.xlsx_path)
 samples_cols = ['alias', 'title', 'scientific_name', 'sample_description']
 # extend the samples_cols list to add the ones that are filled by the CLI
 samples_cols = samples_cols + ['status', 'accession', 'taxon_id', 'submission_date']
 if args.viral_submission:
 # extend the samples columns with the viral specific data
-samples_cols = samples_cols + ['geographic_location', 'host_common_name',
+samples_cols = samples_cols + ['geographic location (country and/or sea)', 'host common name',
-'host_subject_id', 'host_health_state', 'host_sex',
+'host subject id', 'host health state', 'host sex',
-'host_scientific_name', 'collector_name',
+'host scientific name', 'collector name',
-'collecting_institution', 'isolate']
+'collecting institution', 'isolate']
 if len(samples_optional_cols_loaded) > 0:
 for optional_cols_excel in samples_optional_cols_loaded:
 samples_cols.append(optional_samples_cols_mapping[optional_cols_excel])
 samples_table.write('\t'.join(samples_cols) + '\n')
 runs_table = open(pathlib.Path(args.out_path) / 'runs.tsv', 'w')
 runs_table.write('\t'.join(['alias', 'status', 'accession', 'experiment_alias', 'file_name',
 'file_format', 'file_checksum', 'submission_date']) + '\n')
 action = args.action
+# actionable_items
 # WRITE  DICTIONARIES TO TABLE FILES
 # ADD A TIMESTAMP TO THE ALIAS? SEEMS LIKE ENA REQUIRES ALL ENTRIES FOR A WEBIN TO HAVE UNIQUE IDS?
 # dt_oobj = datetime.now(tz=None)
 # timestamp = dt_oobj.strftime("%Y%m%d_%H:%M:%S")
 runs_included = []
 exp_included = []
 for study_alias, study in studies_dict.items():
 # study_alias = study_alias + '_' + timestamp
-studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'],
+if args.dev_submission:
+entry_action = args.action
+else:
+entry_action = identify_action('study', study_alias)
+studies_table.write('\t'.join([study_alias, entry_action, 'ENA_accession', study['title'],
 study['study_type'], study['study_abstract'], '',
 'ENA_submission_data']) + '\n')  # assuming no pubmed_id
 for sample_alias, sample in samples_dict.items():
 # sample_alias = sample_alias + '_' + timestamp
+if args.dev_submission:
+entry_action = args.action
+else:
+entry_action = identify_action('sample', sample_alias)
 samples_row_values = [sample_alias, sample['title'], sample['scientific_name'],
-sample['sample_description'], action, 'ena_accession',
+sample['sample_description'], entry_action, 'ena_accession',
-'tax_id_updated_by_ENA', 'ENA_submission_date']
+'', 'ENA_submission_date']
 if args.viral_submission:
 # add the values that are unique for the viral samples
 if sample['collector name'] == '':
 sample['collector name'] = 'unknown'
 samples_row_values = samples_row_values + \
 for exp_alias, exp in experiments_dict.items():
 # should I check here if any experiment has a study or sample alias that is incorrect?
 # (not listed in the samples or study dict)
 # process the experiments for this sample
 if exp['sample_alias'] == sample_alias:
-experiments_table.write('\t'.join([exp_alias, action, 'accession_ena', exp['title'],
+# check the remote status
+if args.dev_submission:
+entry_action = args.action
+else:
+entry_action = identify_action('experiment', exp_alias)
+experiments_table.write('\t'.join([exp_alias, entry_action, 'accession_ena', exp['title'],
 exp['study_alias'], sample_alias,
 exp['design_description'], exp['library_name'],
 exp['library_strategy'], exp['library_source'],
 exp['library_selection'],
 exp['library_layout'].lower(),
 runs_list = [run]
 else:
 runs_list = run
 for run_entry in runs_list:
 if run_entry['experiment_alias'] == exp_alias:
-runs_table.write('\t'.join([run_alias, action, 'ena_run_accession',
+if args.dev_submission:
+entry_action = args.action
+else:
+entry_action = identify_action('run', run_alias)
+runs_table.write('\t'.join([run_alias, entry_action, 'ena_run_accession',
 exp_alias, run_entry['file_name'],
-FILE_FORMAT, 'file_checksum',
+FILE_FORMAT, '',
 'submission_date_ENA']) + '\n')
 runs_included.append(run_alias)
 # check if any experiment or run was not associated with any sample
 for run in runs_dict.keys():

Mercurial > repos > iuc > ena_upload

comparison process_xlsx.py @ 4:26ccb678abc8 draft