Mercurial > repos > iuc > ena_upload
diff process_xlsx.py @ 4:26ccb678abc8 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ena_upload commit ba358013c83e7dfffec895946d36585f237e54c5"
author | iuc |
---|---|
date | Tue, 19 Oct 2021 15:57:14 +0000 |
parents | 59bb6d34fca6 |
children | 4aab5ae907b6 |
line wrap: on
line diff
--- a/process_xlsx.py Wed Aug 18 19:42:49 2021 +0000 +++ b/process_xlsx.py Tue Oct 19 15:57:14 2021 +0000 @@ -4,11 +4,24 @@ import xlrd import yaml +from check_remote import check_remote_entry from mappings import optional_samples_cols_mapping FILE_FORMAT = 'fastq' +def identify_action(entry_type, alias): + ''' define action ['add' | 'modify'] that needs to be perfomed for this entry ''' + query = {entry_type + '_alias': alias} + remote_accessions = check_remote_entry(entry_type, query) + if len(remote_accessions) > 0: + print(f'Found: {entry_type} entry with alias {alias}') + return 'modify' + else: + print(f'No {entry_type} entry found with alias {alias}') + return 'add' + + def extract_data(xl_sheet, expected_columns, optional_cols=None): """ 1. Check that the columns I expect are present in the sheet @@ -86,6 +99,7 @@ parser.add_argument('--out_dir', dest='out_path', required=True) parser.add_argument('--action', dest='action', required=True) parser.add_argument('--vir', dest='viral_submission', required=False, action='store_true') +parser.add_argument('--dev', dest='dev_submission', required=False, action='store_true') parser.add_argument('--verbose', dest='verbose', required=False, action='store_true') args = parser.parse_args() @@ -148,10 +162,10 @@ samples_cols = samples_cols + ['status', 'accession', 'taxon_id', 'submission_date'] if args.viral_submission: # extend the samples columns with the viral specific data - samples_cols = samples_cols + ['geographic_location', 'host_common_name', - 'host_subject_id', 'host_health_state', 'host_sex', - 'host_scientific_name', 'collector_name', - 'collecting_institution', 'isolate'] + samples_cols = samples_cols + ['geographic location (country and/or sea)', 'host common name', + 'host subject id', 'host health state', 'host sex', + 'host scientific name', 'collector name', + 'collecting institution', 'isolate'] if len(samples_optional_cols_loaded) > 0: for optional_cols_excel in samples_optional_cols_loaded: samples_cols.append(optional_samples_cols_mapping[optional_cols_excel]) @@ -168,7 +182,7 @@ runs_table.write('\t'.join(['alias', 'status', 'accession', 'experiment_alias', 'file_name', 'file_format', 'file_checksum', 'submission_date']) + '\n') action = args.action - +# actionable_items # WRITE DICTIONARIES TO TABLE FILES # ADD A TIMESTAMP TO THE ALIAS? SEEMS LIKE ENA REQUIRES ALL ENTRIES FOR A WEBIN TO HAVE UNIQUE IDS? @@ -178,14 +192,22 @@ exp_included = [] for study_alias, study in studies_dict.items(): # study_alias = study_alias + '_' + timestamp - studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'], + if args.dev_submission: + entry_action = args.action + else: + entry_action = identify_action('study', study_alias) + studies_table.write('\t'.join([study_alias, entry_action, 'ENA_accession', study['title'], study['study_type'], study['study_abstract'], '', 'ENA_submission_data']) + '\n') # assuming no pubmed_id for sample_alias, sample in samples_dict.items(): # sample_alias = sample_alias + '_' + timestamp + if args.dev_submission: + entry_action = args.action + else: + entry_action = identify_action('sample', sample_alias) samples_row_values = [sample_alias, sample['title'], sample['scientific_name'], - sample['sample_description'], action, 'ena_accession', - 'tax_id_updated_by_ENA', 'ENA_submission_date'] + sample['sample_description'], entry_action, 'ena_accession', + '', 'ENA_submission_date'] if args.viral_submission: # add the values that are unique for the viral samples if sample['collector name'] == '': @@ -230,7 +252,12 @@ # (not listed in the samples or study dict) # process the experiments for this sample if exp['sample_alias'] == sample_alias: - experiments_table.write('\t'.join([exp_alias, action, 'accession_ena', exp['title'], + # check the remote status + if args.dev_submission: + entry_action = args.action + else: + entry_action = identify_action('experiment', exp_alias) + experiments_table.write('\t'.join([exp_alias, entry_action, 'accession_ena', exp['title'], exp['study_alias'], sample_alias, exp['design_description'], exp['library_name'], exp['library_strategy'], exp['library_source'], @@ -250,9 +277,13 @@ runs_list = run for run_entry in runs_list: if run_entry['experiment_alias'] == exp_alias: - runs_table.write('\t'.join([run_alias, action, 'ena_run_accession', + if args.dev_submission: + entry_action = args.action + else: + entry_action = identify_action('run', run_alias) + runs_table.write('\t'.join([run_alias, entry_action, 'ena_run_accession', exp_alias, run_entry['file_name'], - FILE_FORMAT, 'file_checksum', + FILE_FORMAT, '', 'submission_date_ENA']) + '\n') runs_included.append(run_alias)