Mercurial > repos > ieguinoa > ena_webin_cli
diff process_input.py @ 3:7d751b5943b0 draft default tip
Uploaded
author | ieguinoa |
---|---|
date | Tue, 22 Feb 2022 11:03:34 +0000 |
parents | e25357392813 |
children |
line wrap: on
line diff
--- a/process_input.py Fri Feb 04 15:52:45 2022 +0000 +++ b/process_input.py Tue Feb 22 11:03:34 2022 +0000 @@ -1,127 +1,127 @@ import gzip +import json import os import sys import shutil import yaml -from Bio import SeqIO - - -""" -Takes as input: - 1. A receipt obtained from ENA submission tool. - A txt file that includes a YAML section with - - 2. A fasta file with fasta entries ids defined after the files used for the raw submission. - - 3. Path to write generated manifests - 4. Path to write generated fasta files - 5. manifest template path: the manifest with the global values set (e.g COVERAGE, MINGAPLENGHT..) -""" - -def get_section_string(f, start_line, end_line): +def get_section_string(f, start_line, end_line, return_string=False): # consume starting lines start_string = iter(f.readline, start_line) start_string = ''.join(line for line in start_string) # read YAML lines yaml_string = iter(f.readline, end_line) - return ''.join(x for x in yaml_string) + if return_string: + return ''.join(x for x in yaml_string) + else: + return [x for x in yaml_string] + +def fill_from_yaml_data(yaml_only_dict, studies_samples_dict): + # fill experiment information (platform) **** + for index,exp in yaml_only_dict['ENA_experiment'].items(): + study_alias = exp['study_alias'] + sample_alias = exp['sample_alias'] + if study_alias in studies_samples_dict.keys(): + if sample_alias in studies_samples_dict[study_alias].keys(): + studies_samples_dict[study_alias][sample_alias]['experiments'].append({'platform': exp['platform']}) + else: + studies_samples_dict[study_alias][sample_alias] = {'experiments': [{'platform': exp['platform']}]} + else: + studies_samples_dict[study_alias] = {sample_alias: {'experiments':[{'platform': exp['platform']}]}} + + +def load_receipt_data(input_file_path): + # should do some health check of the input file? + # load yaml section + loaded_data = {} + yaml_delimiter = 'YAML -------------\n' + with open(input_file_path) as input_file: + yaml_only_section = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter, return_string=True)) + fill_from_yaml_data(yaml_only_section, loaded_data) + # read study accessions + study_delimiter = 'Study accession details:\n' + end_line = '\n' + with open(input_file_path) as input_file: + studies_accession_lines = get_section_string(input_file, start_line=study_delimiter, end_line=end_line) + # loaded_data['studies'] = {} + for study_line in studies_accession_lines: + if study_line != '\n': + alias, accession, *_ = study_line.split('\t') + try: + loaded_data[alias]['accession'] = accession + except KeyError: + print(f"Experiment {exp} has unknown study or sample") + # loaded_data['studies'][alias]['accession'] = accession + samples_delimiter = 'Sample accession details:\n' + with open(input_file_path) as input_file: + samples_accession_lines = get_section_string(input_file, start_line=samples_delimiter, end_line=end_line) + ## need to iterate over all studies, because here I don't know which study is the sample from. + # loaded_data['samples'] = {} + for sample_line in samples_accession_lines: + if sample_line != '\n': + alias, accession, *_ = sample_line.split('\t') + for study in loaded_data.keys(): + if alias in loaded_data[study].keys(): + loaded_data[study][alias]['accession'] = accession + break + return loaded_data + + +""" +Takes as input: + 1. A receipt obtained from ENA submission tool: + a txt file that contains sections describing submission details. + 2. A json file with the list of fasta that the user loaded + 3. Path to write generated manifests + 4. Manifest template path: the manifest with the global values set + (e.g COVERAGE, MINGAPLENGHT..) +""" + def main(): - input_file = open(sys.argv[1]) - fasta_in = open(sys.argv[2]) + input_file_path = sys.argv[1] + fasta_names_list_path = sys.argv[2] out_manifest_base = sys.argv[3] - out_fasta_base = sys.argv[4] - manifest_template = sys.argv[5] - yaml_delimiter = 'YAML -------------\n' - yaml_only = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter)) - # print(yaml_only) - submission_tuples_list = [] - # parse the sequence IDs - for record in SeqIO.parse(fasta_in, "fasta"): - seq_id = record.id - # need to map the seq ID to 1 or more seq files submitted: for single files these is the exact file name? - # ... but for paired it may not be the exact same - # ...initially I should attempt to find a - # .. if this - # in any case, if I cant make the right match then I should do a kind of substring match - - # find the exp_alias associated with the file - exp_alias = None - for index,run in yaml_only['ENA_run'].items(): - if run['file_name'] == seq_id: - ## TODO: match also cases when the seq entry name is == entry_[1|2].fastq.gz or something - exp_alias = run['experiment_alias'] - break - if not exp_alias: - raise Exception("No run files match for the sequence entry {seq_id}") - # find the sample and study for that experiment - sample_alias = None - study_alias = None - for index,exp in yaml_only['ENA_experiment'].items(): - if exp['alias'] == exp_alias: - sample_alias = exp['sample_alias'] - study_alias = exp['study_alias'] - platform = exp['platform'] - break - if not sample_alias: - raise Exception("No sample associated with experiment {exp_alias}") - if not study_alias: - raise Exception("No study associated with experiment {exp_alias}") - + manifest_template = sys.argv[4] + # load submitted data from receipt file + data_dict = load_receipt_data(input_file_path) + # iterate over the list of fasta files + with open(fasta_names_list_path, 'r') as fasta_files_json_file: + fasta_files_list = json.load(fasta_files_json_file) + with open('submit_list.tab', 'w') as written_manifests_out: + for fasta_file in fasta_files_list: + if fasta_file.endswith('.fasta.gz'): + sample_alias = fasta_file[:-9] + else: + sample_alias = fasta_file[:-6] + print(f'Processing {sample_alias}') + found_metadata = False + for study_alias in data_dict.keys(): + if sample_alias in data_dict[study_alias].keys(): + sample_accession = data_dict[study_alias][sample_alias]['accession'] + study_accession = data_dict[study_alias]['accession'] + ### TODO get a string that concatenates plaform information from multiple exp + platform = data_dict[study_alias][sample_alias]['experiments'][0]['platform'] + manifest_path = os.path.join(out_manifest_base, sample_alias + '.manifest.txt') + with open(manifest_path, "w") as output_handle: + # first dump the contents of manifest template + # containing the global vars + with open(manifest_template) as m_template: + output_handle.write(m_template.read()) + output_handle.write("ASSEMBLYNAME\tconsensus_" + sample_alias + "\n") + output_handle.write("PLATFORM\t" + platform + "\n") + output_handle.write("STUDY\t" + study_accession + "\n") + output_handle.write("SAMPLE\t" + sample_accession + "\n") + # files should be available in the corresponding dir and named: + # sample_alias.fasta.gz + output_handle.write("FASTA\t" + sample_alias + '.fasta.gz' + "\n") + found_metadata = True + written_manifests_out.write(manifest_path + '\n') + break + if not found_metadata: + print(f'No metadata found for sample {sample_alias}') - # and finally create a fasta file for each sequence (e.g named with the seq id or the run ID) - fasta_path = os.path.join(out_fasta_base, seq_id + '.fasta') - with open(fasta_path, "w") as output_handle: - SeqIO.write([record], output_handle, "fasta") - #gzip the file (required by ENA upload tool) - fasta_path_gz = fasta_path + '.gz' - with open(fasta_path, 'rb') as f_in: - with gzip.open(fasta_path_gz, 'wb') as f_out: - shutil.copyfileobj(f_in, f_out) - # create the manifest - # add to the manifest the: - # - manifest_path = os.path.join(out_manifest_base, seq_id + '.manifest.txt') - with open(manifest_path, "w") as output_handle: - # first dump the contents of manifest template - # containing the global vars - with open(manifest_template) as m_template: - output_handle.write(m_template.read()) - output_handle.write("ASSEMBLYNAME\tconsensus_" + seq_id + "\n") - output_handle.write("PLATFORM\t" + platform + "\n") - output_handle.write("STUDY\t" + study_alias + "\n") - output_handle.write("SAMPLE\t" + sample_alias + "\n") - output_handle.write("FASTA\t" + fasta_path_gz + "\n") - - # ... and a dict (or tuple list???) that contains for each study - sample the name of the file that has the consensus sequence - # **** is it ok to use the unique ids of the study and sample in the manifest?? or should I use the accessions?? - # in the latest case then I also need to parse the Study accession details: and Sample accession details: entries - # samples_dir[study][sample] = seq_id + '.fasta' - submission_tuples_list.append((manifest_path, fasta_path)) - - with open('submit_list.tab', "w") as output_handle: - for submit_tuple in submission_tuples_list: - output_handle.write('\t'.join(submit_tuple) + '\n') - ## DEBUG CASE - #study details - # start_study = 'Study accession details:\n' - # empty_end = '\n' - # study_data = get_section_string(input_file, start_line=start_study, end_line=empty_end) - # if len(study_data.split('\n')) > 2: - # # more than 1 study accession - # raise Exception("Multiple study accessions found") - # out_manifest.write(f'STUDY\t{study_data.split()[1]}\n') - # start_sample = 'Sample accession details:\n' - # sample_data = get_section_string(input_file, start_line=start_sample, end_line=empty_end) - # if len(sample_data.split('\n')) > 2: - # # more than 1 study accession - # raise Exception("Multiple sample accessions found") - # out_manifest.write(f'SAMPLE\t{sample_data.split()[1]}\n') - # platform = 'Ion Torrent' - # out_manifest.write(f"PLATFORM\t{platform}\n") - # out_manifest.close() if __name__ == '__main__': main()