Mercurial > repos > ieguinoa > ena_webin_cli
comparison process_input.py @ 3:7d751b5943b0 draft default tip
Uploaded
| author | ieguinoa |
|---|---|
| date | Tue, 22 Feb 2022 11:03:34 +0000 |
| parents | e25357392813 |
| children |
comparison
equal
deleted
inserted
replaced
| 2:1ecd8ce07db4 | 3:7d751b5943b0 |
|---|---|
| 1 import gzip | 1 import gzip |
| 2 import json | |
| 2 import os | 3 import os |
| 3 import sys | 4 import sys |
| 4 import shutil | 5 import shutil |
| 5 import yaml | 6 import yaml |
| 6 | 7 |
| 7 from Bio import SeqIO | 8 def get_section_string(f, start_line, end_line, return_string=False): |
| 8 | |
| 9 | |
| 10 """ | |
| 11 Takes as input: | |
| 12 1. A receipt obtained from ENA submission tool. | |
| 13 A txt file that includes a YAML section with | |
| 14 | |
| 15 2. A fasta file with fasta entries ids defined after the files used for the raw submission. | |
| 16 | |
| 17 3. Path to write generated manifests | |
| 18 4. Path to write generated fasta files | |
| 19 5. manifest template path: the manifest with the global values set (e.g COVERAGE, MINGAPLENGHT..) | |
| 20 """ | |
| 21 | |
| 22 def get_section_string(f, start_line, end_line): | |
| 23 # consume starting lines | 9 # consume starting lines |
| 24 start_string = iter(f.readline, start_line) | 10 start_string = iter(f.readline, start_line) |
| 25 start_string = ''.join(line for line in start_string) | 11 start_string = ''.join(line for line in start_string) |
| 26 # read YAML lines | 12 # read YAML lines |
| 27 yaml_string = iter(f.readline, end_line) | 13 yaml_string = iter(f.readline, end_line) |
| 28 return ''.join(x for x in yaml_string) | 14 if return_string: |
| 15 return ''.join(x for x in yaml_string) | |
| 16 else: | |
| 17 return [x for x in yaml_string] | |
| 18 | |
| 19 def fill_from_yaml_data(yaml_only_dict, studies_samples_dict): | |
| 20 # fill experiment information (platform) **** | |
| 21 for index,exp in yaml_only_dict['ENA_experiment'].items(): | |
| 22 study_alias = exp['study_alias'] | |
| 23 sample_alias = exp['sample_alias'] | |
| 24 if study_alias in studies_samples_dict.keys(): | |
| 25 if sample_alias in studies_samples_dict[study_alias].keys(): | |
| 26 studies_samples_dict[study_alias][sample_alias]['experiments'].append({'platform': exp['platform']}) | |
| 27 else: | |
| 28 studies_samples_dict[study_alias][sample_alias] = {'experiments': [{'platform': exp['platform']}]} | |
| 29 else: | |
| 30 studies_samples_dict[study_alias] = {sample_alias: {'experiments':[{'platform': exp['platform']}]}} | |
| 31 | |
| 32 | |
| 33 def load_receipt_data(input_file_path): | |
| 34 # should do some health check of the input file? | |
| 35 # load yaml section | |
| 36 loaded_data = {} | |
| 37 yaml_delimiter = 'YAML -------------\n' | |
| 38 with open(input_file_path) as input_file: | |
| 39 yaml_only_section = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter, return_string=True)) | |
| 40 fill_from_yaml_data(yaml_only_section, loaded_data) | |
| 41 # read study accessions | |
| 42 study_delimiter = 'Study accession details:\n' | |
| 43 end_line = '\n' | |
| 44 with open(input_file_path) as input_file: | |
| 45 studies_accession_lines = get_section_string(input_file, start_line=study_delimiter, end_line=end_line) | |
| 46 # loaded_data['studies'] = {} | |
| 47 for study_line in studies_accession_lines: | |
| 48 if study_line != '\n': | |
| 49 alias, accession, *_ = study_line.split('\t') | |
| 50 try: | |
| 51 loaded_data[alias]['accession'] = accession | |
| 52 except KeyError: | |
| 53 print(f"Experiment {exp} has unknown study or sample") | |
| 54 # loaded_data['studies'][alias]['accession'] = accession | |
| 55 samples_delimiter = 'Sample accession details:\n' | |
| 56 with open(input_file_path) as input_file: | |
| 57 samples_accession_lines = get_section_string(input_file, start_line=samples_delimiter, end_line=end_line) | |
| 58 ## need to iterate over all studies, because here I don't know which study is the sample from. | |
| 59 # loaded_data['samples'] = {} | |
| 60 for sample_line in samples_accession_lines: | |
| 61 if sample_line != '\n': | |
| 62 alias, accession, *_ = sample_line.split('\t') | |
| 63 for study in loaded_data.keys(): | |
| 64 if alias in loaded_data[study].keys(): | |
| 65 loaded_data[study][alias]['accession'] = accession | |
| 66 break | |
| 67 return loaded_data | |
| 68 | |
| 69 | |
| 70 """ | |
| 71 Takes as input: | |
| 72 1. A receipt obtained from ENA submission tool: | |
| 73 a txt file that contains sections describing submission details. | |
| 74 2. A json file with the list of fasta that the user loaded | |
| 75 3. Path to write generated manifests | |
| 76 4. Manifest template path: the manifest with the global values set | |
| 77 (e.g COVERAGE, MINGAPLENGHT..) | |
| 78 """ | |
| 79 | |
| 29 | 80 |
| 30 | 81 |
| 31 def main(): | 82 def main(): |
| 32 input_file = open(sys.argv[1]) | 83 input_file_path = sys.argv[1] |
| 33 fasta_in = open(sys.argv[2]) | 84 fasta_names_list_path = sys.argv[2] |
| 34 out_manifest_base = sys.argv[3] | 85 out_manifest_base = sys.argv[3] |
| 35 out_fasta_base = sys.argv[4] | 86 manifest_template = sys.argv[4] |
| 36 manifest_template = sys.argv[5] | 87 # load submitted data from receipt file |
| 37 yaml_delimiter = 'YAML -------------\n' | 88 data_dict = load_receipt_data(input_file_path) |
| 38 yaml_only = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter)) | 89 # iterate over the list of fasta files |
| 39 # print(yaml_only) | 90 with open(fasta_names_list_path, 'r') as fasta_files_json_file: |
| 40 submission_tuples_list = [] | 91 fasta_files_list = json.load(fasta_files_json_file) |
| 41 # parse the sequence IDs | 92 with open('submit_list.tab', 'w') as written_manifests_out: |
| 42 for record in SeqIO.parse(fasta_in, "fasta"): | 93 for fasta_file in fasta_files_list: |
| 43 seq_id = record.id | 94 if fasta_file.endswith('.fasta.gz'): |
| 44 # need to map the seq ID to 1 or more seq files submitted: for single files these is the exact file name? | 95 sample_alias = fasta_file[:-9] |
| 45 # ... but for paired it may not be the exact same | 96 else: |
| 46 # ...initially I should attempt to find a | 97 sample_alias = fasta_file[:-6] |
| 47 # .. if this | 98 print(f'Processing {sample_alias}') |
| 48 # in any case, if I cant make the right match then I should do a kind of substring match | 99 found_metadata = False |
| 100 for study_alias in data_dict.keys(): | |
| 101 if sample_alias in data_dict[study_alias].keys(): | |
| 102 sample_accession = data_dict[study_alias][sample_alias]['accession'] | |
| 103 study_accession = data_dict[study_alias]['accession'] | |
| 104 ### TODO get a string that concatenates plaform information from multiple exp | |
| 105 platform = data_dict[study_alias][sample_alias]['experiments'][0]['platform'] | |
| 106 manifest_path = os.path.join(out_manifest_base, sample_alias + '.manifest.txt') | |
| 107 with open(manifest_path, "w") as output_handle: | |
| 108 # first dump the contents of manifest template | |
| 109 # containing the global vars | |
| 110 with open(manifest_template) as m_template: | |
| 111 output_handle.write(m_template.read()) | |
| 112 output_handle.write("ASSEMBLYNAME\tconsensus_" + sample_alias + "\n") | |
| 113 output_handle.write("PLATFORM\t" + platform + "\n") | |
| 114 output_handle.write("STUDY\t" + study_accession + "\n") | |
| 115 output_handle.write("SAMPLE\t" + sample_accession + "\n") | |
| 116 # files should be available in the corresponding dir and named: | |
| 117 # sample_alias.fasta.gz | |
| 118 output_handle.write("FASTA\t" + sample_alias + '.fasta.gz' + "\n") | |
| 119 found_metadata = True | |
| 120 written_manifests_out.write(manifest_path + '\n') | |
| 121 break | |
| 122 if not found_metadata: | |
| 123 print(f'No metadata found for sample {sample_alias}') | |
| 49 | 124 |
| 50 # find the exp_alias associated with the file | |
| 51 exp_alias = None | |
| 52 for index,run in yaml_only['ENA_run'].items(): | |
| 53 if run['file_name'] == seq_id: | |
| 54 ## TODO: match also cases when the seq entry name is == entry_[1|2].fastq.gz or something | |
| 55 exp_alias = run['experiment_alias'] | |
| 56 break | |
| 57 if not exp_alias: | |
| 58 raise Exception("No run files match for the sequence entry {seq_id}") | |
| 59 # find the sample and study for that experiment | |
| 60 sample_alias = None | |
| 61 study_alias = None | |
| 62 for index,exp in yaml_only['ENA_experiment'].items(): | |
| 63 if exp['alias'] == exp_alias: | |
| 64 sample_alias = exp['sample_alias'] | |
| 65 study_alias = exp['study_alias'] | |
| 66 platform = exp['platform'] | |
| 67 break | |
| 68 if not sample_alias: | |
| 69 raise Exception("No sample associated with experiment {exp_alias}") | |
| 70 if not study_alias: | |
| 71 raise Exception("No study associated with experiment {exp_alias}") | |
| 72 | |
| 73 | |
| 74 # and finally create a fasta file for each sequence (e.g named with the seq id or the run ID) | |
| 75 fasta_path = os.path.join(out_fasta_base, seq_id + '.fasta') | |
| 76 with open(fasta_path, "w") as output_handle: | |
| 77 SeqIO.write([record], output_handle, "fasta") | |
| 78 #gzip the file (required by ENA upload tool) | |
| 79 fasta_path_gz = fasta_path + '.gz' | |
| 80 with open(fasta_path, 'rb') as f_in: | |
| 81 with gzip.open(fasta_path_gz, 'wb') as f_out: | |
| 82 shutil.copyfileobj(f_in, f_out) | |
| 83 # create the manifest | |
| 84 # add to the manifest the: | |
| 85 # | |
| 86 manifest_path = os.path.join(out_manifest_base, seq_id + '.manifest.txt') | |
| 87 with open(manifest_path, "w") as output_handle: | |
| 88 # first dump the contents of manifest template | |
| 89 # containing the global vars | |
| 90 with open(manifest_template) as m_template: | |
| 91 output_handle.write(m_template.read()) | |
| 92 output_handle.write("ASSEMBLYNAME\tconsensus_" + seq_id + "\n") | |
| 93 output_handle.write("PLATFORM\t" + platform + "\n") | |
| 94 output_handle.write("STUDY\t" + study_alias + "\n") | |
| 95 output_handle.write("SAMPLE\t" + sample_alias + "\n") | |
| 96 output_handle.write("FASTA\t" + fasta_path_gz + "\n") | |
| 97 | |
| 98 # ... and a dict (or tuple list???) that contains for each study - sample the name of the file that has the consensus sequence | |
| 99 # **** is it ok to use the unique ids of the study and sample in the manifest?? or should I use the accessions?? | |
| 100 # in the latest case then I also need to parse the Study accession details: and Sample accession details: entries | |
| 101 # samples_dir[study][sample] = seq_id + '.fasta' | |
| 102 submission_tuples_list.append((manifest_path, fasta_path)) | |
| 103 | |
| 104 with open('submit_list.tab', "w") as output_handle: | |
| 105 for submit_tuple in submission_tuples_list: | |
| 106 output_handle.write('\t'.join(submit_tuple) + '\n') | |
| 107 ## DEBUG CASE | |
| 108 #study details | |
| 109 # start_study = 'Study accession details:\n' | |
| 110 # empty_end = '\n' | |
| 111 # study_data = get_section_string(input_file, start_line=start_study, end_line=empty_end) | |
| 112 # if len(study_data.split('\n')) > 2: | |
| 113 # # more than 1 study accession | |
| 114 # raise Exception("Multiple study accessions found") | |
| 115 # out_manifest.write(f'STUDY\t{study_data.split()[1]}\n') | |
| 116 # start_sample = 'Sample accession details:\n' | |
| 117 # sample_data = get_section_string(input_file, start_line=start_sample, end_line=empty_end) | |
| 118 # if len(sample_data.split('\n')) > 2: | |
| 119 # # more than 1 study accession | |
| 120 # raise Exception("Multiple sample accessions found") | |
| 121 # out_manifest.write(f'SAMPLE\t{sample_data.split()[1]}\n') | |
| 122 # platform = 'Ion Torrent' | |
| 123 # out_manifest.write(f"PLATFORM\t{platform}\n") | |
| 124 # out_manifest.close() | |
| 125 | 125 |
| 126 if __name__ == '__main__': | 126 if __name__ == '__main__': |
| 127 main() | 127 main() |
