Mercurial > repos > ieguinoa > ena_webin_cli
comparison process_input.py @ 3:7d751b5943b0 draft default tip
Uploaded
author | ieguinoa |
---|---|
date | Tue, 22 Feb 2022 11:03:34 +0000 |
parents | e25357392813 |
children |
comparison
equal
deleted
inserted
replaced
2:1ecd8ce07db4 | 3:7d751b5943b0 |
---|---|
1 import gzip | 1 import gzip |
2 import json | |
2 import os | 3 import os |
3 import sys | 4 import sys |
4 import shutil | 5 import shutil |
5 import yaml | 6 import yaml |
6 | 7 |
7 from Bio import SeqIO | 8 def get_section_string(f, start_line, end_line, return_string=False): |
8 | |
9 | |
10 """ | |
11 Takes as input: | |
12 1. A receipt obtained from ENA submission tool. | |
13 A txt file that includes a YAML section with | |
14 | |
15 2. A fasta file with fasta entries ids defined after the files used for the raw submission. | |
16 | |
17 3. Path to write generated manifests | |
18 4. Path to write generated fasta files | |
19 5. manifest template path: the manifest with the global values set (e.g COVERAGE, MINGAPLENGHT..) | |
20 """ | |
21 | |
22 def get_section_string(f, start_line, end_line): | |
23 # consume starting lines | 9 # consume starting lines |
24 start_string = iter(f.readline, start_line) | 10 start_string = iter(f.readline, start_line) |
25 start_string = ''.join(line for line in start_string) | 11 start_string = ''.join(line for line in start_string) |
26 # read YAML lines | 12 # read YAML lines |
27 yaml_string = iter(f.readline, end_line) | 13 yaml_string = iter(f.readline, end_line) |
28 return ''.join(x for x in yaml_string) | 14 if return_string: |
15 return ''.join(x for x in yaml_string) | |
16 else: | |
17 return [x for x in yaml_string] | |
18 | |
19 def fill_from_yaml_data(yaml_only_dict, studies_samples_dict): | |
20 # fill experiment information (platform) **** | |
21 for index,exp in yaml_only_dict['ENA_experiment'].items(): | |
22 study_alias = exp['study_alias'] | |
23 sample_alias = exp['sample_alias'] | |
24 if study_alias in studies_samples_dict.keys(): | |
25 if sample_alias in studies_samples_dict[study_alias].keys(): | |
26 studies_samples_dict[study_alias][sample_alias]['experiments'].append({'platform': exp['platform']}) | |
27 else: | |
28 studies_samples_dict[study_alias][sample_alias] = {'experiments': [{'platform': exp['platform']}]} | |
29 else: | |
30 studies_samples_dict[study_alias] = {sample_alias: {'experiments':[{'platform': exp['platform']}]}} | |
31 | |
32 | |
33 def load_receipt_data(input_file_path): | |
34 # should do some health check of the input file? | |
35 # load yaml section | |
36 loaded_data = {} | |
37 yaml_delimiter = 'YAML -------------\n' | |
38 with open(input_file_path) as input_file: | |
39 yaml_only_section = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter, return_string=True)) | |
40 fill_from_yaml_data(yaml_only_section, loaded_data) | |
41 # read study accessions | |
42 study_delimiter = 'Study accession details:\n' | |
43 end_line = '\n' | |
44 with open(input_file_path) as input_file: | |
45 studies_accession_lines = get_section_string(input_file, start_line=study_delimiter, end_line=end_line) | |
46 # loaded_data['studies'] = {} | |
47 for study_line in studies_accession_lines: | |
48 if study_line != '\n': | |
49 alias, accession, *_ = study_line.split('\t') | |
50 try: | |
51 loaded_data[alias]['accession'] = accession | |
52 except KeyError: | |
53 print(f"Experiment {exp} has unknown study or sample") | |
54 # loaded_data['studies'][alias]['accession'] = accession | |
55 samples_delimiter = 'Sample accession details:\n' | |
56 with open(input_file_path) as input_file: | |
57 samples_accession_lines = get_section_string(input_file, start_line=samples_delimiter, end_line=end_line) | |
58 ## need to iterate over all studies, because here I don't know which study is the sample from. | |
59 # loaded_data['samples'] = {} | |
60 for sample_line in samples_accession_lines: | |
61 if sample_line != '\n': | |
62 alias, accession, *_ = sample_line.split('\t') | |
63 for study in loaded_data.keys(): | |
64 if alias in loaded_data[study].keys(): | |
65 loaded_data[study][alias]['accession'] = accession | |
66 break | |
67 return loaded_data | |
68 | |
69 | |
70 """ | |
71 Takes as input: | |
72 1. A receipt obtained from ENA submission tool: | |
73 a txt file that contains sections describing submission details. | |
74 2. A json file with the list of fasta that the user loaded | |
75 3. Path to write generated manifests | |
76 4. Manifest template path: the manifest with the global values set | |
77 (e.g COVERAGE, MINGAPLENGHT..) | |
78 """ | |
79 | |
29 | 80 |
30 | 81 |
31 def main(): | 82 def main(): |
32 input_file = open(sys.argv[1]) | 83 input_file_path = sys.argv[1] |
33 fasta_in = open(sys.argv[2]) | 84 fasta_names_list_path = sys.argv[2] |
34 out_manifest_base = sys.argv[3] | 85 out_manifest_base = sys.argv[3] |
35 out_fasta_base = sys.argv[4] | 86 manifest_template = sys.argv[4] |
36 manifest_template = sys.argv[5] | 87 # load submitted data from receipt file |
37 yaml_delimiter = 'YAML -------------\n' | 88 data_dict = load_receipt_data(input_file_path) |
38 yaml_only = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter)) | 89 # iterate over the list of fasta files |
39 # print(yaml_only) | 90 with open(fasta_names_list_path, 'r') as fasta_files_json_file: |
40 submission_tuples_list = [] | 91 fasta_files_list = json.load(fasta_files_json_file) |
41 # parse the sequence IDs | 92 with open('submit_list.tab', 'w') as written_manifests_out: |
42 for record in SeqIO.parse(fasta_in, "fasta"): | 93 for fasta_file in fasta_files_list: |
43 seq_id = record.id | 94 if fasta_file.endswith('.fasta.gz'): |
44 # need to map the seq ID to 1 or more seq files submitted: for single files these is the exact file name? | 95 sample_alias = fasta_file[:-9] |
45 # ... but for paired it may not be the exact same | 96 else: |
46 # ...initially I should attempt to find a | 97 sample_alias = fasta_file[:-6] |
47 # .. if this | 98 print(f'Processing {sample_alias}') |
48 # in any case, if I cant make the right match then I should do a kind of substring match | 99 found_metadata = False |
100 for study_alias in data_dict.keys(): | |
101 if sample_alias in data_dict[study_alias].keys(): | |
102 sample_accession = data_dict[study_alias][sample_alias]['accession'] | |
103 study_accession = data_dict[study_alias]['accession'] | |
104 ### TODO get a string that concatenates plaform information from multiple exp | |
105 platform = data_dict[study_alias][sample_alias]['experiments'][0]['platform'] | |
106 manifest_path = os.path.join(out_manifest_base, sample_alias + '.manifest.txt') | |
107 with open(manifest_path, "w") as output_handle: | |
108 # first dump the contents of manifest template | |
109 # containing the global vars | |
110 with open(manifest_template) as m_template: | |
111 output_handle.write(m_template.read()) | |
112 output_handle.write("ASSEMBLYNAME\tconsensus_" + sample_alias + "\n") | |
113 output_handle.write("PLATFORM\t" + platform + "\n") | |
114 output_handle.write("STUDY\t" + study_accession + "\n") | |
115 output_handle.write("SAMPLE\t" + sample_accession + "\n") | |
116 # files should be available in the corresponding dir and named: | |
117 # sample_alias.fasta.gz | |
118 output_handle.write("FASTA\t" + sample_alias + '.fasta.gz' + "\n") | |
119 found_metadata = True | |
120 written_manifests_out.write(manifest_path + '\n') | |
121 break | |
122 if not found_metadata: | |
123 print(f'No metadata found for sample {sample_alias}') | |
49 | 124 |
50 # find the exp_alias associated with the file | |
51 exp_alias = None | |
52 for index,run in yaml_only['ENA_run'].items(): | |
53 if run['file_name'] == seq_id: | |
54 ## TODO: match also cases when the seq entry name is == entry_[1|2].fastq.gz or something | |
55 exp_alias = run['experiment_alias'] | |
56 break | |
57 if not exp_alias: | |
58 raise Exception("No run files match for the sequence entry {seq_id}") | |
59 # find the sample and study for that experiment | |
60 sample_alias = None | |
61 study_alias = None | |
62 for index,exp in yaml_only['ENA_experiment'].items(): | |
63 if exp['alias'] == exp_alias: | |
64 sample_alias = exp['sample_alias'] | |
65 study_alias = exp['study_alias'] | |
66 platform = exp['platform'] | |
67 break | |
68 if not sample_alias: | |
69 raise Exception("No sample associated with experiment {exp_alias}") | |
70 if not study_alias: | |
71 raise Exception("No study associated with experiment {exp_alias}") | |
72 | |
73 | |
74 # and finally create a fasta file for each sequence (e.g named with the seq id or the run ID) | |
75 fasta_path = os.path.join(out_fasta_base, seq_id + '.fasta') | |
76 with open(fasta_path, "w") as output_handle: | |
77 SeqIO.write([record], output_handle, "fasta") | |
78 #gzip the file (required by ENA upload tool) | |
79 fasta_path_gz = fasta_path + '.gz' | |
80 with open(fasta_path, 'rb') as f_in: | |
81 with gzip.open(fasta_path_gz, 'wb') as f_out: | |
82 shutil.copyfileobj(f_in, f_out) | |
83 # create the manifest | |
84 # add to the manifest the: | |
85 # | |
86 manifest_path = os.path.join(out_manifest_base, seq_id + '.manifest.txt') | |
87 with open(manifest_path, "w") as output_handle: | |
88 # first dump the contents of manifest template | |
89 # containing the global vars | |
90 with open(manifest_template) as m_template: | |
91 output_handle.write(m_template.read()) | |
92 output_handle.write("ASSEMBLYNAME\tconsensus_" + seq_id + "\n") | |
93 output_handle.write("PLATFORM\t" + platform + "\n") | |
94 output_handle.write("STUDY\t" + study_alias + "\n") | |
95 output_handle.write("SAMPLE\t" + sample_alias + "\n") | |
96 output_handle.write("FASTA\t" + fasta_path_gz + "\n") | |
97 | |
98 # ... and a dict (or tuple list???) that contains for each study - sample the name of the file that has the consensus sequence | |
99 # **** is it ok to use the unique ids of the study and sample in the manifest?? or should I use the accessions?? | |
100 # in the latest case then I also need to parse the Study accession details: and Sample accession details: entries | |
101 # samples_dir[study][sample] = seq_id + '.fasta' | |
102 submission_tuples_list.append((manifest_path, fasta_path)) | |
103 | |
104 with open('submit_list.tab', "w") as output_handle: | |
105 for submit_tuple in submission_tuples_list: | |
106 output_handle.write('\t'.join(submit_tuple) + '\n') | |
107 ## DEBUG CASE | |
108 #study details | |
109 # start_study = 'Study accession details:\n' | |
110 # empty_end = '\n' | |
111 # study_data = get_section_string(input_file, start_line=start_study, end_line=empty_end) | |
112 # if len(study_data.split('\n')) > 2: | |
113 # # more than 1 study accession | |
114 # raise Exception("Multiple study accessions found") | |
115 # out_manifest.write(f'STUDY\t{study_data.split()[1]}\n') | |
116 # start_sample = 'Sample accession details:\n' | |
117 # sample_data = get_section_string(input_file, start_line=start_sample, end_line=empty_end) | |
118 # if len(sample_data.split('\n')) > 2: | |
119 # # more than 1 study accession | |
120 # raise Exception("Multiple sample accessions found") | |
121 # out_manifest.write(f'SAMPLE\t{sample_data.split()[1]}\n') | |
122 # platform = 'Ion Torrent' | |
123 # out_manifest.write(f"PLATFORM\t{platform}\n") | |
124 # out_manifest.close() | |
125 | 125 |
126 if __name__ == '__main__': | 126 if __name__ == '__main__': |
127 main() | 127 main() |