comparison process_input.py @ 3:7d751b5943b0 draft default tip

Uploaded
author ieguinoa
date Tue, 22 Feb 2022 11:03:34 +0000
parents e25357392813
children
comparison
equal deleted inserted replaced
2:1ecd8ce07db4 3:7d751b5943b0
1 import gzip 1 import gzip
2 import json
2 import os 3 import os
3 import sys 4 import sys
4 import shutil 5 import shutil
5 import yaml 6 import yaml
6 7
7 from Bio import SeqIO 8 def get_section_string(f, start_line, end_line, return_string=False):
8
9
10 """
11 Takes as input:
12 1. A receipt obtained from ENA submission tool.
13 A txt file that includes a YAML section with
14
15 2. A fasta file with fasta entries ids defined after the files used for the raw submission.
16
17 3. Path to write generated manifests
18 4. Path to write generated fasta files
19 5. manifest template path: the manifest with the global values set (e.g COVERAGE, MINGAPLENGHT..)
20 """
21
22 def get_section_string(f, start_line, end_line):
23 # consume starting lines 9 # consume starting lines
24 start_string = iter(f.readline, start_line) 10 start_string = iter(f.readline, start_line)
25 start_string = ''.join(line for line in start_string) 11 start_string = ''.join(line for line in start_string)
26 # read YAML lines 12 # read YAML lines
27 yaml_string = iter(f.readline, end_line) 13 yaml_string = iter(f.readline, end_line)
28 return ''.join(x for x in yaml_string) 14 if return_string:
15 return ''.join(x for x in yaml_string)
16 else:
17 return [x for x in yaml_string]
18
19 def fill_from_yaml_data(yaml_only_dict, studies_samples_dict):
20 # fill experiment information (platform) ****
21 for index,exp in yaml_only_dict['ENA_experiment'].items():
22 study_alias = exp['study_alias']
23 sample_alias = exp['sample_alias']
24 if study_alias in studies_samples_dict.keys():
25 if sample_alias in studies_samples_dict[study_alias].keys():
26 studies_samples_dict[study_alias][sample_alias]['experiments'].append({'platform': exp['platform']})
27 else:
28 studies_samples_dict[study_alias][sample_alias] = {'experiments': [{'platform': exp['platform']}]}
29 else:
30 studies_samples_dict[study_alias] = {sample_alias: {'experiments':[{'platform': exp['platform']}]}}
31
32
33 def load_receipt_data(input_file_path):
34 # should do some health check of the input file?
35 # load yaml section
36 loaded_data = {}
37 yaml_delimiter = 'YAML -------------\n'
38 with open(input_file_path) as input_file:
39 yaml_only_section = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter, return_string=True))
40 fill_from_yaml_data(yaml_only_section, loaded_data)
41 # read study accessions
42 study_delimiter = 'Study accession details:\n'
43 end_line = '\n'
44 with open(input_file_path) as input_file:
45 studies_accession_lines = get_section_string(input_file, start_line=study_delimiter, end_line=end_line)
46 # loaded_data['studies'] = {}
47 for study_line in studies_accession_lines:
48 if study_line != '\n':
49 alias, accession, *_ = study_line.split('\t')
50 try:
51 loaded_data[alias]['accession'] = accession
52 except KeyError:
53 print(f"Experiment {exp} has unknown study or sample")
54 # loaded_data['studies'][alias]['accession'] = accession
55 samples_delimiter = 'Sample accession details:\n'
56 with open(input_file_path) as input_file:
57 samples_accession_lines = get_section_string(input_file, start_line=samples_delimiter, end_line=end_line)
58 ## need to iterate over all studies, because here I don't know which study is the sample from.
59 # loaded_data['samples'] = {}
60 for sample_line in samples_accession_lines:
61 if sample_line != '\n':
62 alias, accession, *_ = sample_line.split('\t')
63 for study in loaded_data.keys():
64 if alias in loaded_data[study].keys():
65 loaded_data[study][alias]['accession'] = accession
66 break
67 return loaded_data
68
69
70 """
71 Takes as input:
72 1. A receipt obtained from ENA submission tool:
73 a txt file that contains sections describing submission details.
74 2. A json file with the list of fasta that the user loaded
75 3. Path to write generated manifests
76 4. Manifest template path: the manifest with the global values set
77 (e.g COVERAGE, MINGAPLENGHT..)
78 """
79
29 80
30 81
31 def main(): 82 def main():
32 input_file = open(sys.argv[1]) 83 input_file_path = sys.argv[1]
33 fasta_in = open(sys.argv[2]) 84 fasta_names_list_path = sys.argv[2]
34 out_manifest_base = sys.argv[3] 85 out_manifest_base = sys.argv[3]
35 out_fasta_base = sys.argv[4] 86 manifest_template = sys.argv[4]
36 manifest_template = sys.argv[5] 87 # load submitted data from receipt file
37 yaml_delimiter = 'YAML -------------\n' 88 data_dict = load_receipt_data(input_file_path)
38 yaml_only = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter)) 89 # iterate over the list of fasta files
39 # print(yaml_only) 90 with open(fasta_names_list_path, 'r') as fasta_files_json_file:
40 submission_tuples_list = [] 91 fasta_files_list = json.load(fasta_files_json_file)
41 # parse the sequence IDs 92 with open('submit_list.tab', 'w') as written_manifests_out:
42 for record in SeqIO.parse(fasta_in, "fasta"): 93 for fasta_file in fasta_files_list:
43 seq_id = record.id 94 if fasta_file.endswith('.fasta.gz'):
44 # need to map the seq ID to 1 or more seq files submitted: for single files these is the exact file name? 95 sample_alias = fasta_file[:-9]
45 # ... but for paired it may not be the exact same 96 else:
46 # ...initially I should attempt to find a 97 sample_alias = fasta_file[:-6]
47 # .. if this 98 print(f'Processing {sample_alias}')
48 # in any case, if I cant make the right match then I should do a kind of substring match 99 found_metadata = False
100 for study_alias in data_dict.keys():
101 if sample_alias in data_dict[study_alias].keys():
102 sample_accession = data_dict[study_alias][sample_alias]['accession']
103 study_accession = data_dict[study_alias]['accession']
104 ### TODO get a string that concatenates plaform information from multiple exp
105 platform = data_dict[study_alias][sample_alias]['experiments'][0]['platform']
106 manifest_path = os.path.join(out_manifest_base, sample_alias + '.manifest.txt')
107 with open(manifest_path, "w") as output_handle:
108 # first dump the contents of manifest template
109 # containing the global vars
110 with open(manifest_template) as m_template:
111 output_handle.write(m_template.read())
112 output_handle.write("ASSEMBLYNAME\tconsensus_" + sample_alias + "\n")
113 output_handle.write("PLATFORM\t" + platform + "\n")
114 output_handle.write("STUDY\t" + study_accession + "\n")
115 output_handle.write("SAMPLE\t" + sample_accession + "\n")
116 # files should be available in the corresponding dir and named:
117 # sample_alias.fasta.gz
118 output_handle.write("FASTA\t" + sample_alias + '.fasta.gz' + "\n")
119 found_metadata = True
120 written_manifests_out.write(manifest_path + '\n')
121 break
122 if not found_metadata:
123 print(f'No metadata found for sample {sample_alias}')
49 124
50 # find the exp_alias associated with the file
51 exp_alias = None
52 for index,run in yaml_only['ENA_run'].items():
53 if run['file_name'] == seq_id:
54 ## TODO: match also cases when the seq entry name is == entry_[1|2].fastq.gz or something
55 exp_alias = run['experiment_alias']
56 break
57 if not exp_alias:
58 raise Exception("No run files match for the sequence entry {seq_id}")
59 # find the sample and study for that experiment
60 sample_alias = None
61 study_alias = None
62 for index,exp in yaml_only['ENA_experiment'].items():
63 if exp['alias'] == exp_alias:
64 sample_alias = exp['sample_alias']
65 study_alias = exp['study_alias']
66 platform = exp['platform']
67 break
68 if not sample_alias:
69 raise Exception("No sample associated with experiment {exp_alias}")
70 if not study_alias:
71 raise Exception("No study associated with experiment {exp_alias}")
72
73
74 # and finally create a fasta file for each sequence (e.g named with the seq id or the run ID)
75 fasta_path = os.path.join(out_fasta_base, seq_id + '.fasta')
76 with open(fasta_path, "w") as output_handle:
77 SeqIO.write([record], output_handle, "fasta")
78 #gzip the file (required by ENA upload tool)
79 fasta_path_gz = fasta_path + '.gz'
80 with open(fasta_path, 'rb') as f_in:
81 with gzip.open(fasta_path_gz, 'wb') as f_out:
82 shutil.copyfileobj(f_in, f_out)
83 # create the manifest
84 # add to the manifest the:
85 #
86 manifest_path = os.path.join(out_manifest_base, seq_id + '.manifest.txt')
87 with open(manifest_path, "w") as output_handle:
88 # first dump the contents of manifest template
89 # containing the global vars
90 with open(manifest_template) as m_template:
91 output_handle.write(m_template.read())
92 output_handle.write("ASSEMBLYNAME\tconsensus_" + seq_id + "\n")
93 output_handle.write("PLATFORM\t" + platform + "\n")
94 output_handle.write("STUDY\t" + study_alias + "\n")
95 output_handle.write("SAMPLE\t" + sample_alias + "\n")
96 output_handle.write("FASTA\t" + fasta_path_gz + "\n")
97
98 # ... and a dict (or tuple list???) that contains for each study - sample the name of the file that has the consensus sequence
99 # **** is it ok to use the unique ids of the study and sample in the manifest?? or should I use the accessions??
100 # in the latest case then I also need to parse the Study accession details: and Sample accession details: entries
101 # samples_dir[study][sample] = seq_id + '.fasta'
102 submission_tuples_list.append((manifest_path, fasta_path))
103
104 with open('submit_list.tab', "w") as output_handle:
105 for submit_tuple in submission_tuples_list:
106 output_handle.write('\t'.join(submit_tuple) + '\n')
107 ## DEBUG CASE
108 #study details
109 # start_study = 'Study accession details:\n'
110 # empty_end = '\n'
111 # study_data = get_section_string(input_file, start_line=start_study, end_line=empty_end)
112 # if len(study_data.split('\n')) > 2:
113 # # more than 1 study accession
114 # raise Exception("Multiple study accessions found")
115 # out_manifest.write(f'STUDY\t{study_data.split()[1]}\n')
116 # start_sample = 'Sample accession details:\n'
117 # sample_data = get_section_string(input_file, start_line=start_sample, end_line=empty_end)
118 # if len(sample_data.split('\n')) > 2:
119 # # more than 1 study accession
120 # raise Exception("Multiple sample accessions found")
121 # out_manifest.write(f'SAMPLE\t{sample_data.split()[1]}\n')
122 # platform = 'Ion Torrent'
123 # out_manifest.write(f"PLATFORM\t{platform}\n")
124 # out_manifest.close()
125 125
126 if __name__ == '__main__': 126 if __name__ == '__main__':
127 main() 127 main()