Mercurial > repos > ieguinoa > ena_webin_cli
comparison process_input.py @ 0:e25357392813 draft
Uploaded
author | ieguinoa |
---|---|
date | Tue, 18 May 2021 16:30:52 +0000 |
parents | |
children | 7d751b5943b0 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e25357392813 |
---|---|
1 import gzip | |
2 import os | |
3 import sys | |
4 import shutil | |
5 import yaml | |
6 | |
7 from Bio import SeqIO | |
8 | |
9 | |
10 """ | |
11 Takes as input: | |
12 1. A receipt obtained from ENA submission tool. | |
13 A txt file that includes a YAML section with | |
14 | |
15 2. A fasta file with fasta entries ids defined after the files used for the raw submission. | |
16 | |
17 3. Path to write generated manifests | |
18 4. Path to write generated fasta files | |
19 5. manifest template path: the manifest with the global values set (e.g COVERAGE, MINGAPLENGHT..) | |
20 """ | |
21 | |
22 def get_section_string(f, start_line, end_line): | |
23 # consume starting lines | |
24 start_string = iter(f.readline, start_line) | |
25 start_string = ''.join(line for line in start_string) | |
26 # read YAML lines | |
27 yaml_string = iter(f.readline, end_line) | |
28 return ''.join(x for x in yaml_string) | |
29 | |
30 | |
31 def main(): | |
32 input_file = open(sys.argv[1]) | |
33 fasta_in = open(sys.argv[2]) | |
34 out_manifest_base = sys.argv[3] | |
35 out_fasta_base = sys.argv[4] | |
36 manifest_template = sys.argv[5] | |
37 yaml_delimiter = 'YAML -------------\n' | |
38 yaml_only = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter)) | |
39 # print(yaml_only) | |
40 submission_tuples_list = [] | |
41 # parse the sequence IDs | |
42 for record in SeqIO.parse(fasta_in, "fasta"): | |
43 seq_id = record.id | |
44 # need to map the seq ID to 1 or more seq files submitted: for single files these is the exact file name? | |
45 # ... but for paired it may not be the exact same | |
46 # ...initially I should attempt to find a | |
47 # .. if this | |
48 # in any case, if I cant make the right match then I should do a kind of substring match | |
49 | |
50 # find the exp_alias associated with the file | |
51 exp_alias = None | |
52 for index,run in yaml_only['ENA_run'].items(): | |
53 if run['file_name'] == seq_id: | |
54 ## TODO: match also cases when the seq entry name is == entry_[1|2].fastq.gz or something | |
55 exp_alias = run['experiment_alias'] | |
56 break | |
57 if not exp_alias: | |
58 raise Exception("No run files match for the sequence entry {seq_id}") | |
59 # find the sample and study for that experiment | |
60 sample_alias = None | |
61 study_alias = None | |
62 for index,exp in yaml_only['ENA_experiment'].items(): | |
63 if exp['alias'] == exp_alias: | |
64 sample_alias = exp['sample_alias'] | |
65 study_alias = exp['study_alias'] | |
66 platform = exp['platform'] | |
67 break | |
68 if not sample_alias: | |
69 raise Exception("No sample associated with experiment {exp_alias}") | |
70 if not study_alias: | |
71 raise Exception("No study associated with experiment {exp_alias}") | |
72 | |
73 | |
74 # and finally create a fasta file for each sequence (e.g named with the seq id or the run ID) | |
75 fasta_path = os.path.join(out_fasta_base, seq_id + '.fasta') | |
76 with open(fasta_path, "w") as output_handle: | |
77 SeqIO.write([record], output_handle, "fasta") | |
78 #gzip the file (required by ENA upload tool) | |
79 fasta_path_gz = fasta_path + '.gz' | |
80 with open(fasta_path, 'rb') as f_in: | |
81 with gzip.open(fasta_path_gz, 'wb') as f_out: | |
82 shutil.copyfileobj(f_in, f_out) | |
83 # create the manifest | |
84 # add to the manifest the: | |
85 # | |
86 manifest_path = os.path.join(out_manifest_base, seq_id + '.manifest.txt') | |
87 with open(manifest_path, "w") as output_handle: | |
88 # first dump the contents of manifest template | |
89 # containing the global vars | |
90 with open(manifest_template) as m_template: | |
91 output_handle.write(m_template.read()) | |
92 output_handle.write("ASSEMBLYNAME\tconsensus_" + seq_id + "\n") | |
93 output_handle.write("PLATFORM\t" + platform + "\n") | |
94 output_handle.write("STUDY\t" + study_alias + "\n") | |
95 output_handle.write("SAMPLE\t" + sample_alias + "\n") | |
96 output_handle.write("FASTA\t" + fasta_path_gz + "\n") | |
97 | |
98 # ... and a dict (or tuple list???) that contains for each study - sample the name of the file that has the consensus sequence | |
99 # **** is it ok to use the unique ids of the study and sample in the manifest?? or should I use the accessions?? | |
100 # in the latest case then I also need to parse the Study accession details: and Sample accession details: entries | |
101 # samples_dir[study][sample] = seq_id + '.fasta' | |
102 submission_tuples_list.append((manifest_path, fasta_path)) | |
103 | |
104 with open('submit_list.tab', "w") as output_handle: | |
105 for submit_tuple in submission_tuples_list: | |
106 output_handle.write('\t'.join(submit_tuple) + '\n') | |
107 ## DEBUG CASE | |
108 #study details | |
109 # start_study = 'Study accession details:\n' | |
110 # empty_end = '\n' | |
111 # study_data = get_section_string(input_file, start_line=start_study, end_line=empty_end) | |
112 # if len(study_data.split('\n')) > 2: | |
113 # # more than 1 study accession | |
114 # raise Exception("Multiple study accessions found") | |
115 # out_manifest.write(f'STUDY\t{study_data.split()[1]}\n') | |
116 # start_sample = 'Sample accession details:\n' | |
117 # sample_data = get_section_string(input_file, start_line=start_sample, end_line=empty_end) | |
118 # if len(sample_data.split('\n')) > 2: | |
119 # # more than 1 study accession | |
120 # raise Exception("Multiple sample accessions found") | |
121 # out_manifest.write(f'SAMPLE\t{sample_data.split()[1]}\n') | |
122 # platform = 'Ion Torrent' | |
123 # out_manifest.write(f"PLATFORM\t{platform}\n") | |
124 # out_manifest.close() | |
125 | |
126 if __name__ == '__main__': | |
127 main() |