comparison process_input.py @ 0:e25357392813 draft

Uploaded
author ieguinoa
date Tue, 18 May 2021 16:30:52 +0000
parents
children 7d751b5943b0
comparison
equal deleted inserted replaced
-1:000000000000 0:e25357392813
1 import gzip
2 import os
3 import sys
4 import shutil
5 import yaml
6
7 from Bio import SeqIO
8
9
10 """
11 Takes as input:
12 1. A receipt obtained from ENA submission tool.
13 A txt file that includes a YAML section with
14
15 2. A fasta file with fasta entries ids defined after the files used for the raw submission.
16
17 3. Path to write generated manifests
18 4. Path to write generated fasta files
19 5. manifest template path: the manifest with the global values set (e.g COVERAGE, MINGAPLENGHT..)
20 """
21
22 def get_section_string(f, start_line, end_line):
23 # consume starting lines
24 start_string = iter(f.readline, start_line)
25 start_string = ''.join(line for line in start_string)
26 # read YAML lines
27 yaml_string = iter(f.readline, end_line)
28 return ''.join(x for x in yaml_string)
29
30
31 def main():
32 input_file = open(sys.argv[1])
33 fasta_in = open(sys.argv[2])
34 out_manifest_base = sys.argv[3]
35 out_fasta_base = sys.argv[4]
36 manifest_template = sys.argv[5]
37 yaml_delimiter = 'YAML -------------\n'
38 yaml_only = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter))
39 # print(yaml_only)
40 submission_tuples_list = []
41 # parse the sequence IDs
42 for record in SeqIO.parse(fasta_in, "fasta"):
43 seq_id = record.id
44 # need to map the seq ID to 1 or more seq files submitted: for single files these is the exact file name?
45 # ... but for paired it may not be the exact same
46 # ...initially I should attempt to find a
47 # .. if this
48 # in any case, if I cant make the right match then I should do a kind of substring match
49
50 # find the exp_alias associated with the file
51 exp_alias = None
52 for index,run in yaml_only['ENA_run'].items():
53 if run['file_name'] == seq_id:
54 ## TODO: match also cases when the seq entry name is == entry_[1|2].fastq.gz or something
55 exp_alias = run['experiment_alias']
56 break
57 if not exp_alias:
58 raise Exception("No run files match for the sequence entry {seq_id}")
59 # find the sample and study for that experiment
60 sample_alias = None
61 study_alias = None
62 for index,exp in yaml_only['ENA_experiment'].items():
63 if exp['alias'] == exp_alias:
64 sample_alias = exp['sample_alias']
65 study_alias = exp['study_alias']
66 platform = exp['platform']
67 break
68 if not sample_alias:
69 raise Exception("No sample associated with experiment {exp_alias}")
70 if not study_alias:
71 raise Exception("No study associated with experiment {exp_alias}")
72
73
74 # and finally create a fasta file for each sequence (e.g named with the seq id or the run ID)
75 fasta_path = os.path.join(out_fasta_base, seq_id + '.fasta')
76 with open(fasta_path, "w") as output_handle:
77 SeqIO.write([record], output_handle, "fasta")
78 #gzip the file (required by ENA upload tool)
79 fasta_path_gz = fasta_path + '.gz'
80 with open(fasta_path, 'rb') as f_in:
81 with gzip.open(fasta_path_gz, 'wb') as f_out:
82 shutil.copyfileobj(f_in, f_out)
83 # create the manifest
84 # add to the manifest the:
85 #
86 manifest_path = os.path.join(out_manifest_base, seq_id + '.manifest.txt')
87 with open(manifest_path, "w") as output_handle:
88 # first dump the contents of manifest template
89 # containing the global vars
90 with open(manifest_template) as m_template:
91 output_handle.write(m_template.read())
92 output_handle.write("ASSEMBLYNAME\tconsensus_" + seq_id + "\n")
93 output_handle.write("PLATFORM\t" + platform + "\n")
94 output_handle.write("STUDY\t" + study_alias + "\n")
95 output_handle.write("SAMPLE\t" + sample_alias + "\n")
96 output_handle.write("FASTA\t" + fasta_path_gz + "\n")
97
98 # ... and a dict (or tuple list???) that contains for each study - sample the name of the file that has the consensus sequence
99 # **** is it ok to use the unique ids of the study and sample in the manifest?? or should I use the accessions??
100 # in the latest case then I also need to parse the Study accession details: and Sample accession details: entries
101 # samples_dir[study][sample] = seq_id + '.fasta'
102 submission_tuples_list.append((manifest_path, fasta_path))
103
104 with open('submit_list.tab', "w") as output_handle:
105 for submit_tuple in submission_tuples_list:
106 output_handle.write('\t'.join(submit_tuple) + '\n')
107 ## DEBUG CASE
108 #study details
109 # start_study = 'Study accession details:\n'
110 # empty_end = '\n'
111 # study_data = get_section_string(input_file, start_line=start_study, end_line=empty_end)
112 # if len(study_data.split('\n')) > 2:
113 # # more than 1 study accession
114 # raise Exception("Multiple study accessions found")
115 # out_manifest.write(f'STUDY\t{study_data.split()[1]}\n')
116 # start_sample = 'Sample accession details:\n'
117 # sample_data = get_section_string(input_file, start_line=start_sample, end_line=empty_end)
118 # if len(sample_data.split('\n')) > 2:
119 # # more than 1 study accession
120 # raise Exception("Multiple sample accessions found")
121 # out_manifest.write(f'SAMPLE\t{sample_data.split()[1]}\n')
122 # platform = 'Ion Torrent'
123 # out_manifest.write(f"PLATFORM\t{platform}\n")
124 # out_manifest.close()
125
126 if __name__ == '__main__':
127 main()