0
|
1 import gzip
|
|
2 import os
|
|
3 import sys
|
|
4 import shutil
|
|
5 import yaml
|
|
6
|
|
7 from Bio import SeqIO
|
|
8
|
|
9
|
|
10 """
|
|
11 Takes as input:
|
|
12 1. A receipt obtained from ENA submission tool.
|
|
13 A txt file that includes a YAML section with
|
|
14
|
|
15 2. A fasta file with fasta entries ids defined after the files used for the raw submission.
|
|
16
|
|
17 3. Path to write generated manifests
|
|
18 4. Path to write generated fasta files
|
|
19 5. manifest template path: the manifest with the global values set (e.g COVERAGE, MINGAPLENGHT..)
|
|
20 """
|
|
21
|
|
22 def get_section_string(f, start_line, end_line):
|
|
23 # consume starting lines
|
|
24 start_string = iter(f.readline, start_line)
|
|
25 start_string = ''.join(line for line in start_string)
|
|
26 # read YAML lines
|
|
27 yaml_string = iter(f.readline, end_line)
|
|
28 return ''.join(x for x in yaml_string)
|
|
29
|
|
30
|
|
31 def main():
|
|
32 input_file = open(sys.argv[1])
|
|
33 fasta_in = open(sys.argv[2])
|
|
34 out_manifest_base = sys.argv[3]
|
|
35 out_fasta_base = sys.argv[4]
|
|
36 manifest_template = sys.argv[5]
|
|
37 yaml_delimiter = 'YAML -------------\n'
|
|
38 yaml_only = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter))
|
|
39 # print(yaml_only)
|
|
40 submission_tuples_list = []
|
|
41 # parse the sequence IDs
|
|
42 for record in SeqIO.parse(fasta_in, "fasta"):
|
|
43 seq_id = record.id
|
|
44 # need to map the seq ID to 1 or more seq files submitted: for single files these is the exact file name?
|
|
45 # ... but for paired it may not be the exact same
|
|
46 # ...initially I should attempt to find a
|
|
47 # .. if this
|
|
48 # in any case, if I cant make the right match then I should do a kind of substring match
|
|
49
|
|
50 # find the exp_alias associated with the file
|
|
51 exp_alias = None
|
|
52 for index,run in yaml_only['ENA_run'].items():
|
|
53 if run['file_name'] == seq_id:
|
|
54 ## TODO: match also cases when the seq entry name is == entry_[1|2].fastq.gz or something
|
|
55 exp_alias = run['experiment_alias']
|
|
56 break
|
|
57 if not exp_alias:
|
|
58 raise Exception("No run files match for the sequence entry {seq_id}")
|
|
59 # find the sample and study for that experiment
|
|
60 sample_alias = None
|
|
61 study_alias = None
|
|
62 for index,exp in yaml_only['ENA_experiment'].items():
|
|
63 if exp['alias'] == exp_alias:
|
|
64 sample_alias = exp['sample_alias']
|
|
65 study_alias = exp['study_alias']
|
|
66 platform = exp['platform']
|
|
67 break
|
|
68 if not sample_alias:
|
|
69 raise Exception("No sample associated with experiment {exp_alias}")
|
|
70 if not study_alias:
|
|
71 raise Exception("No study associated with experiment {exp_alias}")
|
|
72
|
|
73
|
|
74 # and finally create a fasta file for each sequence (e.g named with the seq id or the run ID)
|
|
75 fasta_path = os.path.join(out_fasta_base, seq_id + '.fasta')
|
|
76 with open(fasta_path, "w") as output_handle:
|
|
77 SeqIO.write([record], output_handle, "fasta")
|
|
78 #gzip the file (required by ENA upload tool)
|
|
79 fasta_path_gz = fasta_path + '.gz'
|
|
80 with open(fasta_path, 'rb') as f_in:
|
|
81 with gzip.open(fasta_path_gz, 'wb') as f_out:
|
|
82 shutil.copyfileobj(f_in, f_out)
|
|
83 # create the manifest
|
|
84 # add to the manifest the:
|
|
85 #
|
|
86 manifest_path = os.path.join(out_manifest_base, seq_id + '.manifest.txt')
|
|
87 with open(manifest_path, "w") as output_handle:
|
|
88 # first dump the contents of manifest template
|
|
89 # containing the global vars
|
|
90 with open(manifest_template) as m_template:
|
|
91 output_handle.write(m_template.read())
|
|
92 output_handle.write("ASSEMBLYNAME\tconsensus_" + seq_id + "\n")
|
|
93 output_handle.write("PLATFORM\t" + platform + "\n")
|
|
94 output_handle.write("STUDY\t" + study_alias + "\n")
|
|
95 output_handle.write("SAMPLE\t" + sample_alias + "\n")
|
|
96 output_handle.write("FASTA\t" + fasta_path_gz + "\n")
|
|
97
|
|
98 # ... and a dict (or tuple list???) that contains for each study - sample the name of the file that has the consensus sequence
|
|
99 # **** is it ok to use the unique ids of the study and sample in the manifest?? or should I use the accessions??
|
|
100 # in the latest case then I also need to parse the Study accession details: and Sample accession details: entries
|
|
101 # samples_dir[study][sample] = seq_id + '.fasta'
|
|
102 submission_tuples_list.append((manifest_path, fasta_path))
|
|
103
|
|
104 with open('submit_list.tab', "w") as output_handle:
|
|
105 for submit_tuple in submission_tuples_list:
|
|
106 output_handle.write('\t'.join(submit_tuple) + '\n')
|
|
107 ## DEBUG CASE
|
|
108 #study details
|
|
109 # start_study = 'Study accession details:\n'
|
|
110 # empty_end = '\n'
|
|
111 # study_data = get_section_string(input_file, start_line=start_study, end_line=empty_end)
|
|
112 # if len(study_data.split('\n')) > 2:
|
|
113 # # more than 1 study accession
|
|
114 # raise Exception("Multiple study accessions found")
|
|
115 # out_manifest.write(f'STUDY\t{study_data.split()[1]}\n')
|
|
116 # start_sample = 'Sample accession details:\n'
|
|
117 # sample_data = get_section_string(input_file, start_line=start_sample, end_line=empty_end)
|
|
118 # if len(sample_data.split('\n')) > 2:
|
|
119 # # more than 1 study accession
|
|
120 # raise Exception("Multiple sample accessions found")
|
|
121 # out_manifest.write(f'SAMPLE\t{sample_data.split()[1]}\n')
|
|
122 # platform = 'Ion Torrent'
|
|
123 # out_manifest.write(f"PLATFORM\t{platform}\n")
|
|
124 # out_manifest.close()
|
|
125
|
|
126 if __name__ == '__main__':
|
|
127 main()
|