annotate process_input.py @ 0:e25357392813 draft

Uploaded
author ieguinoa
date Tue, 18 May 2021 16:30:52 +0000
parents
children 7d751b5943b0
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
1 import gzip
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
2 import os
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
3 import sys
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
4 import shutil
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
5 import yaml
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
6
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
7 from Bio import SeqIO
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
8
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
9
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
10 """
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
11 Takes as input:
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
12 1. A receipt obtained from ENA submission tool.
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
13 A txt file that includes a YAML section with
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
14
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
15 2. A fasta file with fasta entries ids defined after the files used for the raw submission.
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
16
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
17 3. Path to write generated manifests
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
18 4. Path to write generated fasta files
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
19 5. manifest template path: the manifest with the global values set (e.g COVERAGE, MINGAPLENGHT..)
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
20 """
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
21
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
22 def get_section_string(f, start_line, end_line):
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
23 # consume starting lines
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
24 start_string = iter(f.readline, start_line)
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
25 start_string = ''.join(line for line in start_string)
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
26 # read YAML lines
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
27 yaml_string = iter(f.readline, end_line)
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
28 return ''.join(x for x in yaml_string)
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
29
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
30
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
31 def main():
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
32 input_file = open(sys.argv[1])
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
33 fasta_in = open(sys.argv[2])
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
34 out_manifest_base = sys.argv[3]
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
35 out_fasta_base = sys.argv[4]
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
36 manifest_template = sys.argv[5]
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
37 yaml_delimiter = 'YAML -------------\n'
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
38 yaml_only = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter))
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
39 # print(yaml_only)
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
40 submission_tuples_list = []
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
41 # parse the sequence IDs
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
42 for record in SeqIO.parse(fasta_in, "fasta"):
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
43 seq_id = record.id
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
44 # need to map the seq ID to 1 or more seq files submitted: for single files these is the exact file name?
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
45 # ... but for paired it may not be the exact same
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
46 # ...initially I should attempt to find a
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
47 # .. if this
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
48 # in any case, if I cant make the right match then I should do a kind of substring match
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
49
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
50 # find the exp_alias associated with the file
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
51 exp_alias = None
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
52 for index,run in yaml_only['ENA_run'].items():
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
53 if run['file_name'] == seq_id:
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
54 ## TODO: match also cases when the seq entry name is == entry_[1|2].fastq.gz or something
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
55 exp_alias = run['experiment_alias']
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
56 break
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
57 if not exp_alias:
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
58 raise Exception("No run files match for the sequence entry {seq_id}")
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
59 # find the sample and study for that experiment
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
60 sample_alias = None
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
61 study_alias = None
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
62 for index,exp in yaml_only['ENA_experiment'].items():
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
63 if exp['alias'] == exp_alias:
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
64 sample_alias = exp['sample_alias']
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
65 study_alias = exp['study_alias']
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
66 platform = exp['platform']
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
67 break
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
68 if not sample_alias:
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
69 raise Exception("No sample associated with experiment {exp_alias}")
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
70 if not study_alias:
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
71 raise Exception("No study associated with experiment {exp_alias}")
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
72
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
73
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
74 # and finally create a fasta file for each sequence (e.g named with the seq id or the run ID)
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
75 fasta_path = os.path.join(out_fasta_base, seq_id + '.fasta')
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
76 with open(fasta_path, "w") as output_handle:
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
77 SeqIO.write([record], output_handle, "fasta")
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
78 #gzip the file (required by ENA upload tool)
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
79 fasta_path_gz = fasta_path + '.gz'
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
80 with open(fasta_path, 'rb') as f_in:
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
81 with gzip.open(fasta_path_gz, 'wb') as f_out:
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
82 shutil.copyfileobj(f_in, f_out)
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
83 # create the manifest
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
84 # add to the manifest the:
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
85 #
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
86 manifest_path = os.path.join(out_manifest_base, seq_id + '.manifest.txt')
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
87 with open(manifest_path, "w") as output_handle:
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
88 # first dump the contents of manifest template
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
89 # containing the global vars
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
90 with open(manifest_template) as m_template:
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
91 output_handle.write(m_template.read())
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
92 output_handle.write("ASSEMBLYNAME\tconsensus_" + seq_id + "\n")
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
93 output_handle.write("PLATFORM\t" + platform + "\n")
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
94 output_handle.write("STUDY\t" + study_alias + "\n")
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
95 output_handle.write("SAMPLE\t" + sample_alias + "\n")
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
96 output_handle.write("FASTA\t" + fasta_path_gz + "\n")
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
97
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
98 # ... and a dict (or tuple list???) that contains for each study - sample the name of the file that has the consensus sequence
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
99 # **** is it ok to use the unique ids of the study and sample in the manifest?? or should I use the accessions??
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
100 # in the latest case then I also need to parse the Study accession details: and Sample accession details: entries
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
101 # samples_dir[study][sample] = seq_id + '.fasta'
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
102 submission_tuples_list.append((manifest_path, fasta_path))
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
103
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
104 with open('submit_list.tab', "w") as output_handle:
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
105 for submit_tuple in submission_tuples_list:
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
106 output_handle.write('\t'.join(submit_tuple) + '\n')
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
107 ## DEBUG CASE
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
108 #study details
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
109 # start_study = 'Study accession details:\n'
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
110 # empty_end = '\n'
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
111 # study_data = get_section_string(input_file, start_line=start_study, end_line=empty_end)
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
112 # if len(study_data.split('\n')) > 2:
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
113 # # more than 1 study accession
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
114 # raise Exception("Multiple study accessions found")
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
115 # out_manifest.write(f'STUDY\t{study_data.split()[1]}\n')
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
116 # start_sample = 'Sample accession details:\n'
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
117 # sample_data = get_section_string(input_file, start_line=start_sample, end_line=empty_end)
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
118 # if len(sample_data.split('\n')) > 2:
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
119 # # more than 1 study accession
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
120 # raise Exception("Multiple sample accessions found")
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
121 # out_manifest.write(f'SAMPLE\t{sample_data.split()[1]}\n')
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
122 # platform = 'Ion Torrent'
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
123 # out_manifest.write(f"PLATFORM\t{platform}\n")
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
124 # out_manifest.close()
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
125
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
126 if __name__ == '__main__':
e25357392813 Uploaded
ieguinoa
parents:
diff changeset
127 main()