0
|
1 import gzip
|
3
|
2 import json
|
0
|
3 import os
|
|
4 import sys
|
|
5 import shutil
|
|
6 import yaml
|
|
7
|
3
|
8 def get_section_string(f, start_line, end_line, return_string=False):
|
0
|
9 # consume starting lines
|
|
10 start_string = iter(f.readline, start_line)
|
|
11 start_string = ''.join(line for line in start_string)
|
|
12 # read YAML lines
|
|
13 yaml_string = iter(f.readline, end_line)
|
3
|
14 if return_string:
|
|
15 return ''.join(x for x in yaml_string)
|
|
16 else:
|
|
17 return [x for x in yaml_string]
|
|
18
|
|
19 def fill_from_yaml_data(yaml_only_dict, studies_samples_dict):
|
|
20 # fill experiment information (platform) ****
|
|
21 for index,exp in yaml_only_dict['ENA_experiment'].items():
|
|
22 study_alias = exp['study_alias']
|
|
23 sample_alias = exp['sample_alias']
|
|
24 if study_alias in studies_samples_dict.keys():
|
|
25 if sample_alias in studies_samples_dict[study_alias].keys():
|
|
26 studies_samples_dict[study_alias][sample_alias]['experiments'].append({'platform': exp['platform']})
|
|
27 else:
|
|
28 studies_samples_dict[study_alias][sample_alias] = {'experiments': [{'platform': exp['platform']}]}
|
|
29 else:
|
|
30 studies_samples_dict[study_alias] = {sample_alias: {'experiments':[{'platform': exp['platform']}]}}
|
|
31
|
|
32
|
|
33 def load_receipt_data(input_file_path):
|
|
34 # should do some health check of the input file?
|
|
35 # load yaml section
|
|
36 loaded_data = {}
|
|
37 yaml_delimiter = 'YAML -------------\n'
|
|
38 with open(input_file_path) as input_file:
|
|
39 yaml_only_section = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter, return_string=True))
|
|
40 fill_from_yaml_data(yaml_only_section, loaded_data)
|
|
41 # read study accessions
|
|
42 study_delimiter = 'Study accession details:\n'
|
|
43 end_line = '\n'
|
|
44 with open(input_file_path) as input_file:
|
|
45 studies_accession_lines = get_section_string(input_file, start_line=study_delimiter, end_line=end_line)
|
|
46 # loaded_data['studies'] = {}
|
|
47 for study_line in studies_accession_lines:
|
|
48 if study_line != '\n':
|
|
49 alias, accession, *_ = study_line.split('\t')
|
|
50 try:
|
|
51 loaded_data[alias]['accession'] = accession
|
|
52 except KeyError:
|
|
53 print(f"Experiment {exp} has unknown study or sample")
|
|
54 # loaded_data['studies'][alias]['accession'] = accession
|
|
55 samples_delimiter = 'Sample accession details:\n'
|
|
56 with open(input_file_path) as input_file:
|
|
57 samples_accession_lines = get_section_string(input_file, start_line=samples_delimiter, end_line=end_line)
|
|
58 ## need to iterate over all studies, because here I don't know which study is the sample from.
|
|
59 # loaded_data['samples'] = {}
|
|
60 for sample_line in samples_accession_lines:
|
|
61 if sample_line != '\n':
|
|
62 alias, accession, *_ = sample_line.split('\t')
|
|
63 for study in loaded_data.keys():
|
|
64 if alias in loaded_data[study].keys():
|
|
65 loaded_data[study][alias]['accession'] = accession
|
|
66 break
|
|
67 return loaded_data
|
|
68
|
|
69
|
|
70 """
|
|
71 Takes as input:
|
|
72 1. A receipt obtained from ENA submission tool:
|
|
73 a txt file that contains sections describing submission details.
|
|
74 2. A json file with the list of fasta that the user loaded
|
|
75 3. Path to write generated manifests
|
|
76 4. Manifest template path: the manifest with the global values set
|
|
77 (e.g COVERAGE, MINGAPLENGHT..)
|
|
78 """
|
|
79
|
0
|
80
|
|
81
|
|
82 def main():
|
3
|
83 input_file_path = sys.argv[1]
|
|
84 fasta_names_list_path = sys.argv[2]
|
0
|
85 out_manifest_base = sys.argv[3]
|
3
|
86 manifest_template = sys.argv[4]
|
|
87 # load submitted data from receipt file
|
|
88 data_dict = load_receipt_data(input_file_path)
|
|
89 # iterate over the list of fasta files
|
|
90 with open(fasta_names_list_path, 'r') as fasta_files_json_file:
|
|
91 fasta_files_list = json.load(fasta_files_json_file)
|
|
92 with open('submit_list.tab', 'w') as written_manifests_out:
|
|
93 for fasta_file in fasta_files_list:
|
|
94 if fasta_file.endswith('.fasta.gz'):
|
|
95 sample_alias = fasta_file[:-9]
|
|
96 else:
|
|
97 sample_alias = fasta_file[:-6]
|
|
98 print(f'Processing {sample_alias}')
|
|
99 found_metadata = False
|
|
100 for study_alias in data_dict.keys():
|
|
101 if sample_alias in data_dict[study_alias].keys():
|
|
102 sample_accession = data_dict[study_alias][sample_alias]['accession']
|
|
103 study_accession = data_dict[study_alias]['accession']
|
|
104 ### TODO get a string that concatenates plaform information from multiple exp
|
|
105 platform = data_dict[study_alias][sample_alias]['experiments'][0]['platform']
|
|
106 manifest_path = os.path.join(out_manifest_base, sample_alias + '.manifest.txt')
|
|
107 with open(manifest_path, "w") as output_handle:
|
|
108 # first dump the contents of manifest template
|
|
109 # containing the global vars
|
|
110 with open(manifest_template) as m_template:
|
|
111 output_handle.write(m_template.read())
|
|
112 output_handle.write("ASSEMBLYNAME\tconsensus_" + sample_alias + "\n")
|
|
113 output_handle.write("PLATFORM\t" + platform + "\n")
|
|
114 output_handle.write("STUDY\t" + study_accession + "\n")
|
|
115 output_handle.write("SAMPLE\t" + sample_accession + "\n")
|
|
116 # files should be available in the corresponding dir and named:
|
|
117 # sample_alias.fasta.gz
|
|
118 output_handle.write("FASTA\t" + sample_alias + '.fasta.gz' + "\n")
|
|
119 found_metadata = True
|
|
120 written_manifests_out.write(manifest_path + '\n')
|
|
121 break
|
|
122 if not found_metadata:
|
|
123 print(f'No metadata found for sample {sample_alias}')
|
0
|
124
|
|
125
|
|
126 if __name__ == '__main__':
|
|
127 main()
|