Mercurial > repos > petr-novak > dante_ltr
comparison detect_putative_ltr_wrapper.py @ 12:ff01d4263391 draft
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
author | petr-novak |
---|---|
date | Thu, 21 Jul 2022 08:23:15 +0000 |
parents | |
children | 559940c04c44 |
comparison
equal
deleted
inserted
replaced
11:54bd36973253 | 12:ff01d4263391 |
---|---|
1 #!/usr/bin/env python | |
2 """This wrapper is intended to be used on large genomes and large DANTE input to | |
3 minimize memory usage, It splits input files to pieces and analyze it on by one by | |
4 detect_putative_ltr.R | |
5 If input does not exceed memory limit, it will run detect_putative_ltr.R directly | |
6 """ | |
7 | |
8 import argparse | |
9 import os | |
10 import sys | |
11 import tempfile | |
12 from itertools import cycle | |
13 import subprocess | |
14 | |
15 | |
16 def get_arguments(): | |
17 """ | |
18 Get arguments from command line | |
19 :return: | |
20 args | |
21 """ | |
22 parser = argparse.ArgumentParser( | |
23 description="""detect_putative_ltr_wrapper.py is a wrapper for | |
24 detect_putative_ltr.R""", formatter_class=argparse.RawTextHelpFormatter | |
25 ) | |
26 parser.add_argument( | |
27 '-g', '--gff3', default=None, required=True, help="gff3 file", type=str, | |
28 action='store' | |
29 ) | |
30 parser.add_argument( | |
31 '-s', '--reference_sequence', default=None, required=True, | |
32 help="reference sequence as fasta file", type=str, action='store' | |
33 ) | |
34 parser.add_argument( | |
35 '-o', '--output', default=None, required=True, help="output file path and prefix", | |
36 type=str, action='store' | |
37 ) | |
38 parser.add_argument( | |
39 '-c', '--cpu', default=1, required=False, help="number of CPUs", type=int, | |
40 action='store' | |
41 ) | |
42 parser.add_argument( | |
43 '-M', '--max_missing_domains', default=0, required=False, type=int | |
44 ) | |
45 parser.add_argument( | |
46 '-L', '--min_relative_length', default=0.6, required=False, type=float, | |
47 help="Minimum relative length of protein domain to be " | |
48 "considered for retrostransposon detection" | |
49 ) | |
50 parser.add_argument( | |
51 '-S', '--max_chunk_size', default=100000000, required=False, type=int, | |
52 help='If size of reference sequence is greater than this value, reference is ' | |
53 'analyzed in chunks of this size. This is just approximate value - ' | |
54 'sequences ' | |
55 'which are longer are are not split, default is %(default)s' | |
56 ) | |
57 args = parser.parse_args() | |
58 return args | |
59 | |
60 | |
61 def read_fasta_sequence_size(fasta_file): | |
62 """Read size of sequence into dictionary""" | |
63 fasta_dict = {} | |
64 with open(fasta_file, 'r') as f: | |
65 for line in f: | |
66 if line[0] == '>': | |
67 header = line.strip().split(' ')[0][1:] # remove part of name after space | |
68 fasta_dict[header] = 0 | |
69 else: | |
70 fasta_dict[header] += len(line.strip()) | |
71 return fasta_dict | |
72 | |
73 | |
74 def make_temp_files(number_of_files): | |
75 """ | |
76 Make named temporary files, file will not be deleted upon exit! | |
77 :param number_of_files: | |
78 :return: | |
79 filepaths | |
80 """ | |
81 temp_files = [] | |
82 for i in range(number_of_files): | |
83 temp_files.append(tempfile.NamedTemporaryFile(delete=False).name) | |
84 return temp_files | |
85 | |
86 | |
87 def sum_up_stats_files(files): | |
88 """ | |
89 Sum up statistics files | |
90 :return: | |
91 """ | |
92 new_statistics = {} | |
93 for file in files: | |
94 with open(file, 'r') as fh: | |
95 for line in fh: | |
96 items = line.strip().split('\t') | |
97 if items[0] == 'Classification': | |
98 header = items | |
99 continue | |
100 else: | |
101 counts = [int(item) for item in items[1:]] | |
102 if items[0] in new_statistics: | |
103 new_statistics[items[0]] = [sum(x) for x in | |
104 zip(new_statistics[items[0]], counts)] | |
105 else: | |
106 new_statistics[items[0]] = counts | |
107 # convert to string, first line is header | |
108 statistics_str = [] | |
109 for classification, counts in new_statistics.items(): | |
110 statistics_str.append(classification + '\t' + '\t'.join([str(x) for x in counts])) | |
111 sorted_stat_with_header = ['\t'.join(header)] + sorted(statistics_str) | |
112 return sorted_stat_with_header | |
113 | |
114 | |
115 def main(): | |
116 """ | |
117 Main function | |
118 """ | |
119 args = get_arguments() | |
120 # locate directory of current script | |
121 tool_path = os.path.dirname(os.path.realpath(__file__)) | |
122 fasta_seq_size = read_fasta_sequence_size(args.reference_sequence) | |
123 total_size = sum(fasta_seq_size.values()) | |
124 number_of_sequences = len(fasta_seq_size) | |
125 if total_size > args.max_chunk_size and number_of_sequences > 1: | |
126 # sort dictionary by values | |
127 seq_id_size_sorted = [i[0] for i in sorted( | |
128 fasta_seq_size.items(), key=lambda x: int(x[1]), reverse=True | |
129 )] | |
130 number_of_temp_files = int(total_size / args.max_chunk_size) + 1 | |
131 if number_of_temp_files > number_of_sequences: | |
132 number_of_temp_files = number_of_sequences | |
133 | |
134 temp_files_fasta = make_temp_files(number_of_temp_files) | |
135 file_handles = [open(temp_file, 'w') for temp_file in temp_files_fasta] | |
136 # make dictionary seq_id_sorted as keys and values as file handles | |
137 seq_id_file_handle_dict = dict(zip(seq_id_size_sorted, cycle(file_handles))) | |
138 | |
139 # write sequences to temporary files | |
140 with open(args.reference_sequence, 'r') as f: | |
141 for line in f: | |
142 if line[0] == '>': | |
143 header = line.strip().split(' ')[0][1:] | |
144 print(header) | |
145 seq_id_file_handle_dict[header].write(line) | |
146 else: | |
147 seq_id_file_handle_dict[header].write(line) | |
148 # close file handles | |
149 for file_handle in file_handles: | |
150 file_handle.close() | |
151 | |
152 # split gff3 file to temporary files - | |
153 # each temporary file will contain gff lines matching fasta | |
154 temp_files_gff = make_temp_files(number_of_temp_files) | |
155 file_handles = [open(temp_file, 'w') for temp_file in temp_files_gff] | |
156 # make dictionary seq_id_sorted as keys and values as file handles | |
157 seq_id_file_handle_dict = dict(zip(seq_id_size_sorted, cycle(file_handles))) | |
158 # write gff lines to chunks | |
159 with open(args.gff3, 'r') as f: | |
160 for line in f: | |
161 if line[0] == '#': | |
162 continue | |
163 else: | |
164 header = line.strip().split('\t')[0] | |
165 seq_id_file_handle_dict[header].write(line) | |
166 # close file handles | |
167 for file_handle in file_handles: | |
168 file_handle.close() | |
169 | |
170 # run retrotransposon detection on each temporary file | |
171 output_files = make_temp_files(number_of_temp_files) | |
172 for i in range(number_of_temp_files): | |
173 print('Running retrotransposon detection on file ' + str(i)) | |
174 subprocess.check_call( | |
175 [f'{tool_path}/detect_putative_ltr.R', '-s', temp_files_fasta[i], '-g', | |
176 temp_files_gff[i], '-o', output_files[i], '-c', str(args.cpu), '-M', | |
177 str(args.max_missing_domains), '-L', str(args.min_relative_length)] | |
178 ) | |
179 | |
180 # remove all temporary input files | |
181 for temp_file in temp_files_fasta + temp_files_gff: | |
182 os.remove(temp_file) | |
183 | |
184 # concatenate output files | |
185 output_file_suffixes = ['_D.fasta', '_DL.fasta', '_DLT.fasta', '_DLTP.fasta', | |
186 '_DLP.fasta', '.gff3', '_statistics.csv'] | |
187 | |
188 for suffix in output_file_suffixes: | |
189 if suffix == '_statistics.csv': | |
190 # sum up line with same word in first column | |
191 stat_files = [output_file + suffix for output_file in output_files] | |
192 new_statistics = sum_up_stats_files(stat_files) | |
193 with open(args.output + suffix, 'w') as f: | |
194 f.write("\n".join(new_statistics)) | |
195 # remove parsed temporary statistics files | |
196 for file in stat_files: | |
197 os.remove(file) | |
198 else: | |
199 with open(args.output + suffix, 'w') as f: | |
200 for i in range(number_of_temp_files): | |
201 # some file may not exist, so we need to check | |
202 try: | |
203 with open(output_files[i] + suffix, 'r') as g: | |
204 for line in g: | |
205 f.write(line) | |
206 # remove parsed temporary output files | |
207 os.remove(output_files[i]) | |
208 except FileNotFoundError: | |
209 pass | |
210 else: | |
211 # no need to split sequences into chunks | |
212 subprocess.check_call( | |
213 [f'{tool_path}/detect_putative_ltr.R', '-s', args.reference_sequence, '-g', | |
214 args.gff3, '-o', args.output, '-c', str(args.cpu), '-M', | |
215 str(args.max_missing_domains), '-L', str(args.min_relative_length)] | |
216 ) | |
217 | |
218 | |
219 if __name__ == '__main__': | |
220 # check version of python must be 3.6 or greater | |
221 if sys.version_info < (3, 6): | |
222 print('Python version must be 3.6 or greater') | |
223 sys.exit(1) | |
224 main() |