Mercurial > repos > petr-novak > dante
comparison dante_gff_output_filtering.py @ 0:77d9f2ecb28a draft
Uploaded
author | petr-novak |
---|---|
date | Wed, 03 Jul 2019 02:45:00 -0400 |
parents | |
children | 3151a72a6671 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:77d9f2ecb28a |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 import time | |
4 import configuration | |
5 import os | |
6 import textwrap | |
7 import subprocess | |
8 from tempfile import NamedTemporaryFile | |
9 from collections import defaultdict | |
10 | |
11 | |
12 class Range(): | |
13 ''' | |
14 This class is used to check float range in argparse | |
15 ''' | |
16 | |
17 def __init__(self, start, end): | |
18 self.start = start | |
19 self.end = end | |
20 | |
21 def __eq__(self, other): | |
22 return self.start <= other <= self.end | |
23 | |
24 def __str__(self): | |
25 return "float range {}..{}".format(self.start, self.end) | |
26 | |
27 def __repr__(self): | |
28 return "float range {}..{}".format(self.start, self.end) | |
29 | |
30 | |
31 def check_file_start(gff_file): | |
32 count_comment = 0 | |
33 with open(gff_file, "r") as gff_all: | |
34 line = gff_all.readline() | |
35 while line.startswith("#"): | |
36 line = gff_all.readline() | |
37 count_comment += 1 | |
38 return count_comment | |
39 | |
40 | |
41 def write_info(filt_dom_tmp, FILT_DOM_GFF, orig_class_dict, filt_class_dict, | |
42 dom_dict, version_lines): | |
43 ''' | |
44 Write domains statistics in beginning of filtered GFF | |
45 ''' | |
46 with open(FILT_DOM_GFF, "w") as filt_gff: | |
47 for line in version_lines: | |
48 filt_gff.write(line) | |
49 filt_gff.write("##CLASSIFICATION\tORIGINAL_COUNTS\tFILTERED_COUNTS\n") | |
50 if not orig_class_dict: | |
51 filt_gff.write("##NO DOMAINS CLASSIFICATIONS\n") | |
52 for classification in sorted(orig_class_dict.keys()): | |
53 if classification in filt_class_dict.keys(): | |
54 filt_gff.write("##{}\t{}\t{}\n".format( | |
55 classification, orig_class_dict[ | |
56 classification], filt_class_dict[classification])) | |
57 else: | |
58 filt_gff.write("##{}\t{}\t{}\n".format( | |
59 classification, orig_class_dict[classification], 0)) | |
60 filt_gff.write("##-----------------------------------------------\n" | |
61 "##SEQ\tDOMAIN\tCOUNTS\n") | |
62 if not dom_dict: | |
63 filt_gff.write("##NO DOMAINS\n") | |
64 for seq in sorted(dom_dict.keys()): | |
65 for dom, count in sorted(dom_dict[seq].items()): | |
66 filt_gff.write("##{}\t{}\t{}\n".format(seq, dom, count)) | |
67 filt_gff.write("##-----------------------------------------------\n") | |
68 with open(filt_dom_tmp.name, "r") as filt_tmp: | |
69 for line in filt_tmp: | |
70 filt_gff.write(line) | |
71 | |
72 | |
73 def get_file_start(gff_file): | |
74 count_comment = 0 | |
75 lines = [] | |
76 with open(gff_file, "r") as gff_all: | |
77 line = gff_all.readline() | |
78 while line.startswith("#"): | |
79 lines.append(line) | |
80 line = gff_all.readline() | |
81 count_comment += 1 | |
82 return count_comment, lines | |
83 | |
84 | |
85 def filter_qual_dom(DOM_GFF, FILT_DOM_GFF, TH_IDENTITY, TH_SIMILARITY, | |
86 TH_LENGTH, TH_INTERRUPT, TH_LEN_RATIO, SELECTED_DOM, | |
87 ELEMENT): | |
88 ''' Filter gff output based on domain and quality of alignment ''' | |
89 [count_comment, version_lines] = get_file_start(DOM_GFF) | |
90 filt_dom_tmp = NamedTemporaryFile(delete=False) | |
91 with open(DOM_GFF, "r") as gff_all, open(filt_dom_tmp.name, | |
92 "w") as gff_filtered: | |
93 for comment_idx in range(count_comment): | |
94 next(gff_all) | |
95 dom_dict = defaultdict(lambda: defaultdict(int)) | |
96 orig_class_dict = defaultdict(int) | |
97 filt_class_dict = defaultdict(int) | |
98 seq_ids_all = [] | |
99 xminimals = [] | |
100 xmaximals = [] | |
101 domains = [] | |
102 xminimals_all = [] | |
103 xmaximals_all = [] | |
104 domains_all = [] | |
105 start = True | |
106 for line in gff_all: | |
107 attributes = line.rstrip().split("\t")[-1] | |
108 classification = attributes.split(";")[1].split("=")[1] | |
109 orig_class_dict[classification] += 1 | |
110 ## ambiguous domains filtered out automatically | |
111 if classification != configuration.AMBIGUOUS_TAG: | |
112 al_identity = float(attributes.split(";")[-5].split("=")[1]) | |
113 al_similarity = float(attributes.split(";")[-4].split("=")[1]) | |
114 al_length = float(attributes.split(";")[-3].split("=")[1]) | |
115 relat_interrupt = float(attributes.split(";")[-2].split("=")[ | |
116 1]) | |
117 db_len_proportion = float(attributes.split(";")[-1].split("=")[ | |
118 1]) | |
119 dom_type = attributes.split(";")[0].split("=")[1] | |
120 seq_id = line.split("\t")[0] | |
121 xminimal = int(line.split("\t")[3]) | |
122 xmaximal = int(line.split("\t")[4]) | |
123 if al_identity >= TH_IDENTITY and al_similarity >= TH_SIMILARITY and al_length >= TH_LENGTH and relat_interrupt <= TH_INTERRUPT and db_len_proportion <= TH_LEN_RATIO and ( | |
124 dom_type == SELECTED_DOM or | |
125 SELECTED_DOM == "All") and (ELEMENT in classification): | |
126 gff_filtered.writelines(line) | |
127 filt_class_dict[classification] += 1 | |
128 dom_dict[seq_id][dom_type] += 1 | |
129 if start: | |
130 seq_ids_all.append(line.split("\t")[0]) | |
131 start = False | |
132 if seq_id != seq_ids_all[-1]: | |
133 seq_ids_all.append(seq_id) | |
134 xminimals_all.append(xminimals) | |
135 xmaximals_all.append(xmaximals) | |
136 domains_all.append(domains) | |
137 xminimals = [] | |
138 xmaximals = [] | |
139 domains = [] | |
140 xminimals.append(xminimal) | |
141 xmaximals.append(xmaximal) | |
142 domains.append(dom_type) | |
143 path = os.path.dirname(os.path.realpath(__file__)) | |
144 write_info(filt_dom_tmp, FILT_DOM_GFF, orig_class_dict, filt_class_dict, | |
145 dom_dict, version_lines) | |
146 os.unlink(filt_dom_tmp.name) | |
147 xminimals_all.append(xminimals) | |
148 xmaximals_all.append(xmaximals) | |
149 domains_all.append(domains) | |
150 return xminimals_all, xmaximals_all, domains_all, seq_ids_all | |
151 | |
152 | |
153 def get_domains_protseq(FILT_DOM_GFF, DOMAIN_PROT_SEQ): | |
154 ''' Get the translated protein sequence of original DNA seq for all the filtered domains regions | |
155 The translated sequences are taken from alignment reported by LASTAL (Query_Seq attribute in GFF) | |
156 ''' | |
157 count_comment = check_file_start(FILT_DOM_GFF) | |
158 with open(FILT_DOM_GFF, "r") as filt_gff: | |
159 for comment_idx in range(count_comment): | |
160 next(filt_gff) | |
161 with open(DOMAIN_PROT_SEQ, "w") as dom_prot_file: | |
162 for line in filt_gff: | |
163 attributes = line.rstrip().split("\t")[8] | |
164 positions = attributes.split(";")[3].split("=")[1].split(":")[ | |
165 -1].split("[")[0] | |
166 dom = attributes.split(";")[0].split("=")[1] | |
167 dom_class = attributes.split(";")[1].split("=")[1] | |
168 seq_id = line.rstrip().split("\t")[0] | |
169 prot_seq_align = line.rstrip().split("\t")[8].split(";")[ | |
170 6].split("=")[1] | |
171 prot_seq = prot_seq_align.translate({ord(i): None | |
172 for i in '/\\-'}) | |
173 header_prot_seq = ">{}:{} {} {}".format(seq_id, positions, dom, | |
174 dom_class) | |
175 dom_prot_file.write("{}\n{}\n".format( | |
176 header_prot_seq, textwrap.fill(prot_seq, | |
177 configuration.FASTA_LINE))) | |
178 | |
179 | |
180 def main(args): | |
181 | |
182 t = time.time() | |
183 | |
184 DOM_GFF = args.dom_gff | |
185 DOMAIN_PROT_SEQ = args.domains_prot_seq | |
186 TH_IDENTITY = args.th_identity | |
187 TH_LENGTH = args.th_length | |
188 TH_INTERRUPT = args.interruptions | |
189 TH_SIMILARITY = args.th_similarity | |
190 TH_LEN_RATIO = args.max_len_proportion | |
191 FILT_DOM_GFF = args.domains_filtered | |
192 SELECTED_DOM = args.selected_dom | |
193 OUTPUT_DIR = args.output_dir | |
194 # DELETE : ELEMENT = args.element_type.replace("_pipe_", "|") | |
195 ELEMENT = args.element_type | |
196 | |
197 if DOMAIN_PROT_SEQ is None: | |
198 DOMAIN_PROT_SEQ = configuration.DOM_PROT_SEQ | |
199 if FILT_DOM_GFF is None: | |
200 FILT_DOM_GFF = configuration.FILT_DOM_GFF | |
201 | |
202 if OUTPUT_DIR and not os.path.exists(OUTPUT_DIR): | |
203 os.makedirs(OUTPUT_DIR) | |
204 | |
205 if not os.path.isabs(FILT_DOM_GFF): | |
206 if OUTPUT_DIR is None: | |
207 OUTPUT_DIR = os.path.dirname(os.path.abspath(DOM_GFF)) | |
208 FILT_DOM_GFF = os.path.join(OUTPUT_DIR, os.path.basename(FILT_DOM_GFF)) | |
209 DOMAIN_PROT_SEQ = os.path.join(OUTPUT_DIR, | |
210 os.path.basename(DOMAIN_PROT_SEQ)) | |
211 | |
212 [xminimals_all, xmaximals_all, domains_all, seq_ids_all] = filter_qual_dom( | |
213 DOM_GFF, FILT_DOM_GFF, TH_IDENTITY, TH_SIMILARITY, TH_LENGTH, | |
214 TH_INTERRUPT, TH_LEN_RATIO, SELECTED_DOM, ELEMENT) | |
215 get_domains_protseq(FILT_DOM_GFF, DOMAIN_PROT_SEQ) | |
216 | |
217 print("ELAPSED_TIME_DOMAINS = {} s".format(time.time() - t)) | |
218 | |
219 | |
220 if __name__ == "__main__": | |
221 import argparse | |
222 from argparse import RawDescriptionHelpFormatter | |
223 | |
224 class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, | |
225 argparse.RawDescriptionHelpFormatter): | |
226 pass | |
227 | |
228 parser = argparse.ArgumentParser( | |
229 description= | |
230 '''The script performs DANTE's output filtering for quality and/or extracting specific type of protein domain or mobile elements of origin. For the filtered domains it reports their translated protein sequence of original DNA. | |
231 WHEN NO PARAMETERS GIVEN, IT PERFORMS QUALITY FILTERING USING THE DEFAULT PARAMETRES (optimized for Viridiplantae species) | |
232 | |
233 INPUTS: | |
234 - GFF3 file produced by protein_domains.py OR already filtered GFF3 | |
235 | |
236 FILTERING OPTIONS: | |
237 > QUALITY: - Min relative length of alignemnt to the protein domain from DB (without gaps) | |
238 - Identity | |
239 - Similarity (scoring matrix: BLOSUM82) | |
240 - Interruption in the reading frame (frameshifts + stop codons) per every starting 100 AA | |
241 - Max alignment proportion to the original length of database domain sequence | |
242 > DOMAIN TYPE: choose from choices ('Name' attribute in GFF) | |
243 Records for ambiguous domain type (e.g. INT/RH) are filtered out automatically | |
244 | |
245 > MOBILE ELEMENT TYPE: | |
246 arbitrary substring of the element classification ('Final_Classification' attribute in GFF) | |
247 | |
248 OUTPUTS: | |
249 - filtered GFF3 file | |
250 - fasta file of translated protein sequences (from original DNA) for the aligned domains that match the filtering criteria | |
251 | |
252 DEPENDENCIES: | |
253 - python 3.4 or higher | |
254 > ProfRep modules: | |
255 - configuration.py | |
256 | |
257 EXAMPLE OF USAGE: | |
258 Getting quality filtered integrase(INT) domains of all gypsy transposable elements: | |
259 ./domains_filtering.py -dom_gff PATH_TO_INPUT_GFF -pdb PATH_TO_PROTEIN_DB -cs PATH_TO_CLASSIFICATION_FILE --selected_dom INT --element_type Ty3/gypsy | |
260 | |
261 ''', | |
262 epilog="""""", | |
263 formatter_class=CustomFormatter) | |
264 requiredNamed = parser.add_argument_group('required named arguments') | |
265 requiredNamed.add_argument("-dg", | |
266 "--dom_gff", | |
267 type=str, | |
268 required=True, | |
269 help="basic unfiltered gff file of all domains") | |
270 parser.add_argument("-ouf", | |
271 "--domains_filtered", | |
272 type=str, | |
273 help="output filtered domains gff file") | |
274 parser.add_argument("-dps", | |
275 "--domains_prot_seq", | |
276 type=str, | |
277 help="output file containg domains protein sequences") | |
278 parser.add_argument("-thl", | |
279 "--th_length", | |
280 type=float, | |
281 choices=[Range(0.0, 1.0)], | |
282 default=0.8, | |
283 help="proportion of alignment length threshold") | |
284 parser.add_argument("-thi", | |
285 "--th_identity", | |
286 type=float, | |
287 choices=[Range(0.0, 1.0)], | |
288 default=0.35, | |
289 help="proportion of alignment identity threshold") | |
290 parser.add_argument("-ths", | |
291 "--th_similarity", | |
292 type=float, | |
293 choices=[Range(0.0, 1.0)], | |
294 default=0.45, | |
295 help="threshold for alignment proportional similarity") | |
296 parser.add_argument( | |
297 "-ir", | |
298 "--interruptions", | |
299 type=int, | |
300 default=3, | |
301 help= | |
302 "interruptions (frameshifts + stop codons) tolerance threshold per 100 AA") | |
303 parser.add_argument( | |
304 "-mlen", | |
305 "--max_len_proportion", | |
306 type=float, | |
307 default=1.2, | |
308 help= | |
309 "maximal proportion of alignment length to the original length of protein domain from database") | |
310 parser.add_argument( | |
311 "-sd", | |
312 "--selected_dom", | |
313 type=str, | |
314 default="All", | |
315 choices=[ | |
316 "All", "GAG", "INT", "PROT", "RH", "RT", "aRH", "CHDCR", "CHDII", | |
317 "TPase", "YR", "HEL1", "HEL2", "ENDO" | |
318 ], | |
319 help="filter output domains based on the domain type") | |
320 parser.add_argument( | |
321 "-el", | |
322 "--element_type", | |
323 type=str, | |
324 default="", | |
325 help="filter output domains by typing substring from classification") | |
326 parser.add_argument( | |
327 "-dir", | |
328 "--output_dir", | |
329 type=str, | |
330 default=None, | |
331 help="specify if you want to change the output directory") | |
332 args = parser.parse_args() | |
333 main(args) |