comparison dante_gff_output_filtering.py @ 0:77d9f2ecb28a draft

Uploaded
author petr-novak
date Wed, 03 Jul 2019 02:45:00 -0400
parents
children 3151a72a6671
comparison
equal deleted inserted replaced
-1:000000000000 0:77d9f2ecb28a
1 #!/usr/bin/env python3
2
3 import time
4 import configuration
5 import os
6 import textwrap
7 import subprocess
8 from tempfile import NamedTemporaryFile
9 from collections import defaultdict
10
11
12 class Range():
13 '''
14 This class is used to check float range in argparse
15 '''
16
17 def __init__(self, start, end):
18 self.start = start
19 self.end = end
20
21 def __eq__(self, other):
22 return self.start <= other <= self.end
23
24 def __str__(self):
25 return "float range {}..{}".format(self.start, self.end)
26
27 def __repr__(self):
28 return "float range {}..{}".format(self.start, self.end)
29
30
31 def check_file_start(gff_file):
32 count_comment = 0
33 with open(gff_file, "r") as gff_all:
34 line = gff_all.readline()
35 while line.startswith("#"):
36 line = gff_all.readline()
37 count_comment += 1
38 return count_comment
39
40
41 def write_info(filt_dom_tmp, FILT_DOM_GFF, orig_class_dict, filt_class_dict,
42 dom_dict, version_lines):
43 '''
44 Write domains statistics in beginning of filtered GFF
45 '''
46 with open(FILT_DOM_GFF, "w") as filt_gff:
47 for line in version_lines:
48 filt_gff.write(line)
49 filt_gff.write("##CLASSIFICATION\tORIGINAL_COUNTS\tFILTERED_COUNTS\n")
50 if not orig_class_dict:
51 filt_gff.write("##NO DOMAINS CLASSIFICATIONS\n")
52 for classification in sorted(orig_class_dict.keys()):
53 if classification in filt_class_dict.keys():
54 filt_gff.write("##{}\t{}\t{}\n".format(
55 classification, orig_class_dict[
56 classification], filt_class_dict[classification]))
57 else:
58 filt_gff.write("##{}\t{}\t{}\n".format(
59 classification, orig_class_dict[classification], 0))
60 filt_gff.write("##-----------------------------------------------\n"
61 "##SEQ\tDOMAIN\tCOUNTS\n")
62 if not dom_dict:
63 filt_gff.write("##NO DOMAINS\n")
64 for seq in sorted(dom_dict.keys()):
65 for dom, count in sorted(dom_dict[seq].items()):
66 filt_gff.write("##{}\t{}\t{}\n".format(seq, dom, count))
67 filt_gff.write("##-----------------------------------------------\n")
68 with open(filt_dom_tmp.name, "r") as filt_tmp:
69 for line in filt_tmp:
70 filt_gff.write(line)
71
72
73 def get_file_start(gff_file):
74 count_comment = 0
75 lines = []
76 with open(gff_file, "r") as gff_all:
77 line = gff_all.readline()
78 while line.startswith("#"):
79 lines.append(line)
80 line = gff_all.readline()
81 count_comment += 1
82 return count_comment, lines
83
84
85 def filter_qual_dom(DOM_GFF, FILT_DOM_GFF, TH_IDENTITY, TH_SIMILARITY,
86 TH_LENGTH, TH_INTERRUPT, TH_LEN_RATIO, SELECTED_DOM,
87 ELEMENT):
88 ''' Filter gff output based on domain and quality of alignment '''
89 [count_comment, version_lines] = get_file_start(DOM_GFF)
90 filt_dom_tmp = NamedTemporaryFile(delete=False)
91 with open(DOM_GFF, "r") as gff_all, open(filt_dom_tmp.name,
92 "w") as gff_filtered:
93 for comment_idx in range(count_comment):
94 next(gff_all)
95 dom_dict = defaultdict(lambda: defaultdict(int))
96 orig_class_dict = defaultdict(int)
97 filt_class_dict = defaultdict(int)
98 seq_ids_all = []
99 xminimals = []
100 xmaximals = []
101 domains = []
102 xminimals_all = []
103 xmaximals_all = []
104 domains_all = []
105 start = True
106 for line in gff_all:
107 attributes = line.rstrip().split("\t")[-1]
108 classification = attributes.split(";")[1].split("=")[1]
109 orig_class_dict[classification] += 1
110 ## ambiguous domains filtered out automatically
111 if classification != configuration.AMBIGUOUS_TAG:
112 al_identity = float(attributes.split(";")[-5].split("=")[1])
113 al_similarity = float(attributes.split(";")[-4].split("=")[1])
114 al_length = float(attributes.split(";")[-3].split("=")[1])
115 relat_interrupt = float(attributes.split(";")[-2].split("=")[
116 1])
117 db_len_proportion = float(attributes.split(";")[-1].split("=")[
118 1])
119 dom_type = attributes.split(";")[0].split("=")[1]
120 seq_id = line.split("\t")[0]
121 xminimal = int(line.split("\t")[3])
122 xmaximal = int(line.split("\t")[4])
123 if al_identity >= TH_IDENTITY and al_similarity >= TH_SIMILARITY and al_length >= TH_LENGTH and relat_interrupt <= TH_INTERRUPT and db_len_proportion <= TH_LEN_RATIO and (
124 dom_type == SELECTED_DOM or
125 SELECTED_DOM == "All") and (ELEMENT in classification):
126 gff_filtered.writelines(line)
127 filt_class_dict[classification] += 1
128 dom_dict[seq_id][dom_type] += 1
129 if start:
130 seq_ids_all.append(line.split("\t")[0])
131 start = False
132 if seq_id != seq_ids_all[-1]:
133 seq_ids_all.append(seq_id)
134 xminimals_all.append(xminimals)
135 xmaximals_all.append(xmaximals)
136 domains_all.append(domains)
137 xminimals = []
138 xmaximals = []
139 domains = []
140 xminimals.append(xminimal)
141 xmaximals.append(xmaximal)
142 domains.append(dom_type)
143 path = os.path.dirname(os.path.realpath(__file__))
144 write_info(filt_dom_tmp, FILT_DOM_GFF, orig_class_dict, filt_class_dict,
145 dom_dict, version_lines)
146 os.unlink(filt_dom_tmp.name)
147 xminimals_all.append(xminimals)
148 xmaximals_all.append(xmaximals)
149 domains_all.append(domains)
150 return xminimals_all, xmaximals_all, domains_all, seq_ids_all
151
152
153 def get_domains_protseq(FILT_DOM_GFF, DOMAIN_PROT_SEQ):
154 ''' Get the translated protein sequence of original DNA seq for all the filtered domains regions
155 The translated sequences are taken from alignment reported by LASTAL (Query_Seq attribute in GFF)
156 '''
157 count_comment = check_file_start(FILT_DOM_GFF)
158 with open(FILT_DOM_GFF, "r") as filt_gff:
159 for comment_idx in range(count_comment):
160 next(filt_gff)
161 with open(DOMAIN_PROT_SEQ, "w") as dom_prot_file:
162 for line in filt_gff:
163 attributes = line.rstrip().split("\t")[8]
164 positions = attributes.split(";")[3].split("=")[1].split(":")[
165 -1].split("[")[0]
166 dom = attributes.split(";")[0].split("=")[1]
167 dom_class = attributes.split(";")[1].split("=")[1]
168 seq_id = line.rstrip().split("\t")[0]
169 prot_seq_align = line.rstrip().split("\t")[8].split(";")[
170 6].split("=")[1]
171 prot_seq = prot_seq_align.translate({ord(i): None
172 for i in '/\\-'})
173 header_prot_seq = ">{}:{} {} {}".format(seq_id, positions, dom,
174 dom_class)
175 dom_prot_file.write("{}\n{}\n".format(
176 header_prot_seq, textwrap.fill(prot_seq,
177 configuration.FASTA_LINE)))
178
179
180 def main(args):
181
182 t = time.time()
183
184 DOM_GFF = args.dom_gff
185 DOMAIN_PROT_SEQ = args.domains_prot_seq
186 TH_IDENTITY = args.th_identity
187 TH_LENGTH = args.th_length
188 TH_INTERRUPT = args.interruptions
189 TH_SIMILARITY = args.th_similarity
190 TH_LEN_RATIO = args.max_len_proportion
191 FILT_DOM_GFF = args.domains_filtered
192 SELECTED_DOM = args.selected_dom
193 OUTPUT_DIR = args.output_dir
194 # DELETE : ELEMENT = args.element_type.replace("_pipe_", "|")
195 ELEMENT = args.element_type
196
197 if DOMAIN_PROT_SEQ is None:
198 DOMAIN_PROT_SEQ = configuration.DOM_PROT_SEQ
199 if FILT_DOM_GFF is None:
200 FILT_DOM_GFF = configuration.FILT_DOM_GFF
201
202 if OUTPUT_DIR and not os.path.exists(OUTPUT_DIR):
203 os.makedirs(OUTPUT_DIR)
204
205 if not os.path.isabs(FILT_DOM_GFF):
206 if OUTPUT_DIR is None:
207 OUTPUT_DIR = os.path.dirname(os.path.abspath(DOM_GFF))
208 FILT_DOM_GFF = os.path.join(OUTPUT_DIR, os.path.basename(FILT_DOM_GFF))
209 DOMAIN_PROT_SEQ = os.path.join(OUTPUT_DIR,
210 os.path.basename(DOMAIN_PROT_SEQ))
211
212 [xminimals_all, xmaximals_all, domains_all, seq_ids_all] = filter_qual_dom(
213 DOM_GFF, FILT_DOM_GFF, TH_IDENTITY, TH_SIMILARITY, TH_LENGTH,
214 TH_INTERRUPT, TH_LEN_RATIO, SELECTED_DOM, ELEMENT)
215 get_domains_protseq(FILT_DOM_GFF, DOMAIN_PROT_SEQ)
216
217 print("ELAPSED_TIME_DOMAINS = {} s".format(time.time() - t))
218
219
220 if __name__ == "__main__":
221 import argparse
222 from argparse import RawDescriptionHelpFormatter
223
224 class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
225 argparse.RawDescriptionHelpFormatter):
226 pass
227
228 parser = argparse.ArgumentParser(
229 description=
230 '''The script performs DANTE's output filtering for quality and/or extracting specific type of protein domain or mobile elements of origin. For the filtered domains it reports their translated protein sequence of original DNA.
231 WHEN NO PARAMETERS GIVEN, IT PERFORMS QUALITY FILTERING USING THE DEFAULT PARAMETRES (optimized for Viridiplantae species)
232
233 INPUTS:
234 - GFF3 file produced by protein_domains.py OR already filtered GFF3
235
236 FILTERING OPTIONS:
237 > QUALITY: - Min relative length of alignemnt to the protein domain from DB (without gaps)
238 - Identity
239 - Similarity (scoring matrix: BLOSUM82)
240 - Interruption in the reading frame (frameshifts + stop codons) per every starting 100 AA
241 - Max alignment proportion to the original length of database domain sequence
242 > DOMAIN TYPE: choose from choices ('Name' attribute in GFF)
243 Records for ambiguous domain type (e.g. INT/RH) are filtered out automatically
244
245 > MOBILE ELEMENT TYPE:
246 arbitrary substring of the element classification ('Final_Classification' attribute in GFF)
247
248 OUTPUTS:
249 - filtered GFF3 file
250 - fasta file of translated protein sequences (from original DNA) for the aligned domains that match the filtering criteria
251
252 DEPENDENCIES:
253 - python 3.4 or higher
254 > ProfRep modules:
255 - configuration.py
256
257 EXAMPLE OF USAGE:
258 Getting quality filtered integrase(INT) domains of all gypsy transposable elements:
259 ./domains_filtering.py -dom_gff PATH_TO_INPUT_GFF -pdb PATH_TO_PROTEIN_DB -cs PATH_TO_CLASSIFICATION_FILE --selected_dom INT --element_type Ty3/gypsy
260
261 ''',
262 epilog="""""",
263 formatter_class=CustomFormatter)
264 requiredNamed = parser.add_argument_group('required named arguments')
265 requiredNamed.add_argument("-dg",
266 "--dom_gff",
267 type=str,
268 required=True,
269 help="basic unfiltered gff file of all domains")
270 parser.add_argument("-ouf",
271 "--domains_filtered",
272 type=str,
273 help="output filtered domains gff file")
274 parser.add_argument("-dps",
275 "--domains_prot_seq",
276 type=str,
277 help="output file containg domains protein sequences")
278 parser.add_argument("-thl",
279 "--th_length",
280 type=float,
281 choices=[Range(0.0, 1.0)],
282 default=0.8,
283 help="proportion of alignment length threshold")
284 parser.add_argument("-thi",
285 "--th_identity",
286 type=float,
287 choices=[Range(0.0, 1.0)],
288 default=0.35,
289 help="proportion of alignment identity threshold")
290 parser.add_argument("-ths",
291 "--th_similarity",
292 type=float,
293 choices=[Range(0.0, 1.0)],
294 default=0.45,
295 help="threshold for alignment proportional similarity")
296 parser.add_argument(
297 "-ir",
298 "--interruptions",
299 type=int,
300 default=3,
301 help=
302 "interruptions (frameshifts + stop codons) tolerance threshold per 100 AA")
303 parser.add_argument(
304 "-mlen",
305 "--max_len_proportion",
306 type=float,
307 default=1.2,
308 help=
309 "maximal proportion of alignment length to the original length of protein domain from database")
310 parser.add_argument(
311 "-sd",
312 "--selected_dom",
313 type=str,
314 default="All",
315 choices=[
316 "All", "GAG", "INT", "PROT", "RH", "RT", "aRH", "CHDCR", "CHDII",
317 "TPase", "YR", "HEL1", "HEL2", "ENDO"
318 ],
319 help="filter output domains based on the domain type")
320 parser.add_argument(
321 "-el",
322 "--element_type",
323 type=str,
324 default="",
325 help="filter output domains by typing substring from classification")
326 parser.add_argument(
327 "-dir",
328 "--output_dir",
329 type=str,
330 default=None,
331 help="specify if you want to change the output directory")
332 args = parser.parse_args()
333 main(args)