0
|
1 #!/usr/bin/env python3
|
|
2
|
|
3 import time
|
|
4 import configuration
|
|
5 import os
|
|
6 import textwrap
|
|
7 import subprocess
|
|
8 from tempfile import NamedTemporaryFile
|
|
9 from collections import defaultdict
|
|
10
|
|
11
|
|
12 class Range():
|
|
13 '''
|
|
14 This class is used to check float range in argparse
|
|
15 '''
|
|
16
|
|
17 def __init__(self, start, end):
|
|
18 self.start = start
|
|
19 self.end = end
|
|
20
|
|
21 def __eq__(self, other):
|
|
22 return self.start <= other <= self.end
|
|
23
|
|
24 def __str__(self):
|
|
25 return "float range {}..{}".format(self.start, self.end)
|
|
26
|
|
27 def __repr__(self):
|
|
28 return "float range {}..{}".format(self.start, self.end)
|
|
29
|
|
30
|
|
31 def check_file_start(gff_file):
|
|
32 count_comment = 0
|
|
33 with open(gff_file, "r") as gff_all:
|
|
34 line = gff_all.readline()
|
|
35 while line.startswith("#"):
|
|
36 line = gff_all.readline()
|
|
37 count_comment += 1
|
|
38 return count_comment
|
|
39
|
|
40
|
|
41 def write_info(filt_dom_tmp, FILT_DOM_GFF, orig_class_dict, filt_class_dict,
|
|
42 dom_dict, version_lines):
|
|
43 '''
|
|
44 Write domains statistics in beginning of filtered GFF
|
|
45 '''
|
|
46 with open(FILT_DOM_GFF, "w") as filt_gff:
|
|
47 for line in version_lines:
|
|
48 filt_gff.write(line)
|
|
49 filt_gff.write("##CLASSIFICATION\tORIGINAL_COUNTS\tFILTERED_COUNTS\n")
|
|
50 if not orig_class_dict:
|
|
51 filt_gff.write("##NO DOMAINS CLASSIFICATIONS\n")
|
|
52 for classification in sorted(orig_class_dict.keys()):
|
|
53 if classification in filt_class_dict.keys():
|
|
54 filt_gff.write("##{}\t{}\t{}\n".format(
|
|
55 classification, orig_class_dict[
|
|
56 classification], filt_class_dict[classification]))
|
|
57 else:
|
|
58 filt_gff.write("##{}\t{}\t{}\n".format(
|
|
59 classification, orig_class_dict[classification], 0))
|
|
60 filt_gff.write("##-----------------------------------------------\n"
|
|
61 "##SEQ\tDOMAIN\tCOUNTS\n")
|
|
62 if not dom_dict:
|
|
63 filt_gff.write("##NO DOMAINS\n")
|
|
64 for seq in sorted(dom_dict.keys()):
|
|
65 for dom, count in sorted(dom_dict[seq].items()):
|
|
66 filt_gff.write("##{}\t{}\t{}\n".format(seq, dom, count))
|
|
67 filt_gff.write("##-----------------------------------------------\n")
|
|
68 with open(filt_dom_tmp.name, "r") as filt_tmp:
|
|
69 for line in filt_tmp:
|
|
70 filt_gff.write(line)
|
|
71
|
|
72
|
|
73 def get_file_start(gff_file):
|
|
74 count_comment = 0
|
|
75 lines = []
|
|
76 with open(gff_file, "r") as gff_all:
|
|
77 line = gff_all.readline()
|
|
78 while line.startswith("#"):
|
|
79 lines.append(line)
|
|
80 line = gff_all.readline()
|
|
81 count_comment += 1
|
|
82 return count_comment, lines
|
|
83
|
|
84
|
15
|
85 def parse_gff_line(line):
|
|
86 '''Return dictionary with gff fields and atributers
|
|
87 Note - type of fields is strings
|
|
88 '''
|
|
89 # order of first 9 column is fixed
|
|
90 gff_line = dict(
|
|
91 zip(
|
|
92 ['seqid', 'source', 'type', 'start', 'end',
|
|
93 'score', 'strand', 'phase', 'attributes'],
|
|
94 line.split("\t")
|
|
95 )
|
|
96 )
|
|
97 # split attributes and replace:
|
|
98 gff_line['attributes'] = dict([i.split("=") for i in gff_line['attributes'].split(";")])
|
|
99 return gff_line
|
|
100
|
0
|
101 def filter_qual_dom(DOM_GFF, FILT_DOM_GFF, TH_IDENTITY, TH_SIMILARITY,
|
|
102 TH_LENGTH, TH_INTERRUPT, TH_LEN_RATIO, SELECTED_DOM,
|
|
103 ELEMENT):
|
|
104 ''' Filter gff output based on domain and quality of alignment '''
|
|
105 [count_comment, version_lines] = get_file_start(DOM_GFF)
|
|
106 filt_dom_tmp = NamedTemporaryFile(delete=False)
|
|
107 with open(DOM_GFF, "r") as gff_all, open(filt_dom_tmp.name,
|
|
108 "w") as gff_filtered:
|
15
|
109 for _ in range(count_comment):
|
0
|
110 next(gff_all)
|
|
111 dom_dict = defaultdict(lambda: defaultdict(int))
|
|
112 orig_class_dict = defaultdict(int)
|
|
113 filt_class_dict = defaultdict(int)
|
|
114 seq_ids_all = []
|
|
115 xminimals = []
|
|
116 xmaximals = []
|
|
117 domains = []
|
|
118 xminimals_all = []
|
|
119 xmaximals_all = []
|
|
120 domains_all = []
|
|
121 start = True
|
|
122 for line in gff_all:
|
|
123 attributes = line.rstrip().split("\t")[-1]
|
|
124 classification = attributes.split(";")[1].split("=")[1]
|
|
125 orig_class_dict[classification] += 1
|
|
126 ## ambiguous domains filtered out automatically
|
|
127 if classification != configuration.AMBIGUOUS_TAG:
|
15
|
128 gff_line = parse_gff_line(line)
|
|
129 al_identity = float(gff_line['attributes']['Identity'])
|
|
130 al_similarity = float(gff_line['attributes']['Similarity'])
|
|
131 al_length = float(gff_line['attributes']['Relat_Length'])
|
|
132 relat_interrupt = float(gff_line['attributes']['Relat_Interruptions'])
|
|
133 db_len_proportion = float(gff_line['attributes']['Hit_to_DB_Length'])
|
|
134 dom_type = gff_line['attributes']['Final_Classification']
|
|
135 seq_id = gff_line['seqid']
|
|
136 xminimal = int(gff_line['start'])
|
|
137 xmaximal = int(gff_line['end'])
|
|
138 c1 = al_identity >= TH_IDENTITY
|
|
139 c2 = al_similarity >= TH_SIMILARITY
|
|
140 if (c1 and c2 and al_length >= TH_LENGTH and relat_interrupt <= TH_INTERRUPT and
|
|
141 db_len_proportion <= TH_LEN_RATIO and
|
|
142 (dom_type == SELECTED_DOM or SELECTED_DOM == "All") and
|
|
143 (ELEMENT in classification)):
|
0
|
144 gff_filtered.writelines(line)
|
|
145 filt_class_dict[classification] += 1
|
|
146 dom_dict[seq_id][dom_type] += 1
|
|
147 if start:
|
|
148 seq_ids_all.append(line.split("\t")[0])
|
|
149 start = False
|
|
150 if seq_id != seq_ids_all[-1]:
|
|
151 seq_ids_all.append(seq_id)
|
|
152 xminimals_all.append(xminimals)
|
|
153 xmaximals_all.append(xmaximals)
|
|
154 domains_all.append(domains)
|
|
155 xminimals = []
|
|
156 xmaximals = []
|
|
157 domains = []
|
|
158 xminimals.append(xminimal)
|
|
159 xmaximals.append(xmaximal)
|
|
160 domains.append(dom_type)
|
|
161 path = os.path.dirname(os.path.realpath(__file__))
|
|
162 write_info(filt_dom_tmp, FILT_DOM_GFF, orig_class_dict, filt_class_dict,
|
|
163 dom_dict, version_lines)
|
|
164 os.unlink(filt_dom_tmp.name)
|
|
165 xminimals_all.append(xminimals)
|
|
166 xmaximals_all.append(xmaximals)
|
|
167 domains_all.append(domains)
|
|
168 return xminimals_all, xmaximals_all, domains_all, seq_ids_all
|
|
169
|
|
170
|
|
171 def get_domains_protseq(FILT_DOM_GFF, DOMAIN_PROT_SEQ):
|
|
172 ''' Get the translated protein sequence of original DNA seq for all the filtered domains regions
|
|
173 The translated sequences are taken from alignment reported by LASTAL (Query_Seq attribute in GFF)
|
|
174 '''
|
|
175 count_comment = check_file_start(FILT_DOM_GFF)
|
|
176 with open(FILT_DOM_GFF, "r") as filt_gff:
|
|
177 for comment_idx in range(count_comment):
|
|
178 next(filt_gff)
|
|
179 with open(DOMAIN_PROT_SEQ, "w") as dom_prot_file:
|
|
180 for line in filt_gff:
|
|
181 attributes = line.rstrip().split("\t")[8]
|
|
182 positions = attributes.split(";")[3].split("=")[1].split(":")[
|
|
183 -1].split("[")[0]
|
|
184 dom = attributes.split(";")[0].split("=")[1]
|
|
185 dom_class = attributes.split(";")[1].split("=")[1]
|
|
186 seq_id = line.rstrip().split("\t")[0]
|
|
187 prot_seq_align = line.rstrip().split("\t")[8].split(";")[
|
|
188 6].split("=")[1]
|
|
189 prot_seq = prot_seq_align.translate({ord(i): None
|
|
190 for i in '/\\-'})
|
|
191 header_prot_seq = ">{}:{} {} {}".format(seq_id, positions, dom,
|
|
192 dom_class)
|
|
193 dom_prot_file.write("{}\n{}\n".format(
|
|
194 header_prot_seq, textwrap.fill(prot_seq,
|
|
195 configuration.FASTA_LINE)))
|
|
196
|
|
197
|
|
198 def main(args):
|
|
199
|
|
200 t = time.time()
|
|
201
|
|
202 DOM_GFF = args.dom_gff
|
|
203 DOMAIN_PROT_SEQ = args.domains_prot_seq
|
|
204 TH_IDENTITY = args.th_identity
|
|
205 TH_LENGTH = args.th_length
|
|
206 TH_INTERRUPT = args.interruptions
|
|
207 TH_SIMILARITY = args.th_similarity
|
|
208 TH_LEN_RATIO = args.max_len_proportion
|
|
209 FILT_DOM_GFF = args.domains_filtered
|
|
210 SELECTED_DOM = args.selected_dom
|
|
211 OUTPUT_DIR = args.output_dir
|
|
212 # DELETE : ELEMENT = args.element_type.replace("_pipe_", "|")
|
|
213 ELEMENT = args.element_type
|
|
214
|
|
215 if DOMAIN_PROT_SEQ is None:
|
|
216 DOMAIN_PROT_SEQ = configuration.DOM_PROT_SEQ
|
|
217 if FILT_DOM_GFF is None:
|
|
218 FILT_DOM_GFF = configuration.FILT_DOM_GFF
|
|
219
|
|
220 if OUTPUT_DIR and not os.path.exists(OUTPUT_DIR):
|
|
221 os.makedirs(OUTPUT_DIR)
|
|
222
|
|
223 if not os.path.isabs(FILT_DOM_GFF):
|
|
224 if OUTPUT_DIR is None:
|
|
225 OUTPUT_DIR = os.path.dirname(os.path.abspath(DOM_GFF))
|
|
226 FILT_DOM_GFF = os.path.join(OUTPUT_DIR, os.path.basename(FILT_DOM_GFF))
|
|
227 DOMAIN_PROT_SEQ = os.path.join(OUTPUT_DIR,
|
|
228 os.path.basename(DOMAIN_PROT_SEQ))
|
|
229
|
|
230 [xminimals_all, xmaximals_all, domains_all, seq_ids_all] = filter_qual_dom(
|
|
231 DOM_GFF, FILT_DOM_GFF, TH_IDENTITY, TH_SIMILARITY, TH_LENGTH,
|
|
232 TH_INTERRUPT, TH_LEN_RATIO, SELECTED_DOM, ELEMENT)
|
|
233 get_domains_protseq(FILT_DOM_GFF, DOMAIN_PROT_SEQ)
|
|
234
|
|
235 print("ELAPSED_TIME_DOMAINS = {} s".format(time.time() - t))
|
|
236
|
|
237
|
|
238 if __name__ == "__main__":
|
|
239 import argparse
|
|
240 from argparse import RawDescriptionHelpFormatter
|
|
241
|
|
242 class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
|
|
243 argparse.RawDescriptionHelpFormatter):
|
|
244 pass
|
|
245
|
|
246 parser = argparse.ArgumentParser(
|
|
247 description=
|
|
248 '''The script performs DANTE's output filtering for quality and/or extracting specific type of protein domain or mobile elements of origin. For the filtered domains it reports their translated protein sequence of original DNA.
|
|
249 WHEN NO PARAMETERS GIVEN, IT PERFORMS QUALITY FILTERING USING THE DEFAULT PARAMETRES (optimized for Viridiplantae species)
|
|
250
|
|
251 INPUTS:
|
|
252 - GFF3 file produced by protein_domains.py OR already filtered GFF3
|
|
253
|
|
254 FILTERING OPTIONS:
|
|
255 > QUALITY: - Min relative length of alignemnt to the protein domain from DB (without gaps)
|
|
256 - Identity
|
|
257 - Similarity (scoring matrix: BLOSUM82)
|
|
258 - Interruption in the reading frame (frameshifts + stop codons) per every starting 100 AA
|
|
259 - Max alignment proportion to the original length of database domain sequence
|
|
260 > DOMAIN TYPE: choose from choices ('Name' attribute in GFF)
|
|
261 Records for ambiguous domain type (e.g. INT/RH) are filtered out automatically
|
|
262
|
|
263 > MOBILE ELEMENT TYPE:
|
|
264 arbitrary substring of the element classification ('Final_Classification' attribute in GFF)
|
|
265
|
|
266 OUTPUTS:
|
|
267 - filtered GFF3 file
|
|
268 - fasta file of translated protein sequences (from original DNA) for the aligned domains that match the filtering criteria
|
|
269
|
|
270 DEPENDENCIES:
|
|
271 - python 3.4 or higher
|
|
272 > ProfRep modules:
|
|
273 - configuration.py
|
|
274
|
|
275 EXAMPLE OF USAGE:
|
|
276 Getting quality filtered integrase(INT) domains of all gypsy transposable elements:
|
|
277 ./domains_filtering.py -dom_gff PATH_TO_INPUT_GFF -pdb PATH_TO_PROTEIN_DB -cs PATH_TO_CLASSIFICATION_FILE --selected_dom INT --element_type Ty3/gypsy
|
|
278
|
|
279 ''',
|
|
280 epilog="""""",
|
|
281 formatter_class=CustomFormatter)
|
|
282 requiredNamed = parser.add_argument_group('required named arguments')
|
|
283 requiredNamed.add_argument("-dg",
|
|
284 "--dom_gff",
|
|
285 type=str,
|
|
286 required=True,
|
|
287 help="basic unfiltered gff file of all domains")
|
|
288 parser.add_argument("-ouf",
|
|
289 "--domains_filtered",
|
|
290 type=str,
|
|
291 help="output filtered domains gff file")
|
|
292 parser.add_argument("-dps",
|
|
293 "--domains_prot_seq",
|
|
294 type=str,
|
|
295 help="output file containg domains protein sequences")
|
|
296 parser.add_argument("-thl",
|
|
297 "--th_length",
|
|
298 type=float,
|
|
299 choices=[Range(0.0, 1.0)],
|
|
300 default=0.8,
|
|
301 help="proportion of alignment length threshold")
|
|
302 parser.add_argument("-thi",
|
|
303 "--th_identity",
|
|
304 type=float,
|
|
305 choices=[Range(0.0, 1.0)],
|
|
306 default=0.35,
|
|
307 help="proportion of alignment identity threshold")
|
|
308 parser.add_argument("-ths",
|
|
309 "--th_similarity",
|
|
310 type=float,
|
|
311 choices=[Range(0.0, 1.0)],
|
|
312 default=0.45,
|
|
313 help="threshold for alignment proportional similarity")
|
|
314 parser.add_argument(
|
|
315 "-ir",
|
|
316 "--interruptions",
|
|
317 type=int,
|
|
318 default=3,
|
|
319 help=
|
|
320 "interruptions (frameshifts + stop codons) tolerance threshold per 100 AA")
|
|
321 parser.add_argument(
|
|
322 "-mlen",
|
|
323 "--max_len_proportion",
|
|
324 type=float,
|
|
325 default=1.2,
|
|
326 help=
|
|
327 "maximal proportion of alignment length to the original length of protein domain from database")
|
|
328 parser.add_argument(
|
|
329 "-sd",
|
|
330 "--selected_dom",
|
|
331 type=str,
|
|
332 default="All",
|
|
333 choices=[
|
|
334 "All", "GAG", "INT", "PROT", "RH", "RT", "aRH", "CHDCR", "CHDII",
|
|
335 "TPase", "YR", "HEL1", "HEL2", "ENDO"
|
|
336 ],
|
|
337 help="filter output domains based on the domain type")
|
|
338 parser.add_argument(
|
|
339 "-el",
|
|
340 "--element_type",
|
|
341 type=str,
|
|
342 default="",
|
|
343 help="filter output domains by typing substring from classification")
|
|
344 parser.add_argument(
|
|
345 "-dir",
|
|
346 "--output_dir",
|
|
347 type=str,
|
|
348 default=None,
|
|
349 help="specify if you want to change the output directory")
|
|
350 args = parser.parse_args()
|
|
351 main(args)
|