annotate profrep_refining.py @ 4:e27e86406f56 draft

Uploaded
author petr-novak
date Wed, 26 Jun 2019 10:23:50 -0400
parents a5f1638b73be
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python3
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
2
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
3 import argparse
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
4 import numpy as np
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
5 import os
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
6 from tempfile import NamedTemporaryFile
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
7 from collections import defaultdict
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
8 import configuration
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
9 import sys
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
10
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
11
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
12 def str2bool(v):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
13 if v.lower() in ('yes', 'true', 't', 'y', '1'):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
14 return True
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
15 elif v.lower() in ('no', 'false', 'f', 'n', '0'):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
16 return False
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
17 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
18 raise argparse.ArgumentTypeError('Boolean value expected')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
19
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
20
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
21 def check_dom_gff(DOM_GFF):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
22 ''' Check if the GFF file of protein domains was given '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
23 with open(DOM_GFF) as domains_f:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
24 line = domains_f.readline()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
25 while line.startswith("#"):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
26 line = domains_f.readline()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
27 if len(line.split("\t")) == 9 and "Final_Classification" in line:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
28 pass
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
29 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
30 raise IOError(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
31 "There was detected an input GFF file that does not contain domains. Please check it and choose the domains GFF file")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
32
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
33
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
34 def create_dom_dict(DOM_GFF):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
35 ''' Create hash table of protein domains for individual sequences '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
36 check_dom_gff(DOM_GFF)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
37 dict_domains = defaultdict(list)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
38 count_comment = check_file_start(DOM_GFF)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
39 with open(DOM_GFF, "r") as domains:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
40 for comment_idx in range(count_comment):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
41 next(domains)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
42 for line in domains:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
43 seq_id = line.split("\t")[0]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
44 ann_dom_lineage = line.split("\t")[8].split(";")[1].split("=")[-1]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
45 start_dom = int(line.split("\t")[3])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
46 end_dom = int(line.split("\t")[4])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
47 strand_dom = line.split("\t")[6]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
48 dict_domains[seq_id].append((start_dom, end_dom, ann_dom_lineage,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
49 strand_dom))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
50 for seq_id in dict_domains.keys():
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
51 dict_domains[seq_id] = sorted(dict_domains[seq_id], key=lambda x: x[0])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
52 return dict_domains
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
53
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
54
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
55 def check_file_start(gff_file):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
56 count_comment = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
57 with open(gff_file, "r") as gff_all:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
58 line = gff_all.readline()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
59 while line.startswith("#"):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
60 line = gff_all.readline()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
61 count_comment += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
62 return count_comment
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
63
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
64
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
65 def interconnect_intervals(OUT_REFINED, gff_removed, GAP_TH, DOM_NUM,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
66 dict_domains, domains_classes):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
67 ''' Second step of refining - INTERVALS INTERCONNECTING:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
68 Gradually checking regions of GFF which are sorted by starting point.
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
69 Adding regions to be interconnected if the gap between the new and previous one is lower than threshold and theirclassification is the same as the one set by the first region.
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
70 If the conditions are not fullfilled the adding is stopped and this whole expanded region is further evaluated:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
71 1. if domains are not included in joining (A. choosed as parameter or B. based on the classification the element does not belong to mobile elements):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
72 -> the fragments are joined reported as one region in refined gff
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
73 2. if domains should be included:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
74 -> the fragments are joined if domains within the expanded region meets the criteria:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
75 1. they are at least certain number of domains
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
76 2. they have equal strand orientation
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
77 3. they are classified to the last classification level (checked from the class. table of the database) and this matches the classification of the expanded region
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
78 -> otherwise the region is refragmented to previous parts, which are reported as they were in the original repeats gff
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
79 '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
80 with open(OUT_REFINED, "w") as joined_intervals:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
81 start_line = check_file_start(gff_removed)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
82 with open(gff_removed, "r") as repeats:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
83 for comment_idx in range(start_line):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
84 joined_intervals.write(repeats.readline())
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
85 joined = False
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
86 initial = repeats.readline()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
87 ## if there are repeats in GFF, initialize
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
88 if initial is not "":
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
89 seq_id_ini = initial.rstrip().split("\t")[0]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
90 start_ini = int(initial.rstrip().split("\t")[3])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
91 end_ini = int(initial.rstrip().split("\t")[4])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
92 ann_ini = initial.rstrip().split("\t")[8].split(";")[0].split(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
93 "=")[-1]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
94 starts_part = [start_ini]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
95 ends_part = [end_ini]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
96 for line in repeats:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
97 seq_id = line.rstrip().split("\t")[0]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
98 start = int(line.rstrip().split("\t")[3])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
99 end = int(line.rstrip().split("\t")[4])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
100 ann = line.rstrip().split("\t")[8].split(";")[0].split(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
101 "=")[-1]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
102 ## if new sequence is detected the joining process is reset
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
103 if seq_id != seq_id_ini:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
104 ## handle the last expanded region from the previous sequence
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
105 if dict_domains and joined is True and configuration.WITH_DOMAINS in ann_ini:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
106 # check the domains
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
107 dict_domains = dom_refining(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
108 joined_intervals, dict_domains, seq_id_ini,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
109 start_ini, end_ini, ann_ini, starts_part,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
110 ends_part, DOM_NUM, domains_classes)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
111 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
112 # write the joined region if domains should not be considered
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
113 joined_intervals.write(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
114 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
115 seq_id_ini, configuration.SOURCE_PROFREP,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
116 configuration.REPEATS_FEATURE, start_ini,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
117 end_ini, configuration.GFF_EMPTY,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
118 configuration.GFF_EMPTY,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
119 configuration.GFF_EMPTY, ann_ini))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
120 # initialize the search for expanded intervals
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
121 starts_part = [start]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
122 ends_part = [end]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
123 joined = False
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
124 seq_id_ini = seq_id
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
125 start_ini = start
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
126 end_ini = end
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
127 ann_ini = ann
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
128 ## prolonging the potential region to be expanded
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
129 elif start - end_ini <= GAP_TH and ann == ann_ini:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
130 starts_part.append(start)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
131 ends_part.append(end)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
132 end_ini = end
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
133 joined = True
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
134 ## end of expanding the interval
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
135 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
136 if dict_domains and joined is True and configuration.WITH_DOMAINS in ann_ini:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
137 dict_domains = dom_refining(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
138 joined_intervals, dict_domains, seq_id,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
139 start_ini, end_ini, ann_ini, starts_part,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
140 ends_part, DOM_NUM, domains_classes)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
141 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
142 joined_intervals.write(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
143 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
144 seq_id, configuration.SOURCE_PROFREP,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
145 configuration.REPEATS_FEATURE, start_ini,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
146 end_ini, configuration.GFF_EMPTY,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
147 configuration.GFF_EMPTY,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
148 configuration.GFF_EMPTY, ann_ini))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
149 starts_part = [start]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
150 ends_part = [end]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
151 # after handling the expanded interval, set the "joined" indicator to be false, so the next region expanding and potential joining can begin
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
152 joined = False
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
153 start_ini = start
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
154 end_ini = end
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
155 ann_ini = ann
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
156 ## handle the last potentially expanded region
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
157 if dict_domains and joined is True and configuration.WITH_DOMAINS in ann_ini:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
158 # check the domains
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
159 dict_domains = dom_refining(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
160 joined_intervals, dict_domains, seq_id, start_ini,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
161 end_ini, ann_ini, starts_part, ends_part, DOM_NUM,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
162 domains_classes)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
163 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
164 # write the joined region
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
165 joined_intervals.write(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
166 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
167 seq_id, configuration.SOURCE_PROFREP,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
168 configuration.REPEATS_FEATURE, start_ini, end_ini,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
169 configuration.GFF_EMPTY, configuration.GFF_EMPTY,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
170 configuration.GFF_EMPTY, ann_ini))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
171 del (starts_part)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
172 del (ends_part)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
173
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
174
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
175 def cluster_regions_for_quality_check(REPEATS_GFF, BORDERS):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
176 ''' First step of refining - REMOVING LOW CONFIDENCE REPEATS REGIONS
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
177 Create clusters of overlapping regions from REPEATS_GFF.
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
178 These clusters subsequently serves for removing nested regions of different classification which have significantly lower quality comparing to the region they are nested in.
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
179 Overhang tolerance of couple of bases on the both sides is tolerated when evaluting the nested regions.
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
180 '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
181 start_line = check_file_start(REPEATS_GFF)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
182 cluster = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
183 end_cluster = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
184 seq_id_ini = None
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
185 gff_removed_parts = NamedTemporaryFile(delete=False)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
186 with open(REPEATS_GFF, "r") as repeats:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
187 for comment_idx in range(start_line):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
188 gff_removed_parts.write(repeats.readline().encode("utf-8"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
189 for interval in repeats:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
190 seq_id = interval.split("\t")[0]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
191 start = int(interval.split("\t")[3])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
192 end = int(interval.split("\t")[4])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
193 ann = interval.split("\t")[8].split(";")[0].split("=")[-1]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
194 pid = int(interval.rstrip().split("\t")[8].split(";")[1].split(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
195 "=")[-1])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
196 if seq_id != seq_id_ini:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
197 if len(cluster) > 1:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
198 remove_low_qual(cluster, BORDERS, gff_removed_parts)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
199 elif cluster:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
200 gff_removed_parts.write(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
201 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Average_PID={}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
202 cluster[0][0], configuration.SOURCE_PROFREP,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
203 configuration.REPEATS_FEATURE, cluster[0][
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
204 1], cluster[0][2], configuration.GFF_EMPTY,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
205 configuration.GFF_EMPTY, configuration.GFF_EMPTY,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
206 cluster[0][3], cluster[0][4]).encode("utf-8"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
207 seq_id_ini = seq_id
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
208 cluster = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
209 ## expanding the cluster
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
210 if start <= end_cluster or end_cluster == 0:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
211 cluster.append([seq_id, start, end, ann, pid])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
212 if end > end_cluster:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
213 end_cluster = end
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
214 ## if the next region is not overlapping, evaluate the regions in cluster or write down the single region
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
215 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
216 if len(cluster) > 1:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
217 remove_low_qual(cluster, BORDERS, gff_removed_parts)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
218 elif cluster:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
219 gff_removed_parts.write(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
220 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Average_PID={}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
221 cluster[0][0], configuration.SOURCE_PROFREP,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
222 configuration.REPEATS_FEATURE, cluster[0][
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
223 1], cluster[0][2], configuration.GFF_EMPTY,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
224 configuration.GFF_EMPTY, configuration.GFF_EMPTY,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
225 cluster[0][3], cluster[0][4]).encode("utf-8"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
226 end_cluster = end
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
227 cluster = [[seq_id, start, end, ann, pid]]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
228 if cluster:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
229 remove_low_qual(cluster, BORDERS, gff_removed_parts)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
230 gff_removed_parts.close()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
231 return gff_removed_parts.name
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
232
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
233
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
234 def remove_low_qual(cluster, BORDERS, gff_removed_parts):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
235 ''' Loop over regions in the cluster which are sorted based on descending quality (PID).
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
236 Regions nested in the currently checked region (with some exceeding borders tolerance) are tested for quality.
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
237 If the quality difference between the currently highest quality and the nested region is more than a threshold nested one will be removed.
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
238 Already checked region is removed from the list and procedure continues with the next best quality region in the cluster.
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
239 '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
240 score_sorted = sorted(cluster, key=lambda x: int(x[4]), reverse=True)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
241 for interval in score_sorted:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
242 start_toler = interval[1] - BORDERS
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
243 end_toler = interval[2] + BORDERS
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
244 score_best = interval[4]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
245 ## difference
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
246 for item in score_sorted[:]:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
247 if item[1] >= start_toler and item[2] <= end_toler:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
248 if item[4] <= (1 - configuration.QUALITY_DIFF_TO_REMOVE
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
249 ) * score_best: ### 5% difference tolerance
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
250 score_sorted.remove(item)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
251 position_sorted = sorted(score_sorted, key=lambda x: int(x[1]))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
252 for interval_refined in position_sorted:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
253 gff_removed_parts.write(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
254 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Average_PID={}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
255 interval_refined[0], configuration.SOURCE_PROFREP,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
256 configuration.REPEATS_FEATURE, interval_refined[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
257 1], interval_refined[2], configuration.GFF_EMPTY,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
258 configuration.GFF_EMPTY, configuration.GFF_EMPTY,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
259 interval_refined[3], interval_refined[4]).encode("utf-8"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
260
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
261
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
262 def dom_refining(joined_intervals, dict_domains, seq_id, start_ini, end_ini,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
263 ann_ini, starts_part, ends_part, DOM_NUM, domains_classes):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
264 ''' Check the protein domains within the potential interval to be joined:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
265 1. appropriate domain has the same classification as interval of repeats to be joined and is classified to the last level
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
266 2. strands of appropriate domains are uniform
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
267 3. the count of such appropriate domains is more than a threshold
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
268 '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
269 count_dom = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
270 index_dom = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
271 strands = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
272 indices_del = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
273 for dom_attributes in dict_domains[seq_id]:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
274 ann_dom = dom_attributes[2]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
275 if dom_attributes[0] >= start_ini and dom_attributes[1] <= end_ini:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
276 repeat_class = "|".join(ann_ini.split("|")[2:])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
277 if ann_dom in domains_classes and ann_dom == repeat_class:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
278 strands.append(dom_attributes[3])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
279 count_dom += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
280 index_dom += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
281 ## if criteria for domains are met, write the expanded interval
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
282 if len(set(strands)) <= 1 and count_dom >= DOM_NUM:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
283 joined_intervals.write(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
284 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
285 seq_id, configuration.SOURCE_PROFREP, configuration.
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
286 REPEATS_FEATURE, start_ini, end_ini, configuration.GFF_EMPTY,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
287 configuration.GFF_EMPTY, configuration.GFF_EMPTY, ann_ini))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
288 ## if criteria are not met, refragment the expanded interval on original regions
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
289 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
290 for part_start, part_end in zip(starts_part, ends_part):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
291 joined_intervals.write(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
292 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
293 seq_id, configuration.SOURCE_PROFREP,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
294 configuration.REPEATS_FEATURE, part_start, part_end,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
295 configuration.GFF_EMPTY, configuration.GFF_EMPTY,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
296 configuration.GFF_EMPTY, ann_ini))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
297 ## delete already checked domains from the dict
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
298 del (dict_domains[seq_id][0:index_dom])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
299 return dict_domains
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
300
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
301
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
302 def get_unique_classes(CLASS_TBL):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
303 ''' Get all the lineages of current domains database classification table to subsequently check the protein domains if they are classified up to the last level.
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
304 Only these domains will be considered as valid for interval joining.
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
305 If their classification is be finite (based on comparing to this list of unique classes) they will not be counted for minimum number of domains criterion within the segment to be joined
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
306 '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
307 unique_classes = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
308 with open(CLASS_TBL, "r") as class_tbl:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
309 for line in class_tbl:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
310 line_class = "|".join(line.rstrip().split("\t")[1:])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
311 if line_class not in unique_classes:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
312 unique_classes.append(line_class)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
313 return unique_classes
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
314
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
315
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
316 def main(args):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
317 # Command line arguments
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
318 REPEATS_GFF = args.repeats_gff
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
319 DOM_GFF = args.domains_gff
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
320 GAP_TH = args.gap_threshold
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
321 DOM_NUM = args.dom_number
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
322 OUT_REFINED = args.out_refined
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
323 INCLUDE_DOM = args.include_dom
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
324 CLASS_TBL = args.class_tbl
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
325 BORDERS = args.borders
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
326
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
327 # first step of refining - removing low confidence repeats regions
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
328 gff_removed = cluster_regions_for_quality_check(REPEATS_GFF, BORDERS)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
329
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
330 # second step of refining - interconnecting repeats regions
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
331 if INCLUDE_DOM:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
332 unique_classes = get_unique_classes(CLASS_TBL)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
333 dict_domains = create_dom_dict(DOM_GFF)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
334 joined_intervals = interconnect_intervals(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
335 OUT_REFINED, gff_removed, GAP_TH, DOM_NUM, dict_domains,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
336 unique_classes)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
337 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
338 joined_intervals = interconnect_intervals(OUT_REFINED, gff_removed,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
339 GAP_TH, DOM_NUM, None, None)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
340 os.unlink(gff_removed)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
341
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
342
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
343 if __name__ == "__main__":
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
344
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
345 # Command line arguments
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
346 parser = argparse.ArgumentParser()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
347 parser.add_argument('-rep_gff',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
348 '--repeats_gff',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
349 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
350 required=True,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
351 help='original repeats regions GFF from PROFREP')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
352 parser.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
353 '-dom_gff',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
354 '--domains_gff',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
355 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
356 help=
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
357 'protein domains GFF if you want to support repeats joining by domains information')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
358 parser.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
359 '-gth',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
360 '--gap_threshold',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
361 type=int,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
362 default=250,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
363 help='gap tolerance between consecutive repeats to be interconnected')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
364 parser.add_argument('-our',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
365 '--out_refined',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
366 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
367 default="output_refined.gff",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
368 help='query sequence to be processed')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
369 parser.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
370 '-dn',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
371 '--dom_number',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
372 type=int,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
373 default=2,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
374 help='min number of domains present to confirm the region joining')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
375 parser.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
376 '-id',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
377 '--include_dom',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
378 type=str2bool,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
379 default=False,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
380 help='include domains information to refine the repeats regions')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
381 parser.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
382 '-ct',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
383 '--class_tbl',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
384 help=
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
385 'classification table of protein domain database to check the level of classification')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
386 parser.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
387 '-br',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
388 '--borders',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
389 type=int,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
390 default=10,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
391 help=
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
392 'number of bp tolerated from one or the other side of two overlaping regions when evaluating quality')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
393 args = parser.parse_args()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
394 main(args)