comparison RaGOO/ragoo_utilities/ReadCoverage.py @ 13:b9a3aeb162ab draft default tip

Uploaded
author dereeper
date Mon, 26 Jul 2021 18:22:37 +0000
parents
children
comparison
equal deleted inserted replaced
12:68a9ec9ce51e 13:b9a3aeb162ab
1 import bisect
2
3 import numpy as np
4
5
6 class ReadCoverage:
7
8 def __init__(self, in_paf):
9 self.paf = in_paf
10 self.glob_mean = None
11 self.glob_std = None
12 self.coverage_map = dict()
13 self.ctg_lens = dict()
14 self._make_coverage_map()
15 self._get_glob_mean()
16
17 @staticmethod
18 def _tabulate_coverage(cov_list):
19 current_coverage = 0
20 coverage_list = []
21 seen = set()
22
23 for header, pos in cov_list:
24 if header in seen:
25 current_coverage -= 1
26 coverage_list.append((pos, current_coverage))
27 seen.remove(header)
28 else:
29 current_coverage += 1
30 coverage_list.append((pos, current_coverage))
31 seen.add(header)
32 return coverage_list
33
34 @staticmethod
35 def _smooth_supported_breaks(sup_breaks, break_types):
36 """
37 If there are multiple low coverage breaks in close proximity,
38 merge it into one break at the lowest coverage point.
39 """
40 i = 0
41 j = 1
42 while j < len(sup_breaks):
43 if break_types[i] == 'l' and break_types[j] == 'l':
44 if abs(sup_breaks[i][0] - sup_breaks[j][0]) < 100000:
45 # Merge these two break points
46 sup_breaks[i] = min([sup_breaks[i], sup_breaks[j]], key=lambda x: x[1])
47 sup_breaks.pop(j)
48 else:
49 i += 1
50 j += 1
51 else:
52 i += 1
53 j += 1
54
55 return [z[0] for z in sup_breaks]
56
57 def _trim_ends(self, dist=25000):
58 """ Remove the ends of the contigs from the coverage map. """
59 for i in self.coverage_map:
60 # Start with the beginning of the contig
61 start_idx = 0
62 end_idx = 0
63 for j in range(len(self.coverage_map[i])):
64 if self.coverage_map[i][j][0] < dist:
65 start_idx = j
66
67 if self.coverage_map[i][j][0] > self.ctg_lens[i] - dist:
68 end_idx = j-1
69 break
70 self.coverage_map[i] = self.coverage_map[i][start_idx:end_idx]
71
72 # Remove contigs which don't have coverage info.
73 header_keys = list(self.coverage_map.keys())
74 for i in header_keys:
75 if not self.coverage_map[i]:
76 self.coverage_map.pop(i)
77
78 def _make_coverage_map(self):
79 """
80 Populate self.coverage_map. This is a dictionary that associates each contig header with a list of alignment
81 positions and their coverage levels.
82 """
83 # Associate with each contig header, a list of (query header, start), (query header, end)
84 alns_pos = dict()
85 with open(self.paf, 'r') as f:
86 for line in f:
87 L1 = line.rstrip().split('\t')
88
89 # Only consider an alignment if at least 75% of the read aligned.
90 if abs((int(L1[3]) - int(L1[2])))/int(L1[1]) >= 0.75:
91 if L1[5] in alns_pos:
92 alns_pos[L1[5]].append((L1[0], int(L1[7])))
93 alns_pos[L1[5]].append((L1[0], int(L1[8])))
94 else:
95 alns_pos[L1[5]] = [(L1[0], int(L1[7])), (L1[0], int(L1[8]))]
96 self.ctg_lens[L1[5]] = int(L1[6])
97
98 # Sort these coverage positions and get the coverage map
99 for i in alns_pos:
100 self.coverage_map[i] = self._tabulate_coverage(sorted(alns_pos[i], key=lambda x: x[1]))
101
102 self._trim_ends()
103
104 def _get_glob_mean(self):
105 L1 = []
106 for i in self.coverage_map:
107
108 # In the case where we have multiple coverage values for one position, take the last one.
109 last_pos = 0
110 curr_val = 0
111 for j in self.coverage_map[i]:
112 if j[0] == last_pos:
113 curr_val = j[1]
114 else:
115 last_pos = j[0]
116 L1.append(curr_val)
117 cur_val = j[1]
118
119 L1 = np.asarray(L1, dtype=np.int32)
120 self.glob_mean = np.median(L1)
121 self.glob_std = np.sqrt(self.glob_mean)
122
123 def _get_index_range(self, header, start_ind, distance):
124 """
125 Get the list of indices that are contained within a distance around the start index
126 """
127 all_inds = []
128 low_counter = 1
129
130 # Check if the start point is at the end of the contig
131 if start_ind == len(self.coverage_map[header]):
132 start_ind -= 1
133
134 start_pos = self.coverage_map[header][start_ind][0]
135 is_low = False
136
137 # Get all coverage map indices representing regions 50kbp upstream of the start
138 while not is_low:
139 next_ind = start_ind-low_counter
140 if next_ind < 0:
141 is_low = True
142 else:
143 next_pos = self.coverage_map[header][next_ind][0]
144 if start_pos - next_pos > distance:
145 is_low = True
146 else:
147 all_inds.append(next_ind)
148 low_counter += 1
149
150 # Repeat for 50kbp downstream
151 high_counter = 1
152 is_high = False
153 while not is_high:
154 next_ind = start_ind + high_counter
155 if next_ind >= len(self.coverage_map[header]):
156 is_high = True
157 else:
158 next_pos = self.coverage_map[header][next_ind][0]
159 if next_pos - start_pos > distance:
160 is_high = True
161 else:
162 all_inds.append(next_ind)
163 high_counter += 1
164
165 return sorted(all_inds)
166
167 def check_break_cov(self, header, in_breaks, min_cov=None, max_cov=None):
168 """
169 Given a list of potential break points, verify if those break points occur around low or high coverage
170 areas. If so, replace the candidate break point with the low/high coverage break point.
171 :param header: contig header for these breaks
172 :param in_breaks: list of candidate break points
173 :param min_cov: break at coverage levels below this value
174 :param max_cov: break at coverage levels above this value
175 :return: list of real break points, or empty list if not breaking is recommended
176 """
177 # Check that we have coverage info for this contig.
178 if header not in self.coverage_map:
179 return []
180
181 if min_cov is None or max_cov is None:
182 # Automatically calculate min and max coverage
183 min_cov = max(self.glob_mean - (self.glob_std*3), 0)
184 max_cov = self.glob_mean + (self.glob_std*3)
185
186 supported_breaks = []
187 break_types = []
188
189 for i in in_breaks:
190 # Get the coverage for the position closest to this potential break point
191 ins_ind = bisect.bisect_left(self.coverage_map[header], (i, 0))
192 # Get the coverage for positions within 50 kbp of the candidate coverage position
193 # Exclude positions near the ends of the sequence.
194 ind_range = self._get_index_range(header, ins_ind, 50000)
195
196 if len(set(ind_range)) > 1:
197 # Check for low coverage
198 lowest_cov = min(self.coverage_map[header][ind_range[0]:ind_range[-1]], key=lambda x: x[1])
199 if lowest_cov[1] < min_cov:
200 supported_breaks.append(lowest_cov)
201 break_types.append('l')
202 continue
203
204 # Check for high coverage
205 highest_cov = max(self.coverage_map[header][ind_range[0]:ind_range[-1]], key=lambda x: x[1])
206 if highest_cov[1] > max_cov:
207 supported_breaks.append(highest_cov)
208 break_types.append('h')
209
210 return self._smooth_supported_breaks(supported_breaks, break_types)
211
212