Mercurial > repos > dereeper > ragoo
comparison RaGOO/ragoo_utilities/ReadCoverage.py @ 13:b9a3aeb162ab draft default tip
Uploaded
author | dereeper |
---|---|
date | Mon, 26 Jul 2021 18:22:37 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
12:68a9ec9ce51e | 13:b9a3aeb162ab |
---|---|
1 import bisect | |
2 | |
3 import numpy as np | |
4 | |
5 | |
6 class ReadCoverage: | |
7 | |
8 def __init__(self, in_paf): | |
9 self.paf = in_paf | |
10 self.glob_mean = None | |
11 self.glob_std = None | |
12 self.coverage_map = dict() | |
13 self.ctg_lens = dict() | |
14 self._make_coverage_map() | |
15 self._get_glob_mean() | |
16 | |
17 @staticmethod | |
18 def _tabulate_coverage(cov_list): | |
19 current_coverage = 0 | |
20 coverage_list = [] | |
21 seen = set() | |
22 | |
23 for header, pos in cov_list: | |
24 if header in seen: | |
25 current_coverage -= 1 | |
26 coverage_list.append((pos, current_coverage)) | |
27 seen.remove(header) | |
28 else: | |
29 current_coverage += 1 | |
30 coverage_list.append((pos, current_coverage)) | |
31 seen.add(header) | |
32 return coverage_list | |
33 | |
34 @staticmethod | |
35 def _smooth_supported_breaks(sup_breaks, break_types): | |
36 """ | |
37 If there are multiple low coverage breaks in close proximity, | |
38 merge it into one break at the lowest coverage point. | |
39 """ | |
40 i = 0 | |
41 j = 1 | |
42 while j < len(sup_breaks): | |
43 if break_types[i] == 'l' and break_types[j] == 'l': | |
44 if abs(sup_breaks[i][0] - sup_breaks[j][0]) < 100000: | |
45 # Merge these two break points | |
46 sup_breaks[i] = min([sup_breaks[i], sup_breaks[j]], key=lambda x: x[1]) | |
47 sup_breaks.pop(j) | |
48 else: | |
49 i += 1 | |
50 j += 1 | |
51 else: | |
52 i += 1 | |
53 j += 1 | |
54 | |
55 return [z[0] for z in sup_breaks] | |
56 | |
57 def _trim_ends(self, dist=25000): | |
58 """ Remove the ends of the contigs from the coverage map. """ | |
59 for i in self.coverage_map: | |
60 # Start with the beginning of the contig | |
61 start_idx = 0 | |
62 end_idx = 0 | |
63 for j in range(len(self.coverage_map[i])): | |
64 if self.coverage_map[i][j][0] < dist: | |
65 start_idx = j | |
66 | |
67 if self.coverage_map[i][j][0] > self.ctg_lens[i] - dist: | |
68 end_idx = j-1 | |
69 break | |
70 self.coverage_map[i] = self.coverage_map[i][start_idx:end_idx] | |
71 | |
72 # Remove contigs which don't have coverage info. | |
73 header_keys = list(self.coverage_map.keys()) | |
74 for i in header_keys: | |
75 if not self.coverage_map[i]: | |
76 self.coverage_map.pop(i) | |
77 | |
78 def _make_coverage_map(self): | |
79 """ | |
80 Populate self.coverage_map. This is a dictionary that associates each contig header with a list of alignment | |
81 positions and their coverage levels. | |
82 """ | |
83 # Associate with each contig header, a list of (query header, start), (query header, end) | |
84 alns_pos = dict() | |
85 with open(self.paf, 'r') as f: | |
86 for line in f: | |
87 L1 = line.rstrip().split('\t') | |
88 | |
89 # Only consider an alignment if at least 75% of the read aligned. | |
90 if abs((int(L1[3]) - int(L1[2])))/int(L1[1]) >= 0.75: | |
91 if L1[5] in alns_pos: | |
92 alns_pos[L1[5]].append((L1[0], int(L1[7]))) | |
93 alns_pos[L1[5]].append((L1[0], int(L1[8]))) | |
94 else: | |
95 alns_pos[L1[5]] = [(L1[0], int(L1[7])), (L1[0], int(L1[8]))] | |
96 self.ctg_lens[L1[5]] = int(L1[6]) | |
97 | |
98 # Sort these coverage positions and get the coverage map | |
99 for i in alns_pos: | |
100 self.coverage_map[i] = self._tabulate_coverage(sorted(alns_pos[i], key=lambda x: x[1])) | |
101 | |
102 self._trim_ends() | |
103 | |
104 def _get_glob_mean(self): | |
105 L1 = [] | |
106 for i in self.coverage_map: | |
107 | |
108 # In the case where we have multiple coverage values for one position, take the last one. | |
109 last_pos = 0 | |
110 curr_val = 0 | |
111 for j in self.coverage_map[i]: | |
112 if j[0] == last_pos: | |
113 curr_val = j[1] | |
114 else: | |
115 last_pos = j[0] | |
116 L1.append(curr_val) | |
117 cur_val = j[1] | |
118 | |
119 L1 = np.asarray(L1, dtype=np.int32) | |
120 self.glob_mean = np.median(L1) | |
121 self.glob_std = np.sqrt(self.glob_mean) | |
122 | |
123 def _get_index_range(self, header, start_ind, distance): | |
124 """ | |
125 Get the list of indices that are contained within a distance around the start index | |
126 """ | |
127 all_inds = [] | |
128 low_counter = 1 | |
129 | |
130 # Check if the start point is at the end of the contig | |
131 if start_ind == len(self.coverage_map[header]): | |
132 start_ind -= 1 | |
133 | |
134 start_pos = self.coverage_map[header][start_ind][0] | |
135 is_low = False | |
136 | |
137 # Get all coverage map indices representing regions 50kbp upstream of the start | |
138 while not is_low: | |
139 next_ind = start_ind-low_counter | |
140 if next_ind < 0: | |
141 is_low = True | |
142 else: | |
143 next_pos = self.coverage_map[header][next_ind][0] | |
144 if start_pos - next_pos > distance: | |
145 is_low = True | |
146 else: | |
147 all_inds.append(next_ind) | |
148 low_counter += 1 | |
149 | |
150 # Repeat for 50kbp downstream | |
151 high_counter = 1 | |
152 is_high = False | |
153 while not is_high: | |
154 next_ind = start_ind + high_counter | |
155 if next_ind >= len(self.coverage_map[header]): | |
156 is_high = True | |
157 else: | |
158 next_pos = self.coverage_map[header][next_ind][0] | |
159 if next_pos - start_pos > distance: | |
160 is_high = True | |
161 else: | |
162 all_inds.append(next_ind) | |
163 high_counter += 1 | |
164 | |
165 return sorted(all_inds) | |
166 | |
167 def check_break_cov(self, header, in_breaks, min_cov=None, max_cov=None): | |
168 """ | |
169 Given a list of potential break points, verify if those break points occur around low or high coverage | |
170 areas. If so, replace the candidate break point with the low/high coverage break point. | |
171 :param header: contig header for these breaks | |
172 :param in_breaks: list of candidate break points | |
173 :param min_cov: break at coverage levels below this value | |
174 :param max_cov: break at coverage levels above this value | |
175 :return: list of real break points, or empty list if not breaking is recommended | |
176 """ | |
177 # Check that we have coverage info for this contig. | |
178 if header not in self.coverage_map: | |
179 return [] | |
180 | |
181 if min_cov is None or max_cov is None: | |
182 # Automatically calculate min and max coverage | |
183 min_cov = max(self.glob_mean - (self.glob_std*3), 0) | |
184 max_cov = self.glob_mean + (self.glob_std*3) | |
185 | |
186 supported_breaks = [] | |
187 break_types = [] | |
188 | |
189 for i in in_breaks: | |
190 # Get the coverage for the position closest to this potential break point | |
191 ins_ind = bisect.bisect_left(self.coverage_map[header], (i, 0)) | |
192 # Get the coverage for positions within 50 kbp of the candidate coverage position | |
193 # Exclude positions near the ends of the sequence. | |
194 ind_range = self._get_index_range(header, ins_ind, 50000) | |
195 | |
196 if len(set(ind_range)) > 1: | |
197 # Check for low coverage | |
198 lowest_cov = min(self.coverage_map[header][ind_range[0]:ind_range[-1]], key=lambda x: x[1]) | |
199 if lowest_cov[1] < min_cov: | |
200 supported_breaks.append(lowest_cov) | |
201 break_types.append('l') | |
202 continue | |
203 | |
204 # Check for high coverage | |
205 highest_cov = max(self.coverage_map[header][ind_range[0]:ind_range[-1]], key=lambda x: x[1]) | |
206 if highest_cov[1] > max_cov: | |
207 supported_breaks.append(highest_cov) | |
208 break_types.append('h') | |
209 | |
210 return self._smooth_supported_breaks(supported_breaks, break_types) | |
211 | |
212 |