Mercurial > repos > dereeper > ragoo
comparison RaGOO/ragoo_utilities/ReadCoverage.py @ 13:b9a3aeb162ab draft default tip
Uploaded
| author | dereeper |
|---|---|
| date | Mon, 26 Jul 2021 18:22:37 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 12:68a9ec9ce51e | 13:b9a3aeb162ab |
|---|---|
| 1 import bisect | |
| 2 | |
| 3 import numpy as np | |
| 4 | |
| 5 | |
| 6 class ReadCoverage: | |
| 7 | |
| 8 def __init__(self, in_paf): | |
| 9 self.paf = in_paf | |
| 10 self.glob_mean = None | |
| 11 self.glob_std = None | |
| 12 self.coverage_map = dict() | |
| 13 self.ctg_lens = dict() | |
| 14 self._make_coverage_map() | |
| 15 self._get_glob_mean() | |
| 16 | |
| 17 @staticmethod | |
| 18 def _tabulate_coverage(cov_list): | |
| 19 current_coverage = 0 | |
| 20 coverage_list = [] | |
| 21 seen = set() | |
| 22 | |
| 23 for header, pos in cov_list: | |
| 24 if header in seen: | |
| 25 current_coverage -= 1 | |
| 26 coverage_list.append((pos, current_coverage)) | |
| 27 seen.remove(header) | |
| 28 else: | |
| 29 current_coverage += 1 | |
| 30 coverage_list.append((pos, current_coverage)) | |
| 31 seen.add(header) | |
| 32 return coverage_list | |
| 33 | |
| 34 @staticmethod | |
| 35 def _smooth_supported_breaks(sup_breaks, break_types): | |
| 36 """ | |
| 37 If there are multiple low coverage breaks in close proximity, | |
| 38 merge it into one break at the lowest coverage point. | |
| 39 """ | |
| 40 i = 0 | |
| 41 j = 1 | |
| 42 while j < len(sup_breaks): | |
| 43 if break_types[i] == 'l' and break_types[j] == 'l': | |
| 44 if abs(sup_breaks[i][0] - sup_breaks[j][0]) < 100000: | |
| 45 # Merge these two break points | |
| 46 sup_breaks[i] = min([sup_breaks[i], sup_breaks[j]], key=lambda x: x[1]) | |
| 47 sup_breaks.pop(j) | |
| 48 else: | |
| 49 i += 1 | |
| 50 j += 1 | |
| 51 else: | |
| 52 i += 1 | |
| 53 j += 1 | |
| 54 | |
| 55 return [z[0] for z in sup_breaks] | |
| 56 | |
| 57 def _trim_ends(self, dist=25000): | |
| 58 """ Remove the ends of the contigs from the coverage map. """ | |
| 59 for i in self.coverage_map: | |
| 60 # Start with the beginning of the contig | |
| 61 start_idx = 0 | |
| 62 end_idx = 0 | |
| 63 for j in range(len(self.coverage_map[i])): | |
| 64 if self.coverage_map[i][j][0] < dist: | |
| 65 start_idx = j | |
| 66 | |
| 67 if self.coverage_map[i][j][0] > self.ctg_lens[i] - dist: | |
| 68 end_idx = j-1 | |
| 69 break | |
| 70 self.coverage_map[i] = self.coverage_map[i][start_idx:end_idx] | |
| 71 | |
| 72 # Remove contigs which don't have coverage info. | |
| 73 header_keys = list(self.coverage_map.keys()) | |
| 74 for i in header_keys: | |
| 75 if not self.coverage_map[i]: | |
| 76 self.coverage_map.pop(i) | |
| 77 | |
| 78 def _make_coverage_map(self): | |
| 79 """ | |
| 80 Populate self.coverage_map. This is a dictionary that associates each contig header with a list of alignment | |
| 81 positions and their coverage levels. | |
| 82 """ | |
| 83 # Associate with each contig header, a list of (query header, start), (query header, end) | |
| 84 alns_pos = dict() | |
| 85 with open(self.paf, 'r') as f: | |
| 86 for line in f: | |
| 87 L1 = line.rstrip().split('\t') | |
| 88 | |
| 89 # Only consider an alignment if at least 75% of the read aligned. | |
| 90 if abs((int(L1[3]) - int(L1[2])))/int(L1[1]) >= 0.75: | |
| 91 if L1[5] in alns_pos: | |
| 92 alns_pos[L1[5]].append((L1[0], int(L1[7]))) | |
| 93 alns_pos[L1[5]].append((L1[0], int(L1[8]))) | |
| 94 else: | |
| 95 alns_pos[L1[5]] = [(L1[0], int(L1[7])), (L1[0], int(L1[8]))] | |
| 96 self.ctg_lens[L1[5]] = int(L1[6]) | |
| 97 | |
| 98 # Sort these coverage positions and get the coverage map | |
| 99 for i in alns_pos: | |
| 100 self.coverage_map[i] = self._tabulate_coverage(sorted(alns_pos[i], key=lambda x: x[1])) | |
| 101 | |
| 102 self._trim_ends() | |
| 103 | |
| 104 def _get_glob_mean(self): | |
| 105 L1 = [] | |
| 106 for i in self.coverage_map: | |
| 107 | |
| 108 # In the case where we have multiple coverage values for one position, take the last one. | |
| 109 last_pos = 0 | |
| 110 curr_val = 0 | |
| 111 for j in self.coverage_map[i]: | |
| 112 if j[0] == last_pos: | |
| 113 curr_val = j[1] | |
| 114 else: | |
| 115 last_pos = j[0] | |
| 116 L1.append(curr_val) | |
| 117 cur_val = j[1] | |
| 118 | |
| 119 L1 = np.asarray(L1, dtype=np.int32) | |
| 120 self.glob_mean = np.median(L1) | |
| 121 self.glob_std = np.sqrt(self.glob_mean) | |
| 122 | |
| 123 def _get_index_range(self, header, start_ind, distance): | |
| 124 """ | |
| 125 Get the list of indices that are contained within a distance around the start index | |
| 126 """ | |
| 127 all_inds = [] | |
| 128 low_counter = 1 | |
| 129 | |
| 130 # Check if the start point is at the end of the contig | |
| 131 if start_ind == len(self.coverage_map[header]): | |
| 132 start_ind -= 1 | |
| 133 | |
| 134 start_pos = self.coverage_map[header][start_ind][0] | |
| 135 is_low = False | |
| 136 | |
| 137 # Get all coverage map indices representing regions 50kbp upstream of the start | |
| 138 while not is_low: | |
| 139 next_ind = start_ind-low_counter | |
| 140 if next_ind < 0: | |
| 141 is_low = True | |
| 142 else: | |
| 143 next_pos = self.coverage_map[header][next_ind][0] | |
| 144 if start_pos - next_pos > distance: | |
| 145 is_low = True | |
| 146 else: | |
| 147 all_inds.append(next_ind) | |
| 148 low_counter += 1 | |
| 149 | |
| 150 # Repeat for 50kbp downstream | |
| 151 high_counter = 1 | |
| 152 is_high = False | |
| 153 while not is_high: | |
| 154 next_ind = start_ind + high_counter | |
| 155 if next_ind >= len(self.coverage_map[header]): | |
| 156 is_high = True | |
| 157 else: | |
| 158 next_pos = self.coverage_map[header][next_ind][0] | |
| 159 if next_pos - start_pos > distance: | |
| 160 is_high = True | |
| 161 else: | |
| 162 all_inds.append(next_ind) | |
| 163 high_counter += 1 | |
| 164 | |
| 165 return sorted(all_inds) | |
| 166 | |
| 167 def check_break_cov(self, header, in_breaks, min_cov=None, max_cov=None): | |
| 168 """ | |
| 169 Given a list of potential break points, verify if those break points occur around low or high coverage | |
| 170 areas. If so, replace the candidate break point with the low/high coverage break point. | |
| 171 :param header: contig header for these breaks | |
| 172 :param in_breaks: list of candidate break points | |
| 173 :param min_cov: break at coverage levels below this value | |
| 174 :param max_cov: break at coverage levels above this value | |
| 175 :return: list of real break points, or empty list if not breaking is recommended | |
| 176 """ | |
| 177 # Check that we have coverage info for this contig. | |
| 178 if header not in self.coverage_map: | |
| 179 return [] | |
| 180 | |
| 181 if min_cov is None or max_cov is None: | |
| 182 # Automatically calculate min and max coverage | |
| 183 min_cov = max(self.glob_mean - (self.glob_std*3), 0) | |
| 184 max_cov = self.glob_mean + (self.glob_std*3) | |
| 185 | |
| 186 supported_breaks = [] | |
| 187 break_types = [] | |
| 188 | |
| 189 for i in in_breaks: | |
| 190 # Get the coverage for the position closest to this potential break point | |
| 191 ins_ind = bisect.bisect_left(self.coverage_map[header], (i, 0)) | |
| 192 # Get the coverage for positions within 50 kbp of the candidate coverage position | |
| 193 # Exclude positions near the ends of the sequence. | |
| 194 ind_range = self._get_index_range(header, ins_ind, 50000) | |
| 195 | |
| 196 if len(set(ind_range)) > 1: | |
| 197 # Check for low coverage | |
| 198 lowest_cov = min(self.coverage_map[header][ind_range[0]:ind_range[-1]], key=lambda x: x[1]) | |
| 199 if lowest_cov[1] < min_cov: | |
| 200 supported_breaks.append(lowest_cov) | |
| 201 break_types.append('l') | |
| 202 continue | |
| 203 | |
| 204 # Check for high coverage | |
| 205 highest_cov = max(self.coverage_map[header][ind_range[0]:ind_range[-1]], key=lambda x: x[1]) | |
| 206 if highest_cov[1] > max_cov: | |
| 207 supported_breaks.append(highest_cov) | |
| 208 break_types.append('h') | |
| 209 | |
| 210 return self._smooth_supported_breaks(supported_breaks, break_types) | |
| 211 | |
| 212 |
