Mercurial > repos > dereeper > ragoo
comparison RaGOO/ragoo_utilities/break_chimera.py @ 13:b9a3aeb162ab draft default tip
Uploaded
author | dereeper |
---|---|
date | Mon, 26 Jul 2021 18:22:37 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
12:68a9ec9ce51e | 13:b9a3aeb162ab |
---|---|
1 from collections import defaultdict | |
2 import copy | |
3 | |
4 from intervaltree import IntervalTree | |
5 | |
6 from ragoo_utilities.ContigAlignment import UniqueContigAlignment | |
7 | |
8 | |
9 def get_ref_parts(alns, l, p, r): | |
10 """ | |
11 | |
12 :param alns: A ContigAlignment object for the contig in question | |
13 :param l: Minimum alignment length to consider | |
14 :param p: Minimum percentage of a reference chromosome that must be covered to be considered significant | |
15 :param r: Minimum number of reference chromosome nucleotides needed to be covered to be considered significant | |
16 :return: A list of reference chromosomes to which the input contig significantly aligns to | |
17 """ | |
18 all_intervals = defaultdict(list) | |
19 | |
20 # Iterate over all alignments | |
21 for i in range(len(alns.ref_headers)): | |
22 if alns.aln_lens[i] < l: | |
23 continue | |
24 | |
25 all_intervals[alns.ref_headers[i]].append((alns.ref_starts[i], alns.ref_ends[i])) | |
26 | |
27 ranges = dict() | |
28 for i in all_intervals.keys(): | |
29 ranges[i] = 0 | |
30 | |
31 # For each chromosome, sort the range and get the union interval length. | |
32 # Algorithm and pseudocode given by Michael Kirsche | |
33 for i in all_intervals.keys(): | |
34 sorted_intervals = sorted(all_intervals[i], key=lambda tup: tup[0]) | |
35 max_end = -1 | |
36 for j in sorted_intervals: | |
37 start_new_terr = max(j[0], max_end) | |
38 ranges[i] += max(0, j[1] - start_new_terr) | |
39 max_end = max(max_end, j[1]) | |
40 | |
41 total_sum = sum(ranges.values()) | |
42 chimeric_refs = [] | |
43 for i in ranges.keys(): | |
44 if (ranges[i]/total_sum)*100 > float(p) and ranges[i] > r: | |
45 chimeric_refs.append(i) | |
46 | |
47 return chimeric_refs | |
48 | |
49 | |
50 def cluster_contig_alns(contig, alns, chroms, l): | |
51 ctg_alns = alns[contig] | |
52 | |
53 # Filter contig alignments so as to only include alignments to desired chromosomes | |
54 ctg_alns.filter_ref_chroms(chroms) | |
55 | |
56 # Intialize the list of start and end positions w.r.t the query | |
57 query_pos = [] | |
58 | |
59 for i in range(len(ctg_alns.ref_headers)): | |
60 query_pos.append((ctg_alns.query_starts[i], ctg_alns.query_ends[i], i)) | |
61 | |
62 final_order = [i[2] for i in sorted(query_pos)] | |
63 final_refs = [] | |
64 for i in final_order: | |
65 final_refs.append(ctg_alns.ref_headers[i]) | |
66 | |
67 return get_borders(final_refs, ctg_alns, final_order) | |
68 | |
69 | |
70 def get_borders(ordered_refs, alns, order): | |
71 borders = [0] | |
72 current_ref = ordered_refs[0] | |
73 for i in range(1, len(ordered_refs)-1): | |
74 if ordered_refs[i] != current_ref and ordered_refs[i+1] != current_ref: | |
75 current_ref = ordered_refs[i] | |
76 borders.append(alns.query_ends[order[i-1]]) | |
77 | |
78 borders.append(alns.query_lens[0]) | |
79 intervals = [(borders[i], borders[i+1]) for i in range(len(borders)-1)] | |
80 return intervals | |
81 | |
82 | |
83 def avoid_gff_intervals(borders, gff_ins): | |
84 """ | |
85 :param borders: | |
86 :param gff_ins: all features | |
87 :return: | |
88 """ | |
89 | |
90 # Make an interval tree from the intervals of features associated with this contig | |
91 t = IntervalTree() | |
92 for line in gff_ins: | |
93 # If the interval is one bp long, skip | |
94 if line.start == line.end: | |
95 continue | |
96 t[line.start:line.end] = (line.start, line.end) | |
97 | |
98 new_borders = [] | |
99 for i in borders: | |
100 # Returns an empty set if a border does not fall in any interval. | |
101 # Otherwise, returns a list of Interval objects | |
102 interval_query = t[i[0]] | |
103 if interval_query: | |
104 while interval_query: | |
105 # Make new border larger than the max end position of all intervals | |
106 i = (i[0] + 1, i[1]) | |
107 interval_query = t[i[0]] | |
108 | |
109 interval_query = t[i[1]] | |
110 if interval_query: | |
111 while interval_query: | |
112 # Make new border larger than the max end position of all intervals | |
113 i = (i[0], i[1] + 1) | |
114 interval_query = t[i[1]] | |
115 | |
116 new_borders.append(i) | |
117 | |
118 new_borders = new_borders[:-1] + [(new_borders[-1][0], borders[-1][1])] | |
119 return new_borders | |
120 | |
121 | |
122 def update_gff(features, borders, contig): | |
123 # Pop the features to be updated | |
124 contig_feats = features.pop(contig) | |
125 | |
126 # Initialize lists for new broken contig headers | |
127 for i in range(len(borders)): | |
128 features[contig + '_chimera_broken:' + str(borders[i][0]) + '-' + str(borders[i][1])] = [] | |
129 | |
130 # Could just use binary search instead of this tree since intervals are non-overlapping. | |
131 # but input so small both are trivial | |
132 t = IntervalTree() | |
133 for i in borders: | |
134 t[i[0]:i[1]] = i | |
135 | |
136 for i in contig_feats: | |
137 query = t[i.start] | |
138 assert len(query) == 1 | |
139 break_start = list(query)[0].begin | |
140 break_end = list(query)[0].end | |
141 query_border = (break_start, break_end) | |
142 break_number = borders.index(query_border) | |
143 i.seqname = contig + '_chimera_broken:' + str(borders[break_number][0]) + '-' + str(borders[break_number][1]) | |
144 i.start = i.start - break_start | |
145 i.end = i.end - break_start | |
146 features[contig + '_chimera_broken:' + str(borders[break_number][0]) + '-' + str(borders[break_number][1])].append(i) | |
147 | |
148 return features | |
149 | |
150 | |
151 def break_contig(contigs_dict, header, break_points): | |
152 seq = contigs_dict.pop(header) | |
153 test_seq = '' | |
154 for i in range(len(break_points)): | |
155 contigs_dict[header + '_chimera_broken:' + str(break_points[i][0]) + '-' + str(break_points[i][1])] = seq[break_points[i][0]: break_points[i][1]] | |
156 test_seq += seq[break_points[i][0]: break_points[i][1]] | |
157 | |
158 assert test_seq == seq | |
159 return contigs_dict | |
160 | |
161 | |
162 def get_intra_contigs(alns, l, d, c): | |
163 """ | |
164 Flag contigs as being intrachromosomal chimeras | |
165 :param alns: | |
166 :param l: Minimum alignment length to consider | |
167 :param d: Distance between consecutive adjacent alignments with respect to the reference. If larger than this, flag | |
168 :param c: Distance between consecutive adjacent alignments with respect to the query. If larger than this, flag | |
169 :return: dict of contigs and break points. | |
170 """ | |
171 | |
172 # Get only the header to which this contig mostly aligns to and filter out smaller alignments. | |
173 uniq_aln = UniqueContigAlignment(alns) | |
174 best_header = uniq_aln.ref_chrom | |
175 ctg_alns = copy.deepcopy(alns) | |
176 ctg_alns.filter_ref_chroms([best_header]) | |
177 ctg_alns.filter_lengths(l) | |
178 | |
179 # If there are no longer any alignments after length filtering, give up | |
180 if not len(ctg_alns.ref_headers): | |
181 return | |
182 | |
183 # Sort the alignments with respect to the reference start and end positions. | |
184 ctg_alns.sort_by_ref() | |
185 | |
186 # Make a list of distance between alignments | |
187 # first with respect to (wrt) the reference. | |
188 distances_wrt_ref = [] | |
189 for i in range(len(ctg_alns.ref_headers)-1): | |
190 distances_wrt_ref.append(ctg_alns.ref_starts[i+1] - ctg_alns.ref_starts[i]) | |
191 | |
192 # next, with respect to (wrt) the contig. | |
193 distances_wrt_ctg = [] | |
194 for i in range(len(ctg_alns.ref_headers) - 1): | |
195 distances_wrt_ctg.append(abs(ctg_alns.query_starts[i + 1] - ctg_alns.query_starts[i])) | |
196 | |
197 # Next, assign the following two identities. | |
198 # 1. When ordered by the reference, the alignments start at the beginning or the end of the query | |
199 # 2. For the alignment which will be broken on, is it on the forward or reverse strand. | |
200 | |
201 is_query_start = True | |
202 | |
203 if ctg_alns.query_starts[0] >= ctg_alns.query_lens[0]/5: | |
204 is_query_start = False | |
205 | |
206 # This conditional essentially checks if there are any break points for this contig. | |
207 # Returns None otherwise (no return statement) | |
208 if distances_wrt_ref: | |
209 if max(distances_wrt_ref) > d: | |
210 gap_index = distances_wrt_ref.index(max(distances_wrt_ref)) | |
211 a_alns_strands = ctg_alns.strands[:gap_index] | |
212 if is_query_start: | |
213 if a_alns_strands.count('-') > a_alns_strands.count('+'): | |
214 # The first subcontig is on the reverse strand | |
215 return (ctg_alns.contig, [(0, ctg_alns.query_ends[0]), (ctg_alns.query_ends[0], ctg_alns.query_lens[0])]) | |
216 else: | |
217 # The first subcontig is on the forward strand. | |
218 return (ctg_alns.contig, [(0, ctg_alns.query_ends[gap_index]), (ctg_alns.query_ends[gap_index], ctg_alns.query_lens[0])]) | |
219 else: | |
220 # The first subcontig starts at the end of the contig | |
221 if a_alns_strands.count('-') > a_alns_strands.count('+'): | |
222 # The first subcontig is on the reverse strand | |
223 return (ctg_alns.contig, [(0, ctg_alns.query_starts[gap_index]), (ctg_alns.query_starts[gap_index], ctg_alns.query_lens[0])]) | |
224 else: | |
225 # The first subcontig is on the forward strand. | |
226 return (ctg_alns.contig, [(0, ctg_alns.query_starts[0]), (ctg_alns.query_starts[0], ctg_alns.query_lens[0])]) | |
227 | |
228 if max(distances_wrt_ctg) > c: | |
229 gap_index = distances_wrt_ctg.index(max(distances_wrt_ctg)) + 1 | |
230 a_alns_strands = ctg_alns.strands[:gap_index] | |
231 if is_query_start: | |
232 if a_alns_strands.count('-') > a_alns_strands.count('+'): | |
233 # The first subcontig is on the reverse strand | |
234 return (ctg_alns.contig, [(0, ctg_alns.query_ends[0]), (ctg_alns.query_ends[0], ctg_alns.query_lens[0])]) | |
235 else: | |
236 # The first subcontig is on the forward strand. | |
237 return (ctg_alns.contig, [(0, ctg_alns.query_ends[gap_index]), (ctg_alns.query_ends[gap_index], ctg_alns.query_lens[0])]) | |
238 else: | |
239 # The first subcontig starts at the end of the contig | |
240 if a_alns_strands.count('-') > a_alns_strands.count('+'): | |
241 # The first subcontig is on the reverse strand | |
242 return (ctg_alns.contig, [(0, ctg_alns.query_starts[gap_index-1]), (ctg_alns.query_starts[gap_index-1], ctg_alns.query_lens[0])]) | |
243 else: | |
244 # The first subcontig is on the forward strand. | |
245 return (ctg_alns.contig, [(0, ctg_alns.query_starts[0]), (ctg_alns.query_starts[0], ctg_alns.query_lens[0])]) | |
246 |