Mercurial > repos > dereeper > ragoo
comparison RaGOO/Assemblytics_between_alignments.pl @ 13:b9a3aeb162ab draft default tip
Uploaded
author | dereeper |
---|---|
date | Mon, 26 Jul 2021 18:22:37 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
12:68a9ec9ce51e | 13:b9a3aeb162ab |
---|---|
1 #!/usr/bin/perl -w | |
2 | |
3 # Authors: Maria Nattestad and Mike Schatz | |
4 # Email: mnattest@cshl.edu | |
5 | |
6 use strict; | |
7 my @chromosome_filter_choices = ("all-chromosomes","primary-chromosomes"); | |
8 my @longrange_filter_choices = ("include-longrange","exclude-longrange","longrange-only"); | |
9 my @output_file_choices = ("bed","bedpe"); | |
10 | |
11 my $USAGE = "Usage:\nAssemblytics_between_alignments.pl coords.tab minimum_event_size maximum_event_size [@chromosome_filter_choices] [@longrange_filter_choices] [@output_file_choices] > fusions.svs.bedpe "; | |
12 | |
13 my $coordsfile = shift @ARGV or die $USAGE; | |
14 my $minimum_event_size = int(shift @ARGV); | |
15 my $maximum_event_size = int(shift @ARGV); | |
16 my $chromosome_filter = shift @ARGV or die $USAGE; | |
17 my $longrange_filter = shift @ARGV or die $USAGE; | |
18 my $output_file = shift @ARGV or die $USAGE; | |
19 | |
20 | |
21 | |
22 # How close do alignments have to be in order to call deletions and insertions? (as opposed to contractions and expansions) | |
23 my $narrow_threshold = 50; | |
24 | |
25 | |
26 | |
27 # Number of basepairs of distance in either the reference or the query before we call an SV long-range | |
28 my $longrange = $maximum_event_size; | |
29 | |
30 | |
31 | |
32 # What is the longest two alignments can map apart in the query before we throw the variant between them away? | |
33 my $max_query_dist = 100000; | |
34 | |
35 | |
36 | |
37 my %chromosome_filter_choices_hash = map { $_, 1 } @chromosome_filter_choices; | |
38 my %longrange_filter_choices_hash = map { $_, 1 } @longrange_filter_choices; | |
39 my %output_file_choices_hash = map { $_, 1 } @output_file_choices; | |
40 | |
41 if ( $chromosome_filter_choices_hash{ $chromosome_filter } && $longrange_filter_choices_hash{ $longrange_filter } && $output_file_choices_hash { $output_file }) { | |
42 # All is well with the world | |
43 } else { | |
44 die $USAGE; | |
45 } | |
46 | |
47 if ($longrange_filter ne "exclude-longrange" && $output_file eq "bed"){ | |
48 die "Cannot output bed while allowing long-range variants\n$USAGE"; | |
49 } | |
50 | |
51 # open COORDS, "./bin/show-coords -rclHT $deltafile |" | |
52 # or die "Can't process $deltafile ($!)\n"; | |
53 | |
54 open COORDS, "$coordsfile" or die "Can't process $coordsfile ($!)\n"; | |
55 | |
56 ##open COORDS, "show-coords -rclHT $deltafile |" | |
57 ## or die "Can't process $deltafile ($!)\n"; | |
58 | |
59 | |
60 | |
61 ## Require the flanking alignments are at least this long to call an SV | |
62 ## Note there is no minimum length for fusions, this is determined by how | |
63 ## the delta file was filtered | |
64 | |
65 my $MIN_SV_ALIGN = 100; | |
66 | |
67 | |
68 #my $minimum_event_size = 50; | |
69 my $approximately_zero = $narrow_threshold; | |
70 | |
71 | |
72 my %alignments; | |
73 my $numalignments = 0; | |
74 | |
75 | |
76 while (<COORDS>) | |
77 { | |
78 chomp; | |
79 my @vals = split /\s+/, $_; | |
80 | |
81 my $rid = $vals[6]; | |
82 my $qid = $vals[7]; | |
83 | |
84 my $a; | |
85 $a->{"rstart"} = $vals[0]; | |
86 $a->{"rend"} = $vals[1]; | |
87 $a->{"qstart"} = $vals[2]; | |
88 $a->{"qend"} = $vals[3]; | |
89 $a->{"rlen"} = $vals[4]; | |
90 $a->{"qlen"} = $vals[5]; | |
91 $a->{"rid"} = $vals[6]; | |
92 $a->{"qid"} = $vals[7]; | |
93 $a->{"str"} = $_; | |
94 | |
95 $a->{"qidx"} = 0; | |
96 | |
97 $a->{"qrc"} = ($a->{"qend"} > $a->{"qstart"}) ? 0 : 1; | |
98 | |
99 push @{$alignments{$qid}->{$rid}}, $a; # a is a hash with all the info for one alignment | |
100 | |
101 $numalignments++; | |
102 } | |
103 | |
104 | |
105 | |
106 my $candidatefusions = 0; | |
107 my $candidatesvs = 0; | |
108 | |
109 my $sv_id_counter = 0; | |
110 | |
111 my %svstats; | |
112 | |
113 foreach my $qid (sort keys %alignments) # query name is the key for the alignments hash | |
114 { | |
115 my @refs = sort keys %{$alignments{$qid}}; # grab all alignments of that query | |
116 my $numref = scalar @refs; | |
117 | |
118 ## scan for fusions | |
119 # if ($numref > 1) # if query aligns to multiple chromosomes | |
120 # { | |
121 # my $allrefs = join " ", @refs; # join the names together for output | |
122 | |
123 # print "== $qid [$numref] $allrefs\n"; # output the names of the chromosomes | |
124 # $candidatefusions++; | |
125 | |
126 # my $rcnt = 0; | |
127 # foreach my $rid (@refs) | |
128 # { | |
129 # print "--\n" if ($rcnt > 0); | |
130 # $rcnt++; | |
131 | |
132 # foreach my $a (@{$alignments{$qid}->{$rid}}) | |
133 # { | |
134 # my $str = $a->{"str"}; | |
135 # print "$str\n"; | |
136 # } | |
137 # } | |
138 | |
139 # print "\n"; | |
140 # } | |
141 | |
142 | |
143 ## Resort the alignments by query sort position | |
144 my @qaligns; | |
145 foreach my $rid (@refs) | |
146 { | |
147 foreach my $a (@{$alignments{$qid}->{$rid}}) | |
148 { | |
149 push @qaligns, $a; | |
150 } | |
151 } | |
152 | |
153 ## Now record the index of the sorted query indices | |
154 @qaligns = sort { $a->{"qstart"} <=> $b->{"qstart"}} @qaligns; | |
155 for (my $i=0; $i < scalar @qaligns; $i++) | |
156 { | |
157 $qaligns[$i]->{"qidx"} = $i; | |
158 } | |
159 | |
160 ## scan for SVs | |
161 my $numalign = scalar @qaligns; | |
162 | |
163 if ($numalign > 1) # if the query has more than 1 alignment | |
164 { | |
165 ## note skip first one | |
166 for (my $j = 1; $j < $numalign; $j++) | |
167 { | |
168 my $ai = $qaligns[$j-1]; | |
169 my $aj = $qaligns[$j]; | |
170 | |
171 my $istr = $ai->{"str"}; | |
172 my $jstr = $aj->{"str"}; | |
173 | |
174 # if ($ai->{"rid"} ne $aj->{"rid"}) | |
175 # { | |
176 # ## skip the fusions for now ############################################################################################# | |
177 # next; | |
178 # } | |
179 | |
180 my $rid = $ai->{"rid"}; | |
181 | |
182 if (($ai->{"rlen"} >= $MIN_SV_ALIGN) && | |
183 ($aj->{"rlen"} >= $MIN_SV_ALIGN)) | |
184 { | |
185 ## r alignments are always forward, q alignments may be flipped | |
186 | |
187 my $rpos; | |
188 my $qpos; | |
189 my $rdist = 0; | |
190 my $qdist = 0; | |
191 my $svtype = 0; | |
192 | |
193 my $chromi = $ai->{"rid"}; | |
194 my $chromj = $aj->{"rid"}; | |
195 my $posi; | |
196 my $posj; | |
197 my $strandi; | |
198 my $strandj; | |
199 | |
200 $sv_id_counter++; | |
201 | |
202 if (($ai->{"qrc"} == 0) && ($aj->{"qrc"} == 0)) | |
203 { | |
204 ## ri: [1 - 1000] | j: [2000 - 3000] => 1000 | |
205 ## qi: [1 - 1000] | j: [2000 - 3000] => 1000 | |
206 | |
207 $svtype = "FF"; | |
208 | |
209 $qdist = $aj->{"qstart"} - $ai->{"qend"}; | |
210 $rdist = $aj->{"rstart"} - $ai->{"rend"}; | |
211 | |
212 if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $ai->{"rend"}, $aj->{"rstart"}); } | |
213 else { $rpos = sprintf("%s:%d-%d:-", $rid, $aj->{"rstart"}, $ai->{"rend"}); } | |
214 | |
215 if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $ai->{"qend"}, $aj->{"qstart"}); } | |
216 else { $qpos = sprintf("%s:%d-%d:-", $qid, $aj->{"qstart"}, $ai->{"qend"}); } | |
217 | |
218 # When the alignments are forward-forward, the connection point is at the end of the first (i: rend) and at the beginning of the second (j: rstart) | |
219 # i + - j | |
220 # ------> --------> | |
221 $posi = $ai->{"rend"}; | |
222 $posj = $aj->{"rstart"}; | |
223 $strandi = "+"; | |
224 $strandj = "-"; | |
225 | |
226 } | |
227 elsif (($ai->{"qrc"} == 1) && ($aj->{"qrc"} == 1)) | |
228 { | |
229 ## ri: [2000 - 3000] | j: [1 - 1000] => 1000 | |
230 ## qi: [1000 - 1] | j: [3000 - 2000] => 1000 | |
231 | |
232 $svtype = "RR"; | |
233 | |
234 $rdist = $ai->{"rstart"} - $aj->{"rend"}; | |
235 $qdist = $aj->{"qend"} - $ai->{"qstart"}; | |
236 | |
237 if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $aj->{"rend"}, $ai->{"rstart"}); } | |
238 else { $rpos = sprintf("%s:%d-%d:-", $rid, $ai->{"rstart"}, $aj->{"rend"}); } | |
239 | |
240 if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $ai->{"qstart"}, $aj->{"qend"}); } | |
241 else { $qpos = sprintf("%s:%d-%d:-", $qid, $aj->{"qend"}, $ai->{"qstart"}); } | |
242 | |
243 # When the alignments are reverse-reverse, the connection point is at the beginning of the first (i: rstart) and at the end of the second (j: rend) | |
244 # j + - i | |
245 # <------- <-------- | |
246 $posi = $ai->{"rstart"}; # rstart means first reference coordinate, not with respect to the contig | |
247 $posj = $aj->{"rend"}; # rend means last reference coordinate, not with respect to the contig | |
248 $strandi = "-"; | |
249 $strandj = "+"; | |
250 } | |
251 elsif (($ai->{"qrc"} == 0) && ($aj->{"qrc"} == 1)) | |
252 { | |
253 ## ri: [1 - 1000] | j: [2000 - 3000] => 1000 | |
254 ## qi: [1 - 1000] | j: [3000 - 2000] => 1000 | |
255 | |
256 $svtype = "FR"; | |
257 | |
258 $qdist = $aj->{"qend"} - $ai->{"qend"}; | |
259 $rdist = $aj->{"rstart"} - $ai->{"rend"}; | |
260 | |
261 if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $ai->{"rend"}, $aj->{"rstart"}); } | |
262 else { $rpos = sprintf("%s:%d-%d:-", $rid, $aj->{"rstart"}, $ai->{"rend"}); } | |
263 | |
264 if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $ai->{"qend"}, $aj->{"qend"}); } | |
265 else { $qpos = sprintf("%s:%d-%d:-", $qid, $aj->{"qend"}, $ai->{"qend"}); } | |
266 | |
267 # When the alignments are forward-reverse, the connection point is at the beginning of the first (i: rstart) and at the end of the second (j: rend) | |
268 # i + j + | |
269 # -------> <-------- | |
270 $posi = $ai->{"rend"}; | |
271 $posj = $aj->{"rend"}; | |
272 $strandi = "+"; | |
273 $strandj = "+"; | |
274 | |
275 } | |
276 elsif (($ai->{"qrc"} == 1) && ($aj->{"qrc"} == 0)) | |
277 { | |
278 ## ri: [1 - 1000] | j: [2000 - 3000] => 1000 | |
279 ## qi: [1000 - 1] | j: [2000 - 3000] => 1000 | |
280 | |
281 $svtype = "RF"; | |
282 | |
283 $qdist = $ai->{"qend"} - $aj->{"qend"}; | |
284 $rdist = $aj->{"rstart"} - $ai->{"rend"}; | |
285 | |
286 if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $ai->{"rend"}, $aj->{"rstart"}); } | |
287 else { $rpos = sprintf("%s:%d-%d:-", $rid, $aj->{"rstart"}, $ai->{"rend"}); } | |
288 | |
289 if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $aj->{"qend"}, $ai->{"qend"}); } | |
290 else { $qpos = sprintf("%s:%d-%d:-", $qid, $ai->{"qend"}, $aj->{"qend"}); } | |
291 | |
292 # When the alignments are reverse-forward: | |
293 # - i - j | |
294 # <------- --------> | |
295 $posi = $ai->{"rstart"}; | |
296 $posj = $aj->{"rstart"}; | |
297 $strandi = "-"; | |
298 $strandj = "-"; | |
299 } | |
300 else | |
301 { | |
302 my $irc = $ai->{"qrc"}; | |
303 my $jrc = $aj->{"qrc"}; | |
304 | |
305 print "ERROR: Unknown SV: $irc $jrc\n"; | |
306 print "$istr\n"; | |
307 print "$jstr\n"; | |
308 die "ERROR: Unknown SV: $irc $jrc\n"; | |
309 } | |
310 | |
311 my $totaldist = $rdist + $qdist; | |
312 my $typeguess = ""; | |
313 | |
314 my $abs_event_size = abs($rdist-$qdist); | |
315 | |
316 if ($chromi ne $chromj) { # interchromosomal | |
317 $typeguess = "Interchromosomal"; | |
318 $rdist = 0; | |
319 } else { # same chromosome | |
320 if ($strandi eq $strandj) { | |
321 $typeguess = "Inversion"; | |
322 $abs_event_size = $rdist; | |
323 } | |
324 elsif ($qdist > $rdist) { | |
325 # both are significantly negative: (means the size of an overlapping region got larger, so tandem element expansion) | |
326 if ($rdist > -1*$approximately_zero && $rdist < $approximately_zero && $qdist > -1*$approximately_zero) { | |
327 $typeguess = "Insertion"; | |
328 # split into out of nowhere (rdist ~ 0) vs. rdist is > 0: insertion_in_unmapped_region | |
329 } | |
330 else { | |
331 if ($rdist < 0 || $qdist < 0) { | |
332 $typeguess = "Tandem_expansion"; | |
333 } else { | |
334 $typeguess = "Repeat_expansion"; | |
335 } | |
336 } | |
337 } | |
338 elsif ($qdist < $rdist) { | |
339 # both are significantly negative: (means the size of an overlapping region got smaller, so tandem element contraction) | |
340 if ($rdist > -1*$approximately_zero && $qdist > -1*$approximately_zero && $qdist < $approximately_zero) { | |
341 $typeguess = "Deletion"; | |
342 # split into out of nowhere (rdist ~ 0) vs. rdist is > 0: deletion_in_unmapped_region | |
343 } | |
344 else { | |
345 if ($rdist < 0 || $qdist < 0) { | |
346 $typeguess = "Tandem_contraction"; | |
347 } else { | |
348 $typeguess = "Repeat_contraction"; | |
349 } | |
350 } | |
351 } | |
352 else { | |
353 $typeguess = "None"; | |
354 } | |
355 | |
356 if ($abs_event_size > $longrange) { # || abs($rdist) > $longrange || abs($qdist) > $longrange | |
357 $typeguess = "Longrange"; | |
358 if (abs($qdist) > $max_query_dist) { | |
359 $typeguess = "None"; | |
360 } | |
361 } | |
362 # my $ratio; | |
363 # if ($qdist != 0){ | |
364 # # $ratio = abs(($rdist/$qdist)-1); | |
365 # # if ($ratio < 0.1) { | |
366 # # $typeguess = "Equilibrium"; | |
367 # # } | |
368 # if ($rdist==$qdist || abs($qdist) > $longrange) { | |
369 # $typeguess = "None"; | |
370 # } | |
371 # } | |
372 } | |
373 | |
374 # my @chromosome_filter_choices = ("all-chromosomes","primary-chromosomes"); | |
375 # my @longrange_filter_choices = ("include-longrange","exclude-longrange"); | |
376 | |
377 my $chromi_length = length $chromi; # length of the chromosome names: a way to filter to primary chromosomes and cut out alts and patches from the assembly | |
378 my $chromj_length = length $chromj; | |
379 if ($typeguess ne "Inversion" && $typeguess ne "None" && $abs_event_size >= $minimum_event_size) { # always required | |
380 if ($chromosome_filter eq "all-chromosomes" || ($chromi_length < 6 && $chromj_length < 6)) { # test for primary chromosomes unless "all-chromosomes" is chosen | |
381 if ($longrange_filter ne "exclude-longrange" || ($typeguess ne "Interchromosomal" && $typeguess ne "Longrange")) { | |
382 if ($longrange_filter ne "longrange-only" || ($typeguess eq "Interchromosomal" || $typeguess eq "Longrange")) { | |
383 if ($output_file eq "bedpe") { | |
384 print "$chromi\t$posi\t@{[$posi + 1]}\t$chromj\t$posj\t@{[$posj + 1]}\tAssemblytics_b_$sv_id_counter\t$abs_event_size\t$strandi\t$strandj\t$typeguess\t$rdist\t$qdist\t$qpos\t$abs_event_size\t$svtype\tbetween_alignments\n"; | |
385 } | |
386 else { | |
387 use List::Util qw(min max); | |
388 my $ref_start = min(($posi, $posj)); | |
389 my $ref_stop = max(($posi, $posj)); | |
390 if ($ref_stop eq $ref_start) { | |
391 $ref_stop = $ref_start + 1; | |
392 } | |
393 # "chrom","start","stop","name","event.size","strand","event.type","ref.dist","query.dist","contig.name" | |
394 print "$chromi\t$ref_start\t$ref_stop\tAssemblytics_b_$sv_id_counter\t$abs_event_size\t+\t$typeguess\t$rdist\t$qdist\t$qpos\tbetween_alignments\n"; | |
395 } | |
396 } | |
397 } | |
398 } | |
399 #if ($filter_type ~~ ("primary-allsizes","primary-shortrange") { | |
400 # && $typeguess ne "Interchromosomal" && $typeguess ne "Inversion" && $chromi_length < 6 && $chromj_length < 6 && $abs_event_size >= $minimum_event_size) { | |
401 } | |
402 $candidatesvs++; | |
403 #push @{$svstats{$svtype}}, $totaldist; | |
404 } | |
405 } | |
406 } | |
407 } | |
408 | |
409 | |
410 | |
411 # print "Processed $numalignments alignments found $candidatefusions fusions and $candidatesvs SVs\n"; | |
412 # print STDERR "Processed $numalignments alignments found $candidatefusions fusions and $candidatesvs SVs\n"; | |
413 | |
414 # foreach my $svtype (keys %svstats) | |
415 # { | |
416 # my @events = @{$svstats{$svtype}}; | |
417 # my $cnt = scalar @events; | |
418 | |
419 # my $sum = 0.0; | |
420 # foreach my $e (@events) | |
421 # { | |
422 # $sum += $e; | |
423 # } | |
424 | |
425 # my $mean = sprintf ("%0.02f", $sum/$cnt); | |
426 | |
427 # print "svtype[$svtype]: $cnt $mean\n"; | |
428 # print STDERR "svtype[$svtype]: $cnt $mean\n"; | |
429 # } | |
430 | |
431 | |
432 | |
433 | |
434 | |
435 | |
436 | |
437 | |
438 |