comparison RaGOO/Assemblytics_between_alignments.pl @ 13:b9a3aeb162ab draft default tip

Uploaded
author dereeper
date Mon, 26 Jul 2021 18:22:37 +0000
parents
children
comparison
equal deleted inserted replaced
12:68a9ec9ce51e 13:b9a3aeb162ab
1 #!/usr/bin/perl -w
2
3 # Authors: Maria Nattestad and Mike Schatz
4 # Email: mnattest@cshl.edu
5
6 use strict;
7 my @chromosome_filter_choices = ("all-chromosomes","primary-chromosomes");
8 my @longrange_filter_choices = ("include-longrange","exclude-longrange","longrange-only");
9 my @output_file_choices = ("bed","bedpe");
10
11 my $USAGE = "Usage:\nAssemblytics_between_alignments.pl coords.tab minimum_event_size maximum_event_size [@chromosome_filter_choices] [@longrange_filter_choices] [@output_file_choices] > fusions.svs.bedpe ";
12
13 my $coordsfile = shift @ARGV or die $USAGE;
14 my $minimum_event_size = int(shift @ARGV);
15 my $maximum_event_size = int(shift @ARGV);
16 my $chromosome_filter = shift @ARGV or die $USAGE;
17 my $longrange_filter = shift @ARGV or die $USAGE;
18 my $output_file = shift @ARGV or die $USAGE;
19
20
21
22 # How close do alignments have to be in order to call deletions and insertions? (as opposed to contractions and expansions)
23 my $narrow_threshold = 50;
24
25
26
27 # Number of basepairs of distance in either the reference or the query before we call an SV long-range
28 my $longrange = $maximum_event_size;
29
30
31
32 # What is the longest two alignments can map apart in the query before we throw the variant between them away?
33 my $max_query_dist = 100000;
34
35
36
37 my %chromosome_filter_choices_hash = map { $_, 1 } @chromosome_filter_choices;
38 my %longrange_filter_choices_hash = map { $_, 1 } @longrange_filter_choices;
39 my %output_file_choices_hash = map { $_, 1 } @output_file_choices;
40
41 if ( $chromosome_filter_choices_hash{ $chromosome_filter } && $longrange_filter_choices_hash{ $longrange_filter } && $output_file_choices_hash { $output_file }) {
42 # All is well with the world
43 } else {
44 die $USAGE;
45 }
46
47 if ($longrange_filter ne "exclude-longrange" && $output_file eq "bed"){
48 die "Cannot output bed while allowing long-range variants\n$USAGE";
49 }
50
51 # open COORDS, "./bin/show-coords -rclHT $deltafile |"
52 # or die "Can't process $deltafile ($!)\n";
53
54 open COORDS, "$coordsfile" or die "Can't process $coordsfile ($!)\n";
55
56 ##open COORDS, "show-coords -rclHT $deltafile |"
57 ## or die "Can't process $deltafile ($!)\n";
58
59
60
61 ## Require the flanking alignments are at least this long to call an SV
62 ## Note there is no minimum length for fusions, this is determined by how
63 ## the delta file was filtered
64
65 my $MIN_SV_ALIGN = 100;
66
67
68 #my $minimum_event_size = 50;
69 my $approximately_zero = $narrow_threshold;
70
71
72 my %alignments;
73 my $numalignments = 0;
74
75
76 while (<COORDS>)
77 {
78 chomp;
79 my @vals = split /\s+/, $_;
80
81 my $rid = $vals[6];
82 my $qid = $vals[7];
83
84 my $a;
85 $a->{"rstart"} = $vals[0];
86 $a->{"rend"} = $vals[1];
87 $a->{"qstart"} = $vals[2];
88 $a->{"qend"} = $vals[3];
89 $a->{"rlen"} = $vals[4];
90 $a->{"qlen"} = $vals[5];
91 $a->{"rid"} = $vals[6];
92 $a->{"qid"} = $vals[7];
93 $a->{"str"} = $_;
94
95 $a->{"qidx"} = 0;
96
97 $a->{"qrc"} = ($a->{"qend"} > $a->{"qstart"}) ? 0 : 1;
98
99 push @{$alignments{$qid}->{$rid}}, $a; # a is a hash with all the info for one alignment
100
101 $numalignments++;
102 }
103
104
105
106 my $candidatefusions = 0;
107 my $candidatesvs = 0;
108
109 my $sv_id_counter = 0;
110
111 my %svstats;
112
113 foreach my $qid (sort keys %alignments) # query name is the key for the alignments hash
114 {
115 my @refs = sort keys %{$alignments{$qid}}; # grab all alignments of that query
116 my $numref = scalar @refs;
117
118 ## scan for fusions
119 # if ($numref > 1) # if query aligns to multiple chromosomes
120 # {
121 # my $allrefs = join " ", @refs; # join the names together for output
122
123 # print "== $qid [$numref] $allrefs\n"; # output the names of the chromosomes
124 # $candidatefusions++;
125
126 # my $rcnt = 0;
127 # foreach my $rid (@refs)
128 # {
129 # print "--\n" if ($rcnt > 0);
130 # $rcnt++;
131
132 # foreach my $a (@{$alignments{$qid}->{$rid}})
133 # {
134 # my $str = $a->{"str"};
135 # print "$str\n";
136 # }
137 # }
138
139 # print "\n";
140 # }
141
142
143 ## Resort the alignments by query sort position
144 my @qaligns;
145 foreach my $rid (@refs)
146 {
147 foreach my $a (@{$alignments{$qid}->{$rid}})
148 {
149 push @qaligns, $a;
150 }
151 }
152
153 ## Now record the index of the sorted query indices
154 @qaligns = sort { $a->{"qstart"} <=> $b->{"qstart"}} @qaligns;
155 for (my $i=0; $i < scalar @qaligns; $i++)
156 {
157 $qaligns[$i]->{"qidx"} = $i;
158 }
159
160 ## scan for SVs
161 my $numalign = scalar @qaligns;
162
163 if ($numalign > 1) # if the query has more than 1 alignment
164 {
165 ## note skip first one
166 for (my $j = 1; $j < $numalign; $j++)
167 {
168 my $ai = $qaligns[$j-1];
169 my $aj = $qaligns[$j];
170
171 my $istr = $ai->{"str"};
172 my $jstr = $aj->{"str"};
173
174 # if ($ai->{"rid"} ne $aj->{"rid"})
175 # {
176 # ## skip the fusions for now #############################################################################################
177 # next;
178 # }
179
180 my $rid = $ai->{"rid"};
181
182 if (($ai->{"rlen"} >= $MIN_SV_ALIGN) &&
183 ($aj->{"rlen"} >= $MIN_SV_ALIGN))
184 {
185 ## r alignments are always forward, q alignments may be flipped
186
187 my $rpos;
188 my $qpos;
189 my $rdist = 0;
190 my $qdist = 0;
191 my $svtype = 0;
192
193 my $chromi = $ai->{"rid"};
194 my $chromj = $aj->{"rid"};
195 my $posi;
196 my $posj;
197 my $strandi;
198 my $strandj;
199
200 $sv_id_counter++;
201
202 if (($ai->{"qrc"} == 0) && ($aj->{"qrc"} == 0))
203 {
204 ## ri: [1 - 1000] | j: [2000 - 3000] => 1000
205 ## qi: [1 - 1000] | j: [2000 - 3000] => 1000
206
207 $svtype = "FF";
208
209 $qdist = $aj->{"qstart"} - $ai->{"qend"};
210 $rdist = $aj->{"rstart"} - $ai->{"rend"};
211
212 if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $ai->{"rend"}, $aj->{"rstart"}); }
213 else { $rpos = sprintf("%s:%d-%d:-", $rid, $aj->{"rstart"}, $ai->{"rend"}); }
214
215 if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $ai->{"qend"}, $aj->{"qstart"}); }
216 else { $qpos = sprintf("%s:%d-%d:-", $qid, $aj->{"qstart"}, $ai->{"qend"}); }
217
218 # When the alignments are forward-forward, the connection point is at the end of the first (i: rend) and at the beginning of the second (j: rstart)
219 # i + - j
220 # ------> -------->
221 $posi = $ai->{"rend"};
222 $posj = $aj->{"rstart"};
223 $strandi = "+";
224 $strandj = "-";
225
226 }
227 elsif (($ai->{"qrc"} == 1) && ($aj->{"qrc"} == 1))
228 {
229 ## ri: [2000 - 3000] | j: [1 - 1000] => 1000
230 ## qi: [1000 - 1] | j: [3000 - 2000] => 1000
231
232 $svtype = "RR";
233
234 $rdist = $ai->{"rstart"} - $aj->{"rend"};
235 $qdist = $aj->{"qend"} - $ai->{"qstart"};
236
237 if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $aj->{"rend"}, $ai->{"rstart"}); }
238 else { $rpos = sprintf("%s:%d-%d:-", $rid, $ai->{"rstart"}, $aj->{"rend"}); }
239
240 if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $ai->{"qstart"}, $aj->{"qend"}); }
241 else { $qpos = sprintf("%s:%d-%d:-", $qid, $aj->{"qend"}, $ai->{"qstart"}); }
242
243 # When the alignments are reverse-reverse, the connection point is at the beginning of the first (i: rstart) and at the end of the second (j: rend)
244 # j + - i
245 # <------- <--------
246 $posi = $ai->{"rstart"}; # rstart means first reference coordinate, not with respect to the contig
247 $posj = $aj->{"rend"}; # rend means last reference coordinate, not with respect to the contig
248 $strandi = "-";
249 $strandj = "+";
250 }
251 elsif (($ai->{"qrc"} == 0) && ($aj->{"qrc"} == 1))
252 {
253 ## ri: [1 - 1000] | j: [2000 - 3000] => 1000
254 ## qi: [1 - 1000] | j: [3000 - 2000] => 1000
255
256 $svtype = "FR";
257
258 $qdist = $aj->{"qend"} - $ai->{"qend"};
259 $rdist = $aj->{"rstart"} - $ai->{"rend"};
260
261 if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $ai->{"rend"}, $aj->{"rstart"}); }
262 else { $rpos = sprintf("%s:%d-%d:-", $rid, $aj->{"rstart"}, $ai->{"rend"}); }
263
264 if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $ai->{"qend"}, $aj->{"qend"}); }
265 else { $qpos = sprintf("%s:%d-%d:-", $qid, $aj->{"qend"}, $ai->{"qend"}); }
266
267 # When the alignments are forward-reverse, the connection point is at the beginning of the first (i: rstart) and at the end of the second (j: rend)
268 # i + j +
269 # -------> <--------
270 $posi = $ai->{"rend"};
271 $posj = $aj->{"rend"};
272 $strandi = "+";
273 $strandj = "+";
274
275 }
276 elsif (($ai->{"qrc"} == 1) && ($aj->{"qrc"} == 0))
277 {
278 ## ri: [1 - 1000] | j: [2000 - 3000] => 1000
279 ## qi: [1000 - 1] | j: [2000 - 3000] => 1000
280
281 $svtype = "RF";
282
283 $qdist = $ai->{"qend"} - $aj->{"qend"};
284 $rdist = $aj->{"rstart"} - $ai->{"rend"};
285
286 if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $ai->{"rend"}, $aj->{"rstart"}); }
287 else { $rpos = sprintf("%s:%d-%d:-", $rid, $aj->{"rstart"}, $ai->{"rend"}); }
288
289 if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $aj->{"qend"}, $ai->{"qend"}); }
290 else { $qpos = sprintf("%s:%d-%d:-", $qid, $ai->{"qend"}, $aj->{"qend"}); }
291
292 # When the alignments are reverse-forward:
293 # - i - j
294 # <------- -------->
295 $posi = $ai->{"rstart"};
296 $posj = $aj->{"rstart"};
297 $strandi = "-";
298 $strandj = "-";
299 }
300 else
301 {
302 my $irc = $ai->{"qrc"};
303 my $jrc = $aj->{"qrc"};
304
305 print "ERROR: Unknown SV: $irc $jrc\n";
306 print "$istr\n";
307 print "$jstr\n";
308 die "ERROR: Unknown SV: $irc $jrc\n";
309 }
310
311 my $totaldist = $rdist + $qdist;
312 my $typeguess = "";
313
314 my $abs_event_size = abs($rdist-$qdist);
315
316 if ($chromi ne $chromj) { # interchromosomal
317 $typeguess = "Interchromosomal";
318 $rdist = 0;
319 } else { # same chromosome
320 if ($strandi eq $strandj) {
321 $typeguess = "Inversion";
322 $abs_event_size = $rdist;
323 }
324 elsif ($qdist > $rdist) {
325 # both are significantly negative: (means the size of an overlapping region got larger, so tandem element expansion)
326 if ($rdist > -1*$approximately_zero && $rdist < $approximately_zero && $qdist > -1*$approximately_zero) {
327 $typeguess = "Insertion";
328 # split into out of nowhere (rdist ~ 0) vs. rdist is > 0: insertion_in_unmapped_region
329 }
330 else {
331 if ($rdist < 0 || $qdist < 0) {
332 $typeguess = "Tandem_expansion";
333 } else {
334 $typeguess = "Repeat_expansion";
335 }
336 }
337 }
338 elsif ($qdist < $rdist) {
339 # both are significantly negative: (means the size of an overlapping region got smaller, so tandem element contraction)
340 if ($rdist > -1*$approximately_zero && $qdist > -1*$approximately_zero && $qdist < $approximately_zero) {
341 $typeguess = "Deletion";
342 # split into out of nowhere (rdist ~ 0) vs. rdist is > 0: deletion_in_unmapped_region
343 }
344 else {
345 if ($rdist < 0 || $qdist < 0) {
346 $typeguess = "Tandem_contraction";
347 } else {
348 $typeguess = "Repeat_contraction";
349 }
350 }
351 }
352 else {
353 $typeguess = "None";
354 }
355
356 if ($abs_event_size > $longrange) { # || abs($rdist) > $longrange || abs($qdist) > $longrange
357 $typeguess = "Longrange";
358 if (abs($qdist) > $max_query_dist) {
359 $typeguess = "None";
360 }
361 }
362 # my $ratio;
363 # if ($qdist != 0){
364 # # $ratio = abs(($rdist/$qdist)-1);
365 # # if ($ratio < 0.1) {
366 # # $typeguess = "Equilibrium";
367 # # }
368 # if ($rdist==$qdist || abs($qdist) > $longrange) {
369 # $typeguess = "None";
370 # }
371 # }
372 }
373
374 # my @chromosome_filter_choices = ("all-chromosomes","primary-chromosomes");
375 # my @longrange_filter_choices = ("include-longrange","exclude-longrange");
376
377 my $chromi_length = length $chromi; # length of the chromosome names: a way to filter to primary chromosomes and cut out alts and patches from the assembly
378 my $chromj_length = length $chromj;
379 if ($typeguess ne "Inversion" && $typeguess ne "None" && $abs_event_size >= $minimum_event_size) { # always required
380 if ($chromosome_filter eq "all-chromosomes" || ($chromi_length < 6 && $chromj_length < 6)) { # test for primary chromosomes unless "all-chromosomes" is chosen
381 if ($longrange_filter ne "exclude-longrange" || ($typeguess ne "Interchromosomal" && $typeguess ne "Longrange")) {
382 if ($longrange_filter ne "longrange-only" || ($typeguess eq "Interchromosomal" || $typeguess eq "Longrange")) {
383 if ($output_file eq "bedpe") {
384 print "$chromi\t$posi\t@{[$posi + 1]}\t$chromj\t$posj\t@{[$posj + 1]}\tAssemblytics_b_$sv_id_counter\t$abs_event_size\t$strandi\t$strandj\t$typeguess\t$rdist\t$qdist\t$qpos\t$abs_event_size\t$svtype\tbetween_alignments\n";
385 }
386 else {
387 use List::Util qw(min max);
388 my $ref_start = min(($posi, $posj));
389 my $ref_stop = max(($posi, $posj));
390 if ($ref_stop eq $ref_start) {
391 $ref_stop = $ref_start + 1;
392 }
393 # "chrom","start","stop","name","event.size","strand","event.type","ref.dist","query.dist","contig.name"
394 print "$chromi\t$ref_start\t$ref_stop\tAssemblytics_b_$sv_id_counter\t$abs_event_size\t+\t$typeguess\t$rdist\t$qdist\t$qpos\tbetween_alignments\n";
395 }
396 }
397 }
398 }
399 #if ($filter_type ~~ ("primary-allsizes","primary-shortrange") {
400 # && $typeguess ne "Interchromosomal" && $typeguess ne "Inversion" && $chromi_length < 6 && $chromj_length < 6 && $abs_event_size >= $minimum_event_size) {
401 }
402 $candidatesvs++;
403 #push @{$svstats{$svtype}}, $totaldist;
404 }
405 }
406 }
407 }
408
409
410
411 # print "Processed $numalignments alignments found $candidatefusions fusions and $candidatesvs SVs\n";
412 # print STDERR "Processed $numalignments alignments found $candidatefusions fusions and $candidatesvs SVs\n";
413
414 # foreach my $svtype (keys %svstats)
415 # {
416 # my @events = @{$svstats{$svtype}};
417 # my $cnt = scalar @events;
418
419 # my $sum = 0.0;
420 # foreach my $e (@events)
421 # {
422 # $sum += $e;
423 # }
424
425 # my $mean = sprintf ("%0.02f", $sum/$cnt);
426
427 # print "svtype[$svtype]: $cnt $mean\n";
428 # print STDERR "svtype[$svtype]: $cnt $mean\n";
429 # }
430
431
432
433
434
435
436
437
438