13
|
1 #!/usr/bin/perl -w
|
|
2
|
|
3 # Authors: Maria Nattestad and Mike Schatz
|
|
4 # Email: mnattest@cshl.edu
|
|
5
|
|
6 use strict;
|
|
7 my @chromosome_filter_choices = ("all-chromosomes","primary-chromosomes");
|
|
8 my @longrange_filter_choices = ("include-longrange","exclude-longrange","longrange-only");
|
|
9 my @output_file_choices = ("bed","bedpe");
|
|
10
|
|
11 my $USAGE = "Usage:\nAssemblytics_between_alignments.pl coords.tab minimum_event_size maximum_event_size [@chromosome_filter_choices] [@longrange_filter_choices] [@output_file_choices] > fusions.svs.bedpe ";
|
|
12
|
|
13 my $coordsfile = shift @ARGV or die $USAGE;
|
|
14 my $minimum_event_size = int(shift @ARGV);
|
|
15 my $maximum_event_size = int(shift @ARGV);
|
|
16 my $chromosome_filter = shift @ARGV or die $USAGE;
|
|
17 my $longrange_filter = shift @ARGV or die $USAGE;
|
|
18 my $output_file = shift @ARGV or die $USAGE;
|
|
19
|
|
20
|
|
21
|
|
22 # How close do alignments have to be in order to call deletions and insertions? (as opposed to contractions and expansions)
|
|
23 my $narrow_threshold = 50;
|
|
24
|
|
25
|
|
26
|
|
27 # Number of basepairs of distance in either the reference or the query before we call an SV long-range
|
|
28 my $longrange = $maximum_event_size;
|
|
29
|
|
30
|
|
31
|
|
32 # What is the longest two alignments can map apart in the query before we throw the variant between them away?
|
|
33 my $max_query_dist = 100000;
|
|
34
|
|
35
|
|
36
|
|
37 my %chromosome_filter_choices_hash = map { $_, 1 } @chromosome_filter_choices;
|
|
38 my %longrange_filter_choices_hash = map { $_, 1 } @longrange_filter_choices;
|
|
39 my %output_file_choices_hash = map { $_, 1 } @output_file_choices;
|
|
40
|
|
41 if ( $chromosome_filter_choices_hash{ $chromosome_filter } && $longrange_filter_choices_hash{ $longrange_filter } && $output_file_choices_hash { $output_file }) {
|
|
42 # All is well with the world
|
|
43 } else {
|
|
44 die $USAGE;
|
|
45 }
|
|
46
|
|
47 if ($longrange_filter ne "exclude-longrange" && $output_file eq "bed"){
|
|
48 die "Cannot output bed while allowing long-range variants\n$USAGE";
|
|
49 }
|
|
50
|
|
51 # open COORDS, "./bin/show-coords -rclHT $deltafile |"
|
|
52 # or die "Can't process $deltafile ($!)\n";
|
|
53
|
|
54 open COORDS, "$coordsfile" or die "Can't process $coordsfile ($!)\n";
|
|
55
|
|
56 ##open COORDS, "show-coords -rclHT $deltafile |"
|
|
57 ## or die "Can't process $deltafile ($!)\n";
|
|
58
|
|
59
|
|
60
|
|
61 ## Require the flanking alignments are at least this long to call an SV
|
|
62 ## Note there is no minimum length for fusions, this is determined by how
|
|
63 ## the delta file was filtered
|
|
64
|
|
65 my $MIN_SV_ALIGN = 100;
|
|
66
|
|
67
|
|
68 #my $minimum_event_size = 50;
|
|
69 my $approximately_zero = $narrow_threshold;
|
|
70
|
|
71
|
|
72 my %alignments;
|
|
73 my $numalignments = 0;
|
|
74
|
|
75
|
|
76 while (<COORDS>)
|
|
77 {
|
|
78 chomp;
|
|
79 my @vals = split /\s+/, $_;
|
|
80
|
|
81 my $rid = $vals[6];
|
|
82 my $qid = $vals[7];
|
|
83
|
|
84 my $a;
|
|
85 $a->{"rstart"} = $vals[0];
|
|
86 $a->{"rend"} = $vals[1];
|
|
87 $a->{"qstart"} = $vals[2];
|
|
88 $a->{"qend"} = $vals[3];
|
|
89 $a->{"rlen"} = $vals[4];
|
|
90 $a->{"qlen"} = $vals[5];
|
|
91 $a->{"rid"} = $vals[6];
|
|
92 $a->{"qid"} = $vals[7];
|
|
93 $a->{"str"} = $_;
|
|
94
|
|
95 $a->{"qidx"} = 0;
|
|
96
|
|
97 $a->{"qrc"} = ($a->{"qend"} > $a->{"qstart"}) ? 0 : 1;
|
|
98
|
|
99 push @{$alignments{$qid}->{$rid}}, $a; # a is a hash with all the info for one alignment
|
|
100
|
|
101 $numalignments++;
|
|
102 }
|
|
103
|
|
104
|
|
105
|
|
106 my $candidatefusions = 0;
|
|
107 my $candidatesvs = 0;
|
|
108
|
|
109 my $sv_id_counter = 0;
|
|
110
|
|
111 my %svstats;
|
|
112
|
|
113 foreach my $qid (sort keys %alignments) # query name is the key for the alignments hash
|
|
114 {
|
|
115 my @refs = sort keys %{$alignments{$qid}}; # grab all alignments of that query
|
|
116 my $numref = scalar @refs;
|
|
117
|
|
118 ## scan for fusions
|
|
119 # if ($numref > 1) # if query aligns to multiple chromosomes
|
|
120 # {
|
|
121 # my $allrefs = join " ", @refs; # join the names together for output
|
|
122
|
|
123 # print "== $qid [$numref] $allrefs\n"; # output the names of the chromosomes
|
|
124 # $candidatefusions++;
|
|
125
|
|
126 # my $rcnt = 0;
|
|
127 # foreach my $rid (@refs)
|
|
128 # {
|
|
129 # print "--\n" if ($rcnt > 0);
|
|
130 # $rcnt++;
|
|
131
|
|
132 # foreach my $a (@{$alignments{$qid}->{$rid}})
|
|
133 # {
|
|
134 # my $str = $a->{"str"};
|
|
135 # print "$str\n";
|
|
136 # }
|
|
137 # }
|
|
138
|
|
139 # print "\n";
|
|
140 # }
|
|
141
|
|
142
|
|
143 ## Resort the alignments by query sort position
|
|
144 my @qaligns;
|
|
145 foreach my $rid (@refs)
|
|
146 {
|
|
147 foreach my $a (@{$alignments{$qid}->{$rid}})
|
|
148 {
|
|
149 push @qaligns, $a;
|
|
150 }
|
|
151 }
|
|
152
|
|
153 ## Now record the index of the sorted query indices
|
|
154 @qaligns = sort { $a->{"qstart"} <=> $b->{"qstart"}} @qaligns;
|
|
155 for (my $i=0; $i < scalar @qaligns; $i++)
|
|
156 {
|
|
157 $qaligns[$i]->{"qidx"} = $i;
|
|
158 }
|
|
159
|
|
160 ## scan for SVs
|
|
161 my $numalign = scalar @qaligns;
|
|
162
|
|
163 if ($numalign > 1) # if the query has more than 1 alignment
|
|
164 {
|
|
165 ## note skip first one
|
|
166 for (my $j = 1; $j < $numalign; $j++)
|
|
167 {
|
|
168 my $ai = $qaligns[$j-1];
|
|
169 my $aj = $qaligns[$j];
|
|
170
|
|
171 my $istr = $ai->{"str"};
|
|
172 my $jstr = $aj->{"str"};
|
|
173
|
|
174 # if ($ai->{"rid"} ne $aj->{"rid"})
|
|
175 # {
|
|
176 # ## skip the fusions for now #############################################################################################
|
|
177 # next;
|
|
178 # }
|
|
179
|
|
180 my $rid = $ai->{"rid"};
|
|
181
|
|
182 if (($ai->{"rlen"} >= $MIN_SV_ALIGN) &&
|
|
183 ($aj->{"rlen"} >= $MIN_SV_ALIGN))
|
|
184 {
|
|
185 ## r alignments are always forward, q alignments may be flipped
|
|
186
|
|
187 my $rpos;
|
|
188 my $qpos;
|
|
189 my $rdist = 0;
|
|
190 my $qdist = 0;
|
|
191 my $svtype = 0;
|
|
192
|
|
193 my $chromi = $ai->{"rid"};
|
|
194 my $chromj = $aj->{"rid"};
|
|
195 my $posi;
|
|
196 my $posj;
|
|
197 my $strandi;
|
|
198 my $strandj;
|
|
199
|
|
200 $sv_id_counter++;
|
|
201
|
|
202 if (($ai->{"qrc"} == 0) && ($aj->{"qrc"} == 0))
|
|
203 {
|
|
204 ## ri: [1 - 1000] | j: [2000 - 3000] => 1000
|
|
205 ## qi: [1 - 1000] | j: [2000 - 3000] => 1000
|
|
206
|
|
207 $svtype = "FF";
|
|
208
|
|
209 $qdist = $aj->{"qstart"} - $ai->{"qend"};
|
|
210 $rdist = $aj->{"rstart"} - $ai->{"rend"};
|
|
211
|
|
212 if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $ai->{"rend"}, $aj->{"rstart"}); }
|
|
213 else { $rpos = sprintf("%s:%d-%d:-", $rid, $aj->{"rstart"}, $ai->{"rend"}); }
|
|
214
|
|
215 if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $ai->{"qend"}, $aj->{"qstart"}); }
|
|
216 else { $qpos = sprintf("%s:%d-%d:-", $qid, $aj->{"qstart"}, $ai->{"qend"}); }
|
|
217
|
|
218 # When the alignments are forward-forward, the connection point is at the end of the first (i: rend) and at the beginning of the second (j: rstart)
|
|
219 # i + - j
|
|
220 # ------> -------->
|
|
221 $posi = $ai->{"rend"};
|
|
222 $posj = $aj->{"rstart"};
|
|
223 $strandi = "+";
|
|
224 $strandj = "-";
|
|
225
|
|
226 }
|
|
227 elsif (($ai->{"qrc"} == 1) && ($aj->{"qrc"} == 1))
|
|
228 {
|
|
229 ## ri: [2000 - 3000] | j: [1 - 1000] => 1000
|
|
230 ## qi: [1000 - 1] | j: [3000 - 2000] => 1000
|
|
231
|
|
232 $svtype = "RR";
|
|
233
|
|
234 $rdist = $ai->{"rstart"} - $aj->{"rend"};
|
|
235 $qdist = $aj->{"qend"} - $ai->{"qstart"};
|
|
236
|
|
237 if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $aj->{"rend"}, $ai->{"rstart"}); }
|
|
238 else { $rpos = sprintf("%s:%d-%d:-", $rid, $ai->{"rstart"}, $aj->{"rend"}); }
|
|
239
|
|
240 if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $ai->{"qstart"}, $aj->{"qend"}); }
|
|
241 else { $qpos = sprintf("%s:%d-%d:-", $qid, $aj->{"qend"}, $ai->{"qstart"}); }
|
|
242
|
|
243 # When the alignments are reverse-reverse, the connection point is at the beginning of the first (i: rstart) and at the end of the second (j: rend)
|
|
244 # j + - i
|
|
245 # <------- <--------
|
|
246 $posi = $ai->{"rstart"}; # rstart means first reference coordinate, not with respect to the contig
|
|
247 $posj = $aj->{"rend"}; # rend means last reference coordinate, not with respect to the contig
|
|
248 $strandi = "-";
|
|
249 $strandj = "+";
|
|
250 }
|
|
251 elsif (($ai->{"qrc"} == 0) && ($aj->{"qrc"} == 1))
|
|
252 {
|
|
253 ## ri: [1 - 1000] | j: [2000 - 3000] => 1000
|
|
254 ## qi: [1 - 1000] | j: [3000 - 2000] => 1000
|
|
255
|
|
256 $svtype = "FR";
|
|
257
|
|
258 $qdist = $aj->{"qend"} - $ai->{"qend"};
|
|
259 $rdist = $aj->{"rstart"} - $ai->{"rend"};
|
|
260
|
|
261 if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $ai->{"rend"}, $aj->{"rstart"}); }
|
|
262 else { $rpos = sprintf("%s:%d-%d:-", $rid, $aj->{"rstart"}, $ai->{"rend"}); }
|
|
263
|
|
264 if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $ai->{"qend"}, $aj->{"qend"}); }
|
|
265 else { $qpos = sprintf("%s:%d-%d:-", $qid, $aj->{"qend"}, $ai->{"qend"}); }
|
|
266
|
|
267 # When the alignments are forward-reverse, the connection point is at the beginning of the first (i: rstart) and at the end of the second (j: rend)
|
|
268 # i + j +
|
|
269 # -------> <--------
|
|
270 $posi = $ai->{"rend"};
|
|
271 $posj = $aj->{"rend"};
|
|
272 $strandi = "+";
|
|
273 $strandj = "+";
|
|
274
|
|
275 }
|
|
276 elsif (($ai->{"qrc"} == 1) && ($aj->{"qrc"} == 0))
|
|
277 {
|
|
278 ## ri: [1 - 1000] | j: [2000 - 3000] => 1000
|
|
279 ## qi: [1000 - 1] | j: [2000 - 3000] => 1000
|
|
280
|
|
281 $svtype = "RF";
|
|
282
|
|
283 $qdist = $ai->{"qend"} - $aj->{"qend"};
|
|
284 $rdist = $aj->{"rstart"} - $ai->{"rend"};
|
|
285
|
|
286 if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $ai->{"rend"}, $aj->{"rstart"}); }
|
|
287 else { $rpos = sprintf("%s:%d-%d:-", $rid, $aj->{"rstart"}, $ai->{"rend"}); }
|
|
288
|
|
289 if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $aj->{"qend"}, $ai->{"qend"}); }
|
|
290 else { $qpos = sprintf("%s:%d-%d:-", $qid, $ai->{"qend"}, $aj->{"qend"}); }
|
|
291
|
|
292 # When the alignments are reverse-forward:
|
|
293 # - i - j
|
|
294 # <------- -------->
|
|
295 $posi = $ai->{"rstart"};
|
|
296 $posj = $aj->{"rstart"};
|
|
297 $strandi = "-";
|
|
298 $strandj = "-";
|
|
299 }
|
|
300 else
|
|
301 {
|
|
302 my $irc = $ai->{"qrc"};
|
|
303 my $jrc = $aj->{"qrc"};
|
|
304
|
|
305 print "ERROR: Unknown SV: $irc $jrc\n";
|
|
306 print "$istr\n";
|
|
307 print "$jstr\n";
|
|
308 die "ERROR: Unknown SV: $irc $jrc\n";
|
|
309 }
|
|
310
|
|
311 my $totaldist = $rdist + $qdist;
|
|
312 my $typeguess = "";
|
|
313
|
|
314 my $abs_event_size = abs($rdist-$qdist);
|
|
315
|
|
316 if ($chromi ne $chromj) { # interchromosomal
|
|
317 $typeguess = "Interchromosomal";
|
|
318 $rdist = 0;
|
|
319 } else { # same chromosome
|
|
320 if ($strandi eq $strandj) {
|
|
321 $typeguess = "Inversion";
|
|
322 $abs_event_size = $rdist;
|
|
323 }
|
|
324 elsif ($qdist > $rdist) {
|
|
325 # both are significantly negative: (means the size of an overlapping region got larger, so tandem element expansion)
|
|
326 if ($rdist > -1*$approximately_zero && $rdist < $approximately_zero && $qdist > -1*$approximately_zero) {
|
|
327 $typeguess = "Insertion";
|
|
328 # split into out of nowhere (rdist ~ 0) vs. rdist is > 0: insertion_in_unmapped_region
|
|
329 }
|
|
330 else {
|
|
331 if ($rdist < 0 || $qdist < 0) {
|
|
332 $typeguess = "Tandem_expansion";
|
|
333 } else {
|
|
334 $typeguess = "Repeat_expansion";
|
|
335 }
|
|
336 }
|
|
337 }
|
|
338 elsif ($qdist < $rdist) {
|
|
339 # both are significantly negative: (means the size of an overlapping region got smaller, so tandem element contraction)
|
|
340 if ($rdist > -1*$approximately_zero && $qdist > -1*$approximately_zero && $qdist < $approximately_zero) {
|
|
341 $typeguess = "Deletion";
|
|
342 # split into out of nowhere (rdist ~ 0) vs. rdist is > 0: deletion_in_unmapped_region
|
|
343 }
|
|
344 else {
|
|
345 if ($rdist < 0 || $qdist < 0) {
|
|
346 $typeguess = "Tandem_contraction";
|
|
347 } else {
|
|
348 $typeguess = "Repeat_contraction";
|
|
349 }
|
|
350 }
|
|
351 }
|
|
352 else {
|
|
353 $typeguess = "None";
|
|
354 }
|
|
355
|
|
356 if ($abs_event_size > $longrange) { # || abs($rdist) > $longrange || abs($qdist) > $longrange
|
|
357 $typeguess = "Longrange";
|
|
358 if (abs($qdist) > $max_query_dist) {
|
|
359 $typeguess = "None";
|
|
360 }
|
|
361 }
|
|
362 # my $ratio;
|
|
363 # if ($qdist != 0){
|
|
364 # # $ratio = abs(($rdist/$qdist)-1);
|
|
365 # # if ($ratio < 0.1) {
|
|
366 # # $typeguess = "Equilibrium";
|
|
367 # # }
|
|
368 # if ($rdist==$qdist || abs($qdist) > $longrange) {
|
|
369 # $typeguess = "None";
|
|
370 # }
|
|
371 # }
|
|
372 }
|
|
373
|
|
374 # my @chromosome_filter_choices = ("all-chromosomes","primary-chromosomes");
|
|
375 # my @longrange_filter_choices = ("include-longrange","exclude-longrange");
|
|
376
|
|
377 my $chromi_length = length $chromi; # length of the chromosome names: a way to filter to primary chromosomes and cut out alts and patches from the assembly
|
|
378 my $chromj_length = length $chromj;
|
|
379 if ($typeguess ne "Inversion" && $typeguess ne "None" && $abs_event_size >= $minimum_event_size) { # always required
|
|
380 if ($chromosome_filter eq "all-chromosomes" || ($chromi_length < 6 && $chromj_length < 6)) { # test for primary chromosomes unless "all-chromosomes" is chosen
|
|
381 if ($longrange_filter ne "exclude-longrange" || ($typeguess ne "Interchromosomal" && $typeguess ne "Longrange")) {
|
|
382 if ($longrange_filter ne "longrange-only" || ($typeguess eq "Interchromosomal" || $typeguess eq "Longrange")) {
|
|
383 if ($output_file eq "bedpe") {
|
|
384 print "$chromi\t$posi\t@{[$posi + 1]}\t$chromj\t$posj\t@{[$posj + 1]}\tAssemblytics_b_$sv_id_counter\t$abs_event_size\t$strandi\t$strandj\t$typeguess\t$rdist\t$qdist\t$qpos\t$abs_event_size\t$svtype\tbetween_alignments\n";
|
|
385 }
|
|
386 else {
|
|
387 use List::Util qw(min max);
|
|
388 my $ref_start = min(($posi, $posj));
|
|
389 my $ref_stop = max(($posi, $posj));
|
|
390 if ($ref_stop eq $ref_start) {
|
|
391 $ref_stop = $ref_start + 1;
|
|
392 }
|
|
393 # "chrom","start","stop","name","event.size","strand","event.type","ref.dist","query.dist","contig.name"
|
|
394 print "$chromi\t$ref_start\t$ref_stop\tAssemblytics_b_$sv_id_counter\t$abs_event_size\t+\t$typeguess\t$rdist\t$qdist\t$qpos\tbetween_alignments\n";
|
|
395 }
|
|
396 }
|
|
397 }
|
|
398 }
|
|
399 #if ($filter_type ~~ ("primary-allsizes","primary-shortrange") {
|
|
400 # && $typeguess ne "Interchromosomal" && $typeguess ne "Inversion" && $chromi_length < 6 && $chromj_length < 6 && $abs_event_size >= $minimum_event_size) {
|
|
401 }
|
|
402 $candidatesvs++;
|
|
403 #push @{$svstats{$svtype}}, $totaldist;
|
|
404 }
|
|
405 }
|
|
406 }
|
|
407 }
|
|
408
|
|
409
|
|
410
|
|
411 # print "Processed $numalignments alignments found $candidatefusions fusions and $candidatesvs SVs\n";
|
|
412 # print STDERR "Processed $numalignments alignments found $candidatefusions fusions and $candidatesvs SVs\n";
|
|
413
|
|
414 # foreach my $svtype (keys %svstats)
|
|
415 # {
|
|
416 # my @events = @{$svstats{$svtype}};
|
|
417 # my $cnt = scalar @events;
|
|
418
|
|
419 # my $sum = 0.0;
|
|
420 # foreach my $e (@events)
|
|
421 # {
|
|
422 # $sum += $e;
|
|
423 # }
|
|
424
|
|
425 # my $mean = sprintf ("%0.02f", $sum/$cnt);
|
|
426
|
|
427 # print "svtype[$svtype]: $cnt $mean\n";
|
|
428 # print STDERR "svtype[$svtype]: $cnt $mean\n";
|
|
429 # }
|
|
430
|
|
431
|
|
432
|
|
433
|
|
434
|
|
435
|
|
436
|
|
437
|
|
438
|