Mercurial > repos > dereeper > ragoo
comparison RaGOO/Assemblytics_between_alignments.pl @ 13:b9a3aeb162ab draft default tip
Uploaded
| author | dereeper |
|---|---|
| date | Mon, 26 Jul 2021 18:22:37 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 12:68a9ec9ce51e | 13:b9a3aeb162ab |
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 | |
| 3 # Authors: Maria Nattestad and Mike Schatz | |
| 4 # Email: mnattest@cshl.edu | |
| 5 | |
| 6 use strict; | |
| 7 my @chromosome_filter_choices = ("all-chromosomes","primary-chromosomes"); | |
| 8 my @longrange_filter_choices = ("include-longrange","exclude-longrange","longrange-only"); | |
| 9 my @output_file_choices = ("bed","bedpe"); | |
| 10 | |
| 11 my $USAGE = "Usage:\nAssemblytics_between_alignments.pl coords.tab minimum_event_size maximum_event_size [@chromosome_filter_choices] [@longrange_filter_choices] [@output_file_choices] > fusions.svs.bedpe "; | |
| 12 | |
| 13 my $coordsfile = shift @ARGV or die $USAGE; | |
| 14 my $minimum_event_size = int(shift @ARGV); | |
| 15 my $maximum_event_size = int(shift @ARGV); | |
| 16 my $chromosome_filter = shift @ARGV or die $USAGE; | |
| 17 my $longrange_filter = shift @ARGV or die $USAGE; | |
| 18 my $output_file = shift @ARGV or die $USAGE; | |
| 19 | |
| 20 | |
| 21 | |
| 22 # How close do alignments have to be in order to call deletions and insertions? (as opposed to contractions and expansions) | |
| 23 my $narrow_threshold = 50; | |
| 24 | |
| 25 | |
| 26 | |
| 27 # Number of basepairs of distance in either the reference or the query before we call an SV long-range | |
| 28 my $longrange = $maximum_event_size; | |
| 29 | |
| 30 | |
| 31 | |
| 32 # What is the longest two alignments can map apart in the query before we throw the variant between them away? | |
| 33 my $max_query_dist = 100000; | |
| 34 | |
| 35 | |
| 36 | |
| 37 my %chromosome_filter_choices_hash = map { $_, 1 } @chromosome_filter_choices; | |
| 38 my %longrange_filter_choices_hash = map { $_, 1 } @longrange_filter_choices; | |
| 39 my %output_file_choices_hash = map { $_, 1 } @output_file_choices; | |
| 40 | |
| 41 if ( $chromosome_filter_choices_hash{ $chromosome_filter } && $longrange_filter_choices_hash{ $longrange_filter } && $output_file_choices_hash { $output_file }) { | |
| 42 # All is well with the world | |
| 43 } else { | |
| 44 die $USAGE; | |
| 45 } | |
| 46 | |
| 47 if ($longrange_filter ne "exclude-longrange" && $output_file eq "bed"){ | |
| 48 die "Cannot output bed while allowing long-range variants\n$USAGE"; | |
| 49 } | |
| 50 | |
| 51 # open COORDS, "./bin/show-coords -rclHT $deltafile |" | |
| 52 # or die "Can't process $deltafile ($!)\n"; | |
| 53 | |
| 54 open COORDS, "$coordsfile" or die "Can't process $coordsfile ($!)\n"; | |
| 55 | |
| 56 ##open COORDS, "show-coords -rclHT $deltafile |" | |
| 57 ## or die "Can't process $deltafile ($!)\n"; | |
| 58 | |
| 59 | |
| 60 | |
| 61 ## Require the flanking alignments are at least this long to call an SV | |
| 62 ## Note there is no minimum length for fusions, this is determined by how | |
| 63 ## the delta file was filtered | |
| 64 | |
| 65 my $MIN_SV_ALIGN = 100; | |
| 66 | |
| 67 | |
| 68 #my $minimum_event_size = 50; | |
| 69 my $approximately_zero = $narrow_threshold; | |
| 70 | |
| 71 | |
| 72 my %alignments; | |
| 73 my $numalignments = 0; | |
| 74 | |
| 75 | |
| 76 while (<COORDS>) | |
| 77 { | |
| 78 chomp; | |
| 79 my @vals = split /\s+/, $_; | |
| 80 | |
| 81 my $rid = $vals[6]; | |
| 82 my $qid = $vals[7]; | |
| 83 | |
| 84 my $a; | |
| 85 $a->{"rstart"} = $vals[0]; | |
| 86 $a->{"rend"} = $vals[1]; | |
| 87 $a->{"qstart"} = $vals[2]; | |
| 88 $a->{"qend"} = $vals[3]; | |
| 89 $a->{"rlen"} = $vals[4]; | |
| 90 $a->{"qlen"} = $vals[5]; | |
| 91 $a->{"rid"} = $vals[6]; | |
| 92 $a->{"qid"} = $vals[7]; | |
| 93 $a->{"str"} = $_; | |
| 94 | |
| 95 $a->{"qidx"} = 0; | |
| 96 | |
| 97 $a->{"qrc"} = ($a->{"qend"} > $a->{"qstart"}) ? 0 : 1; | |
| 98 | |
| 99 push @{$alignments{$qid}->{$rid}}, $a; # a is a hash with all the info for one alignment | |
| 100 | |
| 101 $numalignments++; | |
| 102 } | |
| 103 | |
| 104 | |
| 105 | |
| 106 my $candidatefusions = 0; | |
| 107 my $candidatesvs = 0; | |
| 108 | |
| 109 my $sv_id_counter = 0; | |
| 110 | |
| 111 my %svstats; | |
| 112 | |
| 113 foreach my $qid (sort keys %alignments) # query name is the key for the alignments hash | |
| 114 { | |
| 115 my @refs = sort keys %{$alignments{$qid}}; # grab all alignments of that query | |
| 116 my $numref = scalar @refs; | |
| 117 | |
| 118 ## scan for fusions | |
| 119 # if ($numref > 1) # if query aligns to multiple chromosomes | |
| 120 # { | |
| 121 # my $allrefs = join " ", @refs; # join the names together for output | |
| 122 | |
| 123 # print "== $qid [$numref] $allrefs\n"; # output the names of the chromosomes | |
| 124 # $candidatefusions++; | |
| 125 | |
| 126 # my $rcnt = 0; | |
| 127 # foreach my $rid (@refs) | |
| 128 # { | |
| 129 # print "--\n" if ($rcnt > 0); | |
| 130 # $rcnt++; | |
| 131 | |
| 132 # foreach my $a (@{$alignments{$qid}->{$rid}}) | |
| 133 # { | |
| 134 # my $str = $a->{"str"}; | |
| 135 # print "$str\n"; | |
| 136 # } | |
| 137 # } | |
| 138 | |
| 139 # print "\n"; | |
| 140 # } | |
| 141 | |
| 142 | |
| 143 ## Resort the alignments by query sort position | |
| 144 my @qaligns; | |
| 145 foreach my $rid (@refs) | |
| 146 { | |
| 147 foreach my $a (@{$alignments{$qid}->{$rid}}) | |
| 148 { | |
| 149 push @qaligns, $a; | |
| 150 } | |
| 151 } | |
| 152 | |
| 153 ## Now record the index of the sorted query indices | |
| 154 @qaligns = sort { $a->{"qstart"} <=> $b->{"qstart"}} @qaligns; | |
| 155 for (my $i=0; $i < scalar @qaligns; $i++) | |
| 156 { | |
| 157 $qaligns[$i]->{"qidx"} = $i; | |
| 158 } | |
| 159 | |
| 160 ## scan for SVs | |
| 161 my $numalign = scalar @qaligns; | |
| 162 | |
| 163 if ($numalign > 1) # if the query has more than 1 alignment | |
| 164 { | |
| 165 ## note skip first one | |
| 166 for (my $j = 1; $j < $numalign; $j++) | |
| 167 { | |
| 168 my $ai = $qaligns[$j-1]; | |
| 169 my $aj = $qaligns[$j]; | |
| 170 | |
| 171 my $istr = $ai->{"str"}; | |
| 172 my $jstr = $aj->{"str"}; | |
| 173 | |
| 174 # if ($ai->{"rid"} ne $aj->{"rid"}) | |
| 175 # { | |
| 176 # ## skip the fusions for now ############################################################################################# | |
| 177 # next; | |
| 178 # } | |
| 179 | |
| 180 my $rid = $ai->{"rid"}; | |
| 181 | |
| 182 if (($ai->{"rlen"} >= $MIN_SV_ALIGN) && | |
| 183 ($aj->{"rlen"} >= $MIN_SV_ALIGN)) | |
| 184 { | |
| 185 ## r alignments are always forward, q alignments may be flipped | |
| 186 | |
| 187 my $rpos; | |
| 188 my $qpos; | |
| 189 my $rdist = 0; | |
| 190 my $qdist = 0; | |
| 191 my $svtype = 0; | |
| 192 | |
| 193 my $chromi = $ai->{"rid"}; | |
| 194 my $chromj = $aj->{"rid"}; | |
| 195 my $posi; | |
| 196 my $posj; | |
| 197 my $strandi; | |
| 198 my $strandj; | |
| 199 | |
| 200 $sv_id_counter++; | |
| 201 | |
| 202 if (($ai->{"qrc"} == 0) && ($aj->{"qrc"} == 0)) | |
| 203 { | |
| 204 ## ri: [1 - 1000] | j: [2000 - 3000] => 1000 | |
| 205 ## qi: [1 - 1000] | j: [2000 - 3000] => 1000 | |
| 206 | |
| 207 $svtype = "FF"; | |
| 208 | |
| 209 $qdist = $aj->{"qstart"} - $ai->{"qend"}; | |
| 210 $rdist = $aj->{"rstart"} - $ai->{"rend"}; | |
| 211 | |
| 212 if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $ai->{"rend"}, $aj->{"rstart"}); } | |
| 213 else { $rpos = sprintf("%s:%d-%d:-", $rid, $aj->{"rstart"}, $ai->{"rend"}); } | |
| 214 | |
| 215 if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $ai->{"qend"}, $aj->{"qstart"}); } | |
| 216 else { $qpos = sprintf("%s:%d-%d:-", $qid, $aj->{"qstart"}, $ai->{"qend"}); } | |
| 217 | |
| 218 # When the alignments are forward-forward, the connection point is at the end of the first (i: rend) and at the beginning of the second (j: rstart) | |
| 219 # i + - j | |
| 220 # ------> --------> | |
| 221 $posi = $ai->{"rend"}; | |
| 222 $posj = $aj->{"rstart"}; | |
| 223 $strandi = "+"; | |
| 224 $strandj = "-"; | |
| 225 | |
| 226 } | |
| 227 elsif (($ai->{"qrc"} == 1) && ($aj->{"qrc"} == 1)) | |
| 228 { | |
| 229 ## ri: [2000 - 3000] | j: [1 - 1000] => 1000 | |
| 230 ## qi: [1000 - 1] | j: [3000 - 2000] => 1000 | |
| 231 | |
| 232 $svtype = "RR"; | |
| 233 | |
| 234 $rdist = $ai->{"rstart"} - $aj->{"rend"}; | |
| 235 $qdist = $aj->{"qend"} - $ai->{"qstart"}; | |
| 236 | |
| 237 if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $aj->{"rend"}, $ai->{"rstart"}); } | |
| 238 else { $rpos = sprintf("%s:%d-%d:-", $rid, $ai->{"rstart"}, $aj->{"rend"}); } | |
| 239 | |
| 240 if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $ai->{"qstart"}, $aj->{"qend"}); } | |
| 241 else { $qpos = sprintf("%s:%d-%d:-", $qid, $aj->{"qend"}, $ai->{"qstart"}); } | |
| 242 | |
| 243 # When the alignments are reverse-reverse, the connection point is at the beginning of the first (i: rstart) and at the end of the second (j: rend) | |
| 244 # j + - i | |
| 245 # <------- <-------- | |
| 246 $posi = $ai->{"rstart"}; # rstart means first reference coordinate, not with respect to the contig | |
| 247 $posj = $aj->{"rend"}; # rend means last reference coordinate, not with respect to the contig | |
| 248 $strandi = "-"; | |
| 249 $strandj = "+"; | |
| 250 } | |
| 251 elsif (($ai->{"qrc"} == 0) && ($aj->{"qrc"} == 1)) | |
| 252 { | |
| 253 ## ri: [1 - 1000] | j: [2000 - 3000] => 1000 | |
| 254 ## qi: [1 - 1000] | j: [3000 - 2000] => 1000 | |
| 255 | |
| 256 $svtype = "FR"; | |
| 257 | |
| 258 $qdist = $aj->{"qend"} - $ai->{"qend"}; | |
| 259 $rdist = $aj->{"rstart"} - $ai->{"rend"}; | |
| 260 | |
| 261 if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $ai->{"rend"}, $aj->{"rstart"}); } | |
| 262 else { $rpos = sprintf("%s:%d-%d:-", $rid, $aj->{"rstart"}, $ai->{"rend"}); } | |
| 263 | |
| 264 if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $ai->{"qend"}, $aj->{"qend"}); } | |
| 265 else { $qpos = sprintf("%s:%d-%d:-", $qid, $aj->{"qend"}, $ai->{"qend"}); } | |
| 266 | |
| 267 # When the alignments are forward-reverse, the connection point is at the beginning of the first (i: rstart) and at the end of the second (j: rend) | |
| 268 # i + j + | |
| 269 # -------> <-------- | |
| 270 $posi = $ai->{"rend"}; | |
| 271 $posj = $aj->{"rend"}; | |
| 272 $strandi = "+"; | |
| 273 $strandj = "+"; | |
| 274 | |
| 275 } | |
| 276 elsif (($ai->{"qrc"} == 1) && ($aj->{"qrc"} == 0)) | |
| 277 { | |
| 278 ## ri: [1 - 1000] | j: [2000 - 3000] => 1000 | |
| 279 ## qi: [1000 - 1] | j: [2000 - 3000] => 1000 | |
| 280 | |
| 281 $svtype = "RF"; | |
| 282 | |
| 283 $qdist = $ai->{"qend"} - $aj->{"qend"}; | |
| 284 $rdist = $aj->{"rstart"} - $ai->{"rend"}; | |
| 285 | |
| 286 if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $ai->{"rend"}, $aj->{"rstart"}); } | |
| 287 else { $rpos = sprintf("%s:%d-%d:-", $rid, $aj->{"rstart"}, $ai->{"rend"}); } | |
| 288 | |
| 289 if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $aj->{"qend"}, $ai->{"qend"}); } | |
| 290 else { $qpos = sprintf("%s:%d-%d:-", $qid, $ai->{"qend"}, $aj->{"qend"}); } | |
| 291 | |
| 292 # When the alignments are reverse-forward: | |
| 293 # - i - j | |
| 294 # <------- --------> | |
| 295 $posi = $ai->{"rstart"}; | |
| 296 $posj = $aj->{"rstart"}; | |
| 297 $strandi = "-"; | |
| 298 $strandj = "-"; | |
| 299 } | |
| 300 else | |
| 301 { | |
| 302 my $irc = $ai->{"qrc"}; | |
| 303 my $jrc = $aj->{"qrc"}; | |
| 304 | |
| 305 print "ERROR: Unknown SV: $irc $jrc\n"; | |
| 306 print "$istr\n"; | |
| 307 print "$jstr\n"; | |
| 308 die "ERROR: Unknown SV: $irc $jrc\n"; | |
| 309 } | |
| 310 | |
| 311 my $totaldist = $rdist + $qdist; | |
| 312 my $typeguess = ""; | |
| 313 | |
| 314 my $abs_event_size = abs($rdist-$qdist); | |
| 315 | |
| 316 if ($chromi ne $chromj) { # interchromosomal | |
| 317 $typeguess = "Interchromosomal"; | |
| 318 $rdist = 0; | |
| 319 } else { # same chromosome | |
| 320 if ($strandi eq $strandj) { | |
| 321 $typeguess = "Inversion"; | |
| 322 $abs_event_size = $rdist; | |
| 323 } | |
| 324 elsif ($qdist > $rdist) { | |
| 325 # both are significantly negative: (means the size of an overlapping region got larger, so tandem element expansion) | |
| 326 if ($rdist > -1*$approximately_zero && $rdist < $approximately_zero && $qdist > -1*$approximately_zero) { | |
| 327 $typeguess = "Insertion"; | |
| 328 # split into out of nowhere (rdist ~ 0) vs. rdist is > 0: insertion_in_unmapped_region | |
| 329 } | |
| 330 else { | |
| 331 if ($rdist < 0 || $qdist < 0) { | |
| 332 $typeguess = "Tandem_expansion"; | |
| 333 } else { | |
| 334 $typeguess = "Repeat_expansion"; | |
| 335 } | |
| 336 } | |
| 337 } | |
| 338 elsif ($qdist < $rdist) { | |
| 339 # both are significantly negative: (means the size of an overlapping region got smaller, so tandem element contraction) | |
| 340 if ($rdist > -1*$approximately_zero && $qdist > -1*$approximately_zero && $qdist < $approximately_zero) { | |
| 341 $typeguess = "Deletion"; | |
| 342 # split into out of nowhere (rdist ~ 0) vs. rdist is > 0: deletion_in_unmapped_region | |
| 343 } | |
| 344 else { | |
| 345 if ($rdist < 0 || $qdist < 0) { | |
| 346 $typeguess = "Tandem_contraction"; | |
| 347 } else { | |
| 348 $typeguess = "Repeat_contraction"; | |
| 349 } | |
| 350 } | |
| 351 } | |
| 352 else { | |
| 353 $typeguess = "None"; | |
| 354 } | |
| 355 | |
| 356 if ($abs_event_size > $longrange) { # || abs($rdist) > $longrange || abs($qdist) > $longrange | |
| 357 $typeguess = "Longrange"; | |
| 358 if (abs($qdist) > $max_query_dist) { | |
| 359 $typeguess = "None"; | |
| 360 } | |
| 361 } | |
| 362 # my $ratio; | |
| 363 # if ($qdist != 0){ | |
| 364 # # $ratio = abs(($rdist/$qdist)-1); | |
| 365 # # if ($ratio < 0.1) { | |
| 366 # # $typeguess = "Equilibrium"; | |
| 367 # # } | |
| 368 # if ($rdist==$qdist || abs($qdist) > $longrange) { | |
| 369 # $typeguess = "None"; | |
| 370 # } | |
| 371 # } | |
| 372 } | |
| 373 | |
| 374 # my @chromosome_filter_choices = ("all-chromosomes","primary-chromosomes"); | |
| 375 # my @longrange_filter_choices = ("include-longrange","exclude-longrange"); | |
| 376 | |
| 377 my $chromi_length = length $chromi; # length of the chromosome names: a way to filter to primary chromosomes and cut out alts and patches from the assembly | |
| 378 my $chromj_length = length $chromj; | |
| 379 if ($typeguess ne "Inversion" && $typeguess ne "None" && $abs_event_size >= $minimum_event_size) { # always required | |
| 380 if ($chromosome_filter eq "all-chromosomes" || ($chromi_length < 6 && $chromj_length < 6)) { # test for primary chromosomes unless "all-chromosomes" is chosen | |
| 381 if ($longrange_filter ne "exclude-longrange" || ($typeguess ne "Interchromosomal" && $typeguess ne "Longrange")) { | |
| 382 if ($longrange_filter ne "longrange-only" || ($typeguess eq "Interchromosomal" || $typeguess eq "Longrange")) { | |
| 383 if ($output_file eq "bedpe") { | |
| 384 print "$chromi\t$posi\t@{[$posi + 1]}\t$chromj\t$posj\t@{[$posj + 1]}\tAssemblytics_b_$sv_id_counter\t$abs_event_size\t$strandi\t$strandj\t$typeguess\t$rdist\t$qdist\t$qpos\t$abs_event_size\t$svtype\tbetween_alignments\n"; | |
| 385 } | |
| 386 else { | |
| 387 use List::Util qw(min max); | |
| 388 my $ref_start = min(($posi, $posj)); | |
| 389 my $ref_stop = max(($posi, $posj)); | |
| 390 if ($ref_stop eq $ref_start) { | |
| 391 $ref_stop = $ref_start + 1; | |
| 392 } | |
| 393 # "chrom","start","stop","name","event.size","strand","event.type","ref.dist","query.dist","contig.name" | |
| 394 print "$chromi\t$ref_start\t$ref_stop\tAssemblytics_b_$sv_id_counter\t$abs_event_size\t+\t$typeguess\t$rdist\t$qdist\t$qpos\tbetween_alignments\n"; | |
| 395 } | |
| 396 } | |
| 397 } | |
| 398 } | |
| 399 #if ($filter_type ~~ ("primary-allsizes","primary-shortrange") { | |
| 400 # && $typeguess ne "Interchromosomal" && $typeguess ne "Inversion" && $chromi_length < 6 && $chromj_length < 6 && $abs_event_size >= $minimum_event_size) { | |
| 401 } | |
| 402 $candidatesvs++; | |
| 403 #push @{$svstats{$svtype}}, $totaldist; | |
| 404 } | |
| 405 } | |
| 406 } | |
| 407 } | |
| 408 | |
| 409 | |
| 410 | |
| 411 # print "Processed $numalignments alignments found $candidatefusions fusions and $candidatesvs SVs\n"; | |
| 412 # print STDERR "Processed $numalignments alignments found $candidatefusions fusions and $candidatesvs SVs\n"; | |
| 413 | |
| 414 # foreach my $svtype (keys %svstats) | |
| 415 # { | |
| 416 # my @events = @{$svstats{$svtype}}; | |
| 417 # my $cnt = scalar @events; | |
| 418 | |
| 419 # my $sum = 0.0; | |
| 420 # foreach my $e (@events) | |
| 421 # { | |
| 422 # $sum += $e; | |
| 423 # } | |
| 424 | |
| 425 # my $mean = sprintf ("%0.02f", $sum/$cnt); | |
| 426 | |
| 427 # print "svtype[$svtype]: $cnt $mean\n"; | |
| 428 # print STDERR "svtype[$svtype]: $cnt $mean\n"; | |
| 429 # } | |
| 430 | |
| 431 | |
| 432 | |
| 433 | |
| 434 | |
| 435 | |
| 436 | |
| 437 | |
| 438 |
