Mercurial > repos > dereeper > ragoo
diff RaGOO/Assemblytics_between_alignments.pl @ 13:b9a3aeb162ab draft default tip
Uploaded
author | dereeper |
---|---|
date | Mon, 26 Jul 2021 18:22:37 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RaGOO/Assemblytics_between_alignments.pl Mon Jul 26 18:22:37 2021 +0000 @@ -0,0 +1,438 @@ +#!/usr/bin/perl -w + +# Authors: Maria Nattestad and Mike Schatz +# Email: mnattest@cshl.edu + +use strict; +my @chromosome_filter_choices = ("all-chromosomes","primary-chromosomes"); +my @longrange_filter_choices = ("include-longrange","exclude-longrange","longrange-only"); +my @output_file_choices = ("bed","bedpe"); + +my $USAGE = "Usage:\nAssemblytics_between_alignments.pl coords.tab minimum_event_size maximum_event_size [@chromosome_filter_choices] [@longrange_filter_choices] [@output_file_choices] > fusions.svs.bedpe "; + +my $coordsfile = shift @ARGV or die $USAGE; +my $minimum_event_size = int(shift @ARGV); +my $maximum_event_size = int(shift @ARGV); +my $chromosome_filter = shift @ARGV or die $USAGE; +my $longrange_filter = shift @ARGV or die $USAGE; +my $output_file = shift @ARGV or die $USAGE; + + + +# How close do alignments have to be in order to call deletions and insertions? (as opposed to contractions and expansions) +my $narrow_threshold = 50; + + + +# Number of basepairs of distance in either the reference or the query before we call an SV long-range +my $longrange = $maximum_event_size; + + + +# What is the longest two alignments can map apart in the query before we throw the variant between them away? +my $max_query_dist = 100000; + + + +my %chromosome_filter_choices_hash = map { $_, 1 } @chromosome_filter_choices; +my %longrange_filter_choices_hash = map { $_, 1 } @longrange_filter_choices; +my %output_file_choices_hash = map { $_, 1 } @output_file_choices; + +if ( $chromosome_filter_choices_hash{ $chromosome_filter } && $longrange_filter_choices_hash{ $longrange_filter } && $output_file_choices_hash { $output_file }) { + # All is well with the world +} else { + die $USAGE; +} + +if ($longrange_filter ne "exclude-longrange" && $output_file eq "bed"){ + die "Cannot output bed while allowing long-range variants\n$USAGE"; +} + +# open COORDS, "./bin/show-coords -rclHT $deltafile |" +# or die "Can't process $deltafile ($!)\n"; + +open COORDS, "$coordsfile" or die "Can't process $coordsfile ($!)\n"; + +##open COORDS, "show-coords -rclHT $deltafile |" +## or die "Can't process $deltafile ($!)\n"; + + + +## Require the flanking alignments are at least this long to call an SV +## Note there is no minimum length for fusions, this is determined by how +## the delta file was filtered + +my $MIN_SV_ALIGN = 100; + + +#my $minimum_event_size = 50; +my $approximately_zero = $narrow_threshold; + + +my %alignments; +my $numalignments = 0; + + +while (<COORDS>) +{ + chomp; + my @vals = split /\s+/, $_; + + my $rid = $vals[6]; + my $qid = $vals[7]; + + my $a; + $a->{"rstart"} = $vals[0]; + $a->{"rend"} = $vals[1]; + $a->{"qstart"} = $vals[2]; + $a->{"qend"} = $vals[3]; + $a->{"rlen"} = $vals[4]; + $a->{"qlen"} = $vals[5]; + $a->{"rid"} = $vals[6]; + $a->{"qid"} = $vals[7]; + $a->{"str"} = $_; + + $a->{"qidx"} = 0; + + $a->{"qrc"} = ($a->{"qend"} > $a->{"qstart"}) ? 0 : 1; + + push @{$alignments{$qid}->{$rid}}, $a; # a is a hash with all the info for one alignment + + $numalignments++; +} + + + +my $candidatefusions = 0; +my $candidatesvs = 0; + +my $sv_id_counter = 0; + +my %svstats; + +foreach my $qid (sort keys %alignments) # query name is the key for the alignments hash +{ + my @refs = sort keys %{$alignments{$qid}}; # grab all alignments of that query + my $numref = scalar @refs; + + ## scan for fusions + # if ($numref > 1) # if query aligns to multiple chromosomes + # { + # my $allrefs = join " ", @refs; # join the names together for output + + # print "== $qid [$numref] $allrefs\n"; # output the names of the chromosomes + # $candidatefusions++; + + # my $rcnt = 0; + # foreach my $rid (@refs) + # { + # print "--\n" if ($rcnt > 0); + # $rcnt++; + + # foreach my $a (@{$alignments{$qid}->{$rid}}) + # { + # my $str = $a->{"str"}; + # print "$str\n"; + # } + # } + + # print "\n"; + # } + + + ## Resort the alignments by query sort position + my @qaligns; + foreach my $rid (@refs) + { + foreach my $a (@{$alignments{$qid}->{$rid}}) + { + push @qaligns, $a; + } + } + + ## Now record the index of the sorted query indices + @qaligns = sort { $a->{"qstart"} <=> $b->{"qstart"}} @qaligns; + for (my $i=0; $i < scalar @qaligns; $i++) + { + $qaligns[$i]->{"qidx"} = $i; + } + + ## scan for SVs + my $numalign = scalar @qaligns; + + if ($numalign > 1) # if the query has more than 1 alignment + { + ## note skip first one + for (my $j = 1; $j < $numalign; $j++) + { + my $ai = $qaligns[$j-1]; + my $aj = $qaligns[$j]; + + my $istr = $ai->{"str"}; + my $jstr = $aj->{"str"}; + + # if ($ai->{"rid"} ne $aj->{"rid"}) + # { + # ## skip the fusions for now ############################################################################################# + # next; + # } + + my $rid = $ai->{"rid"}; + + if (($ai->{"rlen"} >= $MIN_SV_ALIGN) && + ($aj->{"rlen"} >= $MIN_SV_ALIGN)) + { + ## r alignments are always forward, q alignments may be flipped + + my $rpos; + my $qpos; + my $rdist = 0; + my $qdist = 0; + my $svtype = 0; + + my $chromi = $ai->{"rid"}; + my $chromj = $aj->{"rid"}; + my $posi; + my $posj; + my $strandi; + my $strandj; + + $sv_id_counter++; + + if (($ai->{"qrc"} == 0) && ($aj->{"qrc"} == 0)) + { + ## ri: [1 - 1000] | j: [2000 - 3000] => 1000 + ## qi: [1 - 1000] | j: [2000 - 3000] => 1000 + + $svtype = "FF"; + + $qdist = $aj->{"qstart"} - $ai->{"qend"}; + $rdist = $aj->{"rstart"} - $ai->{"rend"}; + + if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $ai->{"rend"}, $aj->{"rstart"}); } + else { $rpos = sprintf("%s:%d-%d:-", $rid, $aj->{"rstart"}, $ai->{"rend"}); } + + if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $ai->{"qend"}, $aj->{"qstart"}); } + else { $qpos = sprintf("%s:%d-%d:-", $qid, $aj->{"qstart"}, $ai->{"qend"}); } + + # When the alignments are forward-forward, the connection point is at the end of the first (i: rend) and at the beginning of the second (j: rstart) + # i + - j + # ------> --------> + $posi = $ai->{"rend"}; + $posj = $aj->{"rstart"}; + $strandi = "+"; + $strandj = "-"; + + } + elsif (($ai->{"qrc"} == 1) && ($aj->{"qrc"} == 1)) + { + ## ri: [2000 - 3000] | j: [1 - 1000] => 1000 + ## qi: [1000 - 1] | j: [3000 - 2000] => 1000 + + $svtype = "RR"; + + $rdist = $ai->{"rstart"} - $aj->{"rend"}; + $qdist = $aj->{"qend"} - $ai->{"qstart"}; + + if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $aj->{"rend"}, $ai->{"rstart"}); } + else { $rpos = sprintf("%s:%d-%d:-", $rid, $ai->{"rstart"}, $aj->{"rend"}); } + + if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $ai->{"qstart"}, $aj->{"qend"}); } + else { $qpos = sprintf("%s:%d-%d:-", $qid, $aj->{"qend"}, $ai->{"qstart"}); } + + # When the alignments are reverse-reverse, the connection point is at the beginning of the first (i: rstart) and at the end of the second (j: rend) + # j + - i + # <------- <-------- + $posi = $ai->{"rstart"}; # rstart means first reference coordinate, not with respect to the contig + $posj = $aj->{"rend"}; # rend means last reference coordinate, not with respect to the contig + $strandi = "-"; + $strandj = "+"; + } + elsif (($ai->{"qrc"} == 0) && ($aj->{"qrc"} == 1)) + { + ## ri: [1 - 1000] | j: [2000 - 3000] => 1000 + ## qi: [1 - 1000] | j: [3000 - 2000] => 1000 + + $svtype = "FR"; + + $qdist = $aj->{"qend"} - $ai->{"qend"}; + $rdist = $aj->{"rstart"} - $ai->{"rend"}; + + if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $ai->{"rend"}, $aj->{"rstart"}); } + else { $rpos = sprintf("%s:%d-%d:-", $rid, $aj->{"rstart"}, $ai->{"rend"}); } + + if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $ai->{"qend"}, $aj->{"qend"}); } + else { $qpos = sprintf("%s:%d-%d:-", $qid, $aj->{"qend"}, $ai->{"qend"}); } + + # When the alignments are forward-reverse, the connection point is at the beginning of the first (i: rstart) and at the end of the second (j: rend) + # i + j + + # -------> <-------- + $posi = $ai->{"rend"}; + $posj = $aj->{"rend"}; + $strandi = "+"; + $strandj = "+"; + + } + elsif (($ai->{"qrc"} == 1) && ($aj->{"qrc"} == 0)) + { + ## ri: [1 - 1000] | j: [2000 - 3000] => 1000 + ## qi: [1000 - 1] | j: [2000 - 3000] => 1000 + + $svtype = "RF"; + + $qdist = $ai->{"qend"} - $aj->{"qend"}; + $rdist = $aj->{"rstart"} - $ai->{"rend"}; + + if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $ai->{"rend"}, $aj->{"rstart"}); } + else { $rpos = sprintf("%s:%d-%d:-", $rid, $aj->{"rstart"}, $ai->{"rend"}); } + + if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $aj->{"qend"}, $ai->{"qend"}); } + else { $qpos = sprintf("%s:%d-%d:-", $qid, $ai->{"qend"}, $aj->{"qend"}); } + + # When the alignments are reverse-forward: + # - i - j + # <------- --------> + $posi = $ai->{"rstart"}; + $posj = $aj->{"rstart"}; + $strandi = "-"; + $strandj = "-"; + } + else + { + my $irc = $ai->{"qrc"}; + my $jrc = $aj->{"qrc"}; + + print "ERROR: Unknown SV: $irc $jrc\n"; + print "$istr\n"; + print "$jstr\n"; + die "ERROR: Unknown SV: $irc $jrc\n"; + } + + my $totaldist = $rdist + $qdist; + my $typeguess = ""; + + my $abs_event_size = abs($rdist-$qdist); + + if ($chromi ne $chromj) { # interchromosomal + $typeguess = "Interchromosomal"; + $rdist = 0; + } else { # same chromosome + if ($strandi eq $strandj) { + $typeguess = "Inversion"; + $abs_event_size = $rdist; + } + elsif ($qdist > $rdist) { + # both are significantly negative: (means the size of an overlapping region got larger, so tandem element expansion) + if ($rdist > -1*$approximately_zero && $rdist < $approximately_zero && $qdist > -1*$approximately_zero) { + $typeguess = "Insertion"; + # split into out of nowhere (rdist ~ 0) vs. rdist is > 0: insertion_in_unmapped_region + } + else { + if ($rdist < 0 || $qdist < 0) { + $typeguess = "Tandem_expansion"; + } else { + $typeguess = "Repeat_expansion"; + } + } + } + elsif ($qdist < $rdist) { + # both are significantly negative: (means the size of an overlapping region got smaller, so tandem element contraction) + if ($rdist > -1*$approximately_zero && $qdist > -1*$approximately_zero && $qdist < $approximately_zero) { + $typeguess = "Deletion"; + # split into out of nowhere (rdist ~ 0) vs. rdist is > 0: deletion_in_unmapped_region + } + else { + if ($rdist < 0 || $qdist < 0) { + $typeguess = "Tandem_contraction"; + } else { + $typeguess = "Repeat_contraction"; + } + } + } + else { + $typeguess = "None"; + } + + if ($abs_event_size > $longrange) { # || abs($rdist) > $longrange || abs($qdist) > $longrange + $typeguess = "Longrange"; + if (abs($qdist) > $max_query_dist) { + $typeguess = "None"; + } + } + # my $ratio; + # if ($qdist != 0){ + # # $ratio = abs(($rdist/$qdist)-1); + # # if ($ratio < 0.1) { + # # $typeguess = "Equilibrium"; + # # } + # if ($rdist==$qdist || abs($qdist) > $longrange) { + # $typeguess = "None"; + # } + # } + } + +# my @chromosome_filter_choices = ("all-chromosomes","primary-chromosomes"); +# my @longrange_filter_choices = ("include-longrange","exclude-longrange"); + + my $chromi_length = length $chromi; # length of the chromosome names: a way to filter to primary chromosomes and cut out alts and patches from the assembly + my $chromj_length = length $chromj; + if ($typeguess ne "Inversion" && $typeguess ne "None" && $abs_event_size >= $minimum_event_size) { # always required + if ($chromosome_filter eq "all-chromosomes" || ($chromi_length < 6 && $chromj_length < 6)) { # test for primary chromosomes unless "all-chromosomes" is chosen + if ($longrange_filter ne "exclude-longrange" || ($typeguess ne "Interchromosomal" && $typeguess ne "Longrange")) { + if ($longrange_filter ne "longrange-only" || ($typeguess eq "Interchromosomal" || $typeguess eq "Longrange")) { + if ($output_file eq "bedpe") { + print "$chromi\t$posi\t@{[$posi + 1]}\t$chromj\t$posj\t@{[$posj + 1]}\tAssemblytics_b_$sv_id_counter\t$abs_event_size\t$strandi\t$strandj\t$typeguess\t$rdist\t$qdist\t$qpos\t$abs_event_size\t$svtype\tbetween_alignments\n"; + } + else { + use List::Util qw(min max); + my $ref_start = min(($posi, $posj)); + my $ref_stop = max(($posi, $posj)); + if ($ref_stop eq $ref_start) { + $ref_stop = $ref_start + 1; + } + # "chrom","start","stop","name","event.size","strand","event.type","ref.dist","query.dist","contig.name" + print "$chromi\t$ref_start\t$ref_stop\tAssemblytics_b_$sv_id_counter\t$abs_event_size\t+\t$typeguess\t$rdist\t$qdist\t$qpos\tbetween_alignments\n"; + } + } + } + } + #if ($filter_type ~~ ("primary-allsizes","primary-shortrange") { + # && $typeguess ne "Interchromosomal" && $typeguess ne "Inversion" && $chromi_length < 6 && $chromj_length < 6 && $abs_event_size >= $minimum_event_size) { + } + $candidatesvs++; + #push @{$svstats{$svtype}}, $totaldist; + } + } + } +} + + + +# print "Processed $numalignments alignments found $candidatefusions fusions and $candidatesvs SVs\n"; +# print STDERR "Processed $numalignments alignments found $candidatefusions fusions and $candidatesvs SVs\n"; + +# foreach my $svtype (keys %svstats) +# { +# my @events = @{$svstats{$svtype}}; +# my $cnt = scalar @events; + +# my $sum = 0.0; +# foreach my $e (@events) +# { +# $sum += $e; +# } + +# my $mean = sprintf ("%0.02f", $sum/$cnt); + +# print "svtype[$svtype]: $cnt $mean\n"; +# print STDERR "svtype[$svtype]: $cnt $mean\n"; +# } + + + + + + + + +