view RaGOO/Assemblytics_between_alignments.pl @ 13:b9a3aeb162ab draft default tip

Uploaded
author dereeper
date Mon, 26 Jul 2021 18:22:37 +0000
parents
children
line wrap: on
line source

#!/usr/bin/perl -w

# Authors: Maria Nattestad and Mike Schatz
# Email: mnattest@cshl.edu

use strict;
my @chromosome_filter_choices =  ("all-chromosomes","primary-chromosomes");
my @longrange_filter_choices = ("include-longrange","exclude-longrange","longrange-only");
my @output_file_choices = ("bed","bedpe");

my $USAGE = "Usage:\nAssemblytics_between_alignments.pl coords.tab minimum_event_size maximum_event_size [@chromosome_filter_choices] [@longrange_filter_choices] [@output_file_choices] > fusions.svs.bedpe ";

my $coordsfile = shift @ARGV or die $USAGE;
my $minimum_event_size = int(shift @ARGV);
my $maximum_event_size = int(shift @ARGV);
my $chromosome_filter = shift @ARGV or die $USAGE;
my $longrange_filter = shift @ARGV or die $USAGE;
my $output_file = shift @ARGV or die $USAGE;



# How close do alignments have to be in order to call deletions and insertions? (as opposed to contractions and expansions)
my $narrow_threshold = 50;



# Number of basepairs of distance in either the reference or the query before we call an SV long-range
my $longrange = $maximum_event_size;



# What is the longest two alignments can map apart in the query before we throw the variant between them away?
my $max_query_dist = 100000;



my %chromosome_filter_choices_hash = map { $_, 1 } @chromosome_filter_choices;
my %longrange_filter_choices_hash = map { $_, 1 } @longrange_filter_choices;
my %output_file_choices_hash = map { $_, 1 } @output_file_choices;

if ( $chromosome_filter_choices_hash{ $chromosome_filter } &&  $longrange_filter_choices_hash{ $longrange_filter } && $output_file_choices_hash { $output_file }) {
  # All is well with the world
} else {
  die $USAGE;
}

if ($longrange_filter ne "exclude-longrange" && $output_file eq "bed"){
  die "Cannot output bed while allowing long-range variants\n$USAGE";
}

# open COORDS, "./bin/show-coords -rclHT $deltafile |"
#   or die "Can't process $deltafile ($!)\n";

open COORDS, "$coordsfile"  or die "Can't process $coordsfile ($!)\n";

##open COORDS, "show-coords -rclHT $deltafile |"
##  or die "Can't process $deltafile ($!)\n";



## Require the flanking alignments are at least this long to call an SV
## Note there is no minimum length for fusions, this is determined by how 
## the delta file was filtered

my $MIN_SV_ALIGN = 100;


#my $minimum_event_size = 50;
my $approximately_zero = $narrow_threshold;


my %alignments;
my $numalignments = 0;


while (<COORDS>)
{
  chomp;
  my @vals = split /\s+/, $_;

  my $rid = $vals[6];
  my $qid = $vals[7];

  my $a;
  $a->{"rstart"} = $vals[0];
  $a->{"rend"}   = $vals[1];
  $a->{"qstart"} = $vals[2];
  $a->{"qend"}   = $vals[3];
  $a->{"rlen"}   = $vals[4];
  $a->{"qlen"}   = $vals[5];
  $a->{"rid"}    = $vals[6];
  $a->{"qid"}    = $vals[7];
  $a->{"str"}    = $_;

  $a->{"qidx"}   = 0;

  $a->{"qrc"} = ($a->{"qend"} > $a->{"qstart"}) ? 0 : 1;

  push @{$alignments{$qid}->{$rid}}, $a; # a is a hash with all the info for one alignment

  $numalignments++;
}



my $candidatefusions = 0;
my $candidatesvs     = 0;

my $sv_id_counter = 0;

my %svstats;

foreach my $qid (sort keys %alignments) # query name is the key for the alignments hash
{
  my @refs = sort keys %{$alignments{$qid}}; # grab all alignments of that query
  my $numref = scalar @refs;

  ## scan for fusions
  # if ($numref > 1) # if query aligns to multiple chromosomes
  # {
  #   my $allrefs = join " ", @refs; # join the names together for output

  #   print "== $qid [$numref] $allrefs\n"; # output the names of the chromosomes 
  #   $candidatefusions++;

  #   my $rcnt = 0;
  #   foreach my $rid (@refs)
  #   {
  #     print "--\n" if ($rcnt > 0);
  #     $rcnt++;

  #     foreach my $a (@{$alignments{$qid}->{$rid}})
  #     {
  #       my $str = $a->{"str"};
  #       print "$str\n";
  #     }
  #   }

  #   print "\n";
  # }


  ## Resort the alignments by query sort position
  my @qaligns;
  foreach my $rid (@refs)
  {
    foreach my $a (@{$alignments{$qid}->{$rid}})
    {
      push @qaligns, $a;
    }
  }

  ## Now record the index of the sorted query indices
  @qaligns = sort { $a->{"qstart"} <=> $b->{"qstart"}} @qaligns;
  for (my $i=0; $i < scalar @qaligns; $i++)
  {
    $qaligns[$i]->{"qidx"} = $i;
  }

  ## scan for SVs
  my $numalign = scalar @qaligns;

  if ($numalign > 1) # if the query has more than 1 alignment
  {
    ## note skip first one
    for (my $j = 1; $j < $numalign; $j++)
    {
      my $ai = $qaligns[$j-1];
      my $aj = $qaligns[$j];

      my $istr = $ai->{"str"};
      my $jstr = $aj->{"str"};

      # if ($ai->{"rid"} ne $aj->{"rid"})
      # {
      #   ## skip the fusions for now #############################################################################################
      #   next;
      # }

      my $rid = $ai->{"rid"};

      if (($ai->{"rlen"} >= $MIN_SV_ALIGN) && 
          ($aj->{"rlen"} >= $MIN_SV_ALIGN))
      {
        ## r alignments are always forward, q alignments may be flipped

        my $rpos;
        my $qpos;
        my $rdist = 0;
        my $qdist = 0;
        my $svtype = 0;

        my $chromi = $ai->{"rid"};
        my $chromj = $aj->{"rid"};
        my $posi;
        my $posj;
        my $strandi;
        my $strandj;

        $sv_id_counter++;

        if (($ai->{"qrc"} == 0) && ($aj->{"qrc"} == 0))
        {
          ## ri: [1 - 1000] | j: [2000 - 3000] => 1000 
          ## qi: [1 - 1000] | j: [2000 - 3000] => 1000 

          $svtype = "FF";

          $qdist = $aj->{"qstart"} - $ai->{"qend"};
          $rdist = $aj->{"rstart"} - $ai->{"rend"};

          if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $ai->{"rend"}, $aj->{"rstart"}); }
          else             { $rpos = sprintf("%s:%d-%d:-", $rid, $aj->{"rstart"}, $ai->{"rend"}); }

          if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $ai->{"qend"}, $aj->{"qstart"}); }
          else             { $qpos = sprintf("%s:%d-%d:-", $qid, $aj->{"qstart"}, $ai->{"qend"}); }

          # When the alignments are forward-forward, the connection point is at the end of the first (i: rend) and at the beginning of the second (j: rstart)
          #    i  + -   j
          # ------> -------->
          $posi = $ai->{"rend"}; 
          $posj = $aj->{"rstart"};
          $strandi = "+";
          $strandj = "-";

        }
        elsif (($ai->{"qrc"} == 1) && ($aj->{"qrc"} == 1))
        {
          ## ri: [2000 - 3000] | j: [1 - 1000] => 1000 
          ## qi: [1000 - 1]    | j: [3000 - 2000] => 1000 

          $svtype = "RR";

          $rdist = $ai->{"rstart"} - $aj->{"rend"}; 
          $qdist = $aj->{"qend"} - $ai->{"qstart"};

          if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $aj->{"rend"}, $ai->{"rstart"}); }
          else             { $rpos = sprintf("%s:%d-%d:-", $rid, $ai->{"rstart"}, $aj->{"rend"}); }

          if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $ai->{"qstart"}, $aj->{"qend"}); }
          else             { $qpos = sprintf("%s:%d-%d:-", $qid, $aj->{"qend"}, $ai->{"qstart"}); }

          # When the alignments are reverse-reverse, the connection point is at the beginning of the first (i: rstart) and at the end of the second (j: rend)
          #     j  + -    i
          # <------- <--------
          $posi = $ai->{"rstart"};  # rstart means first reference coordinate, not with respect to the contig
          $posj = $aj->{"rend"}; # rend means last reference coordinate, not with respect to the contig
          $strandi = "-";
          $strandj = "+";
        }
        elsif (($ai->{"qrc"} == 0) && ($aj->{"qrc"} == 1))
        {
          ## ri: [1 - 1000] | j: [2000 - 3000] => 1000 
          ## qi: [1 - 1000] | j: [3000 - 2000] => 1000 

          $svtype = "FR";

          $qdist = $aj->{"qend"} - $ai->{"qend"};
          $rdist = $aj->{"rstart"} - $ai->{"rend"};

          if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $ai->{"rend"}, $aj->{"rstart"}); }
          else             { $rpos = sprintf("%s:%d-%d:-", $rid, $aj->{"rstart"}, $ai->{"rend"}); }

          if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $ai->{"qend"}, $aj->{"qend"}); }
          else             { $qpos = sprintf("%s:%d-%d:-", $qid, $aj->{"qend"}, $ai->{"qend"}); }

          # When the alignments are forward-reverse, the connection point is at the beginning of the first (i: rstart) and at the end of the second (j: rend)
          #    i   +     j   +
          # -------> <--------
          $posi = $ai->{"rend"}; 
          $posj = $aj->{"rend"};
          $strandi = "+";
          $strandj = "+";

        }
        elsif (($ai->{"qrc"} == 1) && ($aj->{"qrc"} == 0))
        {
          ## ri: [1 - 1000] | j: [2000 - 3000] => 1000 
          ## qi: [1000 - 1] | j: [2000 - 3000] => 1000 

          $svtype = "RF";

          $qdist = $ai->{"qend"} - $aj->{"qend"};
          $rdist = $aj->{"rstart"} - $ai->{"rend"};

          if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $ai->{"rend"}, $aj->{"rstart"}); }
          else             { $rpos = sprintf("%s:%d-%d:-", $rid, $aj->{"rstart"}, $ai->{"rend"}); }

          if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $aj->{"qend"}, $ai->{"qend"}); }
          else             { $qpos = sprintf("%s:%d-%d:-", $qid, $ai->{"qend"}, $aj->{"qend"}); }

          # When the alignments are reverse-forward:
          # -   i    -    j   
          # <------- -------->
          $posi = $ai->{"rstart"}; 
          $posj = $aj->{"rstart"};
          $strandi = "-";
          $strandj = "-";
        }
        else
        {
          my $irc = $ai->{"qrc"};
          my $jrc = $aj->{"qrc"};

          print "ERROR: Unknown SV: $irc $jrc\n";
          print "$istr\n";
          print "$jstr\n";
          die "ERROR: Unknown SV: $irc $jrc\n";
        }
        
        my $totaldist = $rdist + $qdist;
        my $typeguess = "";

        my $abs_event_size = abs($rdist-$qdist);

        if ($chromi ne $chromj) { # interchromosomal
          $typeguess = "Interchromosomal";
          $rdist = 0;
        } else { # same chromosome
          if ($strandi eq $strandj) {
            $typeguess = "Inversion";
            $abs_event_size = $rdist;
          }
          elsif ($qdist > $rdist) {
            # both are significantly negative: (means the size of an overlapping region got larger, so tandem element expansion)
            if ($rdist > -1*$approximately_zero && $rdist < $approximately_zero  && $qdist > -1*$approximately_zero) {
              $typeguess = "Insertion";
                # split into out of nowhere (rdist ~ 0) vs. rdist is > 0: insertion_in_unmapped_region
            }
            else {
              if ($rdist < 0 || $qdist < 0) {
                $typeguess = "Tandem_expansion";  
              } else {
                $typeguess = "Repeat_expansion";  
              }
            }
          }
          elsif ($qdist < $rdist) {
            # both are significantly negative: (means the size of an overlapping region got smaller, so tandem element contraction)
            if ($rdist > -1*$approximately_zero && $qdist > -1*$approximately_zero && $qdist < $approximately_zero) {
              $typeguess = "Deletion";
              # split into out of nowhere (rdist ~ 0) vs. rdist is > 0: deletion_in_unmapped_region
            }
            else {
              if ($rdist < 0 || $qdist < 0) {
                $typeguess = "Tandem_contraction";  
              } else {
                $typeguess = "Repeat_contraction";  
              }
            }
          }
          else {
            $typeguess = "None";
          }
          
          if ($abs_event_size > $longrange) {   #  || abs($rdist) > $longrange || abs($qdist) > $longrange 
            $typeguess = "Longrange";
            if (abs($qdist) > $max_query_dist) {
              $typeguess = "None";
            }
          }
          # my $ratio; 
          # if ($qdist != 0){
          # #   $ratio = abs(($rdist/$qdist)-1);
          # #   if ($ratio < 0.1) {
          # #     $typeguess = "Equilibrium";
          # #   }
            #   if ($rdist==$qdist || abs($qdist) > $longrange) {
            #     $typeguess = "None";
            #   }
          # }
        }

# my @chromosome_filter_choices = ("all-chromosomes","primary-chromosomes");
# my @longrange_filter_choices = ("include-longrange","exclude-longrange");

        my $chromi_length = length $chromi; # length of the chromosome names: a way to filter to primary chromosomes and cut out alts and patches from the assembly
        my $chromj_length = length $chromj;
        if ($typeguess ne "Inversion" && $typeguess ne "None" && $abs_event_size >= $minimum_event_size) { # always required
          if ($chromosome_filter eq "all-chromosomes" || ($chromi_length < 6 && $chromj_length < 6)) { # test for primary chromosomes unless "all-chromosomes" is chosen
            if ($longrange_filter ne "exclude-longrange" || ($typeguess ne "Interchromosomal" && $typeguess ne "Longrange")) {
              if ($longrange_filter ne "longrange-only" || ($typeguess eq "Interchromosomal" || $typeguess eq "Longrange")) {
                if ($output_file eq "bedpe") {
                  print "$chromi\t$posi\t@{[$posi + 1]}\t$chromj\t$posj\t@{[$posj + 1]}\tAssemblytics_b_$sv_id_counter\t$abs_event_size\t$strandi\t$strandj\t$typeguess\t$rdist\t$qdist\t$qpos\t$abs_event_size\t$svtype\tbetween_alignments\n";  
                }
                else {
                  use List::Util qw(min max);
                  my $ref_start = min(($posi, $posj));
                  my $ref_stop = max(($posi, $posj));
                  if ($ref_stop eq $ref_start) {
                    $ref_stop = $ref_start + 1;
                  }
                  # "chrom","start","stop","name","event.size","strand","event.type","ref.dist","query.dist","contig.name"
                  print "$chromi\t$ref_start\t$ref_stop\tAssemblytics_b_$sv_id_counter\t$abs_event_size\t+\t$typeguess\t$rdist\t$qdist\t$qpos\tbetween_alignments\n";  
                }
              }
            }
          }
          #if ($filter_type ~~ ("primary-allsizes","primary-shortrange") {
           # && $typeguess ne "Interchromosomal" && $typeguess ne "Inversion" && $chromi_length < 6 && $chromj_length < 6 && $abs_event_size >= $minimum_event_size) {
        }
        $candidatesvs++;
        #push @{$svstats{$svtype}}, $totaldist;
      }
    }
  }
}



# print "Processed $numalignments alignments found $candidatefusions fusions and $candidatesvs SVs\n";
# print STDERR "Processed $numalignments alignments found $candidatefusions fusions and $candidatesvs SVs\n";

# foreach my $svtype (keys %svstats)
# {
#   my @events = @{$svstats{$svtype}};
#   my $cnt = scalar @events;

#   my $sum = 0.0;
#   foreach my $e (@events)
#   {
#     $sum += $e;
#   }

#   my $mean = sprintf ("%0.02f", $sum/$cnt);

#   print "svtype[$svtype]: $cnt $mean\n";
#   print STDERR "svtype[$svtype]: $cnt $mean\n";
# }