changeset 13:b9a3aeb162ab draft default tip

Uploaded
author dereeper
date Mon, 26 Jul 2021 18:22:37 +0000
parents 68a9ec9ce51e
children
files RaGOO/.gitignore RaGOO/Assemblytics_between_alignments.pl RaGOO/Assemblytics_uniq_anchor.py RaGOO/Assemblytics_within_alignment.py RaGOO/LICENSE RaGOO/README.md RaGOO/filter_gap_SVs.py RaGOO/lift_over.py RaGOO/make_agp.py RaGOO/ragoo.py RaGOO/ragoo_utilities/ContigAlignment.py RaGOO/ragoo_utilities/GFFReader.py RaGOO/ragoo_utilities/PAFReader.py RaGOO/ragoo_utilities/ReadCoverage.py RaGOO/ragoo_utilities/SeqReader.py RaGOO/ragoo_utilities/__init__.py RaGOO/ragoo_utilities/__pycache__/ContigAlignment.cpython-39.pyc RaGOO/ragoo_utilities/__pycache__/GFFReader.cpython-39.pyc RaGOO/ragoo_utilities/__pycache__/PAFReader.cpython-39.pyc RaGOO/ragoo_utilities/__pycache__/ReadCoverage.cpython-39.pyc RaGOO/ragoo_utilities/__pycache__/SeqReader.cpython-39.pyc RaGOO/ragoo_utilities/__pycache__/__init__.cpython-39.pyc RaGOO/ragoo_utilities/__pycache__/break_chimera.cpython-39.pyc RaGOO/ragoo_utilities/__pycache__/utilities.cpython-39.pyc RaGOO/ragoo_utilities/break_chimera.py RaGOO/ragoo_utilities/get_contig_borders.py RaGOO/ragoo_utilities/get_ragoo_stats.py RaGOO/ragoo_utilities/utilities.py RaGOO/sam2delta.py RaGOO/setup.py
diffstat 29 files changed, 3611 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RaGOO/.gitignore	Mon Jul 26 18:22:37 2021 +0000
@@ -0,0 +1,4 @@
+.idea
+RaGOO.egg-info
+build
+dist
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RaGOO/Assemblytics_between_alignments.pl	Mon Jul 26 18:22:37 2021 +0000
@@ -0,0 +1,438 @@
+#!/usr/bin/perl -w
+
+# Authors: Maria Nattestad and Mike Schatz
+# Email: mnattest@cshl.edu
+
+use strict;
+my @chromosome_filter_choices =  ("all-chromosomes","primary-chromosomes");
+my @longrange_filter_choices = ("include-longrange","exclude-longrange","longrange-only");
+my @output_file_choices = ("bed","bedpe");
+
+my $USAGE = "Usage:\nAssemblytics_between_alignments.pl coords.tab minimum_event_size maximum_event_size [@chromosome_filter_choices] [@longrange_filter_choices] [@output_file_choices] > fusions.svs.bedpe ";
+
+my $coordsfile = shift @ARGV or die $USAGE;
+my $minimum_event_size = int(shift @ARGV);
+my $maximum_event_size = int(shift @ARGV);
+my $chromosome_filter = shift @ARGV or die $USAGE;
+my $longrange_filter = shift @ARGV or die $USAGE;
+my $output_file = shift @ARGV or die $USAGE;
+
+
+
+# How close do alignments have to be in order to call deletions and insertions? (as opposed to contractions and expansions)
+my $narrow_threshold = 50;
+
+
+
+# Number of basepairs of distance in either the reference or the query before we call an SV long-range
+my $longrange = $maximum_event_size;
+
+
+
+# What is the longest two alignments can map apart in the query before we throw the variant between them away?
+my $max_query_dist = 100000;
+
+
+
+my %chromosome_filter_choices_hash = map { $_, 1 } @chromosome_filter_choices;
+my %longrange_filter_choices_hash = map { $_, 1 } @longrange_filter_choices;
+my %output_file_choices_hash = map { $_, 1 } @output_file_choices;
+
+if ( $chromosome_filter_choices_hash{ $chromosome_filter } &&  $longrange_filter_choices_hash{ $longrange_filter } && $output_file_choices_hash { $output_file }) {
+  # All is well with the world
+} else {
+  die $USAGE;
+}
+
+if ($longrange_filter ne "exclude-longrange" && $output_file eq "bed"){
+  die "Cannot output bed while allowing long-range variants\n$USAGE";
+}
+
+# open COORDS, "./bin/show-coords -rclHT $deltafile |"
+#   or die "Can't process $deltafile ($!)\n";
+
+open COORDS, "$coordsfile"  or die "Can't process $coordsfile ($!)\n";
+
+##open COORDS, "show-coords -rclHT $deltafile |"
+##  or die "Can't process $deltafile ($!)\n";
+
+
+
+## Require the flanking alignments are at least this long to call an SV
+## Note there is no minimum length for fusions, this is determined by how 
+## the delta file was filtered
+
+my $MIN_SV_ALIGN = 100;
+
+
+#my $minimum_event_size = 50;
+my $approximately_zero = $narrow_threshold;
+
+
+my %alignments;
+my $numalignments = 0;
+
+
+while (<COORDS>)
+{
+  chomp;
+  my @vals = split /\s+/, $_;
+
+  my $rid = $vals[6];
+  my $qid = $vals[7];
+
+  my $a;
+  $a->{"rstart"} = $vals[0];
+  $a->{"rend"}   = $vals[1];
+  $a->{"qstart"} = $vals[2];
+  $a->{"qend"}   = $vals[3];
+  $a->{"rlen"}   = $vals[4];
+  $a->{"qlen"}   = $vals[5];
+  $a->{"rid"}    = $vals[6];
+  $a->{"qid"}    = $vals[7];
+  $a->{"str"}    = $_;
+
+  $a->{"qidx"}   = 0;
+
+  $a->{"qrc"} = ($a->{"qend"} > $a->{"qstart"}) ? 0 : 1;
+
+  push @{$alignments{$qid}->{$rid}}, $a; # a is a hash with all the info for one alignment
+
+  $numalignments++;
+}
+
+
+
+my $candidatefusions = 0;
+my $candidatesvs     = 0;
+
+my $sv_id_counter = 0;
+
+my %svstats;
+
+foreach my $qid (sort keys %alignments) # query name is the key for the alignments hash
+{
+  my @refs = sort keys %{$alignments{$qid}}; # grab all alignments of that query
+  my $numref = scalar @refs;
+
+  ## scan for fusions
+  # if ($numref > 1) # if query aligns to multiple chromosomes
+  # {
+  #   my $allrefs = join " ", @refs; # join the names together for output
+
+  #   print "== $qid [$numref] $allrefs\n"; # output the names of the chromosomes 
+  #   $candidatefusions++;
+
+  #   my $rcnt = 0;
+  #   foreach my $rid (@refs)
+  #   {
+  #     print "--\n" if ($rcnt > 0);
+  #     $rcnt++;
+
+  #     foreach my $a (@{$alignments{$qid}->{$rid}})
+  #     {
+  #       my $str = $a->{"str"};
+  #       print "$str\n";
+  #     }
+  #   }
+
+  #   print "\n";
+  # }
+
+
+  ## Resort the alignments by query sort position
+  my @qaligns;
+  foreach my $rid (@refs)
+  {
+    foreach my $a (@{$alignments{$qid}->{$rid}})
+    {
+      push @qaligns, $a;
+    }
+  }
+
+  ## Now record the index of the sorted query indices
+  @qaligns = sort { $a->{"qstart"} <=> $b->{"qstart"}} @qaligns;
+  for (my $i=0; $i < scalar @qaligns; $i++)
+  {
+    $qaligns[$i]->{"qidx"} = $i;
+  }
+
+  ## scan for SVs
+  my $numalign = scalar @qaligns;
+
+  if ($numalign > 1) # if the query has more than 1 alignment
+  {
+    ## note skip first one
+    for (my $j = 1; $j < $numalign; $j++)
+    {
+      my $ai = $qaligns[$j-1];
+      my $aj = $qaligns[$j];
+
+      my $istr = $ai->{"str"};
+      my $jstr = $aj->{"str"};
+
+      # if ($ai->{"rid"} ne $aj->{"rid"})
+      # {
+      #   ## skip the fusions for now #############################################################################################
+      #   next;
+      # }
+
+      my $rid = $ai->{"rid"};
+
+      if (($ai->{"rlen"} >= $MIN_SV_ALIGN) && 
+          ($aj->{"rlen"} >= $MIN_SV_ALIGN))
+      {
+        ## r alignments are always forward, q alignments may be flipped
+
+        my $rpos;
+        my $qpos;
+        my $rdist = 0;
+        my $qdist = 0;
+        my $svtype = 0;
+
+        my $chromi = $ai->{"rid"};
+        my $chromj = $aj->{"rid"};
+        my $posi;
+        my $posj;
+        my $strandi;
+        my $strandj;
+
+        $sv_id_counter++;
+
+        if (($ai->{"qrc"} == 0) && ($aj->{"qrc"} == 0))
+        {
+          ## ri: [1 - 1000] | j: [2000 - 3000] => 1000 
+          ## qi: [1 - 1000] | j: [2000 - 3000] => 1000 
+
+          $svtype = "FF";
+
+          $qdist = $aj->{"qstart"} - $ai->{"qend"};
+          $rdist = $aj->{"rstart"} - $ai->{"rend"};
+
+          if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $ai->{"rend"}, $aj->{"rstart"}); }
+          else             { $rpos = sprintf("%s:%d-%d:-", $rid, $aj->{"rstart"}, $ai->{"rend"}); }
+
+          if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $ai->{"qend"}, $aj->{"qstart"}); }
+          else             { $qpos = sprintf("%s:%d-%d:-", $qid, $aj->{"qstart"}, $ai->{"qend"}); }
+
+          # When the alignments are forward-forward, the connection point is at the end of the first (i: rend) and at the beginning of the second (j: rstart)
+          #    i  + -   j
+          # ------> -------->
+          $posi = $ai->{"rend"}; 
+          $posj = $aj->{"rstart"};
+          $strandi = "+";
+          $strandj = "-";
+
+        }
+        elsif (($ai->{"qrc"} == 1) && ($aj->{"qrc"} == 1))
+        {
+          ## ri: [2000 - 3000] | j: [1 - 1000] => 1000 
+          ## qi: [1000 - 1]    | j: [3000 - 2000] => 1000 
+
+          $svtype = "RR";
+
+          $rdist = $ai->{"rstart"} - $aj->{"rend"}; 
+          $qdist = $aj->{"qend"} - $ai->{"qstart"};
+
+          if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $aj->{"rend"}, $ai->{"rstart"}); }
+          else             { $rpos = sprintf("%s:%d-%d:-", $rid, $ai->{"rstart"}, $aj->{"rend"}); }
+
+          if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $ai->{"qstart"}, $aj->{"qend"}); }
+          else             { $qpos = sprintf("%s:%d-%d:-", $qid, $aj->{"qend"}, $ai->{"qstart"}); }
+
+          # When the alignments are reverse-reverse, the connection point is at the beginning of the first (i: rstart) and at the end of the second (j: rend)
+          #     j  + -    i
+          # <------- <--------
+          $posi = $ai->{"rstart"};  # rstart means first reference coordinate, not with respect to the contig
+          $posj = $aj->{"rend"}; # rend means last reference coordinate, not with respect to the contig
+          $strandi = "-";
+          $strandj = "+";
+        }
+        elsif (($ai->{"qrc"} == 0) && ($aj->{"qrc"} == 1))
+        {
+          ## ri: [1 - 1000] | j: [2000 - 3000] => 1000 
+          ## qi: [1 - 1000] | j: [3000 - 2000] => 1000 
+
+          $svtype = "FR";
+
+          $qdist = $aj->{"qend"} - $ai->{"qend"};
+          $rdist = $aj->{"rstart"} - $ai->{"rend"};
+
+          if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $ai->{"rend"}, $aj->{"rstart"}); }
+          else             { $rpos = sprintf("%s:%d-%d:-", $rid, $aj->{"rstart"}, $ai->{"rend"}); }
+
+          if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $ai->{"qend"}, $aj->{"qend"}); }
+          else             { $qpos = sprintf("%s:%d-%d:-", $qid, $aj->{"qend"}, $ai->{"qend"}); }
+
+          # When the alignments are forward-reverse, the connection point is at the beginning of the first (i: rstart) and at the end of the second (j: rend)
+          #    i   +     j   +
+          # -------> <--------
+          $posi = $ai->{"rend"}; 
+          $posj = $aj->{"rend"};
+          $strandi = "+";
+          $strandj = "+";
+
+        }
+        elsif (($ai->{"qrc"} == 1) && ($aj->{"qrc"} == 0))
+        {
+          ## ri: [1 - 1000] | j: [2000 - 3000] => 1000 
+          ## qi: [1000 - 1] | j: [2000 - 3000] => 1000 
+
+          $svtype = "RF";
+
+          $qdist = $ai->{"qend"} - $aj->{"qend"};
+          $rdist = $aj->{"rstart"} - $ai->{"rend"};
+
+          if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $ai->{"rend"}, $aj->{"rstart"}); }
+          else             { $rpos = sprintf("%s:%d-%d:-", $rid, $aj->{"rstart"}, $ai->{"rend"}); }
+
+          if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $aj->{"qend"}, $ai->{"qend"}); }
+          else             { $qpos = sprintf("%s:%d-%d:-", $qid, $ai->{"qend"}, $aj->{"qend"}); }
+
+          # When the alignments are reverse-forward:
+          # -   i    -    j   
+          # <------- -------->
+          $posi = $ai->{"rstart"}; 
+          $posj = $aj->{"rstart"};
+          $strandi = "-";
+          $strandj = "-";
+        }
+        else
+        {
+          my $irc = $ai->{"qrc"};
+          my $jrc = $aj->{"qrc"};
+
+          print "ERROR: Unknown SV: $irc $jrc\n";
+          print "$istr\n";
+          print "$jstr\n";
+          die "ERROR: Unknown SV: $irc $jrc\n";
+        }
+        
+        my $totaldist = $rdist + $qdist;
+        my $typeguess = "";
+
+        my $abs_event_size = abs($rdist-$qdist);
+
+        if ($chromi ne $chromj) { # interchromosomal
+          $typeguess = "Interchromosomal";
+          $rdist = 0;
+        } else { # same chromosome
+          if ($strandi eq $strandj) {
+            $typeguess = "Inversion";
+            $abs_event_size = $rdist;
+          }
+          elsif ($qdist > $rdist) {
+            # both are significantly negative: (means the size of an overlapping region got larger, so tandem element expansion)
+            if ($rdist > -1*$approximately_zero && $rdist < $approximately_zero  && $qdist > -1*$approximately_zero) {
+              $typeguess = "Insertion";
+                # split into out of nowhere (rdist ~ 0) vs. rdist is > 0: insertion_in_unmapped_region
+            }
+            else {
+              if ($rdist < 0 || $qdist < 0) {
+                $typeguess = "Tandem_expansion";  
+              } else {
+                $typeguess = "Repeat_expansion";  
+              }
+            }
+          }
+          elsif ($qdist < $rdist) {
+            # both are significantly negative: (means the size of an overlapping region got smaller, so tandem element contraction)
+            if ($rdist > -1*$approximately_zero && $qdist > -1*$approximately_zero && $qdist < $approximately_zero) {
+              $typeguess = "Deletion";
+              # split into out of nowhere (rdist ~ 0) vs. rdist is > 0: deletion_in_unmapped_region
+            }
+            else {
+              if ($rdist < 0 || $qdist < 0) {
+                $typeguess = "Tandem_contraction";  
+              } else {
+                $typeguess = "Repeat_contraction";  
+              }
+            }
+          }
+          else {
+            $typeguess = "None";
+          }
+          
+          if ($abs_event_size > $longrange) {   #  || abs($rdist) > $longrange || abs($qdist) > $longrange 
+            $typeguess = "Longrange";
+            if (abs($qdist) > $max_query_dist) {
+              $typeguess = "None";
+            }
+          }
+          # my $ratio; 
+          # if ($qdist != 0){
+          # #   $ratio = abs(($rdist/$qdist)-1);
+          # #   if ($ratio < 0.1) {
+          # #     $typeguess = "Equilibrium";
+          # #   }
+            #   if ($rdist==$qdist || abs($qdist) > $longrange) {
+            #     $typeguess = "None";
+            #   }
+          # }
+        }
+
+# my @chromosome_filter_choices = ("all-chromosomes","primary-chromosomes");
+# my @longrange_filter_choices = ("include-longrange","exclude-longrange");
+
+        my $chromi_length = length $chromi; # length of the chromosome names: a way to filter to primary chromosomes and cut out alts and patches from the assembly
+        my $chromj_length = length $chromj;
+        if ($typeguess ne "Inversion" && $typeguess ne "None" && $abs_event_size >= $minimum_event_size) { # always required
+          if ($chromosome_filter eq "all-chromosomes" || ($chromi_length < 6 && $chromj_length < 6)) { # test for primary chromosomes unless "all-chromosomes" is chosen
+            if ($longrange_filter ne "exclude-longrange" || ($typeguess ne "Interchromosomal" && $typeguess ne "Longrange")) {
+              if ($longrange_filter ne "longrange-only" || ($typeguess eq "Interchromosomal" || $typeguess eq "Longrange")) {
+                if ($output_file eq "bedpe") {
+                  print "$chromi\t$posi\t@{[$posi + 1]}\t$chromj\t$posj\t@{[$posj + 1]}\tAssemblytics_b_$sv_id_counter\t$abs_event_size\t$strandi\t$strandj\t$typeguess\t$rdist\t$qdist\t$qpos\t$abs_event_size\t$svtype\tbetween_alignments\n";  
+                }
+                else {
+                  use List::Util qw(min max);
+                  my $ref_start = min(($posi, $posj));
+                  my $ref_stop = max(($posi, $posj));
+                  if ($ref_stop eq $ref_start) {
+                    $ref_stop = $ref_start + 1;
+                  }
+                  # "chrom","start","stop","name","event.size","strand","event.type","ref.dist","query.dist","contig.name"
+                  print "$chromi\t$ref_start\t$ref_stop\tAssemblytics_b_$sv_id_counter\t$abs_event_size\t+\t$typeguess\t$rdist\t$qdist\t$qpos\tbetween_alignments\n";  
+                }
+              }
+            }
+          }
+          #if ($filter_type ~~ ("primary-allsizes","primary-shortrange") {
+           # && $typeguess ne "Interchromosomal" && $typeguess ne "Inversion" && $chromi_length < 6 && $chromj_length < 6 && $abs_event_size >= $minimum_event_size) {
+        }
+        $candidatesvs++;
+        #push @{$svstats{$svtype}}, $totaldist;
+      }
+    }
+  }
+}
+
+
+
+# print "Processed $numalignments alignments found $candidatefusions fusions and $candidatesvs SVs\n";
+# print STDERR "Processed $numalignments alignments found $candidatefusions fusions and $candidatesvs SVs\n";
+
+# foreach my $svtype (keys %svstats)
+# {
+#   my @events = @{$svstats{$svtype}};
+#   my $cnt = scalar @events;
+
+#   my $sum = 0.0;
+#   foreach my $e (@events)
+#   {
+#     $sum += $e;
+#   }
+
+#   my $mean = sprintf ("%0.02f", $sum/$cnt);
+
+#   print "svtype[$svtype]: $cnt $mean\n";
+#   print STDERR "svtype[$svtype]: $cnt $mean\n";
+# }
+
+
+
+
+
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RaGOO/Assemblytics_uniq_anchor.py	Mon Jul 26 18:22:37 2021 +0000
@@ -0,0 +1,440 @@
+#! /usr/bin/env python
+
+
+# Author: Maria Nattestad
+# Email: mnattest@cshl.edu
+# This script is part of Assemblytics, a program to detect and analyze structural variants from an assembly aligned to a reference genome using MUMmer. 
+
+
+import argparse
+import gzip
+# from intervaltree import *
+import time
+
+import numpy as np
+import operator
+
+
+
+def run(args):
+    filename = args.delta
+    unique_length = args.unique_length
+    output_filename = args.out
+    keep_small_uniques = args.keep_small_uniques
+    
+    f = open(filename)
+    header1 = f.readline()
+    if header1[0:2]=="\x1f\x8b":
+        f.close()
+        f = gzip.open(filename)
+
+
+    linecounter = 0
+
+    current_query_name = ""
+    current_header = ""
+
+    lines_by_query = {}
+    header_lines_by_query = {}
+
+    before = time.time()
+    last = before
+
+    existing_query_names = set()
+
+    for line in f:
+        if line[0]==">":
+
+            fields = line.strip().split()
+            current_query_name = fields[1]
+            current_header = line.strip()
+            if current_query_name not in existing_query_names:
+                lines_by_query[current_query_name] = []
+                header_lines_by_query[current_query_name] = []
+                existing_query_names.add(current_query_name)
+        else:
+            fields = line.strip().split()
+            if len(fields) > 4:
+                # sometimes start and end are the other way around, but for this they need to be in order
+                query_min = min([int(fields[2]),int(fields[3])])
+                query_max = max([int(fields[2]),int(fields[3])])
+
+                ##########  TESTING ONLY  ###########
+                # lines_by_query[current_query_name] = (query_min,query_max)
+                # test_list = test_list + [(query_min,query_max)]
+                #####################################
+
+                lines_by_query[current_query_name].append((query_min,query_max))
+                header_lines_by_query[current_query_name].append(current_header)
+        # linecounter += 1
+        # if linecounter % 10000000 == 0:
+        #     print "%d,%f" % (linecounter, time.time()-last)
+        #     last = time.time()
+        
+
+    f.close()
+    
+
+    before = time.time()
+    alignments_to_keep = {}
+    num_queries = len(lines_by_query)
+    
+    num_query_step_to_report = num_queries/100
+    if num_queries < 100:
+        num_query_step_to_report = num_queries/10
+    if num_queries < 10:
+        num_query_step_to_report = 1
+
+    query_counter = 0
+
+    for query in lines_by_query:
+
+        ################   TESTING    ####################   
+
+        # results_intervaltree = summarize_intervaltree(lines_by_query[query], unique_length_required = unique_length)
+        # intervaltree_filtered_out = set(range(0,len(lines_by_query[query]))) - set(results_intervaltree)
+    
+        # results_planesweep = summarize_planesweep(lines_by_query[query], unique_length_required = unique_length) 
+        # planesweep_filtered_out = set(range(0,len(lines_by_query[query]))) - set(results_planesweep)
+        # if intervaltree_filtered_out == planesweep_filtered_out :
+        #     num_matches += 1
+        # else:
+        #     num_mismatches += 1
+        #     print "MISMATCH:"
+        #     print "number of alignments:", len(lines_by_query[query])
+        #     print "results_intervaltree:"
+        #     print results_intervaltree
+        #     for i in results_intervaltree:
+        #         print lines_by_query[query][i]
+        #     print "results_planesweep:"
+        #     print results_planesweep
+        #     for i in results_planesweep:
+        #         print lines_by_query[query][i]
+        ################   TESTING    ####################
+
+        alignments_to_keep[query] = summarize_planesweep(lines_by_query[query], unique_length_required = unique_length,keep_small_uniques=keep_small_uniques)
+
+        query_counter += 1
+
+    before = time.time()
+
+
+    fout = open(output_filename + ".Assemblytics.unique_length_filtered_l%d.delta" % (unique_length),'w')
+    
+
+    f = open(filename)
+    header1 = f.readline()
+    if header1[0:2]=="\x1f\x8b":
+        f.close()
+        f = gzip.open(filename)
+        header1 = f.readline()
+
+    fout.write(header1) # write the first line that we read already
+    fout.write(f.readline())
+    
+    linecounter = 0
+
+    # For filtered delta file:
+    list_of_alignments_to_keep = []
+    alignment_counter = {}
+    keep_printing = False
+
+    # For coords:
+    current_query_name = ""
+    current_query_position = 0
+    fcoords_out_tab = open(output_filename + ".coords.tab",'w')
+    fcoords_out_csv = open(output_filename + ".coords.csv",'w')
+    fcoords_out_csv.write("ref_start,ref_end,query_start,query_end,ref_length,query_length,ref,query,tag\n")
+
+
+    # For basic assembly stats:
+    ref_sequences = set()
+    query_sequences = set()
+    ref_lengths = []
+    query_lengths = []
+
+    f_stats_out = open(output_filename + ".Assemblytics_assembly_stats.txt","w")
+
+    for line in f:
+        linecounter += 1
+        if line[0]==">":
+            fields = line.strip().split()
+            
+            # For delta file output:
+            query = fields[1]
+            list_of_alignments_to_keep = alignments_to_keep[query]
+
+            header_needed = False
+            for index in list_of_alignments_to_keep:
+                if line.strip() == header_lines_by_query[query][index]:
+                    header_needed = True
+            if header_needed == True:
+                fout.write(line) # if we have any alignments under this header, print the header
+            alignment_counter[query] = alignment_counter.get(query,0)
+
+            # For coords:
+            current_reference_name = fields[0][1:]
+            current_query_name = fields[1]
+
+            current_reference_size = int(fields[2])
+            current_query_size = int(fields[3])
+
+            # For basic assembly stats:
+            if not current_reference_name in ref_sequences:
+                ref_lengths.append(current_reference_size)
+                ref_sequences.add(current_reference_name)
+            if not current_query_name in query_sequences:
+                query_lengths.append(current_query_size)
+                query_sequences.add(current_query_name)
+
+        else:
+            fields = line.strip().split()
+            if len(fields) > 4:
+                # For coords:
+                ref_start = int(fields[0])
+                ref_end = int(fields[1])
+                query_start = int(fields[2])
+                query_end = int(fields[3])
+                csv_tag = "repetitive"
+                if alignment_counter[query] in list_of_alignments_to_keep:
+                    fout.write(line)
+                    fcoords_out_tab.write("\t".join(map(str,[ref_start,ref_end,query_start, query_end,current_reference_size,current_query_size,current_reference_name,current_query_name])) + "\n")
+                    csv_tag = "unique"
+                    keep_printing = True
+                else:
+                    keep_printing = False
+                fcoords_out_csv.write(",".join(map(str,[ref_start,ref_end,query_start, query_end,current_reference_size,current_query_size,current_reference_name.replace(",","_"),current_query_name.replace(",","_"),csv_tag])) + "\n")
+                alignment_counter[query] = alignment_counter[query] + 1
+
+            elif keep_printing == True:
+                fout.write(line)
+
+    fcoords_out_tab.close()
+    fcoords_out_csv.close()
+
+
+    ref_lengths.sort()
+    query_lengths.sort()
+
+    # Assembly statistics
+    ref_lengths = np.array(ref_lengths)
+    query_lengths = np.array(query_lengths)
+
+    f_stats_out.write("Reference: %s\n" % (header1.split()[0].split("/")[-1]))
+    f_stats_out.write( "Number of sequences: %s\n" % intWithCommas(len(ref_lengths)))
+    f_stats_out.write( "Total sequence length: %s\n" %  gig_meg(sum(ref_lengths)))
+    f_stats_out.write( "Mean: %s\n" % gig_meg(np.mean(ref_lengths)))
+    f_stats_out.write( "Min: %s\n" % gig_meg(np.min(ref_lengths)))
+    f_stats_out.write( "Max: %s\n" % gig_meg(np.max(ref_lengths)))
+    f_stats_out.write( "N50: %s\n" % gig_meg(N50(ref_lengths)))
+    f_stats_out.write( "\n\n")
+    f_stats_out.write( "Query: %s\n" % header1.split()[1].split("/")[-1])
+    f_stats_out.write( "Number of sequences: %s\n" % intWithCommas(len(query_lengths)))
+    f_stats_out.write( "Total sequence length: %s\n" % gig_meg(sum(query_lengths)))
+    f_stats_out.write( "Mean: %s\n" % gig_meg(np.mean(query_lengths)))
+    f_stats_out.write( "Min: %s\n" % gig_meg(np.min(query_lengths)))
+    f_stats_out.write( "Max: %s\n" % gig_meg(np.max(query_lengths)))
+    f_stats_out.write( "N50: %s\n" % gig_meg(N50(query_lengths)))
+
+
+    f.close()
+    fout.close()
+    f_stats_out.close()
+
+def N50(sorted_list):
+    # List should be sorted as increasing
+
+    # We flip the list around here so we start with the largest element
+    cumsum = 0
+    for length in sorted_list[::-1]:
+        cumsum += length
+        if cumsum >= sum(sorted_list)/2:
+            return length
+
+
+def gig_meg(number,digits = 2):
+    gig = 1000000000.
+    meg = 1000000.
+    kil = 1000.
+
+    if number > gig:
+        return str(round(number/gig,digits)) + " Gbp"
+    elif number > meg:
+        return str(round(number/meg,digits)) + " Mbp"
+    elif number > kil:
+        return str(round(number/kil,digits)) + " Kbp"
+    else:
+        return str(number) + " bp"
+
+def intWithCommas(x):
+    if type(x) not in [type(0)]:
+        raise TypeError("Parameter must be an integer.")
+    if x < 0:
+        return '-' + intWithCommas(-x)
+    result = ''
+    while x >= 1000:
+        x, r = divmod(x, 1000)
+        result = ",%03d%s" % (r, result)
+    return "%d%s" % (x, result)
+
+
+def summarize_planesweep(lines,unique_length_required, keep_small_uniques=False):
+
+    alignments_to_keep = []
+    # print len(lines)
+
+    # If no alignments:
+    if len(lines)==0:
+        return []
+
+    # If only one alignment:
+    if len(lines) == 1:
+        if keep_small_uniques == True or abs(lines[0][1] - lines[0][0]) >= unique_length_required:
+            return [0]
+        else:
+            return []
+
+    starts_and_stops = []
+    for query_min,query_max in lines:
+        # print query_min, query_max
+        starts_and_stops.append((query_min,"start"))
+        starts_and_stops.append((query_max,"stop"))
+
+
+    sorted_starts_and_stops = sorted(starts_and_stops,key=operator.itemgetter(0))
+    # print sorted_starts_and_stops
+
+    current_coverage = 0
+    last_position = -1
+    # sorted_unique_intervals = []
+    sorted_unique_intervals_left = []
+    sorted_unique_intervals_right = []
+    for pos,change in sorted_starts_and_stops:
+        # print sorted_starts_and_stops[i]
+        # pos = sorted_starts_and_stops[i][0]
+        # change = sorted_starts_and_stops[i][1]
+        
+        # print pos,change
+        # First alignment only:
+        # if last_position == -1:
+        #     last_position = pos
+        #     continue
+
+        # print last_position,pos,current_coverage
+
+        if current_coverage == 1:
+            # sorted_unique_intervals.append((last_position,pos))
+            sorted_unique_intervals_left.append(last_position)
+            sorted_unique_intervals_right.append(pos)
+
+        if change == "start":
+            current_coverage += 1
+        else:
+            current_coverage -= 1
+        last_position = pos
+
+
+    linecounter = 0
+    for query_min,query_max in lines:
+
+        i = binary_search(query_min,sorted_unique_intervals_left,0,len(sorted_unique_intervals_left))
+
+        exact_match = False
+        if sorted_unique_intervals_left[i] == query_min and sorted_unique_intervals_right[i] == query_max:
+            exact_match = True
+        sum_uniq = 0
+        while i < len(sorted_unique_intervals_left) and sorted_unique_intervals_left[i] >= query_min and sorted_unique_intervals_right[i] <= query_max:
+            sum_uniq += sorted_unique_intervals_right[i] - sorted_unique_intervals_left[i]
+            i += 1
+
+        # print query_min,query_max,sum_uniq
+        if sum_uniq >= unique_length_required:
+            alignments_to_keep.append(linecounter)
+        elif keep_small_uniques == True and exact_match == True:
+            alignments_to_keep.append(linecounter)
+            # print "Keeping small alignment:", query_min, query_max
+            # print sorted_unique_intervals_left[i-1],sorted_unique_intervals_right[i-1]
+
+        linecounter += 1
+
+    return alignments_to_keep
+
+
+
+def binary_search(query, numbers, left, right):
+    #  Returns index of the matching element or the first element to the right
+    
+    if left >= right:
+        return right
+    mid = (right+left)//2
+    
+
+    if query == numbers[mid]:
+        return mid
+    elif query < numbers[mid]:
+        return binary_search(query,numbers,left,mid)
+    else: # if query > numbers[mid]:
+        return binary_search(query,numbers,mid+1,right)
+
+
+# def summarize_intervaltree(lines, unique_length_required):
+
+#     alignments_to_keep = []
+#     # print len(lines)
+
+#     if len(lines)==0:
+#         return alignments_to_keep
+
+#     if len(lines) == 1:
+#         if abs(lines[0][1] - lines[0][0]) >= unique_length_required:
+#             return [0]
+
+
+#     starts_and_stops = []
+#     for query_min,query_max in lines:
+#         starts_and_stops.append((query_min,query_max))
+
+#     # build full tree
+#     tree = IntervalTree.from_tuples(starts_and_stops) 
+    
+
+#     # for each interval (keeping the same order as the lines in the input file)
+#     line_counter = 0
+#     for query_min,query_max in lines:
+        
+#         # create a tree object from the current interval
+#         this_interval = IntervalTree.from_tuples([(query_min,query_max)])
+
+#         # create a copy of the tree without this one interval
+#         rest_of_tree = tree - this_interval
+
+#         # find difference between this interval and the rest of the tree by subtracting out the other intervals one by one
+#         for other_interval in rest_of_tree:
+#             this_interval.chop(other_interval.begin, other_interval.end)
+        
+#         # loop through to count the total number of unique basepairs
+#         total_unique_length = 0
+#         for sub_interval in this_interval:
+#             total_unique_length += sub_interval.end - sub_interval.begin
+
+#         # if the total unique length is above our threshold, add the index to the list we are reporting       
+#         if total_unique_length >= unique_length_required:
+#             alignments_to_keep.append(line_counter)
+#         line_counter += 1
+
+
+#     return alignments_to_keep
+
+
+def main():
+    parser=argparse.ArgumentParser(description="Filters alignments in delta file based whether each alignment has a unique sequence anchoring it")
+    parser.add_argument("--delta",help="delta file" ,dest="delta", type=str, required=True)
+    parser.add_argument("--out",help="output file" ,dest="out", type=str, required=True)
+    parser.add_argument("--unique-length",help="The total length of unique sequence an alignment must have on the query side to be retained. Default: 10000" ,dest="unique_length",type=int, default=10000)
+    parser.add_argument("--keep-small-uniques",help="Keep small aligments (below the unique anchor length) if they are completely unique without any part of the alignment mapping multiple places" ,dest="keep_small_uniques",action="store_true")
+    parser.set_defaults(func=run)
+    args=parser.parse_args()
+    args.func(args)
+
+if __name__=="__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RaGOO/Assemblytics_within_alignment.py	Mon Jul 26 18:22:37 2021 +0000
@@ -0,0 +1,105 @@
+#! /usr/bin/env python
+import argparse
+
+import gzip
+# Author: Maria Nattestad
+# Email: mnattest@cshl.edu
+# This script is part of Assemblytics, a program to detect and analyze structural variants from an assembly aligned to a reference genome using MUMmer. 
+
+
+def run(args):
+    filename = args.delta
+    minimum_variant_size = args.minimum_variant_size
+
+    f = open(filename)
+    header1 = f.readline()
+    if header1[0:2]=="\x1f\x8b":
+        f.close()
+        f = gzip.open(filename)
+        header1 = f.readline()
+    
+    # Ignore the first two lines for now
+    f.readline()
+
+    linecounter = 0
+
+    current_reference_name = ""
+    current_reference_position = 0
+
+    current_query_name = ""
+    current_query_position = 0
+
+    variants = []
+
+    for line in f:
+        if line[0]==">":
+            # linecounter += 1
+            # if linecounter > 1:
+            #     break
+            fields = line.strip().split()
+            current_reference_name = fields[0][1:]
+            current_query_name = fields[1]
+        else:
+            fields = line.strip().split()
+            if len(fields) > 4:
+                # current_reference_position = int(fields[0])
+                current_reference_position = min(int(fields[0]),int(fields[1]))
+                # fields[1] is the reference position at the end of the alignment
+                # current_query_position = int(fields[2])
+                current_query_position = min(int(fields[2]),int(fields[3]))
+                # fields[3] is the query position at the end of the alignment
+            else:
+                tick = int(fields[0])
+                if abs(tick) == 1: # then go back and edit the last entry to add 1 more to its size
+                    report = variants[-1]
+                    report[4] = report[4] + 1 # size
+                    if tick > 0: # deletion, moves in reference
+                        report[2] = report[2] + 1 # reference end position
+                        report[7] = report[7] + 1 # reference gap size
+                        current_reference_position += 1 # update reference position after deletion
+                    elif tick < 0: # insertion, moves in query
+                        report[8] = report[8] + 1 # query gap size
+                        report[12] = report[12] + 1 # query end position
+                        current_query_position += 1 # update query position after insertion
+                else: # report the last one and continue
+                    current_reference_position += abs(tick) - 1 
+                    current_query_position += abs(tick) - 1 
+                    if tick > 0:
+                        size = 1
+                        # report = "%s\t%d\t%d\tAssemblytics_%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\n" % (current_reference_name,current_reference_position,current_reference_position+size,len(variants)+1,size,"+","Deletion",size,0,current_query_name,"within_alignment")
+                        report = [current_reference_name,current_reference_position,current_reference_position+size,"Assemblytics_w_"+str(len(variants)+1),size,"+","Deletion",size,0,current_query_name,"within_alignment",current_query_position,current_query_position]
+                        current_reference_position += size # update reference position after deletion
+                        variants.append(report)
+                    elif tick < 0:
+                        size = 1
+                        # report = "%s\t%d\t%d\tAssemblytics_%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\n" % (current_reference_name,current_reference_position,current_reference_position,len(variants)+1,size,"+","Insertion",0,size,current_query_name,"within_alignment")
+                        report = [current_reference_name,current_reference_position,current_reference_position,"Assemblytics_w_"+str(len(variants)+1),size,"+","Insertion",0,size,current_query_name,"within_alignment",current_query_position,current_query_position+size]
+                        current_query_position += size # update query position after insertion
+                        variants.append(report)
+                # TESTING
+                # print line, report
+                
+
+    f.close()
+
+    newcounter = 1
+    for line in variants:
+        # report = "%s\t%d\t%d\tAssemblytics_%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\n" % line
+        if line[4] >= minimum_variant_size:
+            line[3] = "Assemblytics_w_%d" % (newcounter)
+            print ("\t".join(map(str,line[0:10])) + ":" + str(line[11]) + "-" + str(line[12]) + ":+\t" + line[10])
+            # print "\t".join(map(str,line))
+            newcounter += 1
+
+
+def main():
+    parser=argparse.ArgumentParser(description="Outputs MUMmer coordinates annotated with length of unique sequence for each alignment")
+    parser.add_argument("--delta",help="delta file" ,dest="delta", type=str, required=True)
+    parser.add_argument("--min",help="Minimum size (bp) of variant to include, default = 50" ,dest="minimum_variant_size",type=int, default=50)
+    parser.set_defaults(func=run)
+    args=parser.parse_args()
+    args.func(args)
+
+if __name__=="__main__":
+    main()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RaGOO/LICENSE	Mon Jul 26 18:22:37 2021 +0000
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 Michael Alonge
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RaGOO/README.md	Mon Jul 26 18:22:37 2021 +0000
@@ -0,0 +1,4 @@
+[![DOI](https://zenodo.org/badge/119861637.svg)](https://zenodo.org/badge/latestdoi/119861637)
+
+
+# RaGOO is no longer supported. Please use [RagTag](https://github.com/malonge/RagTag) instead. 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RaGOO/filter_gap_SVs.py	Mon Jul 26 18:22:37 2021 +0000
@@ -0,0 +1,136 @@
+#!/usr/bin/env python
+import re
+
+from intervaltree import IntervalTree
+
+from ragoo_utilities.SeqReader import SeqReader
+
+
+class BaseSequence(object):
+
+    def __init__(self, in_sequence):
+        if not isinstance(in_sequence, str):
+            raise AttributeError('Only a string can be used to instantiate this class.')
+        self.sequence = in_sequence.upper()
+
+
+class GapSequence(BaseSequence):
+    def get_gap_coords(self):
+        """ Find all of the gap string indices for this sequence. """
+        return re.finditer(r'N+', self.sequence)
+
+
+def make_gaps_tree(in_file):
+    # A dictionary to store an interval tree for each chromosome header.
+    all_trees = dict()
+    x = SeqReader(in_file)
+    if in_file.endswith(".gz"):
+        for header, sequence in x.parse_gzip_fasta():
+            # Remove the greater than sign and only get first token if delimited by spaces
+            header = header[1:].split(' ')[0]
+            all_trees[header] = IntervalTree()
+            gap_sequence = GapSequence(sequence)
+            all_coordinates = [(m.start(0), m.end(0)) for m in gap_sequence.get_gap_coords()]
+            for i in all_coordinates:
+                all_trees[header][i[0]:i[1]] = i
+    else:
+        for header, sequence in x.parse_fasta():
+            # Remove the greater than sign and only get first token if delimited by spaces
+            header = header[1:].split(' ')[0]
+            all_trees[header] = IntervalTree()
+            gap_sequence = GapSequence(sequence)
+            all_coordinates = [(m.start(0), m.end(0)) for m in gap_sequence.get_gap_coords()]
+            for i in all_coordinates:
+                all_trees[header][i[0]:i[1]] = i
+    return all_trees
+
+
+def get_query_bed_coords(in_line):
+    c_loc = [m.start() for m in re.finditer(':', in_line)]
+    c1, c2 = c_loc[-1], c_loc[-2]
+    header = in_line[:c2]
+    L1 = in_line[c2+1:c1].split('-')
+    start, stop = int(L1[0]), int(L1[1])
+    return header, start, stop
+
+
+def make_svs_bed(in_trees_q, in_trees_r):
+    final_lines = []
+    with open('assemblytics_out.Assemblytics_structural_variants.bed', 'r') as f:
+
+        # Make a new header column for the field I am about to create
+        header = f.readline().rstrip() + '\tquery_gap_ovlp'
+        final_lines.append(header)
+
+        # Calculate the percentage overlap for each structural variant.
+        for line in f:
+            pct_ovlp = 0
+            ovlp = 0
+            L1 = line.rstrip().split('\t')
+            head, start, end = get_query_bed_coords(L1[9])
+            query = in_trees_q[head][start:end]
+            if query:
+                for interval in query:
+                    gap_start, gap_end= interval.begin, interval.end
+
+                    # Calculate the amount these to intervals overlap
+                    ovlp += max(0, min(end, gap_end) - max(start, gap_start))
+
+                pct_ovlp = ovlp/(end-start)
+
+            # Add the new value to the original line
+            L1.append(pct_ovlp)
+            L1 = [str(i) for i in L1]
+            final_lines.append('\t'.join(L1))
+
+    with open('assemblytics_out.Assemblytics_structural_variants.bed', 'w') as out_file:
+        out_file.write('\n'.join(final_lines))
+
+    # Now repeat but with respect to the reference
+    final_lines = []
+    with open('assemblytics_out.Assemblytics_structural_variants.bed', 'r') as f:
+        # Make a new header column for the field I am about to create
+        header = f.readline().rstrip() + '\tref_gap_ovlp'
+        final_lines.append(header)
+
+        # Calculate the percentage overlap for each structural variant.
+        for line in f:
+            pct_ovlp = 0
+            ovlp = 0
+            L1 = line.rstrip().split('\t')
+            head, start, end = L1[0], int(L1[1]), int(L1[2])
+            query = in_trees_r[head][start:end]
+            if query:
+                for interval in query:
+                    gap_start, gap_end = interval.begin, interval.end
+
+                    # Calculate the amount these to intervals overlap
+                    ovlp += max(0, min(end, gap_end) - max(start, gap_start))
+
+                pct_ovlp = ovlp / (end - start)
+
+            # Add the new value to the original line
+            L1.append(pct_ovlp)
+            L1 = [str(i) for i in L1]
+            final_lines.append('\t'.join(L1))
+
+        with open('assemblytics_out.Assemblytics_structural_variants.bed', 'w') as out_file:
+            out_file.write('\n'.join(final_lines))
+
+
+def main():
+    import argparse
+
+    parser = argparse.ArgumentParser(description='Annotate the SV bed file with the amount degree in which the SV overlaps with a gap')
+    parser.add_argument("ref_file", metavar="<ref.fasta>", type=str,help="reference fasta file")
+    args = parser.parse_args()
+    ref_file = args.ref_file
+
+    # Make a new bed file w.r.t the query
+    all_trees_q = make_gaps_tree('../ragoo.fasta')
+    all_trees_r = make_gaps_tree(ref_file)
+    make_svs_bed(all_trees_q, all_trees_r)
+
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RaGOO/lift_over.py	Mon Jul 26 18:22:37 2021 +0000
@@ -0,0 +1,125 @@
+#!/usr/bin/env python
+def get_contig_lengths(in_fai):
+    """ Get contig lengths from a fasta index. """
+    lens = dict()
+    with open(in_fai, 'r') as f:
+        for line in f:
+            L1 = line.rstrip().split('\t')
+            lens[L1[0]] = int(L1[1])
+    return lens
+
+
+def get_contig_orderings(in_groupings):
+    """
+    From the orderings files, return the following:
+
+    1. Dictionary associating a reference header with a list of ordered contig headers
+    2. Dictionary associating a contig with an orientation
+    3. Dicitonary associating a contig with its assigned reference header
+    """
+    orderings = dict()
+    orientations = dict()
+    ref_chr = dict()
+
+    # Iterate through the orderings files
+    with open(in_groupings, 'r') as f1:
+        for line1 in f1:
+            header = line1.rstrip().replace('_orderings.txt', '_RaGOO')
+            header = header[header.rfind('/') + 1:]
+
+            # Initialize the list for the orderings
+            orderings[header] = []
+            with open(line1.rstrip(), 'r') as f2:
+                for line2 in f2:
+                    L1 = line2.rstrip().split('\t')
+                    orderings[header].append(L1[0])
+                    orientations[L1[0]] = L1[1]
+                    ref_chr[L1[0]] = header
+    return orderings, orientations, ref_chr
+
+
+def get_reverse_coords(start_coord, end_coord, seq_length):
+    """
+    Returns new genomic coordinates of a region that has undergone reverse complementation.
+    new start coordinate = seqLength - endCoord
+    new end coordinate = seqLength - startCoord
+    """
+    return seq_length - (end_coord - 1), seq_length - (start_coord - 1)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description='Lift-over gff coordinates to from contigs to RaGOO pseudomolecules.'
+                                                 'Please make sure that you are using the updated gff file and set of contigs if chimera correction was used.'
+                                                 'Also make sure that the gap size (-g) matches that which was used during ordering and orienting.')
+    parser.add_argument("gff", metavar="<genes.gff>", type=str, help="Gff file to be lifted-over")
+    parser.add_argument("orderings", metavar="<orderings.fofn>", type=str, help="List of RaGOO 'orderings' files. 'ls ragoo_output/orderings/* > orderings.fofn'")
+    parser.add_argument("fai", metavar="<contigs.fasta.fai>", type=str, help="Fasta index for contigs.'")
+    parser.add_argument("-g", metavar="100", type=int, default=100, help="Gap size for padding in pseudomolecules (must match what was used for 'ragoo.py'.")
+
+
+    # Get the command line arguments
+    args = parser.parse_args()
+    gff_file = args.gff
+    orderings_file = args.orderings
+    fai_file = args.fai
+    gap_size = args.g
+
+    # Get the contig lengths
+    ctg_lens = get_contig_lengths(fai_file)
+
+    # Get the contig orderings and orientations
+    ctg_orderings, ctg_orientations, ctg_chr = get_contig_orderings(orderings_file)
+
+    #Iterate through the GFF features and update
+    offsets = dict()
+    with open(gff_file) as f:
+        for gff_line in f:
+            if gff_line.startswith("#"):
+                print(gff_line.rstrip())
+            else:
+                gff_fields = gff_line.rstrip().split('\t')
+                gff_header, start, end, strand = gff_fields[0], int(gff_fields[3]), int(gff_fields[4]), gff_fields[6]
+                new_header = ctg_chr[gff_header]
+
+                # Check that this contig header is in our list of headers
+                if gff_header not in ctg_lens.keys():
+                    err_msg = """ %s was not found in the list of orderings files provided.
+                    Please check that you are using the correct files. If chimeric contig correction was
+                    used, please use the corrected gff and fai file. 
+                    """
+                    raise ValueError(err_msg)
+
+                # Check if the contig has been reverse complemented. Update accordingly
+                if ctg_orientations[gff_header] == '-':
+                    start, end = get_reverse_coords(start, end, ctg_lens[gff_header])
+
+                    # Set the opposite strand
+                    if strand == '+':
+                        strand = '-'
+                    else:
+                        strand = '+'
+
+                # Check if the offset for this contig has already been calculated
+                if gff_header in offsets:
+                    offset = offsets[gff_header]
+                else:
+                    ctg_idx = ctg_orderings[new_header].index(gff_header)
+                    offset = 0
+
+                    for ctg in ctg_orderings[new_header][:ctg_idx]:
+                        offset += ctg_lens[ctg]
+                        offset += gap_size
+
+                    # memoize the offset
+                    offsets[gff_header] = offset
+
+                new_start = start + offset
+                new_end = end + offset
+
+                gff_fields[0] = new_header
+                gff_fields[3] = str(new_start)
+                gff_fields[4] = str(new_end)
+                gff_fields[6] = strand
+                print('\t'.join(gff_fields))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RaGOO/make_agp.py	Mon Jul 26 18:22:37 2021 +0000
@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+import sys
+import argparse
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Produce an AGP v2.0 file describing the ordering and orienting performed by RaGOO.')
+    parser.add_argument("orderings", metavar="<orderings.fofn>", type=str, help="file of orderings files ($ls ragoo_output/orderings/* > orderings.fofn)")
+    parser.add_argument("fai", metavar="<contigs.fasta.fai>", type=str, help="index file for contigs ($samtools faidx contigs.fasta)")
+    parser.add_argument("gap_len", metavar="gap_size", type=int, help="Gap size used for pseudomolecule padding.")
+
+    # Get the command line arguments
+    args = parser.parse_args()
+    orderings_fofn = args.orderings
+    fai_file = args.fai
+    gap_len = args.gap_len
+
+    # Save the contig orderings
+    orderings = dict()
+    with open(orderings_fofn, 'r') as f1:
+        for line1 in f1:
+            chrom = line1[line1.rfind("/")+1:line1.rfind("_orderings.txt")] + "_RaGOO"
+            orderings[chrom] = []
+            with open(line1.rstrip(), 'r') as f2:
+                for line2 in f2:
+                    L1 = line2.rstrip().split('\t')
+                    # Append (ctg_header, orientation)
+                    orderings[chrom].append((L1[0], L1[1]))
+
+    # Get contig lengths
+    ctg_lens = dict()
+    with open(fai_file, 'r') as f:
+        for line in f:
+            L1 = line.split('\t')
+            ctg_lens[L1[0]] = int(L1[1])
+
+    # Write the final AGP file
+    current_pos = 1
+    idx = 1
+    chrom_idx = 1
+    out_buff = []
+
+    sys.stdout.write("## AGP-version 2.0\n")
+    sys.stdout.write("## AGP constructed by RaGOO\n")
+    for chrom in orderings:
+        if orderings[chrom]:
+            for ctg, strand in orderings[chrom]:
+                start = current_pos
+                end = current_pos + ctg_lens[ctg] - 1
+                line_buff = list()
+
+                # Write the line for the contig
+                line_buff.append(chrom)
+                line_buff.append(str(start))
+                line_buff.append(str(end))
+                line_buff.append(str(idx))
+                line_buff.append("W")
+                line_buff.append(ctg)
+                line_buff.append("1")
+                line_buff.append(str(end - start + 1))
+                line_buff.append(strand)
+                out_buff.append("\t".join(line_buff))
+
+                # Write the line for the gap
+                idx += 1
+                current_pos += ctg_lens[ctg]
+                start = current_pos
+                end = current_pos + gap_len - 1
+                line_buff = list()
+
+                line_buff.append(chrom)
+                line_buff.append(str(start))
+                line_buff.append(str(end))
+                line_buff.append(str(idx))
+                line_buff.append("N")
+                line_buff.append(str(gap_len))
+                line_buff.append("scaffold")
+                line_buff.append("yes")
+                line_buff.append("align_genus")
+                out_buff.append("\t".join(line_buff))
+
+                idx += 1
+                current_pos += gap_len
+
+            # Pop the last gap since we don't end with padding
+            out_buff.pop()
+            idx =1
+            chrom_idx += 1
+            current_pos = 1
+
+    sys.stdout.write("\n".join(out_buff) + "\n")
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RaGOO/ragoo.py	Mon Jul 26 18:22:37 2021 +0000
@@ -0,0 +1,759 @@
+#!/usr/bin/env python
+from collections import defaultdict
+from collections import OrderedDict
+import copy
+
+from intervaltree import IntervalTree
+
+from ragoo_utilities.PAFReader import PAFReader
+from ragoo_utilities.SeqReader import SeqReader
+from ragoo_utilities.ReadCoverage import ReadCoverage
+from ragoo_utilities.ContigAlignment import ContigAlignment
+from ragoo_utilities.ContigAlignment import UniqueContigAlignment
+from ragoo_utilities.ContigAlignment import LongestContigAlignment
+from ragoo_utilities.GFFReader import GFFReader
+from ragoo_utilities.utilities import run, log, reverse_complement, read_contigs, read_gz_contigs
+from ragoo_utilities.break_chimera import get_ref_parts, cluster_contig_alns, avoid_gff_intervals, update_gff, break_contig, get_intra_contigs
+
+
+def update_misasm_features(features, breaks, contig, ctg_len):
+
+    # Get ctg len from ReadCoverage object
+    break_list = [0] + sorted(breaks) + [ctg_len]
+    borders = []
+    for i in range(len(break_list) - 1):
+        borders.append((break_list[i], break_list[i+1]))
+
+    # Pop the features to be updated
+    contig_feats = features.pop(contig)
+
+    # Initialize lists for new broken contig headers
+    for i in range(len(borders)):
+        features[contig + '_misasm_break:' + str(borders[i][0]) + '-' + str(borders[i][1])] = []
+
+    t = IntervalTree()
+    for i in borders:
+        t[i[0]:i[1]] = i
+
+    for i in contig_feats:
+        query = t[i.start]
+        assert len(query) == 1
+        break_start = list(query)[0].begin
+        break_end = list(query)[0].end
+        query_border = (break_start, break_end)
+        break_number = borders.index(query_border)
+        i.seqname = contig + '_misasm_break:' + str(borders[break_number][0]) + '-' + str(borders[break_number][1])
+        i.start = i.start - break_start
+        i.end = i.end - break_start
+        features[
+            contig + '_misasm_break:' + str(borders[break_number][0]) + '-' + str(borders[break_number][1])].append(i)
+
+    return features
+
+
+def remove_gff_breaks(gff_ins, breaks):
+    """
+    Given a list of candidate breakpoints proposed by misassembly correction, remove any such break points that
+    fall within the interval of a gff feature. This should be called once per contig.
+    :param gff_ins: List of GFFLines
+    :param breaks: candidate break points
+    :return:
+    """
+    # Make an interval tree from the intervals of the gff lines
+    t = IntervalTree()
+    for line in gff_ins:
+        # If the interval is one bp long, skip
+        if line.start == line.end:
+            continue
+        t[line.start:line.end] = (line.start, line.end)
+
+    return [i for i in breaks if not t[i]]
+
+
+def write_misasm_broken_ctgs(contigs_file, breaks, out_prefix, in_gff=None, in_gff_name=None):
+    current_path = os.getcwd()
+    os.chdir('ctg_alignments')
+
+    if in_gff and in_gff_name:
+        with open(in_gff_name, 'w') as f:
+            for i in in_gff.keys():
+                for j in in_gff[i]:
+                    f.write(str(j) + '\n')
+
+    x = SeqReader("../../" + contigs_file)
+    f = open(out_prefix + ".misasm.break.fa", 'w')
+    for header, seq in x.parse_fasta():
+        header = header[1:]
+        if header not in breaks:
+            f.write(">" + header + "\n")
+            f.write(seq + "\n")
+        else:
+            # Break the contig
+            ctg_len = len(seq)
+            break_list = [0] + sorted(breaks[header]) + [ctg_len]
+            for i in range(len(break_list) - 1):
+                f.write(">" + header + "_misasm_break:" + str(break_list[i]) + "-" + str(break_list[i+1]) + "\n")
+                f.write(seq[break_list[i]:break_list[i+1]] + "\n")
+    os.chdir(current_path)
+
+
+def align_misasm_broken(out_prefix):
+    current_path = os.getcwd()
+    os.chdir('ctg_alignments')
+
+    ctgs_file = out_prefix + ".misasm.break.fa"
+    cmd = '{} -k19 -w19 -t{} ../../{}  {} ' \
+          '> contigs_brk_against_ref.paf 2> contigs_brk_against_ref.paf.log'.format(minimap_path, t, reference_file,
+                                                                            ctgs_file)
+    if not os.path.isfile('contigs_brk_against_ref.paf'):
+        run(cmd)
+    os.chdir(current_path)
+
+
+def write_contig_clusters(unique_dict, thresh, skip_list):
+    # Get a list of all chromosomes
+    all_chroms = set([unique_dict[i].ref_chrom for i in unique_dict.keys()])
+    current_path = os.getcwd()
+    output_path = current_path + '/groupings'
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+
+    os.chdir('groupings')
+    for i in all_chroms:
+        open(i + '_contigs.txt', 'w').close()
+
+    for i in unique_dict.keys():
+        this_chr = unique_dict[i].ref_chrom
+        this_confidence = unique_dict[i].confidence
+        if this_confidence > thresh:
+            if not i in skip_list:
+                file_name = str(this_chr) + '_contigs.txt'
+                with open(file_name, 'a') as f:
+                    f.write(i + '\t' + str(this_confidence) + '\n')
+    os.chdir(current_path)
+
+
+def clean_alignments(in_alns, l=10000, in_exclude_file='', uniq_anchor_filter=False, merge=False):
+    # Exclude alignments to undesired reference headers and filter alignment lengths.
+    exclude_list = []
+    if in_exclude_file:
+        with open('../' + in_exclude_file) as f:
+            for line in f:
+                exclude_list.append(line.rstrip().replace('>', '').split()[0])
+
+    empty_headers = []
+    for header in in_alns.keys():
+        in_alns[header].exclude_ref_chroms(exclude_list)
+        in_alns[header].filter_lengths(l)
+        if uniq_anchor_filter:
+            in_alns[header].unique_anchor_filter()
+
+        if merge:
+            in_alns[header].merge_alns()
+
+        # Check if our filtering has removed all alignments for a contig
+        if len(in_alns[header].ref_headers) == 0:
+            empty_headers.append(header)
+
+    for header in empty_headers:
+        in_alns.pop(header)
+    return in_alns
+
+
+def read_paf_alignments(in_paf):
+    # Read in PAF alignments
+    # Initialize a dictionary where key is contig header, and value is ContigAlignment.
+    alns = dict()
+    x = PAFReader(in_paf)
+    for paf_line in x.parse_paf():
+        if paf_line.contig in alns:
+            alns[paf_line.contig].add_alignment(paf_line)
+        else:
+            alns[paf_line.contig] = ContigAlignment(paf_line.contig)
+            alns[paf_line.contig].add_alignment(paf_line)
+    return alns
+
+
+def get_contigs_from_groupings(in_file):
+    contigs = []
+    with open(in_file) as f:
+        for line in f:
+            contigs.append(line.split('\t')[0])
+    return contigs
+
+
+def get_location_confidence(in_ctg_alns):
+    # Use interval tree to get all alignments with the reference span
+    # Go through each of them and if any start is less than the min_pos or any end is greater than
+    # the max_pos, change the borders to those values. Then use the algorithm that Mike gave me.
+    min_pos = min(in_ctg_alns.ref_starts)
+    max_pos = max(in_ctg_alns.ref_ends)
+    t = IntervalTree()
+
+    # Put the reference start and end position for every alignment into the tree
+    for i in range(len(in_ctg_alns.ref_headers)):
+        t[in_ctg_alns.ref_starts[i]:in_ctg_alns.ref_ends[i]] = (in_ctg_alns.ref_starts[i], in_ctg_alns.ref_ends[i])
+
+    overlaps = t[min_pos:max_pos]
+    if not overlaps:
+        return 0
+
+    # If any intervals fall beyond the boundaries, replace the start/end with the boundary it exceeds
+    ovlp_list = [i.data for i in overlaps]
+    bounded_list = []
+    for i in ovlp_list:
+        if i[0] < min_pos:
+            i[0] = min_pos
+        if i[1] > max_pos:
+            i[1] = max_pos
+        bounded_list.append(i)
+
+    # Now can just calculate the total range covered by the intervals
+    ovlp_range = 0
+    sorted_intervals = sorted(bounded_list, key=lambda tup: tup[0])
+    max_end = -1
+    for j in sorted_intervals:
+        start_new_terr = max(j[0], max_end)
+        ovlp_range += max(0, j[1] - start_new_terr)
+        max_end = max(max_end, j[1])
+
+    return ovlp_range / (max_pos - min_pos)
+
+
+def order_orient_contigs(in_unique_contigs, in_alns):
+    current_path = os.getcwd()
+    output_path = current_path + '/orderings'
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+
+    # Get longest alignments
+    longest_contigs = dict()
+    for i in in_alns.keys():
+        # Only consider alignments to the assigned chromosome
+        uniq_aln = UniqueContigAlignment(in_alns[i])
+        best_header = uniq_aln.ref_chrom
+        ctg_alns = copy.deepcopy(in_alns[i])
+        ctg_alns.filter_ref_chroms([best_header])
+        longest_contigs[i] = LongestContigAlignment(ctg_alns)
+
+    # Save the orientations
+    final_orientations = dict()
+    for i in longest_contigs.keys():
+        final_orientations[i] = longest_contigs[i].strand
+
+    # Get the location and orientation confidence scores
+    orientation_confidence = dict()
+    location_confidence = dict()
+    forward_bp = 0
+    reverse_bp = 0
+    for i in in_alns.keys():
+        uniq_aln = UniqueContigAlignment(in_alns[i])
+        best_header = uniq_aln.ref_chrom
+        ctg_alns = copy.deepcopy(in_alns[i])
+        ctg_alns.filter_ref_chroms([best_header])
+
+        # Orientation confidence scores
+        # Every base pair votes for the orientation of the alignment in which it belongs
+        # Score is # votes for the assigned orientation over all votes
+        for j in range(len(ctg_alns.ref_headers)):
+            if ctg_alns.strands[j] == '+':
+                forward_bp += ctg_alns.aln_lens[j]
+            else:
+                reverse_bp += ctg_alns.aln_lens[j]
+
+        if final_orientations[i] == '+':
+            orientation_confidence[i] = forward_bp / (forward_bp + reverse_bp)
+        else:
+            orientation_confidence[i] = reverse_bp / (forward_bp + reverse_bp)
+
+        forward_bp = 0
+        reverse_bp = 0
+
+        # Location confidence
+        location_confidence[i] = get_location_confidence(ctg_alns)
+
+    all_chroms = set([in_unique_contigs[i].ref_chrom for i in in_unique_contigs.keys()])
+
+    for this_chrom in all_chroms:
+
+        # Intialize the list of start and end positions w.r.t the query
+        ref_pos = []
+
+        groupings_file = 'groupings/' + this_chrom + '_contigs.txt'
+        contigs_list = get_contigs_from_groupings(groupings_file)
+
+        for i in range(len(contigs_list)):
+            # There is a scope issue here. Pass this (longest_contigs) to the method explicitly.
+            ref_pos.append((longest_contigs[contigs_list[i]].ref_start, longest_contigs[contigs_list[i]].ref_end, i))
+
+        final_order = [contigs_list[i[2]] for i in sorted(ref_pos)]
+
+        # Get ordering confidence
+        # To do this, get the max and min alignments to this reference chromosome
+        # Then within that region, what percent of bp are covered
+
+        with open('orderings/' + this_chrom + '_orderings.txt', 'w') as out_file:
+            for i in final_order:
+                # Also have a scope issue here.
+                out_file.write(i + '\t' + final_orientations[i] + '\t' + str(location_confidence[i]) + '\t' + str(orientation_confidence[i]) + '\n')
+
+
+def get_orderings(in_orderings_file):
+    all_orderings = []
+    with open(in_orderings_file) as f:
+        for line in f:
+            L1 = line.split('\t')
+            all_orderings.append((L1[0], L1[1].rstrip()))
+    return all_orderings
+
+
+def create_pseudomolecules(in_contigs_file, in_unique_contigs, gap_size, chr0=True):
+    """
+    Need to make a translation table for easy lift-over.
+    :param in_contigs_file:
+    :param in_unique_contigs:
+    :param gap_size:
+    :return:
+    """
+    # First, read all of the contigs into memory
+    remaining_contig_headers = []
+    all_seqs = OrderedDict()
+    x = SeqReader('../' + in_contigs_file)
+    if in_contigs_file.endswith(".gz"):
+        for header, seq in x.parse_gzip_fasta():
+            remaining_contig_headers.append(header.split(' ')[0])
+            all_seqs[header.split(' ')[0]] = seq
+    else:
+        for header, seq in x.parse_fasta():
+            remaining_contig_headers.append(header.split(' ')[0])
+            all_seqs[header.split(' ')[0]] = seq
+
+    # Get all reference chromosomes
+    all_chroms = sorted(list(set([in_unique_contigs[i].ref_chrom for i in in_unique_contigs.keys()])))
+
+    # Iterate through each orderings file and store sequence in a dictionary
+    all_pms = dict()
+    pad = ''.join('N' for i in range(gap_size))
+    for this_chrom in all_chroms:
+        orderings_file = 'orderings/' + this_chrom + '_orderings.txt'
+        orderings = get_orderings(orderings_file)
+        if orderings:
+            seq_list = []
+            for line in orderings:
+                # Mark that we have seen this contig
+                remaining_contig_headers.pop(remaining_contig_headers.index('>' + line[0]))
+                if line[1] == '+':
+                    seq_list.append(all_seqs['>' + line[0]])
+                else:
+                    assert line[1] == '-'
+                    seq_list.append(reverse_complement(all_seqs['>' + line[0]]))
+            all_pms[this_chrom] = pad.join(seq_list)
+            all_pms[this_chrom] += '\n'
+
+    # Get unincorporated sequences and place them in Chr0
+    if remaining_contig_headers:
+        if chr0:
+            chr0_headers = []
+            chr0_seq_list = []
+            for header in remaining_contig_headers:
+                chr0_headers.append(header)
+                chr0_seq_list.append(all_seqs[header])
+            all_pms['Chr0'] = pad.join(chr0_seq_list)
+            all_pms['Chr0'] += '\n'
+
+            # Write out the list of chr0 headers
+            f_chr0_g = open('groupings/Chr0_contigs.txt', 'w')
+            f_chr0_o = open('orderings/Chr0_orderings.txt', 'w')
+            for i in chr0_headers:
+                f_chr0_g.write(i[1:] + "\t" + "0" + '\n')
+                f_chr0_o.write(i[1:] + '\t' + "+" + '\t' + "0" + '\t' + "0" + '\n')
+            f_chr0_g.close()
+            f_chr0_o.close()
+        else:
+            # Instead of making a chromosome 0, add the unplaced sequences as is.
+            for header in remaining_contig_headers:
+                all_pms[header[1:]] = all_seqs[header] + "\n"
+                f_chr0_g = open('groupings/' + header[1:] + '_contigs.txt', 'w')
+                f_chr0_o = open('orderings/' + header[1:] + '_orderings.txt', 'w')
+                f_chr0_g.write(header[1:] + "\t" + "0" + '\n')
+                f_chr0_o.write(header[1:] + '\t' + "+" + '\t' + "0" + '\t' + "0" + '\n')
+                f_chr0_g.close()
+                f_chr0_o.close()
+
+    # Write the final sequences out to a file
+    with open('ragoo.fasta', 'w') as f:
+        for out_header in all_pms:
+            f.write(">" + out_header + "_RaGOO\n")
+            f.write(all_pms[out_header])
+
+
+def write_broken_files(in_contigs, in_contigs_name, in_gff=None, in_gff_name=None):
+    current_path = os.getcwd()
+    output_path = current_path + '/chimera_break'
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+
+    os.chdir('chimera_break')
+    if in_gff and in_gff_name:
+        with open(in_gff_name, 'w') as f:
+            for i in in_gff.keys():
+                for j in in_gff[i]:
+                    f.write(str(j) + '\n')
+
+    with open(in_contigs_name, 'w') as f:
+        for i in in_contigs.keys():
+            f.write('>' + i + '\n')
+            f.write(in_contigs[i] + '\n')
+
+    os.chdir(current_path)
+
+
+def align_breaks(break_type, m_path, in_reference_file, in_contigs_file, in_num_threads):
+    current_path = os.getcwd()
+    os.chdir('chimera_break')
+    if break_type == 'inter':
+        cmd = '{} -k19 -w19 -t{} ../../{} {} ' \
+          '> inter_contigs_against_ref.paf 2> inter_contigs_against_ref.paf.log'.format(m_path, in_num_threads, in_reference_file, in_contigs_file)
+        if not os.path.isfile('inter_contigs_against_ref.paf'):
+            run(cmd)
+    else:
+        cmd = '{} -k19 -w19 -t{} ../../{} {} ' \
+              '> intra_contigs_against_ref.paf 2> intra_contigs_against_ref.paf.log'.format(m_path, in_num_threads, in_reference_file, in_contigs_file)
+        if not os.path.isfile('intra_contigs_against_ref.paf'):
+            run(cmd)
+
+    os.chdir(current_path)
+
+
+def align_pms(m_path, num_threads, in_reference_file):
+    current_path = os.getcwd()
+    output_path = current_path + '/pm_alignments'
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+    os.chdir('pm_alignments')
+
+    cmd = '{} -ax asm5 --cs -t{} ../../{} {} ' \
+          '> pm_against_ref.sam 2> pm_contigs_against_ref.sam.log'.format(m_path, num_threads,
+                                                                                        in_reference_file, '../ragoo.fasta')
+    if not os.path.isfile('pm_against_ref.sam'):
+        run(cmd)
+
+    os.chdir(current_path)
+
+
+def get_SVs(sv_min, sv_max, in_ref_file):
+    current_path = os.getcwd()
+    os.chdir('pm_alignments')
+    # Change this when setup.py is ready. Just call script directly
+    cmd = 'sam2delta.py pm_against_ref.sam'
+    if not os.path.isfile('pm_against_ref.sam.delta'):
+        run(cmd)
+
+    cmd_2 = 'Assemblytics_uniq_anchor.py --delta pm_against_ref.sam.delta --unique-length 10000 --out assemblytics_out --keep-small-uniques'
+    if not os.path.isfile('assemblytics_out.Assemblytics.unique_length_filtered_l10000.delta'):
+        run(cmd_2)
+
+    cmd_3 = 'Assemblytics_between_alignments.pl assemblytics_out.coords.tab %r %r all-chromosomes exclude-longrange bed > assemblytics_out.variants_between_alignments.bed' %(sv_min, sv_max)
+    if not os.path.isfile('assemblytics_out.variants_between_alignments.bed'):
+        run(cmd_3)
+
+    cmd_4 = 'Assemblytics_within_alignment.py --delta assemblytics_out.Assemblytics.unique_length_filtered_l10000.delta --min %r > assemblytics_out.variants_within_alignments.bed' %(sv_min)
+    if not os.path.isfile('assemblytics_out.variants_within_alignments.bed'):
+        run(cmd_4)
+
+    header = "reference\tref_start\tref_stop\tID\tsize\tstrand\ttype\tref_gap_size\tquery_gap_size\tquery_coordinates\tmethod\n"
+
+    with open('assemblytics_out.variants_between_alignments.bed', 'r')as f1:
+        b1 = f1.read()
+
+    with open('assemblytics_out.variants_within_alignments.bed', 'r') as f2:
+        b2 = f2.read()
+
+    with open('assemblytics_out.Assemblytics_structural_variants.bed', 'w') as f:
+        f.write(header)
+        # Might need to add newlines here
+        f.write(b1)
+        f.write(b2)
+
+    # Filter out SVs caused by gaps
+    cmd_5 = 'filter_gap_SVs.py ../../%s' %(in_ref_file)
+    run(cmd_5)
+
+    os.chdir(current_path)
+
+
+def align_reads(m_path, num_threads, in_ctg_file, reads, tech='ont'):
+    current_path = os.getcwd()
+    output_path = current_path + '/ctg_alignments'
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+    os.chdir('ctg_alignments')
+
+    if tech == 'sr':
+        cmd = '{} -x sr -t{} ../../{} ../../{} ' \
+              '> reads_against_ctg.paf 2> reads_against_ctg.paf.log'.format(m_path, num_threads, in_ctg_file, reads)
+    elif tech == 'corr':
+        cmd = '{} -x asm10 -t{} ../../{} ../../{} ' \
+              '> reads_against_ctg.paf 2> reads_against_ctg.paf.log'.format(m_path, num_threads, in_ctg_file, reads)
+    else:
+        raise ValueError("Only 'sr' or 'corr' are accepted for read type.")
+
+    if not os.path.isfile('reads_against_ctg.paf'):
+        run(cmd)
+
+    os.chdir(current_path)
+
+
+if __name__ == "__main__":
+    import os
+    import argparse
+
+    parser = argparse.ArgumentParser(description='order and orient contigs according to minimap2 alignments to a reference (v1.1)')
+    parser.add_argument("contigs", metavar="<contigs.fasta>", type=str, help="fasta file with contigs to be ordered and oriented (gzipped allowed)")
+    parser.add_argument("reference", metavar="<reference.fasta>", type=str, help="reference fasta file (gzipped allowed)")
+    #parser.add_argument("-o", metavar="PATH", type=str, default="ragoo_output", help="output directory name")
+    parser.add_argument("-e", metavar="<exclude.txt>", type=str, default="", help="single column text file of reference headers to ignore")
+    parser.add_argument("-gff", metavar="<annotations.gff>", type=str, default='', help="lift-over gff features to chimera-broken contigs")
+    parser.add_argument("-m", metavar="PATH", type=str, default="minimap2", help='path to minimap2 executable')
+    parser.add_argument("-b", action='store_true', default=False, help="Break chimeric contigs")
+    parser.add_argument("-R", metavar="<reads.fasta>", type=str, default="", help="Turns on misassembly correction. Align provided reads to the contigs to aid misassembly correction. fastq or fasta allowed. Gzipped files allowed. Turns off '-b'.")
+    parser.add_argument("-T", metavar="sr", type=str, default="", help="Type of reads provided by '-R'. 'sr' and 'corr' accepted for short reads and error corrected long reads respectively.")
+    parser.add_argument("-p", metavar="5", type=int, default=5, help=argparse.SUPPRESS)
+    parser.add_argument("-l", metavar="10000", type=int, default=10000, help=argparse.SUPPRESS)
+    parser.add_argument("-r", metavar="100000", type=int, default=100000, help="(with -b) this many bp of >1 reference sequence must be covered for a contig to be considered an interchromosomal chimera.")
+    parser.add_argument("-c", metavar="1000000", type=int, default=1000000, help="(with -b) distance threshold between consecutive alignments with respect to the contig.")
+    parser.add_argument("-d", metavar="2000000", type=int, default=2000000, help="(with -b) distance threshold between consecutive alignments with respect to the reference.")
+    parser.add_argument("-t", metavar="3", type=int, default=3, help="Number of threads when running minimap.")
+    parser.add_argument("-g", metavar="100", type=int, default=100, help="Gap size for padding in pseudomolecules.")
+    parser.add_argument("-s", action='store_true', default=False, help="Call structural variants")
+    parser.add_argument("-a", metavar="50", type=int, default=50, help=argparse.SUPPRESS)
+    parser.add_argument("-f", metavar="10000", type=int, default=10000, help=argparse.SUPPRESS)
+    parser.add_argument("-i", metavar="0.2", type=float, default=0.2, help="Minimum grouping confidence score needed to be localized.")
+    parser.add_argument("-j", metavar="<skip.txt>", type=str, default="", help="List of contigs to automatically put in chr0.")
+    parser.add_argument("-C", action='store_true', default=False, help="Write unplaced contigs individually instead of making a chr0")
+
+    # Get the command line arguments
+    args = parser.parse_args()
+    contigs_file = args.contigs
+    reference_file = args.reference
+    #output_path = args.o
+    exclude_file = args.e
+    minimap_path = args.m
+    break_chimeras = args.b
+    gff_file = args.gff
+    min_break_pct = args.p
+    min_len = args.l
+    min_range = args.r
+    intra_wrt_ref_min = args.d
+    intra_wrt_ctg_min = args.c
+    t = args.t
+    g = args.g
+    call_svs = args.s
+    min_assemblytics = args.a
+    max_assemblytics = args.f
+    group_score_thresh = args.i
+    skip_file = args.j
+    corr_reads = args.R
+    corr_reads_tech = args.T
+    make_chr0 = not args.C
+
+    if corr_reads:
+        log("Misassembly correction has been turned on. This automatically inactivates chimeric contig correction.")
+        break_chimeras = False
+
+    # Make sure that if -R, -T has been specified
+    if corr_reads and not corr_reads_tech:
+        raise ValueError("'-T' must be provided when using -R.")
+
+    skip_ctg = []
+    if skip_file:
+        with open(skip_file) as f:
+            for line in f:
+                skip_ctg.append(line.rstrip())
+
+    current_path = os.getcwd()
+    output_path = current_path + '/ragoo_output'
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+    os.chdir(output_path)
+
+    # Run minimap2
+    cmd = '{} -k19 -w19 -t{} ../{} ../{} ' \
+          '> contigs_against_ref.paf 2> contigs_against_ref.paf.log'.format(minimap_path, t, reference_file, contigs_file)
+
+    if not os.path.isfile('contigs_against_ref.paf'):
+        run(cmd)
+
+    # Read in the minimap2 alignments just generated
+    log('Reading alignments')
+    alns = read_paf_alignments('contigs_against_ref.paf')
+    alns = clean_alignments(alns, l=1000, in_exclude_file=exclude_file)
+
+    # Process the gff file
+    if gff_file:
+        log('Getting gff features')
+        features = defaultdict(list)
+        z = GFFReader('../' + gff_file)
+        for i in z.parse_gff():
+            features[i.seqname].append(i)
+
+    # Break chimeras if desired
+    if break_chimeras:
+        # Record how many contigs are broken
+        total_inter_broken = 0
+        total_intra_broken = 0
+
+        alns = clean_alignments(alns, l=10000, in_exclude_file=exclude_file, uniq_anchor_filter=True)
+        # Process contigs
+        log('Getting contigs')
+        if contigs_file.endswith(".gz"):
+            contigs_dict = read_gz_contigs('../' + contigs_file)
+        else:
+            contigs_dict = read_contigs('../' + contigs_file)
+
+        log('Finding interchromosomally chimeric contigs')
+        all_chimeras = dict()
+        for i in alns.keys():
+            ref_parts = get_ref_parts(alns[i], min_len, min_break_pct, min_range)
+            if len(ref_parts) > 1:
+                all_chimeras[i] = ref_parts
+
+        log('Finding break points and breaking interchromosomally chimeric contigs')
+        break_intervals = dict()
+        for i in all_chimeras.keys():
+            break_intervals[i] = cluster_contig_alns(i, alns, all_chimeras[i], min_len)
+
+            # If its just going to break it into the same thing, skip it.
+            if len(break_intervals[i]) <= 1:
+                continue
+
+            if gff_file:
+                # If desired, ensure that breakpoints don't disrupt any gff intervals
+                break_intervals[i] = avoid_gff_intervals(break_intervals[i], features[i])
+                features = update_gff(features, break_intervals[i], i)
+
+            # Break contigs according to the final break points
+            contigs_dict = break_contig(contigs_dict, i, break_intervals[i])
+            total_inter_broken += 1
+
+        # Next, need to re-align before finding intrachromosomal chimeras
+        # First, write out the interchromosomal chimera broken fasta
+        out_inter_fasta = contigs_file[:contigs_file.rfind('.')] + '.inter.chimera.broken.fa'
+        if gff_file:
+            out_gff = gff_file[:gff_file.rfind('.')] + '.inter.chimera_broken.gff'
+            write_broken_files(contigs_dict, out_inter_fasta, features, out_gff)
+        else:
+            write_broken_files(contigs_dict, out_inter_fasta)
+
+        # Next, realign the chimera broken contigs
+        align_breaks('inter', minimap_path, reference_file, out_inter_fasta, t)
+
+        # Now, use those new alignments for intrachromosomal chimeras
+        log('Reading interchromosomal chimera broken alignments')
+        inter_alns = read_paf_alignments('chimera_break/inter_contigs_against_ref.paf')
+        inter_alns = clean_alignments(inter_alns, l=1000, in_exclude_file=exclude_file)
+
+        log('Finding intrachromosomally chimeric contigs')
+        # Find intrachromosomally chimeric contigs
+        for i in inter_alns.keys():
+            intra = get_intra_contigs(inter_alns[i], 15000, intra_wrt_ref_min, intra_wrt_ctg_min)
+            if intra:
+                if gff_file:
+                    intra_break_intervals = avoid_gff_intervals(intra[1], features[intra[0]])
+                else:
+                    intra_break_intervals = intra[1]
+                # Check if the avoidance of gff intervals pushed the break point to the end of the contig.
+                if intra_break_intervals[-1][0] == intra_break_intervals[-1][1]:
+                    continue
+
+                # break the contigs and update features if desired
+                contigs_dict = break_contig(contigs_dict, intra[0], intra_break_intervals)
+                total_intra_broken += 1
+
+                if gff_file:
+                    features = update_gff(features, intra_break_intervals, intra[0])
+
+        # Write out the intrachromosomal information
+        out_intra_fasta = contigs_file[:contigs_file.rfind('.')] + '.intra.chimera.broken.fa'
+        if gff_file:
+            out_intra_gff = gff_file[:gff_file.rfind('.')] + '.intra.chimera_broken.gff'
+            write_broken_files(contigs_dict, out_intra_fasta, features, out_intra_gff)
+        else:
+            write_broken_files(contigs_dict, out_intra_fasta)
+
+        # Re align the contigs
+        # Next, realign the chimera broken contigs
+        align_breaks('intra', minimap_path, reference_file, out_intra_fasta, t)
+
+        # Read in alignments of intrachromosomal chimeras and proceed with ordering and orientation
+        log('Reading intrachromosomal chimera broken alignments')
+        alns = read_paf_alignments('chimera_break/intra_contigs_against_ref.paf')
+        alns = clean_alignments(alns, l=1000, in_exclude_file=exclude_file)
+        contigs_file = '/ragoo_output/chimera_break/' + out_intra_fasta
+        log('The total number of interchromasomally chimeric contigs broken is %r' % total_inter_broken)
+        log('The total number of intrachromasomally chimeric contigs broken is %r' % total_intra_broken)
+
+    # Check if misassembly correction is turned on. This is mutually exclusive with chimeric contig correction
+    if corr_reads:
+        # Align the raw reads to the assembly.
+        log('Aligning raw reads to contigs')
+        align_reads(minimap_path, t, contigs_file, corr_reads, corr_reads_tech)
+        log('Computing contig coverage')
+        cov_map = ReadCoverage('ctg_alignments/reads_against_ctg.paf')
+        alns = clean_alignments(alns, l=10000, in_exclude_file=exclude_file, uniq_anchor_filter=True, merge=True)
+
+        # Get the initial candidate break points.
+        candidate_breaks = dict()
+        for i in alns:
+            candidates = alns[i].get_break_candidates()
+            if candidates:
+                candidate_breaks[i] = candidates
+
+        # Validate each breakpoint by checking for excessively high or low coverage
+        # Also, if a gff is provided, check to ensure that we don't break within a gff feature interval
+        val_candidate_breaks = dict()
+        for i in candidate_breaks:
+            candidates = cov_map.check_break_cov(i, candidate_breaks[i])
+            if gff_file:
+                candidates = remove_gff_breaks(features[i], candidates)
+            if candidates:
+                val_candidate_breaks[i] = list(set(candidates))
+                if gff_file:
+                    features = update_misasm_features(features, val_candidate_breaks[i], i, cov_map.ctg_lens[i])
+
+        # Break the contigs
+        if gff_file:
+            out_misasm_gff = gff_file[:gff_file.rfind('.')] + '.misasm.broken.gff'
+            write_misasm_broken_ctgs(contigs_file, val_candidate_breaks, contigs_file[:contigs_file.rfind('.')], in_gff=features, in_gff_name=out_misasm_gff)
+        else:
+            write_misasm_broken_ctgs(contigs_file, val_candidate_breaks, contigs_file[:contigs_file.rfind('.')])
+
+        # Align the broken contigs back to the reference
+        align_misasm_broken(contigs_file[:contigs_file.rfind('.')])
+        alns = read_paf_alignments('ctg_alignments/contigs_brk_against_ref.paf')
+        alns = clean_alignments(alns, l=1000, in_exclude_file=exclude_file)
+        contigs_file = '/ragoo_output/ctg_alignments/' + contigs_file[:contigs_file.rfind('.')] + ".misasm.break.fa"
+
+    # Assign each contig to a corresponding reference chromosome.
+    log('Assigning contigs')
+    all_unique_contigs = dict()
+    for i in alns.keys():
+        all_unique_contigs[i] = UniqueContigAlignment(alns[i])
+
+    # Add to this the list of headers that did not make it
+    write_contig_clusters(all_unique_contigs, group_score_thresh, skip_ctg)
+
+    log('Ordering and orienting contigs')
+    order_orient_contigs(all_unique_contigs, alns)
+
+    log('Creating pseudomolecules')
+    create_pseudomolecules(contigs_file, all_unique_contigs, g, make_chr0)
+
+    if call_svs:
+        log('Aligning pseudomolecules to reference')
+        align_pms(minimap_path, t, reference_file)
+
+        log('Getting structural variants')
+        get_SVs(min_assemblytics, max_assemblytics, reference_file)
+
+    log('goodbye')
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RaGOO/ragoo_utilities/ContigAlignment.py	Mon Jul 26 18:22:37 2021 +0000
@@ -0,0 +1,364 @@
+import operator
+from collections import defaultdict
+
+from ragoo_utilities.utilities import summarize_planesweep, binary_search
+
+
+class ContigAlignment:
+    """
+    This object will represent a contig's alignment to a reference chromosome.
+    Specifically, this class will organize alignments in a one-to-many fashion, where
+    we will only have one object per contig, and each object will store all of the alignments
+    of that contig.
+
+
+    """
+
+    def __init__(self, in_contig):
+        """
+        Each object will refer to a single contig via its header name.
+
+        All other alignment metrics will be stored in lists, where the list offsets
+        correspond to the alignment number.
+
+        Don't actually add any alignments through object instantiation. Only use the add_alignment method.
+        """
+        self.contig = in_contig
+
+        self.query_lens = []
+        self.query_starts = []
+        self.query_ends = []
+        self.strands = []
+        self.ref_headers = []
+        self.ref_lens = []
+        self.ref_starts = []
+        self.ref_ends = []
+        self.num_matches = []
+        self.aln_lens = []
+        self.mapqs = []
+
+        self._is_unique_anchor_filtered = False
+        self._is_merged = False
+
+    def __repr__(self):
+        return '<ContigAlignment' + self.contig + '>'
+
+    def __str__(self):
+        """ Return the alignments in sorted PAF format. """
+        self.sort_by_query()
+        all_alns = []
+        for i in range(len(self.ref_headers)):
+            all_alns.append(
+                "\t".join([
+                    self.contig,
+                    str(self.query_lens[i]),
+                    str(self.query_starts[i]),
+                    str(self.query_ends[i]),
+                    self.strands[i],
+                    self.ref_headers[i],
+                    str(self.ref_lens[i]),
+                    str(self.ref_starts[i]),
+                    str(self.ref_ends[i]),
+                    str(self.num_matches[i]),
+                    str(self.aln_lens[i]),
+                    str(self.mapqs[i])
+                ])
+            )
+        return "\n".join(all_alns)
+
+    def get_attr_lens(self):
+        all_lens = [
+            len(self.query_lens),
+            len(self.query_starts),
+            len(self.query_ends),
+            len(self.strands),
+            len(self.ref_headers),
+            len(self.ref_lens),
+            len(self.ref_starts),
+            len(self.ref_ends),
+            len(self.num_matches),
+            len(self.aln_lens),
+            len(self.mapqs)
+        ]
+        return all_lens
+
+    def add_alignment(self, paf_line):
+        """
+        The only way to add alignments for this contig. Only accepts a PAFLine type object which represents
+        as single line of a paf file.
+        """
+        if not paf_line.contig == self.contig:
+            raise ValueError('Only can add alignments with query header %s' % self.contig)
+
+        self.query_lens.append(paf_line.query_len)
+        self.query_starts.append(paf_line.query_start)
+        self.query_ends.append(paf_line.query_end)
+        self.strands.append(paf_line.strand)
+        self.ref_headers.append(paf_line.ref_header)
+        self.ref_lens.append(paf_line.ref_len)
+        self.ref_starts.append(paf_line.ref_start)
+        self.ref_ends.append(paf_line.ref_end)
+        self.num_matches.append(paf_line.num_match)
+        self.aln_lens.append(paf_line.aln_len)
+        self.mapqs.append(paf_line.mapq)
+
+        # Check that all attribute lists have same length
+        all_lens = self.get_attr_lens()
+        assert len(set(all_lens)) == 1
+
+    def has_unique_chr_match(self):
+        s1 = set(self.ref_headers)
+        if len(s1) == 1:
+            return True
+        return False
+
+    def count_chr_matches(self):
+        s1 = set(self.ref_headers)
+        return len(s1)
+
+    def rearrange_alns(self, hits):
+        """ Generally re structure the alignments according to 'hits', an ordered list of indices. """
+        self.query_lens = [self.query_lens[i] for i in hits]
+        self.query_starts = [self.query_starts[i] for i in hits]
+        self.query_ends = [self.query_ends[i] for i in hits]
+        self.strands = [self.strands[i] for i in hits]
+        self.ref_headers = [self.ref_headers[i] for i in hits]
+        self.ref_lens = [self.ref_lens[i] for i in hits]
+        self.ref_starts = [self.ref_starts[i] for i in hits]
+        self.ref_ends = [self.ref_ends[i] for i in hits]
+        self.num_matches = [self.num_matches[i] for i in hits]
+        self.aln_lens = [self.aln_lens[i] for i in hits]
+        self.mapqs = [self.mapqs[i] for i in hits]
+
+    def filter_ref_chroms(self, in_chroms):
+        """
+        Given a list of chromosomes, mutate this object so as to only include alignments to those chromosomes.
+        """
+        hits = []
+        for i in range(len(self.ref_headers)):
+            if self.ref_headers[i] in in_chroms:
+                hits.append(i)
+
+        self.rearrange_alns(hits)
+
+    def sort_by_ref(self):
+        ref_pos = []
+        for i in range(len(self.ref_headers)):
+            ref_pos.append((self.ref_headers[i], self.ref_starts[i], self.ref_ends[i], i))
+        hits = [i[3] for i in sorted(ref_pos)]
+
+        self.rearrange_alns(hits)
+
+    def sort_by_query(self):
+        q_pos = []
+        for i in range(len(self.ref_headers)):
+            q_pos.append((self.query_starts[i], self.query_ends[i], i))
+        hits = [i[2] for i in sorted(q_pos)]
+
+        self.rearrange_alns(hits)
+
+    def exclude_ref_chroms(self, exclude_list):
+        hits = []
+        for i in range(len(self.ref_headers)):
+            if self.ref_headers[i] not in exclude_list:
+                hits.append(i)
+
+        self.rearrange_alns(hits)
+
+    def filter_lengths(self, l):
+        hits = []
+        for i in range(len(self.ref_headers)):
+            if self.aln_lens[i] > l:
+                hits.append(i)
+
+        self.rearrange_alns(hits)
+
+    def unique_anchor_filter(self):
+        """
+        The contents of this method are either influenced by or directly copied from "Assemblytics_uniq_anchor.py"
+        written by Maria Nattestad. The original script can be found here:
+
+        https://github.com/MariaNattestad/Assemblytics
+
+        And the publication associated with Maria's work is here:
+
+        Nattestad, Maria, and Michael C. Schatz. "Assemblytics: a
+        web analytics tool for the detection of variants from an
+        assembly." Bioinformatics 32.19 (2016): 3021-3023.
+        """
+
+        if not self._is_unique_anchor_filtered:
+            lines_by_query = []
+            for i, j in zip(self.query_starts, self.query_ends):
+                lines_by_query.append((i, j))
+
+            hits = summarize_planesweep(lines_by_query, 10000)
+            self.rearrange_alns(hits)
+            self._is_unique_anchor_filtered = True
+
+    def merge_alns(self, merge_dist=100000):
+        """
+        Merge chains of alignments that are to the same target, have the same orientation, and are less than
+        merge_dist away from each other.
+        :param merge_dist:
+        :return:
+        """
+        # Sort the alignments
+        self.sort_by_ref()
+
+        # Might also want to filter out low identity alignments
+
+        # Keep track of which alignments we are comparing
+        i = 0
+        j = 1
+        while j < len(self.ref_headers):
+            if all([
+                self.ref_headers[i] == self.ref_headers[j],
+                self.strands[i] == self.strands[j],
+                self.ref_starts[j] - self.ref_ends[i] <= merge_dist
+            ]):
+                # Merge the alignments in place of the first alignment
+                self.ref_starts[i] = min(self.ref_starts[i], self.ref_starts[j])
+                self.ref_ends[i] = max(self.ref_ends[i], self.ref_ends[j])
+                self.query_starts[i] = min(self.query_starts[i], self.query_starts[j])
+                self.query_ends[i] = max(self.query_ends[i], self.query_ends[j])
+
+                self.num_matches[i] += self.num_matches[j]
+                self.aln_lens[i] = self.ref_ends[i] - self.ref_starts[i]
+                self.mapqs[i] = (self.mapqs[i] + self.mapqs[j])//2
+
+                # Remove the redundant alignment
+                self.query_lens.pop(j)
+                self.query_starts.pop(j)
+                self.query_ends.pop(j)
+                self.strands.pop(j)
+                self.ref_headers.pop(j)
+                self.ref_lens.pop(j)
+                self.ref_starts.pop(j)
+                self.ref_ends.pop(j)
+                self.num_matches.pop(j)
+                self.aln_lens.pop(j)
+                self.mapqs.pop(j)
+            else:
+                i += 1
+                j += 1
+
+        # remove contained alignments. contained w.r.t the contig
+        cont_inds = []
+        for i in range(len(self.ref_headers)):
+            for j in range(len(self.ref_headers)):
+                # Check if j is contained by i
+                if i == j:
+                    continue
+                if self.query_starts[i] <= self.query_starts[j] and self.query_ends[i] >= self.query_ends[j]:
+                    cont_inds.append(j)
+
+        hits = [i for i in range(len(self.ref_headers)) if i not in cont_inds]
+        self.rearrange_alns(hits)
+
+        self._is_merged = True
+
+    def get_break_candidates(self):
+        if not self._is_merged:
+            raise ValueError("Alignments must first be merged.")
+
+        self.sort_by_query()
+        all_candidates = []
+        for i in range(1, len(self.ref_headers)):
+            all_candidates.append(min(self.query_ends[i-1], self.query_starts[i]))
+            all_candidates.append(max(self.query_ends[i-1], self.query_starts[i]))
+
+        return all_candidates
+
+
+
+class UniqueContigAlignment:
+    """
+    A class representing the reference chromosome to which a particular contig will be assigned to.
+    At first, the incoming alignments may be to multiple chromosomes, but this class will assign a single
+    reference chromosome to an input contig.
+    """
+
+    def __init__(self, in_contig_aln):
+        if not isinstance(in_contig_aln, ContigAlignment):
+            message = 'Can only make a unique alignment from a ContigAlignment object. Got type %s instead.' % str(type(in_contig_aln))
+            raise ValueError(message)
+
+        self.contig = in_contig_aln.contig
+
+        # Find the chromosome which was covered the most in these alignments.
+        self.ref_chrom = None
+        self.confidence = 0.0
+        self._get_best_ref_cov(in_contig_aln)
+
+    def __str__(self):
+        return '<UniqueContigAlignment:' + self.contig + ' - ' + self.ref_chrom + '>'
+
+    def _get_best_ref_cov(self, alns):
+        # Get list of all references chromosomes
+        all_chroms = set(alns.ref_headers)
+        if len(all_chroms) == 1:
+            self.ref_chrom = alns.ref_headers[0]
+            self.confidence = 1
+            return
+
+        # Initialize coverage counts for each chromosome
+        ranges = dict()
+        for i in all_chroms:
+            ranges[i] = 0
+
+        # Get all the ranges in reference to all chromosomes
+        all_intervals = defaultdict(list)
+        for i in range(len(alns.ref_headers)):
+            this_range = (alns.ref_starts[i], alns.ref_ends[i])
+            this_chrom = alns.ref_headers[i]
+            all_intervals[this_chrom].append(this_range)
+
+        # For each chromosome, sort the range and get the union interval length.
+        # Algorithm and pseudocode given by Michael Kirsche
+        for i in all_intervals.keys():
+            sorted_intervals = sorted(all_intervals[i], key=lambda tup: tup[0])
+            max_end = -1
+            for j in sorted_intervals:
+                start_new_terr = max(j[0], max_end)
+                ranges[i] += max(0, j[1] - start_new_terr)
+                max_end = max(max_end, j[1])
+
+        assert ranges
+
+        # I convert to a list and sort the ranges.items() in order to have ties broken in a deterministic way.
+        max_chrom = max(sorted(list(ranges.items())), key=operator.itemgetter(1))[0]
+        self.ref_chrom = max_chrom
+
+        # Now get the confidence of this chromosome assignment
+        # Equal to the max range over all ranges
+        self.confidence = ranges[max_chrom]/sum(ranges.values())
+        assert self.confidence >= 0
+        assert self.confidence <= 1
+
+
+class LongestContigAlignment:
+    """
+    Given a ContigAlignment object, find the longest alignment with respect to a given references chromosome.
+
+    1. Filter out any alignments that are not aligned to specified chromosome.
+    2. Find longest alignment remaining.
+    """
+
+    def __init__(self, in_contig_aln):
+        self.best_index = self._get_best_index(in_contig_aln)
+        self.contig = in_contig_aln.contig
+        self.ref_start = in_contig_aln.ref_starts[self.best_index]
+        self.ref_end = in_contig_aln.ref_ends[self.best_index]
+        self.aln_len = in_contig_aln.aln_lens[self.best_index]
+        self.strand = in_contig_aln.strands[self.best_index]
+        self.interval = (self.ref_start, self.ref_end)
+
+    def _get_best_index(self, aln):
+        max_index = -1
+        max_len = -1
+        for i in range(len(aln.aln_lens)):
+            if aln.aln_lens[i] > max_len:
+                max_len = aln.aln_lens[i]
+                max_index = i
+        return max_index
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RaGOO/ragoo_utilities/GFFReader.py	Mon Jul 26 18:22:37 2021 +0000
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+
+class GFFLine:
+
+    def __init__(self, in_fields):
+        self.seqname = in_fields[0]
+        self.source = in_fields[1]
+        self.feature = in_fields[2]
+        self.start = int(in_fields[3])
+        self.end = int(in_fields[4])
+        self.score = in_fields[5]
+        self.strand = in_fields[6]
+        self.frame = in_fields[7]
+        self.attribute = in_fields[8]
+
+    def __str__(self):
+        all_atts = [
+            self.seqname,
+            self.source,
+            self.feature,
+            self.start,
+            self.end,
+            self.score,
+            self.strand,
+            self.frame,
+            self.attribute
+        ]
+        all_atts = [str(i) for i in all_atts]
+        return '\t'.join(all_atts)
+
+
+class GFFReader:
+
+    def __init__(self, in_gff_file):
+        self.gff_file = in_gff_file
+
+    def parse_gff(self):
+        with open(self.gff_file) as f:
+            for line in f:
+                if not line.startswith('#'):
+                    L1 = line.rstrip().split('\t')
+                    yield GFFLine(L1)
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RaGOO/ragoo_utilities/PAFReader.py	Mon Jul 26 18:22:37 2021 +0000
@@ -0,0 +1,40 @@
+class PAFLine:
+    """ Object to represent a single alignment in a minimap PAF file. """
+
+    def __init__(self, in_line):
+        """
+        start positions should be before end positions for both query and target
+        """
+        self.line = in_line.rstrip().split('\t')
+        self.contig = self.line[0]
+        self.query_len = int(self.line[1])
+        self.query_start = int(self.line[2])
+        self.query_end = int(self.line[3])
+        self.strand = self.line[4]
+        self.ref_header = self.line[5]
+        self.ref_len = int(self.line[6])
+        self.ref_start = int(self.line[7])
+        self.ref_end = int(self.line[8])
+        self.num_match = int(self.line[9])
+        self.aln_len = int(self.line[10])
+        self.mapq = int(self.line[11])
+
+        assert self.query_start <= self.query_end
+        assert self.ref_start <= self.ref_end
+
+    def __str__(self):
+        return '\t'.join(self.line)
+
+    def __eq__(self, other):
+        return self.line == other.line
+
+
+class PAFReader:
+
+    def __init__(self, paf_file):
+        self.paf_file = paf_file
+
+    def parse_paf(self):
+        with open(self.paf_file) as f:
+            for line in f:
+                yield PAFLine(line)
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RaGOO/ragoo_utilities/ReadCoverage.py	Mon Jul 26 18:22:37 2021 +0000
@@ -0,0 +1,212 @@
+import bisect
+
+import numpy as np
+
+
+class ReadCoverage:
+
+    def __init__(self, in_paf):
+        self.paf = in_paf
+        self.glob_mean = None
+        self.glob_std = None
+        self.coverage_map = dict()
+        self.ctg_lens = dict()
+        self._make_coverage_map()
+        self._get_glob_mean()
+
+    @staticmethod
+    def _tabulate_coverage(cov_list):
+        current_coverage = 0
+        coverage_list = []
+        seen = set()
+
+        for header, pos in cov_list:
+            if header in seen:
+                current_coverage -= 1
+                coverage_list.append((pos, current_coverage))
+                seen.remove(header)
+            else:
+                current_coverage += 1
+                coverage_list.append((pos, current_coverage))
+                seen.add(header)
+        return coverage_list
+
+    @staticmethod
+    def _smooth_supported_breaks(sup_breaks, break_types):
+        """
+        If there are multiple low coverage breaks in close proximity,
+        merge it into one break at the lowest coverage point.
+        """
+        i = 0
+        j = 1
+        while j < len(sup_breaks):
+            if break_types[i] == 'l' and break_types[j] == 'l':
+                if abs(sup_breaks[i][0] - sup_breaks[j][0]) < 100000:
+                    # Merge these two break points
+                    sup_breaks[i] = min([sup_breaks[i], sup_breaks[j]], key=lambda x: x[1])
+                    sup_breaks.pop(j)
+                else:
+                    i += 1
+                    j += 1
+            else:
+                i += 1
+                j += 1
+
+        return [z[0] for z in sup_breaks]
+
+    def _trim_ends(self, dist=25000):
+        """ Remove the ends of the contigs from the coverage map. """
+        for i in self.coverage_map:
+            # Start with the beginning of the contig
+            start_idx = 0
+            end_idx = 0
+            for j in range(len(self.coverage_map[i])):
+                if self.coverage_map[i][j][0] < dist:
+                    start_idx = j
+
+                if self.coverage_map[i][j][0] > self.ctg_lens[i] - dist:
+                    end_idx = j-1
+                    break
+            self.coverage_map[i] = self.coverage_map[i][start_idx:end_idx]
+
+        # Remove contigs which don't have coverage info.
+        header_keys = list(self.coverage_map.keys())
+        for i in header_keys:
+            if not self.coverage_map[i]:
+                self.coverage_map.pop(i)
+
+    def _make_coverage_map(self):
+        """
+        Populate self.coverage_map. This is a dictionary that associates each contig header with a list of alignment
+        positions and their coverage levels.
+        """
+        # Associate with each contig header, a list of (query header, start), (query header, end)
+        alns_pos = dict()
+        with open(self.paf, 'r') as f:
+            for line in f:
+                L1 = line.rstrip().split('\t')
+
+                # Only consider an alignment if at least 75% of the read aligned.
+                if abs((int(L1[3]) - int(L1[2])))/int(L1[1]) >= 0.75:
+                    if L1[5] in alns_pos:
+                        alns_pos[L1[5]].append((L1[0], int(L1[7])))
+                        alns_pos[L1[5]].append((L1[0], int(L1[8])))
+                    else:
+                        alns_pos[L1[5]] = [(L1[0], int(L1[7])), (L1[0], int(L1[8]))]
+                        self.ctg_lens[L1[5]] = int(L1[6])
+
+        # Sort these coverage positions and get the coverage map
+        for i in alns_pos:
+            self.coverage_map[i] = self._tabulate_coverage(sorted(alns_pos[i], key=lambda x: x[1]))
+
+        self._trim_ends()
+
+    def _get_glob_mean(self):
+        L1 = []
+        for i in self.coverage_map:
+
+            # In the case where we have multiple coverage values for one position, take the last one.
+            last_pos = 0
+            curr_val = 0
+            for j in self.coverage_map[i]:
+                if j[0] == last_pos:
+                    curr_val = j[1]
+                else:
+                    last_pos = j[0]
+                    L1.append(curr_val)
+                    cur_val = j[1]
+
+        L1 = np.asarray(L1, dtype=np.int32)
+        self.glob_mean = np.median(L1)
+        self.glob_std = np.sqrt(self.glob_mean)
+
+    def _get_index_range(self, header, start_ind, distance):
+        """
+        Get the list of indices that are contained within a distance around the start index
+        """
+        all_inds = []
+        low_counter = 1
+
+        # Check if the start point is at the end of the contig
+        if start_ind == len(self.coverage_map[header]):
+            start_ind -= 1
+
+        start_pos = self.coverage_map[header][start_ind][0]
+        is_low = False
+
+        # Get all coverage map indices representing regions 50kbp upstream of the start
+        while not is_low:
+            next_ind = start_ind-low_counter
+            if next_ind < 0:
+                is_low = True
+            else:
+                next_pos = self.coverage_map[header][next_ind][0]
+                if start_pos - next_pos > distance:
+                    is_low = True
+                else:
+                    all_inds.append(next_ind)
+                    low_counter += 1
+
+        # Repeat for 50kbp downstream
+        high_counter = 1
+        is_high = False
+        while not is_high:
+            next_ind = start_ind + high_counter
+            if next_ind >= len(self.coverage_map[header]):
+                is_high = True
+            else:
+                next_pos = self.coverage_map[header][next_ind][0]
+                if next_pos - start_pos > distance:
+                    is_high = True
+                else:
+                    all_inds.append(next_ind)
+                    high_counter += 1
+
+        return sorted(all_inds)
+
+    def check_break_cov(self, header, in_breaks, min_cov=None, max_cov=None):
+        """
+        Given a list of potential break points, verify if those break points occur around low or high coverage
+        areas. If so, replace the candidate break point with the low/high coverage break point.
+        :param header: contig header for these breaks
+        :param in_breaks: list of candidate break points
+        :param min_cov: break at coverage levels below this value
+        :param max_cov: break at coverage levels above this value
+        :return: list of real break points, or empty list if not breaking is recommended
+        """
+        # Check that we have coverage info for this contig.
+        if header not in self.coverage_map:
+            return []
+
+        if min_cov is None or max_cov is None:
+            # Automatically calculate min and max coverage
+            min_cov = max(self.glob_mean - (self.glob_std*3), 0)
+            max_cov = self.glob_mean + (self.glob_std*3)
+
+        supported_breaks = []
+        break_types = []
+
+        for i in in_breaks:
+            # Get the coverage for the position closest to this potential break point
+            ins_ind = bisect.bisect_left(self.coverage_map[header], (i, 0))
+            # Get the coverage for positions within 50 kbp of the candidate coverage position
+            # Exclude positions near the ends of the sequence.
+            ind_range = self._get_index_range(header, ins_ind, 50000)
+
+            if len(set(ind_range)) > 1:
+                # Check for low coverage
+                lowest_cov = min(self.coverage_map[header][ind_range[0]:ind_range[-1]], key=lambda x: x[1])
+                if lowest_cov[1] < min_cov:
+                    supported_breaks.append(lowest_cov)
+                    break_types.append('l')
+                    continue
+
+                # Check for high coverage
+                highest_cov = max(self.coverage_map[header][ind_range[0]:ind_range[-1]], key=lambda x: x[1])
+                if highest_cov[1] > max_cov:
+                    supported_breaks.append(highest_cov)
+                    break_types.append('h')
+
+        return self._smooth_supported_breaks(supported_breaks, break_types)
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RaGOO/ragoo_utilities/SeqReader.py	Mon Jul 26 18:22:37 2021 +0000
@@ -0,0 +1,76 @@
+import gzip
+
+
+class SeqReader:
+
+    def __init__(self, in_file):
+        """
+        Initialize sequence file to be parsed.
+        :param in_file:
+        """
+        if not isinstance(in_file, str):
+            raise AttributeError('Only a string can be used to instantiate a SeqReader object.')
+        self.in_file = in_file
+
+    def parse_fasta(self):
+        """
+        Generator yielding header and sequence, for each sequence
+        in the fasta file sent to the class.
+        """
+        with open(self.in_file) as fasta_file:
+            sequence = ''
+            # Find first header.
+            line = fasta_file.readline()
+            while not line.startswith('>'):
+                line = fasta_file.readline()
+                if not line:
+                    error = """ This file provided is not in proper fasta format.
+                    In addition to the usual fasta conventions, be sure that there are
+                    no blank lines in the file.
+                    """
+                    raise RuntimeError(error)
+            header = line.rstrip()
+
+            # Get sequence associated with that header.
+            for line in fasta_file:
+                if line.startswith('>'):
+                    # Once the sequence is over, (next header begins),
+                    # yield initial header and sequence.
+                    yield header, sequence
+                    header = line.rstrip()
+                    sequence = ''
+                else:
+                    sequence += ''.join(line.rstrip().split())
+        yield header, sequence
+
+    def parse_gzip_fasta(self):
+        """
+        Generator yielding header and sequence, for each sequence
+        in the fasta file sent to the class. For gzipped fasta files.
+        """
+        with gzip.open(self.in_file) as fasta_file:
+            sequence = ''
+            # Find first header.
+            line = fasta_file.readline().decode('utf-8')
+            while not line.startswith('>'):
+                line = fasta_file.readline().decode('utf-8')
+                if not line:
+                    error = """ This file provided is not in proper fasta format.
+                            In addition to the usual fasta conventions, be sure that there are
+                            no blank lines in the file.
+                            """
+                    raise RuntimeError(error)
+            header = line.rstrip()
+
+            # Get sequence associated with that header.
+            for line in fasta_file:
+                line = line.decode('utf-8')
+                if line.startswith('>'):
+                    # Once the sequence is over, (next header begins),
+                    # yield initial header and sequence.
+                    yield header, sequence
+                    header = line.rstrip()
+                    sequence = ''
+                else:
+                    sequence += ''.join(line.rstrip().split())
+        yield header, sequence
\ No newline at end of file
Binary file RaGOO/ragoo_utilities/__pycache__/ContigAlignment.cpython-39.pyc has changed
Binary file RaGOO/ragoo_utilities/__pycache__/GFFReader.cpython-39.pyc has changed
Binary file RaGOO/ragoo_utilities/__pycache__/PAFReader.cpython-39.pyc has changed
Binary file RaGOO/ragoo_utilities/__pycache__/ReadCoverage.cpython-39.pyc has changed
Binary file RaGOO/ragoo_utilities/__pycache__/SeqReader.cpython-39.pyc has changed
Binary file RaGOO/ragoo_utilities/__pycache__/__init__.cpython-39.pyc has changed
Binary file RaGOO/ragoo_utilities/__pycache__/break_chimera.cpython-39.pyc has changed
Binary file RaGOO/ragoo_utilities/__pycache__/utilities.cpython-39.pyc has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RaGOO/ragoo_utilities/break_chimera.py	Mon Jul 26 18:22:37 2021 +0000
@@ -0,0 +1,246 @@
+from collections import defaultdict
+import copy
+
+from intervaltree import IntervalTree
+
+from ragoo_utilities.ContigAlignment import UniqueContigAlignment
+
+
+def get_ref_parts(alns, l, p, r):
+    """
+
+    :param alns: A ContigAlignment object for the contig in question
+    :param l: Minimum alignment length to consider
+    :param p: Minimum percentage of a reference chromosome that must be covered to be considered significant
+    :param r: Minimum number of reference chromosome nucleotides needed to be covered to be considered significant
+    :return: A list of reference chromosomes to which the input contig significantly aligns to
+    """
+    all_intervals = defaultdict(list)
+
+    # Iterate over all alignments
+    for i in range(len(alns.ref_headers)):
+        if alns.aln_lens[i] < l:
+            continue
+
+        all_intervals[alns.ref_headers[i]].append((alns.ref_starts[i], alns.ref_ends[i]))
+
+    ranges = dict()
+    for i in all_intervals.keys():
+        ranges[i] = 0
+
+    # For each chromosome, sort the range and get the union interval length.
+    # Algorithm and pseudocode given by Michael Kirsche
+    for i in all_intervals.keys():
+        sorted_intervals = sorted(all_intervals[i], key=lambda tup: tup[0])
+        max_end = -1
+        for j in sorted_intervals:
+            start_new_terr = max(j[0], max_end)
+            ranges[i] += max(0, j[1] - start_new_terr)
+            max_end = max(max_end, j[1])
+
+    total_sum = sum(ranges.values())
+    chimeric_refs = []
+    for i in ranges.keys():
+        if (ranges[i]/total_sum)*100 > float(p) and ranges[i] > r:
+            chimeric_refs.append(i)
+
+    return chimeric_refs
+
+
+def cluster_contig_alns(contig, alns, chroms, l):
+    ctg_alns = alns[contig]
+
+    # Filter contig alignments so as to only include alignments to desired chromosomes
+    ctg_alns.filter_ref_chroms(chroms)
+
+    # Intialize the list of start and end positions w.r.t the query
+    query_pos = []
+
+    for i in range(len(ctg_alns.ref_headers)):
+        query_pos.append((ctg_alns.query_starts[i], ctg_alns.query_ends[i], i))
+
+    final_order = [i[2] for i in sorted(query_pos)]
+    final_refs = []
+    for i in final_order:
+        final_refs.append(ctg_alns.ref_headers[i])
+
+    return get_borders(final_refs, ctg_alns, final_order)
+
+
+def get_borders(ordered_refs, alns, order):
+    borders = [0]
+    current_ref = ordered_refs[0]
+    for i in range(1, len(ordered_refs)-1):
+        if ordered_refs[i] != current_ref and ordered_refs[i+1] != current_ref:
+            current_ref = ordered_refs[i]
+            borders.append(alns.query_ends[order[i-1]])
+
+    borders.append(alns.query_lens[0])
+    intervals = [(borders[i], borders[i+1]) for i in range(len(borders)-1)]
+    return intervals
+
+
+def avoid_gff_intervals(borders, gff_ins):
+    """
+    :param borders:
+    :param gff_ins: all features
+    :return:
+    """
+
+    # Make an interval tree from the intervals of features associated with this contig
+    t = IntervalTree()
+    for line in gff_ins:
+        # If the interval is one bp long, skip
+        if line.start == line.end:
+            continue
+        t[line.start:line.end] = (line.start, line.end)
+
+    new_borders = []
+    for i in borders:
+        # Returns an empty set if a border does not fall in any interval.
+        # Otherwise, returns a list of Interval objects
+        interval_query = t[i[0]]
+        if interval_query:
+            while interval_query:
+                # Make new border larger than the max end position of all intervals
+                i = (i[0] + 1, i[1])
+                interval_query = t[i[0]]
+
+        interval_query = t[i[1]]
+        if interval_query:
+            while interval_query:
+                # Make new border larger than the max end position of all intervals
+                i = (i[0], i[1] + 1)
+                interval_query = t[i[1]]
+
+        new_borders.append(i)
+
+    new_borders = new_borders[:-1] + [(new_borders[-1][0], borders[-1][1])]
+    return new_borders
+
+
+def update_gff(features, borders, contig):
+    # Pop the features to be updated
+    contig_feats = features.pop(contig)
+
+    # Initialize lists for new broken contig headers
+    for i in range(len(borders)):
+        features[contig + '_chimera_broken:' + str(borders[i][0]) + '-' + str(borders[i][1])] = []
+
+    # Could just use binary search instead of this tree since intervals are non-overlapping.
+    # but input so small both are trivial
+    t = IntervalTree()
+    for i in borders:
+        t[i[0]:i[1]] = i
+
+    for i in contig_feats:
+        query = t[i.start]
+        assert len(query) == 1
+        break_start = list(query)[0].begin
+        break_end = list(query)[0].end
+        query_border = (break_start, break_end)
+        break_number = borders.index(query_border)
+        i.seqname = contig + '_chimera_broken:' + str(borders[break_number][0]) + '-' + str(borders[break_number][1])
+        i.start = i.start - break_start
+        i.end = i.end - break_start
+        features[contig + '_chimera_broken:' + str(borders[break_number][0]) + '-' + str(borders[break_number][1])].append(i)
+
+    return features
+
+
+def break_contig(contigs_dict, header, break_points):
+    seq = contigs_dict.pop(header)
+    test_seq = ''
+    for i in range(len(break_points)):
+        contigs_dict[header + '_chimera_broken:' + str(break_points[i][0]) + '-' + str(break_points[i][1])] = seq[break_points[i][0]: break_points[i][1]]
+        test_seq += seq[break_points[i][0]: break_points[i][1]]
+
+    assert test_seq == seq
+    return contigs_dict
+
+
+def get_intra_contigs(alns, l, d, c):
+    """
+    Flag contigs as being intrachromosomal chimeras
+    :param alns:
+    :param l: Minimum alignment length to consider
+    :param d: Distance between consecutive adjacent alignments with respect to the reference. If larger than this, flag
+    :param c: Distance between consecutive adjacent alignments with respect to the query. If larger than this, flag
+    :return: dict of contigs and break points.
+    """
+
+    # Get only the header to which this contig mostly aligns to and filter out smaller alignments.
+    uniq_aln = UniqueContigAlignment(alns)
+    best_header = uniq_aln.ref_chrom
+    ctg_alns = copy.deepcopy(alns)
+    ctg_alns.filter_ref_chroms([best_header])
+    ctg_alns.filter_lengths(l)
+
+    # If there are no longer any alignments after length filtering, give up
+    if not len(ctg_alns.ref_headers):
+        return
+
+    # Sort the alignments with respect to the reference start and end positions.
+    ctg_alns.sort_by_ref()
+
+    # Make a list of distance between alignments
+    # first with respect to (wrt) the reference.
+    distances_wrt_ref = []
+    for i in range(len(ctg_alns.ref_headers)-1):
+        distances_wrt_ref.append(ctg_alns.ref_starts[i+1] - ctg_alns.ref_starts[i])
+
+    # next, with respect to (wrt) the contig.
+    distances_wrt_ctg = []
+    for i in range(len(ctg_alns.ref_headers) - 1):
+        distances_wrt_ctg.append(abs(ctg_alns.query_starts[i + 1] - ctg_alns.query_starts[i]))
+
+    # Next, assign the following two identities.
+    #  1. When ordered by the reference, the alignments start at the beginning or the end of the query
+    #  2. For the alignment which will be broken on, is it on the forward or reverse strand.
+
+    is_query_start = True
+
+    if ctg_alns.query_starts[0] >= ctg_alns.query_lens[0]/5:
+        is_query_start = False
+
+    # This conditional essentially checks if there are any break points for this contig.
+    # Returns None otherwise (no return statement)
+    if distances_wrt_ref:
+        if max(distances_wrt_ref) > d:
+            gap_index = distances_wrt_ref.index(max(distances_wrt_ref))
+            a_alns_strands = ctg_alns.strands[:gap_index]
+            if is_query_start:
+                if a_alns_strands.count('-') > a_alns_strands.count('+'):
+                    # The first subcontig is on the reverse strand
+                    return (ctg_alns.contig, [(0, ctg_alns.query_ends[0]), (ctg_alns.query_ends[0], ctg_alns.query_lens[0])])
+                else:
+                    # The first subcontig is on the forward strand.
+                    return (ctg_alns.contig, [(0, ctg_alns.query_ends[gap_index]), (ctg_alns.query_ends[gap_index], ctg_alns.query_lens[0])])
+            else:
+                # The first subcontig starts at the end of the contig
+                if a_alns_strands.count('-') > a_alns_strands.count('+'):
+                    # The first subcontig is on the reverse strand
+                    return (ctg_alns.contig, [(0, ctg_alns.query_starts[gap_index]), (ctg_alns.query_starts[gap_index], ctg_alns.query_lens[0])])
+                else:
+                    # The first subcontig is on the forward strand.
+                    return (ctg_alns.contig, [(0, ctg_alns.query_starts[0]), (ctg_alns.query_starts[0], ctg_alns.query_lens[0])])
+
+        if max(distances_wrt_ctg) > c:
+            gap_index = distances_wrt_ctg.index(max(distances_wrt_ctg)) + 1
+            a_alns_strands = ctg_alns.strands[:gap_index]
+            if is_query_start:
+                if a_alns_strands.count('-') > a_alns_strands.count('+'):
+                    # The first subcontig is on the reverse strand
+                    return (ctg_alns.contig, [(0, ctg_alns.query_ends[0]), (ctg_alns.query_ends[0], ctg_alns.query_lens[0])])
+                else:
+                    # The first subcontig is on the forward strand.
+                    return (ctg_alns.contig, [(0, ctg_alns.query_ends[gap_index]), (ctg_alns.query_ends[gap_index], ctg_alns.query_lens[0])])
+            else:
+                # The first subcontig starts at the end of the contig
+                if a_alns_strands.count('-') > a_alns_strands.count('+'):
+                    # The first subcontig is on the reverse strand
+                    return (ctg_alns.contig, [(0, ctg_alns.query_starts[gap_index-1]), (ctg_alns.query_starts[gap_index-1], ctg_alns.query_lens[0])])
+                else:
+                    # The first subcontig is on the forward strand.
+                    return (ctg_alns.contig, [(0, ctg_alns.query_starts[0]), (ctg_alns.query_starts[0], ctg_alns.query_lens[0])])
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RaGOO/ragoo_utilities/get_contig_borders.py	Mon Jul 26 18:22:37 2021 +0000
@@ -0,0 +1,41 @@
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description='given and orderings file and a contigs fasta index, print a bed file of contig placements in the pseudomolecules.')
+    parser.add_argument("orderings", metavar="<orderings.txt>", type=str, help="orderings file from RaGOO")
+    parser.add_argument("fai", metavar="<contigs.fasta.fai>", type=str, help="index file for contigs (samtools faidx contigs.fasta)")
+    parser.add_argument("gap_len", metavar="100", type=int, help="Gap size used for pseudomolecule padding.")
+
+    # Get the command line arguments
+    args = parser.parse_args()
+    orderings_file = args.orderings
+    fai_file = args.fai
+    gap_len = args.gap_len
+
+    # Save the contig orderings
+    ctgs = []
+    with open(orderings_file, 'r') as f:
+        for line in f:
+            ctgs.append(line.rstrip().split('\t')[0])
+
+    # Get contig lengths
+    ctg_lens = dict()
+    with open(fai_file, 'r') as f:
+        for line in f:
+            L1 = line.split('\t')
+            ctg_lens[L1[0]] = int(L1[1])
+
+    # Get contig borders
+    final_bed = []
+    current_pos = 0
+
+    for ctg in ctgs:
+        start = current_pos
+        end = current_pos + ctg_lens[ctg]
+        current_pos += ctg_lens[ctg]
+        current_pos += gap_len
+        pm_header = orderings_file[orderings_file.rfind('/')+1:orderings_file.rfind('_')] + '_RaGOO'
+        final_bed.append('%s\t%r\t%r' % (pm_header, start, end))
+
+    print('\n'.join(final_bed))
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RaGOO/ragoo_utilities/get_ragoo_stats.py	Mon Jul 26 18:22:37 2021 +0000
@@ -0,0 +1,55 @@
+import argparse
+
+parser = argparse.ArgumentParser(description='Summary stats about contig scaffolding with RaGOO.')
+parser.add_argument("index", metavar="<contigs.fasta.fai>", type=str, help="Samtools fasta index file for input contigs. If chimera breaking mode was used, this must be"
+                                                                           "the index file of the chimera broken contigs, which can be found in ragoo_output/chimera_break."
+                                                                           "The correct file to use is the file with the .intra.chimera.broken.fa suffix.")
+parser.add_argument("groupings", metavar="<groupings.fofn>", type=str, help="file of file names for all *_groupings.txt produced by RaGOO. Single column with full path to each grouping file.")
+
+args = parser.parse_args()
+contigs_index = args.index
+grouping_fofn = args.groupings
+
+remaining_ctg = []
+all_ctg_len = dict()
+with open(contigs_index) as f:
+    for line in f:
+        L1 = line.split('\t')
+        all_ctg_len[L1[0]] = int(L1[1])
+        remaining_ctg.append(L1[0])
+
+grouping_files = []
+with open(grouping_fofn) as f:
+    for line in f:
+        grouping_files.append(line.rstrip())
+
+num_ctg_localized = 0
+num_bp_localized = 0
+
+for group_file in grouping_files:
+    with open(group_file) as f:
+        for line in f:
+            L1 = line.split('\t')
+            header = L1[0].rstrip()
+            num_ctg_localized += 1
+            num_bp_localized += all_ctg_len[header]
+            assert header in remaining_ctg
+            remaining_ctg.pop(remaining_ctg.index(header))
+
+num_ctg_unlocalized = 0
+num_bp_unlocalized = 0
+for ctg in remaining_ctg:
+    num_ctg_unlocalized += 1
+    num_bp_unlocalized += all_ctg_len[ctg]
+
+print('%r contigs were localized by RaGOO' %(num_ctg_localized))
+print('%r bp were localized by RaGOO' %(num_bp_localized))
+print('%r contigs were unlocalized by RaGOO' %(num_ctg_unlocalized))
+print('%r bp were unlocalized by RaGOO' %(num_bp_unlocalized))
+
+print('%r %% of contigs were localized by RaGOO' %((num_ctg_localized/(num_ctg_localized + num_ctg_unlocalized))*100))
+print('%r %% of bp were localized by RaGOO' %((num_bp_localized/(num_bp_localized + num_bp_unlocalized))*100))
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RaGOO/ragoo_utilities/utilities.py	Mon Jul 26 18:22:37 2021 +0000
@@ -0,0 +1,152 @@
+import time
+import subprocess
+import operator
+
+from ragoo_utilities.SeqReader import SeqReader
+
+""" A collection of various helper functions"""
+
+complements = str.maketrans("ACGTNURYSWKMBVDHacgtnuryswkmbvdh", "TGCANAYRSWMKVBHDtgcanayrswmkvbhd")
+
+
+def reverse_complement(seq):
+    """
+    Reverse complement a nucleotide sequence.
+    :param seq: Sequence to be reverse complemented
+    :return: A reverse complemented sequence
+    """
+    return seq.translate(complements)[::-1]
+
+
+def run(cmnd):
+    """ Run command and report status. """
+    log('Running : %s' % cmnd)
+    if subprocess.call(cmnd, shell=True, executable='/bin/bash') != 0:
+        raise RuntimeError('Failed : %s ' % cmnd)
+
+
+def log(message):
+    """ Log messages to standard output. """
+    print(time.ctime() + ' --- ' + message, flush=True)
+
+
+def read_contigs(in_file):
+    d = dict()
+    x = SeqReader(in_file)
+    for header, seq in x.parse_fasta():
+        d[header.replace('>', '').split(' ')[0]] = seq
+    return d
+
+
+def read_gz_contigs(in_file):
+    d = dict()
+    x = SeqReader(in_file)
+    for header, seq in x.parse_gzip_fasta():
+        d[header.replace('>', '').split(' ')[0]] = seq
+    return d
+
+
+def binary_search(query, numbers, left, right):
+    """
+    The contents of this method are either influenced by or directly copied from "Assemblytics_uniq_anchor.py"
+    written by Maria Nattestad. The original script can be found here:
+
+    https://github.com/MariaNattestad/Assemblytics
+
+    And the publication associated with Maria's work is here:
+
+    Nattestad, Maria, and Michael C. Schatz. "Assemblytics: a
+    web analytics tool for the detection of variants from an
+    assembly." Bioinformatics 32.19 (2016): 3021-3023.
+    """
+    #  Returns index of the matching element or the first element to the right
+
+    if left >= right:
+        return right
+    mid = (right + left) // 2
+
+    if query == numbers[mid]:
+        return mid
+    elif query < numbers[mid]:
+        return binary_search(query, numbers, left, mid)
+    else:  # if query > numbers[mid]:
+        return binary_search(query, numbers, mid + 1, right)
+
+
+def summarize_planesweep(lines, unique_length_required, keep_small_uniques=False):
+    """
+    The contents of this method are either influenced by or directly copied from "Assemblytics_uniq_anchor.py"
+    written by Maria Nattestad. The original script can be found here:
+
+    https://github.com/MariaNattestad/Assemblytics
+
+    And the publication associated with Maria's work is here:
+
+    Nattestad, Maria, and Michael C. Schatz. "Assemblytics: a
+    web analytics tool for the detection of variants from an
+    assembly." Bioinformatics 32.19 (2016): 3021-3023.
+    """
+    alignments_to_keep = []
+    # print len(lines)
+
+    # If no alignments:
+    if len(lines) == 0:
+        return []
+
+    # If only one alignment:
+    if len(lines) == 1:
+        if keep_small_uniques == True or abs(lines[0][1] - lines[0][0]) >= unique_length_required:
+            return [0]
+        else:
+            return []
+
+    starts_and_stops = []
+    for query_min, query_max in lines:
+        # print query_min, query_max
+        starts_and_stops.append((query_min, "start"))
+        starts_and_stops.append((query_max, "stop"))
+
+    sorted_starts_and_stops = sorted(starts_and_stops, key=operator.itemgetter(0))
+
+    current_coverage = 0
+    last_position = -1
+    sorted_unique_intervals_left = []
+    sorted_unique_intervals_right = []
+    for pos, change in sorted_starts_and_stops:
+
+        if current_coverage == 1:
+            # sorted_unique_intervals.append((last_position,pos))
+            sorted_unique_intervals_left.append(last_position)
+            sorted_unique_intervals_right.append(pos)
+
+        if change == "start":
+            current_coverage += 1
+        else:
+            current_coverage -= 1
+        last_position = pos
+
+    linecounter = 0
+    for query_min, query_max in lines:
+
+        i = binary_search(query_min, sorted_unique_intervals_left, 0, len(sorted_unique_intervals_left))
+
+        exact_match = False
+        if sorted_unique_intervals_left[i] == query_min and sorted_unique_intervals_right[i] == query_max:
+            exact_match = True
+        sum_uniq = 0
+        while i < len(sorted_unique_intervals_left) and sorted_unique_intervals_left[i] >= query_min and \
+                        sorted_unique_intervals_right[i] <= query_max:
+            sum_uniq += sorted_unique_intervals_right[i] - sorted_unique_intervals_left[i]
+            i += 1
+
+        # print query_min,query_max,sum_uniq
+        if sum_uniq >= unique_length_required:
+            alignments_to_keep.append(linecounter)
+        elif keep_small_uniques == True and exact_match == True:
+            alignments_to_keep.append(linecounter)
+            # print "Keeping small alignment:", query_min, query_max
+            # print sorted_unique_intervals_left[i-1],sorted_unique_intervals_right[i-1]
+
+        linecounter += 1
+
+    return alignments_to_keep
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RaGOO/sam2delta.py	Mon Jul 26 18:22:37 2021 +0000
@@ -0,0 +1,239 @@
+#!/usr/bin/env python
+import argparse
+from collections import defaultdict
+
+"""
+This utility converts a SAM file to a nucmer delta file.
+
+SAM files must contain an NM tag, which is default in minimap2 alignments.  
+"""
+
+
+class SAMAlignment:
+
+    def __init__(self, in_ref_header, in_query_header, in_ref_start, in_cigar, in_flag, in_seq_len, in_num_mismatches):
+        self.num_mismatches = in_num_mismatches
+        self.seq_len = in_seq_len
+        self.flag = int(in_flag)
+        self.is_rc = False
+
+        # Check if the query was reverse complemented
+        if self.flag & 16:
+            self.is_rc = True
+
+        self.cigar = in_cigar
+        self.parsed_cigar = []
+        self._parse_cigar()
+
+        if self.is_rc:
+            self.parsed_cigar = self.parsed_cigar[::-1]
+
+        self.ref_header = in_ref_header
+        self.query_header = in_query_header
+        self.ref_start = int(in_ref_start)
+        self.ref_end = None
+        self.query_start = None
+        self.query_end = None
+        self.query_aln_len = 0
+        self.ref_aln_len = 0
+
+        num_hard_clipped = 0
+        for i in self.parsed_cigar:
+            if i[-1] == 'H':
+                num_hard_clipped += int(i[:-1])
+
+        if self.seq_len == 1:
+            self.seq_len = 0
+            for i in self.parsed_cigar:
+                if i[-1] == 'M' or i[-1] == 'I' or i[-1] == 'S' or i[-1] == 'X' or i[-1] == '=':
+                    self.seq_len += int(i[:-1])
+
+        self.query_len = self.seq_len + num_hard_clipped
+
+        self._get_query_start()
+        self._get_query_aln_len()
+        self._get_ref_aln_len()
+        self._get_query_end()
+        self._get_ref_end()
+
+        if self.is_rc:
+            self.query_start, self.query_end = self.query_end, self.query_start
+            # Lazily taking care of off-by-one errors
+            self.query_end += 1
+            self.query_start -= 1
+
+        self.query_end -= 1
+        self.ref_end -= 1
+
+    def __str__(self):
+        return ' '.join([self.ref_header, self.query_header])
+
+    def __repr__(self):
+        return '<' +  ' '.join([self.ref_header, self.query_header]) + '>'
+
+    def _get_query_start(self):
+        """
+        An alignment will either start with soft clipping, hard clipping, or with one of the alignment
+        codes (e.g. Match/Mismatch (M) or InDel (I/D)). If hard or soft clipped, the start of the alignment is
+        the very next base after clipping. If the alignment has commenced, then the start of the alignment is 1.
+
+        Notes:
+        At least for now, I am only specifying this for minimap2 output, which I believe only has the following
+        characters in cigar strings:
+        {'I', 'S', 'M', 'D', 'H'}
+        so I will only handle these cases.
+        """
+        if self.parsed_cigar[0][-1] == 'I' or self.parsed_cigar[0][-1] == 'D' or self.parsed_cigar[0][-1] == 'M':
+            self.query_start = 1
+        else:
+            # The alignment has started with either soft or hard clipping
+            # Need to double check if a 1 needs to be added here
+            self.query_start = int(self.parsed_cigar[0][:-1]) + 1
+
+    def _get_query_aln_len(self):
+        """ Just the addition of all cigar codes which "consume" the query (M, I)"""
+        for i in self.parsed_cigar:
+            if i[-1] == 'M' or i[-1] == 'I':
+                self.query_aln_len += int(i[:-1])
+
+    def _get_ref_aln_len(self):
+        for i in self.parsed_cigar:
+            if i[-1] == 'M' or i[-1] == 'D':
+                self.ref_aln_len += int(i[:-1])
+
+    def _get_query_end(self):
+        """ This is the length of the alignment + query start """
+        self.query_end = self.query_start + self.query_aln_len
+
+    def _get_ref_end(self):
+        """ This is the length of the alignment + query start """
+        self.ref_end = self.ref_start + self.ref_aln_len
+
+    def _parse_cigar(self):
+        cigar_chars = {
+            'M',
+            'I',
+            'D',
+            'N',
+            'S',
+            'H',
+            'P',
+            '=',
+            'X'
+        }
+
+        this_field = ''
+        for char in self.cigar:
+            this_field += char
+            if char in cigar_chars:
+                self.parsed_cigar.append(this_field)
+                this_field = ''
+
+
+def write_delta(in_alns, in_file_name):
+    with open(in_file_name, 'w') as f:
+        f.write('file1 file2\n')
+        f.write('NUCMER\n')
+        for aln in in_alns.keys():
+            query_len = in_alns[aln][0].query_len
+            #print (query_len)
+            f.write('>%s %s %r %r\n' % (aln[0], aln[1], ref_chr_lens[aln[0]], query_len))
+            for i in in_alns[aln]:
+                f.write('%r %r %r %r %r %r %r\n' % (
+                    i.ref_start,
+                    i.ref_end,
+                    i.query_start,
+                    i.query_end,
+                    i.num_mismatches,
+                    i.num_mismatches,
+                    0
+                ))
+                # Continue with the cigar string
+                offsets = []
+                cigar = i.parsed_cigar
+                if cigar[0][-1] == 'S' or cigar[0][-1] == 'H':
+                    cigar = cigar[1:-1]
+                else:
+                    cigar = cigar[:-1]
+
+                counter = 1
+                for code in cigar:
+                    if code[-1] == 'M':
+                        counter += int(code[:-1])
+                    elif code[-1] == 'D':
+                        offsets.append(counter)
+                        num_I = int(code[:-1])
+                        for i in range(1, num_I):
+                            offsets.append(1)
+                        counter = 1
+                    elif code[-1] == 'I':
+                        offsets.append(-1*counter)
+                        num_I = int(code[:-1])
+                        for i in range(1, num_I):
+                            offsets.append(-1)
+                        counter = 1
+                    else:
+                        raise ValueError('Unexpected CIGAR code')
+                offsets.append(0)
+                offsets = [str(i) for i in offsets]
+                f.write('\n'.join(offsets) + '\n')
+
+
+parser = argparse.ArgumentParser(description='Convert a SAM file to a nucmer delta file.')
+parser.add_argument("sam_file", metavar="<alns.sam>", type=str, help="SAM file to convert")
+
+args = parser.parse_args()
+sam_file = args.sam_file
+
+# Make a dictionary storing all alignments between and query and a reference sequence.
+# key = (reference_header, query_header)
+# value = list of alignments between those sequences. only store info needed for the delta file
+
+alns = defaultdict(list)
+
+# Read through the sam file
+ref_chr_lens = dict()
+with open(sam_file) as f:
+    for line in f:
+        # Skip SAM headers
+        if line.startswith('@SQ'):
+            header_list = line.split('\t')
+            chrom = header_list[1].replace('SN:', '')
+            chr_len = int(header_list[2].replace('LN:', ''))
+            ref_chr_lens[chrom] = chr_len
+            continue
+
+        if line.startswith('@'):
+            continue
+
+        fields = line.split('\t')
+        ref_header = fields[2]
+        query_header = fields[0]
+        ref_start = fields[3]
+        cigar = fields[5]
+        flag = fields[1]
+        seq_len = len(fields[9])
+
+        if not cigar == '*':
+            # Get the NM flag
+            nm = None
+            for i in fields:
+                if i.startswith('NM:i'):
+                    nm = int(i[5:])
+                    continue
+
+            if nm is None:
+                raise ValueError('SAM file must include NM tag.')
+
+            x = SAMAlignment(
+                ref_header,
+                query_header,
+                ref_start,
+                cigar,
+                flag,
+                seq_len,
+                nm
+            )
+            alns[(ref_header, query_header)].append(x)
+
+write_delta(alns, sam_file + '.delta')
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RaGOO/setup.py	Mon Jul 26 18:22:37 2021 +0000
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+from setuptools import setup
+import glob
+
+scripts = glob.glob("*.p*")
+
+setup(
+    name='RaGOO',
+    version='v1.1',
+    description='A tool to order and orient genome assembly contigs via minimap2 alignments to a reference genome.',
+    author='Michael Alonge',
+    author_email='malonge11@gmail.com',
+    packages=['ragoo_utilities'],
+    package_dir={'ragoo_utilities': 'ragoo_utilities/'},
+    install_requires=[
+              'intervaltree',
+              'numpy',
+          ],
+    scripts=scripts,
+    zip_safe=True
+)