# HG changeset patch # User dereeper # Date 1627323757 0 # Node ID b9a3aeb162ab0ce5c6717deaa97ae59038a09bc8 # Parent 68a9ec9ce51e9e0e6e72e75db3cff646a73f9a31 Uploaded diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/.gitignore --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RaGOO/.gitignore Mon Jul 26 18:22:37 2021 +0000 @@ -0,0 +1,4 @@ +.idea +RaGOO.egg-info +build +dist diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/Assemblytics_between_alignments.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RaGOO/Assemblytics_between_alignments.pl Mon Jul 26 18:22:37 2021 +0000 @@ -0,0 +1,438 @@ +#!/usr/bin/perl -w + +# Authors: Maria Nattestad and Mike Schatz +# Email: mnattest@cshl.edu + +use strict; +my @chromosome_filter_choices = ("all-chromosomes","primary-chromosomes"); +my @longrange_filter_choices = ("include-longrange","exclude-longrange","longrange-only"); +my @output_file_choices = ("bed","bedpe"); + +my $USAGE = "Usage:\nAssemblytics_between_alignments.pl coords.tab minimum_event_size maximum_event_size [@chromosome_filter_choices] [@longrange_filter_choices] [@output_file_choices] > fusions.svs.bedpe "; + +my $coordsfile = shift @ARGV or die $USAGE; +my $minimum_event_size = int(shift @ARGV); +my $maximum_event_size = int(shift @ARGV); +my $chromosome_filter = shift @ARGV or die $USAGE; +my $longrange_filter = shift @ARGV or die $USAGE; +my $output_file = shift @ARGV or die $USAGE; + + + +# How close do alignments have to be in order to call deletions and insertions? (as opposed to contractions and expansions) +my $narrow_threshold = 50; + + + +# Number of basepairs of distance in either the reference or the query before we call an SV long-range +my $longrange = $maximum_event_size; + + + +# What is the longest two alignments can map apart in the query before we throw the variant between them away? +my $max_query_dist = 100000; + + + +my %chromosome_filter_choices_hash = map { $_, 1 } @chromosome_filter_choices; +my %longrange_filter_choices_hash = map { $_, 1 } @longrange_filter_choices; +my %output_file_choices_hash = map { $_, 1 } @output_file_choices; + +if ( $chromosome_filter_choices_hash{ $chromosome_filter } && $longrange_filter_choices_hash{ $longrange_filter } && $output_file_choices_hash { $output_file }) { + # All is well with the world +} else { + die $USAGE; +} + +if ($longrange_filter ne "exclude-longrange" && $output_file eq "bed"){ + die "Cannot output bed while allowing long-range variants\n$USAGE"; +} + +# open COORDS, "./bin/show-coords -rclHT $deltafile |" +# or die "Can't process $deltafile ($!)\n"; + +open COORDS, "$coordsfile" or die "Can't process $coordsfile ($!)\n"; + +##open COORDS, "show-coords -rclHT $deltafile |" +## or die "Can't process $deltafile ($!)\n"; + + + +## Require the flanking alignments are at least this long to call an SV +## Note there is no minimum length for fusions, this is determined by how +## the delta file was filtered + +my $MIN_SV_ALIGN = 100; + + +#my $minimum_event_size = 50; +my $approximately_zero = $narrow_threshold; + + +my %alignments; +my $numalignments = 0; + + +while () +{ + chomp; + my @vals = split /\s+/, $_; + + my $rid = $vals[6]; + my $qid = $vals[7]; + + my $a; + $a->{"rstart"} = $vals[0]; + $a->{"rend"} = $vals[1]; + $a->{"qstart"} = $vals[2]; + $a->{"qend"} = $vals[3]; + $a->{"rlen"} = $vals[4]; + $a->{"qlen"} = $vals[5]; + $a->{"rid"} = $vals[6]; + $a->{"qid"} = $vals[7]; + $a->{"str"} = $_; + + $a->{"qidx"} = 0; + + $a->{"qrc"} = ($a->{"qend"} > $a->{"qstart"}) ? 0 : 1; + + push @{$alignments{$qid}->{$rid}}, $a; # a is a hash with all the info for one alignment + + $numalignments++; +} + + + +my $candidatefusions = 0; +my $candidatesvs = 0; + +my $sv_id_counter = 0; + +my %svstats; + +foreach my $qid (sort keys %alignments) # query name is the key for the alignments hash +{ + my @refs = sort keys %{$alignments{$qid}}; # grab all alignments of that query + my $numref = scalar @refs; + + ## scan for fusions + # if ($numref > 1) # if query aligns to multiple chromosomes + # { + # my $allrefs = join " ", @refs; # join the names together for output + + # print "== $qid [$numref] $allrefs\n"; # output the names of the chromosomes + # $candidatefusions++; + + # my $rcnt = 0; + # foreach my $rid (@refs) + # { + # print "--\n" if ($rcnt > 0); + # $rcnt++; + + # foreach my $a (@{$alignments{$qid}->{$rid}}) + # { + # my $str = $a->{"str"}; + # print "$str\n"; + # } + # } + + # print "\n"; + # } + + + ## Resort the alignments by query sort position + my @qaligns; + foreach my $rid (@refs) + { + foreach my $a (@{$alignments{$qid}->{$rid}}) + { + push @qaligns, $a; + } + } + + ## Now record the index of the sorted query indices + @qaligns = sort { $a->{"qstart"} <=> $b->{"qstart"}} @qaligns; + for (my $i=0; $i < scalar @qaligns; $i++) + { + $qaligns[$i]->{"qidx"} = $i; + } + + ## scan for SVs + my $numalign = scalar @qaligns; + + if ($numalign > 1) # if the query has more than 1 alignment + { + ## note skip first one + for (my $j = 1; $j < $numalign; $j++) + { + my $ai = $qaligns[$j-1]; + my $aj = $qaligns[$j]; + + my $istr = $ai->{"str"}; + my $jstr = $aj->{"str"}; + + # if ($ai->{"rid"} ne $aj->{"rid"}) + # { + # ## skip the fusions for now ############################################################################################# + # next; + # } + + my $rid = $ai->{"rid"}; + + if (($ai->{"rlen"} >= $MIN_SV_ALIGN) && + ($aj->{"rlen"} >= $MIN_SV_ALIGN)) + { + ## r alignments are always forward, q alignments may be flipped + + my $rpos; + my $qpos; + my $rdist = 0; + my $qdist = 0; + my $svtype = 0; + + my $chromi = $ai->{"rid"}; + my $chromj = $aj->{"rid"}; + my $posi; + my $posj; + my $strandi; + my $strandj; + + $sv_id_counter++; + + if (($ai->{"qrc"} == 0) && ($aj->{"qrc"} == 0)) + { + ## ri: [1 - 1000] | j: [2000 - 3000] => 1000 + ## qi: [1 - 1000] | j: [2000 - 3000] => 1000 + + $svtype = "FF"; + + $qdist = $aj->{"qstart"} - $ai->{"qend"}; + $rdist = $aj->{"rstart"} - $ai->{"rend"}; + + if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $ai->{"rend"}, $aj->{"rstart"}); } + else { $rpos = sprintf("%s:%d-%d:-", $rid, $aj->{"rstart"}, $ai->{"rend"}); } + + if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $ai->{"qend"}, $aj->{"qstart"}); } + else { $qpos = sprintf("%s:%d-%d:-", $qid, $aj->{"qstart"}, $ai->{"qend"}); } + + # When the alignments are forward-forward, the connection point is at the end of the first (i: rend) and at the beginning of the second (j: rstart) + # i + - j + # ------> --------> + $posi = $ai->{"rend"}; + $posj = $aj->{"rstart"}; + $strandi = "+"; + $strandj = "-"; + + } + elsif (($ai->{"qrc"} == 1) && ($aj->{"qrc"} == 1)) + { + ## ri: [2000 - 3000] | j: [1 - 1000] => 1000 + ## qi: [1000 - 1] | j: [3000 - 2000] => 1000 + + $svtype = "RR"; + + $rdist = $ai->{"rstart"} - $aj->{"rend"}; + $qdist = $aj->{"qend"} - $ai->{"qstart"}; + + if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $aj->{"rend"}, $ai->{"rstart"}); } + else { $rpos = sprintf("%s:%d-%d:-", $rid, $ai->{"rstart"}, $aj->{"rend"}); } + + if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $ai->{"qstart"}, $aj->{"qend"}); } + else { $qpos = sprintf("%s:%d-%d:-", $qid, $aj->{"qend"}, $ai->{"qstart"}); } + + # When the alignments are reverse-reverse, the connection point is at the beginning of the first (i: rstart) and at the end of the second (j: rend) + # j + - i + # <------- <-------- + $posi = $ai->{"rstart"}; # rstart means first reference coordinate, not with respect to the contig + $posj = $aj->{"rend"}; # rend means last reference coordinate, not with respect to the contig + $strandi = "-"; + $strandj = "+"; + } + elsif (($ai->{"qrc"} == 0) && ($aj->{"qrc"} == 1)) + { + ## ri: [1 - 1000] | j: [2000 - 3000] => 1000 + ## qi: [1 - 1000] | j: [3000 - 2000] => 1000 + + $svtype = "FR"; + + $qdist = $aj->{"qend"} - $ai->{"qend"}; + $rdist = $aj->{"rstart"} - $ai->{"rend"}; + + if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $ai->{"rend"}, $aj->{"rstart"}); } + else { $rpos = sprintf("%s:%d-%d:-", $rid, $aj->{"rstart"}, $ai->{"rend"}); } + + if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $ai->{"qend"}, $aj->{"qend"}); } + else { $qpos = sprintf("%s:%d-%d:-", $qid, $aj->{"qend"}, $ai->{"qend"}); } + + # When the alignments are forward-reverse, the connection point is at the beginning of the first (i: rstart) and at the end of the second (j: rend) + # i + j + + # -------> <-------- + $posi = $ai->{"rend"}; + $posj = $aj->{"rend"}; + $strandi = "+"; + $strandj = "+"; + + } + elsif (($ai->{"qrc"} == 1) && ($aj->{"qrc"} == 0)) + { + ## ri: [1 - 1000] | j: [2000 - 3000] => 1000 + ## qi: [1000 - 1] | j: [2000 - 3000] => 1000 + + $svtype = "RF"; + + $qdist = $ai->{"qend"} - $aj->{"qend"}; + $rdist = $aj->{"rstart"} - $ai->{"rend"}; + + if ($rdist >= 0) { $rpos = sprintf("%s:%d-%d:+", $rid, $ai->{"rend"}, $aj->{"rstart"}); } + else { $rpos = sprintf("%s:%d-%d:-", $rid, $aj->{"rstart"}, $ai->{"rend"}); } + + if ($qdist >= 0) { $qpos = sprintf("%s:%d-%d:+", $qid, $aj->{"qend"}, $ai->{"qend"}); } + else { $qpos = sprintf("%s:%d-%d:-", $qid, $ai->{"qend"}, $aj->{"qend"}); } + + # When the alignments are reverse-forward: + # - i - j + # <------- --------> + $posi = $ai->{"rstart"}; + $posj = $aj->{"rstart"}; + $strandi = "-"; + $strandj = "-"; + } + else + { + my $irc = $ai->{"qrc"}; + my $jrc = $aj->{"qrc"}; + + print "ERROR: Unknown SV: $irc $jrc\n"; + print "$istr\n"; + print "$jstr\n"; + die "ERROR: Unknown SV: $irc $jrc\n"; + } + + my $totaldist = $rdist + $qdist; + my $typeguess = ""; + + my $abs_event_size = abs($rdist-$qdist); + + if ($chromi ne $chromj) { # interchromosomal + $typeguess = "Interchromosomal"; + $rdist = 0; + } else { # same chromosome + if ($strandi eq $strandj) { + $typeguess = "Inversion"; + $abs_event_size = $rdist; + } + elsif ($qdist > $rdist) { + # both are significantly negative: (means the size of an overlapping region got larger, so tandem element expansion) + if ($rdist > -1*$approximately_zero && $rdist < $approximately_zero && $qdist > -1*$approximately_zero) { + $typeguess = "Insertion"; + # split into out of nowhere (rdist ~ 0) vs. rdist is > 0: insertion_in_unmapped_region + } + else { + if ($rdist < 0 || $qdist < 0) { + $typeguess = "Tandem_expansion"; + } else { + $typeguess = "Repeat_expansion"; + } + } + } + elsif ($qdist < $rdist) { + # both are significantly negative: (means the size of an overlapping region got smaller, so tandem element contraction) + if ($rdist > -1*$approximately_zero && $qdist > -1*$approximately_zero && $qdist < $approximately_zero) { + $typeguess = "Deletion"; + # split into out of nowhere (rdist ~ 0) vs. rdist is > 0: deletion_in_unmapped_region + } + else { + if ($rdist < 0 || $qdist < 0) { + $typeguess = "Tandem_contraction"; + } else { + $typeguess = "Repeat_contraction"; + } + } + } + else { + $typeguess = "None"; + } + + if ($abs_event_size > $longrange) { # || abs($rdist) > $longrange || abs($qdist) > $longrange + $typeguess = "Longrange"; + if (abs($qdist) > $max_query_dist) { + $typeguess = "None"; + } + } + # my $ratio; + # if ($qdist != 0){ + # # $ratio = abs(($rdist/$qdist)-1); + # # if ($ratio < 0.1) { + # # $typeguess = "Equilibrium"; + # # } + # if ($rdist==$qdist || abs($qdist) > $longrange) { + # $typeguess = "None"; + # } + # } + } + +# my @chromosome_filter_choices = ("all-chromosomes","primary-chromosomes"); +# my @longrange_filter_choices = ("include-longrange","exclude-longrange"); + + my $chromi_length = length $chromi; # length of the chromosome names: a way to filter to primary chromosomes and cut out alts and patches from the assembly + my $chromj_length = length $chromj; + if ($typeguess ne "Inversion" && $typeguess ne "None" && $abs_event_size >= $minimum_event_size) { # always required + if ($chromosome_filter eq "all-chromosomes" || ($chromi_length < 6 && $chromj_length < 6)) { # test for primary chromosomes unless "all-chromosomes" is chosen + if ($longrange_filter ne "exclude-longrange" || ($typeguess ne "Interchromosomal" && $typeguess ne "Longrange")) { + if ($longrange_filter ne "longrange-only" || ($typeguess eq "Interchromosomal" || $typeguess eq "Longrange")) { + if ($output_file eq "bedpe") { + print "$chromi\t$posi\t@{[$posi + 1]}\t$chromj\t$posj\t@{[$posj + 1]}\tAssemblytics_b_$sv_id_counter\t$abs_event_size\t$strandi\t$strandj\t$typeguess\t$rdist\t$qdist\t$qpos\t$abs_event_size\t$svtype\tbetween_alignments\n"; + } + else { + use List::Util qw(min max); + my $ref_start = min(($posi, $posj)); + my $ref_stop = max(($posi, $posj)); + if ($ref_stop eq $ref_start) { + $ref_stop = $ref_start + 1; + } + # "chrom","start","stop","name","event.size","strand","event.type","ref.dist","query.dist","contig.name" + print "$chromi\t$ref_start\t$ref_stop\tAssemblytics_b_$sv_id_counter\t$abs_event_size\t+\t$typeguess\t$rdist\t$qdist\t$qpos\tbetween_alignments\n"; + } + } + } + } + #if ($filter_type ~~ ("primary-allsizes","primary-shortrange") { + # && $typeguess ne "Interchromosomal" && $typeguess ne "Inversion" && $chromi_length < 6 && $chromj_length < 6 && $abs_event_size >= $minimum_event_size) { + } + $candidatesvs++; + #push @{$svstats{$svtype}}, $totaldist; + } + } + } +} + + + +# print "Processed $numalignments alignments found $candidatefusions fusions and $candidatesvs SVs\n"; +# print STDERR "Processed $numalignments alignments found $candidatefusions fusions and $candidatesvs SVs\n"; + +# foreach my $svtype (keys %svstats) +# { +# my @events = @{$svstats{$svtype}}; +# my $cnt = scalar @events; + +# my $sum = 0.0; +# foreach my $e (@events) +# { +# $sum += $e; +# } + +# my $mean = sprintf ("%0.02f", $sum/$cnt); + +# print "svtype[$svtype]: $cnt $mean\n"; +# print STDERR "svtype[$svtype]: $cnt $mean\n"; +# } + + + + + + + + + diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/Assemblytics_uniq_anchor.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RaGOO/Assemblytics_uniq_anchor.py Mon Jul 26 18:22:37 2021 +0000 @@ -0,0 +1,440 @@ +#! /usr/bin/env python + + +# Author: Maria Nattestad +# Email: mnattest@cshl.edu +# This script is part of Assemblytics, a program to detect and analyze structural variants from an assembly aligned to a reference genome using MUMmer. + + +import argparse +import gzip +# from intervaltree import * +import time + +import numpy as np +import operator + + + +def run(args): + filename = args.delta + unique_length = args.unique_length + output_filename = args.out + keep_small_uniques = args.keep_small_uniques + + f = open(filename) + header1 = f.readline() + if header1[0:2]=="\x1f\x8b": + f.close() + f = gzip.open(filename) + + + linecounter = 0 + + current_query_name = "" + current_header = "" + + lines_by_query = {} + header_lines_by_query = {} + + before = time.time() + last = before + + existing_query_names = set() + + for line in f: + if line[0]==">": + + fields = line.strip().split() + current_query_name = fields[1] + current_header = line.strip() + if current_query_name not in existing_query_names: + lines_by_query[current_query_name] = [] + header_lines_by_query[current_query_name] = [] + existing_query_names.add(current_query_name) + else: + fields = line.strip().split() + if len(fields) > 4: + # sometimes start and end are the other way around, but for this they need to be in order + query_min = min([int(fields[2]),int(fields[3])]) + query_max = max([int(fields[2]),int(fields[3])]) + + ########## TESTING ONLY ########### + # lines_by_query[current_query_name] = (query_min,query_max) + # test_list = test_list + [(query_min,query_max)] + ##################################### + + lines_by_query[current_query_name].append((query_min,query_max)) + header_lines_by_query[current_query_name].append(current_header) + # linecounter += 1 + # if linecounter % 10000000 == 0: + # print "%d,%f" % (linecounter, time.time()-last) + # last = time.time() + + + f.close() + + + before = time.time() + alignments_to_keep = {} + num_queries = len(lines_by_query) + + num_query_step_to_report = num_queries/100 + if num_queries < 100: + num_query_step_to_report = num_queries/10 + if num_queries < 10: + num_query_step_to_report = 1 + + query_counter = 0 + + for query in lines_by_query: + + ################ TESTING #################### + + # results_intervaltree = summarize_intervaltree(lines_by_query[query], unique_length_required = unique_length) + # intervaltree_filtered_out = set(range(0,len(lines_by_query[query]))) - set(results_intervaltree) + + # results_planesweep = summarize_planesweep(lines_by_query[query], unique_length_required = unique_length) + # planesweep_filtered_out = set(range(0,len(lines_by_query[query]))) - set(results_planesweep) + # if intervaltree_filtered_out == planesweep_filtered_out : + # num_matches += 1 + # else: + # num_mismatches += 1 + # print "MISMATCH:" + # print "number of alignments:", len(lines_by_query[query]) + # print "results_intervaltree:" + # print results_intervaltree + # for i in results_intervaltree: + # print lines_by_query[query][i] + # print "results_planesweep:" + # print results_planesweep + # for i in results_planesweep: + # print lines_by_query[query][i] + ################ TESTING #################### + + alignments_to_keep[query] = summarize_planesweep(lines_by_query[query], unique_length_required = unique_length,keep_small_uniques=keep_small_uniques) + + query_counter += 1 + + before = time.time() + + + fout = open(output_filename + ".Assemblytics.unique_length_filtered_l%d.delta" % (unique_length),'w') + + + f = open(filename) + header1 = f.readline() + if header1[0:2]=="\x1f\x8b": + f.close() + f = gzip.open(filename) + header1 = f.readline() + + fout.write(header1) # write the first line that we read already + fout.write(f.readline()) + + linecounter = 0 + + # For filtered delta file: + list_of_alignments_to_keep = [] + alignment_counter = {} + keep_printing = False + + # For coords: + current_query_name = "" + current_query_position = 0 + fcoords_out_tab = open(output_filename + ".coords.tab",'w') + fcoords_out_csv = open(output_filename + ".coords.csv",'w') + fcoords_out_csv.write("ref_start,ref_end,query_start,query_end,ref_length,query_length,ref,query,tag\n") + + + # For basic assembly stats: + ref_sequences = set() + query_sequences = set() + ref_lengths = [] + query_lengths = [] + + f_stats_out = open(output_filename + ".Assemblytics_assembly_stats.txt","w") + + for line in f: + linecounter += 1 + if line[0]==">": + fields = line.strip().split() + + # For delta file output: + query = fields[1] + list_of_alignments_to_keep = alignments_to_keep[query] + + header_needed = False + for index in list_of_alignments_to_keep: + if line.strip() == header_lines_by_query[query][index]: + header_needed = True + if header_needed == True: + fout.write(line) # if we have any alignments under this header, print the header + alignment_counter[query] = alignment_counter.get(query,0) + + # For coords: + current_reference_name = fields[0][1:] + current_query_name = fields[1] + + current_reference_size = int(fields[2]) + current_query_size = int(fields[3]) + + # For basic assembly stats: + if not current_reference_name in ref_sequences: + ref_lengths.append(current_reference_size) + ref_sequences.add(current_reference_name) + if not current_query_name in query_sequences: + query_lengths.append(current_query_size) + query_sequences.add(current_query_name) + + else: + fields = line.strip().split() + if len(fields) > 4: + # For coords: + ref_start = int(fields[0]) + ref_end = int(fields[1]) + query_start = int(fields[2]) + query_end = int(fields[3]) + csv_tag = "repetitive" + if alignment_counter[query] in list_of_alignments_to_keep: + fout.write(line) + fcoords_out_tab.write("\t".join(map(str,[ref_start,ref_end,query_start, query_end,current_reference_size,current_query_size,current_reference_name,current_query_name])) + "\n") + csv_tag = "unique" + keep_printing = True + else: + keep_printing = False + fcoords_out_csv.write(",".join(map(str,[ref_start,ref_end,query_start, query_end,current_reference_size,current_query_size,current_reference_name.replace(",","_"),current_query_name.replace(",","_"),csv_tag])) + "\n") + alignment_counter[query] = alignment_counter[query] + 1 + + elif keep_printing == True: + fout.write(line) + + fcoords_out_tab.close() + fcoords_out_csv.close() + + + ref_lengths.sort() + query_lengths.sort() + + # Assembly statistics + ref_lengths = np.array(ref_lengths) + query_lengths = np.array(query_lengths) + + f_stats_out.write("Reference: %s\n" % (header1.split()[0].split("/")[-1])) + f_stats_out.write( "Number of sequences: %s\n" % intWithCommas(len(ref_lengths))) + f_stats_out.write( "Total sequence length: %s\n" % gig_meg(sum(ref_lengths))) + f_stats_out.write( "Mean: %s\n" % gig_meg(np.mean(ref_lengths))) + f_stats_out.write( "Min: %s\n" % gig_meg(np.min(ref_lengths))) + f_stats_out.write( "Max: %s\n" % gig_meg(np.max(ref_lengths))) + f_stats_out.write( "N50: %s\n" % gig_meg(N50(ref_lengths))) + f_stats_out.write( "\n\n") + f_stats_out.write( "Query: %s\n" % header1.split()[1].split("/")[-1]) + f_stats_out.write( "Number of sequences: %s\n" % intWithCommas(len(query_lengths))) + f_stats_out.write( "Total sequence length: %s\n" % gig_meg(sum(query_lengths))) + f_stats_out.write( "Mean: %s\n" % gig_meg(np.mean(query_lengths))) + f_stats_out.write( "Min: %s\n" % gig_meg(np.min(query_lengths))) + f_stats_out.write( "Max: %s\n" % gig_meg(np.max(query_lengths))) + f_stats_out.write( "N50: %s\n" % gig_meg(N50(query_lengths))) + + + f.close() + fout.close() + f_stats_out.close() + +def N50(sorted_list): + # List should be sorted as increasing + + # We flip the list around here so we start with the largest element + cumsum = 0 + for length in sorted_list[::-1]: + cumsum += length + if cumsum >= sum(sorted_list)/2: + return length + + +def gig_meg(number,digits = 2): + gig = 1000000000. + meg = 1000000. + kil = 1000. + + if number > gig: + return str(round(number/gig,digits)) + " Gbp" + elif number > meg: + return str(round(number/meg,digits)) + " Mbp" + elif number > kil: + return str(round(number/kil,digits)) + " Kbp" + else: + return str(number) + " bp" + +def intWithCommas(x): + if type(x) not in [type(0)]: + raise TypeError("Parameter must be an integer.") + if x < 0: + return '-' + intWithCommas(-x) + result = '' + while x >= 1000: + x, r = divmod(x, 1000) + result = ",%03d%s" % (r, result) + return "%d%s" % (x, result) + + +def summarize_planesweep(lines,unique_length_required, keep_small_uniques=False): + + alignments_to_keep = [] + # print len(lines) + + # If no alignments: + if len(lines)==0: + return [] + + # If only one alignment: + if len(lines) == 1: + if keep_small_uniques == True or abs(lines[0][1] - lines[0][0]) >= unique_length_required: + return [0] + else: + return [] + + starts_and_stops = [] + for query_min,query_max in lines: + # print query_min, query_max + starts_and_stops.append((query_min,"start")) + starts_and_stops.append((query_max,"stop")) + + + sorted_starts_and_stops = sorted(starts_and_stops,key=operator.itemgetter(0)) + # print sorted_starts_and_stops + + current_coverage = 0 + last_position = -1 + # sorted_unique_intervals = [] + sorted_unique_intervals_left = [] + sorted_unique_intervals_right = [] + for pos,change in sorted_starts_and_stops: + # print sorted_starts_and_stops[i] + # pos = sorted_starts_and_stops[i][0] + # change = sorted_starts_and_stops[i][1] + + # print pos,change + # First alignment only: + # if last_position == -1: + # last_position = pos + # continue + + # print last_position,pos,current_coverage + + if current_coverage == 1: + # sorted_unique_intervals.append((last_position,pos)) + sorted_unique_intervals_left.append(last_position) + sorted_unique_intervals_right.append(pos) + + if change == "start": + current_coverage += 1 + else: + current_coverage -= 1 + last_position = pos + + + linecounter = 0 + for query_min,query_max in lines: + + i = binary_search(query_min,sorted_unique_intervals_left,0,len(sorted_unique_intervals_left)) + + exact_match = False + if sorted_unique_intervals_left[i] == query_min and sorted_unique_intervals_right[i] == query_max: + exact_match = True + sum_uniq = 0 + while i < len(sorted_unique_intervals_left) and sorted_unique_intervals_left[i] >= query_min and sorted_unique_intervals_right[i] <= query_max: + sum_uniq += sorted_unique_intervals_right[i] - sorted_unique_intervals_left[i] + i += 1 + + # print query_min,query_max,sum_uniq + if sum_uniq >= unique_length_required: + alignments_to_keep.append(linecounter) + elif keep_small_uniques == True and exact_match == True: + alignments_to_keep.append(linecounter) + # print "Keeping small alignment:", query_min, query_max + # print sorted_unique_intervals_left[i-1],sorted_unique_intervals_right[i-1] + + linecounter += 1 + + return alignments_to_keep + + + +def binary_search(query, numbers, left, right): + # Returns index of the matching element or the first element to the right + + if left >= right: + return right + mid = (right+left)//2 + + + if query == numbers[mid]: + return mid + elif query < numbers[mid]: + return binary_search(query,numbers,left,mid) + else: # if query > numbers[mid]: + return binary_search(query,numbers,mid+1,right) + + +# def summarize_intervaltree(lines, unique_length_required): + +# alignments_to_keep = [] +# # print len(lines) + +# if len(lines)==0: +# return alignments_to_keep + +# if len(lines) == 1: +# if abs(lines[0][1] - lines[0][0]) >= unique_length_required: +# return [0] + + +# starts_and_stops = [] +# for query_min,query_max in lines: +# starts_and_stops.append((query_min,query_max)) + +# # build full tree +# tree = IntervalTree.from_tuples(starts_and_stops) + + +# # for each interval (keeping the same order as the lines in the input file) +# line_counter = 0 +# for query_min,query_max in lines: + +# # create a tree object from the current interval +# this_interval = IntervalTree.from_tuples([(query_min,query_max)]) + +# # create a copy of the tree without this one interval +# rest_of_tree = tree - this_interval + +# # find difference between this interval and the rest of the tree by subtracting out the other intervals one by one +# for other_interval in rest_of_tree: +# this_interval.chop(other_interval.begin, other_interval.end) + +# # loop through to count the total number of unique basepairs +# total_unique_length = 0 +# for sub_interval in this_interval: +# total_unique_length += sub_interval.end - sub_interval.begin + +# # if the total unique length is above our threshold, add the index to the list we are reporting +# if total_unique_length >= unique_length_required: +# alignments_to_keep.append(line_counter) +# line_counter += 1 + + +# return alignments_to_keep + + +def main(): + parser=argparse.ArgumentParser(description="Filters alignments in delta file based whether each alignment has a unique sequence anchoring it") + parser.add_argument("--delta",help="delta file" ,dest="delta", type=str, required=True) + parser.add_argument("--out",help="output file" ,dest="out", type=str, required=True) + parser.add_argument("--unique-length",help="The total length of unique sequence an alignment must have on the query side to be retained. Default: 10000" ,dest="unique_length",type=int, default=10000) + parser.add_argument("--keep-small-uniques",help="Keep small aligments (below the unique anchor length) if they are completely unique without any part of the alignment mapping multiple places" ,dest="keep_small_uniques",action="store_true") + parser.set_defaults(func=run) + args=parser.parse_args() + args.func(args) + +if __name__=="__main__": + main() diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/Assemblytics_within_alignment.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RaGOO/Assemblytics_within_alignment.py Mon Jul 26 18:22:37 2021 +0000 @@ -0,0 +1,105 @@ +#! /usr/bin/env python +import argparse + +import gzip +# Author: Maria Nattestad +# Email: mnattest@cshl.edu +# This script is part of Assemblytics, a program to detect and analyze structural variants from an assembly aligned to a reference genome using MUMmer. + + +def run(args): + filename = args.delta + minimum_variant_size = args.minimum_variant_size + + f = open(filename) + header1 = f.readline() + if header1[0:2]=="\x1f\x8b": + f.close() + f = gzip.open(filename) + header1 = f.readline() + + # Ignore the first two lines for now + f.readline() + + linecounter = 0 + + current_reference_name = "" + current_reference_position = 0 + + current_query_name = "" + current_query_position = 0 + + variants = [] + + for line in f: + if line[0]==">": + # linecounter += 1 + # if linecounter > 1: + # break + fields = line.strip().split() + current_reference_name = fields[0][1:] + current_query_name = fields[1] + else: + fields = line.strip().split() + if len(fields) > 4: + # current_reference_position = int(fields[0]) + current_reference_position = min(int(fields[0]),int(fields[1])) + # fields[1] is the reference position at the end of the alignment + # current_query_position = int(fields[2]) + current_query_position = min(int(fields[2]),int(fields[3])) + # fields[3] is the query position at the end of the alignment + else: + tick = int(fields[0]) + if abs(tick) == 1: # then go back and edit the last entry to add 1 more to its size + report = variants[-1] + report[4] = report[4] + 1 # size + if tick > 0: # deletion, moves in reference + report[2] = report[2] + 1 # reference end position + report[7] = report[7] + 1 # reference gap size + current_reference_position += 1 # update reference position after deletion + elif tick < 0: # insertion, moves in query + report[8] = report[8] + 1 # query gap size + report[12] = report[12] + 1 # query end position + current_query_position += 1 # update query position after insertion + else: # report the last one and continue + current_reference_position += abs(tick) - 1 + current_query_position += abs(tick) - 1 + if tick > 0: + size = 1 + # report = "%s\t%d\t%d\tAssemblytics_%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\n" % (current_reference_name,current_reference_position,current_reference_position+size,len(variants)+1,size,"+","Deletion",size,0,current_query_name,"within_alignment") + report = [current_reference_name,current_reference_position,current_reference_position+size,"Assemblytics_w_"+str(len(variants)+1),size,"+","Deletion",size,0,current_query_name,"within_alignment",current_query_position,current_query_position] + current_reference_position += size # update reference position after deletion + variants.append(report) + elif tick < 0: + size = 1 + # report = "%s\t%d\t%d\tAssemblytics_%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\n" % (current_reference_name,current_reference_position,current_reference_position,len(variants)+1,size,"+","Insertion",0,size,current_query_name,"within_alignment") + report = [current_reference_name,current_reference_position,current_reference_position,"Assemblytics_w_"+str(len(variants)+1),size,"+","Insertion",0,size,current_query_name,"within_alignment",current_query_position,current_query_position+size] + current_query_position += size # update query position after insertion + variants.append(report) + # TESTING + # print line, report + + + f.close() + + newcounter = 1 + for line in variants: + # report = "%s\t%d\t%d\tAssemblytics_%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\n" % line + if line[4] >= minimum_variant_size: + line[3] = "Assemblytics_w_%d" % (newcounter) + print ("\t".join(map(str,line[0:10])) + ":" + str(line[11]) + "-" + str(line[12]) + ":+\t" + line[10]) + # print "\t".join(map(str,line)) + newcounter += 1 + + +def main(): + parser=argparse.ArgumentParser(description="Outputs MUMmer coordinates annotated with length of unique sequence for each alignment") + parser.add_argument("--delta",help="delta file" ,dest="delta", type=str, required=True) + parser.add_argument("--min",help="Minimum size (bp) of variant to include, default = 50" ,dest="minimum_variant_size",type=int, default=50) + parser.set_defaults(func=run) + args=parser.parse_args() + args.func(args) + +if __name__=="__main__": + main() + diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/LICENSE --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RaGOO/LICENSE Mon Jul 26 18:22:37 2021 +0000 @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Michael Alonge + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/README.md --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RaGOO/README.md Mon Jul 26 18:22:37 2021 +0000 @@ -0,0 +1,4 @@ +[![DOI](https://zenodo.org/badge/119861637.svg)](https://zenodo.org/badge/latestdoi/119861637) + + +# RaGOO is no longer supported. Please use [RagTag](https://github.com/malonge/RagTag) instead. diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/filter_gap_SVs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RaGOO/filter_gap_SVs.py Mon Jul 26 18:22:37 2021 +0000 @@ -0,0 +1,136 @@ +#!/usr/bin/env python +import re + +from intervaltree import IntervalTree + +from ragoo_utilities.SeqReader import SeqReader + + +class BaseSequence(object): + + def __init__(self, in_sequence): + if not isinstance(in_sequence, str): + raise AttributeError('Only a string can be used to instantiate this class.') + self.sequence = in_sequence.upper() + + +class GapSequence(BaseSequence): + def get_gap_coords(self): + """ Find all of the gap string indices for this sequence. """ + return re.finditer(r'N+', self.sequence) + + +def make_gaps_tree(in_file): + # A dictionary to store an interval tree for each chromosome header. + all_trees = dict() + x = SeqReader(in_file) + if in_file.endswith(".gz"): + for header, sequence in x.parse_gzip_fasta(): + # Remove the greater than sign and only get first token if delimited by spaces + header = header[1:].split(' ')[0] + all_trees[header] = IntervalTree() + gap_sequence = GapSequence(sequence) + all_coordinates = [(m.start(0), m.end(0)) for m in gap_sequence.get_gap_coords()] + for i in all_coordinates: + all_trees[header][i[0]:i[1]] = i + else: + for header, sequence in x.parse_fasta(): + # Remove the greater than sign and only get first token if delimited by spaces + header = header[1:].split(' ')[0] + all_trees[header] = IntervalTree() + gap_sequence = GapSequence(sequence) + all_coordinates = [(m.start(0), m.end(0)) for m in gap_sequence.get_gap_coords()] + for i in all_coordinates: + all_trees[header][i[0]:i[1]] = i + return all_trees + + +def get_query_bed_coords(in_line): + c_loc = [m.start() for m in re.finditer(':', in_line)] + c1, c2 = c_loc[-1], c_loc[-2] + header = in_line[:c2] + L1 = in_line[c2+1:c1].split('-') + start, stop = int(L1[0]), int(L1[1]) + return header, start, stop + + +def make_svs_bed(in_trees_q, in_trees_r): + final_lines = [] + with open('assemblytics_out.Assemblytics_structural_variants.bed', 'r') as f: + + # Make a new header column for the field I am about to create + header = f.readline().rstrip() + '\tquery_gap_ovlp' + final_lines.append(header) + + # Calculate the percentage overlap for each structural variant. + for line in f: + pct_ovlp = 0 + ovlp = 0 + L1 = line.rstrip().split('\t') + head, start, end = get_query_bed_coords(L1[9]) + query = in_trees_q[head][start:end] + if query: + for interval in query: + gap_start, gap_end= interval.begin, interval.end + + # Calculate the amount these to intervals overlap + ovlp += max(0, min(end, gap_end) - max(start, gap_start)) + + pct_ovlp = ovlp/(end-start) + + # Add the new value to the original line + L1.append(pct_ovlp) + L1 = [str(i) for i in L1] + final_lines.append('\t'.join(L1)) + + with open('assemblytics_out.Assemblytics_structural_variants.bed', 'w') as out_file: + out_file.write('\n'.join(final_lines)) + + # Now repeat but with respect to the reference + final_lines = [] + with open('assemblytics_out.Assemblytics_structural_variants.bed', 'r') as f: + # Make a new header column for the field I am about to create + header = f.readline().rstrip() + '\tref_gap_ovlp' + final_lines.append(header) + + # Calculate the percentage overlap for each structural variant. + for line in f: + pct_ovlp = 0 + ovlp = 0 + L1 = line.rstrip().split('\t') + head, start, end = L1[0], int(L1[1]), int(L1[2]) + query = in_trees_r[head][start:end] + if query: + for interval in query: + gap_start, gap_end = interval.begin, interval.end + + # Calculate the amount these to intervals overlap + ovlp += max(0, min(end, gap_end) - max(start, gap_start)) + + pct_ovlp = ovlp / (end - start) + + # Add the new value to the original line + L1.append(pct_ovlp) + L1 = [str(i) for i in L1] + final_lines.append('\t'.join(L1)) + + with open('assemblytics_out.Assemblytics_structural_variants.bed', 'w') as out_file: + out_file.write('\n'.join(final_lines)) + + +def main(): + import argparse + + parser = argparse.ArgumentParser(description='Annotate the SV bed file with the amount degree in which the SV overlaps with a gap') + parser.add_argument("ref_file", metavar="", type=str,help="reference fasta file") + args = parser.parse_args() + ref_file = args.ref_file + + # Make a new bed file w.r.t the query + all_trees_q = make_gaps_tree('../ragoo.fasta') + all_trees_r = make_gaps_tree(ref_file) + make_svs_bed(all_trees_q, all_trees_r) + + +if __name__ == "__main__": + main() diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/lift_over.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RaGOO/lift_over.py Mon Jul 26 18:22:37 2021 +0000 @@ -0,0 +1,125 @@ +#!/usr/bin/env python +def get_contig_lengths(in_fai): + """ Get contig lengths from a fasta index. """ + lens = dict() + with open(in_fai, 'r') as f: + for line in f: + L1 = line.rstrip().split('\t') + lens[L1[0]] = int(L1[1]) + return lens + + +def get_contig_orderings(in_groupings): + """ + From the orderings files, return the following: + + 1. Dictionary associating a reference header with a list of ordered contig headers + 2. Dictionary associating a contig with an orientation + 3. Dicitonary associating a contig with its assigned reference header + """ + orderings = dict() + orientations = dict() + ref_chr = dict() + + # Iterate through the orderings files + with open(in_groupings, 'r') as f1: + for line1 in f1: + header = line1.rstrip().replace('_orderings.txt', '_RaGOO') + header = header[header.rfind('/') + 1:] + + # Initialize the list for the orderings + orderings[header] = [] + with open(line1.rstrip(), 'r') as f2: + for line2 in f2: + L1 = line2.rstrip().split('\t') + orderings[header].append(L1[0]) + orientations[L1[0]] = L1[1] + ref_chr[L1[0]] = header + return orderings, orientations, ref_chr + + +def get_reverse_coords(start_coord, end_coord, seq_length): + """ + Returns new genomic coordinates of a region that has undergone reverse complementation. + new start coordinate = seqLength - endCoord + new end coordinate = seqLength - startCoord + """ + return seq_length - (end_coord - 1), seq_length - (start_coord - 1) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description='Lift-over gff coordinates to from contigs to RaGOO pseudomolecules.' + 'Please make sure that you are using the updated gff file and set of contigs if chimera correction was used.' + 'Also make sure that the gap size (-g) matches that which was used during ordering and orienting.') + parser.add_argument("gff", metavar="", type=str, help="Gff file to be lifted-over") + parser.add_argument("orderings", metavar="", type=str, help="List of RaGOO 'orderings' files. 'ls ragoo_output/orderings/* > orderings.fofn'") + parser.add_argument("fai", metavar="", type=str, help="Fasta index for contigs.'") + parser.add_argument("-g", metavar="100", type=int, default=100, help="Gap size for padding in pseudomolecules (must match what was used for 'ragoo.py'.") + + + # Get the command line arguments + args = parser.parse_args() + gff_file = args.gff + orderings_file = args.orderings + fai_file = args.fai + gap_size = args.g + + # Get the contig lengths + ctg_lens = get_contig_lengths(fai_file) + + # Get the contig orderings and orientations + ctg_orderings, ctg_orientations, ctg_chr = get_contig_orderings(orderings_file) + + #Iterate through the GFF features and update + offsets = dict() + with open(gff_file) as f: + for gff_line in f: + if gff_line.startswith("#"): + print(gff_line.rstrip()) + else: + gff_fields = gff_line.rstrip().split('\t') + gff_header, start, end, strand = gff_fields[0], int(gff_fields[3]), int(gff_fields[4]), gff_fields[6] + new_header = ctg_chr[gff_header] + + # Check that this contig header is in our list of headers + if gff_header not in ctg_lens.keys(): + err_msg = """ %s was not found in the list of orderings files provided. + Please check that you are using the correct files. If chimeric contig correction was + used, please use the corrected gff and fai file. + """ + raise ValueError(err_msg) + + # Check if the contig has been reverse complemented. Update accordingly + if ctg_orientations[gff_header] == '-': + start, end = get_reverse_coords(start, end, ctg_lens[gff_header]) + + # Set the opposite strand + if strand == '+': + strand = '-' + else: + strand = '+' + + # Check if the offset for this contig has already been calculated + if gff_header in offsets: + offset = offsets[gff_header] + else: + ctg_idx = ctg_orderings[new_header].index(gff_header) + offset = 0 + + for ctg in ctg_orderings[new_header][:ctg_idx]: + offset += ctg_lens[ctg] + offset += gap_size + + # memoize the offset + offsets[gff_header] = offset + + new_start = start + offset + new_end = end + offset + + gff_fields[0] = new_header + gff_fields[3] = str(new_start) + gff_fields[4] = str(new_end) + gff_fields[6] = strand + print('\t'.join(gff_fields)) diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/make_agp.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RaGOO/make_agp.py Mon Jul 26 18:22:37 2021 +0000 @@ -0,0 +1,90 @@ +#!/usr/bin/env python +import sys +import argparse + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Produce an AGP v2.0 file describing the ordering and orienting performed by RaGOO.') + parser.add_argument("orderings", metavar="", type=str, help="file of orderings files ($ls ragoo_output/orderings/* > orderings.fofn)") + parser.add_argument("fai", metavar="", type=str, help="index file for contigs ($samtools faidx contigs.fasta)") + parser.add_argument("gap_len", metavar="gap_size", type=int, help="Gap size used for pseudomolecule padding.") + + # Get the command line arguments + args = parser.parse_args() + orderings_fofn = args.orderings + fai_file = args.fai + gap_len = args.gap_len + + # Save the contig orderings + orderings = dict() + with open(orderings_fofn, 'r') as f1: + for line1 in f1: + chrom = line1[line1.rfind("/")+1:line1.rfind("_orderings.txt")] + "_RaGOO" + orderings[chrom] = [] + with open(line1.rstrip(), 'r') as f2: + for line2 in f2: + L1 = line2.rstrip().split('\t') + # Append (ctg_header, orientation) + orderings[chrom].append((L1[0], L1[1])) + + # Get contig lengths + ctg_lens = dict() + with open(fai_file, 'r') as f: + for line in f: + L1 = line.split('\t') + ctg_lens[L1[0]] = int(L1[1]) + + # Write the final AGP file + current_pos = 1 + idx = 1 + chrom_idx = 1 + out_buff = [] + + sys.stdout.write("## AGP-version 2.0\n") + sys.stdout.write("## AGP constructed by RaGOO\n") + for chrom in orderings: + if orderings[chrom]: + for ctg, strand in orderings[chrom]: + start = current_pos + end = current_pos + ctg_lens[ctg] - 1 + line_buff = list() + + # Write the line for the contig + line_buff.append(chrom) + line_buff.append(str(start)) + line_buff.append(str(end)) + line_buff.append(str(idx)) + line_buff.append("W") + line_buff.append(ctg) + line_buff.append("1") + line_buff.append(str(end - start + 1)) + line_buff.append(strand) + out_buff.append("\t".join(line_buff)) + + # Write the line for the gap + idx += 1 + current_pos += ctg_lens[ctg] + start = current_pos + end = current_pos + gap_len - 1 + line_buff = list() + + line_buff.append(chrom) + line_buff.append(str(start)) + line_buff.append(str(end)) + line_buff.append(str(idx)) + line_buff.append("N") + line_buff.append(str(gap_len)) + line_buff.append("scaffold") + line_buff.append("yes") + line_buff.append("align_genus") + out_buff.append("\t".join(line_buff)) + + idx += 1 + current_pos += gap_len + + # Pop the last gap since we don't end with padding + out_buff.pop() + idx =1 + chrom_idx += 1 + current_pos = 1 + + sys.stdout.write("\n".join(out_buff) + "\n") \ No newline at end of file diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/ragoo.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RaGOO/ragoo.py Mon Jul 26 18:22:37 2021 +0000 @@ -0,0 +1,759 @@ +#!/usr/bin/env python +from collections import defaultdict +from collections import OrderedDict +import copy + +from intervaltree import IntervalTree + +from ragoo_utilities.PAFReader import PAFReader +from ragoo_utilities.SeqReader import SeqReader +from ragoo_utilities.ReadCoverage import ReadCoverage +from ragoo_utilities.ContigAlignment import ContigAlignment +from ragoo_utilities.ContigAlignment import UniqueContigAlignment +from ragoo_utilities.ContigAlignment import LongestContigAlignment +from ragoo_utilities.GFFReader import GFFReader +from ragoo_utilities.utilities import run, log, reverse_complement, read_contigs, read_gz_contigs +from ragoo_utilities.break_chimera import get_ref_parts, cluster_contig_alns, avoid_gff_intervals, update_gff, break_contig, get_intra_contigs + + +def update_misasm_features(features, breaks, contig, ctg_len): + + # Get ctg len from ReadCoverage object + break_list = [0] + sorted(breaks) + [ctg_len] + borders = [] + for i in range(len(break_list) - 1): + borders.append((break_list[i], break_list[i+1])) + + # Pop the features to be updated + contig_feats = features.pop(contig) + + # Initialize lists for new broken contig headers + for i in range(len(borders)): + features[contig + '_misasm_break:' + str(borders[i][0]) + '-' + str(borders[i][1])] = [] + + t = IntervalTree() + for i in borders: + t[i[0]:i[1]] = i + + for i in contig_feats: + query = t[i.start] + assert len(query) == 1 + break_start = list(query)[0].begin + break_end = list(query)[0].end + query_border = (break_start, break_end) + break_number = borders.index(query_border) + i.seqname = contig + '_misasm_break:' + str(borders[break_number][0]) + '-' + str(borders[break_number][1]) + i.start = i.start - break_start + i.end = i.end - break_start + features[ + contig + '_misasm_break:' + str(borders[break_number][0]) + '-' + str(borders[break_number][1])].append(i) + + return features + + +def remove_gff_breaks(gff_ins, breaks): + """ + Given a list of candidate breakpoints proposed by misassembly correction, remove any such break points that + fall within the interval of a gff feature. This should be called once per contig. + :param gff_ins: List of GFFLines + :param breaks: candidate break points + :return: + """ + # Make an interval tree from the intervals of the gff lines + t = IntervalTree() + for line in gff_ins: + # If the interval is one bp long, skip + if line.start == line.end: + continue + t[line.start:line.end] = (line.start, line.end) + + return [i for i in breaks if not t[i]] + + +def write_misasm_broken_ctgs(contigs_file, breaks, out_prefix, in_gff=None, in_gff_name=None): + current_path = os.getcwd() + os.chdir('ctg_alignments') + + if in_gff and in_gff_name: + with open(in_gff_name, 'w') as f: + for i in in_gff.keys(): + for j in in_gff[i]: + f.write(str(j) + '\n') + + x = SeqReader("../../" + contigs_file) + f = open(out_prefix + ".misasm.break.fa", 'w') + for header, seq in x.parse_fasta(): + header = header[1:] + if header not in breaks: + f.write(">" + header + "\n") + f.write(seq + "\n") + else: + # Break the contig + ctg_len = len(seq) + break_list = [0] + sorted(breaks[header]) + [ctg_len] + for i in range(len(break_list) - 1): + f.write(">" + header + "_misasm_break:" + str(break_list[i]) + "-" + str(break_list[i+1]) + "\n") + f.write(seq[break_list[i]:break_list[i+1]] + "\n") + os.chdir(current_path) + + +def align_misasm_broken(out_prefix): + current_path = os.getcwd() + os.chdir('ctg_alignments') + + ctgs_file = out_prefix + ".misasm.break.fa" + cmd = '{} -k19 -w19 -t{} ../../{} {} ' \ + '> contigs_brk_against_ref.paf 2> contigs_brk_against_ref.paf.log'.format(minimap_path, t, reference_file, + ctgs_file) + if not os.path.isfile('contigs_brk_against_ref.paf'): + run(cmd) + os.chdir(current_path) + + +def write_contig_clusters(unique_dict, thresh, skip_list): + # Get a list of all chromosomes + all_chroms = set([unique_dict[i].ref_chrom for i in unique_dict.keys()]) + current_path = os.getcwd() + output_path = current_path + '/groupings' + if not os.path.exists(output_path): + os.makedirs(output_path) + + os.chdir('groupings') + for i in all_chroms: + open(i + '_contigs.txt', 'w').close() + + for i in unique_dict.keys(): + this_chr = unique_dict[i].ref_chrom + this_confidence = unique_dict[i].confidence + if this_confidence > thresh: + if not i in skip_list: + file_name = str(this_chr) + '_contigs.txt' + with open(file_name, 'a') as f: + f.write(i + '\t' + str(this_confidence) + '\n') + os.chdir(current_path) + + +def clean_alignments(in_alns, l=10000, in_exclude_file='', uniq_anchor_filter=False, merge=False): + # Exclude alignments to undesired reference headers and filter alignment lengths. + exclude_list = [] + if in_exclude_file: + with open('../' + in_exclude_file) as f: + for line in f: + exclude_list.append(line.rstrip().replace('>', '').split()[0]) + + empty_headers = [] + for header in in_alns.keys(): + in_alns[header].exclude_ref_chroms(exclude_list) + in_alns[header].filter_lengths(l) + if uniq_anchor_filter: + in_alns[header].unique_anchor_filter() + + if merge: + in_alns[header].merge_alns() + + # Check if our filtering has removed all alignments for a contig + if len(in_alns[header].ref_headers) == 0: + empty_headers.append(header) + + for header in empty_headers: + in_alns.pop(header) + return in_alns + + +def read_paf_alignments(in_paf): + # Read in PAF alignments + # Initialize a dictionary where key is contig header, and value is ContigAlignment. + alns = dict() + x = PAFReader(in_paf) + for paf_line in x.parse_paf(): + if paf_line.contig in alns: + alns[paf_line.contig].add_alignment(paf_line) + else: + alns[paf_line.contig] = ContigAlignment(paf_line.contig) + alns[paf_line.contig].add_alignment(paf_line) + return alns + + +def get_contigs_from_groupings(in_file): + contigs = [] + with open(in_file) as f: + for line in f: + contigs.append(line.split('\t')[0]) + return contigs + + +def get_location_confidence(in_ctg_alns): + # Use interval tree to get all alignments with the reference span + # Go through each of them and if any start is less than the min_pos or any end is greater than + # the max_pos, change the borders to those values. Then use the algorithm that Mike gave me. + min_pos = min(in_ctg_alns.ref_starts) + max_pos = max(in_ctg_alns.ref_ends) + t = IntervalTree() + + # Put the reference start and end position for every alignment into the tree + for i in range(len(in_ctg_alns.ref_headers)): + t[in_ctg_alns.ref_starts[i]:in_ctg_alns.ref_ends[i]] = (in_ctg_alns.ref_starts[i], in_ctg_alns.ref_ends[i]) + + overlaps = t[min_pos:max_pos] + if not overlaps: + return 0 + + # If any intervals fall beyond the boundaries, replace the start/end with the boundary it exceeds + ovlp_list = [i.data for i in overlaps] + bounded_list = [] + for i in ovlp_list: + if i[0] < min_pos: + i[0] = min_pos + if i[1] > max_pos: + i[1] = max_pos + bounded_list.append(i) + + # Now can just calculate the total range covered by the intervals + ovlp_range = 0 + sorted_intervals = sorted(bounded_list, key=lambda tup: tup[0]) + max_end = -1 + for j in sorted_intervals: + start_new_terr = max(j[0], max_end) + ovlp_range += max(0, j[1] - start_new_terr) + max_end = max(max_end, j[1]) + + return ovlp_range / (max_pos - min_pos) + + +def order_orient_contigs(in_unique_contigs, in_alns): + current_path = os.getcwd() + output_path = current_path + '/orderings' + if not os.path.exists(output_path): + os.makedirs(output_path) + + # Get longest alignments + longest_contigs = dict() + for i in in_alns.keys(): + # Only consider alignments to the assigned chromosome + uniq_aln = UniqueContigAlignment(in_alns[i]) + best_header = uniq_aln.ref_chrom + ctg_alns = copy.deepcopy(in_alns[i]) + ctg_alns.filter_ref_chroms([best_header]) + longest_contigs[i] = LongestContigAlignment(ctg_alns) + + # Save the orientations + final_orientations = dict() + for i in longest_contigs.keys(): + final_orientations[i] = longest_contigs[i].strand + + # Get the location and orientation confidence scores + orientation_confidence = dict() + location_confidence = dict() + forward_bp = 0 + reverse_bp = 0 + for i in in_alns.keys(): + uniq_aln = UniqueContigAlignment(in_alns[i]) + best_header = uniq_aln.ref_chrom + ctg_alns = copy.deepcopy(in_alns[i]) + ctg_alns.filter_ref_chroms([best_header]) + + # Orientation confidence scores + # Every base pair votes for the orientation of the alignment in which it belongs + # Score is # votes for the assigned orientation over all votes + for j in range(len(ctg_alns.ref_headers)): + if ctg_alns.strands[j] == '+': + forward_bp += ctg_alns.aln_lens[j] + else: + reverse_bp += ctg_alns.aln_lens[j] + + if final_orientations[i] == '+': + orientation_confidence[i] = forward_bp / (forward_bp + reverse_bp) + else: + orientation_confidence[i] = reverse_bp / (forward_bp + reverse_bp) + + forward_bp = 0 + reverse_bp = 0 + + # Location confidence + location_confidence[i] = get_location_confidence(ctg_alns) + + all_chroms = set([in_unique_contigs[i].ref_chrom for i in in_unique_contigs.keys()]) + + for this_chrom in all_chroms: + + # Intialize the list of start and end positions w.r.t the query + ref_pos = [] + + groupings_file = 'groupings/' + this_chrom + '_contigs.txt' + contigs_list = get_contigs_from_groupings(groupings_file) + + for i in range(len(contigs_list)): + # There is a scope issue here. Pass this (longest_contigs) to the method explicitly. + ref_pos.append((longest_contigs[contigs_list[i]].ref_start, longest_contigs[contigs_list[i]].ref_end, i)) + + final_order = [contigs_list[i[2]] for i in sorted(ref_pos)] + + # Get ordering confidence + # To do this, get the max and min alignments to this reference chromosome + # Then within that region, what percent of bp are covered + + with open('orderings/' + this_chrom + '_orderings.txt', 'w') as out_file: + for i in final_order: + # Also have a scope issue here. + out_file.write(i + '\t' + final_orientations[i] + '\t' + str(location_confidence[i]) + '\t' + str(orientation_confidence[i]) + '\n') + + +def get_orderings(in_orderings_file): + all_orderings = [] + with open(in_orderings_file) as f: + for line in f: + L1 = line.split('\t') + all_orderings.append((L1[0], L1[1].rstrip())) + return all_orderings + + +def create_pseudomolecules(in_contigs_file, in_unique_contigs, gap_size, chr0=True): + """ + Need to make a translation table for easy lift-over. + :param in_contigs_file: + :param in_unique_contigs: + :param gap_size: + :return: + """ + # First, read all of the contigs into memory + remaining_contig_headers = [] + all_seqs = OrderedDict() + x = SeqReader('../' + in_contigs_file) + if in_contigs_file.endswith(".gz"): + for header, seq in x.parse_gzip_fasta(): + remaining_contig_headers.append(header.split(' ')[0]) + all_seqs[header.split(' ')[0]] = seq + else: + for header, seq in x.parse_fasta(): + remaining_contig_headers.append(header.split(' ')[0]) + all_seqs[header.split(' ')[0]] = seq + + # Get all reference chromosomes + all_chroms = sorted(list(set([in_unique_contigs[i].ref_chrom for i in in_unique_contigs.keys()]))) + + # Iterate through each orderings file and store sequence in a dictionary + all_pms = dict() + pad = ''.join('N' for i in range(gap_size)) + for this_chrom in all_chroms: + orderings_file = 'orderings/' + this_chrom + '_orderings.txt' + orderings = get_orderings(orderings_file) + if orderings: + seq_list = [] + for line in orderings: + # Mark that we have seen this contig + remaining_contig_headers.pop(remaining_contig_headers.index('>' + line[0])) + if line[1] == '+': + seq_list.append(all_seqs['>' + line[0]]) + else: + assert line[1] == '-' + seq_list.append(reverse_complement(all_seqs['>' + line[0]])) + all_pms[this_chrom] = pad.join(seq_list) + all_pms[this_chrom] += '\n' + + # Get unincorporated sequences and place them in Chr0 + if remaining_contig_headers: + if chr0: + chr0_headers = [] + chr0_seq_list = [] + for header in remaining_contig_headers: + chr0_headers.append(header) + chr0_seq_list.append(all_seqs[header]) + all_pms['Chr0'] = pad.join(chr0_seq_list) + all_pms['Chr0'] += '\n' + + # Write out the list of chr0 headers + f_chr0_g = open('groupings/Chr0_contigs.txt', 'w') + f_chr0_o = open('orderings/Chr0_orderings.txt', 'w') + for i in chr0_headers: + f_chr0_g.write(i[1:] + "\t" + "0" + '\n') + f_chr0_o.write(i[1:] + '\t' + "+" + '\t' + "0" + '\t' + "0" + '\n') + f_chr0_g.close() + f_chr0_o.close() + else: + # Instead of making a chromosome 0, add the unplaced sequences as is. + for header in remaining_contig_headers: + all_pms[header[1:]] = all_seqs[header] + "\n" + f_chr0_g = open('groupings/' + header[1:] + '_contigs.txt', 'w') + f_chr0_o = open('orderings/' + header[1:] + '_orderings.txt', 'w') + f_chr0_g.write(header[1:] + "\t" + "0" + '\n') + f_chr0_o.write(header[1:] + '\t' + "+" + '\t' + "0" + '\t' + "0" + '\n') + f_chr0_g.close() + f_chr0_o.close() + + # Write the final sequences out to a file + with open('ragoo.fasta', 'w') as f: + for out_header in all_pms: + f.write(">" + out_header + "_RaGOO\n") + f.write(all_pms[out_header]) + + +def write_broken_files(in_contigs, in_contigs_name, in_gff=None, in_gff_name=None): + current_path = os.getcwd() + output_path = current_path + '/chimera_break' + if not os.path.exists(output_path): + os.makedirs(output_path) + + os.chdir('chimera_break') + if in_gff and in_gff_name: + with open(in_gff_name, 'w') as f: + for i in in_gff.keys(): + for j in in_gff[i]: + f.write(str(j) + '\n') + + with open(in_contigs_name, 'w') as f: + for i in in_contigs.keys(): + f.write('>' + i + '\n') + f.write(in_contigs[i] + '\n') + + os.chdir(current_path) + + +def align_breaks(break_type, m_path, in_reference_file, in_contigs_file, in_num_threads): + current_path = os.getcwd() + os.chdir('chimera_break') + if break_type == 'inter': + cmd = '{} -k19 -w19 -t{} ../../{} {} ' \ + '> inter_contigs_against_ref.paf 2> inter_contigs_against_ref.paf.log'.format(m_path, in_num_threads, in_reference_file, in_contigs_file) + if not os.path.isfile('inter_contigs_against_ref.paf'): + run(cmd) + else: + cmd = '{} -k19 -w19 -t{} ../../{} {} ' \ + '> intra_contigs_against_ref.paf 2> intra_contigs_against_ref.paf.log'.format(m_path, in_num_threads, in_reference_file, in_contigs_file) + if not os.path.isfile('intra_contigs_against_ref.paf'): + run(cmd) + + os.chdir(current_path) + + +def align_pms(m_path, num_threads, in_reference_file): + current_path = os.getcwd() + output_path = current_path + '/pm_alignments' + if not os.path.exists(output_path): + os.makedirs(output_path) + os.chdir('pm_alignments') + + cmd = '{} -ax asm5 --cs -t{} ../../{} {} ' \ + '> pm_against_ref.sam 2> pm_contigs_against_ref.sam.log'.format(m_path, num_threads, + in_reference_file, '../ragoo.fasta') + if not os.path.isfile('pm_against_ref.sam'): + run(cmd) + + os.chdir(current_path) + + +def get_SVs(sv_min, sv_max, in_ref_file): + current_path = os.getcwd() + os.chdir('pm_alignments') + # Change this when setup.py is ready. Just call script directly + cmd = 'sam2delta.py pm_against_ref.sam' + if not os.path.isfile('pm_against_ref.sam.delta'): + run(cmd) + + cmd_2 = 'Assemblytics_uniq_anchor.py --delta pm_against_ref.sam.delta --unique-length 10000 --out assemblytics_out --keep-small-uniques' + if not os.path.isfile('assemblytics_out.Assemblytics.unique_length_filtered_l10000.delta'): + run(cmd_2) + + cmd_3 = 'Assemblytics_between_alignments.pl assemblytics_out.coords.tab %r %r all-chromosomes exclude-longrange bed > assemblytics_out.variants_between_alignments.bed' %(sv_min, sv_max) + if not os.path.isfile('assemblytics_out.variants_between_alignments.bed'): + run(cmd_3) + + cmd_4 = 'Assemblytics_within_alignment.py --delta assemblytics_out.Assemblytics.unique_length_filtered_l10000.delta --min %r > assemblytics_out.variants_within_alignments.bed' %(sv_min) + if not os.path.isfile('assemblytics_out.variants_within_alignments.bed'): + run(cmd_4) + + header = "reference\tref_start\tref_stop\tID\tsize\tstrand\ttype\tref_gap_size\tquery_gap_size\tquery_coordinates\tmethod\n" + + with open('assemblytics_out.variants_between_alignments.bed', 'r')as f1: + b1 = f1.read() + + with open('assemblytics_out.variants_within_alignments.bed', 'r') as f2: + b2 = f2.read() + + with open('assemblytics_out.Assemblytics_structural_variants.bed', 'w') as f: + f.write(header) + # Might need to add newlines here + f.write(b1) + f.write(b2) + + # Filter out SVs caused by gaps + cmd_5 = 'filter_gap_SVs.py ../../%s' %(in_ref_file) + run(cmd_5) + + os.chdir(current_path) + + +def align_reads(m_path, num_threads, in_ctg_file, reads, tech='ont'): + current_path = os.getcwd() + output_path = current_path + '/ctg_alignments' + if not os.path.exists(output_path): + os.makedirs(output_path) + os.chdir('ctg_alignments') + + if tech == 'sr': + cmd = '{} -x sr -t{} ../../{} ../../{} ' \ + '> reads_against_ctg.paf 2> reads_against_ctg.paf.log'.format(m_path, num_threads, in_ctg_file, reads) + elif tech == 'corr': + cmd = '{} -x asm10 -t{} ../../{} ../../{} ' \ + '> reads_against_ctg.paf 2> reads_against_ctg.paf.log'.format(m_path, num_threads, in_ctg_file, reads) + else: + raise ValueError("Only 'sr' or 'corr' are accepted for read type.") + + if not os.path.isfile('reads_against_ctg.paf'): + run(cmd) + + os.chdir(current_path) + + +if __name__ == "__main__": + import os + import argparse + + parser = argparse.ArgumentParser(description='order and orient contigs according to minimap2 alignments to a reference (v1.1)') + parser.add_argument("contigs", metavar="", type=str, help="fasta file with contigs to be ordered and oriented (gzipped allowed)") + parser.add_argument("reference", metavar="", type=str, help="reference fasta file (gzipped allowed)") + #parser.add_argument("-o", metavar="PATH", type=str, default="ragoo_output", help="output directory name") + parser.add_argument("-e", metavar="", type=str, default="", help="single column text file of reference headers to ignore") + parser.add_argument("-gff", metavar="", type=str, default='', help="lift-over gff features to chimera-broken contigs") + parser.add_argument("-m", metavar="PATH", type=str, default="minimap2", help='path to minimap2 executable') + parser.add_argument("-b", action='store_true', default=False, help="Break chimeric contigs") + parser.add_argument("-R", metavar="", type=str, default="", help="Turns on misassembly correction. Align provided reads to the contigs to aid misassembly correction. fastq or fasta allowed. Gzipped files allowed. Turns off '-b'.") + parser.add_argument("-T", metavar="sr", type=str, default="", help="Type of reads provided by '-R'. 'sr' and 'corr' accepted for short reads and error corrected long reads respectively.") + parser.add_argument("-p", metavar="5", type=int, default=5, help=argparse.SUPPRESS) + parser.add_argument("-l", metavar="10000", type=int, default=10000, help=argparse.SUPPRESS) + parser.add_argument("-r", metavar="100000", type=int, default=100000, help="(with -b) this many bp of >1 reference sequence must be covered for a contig to be considered an interchromosomal chimera.") + parser.add_argument("-c", metavar="1000000", type=int, default=1000000, help="(with -b) distance threshold between consecutive alignments with respect to the contig.") + parser.add_argument("-d", metavar="2000000", type=int, default=2000000, help="(with -b) distance threshold between consecutive alignments with respect to the reference.") + parser.add_argument("-t", metavar="3", type=int, default=3, help="Number of threads when running minimap.") + parser.add_argument("-g", metavar="100", type=int, default=100, help="Gap size for padding in pseudomolecules.") + parser.add_argument("-s", action='store_true', default=False, help="Call structural variants") + parser.add_argument("-a", metavar="50", type=int, default=50, help=argparse.SUPPRESS) + parser.add_argument("-f", metavar="10000", type=int, default=10000, help=argparse.SUPPRESS) + parser.add_argument("-i", metavar="0.2", type=float, default=0.2, help="Minimum grouping confidence score needed to be localized.") + parser.add_argument("-j", metavar="", type=str, default="", help="List of contigs to automatically put in chr0.") + parser.add_argument("-C", action='store_true', default=False, help="Write unplaced contigs individually instead of making a chr0") + + # Get the command line arguments + args = parser.parse_args() + contigs_file = args.contigs + reference_file = args.reference + #output_path = args.o + exclude_file = args.e + minimap_path = args.m + break_chimeras = args.b + gff_file = args.gff + min_break_pct = args.p + min_len = args.l + min_range = args.r + intra_wrt_ref_min = args.d + intra_wrt_ctg_min = args.c + t = args.t + g = args.g + call_svs = args.s + min_assemblytics = args.a + max_assemblytics = args.f + group_score_thresh = args.i + skip_file = args.j + corr_reads = args.R + corr_reads_tech = args.T + make_chr0 = not args.C + + if corr_reads: + log("Misassembly correction has been turned on. This automatically inactivates chimeric contig correction.") + break_chimeras = False + + # Make sure that if -R, -T has been specified + if corr_reads and not corr_reads_tech: + raise ValueError("'-T' must be provided when using -R.") + + skip_ctg = [] + if skip_file: + with open(skip_file) as f: + for line in f: + skip_ctg.append(line.rstrip()) + + current_path = os.getcwd() + output_path = current_path + '/ragoo_output' + if not os.path.exists(output_path): + os.makedirs(output_path) + os.chdir(output_path) + + # Run minimap2 + cmd = '{} -k19 -w19 -t{} ../{} ../{} ' \ + '> contigs_against_ref.paf 2> contigs_against_ref.paf.log'.format(minimap_path, t, reference_file, contigs_file) + + if not os.path.isfile('contigs_against_ref.paf'): + run(cmd) + + # Read in the minimap2 alignments just generated + log('Reading alignments') + alns = read_paf_alignments('contigs_against_ref.paf') + alns = clean_alignments(alns, l=1000, in_exclude_file=exclude_file) + + # Process the gff file + if gff_file: + log('Getting gff features') + features = defaultdict(list) + z = GFFReader('../' + gff_file) + for i in z.parse_gff(): + features[i.seqname].append(i) + + # Break chimeras if desired + if break_chimeras: + # Record how many contigs are broken + total_inter_broken = 0 + total_intra_broken = 0 + + alns = clean_alignments(alns, l=10000, in_exclude_file=exclude_file, uniq_anchor_filter=True) + # Process contigs + log('Getting contigs') + if contigs_file.endswith(".gz"): + contigs_dict = read_gz_contigs('../' + contigs_file) + else: + contigs_dict = read_contigs('../' + contigs_file) + + log('Finding interchromosomally chimeric contigs') + all_chimeras = dict() + for i in alns.keys(): + ref_parts = get_ref_parts(alns[i], min_len, min_break_pct, min_range) + if len(ref_parts) > 1: + all_chimeras[i] = ref_parts + + log('Finding break points and breaking interchromosomally chimeric contigs') + break_intervals = dict() + for i in all_chimeras.keys(): + break_intervals[i] = cluster_contig_alns(i, alns, all_chimeras[i], min_len) + + # If its just going to break it into the same thing, skip it. + if len(break_intervals[i]) <= 1: + continue + + if gff_file: + # If desired, ensure that breakpoints don't disrupt any gff intervals + break_intervals[i] = avoid_gff_intervals(break_intervals[i], features[i]) + features = update_gff(features, break_intervals[i], i) + + # Break contigs according to the final break points + contigs_dict = break_contig(contigs_dict, i, break_intervals[i]) + total_inter_broken += 1 + + # Next, need to re-align before finding intrachromosomal chimeras + # First, write out the interchromosomal chimera broken fasta + out_inter_fasta = contigs_file[:contigs_file.rfind('.')] + '.inter.chimera.broken.fa' + if gff_file: + out_gff = gff_file[:gff_file.rfind('.')] + '.inter.chimera_broken.gff' + write_broken_files(contigs_dict, out_inter_fasta, features, out_gff) + else: + write_broken_files(contigs_dict, out_inter_fasta) + + # Next, realign the chimera broken contigs + align_breaks('inter', minimap_path, reference_file, out_inter_fasta, t) + + # Now, use those new alignments for intrachromosomal chimeras + log('Reading interchromosomal chimera broken alignments') + inter_alns = read_paf_alignments('chimera_break/inter_contigs_against_ref.paf') + inter_alns = clean_alignments(inter_alns, l=1000, in_exclude_file=exclude_file) + + log('Finding intrachromosomally chimeric contigs') + # Find intrachromosomally chimeric contigs + for i in inter_alns.keys(): + intra = get_intra_contigs(inter_alns[i], 15000, intra_wrt_ref_min, intra_wrt_ctg_min) + if intra: + if gff_file: + intra_break_intervals = avoid_gff_intervals(intra[1], features[intra[0]]) + else: + intra_break_intervals = intra[1] + # Check if the avoidance of gff intervals pushed the break point to the end of the contig. + if intra_break_intervals[-1][0] == intra_break_intervals[-1][1]: + continue + + # break the contigs and update features if desired + contigs_dict = break_contig(contigs_dict, intra[0], intra_break_intervals) + total_intra_broken += 1 + + if gff_file: + features = update_gff(features, intra_break_intervals, intra[0]) + + # Write out the intrachromosomal information + out_intra_fasta = contigs_file[:contigs_file.rfind('.')] + '.intra.chimera.broken.fa' + if gff_file: + out_intra_gff = gff_file[:gff_file.rfind('.')] + '.intra.chimera_broken.gff' + write_broken_files(contigs_dict, out_intra_fasta, features, out_intra_gff) + else: + write_broken_files(contigs_dict, out_intra_fasta) + + # Re align the contigs + # Next, realign the chimera broken contigs + align_breaks('intra', minimap_path, reference_file, out_intra_fasta, t) + + # Read in alignments of intrachromosomal chimeras and proceed with ordering and orientation + log('Reading intrachromosomal chimera broken alignments') + alns = read_paf_alignments('chimera_break/intra_contigs_against_ref.paf') + alns = clean_alignments(alns, l=1000, in_exclude_file=exclude_file) + contigs_file = '/ragoo_output/chimera_break/' + out_intra_fasta + log('The total number of interchromasomally chimeric contigs broken is %r' % total_inter_broken) + log('The total number of intrachromasomally chimeric contigs broken is %r' % total_intra_broken) + + # Check if misassembly correction is turned on. This is mutually exclusive with chimeric contig correction + if corr_reads: + # Align the raw reads to the assembly. + log('Aligning raw reads to contigs') + align_reads(minimap_path, t, contigs_file, corr_reads, corr_reads_tech) + log('Computing contig coverage') + cov_map = ReadCoverage('ctg_alignments/reads_against_ctg.paf') + alns = clean_alignments(alns, l=10000, in_exclude_file=exclude_file, uniq_anchor_filter=True, merge=True) + + # Get the initial candidate break points. + candidate_breaks = dict() + for i in alns: + candidates = alns[i].get_break_candidates() + if candidates: + candidate_breaks[i] = candidates + + # Validate each breakpoint by checking for excessively high or low coverage + # Also, if a gff is provided, check to ensure that we don't break within a gff feature interval + val_candidate_breaks = dict() + for i in candidate_breaks: + candidates = cov_map.check_break_cov(i, candidate_breaks[i]) + if gff_file: + candidates = remove_gff_breaks(features[i], candidates) + if candidates: + val_candidate_breaks[i] = list(set(candidates)) + if gff_file: + features = update_misasm_features(features, val_candidate_breaks[i], i, cov_map.ctg_lens[i]) + + # Break the contigs + if gff_file: + out_misasm_gff = gff_file[:gff_file.rfind('.')] + '.misasm.broken.gff' + write_misasm_broken_ctgs(contigs_file, val_candidate_breaks, contigs_file[:contigs_file.rfind('.')], in_gff=features, in_gff_name=out_misasm_gff) + else: + write_misasm_broken_ctgs(contigs_file, val_candidate_breaks, contigs_file[:contigs_file.rfind('.')]) + + # Align the broken contigs back to the reference + align_misasm_broken(contigs_file[:contigs_file.rfind('.')]) + alns = read_paf_alignments('ctg_alignments/contigs_brk_against_ref.paf') + alns = clean_alignments(alns, l=1000, in_exclude_file=exclude_file) + contigs_file = '/ragoo_output/ctg_alignments/' + contigs_file[:contigs_file.rfind('.')] + ".misasm.break.fa" + + # Assign each contig to a corresponding reference chromosome. + log('Assigning contigs') + all_unique_contigs = dict() + for i in alns.keys(): + all_unique_contigs[i] = UniqueContigAlignment(alns[i]) + + # Add to this the list of headers that did not make it + write_contig_clusters(all_unique_contigs, group_score_thresh, skip_ctg) + + log('Ordering and orienting contigs') + order_orient_contigs(all_unique_contigs, alns) + + log('Creating pseudomolecules') + create_pseudomolecules(contigs_file, all_unique_contigs, g, make_chr0) + + if call_svs: + log('Aligning pseudomolecules to reference') + align_pms(minimap_path, t, reference_file) + + log('Getting structural variants') + get_SVs(min_assemblytics, max_assemblytics, reference_file) + + log('goodbye') diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/ragoo_utilities/ContigAlignment.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RaGOO/ragoo_utilities/ContigAlignment.py Mon Jul 26 18:22:37 2021 +0000 @@ -0,0 +1,364 @@ +import operator +from collections import defaultdict + +from ragoo_utilities.utilities import summarize_planesweep, binary_search + + +class ContigAlignment: + """ + This object will represent a contig's alignment to a reference chromosome. + Specifically, this class will organize alignments in a one-to-many fashion, where + we will only have one object per contig, and each object will store all of the alignments + of that contig. + + + """ + + def __init__(self, in_contig): + """ + Each object will refer to a single contig via its header name. + + All other alignment metrics will be stored in lists, where the list offsets + correspond to the alignment number. + + Don't actually add any alignments through object instantiation. Only use the add_alignment method. + """ + self.contig = in_contig + + self.query_lens = [] + self.query_starts = [] + self.query_ends = [] + self.strands = [] + self.ref_headers = [] + self.ref_lens = [] + self.ref_starts = [] + self.ref_ends = [] + self.num_matches = [] + self.aln_lens = [] + self.mapqs = [] + + self._is_unique_anchor_filtered = False + self._is_merged = False + + def __repr__(self): + return '' + + def __str__(self): + """ Return the alignments in sorted PAF format. """ + self.sort_by_query() + all_alns = [] + for i in range(len(self.ref_headers)): + all_alns.append( + "\t".join([ + self.contig, + str(self.query_lens[i]), + str(self.query_starts[i]), + str(self.query_ends[i]), + self.strands[i], + self.ref_headers[i], + str(self.ref_lens[i]), + str(self.ref_starts[i]), + str(self.ref_ends[i]), + str(self.num_matches[i]), + str(self.aln_lens[i]), + str(self.mapqs[i]) + ]) + ) + return "\n".join(all_alns) + + def get_attr_lens(self): + all_lens = [ + len(self.query_lens), + len(self.query_starts), + len(self.query_ends), + len(self.strands), + len(self.ref_headers), + len(self.ref_lens), + len(self.ref_starts), + len(self.ref_ends), + len(self.num_matches), + len(self.aln_lens), + len(self.mapqs) + ] + return all_lens + + def add_alignment(self, paf_line): + """ + The only way to add alignments for this contig. Only accepts a PAFLine type object which represents + as single line of a paf file. + """ + if not paf_line.contig == self.contig: + raise ValueError('Only can add alignments with query header %s' % self.contig) + + self.query_lens.append(paf_line.query_len) + self.query_starts.append(paf_line.query_start) + self.query_ends.append(paf_line.query_end) + self.strands.append(paf_line.strand) + self.ref_headers.append(paf_line.ref_header) + self.ref_lens.append(paf_line.ref_len) + self.ref_starts.append(paf_line.ref_start) + self.ref_ends.append(paf_line.ref_end) + self.num_matches.append(paf_line.num_match) + self.aln_lens.append(paf_line.aln_len) + self.mapqs.append(paf_line.mapq) + + # Check that all attribute lists have same length + all_lens = self.get_attr_lens() + assert len(set(all_lens)) == 1 + + def has_unique_chr_match(self): + s1 = set(self.ref_headers) + if len(s1) == 1: + return True + return False + + def count_chr_matches(self): + s1 = set(self.ref_headers) + return len(s1) + + def rearrange_alns(self, hits): + """ Generally re structure the alignments according to 'hits', an ordered list of indices. """ + self.query_lens = [self.query_lens[i] for i in hits] + self.query_starts = [self.query_starts[i] for i in hits] + self.query_ends = [self.query_ends[i] for i in hits] + self.strands = [self.strands[i] for i in hits] + self.ref_headers = [self.ref_headers[i] for i in hits] + self.ref_lens = [self.ref_lens[i] for i in hits] + self.ref_starts = [self.ref_starts[i] for i in hits] + self.ref_ends = [self.ref_ends[i] for i in hits] + self.num_matches = [self.num_matches[i] for i in hits] + self.aln_lens = [self.aln_lens[i] for i in hits] + self.mapqs = [self.mapqs[i] for i in hits] + + def filter_ref_chroms(self, in_chroms): + """ + Given a list of chromosomes, mutate this object so as to only include alignments to those chromosomes. + """ + hits = [] + for i in range(len(self.ref_headers)): + if self.ref_headers[i] in in_chroms: + hits.append(i) + + self.rearrange_alns(hits) + + def sort_by_ref(self): + ref_pos = [] + for i in range(len(self.ref_headers)): + ref_pos.append((self.ref_headers[i], self.ref_starts[i], self.ref_ends[i], i)) + hits = [i[3] for i in sorted(ref_pos)] + + self.rearrange_alns(hits) + + def sort_by_query(self): + q_pos = [] + for i in range(len(self.ref_headers)): + q_pos.append((self.query_starts[i], self.query_ends[i], i)) + hits = [i[2] for i in sorted(q_pos)] + + self.rearrange_alns(hits) + + def exclude_ref_chroms(self, exclude_list): + hits = [] + for i in range(len(self.ref_headers)): + if self.ref_headers[i] not in exclude_list: + hits.append(i) + + self.rearrange_alns(hits) + + def filter_lengths(self, l): + hits = [] + for i in range(len(self.ref_headers)): + if self.aln_lens[i] > l: + hits.append(i) + + self.rearrange_alns(hits) + + def unique_anchor_filter(self): + """ + The contents of this method are either influenced by or directly copied from "Assemblytics_uniq_anchor.py" + written by Maria Nattestad. The original script can be found here: + + https://github.com/MariaNattestad/Assemblytics + + And the publication associated with Maria's work is here: + + Nattestad, Maria, and Michael C. Schatz. "Assemblytics: a + web analytics tool for the detection of variants from an + assembly." Bioinformatics 32.19 (2016): 3021-3023. + """ + + if not self._is_unique_anchor_filtered: + lines_by_query = [] + for i, j in zip(self.query_starts, self.query_ends): + lines_by_query.append((i, j)) + + hits = summarize_planesweep(lines_by_query, 10000) + self.rearrange_alns(hits) + self._is_unique_anchor_filtered = True + + def merge_alns(self, merge_dist=100000): + """ + Merge chains of alignments that are to the same target, have the same orientation, and are less than + merge_dist away from each other. + :param merge_dist: + :return: + """ + # Sort the alignments + self.sort_by_ref() + + # Might also want to filter out low identity alignments + + # Keep track of which alignments we are comparing + i = 0 + j = 1 + while j < len(self.ref_headers): + if all([ + self.ref_headers[i] == self.ref_headers[j], + self.strands[i] == self.strands[j], + self.ref_starts[j] - self.ref_ends[i] <= merge_dist + ]): + # Merge the alignments in place of the first alignment + self.ref_starts[i] = min(self.ref_starts[i], self.ref_starts[j]) + self.ref_ends[i] = max(self.ref_ends[i], self.ref_ends[j]) + self.query_starts[i] = min(self.query_starts[i], self.query_starts[j]) + self.query_ends[i] = max(self.query_ends[i], self.query_ends[j]) + + self.num_matches[i] += self.num_matches[j] + self.aln_lens[i] = self.ref_ends[i] - self.ref_starts[i] + self.mapqs[i] = (self.mapqs[i] + self.mapqs[j])//2 + + # Remove the redundant alignment + self.query_lens.pop(j) + self.query_starts.pop(j) + self.query_ends.pop(j) + self.strands.pop(j) + self.ref_headers.pop(j) + self.ref_lens.pop(j) + self.ref_starts.pop(j) + self.ref_ends.pop(j) + self.num_matches.pop(j) + self.aln_lens.pop(j) + self.mapqs.pop(j) + else: + i += 1 + j += 1 + + # remove contained alignments. contained w.r.t the contig + cont_inds = [] + for i in range(len(self.ref_headers)): + for j in range(len(self.ref_headers)): + # Check if j is contained by i + if i == j: + continue + if self.query_starts[i] <= self.query_starts[j] and self.query_ends[i] >= self.query_ends[j]: + cont_inds.append(j) + + hits = [i for i in range(len(self.ref_headers)) if i not in cont_inds] + self.rearrange_alns(hits) + + self._is_merged = True + + def get_break_candidates(self): + if not self._is_merged: + raise ValueError("Alignments must first be merged.") + + self.sort_by_query() + all_candidates = [] + for i in range(1, len(self.ref_headers)): + all_candidates.append(min(self.query_ends[i-1], self.query_starts[i])) + all_candidates.append(max(self.query_ends[i-1], self.query_starts[i])) + + return all_candidates + + + +class UniqueContigAlignment: + """ + A class representing the reference chromosome to which a particular contig will be assigned to. + At first, the incoming alignments may be to multiple chromosomes, but this class will assign a single + reference chromosome to an input contig. + """ + + def __init__(self, in_contig_aln): + if not isinstance(in_contig_aln, ContigAlignment): + message = 'Can only make a unique alignment from a ContigAlignment object. Got type %s instead.' % str(type(in_contig_aln)) + raise ValueError(message) + + self.contig = in_contig_aln.contig + + # Find the chromosome which was covered the most in these alignments. + self.ref_chrom = None + self.confidence = 0.0 + self._get_best_ref_cov(in_contig_aln) + + def __str__(self): + return '' + + def _get_best_ref_cov(self, alns): + # Get list of all references chromosomes + all_chroms = set(alns.ref_headers) + if len(all_chroms) == 1: + self.ref_chrom = alns.ref_headers[0] + self.confidence = 1 + return + + # Initialize coverage counts for each chromosome + ranges = dict() + for i in all_chroms: + ranges[i] = 0 + + # Get all the ranges in reference to all chromosomes + all_intervals = defaultdict(list) + for i in range(len(alns.ref_headers)): + this_range = (alns.ref_starts[i], alns.ref_ends[i]) + this_chrom = alns.ref_headers[i] + all_intervals[this_chrom].append(this_range) + + # For each chromosome, sort the range and get the union interval length. + # Algorithm and pseudocode given by Michael Kirsche + for i in all_intervals.keys(): + sorted_intervals = sorted(all_intervals[i], key=lambda tup: tup[0]) + max_end = -1 + for j in sorted_intervals: + start_new_terr = max(j[0], max_end) + ranges[i] += max(0, j[1] - start_new_terr) + max_end = max(max_end, j[1]) + + assert ranges + + # I convert to a list and sort the ranges.items() in order to have ties broken in a deterministic way. + max_chrom = max(sorted(list(ranges.items())), key=operator.itemgetter(1))[0] + self.ref_chrom = max_chrom + + # Now get the confidence of this chromosome assignment + # Equal to the max range over all ranges + self.confidence = ranges[max_chrom]/sum(ranges.values()) + assert self.confidence >= 0 + assert self.confidence <= 1 + + +class LongestContigAlignment: + """ + Given a ContigAlignment object, find the longest alignment with respect to a given references chromosome. + + 1. Filter out any alignments that are not aligned to specified chromosome. + 2. Find longest alignment remaining. + """ + + def __init__(self, in_contig_aln): + self.best_index = self._get_best_index(in_contig_aln) + self.contig = in_contig_aln.contig + self.ref_start = in_contig_aln.ref_starts[self.best_index] + self.ref_end = in_contig_aln.ref_ends[self.best_index] + self.aln_len = in_contig_aln.aln_lens[self.best_index] + self.strand = in_contig_aln.strands[self.best_index] + self.interval = (self.ref_start, self.ref_end) + + def _get_best_index(self, aln): + max_index = -1 + max_len = -1 + for i in range(len(aln.aln_lens)): + if aln.aln_lens[i] > max_len: + max_len = aln.aln_lens[i] + max_index = i + return max_index \ No newline at end of file diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/ragoo_utilities/GFFReader.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RaGOO/ragoo_utilities/GFFReader.py Mon Jul 26 18:22:37 2021 +0000 @@ -0,0 +1,43 @@ +#!/usr/bin/env python + + +class GFFLine: + + def __init__(self, in_fields): + self.seqname = in_fields[0] + self.source = in_fields[1] + self.feature = in_fields[2] + self.start = int(in_fields[3]) + self.end = int(in_fields[4]) + self.score = in_fields[5] + self.strand = in_fields[6] + self.frame = in_fields[7] + self.attribute = in_fields[8] + + def __str__(self): + all_atts = [ + self.seqname, + self.source, + self.feature, + self.start, + self.end, + self.score, + self.strand, + self.frame, + self.attribute + ] + all_atts = [str(i) for i in all_atts] + return '\t'.join(all_atts) + + +class GFFReader: + + def __init__(self, in_gff_file): + self.gff_file = in_gff_file + + def parse_gff(self): + with open(self.gff_file) as f: + for line in f: + if not line.startswith('#'): + L1 = line.rstrip().split('\t') + yield GFFLine(L1) \ No newline at end of file diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/ragoo_utilities/PAFReader.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RaGOO/ragoo_utilities/PAFReader.py Mon Jul 26 18:22:37 2021 +0000 @@ -0,0 +1,40 @@ +class PAFLine: + """ Object to represent a single alignment in a minimap PAF file. """ + + def __init__(self, in_line): + """ + start positions should be before end positions for both query and target + """ + self.line = in_line.rstrip().split('\t') + self.contig = self.line[0] + self.query_len = int(self.line[1]) + self.query_start = int(self.line[2]) + self.query_end = int(self.line[3]) + self.strand = self.line[4] + self.ref_header = self.line[5] + self.ref_len = int(self.line[6]) + self.ref_start = int(self.line[7]) + self.ref_end = int(self.line[8]) + self.num_match = int(self.line[9]) + self.aln_len = int(self.line[10]) + self.mapq = int(self.line[11]) + + assert self.query_start <= self.query_end + assert self.ref_start <= self.ref_end + + def __str__(self): + return '\t'.join(self.line) + + def __eq__(self, other): + return self.line == other.line + + +class PAFReader: + + def __init__(self, paf_file): + self.paf_file = paf_file + + def parse_paf(self): + with open(self.paf_file) as f: + for line in f: + yield PAFLine(line) \ No newline at end of file diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/ragoo_utilities/ReadCoverage.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RaGOO/ragoo_utilities/ReadCoverage.py Mon Jul 26 18:22:37 2021 +0000 @@ -0,0 +1,212 @@ +import bisect + +import numpy as np + + +class ReadCoverage: + + def __init__(self, in_paf): + self.paf = in_paf + self.glob_mean = None + self.glob_std = None + self.coverage_map = dict() + self.ctg_lens = dict() + self._make_coverage_map() + self._get_glob_mean() + + @staticmethod + def _tabulate_coverage(cov_list): + current_coverage = 0 + coverage_list = [] + seen = set() + + for header, pos in cov_list: + if header in seen: + current_coverage -= 1 + coverage_list.append((pos, current_coverage)) + seen.remove(header) + else: + current_coverage += 1 + coverage_list.append((pos, current_coverage)) + seen.add(header) + return coverage_list + + @staticmethod + def _smooth_supported_breaks(sup_breaks, break_types): + """ + If there are multiple low coverage breaks in close proximity, + merge it into one break at the lowest coverage point. + """ + i = 0 + j = 1 + while j < len(sup_breaks): + if break_types[i] == 'l' and break_types[j] == 'l': + if abs(sup_breaks[i][0] - sup_breaks[j][0]) < 100000: + # Merge these two break points + sup_breaks[i] = min([sup_breaks[i], sup_breaks[j]], key=lambda x: x[1]) + sup_breaks.pop(j) + else: + i += 1 + j += 1 + else: + i += 1 + j += 1 + + return [z[0] for z in sup_breaks] + + def _trim_ends(self, dist=25000): + """ Remove the ends of the contigs from the coverage map. """ + for i in self.coverage_map: + # Start with the beginning of the contig + start_idx = 0 + end_idx = 0 + for j in range(len(self.coverage_map[i])): + if self.coverage_map[i][j][0] < dist: + start_idx = j + + if self.coverage_map[i][j][0] > self.ctg_lens[i] - dist: + end_idx = j-1 + break + self.coverage_map[i] = self.coverage_map[i][start_idx:end_idx] + + # Remove contigs which don't have coverage info. + header_keys = list(self.coverage_map.keys()) + for i in header_keys: + if not self.coverage_map[i]: + self.coverage_map.pop(i) + + def _make_coverage_map(self): + """ + Populate self.coverage_map. This is a dictionary that associates each contig header with a list of alignment + positions and their coverage levels. + """ + # Associate with each contig header, a list of (query header, start), (query header, end) + alns_pos = dict() + with open(self.paf, 'r') as f: + for line in f: + L1 = line.rstrip().split('\t') + + # Only consider an alignment if at least 75% of the read aligned. + if abs((int(L1[3]) - int(L1[2])))/int(L1[1]) >= 0.75: + if L1[5] in alns_pos: + alns_pos[L1[5]].append((L1[0], int(L1[7]))) + alns_pos[L1[5]].append((L1[0], int(L1[8]))) + else: + alns_pos[L1[5]] = [(L1[0], int(L1[7])), (L1[0], int(L1[8]))] + self.ctg_lens[L1[5]] = int(L1[6]) + + # Sort these coverage positions and get the coverage map + for i in alns_pos: + self.coverage_map[i] = self._tabulate_coverage(sorted(alns_pos[i], key=lambda x: x[1])) + + self._trim_ends() + + def _get_glob_mean(self): + L1 = [] + for i in self.coverage_map: + + # In the case where we have multiple coverage values for one position, take the last one. + last_pos = 0 + curr_val = 0 + for j in self.coverage_map[i]: + if j[0] == last_pos: + curr_val = j[1] + else: + last_pos = j[0] + L1.append(curr_val) + cur_val = j[1] + + L1 = np.asarray(L1, dtype=np.int32) + self.glob_mean = np.median(L1) + self.glob_std = np.sqrt(self.glob_mean) + + def _get_index_range(self, header, start_ind, distance): + """ + Get the list of indices that are contained within a distance around the start index + """ + all_inds = [] + low_counter = 1 + + # Check if the start point is at the end of the contig + if start_ind == len(self.coverage_map[header]): + start_ind -= 1 + + start_pos = self.coverage_map[header][start_ind][0] + is_low = False + + # Get all coverage map indices representing regions 50kbp upstream of the start + while not is_low: + next_ind = start_ind-low_counter + if next_ind < 0: + is_low = True + else: + next_pos = self.coverage_map[header][next_ind][0] + if start_pos - next_pos > distance: + is_low = True + else: + all_inds.append(next_ind) + low_counter += 1 + + # Repeat for 50kbp downstream + high_counter = 1 + is_high = False + while not is_high: + next_ind = start_ind + high_counter + if next_ind >= len(self.coverage_map[header]): + is_high = True + else: + next_pos = self.coverage_map[header][next_ind][0] + if next_pos - start_pos > distance: + is_high = True + else: + all_inds.append(next_ind) + high_counter += 1 + + return sorted(all_inds) + + def check_break_cov(self, header, in_breaks, min_cov=None, max_cov=None): + """ + Given a list of potential break points, verify if those break points occur around low or high coverage + areas. If so, replace the candidate break point with the low/high coverage break point. + :param header: contig header for these breaks + :param in_breaks: list of candidate break points + :param min_cov: break at coverage levels below this value + :param max_cov: break at coverage levels above this value + :return: list of real break points, or empty list if not breaking is recommended + """ + # Check that we have coverage info for this contig. + if header not in self.coverage_map: + return [] + + if min_cov is None or max_cov is None: + # Automatically calculate min and max coverage + min_cov = max(self.glob_mean - (self.glob_std*3), 0) + max_cov = self.glob_mean + (self.glob_std*3) + + supported_breaks = [] + break_types = [] + + for i in in_breaks: + # Get the coverage for the position closest to this potential break point + ins_ind = bisect.bisect_left(self.coverage_map[header], (i, 0)) + # Get the coverage for positions within 50 kbp of the candidate coverage position + # Exclude positions near the ends of the sequence. + ind_range = self._get_index_range(header, ins_ind, 50000) + + if len(set(ind_range)) > 1: + # Check for low coverage + lowest_cov = min(self.coverage_map[header][ind_range[0]:ind_range[-1]], key=lambda x: x[1]) + if lowest_cov[1] < min_cov: + supported_breaks.append(lowest_cov) + break_types.append('l') + continue + + # Check for high coverage + highest_cov = max(self.coverage_map[header][ind_range[0]:ind_range[-1]], key=lambda x: x[1]) + if highest_cov[1] > max_cov: + supported_breaks.append(highest_cov) + break_types.append('h') + + return self._smooth_supported_breaks(supported_breaks, break_types) + + diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/ragoo_utilities/SeqReader.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RaGOO/ragoo_utilities/SeqReader.py Mon Jul 26 18:22:37 2021 +0000 @@ -0,0 +1,76 @@ +import gzip + + +class SeqReader: + + def __init__(self, in_file): + """ + Initialize sequence file to be parsed. + :param in_file: + """ + if not isinstance(in_file, str): + raise AttributeError('Only a string can be used to instantiate a SeqReader object.') + self.in_file = in_file + + def parse_fasta(self): + """ + Generator yielding header and sequence, for each sequence + in the fasta file sent to the class. + """ + with open(self.in_file) as fasta_file: + sequence = '' + # Find first header. + line = fasta_file.readline() + while not line.startswith('>'): + line = fasta_file.readline() + if not line: + error = """ This file provided is not in proper fasta format. + In addition to the usual fasta conventions, be sure that there are + no blank lines in the file. + """ + raise RuntimeError(error) + header = line.rstrip() + + # Get sequence associated with that header. + for line in fasta_file: + if line.startswith('>'): + # Once the sequence is over, (next header begins), + # yield initial header and sequence. + yield header, sequence + header = line.rstrip() + sequence = '' + else: + sequence += ''.join(line.rstrip().split()) + yield header, sequence + + def parse_gzip_fasta(self): + """ + Generator yielding header and sequence, for each sequence + in the fasta file sent to the class. For gzipped fasta files. + """ + with gzip.open(self.in_file) as fasta_file: + sequence = '' + # Find first header. + line = fasta_file.readline().decode('utf-8') + while not line.startswith('>'): + line = fasta_file.readline().decode('utf-8') + if not line: + error = """ This file provided is not in proper fasta format. + In addition to the usual fasta conventions, be sure that there are + no blank lines in the file. + """ + raise RuntimeError(error) + header = line.rstrip() + + # Get sequence associated with that header. + for line in fasta_file: + line = line.decode('utf-8') + if line.startswith('>'): + # Once the sequence is over, (next header begins), + # yield initial header and sequence. + yield header, sequence + header = line.rstrip() + sequence = '' + else: + sequence += ''.join(line.rstrip().split()) + yield header, sequence \ No newline at end of file diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/ragoo_utilities/__init__.py diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/ragoo_utilities/__pycache__/ContigAlignment.cpython-39.pyc Binary file RaGOO/ragoo_utilities/__pycache__/ContigAlignment.cpython-39.pyc has changed diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/ragoo_utilities/__pycache__/GFFReader.cpython-39.pyc Binary file RaGOO/ragoo_utilities/__pycache__/GFFReader.cpython-39.pyc has changed diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/ragoo_utilities/__pycache__/PAFReader.cpython-39.pyc Binary file RaGOO/ragoo_utilities/__pycache__/PAFReader.cpython-39.pyc has changed diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/ragoo_utilities/__pycache__/ReadCoverage.cpython-39.pyc Binary file RaGOO/ragoo_utilities/__pycache__/ReadCoverage.cpython-39.pyc has changed diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/ragoo_utilities/__pycache__/SeqReader.cpython-39.pyc Binary file RaGOO/ragoo_utilities/__pycache__/SeqReader.cpython-39.pyc has changed diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/ragoo_utilities/__pycache__/__init__.cpython-39.pyc Binary file RaGOO/ragoo_utilities/__pycache__/__init__.cpython-39.pyc has changed diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/ragoo_utilities/__pycache__/break_chimera.cpython-39.pyc Binary file RaGOO/ragoo_utilities/__pycache__/break_chimera.cpython-39.pyc has changed diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/ragoo_utilities/__pycache__/utilities.cpython-39.pyc Binary file RaGOO/ragoo_utilities/__pycache__/utilities.cpython-39.pyc has changed diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/ragoo_utilities/break_chimera.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RaGOO/ragoo_utilities/break_chimera.py Mon Jul 26 18:22:37 2021 +0000 @@ -0,0 +1,246 @@ +from collections import defaultdict +import copy + +from intervaltree import IntervalTree + +from ragoo_utilities.ContigAlignment import UniqueContigAlignment + + +def get_ref_parts(alns, l, p, r): + """ + + :param alns: A ContigAlignment object for the contig in question + :param l: Minimum alignment length to consider + :param p: Minimum percentage of a reference chromosome that must be covered to be considered significant + :param r: Minimum number of reference chromosome nucleotides needed to be covered to be considered significant + :return: A list of reference chromosomes to which the input contig significantly aligns to + """ + all_intervals = defaultdict(list) + + # Iterate over all alignments + for i in range(len(alns.ref_headers)): + if alns.aln_lens[i] < l: + continue + + all_intervals[alns.ref_headers[i]].append((alns.ref_starts[i], alns.ref_ends[i])) + + ranges = dict() + for i in all_intervals.keys(): + ranges[i] = 0 + + # For each chromosome, sort the range and get the union interval length. + # Algorithm and pseudocode given by Michael Kirsche + for i in all_intervals.keys(): + sorted_intervals = sorted(all_intervals[i], key=lambda tup: tup[0]) + max_end = -1 + for j in sorted_intervals: + start_new_terr = max(j[0], max_end) + ranges[i] += max(0, j[1] - start_new_terr) + max_end = max(max_end, j[1]) + + total_sum = sum(ranges.values()) + chimeric_refs = [] + for i in ranges.keys(): + if (ranges[i]/total_sum)*100 > float(p) and ranges[i] > r: + chimeric_refs.append(i) + + return chimeric_refs + + +def cluster_contig_alns(contig, alns, chroms, l): + ctg_alns = alns[contig] + + # Filter contig alignments so as to only include alignments to desired chromosomes + ctg_alns.filter_ref_chroms(chroms) + + # Intialize the list of start and end positions w.r.t the query + query_pos = [] + + for i in range(len(ctg_alns.ref_headers)): + query_pos.append((ctg_alns.query_starts[i], ctg_alns.query_ends[i], i)) + + final_order = [i[2] for i in sorted(query_pos)] + final_refs = [] + for i in final_order: + final_refs.append(ctg_alns.ref_headers[i]) + + return get_borders(final_refs, ctg_alns, final_order) + + +def get_borders(ordered_refs, alns, order): + borders = [0] + current_ref = ordered_refs[0] + for i in range(1, len(ordered_refs)-1): + if ordered_refs[i] != current_ref and ordered_refs[i+1] != current_ref: + current_ref = ordered_refs[i] + borders.append(alns.query_ends[order[i-1]]) + + borders.append(alns.query_lens[0]) + intervals = [(borders[i], borders[i+1]) for i in range(len(borders)-1)] + return intervals + + +def avoid_gff_intervals(borders, gff_ins): + """ + :param borders: + :param gff_ins: all features + :return: + """ + + # Make an interval tree from the intervals of features associated with this contig + t = IntervalTree() + for line in gff_ins: + # If the interval is one bp long, skip + if line.start == line.end: + continue + t[line.start:line.end] = (line.start, line.end) + + new_borders = [] + for i in borders: + # Returns an empty set if a border does not fall in any interval. + # Otherwise, returns a list of Interval objects + interval_query = t[i[0]] + if interval_query: + while interval_query: + # Make new border larger than the max end position of all intervals + i = (i[0] + 1, i[1]) + interval_query = t[i[0]] + + interval_query = t[i[1]] + if interval_query: + while interval_query: + # Make new border larger than the max end position of all intervals + i = (i[0], i[1] + 1) + interval_query = t[i[1]] + + new_borders.append(i) + + new_borders = new_borders[:-1] + [(new_borders[-1][0], borders[-1][1])] + return new_borders + + +def update_gff(features, borders, contig): + # Pop the features to be updated + contig_feats = features.pop(contig) + + # Initialize lists for new broken contig headers + for i in range(len(borders)): + features[contig + '_chimera_broken:' + str(borders[i][0]) + '-' + str(borders[i][1])] = [] + + # Could just use binary search instead of this tree since intervals are non-overlapping. + # but input so small both are trivial + t = IntervalTree() + for i in borders: + t[i[0]:i[1]] = i + + for i in contig_feats: + query = t[i.start] + assert len(query) == 1 + break_start = list(query)[0].begin + break_end = list(query)[0].end + query_border = (break_start, break_end) + break_number = borders.index(query_border) + i.seqname = contig + '_chimera_broken:' + str(borders[break_number][0]) + '-' + str(borders[break_number][1]) + i.start = i.start - break_start + i.end = i.end - break_start + features[contig + '_chimera_broken:' + str(borders[break_number][0]) + '-' + str(borders[break_number][1])].append(i) + + return features + + +def break_contig(contigs_dict, header, break_points): + seq = contigs_dict.pop(header) + test_seq = '' + for i in range(len(break_points)): + contigs_dict[header + '_chimera_broken:' + str(break_points[i][0]) + '-' + str(break_points[i][1])] = seq[break_points[i][0]: break_points[i][1]] + test_seq += seq[break_points[i][0]: break_points[i][1]] + + assert test_seq == seq + return contigs_dict + + +def get_intra_contigs(alns, l, d, c): + """ + Flag contigs as being intrachromosomal chimeras + :param alns: + :param l: Minimum alignment length to consider + :param d: Distance between consecutive adjacent alignments with respect to the reference. If larger than this, flag + :param c: Distance between consecutive adjacent alignments with respect to the query. If larger than this, flag + :return: dict of contigs and break points. + """ + + # Get only the header to which this contig mostly aligns to and filter out smaller alignments. + uniq_aln = UniqueContigAlignment(alns) + best_header = uniq_aln.ref_chrom + ctg_alns = copy.deepcopy(alns) + ctg_alns.filter_ref_chroms([best_header]) + ctg_alns.filter_lengths(l) + + # If there are no longer any alignments after length filtering, give up + if not len(ctg_alns.ref_headers): + return + + # Sort the alignments with respect to the reference start and end positions. + ctg_alns.sort_by_ref() + + # Make a list of distance between alignments + # first with respect to (wrt) the reference. + distances_wrt_ref = [] + for i in range(len(ctg_alns.ref_headers)-1): + distances_wrt_ref.append(ctg_alns.ref_starts[i+1] - ctg_alns.ref_starts[i]) + + # next, with respect to (wrt) the contig. + distances_wrt_ctg = [] + for i in range(len(ctg_alns.ref_headers) - 1): + distances_wrt_ctg.append(abs(ctg_alns.query_starts[i + 1] - ctg_alns.query_starts[i])) + + # Next, assign the following two identities. + # 1. When ordered by the reference, the alignments start at the beginning or the end of the query + # 2. For the alignment which will be broken on, is it on the forward or reverse strand. + + is_query_start = True + + if ctg_alns.query_starts[0] >= ctg_alns.query_lens[0]/5: + is_query_start = False + + # This conditional essentially checks if there are any break points for this contig. + # Returns None otherwise (no return statement) + if distances_wrt_ref: + if max(distances_wrt_ref) > d: + gap_index = distances_wrt_ref.index(max(distances_wrt_ref)) + a_alns_strands = ctg_alns.strands[:gap_index] + if is_query_start: + if a_alns_strands.count('-') > a_alns_strands.count('+'): + # The first subcontig is on the reverse strand + return (ctg_alns.contig, [(0, ctg_alns.query_ends[0]), (ctg_alns.query_ends[0], ctg_alns.query_lens[0])]) + else: + # The first subcontig is on the forward strand. + return (ctg_alns.contig, [(0, ctg_alns.query_ends[gap_index]), (ctg_alns.query_ends[gap_index], ctg_alns.query_lens[0])]) + else: + # The first subcontig starts at the end of the contig + if a_alns_strands.count('-') > a_alns_strands.count('+'): + # The first subcontig is on the reverse strand + return (ctg_alns.contig, [(0, ctg_alns.query_starts[gap_index]), (ctg_alns.query_starts[gap_index], ctg_alns.query_lens[0])]) + else: + # The first subcontig is on the forward strand. + return (ctg_alns.contig, [(0, ctg_alns.query_starts[0]), (ctg_alns.query_starts[0], ctg_alns.query_lens[0])]) + + if max(distances_wrt_ctg) > c: + gap_index = distances_wrt_ctg.index(max(distances_wrt_ctg)) + 1 + a_alns_strands = ctg_alns.strands[:gap_index] + if is_query_start: + if a_alns_strands.count('-') > a_alns_strands.count('+'): + # The first subcontig is on the reverse strand + return (ctg_alns.contig, [(0, ctg_alns.query_ends[0]), (ctg_alns.query_ends[0], ctg_alns.query_lens[0])]) + else: + # The first subcontig is on the forward strand. + return (ctg_alns.contig, [(0, ctg_alns.query_ends[gap_index]), (ctg_alns.query_ends[gap_index], ctg_alns.query_lens[0])]) + else: + # The first subcontig starts at the end of the contig + if a_alns_strands.count('-') > a_alns_strands.count('+'): + # The first subcontig is on the reverse strand + return (ctg_alns.contig, [(0, ctg_alns.query_starts[gap_index-1]), (ctg_alns.query_starts[gap_index-1], ctg_alns.query_lens[0])]) + else: + # The first subcontig is on the forward strand. + return (ctg_alns.contig, [(0, ctg_alns.query_starts[0]), (ctg_alns.query_starts[0], ctg_alns.query_lens[0])]) + diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/ragoo_utilities/get_contig_borders.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RaGOO/ragoo_utilities/get_contig_borders.py Mon Jul 26 18:22:37 2021 +0000 @@ -0,0 +1,41 @@ + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description='given and orderings file and a contigs fasta index, print a bed file of contig placements in the pseudomolecules.') + parser.add_argument("orderings", metavar="", type=str, help="orderings file from RaGOO") + parser.add_argument("fai", metavar="", type=str, help="index file for contigs (samtools faidx contigs.fasta)") + parser.add_argument("gap_len", metavar="100", type=int, help="Gap size used for pseudomolecule padding.") + + # Get the command line arguments + args = parser.parse_args() + orderings_file = args.orderings + fai_file = args.fai + gap_len = args.gap_len + + # Save the contig orderings + ctgs = [] + with open(orderings_file, 'r') as f: + for line in f: + ctgs.append(line.rstrip().split('\t')[0]) + + # Get contig lengths + ctg_lens = dict() + with open(fai_file, 'r') as f: + for line in f: + L1 = line.split('\t') + ctg_lens[L1[0]] = int(L1[1]) + + # Get contig borders + final_bed = [] + current_pos = 0 + + for ctg in ctgs: + start = current_pos + end = current_pos + ctg_lens[ctg] + current_pos += ctg_lens[ctg] + current_pos += gap_len + pm_header = orderings_file[orderings_file.rfind('/')+1:orderings_file.rfind('_')] + '_RaGOO' + final_bed.append('%s\t%r\t%r' % (pm_header, start, end)) + + print('\n'.join(final_bed)) \ No newline at end of file diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/ragoo_utilities/get_ragoo_stats.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RaGOO/ragoo_utilities/get_ragoo_stats.py Mon Jul 26 18:22:37 2021 +0000 @@ -0,0 +1,55 @@ +import argparse + +parser = argparse.ArgumentParser(description='Summary stats about contig scaffolding with RaGOO.') +parser.add_argument("index", metavar="", type=str, help="Samtools fasta index file for input contigs. If chimera breaking mode was used, this must be" + "the index file of the chimera broken contigs, which can be found in ragoo_output/chimera_break." + "The correct file to use is the file with the .intra.chimera.broken.fa suffix.") +parser.add_argument("groupings", metavar="", type=str, help="file of file names for all *_groupings.txt produced by RaGOO. Single column with full path to each grouping file.") + +args = parser.parse_args() +contigs_index = args.index +grouping_fofn = args.groupings + +remaining_ctg = [] +all_ctg_len = dict() +with open(contigs_index) as f: + for line in f: + L1 = line.split('\t') + all_ctg_len[L1[0]] = int(L1[1]) + remaining_ctg.append(L1[0]) + +grouping_files = [] +with open(grouping_fofn) as f: + for line in f: + grouping_files.append(line.rstrip()) + +num_ctg_localized = 0 +num_bp_localized = 0 + +for group_file in grouping_files: + with open(group_file) as f: + for line in f: + L1 = line.split('\t') + header = L1[0].rstrip() + num_ctg_localized += 1 + num_bp_localized += all_ctg_len[header] + assert header in remaining_ctg + remaining_ctg.pop(remaining_ctg.index(header)) + +num_ctg_unlocalized = 0 +num_bp_unlocalized = 0 +for ctg in remaining_ctg: + num_ctg_unlocalized += 1 + num_bp_unlocalized += all_ctg_len[ctg] + +print('%r contigs were localized by RaGOO' %(num_ctg_localized)) +print('%r bp were localized by RaGOO' %(num_bp_localized)) +print('%r contigs were unlocalized by RaGOO' %(num_ctg_unlocalized)) +print('%r bp were unlocalized by RaGOO' %(num_bp_unlocalized)) + +print('%r %% of contigs were localized by RaGOO' %((num_ctg_localized/(num_ctg_localized + num_ctg_unlocalized))*100)) +print('%r %% of bp were localized by RaGOO' %((num_bp_localized/(num_bp_localized + num_bp_unlocalized))*100)) + + + + diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/ragoo_utilities/utilities.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RaGOO/ragoo_utilities/utilities.py Mon Jul 26 18:22:37 2021 +0000 @@ -0,0 +1,152 @@ +import time +import subprocess +import operator + +from ragoo_utilities.SeqReader import SeqReader + +""" A collection of various helper functions""" + +complements = str.maketrans("ACGTNURYSWKMBVDHacgtnuryswkmbvdh", "TGCANAYRSWMKVBHDtgcanayrswmkvbhd") + + +def reverse_complement(seq): + """ + Reverse complement a nucleotide sequence. + :param seq: Sequence to be reverse complemented + :return: A reverse complemented sequence + """ + return seq.translate(complements)[::-1] + + +def run(cmnd): + """ Run command and report status. """ + log('Running : %s' % cmnd) + if subprocess.call(cmnd, shell=True, executable='/bin/bash') != 0: + raise RuntimeError('Failed : %s ' % cmnd) + + +def log(message): + """ Log messages to standard output. """ + print(time.ctime() + ' --- ' + message, flush=True) + + +def read_contigs(in_file): + d = dict() + x = SeqReader(in_file) + for header, seq in x.parse_fasta(): + d[header.replace('>', '').split(' ')[0]] = seq + return d + + +def read_gz_contigs(in_file): + d = dict() + x = SeqReader(in_file) + for header, seq in x.parse_gzip_fasta(): + d[header.replace('>', '').split(' ')[0]] = seq + return d + + +def binary_search(query, numbers, left, right): + """ + The contents of this method are either influenced by or directly copied from "Assemblytics_uniq_anchor.py" + written by Maria Nattestad. The original script can be found here: + + https://github.com/MariaNattestad/Assemblytics + + And the publication associated with Maria's work is here: + + Nattestad, Maria, and Michael C. Schatz. "Assemblytics: a + web analytics tool for the detection of variants from an + assembly." Bioinformatics 32.19 (2016): 3021-3023. + """ + # Returns index of the matching element or the first element to the right + + if left >= right: + return right + mid = (right + left) // 2 + + if query == numbers[mid]: + return mid + elif query < numbers[mid]: + return binary_search(query, numbers, left, mid) + else: # if query > numbers[mid]: + return binary_search(query, numbers, mid + 1, right) + + +def summarize_planesweep(lines, unique_length_required, keep_small_uniques=False): + """ + The contents of this method are either influenced by or directly copied from "Assemblytics_uniq_anchor.py" + written by Maria Nattestad. The original script can be found here: + + https://github.com/MariaNattestad/Assemblytics + + And the publication associated with Maria's work is here: + + Nattestad, Maria, and Michael C. Schatz. "Assemblytics: a + web analytics tool for the detection of variants from an + assembly." Bioinformatics 32.19 (2016): 3021-3023. + """ + alignments_to_keep = [] + # print len(lines) + + # If no alignments: + if len(lines) == 0: + return [] + + # If only one alignment: + if len(lines) == 1: + if keep_small_uniques == True or abs(lines[0][1] - lines[0][0]) >= unique_length_required: + return [0] + else: + return [] + + starts_and_stops = [] + for query_min, query_max in lines: + # print query_min, query_max + starts_and_stops.append((query_min, "start")) + starts_and_stops.append((query_max, "stop")) + + sorted_starts_and_stops = sorted(starts_and_stops, key=operator.itemgetter(0)) + + current_coverage = 0 + last_position = -1 + sorted_unique_intervals_left = [] + sorted_unique_intervals_right = [] + for pos, change in sorted_starts_and_stops: + + if current_coverage == 1: + # sorted_unique_intervals.append((last_position,pos)) + sorted_unique_intervals_left.append(last_position) + sorted_unique_intervals_right.append(pos) + + if change == "start": + current_coverage += 1 + else: + current_coverage -= 1 + last_position = pos + + linecounter = 0 + for query_min, query_max in lines: + + i = binary_search(query_min, sorted_unique_intervals_left, 0, len(sorted_unique_intervals_left)) + + exact_match = False + if sorted_unique_intervals_left[i] == query_min and sorted_unique_intervals_right[i] == query_max: + exact_match = True + sum_uniq = 0 + while i < len(sorted_unique_intervals_left) and sorted_unique_intervals_left[i] >= query_min and \ + sorted_unique_intervals_right[i] <= query_max: + sum_uniq += sorted_unique_intervals_right[i] - sorted_unique_intervals_left[i] + i += 1 + + # print query_min,query_max,sum_uniq + if sum_uniq >= unique_length_required: + alignments_to_keep.append(linecounter) + elif keep_small_uniques == True and exact_match == True: + alignments_to_keep.append(linecounter) + # print "Keeping small alignment:", query_min, query_max + # print sorted_unique_intervals_left[i-1],sorted_unique_intervals_right[i-1] + + linecounter += 1 + + return alignments_to_keep diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/sam2delta.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RaGOO/sam2delta.py Mon Jul 26 18:22:37 2021 +0000 @@ -0,0 +1,239 @@ +#!/usr/bin/env python +import argparse +from collections import defaultdict + +""" +This utility converts a SAM file to a nucmer delta file. + +SAM files must contain an NM tag, which is default in minimap2 alignments. +""" + + +class SAMAlignment: + + def __init__(self, in_ref_header, in_query_header, in_ref_start, in_cigar, in_flag, in_seq_len, in_num_mismatches): + self.num_mismatches = in_num_mismatches + self.seq_len = in_seq_len + self.flag = int(in_flag) + self.is_rc = False + + # Check if the query was reverse complemented + if self.flag & 16: + self.is_rc = True + + self.cigar = in_cigar + self.parsed_cigar = [] + self._parse_cigar() + + if self.is_rc: + self.parsed_cigar = self.parsed_cigar[::-1] + + self.ref_header = in_ref_header + self.query_header = in_query_header + self.ref_start = int(in_ref_start) + self.ref_end = None + self.query_start = None + self.query_end = None + self.query_aln_len = 0 + self.ref_aln_len = 0 + + num_hard_clipped = 0 + for i in self.parsed_cigar: + if i[-1] == 'H': + num_hard_clipped += int(i[:-1]) + + if self.seq_len == 1: + self.seq_len = 0 + for i in self.parsed_cigar: + if i[-1] == 'M' or i[-1] == 'I' or i[-1] == 'S' or i[-1] == 'X' or i[-1] == '=': + self.seq_len += int(i[:-1]) + + self.query_len = self.seq_len + num_hard_clipped + + self._get_query_start() + self._get_query_aln_len() + self._get_ref_aln_len() + self._get_query_end() + self._get_ref_end() + + if self.is_rc: + self.query_start, self.query_end = self.query_end, self.query_start + # Lazily taking care of off-by-one errors + self.query_end += 1 + self.query_start -= 1 + + self.query_end -= 1 + self.ref_end -= 1 + + def __str__(self): + return ' '.join([self.ref_header, self.query_header]) + + def __repr__(self): + return '<' + ' '.join([self.ref_header, self.query_header]) + '>' + + def _get_query_start(self): + """ + An alignment will either start with soft clipping, hard clipping, or with one of the alignment + codes (e.g. Match/Mismatch (M) or InDel (I/D)). If hard or soft clipped, the start of the alignment is + the very next base after clipping. If the alignment has commenced, then the start of the alignment is 1. + + Notes: + At least for now, I am only specifying this for minimap2 output, which I believe only has the following + characters in cigar strings: + {'I', 'S', 'M', 'D', 'H'} + so I will only handle these cases. + """ + if self.parsed_cigar[0][-1] == 'I' or self.parsed_cigar[0][-1] == 'D' or self.parsed_cigar[0][-1] == 'M': + self.query_start = 1 + else: + # The alignment has started with either soft or hard clipping + # Need to double check if a 1 needs to be added here + self.query_start = int(self.parsed_cigar[0][:-1]) + 1 + + def _get_query_aln_len(self): + """ Just the addition of all cigar codes which "consume" the query (M, I)""" + for i in self.parsed_cigar: + if i[-1] == 'M' or i[-1] == 'I': + self.query_aln_len += int(i[:-1]) + + def _get_ref_aln_len(self): + for i in self.parsed_cigar: + if i[-1] == 'M' or i[-1] == 'D': + self.ref_aln_len += int(i[:-1]) + + def _get_query_end(self): + """ This is the length of the alignment + query start """ + self.query_end = self.query_start + self.query_aln_len + + def _get_ref_end(self): + """ This is the length of the alignment + query start """ + self.ref_end = self.ref_start + self.ref_aln_len + + def _parse_cigar(self): + cigar_chars = { + 'M', + 'I', + 'D', + 'N', + 'S', + 'H', + 'P', + '=', + 'X' + } + + this_field = '' + for char in self.cigar: + this_field += char + if char in cigar_chars: + self.parsed_cigar.append(this_field) + this_field = '' + + +def write_delta(in_alns, in_file_name): + with open(in_file_name, 'w') as f: + f.write('file1 file2\n') + f.write('NUCMER\n') + for aln in in_alns.keys(): + query_len = in_alns[aln][0].query_len + #print (query_len) + f.write('>%s %s %r %r\n' % (aln[0], aln[1], ref_chr_lens[aln[0]], query_len)) + for i in in_alns[aln]: + f.write('%r %r %r %r %r %r %r\n' % ( + i.ref_start, + i.ref_end, + i.query_start, + i.query_end, + i.num_mismatches, + i.num_mismatches, + 0 + )) + # Continue with the cigar string + offsets = [] + cigar = i.parsed_cigar + if cigar[0][-1] == 'S' or cigar[0][-1] == 'H': + cigar = cigar[1:-1] + else: + cigar = cigar[:-1] + + counter = 1 + for code in cigar: + if code[-1] == 'M': + counter += int(code[:-1]) + elif code[-1] == 'D': + offsets.append(counter) + num_I = int(code[:-1]) + for i in range(1, num_I): + offsets.append(1) + counter = 1 + elif code[-1] == 'I': + offsets.append(-1*counter) + num_I = int(code[:-1]) + for i in range(1, num_I): + offsets.append(-1) + counter = 1 + else: + raise ValueError('Unexpected CIGAR code') + offsets.append(0) + offsets = [str(i) for i in offsets] + f.write('\n'.join(offsets) + '\n') + + +parser = argparse.ArgumentParser(description='Convert a SAM file to a nucmer delta file.') +parser.add_argument("sam_file", metavar="", type=str, help="SAM file to convert") + +args = parser.parse_args() +sam_file = args.sam_file + +# Make a dictionary storing all alignments between and query and a reference sequence. +# key = (reference_header, query_header) +# value = list of alignments between those sequences. only store info needed for the delta file + +alns = defaultdict(list) + +# Read through the sam file +ref_chr_lens = dict() +with open(sam_file) as f: + for line in f: + # Skip SAM headers + if line.startswith('@SQ'): + header_list = line.split('\t') + chrom = header_list[1].replace('SN:', '') + chr_len = int(header_list[2].replace('LN:', '')) + ref_chr_lens[chrom] = chr_len + continue + + if line.startswith('@'): + continue + + fields = line.split('\t') + ref_header = fields[2] + query_header = fields[0] + ref_start = fields[3] + cigar = fields[5] + flag = fields[1] + seq_len = len(fields[9]) + + if not cigar == '*': + # Get the NM flag + nm = None + for i in fields: + if i.startswith('NM:i'): + nm = int(i[5:]) + continue + + if nm is None: + raise ValueError('SAM file must include NM tag.') + + x = SAMAlignment( + ref_header, + query_header, + ref_start, + cigar, + flag, + seq_len, + nm + ) + alns[(ref_header, query_header)].append(x) + +write_delta(alns, sam_file + '.delta') diff -r 68a9ec9ce51e -r b9a3aeb162ab RaGOO/setup.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RaGOO/setup.py Mon Jul 26 18:22:37 2021 +0000 @@ -0,0 +1,21 @@ +#!/usr/bin/env python +from setuptools import setup +import glob + +scripts = glob.glob("*.p*") + +setup( + name='RaGOO', + version='v1.1', + description='A tool to order and orient genome assembly contigs via minimap2 alignments to a reference genome.', + author='Michael Alonge', + author_email='malonge11@gmail.com', + packages=['ragoo_utilities'], + package_dir={'ragoo_utilities': 'ragoo_utilities/'}, + install_requires=[ + 'intervaltree', + 'numpy', + ], + scripts=scripts, + zip_safe=True +)