diff RaGOO/Assemblytics_uniq_anchor.py @ 13:b9a3aeb162ab draft default tip

Uploaded
author dereeper
date Mon, 26 Jul 2021 18:22:37 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RaGOO/Assemblytics_uniq_anchor.py	Mon Jul 26 18:22:37 2021 +0000
@@ -0,0 +1,440 @@
+#! /usr/bin/env python
+
+
+# Author: Maria Nattestad
+# Email: mnattest@cshl.edu
+# This script is part of Assemblytics, a program to detect and analyze structural variants from an assembly aligned to a reference genome using MUMmer. 
+
+
+import argparse
+import gzip
+# from intervaltree import *
+import time
+
+import numpy as np
+import operator
+
+
+
+def run(args):
+    filename = args.delta
+    unique_length = args.unique_length
+    output_filename = args.out
+    keep_small_uniques = args.keep_small_uniques
+    
+    f = open(filename)
+    header1 = f.readline()
+    if header1[0:2]=="\x1f\x8b":
+        f.close()
+        f = gzip.open(filename)
+
+
+    linecounter = 0
+
+    current_query_name = ""
+    current_header = ""
+
+    lines_by_query = {}
+    header_lines_by_query = {}
+
+    before = time.time()
+    last = before
+
+    existing_query_names = set()
+
+    for line in f:
+        if line[0]==">":
+
+            fields = line.strip().split()
+            current_query_name = fields[1]
+            current_header = line.strip()
+            if current_query_name not in existing_query_names:
+                lines_by_query[current_query_name] = []
+                header_lines_by_query[current_query_name] = []
+                existing_query_names.add(current_query_name)
+        else:
+            fields = line.strip().split()
+            if len(fields) > 4:
+                # sometimes start and end are the other way around, but for this they need to be in order
+                query_min = min([int(fields[2]),int(fields[3])])
+                query_max = max([int(fields[2]),int(fields[3])])
+
+                ##########  TESTING ONLY  ###########
+                # lines_by_query[current_query_name] = (query_min,query_max)
+                # test_list = test_list + [(query_min,query_max)]
+                #####################################
+
+                lines_by_query[current_query_name].append((query_min,query_max))
+                header_lines_by_query[current_query_name].append(current_header)
+        # linecounter += 1
+        # if linecounter % 10000000 == 0:
+        #     print "%d,%f" % (linecounter, time.time()-last)
+        #     last = time.time()
+        
+
+    f.close()
+    
+
+    before = time.time()
+    alignments_to_keep = {}
+    num_queries = len(lines_by_query)
+    
+    num_query_step_to_report = num_queries/100
+    if num_queries < 100:
+        num_query_step_to_report = num_queries/10
+    if num_queries < 10:
+        num_query_step_to_report = 1
+
+    query_counter = 0
+
+    for query in lines_by_query:
+
+        ################   TESTING    ####################   
+
+        # results_intervaltree = summarize_intervaltree(lines_by_query[query], unique_length_required = unique_length)
+        # intervaltree_filtered_out = set(range(0,len(lines_by_query[query]))) - set(results_intervaltree)
+    
+        # results_planesweep = summarize_planesweep(lines_by_query[query], unique_length_required = unique_length) 
+        # planesweep_filtered_out = set(range(0,len(lines_by_query[query]))) - set(results_planesweep)
+        # if intervaltree_filtered_out == planesweep_filtered_out :
+        #     num_matches += 1
+        # else:
+        #     num_mismatches += 1
+        #     print "MISMATCH:"
+        #     print "number of alignments:", len(lines_by_query[query])
+        #     print "results_intervaltree:"
+        #     print results_intervaltree
+        #     for i in results_intervaltree:
+        #         print lines_by_query[query][i]
+        #     print "results_planesweep:"
+        #     print results_planesweep
+        #     for i in results_planesweep:
+        #         print lines_by_query[query][i]
+        ################   TESTING    ####################
+
+        alignments_to_keep[query] = summarize_planesweep(lines_by_query[query], unique_length_required = unique_length,keep_small_uniques=keep_small_uniques)
+
+        query_counter += 1
+
+    before = time.time()
+
+
+    fout = open(output_filename + ".Assemblytics.unique_length_filtered_l%d.delta" % (unique_length),'w')
+    
+
+    f = open(filename)
+    header1 = f.readline()
+    if header1[0:2]=="\x1f\x8b":
+        f.close()
+        f = gzip.open(filename)
+        header1 = f.readline()
+
+    fout.write(header1) # write the first line that we read already
+    fout.write(f.readline())
+    
+    linecounter = 0
+
+    # For filtered delta file:
+    list_of_alignments_to_keep = []
+    alignment_counter = {}
+    keep_printing = False
+
+    # For coords:
+    current_query_name = ""
+    current_query_position = 0
+    fcoords_out_tab = open(output_filename + ".coords.tab",'w')
+    fcoords_out_csv = open(output_filename + ".coords.csv",'w')
+    fcoords_out_csv.write("ref_start,ref_end,query_start,query_end,ref_length,query_length,ref,query,tag\n")
+
+
+    # For basic assembly stats:
+    ref_sequences = set()
+    query_sequences = set()
+    ref_lengths = []
+    query_lengths = []
+
+    f_stats_out = open(output_filename + ".Assemblytics_assembly_stats.txt","w")
+
+    for line in f:
+        linecounter += 1
+        if line[0]==">":
+            fields = line.strip().split()
+            
+            # For delta file output:
+            query = fields[1]
+            list_of_alignments_to_keep = alignments_to_keep[query]
+
+            header_needed = False
+            for index in list_of_alignments_to_keep:
+                if line.strip() == header_lines_by_query[query][index]:
+                    header_needed = True
+            if header_needed == True:
+                fout.write(line) # if we have any alignments under this header, print the header
+            alignment_counter[query] = alignment_counter.get(query,0)
+
+            # For coords:
+            current_reference_name = fields[0][1:]
+            current_query_name = fields[1]
+
+            current_reference_size = int(fields[2])
+            current_query_size = int(fields[3])
+
+            # For basic assembly stats:
+            if not current_reference_name in ref_sequences:
+                ref_lengths.append(current_reference_size)
+                ref_sequences.add(current_reference_name)
+            if not current_query_name in query_sequences:
+                query_lengths.append(current_query_size)
+                query_sequences.add(current_query_name)
+
+        else:
+            fields = line.strip().split()
+            if len(fields) > 4:
+                # For coords:
+                ref_start = int(fields[0])
+                ref_end = int(fields[1])
+                query_start = int(fields[2])
+                query_end = int(fields[3])
+                csv_tag = "repetitive"
+                if alignment_counter[query] in list_of_alignments_to_keep:
+                    fout.write(line)
+                    fcoords_out_tab.write("\t".join(map(str,[ref_start,ref_end,query_start, query_end,current_reference_size,current_query_size,current_reference_name,current_query_name])) + "\n")
+                    csv_tag = "unique"
+                    keep_printing = True
+                else:
+                    keep_printing = False
+                fcoords_out_csv.write(",".join(map(str,[ref_start,ref_end,query_start, query_end,current_reference_size,current_query_size,current_reference_name.replace(",","_"),current_query_name.replace(",","_"),csv_tag])) + "\n")
+                alignment_counter[query] = alignment_counter[query] + 1
+
+            elif keep_printing == True:
+                fout.write(line)
+
+    fcoords_out_tab.close()
+    fcoords_out_csv.close()
+
+
+    ref_lengths.sort()
+    query_lengths.sort()
+
+    # Assembly statistics
+    ref_lengths = np.array(ref_lengths)
+    query_lengths = np.array(query_lengths)
+
+    f_stats_out.write("Reference: %s\n" % (header1.split()[0].split("/")[-1]))
+    f_stats_out.write( "Number of sequences: %s\n" % intWithCommas(len(ref_lengths)))
+    f_stats_out.write( "Total sequence length: %s\n" %  gig_meg(sum(ref_lengths)))
+    f_stats_out.write( "Mean: %s\n" % gig_meg(np.mean(ref_lengths)))
+    f_stats_out.write( "Min: %s\n" % gig_meg(np.min(ref_lengths)))
+    f_stats_out.write( "Max: %s\n" % gig_meg(np.max(ref_lengths)))
+    f_stats_out.write( "N50: %s\n" % gig_meg(N50(ref_lengths)))
+    f_stats_out.write( "\n\n")
+    f_stats_out.write( "Query: %s\n" % header1.split()[1].split("/")[-1])
+    f_stats_out.write( "Number of sequences: %s\n" % intWithCommas(len(query_lengths)))
+    f_stats_out.write( "Total sequence length: %s\n" % gig_meg(sum(query_lengths)))
+    f_stats_out.write( "Mean: %s\n" % gig_meg(np.mean(query_lengths)))
+    f_stats_out.write( "Min: %s\n" % gig_meg(np.min(query_lengths)))
+    f_stats_out.write( "Max: %s\n" % gig_meg(np.max(query_lengths)))
+    f_stats_out.write( "N50: %s\n" % gig_meg(N50(query_lengths)))
+
+
+    f.close()
+    fout.close()
+    f_stats_out.close()
+
+def N50(sorted_list):
+    # List should be sorted as increasing
+
+    # We flip the list around here so we start with the largest element
+    cumsum = 0
+    for length in sorted_list[::-1]:
+        cumsum += length
+        if cumsum >= sum(sorted_list)/2:
+            return length
+
+
+def gig_meg(number,digits = 2):
+    gig = 1000000000.
+    meg = 1000000.
+    kil = 1000.
+
+    if number > gig:
+        return str(round(number/gig,digits)) + " Gbp"
+    elif number > meg:
+        return str(round(number/meg,digits)) + " Mbp"
+    elif number > kil:
+        return str(round(number/kil,digits)) + " Kbp"
+    else:
+        return str(number) + " bp"
+
+def intWithCommas(x):
+    if type(x) not in [type(0)]:
+        raise TypeError("Parameter must be an integer.")
+    if x < 0:
+        return '-' + intWithCommas(-x)
+    result = ''
+    while x >= 1000:
+        x, r = divmod(x, 1000)
+        result = ",%03d%s" % (r, result)
+    return "%d%s" % (x, result)
+
+
+def summarize_planesweep(lines,unique_length_required, keep_small_uniques=False):
+
+    alignments_to_keep = []
+    # print len(lines)
+
+    # If no alignments:
+    if len(lines)==0:
+        return []
+
+    # If only one alignment:
+    if len(lines) == 1:
+        if keep_small_uniques == True or abs(lines[0][1] - lines[0][0]) >= unique_length_required:
+            return [0]
+        else:
+            return []
+
+    starts_and_stops = []
+    for query_min,query_max in lines:
+        # print query_min, query_max
+        starts_and_stops.append((query_min,"start"))
+        starts_and_stops.append((query_max,"stop"))
+
+
+    sorted_starts_and_stops = sorted(starts_and_stops,key=operator.itemgetter(0))
+    # print sorted_starts_and_stops
+
+    current_coverage = 0
+    last_position = -1
+    # sorted_unique_intervals = []
+    sorted_unique_intervals_left = []
+    sorted_unique_intervals_right = []
+    for pos,change in sorted_starts_and_stops:
+        # print sorted_starts_and_stops[i]
+        # pos = sorted_starts_and_stops[i][0]
+        # change = sorted_starts_and_stops[i][1]
+        
+        # print pos,change
+        # First alignment only:
+        # if last_position == -1:
+        #     last_position = pos
+        #     continue
+
+        # print last_position,pos,current_coverage
+
+        if current_coverage == 1:
+            # sorted_unique_intervals.append((last_position,pos))
+            sorted_unique_intervals_left.append(last_position)
+            sorted_unique_intervals_right.append(pos)
+
+        if change == "start":
+            current_coverage += 1
+        else:
+            current_coverage -= 1
+        last_position = pos
+
+
+    linecounter = 0
+    for query_min,query_max in lines:
+
+        i = binary_search(query_min,sorted_unique_intervals_left,0,len(sorted_unique_intervals_left))
+
+        exact_match = False
+        if sorted_unique_intervals_left[i] == query_min and sorted_unique_intervals_right[i] == query_max:
+            exact_match = True
+        sum_uniq = 0
+        while i < len(sorted_unique_intervals_left) and sorted_unique_intervals_left[i] >= query_min and sorted_unique_intervals_right[i] <= query_max:
+            sum_uniq += sorted_unique_intervals_right[i] - sorted_unique_intervals_left[i]
+            i += 1
+
+        # print query_min,query_max,sum_uniq
+        if sum_uniq >= unique_length_required:
+            alignments_to_keep.append(linecounter)
+        elif keep_small_uniques == True and exact_match == True:
+            alignments_to_keep.append(linecounter)
+            # print "Keeping small alignment:", query_min, query_max
+            # print sorted_unique_intervals_left[i-1],sorted_unique_intervals_right[i-1]
+
+        linecounter += 1
+
+    return alignments_to_keep
+
+
+
+def binary_search(query, numbers, left, right):
+    #  Returns index of the matching element or the first element to the right
+    
+    if left >= right:
+        return right
+    mid = (right+left)//2
+    
+
+    if query == numbers[mid]:
+        return mid
+    elif query < numbers[mid]:
+        return binary_search(query,numbers,left,mid)
+    else: # if query > numbers[mid]:
+        return binary_search(query,numbers,mid+1,right)
+
+
+# def summarize_intervaltree(lines, unique_length_required):
+
+#     alignments_to_keep = []
+#     # print len(lines)
+
+#     if len(lines)==0:
+#         return alignments_to_keep
+
+#     if len(lines) == 1:
+#         if abs(lines[0][1] - lines[0][0]) >= unique_length_required:
+#             return [0]
+
+
+#     starts_and_stops = []
+#     for query_min,query_max in lines:
+#         starts_and_stops.append((query_min,query_max))
+
+#     # build full tree
+#     tree = IntervalTree.from_tuples(starts_and_stops) 
+    
+
+#     # for each interval (keeping the same order as the lines in the input file)
+#     line_counter = 0
+#     for query_min,query_max in lines:
+        
+#         # create a tree object from the current interval
+#         this_interval = IntervalTree.from_tuples([(query_min,query_max)])
+
+#         # create a copy of the tree without this one interval
+#         rest_of_tree = tree - this_interval
+
+#         # find difference between this interval and the rest of the tree by subtracting out the other intervals one by one
+#         for other_interval in rest_of_tree:
+#             this_interval.chop(other_interval.begin, other_interval.end)
+        
+#         # loop through to count the total number of unique basepairs
+#         total_unique_length = 0
+#         for sub_interval in this_interval:
+#             total_unique_length += sub_interval.end - sub_interval.begin
+
+#         # if the total unique length is above our threshold, add the index to the list we are reporting       
+#         if total_unique_length >= unique_length_required:
+#             alignments_to_keep.append(line_counter)
+#         line_counter += 1
+
+
+#     return alignments_to_keep
+
+
+def main():
+    parser=argparse.ArgumentParser(description="Filters alignments in delta file based whether each alignment has a unique sequence anchoring it")
+    parser.add_argument("--delta",help="delta file" ,dest="delta", type=str, required=True)
+    parser.add_argument("--out",help="output file" ,dest="out", type=str, required=True)
+    parser.add_argument("--unique-length",help="The total length of unique sequence an alignment must have on the query side to be retained. Default: 10000" ,dest="unique_length",type=int, default=10000)
+    parser.add_argument("--keep-small-uniques",help="Keep small aligments (below the unique anchor length) if they are completely unique without any part of the alignment mapping multiple places" ,dest="keep_small_uniques",action="store_true")
+    parser.set_defaults(func=run)
+    args=parser.parse_args()
+    args.func(args)
+
+if __name__=="__main__":
+    main()