view RaGOO/filter_gap_SVs.py @ 13:b9a3aeb162ab draft default tip

Uploaded
author dereeper
date Mon, 26 Jul 2021 18:22:37 +0000
parents
children
line wrap: on
line source

#!/usr/bin/env python
import re

from intervaltree import IntervalTree

from ragoo_utilities.SeqReader import SeqReader


class BaseSequence(object):

    def __init__(self, in_sequence):
        if not isinstance(in_sequence, str):
            raise AttributeError('Only a string can be used to instantiate this class.')
        self.sequence = in_sequence.upper()


class GapSequence(BaseSequence):
    def get_gap_coords(self):
        """ Find all of the gap string indices for this sequence. """
        return re.finditer(r'N+', self.sequence)


def make_gaps_tree(in_file):
    # A dictionary to store an interval tree for each chromosome header.
    all_trees = dict()
    x = SeqReader(in_file)
    if in_file.endswith(".gz"):
        for header, sequence in x.parse_gzip_fasta():
            # Remove the greater than sign and only get first token if delimited by spaces
            header = header[1:].split(' ')[0]
            all_trees[header] = IntervalTree()
            gap_sequence = GapSequence(sequence)
            all_coordinates = [(m.start(0), m.end(0)) for m in gap_sequence.get_gap_coords()]
            for i in all_coordinates:
                all_trees[header][i[0]:i[1]] = i
    else:
        for header, sequence in x.parse_fasta():
            # Remove the greater than sign and only get first token if delimited by spaces
            header = header[1:].split(' ')[0]
            all_trees[header] = IntervalTree()
            gap_sequence = GapSequence(sequence)
            all_coordinates = [(m.start(0), m.end(0)) for m in gap_sequence.get_gap_coords()]
            for i in all_coordinates:
                all_trees[header][i[0]:i[1]] = i
    return all_trees


def get_query_bed_coords(in_line):
    c_loc = [m.start() for m in re.finditer(':', in_line)]
    c1, c2 = c_loc[-1], c_loc[-2]
    header = in_line[:c2]
    L1 = in_line[c2+1:c1].split('-')
    start, stop = int(L1[0]), int(L1[1])
    return header, start, stop


def make_svs_bed(in_trees_q, in_trees_r):
    final_lines = []
    with open('assemblytics_out.Assemblytics_structural_variants.bed', 'r') as f:

        # Make a new header column for the field I am about to create
        header = f.readline().rstrip() + '\tquery_gap_ovlp'
        final_lines.append(header)

        # Calculate the percentage overlap for each structural variant.
        for line in f:
            pct_ovlp = 0
            ovlp = 0
            L1 = line.rstrip().split('\t')
            head, start, end = get_query_bed_coords(L1[9])
            query = in_trees_q[head][start:end]
            if query:
                for interval in query:
                    gap_start, gap_end= interval.begin, interval.end

                    # Calculate the amount these to intervals overlap
                    ovlp += max(0, min(end, gap_end) - max(start, gap_start))

                pct_ovlp = ovlp/(end-start)

            # Add the new value to the original line
            L1.append(pct_ovlp)
            L1 = [str(i) for i in L1]
            final_lines.append('\t'.join(L1))

    with open('assemblytics_out.Assemblytics_structural_variants.bed', 'w') as out_file:
        out_file.write('\n'.join(final_lines))

    # Now repeat but with respect to the reference
    final_lines = []
    with open('assemblytics_out.Assemblytics_structural_variants.bed', 'r') as f:
        # Make a new header column for the field I am about to create
        header = f.readline().rstrip() + '\tref_gap_ovlp'
        final_lines.append(header)

        # Calculate the percentage overlap for each structural variant.
        for line in f:
            pct_ovlp = 0
            ovlp = 0
            L1 = line.rstrip().split('\t')
            head, start, end = L1[0], int(L1[1]), int(L1[2])
            query = in_trees_r[head][start:end]
            if query:
                for interval in query:
                    gap_start, gap_end = interval.begin, interval.end

                    # Calculate the amount these to intervals overlap
                    ovlp += max(0, min(end, gap_end) - max(start, gap_start))

                pct_ovlp = ovlp / (end - start)

            # Add the new value to the original line
            L1.append(pct_ovlp)
            L1 = [str(i) for i in L1]
            final_lines.append('\t'.join(L1))

        with open('assemblytics_out.Assemblytics_structural_variants.bed', 'w') as out_file:
            out_file.write('\n'.join(final_lines))


def main():
    import argparse

    parser = argparse.ArgumentParser(description='Annotate the SV bed file with the amount degree in which the SV overlaps with a gap')
    parser.add_argument("ref_file", metavar="<ref.fasta>", type=str,help="reference fasta file")
    args = parser.parse_args()
    ref_file = args.ref_file

    # Make a new bed file w.r.t the query
    all_trees_q = make_gaps_tree('../ragoo.fasta')
    all_trees_r = make_gaps_tree(ref_file)
    make_svs_bed(all_trees_q, all_trees_r)


if __name__ == "__main__":
    main()