diff RaGOO/filter_gap_SVs.py @ 13:b9a3aeb162ab draft default tip

Uploaded
author dereeper
date Mon, 26 Jul 2021 18:22:37 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RaGOO/filter_gap_SVs.py	Mon Jul 26 18:22:37 2021 +0000
@@ -0,0 +1,136 @@
+#!/usr/bin/env python
+import re
+
+from intervaltree import IntervalTree
+
+from ragoo_utilities.SeqReader import SeqReader
+
+
+class BaseSequence(object):
+
+    def __init__(self, in_sequence):
+        if not isinstance(in_sequence, str):
+            raise AttributeError('Only a string can be used to instantiate this class.')
+        self.sequence = in_sequence.upper()
+
+
+class GapSequence(BaseSequence):
+    def get_gap_coords(self):
+        """ Find all of the gap string indices for this sequence. """
+        return re.finditer(r'N+', self.sequence)
+
+
+def make_gaps_tree(in_file):
+    # A dictionary to store an interval tree for each chromosome header.
+    all_trees = dict()
+    x = SeqReader(in_file)
+    if in_file.endswith(".gz"):
+        for header, sequence in x.parse_gzip_fasta():
+            # Remove the greater than sign and only get first token if delimited by spaces
+            header = header[1:].split(' ')[0]
+            all_trees[header] = IntervalTree()
+            gap_sequence = GapSequence(sequence)
+            all_coordinates = [(m.start(0), m.end(0)) for m in gap_sequence.get_gap_coords()]
+            for i in all_coordinates:
+                all_trees[header][i[0]:i[1]] = i
+    else:
+        for header, sequence in x.parse_fasta():
+            # Remove the greater than sign and only get first token if delimited by spaces
+            header = header[1:].split(' ')[0]
+            all_trees[header] = IntervalTree()
+            gap_sequence = GapSequence(sequence)
+            all_coordinates = [(m.start(0), m.end(0)) for m in gap_sequence.get_gap_coords()]
+            for i in all_coordinates:
+                all_trees[header][i[0]:i[1]] = i
+    return all_trees
+
+
+def get_query_bed_coords(in_line):
+    c_loc = [m.start() for m in re.finditer(':', in_line)]
+    c1, c2 = c_loc[-1], c_loc[-2]
+    header = in_line[:c2]
+    L1 = in_line[c2+1:c1].split('-')
+    start, stop = int(L1[0]), int(L1[1])
+    return header, start, stop
+
+
+def make_svs_bed(in_trees_q, in_trees_r):
+    final_lines = []
+    with open('assemblytics_out.Assemblytics_structural_variants.bed', 'r') as f:
+
+        # Make a new header column for the field I am about to create
+        header = f.readline().rstrip() + '\tquery_gap_ovlp'
+        final_lines.append(header)
+
+        # Calculate the percentage overlap for each structural variant.
+        for line in f:
+            pct_ovlp = 0
+            ovlp = 0
+            L1 = line.rstrip().split('\t')
+            head, start, end = get_query_bed_coords(L1[9])
+            query = in_trees_q[head][start:end]
+            if query:
+                for interval in query:
+                    gap_start, gap_end= interval.begin, interval.end
+
+                    # Calculate the amount these to intervals overlap
+                    ovlp += max(0, min(end, gap_end) - max(start, gap_start))
+
+                pct_ovlp = ovlp/(end-start)
+
+            # Add the new value to the original line
+            L1.append(pct_ovlp)
+            L1 = [str(i) for i in L1]
+            final_lines.append('\t'.join(L1))
+
+    with open('assemblytics_out.Assemblytics_structural_variants.bed', 'w') as out_file:
+        out_file.write('\n'.join(final_lines))
+
+    # Now repeat but with respect to the reference
+    final_lines = []
+    with open('assemblytics_out.Assemblytics_structural_variants.bed', 'r') as f:
+        # Make a new header column for the field I am about to create
+        header = f.readline().rstrip() + '\tref_gap_ovlp'
+        final_lines.append(header)
+
+        # Calculate the percentage overlap for each structural variant.
+        for line in f:
+            pct_ovlp = 0
+            ovlp = 0
+            L1 = line.rstrip().split('\t')
+            head, start, end = L1[0], int(L1[1]), int(L1[2])
+            query = in_trees_r[head][start:end]
+            if query:
+                for interval in query:
+                    gap_start, gap_end = interval.begin, interval.end
+
+                    # Calculate the amount these to intervals overlap
+                    ovlp += max(0, min(end, gap_end) - max(start, gap_start))
+
+                pct_ovlp = ovlp / (end - start)
+
+            # Add the new value to the original line
+            L1.append(pct_ovlp)
+            L1 = [str(i) for i in L1]
+            final_lines.append('\t'.join(L1))
+
+        with open('assemblytics_out.Assemblytics_structural_variants.bed', 'w') as out_file:
+            out_file.write('\n'.join(final_lines))
+
+
+def main():
+    import argparse
+
+    parser = argparse.ArgumentParser(description='Annotate the SV bed file with the amount degree in which the SV overlaps with a gap')
+    parser.add_argument("ref_file", metavar="<ref.fasta>", type=str,help="reference fasta file")
+    args = parser.parse_args()
+    ref_file = args.ref_file
+
+    # Make a new bed file w.r.t the query
+    all_trees_q = make_gaps_tree('../ragoo.fasta')
+    all_trees_r = make_gaps_tree(ref_file)
+    make_svs_bed(all_trees_q, all_trees_r)
+
+
+if __name__ == "__main__":
+    main()