comparison RaGOO/filter_gap_SVs.py @ 13:b9a3aeb162ab draft default tip

Uploaded
author dereeper
date Mon, 26 Jul 2021 18:22:37 +0000
parents
children
comparison
equal deleted inserted replaced
12:68a9ec9ce51e 13:b9a3aeb162ab
1 #!/usr/bin/env python
2 import re
3
4 from intervaltree import IntervalTree
5
6 from ragoo_utilities.SeqReader import SeqReader
7
8
9 class BaseSequence(object):
10
11 def __init__(self, in_sequence):
12 if not isinstance(in_sequence, str):
13 raise AttributeError('Only a string can be used to instantiate this class.')
14 self.sequence = in_sequence.upper()
15
16
17 class GapSequence(BaseSequence):
18 def get_gap_coords(self):
19 """ Find all of the gap string indices for this sequence. """
20 return re.finditer(r'N+', self.sequence)
21
22
23 def make_gaps_tree(in_file):
24 # A dictionary to store an interval tree for each chromosome header.
25 all_trees = dict()
26 x = SeqReader(in_file)
27 if in_file.endswith(".gz"):
28 for header, sequence in x.parse_gzip_fasta():
29 # Remove the greater than sign and only get first token if delimited by spaces
30 header = header[1:].split(' ')[0]
31 all_trees[header] = IntervalTree()
32 gap_sequence = GapSequence(sequence)
33 all_coordinates = [(m.start(0), m.end(0)) for m in gap_sequence.get_gap_coords()]
34 for i in all_coordinates:
35 all_trees[header][i[0]:i[1]] = i
36 else:
37 for header, sequence in x.parse_fasta():
38 # Remove the greater than sign and only get first token if delimited by spaces
39 header = header[1:].split(' ')[0]
40 all_trees[header] = IntervalTree()
41 gap_sequence = GapSequence(sequence)
42 all_coordinates = [(m.start(0), m.end(0)) for m in gap_sequence.get_gap_coords()]
43 for i in all_coordinates:
44 all_trees[header][i[0]:i[1]] = i
45 return all_trees
46
47
48 def get_query_bed_coords(in_line):
49 c_loc = [m.start() for m in re.finditer(':', in_line)]
50 c1, c2 = c_loc[-1], c_loc[-2]
51 header = in_line[:c2]
52 L1 = in_line[c2+1:c1].split('-')
53 start, stop = int(L1[0]), int(L1[1])
54 return header, start, stop
55
56
57 def make_svs_bed(in_trees_q, in_trees_r):
58 final_lines = []
59 with open('assemblytics_out.Assemblytics_structural_variants.bed', 'r') as f:
60
61 # Make a new header column for the field I am about to create
62 header = f.readline().rstrip() + '\tquery_gap_ovlp'
63 final_lines.append(header)
64
65 # Calculate the percentage overlap for each structural variant.
66 for line in f:
67 pct_ovlp = 0
68 ovlp = 0
69 L1 = line.rstrip().split('\t')
70 head, start, end = get_query_bed_coords(L1[9])
71 query = in_trees_q[head][start:end]
72 if query:
73 for interval in query:
74 gap_start, gap_end= interval.begin, interval.end
75
76 # Calculate the amount these to intervals overlap
77 ovlp += max(0, min(end, gap_end) - max(start, gap_start))
78
79 pct_ovlp = ovlp/(end-start)
80
81 # Add the new value to the original line
82 L1.append(pct_ovlp)
83 L1 = [str(i) for i in L1]
84 final_lines.append('\t'.join(L1))
85
86 with open('assemblytics_out.Assemblytics_structural_variants.bed', 'w') as out_file:
87 out_file.write('\n'.join(final_lines))
88
89 # Now repeat but with respect to the reference
90 final_lines = []
91 with open('assemblytics_out.Assemblytics_structural_variants.bed', 'r') as f:
92 # Make a new header column for the field I am about to create
93 header = f.readline().rstrip() + '\tref_gap_ovlp'
94 final_lines.append(header)
95
96 # Calculate the percentage overlap for each structural variant.
97 for line in f:
98 pct_ovlp = 0
99 ovlp = 0
100 L1 = line.rstrip().split('\t')
101 head, start, end = L1[0], int(L1[1]), int(L1[2])
102 query = in_trees_r[head][start:end]
103 if query:
104 for interval in query:
105 gap_start, gap_end = interval.begin, interval.end
106
107 # Calculate the amount these to intervals overlap
108 ovlp += max(0, min(end, gap_end) - max(start, gap_start))
109
110 pct_ovlp = ovlp / (end - start)
111
112 # Add the new value to the original line
113 L1.append(pct_ovlp)
114 L1 = [str(i) for i in L1]
115 final_lines.append('\t'.join(L1))
116
117 with open('assemblytics_out.Assemblytics_structural_variants.bed', 'w') as out_file:
118 out_file.write('\n'.join(final_lines))
119
120
121 def main():
122 import argparse
123
124 parser = argparse.ArgumentParser(description='Annotate the SV bed file with the amount degree in which the SV overlaps with a gap')
125 parser.add_argument("ref_file", metavar="<ref.fasta>", type=str,help="reference fasta file")
126 args = parser.parse_args()
127 ref_file = args.ref_file
128
129 # Make a new bed file w.r.t the query
130 all_trees_q = make_gaps_tree('../ragoo.fasta')
131 all_trees_r = make_gaps_tree(ref_file)
132 make_svs_bed(all_trees_q, all_trees_r)
133
134
135 if __name__ == "__main__":
136 main()