annotate RaGOO/filter_gap_SVs.py @ 13:b9a3aeb162ab draft default tip

Uploaded
author dereeper
date Mon, 26 Jul 2021 18:22:37 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
13
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/env python
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
2 import re
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
3
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
4 from intervaltree import IntervalTree
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
5
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
6 from ragoo_utilities.SeqReader import SeqReader
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
7
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
8
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
9 class BaseSequence(object):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
10
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
11 def __init__(self, in_sequence):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
12 if not isinstance(in_sequence, str):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
13 raise AttributeError('Only a string can be used to instantiate this class.')
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
14 self.sequence = in_sequence.upper()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
15
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
16
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
17 class GapSequence(BaseSequence):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
18 def get_gap_coords(self):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
19 """ Find all of the gap string indices for this sequence. """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
20 return re.finditer(r'N+', self.sequence)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
21
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
22
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
23 def make_gaps_tree(in_file):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
24 # A dictionary to store an interval tree for each chromosome header.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
25 all_trees = dict()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
26 x = SeqReader(in_file)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
27 if in_file.endswith(".gz"):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
28 for header, sequence in x.parse_gzip_fasta():
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
29 # Remove the greater than sign and only get first token if delimited by spaces
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
30 header = header[1:].split(' ')[0]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
31 all_trees[header] = IntervalTree()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
32 gap_sequence = GapSequence(sequence)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
33 all_coordinates = [(m.start(0), m.end(0)) for m in gap_sequence.get_gap_coords()]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
34 for i in all_coordinates:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
35 all_trees[header][i[0]:i[1]] = i
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
36 else:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
37 for header, sequence in x.parse_fasta():
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
38 # Remove the greater than sign and only get first token if delimited by spaces
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
39 header = header[1:].split(' ')[0]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
40 all_trees[header] = IntervalTree()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
41 gap_sequence = GapSequence(sequence)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
42 all_coordinates = [(m.start(0), m.end(0)) for m in gap_sequence.get_gap_coords()]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
43 for i in all_coordinates:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
44 all_trees[header][i[0]:i[1]] = i
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
45 return all_trees
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
46
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
47
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
48 def get_query_bed_coords(in_line):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
49 c_loc = [m.start() for m in re.finditer(':', in_line)]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
50 c1, c2 = c_loc[-1], c_loc[-2]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
51 header = in_line[:c2]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
52 L1 = in_line[c2+1:c1].split('-')
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
53 start, stop = int(L1[0]), int(L1[1])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
54 return header, start, stop
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
55
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
56
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
57 def make_svs_bed(in_trees_q, in_trees_r):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
58 final_lines = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
59 with open('assemblytics_out.Assemblytics_structural_variants.bed', 'r') as f:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
60
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
61 # Make a new header column for the field I am about to create
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
62 header = f.readline().rstrip() + '\tquery_gap_ovlp'
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
63 final_lines.append(header)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
64
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
65 # Calculate the percentage overlap for each structural variant.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
66 for line in f:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
67 pct_ovlp = 0
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
68 ovlp = 0
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
69 L1 = line.rstrip().split('\t')
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
70 head, start, end = get_query_bed_coords(L1[9])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
71 query = in_trees_q[head][start:end]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
72 if query:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
73 for interval in query:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
74 gap_start, gap_end= interval.begin, interval.end
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
75
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
76 # Calculate the amount these to intervals overlap
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
77 ovlp += max(0, min(end, gap_end) - max(start, gap_start))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
78
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
79 pct_ovlp = ovlp/(end-start)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
80
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
81 # Add the new value to the original line
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
82 L1.append(pct_ovlp)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
83 L1 = [str(i) for i in L1]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
84 final_lines.append('\t'.join(L1))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
85
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
86 with open('assemblytics_out.Assemblytics_structural_variants.bed', 'w') as out_file:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
87 out_file.write('\n'.join(final_lines))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
88
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
89 # Now repeat but with respect to the reference
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
90 final_lines = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
91 with open('assemblytics_out.Assemblytics_structural_variants.bed', 'r') as f:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
92 # Make a new header column for the field I am about to create
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
93 header = f.readline().rstrip() + '\tref_gap_ovlp'
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
94 final_lines.append(header)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
95
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
96 # Calculate the percentage overlap for each structural variant.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
97 for line in f:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
98 pct_ovlp = 0
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
99 ovlp = 0
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
100 L1 = line.rstrip().split('\t')
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
101 head, start, end = L1[0], int(L1[1]), int(L1[2])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
102 query = in_trees_r[head][start:end]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
103 if query:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
104 for interval in query:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
105 gap_start, gap_end = interval.begin, interval.end
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
106
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
107 # Calculate the amount these to intervals overlap
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
108 ovlp += max(0, min(end, gap_end) - max(start, gap_start))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
109
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
110 pct_ovlp = ovlp / (end - start)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
111
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
112 # Add the new value to the original line
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
113 L1.append(pct_ovlp)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
114 L1 = [str(i) for i in L1]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
115 final_lines.append('\t'.join(L1))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
116
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
117 with open('assemblytics_out.Assemblytics_structural_variants.bed', 'w') as out_file:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
118 out_file.write('\n'.join(final_lines))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
119
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
120
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
121 def main():
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
122 import argparse
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
123
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
124 parser = argparse.ArgumentParser(description='Annotate the SV bed file with the amount degree in which the SV overlaps with a gap')
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
125 parser.add_argument("ref_file", metavar="<ref.fasta>", type=str,help="reference fasta file")
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
126 args = parser.parse_args()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
127 ref_file = args.ref_file
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
128
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
129 # Make a new bed file w.r.t the query
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
130 all_trees_q = make_gaps_tree('../ragoo.fasta')
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
131 all_trees_r = make_gaps_tree(ref_file)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
132 make_svs_bed(all_trees_q, all_trees_r)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
133
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
134
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
135 if __name__ == "__main__":
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
136 main()