comparison RaGOO/Assemblytics_uniq_anchor.py @ 13:b9a3aeb162ab draft default tip

Uploaded
author dereeper
date Mon, 26 Jul 2021 18:22:37 +0000
parents
children
comparison
equal deleted inserted replaced
12:68a9ec9ce51e 13:b9a3aeb162ab
1 #! /usr/bin/env python
2
3
4 # Author: Maria Nattestad
5 # Email: mnattest@cshl.edu
6 # This script is part of Assemblytics, a program to detect and analyze structural variants from an assembly aligned to a reference genome using MUMmer.
7
8
9 import argparse
10 import gzip
11 # from intervaltree import *
12 import time
13
14 import numpy as np
15 import operator
16
17
18
19 def run(args):
20 filename = args.delta
21 unique_length = args.unique_length
22 output_filename = args.out
23 keep_small_uniques = args.keep_small_uniques
24
25 f = open(filename)
26 header1 = f.readline()
27 if header1[0:2]=="\x1f\x8b":
28 f.close()
29 f = gzip.open(filename)
30
31
32 linecounter = 0
33
34 current_query_name = ""
35 current_header = ""
36
37 lines_by_query = {}
38 header_lines_by_query = {}
39
40 before = time.time()
41 last = before
42
43 existing_query_names = set()
44
45 for line in f:
46 if line[0]==">":
47
48 fields = line.strip().split()
49 current_query_name = fields[1]
50 current_header = line.strip()
51 if current_query_name not in existing_query_names:
52 lines_by_query[current_query_name] = []
53 header_lines_by_query[current_query_name] = []
54 existing_query_names.add(current_query_name)
55 else:
56 fields = line.strip().split()
57 if len(fields) > 4:
58 # sometimes start and end are the other way around, but for this they need to be in order
59 query_min = min([int(fields[2]),int(fields[3])])
60 query_max = max([int(fields[2]),int(fields[3])])
61
62 ########## TESTING ONLY ###########
63 # lines_by_query[current_query_name] = (query_min,query_max)
64 # test_list = test_list + [(query_min,query_max)]
65 #####################################
66
67 lines_by_query[current_query_name].append((query_min,query_max))
68 header_lines_by_query[current_query_name].append(current_header)
69 # linecounter += 1
70 # if linecounter % 10000000 == 0:
71 # print "%d,%f" % (linecounter, time.time()-last)
72 # last = time.time()
73
74
75 f.close()
76
77
78 before = time.time()
79 alignments_to_keep = {}
80 num_queries = len(lines_by_query)
81
82 num_query_step_to_report = num_queries/100
83 if num_queries < 100:
84 num_query_step_to_report = num_queries/10
85 if num_queries < 10:
86 num_query_step_to_report = 1
87
88 query_counter = 0
89
90 for query in lines_by_query:
91
92 ################ TESTING ####################
93
94 # results_intervaltree = summarize_intervaltree(lines_by_query[query], unique_length_required = unique_length)
95 # intervaltree_filtered_out = set(range(0,len(lines_by_query[query]))) - set(results_intervaltree)
96
97 # results_planesweep = summarize_planesweep(lines_by_query[query], unique_length_required = unique_length)
98 # planesweep_filtered_out = set(range(0,len(lines_by_query[query]))) - set(results_planesweep)
99 # if intervaltree_filtered_out == planesweep_filtered_out :
100 # num_matches += 1
101 # else:
102 # num_mismatches += 1
103 # print "MISMATCH:"
104 # print "number of alignments:", len(lines_by_query[query])
105 # print "results_intervaltree:"
106 # print results_intervaltree
107 # for i in results_intervaltree:
108 # print lines_by_query[query][i]
109 # print "results_planesweep:"
110 # print results_planesweep
111 # for i in results_planesweep:
112 # print lines_by_query[query][i]
113 ################ TESTING ####################
114
115 alignments_to_keep[query] = summarize_planesweep(lines_by_query[query], unique_length_required = unique_length,keep_small_uniques=keep_small_uniques)
116
117 query_counter += 1
118
119 before = time.time()
120
121
122 fout = open(output_filename + ".Assemblytics.unique_length_filtered_l%d.delta" % (unique_length),'w')
123
124
125 f = open(filename)
126 header1 = f.readline()
127 if header1[0:2]=="\x1f\x8b":
128 f.close()
129 f = gzip.open(filename)
130 header1 = f.readline()
131
132 fout.write(header1) # write the first line that we read already
133 fout.write(f.readline())
134
135 linecounter = 0
136
137 # For filtered delta file:
138 list_of_alignments_to_keep = []
139 alignment_counter = {}
140 keep_printing = False
141
142 # For coords:
143 current_query_name = ""
144 current_query_position = 0
145 fcoords_out_tab = open(output_filename + ".coords.tab",'w')
146 fcoords_out_csv = open(output_filename + ".coords.csv",'w')
147 fcoords_out_csv.write("ref_start,ref_end,query_start,query_end,ref_length,query_length,ref,query,tag\n")
148
149
150 # For basic assembly stats:
151 ref_sequences = set()
152 query_sequences = set()
153 ref_lengths = []
154 query_lengths = []
155
156 f_stats_out = open(output_filename + ".Assemblytics_assembly_stats.txt","w")
157
158 for line in f:
159 linecounter += 1
160 if line[0]==">":
161 fields = line.strip().split()
162
163 # For delta file output:
164 query = fields[1]
165 list_of_alignments_to_keep = alignments_to_keep[query]
166
167 header_needed = False
168 for index in list_of_alignments_to_keep:
169 if line.strip() == header_lines_by_query[query][index]:
170 header_needed = True
171 if header_needed == True:
172 fout.write(line) # if we have any alignments under this header, print the header
173 alignment_counter[query] = alignment_counter.get(query,0)
174
175 # For coords:
176 current_reference_name = fields[0][1:]
177 current_query_name = fields[1]
178
179 current_reference_size = int(fields[2])
180 current_query_size = int(fields[3])
181
182 # For basic assembly stats:
183 if not current_reference_name in ref_sequences:
184 ref_lengths.append(current_reference_size)
185 ref_sequences.add(current_reference_name)
186 if not current_query_name in query_sequences:
187 query_lengths.append(current_query_size)
188 query_sequences.add(current_query_name)
189
190 else:
191 fields = line.strip().split()
192 if len(fields) > 4:
193 # For coords:
194 ref_start = int(fields[0])
195 ref_end = int(fields[1])
196 query_start = int(fields[2])
197 query_end = int(fields[3])
198 csv_tag = "repetitive"
199 if alignment_counter[query] in list_of_alignments_to_keep:
200 fout.write(line)
201 fcoords_out_tab.write("\t".join(map(str,[ref_start,ref_end,query_start, query_end,current_reference_size,current_query_size,current_reference_name,current_query_name])) + "\n")
202 csv_tag = "unique"
203 keep_printing = True
204 else:
205 keep_printing = False
206 fcoords_out_csv.write(",".join(map(str,[ref_start,ref_end,query_start, query_end,current_reference_size,current_query_size,current_reference_name.replace(",","_"),current_query_name.replace(",","_"),csv_tag])) + "\n")
207 alignment_counter[query] = alignment_counter[query] + 1
208
209 elif keep_printing == True:
210 fout.write(line)
211
212 fcoords_out_tab.close()
213 fcoords_out_csv.close()
214
215
216 ref_lengths.sort()
217 query_lengths.sort()
218
219 # Assembly statistics
220 ref_lengths = np.array(ref_lengths)
221 query_lengths = np.array(query_lengths)
222
223 f_stats_out.write("Reference: %s\n" % (header1.split()[0].split("/")[-1]))
224 f_stats_out.write( "Number of sequences: %s\n" % intWithCommas(len(ref_lengths)))
225 f_stats_out.write( "Total sequence length: %s\n" % gig_meg(sum(ref_lengths)))
226 f_stats_out.write( "Mean: %s\n" % gig_meg(np.mean(ref_lengths)))
227 f_stats_out.write( "Min: %s\n" % gig_meg(np.min(ref_lengths)))
228 f_stats_out.write( "Max: %s\n" % gig_meg(np.max(ref_lengths)))
229 f_stats_out.write( "N50: %s\n" % gig_meg(N50(ref_lengths)))
230 f_stats_out.write( "\n\n")
231 f_stats_out.write( "Query: %s\n" % header1.split()[1].split("/")[-1])
232 f_stats_out.write( "Number of sequences: %s\n" % intWithCommas(len(query_lengths)))
233 f_stats_out.write( "Total sequence length: %s\n" % gig_meg(sum(query_lengths)))
234 f_stats_out.write( "Mean: %s\n" % gig_meg(np.mean(query_lengths)))
235 f_stats_out.write( "Min: %s\n" % gig_meg(np.min(query_lengths)))
236 f_stats_out.write( "Max: %s\n" % gig_meg(np.max(query_lengths)))
237 f_stats_out.write( "N50: %s\n" % gig_meg(N50(query_lengths)))
238
239
240 f.close()
241 fout.close()
242 f_stats_out.close()
243
244 def N50(sorted_list):
245 # List should be sorted as increasing
246
247 # We flip the list around here so we start with the largest element
248 cumsum = 0
249 for length in sorted_list[::-1]:
250 cumsum += length
251 if cumsum >= sum(sorted_list)/2:
252 return length
253
254
255 def gig_meg(number,digits = 2):
256 gig = 1000000000.
257 meg = 1000000.
258 kil = 1000.
259
260 if number > gig:
261 return str(round(number/gig,digits)) + " Gbp"
262 elif number > meg:
263 return str(round(number/meg,digits)) + " Mbp"
264 elif number > kil:
265 return str(round(number/kil,digits)) + " Kbp"
266 else:
267 return str(number) + " bp"
268
269 def intWithCommas(x):
270 if type(x) not in [type(0)]:
271 raise TypeError("Parameter must be an integer.")
272 if x < 0:
273 return '-' + intWithCommas(-x)
274 result = ''
275 while x >= 1000:
276 x, r = divmod(x, 1000)
277 result = ",%03d%s" % (r, result)
278 return "%d%s" % (x, result)
279
280
281 def summarize_planesweep(lines,unique_length_required, keep_small_uniques=False):
282
283 alignments_to_keep = []
284 # print len(lines)
285
286 # If no alignments:
287 if len(lines)==0:
288 return []
289
290 # If only one alignment:
291 if len(lines) == 1:
292 if keep_small_uniques == True or abs(lines[0][1] - lines[0][0]) >= unique_length_required:
293 return [0]
294 else:
295 return []
296
297 starts_and_stops = []
298 for query_min,query_max in lines:
299 # print query_min, query_max
300 starts_and_stops.append((query_min,"start"))
301 starts_and_stops.append((query_max,"stop"))
302
303
304 sorted_starts_and_stops = sorted(starts_and_stops,key=operator.itemgetter(0))
305 # print sorted_starts_and_stops
306
307 current_coverage = 0
308 last_position = -1
309 # sorted_unique_intervals = []
310 sorted_unique_intervals_left = []
311 sorted_unique_intervals_right = []
312 for pos,change in sorted_starts_and_stops:
313 # print sorted_starts_and_stops[i]
314 # pos = sorted_starts_and_stops[i][0]
315 # change = sorted_starts_and_stops[i][1]
316
317 # print pos,change
318 # First alignment only:
319 # if last_position == -1:
320 # last_position = pos
321 # continue
322
323 # print last_position,pos,current_coverage
324
325 if current_coverage == 1:
326 # sorted_unique_intervals.append((last_position,pos))
327 sorted_unique_intervals_left.append(last_position)
328 sorted_unique_intervals_right.append(pos)
329
330 if change == "start":
331 current_coverage += 1
332 else:
333 current_coverage -= 1
334 last_position = pos
335
336
337 linecounter = 0
338 for query_min,query_max in lines:
339
340 i = binary_search(query_min,sorted_unique_intervals_left,0,len(sorted_unique_intervals_left))
341
342 exact_match = False
343 if sorted_unique_intervals_left[i] == query_min and sorted_unique_intervals_right[i] == query_max:
344 exact_match = True
345 sum_uniq = 0
346 while i < len(sorted_unique_intervals_left) and sorted_unique_intervals_left[i] >= query_min and sorted_unique_intervals_right[i] <= query_max:
347 sum_uniq += sorted_unique_intervals_right[i] - sorted_unique_intervals_left[i]
348 i += 1
349
350 # print query_min,query_max,sum_uniq
351 if sum_uniq >= unique_length_required:
352 alignments_to_keep.append(linecounter)
353 elif keep_small_uniques == True and exact_match == True:
354 alignments_to_keep.append(linecounter)
355 # print "Keeping small alignment:", query_min, query_max
356 # print sorted_unique_intervals_left[i-1],sorted_unique_intervals_right[i-1]
357
358 linecounter += 1
359
360 return alignments_to_keep
361
362
363
364 def binary_search(query, numbers, left, right):
365 # Returns index of the matching element or the first element to the right
366
367 if left >= right:
368 return right
369 mid = (right+left)//2
370
371
372 if query == numbers[mid]:
373 return mid
374 elif query < numbers[mid]:
375 return binary_search(query,numbers,left,mid)
376 else: # if query > numbers[mid]:
377 return binary_search(query,numbers,mid+1,right)
378
379
380 # def summarize_intervaltree(lines, unique_length_required):
381
382 # alignments_to_keep = []
383 # # print len(lines)
384
385 # if len(lines)==0:
386 # return alignments_to_keep
387
388 # if len(lines) == 1:
389 # if abs(lines[0][1] - lines[0][0]) >= unique_length_required:
390 # return [0]
391
392
393 # starts_and_stops = []
394 # for query_min,query_max in lines:
395 # starts_and_stops.append((query_min,query_max))
396
397 # # build full tree
398 # tree = IntervalTree.from_tuples(starts_and_stops)
399
400
401 # # for each interval (keeping the same order as the lines in the input file)
402 # line_counter = 0
403 # for query_min,query_max in lines:
404
405 # # create a tree object from the current interval
406 # this_interval = IntervalTree.from_tuples([(query_min,query_max)])
407
408 # # create a copy of the tree without this one interval
409 # rest_of_tree = tree - this_interval
410
411 # # find difference between this interval and the rest of the tree by subtracting out the other intervals one by one
412 # for other_interval in rest_of_tree:
413 # this_interval.chop(other_interval.begin, other_interval.end)
414
415 # # loop through to count the total number of unique basepairs
416 # total_unique_length = 0
417 # for sub_interval in this_interval:
418 # total_unique_length += sub_interval.end - sub_interval.begin
419
420 # # if the total unique length is above our threshold, add the index to the list we are reporting
421 # if total_unique_length >= unique_length_required:
422 # alignments_to_keep.append(line_counter)
423 # line_counter += 1
424
425
426 # return alignments_to_keep
427
428
429 def main():
430 parser=argparse.ArgumentParser(description="Filters alignments in delta file based whether each alignment has a unique sequence anchoring it")
431 parser.add_argument("--delta",help="delta file" ,dest="delta", type=str, required=True)
432 parser.add_argument("--out",help="output file" ,dest="out", type=str, required=True)
433 parser.add_argument("--unique-length",help="The total length of unique sequence an alignment must have on the query side to be retained. Default: 10000" ,dest="unique_length",type=int, default=10000)
434 parser.add_argument("--keep-small-uniques",help="Keep small aligments (below the unique anchor length) if they are completely unique without any part of the alignment mapping multiple places" ,dest="keep_small_uniques",action="store_true")
435 parser.set_defaults(func=run)
436 args=parser.parse_args()
437 args.func(args)
438
439 if __name__=="__main__":
440 main()