annotate RaGOO/Assemblytics_uniq_anchor.py @ 13:b9a3aeb162ab draft default tip

Uploaded
author dereeper
date Mon, 26 Jul 2021 18:22:37 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
13
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
1 #! /usr/bin/env python
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
2
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
3
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
4 # Author: Maria Nattestad
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
5 # Email: mnattest@cshl.edu
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
6 # This script is part of Assemblytics, a program to detect and analyze structural variants from an assembly aligned to a reference genome using MUMmer.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
7
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
8
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
9 import argparse
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
10 import gzip
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
11 # from intervaltree import *
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
12 import time
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
13
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
14 import numpy as np
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
15 import operator
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
16
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
17
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
18
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
19 def run(args):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
20 filename = args.delta
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
21 unique_length = args.unique_length
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
22 output_filename = args.out
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
23 keep_small_uniques = args.keep_small_uniques
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
24
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
25 f = open(filename)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
26 header1 = f.readline()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
27 if header1[0:2]=="\x1f\x8b":
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
28 f.close()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
29 f = gzip.open(filename)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
30
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
31
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
32 linecounter = 0
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
33
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
34 current_query_name = ""
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
35 current_header = ""
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
36
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
37 lines_by_query = {}
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
38 header_lines_by_query = {}
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
39
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
40 before = time.time()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
41 last = before
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
42
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
43 existing_query_names = set()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
44
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
45 for line in f:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
46 if line[0]==">":
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
47
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
48 fields = line.strip().split()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
49 current_query_name = fields[1]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
50 current_header = line.strip()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
51 if current_query_name not in existing_query_names:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
52 lines_by_query[current_query_name] = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
53 header_lines_by_query[current_query_name] = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
54 existing_query_names.add(current_query_name)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
55 else:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
56 fields = line.strip().split()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
57 if len(fields) > 4:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
58 # sometimes start and end are the other way around, but for this they need to be in order
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
59 query_min = min([int(fields[2]),int(fields[3])])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
60 query_max = max([int(fields[2]),int(fields[3])])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
61
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
62 ########## TESTING ONLY ###########
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
63 # lines_by_query[current_query_name] = (query_min,query_max)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
64 # test_list = test_list + [(query_min,query_max)]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
65 #####################################
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
66
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
67 lines_by_query[current_query_name].append((query_min,query_max))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
68 header_lines_by_query[current_query_name].append(current_header)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
69 # linecounter += 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
70 # if linecounter % 10000000 == 0:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
71 # print "%d,%f" % (linecounter, time.time()-last)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
72 # last = time.time()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
73
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
74
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
75 f.close()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
76
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
77
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
78 before = time.time()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
79 alignments_to_keep = {}
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
80 num_queries = len(lines_by_query)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
81
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
82 num_query_step_to_report = num_queries/100
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
83 if num_queries < 100:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
84 num_query_step_to_report = num_queries/10
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
85 if num_queries < 10:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
86 num_query_step_to_report = 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
87
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
88 query_counter = 0
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
89
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
90 for query in lines_by_query:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
91
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
92 ################ TESTING ####################
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
93
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
94 # results_intervaltree = summarize_intervaltree(lines_by_query[query], unique_length_required = unique_length)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
95 # intervaltree_filtered_out = set(range(0,len(lines_by_query[query]))) - set(results_intervaltree)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
96
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
97 # results_planesweep = summarize_planesweep(lines_by_query[query], unique_length_required = unique_length)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
98 # planesweep_filtered_out = set(range(0,len(lines_by_query[query]))) - set(results_planesweep)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
99 # if intervaltree_filtered_out == planesweep_filtered_out :
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
100 # num_matches += 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
101 # else:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
102 # num_mismatches += 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
103 # print "MISMATCH:"
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
104 # print "number of alignments:", len(lines_by_query[query])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
105 # print "results_intervaltree:"
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
106 # print results_intervaltree
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
107 # for i in results_intervaltree:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
108 # print lines_by_query[query][i]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
109 # print "results_planesweep:"
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
110 # print results_planesweep
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
111 # for i in results_planesweep:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
112 # print lines_by_query[query][i]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
113 ################ TESTING ####################
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
114
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
115 alignments_to_keep[query] = summarize_planesweep(lines_by_query[query], unique_length_required = unique_length,keep_small_uniques=keep_small_uniques)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
116
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
117 query_counter += 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
118
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
119 before = time.time()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
120
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
121
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
122 fout = open(output_filename + ".Assemblytics.unique_length_filtered_l%d.delta" % (unique_length),'w')
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
123
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
124
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
125 f = open(filename)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
126 header1 = f.readline()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
127 if header1[0:2]=="\x1f\x8b":
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
128 f.close()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
129 f = gzip.open(filename)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
130 header1 = f.readline()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
131
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
132 fout.write(header1) # write the first line that we read already
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
133 fout.write(f.readline())
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
134
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
135 linecounter = 0
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
136
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
137 # For filtered delta file:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
138 list_of_alignments_to_keep = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
139 alignment_counter = {}
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
140 keep_printing = False
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
141
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
142 # For coords:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
143 current_query_name = ""
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
144 current_query_position = 0
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
145 fcoords_out_tab = open(output_filename + ".coords.tab",'w')
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
146 fcoords_out_csv = open(output_filename + ".coords.csv",'w')
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
147 fcoords_out_csv.write("ref_start,ref_end,query_start,query_end,ref_length,query_length,ref,query,tag\n")
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
148
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
149
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
150 # For basic assembly stats:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
151 ref_sequences = set()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
152 query_sequences = set()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
153 ref_lengths = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
154 query_lengths = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
155
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
156 f_stats_out = open(output_filename + ".Assemblytics_assembly_stats.txt","w")
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
157
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
158 for line in f:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
159 linecounter += 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
160 if line[0]==">":
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
161 fields = line.strip().split()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
162
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
163 # For delta file output:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
164 query = fields[1]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
165 list_of_alignments_to_keep = alignments_to_keep[query]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
166
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
167 header_needed = False
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
168 for index in list_of_alignments_to_keep:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
169 if line.strip() == header_lines_by_query[query][index]:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
170 header_needed = True
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
171 if header_needed == True:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
172 fout.write(line) # if we have any alignments under this header, print the header
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
173 alignment_counter[query] = alignment_counter.get(query,0)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
174
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
175 # For coords:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
176 current_reference_name = fields[0][1:]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
177 current_query_name = fields[1]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
178
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
179 current_reference_size = int(fields[2])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
180 current_query_size = int(fields[3])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
181
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
182 # For basic assembly stats:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
183 if not current_reference_name in ref_sequences:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
184 ref_lengths.append(current_reference_size)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
185 ref_sequences.add(current_reference_name)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
186 if not current_query_name in query_sequences:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
187 query_lengths.append(current_query_size)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
188 query_sequences.add(current_query_name)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
189
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
190 else:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
191 fields = line.strip().split()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
192 if len(fields) > 4:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
193 # For coords:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
194 ref_start = int(fields[0])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
195 ref_end = int(fields[1])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
196 query_start = int(fields[2])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
197 query_end = int(fields[3])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
198 csv_tag = "repetitive"
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
199 if alignment_counter[query] in list_of_alignments_to_keep:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
200 fout.write(line)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
201 fcoords_out_tab.write("\t".join(map(str,[ref_start,ref_end,query_start, query_end,current_reference_size,current_query_size,current_reference_name,current_query_name])) + "\n")
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
202 csv_tag = "unique"
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
203 keep_printing = True
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
204 else:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
205 keep_printing = False
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
206 fcoords_out_csv.write(",".join(map(str,[ref_start,ref_end,query_start, query_end,current_reference_size,current_query_size,current_reference_name.replace(",","_"),current_query_name.replace(",","_"),csv_tag])) + "\n")
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
207 alignment_counter[query] = alignment_counter[query] + 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
208
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
209 elif keep_printing == True:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
210 fout.write(line)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
211
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
212 fcoords_out_tab.close()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
213 fcoords_out_csv.close()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
214
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
215
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
216 ref_lengths.sort()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
217 query_lengths.sort()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
218
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
219 # Assembly statistics
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
220 ref_lengths = np.array(ref_lengths)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
221 query_lengths = np.array(query_lengths)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
222
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
223 f_stats_out.write("Reference: %s\n" % (header1.split()[0].split("/")[-1]))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
224 f_stats_out.write( "Number of sequences: %s\n" % intWithCommas(len(ref_lengths)))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
225 f_stats_out.write( "Total sequence length: %s\n" % gig_meg(sum(ref_lengths)))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
226 f_stats_out.write( "Mean: %s\n" % gig_meg(np.mean(ref_lengths)))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
227 f_stats_out.write( "Min: %s\n" % gig_meg(np.min(ref_lengths)))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
228 f_stats_out.write( "Max: %s\n" % gig_meg(np.max(ref_lengths)))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
229 f_stats_out.write( "N50: %s\n" % gig_meg(N50(ref_lengths)))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
230 f_stats_out.write( "\n\n")
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
231 f_stats_out.write( "Query: %s\n" % header1.split()[1].split("/")[-1])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
232 f_stats_out.write( "Number of sequences: %s\n" % intWithCommas(len(query_lengths)))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
233 f_stats_out.write( "Total sequence length: %s\n" % gig_meg(sum(query_lengths)))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
234 f_stats_out.write( "Mean: %s\n" % gig_meg(np.mean(query_lengths)))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
235 f_stats_out.write( "Min: %s\n" % gig_meg(np.min(query_lengths)))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
236 f_stats_out.write( "Max: %s\n" % gig_meg(np.max(query_lengths)))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
237 f_stats_out.write( "N50: %s\n" % gig_meg(N50(query_lengths)))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
238
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
239
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
240 f.close()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
241 fout.close()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
242 f_stats_out.close()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
243
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
244 def N50(sorted_list):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
245 # List should be sorted as increasing
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
246
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
247 # We flip the list around here so we start with the largest element
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
248 cumsum = 0
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
249 for length in sorted_list[::-1]:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
250 cumsum += length
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
251 if cumsum >= sum(sorted_list)/2:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
252 return length
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
253
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
254
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
255 def gig_meg(number,digits = 2):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
256 gig = 1000000000.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
257 meg = 1000000.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
258 kil = 1000.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
259
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
260 if number > gig:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
261 return str(round(number/gig,digits)) + " Gbp"
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
262 elif number > meg:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
263 return str(round(number/meg,digits)) + " Mbp"
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
264 elif number > kil:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
265 return str(round(number/kil,digits)) + " Kbp"
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
266 else:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
267 return str(number) + " bp"
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
268
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
269 def intWithCommas(x):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
270 if type(x) not in [type(0)]:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
271 raise TypeError("Parameter must be an integer.")
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
272 if x < 0:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
273 return '-' + intWithCommas(-x)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
274 result = ''
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
275 while x >= 1000:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
276 x, r = divmod(x, 1000)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
277 result = ",%03d%s" % (r, result)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
278 return "%d%s" % (x, result)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
279
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
280
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
281 def summarize_planesweep(lines,unique_length_required, keep_small_uniques=False):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
282
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
283 alignments_to_keep = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
284 # print len(lines)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
285
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
286 # If no alignments:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
287 if len(lines)==0:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
288 return []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
289
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
290 # If only one alignment:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
291 if len(lines) == 1:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
292 if keep_small_uniques == True or abs(lines[0][1] - lines[0][0]) >= unique_length_required:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
293 return [0]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
294 else:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
295 return []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
296
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
297 starts_and_stops = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
298 for query_min,query_max in lines:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
299 # print query_min, query_max
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
300 starts_and_stops.append((query_min,"start"))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
301 starts_and_stops.append((query_max,"stop"))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
302
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
303
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
304 sorted_starts_and_stops = sorted(starts_and_stops,key=operator.itemgetter(0))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
305 # print sorted_starts_and_stops
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
306
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
307 current_coverage = 0
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
308 last_position = -1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
309 # sorted_unique_intervals = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
310 sorted_unique_intervals_left = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
311 sorted_unique_intervals_right = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
312 for pos,change in sorted_starts_and_stops:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
313 # print sorted_starts_and_stops[i]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
314 # pos = sorted_starts_and_stops[i][0]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
315 # change = sorted_starts_and_stops[i][1]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
316
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
317 # print pos,change
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
318 # First alignment only:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
319 # if last_position == -1:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
320 # last_position = pos
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
321 # continue
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
322
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
323 # print last_position,pos,current_coverage
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
324
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
325 if current_coverage == 1:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
326 # sorted_unique_intervals.append((last_position,pos))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
327 sorted_unique_intervals_left.append(last_position)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
328 sorted_unique_intervals_right.append(pos)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
329
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
330 if change == "start":
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
331 current_coverage += 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
332 else:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
333 current_coverage -= 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
334 last_position = pos
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
335
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
336
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
337 linecounter = 0
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
338 for query_min,query_max in lines:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
339
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
340 i = binary_search(query_min,sorted_unique_intervals_left,0,len(sorted_unique_intervals_left))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
341
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
342 exact_match = False
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
343 if sorted_unique_intervals_left[i] == query_min and sorted_unique_intervals_right[i] == query_max:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
344 exact_match = True
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
345 sum_uniq = 0
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
346 while i < len(sorted_unique_intervals_left) and sorted_unique_intervals_left[i] >= query_min and sorted_unique_intervals_right[i] <= query_max:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
347 sum_uniq += sorted_unique_intervals_right[i] - sorted_unique_intervals_left[i]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
348 i += 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
349
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
350 # print query_min,query_max,sum_uniq
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
351 if sum_uniq >= unique_length_required:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
352 alignments_to_keep.append(linecounter)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
353 elif keep_small_uniques == True and exact_match == True:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
354 alignments_to_keep.append(linecounter)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
355 # print "Keeping small alignment:", query_min, query_max
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
356 # print sorted_unique_intervals_left[i-1],sorted_unique_intervals_right[i-1]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
357
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
358 linecounter += 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
359
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
360 return alignments_to_keep
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
361
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
362
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
363
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
364 def binary_search(query, numbers, left, right):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
365 # Returns index of the matching element or the first element to the right
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
366
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
367 if left >= right:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
368 return right
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
369 mid = (right+left)//2
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
370
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
371
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
372 if query == numbers[mid]:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
373 return mid
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
374 elif query < numbers[mid]:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
375 return binary_search(query,numbers,left,mid)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
376 else: # if query > numbers[mid]:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
377 return binary_search(query,numbers,mid+1,right)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
378
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
379
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
380 # def summarize_intervaltree(lines, unique_length_required):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
381
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
382 # alignments_to_keep = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
383 # # print len(lines)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
384
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
385 # if len(lines)==0:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
386 # return alignments_to_keep
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
387
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
388 # if len(lines) == 1:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
389 # if abs(lines[0][1] - lines[0][0]) >= unique_length_required:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
390 # return [0]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
391
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
392
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
393 # starts_and_stops = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
394 # for query_min,query_max in lines:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
395 # starts_and_stops.append((query_min,query_max))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
396
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
397 # # build full tree
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
398 # tree = IntervalTree.from_tuples(starts_and_stops)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
399
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
400
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
401 # # for each interval (keeping the same order as the lines in the input file)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
402 # line_counter = 0
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
403 # for query_min,query_max in lines:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
404
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
405 # # create a tree object from the current interval
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
406 # this_interval = IntervalTree.from_tuples([(query_min,query_max)])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
407
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
408 # # create a copy of the tree without this one interval
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
409 # rest_of_tree = tree - this_interval
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
410
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
411 # # find difference between this interval and the rest of the tree by subtracting out the other intervals one by one
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
412 # for other_interval in rest_of_tree:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
413 # this_interval.chop(other_interval.begin, other_interval.end)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
414
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
415 # # loop through to count the total number of unique basepairs
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
416 # total_unique_length = 0
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
417 # for sub_interval in this_interval:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
418 # total_unique_length += sub_interval.end - sub_interval.begin
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
419
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
420 # # if the total unique length is above our threshold, add the index to the list we are reporting
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
421 # if total_unique_length >= unique_length_required:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
422 # alignments_to_keep.append(line_counter)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
423 # line_counter += 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
424
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
425
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
426 # return alignments_to_keep
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
427
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
428
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
429 def main():
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
430 parser=argparse.ArgumentParser(description="Filters alignments in delta file based whether each alignment has a unique sequence anchoring it")
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
431 parser.add_argument("--delta",help="delta file" ,dest="delta", type=str, required=True)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
432 parser.add_argument("--out",help="output file" ,dest="out", type=str, required=True)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
433 parser.add_argument("--unique-length",help="The total length of unique sequence an alignment must have on the query side to be retained. Default: 10000" ,dest="unique_length",type=int, default=10000)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
434 parser.add_argument("--keep-small-uniques",help="Keep small aligments (below the unique anchor length) if they are completely unique without any part of the alignment mapping multiple places" ,dest="keep_small_uniques",action="store_true")
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
435 parser.set_defaults(func=run)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
436 args=parser.parse_args()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
437 args.func(args)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
438
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
439 if __name__=="__main__":
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
440 main()