Mercurial > repos > weilong-guo > bs_seeker2
comparison BSseeker2/bs_align/bs_align_utils.py @ 0:e6df770c0e58 draft
Initial upload
author | weilong-guo |
---|---|
date | Fri, 12 Jul 2013 18:47:28 -0400 |
parents | |
children | 8b26adf64adc |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e6df770c0e58 |
---|---|
1 from bs_utils.utils import * | |
2 import re | |
3 | |
4 | |
5 BAM_MATCH = 0 | |
6 BAM_INS = 1 | |
7 BAM_DEL = 2 | |
8 BAM_SOFTCLIP = 4 | |
9 | |
10 CIGAR_OPS = {'M' : BAM_MATCH, 'I' : BAM_INS, 'D' : BAM_DEL, 'S' : BAM_SOFTCLIP} | |
11 | |
12 | |
13 def N_MIS(r,g): | |
14 mismatches = 0 | |
15 if len(r)==len(g): | |
16 for i in xrange(len(r)): | |
17 if r[i] != g[i] and r[i] != "N" and g[i] != "N" and not(r[i] == 'T' and g[i] == 'C'): | |
18 mismatches += 1 | |
19 return mismatches | |
20 | |
21 | |
22 #---------------------------------------------------------------- | |
23 | |
24 """ | |
25 Exmaple: | |
26 ======== | |
27 Read : ACCGCGTTGATCGAGTACGTACGTGGGTC | |
28 Adapter : ....................ACGTGGGTCCCG | |
29 ======== | |
30 | |
31 no_mismatch : the maximum number allowed for mismatches | |
32 | |
33 Algorithm: (allowing 1 mismatch) | |
34 ======== | |
35 -Step 1: | |
36 ACCGCGTTGATCGAGTACGTACGTGGGTC | |
37 ||XX | |
38 ACGTGGGTCCCG | |
39 -Step 2: | |
40 ACCGCGTTGATCGAGTACGTACGTGGGTC | |
41 X||X | |
42 .ACGTGGGTCCCG | |
43 -Step 3: | |
44 ACCGCGTTGATCGAGTACGTACGTGGGTC | |
45 XX | |
46 ..ACGTGGGTCCCG | |
47 -Step ... | |
48 -Step N: | |
49 ACCGCGTTGATCGAGTACGTACGTGGGTC | |
50 ||||||||| | |
51 ....................ACGTGGGTCCCG | |
52 Success & return! | |
53 ======== | |
54 | |
55 """ | |
56 | |
57 def RemoveAdapter ( read, adapter, no_mismatch ) : | |
58 lr = len(read) | |
59 la = len(adapter) | |
60 for i in xrange( lr - no_mismatch ) : | |
61 read_pos = i | |
62 adapter_pos = 0 | |
63 count_no_mis = 0 | |
64 while (adapter_pos < la) and (read_pos < lr) : | |
65 if (read[read_pos] == adapter[adapter_pos]) : | |
66 read_pos = read_pos + 1 | |
67 adapter_pos = adapter_pos + 1 | |
68 else : | |
69 count_no_mis = count_no_mis + 1 | |
70 if count_no_mis > no_mismatch : | |
71 break | |
72 else : | |
73 read_pos = read_pos + 1 | |
74 adapter_pos = adapter_pos + 1 | |
75 # while_end | |
76 | |
77 if adapter_pos == la or read_pos == lr : | |
78 return read[:i] | |
79 # for_end | |
80 return read | |
81 | |
82 | |
83 def Remove_5end_Adapter ( read, adapter, no_mismatch) : | |
84 lr = len(read) | |
85 la = len(adapter) | |
86 for i in xrange (la - no_mismatch) : | |
87 read_pos = 0 | |
88 adapter_pos = i | |
89 count_no_mis = 0 | |
90 while (adapter_pos < la) and (read_pos < lr) : | |
91 if (read[read_pos] == adapter[adapter_pos]) : | |
92 adapter_pos = adapter_pos + 1 | |
93 read_pos = read_pos + 1 | |
94 else : | |
95 count_no_mis = count_no_mis + 1 | |
96 if count_no_mis > no_mismatch : | |
97 break | |
98 else : | |
99 read_pos = read_pos + 1 | |
100 adapter_pos = adapter_pos + 1 | |
101 # while_end | |
102 if adapter_pos == la : | |
103 return read[(la-i):] | |
104 | |
105 | |
106 def next_nuc(seq, pos, n): | |
107 """ Returns the nucleotide that is n places from pos in seq. Skips gap symbols. | |
108 """ | |
109 i = pos + 1 | |
110 while i < len(seq): | |
111 if seq[i] != '-': | |
112 n -= 1 | |
113 if n == 0: break | |
114 i += 1 | |
115 if i < len(seq) : | |
116 return seq[i] | |
117 else : | |
118 return 'N' | |
119 | |
120 | |
121 | |
122 def methy_seq(read, genome): | |
123 H = ['A', 'C', 'T'] | |
124 m_seq = [] | |
125 xx = "-" | |
126 for i in xrange(len(read)): | |
127 | |
128 if genome[i] == '-': | |
129 continue | |
130 | |
131 elif read[i] != 'C' and read[i] != 'T': | |
132 xx = "-" | |
133 | |
134 elif read[i] == "T" and genome[i] == "C": #(unmethylated): | |
135 nn1 = next_nuc(genome, i, 1) | |
136 if nn1 == "G": | |
137 xx = "x" | |
138 elif nn1 in H : | |
139 nn2 = next_nuc(genome, i, 2) | |
140 if nn2 == "G": | |
141 xx = "y" | |
142 elif nn2 in H : | |
143 xx = "z" | |
144 | |
145 elif read[i] == "C" and genome[i] == "C": #(methylated): | |
146 nn1 = next_nuc(genome, i, 1) | |
147 | |
148 if nn1 == "G": | |
149 xx = "X" | |
150 elif nn1 in H : | |
151 nn2 = next_nuc(genome, i, 2) | |
152 | |
153 if nn2 == "G": | |
154 xx = "Y" | |
155 elif nn2 in H: | |
156 xx = "Z" | |
157 else: | |
158 xx = "-" | |
159 m_seq.append(xx) | |
160 | |
161 return ''.join(m_seq) | |
162 | |
163 def mcounts(mseq, mlst, ulst): | |
164 out_mlst=[mlst[0]+mseq.count("X"), mlst[1]+mseq.count("Y"), mlst[2]+mseq.count("Z")] | |
165 out_ulst=[ulst[0]+mseq.count("x"), ulst[1]+mseq.count("y"), ulst[2]+mseq.count("z")] | |
166 return out_mlst, out_ulst | |
167 | |
168 | |
169 | |
170 def process_aligner_output(filename, pair_end = False): | |
171 | |
172 #m = re.search(r'-('+'|'.join(supported_aligners) +')-TMP', filename) | |
173 m = re.search(r'-('+'|'.join(supported_aligners) +')-.*TMP', filename) | |
174 if m is None: | |
175 error('The temporary folder path should contain the name of one of the supported aligners: ' + filename) | |
176 | |
177 format = m.group(1) | |
178 try : | |
179 input = open(filename) | |
180 except IOError: | |
181 print "[Error] Cannot open file %s" % filename | |
182 exit(-1) | |
183 | |
184 QNAME, FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL = range(11) | |
185 def parse_SAM(line): | |
186 buf = line.split() | |
187 # print buf | |
188 flag = int(buf[FLAG]) | |
189 | |
190 # skip reads that are not mapped | |
191 # skip reads that have probability of being non-unique higher than 1/10 | |
192 if flag & 0x4 : # or int(buf[MAPQ]) < 10: | |
193 return None, None, None, None, None, None | |
194 # print "format = ", format | |
195 if format == BOWTIE: | |
196 mismatches = int([buf[i][5:] for i in xrange(11, len(buf)) if buf[i][:5] == 'NM:i:'][0]) # get the edit distance | |
197 # --- bug fixed ------ | |
198 elif format == BOWTIE2: | |
199 if re.search(r'(.)*-e2e-TMP(.*)', filename) is None : # local model | |
200 mismatches = 1-int([buf[i][5:] for i in xrange(11, len(buf)) if buf[i][:5] == 'AS:i:'][0]) | |
201 # print "====local=====\n" | |
202 ## bowtie2 use AS tag (score) to evaluate the mapping. The higher, the better. | |
203 else : # end-to-end model | |
204 # print "end-to-end\n" | |
205 mismatches = int([buf[i][5:] for i in xrange(11, len(buf)) if buf[i][:5] == 'XM:i:'][0]) | |
206 # --- Weilong --------- | |
207 elif format == SOAP: | |
208 mismatches = 1-buf[MAPQ] | |
209 # mismatches = 1/float(buf[MAPQ]) | |
210 ## downstream might round (0,1) to 0, so use integer instead | |
211 ## fixed by Weilong | |
212 elif format == RMAP: | |
213 # chr16 75728107 75728147 read45 9 - | |
214 # chr16 67934919 67934959 read45 9 - | |
215 mismatches = buf[4] | |
216 | |
217 return (buf[QNAME], # read ID | |
218 buf[RNAME], # reference ID | |
219 int(buf[POS]) - 1, # position, 0 based (SAM is 1 based) | |
220 mismatches, # number of mismatches | |
221 parse_cigar(buf[CIGAR]), # the parsed cigar string | |
222 flag & 0x40 # true if it is the first mate in a pair, false if it is the second mate | |
223 ) | |
224 | |
225 SOAP_QNAME, SOAP_SEQ, SOAP_QUAL, SOAP_NHITS, SOAP_AB, SOAP_LEN, SOAP_STRAND, SOAP_CHR, SOAP_LOCATION, SOAP_MISMATCHES = range(10) | |
226 def parse_SOAP(line): | |
227 buf = line.split() | |
228 return (buf[SOAP_QNAME], | |
229 buf[SOAP_CHR], | |
230 int(buf[SOAP_LOCATION]) - 1, | |
231 int(buf[SOAP_MISMATCHES]), | |
232 buf[SOAP_AB], | |
233 buf[SOAP_STRAND], | |
234 parse_cigar(buf[SOAP_LEN]+'M') | |
235 ) | |
236 | |
237 # chr16 75728107 75728147 read45 9 - | |
238 RMAP_CHR, RMAP_START, RMAP_END, RMAP_QNAME, RMAP_MISMATCH, RMAP_STRAND = range(6) | |
239 def parse_RMAP(line): | |
240 buf = line.split() | |
241 return ( buf[RMAP_QNAME], | |
242 buf[RMAP_CHR], | |
243 int(buf[RMAP_START]), # to check -1 or not | |
244 int(buf[RMAP_END]) - int(buf[RMAP_START]) + 1, | |
245 int(buf[RMAP_MISMATCH]), | |
246 buf[RMAP_STRAND] | |
247 ) | |
248 | |
249 if format == BOWTIE or format == BOWTIE2: | |
250 if pair_end: | |
251 for line in input: | |
252 header1, chr1, location1, no_mismatch1, cigar1, _ = parse_SAM(line) | |
253 header2, _, location2, no_mismatch2, cigar2, mate_no2 = parse_SAM(input.next()) | |
254 | |
255 | |
256 if header1 and header2: | |
257 # flip the location info if the second mate comes first in the alignment file | |
258 if mate_no2: | |
259 location1, location2 = location2, location1 | |
260 cigar1, cigar2 = cigar2, cigar1 | |
261 | |
262 | |
263 yield header1, chr1, no_mismatch1 + no_mismatch2, location1, cigar1, location2, cigar2 | |
264 else: | |
265 for line in input: | |
266 header, chr, location, no_mismatch, cigar, _ = parse_SAM(line) | |
267 if header is not None: | |
268 yield header, chr, location, no_mismatch, cigar | |
269 elif format == SOAP: | |
270 if pair_end: | |
271 for line in input: | |
272 header1, chr1, location1, no_mismatch1, mate1, strand1, cigar1 = parse_SOAP(line) | |
273 header2, _ , location2, no_mismatch2, _, strand2, cigar2 = parse_SOAP(input.next()) | |
274 | |
275 if mate1 == 'b': | |
276 location1, location2 = location2, location1 | |
277 strand1, strand2 = strand2, strand1 | |
278 ciga1, cigar2 = cigar2, cigar1 | |
279 | |
280 | |
281 if header1 and header2 and strand1 == '+' and strand2 == '-': | |
282 yield header1, chr1, no_mismatch1 + no_mismatch2, location1, cigar1, location2, cigar2 | |
283 | |
284 else: | |
285 for line in input: | |
286 header, chr, location, no_mismatch, _, strand, cigar = parse_SOAP(line) | |
287 if header and strand == '+': | |
288 yield header, chr, location, no_mismatch, cigar | |
289 elif format == RMAP : | |
290 if pair_end : | |
291 todo = 0 | |
292 # to do | |
293 else : | |
294 for line in input: | |
295 header, chr, location, read_len, no_mismatch, strand = parse_RMAP(line) | |
296 cigar = str(read_len) + "M" | |
297 yield header, chr, location, no_mismatch, cigar | |
298 | |
299 input.close() | |
300 | |
301 | |
302 def parse_cigar(cigar_string): | |
303 i = 0 | |
304 prev_i = 0 | |
305 cigar = [] | |
306 while i < len(cigar_string): | |
307 if cigar_string[i] in CIGAR_OPS: | |
308 cigar.append((CIGAR_OPS[cigar_string[i]], int(cigar_string[prev_i:i]))) | |
309 prev_i = i + 1 | |
310 i += 1 | |
311 return cigar | |
312 | |
313 def get_read_start_end_and_genome_length(cigar): | |
314 r_start = cigar[0][1] if cigar[0][0] == BAM_SOFTCLIP else 0 | |
315 r_end = r_start | |
316 g_len = 0 | |
317 for edit_op, count in cigar: | |
318 if edit_op == BAM_MATCH: | |
319 r_end += count | |
320 g_len += count | |
321 elif edit_op == BAM_INS: | |
322 r_end += count | |
323 elif edit_op == BAM_DEL: | |
324 g_len += count | |
325 return r_start, r_end, g_len # return the start and end in the read and the length of the genomic sequence | |
326 # r_start : start position on the read | |
327 # r_end : end position on the read | |
328 # g_len : length of the mapped region on genome | |
329 | |
330 | |
331 def cigar_to_alignment(cigar, read_seq, genome_seq): | |
332 """ Reconstruct the pairwise alignment based on the CIGAR string and the two sequences | |
333 """ | |
334 | |
335 # reconstruct the alignment | |
336 r_pos = cigar[0][1] if cigar[0][0] == BAM_SOFTCLIP else 0 | |
337 g_pos = 0 | |
338 r_aln = '' | |
339 g_aln = '' | |
340 for edit_op, count in cigar: | |
341 if edit_op == BAM_MATCH: | |
342 r_aln += read_seq[r_pos : r_pos + count] | |
343 g_aln += genome_seq[g_pos : g_pos + count] | |
344 r_pos += count | |
345 g_pos += count | |
346 elif edit_op == BAM_DEL: | |
347 r_aln += '-'*count | |
348 g_aln += genome_seq[g_pos : g_pos + count] | |
349 g_pos += count | |
350 elif edit_op == BAM_INS: | |
351 r_aln += read_seq[r_pos : r_pos + count] | |
352 g_aln += '-'*count | |
353 r_pos += count | |
354 | |
355 return r_aln, g_aln | |
356 | |
357 | |
358 | |
359 # return sequence is [start, end), not include 'end' | |
360 def get_genomic_sequence(genome, start, end, strand = '+'): | |
361 if strand != '+' and strand != '-' : | |
362 print "[Bug] get_genomic_sequence input should be \'+\' or \'-\'." | |
363 exit(-1) | |
364 if start > 1: | |
365 prev = genome[start-2:start] | |
366 elif start == 1: | |
367 prev = 'N'+genome[0] | |
368 else: | |
369 prev = 'NN' | |
370 | |
371 if end < len(genome) - 1: | |
372 next = genome[end: end + 2] | |
373 elif end == len(genome) - 1: | |
374 next = genome[end] + 'N' | |
375 else: | |
376 next = 'NN' | |
377 origin_genome = genome[start:end] | |
378 | |
379 if strand == '-': | |
380 # reverse complement everything if strand is '-' | |
381 revc = reverse_compl_seq('%s%s%s' % (prev, origin_genome, next)) | |
382 prev, origin_genome, next = revc[:2], revc[2:-2], revc[-2:] | |
383 | |
384 return origin_genome, next, '%s_%s_%s' % (prev, origin_genome, next) | |
385 # next : next two nucleotides |