0
|
1 from bs_utils.utils import *
|
|
2 import re
|
|
3
|
|
4
|
|
5 BAM_MATCH = 0
|
|
6 BAM_INS = 1
|
|
7 BAM_DEL = 2
|
|
8 BAM_SOFTCLIP = 4
|
|
9
|
|
10 CIGAR_OPS = {'M' : BAM_MATCH, 'I' : BAM_INS, 'D' : BAM_DEL, 'S' : BAM_SOFTCLIP}
|
|
11
|
|
12
|
|
13 def N_MIS(r,g):
|
|
14 mismatches = 0
|
|
15 if len(r)==len(g):
|
|
16 for i in xrange(len(r)):
|
|
17 if r[i] != g[i] and r[i] != "N" and g[i] != "N" and not(r[i] == 'T' and g[i] == 'C'):
|
|
18 mismatches += 1
|
|
19 return mismatches
|
|
20
|
|
21
|
|
22 #----------------------------------------------------------------
|
|
23
|
|
24 """
|
|
25 Exmaple:
|
|
26 ========
|
|
27 Read : ACCGCGTTGATCGAGTACGTACGTGGGTC
|
|
28 Adapter : ....................ACGTGGGTCCCG
|
|
29 ========
|
|
30
|
|
31 no_mismatch : the maximum number allowed for mismatches
|
|
32
|
|
33 Algorithm: (allowing 1 mismatch)
|
|
34 ========
|
|
35 -Step 1:
|
|
36 ACCGCGTTGATCGAGTACGTACGTGGGTC
|
|
37 ||XX
|
|
38 ACGTGGGTCCCG
|
|
39 -Step 2:
|
|
40 ACCGCGTTGATCGAGTACGTACGTGGGTC
|
|
41 X||X
|
|
42 .ACGTGGGTCCCG
|
|
43 -Step 3:
|
|
44 ACCGCGTTGATCGAGTACGTACGTGGGTC
|
|
45 XX
|
|
46 ..ACGTGGGTCCCG
|
|
47 -Step ...
|
|
48 -Step N:
|
|
49 ACCGCGTTGATCGAGTACGTACGTGGGTC
|
|
50 |||||||||
|
|
51 ....................ACGTGGGTCCCG
|
|
52 Success & return!
|
|
53 ========
|
|
54
|
|
55 """
|
|
56
|
1
|
57 # Remove the adapter from 3' end
|
|
58 def RemoveAdapter ( read, adapter, no_mismatch, rm_back=0) :
|
0
|
59 lr = len(read)
|
|
60 la = len(adapter)
|
1
|
61 # Check the empty adapter, namely, the reads start with the 2nd base of adapter,
|
|
62 # not including the 'A' base in front of the adapter.
|
|
63 if adapter[2:] == read[0:(la-1)] :
|
|
64 return ""
|
|
65
|
0
|
66 for i in xrange( lr - no_mismatch ) :
|
|
67 read_pos = i
|
|
68 adapter_pos = 0
|
|
69 count_no_mis = 0
|
|
70 while (adapter_pos < la) and (read_pos < lr) :
|
|
71 if (read[read_pos] == adapter[adapter_pos]) :
|
|
72 read_pos = read_pos + 1
|
|
73 adapter_pos = adapter_pos + 1
|
|
74 else :
|
|
75 count_no_mis = count_no_mis + 1
|
|
76 if count_no_mis > no_mismatch :
|
|
77 break
|
|
78 else :
|
|
79 read_pos = read_pos + 1
|
|
80 adapter_pos = adapter_pos + 1
|
|
81 # while_end
|
|
82
|
1
|
83 # Cut the extra bases before the adapter
|
|
84 # --C|CG G-- => --CNN+A+<adapter>
|
|
85 # --G GC|C-- --GGC
|
0
|
86 if adapter_pos == la or read_pos == lr :
|
1
|
87 if i <= rm_back :
|
|
88 return ''
|
|
89 else :
|
|
90 return read[:(i-rm_back)]
|
0
|
91 # for_end
|
|
92 return read
|
|
93
|
|
94
|
|
95 def Remove_5end_Adapter ( read, adapter, no_mismatch) :
|
|
96 lr = len(read)
|
|
97 la = len(adapter)
|
|
98 for i in xrange (la - no_mismatch) :
|
|
99 read_pos = 0
|
|
100 adapter_pos = i
|
|
101 count_no_mis = 0
|
|
102 while (adapter_pos < la) and (read_pos < lr) :
|
|
103 if (read[read_pos] == adapter[adapter_pos]) :
|
|
104 adapter_pos = adapter_pos + 1
|
|
105 read_pos = read_pos + 1
|
|
106 else :
|
|
107 count_no_mis = count_no_mis + 1
|
|
108 if count_no_mis > no_mismatch :
|
|
109 break
|
|
110 else :
|
|
111 read_pos = read_pos + 1
|
|
112 adapter_pos = adapter_pos + 1
|
|
113 # while_end
|
|
114 if adapter_pos == la :
|
|
115 return read[(la-i):]
|
|
116
|
|
117
|
|
118 def next_nuc(seq, pos, n):
|
|
119 """ Returns the nucleotide that is n places from pos in seq. Skips gap symbols.
|
|
120 """
|
|
121 i = pos + 1
|
|
122 while i < len(seq):
|
|
123 if seq[i] != '-':
|
|
124 n -= 1
|
|
125 if n == 0: break
|
|
126 i += 1
|
|
127 if i < len(seq) :
|
|
128 return seq[i]
|
|
129 else :
|
|
130 return 'N'
|
|
131
|
|
132
|
|
133
|
|
134 def methy_seq(read, genome):
|
|
135 H = ['A', 'C', 'T']
|
|
136 m_seq = []
|
|
137 xx = "-"
|
|
138 for i in xrange(len(read)):
|
|
139
|
|
140 if genome[i] == '-':
|
|
141 continue
|
|
142
|
|
143 elif read[i] != 'C' and read[i] != 'T':
|
|
144 xx = "-"
|
|
145
|
|
146 elif read[i] == "T" and genome[i] == "C": #(unmethylated):
|
|
147 nn1 = next_nuc(genome, i, 1)
|
|
148 if nn1 == "G":
|
|
149 xx = "x"
|
|
150 elif nn1 in H :
|
|
151 nn2 = next_nuc(genome, i, 2)
|
|
152 if nn2 == "G":
|
|
153 xx = "y"
|
|
154 elif nn2 in H :
|
|
155 xx = "z"
|
|
156
|
|
157 elif read[i] == "C" and genome[i] == "C": #(methylated):
|
|
158 nn1 = next_nuc(genome, i, 1)
|
|
159
|
|
160 if nn1 == "G":
|
|
161 xx = "X"
|
|
162 elif nn1 in H :
|
|
163 nn2 = next_nuc(genome, i, 2)
|
|
164
|
|
165 if nn2 == "G":
|
|
166 xx = "Y"
|
|
167 elif nn2 in H:
|
|
168 xx = "Z"
|
|
169 else:
|
|
170 xx = "-"
|
|
171 m_seq.append(xx)
|
|
172
|
|
173 return ''.join(m_seq)
|
|
174
|
|
175 def mcounts(mseq, mlst, ulst):
|
|
176 out_mlst=[mlst[0]+mseq.count("X"), mlst[1]+mseq.count("Y"), mlst[2]+mseq.count("Z")]
|
|
177 out_ulst=[ulst[0]+mseq.count("x"), ulst[1]+mseq.count("y"), ulst[2]+mseq.count("z")]
|
|
178 return out_mlst, out_ulst
|
|
179
|
|
180
|
|
181
|
|
182 def process_aligner_output(filename, pair_end = False):
|
|
183
|
|
184 #m = re.search(r'-('+'|'.join(supported_aligners) +')-TMP', filename)
|
|
185 m = re.search(r'-('+'|'.join(supported_aligners) +')-.*TMP', filename)
|
|
186 if m is None:
|
|
187 error('The temporary folder path should contain the name of one of the supported aligners: ' + filename)
|
|
188
|
|
189 format = m.group(1)
|
|
190 try :
|
|
191 input = open(filename)
|
|
192 except IOError:
|
|
193 print "[Error] Cannot open file %s" % filename
|
|
194 exit(-1)
|
|
195
|
|
196 QNAME, FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL = range(11)
|
|
197 def parse_SAM(line):
|
|
198 buf = line.split()
|
|
199 # print buf
|
|
200 flag = int(buf[FLAG])
|
|
201
|
|
202 # skip reads that are not mapped
|
|
203 # skip reads that have probability of being non-unique higher than 1/10
|
|
204 if flag & 0x4 : # or int(buf[MAPQ]) < 10:
|
|
205 return None, None, None, None, None, None
|
|
206 # print "format = ", format
|
|
207 if format == BOWTIE:
|
|
208 mismatches = int([buf[i][5:] for i in xrange(11, len(buf)) if buf[i][:5] == 'NM:i:'][0]) # get the edit distance
|
|
209 # --- bug fixed ------
|
|
210 elif format == BOWTIE2:
|
|
211 if re.search(r'(.)*-e2e-TMP(.*)', filename) is None : # local model
|
|
212 mismatches = 1-int([buf[i][5:] for i in xrange(11, len(buf)) if buf[i][:5] == 'AS:i:'][0])
|
|
213 # print "====local=====\n"
|
|
214 ## bowtie2 use AS tag (score) to evaluate the mapping. The higher, the better.
|
|
215 else : # end-to-end model
|
|
216 # print "end-to-end\n"
|
|
217 mismatches = int([buf[i][5:] for i in xrange(11, len(buf)) if buf[i][:5] == 'XM:i:'][0])
|
|
218 # --- Weilong ---------
|
|
219 elif format == SOAP:
|
|
220 mismatches = 1-buf[MAPQ]
|
|
221 # mismatches = 1/float(buf[MAPQ])
|
|
222 ## downstream might round (0,1) to 0, so use integer instead
|
|
223 ## fixed by Weilong
|
|
224 elif format == RMAP:
|
|
225 # chr16 75728107 75728147 read45 9 -
|
|
226 # chr16 67934919 67934959 read45 9 -
|
|
227 mismatches = buf[4]
|
|
228
|
|
229 return (buf[QNAME], # read ID
|
|
230 buf[RNAME], # reference ID
|
|
231 int(buf[POS]) - 1, # position, 0 based (SAM is 1 based)
|
|
232 mismatches, # number of mismatches
|
|
233 parse_cigar(buf[CIGAR]), # the parsed cigar string
|
|
234 flag & 0x40 # true if it is the first mate in a pair, false if it is the second mate
|
|
235 )
|
|
236
|
|
237 SOAP_QNAME, SOAP_SEQ, SOAP_QUAL, SOAP_NHITS, SOAP_AB, SOAP_LEN, SOAP_STRAND, SOAP_CHR, SOAP_LOCATION, SOAP_MISMATCHES = range(10)
|
|
238 def parse_SOAP(line):
|
|
239 buf = line.split()
|
|
240 return (buf[SOAP_QNAME],
|
|
241 buf[SOAP_CHR],
|
|
242 int(buf[SOAP_LOCATION]) - 1,
|
|
243 int(buf[SOAP_MISMATCHES]),
|
|
244 buf[SOAP_AB],
|
|
245 buf[SOAP_STRAND],
|
|
246 parse_cigar(buf[SOAP_LEN]+'M')
|
|
247 )
|
|
248
|
|
249 # chr16 75728107 75728147 read45 9 -
|
|
250 RMAP_CHR, RMAP_START, RMAP_END, RMAP_QNAME, RMAP_MISMATCH, RMAP_STRAND = range(6)
|
|
251 def parse_RMAP(line):
|
|
252 buf = line.split()
|
|
253 return ( buf[RMAP_QNAME],
|
|
254 buf[RMAP_CHR],
|
|
255 int(buf[RMAP_START]), # to check -1 or not
|
|
256 int(buf[RMAP_END]) - int(buf[RMAP_START]) + 1,
|
|
257 int(buf[RMAP_MISMATCH]),
|
|
258 buf[RMAP_STRAND]
|
|
259 )
|
|
260
|
|
261 if format == BOWTIE or format == BOWTIE2:
|
|
262 if pair_end:
|
|
263 for line in input:
|
|
264 header1, chr1, location1, no_mismatch1, cigar1, _ = parse_SAM(line)
|
|
265 header2, _, location2, no_mismatch2, cigar2, mate_no2 = parse_SAM(input.next())
|
|
266
|
|
267
|
|
268 if header1 and header2:
|
|
269 # flip the location info if the second mate comes first in the alignment file
|
|
270 if mate_no2:
|
|
271 location1, location2 = location2, location1
|
|
272 cigar1, cigar2 = cigar2, cigar1
|
|
273
|
|
274
|
|
275 yield header1, chr1, no_mismatch1 + no_mismatch2, location1, cigar1, location2, cigar2
|
|
276 else:
|
|
277 for line in input:
|
|
278 header, chr, location, no_mismatch, cigar, _ = parse_SAM(line)
|
|
279 if header is not None:
|
|
280 yield header, chr, location, no_mismatch, cigar
|
|
281 elif format == SOAP:
|
|
282 if pair_end:
|
|
283 for line in input:
|
|
284 header1, chr1, location1, no_mismatch1, mate1, strand1, cigar1 = parse_SOAP(line)
|
|
285 header2, _ , location2, no_mismatch2, _, strand2, cigar2 = parse_SOAP(input.next())
|
|
286
|
|
287 if mate1 == 'b':
|
|
288 location1, location2 = location2, location1
|
|
289 strand1, strand2 = strand2, strand1
|
|
290 ciga1, cigar2 = cigar2, cigar1
|
|
291
|
|
292
|
|
293 if header1 and header2 and strand1 == '+' and strand2 == '-':
|
|
294 yield header1, chr1, no_mismatch1 + no_mismatch2, location1, cigar1, location2, cigar2
|
|
295
|
|
296 else:
|
|
297 for line in input:
|
|
298 header, chr, location, no_mismatch, _, strand, cigar = parse_SOAP(line)
|
|
299 if header and strand == '+':
|
|
300 yield header, chr, location, no_mismatch, cigar
|
|
301 elif format == RMAP :
|
|
302 if pair_end :
|
|
303 todo = 0
|
|
304 # to do
|
|
305 else :
|
|
306 for line in input:
|
|
307 header, chr, location, read_len, no_mismatch, strand = parse_RMAP(line)
|
|
308 cigar = str(read_len) + "M"
|
|
309 yield header, chr, location, no_mismatch, cigar
|
|
310
|
|
311 input.close()
|
|
312
|
|
313
|
|
314 def parse_cigar(cigar_string):
|
|
315 i = 0
|
|
316 prev_i = 0
|
|
317 cigar = []
|
|
318 while i < len(cigar_string):
|
|
319 if cigar_string[i] in CIGAR_OPS:
|
|
320 cigar.append((CIGAR_OPS[cigar_string[i]], int(cigar_string[prev_i:i])))
|
|
321 prev_i = i + 1
|
|
322 i += 1
|
|
323 return cigar
|
|
324
|
|
325 def get_read_start_end_and_genome_length(cigar):
|
|
326 r_start = cigar[0][1] if cigar[0][0] == BAM_SOFTCLIP else 0
|
|
327 r_end = r_start
|
|
328 g_len = 0
|
|
329 for edit_op, count in cigar:
|
|
330 if edit_op == BAM_MATCH:
|
|
331 r_end += count
|
|
332 g_len += count
|
|
333 elif edit_op == BAM_INS:
|
|
334 r_end += count
|
|
335 elif edit_op == BAM_DEL:
|
|
336 g_len += count
|
|
337 return r_start, r_end, g_len # return the start and end in the read and the length of the genomic sequence
|
|
338 # r_start : start position on the read
|
|
339 # r_end : end position on the read
|
|
340 # g_len : length of the mapped region on genome
|
|
341
|
|
342
|
|
343 def cigar_to_alignment(cigar, read_seq, genome_seq):
|
|
344 """ Reconstruct the pairwise alignment based on the CIGAR string and the two sequences
|
|
345 """
|
|
346
|
|
347 # reconstruct the alignment
|
|
348 r_pos = cigar[0][1] if cigar[0][0] == BAM_SOFTCLIP else 0
|
|
349 g_pos = 0
|
|
350 r_aln = ''
|
|
351 g_aln = ''
|
|
352 for edit_op, count in cigar:
|
|
353 if edit_op == BAM_MATCH:
|
|
354 r_aln += read_seq[r_pos : r_pos + count]
|
|
355 g_aln += genome_seq[g_pos : g_pos + count]
|
|
356 r_pos += count
|
|
357 g_pos += count
|
|
358 elif edit_op == BAM_DEL:
|
|
359 r_aln += '-'*count
|
|
360 g_aln += genome_seq[g_pos : g_pos + count]
|
|
361 g_pos += count
|
|
362 elif edit_op == BAM_INS:
|
|
363 r_aln += read_seq[r_pos : r_pos + count]
|
|
364 g_aln += '-'*count
|
|
365 r_pos += count
|
|
366
|
|
367 return r_aln, g_aln
|
|
368
|
|
369
|
|
370
|
|
371 # return sequence is [start, end), not include 'end'
|
|
372 def get_genomic_sequence(genome, start, end, strand = '+'):
|
|
373 if strand != '+' and strand != '-' :
|
|
374 print "[Bug] get_genomic_sequence input should be \'+\' or \'-\'."
|
|
375 exit(-1)
|
|
376 if start > 1:
|
|
377 prev = genome[start-2:start]
|
|
378 elif start == 1:
|
|
379 prev = 'N'+genome[0]
|
|
380 else:
|
|
381 prev = 'NN'
|
|
382
|
|
383 if end < len(genome) - 1:
|
|
384 next = genome[end: end + 2]
|
|
385 elif end == len(genome) - 1:
|
|
386 next = genome[end] + 'N'
|
|
387 else:
|
|
388 next = 'NN'
|
|
389 origin_genome = genome[start:end]
|
|
390
|
|
391 if strand == '-':
|
|
392 # reverse complement everything if strand is '-'
|
|
393 revc = reverse_compl_seq('%s%s%s' % (prev, origin_genome, next))
|
|
394 prev, origin_genome, next = revc[:2], revc[2:-2], revc[-2:]
|
|
395
|
|
396 return origin_genome, next, '%s_%s_%s' % (prev, origin_genome, next)
|
|
397 # next : next two nucleotides |