annotate BSseeker2/bs_align/bs_align_utils.py @ 0:e6df770c0e58 draft

Initial upload
author weilong-guo
date Fri, 12 Jul 2013 18:47:28 -0400
parents
children 8b26adf64adc
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
1 from bs_utils.utils import *
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
2 import re
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
3
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
4
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
5 BAM_MATCH = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
6 BAM_INS = 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
7 BAM_DEL = 2
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
8 BAM_SOFTCLIP = 4
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
9
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
10 CIGAR_OPS = {'M' : BAM_MATCH, 'I' : BAM_INS, 'D' : BAM_DEL, 'S' : BAM_SOFTCLIP}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
11
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
12
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
13 def N_MIS(r,g):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
14 mismatches = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
15 if len(r)==len(g):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
16 for i in xrange(len(r)):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
17 if r[i] != g[i] and r[i] != "N" and g[i] != "N" and not(r[i] == 'T' and g[i] == 'C'):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
18 mismatches += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
19 return mismatches
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
20
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
21
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
22 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
23
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
24 """
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
25 Exmaple:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
26 ========
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
27 Read : ACCGCGTTGATCGAGTACGTACGTGGGTC
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
28 Adapter : ....................ACGTGGGTCCCG
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
29 ========
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
30
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
31 no_mismatch : the maximum number allowed for mismatches
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
32
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
33 Algorithm: (allowing 1 mismatch)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
34 ========
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
35 -Step 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
36 ACCGCGTTGATCGAGTACGTACGTGGGTC
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
37 ||XX
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
38 ACGTGGGTCCCG
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
39 -Step 2:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
40 ACCGCGTTGATCGAGTACGTACGTGGGTC
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
41 X||X
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
42 .ACGTGGGTCCCG
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
43 -Step 3:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
44 ACCGCGTTGATCGAGTACGTACGTGGGTC
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
45 XX
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
46 ..ACGTGGGTCCCG
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
47 -Step ...
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
48 -Step N:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
49 ACCGCGTTGATCGAGTACGTACGTGGGTC
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
50 |||||||||
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
51 ....................ACGTGGGTCCCG
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
52 Success & return!
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
53 ========
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
54
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
55 """
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
56
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
57 def RemoveAdapter ( read, adapter, no_mismatch ) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
58 lr = len(read)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
59 la = len(adapter)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
60 for i in xrange( lr - no_mismatch ) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
61 read_pos = i
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
62 adapter_pos = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
63 count_no_mis = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
64 while (adapter_pos < la) and (read_pos < lr) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
65 if (read[read_pos] == adapter[adapter_pos]) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
66 read_pos = read_pos + 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
67 adapter_pos = adapter_pos + 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
68 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
69 count_no_mis = count_no_mis + 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
70 if count_no_mis > no_mismatch :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
71 break
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
72 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
73 read_pos = read_pos + 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
74 adapter_pos = adapter_pos + 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
75 # while_end
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
76
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
77 if adapter_pos == la or read_pos == lr :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
78 return read[:i]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
79 # for_end
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
80 return read
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
81
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
82
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
83 def Remove_5end_Adapter ( read, adapter, no_mismatch) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
84 lr = len(read)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
85 la = len(adapter)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
86 for i in xrange (la - no_mismatch) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
87 read_pos = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
88 adapter_pos = i
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
89 count_no_mis = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
90 while (adapter_pos < la) and (read_pos < lr) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
91 if (read[read_pos] == adapter[adapter_pos]) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
92 adapter_pos = adapter_pos + 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
93 read_pos = read_pos + 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
94 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
95 count_no_mis = count_no_mis + 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
96 if count_no_mis > no_mismatch :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
97 break
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
98 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
99 read_pos = read_pos + 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
100 adapter_pos = adapter_pos + 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
101 # while_end
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
102 if adapter_pos == la :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
103 return read[(la-i):]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
104
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
105
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
106 def next_nuc(seq, pos, n):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
107 """ Returns the nucleotide that is n places from pos in seq. Skips gap symbols.
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
108 """
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
109 i = pos + 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
110 while i < len(seq):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
111 if seq[i] != '-':
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
112 n -= 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
113 if n == 0: break
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
114 i += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
115 if i < len(seq) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
116 return seq[i]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
117 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
118 return 'N'
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
119
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
120
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
121
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
122 def methy_seq(read, genome):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
123 H = ['A', 'C', 'T']
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
124 m_seq = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
125 xx = "-"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
126 for i in xrange(len(read)):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
127
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
128 if genome[i] == '-':
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
129 continue
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
130
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
131 elif read[i] != 'C' and read[i] != 'T':
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
132 xx = "-"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
133
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
134 elif read[i] == "T" and genome[i] == "C": #(unmethylated):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
135 nn1 = next_nuc(genome, i, 1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
136 if nn1 == "G":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
137 xx = "x"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
138 elif nn1 in H :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
139 nn2 = next_nuc(genome, i, 2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
140 if nn2 == "G":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
141 xx = "y"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
142 elif nn2 in H :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
143 xx = "z"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
144
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
145 elif read[i] == "C" and genome[i] == "C": #(methylated):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
146 nn1 = next_nuc(genome, i, 1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
147
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
148 if nn1 == "G":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
149 xx = "X"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
150 elif nn1 in H :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
151 nn2 = next_nuc(genome, i, 2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
152
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
153 if nn2 == "G":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
154 xx = "Y"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
155 elif nn2 in H:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
156 xx = "Z"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
157 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
158 xx = "-"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
159 m_seq.append(xx)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
160
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
161 return ''.join(m_seq)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
162
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
163 def mcounts(mseq, mlst, ulst):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
164 out_mlst=[mlst[0]+mseq.count("X"), mlst[1]+mseq.count("Y"), mlst[2]+mseq.count("Z")]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
165 out_ulst=[ulst[0]+mseq.count("x"), ulst[1]+mseq.count("y"), ulst[2]+mseq.count("z")]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
166 return out_mlst, out_ulst
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
167
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
168
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
169
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
170 def process_aligner_output(filename, pair_end = False):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
171
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
172 #m = re.search(r'-('+'|'.join(supported_aligners) +')-TMP', filename)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
173 m = re.search(r'-('+'|'.join(supported_aligners) +')-.*TMP', filename)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
174 if m is None:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
175 error('The temporary folder path should contain the name of one of the supported aligners: ' + filename)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
176
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
177 format = m.group(1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
178 try :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
179 input = open(filename)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
180 except IOError:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
181 print "[Error] Cannot open file %s" % filename
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
182 exit(-1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
183
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
184 QNAME, FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL = range(11)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
185 def parse_SAM(line):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
186 buf = line.split()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
187 # print buf
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
188 flag = int(buf[FLAG])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
189
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
190 # skip reads that are not mapped
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
191 # skip reads that have probability of being non-unique higher than 1/10
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
192 if flag & 0x4 : # or int(buf[MAPQ]) < 10:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
193 return None, None, None, None, None, None
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
194 # print "format = ", format
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
195 if format == BOWTIE:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
196 mismatches = int([buf[i][5:] for i in xrange(11, len(buf)) if buf[i][:5] == 'NM:i:'][0]) # get the edit distance
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
197 # --- bug fixed ------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
198 elif format == BOWTIE2:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
199 if re.search(r'(.)*-e2e-TMP(.*)', filename) is None : # local model
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
200 mismatches = 1-int([buf[i][5:] for i in xrange(11, len(buf)) if buf[i][:5] == 'AS:i:'][0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
201 # print "====local=====\n"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
202 ## bowtie2 use AS tag (score) to evaluate the mapping. The higher, the better.
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
203 else : # end-to-end model
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
204 # print "end-to-end\n"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
205 mismatches = int([buf[i][5:] for i in xrange(11, len(buf)) if buf[i][:5] == 'XM:i:'][0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
206 # --- Weilong ---------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
207 elif format == SOAP:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
208 mismatches = 1-buf[MAPQ]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
209 # mismatches = 1/float(buf[MAPQ])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
210 ## downstream might round (0,1) to 0, so use integer instead
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
211 ## fixed by Weilong
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
212 elif format == RMAP:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
213 # chr16 75728107 75728147 read45 9 -
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
214 # chr16 67934919 67934959 read45 9 -
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
215 mismatches = buf[4]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
216
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
217 return (buf[QNAME], # read ID
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
218 buf[RNAME], # reference ID
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
219 int(buf[POS]) - 1, # position, 0 based (SAM is 1 based)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
220 mismatches, # number of mismatches
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
221 parse_cigar(buf[CIGAR]), # the parsed cigar string
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
222 flag & 0x40 # true if it is the first mate in a pair, false if it is the second mate
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
223 )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
224
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
225 SOAP_QNAME, SOAP_SEQ, SOAP_QUAL, SOAP_NHITS, SOAP_AB, SOAP_LEN, SOAP_STRAND, SOAP_CHR, SOAP_LOCATION, SOAP_MISMATCHES = range(10)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
226 def parse_SOAP(line):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
227 buf = line.split()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
228 return (buf[SOAP_QNAME],
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
229 buf[SOAP_CHR],
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
230 int(buf[SOAP_LOCATION]) - 1,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
231 int(buf[SOAP_MISMATCHES]),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
232 buf[SOAP_AB],
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
233 buf[SOAP_STRAND],
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
234 parse_cigar(buf[SOAP_LEN]+'M')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
235 )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
236
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
237 # chr16 75728107 75728147 read45 9 -
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
238 RMAP_CHR, RMAP_START, RMAP_END, RMAP_QNAME, RMAP_MISMATCH, RMAP_STRAND = range(6)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
239 def parse_RMAP(line):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
240 buf = line.split()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
241 return ( buf[RMAP_QNAME],
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
242 buf[RMAP_CHR],
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
243 int(buf[RMAP_START]), # to check -1 or not
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
244 int(buf[RMAP_END]) - int(buf[RMAP_START]) + 1,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
245 int(buf[RMAP_MISMATCH]),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
246 buf[RMAP_STRAND]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
247 )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
248
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
249 if format == BOWTIE or format == BOWTIE2:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
250 if pair_end:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
251 for line in input:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
252 header1, chr1, location1, no_mismatch1, cigar1, _ = parse_SAM(line)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
253 header2, _, location2, no_mismatch2, cigar2, mate_no2 = parse_SAM(input.next())
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
254
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
255
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
256 if header1 and header2:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
257 # flip the location info if the second mate comes first in the alignment file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
258 if mate_no2:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
259 location1, location2 = location2, location1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
260 cigar1, cigar2 = cigar2, cigar1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
261
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
262
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
263 yield header1, chr1, no_mismatch1 + no_mismatch2, location1, cigar1, location2, cigar2
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
264 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
265 for line in input:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
266 header, chr, location, no_mismatch, cigar, _ = parse_SAM(line)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
267 if header is not None:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
268 yield header, chr, location, no_mismatch, cigar
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
269 elif format == SOAP:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
270 if pair_end:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
271 for line in input:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
272 header1, chr1, location1, no_mismatch1, mate1, strand1, cigar1 = parse_SOAP(line)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
273 header2, _ , location2, no_mismatch2, _, strand2, cigar2 = parse_SOAP(input.next())
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
274
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
275 if mate1 == 'b':
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
276 location1, location2 = location2, location1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
277 strand1, strand2 = strand2, strand1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
278 ciga1, cigar2 = cigar2, cigar1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
279
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
280
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
281 if header1 and header2 and strand1 == '+' and strand2 == '-':
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
282 yield header1, chr1, no_mismatch1 + no_mismatch2, location1, cigar1, location2, cigar2
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
283
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
284 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
285 for line in input:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
286 header, chr, location, no_mismatch, _, strand, cigar = parse_SOAP(line)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
287 if header and strand == '+':
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
288 yield header, chr, location, no_mismatch, cigar
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
289 elif format == RMAP :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
290 if pair_end :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
291 todo = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
292 # to do
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
293 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
294 for line in input:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
295 header, chr, location, read_len, no_mismatch, strand = parse_RMAP(line)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
296 cigar = str(read_len) + "M"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
297 yield header, chr, location, no_mismatch, cigar
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
298
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
299 input.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
300
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
301
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
302 def parse_cigar(cigar_string):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
303 i = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
304 prev_i = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
305 cigar = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
306 while i < len(cigar_string):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
307 if cigar_string[i] in CIGAR_OPS:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
308 cigar.append((CIGAR_OPS[cigar_string[i]], int(cigar_string[prev_i:i])))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
309 prev_i = i + 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
310 i += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
311 return cigar
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
312
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
313 def get_read_start_end_and_genome_length(cigar):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
314 r_start = cigar[0][1] if cigar[0][0] == BAM_SOFTCLIP else 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
315 r_end = r_start
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
316 g_len = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
317 for edit_op, count in cigar:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
318 if edit_op == BAM_MATCH:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
319 r_end += count
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
320 g_len += count
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
321 elif edit_op == BAM_INS:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
322 r_end += count
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
323 elif edit_op == BAM_DEL:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
324 g_len += count
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
325 return r_start, r_end, g_len # return the start and end in the read and the length of the genomic sequence
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
326 # r_start : start position on the read
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
327 # r_end : end position on the read
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
328 # g_len : length of the mapped region on genome
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
329
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
330
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
331 def cigar_to_alignment(cigar, read_seq, genome_seq):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
332 """ Reconstruct the pairwise alignment based on the CIGAR string and the two sequences
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
333 """
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
334
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
335 # reconstruct the alignment
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
336 r_pos = cigar[0][1] if cigar[0][0] == BAM_SOFTCLIP else 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
337 g_pos = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
338 r_aln = ''
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
339 g_aln = ''
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
340 for edit_op, count in cigar:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
341 if edit_op == BAM_MATCH:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
342 r_aln += read_seq[r_pos : r_pos + count]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
343 g_aln += genome_seq[g_pos : g_pos + count]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
344 r_pos += count
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
345 g_pos += count
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
346 elif edit_op == BAM_DEL:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
347 r_aln += '-'*count
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
348 g_aln += genome_seq[g_pos : g_pos + count]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
349 g_pos += count
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
350 elif edit_op == BAM_INS:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
351 r_aln += read_seq[r_pos : r_pos + count]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
352 g_aln += '-'*count
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
353 r_pos += count
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
354
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
355 return r_aln, g_aln
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
356
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
357
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
358
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
359 # return sequence is [start, end), not include 'end'
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
360 def get_genomic_sequence(genome, start, end, strand = '+'):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
361 if strand != '+' and strand != '-' :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
362 print "[Bug] get_genomic_sequence input should be \'+\' or \'-\'."
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
363 exit(-1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
364 if start > 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
365 prev = genome[start-2:start]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
366 elif start == 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
367 prev = 'N'+genome[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
368 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
369 prev = 'NN'
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
370
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
371 if end < len(genome) - 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
372 next = genome[end: end + 2]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
373 elif end == len(genome) - 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
374 next = genome[end] + 'N'
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
375 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
376 next = 'NN'
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
377 origin_genome = genome[start:end]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
378
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
379 if strand == '-':
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
380 # reverse complement everything if strand is '-'
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
381 revc = reverse_compl_seq('%s%s%s' % (prev, origin_genome, next))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
382 prev, origin_genome, next = revc[:2], revc[2:-2], revc[-2:]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
383
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
384 return origin_genome, next, '%s_%s_%s' % (prev, origin_genome, next)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
385 # next : next two nucleotides