0
|
1 #include <unistd.h>
|
|
2 #include <string.h>
|
|
3 #include <stdio.h>
|
|
4 #include <stdlib.h>
|
|
5 #include <math.h>
|
|
6 #include <time.h>
|
|
7 #include "stdaln.h"
|
|
8 #include "bwase.h"
|
|
9 #include "bwtaln.h"
|
|
10 #include "bntseq.h"
|
|
11 #include "utils.h"
|
|
12 #include "kstring.h"
|
|
13
|
|
14 static int g_log_n[256];
|
|
15
|
|
16 void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi)
|
|
17 {
|
|
18 int i, cnt, best;
|
|
19 if (n_aln == 0) {
|
|
20 s->type = BWA_TYPE_NO_MATCH;
|
|
21 s->c1 = s->c2 = 0;
|
|
22 return;
|
|
23 }
|
|
24
|
|
25 if (set_main) {
|
|
26 best = aln[0].score;
|
|
27 for (i = cnt = 0; i < n_aln; ++i) {
|
|
28 const bwt_aln1_t *p = aln + i;
|
|
29 if (p->score > best) break;
|
|
30 if (drand48() * (p->l - p->k + 1) > (double)cnt) {
|
|
31 s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape; s->strand = p->a;
|
|
32 s->score = p->score;
|
|
33 s->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48());
|
|
34 }
|
|
35 cnt += p->l - p->k + 1;
|
|
36 }
|
|
37 s->c1 = cnt;
|
|
38 for (; i < n_aln; ++i) cnt += aln[i].l - aln[i].k + 1;
|
|
39 s->c2 = cnt - s->c1;
|
|
40 s->type = s->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE;
|
|
41 }
|
|
42
|
|
43 if (n_multi) {
|
|
44 int k, rest, n_occ, z = 0;
|
|
45 for (k = n_occ = 0; k < n_aln; ++k) {
|
|
46 const bwt_aln1_t *q = aln + k;
|
|
47 n_occ += q->l - q->k + 1;
|
|
48 }
|
|
49 if (s->multi) free(s->multi);
|
|
50 if (n_occ > n_multi + 1) { // if there are too many hits, generate none of them
|
|
51 s->multi = 0; s->n_multi = 0;
|
|
52 return;
|
|
53 }
|
|
54 /* The following code is more flexible than what is required
|
|
55 * here. In principle, due to the requirement above, we can
|
|
56 * simply output all hits, but the following samples "rest"
|
|
57 * number of random hits. */
|
|
58 rest = n_occ > n_multi + 1? n_multi + 1 : n_occ; // find one additional for ->sa
|
|
59 s->multi = calloc(rest, rest * sizeof(bwt_multi1_t));
|
|
60 for (k = 0; k < n_aln; ++k) {
|
|
61 const bwt_aln1_t *q = aln + k;
|
|
62 if (q->l - q->k + 1 <= rest) {
|
|
63 bwtint_t l;
|
|
64 for (l = q->k; l <= q->l; ++l) {
|
|
65 s->multi[z].pos = l;
|
|
66 s->multi[z].gap = q->n_gapo + q->n_gape;
|
|
67 s->multi[z].mm = q->n_mm;
|
|
68 s->multi[z++].strand = q->a;
|
|
69 }
|
|
70 rest -= q->l - q->k + 1;
|
|
71 } else { // Random sampling (http://code.activestate.com/recipes/272884/). In fact, we never come here.
|
|
72 int j, i, k;
|
|
73 for (j = rest, i = q->l - q->k + 1, k = 0; j > 0; --j) {
|
|
74 double p = 1.0, x = drand48();
|
|
75 while (x < p) p -= p * j / (i--);
|
|
76 s->multi[z].pos = q->l - i;
|
|
77 s->multi[z].gap = q->n_gapo + q->n_gape;
|
|
78 s->multi[z].mm = q->n_mm;
|
|
79 s->multi[z++].strand = q->a;
|
|
80 }
|
|
81 rest = 0;
|
|
82 break;
|
|
83 }
|
|
84 }
|
|
85 s->n_multi = z;
|
|
86 for (k = z = 0; k < s->n_multi; ++k)
|
|
87 if (s->multi[k].pos != s->sa)
|
|
88 s->multi[z++] = s->multi[k];
|
|
89 s->n_multi = z < n_multi? z : n_multi;
|
|
90 }
|
|
91 }
|
|
92
|
|
93 void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s)
|
|
94 {
|
|
95 bwa_aln2seq_core(n_aln, aln, s, 1, 0);
|
|
96 }
|
|
97
|
|
98 /* *aln points to alignments found for the current sequence, n_aln is the
|
|
99 size of the array pointed to by *aln. Array *s of size n_seq is a collection
|
|
100 of SAM records that must be replicas, i.e. initialized with the same current sequence.
|
|
101 This method updates sequence records in *s with placements recorded in *aln,
|
|
102 until all n_seq replicas are updated. For each separate alignment record in array *aln,
|
|
103 each placement corresponding to this record will be assigned to a separate record in *s
|
|
104 if there are enough elements remaining in *s, otherwise a random subset of the placements will
|
|
105 be assigned to the remaining elements in *s. The total number of best placements and total
|
|
106 number of non-best placements will be computed from the whole array *aln (regardless of whether it
|
|
107 fits completely into *s or not) and assigned to each updated record in *s.
|
|
108 */
|
|
109 void bwa_aln2seq_all(int n_aln, const bwt_aln1_t *aln, int n_seq, bwa_seq_t *s)
|
|
110 {
|
|
111 int i, cnt1, cnt2, j, best, N;
|
|
112 if (n_aln == 0) {
|
|
113 /* there is no match found for *s */
|
|
114 s->type = BWA_TYPE_NO_MATCH;
|
|
115 s->c1 = s->c2 = 0;
|
|
116 return;
|
|
117 }
|
|
118
|
|
119 N = n_seq; // remember the size of the array
|
|
120 best = aln[0].score;
|
|
121
|
|
122 cnt1 = 0; // total number of already processed alignments (i.e. distinct placements, NOT alignment records) with best score
|
|
123 cnt2 = 0; // total number of already processed alignments with inferior score(s)
|
|
124 for (i = 0; i < n_aln && n_seq > 0 ; ++i) {
|
|
125 const bwt_aln1_t *p_aln = aln + i;
|
|
126
|
|
127 int N_aligns = p_aln->l-p_aln->k +1 ; // number of placements (alignments) in the current alignment record p_aln
|
|
128
|
|
129 if (N_aligns <= n_seq) { /* we have space to save all the alignments stored in 'p_aln' */
|
|
130
|
|
131 for ( j = 0 ; j < N_aligns ; j++ ) {
|
|
132
|
|
133 bwa_seq_t * seq = s + cnt1+ cnt2+j ;
|
|
134 seq->n_mm = p_aln->n_mm; seq->n_gapo = p_aln->n_gapo; seq->n_gape = p_aln->n_gape; seq->strand = p_aln->a;
|
|
135 seq->score = p_aln->score;
|
|
136 seq->sa = p_aln->k + j;
|
|
137 }
|
|
138 n_seq -= N_aligns; // we have n_seq slots remaining to store more alignments
|
|
139 } else { // See also: http://code.activestate.com/recipes/272884/
|
|
140 // we have to truncate, so let's select few remaining alignments randomly:
|
|
141 int xj, xi, xk;
|
|
142 for (xj = n_seq, xi = N_aligns, xk = 0; xj > 0; --xj, ++xk) {
|
|
143 double p = 1.0, x = drand48();
|
|
144 while (x < p) p -= p * xj / (xi--);
|
|
145
|
|
146 bwa_seq_t * seq = s+cnt1+cnt2+xk ;
|
|
147 seq->n_mm = p_aln->n_mm; seq->n_gapo = p_aln->n_gapo; seq->n_gape = p_aln->n_gape; seq->strand = p_aln->a;
|
|
148 seq->score = p_aln->score;
|
|
149 seq->sa = p_aln->l - xi;
|
|
150
|
|
151 }
|
|
152 n_seq = 0;
|
|
153 }
|
|
154 // cnt1 + cnt2 is the total count of hits processed so far:
|
|
155 if ( p_aln->score == best ) cnt1 += N_aligns; // we found N_aligns more placements with best score
|
|
156 else cnt2 += N_aligns; // N_aligns more placements with inferior score
|
|
157 }
|
|
158
|
|
159 // we filled all available slots in the array *s, but there can be more alignments
|
|
160 // left; we need to count them:
|
|
161 for (; i < n_aln; ++i) {
|
|
162 if ( aln[i].score == best ) cnt1 += aln[i].l-aln[i].k+1;
|
|
163 else cnt2 += aln[i].l-aln[i].k+1;
|
|
164 }
|
|
165
|
|
166 // now cnt1 is the total number of found alignments (placements) with best score
|
|
167 // and cnt2 is the total number of found placements with worse score
|
|
168
|
|
169 /* set counts and flags for all hits: */
|
|
170 for (i = 0; i < N ; ++i) {
|
|
171 bwa_seq_t * seq = s+i ;
|
|
172
|
|
173 seq->c1 = cnt1;
|
|
174 seq->c2 = cnt2;
|
|
175 seq->type = seq->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE;
|
|
176 }
|
|
177 }
|
|
178
|
|
179
|
|
180
|
|
181 int bwa_approx_mapQ(const bwa_seq_t *p, int mm)
|
|
182 {
|
|
183 int n;
|
|
184 if (p->c1 == 0) return 23;
|
|
185 if (p->c1 > 1) return 0;
|
|
186 if (p->n_mm == mm) return 25;
|
|
187 if (p->c2 == 0) return 37;
|
|
188 n = (p->c2 >= 255)? 255 : p->c2;
|
|
189 return (23 < g_log_n[n])? 0 : 23 - g_log_n[n];
|
|
190 }
|
|
191
|
|
192 /**
|
|
193 * Derive the actual position in the read from the given suffix array
|
|
194 * coordinates. Note that the position will be approximate based on
|
|
195 * whether indels appear in the read and whether calculations are
|
|
196 * performed from the start or end of the read.
|
|
197 */
|
|
198 void bwa_cal_pac_pos_core(const bwt_t *forward_bwt, const bwt_t *reverse_bwt, int n_seqs, bwa_seq_t *s, const int max_mm, const float fnr)
|
|
199 {
|
|
200 int max_diff;
|
|
201 bwa_seq_t *seq;
|
|
202 int i;
|
|
203
|
|
204 for ( i = 0 ; i < n_seqs ; i++ ) {
|
|
205 seq = s + i;
|
|
206 if (seq->type != BWA_TYPE_UNIQUE && seq->type != BWA_TYPE_REPEAT) continue;
|
|
207
|
|
208 max_diff = fnr > 0.0? bwa_cal_maxdiff(seq->len, BWA_AVG_ERR, fnr) : max_mm;
|
|
209 if (seq->strand) { // reverse strand only
|
|
210 seq->pos = bwt_sa(forward_bwt, seq->sa);
|
|
211 seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
|
|
212 } else { // forward strand only
|
|
213 /* NB: For gapped alignment, p->pos may not be correct, which
|
|
214 * will be fixed in refine_gapped_core(). This line also
|
|
215 * determines the way "x" is calculated in
|
|
216 * refine_gapped_core() when (ext < 0 && is_end == 0). */
|
|
217 seq->pos = reverse_bwt->seq_len - (bwt_sa(reverse_bwt, seq->sa) + seq->len);
|
|
218 seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
|
|
219 }
|
|
220 }
|
|
221 }
|
|
222
|
|
223 void bwa_cal_pac_pos(const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr)
|
|
224 {
|
|
225 int i, j;
|
|
226 char str[1024];
|
|
227 bwt_t *bwt;
|
|
228 // load forward SA
|
|
229 strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
|
|
230 strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
|
|
231 for (i = 0; i != n_seqs; ++i) {
|
|
232 if (seqs[i].strand) bwa_cal_pac_pos_core(bwt, 0, 1, &seqs[i], max_mm, fnr);
|
|
233 for (j = 0; j < seqs[i].n_multi; ++j) {
|
|
234 bwt_multi1_t *p = seqs[i].multi + j;
|
|
235 if (p->strand) p->pos = bwt_sa(bwt, p->pos);
|
|
236 }
|
|
237 }
|
|
238 bwt_destroy(bwt);
|
|
239 // load reverse BWT and SA
|
|
240 strcpy(str, prefix); strcat(str, ".rbwt"); bwt = bwt_restore_bwt(str);
|
|
241 strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt);
|
|
242 for (i = 0; i != n_seqs; ++i) {
|
|
243 if (!seqs[i].strand) bwa_cal_pac_pos_core(0, bwt, 1, &seqs[i], max_mm, fnr);
|
|
244 for (j = 0; j < seqs[i].n_multi; ++j) {
|
|
245 bwt_multi1_t *p = seqs[i].multi + j;
|
|
246 if (!p->strand) p->pos = bwt->seq_len - (bwt_sa(bwt, p->pos) + seqs[i].len);
|
|
247 }
|
|
248 }
|
|
249 bwt_destroy(bwt);
|
|
250 }
|
|
251
|
|
252 /* is_end_correct == 1 if (*pos+len) gives the correct coordinate on
|
|
253 * forward strand. This happens when p->pos is calculated by
|
|
254 * bwa_cal_pac_pos(). is_end_correct==0 if (*pos) gives the correct
|
|
255 * coordinate. This happens only for color-converted alignment. */
|
|
256 static bwa_cigar_t *refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, bwtint_t *_pos,
|
|
257 int ext, int *n_cigar, int is_end_correct)
|
|
258 {
|
|
259 bwa_cigar_t *cigar = 0;
|
|
260 ubyte_t *ref_seq;
|
|
261 int l = 0, path_len, ref_len;
|
|
262 AlnParam ap = aln_param_bwa;
|
|
263 path_t *path;
|
|
264 int64_t k, __pos = *_pos > l_pac? (int64_t)((int32_t)*_pos) : *_pos;
|
|
265
|
|
266 ref_len = len + abs(ext);
|
|
267 if (ext > 0) {
|
|
268 ref_seq = (ubyte_t*)calloc(ref_len, 1);
|
|
269 for (k = __pos; k < __pos + ref_len && k < l_pac; ++k)
|
|
270 ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
|
|
271 } else {
|
|
272 int64_t x = __pos + (is_end_correct? len : ref_len);
|
|
273 ref_seq = (ubyte_t*)calloc(ref_len, 1);
|
|
274 for (l = 0, k = x - ref_len > 0? x - ref_len : 0; k < x && k < l_pac; ++k)
|
|
275 ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
|
|
276 }
|
|
277 path = (path_t*)calloc(l+len, sizeof(path_t));
|
|
278
|
|
279 aln_global_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len);
|
|
280 cigar = bwa_aln_path2cigar(path, path_len, n_cigar);
|
|
281
|
|
282 if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped on the forward strand
|
|
283 for (l = k = 0; k < *n_cigar; ++k) {
|
|
284 if (__cigar_op(cigar[k]) == FROM_D) l -= __cigar_len(cigar[k]);
|
|
285 else if (__cigar_op(cigar[k]) == FROM_I) l += __cigar_len(cigar[k]);
|
|
286 }
|
|
287 __pos += l;
|
|
288 }
|
|
289
|
|
290 if (__cigar_op(cigar[0]) == FROM_D) { // deletion at the 5'-end
|
|
291 __pos += __cigar_len(cigar[0]);
|
|
292 for (k = 0; k < *n_cigar - 1; ++k) cigar[k] = cigar[k+1];
|
|
293 --(*n_cigar);
|
|
294 }
|
|
295 if (__cigar_op(cigar[*n_cigar-1]) == FROM_D) --(*n_cigar); // deletion at the 3'-end
|
|
296
|
|
297 // change "I" at either end of the read to S. just in case. This should rarely happen...
|
|
298 if (__cigar_op(cigar[*n_cigar-1]) == FROM_I) cigar[*n_cigar-1] = __cigar_create(3, (__cigar_len(cigar[*n_cigar-1])));
|
|
299 if (__cigar_op(cigar[0]) == FROM_I) cigar[0] = __cigar_create(3, (__cigar_len(cigar[0])));
|
|
300
|
|
301 *_pos = (bwtint_t)__pos;
|
|
302 free(ref_seq); free(path);
|
|
303 return cigar;
|
|
304 }
|
|
305
|
|
306 char *bwa_cal_md1(int n_cigar, bwa_cigar_t *cigar, int len, bwtint_t pos, ubyte_t *seq,
|
|
307 bwtint_t l_pac, ubyte_t *pacseq, kstring_t *str, int *_nm)
|
|
308 {
|
|
309 bwtint_t x, y;
|
|
310 int z, u, c, nm = 0;
|
|
311 str->l = 0; // reset
|
|
312 x = pos; y = 0;
|
|
313 if (cigar) {
|
|
314 int k, l;
|
|
315 for (k = u = 0; k < n_cigar; ++k) {
|
|
316 l = __cigar_len(cigar[k]);
|
|
317 if (__cigar_op(cigar[k]) == FROM_M) {
|
|
318 for (z = 0; z < l && x+z < l_pac; ++z) {
|
|
319 c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
|
|
320 if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) {
|
|
321 ksprintf(str, "%d", u);
|
|
322 kputc("ACGTN"[c], str);
|
|
323 ++nm;
|
|
324 u = 0;
|
|
325 } else ++u;
|
|
326 }
|
|
327 x += l; y += l;
|
|
328 /* } else if (cigar[k]>>14 == FROM_I || cigar[k]>>14 == 3) { */
|
|
329 } else if (__cigar_op(cigar[k]) == FROM_I || __cigar_op(cigar[k]) == FROM_S) {
|
|
330 y += l;
|
|
331 if (__cigar_op(cigar[k]) == FROM_I) nm += l;
|
|
332 } else if (__cigar_op(cigar[k]) == FROM_D) {
|
|
333 ksprintf(str, "%d", u);
|
|
334 kputc('^', str);
|
|
335 for (z = 0; z < l && x+z < l_pac; ++z)
|
|
336 kputc("ACGT"[pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3], str);
|
|
337 u = 0;
|
|
338 x += l; nm += l;
|
|
339 }
|
|
340 }
|
|
341 } else { // no gaps
|
|
342 for (z = u = 0; z < (bwtint_t)len; ++z) {
|
|
343 c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
|
|
344 if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) {
|
|
345 ksprintf(str, "%d", u);
|
|
346 kputc("ACGTN"[c], str);
|
|
347 ++nm;
|
|
348 u = 0;
|
|
349 } else ++u;
|
|
350 }
|
|
351 }
|
|
352 ksprintf(str, "%d", u);
|
|
353 *_nm = nm;
|
|
354 return strdup(str->s);
|
|
355 }
|
|
356
|
|
357 void bwa_correct_trimmed(bwa_seq_t *s)
|
|
358 {
|
|
359 if (s->len == s->full_len) return;
|
|
360 if (s->strand == 0) { // forward
|
|
361 if (s->cigar && __cigar_op(s->cigar[s->n_cigar-1]) == FROM_S) { // the last is S
|
|
362 s->cigar[s->n_cigar-1] += s->full_len - s->len;
|
|
363 } else {
|
|
364 if (s->cigar == 0) {
|
|
365 s->n_cigar = 2;
|
|
366 s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t));
|
|
367 s->cigar[0] = __cigar_create(0, s->len);
|
|
368 } else {
|
|
369 ++s->n_cigar;
|
|
370 s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t));
|
|
371 }
|
|
372 s->cigar[s->n_cigar-1] = __cigar_create(3, (s->full_len - s->len));
|
|
373 }
|
|
374 } else { // reverse
|
|
375 if (s->cigar && __cigar_op(s->cigar[0]) == FROM_S) { // the first is S
|
|
376 s->cigar[0] += s->full_len - s->len;
|
|
377 } else {
|
|
378 if (s->cigar == 0) {
|
|
379 s->n_cigar = 2;
|
|
380 s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t));
|
|
381 s->cigar[1] = __cigar_create(0, s->len);
|
|
382 } else {
|
|
383 ++s->n_cigar;
|
|
384 s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t));
|
|
385 memmove(s->cigar + 1, s->cigar, (s->n_cigar-1) * sizeof(bwa_cigar_t));
|
|
386 }
|
|
387 s->cigar[0] = __cigar_create(3, (s->full_len - s->len));
|
|
388 }
|
|
389 }
|
|
390 s->len = s->full_len;
|
|
391 }
|
|
392
|
|
393 void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns)
|
|
394 {
|
|
395 ubyte_t *pacseq, *ntpac = 0;
|
|
396 int i, j;
|
|
397 kstring_t *str;
|
|
398
|
|
399 if (ntbns) { // in color space
|
|
400 ntpac = (ubyte_t*)calloc(ntbns->l_pac/4+1, 1);
|
|
401 rewind(ntbns->fp_pac);
|
|
402 fread(ntpac, 1, ntbns->l_pac/4 + 1, ntbns->fp_pac);
|
|
403 }
|
|
404
|
|
405 if (!_pacseq) {
|
|
406 pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
|
|
407 rewind(bns->fp_pac);
|
|
408 fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac);
|
|
409 } else pacseq = _pacseq;
|
|
410 for (i = 0; i != n_seqs; ++i) {
|
|
411 bwa_seq_t *s = seqs + i;
|
|
412 seq_reverse(s->len, s->seq, 0); // IMPORTANT: s->seq is reversed here!!!
|
|
413 for (j = 0; j < s->n_multi; ++j) {
|
|
414 bwt_multi1_t *q = s->multi + j;
|
|
415 int n_cigar;
|
|
416 if (q->gap == 0) continue;
|
|
417 q->cigar = refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos,
|
|
418 (q->strand? 1 : -1) * q->gap, &n_cigar, 1);
|
|
419 q->n_cigar = n_cigar;
|
|
420 }
|
|
421 if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue;
|
|
422 s->cigar = refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos,
|
|
423 (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1);
|
|
424 }
|
|
425
|
|
426 if (ntbns) { // in color space
|
|
427 for (i = 0; i < n_seqs; ++i) {
|
|
428 bwa_seq_t *s = seqs + i;
|
|
429 bwa_cs2nt_core(s, bns->l_pac, ntpac);
|
|
430 for (j = 0; j < s->n_multi; ++j) {
|
|
431 bwt_multi1_t *q = s->multi + j;
|
|
432 int n_cigar;
|
|
433 if (q->gap == 0) continue;
|
|
434 free(q->cigar);
|
|
435 q->cigar = refine_gapped_core(bns->l_pac, ntpac, s->len, q->strand? s->rseq : s->seq, &q->pos,
|
|
436 (q->strand? 1 : -1) * q->gap, &n_cigar, 0);
|
|
437 q->n_cigar = n_cigar;
|
|
438 }
|
|
439 if (s->type != BWA_TYPE_NO_MATCH && s->cigar) { // update cigar again
|
|
440 free(s->cigar);
|
|
441 s->cigar = refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos,
|
|
442 (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 0);
|
|
443 }
|
|
444 }
|
|
445 }
|
|
446
|
|
447 // generate MD tag
|
|
448 str = (kstring_t*)calloc(1, sizeof(kstring_t));
|
|
449 for (i = 0; i != n_seqs; ++i) {
|
|
450 bwa_seq_t *s = seqs + i;
|
|
451 if (s->type != BWA_TYPE_NO_MATCH) {
|
|
452 int nm;
|
|
453 s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq,
|
|
454 bns->l_pac, ntbns? ntpac : pacseq, str, &nm);
|
|
455 s->nm = nm;
|
|
456 }
|
|
457 }
|
|
458 free(str->s); free(str);
|
|
459
|
|
460 // correct for trimmed reads
|
|
461 if (!ntbns) // trimming is only enabled for Illumina reads
|
|
462 for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i);
|
|
463
|
|
464 if (!_pacseq) free(pacseq);
|
|
465 free(ntpac);
|
|
466 }
|
|
467
|
|
468 int64_t pos_end(const bwa_seq_t *p)
|
|
469 {
|
|
470 if (p->cigar) {
|
|
471 int j;
|
|
472 int64_t x = p->pos;
|
|
473 for (j = 0; j != p->n_cigar; ++j) {
|
|
474 int op = __cigar_op(p->cigar[j]);
|
|
475 if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]);
|
|
476 }
|
|
477 return x;
|
|
478 } else return p->pos + p->len;
|
|
479 }
|
|
480
|
|
481 int64_t pos_end_multi(const bwt_multi1_t *p, int len) // analogy to pos_end()
|
|
482 {
|
|
483 if (p->cigar) {
|
|
484 int j;
|
|
485 int64_t x = p->pos;
|
|
486 for (j = 0; j != p->n_cigar; ++j) {
|
|
487 int op = __cigar_op(p->cigar[j]);
|
|
488 if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]);
|
|
489 }
|
|
490 return x;
|
|
491 } else return p->pos + len;
|
|
492 }
|
|
493
|
|
494 static int64_t pos_5(const bwa_seq_t *p)
|
|
495 {
|
|
496 if (p->type != BWA_TYPE_NO_MATCH)
|
|
497 return p->strand? pos_end(p) : p->pos;
|
|
498 return -1;
|
|
499 }
|
|
500
|
|
501 /* Prints <bases>\t<quals> of the sequence *p into STDOUT;
|
|
502 */
|
|
503 void bwa_print_seq_and_qual(bwa_seq_t *p) {
|
|
504 int j;
|
|
505
|
|
506 if (p->strand == 0)
|
|
507 for (j = 0; j != p->full_len; ++j) putchar("ACGTN"[(int)p->seq[j]]);
|
|
508 else for (j = 0; j != p->full_len; ++j) putchar("TGCAN"[p->seq[p->full_len - 1 - j]]);
|
|
509 putchar('\t');
|
|
510 if (p->qual) {
|
|
511 if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
|
|
512 printf("%s", p->qual);
|
|
513 } else printf("*");
|
|
514
|
|
515 }
|
|
516
|
|
517
|
|
518
|
|
519 void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2)
|
|
520 {
|
|
521 int j;
|
|
522 if (p->type != BWA_TYPE_NO_MATCH || (mate && mate->type != BWA_TYPE_NO_MATCH)) {
|
|
523 int seqid, nn, am = 0, flag = p->extra_flag;
|
|
524 char XT;
|
|
525
|
|
526 if (p->type == BWA_TYPE_NO_MATCH) {
|
|
527 p->pos = mate->pos;
|
|
528 p->strand = mate->strand;
|
|
529 flag |= SAM_FSU;
|
|
530 j = 1;
|
|
531 } else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment
|
|
532
|
|
533 // get seqid
|
|
534 nn = bns_coor_pac2real(bns, p->pos, j, &seqid);
|
|
535 if (p->type != BWA_TYPE_NO_MATCH && p->pos + j - bns->anns[seqid].offset > bns->anns[seqid].len)
|
|
536 flag |= SAM_FSU; // flag UNMAP as this alignment bridges two adjacent reference sequences
|
|
537
|
|
538 // update flag and print it
|
|
539 if (p->strand) flag |= SAM_FSR;
|
|
540 if (mate) {
|
|
541 if (mate->type != BWA_TYPE_NO_MATCH) {
|
|
542 if (mate->strand) flag |= SAM_FMR;
|
|
543 } else flag |= SAM_FMU;
|
|
544 }
|
|
545 printf("%s\t%d\t%s\t", p->name, flag, bns->anns[seqid].name);
|
|
546 printf("%d\t%d\t", (int)(p->pos - bns->anns[seqid].offset + 1), p->mapQ);
|
|
547
|
|
548 // print CIGAR
|
|
549 if (p->cigar) {
|
|
550 for (j = 0; j != p->n_cigar; ++j)
|
|
551 printf("%d%c", __cigar_len(p->cigar[j]), "MIDS"[__cigar_op(p->cigar[j])]);
|
|
552 } else if (p->type == BWA_TYPE_NO_MATCH) printf("*");
|
|
553 else printf("%dM", p->len);
|
|
554
|
|
555 // print mate coordinate
|
|
556 if (mate && mate->type != BWA_TYPE_NO_MATCH) {
|
|
557 int m_seqid, m_is_N;
|
|
558 long long isize;
|
|
559 am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality
|
|
560 // redundant calculation here, but should not matter too much
|
|
561 m_is_N = bns_coor_pac2real(bns, mate->pos, mate->len, &m_seqid);
|
|
562 printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name);
|
|
563 isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0;
|
|
564 if (p->type == BWA_TYPE_NO_MATCH) isize = 0;
|
|
565 printf("%d\t%lld\t", (int)(mate->pos - bns->anns[m_seqid].offset + 1), isize);
|
|
566 } else if (mate) printf("\t=\t%d\t0\t", (int)(p->pos - bns->anns[seqid].offset + 1));
|
|
567 else printf("\t*\t0\t0\t");
|
|
568
|
|
569 // print sequence and quality
|
|
570 bwa_print_seq_and_qual(p);
|
|
571
|
|
572 if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len);
|
|
573 if (p->type != BWA_TYPE_NO_MATCH) {
|
|
574 int i;
|
|
575 // calculate XT tag
|
|
576 XT = "NURM"[p->type];
|
|
577 if (nn > 10) XT = 'N';
|
|
578 // print tags
|
|
579 printf("\tXT:A:%c\t%s:i:%d", XT, (mode & BWA_MODE_COMPREAD)? "NM" : "CM", p->nm);
|
|
580 if (nn) printf("\tXN:i:%d", nn);
|
|
581 if (mate) printf("\tSM:i:%d\tAM:i:%d", p->seQ, am);
|
|
582 if (p->type != BWA_TYPE_MATESW) { // X0 and X1 are not available for this type of alignment
|
|
583 printf("\tX0:i:%d", p->c1);
|
|
584 if (p->c1 <= max_top2) printf("\tX1:i:%d", p->c2);
|
|
585 }
|
|
586 printf("\tXM:i:%d\tXO:i:%d\tXG:i:%d", p->n_mm, p->n_gapo, p->n_gapo+p->n_gape);
|
|
587 if (p->md) printf("\tMD:Z:%s", p->md);
|
|
588 // print multiple hits
|
|
589 if (p->n_multi) {
|
|
590 printf("\tXA:Z:");
|
|
591 for (i = 0; i < p->n_multi; ++i) {
|
|
592 bwt_multi1_t *q = p->multi + i;
|
|
593 int k;
|
|
594 j = pos_end_multi(q, p->len) - q->pos;
|
|
595 nn = bns_coor_pac2real(bns, q->pos, j, &seqid);
|
|
596 printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+',
|
|
597 (int)(q->pos - bns->anns[seqid].offset + 1));
|
|
598 if (q->cigar) {
|
|
599 for (k = 0; k < q->n_cigar; ++k)
|
|
600 printf("%d%c", __cigar_len(q->cigar[k]), "MIDS"[__cigar_op(q->cigar[k])]);
|
|
601 } else printf("%dM", p->len);
|
|
602 printf(",%d;", q->gap + q->mm);
|
|
603 }
|
|
604 }
|
|
605 }
|
|
606 putchar('\n');
|
|
607 } else { // this read has no match
|
|
608 ubyte_t *s = p->strand? p->rseq : p->seq;
|
|
609 int flag = p->extra_flag | SAM_FSU;
|
|
610 if (mate && mate->type == BWA_TYPE_NO_MATCH) flag |= SAM_FMU;
|
|
611 printf("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t", p->name, flag);
|
|
612 for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]);
|
|
613 putchar('\t');
|
|
614 if (p->qual) {
|
|
615 if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
|
|
616 printf("%s", p->qual);
|
|
617 } else printf("*");
|
|
618 if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len);
|
|
619 putchar('\n');
|
|
620 }
|
|
621 }
|
|
622
|
|
623 /* UNUSED
|
|
624 void bwa_print_partial_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2)
|
|
625 {
|
|
626 int j;
|
|
627 if (p->type != BWA_TYPE_NO_MATCH || (mate && mate->type != BWA_TYPE_NO_MATCH)) {
|
|
628 int seqid, nn, am = 0, flag = p->extra_flag;
|
|
629 char XT;
|
|
630
|
|
631 if (p->type == BWA_TYPE_NO_MATCH) {
|
|
632 p->pos = mate->pos;
|
|
633 p->strand = mate->strand;
|
|
634 flag |= SAM_FSU;
|
|
635 j = 1;
|
|
636 } else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment
|
|
637
|
|
638 // get seqid
|
|
639 nn = bns_coor_pac2real(bns, p->pos, j, &seqid);
|
|
640 if (p->type != BWA_TYPE_NO_MATCH && p->pos + j - bns->anns[seqid].offset > bns->anns[seqid].len)
|
|
641 flag |= SAM_FSU; // flag UNMAP as this alignment bridges two adjacent reference sequences
|
|
642
|
|
643 // update flag and print it
|
|
644 if (p->strand) flag |= SAM_FSR;
|
|
645 if (mate) {
|
|
646 if (mate->type != BWA_TYPE_NO_MATCH) {
|
|
647 if (mate->strand) flag |= SAM_FMR;
|
|
648 } else flag |= SAM_FMU;
|
|
649 }
|
|
650 printf("%s\t%d\t%s\t", p->name, flag, bns->anns[seqid].name);
|
|
651 printf("%d\t%d\t", (int)(p->pos - bns->anns[seqid].offset + 1), p->mapQ);
|
|
652
|
|
653 // print CIGAR
|
|
654 if (p->cigar) {
|
|
655 for (j = 0; j != p->n_cigar; ++j)
|
|
656 printf("%d%c", __cigar_len(p->cigar[j]), "MIDS"[__cigar_op(p->cigar[j])]);
|
|
657 } else if (p->type == BWA_TYPE_NO_MATCH) printf("*");
|
|
658 else printf("%dM", p->len);
|
|
659
|
|
660 // print mate coordinate
|
|
661 if (mate && mate->type != BWA_TYPE_NO_MATCH) {
|
|
662 int m_seqid, m_is_N;
|
|
663 long long isize;
|
|
664 am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality
|
|
665 // redundant calculation here, but should not matter too much
|
|
666 m_is_N = bns_coor_pac2real(bns, mate->pos, mate->len, &m_seqid);
|
|
667 printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name);
|
|
668 isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0;
|
|
669 if (p->type == BWA_TYPE_NO_MATCH) isize = 0;
|
|
670 printf("%d\t%lld\t", (int)(mate->pos - bns->anns[m_seqid].offset + 1), isize);
|
|
671 } else if (mate) printf("\t=\t%d\t0\t", (int)(p->pos - bns->anns[seqid].offset + 1));
|
|
672 else printf("\t*\t0\t0\t");
|
|
673
|
|
674
|
|
675 if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len);
|
|
676 if (p->type != BWA_TYPE_NO_MATCH) {
|
|
677 int i;
|
|
678 // calculate XT tag
|
|
679 XT = "NURM"[p->type];
|
|
680 if (nn > 10) XT = 'N';
|
|
681 // print tags
|
|
682 printf("\tXT:A:%c\t%s:i:%d", XT, (mode & BWA_MODE_COMPREAD)? "NM" : "CM", p->nm);
|
|
683 if (nn) printf("\tXN:i:%d", nn);
|
|
684 if (mate) printf("\tSM:i:%d\tAM:i:%d", p->seQ, am);
|
|
685 printf("\tXM:i:%d\tXO:i:%d\tXG:i:%d", p->n_mm, p->n_gapo, p->n_gapo+p->n_gape);
|
|
686 if (p->md) printf("\tMD:Z:%s", p->md);
|
|
687 }
|
|
688 putchar('\n');
|
|
689 } else { // this read has no match
|
|
690 ubyte_t *s = p->strand? p->rseq : p->seq;
|
|
691 int flag = p->extra_flag | SAM_FSU;
|
|
692 if (mate && mate->type == BWA_TYPE_NO_MATCH) flag |= SAM_FMU;
|
|
693 printf("%d\t*\t0\t0\t*\t*\t0\t0\t", flag);
|
|
694 if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len);
|
|
695 putchar('\n');
|
|
696 }
|
|
697 }
|
|
698
|
|
699 */
|
|
700
|
|
701 bntseq_t *bwa_open_nt(const char *prefix)
|
|
702 {
|
|
703 bntseq_t *ntbns;
|
|
704 char *str;
|
|
705 str = (char*)calloc(strlen(prefix) + 10, 1);
|
|
706 strcat(strcpy(str, prefix), ".nt");
|
|
707 ntbns = bns_restore(str);
|
|
708 free(str);
|
|
709 return ntbns;
|
|
710 }
|
|
711
|
|
712 void bwa_print_sam_SQ(const bntseq_t *bns)
|
|
713 {
|
|
714 int i;
|
|
715 for (i = 0; i < bns->n_seqs; ++i)
|
|
716 printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
|
|
717 }
|
|
718
|
|
719 void bwase_initialize()
|
|
720 {
|
|
721 int i;
|
|
722 for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
|
|
723 }
|
|
724
|
|
725 void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ)
|
|
726 {
|
|
727 int i, n_seqs, tot_seqs = 0, m_aln;
|
|
728 bwt_aln1_t *aln = 0;
|
|
729 bwa_seq_t *seqs;
|
|
730 bwa_seqio_t *ks;
|
|
731 clock_t t;
|
|
732 bntseq_t *bns, *ntbns = 0;
|
|
733 FILE *fp_sa;
|
|
734 gap_opt_t opt;
|
|
735
|
|
736 // initialization
|
|
737 bwase_initialize();
|
|
738 bns = bns_restore(prefix);
|
|
739 srand48(bns->seed);
|
|
740 ks = bwa_seq_open(fn_fa);
|
|
741 fp_sa = xopen(fn_sa, "r");
|
|
742
|
|
743 // core loop
|
|
744 m_aln = 0;
|
|
745 fread(&opt, sizeof(gap_opt_t), 1, fp_sa);
|
|
746 if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac
|
|
747 ntbns = bwa_open_nt(prefix);
|
|
748 bwa_print_sam_SQ(bns);
|
|
749 while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode & BWA_MODE_COMPREAD, opt.trim_qual)) != 0) {
|
|
750 tot_seqs += n_seqs;
|
|
751 t = clock();
|
|
752
|
|
753 // read alignment
|
|
754 for (i = 0; i < n_seqs; ++i) {
|
|
755 bwa_seq_t *p = seqs + i;
|
|
756 int n_aln;
|
|
757 fread(&n_aln, 4, 1, fp_sa);
|
|
758 if (n_aln > m_aln) {
|
|
759 m_aln = n_aln;
|
|
760 aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln);
|
|
761 }
|
|
762 fread(aln, sizeof(bwt_aln1_t), n_aln, fp_sa);
|
|
763 bwa_aln2seq_core(n_aln, aln, p, 1, n_occ);
|
|
764 }
|
|
765
|
|
766 fprintf(stderr, "[bwa_aln_core] convert to sequence coordinate... ");
|
|
767 bwa_cal_pac_pos(prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here
|
|
768 fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
|
769
|
|
770 fprintf(stderr, "[bwa_aln_core] refine gapped alignments... ");
|
|
771 bwa_refine_gapped(bns, n_seqs, seqs, 0, ntbns);
|
|
772 fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
|
773
|
|
774 fprintf(stderr, "[bwa_aln_core] print alignments... ");
|
|
775 for (i = 0; i < n_seqs; ++i)
|
|
776 bwa_print_sam1(bns, seqs + i, 0, opt.mode, opt.max_top2);
|
|
777 fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
|
778
|
|
779 bwa_free_read_seq(n_seqs, seqs);
|
|
780 fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs);
|
|
781 }
|
|
782
|
|
783 // destroy
|
|
784 bwa_seq_close(ks);
|
|
785 if (ntbns) bns_destroy(ntbns);
|
|
786 bns_destroy(bns);
|
|
787 fclose(fp_sa);
|
|
788 free(aln);
|
|
789 }
|
|
790
|
|
791 void bwa_print_all_hits(const char *prefix, const char *fn_sa, const char *fn_fa, int max_extra_occ)
|
|
792 {
|
|
793 int i, n_seqs, tot_seqs = 0, m_aln, m_rest;
|
|
794 bwt_aln1_t *aln = 0;
|
|
795 bwa_seq_t *seqs;
|
|
796 bwa_seqio_t *ks;
|
|
797 clock_t t,t_convert, t_refine, t_write;;
|
|
798 bntseq_t *bns, *ntbns = 0;
|
|
799 FILE *fp_sa;
|
|
800 gap_opt_t opt;
|
|
801
|
|
802 //****** below modified (added) for multiple hit printout:
|
|
803
|
|
804 bwa_seq_t * rest_seqs = 0; // this array will keep (shallow) replicas of the current sequence;
|
|
805 // each of the replicas will be updated with its own alignment
|
|
806 // selected from all the (multiple) alignmens available for the current seq.
|
|
807
|
|
808 bwt_t *bwt[2];
|
|
809 char str[1024];
|
|
810 ubyte_t *pacseq;
|
|
811
|
|
812 t = clock();
|
|
813 fprintf(stderr, "[bwa_aln_core] Data structures initialized: ");
|
|
814
|
|
815
|
|
816 strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
|
|
817 strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]);
|
|
818
|
|
819 // load reverse BWT and SA
|
|
820 strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
|
|
821 strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]);
|
|
822 //***************
|
|
823
|
|
824 // initialization
|
|
825 bwase_initialize();
|
|
826 bns = bns_restore(prefix);
|
|
827 srand48(bns->seed);
|
|
828 ks = bwa_seq_open(fn_fa);
|
|
829 fp_sa = xopen(fn_sa, "r");
|
|
830
|
|
831 pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
|
|
832 rewind(bns->fp_pac);
|
|
833 fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac);
|
|
834
|
|
835
|
|
836 // core loop
|
|
837 m_aln = 0;
|
|
838 m_rest = 0;
|
|
839
|
|
840 fread(&opt, sizeof(gap_opt_t), 1, fp_sa);
|
|
841 if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac
|
|
842 ntbns = bwa_open_nt(prefix);
|
|
843 bwa_print_sam_SQ(bns);
|
|
844
|
|
845 fprintf(stderr, "%.2f sec\n", (float)(clock()-t) / CLOCKS_PER_SEC);
|
|
846
|
|
847 t = clock();
|
|
848
|
|
849 max_extra_occ++; // now this variable holds TOTAL number of alignments we want to print (1+requested extra).
|
|
850
|
|
851 while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode & BWA_MODE_COMPREAD, opt.trim_qual)) != 0) {
|
|
852 tot_seqs += n_seqs;
|
|
853 t_convert = 0;
|
|
854 t_refine = 0;
|
|
855 t_write = 0;
|
|
856
|
|
857 fprintf(stderr, "[bwa_aln_core] %d sequences loaded: ",n_seqs);
|
|
858 fprintf(stderr, "%.2f sec\n", (float)(clock()-t) / CLOCKS_PER_SEC);
|
|
859
|
|
860
|
|
861 // read alignment
|
|
862 for (i = 0; i < n_seqs; ++i) {
|
|
863
|
|
864 bwa_seq_t *p = seqs + i;
|
|
865 int n_aln, n_occ, k, rest;
|
|
866 fread(&n_aln, 4, 1, fp_sa);
|
|
867 if (n_aln > m_aln) {
|
|
868 m_aln = n_aln;
|
|
869 aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln);
|
|
870 }
|
|
871
|
|
872 fread(aln, sizeof(bwt_aln1_t), n_aln, fp_sa);
|
|
873 for ( k = n_occ = 0 ; k < n_aln; ++k ) {
|
|
874 const bwt_aln1_t *q = aln + k;
|
|
875 n_occ += q->l - q->k + 1;
|
|
876 } /* n_occ is now keeping total number of available alignments to the reference
|
|
877 (i.e. placements, NOT bwa records, each of which can describe few placements)
|
|
878 */
|
|
879
|
|
880 // we are going to keep and print 'rest' alignments:
|
|
881 rest = ((n_occ > max_extra_occ)? max_extra_occ : n_occ);
|
|
882
|
|
883 if ( rest == 0 ) rest++; /* we need at least one record, even if it is going to say "UNMAPPED" */
|
|
884
|
|
885 if ( rest > m_rest ) {
|
|
886 // reallocate rest_seqs array (only if needed) to ensure it can keep 'rest' records
|
|
887 m_rest = rest;
|
|
888 rest_seqs = (bwa_seq_t*)realloc(rest_seqs,sizeof(bwa_seq_t)*m_rest);
|
|
889 }
|
|
890 // initialize 'rest' replicas of the current sequence record
|
|
891 for ( k = 0 ; k < rest ; k++ ) {
|
|
892 rest_seqs[k] = *p; /* clone current sequence p; IMPORTANT: it's a shallow copy */
|
|
893 }
|
|
894
|
|
895 bwa_aln2seq_all(n_aln, aln, rest,rest_seqs);
|
|
896 // now each of the replicas carries its own bwa alignment selected from all alignments
|
|
897 // available for the current sequence *p.
|
|
898
|
|
899 /* compute positions of the alignments on the ref: */
|
|
900 t = clock();
|
|
901
|
|
902 bwa_cal_pac_pos_core(bwt[0],bwt[1], rest, rest_seqs, opt.max_diff, opt.fnr );
|
|
903 t_convert += ( clock() - t );
|
|
904
|
|
905 /* compute positions of the alignments on the ref: */
|
|
906 t = clock();
|
|
907
|
|
908 bwa_cal_pac_pos_core(bwt[0],bwt[1], rest, rest_seqs, opt.max_diff, opt.fnr );
|
|
909 t_convert += ( clock() - t );
|
|
910
|
|
911 t = clock();
|
|
912
|
|
913 bwa_refine_gapped(bns,rest,rest_seqs,pacseq,ntbns); // refine all gapped aligns in our replicas;
|
|
914 // side effect: cigars will be allocated for each replica
|
|
915 t_refine += ( clock() - t );
|
|
916
|
|
917 t = clock();
|
|
918 // for ( k = 0 ; k < n_seqs ; k++ ) {
|
|
919 for ( k = 0 ; k < rest ; k++ ) {
|
|
920
|
|
921 bwa_print_sam1(bns, rest_seqs + k, 0, opt.mode, opt.max_top2);
|
|
922 // cigar was allocated for us in every replica as a side effect, free it now:
|
|
923 free ( (rest_seqs+k)->cigar );
|
|
924 }
|
|
925 t_write+= ( clock()-t);
|
|
926
|
|
927 }
|
|
928
|
|
929 bwa_free_read_seq(n_seqs, seqs);
|
|
930
|
|
931 fprintf(stderr, "[bwa_aln_core] convert %d sequences to sequence coordinate: ",n_seqs);
|
|
932 fprintf(stderr, "%.2f sec\n", (float)t_convert / CLOCKS_PER_SEC);
|
|
933 fprintf(stderr, "[bwa_aln_core] refine gapped alignments for %d sequences: ", n_seqs);
|
|
934 fprintf(stderr, "%.2f sec\n", (float)t_refine / CLOCKS_PER_SEC);
|
|
935 fprintf(stderr, "[bwa_aln_core] print alignments for %d sequences: ", n_seqs);
|
|
936 fprintf(stderr, "%.2f sec\n", (float)t_write/ CLOCKS_PER_SEC);
|
|
937 fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs);
|
|
938
|
|
939 t = clock();
|
|
940 }
|
|
941
|
|
942 // destroy
|
|
943 bwt_destroy(bwt[0]);
|
|
944 bwt_destroy(bwt[1]);
|
|
945
|
|
946 free(rest_seqs);
|
|
947 free(pacseq);
|
|
948
|
|
949 bwa_seq_close(ks);
|
|
950 if (ntbns) bns_destroy(ntbns);
|
|
951 bns_destroy(bns);
|
|
952 fclose(fp_sa);
|
|
953 free(aln);
|
|
954 }
|
|
955
|
|
956 int bwa_sai2sam_se(int argc, char *argv[])
|
|
957 {
|
|
958 int c, n_occ = 3;
|
|
959 int do_full_sam = 0;
|
|
960 while ((c = getopt(argc, argv, "hsn:f:")) >= 0) {
|
|
961 switch (c) {
|
|
962 case 'h': break;
|
|
963 case 's': do_full_sam = 1; break;
|
|
964 case 'n': n_occ = atoi(optarg); break;
|
|
965 case 'f': freopen(optarg, "w", stdout); break;
|
|
966 default: return 1;
|
|
967 }
|
|
968 }
|
|
969
|
|
970 if (optind + 3 > argc) {
|
|
971 fprintf(stderr, "Usage: bwa samse [-n max_occ [-s] ] [-f out.sam] <prefix> <in.sai> <in.fq>\n");
|
|
972 return 1;
|
|
973 }
|
|
974 if ( do_full_sam ) bwa_print_all_hits(argv[optind], argv[optind+1], argv[optind+2], n_occ);
|
|
975 else bwa_sai2sam_se_core(argv[optind], argv[optind+1], argv[optind+2], n_occ);
|
|
976 return 0;
|
|
977 }
|