# HG changeset patch # User xilinxu # Date 1408005902 14400 # Node ID dfe9332138cfd3a79485ae5513f6a002df266099 # Parent abdbc8fe98ddfc656682ef0ca2fec2c1499fe2b9 Deleted selected files diff -r abdbc8fe98dd -r dfe9332138cf bwa-0.7.9a/Makefile --- a/bwa-0.7.9a/Makefile Thu Aug 14 04:44:51 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,79 +0,0 @@ -CC= gcc -#CC= clang --analyze -CFLAGS= -g -Wall -Wno-unused-function -O2 -WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS -AR= ar -DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) -LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o -AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ - is.o bwtindex.o bwape.o kopen.o pemerge.o \ - bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ - bwtsw2_chain.o fastmap.o bwtsw2_pair.o -PROG= bwa -INCLUDES= -LIBS= -lm -lz -lpthread -SUBDIRS= . - -.SUFFIXES:.c .o .cc - -.c.o: - $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ - -all:$(PROG) - -bwa:libbwa.a $(AOBJS) main.o - $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS) - -bwamem-lite:libbwa.a example.o - $(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ -L. -lbwa $(LIBS) - -libbwa.a:$(LOBJS) - $(AR) -csru $@ $(LOBJS) - -clean: - rm -f gmon.out *.o a.out $(PROG) *~ *.a - -depend: - ( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) -- *.c ) - -# DO NOT DELETE THIS LINE -- make depend depends on it. - -QSufSort.o: QSufSort.h -bamlite.o: bamlite.h malloc_wrap.h -bntseq.o: bntseq.h utils.h kseq.h malloc_wrap.h -bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h kstring.h malloc_wrap.h kseq.h -bwamem.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h -bwamem.o: ksort.h utils.h kbtree.h -bwamem_extra.o: bwa.h bntseq.h bwt.h bwamem.h kstring.h malloc_wrap.h -bwamem_pair.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h kvec.h -bwamem_pair.o: utils.h ksw.h -bwape.o: bwtaln.h bwt.h kvec.h malloc_wrap.h bntseq.h utils.h bwase.h bwa.h -bwape.o: ksw.h khash.h -bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h malloc_wrap.h -bwase.o: bwa.h ksw.h -bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h malloc_wrap.h kseq.h -bwt.o: utils.h bwt.h kvec.h malloc_wrap.h -bwt_gen.o: QSufSort.h malloc_wrap.h -bwt_lite.o: bwt_lite.h malloc_wrap.h -bwtaln.o: bwtaln.h bwt.h bwtgap.h utils.h bwa.h bntseq.h malloc_wrap.h -bwtgap.o: bwtgap.h bwt.h bwtaln.h malloc_wrap.h -bwtindex.o: bntseq.h bwt.h utils.h malloc_wrap.h -bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h kstring.h -bwtsw2_aux.o: malloc_wrap.h bwa.h ksw.h kseq.h ksort.h -bwtsw2_chain.o: bwtsw2.h bntseq.h bwt_lite.h bwt.h malloc_wrap.h ksort.h -bwtsw2_core.o: bwt_lite.h bwtsw2.h bntseq.h bwt.h kvec.h malloc_wrap.h -bwtsw2_core.o: khash.h ksort.h -bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h -bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h -bwtsw2_pair.o: malloc_wrap.h ksw.h -example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h malloc_wrap.h -fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h -is.o: malloc_wrap.h -kopen.o: malloc_wrap.h -kstring.o: kstring.h malloc_wrap.h -ksw.o: ksw.h malloc_wrap.h -main.o: kstring.h malloc_wrap.h utils.h -malloc_wrap.o: malloc_wrap.h -pemerge.o: ksw.h kseq.h malloc_wrap.h kstring.h bwa.h bntseq.h bwt.h utils.h -test-extend.o: ksw.h -utils.o: utils.h ksort.h malloc_wrap.h kseq.h diff -r abdbc8fe98dd -r dfe9332138cf bwa-0.7.9a/bwamem.c --- a/bwa-0.7.9a/bwamem.c Thu Aug 14 04:44:51 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1190 +0,0 @@ -#include -#include -#include -#include -#include -#ifdef HAVE_PTHREAD -#include -#endif - -#include "kstring.h" -#include "bwamem.h" -#include "bntseq.h" -#include "ksw.h" -#include "kvec.h" -#include "ksort.h" -#include "utils.h" - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -/* Theory on probability and scoring *ungapped* alignment - * - * s'(a,b) = log[P(b|a)/P(b)] = log[4P(b|a)], assuming uniform base distribution - * s'(a,a) = log(4), s'(a,b) = log(4e/3), where e is the error rate - * - * Scale s'(a,b) to s(a,a) s.t. s(a,a)=x. Then s(a,b) = x*s'(a,b)/log(4), or conversely: s'(a,b)=s(a,b)*log(4)/x - * - * If the matching score is x and mismatch penalty is -y, we can compute error rate e: - * e = .75 * exp[-log(4) * y/x] - * - * log P(seq) = \sum_i log P(b_i|a_i) = \sum_i {s'(a,b) - log(4)} - * = \sum_i { s(a,b)*log(4)/x - log(4) } = log(4) * (S/x - l) - * - * where S=\sum_i s(a,b) is the alignment score. Converting to the phred scale: - * Q(seq) = -10/log(10) * log P(seq) = 10*log(4)/log(10) * (l - S/x) = 6.02 * (l - S/x) - * - * - * Gap open (zero gap): q' = log[P(gap-open)], r' = log[P(gap-ext)] (see Durbin et al. (1998) Section 4.1) - * Then q = x*log[P(gap-open)]/log(4), r = x*log[P(gap-ext)]/log(4) - * - * When there are gaps, l should be the length of alignment matches (i.e. the M operator in CIGAR) - */ - -static const bntseq_t *global_bns = 0; // for debugging only - -mem_opt_t *mem_opt_init() -{ - mem_opt_t *o; - o = calloc(1, sizeof(mem_opt_t)); - o->flag = 0; - o->a = 1; o->b = 4; - o->o_del = o->o_ins = 6; - o->e_del = o->e_ins = 1; - o->w = 100; - o->T = 30; - o->zdrop = 100; - o->pen_unpaired = 17; - o->pen_clip5 = o->pen_clip3 = 5; - o->min_seed_len = 19; - o->split_width = 10; - o->max_occ = 500; - o->max_chain_gap = 10000; - o->max_ins = 10000; - o->mask_level = 0.50; - o->drop_ratio = 0.50; - o->XA_drop_ratio = 0.80; - o->split_factor = 1.5; - o->chunk_size = 10000000; - o->n_threads = 1; - o->max_hits = 5; - o->max_matesw = 50; - o->mask_level_redun = 0.95; - o->min_chain_weight = 0; - o->max_chain_extend = 1<<30; - o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len); - bwa_fill_scmat(o->a, o->b, o->mat); - return o; -} - -/*************************** - * Collection SA invervals * - ***************************/ - -#define intv_lt(a, b) ((a).info < (b).info) -KSORT_INIT(mem_intv, bwtintv_t, intv_lt) - -typedef struct { - bwtintv_v mem, mem1, *tmpv[2]; -} smem_aux_t; - -static smem_aux_t *smem_aux_init() -{ - smem_aux_t *a; - a = calloc(1, sizeof(smem_aux_t)); - a->tmpv[0] = calloc(1, sizeof(bwtintv_v)); - a->tmpv[1] = calloc(1, sizeof(bwtintv_v)); - return a; -} - -static void smem_aux_destroy(smem_aux_t *a) -{ - free(a->tmpv[0]->a); free(a->tmpv[0]); - free(a->tmpv[1]->a); free(a->tmpv[1]); - free(a->mem.a); free(a->mem1.a); - free(a); -} - -static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq, smem_aux_t *a) -{ - int i, k, x = 0, old_n; - int start_width = (opt->flag & MEM_F_SELF_OVLP)? 2 : 1; - int split_len = (int)(opt->min_seed_len * opt->split_factor + .499); - a->mem.n = 0; - // first pass: find all SMEMs - while (x < len) { - if (seq[x] < 4) { - x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv); - for (i = 0; i < a->mem1.n; ++i) { - bwtintv_t *p = &a->mem1.a[i]; - int slen = (uint32_t)p->info - (p->info>>32); // seed length - if (slen >= opt->min_seed_len) - kv_push(bwtintv_t, a->mem, *p); - } - } else ++x; - } - // second pass: find MEMs inside a long SMEM - old_n = a->mem.n; - for (k = 0; k < old_n; ++k) { - bwtintv_t *p = &a->mem.a[k]; - int start = p->info>>32, end = (int32_t)p->info; - if (end - start < split_len || p->x[2] > opt->split_width) continue; - bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv); - for (i = 0; i < a->mem1.n; ++i) - if ((a->mem1.a[i].info>>32) - (uint32_t)a->mem1.a[i].info >= opt->min_seed_len) - kv_push(bwtintv_t, a->mem, a->mem1.a[i]); - } - // sort - ks_introsort(mem_intv, a->mem.n, a->mem.a); -} - -/************ - * Chaining * - ************/ - -typedef struct { - int64_t rbeg; - int32_t qbeg, len; - int score; -} mem_seed_t; // unaligned memory - -typedef struct { - int n, m, first, rid; - uint32_t w:30, kept:2; - float frac_rep; - int64_t pos; - mem_seed_t *seeds; -} mem_chain_t; - -typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v; - -#include "kbtree.h" - -#define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos)) -KBTREE_INIT(chn, mem_chain_t, chain_cmp) - -// return 1 if the seed is merged into the chain -static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, const mem_seed_t *p, int seed_rid) -{ - int64_t qend, rend, x, y; - const mem_seed_t *last = &c->seeds[c->n-1]; - qend = last->qbeg + last->len; - rend = last->rbeg + last->len; - if (seed_rid != c->rid) return 0; // different chr; request a new chain - if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) - return 1; // contained seed; do nothing - if ((last->rbeg < l_pac || c->seeds[0].rbeg < l_pac) && p->rbeg >= l_pac) return 0; // don't chain if on different strand - x = p->qbeg - last->qbeg; // always non-negtive - y = p->rbeg - last->rbeg; - if (y >= 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain - if (c->n == c->m) { - c->m <<= 1; - c->seeds = realloc(c->seeds, c->m * sizeof(mem_seed_t)); - } - c->seeds[c->n++] = *p; - return 1; - } - return 0; // request to add a new chain -} - -int mem_chain_weight(const mem_chain_t *c) -{ - int64_t end; - int j, w = 0, tmp; - for (j = 0, end = 0; j < c->n; ++j) { - const mem_seed_t *s = &c->seeds[j]; - if (s->qbeg >= end) w += s->len; - else if (s->qbeg + s->len > end) w += s->qbeg + s->len - end; - end = end > s->qbeg + s->len? end : s->qbeg + s->len; - } - tmp = w; w = 0; - for (j = 0, end = 0; j < c->n; ++j) { - const mem_seed_t *s = &c->seeds[j]; - if (s->rbeg >= end) w += s->len; - else if (s->rbeg + s->len > end) w += s->rbeg + s->len - end; - end = end > s->rbeg + s->len? end : s->rbeg + s->len; - } - w = w < tmp? w : tmp; - return w < 1<<30? w : (1<<30)-1; -} - -void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn) -{ - int i, j; - for (i = 0; i < chn->n; ++i) { - mem_chain_t *p = &chn->a[i]; - err_printf("* Found CHAIN(%d): n=%d; weight=%d", i, p->n, mem_chain_weight(p)); - for (j = 0; j < p->n; ++j) { - bwtint_t pos; - int is_rev; - pos = bns_depos(bns, p->seeds[j].rbeg, &is_rev); - if (is_rev) pos -= p->seeds[j].len - 1; - err_printf("\t%d;%d;%d,%ld(%s:%c%ld)", p->seeds[j].score, p->seeds[j].len, p->seeds[j].qbeg, (long)p->seeds[j].rbeg, bns->anns[p->rid].name, "+-"[is_rev], (long)(pos - bns->anns[p->rid].offset) + 1); - } - err_putchar('\n'); - } -} - -mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, int len, const uint8_t *seq, void *buf) -{ - int i, b, e, l_rep; - int64_t l_pac = bns->l_pac; - mem_chain_v chain; - kbtree_t(chn) *tree; - smem_aux_t *aux; - - kv_init(chain); - if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match - tree = kb_init(chn, KB_DEFAULT_SIZE); - - aux = buf? (smem_aux_t*)buf : smem_aux_init(); - mem_collect_intv(opt, bwt, len, seq, aux); - for (i = 0, b = e = l_rep = 0; i < aux->mem.n; ++i) { // compute frac_rep - bwtintv_t *p = &aux->mem.a[i]; - int sb = (p->info>>32), se = (uint32_t)p->info; - if (p->x[2] <= opt->max_occ) continue; - if (sb > e) l_rep += e - b, b = sb, e = se; - else e = e > se? e : se; - } - l_rep += e - b; - for (i = 0; i < aux->mem.n; ++i) { - bwtintv_t *p = &aux->mem.a[i]; - int step, count, slen = (uint32_t)p->info - (p->info>>32); // seed length - int64_t k; - if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive - step = p->x[2] > opt->max_occ? p->x[2] / opt->max_occ : 1; - for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) { - mem_chain_t tmp, *lower, *upper; - mem_seed_t s; - int rid, to_add = 0; - s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference - s.qbeg = p->info>>32; - s.score= s.len = slen; - rid = bns_intv2rid(bns, s.rbeg, s.rbeg + s.len); - if (rid < 0) continue; // bridging multiple reference sequences or the forward-reverse boundary - if (kb_size(tree)) { - kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain - if (!lower || !test_and_merge(opt, l_pac, lower, &s, rid)) to_add = 1; - } else to_add = 1; - if (to_add) { // add the seed as a new chain - tmp.n = 1; tmp.m = 4; - tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t)); - tmp.seeds[0] = s; - tmp.rid = rid; - kb_putp(chn, tree, &tmp); - } - } - } - if (buf == 0) smem_aux_destroy(aux); - - kv_resize(mem_chain_t, chain, kb_size(tree)); - - #define traverse_func(p_) (chain.a[chain.n++] = *(p_)) - __kb_traverse(mem_chain_t, tree, traverse_func); - #undef traverse_func - - for (i = 0; i < chain.n; ++i) chain.a[i].frac_rep = (float)l_rep / len; - if (bwa_verbose >= 4) printf("* fraction of repetitive seeds: %.3f\n", (float)l_rep / len); - - kb_destroy(chn, tree); - return chain; -} - -/******************** - * Filtering chains * - ********************/ - -#define chn_beg(ch) ((ch).seeds->qbeg) -#define chn_end(ch) ((ch).seeds[(ch).n-1].qbeg + (ch).seeds[(ch).n-1].len) - -#define flt_lt(a, b) ((a).w > (b).w) -KSORT_INIT(mem_flt, mem_chain_t, flt_lt) - -int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *a) -{ - int i, k; - kvec_t(int) chains = {0,0,0}; // this keeps int indices of the non-overlapping chains - if (n_chn == 0) return 0; // no need to filter - // compute the weight of each chain and drop chains with small weight - for (i = k = 0; i < n_chn; ++i) { - mem_chain_t *c = &a[i]; - c->first = -1; c->kept = 0; - c->w = mem_chain_weight(c); - if (c->w < opt->min_chain_weight) free(c->seeds); - else a[k++] = *c; - } - n_chn = k; - ks_introsort(mem_flt, n_chn, a); - // pairwise chain comparisons - a[0].kept = 3; - kv_push(int, chains, 0); - for (i = 1; i < n_chn; ++i) { - int large_ovlp = 0; - for (k = 0; k < chains.n; ++k) { - int j = chains.a[k]; - int b_max = chn_beg(a[j]) > chn_beg(a[i])? chn_beg(a[j]) : chn_beg(a[i]); - int e_min = chn_end(a[j]) < chn_end(a[i])? chn_end(a[j]) : chn_end(a[i]); - if (e_min > b_max) { // have overlap - int li = chn_end(a[i]) - chn_beg(a[i]); - int lj = chn_end(a[j]) - chn_beg(a[j]); - int min_l = li < lj? li : lj; - if (e_min - b_max >= min_l * opt->mask_level && min_l < opt->max_chain_gap) { // significant overlap - large_ovlp = 1; - if (a[j].first < 0) a[j].first = i; // keep the first shadowed hit s.t. mapq can be more accurate - if (a[i].w < a[j].w * opt->drop_ratio && a[j].w - a[i].w >= opt->min_seed_len<<1) - break; - } - } - } - if (k == chains.n) { - kv_push(int, chains, i); - a[i].kept = large_ovlp? 2 : 3; - } - } - for (i = 0; i < chains.n; ++i) { - mem_chain_t *c = &a[chains.a[i]]; - if (c->first >= 0) a[c->first].kept = 1; - } - free(chains.a); - for (i = k = 0; i < n_chn; ++i) { // don't extend more than opt->max_chain_extend .kept=1/2 chains - if (a[i].kept == 0 || a[i].kept == 3) continue; - if (++k >= opt->max_chain_extend) break; - } - for (; i < n_chn; ++i) - if (a[i].kept < 3) a[i].kept = 0; - for (i = k = 0; i < n_chn; ++i) { // free discarded chains - mem_chain_t *c = &a[i]; - if (c->kept == 0) free(c->seeds); - else a[k++] = a[i]; - } - return k; -} - -/****************************** - * De-overlap single-end hits * - ******************************/ - -#define alnreg_slt2(a, b) ((a).re < (b).re) -KSORT_INIT(mem_ars2, mem_alnreg_t, alnreg_slt2) - -#define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb)))) -KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt) - -#define alnreg_hlt(a, b) ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash)) -KSORT_INIT(mem_ars_hash, mem_alnreg_t, alnreg_hlt) - -#define PATCH_MAX_R_BW 0.05f -#define PATCH_MIN_SC_RATIO 0.90f - -int mem_patch_reg(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, const mem_alnreg_t *a, const mem_alnreg_t *b, int *_w) -{ - int w, score, q_s, r_s; - double r; - if (bns == 0 || pac == 0 || query == 0) return 0; - assert(a->rid == b->rid && a->rb <= b->rb); - if (a->qb >= b->qb || a->qe >= b->qe || a->re >= b->re) return 0; // not colinear - w = (a->re - b->rb) - (a->qe - b->qb); // required bandwidth - w = w > 0? w : -w; // l = abs(l) - r = (double)(a->re - b->rb) / (b->re - a->rb) - (double)(a->qe - b->qb) / (b->qe - a->qb); // relative bandwidth - r = r > 0.? r : -r; // r = fabs(r) - if (bwa_verbose >= 4) - printf("* potential hit merge between [%d,%d)<=>[%ld,%ld) and [%d,%d)<=>[%ld,%ld), @ %s; w=%d, r=%.4g\n", - a->qb, a->qe, (long)a->rb, (long)a->re, b->qb, b->qe, (long)b->rb, (long)b->re, bns->anns[a->rid].name, w, r); - if (a->re < b->rb || a->qe < b->qb) { // no overlap on query or on ref - if (w > opt->w<<1 || r >= PATCH_MAX_R_BW) return 0; // the bandwidth or the relative bandwidth is too large - } else if (w > opt->w<<2 || r >= PATCH_MAX_R_BW*2) return 0; // more permissive if overlapping on both ref and query - // global alignment - w += a->w + b->w; - w = w < opt->w<<2? w : opt->w<<2; - if (bwa_verbose >= 4) printf("* test potential hit merge with global alignment; w=%d\n", w); - bwa_gen_cigar2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, w, bns->l_pac, pac, b->qe - a->qb, query + a->qb, a->rb, b->re, &score, 0, 0); - q_s = (int)((double)(b->qe - a->qb) / ((b->qe - b->qb) + (a->qe - a->qb)) * (b->score + a->score) + .499); // predicted score from query - r_s = (int)((double)(b->re - a->rb) / ((b->re - b->rb) + (a->re - a->rb)) * (b->score + a->score) + .499); // predicted score from ref - if (bwa_verbose >= 4) printf("* score=%d;(%d,%d)\n", score, q_s, r_s); - if ((double)score / (q_s > r_s? q_s : r_s) < PATCH_MIN_SC_RATIO) return 0; - *_w = w; - return score; -} - -int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a) -{ - int m, i, j; - if (n <= 1) return n; - ks_introsort(mem_ars2, n, a); // sort by the END position, not START! - for (i = 0; i < n; ++i) a[i].n_comp = 1; - for (i = 1; i < n; ++i) { - mem_alnreg_t *p = &a[i]; - if (p->rid != a[i-1].rid || p->rb >= a[i-1].re + opt->max_chain_gap) continue; // then no need to go into the loop below - for (j = i - 1; j >= 0 && p->rid == a[j].rid && p->rb < a[j].re + opt->max_chain_gap; --j) { - mem_alnreg_t *q = &a[j]; - int64_t or, oq, mr, mq; - int score, w; - if (q->qe == q->qb) continue; // a[j] has been excluded - or = q->re - p->rb; // overlap length on the reference - oq = q->qb < p->qb? q->qe - p->qb : p->qe - q->qb; // overlap length on the query - mr = q->re - q->rb < p->re - p->rb? q->re - q->rb : p->re - p->rb; // min ref len in alignment - mq = q->qe - q->qb < p->qe - p->qb? q->qe - q->qb : p->qe - p->qb; // min qry len in alignment - if (or > opt->mask_level_redun * mr && oq > opt->mask_level_redun * mq) { // one of the hits is redundant - if (p->score < q->score) { - p->qe = p->qb; - break; - } else q->qe = q->qb; - } else if (q->rb < p->rb && (score = mem_patch_reg(opt, bns, pac, query, q, p, &w)) > 0) { // then merge q into p - p->n_comp += q->n_comp + 1; - p->seedcov = p->seedcov > q->seedcov? p->seedcov : q->seedcov; - p->sub = p->sub > q->sub? p->sub : q->sub; - p->csub = p->csub > q->csub? p->csub : q->csub; - p->qb = q->qb, p->rb = q->rb; - p->truesc = p->score = score; - p->w = w; - q->qb = q->qe; - } - } - } - for (i = 0, m = 0; i < n; ++i) // exclude identical hits - if (a[i].qe > a[i].qb) { - if (m != i) a[m++] = a[i]; - else ++m; - } - n = m; - ks_introsort(mem_ars, n, a); - for (i = 1; i < n; ++i) { // mark identical hits - if (a[i].score == a[i-1].score && a[i].rb == a[i-1].rb && a[i].qb == a[i-1].qb) - a[i].qe = a[i].qb; - } - for (i = 1, m = 1; i < n; ++i) // exclude identical hits - if (a[i].qe > a[i].qb) { - if (m != i) a[m++] = a[i]; - else ++m; - } - return m; -} - -int mem_test_and_remove_exact(const mem_opt_t *opt, int n, mem_alnreg_t *a, int qlen) -{ - if (!(opt->flag & MEM_F_SELF_OVLP) || n == 0 || a->truesc != qlen * opt->a) return n; - memmove(a, a + 1, (n - 1) * sizeof(mem_alnreg_t)); - return n - 1; -} - -void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id) -{ // similar to the loop in mem_chain_flt() - int i, k, tmp; - kvec_t(int) z; - if (n == 0) return; - kv_init(z); - for (i = 0; i < n; ++i) a[i].sub = 0, a[i].secondary = -1, a[i].hash = hash_64(id+i); - ks_introsort(mem_ars_hash, n, a); - tmp = opt->a + opt->b; - tmp = opt->o_del + opt->e_del > tmp? opt->o_del + opt->e_del : tmp; - tmp = opt->o_ins + opt->e_ins > tmp? opt->o_ins + opt->e_ins : tmp; - kv_push(int, z, 0); - for (i = 1; i < n; ++i) { - for (k = 0; k < z.n; ++k) { - int j = z.a[k]; - int b_max = a[j].qb > a[i].qb? a[j].qb : a[i].qb; - int e_min = a[j].qe < a[i].qe? a[j].qe : a[i].qe; - if (e_min > b_max) { // have overlap - int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb; - if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap - if (a[j].sub == 0) a[j].sub = a[i].score; - if (a[j].score - a[i].score <= tmp) ++a[j].sub_n; - break; - } - } - } - if (k == z.n) kv_push(int, z, i); - else a[i].secondary = z.a[k]; - } - free(z.a); -} - -/********************************* - * Test if a seed is good enough * - *********************************/ - -#define MEM_SHORT_EXT 50 -#define MEM_SHORT_LEN 200 - -#define MEM_HSP_COEF 1.1 - -int mem_seed_sw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_seed_t *s) -{ - int qb, qe, rid; - int64_t rb, re, mid, l_pac = bns->l_pac; - uint8_t *rseq = 0; - kswr_t x; - - qb = s->qbeg, qe = s->qbeg + s->len; - rb = s->rbeg, re = s->rbeg + s->len; - mid = (rb + re) >> 1; - qb -= MEM_SHORT_EXT; qb = qb > 0? qb : 0; - qe += MEM_SHORT_EXT; qe = qe < l_query? qe : l_query; - rb -= MEM_SHORT_EXT; rb = rb > 0? rb : 0; - re += MEM_SHORT_EXT; re = re < l_pac<<1? re : l_pac<<1; - if (rb < l_pac && l_pac < re) { - if (mid < l_pac) re = l_pac; - else rb = l_pac; - } - if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return -1; - - rseq = bns_fetch_seq(bns, pac, &rb, mid, &re, &rid); - x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, KSW_XSTART, 0); - free(rseq); - return x.score; -} - -void mem_flt_chained_seeds(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, int n_chn, mem_chain_t *a) -{ - int i, j, k, min_HSP_score = (int)(opt->min_chain_weight * opt->a * MEM_HSP_COEF + .499); - for (i = 0; i < n_chn; ++i) { - mem_chain_t *c = &a[i]; - for (j = k = 0; j < c->n; ++j) { - mem_seed_t *s = &c->seeds[j]; - s->score = mem_seed_sw(opt, bns, pac, l_query, query, s); - if (s->score < 0 || s->score >= min_HSP_score) - c->seeds[k++] = *s; - } - c->n = k; - } -} - -/**************************************** - * Construct the alignment from a chain * - ****************************************/ - -/* mem_chain2aln() vs mem_chain2aln_short() - * - * mem_chain2aln() covers all the functionality of mem_chain2aln_short(). - * However, it may waste time on extracting the reference sequences given a - * very long query. mem_chain2aln_short() is faster for very short chains in a - * long query. It may fail when the matches are long or reach the end of the - * query. In this case, mem_chain2aln() will be called again. - * mem_chain2aln_short() is almost never used for short-read alignment. - */ - -int mem_chain2aln_short(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) -{ - int i, qb, qe, xtra, rid; - int64_t rb, re, l_pac = bns->l_pac; - uint8_t *rseq = 0; - mem_alnreg_t a; - kswr_t x; - - if (c->n == 0) return -1; - qb = l_query; qe = 0; - rb = l_pac<<1; re = 0; - memset(&a, 0, sizeof(mem_alnreg_t)); - for (i = 0; i < c->n; ++i) { - const mem_seed_t *s = &c->seeds[i]; - qb = qb < s->qbeg? qb : s->qbeg; - qe = qe > s->qbeg + s->len? qe : s->qbeg + s->len; - rb = rb < s->rbeg? rb : s->rbeg; - re = re > s->rbeg + s->len? re : s->rbeg + s->len; - a.seedcov += s->len; - } - qb -= MEM_SHORT_EXT; qe += MEM_SHORT_EXT; - if (qb <= 10 || qe >= l_query - 10) return 1; // because ksw_align() does not support end-to-end alignment - rb -= MEM_SHORT_EXT; re += MEM_SHORT_EXT; - rb = rb > 0? rb : 0; - re = re < l_pac<<1? re : l_pac<<1; - if (rb < l_pac && l_pac < re) { - if (c->seeds[0].rbeg < l_pac) re = l_pac; - else rb = l_pac; - } - if ((re - rb) - (qe - qb) > MEM_SHORT_EXT || (qe - qb) - (re - rb) > MEM_SHORT_EXT) return 1; - if (qe - qb >= opt->w * 4 || re - rb >= opt->w * 4) return 1; - if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return 1; - - rseq = bns_fetch_seq(bns, pac, &rb, c->seeds[0].rbeg, &re, &rid); - assert(c->rid == rid); - xtra = KSW_XSUBO | KSW_XSTART | ((qe - qb) * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a); - x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0); - free(rseq); - a.rb = rb + x.tb; a.re = rb + x.te + 1; - a.qb = qb + x.qb; a.qe = qb + x.qe + 1; - a.score = x.score; - a.csub = x.score2; - a.rid = c->rid; - a.frac_rep = c->frac_rep; - if (bwa_verbose >= 4) printf("** Attempted alignment via mem_chain2aln_short(): [%d,%d) <=> [%ld,%ld); score=%d; %d/%d\n", a.qb, a.qe, (long)a.rb, (long)a.re, x.score, a.qe-a.qb, qe-qb); - if (x.tb < MEM_SHORT_EXT>>1 || x.te > re - rb - (MEM_SHORT_EXT>>1)) return 1; - kv_push(mem_alnreg_t, *av, a); - return 0; -} - -static inline int cal_max_gap(const mem_opt_t *opt, int qlen) -{ - int l_del = (int)((double)(qlen * opt->a - opt->o_del) / opt->e_del + 1.); - int l_ins = (int)((double)(qlen * opt->a - opt->o_ins) / opt->e_ins + 1.); - int l = l_del > l_ins? l_del : l_ins; - l = l > 1? l : 1; - return l < opt->w<<1? l : opt->w<<1; -} - -#define MAX_BAND_TRY 2 - -void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) -{ - int i, k, rid, max_off[2], aw[2]; // aw: actual bandwidth used in extension - int64_t l_pac = bns->l_pac, rmax[2], tmp, max = 0; - const mem_seed_t *s; - uint8_t *rseq = 0; - uint64_t *srt; - - if (c->n == 0) return; - // get the max possible span - rmax[0] = l_pac<<1; rmax[1] = 0; - for (i = 0; i < c->n; ++i) { - int64_t b, e; - const mem_seed_t *t = &c->seeds[i]; - b = t->rbeg - (t->qbeg + cal_max_gap(opt, t->qbeg)); - e = t->rbeg + t->len + ((l_query - t->qbeg - t->len) + cal_max_gap(opt, l_query - t->qbeg - t->len)); - rmax[0] = rmax[0] < b? rmax[0] : b; - rmax[1] = rmax[1] > e? rmax[1] : e; - if (t->len > max) max = t->len; - } - rmax[0] = rmax[0] > 0? rmax[0] : 0; - rmax[1] = rmax[1] < l_pac<<1? rmax[1] : l_pac<<1; - if (rmax[0] < l_pac && l_pac < rmax[1]) { // crossing the forward-reverse boundary; then choose one side - if (c->seeds[0].rbeg < l_pac) rmax[1] = l_pac; // this works because all seeds are guaranteed to be on the same strand - else rmax[0] = l_pac; - } - // retrieve the reference sequence - rseq = bns_fetch_seq(bns, pac, &rmax[0], c->seeds[0].rbeg, &rmax[1], &rid); - assert(c->rid == rid); - - srt = malloc(c->n * 8); - for (i = 0; i < c->n; ++i) - srt[i] = (uint64_t)c->seeds[i].score<<32 | i; - ks_introsort_64(c->n, srt); - - for (k = c->n - 1; k >= 0; --k) { - mem_alnreg_t *a; - s = &c->seeds[(uint32_t)srt[k]]; - - for (i = 0; i < av->n; ++i) { // test whether extension has been made before - mem_alnreg_t *p = &av->a[i]; - int64_t rd; - int qd, w, max_gap; - if (s->rbeg < p->rb || s->rbeg + s->len > p->re || s->qbeg < p->qb || s->qbeg + s->len > p->qe) continue; // not fully contained - if (s->len - p->seedlen0 > .1 * l_query) continue; // this seed may give a better alignment - // qd: distance ahead of the seed on query; rd: on reference - qd = s->qbeg - p->qb; rd = s->rbeg - p->rb; - max_gap = cal_max_gap(opt, qd < rd? qd : rd); // the maximal gap allowed in regions ahead of the seed - w = max_gap < opt->w? max_gap : opt->w; // bounded by the band width - if (qd - rd < w && rd - qd < w) break; // the seed is "around" a previous hit - // similar to the previous four lines, but this time we look at the region behind - qd = p->qe - (s->qbeg + s->len); rd = p->re - (s->rbeg + s->len); - max_gap = cal_max_gap(opt, qd < rd? qd : rd); - w = max_gap < opt->w? max_gap : opt->w; - if (qd - rd < w && rd - qd < w) break; - } - if (i < av->n) { // the seed is (almost) contained in an existing alignment; further testing is needed to confirm it is not leading to a different aln - if (bwa_verbose >= 4) - printf("** Seed(%d) [%ld;%ld,%ld] is almost contained in an existing alignment [%d,%d) <=> [%ld,%ld)\n", - k, (long)s->len, (long)s->qbeg, (long)s->rbeg, av->a[i].qb, av->a[i].qe, (long)av->a[i].rb, (long)av->a[i].re); - for (i = k + 1; i < c->n; ++i) { // check overlapping seeds in the same chain - const mem_seed_t *t; - if (srt[i] == 0) continue; - t = &c->seeds[(uint32_t)srt[i]]; - if (t->len < s->len * .95) continue; // only check overlapping if t is long enough; TODO: more efficient by early stopping - if (s->qbeg <= t->qbeg && s->qbeg + s->len - t->qbeg >= s->len>>2 && t->qbeg - s->qbeg != t->rbeg - s->rbeg) break; - if (t->qbeg <= s->qbeg && t->qbeg + t->len - s->qbeg >= s->len>>2 && s->qbeg - t->qbeg != s->rbeg - t->rbeg) break; - } - if (i == c->n) { // no overlapping seeds; then skip extension - srt[k] = 0; // mark that seed extension has not been performed - continue; - } - if (bwa_verbose >= 4) - printf("** Seed(%d) might lead to a different alignment even though it is contained. Extension will be performed.\n", k); - } - - a = kv_pushp(mem_alnreg_t, *av); - memset(a, 0, sizeof(mem_alnreg_t)); - a->w = aw[0] = aw[1] = opt->w; - a->score = a->truesc = -1; - a->rid = c->rid; - - if (bwa_verbose >= 4) err_printf("** ---> Extending from seed(%d) [%ld;%ld,%ld] @ %s <---\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg, bns->anns[c->rid].name); - if (s->qbeg) { // left extension - uint8_t *rs, *qs; - int qle, tle, gtle, gscore; - qs = malloc(s->qbeg); - for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i]; - tmp = s->rbeg - rmax[0]; - rs = malloc(tmp); - for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; - for (i = 0; i < MAX_BAND_TRY; ++i) { - int prev = a->score; - aw[0] = opt->w << i; - if (bwa_verbose >= 4) { - int j; - printf("*** Left ref: "); for (j = 0; j < tmp; ++j) putchar("ACGTN"[(int)rs[j]]); putchar('\n'); - printf("*** Left query: "); for (j = 0; j < s->qbeg; ++j) putchar("ACGTN"[(int)qs[j]]); putchar('\n'); - } - a->score = ksw_extend2(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); - if (bwa_verbose >= 4) { printf("*** Left extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); } - if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; - } - // check whether we prefer to reach the end of the query - if (gscore <= 0 || gscore <= a->score - opt->pen_clip5) { // local extension - a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; - a->truesc = a->score; - } else { // to-end extension - a->qb = 0, a->rb = s->rbeg - gtle; - a->truesc = gscore; - } - free(qs); free(rs); - } else a->score = a->truesc = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; - - if (s->qbeg + s->len != l_query) { // right extension - int qle, tle, qe, re, gtle, gscore, sc0 = a->score; - qe = s->qbeg + s->len; - re = s->rbeg + s->len - rmax[0]; - assert(re >= 0); - for (i = 0; i < MAX_BAND_TRY; ++i) { - int prev = a->score; - aw[1] = opt->w << i; - if (bwa_verbose >= 4) { - int j; - printf("*** Right ref: "); for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[re+j]]); putchar('\n'); - printf("*** Right query: "); for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[qe+j]]); putchar('\n'); - } - a->score = ksw_extend2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); - if (bwa_verbose >= 4) { printf("*** Right extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); } - if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; - } - // similar to the above - if (gscore <= 0 || gscore <= a->score - opt->pen_clip3) { // local extension - a->qe = qe + qle, a->re = rmax[0] + re + tle; - a->truesc += a->score - sc0; - } else { // to-end extension - a->qe = l_query, a->re = rmax[0] + re + gtle; - a->truesc += gscore - sc0; - } - } else a->qe = l_query, a->re = s->rbeg + s->len; - if (bwa_verbose >= 4) printf("*** Added alignment region: [%d,%d) <=> [%ld,%ld); score=%d; {left,right}_bandwidth={%d,%d}\n", a->qb, a->qe, (long)a->rb, (long)a->re, a->score, aw[0], aw[1]); - - // compute seedcov - for (i = 0, a->seedcov = 0; i < c->n; ++i) { - const mem_seed_t *t = &c->seeds[i]; - if (t->qbeg >= a->qb && t->qbeg + t->len <= a->qe && t->rbeg >= a->rb && t->rbeg + t->len <= a->re) // seed fully contained - a->seedcov += t->len; // this is not very accurate, but for approx. mapQ, this is good enough - } - a->w = aw[0] > aw[1]? aw[0] : aw[1]; - a->seedlen0 = s->len; - - a->frac_rep = c->frac_rep; - } - free(srt); free(rseq); -} - -/***************************** - * Basic hit->SAM conversion * - *****************************/ - -static inline int infer_bw(int l1, int l2, int score, int a, int q, int r) -{ - int w; - if (l1 == l2 && l1 * a - score < (q + r - a)<<1) return 0; // to get equal alignment length, we need at least two gaps - w = ((double)((l1 < l2? l1 : l2) * a - score - q) / r + 2.); - if (w < abs(l1 - l2)) w = abs(l1 - l2); - return w; -} - -static inline int get_rlen(int n_cigar, const uint32_t *cigar) -{ - int k, l; - for (k = l = 0; k < n_cigar; ++k) { - int op = cigar[k]&0xf; - if (op == 0 || op == 2) - l += cigar[k]>>4; - } - return l; -} - -void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m_, int softclip_all) -{ - int i, l_name; - mem_aln_t ptmp = list[which], *p = &ptmp, mtmp, *m = 0; // make a copy of the alignment to convert - - if (m_) mtmp = *m_, m = &mtmp; - // set flag - p->flag |= m? 0x1 : 0; // is paired in sequencing - p->flag |= p->rid < 0? 0x4 : 0; // is mapped - p->flag |= m && m->rid < 0? 0x8 : 0; // is mate mapped - if (p->rid < 0 && m && m->rid >= 0) // copy mate to alignment - p->rid = m->rid, p->pos = m->pos, p->is_rev = m->is_rev, p->n_cigar = 0; - if (m && m->rid < 0 && p->rid >= 0) // copy alignment to mate - m->rid = p->rid, m->pos = p->pos, m->is_rev = p->is_rev, m->n_cigar = 0; - p->flag |= p->is_rev? 0x10 : 0; // is on the reverse strand - p->flag |= m && m->is_rev? 0x20 : 0; // is mate on the reverse strand - - // print up to CIGAR - l_name = strlen(s->name); - ks_resize(str, str->l + s->l_seq + l_name + (s->qual? s->l_seq : 0) + 20); - kputsn(s->name, l_name, str); kputc('\t', str); // QNAME - kputw((p->flag&0xffff) | (p->flag&0x10000? 0x100 : 0), str); kputc('\t', str); // FLAG - if (p->rid >= 0) { // with coordinate - kputs(bns->anns[p->rid].name, str); kputc('\t', str); // RNAME - kputl(p->pos + 1, str); kputc('\t', str); // POS - kputw(p->mapq, str); kputc('\t', str); // MAPQ - if (p->n_cigar) { // aligned - for (i = 0; i < p->n_cigar; ++i) { - int c = p->cigar[i]&0xf; - if (!softclip_all && (c == 3 || c == 4)) - c = which? 4 : 3; // use hard clipping for supplementary alignments - kputw(p->cigar[i]>>4, str); kputc("MIDSH"[c], str); - } - } else kputc('*', str); // having a coordinate but unaligned (e.g. when copy_mate is true) - } else kputsn("*\t0\t0\t*", 7, str); // without coordinte - kputc('\t', str); - - // print the mate position if applicable - if (m && m->rid >= 0) { - if (p->rid == m->rid) kputc('=', str); - else kputs(bns->anns[m->rid].name, str); - kputc('\t', str); - kputl(m->pos + 1, str); kputc('\t', str); - if (p->rid == m->rid) { - int64_t p0 = p->pos + (p->is_rev? get_rlen(p->n_cigar, p->cigar) - 1 : 0); - int64_t p1 = m->pos + (m->is_rev? get_rlen(m->n_cigar, m->cigar) - 1 : 0); - if (m->n_cigar == 0 || p->n_cigar == 0) kputc('0', str); - else kputl(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str); - } else kputc('0', str); - } else kputsn("*\t0\t0", 5, str); - kputc('\t', str); - - // print SEQ and QUAL - if (p->flag & 0x100) { // for secondary alignments, don't write SEQ and QUAL - kputsn("*\t*", 3, str); - } else if (!p->is_rev) { // the forward strand - int i, qb = 0, qe = s->l_seq; - if (p->n_cigar && which && !softclip_all) { // have cigar && not the primary alignment && not softclip all - if ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3) qb += p->cigar[0]>>4; - if ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3) qe -= p->cigar[p->n_cigar-1]>>4; - } - ks_resize(str, str->l + (qe - qb) + 1); - for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]]; - kputc('\t', str); - if (s->qual) { // printf qual - ks_resize(str, str->l + (qe - qb) + 1); - for (i = qb; i < qe; ++i) str->s[str->l++] = s->qual[i]; - str->s[str->l] = 0; - } else kputc('*', str); - } else { // the reverse strand - int i, qb = 0, qe = s->l_seq; - if (p->n_cigar && which && !softclip_all) { - if ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3) qe -= p->cigar[0]>>4; - if ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3) qb += p->cigar[p->n_cigar-1]>>4; - } - ks_resize(str, str->l + (qe - qb) + 1); - for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]]; - kputc('\t', str); - if (s->qual) { // printf qual - ks_resize(str, str->l + (qe - qb) + 1); - for (i = qe-1; i >= qb; --i) str->s[str->l++] = s->qual[i]; - str->s[str->l] = 0; - } else kputc('*', str); - } - - // print optional tags - if (p->n_cigar) { - kputsn("\tNM:i:", 6, str); kputw(p->NM, str); - kputsn("\tMD:Z:", 6, str); kputs((char*)(p->cigar + p->n_cigar), str); - } - if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } - if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } - if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } - if (!(p->flag & 0x100)) { // not multi-hit - for (i = 0; i < n; ++i) - if (i != which && !(list[i].flag&0x100)) break; - if (i < n) { // there are other primary hits; output them - kputsn("\tSA:Z:", 6, str); - for (i = 0; i < n; ++i) { - const mem_aln_t *r = &list[i]; - int k; - if (i == which || (list[i].flag&0x100)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit - kputs(bns->anns[r->rid].name, str); kputc(',', str); - kputl(r->pos+1, str); kputc(',', str); - kputc("+-"[r->is_rev], str); kputc(',', str); - for (k = 0; k < r->n_cigar; ++k) { - kputw(r->cigar[k]>>4, str); kputc("MIDSH"[r->cigar[k]&0xf], str); - } - kputc(',', str); kputw(r->mapq, str); - kputc(',', str); kputw(r->NM, str); - kputc(';', str); - } - } - } - if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); } - if (s->comment) { kputc('\t', str); kputs(s->comment, str); } - kputc('\n', str); -} - -/************************ - * Integrated interface * - ************************/ - -int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) -{ - int mapq, l, sub = a->sub? a->sub : opt->min_seed_len * opt->a; - double identity; - sub = a->csub > sub? a->csub : sub; - if (sub >= a->score) return 0; - l = a->qe - a->qb > a->re - a->rb? a->qe - a->qb : a->re - a->rb; - identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l; - if (a->score == 0) { - mapq = 0; - } else if (opt->mapQ_coef_len > 0) { - double tmp; - tmp = l < opt->mapQ_coef_len? 1. : opt->mapQ_coef_fac / log(l); - tmp *= identity * identity; - mapq = (int)(6.02 * (a->score - sub) / opt->a * tmp * tmp + .499); - } else { - mapq = (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499); - mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq; - } - if (a->sub_n > 0) mapq -= (int)(4.343 * log(a->sub_n+1) + .499); - if (mapq > 60) mapq = 60; - if (mapq < 0) mapq = 0; - mapq = (int)(mapq * (1. - a->frac_rep) + .499); - return mapq; -} - -// TODO (future plan): group hits into a uint64_t[] array. This will be cleaner and more flexible -void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m) -{ - extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query); - kstring_t str; - kvec_t(mem_aln_t) aa; - int k; - char **XA = 0; - - if (!(opt->flag & MEM_F_ALL)) - XA = mem_gen_alt(opt, bns, pac, a, s->l_seq, s->seq); - kv_init(aa); - str.l = str.m = 0; str.s = 0; - for (k = 0; k < a->n; ++k) { - mem_alnreg_t *p = &a->a[k]; - mem_aln_t *q; - if (p->score < opt->T) continue; - if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; - if (p->secondary >= 0 && p->score < a->a[p->secondary].score * opt->drop_ratio) continue; - q = kv_pushp(mem_aln_t, aa); - *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p); - assert(q->rid >= 0); // this should not happen with the new code - q->XA = XA? XA[k] : 0; - q->flag |= extra_flag; // flag secondary - if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score - if (k && p->secondary < 0) // if supplementary - q->flag |= (opt->flag&MEM_F_NO_MULTI)? 0x10000 : 0x800; - if (k && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq; - } - if (aa.n == 0) { // no alignments good enough; then write an unaligned record - mem_aln_t t; - t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0); - t.flag |= extra_flag; - mem_aln2sam(bns, &str, s, 1, &t, 0, m, opt->flag&MEM_F_SOFTCLIP); - } else { - for (k = 0; k < aa.n; ++k) - mem_aln2sam(bns, &str, s, aa.n, aa.a, k, m, opt->flag&MEM_F_SOFTCLIP); - for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar); - free(aa.a); - } - s->sam = str.s; - if (XA) { - for (k = 0; k < a->n; ++k) free(XA[k]); - free(XA); - } -} - -mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf) -{ - int i; - mem_chain_v chn; - mem_alnreg_v regs; - - for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so - seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]]; - - chn = mem_chain(opt, bwt, bns, l_seq, (uint8_t*)seq, buf); - chn.n = mem_chain_flt(opt, chn.n, chn.a); - if (opt->min_chain_weight > 0) mem_flt_chained_seeds(opt, bns, pac, l_seq, (uint8_t*)seq, chn.n, chn.a); - if (bwa_verbose >= 4) mem_print_chain(bns, &chn); - - kv_init(regs); - for (i = 0; i < chn.n; ++i) { - mem_chain_t *p = &chn.a[i]; - int ret = 1; - if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i); - if (opt->min_chain_weight == 0) - ret = mem_chain2aln_short(opt, bns, pac, l_seq, (uint8_t*)seq, p, ®s); - if (ret > 0) mem_chain2aln(opt, bns, pac, l_seq, (uint8_t*)seq, p, ®s); - free(chn.a[i].seeds); - } - free(chn.a); - regs.n = mem_sort_dedup_patch(opt, bns, pac, (uint8_t*)seq, regs.n, regs.a); - if (opt->flag & MEM_F_SELF_OVLP) - regs.n = mem_test_and_remove_exact(opt, regs.n, regs.a, l_seq); - if (bwa_verbose >= 4) { - err_printf("* %ld chains remain after removing duplicated chains\n", regs.n); - for (i = 0; i < regs.n; ++i) { - mem_alnreg_t *p = ®s.a[i]; - printf("** %d, [%d,%d) <=> [%ld,%ld)\n", p->score, p->qb, p->qe, (long)p->rb, (long)p->re); - } - } - return regs; -} - -mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) -{ - mem_aln_t a; - int i, w2, tmp, qb, qe, NM, score, is_rev, last_sc = -(1<<30), l_MD; - int64_t pos, rb, re; - uint8_t *query; - - memset(&a, 0, sizeof(mem_aln_t)); - if (ar == 0 || ar->rb < 0 || ar->re < 0) { // generate an unmapped record - a.rid = -1; a.pos = -1; a.flag |= 0x4; - return a; - } - qb = ar->qb, qe = ar->qe; - rb = ar->rb, re = ar->re; - query = malloc(l_query); - for (i = 0; i < l_query; ++i) // convert to the nt4 encoding - query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; - a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0; - if (ar->secondary >= 0) a.flag |= 0x100; // secondary alignment - tmp = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_del, opt->e_del); - w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_ins, opt->e_ins); - w2 = w2 > tmp? w2 : tmp; - if (bwa_verbose >= 4) printf("* Band width: inferred=%d, cmd_opt=%d, alnreg=%d\n", w2, opt->w, ar->w); - if (w2 > opt->w) w2 = w2 < ar->w? w2 : ar->w; - i = 0; a.cigar = 0; - do { - free(a.cigar); - w2 = w2 < opt->w<<2? w2 : opt->w<<2; - a.cigar = bwa_gen_cigar2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM); - if (bwa_verbose >= 4) printf("* Final alignment: w2=%d, global_sc=%d, local_sc=%d\n", w2, score, ar->truesc); - if (score == last_sc || w2 == opt->w<<2) break; // it is possible that global alignment and local alignment give different scores - last_sc = score; - w2 <<= 1; - } while (++i < 3 && score < ar->truesc - opt->a); - l_MD = strlen((char*)(a.cigar + a.n_cigar)) + 1; - a.NM = NM; - pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); - a.is_rev = is_rev; - if (a.n_cigar > 0) { // squeeze out leading or trailing deletions - if ((a.cigar[0]&0xf) == 2) { - pos += a.cigar[0]>>4; - --a.n_cigar; - memmove(a.cigar, a.cigar + 1, a.n_cigar * 4 + l_MD); - } else if ((a.cigar[a.n_cigar-1]&0xf) == 2) { - --a.n_cigar; - memmove(a.cigar + a.n_cigar, a.cigar + a.n_cigar + 1, l_MD); // MD needs to be moved accordingly - } - } - if (qb != 0 || qe != l_query) { // add clipping to CIGAR - int clip5, clip3; - clip5 = is_rev? l_query - qe : qb; - clip3 = is_rev? qb : l_query - qe; - a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2) + l_MD); - if (clip5) { - memmove(a.cigar+1, a.cigar, a.n_cigar * 4 + l_MD); // make room for 5'-end clipping - a.cigar[0] = clip5<<4 | 3; - ++a.n_cigar; - } - if (clip3) { - memmove(a.cigar + a.n_cigar + 1, a.cigar + a.n_cigar, l_MD); // make room for 3'-end clipping - a.cigar[a.n_cigar++] = clip3<<4 | 3; - } - } - a.rid = bns_pos2rid(bns, pos); - assert(a.rid == ar->rid); - a.pos = pos - bns->anns[a.rid].offset; - a.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub; - free(query); - return a; -} - -typedef struct { - const mem_opt_t *opt; - const bwt_t *bwt; - const bntseq_t *bns; - const uint8_t *pac; - const mem_pestat_t *pes; - smem_aux_t **aux; - bseq1_t *seqs; - mem_alnreg_v *regs; - int64_t n_processed; -} worker_t; - -static void worker1(void *data, int i, int tid) -{ - worker_t *w = (worker_t*)data; - if (!(w->opt->flag&MEM_F_PE)) { - if (bwa_verbose >= 4) printf("=====> Processing read '%s' <=====\n", w->seqs[i].name); - w->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq, w->aux[tid]); - } else { - if (bwa_verbose >= 4) printf("=====> Processing read '%s'/1 <=====\n", w->seqs[i<<1|0].name); - w->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq, w->aux[tid]); - if (bwa_verbose >= 4) printf("=====> Processing read '%s'/2 <=====\n", w->seqs[i<<1|1].name); - w->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq, w->aux[tid]); - } -} - -static void worker2(void *data, int i, int tid) -{ - extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]); - extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a); - worker_t *w = (worker_t*)data; - if (!(w->opt->flag&MEM_F_PE)) { - if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name); - if (w->opt->flag & MEM_F_ALN_REG) { - mem_reg2ovlp(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i]); - } else { - mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i); - mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); - } - free(w->regs[i].a); - } else { - if (bwa_verbose >= 4) printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i<<1|0].name); - mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed>>1) + i, &w->seqs[i<<1], &w->regs[i<<1]); - free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a); - } -} - -void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0) -{ - extern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n); - worker_t w; - mem_alnreg_v *regs; - mem_pestat_t pes[4]; - double ctime, rtime; - int i; - - ctime = cputime(); rtime = realtime(); - global_bns = bns; - regs = malloc(n * sizeof(mem_alnreg_v)); - w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac; - w.seqs = seqs; w.regs = regs; w.n_processed = n_processed; - w.pes = &pes[0]; - w.aux = malloc(opt->n_threads * sizeof(smem_aux_t)); - for (i = 0; i < opt->n_threads; ++i) - w.aux[i] = smem_aux_init(); - kt_for(opt->n_threads, worker1, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions - for (i = 0; i < opt->n_threads; ++i) - smem_aux_destroy(w.aux[i]); - free(w.aux); - if (opt->flag&MEM_F_PE) { // infer insert sizes if not provided - if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0 - else mem_pestat(opt, bns->l_pac, n, regs, pes); // otherwise, infer the insert size distribution from data - } - kt_for(opt->n_threads, worker2, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment - free(regs); - if (bwa_verbose >= 3) - fprintf(stderr, "[M::%s] Processed %d reads in %.3f CPU sec, %.3f real sec\n", __func__, n, cputime() - ctime, realtime() - rtime); -} diff -r abdbc8fe98dd -r dfe9332138cf bwa-0.7.9a/bwamem.h --- a/bwa-0.7.9a/bwamem.h Thu Aug 14 04:44:51 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,179 +0,0 @@ -#ifndef BWAMEM_H_ -#define BWAMEM_H_ - -#include "bwt.h" -#include "bntseq.h" -#include "bwa.h" - -#define MEM_MAPQ_COEF 30.0 -#define MEM_MAPQ_MAX 60 - -struct __smem_i; -typedef struct __smem_i smem_i; - -#define MEM_F_PE 0x2 -#define MEM_F_NOPAIRING 0x4 -#define MEM_F_ALL 0x8 -#define MEM_F_NO_MULTI 0x10 -#define MEM_F_NO_RESCUE 0x20 -#define MEM_F_SELF_OVLP 0x40 -#define MEM_F_ALN_REG 0x80 -#define MEM_F_SOFTCLIP 0x200 - -typedef struct { - int a, b; // match score and mismatch penalty - int o_del, e_del; - int o_ins, e_ins; - int pen_unpaired; // phred-scaled penalty for unpaired reads - int pen_clip5,pen_clip3;// clipping penalty. This score is not deducted from the DP score. - int w; // band width - int zdrop; // Z-dropoff - - int T; // output score threshold; only affecting output - int flag; // see MEM_F_* macros - int min_seed_len; // minimum seed length - int min_chain_weight; - int max_chain_extend; - float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor - int split_width; // split into a seed if its occurence is smaller than this value - int max_occ; // skip a seed if its occurence is larger than this value - int max_chain_gap; // do not chain seed if it is max_chain_gap-bp away from the closest seed - int n_threads; // number of threads - int chunk_size; // process chunk_size-bp sequences in a batch - float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits - float drop_ratio; // drop a chain if its seed coverage is below drop_ratio times the seed coverage of a better chain overlapping with the small chain - float XA_drop_ratio; // when counting hits for the XA tag, ignore alignments with score < XA_drop_ratio * max_score; only effective for the XA tag - float mask_level_redun; - float mapQ_coef_len; - int mapQ_coef_fac; - int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value - int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end - int max_hits; // if there are max_hits or fewer, output them all - int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset -} mem_opt_t; - -typedef struct { - int64_t rb, re; // [rb,re): reference sequence in the alignment - int qb, qe; // [qb,qe): query sequence in the alignment - int rid; // reference seq ID - int score; // best local SW score - int truesc; // actual score corresponding to the aligned region; possibly smaller than $score - int sub; // 2nd best SW score - int csub; // SW score of a tandem hit - int sub_n; // approximate number of suboptimal hits - int w; // actual band width used in extension - int seedcov; // length of regions coverged by seeds - int secondary; // index of the parent hit shadowing the current hit; <0 if primary - int seedlen0; // length of the starting seed - int n_comp; // number of sub-alignments chained together - float frac_rep; - uint64_t hash; -} mem_alnreg_t; - -typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v; - -typedef struct { - int low, high; // lower and upper bounds within which a read pair is considered to be properly paired - int failed; // non-zero if the orientation is not supported by sufficient data - double avg, std; // mean and stddev of the insert size distribution -} mem_pestat_t; - -typedef struct { // This struct is only used for the convenience of API. - int64_t pos; // forward strand 5'-end mapping position - int rid; // reference sequence index in bntseq_t; <0 for unmapped - int flag; // extra flag - uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance - int n_cigar; // number of CIGAR operations - uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234 - char *XA; // alternative mappings - - int score, sub; -} mem_aln_t; - -#ifdef __cplusplus -extern "C" { -#endif - - smem_i *smem_itr_init(const bwt_t *bwt); - void smem_itr_destroy(smem_i *itr); - void smem_set_query(smem_i *itr, int len, const uint8_t *query); - const bwtintv_v *smem_next(smem_i *itr); - - mem_opt_t *mem_opt_init(void); - void mem_fill_scmat(int a, int b, int8_t mat[25]); - - /** - * Align a batch of sequences and generate the alignments in the SAM format - * - * This routine requires $seqs[i].{l_seq,seq,name} and write $seqs[i].sam. - * Note that $seqs[i].sam may consist of several SAM lines if the - * corresponding sequence has multiple primary hits. - * - * In the paired-end mode (i.e. MEM_F_PE is set in $opt->flag), query - * sequences must be interleaved: $n must be an even number and the 2i-th - * sequence and the (2i+1)-th sequence constitute a read pair. In this - * mode, there should be enough (typically >50) unique pairs for the - * routine to infer the orientation and insert size. - * - * @param opt alignment parameters - * @param bwt FM-index of the reference sequence - * @param bns Information of the reference - * @param pac 2-bit encoded reference - * @param n number of query sequences - * @param seqs query sequences; $seqs[i].seq/sam to be modified after the call - * @param pes0 insert-size info; if NULL, infer from data; if not NULL, it should be an array with 4 elements, - * corresponding to each FF, FR, RF and RR orientation. See mem_pestat() for more info. - */ - void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0); - - /** - * Find the aligned regions for one query sequence - * - * Note that this routine does not generate CIGAR. CIGAR should be - * generated later by mem_reg2aln() below. - * - * @param opt alignment parameters - * @param bwt FM-index of the reference sequence - * @param bns Information of the reference - * @param pac 2-bit encoded reference - * @param l_seq length of query sequence - * @param seq query sequence - * - * @return list of aligned regions. - */ - mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq); - - /** - * Generate CIGAR and forward-strand position from alignment region - * - * @param opt alignment parameters - * @param bns Information of the reference - * @param pac 2-bit encoded reference - * @param l_seq length of query sequence - * @param seq query sequence - * @param ar one alignment region - * - * @return CIGAR, strand, mapping quality and forward-strand position - */ - mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar); - mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar, const char *name); - - /** - * Infer the insert size distribution from interleaved alignment regions - * - * This function can be called after mem_align1(), as long as paired-end - * reads are properly interleaved. - * - * @param opt alignment parameters - * @param l_pac length of concatenated reference sequence - * @param n number of query sequences; must be an even number - * @param regs region array of size $n; 2i-th and (2i+1)-th elements constitute a pair - * @param pes inferred insert size distribution (output) - */ - void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]); - -#ifdef __cplusplus -} -#endif - -#endif diff -r abdbc8fe98dd -r dfe9332138cf bwa-0.7.9a/bwamem_extra.c --- a/bwa-0.7.9a/bwamem_extra.c Thu Aug 14 04:44:51 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,147 +0,0 @@ -#include "bwa.h" -#include "bwamem.h" -#include "bntseq.h" -#include "kstring.h" - -/*************************** - * SMEM iterator interface * - ***************************/ - -struct __smem_i { - const bwt_t *bwt; - const uint8_t *query; - int start, len; - bwtintv_v *matches; // matches; to be returned by smem_next() - bwtintv_v *sub; // sub-matches inside the longest match; temporary - bwtintv_v *tmpvec[2]; // temporary arrays -}; - -smem_i *smem_itr_init(const bwt_t *bwt) -{ - smem_i *itr; - itr = calloc(1, sizeof(smem_i)); - itr->bwt = bwt; - itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); - itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); - itr->matches = calloc(1, sizeof(bwtintv_v)); - itr->sub = calloc(1, sizeof(bwtintv_v)); - return itr; -} - -void smem_itr_destroy(smem_i *itr) -{ - free(itr->tmpvec[0]->a); free(itr->tmpvec[0]); - free(itr->tmpvec[1]->a); free(itr->tmpvec[1]); - free(itr->matches->a); free(itr->matches); - free(itr->sub->a); free(itr->sub); - free(itr); -} - -void smem_set_query(smem_i *itr, int len, const uint8_t *query) -{ - itr->query = query; - itr->start = 0; - itr->len = len; -} - -const bwtintv_v *smem_next(smem_i *itr) -{ - int i, max, max_i, ori_start; - itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0; - if (itr->start >= itr->len || itr->start < 0) return 0; - while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases - if (itr->start == itr->len) return 0; - ori_start = itr->start; - itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM - if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here - for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match - bwtintv_t *p = &itr->matches->a[i]; - int len = (uint32_t)p->info - (p->info>>32); - if (max < len) max = len, max_i = i; - } - return itr->matches; -} - -/*********************** - *** Extra functions *** - ***********************/ - -mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq_) -{ // the difference from mem_align1_core() is that this routine: 1) calls mem_mark_primary_se(); 2) does not modify the input sequence - extern mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf); - extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); - mem_alnreg_v ar; - char *seq; - seq = malloc(l_seq); - memcpy(seq, seq_, l_seq); // makes a copy of seq_ - ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq, 0); - mem_mark_primary_se(opt, ar.n, ar.a, lrand48()); - free(seq); - return ar; -} - -void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a) -{ - int i; - kstring_t str = {0,0,0}; - for (i = 0; i < a->n; ++i) { - const mem_alnreg_t *p = &a->a[i]; - int is_rev, rid, qb = p->qb, qe = p->qe; - int64_t pos, rb = p->rb, re = p->re; - pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); - rid = bns_pos2rid(bns, pos); - assert(rid == p->rid); - pos -= bns->anns[rid].offset; - kputs(s->name, &str); kputc('\t', &str); - kputw(s->l_seq, &str); kputc('\t', &str); - if (is_rev) qb ^= qe, qe ^= qb, qb ^= qe; // swap - kputw(qb, &str); kputc('\t', &str); kputw(qe, &str); kputc('\t', &str); - kputs(bns->anns[rid].name, &str); kputc('\t', &str); - kputw(bns->anns[rid].len, &str); kputc('\t', &str); - kputw(pos, &str); kputc('\t', &str); kputw(pos + (re - rb), &str); kputc('\t', &str); - ksprintf(&str, "%.3f", (double)p->truesc / opt->a / (qe - qb > re - rb? qe - qb : re - rb)); - kputc('\n', &str); - } - s->sam = str.s; -} - -// Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup. -char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se() -{ - int i, k, *cnt, tot; - kstring_t *aln = 0; - char **XA = 0; - - cnt = calloc(a->n, sizeof(int)); - for (i = 0, tot = 0; i < a->n; ++i) { - int j = a->a[i].secondary; - if (j >= 0 && a->a[i].score >= a->a[j].score * opt->XA_drop_ratio) - ++cnt[j], ++tot; - } - if (tot == 0) goto end_gen_alt; - aln = calloc(a->n, sizeof(kstring_t)); - for (i = 0; i < a->n; ++i) { - mem_aln_t t; - int j = a->a[i].secondary; - if (j < 0 || a->a[i].score < a->a[j].score * opt->XA_drop_ratio) continue; // we don't process the primary alignments as they will be converted to SAM later - if (cnt[j] > opt->max_hits) continue; - t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]); - kputs(bns->anns[t.rid].name, &aln[j]); - kputc(',', &aln[j]); kputc("+-"[t.is_rev], &aln[j]); kputl(t.pos + 1, &aln[j]); - kputc(',', &aln[j]); - for (k = 0; k < t.n_cigar; ++k) { - kputw(t.cigar[k]>>4, &aln[j]); - kputc("MIDSHN"[t.cigar[k]&0xf], &aln[j]); - } - kputc(',', &aln[j]); kputw(t.NM, &aln[j]); - kputc(';', &aln[j]); - free(t.cigar); - } - XA = calloc(a->n, sizeof(char*)); - for (k = 0; k < a->n; ++k) - XA[k] = aln[k].s; - -end_gen_alt: - free(cnt); free(aln); - return XA; -} diff -r abdbc8fe98dd -r dfe9332138cf bwa-0.7.9a/bwamem_pair.c --- a/bwa-0.7.9a/bwamem_pair.c Thu Aug 14 04:44:51 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,362 +0,0 @@ -#include -#include -#include -#include -#include "kstring.h" -#include "bwamem.h" -#include "kvec.h" -#include "utils.h" -#include "ksw.h" - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - - -#define MIN_RATIO 0.8 -#define MIN_DIR_CNT 10 -#define MIN_DIR_RATIO 0.05 -#define OUTLIER_BOUND 2.0 -#define MAPPING_BOUND 3.0 -#define MAX_STDDEV 4.0 - -static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t *dist) -{ - int64_t p2; - int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac); - p2 = r1 == r2? b2 : (l_pac<<1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand - *dist = p2 > b1? p2 - b1 : b1 - p2; - return (r1 == r2? 0 : 1) ^ (p2 > b1? 0 : 3); -} - -static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) -{ - int j; - for (j = 1; j < r->n; ++j) { // choose unique alignment - int b_max = r->a[j].qb > r->a[0].qb? r->a[j].qb : r->a[0].qb; - int e_min = r->a[j].qe < r->a[0].qe? r->a[j].qe : r->a[0].qe; - if (e_min > b_max) { // have overlap - int min_l = r->a[j].qe - r->a[j].qb < r->a[0].qe - r->a[0].qb? r->a[j].qe - r->a[j].qb : r->a[0].qe - r->a[0].qb; - if (e_min - b_max >= min_l * opt->mask_level) break; // significant overlap - } - } - return j < r->n? r->a[j].score : opt->min_seed_len * opt->a; -} - -void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]) -{ - int i, d, max; - uint64_v isize[4]; - memset(pes, 0, 4 * sizeof(mem_pestat_t)); - memset(isize, 0, sizeof(kvec_t(int)) * 4); - for (i = 0; i < n>>1; ++i) { - int dir; - int64_t is; - mem_alnreg_v *r[2]; - r[0] = (mem_alnreg_v*)®s[i<<1|0]; - r[1] = (mem_alnreg_v*)®s[i<<1|1]; - if (r[0]->n == 0 || r[1]->n == 0) continue; - if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue; - if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue; - if (r[0]->a[0].rid != r[1]->a[0].rid) continue; // not on the same chr - dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is); - if (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is); - } - if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n); - for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two. - mem_pestat_t *r = &pes[d]; - uint64_v *q = &isize[d]; - int p25, p50, p75, x; - if (q->n < MIN_DIR_CNT) { - fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]); - r->failed = 1; - continue; - } else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]); - ks_introsort_64(q->n, q->a); - p25 = q->a[(int)(.25 * q->n + .499)]; - p50 = q->a[(int)(.50 * q->n + .499)]; - p75 = q->a[(int)(.75 * q->n + .499)]; - r->low = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499); - if (r->low < 1) r->low = 1; - r->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); - fprintf(stderr, "[M::%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75); - fprintf(stderr, "[M::%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r->low, r->high); - for (i = x = 0, r->avg = 0; i < q->n; ++i) - if (q->a[i] >= r->low && q->a[i] <= r->high) - r->avg += q->a[i], ++x; - r->avg /= x; - for (i = 0, r->std = 0; i < q->n; ++i) - if (q->a[i] >= r->low && q->a[i] <= r->high) - r->std += (q->a[i] - r->avg) * (q->a[i] - r->avg); - r->std = sqrt(r->std / x); - fprintf(stderr, "[M::%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r->avg, r->std); - r->low = (int)(p25 - MAPPING_BOUND * (p75 - p25) + .499); - r->high = (int)(p75 + MAPPING_BOUND * (p75 - p25) + .499); - if (r->low > r->avg - MAX_STDDEV * r->std) r->low = (int)(r->avg - MAX_STDDEV * r->std + .499); - if (r->high < r->avg - MAX_STDDEV * r->std) r->high = (int)(r->avg + MAX_STDDEV * r->std + .499); - if (r->low < 1) r->low = 1; - fprintf(stderr, "[M::%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r->low, r->high); - free(q->a); - } - for (d = 0, max = 0; d < 4; ++d) - max = max > isize[d].n? max : isize[d].n; - for (d = 0; d < 4; ++d) - if (pes[d].failed == 0 && isize[d].n < max * MIN_DIR_RATIO) { - pes[d].failed = 1; - fprintf(stderr, "[M::%s] skip orientation %c%c\n", __func__, "FR"[d>>1&1], "FR"[d&1]); - } -} - -int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) -{ - extern int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a); - int64_t l_pac = bns->l_pac; - int i, r, skip[4], n = 0, rid; - for (r = 0; r < 4; ++r) - skip[r] = pes[r].failed? 1 : 0; - for (i = 0; i < ma->n; ++i) { // check which orinentation has been found - int64_t dist; - r = mem_infer_dir(l_pac, a->rb, ma->a[i].rb, &dist); - if (dist >= pes[r].low && dist <= pes[r].high) - skip[r] = 1; - } - if (skip[0] + skip[1] + skip[2] + skip[3] == 4) return 0; // consistent pair exist; no need to perform SW - for (r = 0; r < 4; ++r) { - int is_rev, is_larger; - uint8_t *seq, *rev = 0, *ref = 0; - int64_t rb, re; - if (skip[r]) continue; - is_rev = (r>>1 != (r&1)); // whether to reverse complement the mate - is_larger = !(r>>1); // whether the mate has larger coordinate - if (is_rev) { - rev = malloc(l_ms); // this is the reverse complement of $ms - for (i = 0; i < l_ms; ++i) rev[l_ms - 1 - i] = ms[i] < 4? 3 - ms[i] : 4; - seq = rev; - } else seq = (uint8_t*)ms; - if (!is_rev) { - rb = is_larger? a->rb + pes[r].low : a->rb - pes[r].high; - re = (is_larger? a->rb + pes[r].high: a->rb - pes[r].low) + l_ms; // if on the same strand, end position should be larger to make room for the seq length - } else { - rb = (is_larger? a->rb + pes[r].low : a->rb - pes[r].high) - l_ms; // similarly on opposite strands - re = is_larger? a->rb + pes[r].high: a->rb - pes[r].low; - } - if (rb < 0) rb = 0; - if (re > l_pac<<1) re = l_pac<<1; - if (rb < re) ref = bns_fetch_seq(bns, pac, &rb, (rb+re)>>1, &re, &rid); - if (a->rid == rid && re - rb >= opt->min_seed_len) { // no funny things happening - kswr_t aln; - mem_alnreg_t b; - int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a); - aln = ksw_align2(l_ms, seq, re - rb, ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0); - memset(&b, 0, sizeof(mem_alnreg_t)); - if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0 - b.rid = a->rid; - b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb; - b.qe = is_rev? l_ms - aln.qb : aln.qe + 1; - b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb; - b.re = is_rev? (l_pac<<1) - (rb + aln.tb) : rb + aln.te + 1; - b.score = aln.score; - b.csub = aln.score2; - b.secondary = -1; - b.seedcov = (b.re - b.rb < b.qe - b.qb? b.re - b.rb : b.qe - b.qb) >> 1; -// printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re); - kv_push(mem_alnreg_t, *ma, b); // make room for a new element - // move b s.t. ma is sorted - for (i = 0; i < ma->n - 1; ++i) // find the insertion point - if (ma->a[i].score < b.score) break; - tmp = i; - for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1]; - ma->a[i] = b; - } - ++n; - } - if (n) ma->n = mem_sort_dedup_patch(opt, 0, 0, 0, ma->n, ma->a); - if (rev) free(rev); - free(ref); - } - return n; -} - -int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2]) -{ - pair64_v v, u; - int r, i, k, y[4], ret; // y[] keeps the last hit - int64_t l_pac = bns->l_pac; - kv_init(v); kv_init(u); - for (r = 0; r < 2; ++r) { // loop through read number - for (i = 0; i < a[r].n; ++i) { - pair64_t key; - mem_alnreg_t *e = &a[r].a[i]; - key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position - key.x = (uint64_t)e->rid<<32 | (key.x - bns->anns[e->rid].offset); - key.y = (uint64_t)e->score << 32 | i << 2 | (e->rb >= l_pac)<<1 | r; - kv_push(pair64_t, v, key); - } - } - ks_introsort_128(v.n, v.a); - y[0] = y[1] = y[2] = y[3] = -1; - //for (i = 0; i < v.n; ++i) printf("[%d]\t%d\t%c%ld\n", i, (int)(v.a[i].y&1)+1, "+-"[v.a[i].y>>1&1], (long)v.a[i].x); - for (i = 0; i < v.n; ++i) { - for (r = 0; r < 2; ++r) { // loop through direction - int dir = r<<1 | (v.a[i].y>>1&1), which; - if (pes[dir].failed) continue; // invalid orientation - which = r<<1 | ((v.a[i].y&1)^1); - if (y[which] < 0) continue; // no previous hits - for (k = y[which]; k >= 0; --k) { // TODO: this is a O(n^2) solution in the worst case; remember to check if this loop takes a lot of time (I doubt) - int64_t dist; - int q; - double ns; - pair64_t *p; - if ((v.a[k].y&3) != which) continue; - dist = (int64_t)v.a[i].x - v.a[k].x; - //printf("%d: %lld\n", k, dist); - if (dist > pes[dir].high) break; - if (dist < pes[dir].low) continue; - ns = (dist - pes[dir].avg) / pes[dir].std; - q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) + .721 * log(2. * erfc(fabs(ns) * M_SQRT1_2)) * opt->a + .499); // .721 = 1/log(4) - if (q < 0) q = 0; - p = kv_pushp(pair64_t, u); - p->y = (uint64_t)k<<32 | i; - p->x = (uint64_t)q<<32 | (hash_64(p->y ^ id<<8) & 0xffffffffU); - //printf("[%lld,%lld]\t%d\tdist=%ld\n", v.a[k].x, v.a[i].x, q, (long)dist); - } - } - y[v.a[i].y&3] = i; - } - if (u.n) { // found at least one proper pair - int tmp = opt->a + opt->b; - tmp = tmp > opt->o_del + opt->e_del? tmp : opt->o_del + opt->e_del; - tmp = tmp > opt->o_ins + opt->e_ins? tmp : opt->o_ins + opt->e_ins; - ks_introsort_128(u.n, u.a); - i = u.a[u.n-1].y >> 32; k = u.a[u.n-1].y << 32 >> 32; - z[v.a[i].y&1] = v.a[i].y<<32>>34; // index of the best pair - z[v.a[k].y&1] = v.a[k].y<<32>>34; - ret = u.a[u.n-1].x >> 32; - *sub = u.n > 1? u.a[u.n-2].x>>32 : 0; - for (i = (long)u.n - 2, *n_sub = 0; i >= 0; --i) - if (*sub - (int)(u.a[i].x>>32) <= tmp) ++*n_sub; - } else ret = 0, *sub = 0, *n_sub = 0; - free(u.a); free(v.a); - return ret; -} - -#define raw_mapq(diff, a) ((int)(6.02 * (diff) / (a) + .499)) - -int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]) -{ - extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); - extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a); - extern void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m); - extern void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m, int softclip_all); - extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query); - - int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1; - kstring_t str; - mem_aln_t h[2]; - - str.l = str.m = 0; str.s = 0; - if (!(opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment - mem_alnreg_v b[2]; - kv_init(b[0]); kv_init(b[1]); - for (i = 0; i < 2; ++i) - for (j = 0; j < a[i].n; ++j) - if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired) - kv_push(mem_alnreg_t, b[i], a[i].a[j]); - for (i = 0; i < 2; ++i) - for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) - n += mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); - free(b[0].a); free(b[1].a); - } - mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0); - mem_mark_primary_se(opt, a[1].n, a[1].a, id<<1|1); - if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; - // pairing single-end hits - if (a[0].n && a[1].n && (o = mem_pair(opt, bns, pac, pes, s, a, id, &subo, &n_sub, z)) > 0) { - int is_multi[2], q_pe, score_un, q_se[2]; - char **XA[2]; - // check if an end has multiple hits even after mate-SW - for (i = 0; i < 2; ++i) { - for (j = 1; j < a[i].n; ++j) - if (a[i].a[j].secondary < 0 && a[i].a[j].score >= opt->T) break; - is_multi[i] = j < a[i].n? 1 : 0; - } - if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score - // compute mapQ for the best SE hit - score_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired; - //q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0; - subo = subo > score_un? subo : score_un; - q_pe = raw_mapq(o - subo, opt->a); - if (n_sub > 0) q_pe -= (int)(4.343 * log(n_sub+1) + .499); - if (q_pe < 0) q_pe = 0; - if (q_pe > 60) q_pe = 60; - q_pe = (int)(q_pe * (1. - .5 * (a[0].a[0].frac_rep + a[1].a[0].frac_rep)) + .499); - // the following assumes no split hits - if (o > score_un) { // paired alignment is preferred - mem_alnreg_t *c[2]; - c[0] = &a[0].a[z[0]]; c[1] = &a[1].a[z[1]]; - for (i = 0; i < 2; ++i) { - if (c[i]->secondary >= 0) - c[i]->sub = a[i].a[c[i]->secondary].score, c[i]->secondary = -2; - q_se[i] = mem_approx_mapq_se(opt, c[i]); - } - q_se[0] = q_se[0] > q_pe? q_se[0] : q_pe < q_se[0] + 40? q_pe : q_se[0] + 40; - q_se[1] = q_se[1] > q_pe? q_se[1] : q_pe < q_se[1] + 40? q_pe : q_se[1] + 40; - extra_flag |= 2; - // cap at the tandem repeat score - q_se[0] = q_se[0] < raw_mapq(c[0]->score - c[0]->csub, opt->a)? q_se[0] : raw_mapq(c[0]->score - c[0]->csub, opt->a); - q_se[1] = q_se[1] < raw_mapq(c[1]->score - c[1]->csub, opt->a)? q_se[1] : raw_mapq(c[1]->score - c[1]->csub, opt->a); - } else { // the unpaired alignment is preferred - z[0] = z[1] = 0; - q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]); - q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]); - } - // suboptimal hits - if (!(opt->flag & MEM_F_ALL)) { - for (i = 0; i < 2; ++i) { - int k = a[i].a[z[i]].secondary; - if (k >= 0) { // switch secondary and primary - assert(a[i].a[k].secondary < 0); - for (j = 0; j < a[i].n; ++j) - if (a[i].a[j].secondary == k || j == k) - a[i].a[j].secondary = z[i]; - a[i].a[z[i]].secondary = -1; - } - XA[i] = mem_gen_alt(opt, bns, pac, &a[i], s[i].l_seq, s[i].seq); - } - } else XA[0] = XA[1] = 0; - // write SAM - h[0] = mem_reg2aln(opt, bns, pac, s[0].l_seq, s[0].seq, &a[0].a[z[0]]); h[0].mapq = q_se[0]; h[0].flag |= 0x40 | extra_flag; h[0].XA = XA[0]? XA[0][z[0]] : 0; - h[1] = mem_reg2aln(opt, bns, pac, s[1].l_seq, s[1].seq, &a[1].a[z[1]]); h[1].mapq = q_se[1]; h[1].flag |= 0x80 | extra_flag; h[1].XA = XA[1]? XA[1][z[1]] : 0; - mem_aln2sam(bns, &str, &s[0], 1, &h[0], 0, &h[1], opt->flag&MEM_F_SOFTCLIP); s[0].sam = strdup(str.s); str.l = 0; - mem_aln2sam(bns, &str, &s[1], 1, &h[1], 0, &h[0], opt->flag&MEM_F_SOFTCLIP); s[1].sam = str.s; - if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); - free(h[0].cigar); // h[0].XA will be freed in the following block - free(h[1].cigar); - // free XA - for (i = 0; i < 2; ++i) { - if (XA[i]) { - for (j = 0; j < a[i].n; ++j) free(XA[i][j]); - free(XA[i]); - } - } - } else goto no_pairing; - return n; - -no_pairing: - for (i = 0; i < 2; ++i) { - if (a[i].n && a[i].a[0].score >= opt->T) - h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[0]); - else h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0); - } - if (!(opt->flag & MEM_F_NOPAIRING) && h[0].rid == h[1].rid && h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it. - int64_t dist; - int d; - d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist); - if (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high) extra_flag |= 2; - } - mem_reg2sam_se(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]); - mem_reg2sam_se(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]); - if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); - free(h[0].cigar); free(h[1].cigar); - return n; -}