Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/samtools-0.1.19/bam_rmdupse.c @ 0:903fc43d6227 draft default tip
Uploaded
| author | lsong10 |
|---|---|
| date | Fri, 26 Mar 2021 16:52:45 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:903fc43d6227 |
|---|---|
| 1 #include <math.h> | |
| 2 #include "sam.h" | |
| 3 #include "khash.h" | |
| 4 #include "klist.h" | |
| 5 | |
| 6 #define QUEUE_CLEAR_SIZE 0x100000 | |
| 7 #define MAX_POS 0x7fffffff | |
| 8 | |
| 9 typedef struct { | |
| 10 int endpos; | |
| 11 uint32_t score:31, discarded:1; | |
| 12 bam1_t *b; | |
| 13 } elem_t, *elem_p; | |
| 14 #define __free_elem(p) bam_destroy1((p)->data.b) | |
| 15 KLIST_INIT(q, elem_t, __free_elem) | |
| 16 typedef klist_t(q) queue_t; | |
| 17 | |
| 18 KHASH_MAP_INIT_INT(best, elem_p) | |
| 19 typedef khash_t(best) besthash_t; | |
| 20 | |
| 21 typedef struct { | |
| 22 uint64_t n_checked, n_removed; | |
| 23 besthash_t *left, *rght; | |
| 24 } lib_aux_t; | |
| 25 KHASH_MAP_INIT_STR(lib, lib_aux_t) | |
| 26 | |
| 27 static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib) | |
| 28 { | |
| 29 khint_t k = kh_get(lib, aux, lib); | |
| 30 if (k == kh_end(aux)) { | |
| 31 int ret; | |
| 32 char *p = strdup(lib); | |
| 33 lib_aux_t *q; | |
| 34 k = kh_put(lib, aux, p, &ret); | |
| 35 q = &kh_val(aux, k); | |
| 36 q->left = kh_init(best); | |
| 37 q->rght = kh_init(best); | |
| 38 q->n_checked = q->n_removed = 0; | |
| 39 return q; | |
| 40 } else return &kh_val(aux, k); | |
| 41 } | |
| 42 | |
| 43 static inline int sum_qual(const bam1_t *b) | |
| 44 { | |
| 45 int i, q; | |
| 46 uint8_t *qual = bam1_qual(b); | |
| 47 for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i]; | |
| 48 return q; | |
| 49 } | |
| 50 | |
| 51 static inline elem_t *push_queue(queue_t *queue, const bam1_t *b, int endpos, int score) | |
| 52 { | |
| 53 elem_t *p = kl_pushp(q, queue); | |
| 54 p->discarded = 0; | |
| 55 p->endpos = endpos; p->score = score; | |
| 56 if (p->b == 0) p->b = bam_init1(); | |
| 57 bam_copy1(p->b, b); | |
| 58 return p; | |
| 59 } | |
| 60 | |
| 61 static void clear_besthash(besthash_t *h, int32_t pos) | |
| 62 { | |
| 63 khint_t k; | |
| 64 for (k = kh_begin(h); k != kh_end(h); ++k) | |
| 65 if (kh_exist(h, k) && kh_val(h, k)->endpos <= pos) | |
| 66 kh_del(best, h, k); | |
| 67 } | |
| 68 | |
| 69 static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(lib) *h) | |
| 70 { | |
| 71 if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) { | |
| 72 khint_t k; | |
| 73 while (1) { | |
| 74 elem_t *q; | |
| 75 if (queue->head == queue->tail) break; | |
| 76 q = &kl_val(queue->head); | |
| 77 if (q->discarded) { | |
| 78 q->b->data_len = 0; | |
| 79 kl_shift(q, queue, 0); | |
| 80 continue; | |
| 81 } | |
| 82 if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break; | |
| 83 samwrite(out, q->b); | |
| 84 q->b->data_len = 0; | |
| 85 kl_shift(q, queue, 0); | |
| 86 } | |
| 87 for (k = kh_begin(h); k != kh_end(h); ++k) { | |
| 88 if (kh_exist(h, k)) { | |
| 89 clear_besthash(kh_val(h, k).left, pos); | |
| 90 clear_besthash(kh_val(h, k).rght, pos); | |
| 91 } | |
| 92 } | |
| 93 } | |
| 94 } | |
| 95 | |
| 96 void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se) | |
| 97 { | |
| 98 bam1_t *b; | |
| 99 queue_t *queue; | |
| 100 khint_t k; | |
| 101 int last_tid = -2; | |
| 102 khash_t(lib) *aux; | |
| 103 | |
| 104 aux = kh_init(lib); | |
| 105 b = bam_init1(); | |
| 106 queue = kl_init(q); | |
| 107 while (samread(in, b) >= 0) { | |
| 108 bam1_core_t *c = &b->core; | |
| 109 int endpos = bam_calend(c, bam1_cigar(b)); | |
| 110 int score = sum_qual(b); | |
| 111 | |
| 112 if (last_tid != c->tid) { | |
| 113 if (last_tid >= 0) dump_alignment(out, queue, MAX_POS, aux); | |
| 114 last_tid = c->tid; | |
| 115 } else dump_alignment(out, queue, c->pos, aux); | |
| 116 if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) { | |
| 117 push_queue(queue, b, endpos, score); | |
| 118 } else { | |
| 119 const char *lib; | |
| 120 lib_aux_t *q; | |
| 121 besthash_t *h; | |
| 122 uint32_t key; | |
| 123 int ret; | |
| 124 lib = bam_get_library(in->header, b); | |
| 125 q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); | |
| 126 ++q->n_checked; | |
| 127 h = (c->flag&BAM_FREVERSE)? q->rght : q->left; | |
| 128 key = (c->flag&BAM_FREVERSE)? endpos : c->pos; | |
| 129 k = kh_put(best, h, key, &ret); | |
| 130 if (ret == 0) { // in the hash table | |
| 131 elem_t *p = kh_val(h, k); | |
| 132 ++q->n_removed; | |
| 133 if (p->score < score) { | |
| 134 if (c->flag&BAM_FREVERSE) { // mark "discarded" and push the queue | |
| 135 p->discarded = 1; | |
| 136 kh_val(h, k) = push_queue(queue, b, endpos, score); | |
| 137 } else { // replace | |
| 138 p->score = score; p->endpos = endpos; | |
| 139 bam_copy1(p->b, b); | |
| 140 } | |
| 141 } // otherwise, discard the alignment | |
| 142 } else kh_val(h, k) = push_queue(queue, b, endpos, score); | |
| 143 } | |
| 144 } | |
| 145 dump_alignment(out, queue, MAX_POS, aux); | |
| 146 | |
| 147 for (k = kh_begin(aux); k != kh_end(aux); ++k) { | |
| 148 if (kh_exist(aux, k)) { | |
| 149 lib_aux_t *q = &kh_val(aux, k); | |
| 150 fprintf(stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, | |
| 151 (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k)); | |
| 152 kh_destroy(best, q->left); kh_destroy(best, q->rght); | |
| 153 free((char*)kh_key(aux, k)); | |
| 154 } | |
| 155 } | |
| 156 kh_destroy(lib, aux); | |
| 157 bam_destroy1(b); | |
| 158 kl_destroy(q, queue); | |
| 159 } |
