Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/samtools-0.1.19/bam_rmdupse.c @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 #include <math.h> | |
2 #include "sam.h" | |
3 #include "khash.h" | |
4 #include "klist.h" | |
5 | |
6 #define QUEUE_CLEAR_SIZE 0x100000 | |
7 #define MAX_POS 0x7fffffff | |
8 | |
9 typedef struct { | |
10 int endpos; | |
11 uint32_t score:31, discarded:1; | |
12 bam1_t *b; | |
13 } elem_t, *elem_p; | |
14 #define __free_elem(p) bam_destroy1((p)->data.b) | |
15 KLIST_INIT(q, elem_t, __free_elem) | |
16 typedef klist_t(q) queue_t; | |
17 | |
18 KHASH_MAP_INIT_INT(best, elem_p) | |
19 typedef khash_t(best) besthash_t; | |
20 | |
21 typedef struct { | |
22 uint64_t n_checked, n_removed; | |
23 besthash_t *left, *rght; | |
24 } lib_aux_t; | |
25 KHASH_MAP_INIT_STR(lib, lib_aux_t) | |
26 | |
27 static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib) | |
28 { | |
29 khint_t k = kh_get(lib, aux, lib); | |
30 if (k == kh_end(aux)) { | |
31 int ret; | |
32 char *p = strdup(lib); | |
33 lib_aux_t *q; | |
34 k = kh_put(lib, aux, p, &ret); | |
35 q = &kh_val(aux, k); | |
36 q->left = kh_init(best); | |
37 q->rght = kh_init(best); | |
38 q->n_checked = q->n_removed = 0; | |
39 return q; | |
40 } else return &kh_val(aux, k); | |
41 } | |
42 | |
43 static inline int sum_qual(const bam1_t *b) | |
44 { | |
45 int i, q; | |
46 uint8_t *qual = bam1_qual(b); | |
47 for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i]; | |
48 return q; | |
49 } | |
50 | |
51 static inline elem_t *push_queue(queue_t *queue, const bam1_t *b, int endpos, int score) | |
52 { | |
53 elem_t *p = kl_pushp(q, queue); | |
54 p->discarded = 0; | |
55 p->endpos = endpos; p->score = score; | |
56 if (p->b == 0) p->b = bam_init1(); | |
57 bam_copy1(p->b, b); | |
58 return p; | |
59 } | |
60 | |
61 static void clear_besthash(besthash_t *h, int32_t pos) | |
62 { | |
63 khint_t k; | |
64 for (k = kh_begin(h); k != kh_end(h); ++k) | |
65 if (kh_exist(h, k) && kh_val(h, k)->endpos <= pos) | |
66 kh_del(best, h, k); | |
67 } | |
68 | |
69 static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(lib) *h) | |
70 { | |
71 if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) { | |
72 khint_t k; | |
73 while (1) { | |
74 elem_t *q; | |
75 if (queue->head == queue->tail) break; | |
76 q = &kl_val(queue->head); | |
77 if (q->discarded) { | |
78 q->b->data_len = 0; | |
79 kl_shift(q, queue, 0); | |
80 continue; | |
81 } | |
82 if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break; | |
83 samwrite(out, q->b); | |
84 q->b->data_len = 0; | |
85 kl_shift(q, queue, 0); | |
86 } | |
87 for (k = kh_begin(h); k != kh_end(h); ++k) { | |
88 if (kh_exist(h, k)) { | |
89 clear_besthash(kh_val(h, k).left, pos); | |
90 clear_besthash(kh_val(h, k).rght, pos); | |
91 } | |
92 } | |
93 } | |
94 } | |
95 | |
96 void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se) | |
97 { | |
98 bam1_t *b; | |
99 queue_t *queue; | |
100 khint_t k; | |
101 int last_tid = -2; | |
102 khash_t(lib) *aux; | |
103 | |
104 aux = kh_init(lib); | |
105 b = bam_init1(); | |
106 queue = kl_init(q); | |
107 while (samread(in, b) >= 0) { | |
108 bam1_core_t *c = &b->core; | |
109 int endpos = bam_calend(c, bam1_cigar(b)); | |
110 int score = sum_qual(b); | |
111 | |
112 if (last_tid != c->tid) { | |
113 if (last_tid >= 0) dump_alignment(out, queue, MAX_POS, aux); | |
114 last_tid = c->tid; | |
115 } else dump_alignment(out, queue, c->pos, aux); | |
116 if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) { | |
117 push_queue(queue, b, endpos, score); | |
118 } else { | |
119 const char *lib; | |
120 lib_aux_t *q; | |
121 besthash_t *h; | |
122 uint32_t key; | |
123 int ret; | |
124 lib = bam_get_library(in->header, b); | |
125 q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); | |
126 ++q->n_checked; | |
127 h = (c->flag&BAM_FREVERSE)? q->rght : q->left; | |
128 key = (c->flag&BAM_FREVERSE)? endpos : c->pos; | |
129 k = kh_put(best, h, key, &ret); | |
130 if (ret == 0) { // in the hash table | |
131 elem_t *p = kh_val(h, k); | |
132 ++q->n_removed; | |
133 if (p->score < score) { | |
134 if (c->flag&BAM_FREVERSE) { // mark "discarded" and push the queue | |
135 p->discarded = 1; | |
136 kh_val(h, k) = push_queue(queue, b, endpos, score); | |
137 } else { // replace | |
138 p->score = score; p->endpos = endpos; | |
139 bam_copy1(p->b, b); | |
140 } | |
141 } // otherwise, discard the alignment | |
142 } else kh_val(h, k) = push_queue(queue, b, endpos, score); | |
143 } | |
144 } | |
145 dump_alignment(out, queue, MAX_POS, aux); | |
146 | |
147 for (k = kh_begin(aux); k != kh_end(aux); ++k) { | |
148 if (kh_exist(aux, k)) { | |
149 lib_aux_t *q = &kh_val(aux, k); | |
150 fprintf(stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, | |
151 (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k)); | |
152 kh_destroy(best, q->left); kh_destroy(best, q->rght); | |
153 free((char*)kh_key(aux, k)); | |
154 } | |
155 } | |
156 kh_destroy(lib, aux); | |
157 bam_destroy1(b); | |
158 kl_destroy(q, queue); | |
159 } |