Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/samtools-0.1.19/bam_rmdup.c @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 #include <stdlib.h> | |
2 #include <string.h> | |
3 #include <stdio.h> | |
4 #include <zlib.h> | |
5 #include <unistd.h> | |
6 #include "sam.h" | |
7 | |
8 typedef bam1_t *bam1_p; | |
9 | |
10 #include "khash.h" | |
11 KHASH_SET_INIT_STR(name) | |
12 KHASH_MAP_INIT_INT64(pos, bam1_p) | |
13 | |
14 #define BUFFER_SIZE 0x40000 | |
15 | |
16 typedef struct { | |
17 uint64_t n_checked, n_removed; | |
18 khash_t(pos) *best_hash; | |
19 } lib_aux_t; | |
20 KHASH_MAP_INIT_STR(lib, lib_aux_t) | |
21 | |
22 typedef struct { | |
23 int n, max; | |
24 bam1_t **a; | |
25 } tmp_stack_t; | |
26 | |
27 static inline void stack_insert(tmp_stack_t *stack, bam1_t *b) | |
28 { | |
29 if (stack->n == stack->max) { | |
30 stack->max = stack->max? stack->max<<1 : 0x10000; | |
31 stack->a = (bam1_t**)realloc(stack->a, sizeof(bam1_t*) * stack->max); | |
32 } | |
33 stack->a[stack->n++] = b; | |
34 } | |
35 | |
36 static inline void dump_best(tmp_stack_t *stack, samfile_t *out) | |
37 { | |
38 int i; | |
39 for (i = 0; i != stack->n; ++i) { | |
40 samwrite(out, stack->a[i]); | |
41 bam_destroy1(stack->a[i]); | |
42 } | |
43 stack->n = 0; | |
44 } | |
45 | |
46 static void clear_del_set(khash_t(name) *del_set) | |
47 { | |
48 khint_t k; | |
49 for (k = kh_begin(del_set); k < kh_end(del_set); ++k) | |
50 if (kh_exist(del_set, k)) | |
51 free((char*)kh_key(del_set, k)); | |
52 kh_clear(name, del_set); | |
53 } | |
54 | |
55 static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib) | |
56 { | |
57 khint_t k = kh_get(lib, aux, lib); | |
58 if (k == kh_end(aux)) { | |
59 int ret; | |
60 char *p = strdup(lib); | |
61 lib_aux_t *q; | |
62 k = kh_put(lib, aux, p, &ret); | |
63 q = &kh_val(aux, k); | |
64 q->n_checked = q->n_removed = 0; | |
65 q->best_hash = kh_init(pos); | |
66 return q; | |
67 } else return &kh_val(aux, k); | |
68 } | |
69 | |
70 static void clear_best(khash_t(lib) *aux, int max) | |
71 { | |
72 khint_t k; | |
73 for (k = kh_begin(aux); k != kh_end(aux); ++k) { | |
74 if (kh_exist(aux, k)) { | |
75 lib_aux_t *q = &kh_val(aux, k); | |
76 if (kh_size(q->best_hash) >= max) | |
77 kh_clear(pos, q->best_hash); | |
78 } | |
79 } | |
80 } | |
81 | |
82 static inline int sum_qual(const bam1_t *b) | |
83 { | |
84 int i, q; | |
85 uint8_t *qual = bam1_qual(b); | |
86 for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i]; | |
87 return q; | |
88 } | |
89 | |
90 void bam_rmdup_core(samfile_t *in, samfile_t *out) | |
91 { | |
92 bam1_t *b; | |
93 int last_tid = -1, last_pos = -1; | |
94 tmp_stack_t stack; | |
95 khint_t k; | |
96 khash_t(lib) *aux; | |
97 khash_t(name) *del_set; | |
98 | |
99 aux = kh_init(lib); | |
100 del_set = kh_init(name); | |
101 b = bam_init1(); | |
102 memset(&stack, 0, sizeof(tmp_stack_t)); | |
103 | |
104 kh_resize(name, del_set, 4 * BUFFER_SIZE); | |
105 while (samread(in, b) >= 0) { | |
106 bam1_core_t *c = &b->core; | |
107 if (c->tid != last_tid || last_pos != c->pos) { | |
108 dump_best(&stack, out); // write the result | |
109 clear_best(aux, BUFFER_SIZE); | |
110 if (c->tid != last_tid) { | |
111 clear_best(aux, 0); | |
112 if (kh_size(del_set)) { // check | |
113 fprintf(stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set)); | |
114 clear_del_set(del_set); | |
115 } | |
116 if ((int)c->tid == -1) { // append unmapped reads | |
117 samwrite(out, b); | |
118 while (samread(in, b) >= 0) samwrite(out, b); | |
119 break; | |
120 } | |
121 last_tid = c->tid; | |
122 fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", in->header->target_name[c->tid]); | |
123 } | |
124 } | |
125 if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) { | |
126 samwrite(out, b); | |
127 } else if (c->isize > 0) { // paired, head | |
128 uint64_t key = (uint64_t)c->pos<<32 | c->isize; | |
129 const char *lib; | |
130 lib_aux_t *q; | |
131 int ret; | |
132 lib = bam_get_library(in->header, b); | |
133 q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); | |
134 ++q->n_checked; | |
135 k = kh_put(pos, q->best_hash, key, &ret); | |
136 if (ret == 0) { // found in best_hash | |
137 bam1_t *p = kh_val(q->best_hash, k); | |
138 ++q->n_removed; | |
139 if (sum_qual(p) < sum_qual(b)) { // the current alignment is better; this can be accelerated in principle | |
140 kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed | |
141 bam_copy1(p, b); // replaced as b | |
142 } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed | |
143 if (ret == 0) | |
144 fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b)); | |
145 } else { // not found in best_hash | |
146 kh_val(q->best_hash, k) = bam_dup1(b); | |
147 stack_insert(&stack, kh_val(q->best_hash, k)); | |
148 } | |
149 } else { // paired, tail | |
150 k = kh_get(name, del_set, bam1_qname(b)); | |
151 if (k != kh_end(del_set)) { | |
152 free((char*)kh_key(del_set, k)); | |
153 kh_del(name, del_set, k); | |
154 } else samwrite(out, b); | |
155 } | |
156 last_pos = c->pos; | |
157 } | |
158 | |
159 for (k = kh_begin(aux); k != kh_end(aux); ++k) { | |
160 if (kh_exist(aux, k)) { | |
161 lib_aux_t *q = &kh_val(aux, k); | |
162 dump_best(&stack, out); | |
163 fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, | |
164 (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k)); | |
165 kh_destroy(pos, q->best_hash); | |
166 free((char*)kh_key(aux, k)); | |
167 } | |
168 } | |
169 kh_destroy(lib, aux); | |
170 | |
171 clear_del_set(del_set); | |
172 kh_destroy(name, del_set); | |
173 free(stack.a); | |
174 bam_destroy1(b); | |
175 } | |
176 | |
177 void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se); | |
178 | |
179 int bam_rmdup(int argc, char *argv[]) | |
180 { | |
181 int c, is_se = 0, force_se = 0; | |
182 samfile_t *in, *out; | |
183 while ((c = getopt(argc, argv, "sS")) >= 0) { | |
184 switch (c) { | |
185 case 's': is_se = 1; break; | |
186 case 'S': force_se = is_se = 1; break; | |
187 } | |
188 } | |
189 if (optind + 2 > argc) { | |
190 fprintf(stderr, "\n"); | |
191 fprintf(stderr, "Usage: samtools rmdup [-sS] <input.srt.bam> <output.bam>\n\n"); | |
192 fprintf(stderr, "Option: -s rmdup for SE reads\n"); | |
193 fprintf(stderr, " -S treat PE reads as SE in rmdup (force -s)\n\n"); | |
194 return 1; | |
195 } | |
196 in = samopen(argv[optind], "rb", 0); | |
197 out = samopen(argv[optind+1], "wb", in->header); | |
198 if (in == 0 || out == 0) { | |
199 fprintf(stderr, "[bam_rmdup] fail to read/write input files\n"); | |
200 return 1; | |
201 } | |
202 if (is_se) bam_rmdupse_core(in, out, force_se); | |
203 else bam_rmdup_core(in, out); | |
204 samclose(in); samclose(out); | |
205 return 0; | |
206 } |