annotate PsiCLASS-1.0.2/samtools-0.1.19/bam_rmdup.c @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1 #include <stdlib.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
2 #include <string.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
3 #include <stdio.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
4 #include <zlib.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
5 #include <unistd.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
6 #include "sam.h"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
7
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
8 typedef bam1_t *bam1_p;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
9
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
10 #include "khash.h"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
11 KHASH_SET_INIT_STR(name)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
12 KHASH_MAP_INIT_INT64(pos, bam1_p)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
13
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
14 #define BUFFER_SIZE 0x40000
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
15
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
16 typedef struct {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
17 uint64_t n_checked, n_removed;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
18 khash_t(pos) *best_hash;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
19 } lib_aux_t;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
20 KHASH_MAP_INIT_STR(lib, lib_aux_t)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
21
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
22 typedef struct {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
23 int n, max;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
24 bam1_t **a;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
25 } tmp_stack_t;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
26
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
27 static inline void stack_insert(tmp_stack_t *stack, bam1_t *b)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
28 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
29 if (stack->n == stack->max) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
30 stack->max = stack->max? stack->max<<1 : 0x10000;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
31 stack->a = (bam1_t**)realloc(stack->a, sizeof(bam1_t*) * stack->max);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
32 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
33 stack->a[stack->n++] = b;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
34 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
35
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
36 static inline void dump_best(tmp_stack_t *stack, samfile_t *out)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
37 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
38 int i;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
39 for (i = 0; i != stack->n; ++i) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
40 samwrite(out, stack->a[i]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
41 bam_destroy1(stack->a[i]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
42 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
43 stack->n = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
44 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
45
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
46 static void clear_del_set(khash_t(name) *del_set)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
47 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
48 khint_t k;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
49 for (k = kh_begin(del_set); k < kh_end(del_set); ++k)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
50 if (kh_exist(del_set, k))
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
51 free((char*)kh_key(del_set, k));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
52 kh_clear(name, del_set);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
53 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
54
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
55 static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
56 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
57 khint_t k = kh_get(lib, aux, lib);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
58 if (k == kh_end(aux)) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
59 int ret;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
60 char *p = strdup(lib);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
61 lib_aux_t *q;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
62 k = kh_put(lib, aux, p, &ret);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
63 q = &kh_val(aux, k);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
64 q->n_checked = q->n_removed = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
65 q->best_hash = kh_init(pos);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
66 return q;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
67 } else return &kh_val(aux, k);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
68 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
69
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
70 static void clear_best(khash_t(lib) *aux, int max)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
71 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
72 khint_t k;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
73 for (k = kh_begin(aux); k != kh_end(aux); ++k) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
74 if (kh_exist(aux, k)) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
75 lib_aux_t *q = &kh_val(aux, k);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
76 if (kh_size(q->best_hash) >= max)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
77 kh_clear(pos, q->best_hash);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
78 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
79 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
80 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
81
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
82 static inline int sum_qual(const bam1_t *b)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
83 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
84 int i, q;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
85 uint8_t *qual = bam1_qual(b);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
86 for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
87 return q;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
88 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
89
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
90 void bam_rmdup_core(samfile_t *in, samfile_t *out)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
91 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
92 bam1_t *b;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
93 int last_tid = -1, last_pos = -1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
94 tmp_stack_t stack;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
95 khint_t k;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
96 khash_t(lib) *aux;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
97 khash_t(name) *del_set;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
98
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
99 aux = kh_init(lib);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
100 del_set = kh_init(name);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
101 b = bam_init1();
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
102 memset(&stack, 0, sizeof(tmp_stack_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
103
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
104 kh_resize(name, del_set, 4 * BUFFER_SIZE);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
105 while (samread(in, b) >= 0) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
106 bam1_core_t *c = &b->core;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
107 if (c->tid != last_tid || last_pos != c->pos) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
108 dump_best(&stack, out); // write the result
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
109 clear_best(aux, BUFFER_SIZE);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
110 if (c->tid != last_tid) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
111 clear_best(aux, 0);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
112 if (kh_size(del_set)) { // check
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
113 fprintf(stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
114 clear_del_set(del_set);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
115 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
116 if ((int)c->tid == -1) { // append unmapped reads
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
117 samwrite(out, b);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
118 while (samread(in, b) >= 0) samwrite(out, b);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
119 break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
120 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
121 last_tid = c->tid;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
122 fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", in->header->target_name[c->tid]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
123 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
124 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
125 if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
126 samwrite(out, b);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
127 } else if (c->isize > 0) { // paired, head
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
128 uint64_t key = (uint64_t)c->pos<<32 | c->isize;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
129 const char *lib;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
130 lib_aux_t *q;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
131 int ret;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
132 lib = bam_get_library(in->header, b);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
133 q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
134 ++q->n_checked;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
135 k = kh_put(pos, q->best_hash, key, &ret);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
136 if (ret == 0) { // found in best_hash
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
137 bam1_t *p = kh_val(q->best_hash, k);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
138 ++q->n_removed;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
139 if (sum_qual(p) < sum_qual(b)) { // the current alignment is better; this can be accelerated in principle
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
140 kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
141 bam_copy1(p, b); // replaced as b
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
142 } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
143 if (ret == 0)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
144 fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
145 } else { // not found in best_hash
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
146 kh_val(q->best_hash, k) = bam_dup1(b);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
147 stack_insert(&stack, kh_val(q->best_hash, k));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
148 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
149 } else { // paired, tail
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
150 k = kh_get(name, del_set, bam1_qname(b));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
151 if (k != kh_end(del_set)) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
152 free((char*)kh_key(del_set, k));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
153 kh_del(name, del_set, k);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
154 } else samwrite(out, b);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
155 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
156 last_pos = c->pos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
157 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
158
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
159 for (k = kh_begin(aux); k != kh_end(aux); ++k) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
160 if (kh_exist(aux, k)) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
161 lib_aux_t *q = &kh_val(aux, k);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
162 dump_best(&stack, out);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
163 fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
164 (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
165 kh_destroy(pos, q->best_hash);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
166 free((char*)kh_key(aux, k));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
167 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
168 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
169 kh_destroy(lib, aux);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
170
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
171 clear_del_set(del_set);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
172 kh_destroy(name, del_set);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
173 free(stack.a);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
174 bam_destroy1(b);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
175 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
176
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
177 void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
178
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
179 int bam_rmdup(int argc, char *argv[])
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
180 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
181 int c, is_se = 0, force_se = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
182 samfile_t *in, *out;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
183 while ((c = getopt(argc, argv, "sS")) >= 0) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
184 switch (c) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
185 case 's': is_se = 1; break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
186 case 'S': force_se = is_se = 1; break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
187 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
188 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
189 if (optind + 2 > argc) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
190 fprintf(stderr, "\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
191 fprintf(stderr, "Usage: samtools rmdup [-sS] <input.srt.bam> <output.bam>\n\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
192 fprintf(stderr, "Option: -s rmdup for SE reads\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
193 fprintf(stderr, " -S treat PE reads as SE in rmdup (force -s)\n\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
194 return 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
195 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
196 in = samopen(argv[optind], "rb", 0);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
197 out = samopen(argv[optind+1], "wb", in->header);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
198 if (in == 0 || out == 0) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
199 fprintf(stderr, "[bam_rmdup] fail to read/write input files\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
200 return 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
201 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
202 if (is_se) bam_rmdupse_core(in, out, force_se);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
203 else bam_rmdup_core(in, out);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
204 samclose(in); samclose(out);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
205 return 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
206 }