annotate SNV/SNVMix2_source/SNVMix2-v0.12.1-rc1/samtools-0.1.6/bam_rmdup.c @ 0:74f5ea818cea

Uploaded
author ryanmorin
date Wed, 12 Oct 2011 19:50:38 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
1 #include <stdlib.h>
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
2 #include <string.h>
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
3 #include <stdio.h>
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
4 #include <zlib.h>
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
5 #include "bam.h"
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
6
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
7 typedef bam1_t *bam1_p;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
8 #include "khash.h"
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
9 KHASH_SET_INIT_STR(name)
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
10 KHASH_MAP_INIT_INT64(pos, bam1_p)
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
11
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
12 #define BUFFER_SIZE 0x40000
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
13
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
14 typedef struct {
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
15 int n, max;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
16 bam1_t **a;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
17 } tmp_stack_t;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
18
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
19 static inline void stack_insert(tmp_stack_t *stack, bam1_t *b)
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
20 {
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
21 if (stack->n == stack->max) {
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
22 stack->max = stack->max? stack->max<<1 : 0x10000;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
23 stack->a = (bam1_t**)realloc(stack->a, sizeof(bam1_t*) * stack->max);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
24 }
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
25 stack->a[stack->n++] = b;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
26 }
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
27
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
28 static inline void dump_best(tmp_stack_t *stack, khash_t(pos) *best_hash, bamFile out)
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
29 {
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
30 int i;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
31 for (i = 0; i != stack->n; ++i) {
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
32 bam_write1(out, stack->a[i]);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
33 bam_destroy1(stack->a[i]);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
34 }
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
35 stack->n = 0;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
36 if (kh_size(best_hash) > BUFFER_SIZE) kh_clear(pos, best_hash);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
37 }
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
38
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
39 static void clear_del_set(khash_t(name) *del_set)
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
40 {
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
41 khint_t k;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
42 for (k = kh_begin(del_set); k < kh_end(del_set); ++k)
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
43 if (kh_exist(del_set, k))
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
44 free((char*)kh_key(del_set, k));
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
45 kh_clear(name, del_set);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
46 }
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
47
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
48 void bam_rmdup_core(bamFile in, bamFile out)
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
49 {
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
50 bam_header_t *header;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
51 bam1_t *b;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
52 int last_tid = -1, last_pos = -1;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
53 uint64_t n_checked = 0, n_removed = 0;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
54 tmp_stack_t stack;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
55 khint_t k;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
56 khash_t(pos) *best_hash;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
57 khash_t(name) *del_set;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
58
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
59 best_hash = kh_init(pos);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
60 del_set = kh_init(name);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
61 b = bam_init1();
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
62 memset(&stack, 0, sizeof(tmp_stack_t));
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
63 header = bam_header_read(in);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
64 bam_header_write(out, header);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
65
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
66 kh_resize(name, del_set, 4 * BUFFER_SIZE);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
67 kh_resize(pos, best_hash, 3 * BUFFER_SIZE);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
68 while (bam_read1(in, b) >= 0) {
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
69 bam1_core_t *c = &b->core;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
70 if (c->tid != last_tid || last_pos != c->pos) {
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
71 dump_best(&stack, best_hash, out); // write the result
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
72 if (c->tid != last_tid) {
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
73 kh_clear(pos, best_hash);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
74 if (kh_size(del_set)) { // check
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
75 fprintf(stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set));
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
76 clear_del_set(del_set);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
77 }
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
78 if ((int)c->tid == -1) { // append unmapped reads
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
79 bam_write1(out, b);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
80 while (bam_read1(in, b) >= 0) bam_write1(out, b);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
81 break;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
82 }
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
83 last_tid = c->tid;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
84 fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", header->target_name[c->tid]);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
85 }
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
86 }
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
87 if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) {
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
88 bam_write1(out, b);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
89 } else if (c->isize > 0) { // paired, head
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
90 uint64_t key = (uint64_t)c->pos<<32 | c->isize;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
91 int ret;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
92 ++n_checked;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
93 k = kh_put(pos, best_hash, key, &ret);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
94 if (ret == 0) { // found in best_hash
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
95 bam1_t *p = kh_val(best_hash, k);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
96 ++n_removed;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
97 if (p->core.qual < c->qual) { // the current alignment is better
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
98 kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
99 bam_copy1(p, b); // replaced as b
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
100 } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
101 if (ret == 0)
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
102 fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b));
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
103 } else { // not found in best_hash
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
104 kh_val(best_hash, k) = bam_dup1(b);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
105 stack_insert(&stack, kh_val(best_hash, k));
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
106 }
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
107 } else { // paired, tail
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
108 k = kh_get(name, del_set, bam1_qname(b));
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
109 if (k != kh_end(del_set)) {
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
110 free((char*)kh_key(del_set, k));
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
111 kh_del(name, del_set, k);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
112 } else bam_write1(out, b);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
113 }
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
114 last_pos = c->pos;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
115 }
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
116 dump_best(&stack, best_hash, out);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
117
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
118 bam_header_destroy(header);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
119 clear_del_set(del_set);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
120 kh_destroy(name, del_set);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
121 kh_destroy(pos, best_hash);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
122 free(stack.a);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
123 bam_destroy1(b);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
124 fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf\n", (long long)n_removed, (long long)n_checked,
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
125 (double)n_removed/n_checked);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
126 }
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
127 int bam_rmdup(int argc, char *argv[])
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
128 {
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
129 bamFile in, out;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
130 if (argc < 3) {
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
131 fprintf(stderr, "Usage: samtools rmdup <input.srt.bam> <output.bam>\n\n");
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
132 fprintf(stderr, "Note: Picard is recommended for this task.\n");
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
133 return 1;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
134 }
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
135 in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r");
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
136 out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w");
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
137 if (in == 0 || out == 0) {
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
138 fprintf(stderr, "[bam_rmdup] fail to read/write input files\n");
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
139 return 1;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
140 }
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
141 bam_rmdup_core(in, out);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
142 bam_close(in);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
143 bam_close(out);
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
144 return 0;
74f5ea818cea Uploaded
ryanmorin
parents:
diff changeset
145 }