annotate PsiCLASS-1.0.2/samtools-0.1.19/bamshuf.c @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1 #include <unistd.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
2 #include <stdio.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
3 #include <stdlib.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
4 #include <string.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
5 #include <assert.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
6 #include "sam.h"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
7 #include "ksort.h"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
8
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
9 #define DEF_CLEVEL 1
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
10
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
11 static inline unsigned hash_Wang(unsigned key)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
12 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
13 key += ~(key << 15);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
14 key ^= (key >> 10);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
15 key += (key << 3);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
16 key ^= (key >> 6);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
17 key += ~(key << 11);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
18 key ^= (key >> 16);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
19 return key;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
20 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
21
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
22 static inline unsigned hash_X31_Wang(const char *s)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
23 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
24 unsigned h = *s;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
25 if (h) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
26 for (++s ; *s; ++s) h = (h << 5) - h + *s;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
27 return hash_Wang(h);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
28 } else return 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
29 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
30
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
31 typedef struct {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
32 unsigned key;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
33 bam1_t *b;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
34 } elem_t;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
35
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
36 static inline int elem_lt(elem_t x, elem_t y)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
37 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
38 if (x.key < y.key) return 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
39 if (x.key == y.key) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
40 int t;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
41 t = strcmp(bam_get_qname(x.b), bam_get_qname(y.b));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
42 if (t < 0) return 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
43 return (t == 0 && ((x.b->core.flag>>6&3) < (y.b->core.flag>>6&3)));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
44 } else return 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
45 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
46
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
47 KSORT_INIT(bamshuf, elem_t, elem_lt)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
48
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
49 static void bamshuf(const char *fn, int n_files, const char *pre, int clevel, int is_stdout)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
50 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
51 BGZF *fp, *fpw, **fpt;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
52 char **fnt, modew[8];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
53 bam1_t *b;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
54 int i, l;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
55 bam_hdr_t *h;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
56 int64_t *cnt;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
57
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
58 // split
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
59 fp = strcmp(fn, "-")? bgzf_open(fn, "r") : bgzf_dopen(fileno(stdin), "r");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
60 assert(fp);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
61 h = bam_hdr_read(fp);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
62 fnt = (char**)calloc(n_files, sizeof(void*));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
63 fpt = (BGZF**)calloc(n_files, sizeof(void*));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
64 cnt = (int64_t*)calloc(n_files, 8);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
65 l = strlen(pre);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
66 for (i = 0; i < n_files; ++i) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
67 fnt[i] = (char*)calloc(l + 10, 1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
68 sprintf(fnt[i], "%s.%.4d.bam", pre, i);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
69 fpt[i] = bgzf_open(fnt[i], "w1");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
70 bam_hdr_write(fpt[i], h);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
71 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
72 b = bam_init1();
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
73 while (bam_read1(fp, b) >= 0) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
74 uint32_t x;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
75 x = hash_X31_Wang(bam_get_qname(b)) % n_files;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
76 bam_write1(fpt[x], b);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
77 ++cnt[x];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
78 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
79 bam_destroy1(b);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
80 for (i = 0; i < n_files; ++i) bgzf_close(fpt[i]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
81 free(fpt);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
82 bgzf_close(fp);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
83 // merge
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
84 sprintf(modew, "w%d", (clevel >= 0 && clevel <= 9)? clevel : DEF_CLEVEL);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
85 if (!is_stdout) { // output to a file
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
86 char *fnw = (char*)calloc(l + 5, 1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
87 sprintf(fnw, "%s.bam", pre);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
88 fpw = bgzf_open(fnw, modew);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
89 free(fnw);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
90 } else fpw = bgzf_dopen(fileno(stdout), modew); // output to stdout
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
91 bam_hdr_write(fpw, h);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
92 bam_hdr_destroy(h);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
93 for (i = 0; i < n_files; ++i) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
94 int64_t j, c = cnt[i];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
95 elem_t *a;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
96 fp = bgzf_open(fnt[i], "r");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
97 bam_hdr_destroy(bam_hdr_read(fp));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
98 a = (elem_t*)calloc(c, sizeof(elem_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
99 for (j = 0; j < c; ++j) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
100 a[j].b = bam_init1();
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
101 assert(bam_read1(fp, a[j].b) >= 0);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
102 a[j].key = hash_X31_Wang(bam_get_qname(a[j].b));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
103 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
104 bgzf_close(fp);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
105 unlink(fnt[i]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
106 free(fnt[i]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
107 ks_introsort(bamshuf, c, a);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
108 for (j = 0; j < c; ++j) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
109 bam_write1(fpw, a[j].b);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
110 bam_destroy1(a[j].b);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
111 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
112 free(a);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
113 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
114 bgzf_close(fpw);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
115 free(fnt); free(cnt);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
116 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
117
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
118 int main_bamshuf(int argc, char *argv[])
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
119 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
120 int c, n_files = 64, clevel = DEF_CLEVEL, is_stdout = 0, is_un = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
121 while ((c = getopt(argc, argv, "n:l:uO")) >= 0) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
122 switch (c) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
123 case 'n': n_files = atoi(optarg); break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
124 case 'l': clevel = atoi(optarg); break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
125 case 'u': is_un = 1; break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
126 case 'O': is_stdout = 1; break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
127 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
128 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
129 if (is_un) clevel = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
130 if (optind + 2 > argc) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
131 fprintf(stderr, "\nUsage: bamshuf [-Ou] [-n nFiles] [-c cLevel] <in.bam> <out.prefix>\n\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
132 fprintf(stderr, "Options: -O output to stdout\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
133 fprintf(stderr, " -u uncompressed BAM output\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
134 fprintf(stderr, " -l INT compression level [%d]\n", DEF_CLEVEL);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
135 fprintf(stderr, " -n INT number of temporary files [%d]\n", n_files);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
136 fprintf(stderr, "\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
137 return 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
138 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
139 bamshuf(argv[optind], n_files, argv[optind+1], clevel, is_stdout);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
140 return 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
141 }