0
|
1 #include <stdlib.h>
|
|
2 #include <string.h>
|
|
3 #include "sample.h"
|
|
4 #include "khash.h"
|
|
5 KHASH_MAP_INIT_STR(sm, int)
|
|
6
|
|
7 bam_sample_t *bam_smpl_init(void)
|
|
8 {
|
|
9 bam_sample_t *s;
|
|
10 s = calloc(1, sizeof(bam_sample_t));
|
|
11 s->rg2smid = kh_init(sm);
|
|
12 s->sm2id = kh_init(sm);
|
|
13 return s;
|
|
14 }
|
|
15
|
|
16 void bam_smpl_destroy(bam_sample_t *sm)
|
|
17 {
|
|
18 int i;
|
|
19 khint_t k;
|
|
20 khash_t(sm) *rg2smid = (khash_t(sm)*)sm->rg2smid;
|
|
21 if (sm == 0) return;
|
|
22 for (i = 0; i < sm->n; ++i) free(sm->smpl[i]);
|
|
23 free(sm->smpl);
|
|
24 for (k = kh_begin(rg2smid); k != kh_end(rg2smid); ++k)
|
|
25 if (kh_exist(rg2smid, k)) free((char*)kh_key(rg2smid, k));
|
|
26 kh_destroy(sm, sm->rg2smid);
|
|
27 kh_destroy(sm, sm->sm2id);
|
|
28 free(sm);
|
|
29 }
|
|
30
|
|
31 static void add_pair(bam_sample_t *sm, khash_t(sm) *sm2id, const char *key, const char *val)
|
|
32 {
|
|
33 khint_t k_rg, k_sm;
|
|
34 int ret;
|
|
35 khash_t(sm) *rg2smid = (khash_t(sm)*)sm->rg2smid;
|
|
36 k_rg = kh_get(sm, rg2smid, key);
|
|
37 if (k_rg != kh_end(rg2smid)) return; // duplicated @RG-ID
|
|
38 k_rg = kh_put(sm, rg2smid, strdup(key), &ret);
|
|
39 k_sm = kh_get(sm, sm2id, val);
|
|
40 if (k_sm == kh_end(sm2id)) { // absent
|
|
41 if (sm->n == sm->m) {
|
|
42 sm->m = sm->m? sm->m<<1 : 1;
|
|
43 sm->smpl = realloc(sm->smpl, sizeof(void*) * sm->m);
|
|
44 }
|
|
45 sm->smpl[sm->n] = strdup(val);
|
|
46 k_sm = kh_put(sm, sm2id, sm->smpl[sm->n], &ret);
|
|
47 kh_val(sm2id, k_sm) = sm->n++;
|
|
48 }
|
|
49 kh_val(rg2smid, k_rg) = kh_val(sm2id, k_sm);
|
|
50 }
|
|
51
|
|
52 int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt)
|
|
53 {
|
|
54 const char *p = txt, *q, *r;
|
|
55 kstring_t buf, first_sm;
|
|
56 int n = 0;
|
|
57 khash_t(sm) *sm2id = (khash_t(sm)*)sm->sm2id;
|
|
58 if (txt == 0) {
|
|
59 add_pair(sm, sm2id, fn, fn);
|
|
60 return 0;
|
|
61 }
|
|
62 memset(&buf, 0, sizeof(kstring_t));
|
|
63 memset(&first_sm, 0, sizeof(kstring_t));
|
|
64 while ((q = strstr(p, "@RG")) != 0) {
|
|
65 p = q + 3;
|
|
66 r = q = 0;
|
|
67 if ((q = strstr(p, "\tID:")) != 0) q += 4;
|
|
68 if ((r = strstr(p, "\tSM:")) != 0) r += 4;
|
|
69 if (r && q) {
|
|
70 char *u, *v;
|
|
71 int oq, or;
|
|
72 for (u = (char*)q; *u && *u != '\t' && *u != '\n'; ++u);
|
|
73 for (v = (char*)r; *v && *v != '\t' && *v != '\n'; ++v);
|
|
74 oq = *u; or = *v; *u = *v = '\0';
|
|
75 buf.l = 0; kputs(fn, &buf); kputc('/', &buf); kputs(q, &buf);
|
|
76 add_pair(sm, sm2id, buf.s, r);
|
|
77 if ( !first_sm.s )
|
|
78 kputs(r,&first_sm);
|
|
79 *u = oq; *v = or;
|
|
80 } else break;
|
|
81 p = q > r? q : r;
|
|
82 ++n;
|
|
83 }
|
|
84 if (n == 0) add_pair(sm, sm2id, fn, fn);
|
|
85 // If there is only one RG tag present in the header and reads are not annotated, don't refuse to work but
|
|
86 // use the tag instead.
|
|
87 else if ( n==1 && first_sm.s )
|
|
88 add_pair(sm,sm2id,fn,first_sm.s);
|
|
89 if ( first_sm.s )
|
|
90 free(first_sm.s);
|
|
91
|
|
92 // add_pair(sm, sm2id, fn, fn);
|
|
93 free(buf.s);
|
|
94 return 0;
|
|
95 }
|
|
96
|
|
97 int bam_smpl_rg2smid(const bam_sample_t *sm, const char *fn, const char *rg, kstring_t *str)
|
|
98 {
|
|
99 khint_t k;
|
|
100 khash_t(sm) *rg2smid = (khash_t(sm)*)sm->rg2smid;
|
|
101 if (rg) {
|
|
102 str->l = 0;
|
|
103 kputs(fn, str); kputc('/', str); kputs(rg, str);
|
|
104 k = kh_get(sm, rg2smid, str->s);
|
|
105 } else k = kh_get(sm, rg2smid, fn);
|
|
106 return k == kh_end(rg2smid)? -1 : kh_val(rg2smid, k);
|
|
107 }
|