Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/samtools-0.1.19/cut_target.c @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 #include <unistd.h> | |
2 #include <stdlib.h> | |
3 #include <string.h> | |
4 #include "bam.h" | |
5 #include "errmod.h" | |
6 #include "faidx.h" | |
7 | |
8 #define ERR_DEP 0.83f | |
9 | |
10 typedef struct { | |
11 int e[2][3], p[2][2]; | |
12 } score_param_t; | |
13 | |
14 /* Note that although the two matrics have 10 parameters in total, only 4 | |
15 * (probably 3) are free. Changing the scoring matrices in a sort of symmetric | |
16 * way will not change the result. */ | |
17 static score_param_t g_param = { {{0,0,0},{-4,1,6}}, {{0,-14000}, {0,0}} }; | |
18 | |
19 typedef struct { | |
20 int min_baseQ, tid, max_bases; | |
21 uint16_t *bases; | |
22 bamFile fp; | |
23 bam_header_t *h; | |
24 char *ref; | |
25 faidx_t *fai; | |
26 errmod_t *em; | |
27 } ct_t; | |
28 | |
29 static uint16_t gencns(ct_t *g, int n, const bam_pileup1_t *plp) | |
30 { | |
31 int i, j, ret, tmp, k, sum[4], qual; | |
32 float q[16]; | |
33 if (n > g->max_bases) { // enlarge g->bases | |
34 g->max_bases = n; | |
35 kroundup32(g->max_bases); | |
36 g->bases = realloc(g->bases, g->max_bases * 2); | |
37 } | |
38 for (i = k = 0; i < n; ++i) { | |
39 const bam_pileup1_t *p = plp + i; | |
40 uint8_t *seq; | |
41 int q, baseQ, b; | |
42 if (p->is_refskip || p->is_del) continue; | |
43 baseQ = bam1_qual(p->b)[p->qpos]; | |
44 if (baseQ < g->min_baseQ) continue; | |
45 seq = bam1_seq(p->b); | |
46 b = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos)]; | |
47 if (b > 3) continue; | |
48 q = baseQ < p->b->core.qual? baseQ : p->b->core.qual; | |
49 if (q < 4) q = 4; | |
50 if (q > 63) q = 63; | |
51 g->bases[k++] = q<<5 | bam1_strand(p->b)<<4 | b; | |
52 } | |
53 if (k == 0) return 0; | |
54 errmod_cal(g->em, k, 4, g->bases, q); | |
55 for (i = 0; i < 4; ++i) sum[i] = (int)(q[i<<2|i] + .499) << 2 | i; | |
56 for (i = 1; i < 4; ++i) // insertion sort | |
57 for (j = i; j > 0 && sum[j] < sum[j-1]; --j) | |
58 tmp = sum[j], sum[j] = sum[j-1], sum[j-1] = tmp; | |
59 qual = (sum[1]>>2) - (sum[0]>>2); | |
60 k = k < 256? k : 255; | |
61 ret = (qual < 63? qual : 63) << 2 | (sum[0]&3); | |
62 return ret<<8|k; | |
63 } | |
64 | |
65 static void process_cns(bam_header_t *h, int tid, int l, uint16_t *cns) | |
66 { | |
67 int i, f[2][2], *prev, *curr, *swap_tmp, s; | |
68 uint8_t *b; // backtrack array | |
69 b = calloc(l, 1); | |
70 f[0][0] = f[0][1] = 0; | |
71 prev = f[0]; curr = f[1]; | |
72 // fill the backtrack matrix | |
73 for (i = 0; i < l; ++i) { | |
74 int c = (cns[i] == 0)? 0 : (cns[i]>>8 == 0)? 1 : 2; | |
75 int tmp0, tmp1; | |
76 // compute f[0] | |
77 tmp0 = prev[0] + g_param.e[0][c] + g_param.p[0][0]; // (s[i+1],s[i])=(0,0) | |
78 tmp1 = prev[1] + g_param.e[0][c] + g_param.p[1][0]; // (0,1) | |
79 if (tmp0 > tmp1) curr[0] = tmp0, b[i] = 0; | |
80 else curr[0] = tmp1, b[i] = 1; | |
81 // compute f[1] | |
82 tmp0 = prev[0] + g_param.e[1][c] + g_param.p[0][1]; // (s[i+1],s[i])=(1,0) | |
83 tmp1 = prev[1] + g_param.e[1][c] + g_param.p[1][1]; // (1,1) | |
84 if (tmp0 > tmp1) curr[1] = tmp0, b[i] |= 0<<1; | |
85 else curr[1] = tmp1, b[i] |= 1<<1; | |
86 // swap | |
87 swap_tmp = prev; prev = curr; curr = swap_tmp; | |
88 } | |
89 // backtrack | |
90 s = prev[0] > prev[1]? 0 : 1; | |
91 for (i = l - 1; i > 0; --i) { | |
92 b[i] |= s<<2; | |
93 s = b[i]>>s&1; | |
94 } | |
95 // print | |
96 for (i = 0, s = -1; i <= l; ++i) { | |
97 if (i == l || ((b[i]>>2&3) == 0 && s >= 0)) { | |
98 if (s >= 0) { | |
99 int j; | |
100 printf("%s:%d-%d\t0\t%s\t%d\t60\t%dM\t*\t0\t0\t", h->target_name[tid], s+1, i, h->target_name[tid], s+1, i-s); | |
101 for (j = s; j < i; ++j) { | |
102 int c = cns[j]>>8; | |
103 if (c == 0) putchar('N'); | |
104 else putchar("ACGT"[c&3]); | |
105 } | |
106 putchar('\t'); | |
107 for (j = s; j < i; ++j) | |
108 putchar(33 + (cns[j]>>8>>2)); | |
109 putchar('\n'); | |
110 } | |
111 //if (s >= 0) printf("%s\t%d\t%d\t%d\n", h->target_name[tid], s, i, i - s); | |
112 s = -1; | |
113 } else if ((b[i]>>2&3) && s < 0) s = i; | |
114 } | |
115 free(b); | |
116 } | |
117 | |
118 static int read_aln(void *data, bam1_t *b) | |
119 { | |
120 extern int bam_prob_realn_core(bam1_t *b, const char *ref, int flag); | |
121 ct_t *g = (ct_t*)data; | |
122 int ret, len; | |
123 ret = bam_read1(g->fp, b); | |
124 if (ret >= 0 && g->fai && b->core.tid >= 0 && (b->core.flag&4) == 0) { | |
125 if (b->core.tid != g->tid) { // then load the sequence | |
126 free(g->ref); | |
127 g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &len); | |
128 g->tid = b->core.tid; | |
129 } | |
130 bam_prob_realn_core(b, g->ref, 1<<1|1); | |
131 } | |
132 return ret; | |
133 } | |
134 | |
135 int main_cut_target(int argc, char *argv[]) | |
136 { | |
137 int c, tid, pos, n, lasttid = -1, lastpos = -1, l, max_l; | |
138 const bam_pileup1_t *p; | |
139 bam_plp_t plp; | |
140 uint16_t *cns; | |
141 ct_t g; | |
142 | |
143 memset(&g, 0, sizeof(ct_t)); | |
144 g.min_baseQ = 13; g.tid = -1; | |
145 while ((c = getopt(argc, argv, "f:Q:i:o:0:1:2:")) >= 0) { | |
146 switch (c) { | |
147 case 'Q': g.min_baseQ = atoi(optarg); break; // quality cutoff | |
148 case 'i': g_param.p[0][1] = -atoi(optarg); break; // 0->1 transition (in) PENALTY | |
149 case '0': g_param.e[1][0] = atoi(optarg); break; // emission SCORE | |
150 case '1': g_param.e[1][1] = atoi(optarg); break; | |
151 case '2': g_param.e[1][2] = atoi(optarg); break; | |
152 case 'f': g.fai = fai_load(optarg); | |
153 if (g.fai == 0) fprintf(stderr, "[%s] fail to load the fasta index.\n", __func__); | |
154 break; | |
155 } | |
156 } | |
157 if (argc == optind) { | |
158 fprintf(stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] [-f ref] <in.bam>\n"); | |
159 return 1; | |
160 } | |
161 l = max_l = 0; cns = 0; | |
162 g.fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); | |
163 g.h = bam_header_read(g.fp); | |
164 g.em = errmod_init(1 - ERR_DEP); | |
165 plp = bam_plp_init(read_aln, &g); | |
166 while ((p = bam_plp_auto(plp, &tid, &pos, &n)) != 0) { | |
167 if (tid < 0) break; | |
168 if (tid != lasttid) { // change of chromosome | |
169 if (cns) process_cns(g.h, lasttid, l, cns); | |
170 if (max_l < g.h->target_len[tid]) { | |
171 max_l = g.h->target_len[tid]; | |
172 kroundup32(max_l); | |
173 cns = realloc(cns, max_l * 2); | |
174 } | |
175 l = g.h->target_len[tid]; | |
176 memset(cns, 0, max_l * 2); | |
177 lasttid = tid; | |
178 } | |
179 cns[pos] = gencns(&g, n, p); | |
180 lastpos = pos; | |
181 } | |
182 process_cns(g.h, lasttid, l, cns); | |
183 free(cns); | |
184 bam_header_destroy(g.h); | |
185 bam_plp_destroy(plp); | |
186 bam_close(g.fp); | |
187 if (g.fai) { | |
188 fai_destroy(g.fai); free(g.ref); | |
189 } | |
190 errmod_destroy(g.em); | |
191 free(g.bases); | |
192 return 0; | |
193 } |