annotate PsiCLASS-1.0.2/samtools-0.1.19/phase.c @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1 #include <stdio.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
2 #include <stdlib.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
3 #include <unistd.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
4 #include <stdint.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
5 #include <math.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
6 #include <zlib.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
7 #include "bam.h"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
8 #include "errmod.h"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
9
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
10 #include "kseq.h"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
11 KSTREAM_INIT(gzFile, gzread, 16384)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
12
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
13 #define MAX_VARS 256
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
14 #define FLIP_PENALTY 2
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
15 #define FLIP_THRES 4
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
16 #define MASK_THRES 3
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
17
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
18 #define FLAG_FIX_CHIMERA 0x1
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
19 #define FLAG_LIST_EXCL 0x4
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
20 #define FLAG_DROP_AMBI 0x8
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
21
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
22 typedef struct {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
23 // configurations, initialized in the main function
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
24 int flag, k, min_baseQ, min_varLOD, max_depth;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
25 // other global variables
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
26 int vpos_shift;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
27 bamFile fp;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
28 char *pre;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
29 bamFile out[3];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
30 // alignment queue
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
31 int n, m;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
32 bam1_t **b;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
33 } phaseg_t;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
34
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
35 typedef struct {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
36 int8_t seq[MAX_VARS]; // TODO: change to dynamic memory allocation!
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
37 int vpos, beg, end;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
38 uint32_t vlen:16, single:1, flip:1, phase:1, phased:1, ambig:1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
39 uint32_t in:16, out:16; // in-phase and out-phase
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
40 } frag_t, *frag_p;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
41
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
42 #define rseq_lt(a,b) ((a)->vpos < (b)->vpos)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
43
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
44 #include "khash.h"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
45 KHASH_SET_INIT_INT64(set64)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
46 KHASH_MAP_INIT_INT64(64, frag_t)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
47
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
48 typedef khash_t(64) nseq_t;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
49
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
50 #include "ksort.h"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
51 KSORT_INIT(rseq, frag_p, rseq_lt)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
52
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
53 static char nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
54
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
55 static inline uint64_t X31_hash_string(const char *s)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
56 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
57 uint64_t h = *s;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
58 if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
59 return h;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
60 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
61
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
62 static void count1(int l, const uint8_t *seq, int *cnt)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
63 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
64 int i, j, n_ambi;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
65 uint32_t z, x;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
66 if (seq[l-1] == 0) return; // do nothing is the last base is ambiguous
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
67 for (i = n_ambi = 0; i < l; ++i) // collect ambiguous bases
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
68 if (seq[i] == 0) ++n_ambi;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
69 if (l - n_ambi <= 1) return; // only one SNP
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
70 for (x = 0; x < 1u<<n_ambi; ++x) { // count
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
71 for (i = j = 0, z = 0; i < l; ++i) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
72 int c;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
73 if (seq[i]) c = seq[i] - 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
74 else {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
75 c = x>>j&1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
76 ++j;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
77 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
78 z = z<<1 | c;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
79 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
80 ++cnt[z];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
81 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
82 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
83
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
84 static int **count_all(int l, int vpos, nseq_t *hash)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
85 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
86 khint_t k;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
87 int i, j, **cnt;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
88 uint8_t *seq;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
89 seq = calloc(l, 1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
90 cnt = calloc(vpos, sizeof(void*));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
91 for (i = 0; i < vpos; ++i) cnt[i] = calloc(1<<l, sizeof(int));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
92 for (k = 0; k < kh_end(hash); ++k) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
93 if (kh_exist(hash, k)) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
94 frag_t *f = &kh_val(hash, k);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
95 if (f->vpos >= vpos || f->single) continue; // out of region; or singleton
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
96 if (f->vlen == 1) { // such reads should be flagged as deleted previously if everything is right
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
97 f->single = 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
98 continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
99 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
100 for (j = 1; j < f->vlen; ++j) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
101 for (i = 0; i < l; ++i)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
102 seq[i] = j < l - 1 - i? 0 : f->seq[j - (l - 1 - i)];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
103 count1(l, seq, cnt[f->vpos + j]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
104 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
105 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
106 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
107 free(seq);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
108 return cnt;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
109 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
110
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
111 // phasing
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
112 static int8_t *dynaprog(int l, int vpos, int **w)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
113 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
114 int *f[2], *curr, *prev, max, i;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
115 int8_t **b, *h = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
116 uint32_t x, z = 1u<<(l-1), mask = (1u<<l) - 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
117 f[0] = calloc(z, sizeof(int));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
118 f[1] = calloc(z, sizeof(int));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
119 b = calloc(vpos, sizeof(void*));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
120 prev = f[0]; curr = f[1];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
121 // fill the backtrack matrix
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
122 for (i = 0; i < vpos; ++i) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
123 int *wi = w[i], *tmp;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
124 int8_t *bi;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
125 bi = b[i] = calloc(z, 1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
126 /* In the following, x is the current state, which is the
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
127 * lexicographically smaller local haplotype. xc is the complement of
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
128 * x, or the larger local haplotype; y0 and y1 are the two predecessors
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
129 * of x. */
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
130 for (x = 0; x < z; ++x) { // x0 is the smaller
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
131 uint32_t y0, y1, xc;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
132 int c0, c1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
133 xc = ~x&mask; y0 = x>>1; y1 = xc>>1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
134 c0 = prev[y0] + wi[x] + wi[xc];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
135 c1 = prev[y1] + wi[x] + wi[xc];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
136 if (c0 > c1) bi[x] = 0, curr[x] = c0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
137 else bi[x] = 1, curr[x] = c1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
138 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
139 tmp = prev; prev = curr; curr = tmp; // swap
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
140 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
141 { // backtrack
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
142 uint32_t max_x = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
143 int which = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
144 h = calloc(vpos, 1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
145 for (x = 0, max = 0, max_x = 0; x < z; ++x)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
146 if (prev[x] > max) max = prev[x], max_x = x;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
147 for (i = vpos - 1, x = max_x; i >= 0; --i) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
148 h[i] = which? (~x&1) : (x&1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
149 which = b[i][x]? !which : which;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
150 x = b[i][x]? (~x&mask)>>1 : x>>1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
151 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
152 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
153 // free
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
154 for (i = 0; i < vpos; ++i) free(b[i]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
155 free(f[0]); free(f[1]); free(b);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
156 return h;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
157 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
158
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
159 // phase each fragment
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
160 static uint64_t *fragphase(int vpos, const int8_t *path, nseq_t *hash, int flip)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
161 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
162 khint_t k;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
163 uint64_t *pcnt;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
164 uint32_t *left, *rght, max;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
165 left = rght = 0; max = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
166 pcnt = calloc(vpos, 8);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
167 for (k = 0; k < kh_end(hash); ++k) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
168 if (kh_exist(hash, k)) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
169 int i, c[2];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
170 frag_t *f = &kh_val(hash, k);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
171 if (f->vpos >= vpos) continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
172 // get the phase
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
173 c[0] = c[1] = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
174 for (i = 0; i < f->vlen; ++i) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
175 if (f->seq[i] == 0) continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
176 ++c[f->seq[i] == path[f->vpos + i] + 1? 0 : 1];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
177 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
178 f->phase = c[0] > c[1]? 0 : 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
179 f->in = c[f->phase]; f->out = c[1 - f->phase];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
180 f->phased = f->in == f->out? 0 : 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
181 f->ambig = (f->in && f->out && f->out < 3 && f->in <= f->out + 1)? 1 : 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
182 // fix chimera
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
183 f->flip = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
184 if (flip && c[0] >= 3 && c[1] >= 3) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
185 int sum[2], m, mi, md;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
186 if (f->vlen > max) { // enlarge the array
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
187 max = f->vlen;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
188 kroundup32(max);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
189 left = realloc(left, max * 4);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
190 rght = realloc(rght, max * 4);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
191 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
192 for (i = 0, sum[0] = sum[1] = 0; i < f->vlen; ++i) { // get left counts
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
193 if (f->seq[i]) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
194 int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
195 ++sum[c == path[f->vpos + i]? 0 : 1];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
196 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
197 left[i] = sum[1]<<16 | sum[0];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
198 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
199 for (i = f->vlen - 1, sum[0] = sum[1] = 0; i >= 0; --i) { // get right counts
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
200 if (f->seq[i]) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
201 int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
202 ++sum[c == path[f->vpos + i]? 0 : 1];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
203 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
204 rght[i] = sum[1]<<16 | sum[0];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
205 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
206 // find the best flip point
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
207 for (i = m = 0, mi = -1, md = -1; i < f->vlen - 1; ++i) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
208 int a[2];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
209 a[0] = (left[i]&0xffff) + (rght[i+1]>>16&0xffff) - (rght[i+1]&0xffff) * FLIP_PENALTY;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
210 a[1] = (left[i]>>16&0xffff) + (rght[i+1]&0xffff) - (rght[i+1]>>16&0xffff) * FLIP_PENALTY;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
211 if (a[0] > a[1]) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
212 if (a[0] > m) m = a[0], md = 0, mi = i;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
213 } else {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
214 if (a[1] > m) m = a[1], md = 1, mi = i;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
215 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
216 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
217 if (m - c[0] >= FLIP_THRES && m - c[1] >= FLIP_THRES) { // then flip
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
218 f->flip = 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
219 if (md == 0) { // flip the tail
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
220 for (i = mi + 1; i < f->vlen; ++i)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
221 if (f->seq[i] == 1) f->seq[i] = 2;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
222 else if (f->seq[i] == 2) f->seq[i] = 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
223 } else { // flip the head
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
224 for (i = 0; i <= mi; ++i)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
225 if (f->seq[i] == 1) f->seq[i] = 2;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
226 else if (f->seq[i] == 2) f->seq[i] = 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
227 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
228 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
229 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
230 // update pcnt[]
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
231 if (!f->single) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
232 for (i = 0; i < f->vlen; ++i) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
233 int c;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
234 if (f->seq[i] == 0) continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
235 c = f->phase? 2 - f->seq[i] : f->seq[i] - 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
236 if (c == path[f->vpos + i]) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
237 if (f->phase == 0) ++pcnt[f->vpos + i];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
238 else pcnt[f->vpos + i] += 1ull<<32;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
239 } else {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
240 if (f->phase == 0) pcnt[f->vpos + i] += 1<<16;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
241 else pcnt[f->vpos + i] += 1ull<<48;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
242 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
243 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
244 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
245 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
246 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
247 free(left); free(rght);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
248 return pcnt;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
249 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
250
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
251 static uint64_t *genmask(int vpos, const uint64_t *pcnt, int *_n)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
252 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
253 int i, max = 0, max_i = -1, m = 0, n = 0, beg = 0, score = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
254 uint64_t *list = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
255 for (i = 0; i < vpos; ++i) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
256 uint64_t x = pcnt[i];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
257 int c[4], pre = score, s;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
258 c[0] = x&0xffff; c[1] = x>>16&0xffff; c[2] = x>>32&0xffff; c[3] = x>>48&0xffff;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
259 s = (c[1] + c[3] == 0)? -(c[0] + c[2]) : (c[1] + c[3] - 1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
260 if (c[3] > c[2]) s += c[3] - c[2];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
261 if (c[1] > c[0]) s += c[1] - c[0];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
262 score += s;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
263 if (score < 0) score = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
264 if (pre == 0 && score > 0) beg = i; // change from zero to non-zero
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
265 if ((i == vpos - 1 || score == 0) && max >= MASK_THRES) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
266 if (n == m) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
267 m = m? m<<1 : 4;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
268 list = realloc(list, m * 8);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
269 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
270 list[n++] = (uint64_t)beg<<32 | max_i;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
271 i = max_i; // reset i to max_i
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
272 score = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
273 } else if (score > max) max = score, max_i = i;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
274 if (score == 0) max = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
275 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
276 *_n = n;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
277 return list;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
278 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
279
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
280 // trim heading and tailing ambiguous bases; mark deleted and remove sequence
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
281 static int clean_seqs(int vpos, nseq_t *hash)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
282 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
283 khint_t k;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
284 int ret = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
285 for (k = 0; k < kh_end(hash); ++k) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
286 if (kh_exist(hash, k)) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
287 frag_t *f = &kh_val(hash, k);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
288 int beg, end, i;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
289 if (f->vpos >= vpos) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
290 ret = 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
291 continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
292 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
293 for (i = 0; i < f->vlen; ++i)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
294 if (f->seq[i] != 0) break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
295 beg = i;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
296 for (i = f->vlen - 1; i >= 0; --i)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
297 if (f->seq[i] != 0) break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
298 end = i + 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
299 if (end - beg <= 0) kh_del(64, hash, k);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
300 else {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
301 if (beg != 0) memmove(f->seq, f->seq + beg, end - beg);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
302 f->vpos += beg; f->vlen = end - beg;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
303 f->single = f->vlen == 1? 1 : 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
304 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
305 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
306 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
307 return ret;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
308 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
309
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
310 static void dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
311 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
312 int i, is_flip, drop_ambi;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
313 drop_ambi = g->flag & FLAG_DROP_AMBI;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
314 is_flip = (drand48() < 0.5);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
315 for (i = 0; i < g->n; ++i) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
316 int end, which;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
317 uint64_t key;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
318 khint_t k;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
319 bam1_t *b = g->b[i];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
320 key = X31_hash_string(bam1_qname(b));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
321 end = bam_calend(&b->core, bam1_cigar(b));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
322 if (end > min_pos) break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
323 k = kh_get(64, hash, key);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
324 if (k == kh_end(hash)) which = 3;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
325 else {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
326 frag_t *f = &kh_val(hash, k);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
327 if (f->ambig) which = drop_ambi? 2 : 3;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
328 else if (f->phased && f->flip) which = 2;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
329 else if (f->phased == 0) which = 3;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
330 else { // phased and not flipped
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
331 char c = 'Y';
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
332 which = f->phase;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
333 bam_aux_append(b, "ZP", 'A', 1, (uint8_t*)&c);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
334 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
335 if (which < 2 && is_flip) which = 1 - which; // increase the randomness
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
336 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
337 if (which == 3) which = (drand48() < 0.5);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
338 bam_write1(g->out[which], b);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
339 bam_destroy1(b);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
340 g->b[i] = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
341 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
342 memmove(g->b, g->b + i, (g->n - i) * sizeof(void*));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
343 g->n -= i;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
344 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
345
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
346 static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t *hash)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
347 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
348 int i, j, n_seqs = kh_size(hash), n_masked = 0, min_pos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
349 khint_t k;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
350 frag_t **seqs;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
351 int8_t *path, *sitemask;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
352 uint64_t *pcnt, *regmask;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
353
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
354 if (vpos == 0) return 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
355 i = clean_seqs(vpos, hash); // i is true if hash has an element with its vpos >= vpos
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
356 min_pos = i? cns[vpos]>>32 : 0x7fffffff;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
357 if (vpos == 1) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
358 printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
359 printf("M0\t%s\t%d\t%d\t%c\t%c\t%d\t0\t0\t0\t0\n//\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1,
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
360 "ACGTX"[cns[0]&3], "ACGTX"[cns[0]>>16&3], g->vpos_shift + 1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
361 for (k = 0; k < kh_end(hash); ++k) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
362 if (kh_exist(hash, k)) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
363 frag_t *f = &kh_val(hash, k);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
364 if (f->vpos) continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
365 f->flip = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
366 if (f->seq[0] == 0) f->phased = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
367 else f->phased = 1, f->phase = f->seq[0] - 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
368 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
369 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
370 dump_aln(g, min_pos, hash);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
371 ++g->vpos_shift;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
372 return 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
373 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
374 { // phase
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
375 int **cnt;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
376 uint64_t *mask;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
377 printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[vpos-1]>>32) + 1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
378 sitemask = calloc(vpos, 1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
379 cnt = count_all(g->k, vpos, hash);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
380 path = dynaprog(g->k, vpos, cnt);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
381 for (i = 0; i < vpos; ++i) free(cnt[i]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
382 free(cnt);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
383 pcnt = fragphase(vpos, path, hash, 0); // do not fix chimeras when masking
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
384 mask = genmask(vpos, pcnt, &n_masked);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
385 regmask = calloc(n_masked, 8);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
386 for (i = 0; i < n_masked; ++i) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
387 regmask[i] = cns[mask[i]>>32]>>32<<32 | cns[(uint32_t)mask[i]]>>32;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
388 for (j = mask[i]>>32; j <= (int32_t)mask[i]; ++j)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
389 sitemask[j] = 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
390 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
391 free(mask);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
392 if (g->flag & FLAG_FIX_CHIMERA) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
393 free(pcnt);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
394 pcnt = fragphase(vpos, path, hash, 1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
395 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
396 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
397 for (i = 0; i < n_masked; ++i)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
398 printf("FL\t%s\t%d\t%d\n", chr, (int)(regmask[i]>>32) + 1, (int)regmask[i] + 1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
399 for (i = 0; i < vpos; ++i) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
400 uint64_t x = pcnt[i];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
401 int8_t c[2];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
402 c[0] = (cns[i]&0xffff)>>2 == 0? 4 : (cns[i]&3);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
403 c[1] = (cns[i]>>16&0xffff)>>2 == 0? 4 : (cns[i]>>16&3);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
404 printf("M%d\t%s\t%d\t%d\t%c\t%c\t%d\t%d\t%d\t%d\t%d\n", sitemask[i]+1, chr, (int)(cns[0]>>32) + 1, (int)(cns[i]>>32) + 1, "ACGTX"[c[path[i]]], "ACGTX"[c[1-path[i]]],
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
405 i + g->vpos_shift + 1, (int)(x&0xffff), (int)(x>>16&0xffff), (int)(x>>32&0xffff), (int)(x>>48&0xffff));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
406 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
407 free(path); free(pcnt); free(regmask); free(sitemask);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
408 seqs = calloc(n_seqs, sizeof(void*));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
409 for (k = 0, i = 0; k < kh_end(hash); ++k)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
410 if (kh_exist(hash, k) && kh_val(hash, k).vpos < vpos && !kh_val(hash, k).single)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
411 seqs[i++] = &kh_val(hash, k);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
412 n_seqs = i;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
413 ks_introsort_rseq(n_seqs, seqs);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
414 for (i = 0; i < n_seqs; ++i) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
415 frag_t *f = seqs[i];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
416 printf("EV\t0\t%s\t%d\t40\t%dM\t*\t0\t0\t", chr, f->vpos + 1 + g->vpos_shift, f->vlen);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
417 for (j = 0; j < f->vlen; ++j) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
418 uint32_t c = cns[f->vpos + j];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
419 if (f->seq[j] == 0) putchar('N');
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
420 else putchar("ACGT"[f->seq[j] == 1? (c&3) : (c>>16&3)]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
421 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
422 printf("\t*\tYP:i:%d\tYF:i:%d\tYI:i:%d\tYO:i:%d\tYS:i:%d\n", f->phase, f->flip, f->in, f->out, f->beg+1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
423 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
424 free(seqs);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
425 printf("//\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
426 fflush(stdout);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
427 g->vpos_shift += vpos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
428 dump_aln(g, min_pos, hash);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
429 return vpos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
430 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
431
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
432 static void update_vpos(int vpos, nseq_t *hash)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
433 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
434 khint_t k;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
435 for (k = 0; k < kh_end(hash); ++k) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
436 if (kh_exist(hash, k)) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
437 frag_t *f = &kh_val(hash, k);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
438 if (f->vpos < vpos) kh_del(64, hash, k); // TODO: if frag_t::seq is allocated dynamically, free it
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
439 else f->vpos -= vpos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
440 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
441 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
442 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
443
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
444 static nseq_t *shrink_hash(nseq_t *hash) // TODO: to implement
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
445 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
446 return hash;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
447 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
448
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
449 static int readaln(void *data, bam1_t *b)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
450 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
451 phaseg_t *g = (phaseg_t*)data;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
452 int ret;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
453 ret = bam_read1(g->fp, b);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
454 if (ret < 0) return ret;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
455 if (!(b->core.flag & (BAM_FUNMAP|BAM_FSECONDARY|BAM_FQCFAIL|BAM_FDUP)) && g->pre) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
456 if (g->n == g->m) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
457 g->m = g->m? g->m<<1 : 16;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
458 g->b = realloc(g->b, g->m * sizeof(void*));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
459 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
460 g->b[g->n++] = bam_dup1(b);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
461 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
462 return ret;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
463 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
464
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
465 static khash_t(set64) *loadpos(const char *fn, bam_header_t *h)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
466 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
467 gzFile fp;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
468 kstream_t *ks;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
469 int ret, dret;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
470 kstring_t *str;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
471 khash_t(set64) *hash;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
472
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
473 hash = kh_init(set64);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
474 str = calloc(1, sizeof(kstring_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
475 fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
476 ks = ks_init(fp);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
477 while (ks_getuntil(ks, 0, str, &dret) >= 0) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
478 int tid = bam_get_tid(h, str->s);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
479 if (tid >= 0 && dret != '\n') {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
480 if (ks_getuntil(ks, 0, str, &dret) >= 0) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
481 uint64_t x = (uint64_t)tid<<32 | (atoi(str->s) - 1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
482 kh_put(set64, hash, x, &ret);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
483 } else break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
484 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
485 if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n');
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
486 if (dret < 0) break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
487 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
488 ks_destroy(ks);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
489 gzclose(fp);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
490 free(str->s); free(str);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
491 return hash;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
492 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
493
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
494 static int gl2cns(float q[16])
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
495 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
496 int i, j, min_ij;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
497 float min, min2;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
498 min = min2 = 1e30; min_ij = -1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
499 for (i = 0; i < 4; ++i) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
500 for (j = i; j < 4; ++j) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
501 if (q[i<<2|j] < min) min_ij = i<<2|j, min2 = min, min = q[i<<2|j];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
502 else if (q[i<<2|j] < min2) min2 = q[i<<2|j];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
503 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
504 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
505 return (min_ij>>2&3) == (min_ij&3)? 0 : 1<<18 | (min_ij>>2&3)<<16 | (min_ij&3) | (int)(min2 - min + .499) << 2;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
506 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
507
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
508 int main_phase(int argc, char *argv[])
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
509 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
510 extern void bam_init_header_hash(bam_header_t *header);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
511 int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
512 const bam_pileup1_t *plp;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
513 bam_plp_t iter;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
514 bam_header_t *h;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
515 nseq_t *seqs;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
516 uint64_t *cns = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
517 phaseg_t g;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
518 char *fn_list = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
519 khash_t(set64) *set = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
520 errmod_t *em;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
521 uint16_t *bases;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
522
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
523 memset(&g, 0, sizeof(phaseg_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
524 g.flag = FLAG_FIX_CHIMERA;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
525 g.min_varLOD = 37; g.k = 13; g.min_baseQ = 13; g.max_depth = 256;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
526 while ((c = getopt(argc, argv, "Q:eFq:k:b:l:D:A:")) >= 0) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
527 switch (c) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
528 case 'D': g.max_depth = atoi(optarg); break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
529 case 'q': g.min_varLOD = atoi(optarg); break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
530 case 'Q': g.min_baseQ = atoi(optarg); break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
531 case 'k': g.k = atoi(optarg); break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
532 case 'F': g.flag &= ~FLAG_FIX_CHIMERA; break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
533 case 'e': g.flag |= FLAG_LIST_EXCL; break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
534 case 'A': g.flag |= FLAG_DROP_AMBI; break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
535 case 'b': g.pre = strdup(optarg); break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
536 case 'l': fn_list = strdup(optarg); break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
537 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
538 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
539 if (argc == optind) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
540 fprintf(stderr, "\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
541 fprintf(stderr, "Usage: samtools phase [options] <in.bam>\n\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
542 fprintf(stderr, "Options: -k INT block length [%d]\n", g.k);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
543 fprintf(stderr, " -b STR prefix of BAMs to output [null]\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
544 fprintf(stderr, " -q INT min het phred-LOD [%d]\n", g.min_varLOD);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
545 fprintf(stderr, " -Q INT min base quality in het calling [%d]\n", g.min_baseQ);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
546 fprintf(stderr, " -D INT max read depth [%d]\n", g.max_depth);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
547 // fprintf(stderr, " -l FILE list of sites to phase [null]\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
548 fprintf(stderr, " -F do not attempt to fix chimeras\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
549 fprintf(stderr, " -A drop reads with ambiguous phase\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
550 // fprintf(stderr, " -e do not discover SNPs (effective with -l)\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
551 fprintf(stderr, "\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
552 return 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
553 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
554 g.fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
555 h = bam_header_read(g.fp);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
556 if (fn_list) { // read the list of sites to phase
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
557 bam_init_header_hash(h);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
558 set = loadpos(fn_list, h);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
559 free(fn_list);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
560 } else g.flag &= ~FLAG_LIST_EXCL;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
561 if (g.pre) { // open BAMs to write
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
562 char *s = malloc(strlen(g.pre) + 20);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
563 strcpy(s, g.pre); strcat(s, ".0.bam"); g.out[0] = bam_open(s, "w");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
564 strcpy(s, g.pre); strcat(s, ".1.bam"); g.out[1] = bam_open(s, "w");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
565 strcpy(s, g.pre); strcat(s, ".chimera.bam"); g.out[2] = bam_open(s, "w");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
566 for (c = 0; c <= 2; ++c) bam_header_write(g.out[c], h);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
567 free(s);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
568 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
569
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
570 iter = bam_plp_init(readaln, &g);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
571 g.vpos_shift = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
572 seqs = kh_init(64);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
573 em = errmod_init(1. - 0.83);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
574 bases = calloc(g.max_depth, 2);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
575 printf("CC\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
576 printf("CC\tDescriptions:\nCC\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
577 printf("CC\t CC comments\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
578 printf("CC\t PS start of a phase set\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
579 printf("CC\t FL filtered region\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
580 printf("CC\t M[012] markers; 0 for singletons, 1 for phased and 2 for filtered\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
581 printf("CC\t EV supporting reads; SAM format\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
582 printf("CC\t // end of a phase set\nCC\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
583 printf("CC\tFormats of PS, FL and M[012] lines (1-based coordinates):\nCC\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
584 printf("CC\t PS chr phaseSetStart phaseSetEnd\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
585 printf("CC\t FL chr filterStart filterEnd\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
586 printf("CC\t M? chr PS pos allele0 allele1 hetIndex #supports0 #errors0 #supp1 #err1\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
587 printf("CC\nCC\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
588 fflush(stdout);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
589 while ((plp = bam_plp_auto(iter, &tid, &pos, &n)) != 0) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
590 int i, k, c, tmp, dophase = 1, in_set = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
591 float q[16];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
592 if (tid < 0) break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
593 if (tid != lasttid) { // change of chromosome
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
594 g.vpos_shift = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
595 if (lasttid >= 0) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
596 seqs = shrink_hash(seqs);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
597 phase(&g, h->target_name[lasttid], vpos, cns, seqs);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
598 update_vpos(0x7fffffff, seqs);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
599 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
600 lasttid = tid;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
601 vpos = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
602 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
603 if (set && kh_get(set64, set, (uint64_t)tid<<32 | pos) != kh_end(set)) in_set = 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
604 if (n > g.max_depth) continue; // do not proceed if the depth is too high
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
605 // fill the bases array and check if there is a variant
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
606 for (i = k = 0; i < n; ++i) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
607 const bam_pileup1_t *p = plp + i;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
608 uint8_t *seq;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
609 int q, baseQ, b;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
610 if (p->is_del || p->is_refskip) continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
611 baseQ = bam1_qual(p->b)[p->qpos];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
612 if (baseQ < g.min_baseQ) continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
613 seq = bam1_seq(p->b);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
614 b = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos)];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
615 if (b > 3) continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
616 q = baseQ < p->b->core.qual? baseQ : p->b->core.qual;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
617 if (q < 4) q = 4;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
618 if (q > 63) q = 63;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
619 bases[k++] = q<<5 | (int)bam1_strand(p->b)<<4 | b;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
620 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
621 if (k == 0) continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
622 errmod_cal(em, k, 4, bases, q); // compute genotype likelihood
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
623 c = gl2cns(q); // get the consensus
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
624 // tell if to proceed
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
625 if (set && (g.flag&FLAG_LIST_EXCL) && !in_set) continue; // not in the list
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
626 if (!in_set && (c&0xffff)>>2 < g.min_varLOD) continue; // not a variant
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
627 // add the variant
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
628 if (vpos == max_vpos) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
629 max_vpos = max_vpos? max_vpos<<1 : 128;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
630 cns = realloc(cns, max_vpos * 8);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
631 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
632 cns[vpos] = (uint64_t)pos<<32 | c;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
633 for (i = 0; i < n; ++i) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
634 const bam_pileup1_t *p = plp + i;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
635 uint64_t key;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
636 khint_t k;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
637 uint8_t *seq = bam1_seq(p->b);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
638 frag_t *f;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
639 if (p->is_del || p->is_refskip) continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
640 if (p->b->core.qual == 0) continue;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
641 // get the base code
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
642 c = nt16_nt4_table[(int)bam1_seqi(seq, p->qpos)];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
643 if (c == (cns[vpos]&3)) c = 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
644 else if (c == (cns[vpos]>>16&3)) c = 2;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
645 else c = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
646 // write to seqs
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
647 key = X31_hash_string(bam1_qname(p->b));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
648 k = kh_put(64, seqs, key, &tmp);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
649 f = &kh_val(seqs, k);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
650 if (tmp == 0) { // present in the hash table
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
651 if (vpos - f->vpos + 1 < MAX_VARS) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
652 f->vlen = vpos - f->vpos + 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
653 f->seq[f->vlen-1] = c;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
654 f->end = bam_calend(&p->b->core, bam1_cigar(p->b));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
655 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
656 dophase = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
657 } else { // absent
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
658 memset(f->seq, 0, MAX_VARS);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
659 f->beg = p->b->core.pos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
660 f->end = bam_calend(&p->b->core, bam1_cigar(p->b));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
661 f->vpos = vpos, f->vlen = 1, f->seq[0] = c, f->single = f->phased = f->flip = f->ambig = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
662 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
663 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
664 if (dophase) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
665 seqs = shrink_hash(seqs);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
666 phase(&g, h->target_name[tid], vpos, cns, seqs);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
667 update_vpos(vpos, seqs);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
668 cns[0] = cns[vpos];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
669 vpos = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
670 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
671 ++vpos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
672 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
673 if (tid >= 0) phase(&g, h->target_name[tid], vpos, cns, seqs);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
674 bam_header_destroy(h);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
675 bam_plp_destroy(iter);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
676 bam_close(g.fp);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
677 kh_destroy(64, seqs);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
678 kh_destroy(set64, set);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
679 free(cns);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
680 errmod_destroy(em);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
681 free(bases);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
682 if (g.pre) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
683 for (c = 0; c <= 2; ++c) bam_close(g.out[c]);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
684 free(g.pre); free(g.b);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
685 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
686 return 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
687 }