Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/samtools-0.1.19/bcftools/em.c @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 #include <stdlib.h> | |
2 #include <string.h> | |
3 #include <math.h> | |
4 #include "bcf.h" | |
5 #include "kmin.h" | |
6 | |
7 static double g_q2p[256]; | |
8 | |
9 #define ITER_MAX 50 | |
10 #define ITER_TRY 10 | |
11 #define EPS 1e-5 | |
12 | |
13 extern double kf_gammaq(double, double); | |
14 | |
15 /* | |
16 Generic routines | |
17 */ | |
18 // get the 3 genotype likelihoods | |
19 static double *get_pdg3(const bcf1_t *b) | |
20 { | |
21 double *pdg; | |
22 const uint8_t *PL = 0; | |
23 int i, PL_len = 0; | |
24 // initialize g_q2p if necessary | |
25 if (g_q2p[0] == 0.) | |
26 for (i = 0; i < 256; ++i) | |
27 g_q2p[i] = pow(10., -i / 10.); | |
28 // set PL and PL_len | |
29 for (i = 0; i < b->n_gi; ++i) { | |
30 if (b->gi[i].fmt == bcf_str2int("PL", 2)) { | |
31 PL = (const uint8_t*)b->gi[i].data; | |
32 PL_len = b->gi[i].len; | |
33 break; | |
34 } | |
35 } | |
36 if (i == b->n_gi) return 0; // no PL | |
37 // fill pdg | |
38 pdg = malloc(3 * b->n_smpl * sizeof(double)); | |
39 for (i = 0; i < b->n_smpl; ++i) { | |
40 const uint8_t *pi = PL + i * PL_len; | |
41 double *p = pdg + i * 3; | |
42 p[0] = g_q2p[pi[2]]; p[1] = g_q2p[pi[1]]; p[2] = g_q2p[pi[0]]; | |
43 } | |
44 return pdg; | |
45 } | |
46 | |
47 // estimate site allele frequency in a very naive and inaccurate way | |
48 static double est_freq(int n, const double *pdg) | |
49 { | |
50 int i, gcnt[3], tmp1; | |
51 // get a rough estimate of the genotype frequency | |
52 gcnt[0] = gcnt[1] = gcnt[2] = 0; | |
53 for (i = 0; i < n; ++i) { | |
54 const double *p = pdg + i * 3; | |
55 if (p[0] != 1. || p[1] != 1. || p[2] != 1.) { | |
56 int which = p[0] > p[1]? 0 : 1; | |
57 which = p[which] > p[2]? which : 2; | |
58 ++gcnt[which]; | |
59 } | |
60 } | |
61 tmp1 = gcnt[0] + gcnt[1] + gcnt[2]; | |
62 return (tmp1 == 0)? -1.0 : (.5 * gcnt[1] + gcnt[2]) / tmp1; | |
63 } | |
64 | |
65 /* | |
66 Single-locus EM | |
67 */ | |
68 | |
69 typedef struct { | |
70 int beg, end; | |
71 const double *pdg; | |
72 } minaux1_t; | |
73 | |
74 static double prob1(double f, void *data) | |
75 { | |
76 minaux1_t *a = (minaux1_t*)data; | |
77 double p = 1., l = 0., f3[3]; | |
78 int i; | |
79 // printf("brent %lg\n", f); | |
80 if (f < 0 || f > 1) return 1e300; | |
81 f3[0] = (1.-f)*(1.-f); f3[1] = 2.*f*(1.-f); f3[2] = f*f; | |
82 for (i = a->beg; i < a->end; ++i) { | |
83 const double *pdg = a->pdg + i * 3; | |
84 p *= pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]; | |
85 if (p < 1e-200) l -= log(p), p = 1.; | |
86 } | |
87 return l - log(p); | |
88 } | |
89 | |
90 // one EM iteration for allele frequency estimate | |
91 static double freq_iter(double *f, const double *_pdg, int beg, int end) | |
92 { | |
93 double f0 = *f, f3[3], err; | |
94 int i; | |
95 // printf("em %lg\n", *f); | |
96 f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; | |
97 for (i = beg, f0 = 0.; i < end; ++i) { | |
98 const double *pdg = _pdg + i * 3; | |
99 f0 += (pdg[1] * f3[1] + 2. * pdg[2] * f3[2]) | |
100 / (pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]); | |
101 } | |
102 f0 /= (end - beg) * 2; | |
103 err = fabs(f0 - *f); | |
104 *f = f0; | |
105 return err; | |
106 } | |
107 | |
108 /* The following function combines EM and Brent's method. When the signal from | |
109 * the data is strong, EM is faster but sometimes, EM may converge very slowly. | |
110 * When this happens, we switch to Brent's method. The idea is learned from | |
111 * Rasmus Nielsen. | |
112 */ | |
113 static double freqml(double f0, int beg, int end, const double *pdg) | |
114 { | |
115 int i; | |
116 double f; | |
117 for (i = 0, f = f0; i < ITER_TRY; ++i) | |
118 if (freq_iter(&f, pdg, beg, end) < EPS) break; | |
119 if (i == ITER_TRY) { // haven't converged yet; try Brent's method | |
120 minaux1_t a; | |
121 a.beg = beg; a.end = end; a.pdg = pdg; | |
122 kmin_brent(prob1, f0 == f? .5*f0 : f0, f, (void*)&a, EPS, &f); | |
123 } | |
124 return f; | |
125 } | |
126 | |
127 // one EM iteration for genotype frequency estimate | |
128 static double g3_iter(double g[3], const double *_pdg, int beg, int end) | |
129 { | |
130 double err, gg[3]; | |
131 int i; | |
132 gg[0] = gg[1] = gg[2] = 0.; | |
133 // printf("%lg,%lg,%lg\n", g[0], g[1], g[2]); | |
134 for (i = beg; i < end; ++i) { | |
135 double sum, tmp[3]; | |
136 const double *pdg = _pdg + i * 3; | |
137 tmp[0] = pdg[0] * g[0]; tmp[1] = pdg[1] * g[1]; tmp[2] = pdg[2] * g[2]; | |
138 sum = (tmp[0] + tmp[1] + tmp[2]) * (end - beg); | |
139 gg[0] += tmp[0] / sum; gg[1] += tmp[1] / sum; gg[2] += tmp[2] / sum; | |
140 } | |
141 err = fabs(gg[0] - g[0]) > fabs(gg[1] - g[1])? fabs(gg[0] - g[0]) : fabs(gg[1] - g[1]); | |
142 err = err > fabs(gg[2] - g[2])? err : fabs(gg[2] - g[2]); | |
143 g[0] = gg[0]; g[1] = gg[1]; g[2] = gg[2]; | |
144 return err; | |
145 } | |
146 | |
147 // perform likelihood ratio test | |
148 static double lk_ratio_test(int n, int n1, const double *pdg, double f3[3][3]) | |
149 { | |
150 double r; | |
151 int i; | |
152 for (i = 0, r = 1.; i < n1; ++i) { | |
153 const double *p = pdg + i * 3; | |
154 r *= (p[0] * f3[1][0] + p[1] * f3[1][1] + p[2] * f3[1][2]) | |
155 / (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]); | |
156 } | |
157 for (; i < n; ++i) { | |
158 const double *p = pdg + i * 3; | |
159 r *= (p[0] * f3[2][0] + p[1] * f3[2][1] + p[2] * f3[2][2]) | |
160 / (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]); | |
161 } | |
162 return r; | |
163 } | |
164 | |
165 // x[0]: ref frequency | |
166 // x[1..3]: alt-alt, alt-ref, ref-ref frequenc | |
167 // x[4]: HWE P-value | |
168 // x[5..6]: group1 freq, group2 freq | |
169 // x[7]: 1-degree P-value | |
170 // x[8]: 2-degree P-value | |
171 int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10]) | |
172 { | |
173 double *pdg; | |
174 int i, n, n2; | |
175 if (b->n_alleles < 2) return -1; // one allele only | |
176 // initialization | |
177 if (n1 < 0 || n1 > b->n_smpl) n1 = 0; | |
178 if (flag & 1<<7) flag |= 7<<5; // compute group freq if LRT is required | |
179 if (flag & 0xf<<1) flag |= 0xf<<1; | |
180 n = b->n_smpl; n2 = n - n1; | |
181 pdg = get_pdg3(b); | |
182 if (pdg == 0) return -1; | |
183 for (i = 0; i < 10; ++i) x[i] = -1.; // set to negative | |
184 { | |
185 if ((x[0] = est_freq(n, pdg)) < 0.) { | |
186 free(pdg); | |
187 return -1; // no data | |
188 } | |
189 x[0] = freqml(x[0], 0, n, pdg); | |
190 } | |
191 if (flag & (0xf<<1|3<<8)) { // estimate the genotype frequency and test HWE | |
192 double *g = x + 1, f3[3], r; | |
193 f3[0] = g[0] = (1 - x[0]) * (1 - x[0]); | |
194 f3[1] = g[1] = 2 * x[0] * (1 - x[0]); | |
195 f3[2] = g[2] = x[0] * x[0]; | |
196 for (i = 0; i < ITER_MAX; ++i) | |
197 if (g3_iter(g, pdg, 0, n) < EPS) break; | |
198 // Hardy-Weinberg equilibrium (HWE) | |
199 for (i = 0, r = 1.; i < n; ++i) { | |
200 double *p = pdg + i * 3; | |
201 r *= (p[0] * g[0] + p[1] * g[1] + p[2] * g[2]) / (p[0] * f3[0] + p[1] * f3[1] + p[2] * f3[2]); | |
202 } | |
203 x[4] = kf_gammaq(.5, log(r)); | |
204 } | |
205 if ((flag & 7<<5) && n1 > 0 && n1 < n) { // group frequency | |
206 x[5] = freqml(x[0], 0, n1, pdg); | |
207 x[6] = freqml(x[0], n1, n, pdg); | |
208 } | |
209 if ((flag & 1<<7) && n1 > 0 && n1 < n) { // 1-degree P-value | |
210 double f[3], f3[3][3], tmp; | |
211 f[0] = x[0]; f[1] = x[5]; f[2] = x[6]; | |
212 for (i = 0; i < 3; ++i) | |
213 f3[i][0] = (1-f[i])*(1-f[i]), f3[i][1] = 2*f[i]*(1-f[i]), f3[i][2] = f[i]*f[i]; | |
214 tmp = log(lk_ratio_test(n, n1, pdg, f3)); | |
215 if (tmp < 0) tmp = 0; | |
216 x[7] = kf_gammaq(.5, tmp); | |
217 } | |
218 if ((flag & 3<<8) && n1 > 0 && n1 < n) { // 2-degree P-value | |
219 double g[3][3], tmp; | |
220 for (i = 0; i < 3; ++i) memcpy(g[i], x + 1, 3 * sizeof(double)); | |
221 for (i = 0; i < ITER_MAX; ++i) | |
222 if (g3_iter(g[1], pdg, 0, n1) < EPS) break; | |
223 for (i = 0; i < ITER_MAX; ++i) | |
224 if (g3_iter(g[2], pdg, n1, n) < EPS) break; | |
225 tmp = log(lk_ratio_test(n, n1, pdg, g)); | |
226 if (tmp < 0) tmp = 0; | |
227 x[8] = kf_gammaq(1., tmp); | |
228 } | |
229 // free | |
230 free(pdg); | |
231 return 0; | |
232 } | |
233 | |
234 /* | |
235 Two-locus EM (LD) | |
236 */ | |
237 | |
238 #define _G1(h, k) ((h>>1&1) + (k>>1&1)) | |
239 #define _G2(h, k) ((h&1) + (k&1)) | |
240 | |
241 // 0: the previous site; 1: the current site | |
242 static int pair_freq_iter(int n, double *pdg[2], double f[4]) | |
243 { | |
244 double ff[4]; | |
245 int i, k, h; | |
246 // printf("%lf,%lf,%lf,%lf\n", f[0], f[1], f[2], f[3]); | |
247 memset(ff, 0, 4 * sizeof(double)); | |
248 for (i = 0; i < n; ++i) { | |
249 double *p[2], sum, tmp; | |
250 p[0] = pdg[0] + i * 3; p[1] = pdg[1] + i * 3; | |
251 for (k = 0, sum = 0.; k < 4; ++k) | |
252 for (h = 0; h < 4; ++h) | |
253 sum += f[k] * f[h] * p[0][_G1(k,h)] * p[1][_G2(k,h)]; | |
254 for (k = 0; k < 4; ++k) { | |
255 tmp = f[0] * (p[0][_G1(0,k)] * p[1][_G2(0,k)] + p[0][_G1(k,0)] * p[1][_G2(k,0)]) | |
256 + f[1] * (p[0][_G1(1,k)] * p[1][_G2(1,k)] + p[0][_G1(k,1)] * p[1][_G2(k,1)]) | |
257 + f[2] * (p[0][_G1(2,k)] * p[1][_G2(2,k)] + p[0][_G1(k,2)] * p[1][_G2(k,2)]) | |
258 + f[3] * (p[0][_G1(3,k)] * p[1][_G2(3,k)] + p[0][_G1(k,3)] * p[1][_G2(k,3)]); | |
259 ff[k] += f[k] * tmp / sum; | |
260 } | |
261 } | |
262 for (k = 0; k < 4; ++k) f[k] = ff[k] / (2 * n); | |
263 return 0; | |
264 } | |
265 | |
266 double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]) | |
267 { | |
268 const bcf1_t *b[2]; | |
269 int i, j, n_smpl; | |
270 double *pdg[2], flast[4], r, f0[2]; | |
271 // initialize others | |
272 if (b0->n_smpl != b1->n_smpl) return -1; // different number of samples | |
273 n_smpl = b0->n_smpl; | |
274 b[0] = b0; b[1] = b1; | |
275 f[0] = f[1] = f[2] = f[3] = -1.; | |
276 if (b[0]->n_alleles < 2 || b[1]->n_alleles < 2) return -1; // one allele only | |
277 pdg[0] = get_pdg3(b0); pdg[1] = get_pdg3(b1); | |
278 if (pdg[0] == 0 || pdg[1] == 0) { | |
279 free(pdg[0]); free(pdg[1]); | |
280 return -1; | |
281 } | |
282 // set the initial value | |
283 f0[0] = est_freq(n_smpl, pdg[0]); | |
284 f0[1] = est_freq(n_smpl, pdg[1]); | |
285 f[0] = (1 - f0[0]) * (1 - f0[1]); f[3] = f0[0] * f0[1]; | |
286 f[1] = (1 - f0[0]) * f0[1]; f[2] = f0[0] * (1 - f0[1]); | |
287 // iteration | |
288 for (j = 0; j < ITER_MAX; ++j) { | |
289 double eps = 0; | |
290 memcpy(flast, f, 4 * sizeof(double)); | |
291 pair_freq_iter(n_smpl, pdg, f); | |
292 for (i = 0; i < 4; ++i) { | |
293 double x = fabs(f[i] - flast[i]); | |
294 if (x > eps) eps = x; | |
295 } | |
296 if (eps < EPS) break; | |
297 } | |
298 // free | |
299 free(pdg[0]); free(pdg[1]); | |
300 { // calculate r^2 | |
301 double p[2], q[2], D; | |
302 p[0] = f[0] + f[1]; q[0] = 1 - p[0]; | |
303 p[1] = f[0] + f[2]; q[1] = 1 - p[1]; | |
304 D = f[0] * f[3] - f[1] * f[2]; | |
305 r = sqrt(D * D / (p[0] * p[1] * q[0] * q[1])); | |
306 // printf("R(%lf,%lf,%lf,%lf)=%lf\n", f[0], f[1], f[2], f[3], r); | |
307 if (isnan(r)) r = -1.; | |
308 } | |
309 return r; | |
310 } |