Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/tools/samtools-0.1.16/bcftools/em.c @ 0:acc2ca1a3ba4
Uploaded
| author | siyuan |
|---|---|
| date | Thu, 20 Feb 2014 00:44:58 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:acc2ca1a3ba4 |
|---|---|
| 1 #include <stdlib.h> | |
| 2 #include <string.h> | |
| 3 #include <math.h> | |
| 4 #include "bcf.h" | |
| 5 #include "kmin.h" | |
| 6 | |
| 7 static double g_q2p[256]; | |
| 8 | |
| 9 #define ITER_MAX 50 | |
| 10 #define ITER_TRY 10 | |
| 11 #define EPS 1e-5 | |
| 12 | |
| 13 extern double kf_gammaq(double, double); | |
| 14 | |
| 15 /* | |
| 16 Generic routines | |
| 17 */ | |
| 18 // get the 3 genotype likelihoods | |
| 19 static double *get_pdg3(const bcf1_t *b) | |
| 20 { | |
| 21 double *pdg; | |
| 22 const uint8_t *PL = 0; | |
| 23 int i, PL_len = 0; | |
| 24 // initialize g_q2p if necessary | |
| 25 if (g_q2p[0] == 0.) | |
| 26 for (i = 0; i < 256; ++i) | |
| 27 g_q2p[i] = pow(10., -i / 10.); | |
| 28 // set PL and PL_len | |
| 29 for (i = 0; i < b->n_gi; ++i) { | |
| 30 if (b->gi[i].fmt == bcf_str2int("PL", 2)) { | |
| 31 PL = (const uint8_t*)b->gi[i].data; | |
| 32 PL_len = b->gi[i].len; | |
| 33 break; | |
| 34 } | |
| 35 } | |
| 36 if (i == b->n_gi) return 0; // no PL | |
| 37 // fill pdg | |
| 38 pdg = malloc(3 * b->n_smpl * sizeof(double)); | |
| 39 for (i = 0; i < b->n_smpl; ++i) { | |
| 40 const uint8_t *pi = PL + i * PL_len; | |
| 41 double *p = pdg + i * 3; | |
| 42 p[0] = g_q2p[pi[2]]; p[1] = g_q2p[pi[1]]; p[2] = g_q2p[pi[0]]; | |
| 43 } | |
| 44 return pdg; | |
| 45 } | |
| 46 | |
| 47 // estimate site allele frequency in a very naive and inaccurate way | |
| 48 static double est_freq(int n, const double *pdg) | |
| 49 { | |
| 50 int i, gcnt[3], tmp1; | |
| 51 // get a rough estimate of the genotype frequency | |
| 52 gcnt[0] = gcnt[1] = gcnt[2] = 0; | |
| 53 for (i = 0; i < n; ++i) { | |
| 54 const double *p = pdg + i * 3; | |
| 55 if (p[0] != 1. || p[1] != 1. || p[2] != 1.) { | |
| 56 int which = p[0] > p[1]? 0 : 1; | |
| 57 which = p[which] > p[2]? which : 2; | |
| 58 ++gcnt[which]; | |
| 59 } | |
| 60 } | |
| 61 tmp1 = gcnt[0] + gcnt[1] + gcnt[2]; | |
| 62 return (tmp1 == 0)? -1.0 : (.5 * gcnt[1] + gcnt[2]) / tmp1; | |
| 63 } | |
| 64 | |
| 65 /* | |
| 66 Single-locus EM | |
| 67 */ | |
| 68 | |
| 69 typedef struct { | |
| 70 int beg, end; | |
| 71 const double *pdg; | |
| 72 } minaux1_t; | |
| 73 | |
| 74 static double prob1(double f, void *data) | |
| 75 { | |
| 76 minaux1_t *a = (minaux1_t*)data; | |
| 77 double p = 1., l = 0., f3[3]; | |
| 78 int i; | |
| 79 // printf("brent %lg\n", f); | |
| 80 if (f < 0 || f > 1) return 1e300; | |
| 81 f3[0] = (1.-f)*(1.-f); f3[1] = 2.*f*(1.-f); f3[2] = f*f; | |
| 82 for (i = a->beg; i < a->end; ++i) { | |
| 83 const double *pdg = a->pdg + i * 3; | |
| 84 p *= pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]; | |
| 85 if (p < 1e-200) l -= log(p), p = 1.; | |
| 86 } | |
| 87 return l - log(p); | |
| 88 } | |
| 89 | |
| 90 // one EM iteration for allele frequency estimate | |
| 91 static double freq_iter(double *f, const double *_pdg, int beg, int end) | |
| 92 { | |
| 93 double f0 = *f, f3[3], err; | |
| 94 int i; | |
| 95 // printf("em %lg\n", *f); | |
| 96 f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; | |
| 97 for (i = beg, f0 = 0.; i < end; ++i) { | |
| 98 const double *pdg = _pdg + i * 3; | |
| 99 f0 += (pdg[1] * f3[1] + 2. * pdg[2] * f3[2]) | |
| 100 / (pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]); | |
| 101 } | |
| 102 f0 /= (end - beg) * 2; | |
| 103 err = fabs(f0 - *f); | |
| 104 *f = f0; | |
| 105 return err; | |
| 106 } | |
| 107 | |
| 108 /* The following function combines EM and Brent's method. When the signal from | |
| 109 * the data is strong, EM is faster but sometimes, EM may converge very slowly. | |
| 110 * When this happens, we switch to Brent's method. The idea is learned from | |
| 111 * Rasmus Nielsen. | |
| 112 */ | |
| 113 static double freqml(double f0, int beg, int end, const double *pdg) | |
| 114 { | |
| 115 int i; | |
| 116 double f; | |
| 117 for (i = 0, f = f0; i < ITER_TRY; ++i) | |
| 118 if (freq_iter(&f, pdg, beg, end) < EPS) break; | |
| 119 if (i == ITER_TRY) { // haven't converged yet; try Brent's method | |
| 120 minaux1_t a; | |
| 121 a.beg = beg; a.end = end; a.pdg = pdg; | |
| 122 kmin_brent(prob1, f0 == f? .5*f0 : f0, f, (void*)&a, EPS, &f); | |
| 123 } | |
| 124 return f; | |
| 125 } | |
| 126 | |
| 127 // one EM iteration for genotype frequency estimate | |
| 128 static double g3_iter(double g[3], const double *_pdg, int beg, int end) | |
| 129 { | |
| 130 double err, gg[3]; | |
| 131 int i; | |
| 132 gg[0] = gg[1] = gg[2] = 0.; | |
| 133 // printf("%lg,%lg,%lg\n", g[0], g[1], g[2]); | |
| 134 for (i = beg; i < end; ++i) { | |
| 135 double sum, tmp[3]; | |
| 136 const double *pdg = _pdg + i * 3; | |
| 137 tmp[0] = pdg[0] * g[0]; tmp[1] = pdg[1] * g[1]; tmp[2] = pdg[2] * g[2]; | |
| 138 sum = (tmp[0] + tmp[1] + tmp[2]) * (end - beg); | |
| 139 gg[0] += tmp[0] / sum; gg[1] += tmp[1] / sum; gg[2] += tmp[2] / sum; | |
| 140 } | |
| 141 err = fabs(gg[0] - g[0]) > fabs(gg[1] - g[1])? fabs(gg[0] - g[0]) : fabs(gg[1] - g[1]); | |
| 142 err = err > fabs(gg[2] - g[2])? err : fabs(gg[2] - g[2]); | |
| 143 g[0] = gg[0]; g[1] = gg[1]; g[2] = gg[2]; | |
| 144 return err; | |
| 145 } | |
| 146 | |
| 147 // perform likelihood ratio test | |
| 148 static double lk_ratio_test(int n, int n1, const double *pdg, double f3[3][3]) | |
| 149 { | |
| 150 double r; | |
| 151 int i; | |
| 152 for (i = 0, r = 1.; i < n1; ++i) { | |
| 153 const double *p = pdg + i * 3; | |
| 154 r *= (p[0] * f3[1][0] + p[1] * f3[1][1] + p[2] * f3[1][2]) | |
| 155 / (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]); | |
| 156 } | |
| 157 for (; i < n; ++i) { | |
| 158 const double *p = pdg + i * 3; | |
| 159 r *= (p[0] * f3[2][0] + p[1] * f3[2][1] + p[2] * f3[2][2]) | |
| 160 / (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]); | |
| 161 } | |
| 162 return r; | |
| 163 } | |
| 164 | |
| 165 // x[0]: ref frequency | |
| 166 // x[1..3]: alt-alt, alt-ref, ref-ref frequenc | |
| 167 // x[4]: HWE P-value | |
| 168 // x[5..6]: group1 freq, group2 freq | |
| 169 // x[7]: 1-degree P-value | |
| 170 // x[8]: 2-degree P-value | |
| 171 int bcf_em1(const bcf1_t *b, int n1, int flag, double x[9]) | |
| 172 { | |
| 173 double *pdg; | |
| 174 int i, n, n2; | |
| 175 if (b->n_alleles < 2) return -1; // one allele only | |
| 176 // initialization | |
| 177 if (n1 < 0 || n1 > b->n_smpl) n1 = 0; | |
| 178 if (flag & 1<<7) flag |= 7<<5; // compute group freq if LRT is required | |
| 179 if (flag & 0xf<<1) flag |= 0xf<<1; | |
| 180 n = b->n_smpl; n2 = n - n1; | |
| 181 pdg = get_pdg3(b); | |
| 182 if (pdg == 0) return -1; | |
| 183 for (i = 0; i < 9; ++i) x[i] = -1.; | |
| 184 { | |
| 185 if ((x[0] = est_freq(n, pdg)) < 0.) { | |
| 186 free(pdg); | |
| 187 return -1; // no data | |
| 188 } | |
| 189 x[0] = freqml(x[0], 0, n, pdg); | |
| 190 } | |
| 191 if (flag & (0xf<<1|1<<8)) { // estimate the genotype frequency and test HWE | |
| 192 double *g = x + 1, f3[3], r; | |
| 193 f3[0] = g[0] = (1 - x[0]) * (1 - x[0]); | |
| 194 f3[1] = g[1] = 2 * x[0] * (1 - x[0]); | |
| 195 f3[2] = g[2] = x[0] * x[0]; | |
| 196 for (i = 0; i < ITER_MAX; ++i) | |
| 197 if (g3_iter(g, pdg, 0, n) < EPS) break; | |
| 198 // Hardy-Weinberg equilibrium (HWE) | |
| 199 for (i = 0, r = 1.; i < n; ++i) { | |
| 200 double *p = pdg + i * 3; | |
| 201 r *= (p[0] * g[0] + p[1] * g[1] + p[2] * g[2]) / (p[0] * f3[0] + p[1] * f3[1] + p[2] * f3[2]); | |
| 202 } | |
| 203 x[4] = kf_gammaq(.5, log(r)); | |
| 204 } | |
| 205 if ((flag & 7<<5) && n1 > 0 && n1 < n) { // group frequency | |
| 206 x[5] = freqml(x[0], 0, n1, pdg); | |
| 207 x[6] = freqml(x[0], n1, n, pdg); | |
| 208 } | |
| 209 if ((flag & 1<<7) && n1 > 0 && n1 < n) { // 1-degree P-value | |
| 210 double f[3], f3[3][3]; | |
| 211 f[0] = x[0]; f[1] = x[5]; f[2] = x[6]; | |
| 212 for (i = 0; i < 3; ++i) | |
| 213 f3[i][0] = (1-f[i])*(1-f[i]), f3[i][1] = 2*f[i]*(1-f[i]), f3[i][2] = f[i]*f[i]; | |
| 214 x[7] = kf_gammaq(.5, log(lk_ratio_test(n, n1, pdg, f3))); | |
| 215 } | |
| 216 if ((flag & 1<<8) && n1 > 0 && n1 < n) { // 2-degree P-value | |
| 217 double g[3][3]; | |
| 218 for (i = 0; i < 3; ++i) memcpy(g[i], x + 1, 3 * sizeof(double)); | |
| 219 for (i = 0; i < ITER_MAX; ++i) | |
| 220 if (g3_iter(g[1], pdg, 0, n1) < EPS) break; | |
| 221 for (i = 0; i < ITER_MAX; ++i) | |
| 222 if (g3_iter(g[2], pdg, n1, n) < EPS) break; | |
| 223 x[8] = kf_gammaq(1., log(lk_ratio_test(n, n1, pdg, g))); | |
| 224 } | |
| 225 // free | |
| 226 free(pdg); | |
| 227 return 0; | |
| 228 } | |
| 229 | |
| 230 /* | |
| 231 Two-locus EM (LD) | |
| 232 */ | |
| 233 | |
| 234 #define _G1(h, k) ((h>>1&1) + (k>>1&1)) | |
| 235 #define _G2(h, k) ((h&1) + (k&1)) | |
| 236 | |
| 237 // 0: the previous site; 1: the current site | |
| 238 static int pair_freq_iter(int n, double *pdg[2], double f[4]) | |
| 239 { | |
| 240 double ff[4]; | |
| 241 int i, k, h; | |
| 242 // printf("%lf,%lf,%lf,%lf\n", f[0], f[1], f[2], f[3]); | |
| 243 memset(ff, 0, 4 * sizeof(double)); | |
| 244 for (i = 0; i < n; ++i) { | |
| 245 double *p[2], sum, tmp; | |
| 246 p[0] = pdg[0] + i * 3; p[1] = pdg[1] + i * 3; | |
| 247 for (k = 0, sum = 0.; k < 4; ++k) | |
| 248 for (h = 0; h < 4; ++h) | |
| 249 sum += f[k] * f[h] * p[0][_G1(k,h)] * p[1][_G2(k,h)]; | |
| 250 for (k = 0; k < 4; ++k) { | |
| 251 tmp = f[0] * (p[0][_G1(0,k)] * p[1][_G2(0,k)] + p[0][_G1(k,0)] * p[1][_G2(k,0)]) | |
| 252 + f[1] * (p[0][_G1(1,k)] * p[1][_G2(1,k)] + p[0][_G1(k,1)] * p[1][_G2(k,1)]) | |
| 253 + f[2] * (p[0][_G1(2,k)] * p[1][_G2(2,k)] + p[0][_G1(k,2)] * p[1][_G2(k,2)]) | |
| 254 + f[3] * (p[0][_G1(3,k)] * p[1][_G2(3,k)] + p[0][_G1(k,3)] * p[1][_G2(k,3)]); | |
| 255 ff[k] += f[k] * tmp / sum; | |
| 256 } | |
| 257 } | |
| 258 for (k = 0; k < 4; ++k) f[k] = ff[k] / (2 * n); | |
| 259 return 0; | |
| 260 } | |
| 261 | |
| 262 double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]) | |
| 263 { | |
| 264 const bcf1_t *b[2]; | |
| 265 int i, j, n_smpl; | |
| 266 double *pdg[2], flast[4], r, f0[2]; | |
| 267 // initialize others | |
| 268 if (b0->n_smpl != b1->n_smpl) return -1; // different number of samples | |
| 269 n_smpl = b0->n_smpl; | |
| 270 b[0] = b0; b[1] = b1; | |
| 271 f[0] = f[1] = f[2] = f[3] = -1.; | |
| 272 if (b[0]->n_alleles < 2 || b[1]->n_alleles < 2) return -1; // one allele only | |
| 273 pdg[0] = get_pdg3(b0); pdg[1] = get_pdg3(b1); | |
| 274 if (pdg[0] == 0 || pdg[1] == 0) { | |
| 275 free(pdg[0]); free(pdg[1]); | |
| 276 return -1; | |
| 277 } | |
| 278 // set the initial value | |
| 279 f0[0] = est_freq(n_smpl, pdg[0]); | |
| 280 f0[1] = est_freq(n_smpl, pdg[1]); | |
| 281 f[0] = (1 - f0[0]) * (1 - f0[1]); f[3] = f0[0] * f0[1]; | |
| 282 f[1] = (1 - f0[0]) * f0[1]; f[2] = f0[0] * (1 - f0[1]); | |
| 283 // iteration | |
| 284 for (j = 0; j < ITER_MAX; ++j) { | |
| 285 double eps = 0; | |
| 286 memcpy(flast, f, 4 * sizeof(double)); | |
| 287 pair_freq_iter(n_smpl, pdg, f); | |
| 288 for (i = 0; i < 4; ++i) { | |
| 289 double x = fabs(f[i] - flast[i]); | |
| 290 if (x > eps) eps = x; | |
| 291 } | |
| 292 if (eps < EPS) break; | |
| 293 } | |
| 294 // free | |
| 295 free(pdg[0]); free(pdg[1]); | |
| 296 { // calculate r^2 | |
| 297 double p[2], q[2], D; | |
| 298 p[0] = f[0] + f[1]; q[0] = 1 - p[0]; | |
| 299 p[1] = f[0] + f[2]; q[1] = 1 - p[1]; | |
| 300 D = f[0] * f[3] - f[1] * f[2]; | |
| 301 r = sqrt(D * D / (p[0] * p[1] * q[0] * q[1])); | |
| 302 // printf("R(%lf,%lf,%lf,%lf)=%lf\n", f[0], f[1], f[2], f[3], r); | |
| 303 if (isnan(r)) r = -1.; | |
| 304 } | |
| 305 return r; | |
| 306 } |
