| 
0
 | 
     1 #include <stdlib.h>
 | 
| 
 | 
     2 #include <string.h>
 | 
| 
 | 
     3 #include <math.h>
 | 
| 
 | 
     4 #include "bcf.h"
 | 
| 
 | 
     5 #include "kmin.h"
 | 
| 
 | 
     6 
 | 
| 
 | 
     7 static double g_q2p[256];
 | 
| 
 | 
     8 
 | 
| 
 | 
     9 #define ITER_MAX 50
 | 
| 
 | 
    10 #define ITER_TRY 10
 | 
| 
 | 
    11 #define EPS 1e-5
 | 
| 
 | 
    12 
 | 
| 
 | 
    13 extern double kf_gammaq(double, double);
 | 
| 
 | 
    14 
 | 
| 
 | 
    15 /*
 | 
| 
 | 
    16 	Generic routines
 | 
| 
 | 
    17  */
 | 
| 
 | 
    18 // get the 3 genotype likelihoods
 | 
| 
 | 
    19 static double *get_pdg3(const bcf1_t *b)
 | 
| 
 | 
    20 {
 | 
| 
 | 
    21 	double *pdg;
 | 
| 
 | 
    22 	const uint8_t *PL = 0;
 | 
| 
 | 
    23 	int i, PL_len = 0;
 | 
| 
 | 
    24 	// initialize g_q2p if necessary
 | 
| 
 | 
    25 	if (g_q2p[0] == 0.)
 | 
| 
 | 
    26 		for (i = 0; i < 256; ++i)
 | 
| 
 | 
    27 			g_q2p[i] = pow(10., -i / 10.);
 | 
| 
 | 
    28 	// set PL and PL_len
 | 
| 
 | 
    29 	for (i = 0; i < b->n_gi; ++i) {
 | 
| 
 | 
    30 		if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
 | 
| 
 | 
    31 			PL = (const uint8_t*)b->gi[i].data;
 | 
| 
 | 
    32 			PL_len = b->gi[i].len;
 | 
| 
 | 
    33 			break;
 | 
| 
 | 
    34 		}
 | 
| 
 | 
    35 	}
 | 
| 
 | 
    36 	if (i == b->n_gi) return 0; // no PL
 | 
| 
 | 
    37 	// fill pdg
 | 
| 
 | 
    38 	pdg = malloc(3 * b->n_smpl * sizeof(double));
 | 
| 
 | 
    39 	for (i = 0; i < b->n_smpl; ++i) {
 | 
| 
 | 
    40 		const uint8_t *pi = PL + i * PL_len;
 | 
| 
 | 
    41 		double *p = pdg + i * 3;
 | 
| 
 | 
    42 		p[0] = g_q2p[pi[2]]; p[1] = g_q2p[pi[1]]; p[2] = g_q2p[pi[0]];
 | 
| 
 | 
    43 	}
 | 
| 
 | 
    44 	return pdg;
 | 
| 
 | 
    45 }
 | 
| 
 | 
    46 
 | 
| 
 | 
    47 // estimate site allele frequency in a very naive and inaccurate way
 | 
| 
 | 
    48 static double est_freq(int n, const double *pdg)
 | 
| 
 | 
    49 {
 | 
| 
 | 
    50 	int i, gcnt[3], tmp1;
 | 
| 
 | 
    51 	// get a rough estimate of the genotype frequency
 | 
| 
 | 
    52 	gcnt[0] = gcnt[1] = gcnt[2] = 0;
 | 
| 
 | 
    53 	for (i = 0; i < n; ++i) {
 | 
| 
 | 
    54 		const double *p = pdg + i * 3;
 | 
| 
 | 
    55 		if (p[0] != 1. || p[1] != 1. || p[2] != 1.) {
 | 
| 
 | 
    56 			int which = p[0] > p[1]? 0 : 1;
 | 
| 
 | 
    57 			which = p[which] > p[2]? which : 2;
 | 
| 
 | 
    58 			++gcnt[which];
 | 
| 
 | 
    59 		}
 | 
| 
 | 
    60 	}
 | 
| 
 | 
    61 	tmp1 = gcnt[0] + gcnt[1] + gcnt[2];
 | 
| 
 | 
    62 	return (tmp1 == 0)? -1.0 : (.5 * gcnt[1] + gcnt[2]) / tmp1;
 | 
| 
 | 
    63 }
 | 
| 
 | 
    64 
 | 
| 
 | 
    65 /*
 | 
| 
 | 
    66 	Single-locus EM
 | 
| 
 | 
    67  */
 | 
| 
 | 
    68 
 | 
| 
 | 
    69 typedef struct {
 | 
| 
 | 
    70 	int beg, end;
 | 
| 
 | 
    71 	const double *pdg;
 | 
| 
 | 
    72 } minaux1_t;
 | 
| 
 | 
    73 
 | 
| 
 | 
    74 static double prob1(double f, void *data)
 | 
| 
 | 
    75 {
 | 
| 
 | 
    76 	minaux1_t *a = (minaux1_t*)data;
 | 
| 
 | 
    77 	double p = 1., l = 0., f3[3];
 | 
| 
 | 
    78 	int i;
 | 
| 
 | 
    79 //	printf("brent %lg\n", f);
 | 
| 
 | 
    80 	if (f < 0 || f > 1) return 1e300;
 | 
| 
 | 
    81 	f3[0] = (1.-f)*(1.-f); f3[1] = 2.*f*(1.-f); f3[2] = f*f;
 | 
| 
 | 
    82 	for (i = a->beg; i < a->end; ++i) {
 | 
| 
 | 
    83 		const double *pdg = a->pdg + i * 3;
 | 
| 
 | 
    84 		p *= pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2];
 | 
| 
 | 
    85 		if (p < 1e-200) l -= log(p), p = 1.;
 | 
| 
 | 
    86 	}
 | 
| 
 | 
    87 	return l - log(p);
 | 
| 
 | 
    88 }
 | 
| 
 | 
    89 
 | 
| 
 | 
    90 // one EM iteration for allele frequency estimate
 | 
| 
 | 
    91 static double freq_iter(double *f, const double *_pdg, int beg, int end)
 | 
| 
 | 
    92 {
 | 
| 
 | 
    93 	double f0 = *f, f3[3], err;
 | 
| 
 | 
    94 	int i;
 | 
| 
 | 
    95 //	printf("em %lg\n", *f);
 | 
| 
 | 
    96 	f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0;
 | 
| 
 | 
    97 	for (i = beg, f0 = 0.; i < end; ++i) {
 | 
| 
 | 
    98 		const double *pdg = _pdg + i * 3;
 | 
| 
 | 
    99 		f0 += (pdg[1] * f3[1] + 2. * pdg[2] * f3[2])
 | 
| 
 | 
   100 			/ (pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]);
 | 
| 
 | 
   101 	}
 | 
| 
 | 
   102 	f0 /= (end - beg) * 2;
 | 
| 
 | 
   103 	err = fabs(f0 - *f);
 | 
| 
 | 
   104 	*f = f0;
 | 
| 
 | 
   105 	return err;
 | 
| 
 | 
   106 }
 | 
| 
 | 
   107 
 | 
| 
 | 
   108 /* The following function combines EM and Brent's method. When the signal from
 | 
| 
 | 
   109  * the data is strong, EM is faster but sometimes, EM may converge very slowly.
 | 
| 
 | 
   110  * When this happens, we switch to Brent's method. The idea is learned from
 | 
| 
 | 
   111  * Rasmus Nielsen.
 | 
| 
 | 
   112  */
 | 
| 
 | 
   113 static double freqml(double f0, int beg, int end, const double *pdg)
 | 
| 
 | 
   114 {
 | 
| 
 | 
   115 	int i;
 | 
| 
 | 
   116 	double f;
 | 
| 
 | 
   117 	for (i = 0, f = f0; i < ITER_TRY; ++i)
 | 
| 
 | 
   118 		if (freq_iter(&f, pdg, beg, end) < EPS) break;
 | 
| 
 | 
   119 	if (i == ITER_TRY) { // haven't converged yet; try Brent's method
 | 
| 
 | 
   120 		minaux1_t a;
 | 
| 
 | 
   121 		a.beg = beg; a.end = end; a.pdg = pdg;
 | 
| 
 | 
   122 		kmin_brent(prob1, f0 == f? .5*f0 : f0, f, (void*)&a, EPS, &f);
 | 
| 
 | 
   123 	}
 | 
| 
 | 
   124 	return f;
 | 
| 
 | 
   125 }
 | 
| 
 | 
   126 
 | 
| 
 | 
   127 // one EM iteration for genotype frequency estimate
 | 
| 
 | 
   128 static double g3_iter(double g[3], const double *_pdg, int beg, int end)
 | 
| 
 | 
   129 {
 | 
| 
 | 
   130 	double err, gg[3];
 | 
| 
 | 
   131 	int i;
 | 
| 
 | 
   132 	gg[0] = gg[1] = gg[2] = 0.;
 | 
| 
 | 
   133 //	printf("%lg,%lg,%lg\n", g[0], g[1], g[2]);
 | 
| 
 | 
   134 	for (i = beg; i < end; ++i) {
 | 
| 
 | 
   135 		double sum, tmp[3];
 | 
| 
 | 
   136 		const double *pdg = _pdg + i * 3;
 | 
| 
 | 
   137 		tmp[0] = pdg[0] * g[0]; tmp[1] = pdg[1] * g[1]; tmp[2] = pdg[2] * g[2];
 | 
| 
 | 
   138 		sum = (tmp[0] + tmp[1] + tmp[2]) * (end - beg);
 | 
| 
 | 
   139 		gg[0] += tmp[0] / sum; gg[1] += tmp[1] / sum; gg[2] += tmp[2] / sum;
 | 
| 
 | 
   140 	}
 | 
| 
 | 
   141 	err = fabs(gg[0] - g[0]) > fabs(gg[1] - g[1])? fabs(gg[0] - g[0]) : fabs(gg[1] - g[1]);
 | 
| 
 | 
   142 	err = err > fabs(gg[2] - g[2])? err : fabs(gg[2] - g[2]);
 | 
| 
 | 
   143 	g[0] = gg[0]; g[1] = gg[1]; g[2] = gg[2];
 | 
| 
 | 
   144 	return err;
 | 
| 
 | 
   145 }
 | 
| 
 | 
   146 
 | 
| 
 | 
   147 // perform likelihood ratio test
 | 
| 
 | 
   148 static double lk_ratio_test(int n, int n1, const double *pdg, double f3[3][3])
 | 
| 
 | 
   149 {
 | 
| 
 | 
   150 	double r;
 | 
| 
 | 
   151 	int i;
 | 
| 
 | 
   152 	for (i = 0, r = 1.; i < n1; ++i) {
 | 
| 
 | 
   153 		const double *p = pdg + i * 3;
 | 
| 
 | 
   154 		r *= (p[0] * f3[1][0] + p[1] * f3[1][1] + p[2] * f3[1][2])
 | 
| 
 | 
   155 			/ (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]);
 | 
| 
 | 
   156 	}
 | 
| 
 | 
   157 	for (; i < n; ++i) {
 | 
| 
 | 
   158 		const double *p = pdg + i * 3;
 | 
| 
 | 
   159 		r *= (p[0] * f3[2][0] + p[1] * f3[2][1] + p[2] * f3[2][2])
 | 
| 
 | 
   160 			/ (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]);
 | 
| 
 | 
   161 	}
 | 
| 
 | 
   162 	return r;
 | 
| 
 | 
   163 }
 | 
| 
 | 
   164 
 | 
| 
 | 
   165 // x[0]: ref frequency
 | 
| 
 | 
   166 // x[1..3]: alt-alt, alt-ref, ref-ref frequenc
 | 
| 
 | 
   167 // x[4]: HWE P-value
 | 
| 
 | 
   168 // x[5..6]: group1 freq, group2 freq
 | 
| 
 | 
   169 // x[7]: 1-degree P-value
 | 
| 
 | 
   170 // x[8]: 2-degree P-value
 | 
| 
 | 
   171 int bcf_em1(const bcf1_t *b, int n1, int flag, double x[9])
 | 
| 
 | 
   172 {
 | 
| 
 | 
   173 	double *pdg;
 | 
| 
 | 
   174 	int i, n, n2;
 | 
| 
 | 
   175 	if (b->n_alleles < 2) return -1; // one allele only
 | 
| 
 | 
   176 	// initialization
 | 
| 
 | 
   177 	if (n1 < 0 || n1 > b->n_smpl) n1 = 0;
 | 
| 
 | 
   178 	if (flag & 1<<7) flag |= 7<<5; // compute group freq if LRT is required
 | 
| 
 | 
   179 	if (flag & 0xf<<1) flag |= 0xf<<1;
 | 
| 
 | 
   180 	n = b->n_smpl; n2 = n - n1;
 | 
| 
 | 
   181 	pdg = get_pdg3(b);
 | 
| 
 | 
   182 	if (pdg == 0) return -1;
 | 
| 
 | 
   183 	for (i = 0; i < 9; ++i) x[i] = -1.;
 | 
| 
 | 
   184 	{
 | 
| 
 | 
   185 		if ((x[0] = est_freq(n, pdg)) < 0.) {
 | 
| 
 | 
   186 			free(pdg);
 | 
| 
 | 
   187 			return -1; // no data
 | 
| 
 | 
   188 		}
 | 
| 
 | 
   189 		x[0] = freqml(x[0], 0, n, pdg);
 | 
| 
 | 
   190 	}
 | 
| 
 | 
   191 	if (flag & (0xf<<1|1<<8)) { // estimate the genotype frequency and test HWE
 | 
| 
 | 
   192 		double *g = x + 1, f3[3], r;
 | 
| 
 | 
   193 		f3[0] = g[0] = (1 - x[0]) * (1 - x[0]);
 | 
| 
 | 
   194 		f3[1] = g[1] = 2 * x[0] * (1 - x[0]);
 | 
| 
 | 
   195 		f3[2] = g[2] = x[0] * x[0];
 | 
| 
 | 
   196 		for (i = 0; i < ITER_MAX; ++i)
 | 
| 
 | 
   197 			if (g3_iter(g, pdg, 0, n) < EPS) break;
 | 
| 
 | 
   198 		// Hardy-Weinberg equilibrium (HWE)
 | 
| 
 | 
   199 		for (i = 0, r = 1.; i < n; ++i) {
 | 
| 
 | 
   200 			double *p = pdg + i * 3;
 | 
| 
 | 
   201 			r *= (p[0] * g[0] + p[1] * g[1] + p[2] * g[2]) / (p[0] * f3[0] + p[1] * f3[1] + p[2] * f3[2]);
 | 
| 
 | 
   202 		}
 | 
| 
 | 
   203 		x[4] = kf_gammaq(.5, log(r));
 | 
| 
 | 
   204 	}
 | 
| 
 | 
   205 	if ((flag & 7<<5) && n1 > 0 && n1 < n) { // group frequency
 | 
| 
 | 
   206 		x[5] = freqml(x[0], 0, n1, pdg);
 | 
| 
 | 
   207 		x[6] = freqml(x[0], n1, n, pdg);
 | 
| 
 | 
   208 	}
 | 
| 
 | 
   209 	if ((flag & 1<<7) && n1 > 0 && n1 < n) { // 1-degree P-value
 | 
| 
 | 
   210 		double f[3], f3[3][3];
 | 
| 
 | 
   211 		f[0] = x[0]; f[1] = x[5]; f[2] = x[6];
 | 
| 
 | 
   212 		for (i = 0; i < 3; ++i)
 | 
| 
 | 
   213 			f3[i][0] = (1-f[i])*(1-f[i]), f3[i][1] = 2*f[i]*(1-f[i]), f3[i][2] = f[i]*f[i];
 | 
| 
 | 
   214 		x[7] = kf_gammaq(.5, log(lk_ratio_test(n, n1, pdg, f3)));
 | 
| 
 | 
   215 	}
 | 
| 
 | 
   216 	if ((flag & 1<<8) && n1 > 0 && n1 < n) { // 2-degree P-value
 | 
| 
 | 
   217 		double g[3][3];
 | 
| 
 | 
   218 		for (i = 0; i < 3; ++i) memcpy(g[i], x + 1, 3 * sizeof(double));
 | 
| 
 | 
   219 		for (i = 0; i < ITER_MAX; ++i)
 | 
| 
 | 
   220 			if (g3_iter(g[1], pdg, 0, n1) < EPS) break;
 | 
| 
 | 
   221 		for (i = 0; i < ITER_MAX; ++i)
 | 
| 
 | 
   222 			if (g3_iter(g[2], pdg, n1, n) < EPS) break;
 | 
| 
 | 
   223 		x[8] = kf_gammaq(1., log(lk_ratio_test(n, n1, pdg, g)));
 | 
| 
 | 
   224 	}
 | 
| 
 | 
   225 	// free
 | 
| 
 | 
   226 	free(pdg);
 | 
| 
 | 
   227 	return 0;
 | 
| 
 | 
   228 }
 | 
| 
 | 
   229 
 | 
| 
 | 
   230 /*
 | 
| 
 | 
   231 	Two-locus EM (LD)
 | 
| 
 | 
   232  */
 | 
| 
 | 
   233 
 | 
| 
 | 
   234 #define _G1(h, k) ((h>>1&1) + (k>>1&1))
 | 
| 
 | 
   235 #define _G2(h, k) ((h&1) + (k&1))
 | 
| 
 | 
   236 
 | 
| 
 | 
   237 // 0: the previous site; 1: the current site
 | 
| 
 | 
   238 static int pair_freq_iter(int n, double *pdg[2], double f[4])
 | 
| 
 | 
   239 {
 | 
| 
 | 
   240 	double ff[4];
 | 
| 
 | 
   241 	int i, k, h;
 | 
| 
 | 
   242 //	printf("%lf,%lf,%lf,%lf\n", f[0], f[1], f[2], f[3]);
 | 
| 
 | 
   243 	memset(ff, 0, 4 * sizeof(double));
 | 
| 
 | 
   244 	for (i = 0; i < n; ++i) {
 | 
| 
 | 
   245 		double *p[2], sum, tmp;
 | 
| 
 | 
   246 		p[0] = pdg[0] + i * 3; p[1] = pdg[1] + i * 3;
 | 
| 
 | 
   247 		for (k = 0, sum = 0.; k < 4; ++k)
 | 
| 
 | 
   248 			for (h = 0; h < 4; ++h)
 | 
| 
 | 
   249 				sum += f[k] * f[h] * p[0][_G1(k,h)] * p[1][_G2(k,h)];
 | 
| 
 | 
   250 		for (k = 0; k < 4; ++k) {
 | 
| 
 | 
   251 			tmp = f[0] * (p[0][_G1(0,k)] * p[1][_G2(0,k)] + p[0][_G1(k,0)] * p[1][_G2(k,0)])
 | 
| 
 | 
   252 				+ f[1] * (p[0][_G1(1,k)] * p[1][_G2(1,k)] + p[0][_G1(k,1)] * p[1][_G2(k,1)])
 | 
| 
 | 
   253 				+ f[2] * (p[0][_G1(2,k)] * p[1][_G2(2,k)] + p[0][_G1(k,2)] * p[1][_G2(k,2)])
 | 
| 
 | 
   254 				+ f[3] * (p[0][_G1(3,k)] * p[1][_G2(3,k)] + p[0][_G1(k,3)] * p[1][_G2(k,3)]);
 | 
| 
 | 
   255 			ff[k] += f[k] * tmp / sum;
 | 
| 
 | 
   256 		}
 | 
| 
 | 
   257 	}
 | 
| 
 | 
   258 	for (k = 0; k < 4; ++k) f[k] = ff[k] / (2 * n);
 | 
| 
 | 
   259 	return 0;
 | 
| 
 | 
   260 }
 | 
| 
 | 
   261 
 | 
| 
 | 
   262 double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4])
 | 
| 
 | 
   263 {
 | 
| 
 | 
   264 	const bcf1_t *b[2];
 | 
| 
 | 
   265 	int i, j, n_smpl;
 | 
| 
 | 
   266 	double *pdg[2], flast[4], r, f0[2];
 | 
| 
 | 
   267 	// initialize others
 | 
| 
 | 
   268 	if (b0->n_smpl != b1->n_smpl) return -1; // different number of samples
 | 
| 
 | 
   269 	n_smpl = b0->n_smpl;
 | 
| 
 | 
   270 	b[0] = b0; b[1] = b1;
 | 
| 
 | 
   271 	f[0] = f[1] = f[2] = f[3] = -1.;
 | 
| 
 | 
   272 	if (b[0]->n_alleles < 2 || b[1]->n_alleles < 2) return -1; // one allele only
 | 
| 
 | 
   273 	pdg[0] = get_pdg3(b0); pdg[1] = get_pdg3(b1);
 | 
| 
 | 
   274 	if (pdg[0] == 0 || pdg[1] == 0) {
 | 
| 
 | 
   275 		free(pdg[0]); free(pdg[1]);
 | 
| 
 | 
   276 		return -1;
 | 
| 
 | 
   277 	}
 | 
| 
 | 
   278 	// set the initial value
 | 
| 
 | 
   279 	f0[0] = est_freq(n_smpl, pdg[0]);
 | 
| 
 | 
   280 	f0[1] = est_freq(n_smpl, pdg[1]);
 | 
| 
 | 
   281 	f[0] = (1 - f0[0]) * (1 - f0[1]); f[3] = f0[0] * f0[1];
 | 
| 
 | 
   282 	f[1] = (1 - f0[0]) * f0[1]; f[2] = f0[0] * (1 - f0[1]);
 | 
| 
 | 
   283 	// iteration
 | 
| 
 | 
   284 	for (j = 0; j < ITER_MAX; ++j) {
 | 
| 
 | 
   285 		double eps = 0;
 | 
| 
 | 
   286 		memcpy(flast, f, 4 * sizeof(double));
 | 
| 
 | 
   287 		pair_freq_iter(n_smpl, pdg, f);
 | 
| 
 | 
   288 		for (i = 0; i < 4; ++i) {
 | 
| 
 | 
   289 			double x = fabs(f[i] - flast[i]);
 | 
| 
 | 
   290 			if (x > eps) eps = x;
 | 
| 
 | 
   291 		}
 | 
| 
 | 
   292 		if (eps < EPS) break;
 | 
| 
 | 
   293 	}
 | 
| 
 | 
   294 	// free
 | 
| 
 | 
   295 	free(pdg[0]); free(pdg[1]);
 | 
| 
 | 
   296 	{ // calculate r^2
 | 
| 
 | 
   297 		double p[2], q[2], D;
 | 
| 
 | 
   298 		p[0] = f[0] + f[1]; q[0] = 1 - p[0];
 | 
| 
 | 
   299 		p[1] = f[0] + f[2]; q[1] = 1 - p[1];
 | 
| 
 | 
   300 		D = f[0] * f[3] - f[1] * f[2];
 | 
| 
 | 
   301 		r = sqrt(D * D / (p[0] * p[1] * q[0] * q[1]));
 | 
| 
 | 
   302 //		printf("R(%lf,%lf,%lf,%lf)=%lf\n", f[0], f[1], f[2], f[3], r);
 | 
| 
 | 
   303 		if (isnan(r)) r = -1.;
 | 
| 
 | 
   304 	}
 | 
| 
 | 
   305 	return r;
 | 
| 
 | 
   306 }
 |