| 0 | 1 #include <stdlib.h> | 
|  | 2 #include <string.h> | 
|  | 3 #include <math.h> | 
|  | 4 #include "bcf.h" | 
|  | 5 #include "kmin.h" | 
|  | 6 | 
|  | 7 static double g_q2p[256]; | 
|  | 8 | 
|  | 9 #define ITER_MAX 50 | 
|  | 10 #define ITER_TRY 10 | 
|  | 11 #define EPS 1e-5 | 
|  | 12 | 
|  | 13 extern double kf_gammaq(double, double); | 
|  | 14 | 
|  | 15 /* | 
|  | 16 	Generic routines | 
|  | 17  */ | 
|  | 18 // get the 3 genotype likelihoods | 
|  | 19 static double *get_pdg3(const bcf1_t *b) | 
|  | 20 { | 
|  | 21 	double *pdg; | 
|  | 22 	const uint8_t *PL = 0; | 
|  | 23 	int i, PL_len = 0; | 
|  | 24 	// initialize g_q2p if necessary | 
|  | 25 	if (g_q2p[0] == 0.) | 
|  | 26 		for (i = 0; i < 256; ++i) | 
|  | 27 			g_q2p[i] = pow(10., -i / 10.); | 
|  | 28 	// set PL and PL_len | 
|  | 29 	for (i = 0; i < b->n_gi; ++i) { | 
|  | 30 		if (b->gi[i].fmt == bcf_str2int("PL", 2)) { | 
|  | 31 			PL = (const uint8_t*)b->gi[i].data; | 
|  | 32 			PL_len = b->gi[i].len; | 
|  | 33 			break; | 
|  | 34 		} | 
|  | 35 	} | 
|  | 36 	if (i == b->n_gi) return 0; // no PL | 
|  | 37 	// fill pdg | 
|  | 38 	pdg = malloc(3 * b->n_smpl * sizeof(double)); | 
|  | 39 	for (i = 0; i < b->n_smpl; ++i) { | 
|  | 40 		const uint8_t *pi = PL + i * PL_len; | 
|  | 41 		double *p = pdg + i * 3; | 
|  | 42 		p[0] = g_q2p[pi[2]]; p[1] = g_q2p[pi[1]]; p[2] = g_q2p[pi[0]]; | 
|  | 43 	} | 
|  | 44 	return pdg; | 
|  | 45 } | 
|  | 46 | 
|  | 47 // estimate site allele frequency in a very naive and inaccurate way | 
|  | 48 static double est_freq(int n, const double *pdg) | 
|  | 49 { | 
|  | 50 	int i, gcnt[3], tmp1; | 
|  | 51 	// get a rough estimate of the genotype frequency | 
|  | 52 	gcnt[0] = gcnt[1] = gcnt[2] = 0; | 
|  | 53 	for (i = 0; i < n; ++i) { | 
|  | 54 		const double *p = pdg + i * 3; | 
|  | 55 		if (p[0] != 1. || p[1] != 1. || p[2] != 1.) { | 
|  | 56 			int which = p[0] > p[1]? 0 : 1; | 
|  | 57 			which = p[which] > p[2]? which : 2; | 
|  | 58 			++gcnt[which]; | 
|  | 59 		} | 
|  | 60 	} | 
|  | 61 	tmp1 = gcnt[0] + gcnt[1] + gcnt[2]; | 
|  | 62 	return (tmp1 == 0)? -1.0 : (.5 * gcnt[1] + gcnt[2]) / tmp1; | 
|  | 63 } | 
|  | 64 | 
|  | 65 /* | 
|  | 66 	Single-locus EM | 
|  | 67  */ | 
|  | 68 | 
|  | 69 typedef struct { | 
|  | 70 	int beg, end; | 
|  | 71 	const double *pdg; | 
|  | 72 } minaux1_t; | 
|  | 73 | 
|  | 74 static double prob1(double f, void *data) | 
|  | 75 { | 
|  | 76 	minaux1_t *a = (minaux1_t*)data; | 
|  | 77 	double p = 1., l = 0., f3[3]; | 
|  | 78 	int i; | 
|  | 79 //	printf("brent %lg\n", f); | 
|  | 80 	if (f < 0 || f > 1) return 1e300; | 
|  | 81 	f3[0] = (1.-f)*(1.-f); f3[1] = 2.*f*(1.-f); f3[2] = f*f; | 
|  | 82 	for (i = a->beg; i < a->end; ++i) { | 
|  | 83 		const double *pdg = a->pdg + i * 3; | 
|  | 84 		p *= pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]; | 
|  | 85 		if (p < 1e-200) l -= log(p), p = 1.; | 
|  | 86 	} | 
|  | 87 	return l - log(p); | 
|  | 88 } | 
|  | 89 | 
|  | 90 // one EM iteration for allele frequency estimate | 
|  | 91 static double freq_iter(double *f, const double *_pdg, int beg, int end) | 
|  | 92 { | 
|  | 93 	double f0 = *f, f3[3], err; | 
|  | 94 	int i; | 
|  | 95 //	printf("em %lg\n", *f); | 
|  | 96 	f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; | 
|  | 97 	for (i = beg, f0 = 0.; i < end; ++i) { | 
|  | 98 		const double *pdg = _pdg + i * 3; | 
|  | 99 		f0 += (pdg[1] * f3[1] + 2. * pdg[2] * f3[2]) | 
|  | 100 			/ (pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]); | 
|  | 101 	} | 
|  | 102 	f0 /= (end - beg) * 2; | 
|  | 103 	err = fabs(f0 - *f); | 
|  | 104 	*f = f0; | 
|  | 105 	return err; | 
|  | 106 } | 
|  | 107 | 
|  | 108 /* The following function combines EM and Brent's method. When the signal from | 
|  | 109  * the data is strong, EM is faster but sometimes, EM may converge very slowly. | 
|  | 110  * When this happens, we switch to Brent's method. The idea is learned from | 
|  | 111  * Rasmus Nielsen. | 
|  | 112  */ | 
|  | 113 static double freqml(double f0, int beg, int end, const double *pdg) | 
|  | 114 { | 
|  | 115 	int i; | 
|  | 116 	double f; | 
|  | 117 	for (i = 0, f = f0; i < ITER_TRY; ++i) | 
|  | 118 		if (freq_iter(&f, pdg, beg, end) < EPS) break; | 
|  | 119 	if (i == ITER_TRY) { // haven't converged yet; try Brent's method | 
|  | 120 		minaux1_t a; | 
|  | 121 		a.beg = beg; a.end = end; a.pdg = pdg; | 
|  | 122 		kmin_brent(prob1, f0 == f? .5*f0 : f0, f, (void*)&a, EPS, &f); | 
|  | 123 	} | 
|  | 124 	return f; | 
|  | 125 } | 
|  | 126 | 
|  | 127 // one EM iteration for genotype frequency estimate | 
|  | 128 static double g3_iter(double g[3], const double *_pdg, int beg, int end) | 
|  | 129 { | 
|  | 130 	double err, gg[3]; | 
|  | 131 	int i; | 
|  | 132 	gg[0] = gg[1] = gg[2] = 0.; | 
|  | 133 //	printf("%lg,%lg,%lg\n", g[0], g[1], g[2]); | 
|  | 134 	for (i = beg; i < end; ++i) { | 
|  | 135 		double sum, tmp[3]; | 
|  | 136 		const double *pdg = _pdg + i * 3; | 
|  | 137 		tmp[0] = pdg[0] * g[0]; tmp[1] = pdg[1] * g[1]; tmp[2] = pdg[2] * g[2]; | 
|  | 138 		sum = (tmp[0] + tmp[1] + tmp[2]) * (end - beg); | 
|  | 139 		gg[0] += tmp[0] / sum; gg[1] += tmp[1] / sum; gg[2] += tmp[2] / sum; | 
|  | 140 	} | 
|  | 141 	err = fabs(gg[0] - g[0]) > fabs(gg[1] - g[1])? fabs(gg[0] - g[0]) : fabs(gg[1] - g[1]); | 
|  | 142 	err = err > fabs(gg[2] - g[2])? err : fabs(gg[2] - g[2]); | 
|  | 143 	g[0] = gg[0]; g[1] = gg[1]; g[2] = gg[2]; | 
|  | 144 	return err; | 
|  | 145 } | 
|  | 146 | 
|  | 147 // perform likelihood ratio test | 
|  | 148 static double lk_ratio_test(int n, int n1, const double *pdg, double f3[3][3]) | 
|  | 149 { | 
|  | 150 	double r; | 
|  | 151 	int i; | 
|  | 152 	for (i = 0, r = 1.; i < n1; ++i) { | 
|  | 153 		const double *p = pdg + i * 3; | 
|  | 154 		r *= (p[0] * f3[1][0] + p[1] * f3[1][1] + p[2] * f3[1][2]) | 
|  | 155 			/ (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]); | 
|  | 156 	} | 
|  | 157 	for (; i < n; ++i) { | 
|  | 158 		const double *p = pdg + i * 3; | 
|  | 159 		r *= (p[0] * f3[2][0] + p[1] * f3[2][1] + p[2] * f3[2][2]) | 
|  | 160 			/ (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]); | 
|  | 161 	} | 
|  | 162 	return r; | 
|  | 163 } | 
|  | 164 | 
|  | 165 // x[0]: ref frequency | 
|  | 166 // x[1..3]: alt-alt, alt-ref, ref-ref frequenc | 
|  | 167 // x[4]: HWE P-value | 
|  | 168 // x[5..6]: group1 freq, group2 freq | 
|  | 169 // x[7]: 1-degree P-value | 
|  | 170 // x[8]: 2-degree P-value | 
|  | 171 int bcf_em1(const bcf1_t *b, int n1, int flag, double x[9]) | 
|  | 172 { | 
|  | 173 	double *pdg; | 
|  | 174 	int i, n, n2; | 
|  | 175 	if (b->n_alleles < 2) return -1; // one allele only | 
|  | 176 	// initialization | 
|  | 177 	if (n1 < 0 || n1 > b->n_smpl) n1 = 0; | 
|  | 178 	if (flag & 1<<7) flag |= 7<<5; // compute group freq if LRT is required | 
|  | 179 	if (flag & 0xf<<1) flag |= 0xf<<1; | 
|  | 180 	n = b->n_smpl; n2 = n - n1; | 
|  | 181 	pdg = get_pdg3(b); | 
|  | 182 	if (pdg == 0) return -1; | 
|  | 183 	for (i = 0; i < 9; ++i) x[i] = -1.; | 
|  | 184 	{ | 
|  | 185 		if ((x[0] = est_freq(n, pdg)) < 0.) { | 
|  | 186 			free(pdg); | 
|  | 187 			return -1; // no data | 
|  | 188 		} | 
|  | 189 		x[0] = freqml(x[0], 0, n, pdg); | 
|  | 190 	} | 
|  | 191 	if (flag & (0xf<<1|1<<8)) { // estimate the genotype frequency and test HWE | 
|  | 192 		double *g = x + 1, f3[3], r; | 
|  | 193 		f3[0] = g[0] = (1 - x[0]) * (1 - x[0]); | 
|  | 194 		f3[1] = g[1] = 2 * x[0] * (1 - x[0]); | 
|  | 195 		f3[2] = g[2] = x[0] * x[0]; | 
|  | 196 		for (i = 0; i < ITER_MAX; ++i) | 
|  | 197 			if (g3_iter(g, pdg, 0, n) < EPS) break; | 
|  | 198 		// Hardy-Weinberg equilibrium (HWE) | 
|  | 199 		for (i = 0, r = 1.; i < n; ++i) { | 
|  | 200 			double *p = pdg + i * 3; | 
|  | 201 			r *= (p[0] * g[0] + p[1] * g[1] + p[2] * g[2]) / (p[0] * f3[0] + p[1] * f3[1] + p[2] * f3[2]); | 
|  | 202 		} | 
|  | 203 		x[4] = kf_gammaq(.5, log(r)); | 
|  | 204 	} | 
|  | 205 	if ((flag & 7<<5) && n1 > 0 && n1 < n) { // group frequency | 
|  | 206 		x[5] = freqml(x[0], 0, n1, pdg); | 
|  | 207 		x[6] = freqml(x[0], n1, n, pdg); | 
|  | 208 	} | 
|  | 209 	if ((flag & 1<<7) && n1 > 0 && n1 < n) { // 1-degree P-value | 
|  | 210 		double f[3], f3[3][3]; | 
|  | 211 		f[0] = x[0]; f[1] = x[5]; f[2] = x[6]; | 
|  | 212 		for (i = 0; i < 3; ++i) | 
|  | 213 			f3[i][0] = (1-f[i])*(1-f[i]), f3[i][1] = 2*f[i]*(1-f[i]), f3[i][2] = f[i]*f[i]; | 
|  | 214 		x[7] = kf_gammaq(.5, log(lk_ratio_test(n, n1, pdg, f3))); | 
|  | 215 	} | 
|  | 216 	if ((flag & 1<<8) && n1 > 0 && n1 < n) { // 2-degree P-value | 
|  | 217 		double g[3][3]; | 
|  | 218 		for (i = 0; i < 3; ++i) memcpy(g[i], x + 1, 3 * sizeof(double)); | 
|  | 219 		for (i = 0; i < ITER_MAX; ++i) | 
|  | 220 			if (g3_iter(g[1], pdg, 0, n1) < EPS) break; | 
|  | 221 		for (i = 0; i < ITER_MAX; ++i) | 
|  | 222 			if (g3_iter(g[2], pdg, n1, n) < EPS) break; | 
|  | 223 		x[8] = kf_gammaq(1., log(lk_ratio_test(n, n1, pdg, g))); | 
|  | 224 	} | 
|  | 225 	// free | 
|  | 226 	free(pdg); | 
|  | 227 	return 0; | 
|  | 228 } | 
|  | 229 | 
|  | 230 /* | 
|  | 231 	Two-locus EM (LD) | 
|  | 232  */ | 
|  | 233 | 
|  | 234 #define _G1(h, k) ((h>>1&1) + (k>>1&1)) | 
|  | 235 #define _G2(h, k) ((h&1) + (k&1)) | 
|  | 236 | 
|  | 237 // 0: the previous site; 1: the current site | 
|  | 238 static int pair_freq_iter(int n, double *pdg[2], double f[4]) | 
|  | 239 { | 
|  | 240 	double ff[4]; | 
|  | 241 	int i, k, h; | 
|  | 242 //	printf("%lf,%lf,%lf,%lf\n", f[0], f[1], f[2], f[3]); | 
|  | 243 	memset(ff, 0, 4 * sizeof(double)); | 
|  | 244 	for (i = 0; i < n; ++i) { | 
|  | 245 		double *p[2], sum, tmp; | 
|  | 246 		p[0] = pdg[0] + i * 3; p[1] = pdg[1] + i * 3; | 
|  | 247 		for (k = 0, sum = 0.; k < 4; ++k) | 
|  | 248 			for (h = 0; h < 4; ++h) | 
|  | 249 				sum += f[k] * f[h] * p[0][_G1(k,h)] * p[1][_G2(k,h)]; | 
|  | 250 		for (k = 0; k < 4; ++k) { | 
|  | 251 			tmp = f[0] * (p[0][_G1(0,k)] * p[1][_G2(0,k)] + p[0][_G1(k,0)] * p[1][_G2(k,0)]) | 
|  | 252 				+ f[1] * (p[0][_G1(1,k)] * p[1][_G2(1,k)] + p[0][_G1(k,1)] * p[1][_G2(k,1)]) | 
|  | 253 				+ f[2] * (p[0][_G1(2,k)] * p[1][_G2(2,k)] + p[0][_G1(k,2)] * p[1][_G2(k,2)]) | 
|  | 254 				+ f[3] * (p[0][_G1(3,k)] * p[1][_G2(3,k)] + p[0][_G1(k,3)] * p[1][_G2(k,3)]); | 
|  | 255 			ff[k] += f[k] * tmp / sum; | 
|  | 256 		} | 
|  | 257 	} | 
|  | 258 	for (k = 0; k < 4; ++k) f[k] = ff[k] / (2 * n); | 
|  | 259 	return 0; | 
|  | 260 } | 
|  | 261 | 
|  | 262 double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]) | 
|  | 263 { | 
|  | 264 	const bcf1_t *b[2]; | 
|  | 265 	int i, j, n_smpl; | 
|  | 266 	double *pdg[2], flast[4], r, f0[2]; | 
|  | 267 	// initialize others | 
|  | 268 	if (b0->n_smpl != b1->n_smpl) return -1; // different number of samples | 
|  | 269 	n_smpl = b0->n_smpl; | 
|  | 270 	b[0] = b0; b[1] = b1; | 
|  | 271 	f[0] = f[1] = f[2] = f[3] = -1.; | 
|  | 272 	if (b[0]->n_alleles < 2 || b[1]->n_alleles < 2) return -1; // one allele only | 
|  | 273 	pdg[0] = get_pdg3(b0); pdg[1] = get_pdg3(b1); | 
|  | 274 	if (pdg[0] == 0 || pdg[1] == 0) { | 
|  | 275 		free(pdg[0]); free(pdg[1]); | 
|  | 276 		return -1; | 
|  | 277 	} | 
|  | 278 	// set the initial value | 
|  | 279 	f0[0] = est_freq(n_smpl, pdg[0]); | 
|  | 280 	f0[1] = est_freq(n_smpl, pdg[1]); | 
|  | 281 	f[0] = (1 - f0[0]) * (1 - f0[1]); f[3] = f0[0] * f0[1]; | 
|  | 282 	f[1] = (1 - f0[0]) * f0[1]; f[2] = f0[0] * (1 - f0[1]); | 
|  | 283 	// iteration | 
|  | 284 	for (j = 0; j < ITER_MAX; ++j) { | 
|  | 285 		double eps = 0; | 
|  | 286 		memcpy(flast, f, 4 * sizeof(double)); | 
|  | 287 		pair_freq_iter(n_smpl, pdg, f); | 
|  | 288 		for (i = 0; i < 4; ++i) { | 
|  | 289 			double x = fabs(f[i] - flast[i]); | 
|  | 290 			if (x > eps) eps = x; | 
|  | 291 		} | 
|  | 292 		if (eps < EPS) break; | 
|  | 293 	} | 
|  | 294 	// free | 
|  | 295 	free(pdg[0]); free(pdg[1]); | 
|  | 296 	{ // calculate r^2 | 
|  | 297 		double p[2], q[2], D; | 
|  | 298 		p[0] = f[0] + f[1]; q[0] = 1 - p[0]; | 
|  | 299 		p[1] = f[0] + f[2]; q[1] = 1 - p[1]; | 
|  | 300 		D = f[0] * f[3] - f[1] * f[2]; | 
|  | 301 		r = sqrt(D * D / (p[0] * p[1] * q[0] * q[1])); | 
|  | 302 //		printf("R(%lf,%lf,%lf,%lf)=%lf\n", f[0], f[1], f[2], f[3], r); | 
|  | 303 		if (isnan(r)) r = -1.; | 
|  | 304 	} | 
|  | 305 	return r; | 
|  | 306 } |