Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/samtools-0.1.19/misc/vcfutils.lua @ 0:903fc43d6227 draft default tip
Uploaded
| author | lsong10 |
|---|---|
| date | Fri, 26 Mar 2021 16:52:45 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:903fc43d6227 |
|---|---|
| 1 #!/usr/bin/env luajit | |
| 2 | |
| 3 ----------------------------------- | |
| 4 -- BEGIN: routines from klib.lua -- | |
| 5 ----------------------------------- | |
| 6 | |
| 7 -- Description: getopt() translated from the BSD getopt(); compatible with the default Unix getopt() | |
| 8 --[[ Example: | |
| 9 for o, a in os.getopt(arg, 'a:b') do | |
| 10 print(o, a) | |
| 11 end | |
| 12 ]]-- | |
| 13 function os.getopt(args, ostr) | |
| 14 local arg, place = nil, 0; | |
| 15 return function () | |
| 16 if place == 0 then -- update scanning pointer | |
| 17 place = 1 | |
| 18 if #args == 0 or args[1]:sub(1, 1) ~= '-' then place = 0; return nil end | |
| 19 if #args[1] >= 2 then | |
| 20 place = place + 1 | |
| 21 if args[1]:sub(2, 2) == '-' then -- found "--" | |
| 22 table.remove(args, 1); | |
| 23 place = 0 | |
| 24 return nil; | |
| 25 end | |
| 26 end | |
| 27 end | |
| 28 local optopt = place <= #args[1] and args[1]:sub(place, place) or nil | |
| 29 place = place + 1; | |
| 30 local oli = optopt and ostr:find(optopt) or nil | |
| 31 if optopt == ':' or oli == nil then -- unknown option | |
| 32 if optopt == '-' then return nil end | |
| 33 if place > #args[1] then | |
| 34 table.remove(args, 1); | |
| 35 place = 0; | |
| 36 end | |
| 37 return '?'; | |
| 38 end | |
| 39 oli = oli + 1; | |
| 40 if ostr:sub(oli, oli) ~= ':' then -- do not need argument | |
| 41 arg = nil; | |
| 42 if place > #args[1] then | |
| 43 table.remove(args, 1); | |
| 44 place = 0; | |
| 45 end | |
| 46 else -- need an argument | |
| 47 if place <= #args[1] then -- no white space | |
| 48 arg = args[1]:sub(place); | |
| 49 else | |
| 50 table.remove(args, 1); | |
| 51 if #args == 0 then -- an option requiring argument is the last one | |
| 52 place = 0; | |
| 53 if ostr:sub(1, 1) == ':' then return ':' end | |
| 54 return '?'; | |
| 55 else arg = args[1] end | |
| 56 end | |
| 57 table.remove(args, 1); | |
| 58 place = 0; | |
| 59 end | |
| 60 return optopt, arg; | |
| 61 end | |
| 62 end | |
| 63 | |
| 64 -- Description: string split | |
| 65 function string:split(sep, n) | |
| 66 local a, start = {}, 1; | |
| 67 sep = sep or "%s+"; | |
| 68 repeat | |
| 69 local b, e = self:find(sep, start); | |
| 70 if b == nil then | |
| 71 table.insert(a, self:sub(start)); | |
| 72 break | |
| 73 end | |
| 74 a[#a+1] = self:sub(start, b - 1); | |
| 75 start = e + 1; | |
| 76 if n and #a == n then | |
| 77 table.insert(a, self:sub(start)); | |
| 78 break | |
| 79 end | |
| 80 until start > #self; | |
| 81 return a; | |
| 82 end | |
| 83 | |
| 84 -- Description: smart file open | |
| 85 function io.xopen(fn, mode) | |
| 86 mode = mode or 'r'; | |
| 87 if fn == nil then return io.stdin; | |
| 88 elseif fn == '-' then return (mode == 'r' and io.stdin) or io.stdout; | |
| 89 elseif fn:sub(-3) == '.gz' then return (mode == 'r' and io.popen('gzip -dc ' .. fn, 'r')) or io.popen('gzip > ' .. fn, 'w'); | |
| 90 elseif fn:sub(-4) == '.bz2' then return (mode == 'r' and io.popen('bzip2 -dc ' .. fn, 'r')) or io.popen('bgzip2 > ' .. fn, 'w'); | |
| 91 else return io.open(fn, mode) end | |
| 92 end | |
| 93 | |
| 94 -- Description: log gamma function | |
| 95 -- Required by: math.lbinom() | |
| 96 -- Reference: AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245 | |
| 97 function math.lgamma(z) | |
| 98 local x; | |
| 99 x = 0.1659470187408462e-06 / (z+7); | |
| 100 x = x + 0.9934937113930748e-05 / (z+6); | |
| 101 x = x - 0.1385710331296526 / (z+5); | |
| 102 x = x + 12.50734324009056 / (z+4); | |
| 103 x = x - 176.6150291498386 / (z+3); | |
| 104 x = x + 771.3234287757674 / (z+2); | |
| 105 x = x - 1259.139216722289 / (z+1); | |
| 106 x = x + 676.5203681218835 / z; | |
| 107 x = x + 0.9999999999995183; | |
| 108 return math.log(x) - 5.58106146679532777 - z + (z-0.5) * math.log(z+6.5); | |
| 109 end | |
| 110 | |
| 111 -- Description: regularized incomplete gamma function | |
| 112 -- Dependent on: math.lgamma() | |
| 113 --[[ | |
| 114 Formulas are taken from Wiki, with additional input from Numerical | |
| 115 Recipes in C (for modified Lentz's algorithm) and AS245 | |
| 116 (http://lib.stat.cmu.edu/apstat/245). | |
| 117 | |
| 118 A good online calculator is available at: | |
| 119 | |
| 120 http://www.danielsoper.com/statcalc/calc23.aspx | |
| 121 | |
| 122 It calculates upper incomplete gamma function, which equals | |
| 123 math.igamma(s,z,true)*math.exp(math.lgamma(s)) | |
| 124 ]]-- | |
| 125 function math.igamma(s, z, complement) | |
| 126 | |
| 127 local function _kf_gammap(s, z) | |
| 128 local sum, x = 1, 1; | |
| 129 for k = 1, 100 do | |
| 130 x = x * z / (s + k); | |
| 131 sum = sum + x; | |
| 132 if x / sum < 1e-14 then break end | |
| 133 end | |
| 134 return math.exp(s * math.log(z) - z - math.lgamma(s + 1.) + math.log(sum)); | |
| 135 end | |
| 136 | |
| 137 local function _kf_gammaq(s, z) | |
| 138 local C, D, f, TINY; | |
| 139 f = 1. + z - s; C = f; D = 0.; TINY = 1e-290; | |
| 140 -- Modified Lentz's algorithm for computing continued fraction. See Numerical Recipes in C, 2nd edition, section 5.2 | |
| 141 for j = 1, 100 do | |
| 142 local d; | |
| 143 local a, b = j * (s - j), j*2 + 1 + z - s; | |
| 144 D = b + a * D; | |
| 145 if D < TINY then D = TINY end | |
| 146 C = b + a / C; | |
| 147 if C < TINY then C = TINY end | |
| 148 D = 1. / D; | |
| 149 d = C * D; | |
| 150 f = f * d; | |
| 151 if math.abs(d - 1) < 1e-14 then break end | |
| 152 end | |
| 153 return math.exp(s * math.log(z) - z - math.lgamma(s) - math.log(f)); | |
| 154 end | |
| 155 | |
| 156 if complement then | |
| 157 return ((z <= 1 or z < s) and 1 - _kf_gammap(s, z)) or _kf_gammaq(s, z); | |
| 158 else | |
| 159 return ((z <= 1 or z < s) and _kf_gammap(s, z)) or (1 - _kf_gammaq(s, z)); | |
| 160 end | |
| 161 end | |
| 162 | |
| 163 function math.brent(func, a, b, tol) | |
| 164 local gold1, gold2, tiny, max_iter = 1.6180339887, 0.3819660113, 1e-20, 100 | |
| 165 | |
| 166 local fa, fb = func(a, data), func(b, data) | |
| 167 if fb > fa then -- swap, such that f(a) > f(b) | |
| 168 a, b, fa, fb = b, a, fb, fa | |
| 169 end | |
| 170 local c = b + gold1 * (b - a) | |
| 171 local fc = func(c) -- golden section extrapolation | |
| 172 while fb > fc do | |
| 173 local bound = b + 100.0 * (c - b) -- the farthest point where we want to go | |
| 174 local r = (b - a) * (fb - fc) | |
| 175 local q = (b - c) * (fb - fa) | |
| 176 if math.abs(q - r) < tiny then -- avoid 0 denominator | |
| 177 tmp = q > r and tiny or 0.0 - tiny | |
| 178 else tmp = q - r end | |
| 179 u = b - ((b - c) * q - (b - a) * r) / (2.0 * tmp) -- u is the parabolic extrapolation point | |
| 180 if (b > u and u > c) or (b < u and u < c) then -- u lies between b and c | |
| 181 fu = func(u) | |
| 182 if fu < fc then -- (b,u,c) bracket the minimum | |
| 183 a, b, fa, fb = b, u, fb, fu | |
| 184 break | |
| 185 elseif fu > fb then -- (a,b,u) bracket the minimum | |
| 186 c, fc = u, fu | |
| 187 break | |
| 188 end | |
| 189 u = c + gold1 * (c - b) | |
| 190 fu = func(u) -- golden section extrapolation | |
| 191 elseif (c > u and u > bound) or (c < u and u < bound) then -- u lies between c and bound | |
| 192 fu = func(u) | |
| 193 if fu < fc then -- fb > fc > fu | |
| 194 b, c, u = c, u, c + gold1 * (c - b) | |
| 195 fb, fc, fu = fc, fu, func(u) | |
| 196 else -- (b,c,u) bracket the minimum | |
| 197 a, b, c = b, c, u | |
| 198 fa, fb, fc = fb, fc, fu | |
| 199 break | |
| 200 end | |
| 201 elseif (u > bound and bound > c) or (u < bound and bound < c) then -- u goes beyond the bound | |
| 202 u = bound | |
| 203 fu = func(u) | |
| 204 else -- u goes the other way around, use golden section extrapolation | |
| 205 u = c + gold1 * (c - b) | |
| 206 fu = func(u) | |
| 207 end | |
| 208 a, b, c = b, c, u | |
| 209 fa, fb, fc = fb, fc, fu | |
| 210 end | |
| 211 if a > c then a, c = c, a end -- swap | |
| 212 | |
| 213 -- now, a<b<c, fa>fb and fb<fc, move on to Brent's algorithm | |
| 214 local e, d = 0, 0 | |
| 215 local w, v, fw, fv | |
| 216 w, v = b, b | |
| 217 fw, fv = fb, fb | |
| 218 for iter = 1, max_iter do | |
| 219 local mid = 0.5 * (a + c) | |
| 220 local tol1 = tol * math.abs(b) + tiny | |
| 221 local tol2 = 2.0 * tol1 | |
| 222 if math.abs(b - mid) <= tol2 - 0.5 * (c - a) then return fb, b end -- found | |
| 223 if math.abs(e) > tol1 then | |
| 224 -- related to parabolic interpolation | |
| 225 local r = (b - w) * (fb - fv) | |
| 226 local q = (b - v) * (fb - fw) | |
| 227 local p = (b - v) * q - (b - w) * r | |
| 228 q = 2.0 * (q - r) | |
| 229 if q > 0.0 then p = 0.0 - p | |
| 230 else q = 0.0 - q end | |
| 231 eold, e = e, d | |
| 232 if math.abs(p) >= math.abs(0.5 * q * eold) or p <= q * (a - b) or p >= q * (c - b) then | |
| 233 e = b >= mid and a - b or c - b | |
| 234 d = gold2 * e | |
| 235 else | |
| 236 d, u = p / q, b + d -- actual parabolic interpolation happens here | |
| 237 if u - a < tol2 or c - u < tol2 then | |
| 238 d = mid > b and tol1 or 0.0 - tol1 | |
| 239 end | |
| 240 end | |
| 241 else -- golden section interpolation | |
| 242 e = b >= min and a - b or c - b | |
| 243 d = gold2 * e | |
| 244 end | |
| 245 u = fabs(d) >= tol1 and b + d or b + (d > 0.0 and tol1 or -tol1); | |
| 246 fu = func(u) | |
| 247 if fu <= fb then -- u is the minimum point so far | |
| 248 if u >= b then a = b | |
| 249 else c = b end | |
| 250 v, w, b = w, b, u | |
| 251 fv, fw, fb = fw, fb, fu | |
| 252 else -- adjust (a,c) and (u,v,w) | |
| 253 if u < b then a = u | |
| 254 else c = u end | |
| 255 if fu <= fw or w == b then | |
| 256 v, w = w, u | |
| 257 fv, fw = fw, fu | |
| 258 elseif fu <= fv or v == b or v == w then | |
| 259 v, fv = u, fu; | |
| 260 end | |
| 261 end | |
| 262 end | |
| 263 return fb, b | |
| 264 end | |
| 265 | |
| 266 matrix = {} | |
| 267 | |
| 268 -- Description: chi^2 test for contingency tables | |
| 269 -- Dependent on: math.igamma() | |
| 270 function matrix.chi2(a) | |
| 271 if #a == 2 and #a[1] == 2 then -- 2x2 table | |
| 272 local x, z | |
| 273 x = (a[1][1] + a[1][2]) * (a[2][1] + a[2][2]) * (a[1][1] + a[2][1]) * (a[1][2] + a[2][2]) | |
| 274 if x == 0 then return 0, 1, false end | |
| 275 z = a[1][1] * a[2][2] - a[1][2] * a[2][1] | |
| 276 z = (a[1][1] + a[1][2] + a[2][1] + a[2][2]) * z * z / x | |
| 277 return z, math.igamma(.5, .5 * z, true), true | |
| 278 else -- generic table | |
| 279 local rs, cs, n, m, N, z = {}, {}, #a, #a[1], 0, 0 | |
| 280 for i = 1, n do rs[i] = 0 end | |
| 281 for j = 1, m do cs[j] = 0 end | |
| 282 for i = 1, n do -- compute column sum and row sum | |
| 283 for j = 1, m do cs[j], rs[i] = cs[j] + a[i][j], rs[i] + a[i][j] end | |
| 284 end | |
| 285 for i = 1, n do N = N + rs[i] end | |
| 286 for i = 1, n do -- compute the chi^2 statistics | |
| 287 for j = 1, m do | |
| 288 local E = rs[i] * cs[j] / N; | |
| 289 z = z + (a[i][j] - E) * (a[i][j] - E) / E | |
| 290 end | |
| 291 end | |
| 292 return z, math.igamma(.5 * (n-1) * (m-1), .5 * z, true), true; | |
| 293 end | |
| 294 end | |
| 295 | |
| 296 --------------------------------- | |
| 297 -- END: routines from klib.lua -- | |
| 298 --------------------------------- | |
| 299 | |
| 300 | |
| 301 -------------------------- | |
| 302 -- BEGIN: misc routines -- | |
| 303 -------------------------- | |
| 304 | |
| 305 -- precompute an array for PL->probability conversion | |
| 306 -- @param m maximum PL | |
| 307 function algo_init_q2p(m) | |
| 308 local q2p = {} | |
| 309 for i = 0, m do | |
| 310 q2p[i] = math.pow(10, -i / 10) | |
| 311 end | |
| 312 return q2p | |
| 313 end | |
| 314 | |
| 315 -- given the haplotype frequency, compute r^2 | |
| 316 -- @param f 4 haplotype frequencies; f[] is 0-indexed. | |
| 317 -- @return r^2 | |
| 318 function algo_r2(f) | |
| 319 local p = { f[0] + f[1], f[0] + f[2] } | |
| 320 local D = f[0] * f[3] - f[1] * f[2] | |
| 321 return (p[1] == 0 or p[2] == 0 or 1-p[1] == 0 or 1-p[2] == 0) and 0 or D * D / (p[1] * p[2] * (1 - p[1]) * (1 - p[2])) | |
| 322 end | |
| 323 | |
| 324 -- parse a VCF line to get PL | |
| 325 -- @param q2p is computed by algo_init_q2p() | |
| 326 function text_parse_pl(t, q2p, parse_GT) | |
| 327 parse_GT = parse_GT == nil and true or false | |
| 328 local ht, gt, pl = {}, {}, {} | |
| 329 local s, j0 = t[9]:split(':'), 0 | |
| 330 for j = 1, #s do | |
| 331 if s[j] == 'PL' then j0 = j break end | |
| 332 end | |
| 333 local has_GT = (s[1] == 'GT' and parse_GT) and true or false | |
| 334 for i = 10, #t do | |
| 335 if j0 > 0 then | |
| 336 local s = t[i]:split(':') | |
| 337 local a, b = 1, s[j0]:find(',') | |
| 338 pl[#pl+1] = q2p[tonumber(s[j0]:sub(a, b - 1))] | |
| 339 a, b = b + 1, s[j0]:find(',', b + 1) | |
| 340 pl[#pl+1] = q2p[tonumber(s[j0]:sub(a, b - 1))] | |
| 341 a, b = b + 1, s[j0]:find(',', b + 1) | |
| 342 pl[#pl+1] = q2p[tonumber(s[j0]:sub(a, (b and b - 1) or nil))] | |
| 343 end | |
| 344 if has_GT then | |
| 345 if t[i]:sub(1, 1) ~= '.' then | |
| 346 local g = tonumber(t[i]:sub(1, 1)) + tonumber(t[i]:sub(3, 3)); | |
| 347 gt[#gt+1] = 1e-6; gt[#gt+1] = 1e-6; gt[#gt+1] = 1e-6 | |
| 348 gt[#gt - 2 + g] = 1 | |
| 349 ht[#ht+1] = tonumber(t[i]:sub(1, 1)); ht[#ht+1] = tonumber(t[i]:sub(3, 3)); | |
| 350 else | |
| 351 gt[#gt+1] = 1; gt[#gt+1] = 1; gt[#gt+1] = 1 | |
| 352 ht[#ht+1] = -1; ht[#ht+1] = -1; | |
| 353 end | |
| 354 end | |
| 355 -- print(t[i], pl[#pl-2], pl[#pl-1], pl[#pl], gt[#gt-2], gt[#gt-1], gt[#gt]) | |
| 356 end | |
| 357 if #pl == 0 then pl = nil end | |
| 358 local x = has_GT and { t[1], t[2], ht, gt, pl } or { t[1], t[2], nil, nil, pl } | |
| 359 return x | |
| 360 end | |
| 361 | |
| 362 -- Infer haplotype frequency | |
| 363 -- @param pdg genotype likelihoods P(D|g) generated by text_parse_pl(). pdg[] is 1-indexed. | |
| 364 -- @param eps precision [1e-5] | |
| 365 -- @return 2-locus haplotype frequencies, 0-indexed array | |
| 366 function algo_hapfreq2(pdg, eps) | |
| 367 eps = eps or 1e-5 | |
| 368 local n, f = #pdg[1] / 3, {[0]=0.25, 0.25, 0.25, 0.25} | |
| 369 for iter = 1, 100 do | |
| 370 local F = {[0]=0, 0, 0, 0} | |
| 371 for i = 0, n - 1 do | |
| 372 local p1, p2 = {[0]=pdg[1][i*3+1], pdg[1][i*3+2], pdg[1][i*3+3]}, {[0]=pdg[2][i*3+1], pdg[2][i*3+2], pdg[2][i*3+3]} | |
| 373 local u = { [0]= | |
| 374 f[0] * (f[0] * p1[0] * p2[0] + f[1] * p1[0] * p2[1] + f[2] * p1[1] * p2[0] + f[3] * p1[1] * p2[1]), | |
| 375 f[1] * (f[0] * p1[0] * p2[1] + f[1] * p1[0] * p2[2] + f[2] * p1[1] * p2[1] + f[3] * p1[1] * p2[2]), | |
| 376 f[2] * (f[0] * p1[1] * p2[0] + f[1] * p1[1] * p2[1] + f[2] * p1[2] * p2[0] + f[3] * p1[2] * p2[1]), | |
| 377 f[3] * (f[0] * p1[1] * p2[1] + f[1] * p1[1] * p2[2] + f[2] * p1[2] * p2[1] + f[3] * p1[2] * p2[2]) | |
| 378 } | |
| 379 local s = u[0] + u[1] + u[2] + u[3] | |
| 380 s = 1 / (s * n) | |
| 381 F[0] = F[0] + u[0] * s | |
| 382 F[1] = F[1] + u[1] * s | |
| 383 F[2] = F[2] + u[2] * s | |
| 384 F[3] = F[3] + u[3] * s | |
| 385 end | |
| 386 local e = 0 | |
| 387 for k = 0, 3 do | |
| 388 e = math.abs(f[k] - F[k]) > e and math.abs(f[k] - F[k]) or e | |
| 389 end | |
| 390 for k = 0, 3 do f[k] = F[k] end | |
| 391 if e < eps then break end | |
| 392 -- print(f[0], f[1], f[2], f[3]) | |
| 393 end | |
| 394 return f | |
| 395 end | |
| 396 | |
| 397 ------------------------ | |
| 398 -- END: misc routines -- | |
| 399 ------------------------ | |
| 400 | |
| 401 | |
| 402 --------------------- | |
| 403 -- BEGIN: commands -- | |
| 404 --------------------- | |
| 405 | |
| 406 -- CMD vcf2bgl: convert PL tagged VCF to Beagle input -- | |
| 407 function cmd_vcf2bgl() | |
| 408 if #arg == 0 then | |
| 409 print("\nUsage: vcf2bgl.lua <in.vcf>") | |
| 410 print("\nNB: This command finds PL by matching /(\\d+),(\\d+),(\\d+)/.\n"); | |
| 411 os.exit(1) | |
| 412 end | |
| 413 | |
| 414 local lookup = {} | |
| 415 for i = 0, 10000 do lookup[i] = string.format("%.4f", math.pow(10, -i/10)) end | |
| 416 | |
| 417 local fp = io.xopen(arg[1]) | |
| 418 for l in fp:lines() do | |
| 419 if l:sub(1, 2) == '##' then -- meta lines; do nothing | |
| 420 elseif l:sub(1, 1) == '#' then -- sample lines | |
| 421 local t, s = l:split('\t'), {} | |
| 422 for i = 10, #t do s[#s+1] = t[i]; s[#s+1] = t[i]; s[#s+1] = t[i] end | |
| 423 print('marker', 'alleleA', 'alleleB', table.concat(s, '\t')) | |
| 424 else -- data line | |
| 425 local t = l:split('\t'); | |
| 426 if t[5] ~= '.' and t[5]:find(",") == nil and #t[5] == 1 and #t[4] == 1 then -- biallic SNP | |
| 427 local x, z = -1, {}; | |
| 428 if t[9]:find('PL') then | |
| 429 for i = 10, #t do | |
| 430 local AA, Aa, aa = t[i]:match('(%d+),(%d+),(%d+)') | |
| 431 AA = tonumber(AA); Aa = tonumber(Aa); aa = tonumber(aa); | |
| 432 if AA ~= nil then | |
| 433 z[#z+1] = lookup[AA]; z[#z+1] = lookup[Aa]; z[#z+1] = lookup[aa]; | |
| 434 else z[#z+1] = 1; z[#z+1] = 1; z[#z+1] = 1; end | |
| 435 end | |
| 436 print(t[1]..':'..t[2], t[4], t[5], table.concat(z, '\t')) | |
| 437 elseif t[9]:find('GL') then | |
| 438 print('Error: not implemented') | |
| 439 os.exit(1) | |
| 440 end | |
| 441 end | |
| 442 end | |
| 443 end | |
| 444 fp:close() | |
| 445 end | |
| 446 | |
| 447 -- CMD bgl2vcf: convert Beagle output to VCF | |
| 448 function cmd_bgl2vcf() | |
| 449 if #arg < 2 then | |
| 450 print('Usage: bgl2vcf.lua <in.phased> <in.gprobs>') | |
| 451 os.exit(1) | |
| 452 end | |
| 453 | |
| 454 local fpp = io.xopen(arg[1]); | |
| 455 local fpg = io.xopen(arg[2]); | |
| 456 for lg in fpg:lines() do | |
| 457 local tp, tg, a = fpp:read():split('%s'), lg:split('%s', 4), {} | |
| 458 if tp[1] == 'I' then | |
| 459 for i = 3, #tp, 2 do a[#a+1] = tp[i] end | |
| 460 print('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', table.concat(a, '\t')) | |
| 461 else | |
| 462 local chr, pos = tg[1]:match('(%S+):(%d+)$') | |
| 463 a = {chr, pos, '.', tg[2], tg[3], 30, '.', '.', 'GT'} | |
| 464 for i = 3, #tp, 2 do | |
| 465 a[#a+1] = ((tp[i] == tg[2] and 0) or 1) .. '|' .. ((tp[i+1] == tg[2] and 0) or 1) | |
| 466 end | |
| 467 print(table.concat(a, '\t')) | |
| 468 end | |
| 469 end | |
| 470 fpg:close(); fpp:close(); | |
| 471 end | |
| 472 | |
| 473 -- CMD freq: count alleles in each population | |
| 474 function cmd_freq() | |
| 475 -- parse the command line | |
| 476 local site_only = true; -- print site allele frequency or not | |
| 477 for c in os.getopt(arg, 's') do | |
| 478 if c == 's' then site_only = false end | |
| 479 end | |
| 480 if #arg == 0 then | |
| 481 print("\nUsage: vcfutils.lua freq [-s] <in.vcf> [samples.txt]\n") | |
| 482 print("NB: 1) This command only considers biallelic variants.") | |
| 483 print(" 2) Apply '-s' to get the allele frequency spectrum.") | |
| 484 print(" 3) 'samples.txt' is TAB-delimited with each line consisting of sample and population.") | |
| 485 print("") | |
| 486 os.exit(1) | |
| 487 end | |
| 488 | |
| 489 -- read the sample-population pairs | |
| 490 local pop, sample = {}, {} | |
| 491 if #arg > 1 then | |
| 492 local fp = io.xopen(arg[2]); | |
| 493 for l in fp:lines() do | |
| 494 local s, p = l:match("^(%S+)%s+(%S+)"); -- sample, population pair | |
| 495 sample[s] = p; -- FIXME: check duplications | |
| 496 if pop[p] then table.insert(pop[p], s) | |
| 497 else pop[p] = {s} end | |
| 498 end | |
| 499 fp:close(); | |
| 500 end | |
| 501 pop['NA'] = {} | |
| 502 | |
| 503 -- parse VCF | |
| 504 fp = (#arg >= 2 and io.xopen(arg[1])) or io.stdin; | |
| 505 local col, cnt = {}, {}; | |
| 506 for k in pairs(pop) do | |
| 507 col[k], cnt[k] = {}, {[0]=0}; | |
| 508 end | |
| 509 for l in fp:lines() do | |
| 510 if l:sub(1, 2) == '##' then -- meta lines; do nothing | |
| 511 elseif l:sub(1, 1) == '#' then -- the sample line | |
| 512 local t, del_NA = l:split('\t'), true; | |
| 513 for i = 10, #t do | |
| 514 local k = sample[t[i]] | |
| 515 if k == nil then | |
| 516 k, del_NA = 'NA', false | |
| 517 table.insert(pop[k], t[i]) | |
| 518 end | |
| 519 table.insert(col[k], i); | |
| 520 table.insert(cnt[k], 0); | |
| 521 table.insert(cnt[k], 0); | |
| 522 end | |
| 523 if del_NA then pop['NA'], col['NA'], cnt['NA'] = nil, nil, nil end | |
| 524 else -- data lines | |
| 525 local t = l:split('\t'); | |
| 526 if t[5] ~= '.' and t[5]:find(",") == nil then -- biallic | |
| 527 if site_only == true then io.write(t[1], '\t', t[2], '\t', t[4], '\t', t[5]) end | |
| 528 for k, v in pairs(col) do | |
| 529 local ac, an = 0, 0; | |
| 530 for i = 1, #v do | |
| 531 local a1, a2 = t[v[i]]:match("^(%d).(%d)"); | |
| 532 if a1 ~= nil then ac, an = ac + a1 + a2, an + 2 end | |
| 533 end | |
| 534 if site_only == true then io.write('\t', k, ':', an, ':', ac) end | |
| 535 if an == #cnt[k] then cnt[k][ac] = cnt[k][ac] + 1 end | |
| 536 end | |
| 537 if site_only == true then io.write('\n') end | |
| 538 end | |
| 539 end | |
| 540 end | |
| 541 fp:close(); | |
| 542 | |
| 543 -- print | |
| 544 if site_only == false then | |
| 545 for k, v in pairs(cnt) do | |
| 546 io.write(k .. "\t" .. #v); | |
| 547 for i = 0, #v do io.write("\t" .. v[i]) end | |
| 548 io.write('\n'); | |
| 549 end | |
| 550 end | |
| 551 end | |
| 552 | |
| 553 function cmd_vcf2chi2() | |
| 554 if #arg < 3 then | |
| 555 print("Usage: vcfutils.lua vcf2chi2 <in.vcf> <group1.list> <group2.list>"); | |
| 556 os.exit(1) | |
| 557 end | |
| 558 | |
| 559 local g = {}; | |
| 560 | |
| 561 -- read the list of groups | |
| 562 local fp = io.xopen(arg[2]); | |
| 563 for l in fp:lines() do local x = l:match("^(%S+)"); g[x] = 1 end -- FIXME: check duplicate | |
| 564 fp:close() | |
| 565 fp = io.xopen(arg[3]); | |
| 566 for l in fp:lines() do local x = l:match("^(%S+)"); g[x] = 2 end | |
| 567 fp:close() | |
| 568 | |
| 569 -- process VCF | |
| 570 fp = io.xopen(arg[1]) | |
| 571 local h = {{}, {}} | |
| 572 for l in fp:lines() do | |
| 573 if l:sub(1, 2) == '##' then print(l) -- meta lines; do nothing | |
| 574 elseif l:sub(1, 1) == '#' then -- sample lines | |
| 575 local t = l:split('\t'); | |
| 576 for i = 10, #t do | |
| 577 if g[t[i]] == 1 then table.insert(h[1], i) | |
| 578 elseif g[t[i]] == 2 then table.insert(h[2], i) end | |
| 579 end | |
| 580 while #t > 8 do table.remove(t) end | |
| 581 print(table.concat(t, "\t")) | |
| 582 else -- data line | |
| 583 local t = l:split('\t'); | |
| 584 if t[5] ~= '.' and t[5]:find(",") == nil then -- biallic | |
| 585 local a = {{0, 0}, {0, 0}} | |
| 586 for i = 1, 2 do | |
| 587 for _, k in pairs(h[i]) do | |
| 588 if t[k]:find("^0.0") then a[i][1] = a[i][1] + 2 | |
| 589 elseif t[k]:find("^1.1") then a[i][2] = a[i][2] + 2 | |
| 590 elseif t[k]:find("^0.1") or t[k]:find("^1.0") then | |
| 591 a[i][1], a[i][2] = a[i][1] + 1, a[i][2] + 1 | |
| 592 end | |
| 593 end | |
| 594 end | |
| 595 local chi2, p, succ = matrix.chi2(a); | |
| 596 while #t > 8 do table.remove(t) end | |
| 597 --print(a[1][1], a[1][2], a[2][1], a[2][2], chi2, p); | |
| 598 if succ then print(table.concat(t, "\t") .. ";PCHI2=" .. string.format("%.3g", p) | |
| 599 .. string.format(';AF1=%.4g;AF2=%.4g,%.4g', (a[1][2]+a[2][2]) / (a[1][1]+a[1][2]+a[2][1]+a[2][2]), | |
| 600 a[1][2]/(a[1][1]+a[1][2]), a[2][2]/(a[2][1]+a[2][2]))) | |
| 601 else print(table.concat(t, "\t")) end | |
| 602 end | |
| 603 end | |
| 604 end | |
| 605 fp:close() | |
| 606 end | |
| 607 | |
| 608 -- CMD: compute r^2 | |
| 609 function cmd_r2() | |
| 610 local w, is_ht, is_gt = 1, false, false | |
| 611 for o, a in os.getopt(arg, 'w:hg') do | |
| 612 if o == 'w' then w = tonumber(a) | |
| 613 elseif o == 'h' then is_ht, is_gt = true, true | |
| 614 elseif o == 'g' then is_gt = true | |
| 615 end | |
| 616 end | |
| 617 if #arg == 0 then | |
| 618 print("Usage: vcfutils.lua r2 [-hg] [-w 1] <in.vcf>") | |
| 619 os.exit(1) | |
| 620 end | |
| 621 local stack, fp, q2p = {}, io.xopen(arg[1]), algo_init_q2p(1023) | |
| 622 for l in fp:lines() do | |
| 623 if l:sub(1, 1) ~= '#' then | |
| 624 local t = l:split('\t') | |
| 625 local x = text_parse_pl(t, q2p) | |
| 626 if #t[5] == 1 and t[5] ~= '.' then -- biallelic | |
| 627 local r2 = {} | |
| 628 for k = 1, w do | |
| 629 if is_gt == false then -- use PL | |
| 630 if stack[k] then | |
| 631 local pdg = { stack[k][5], x[5] } | |
| 632 r2[#r2+1] = algo_r2(algo_hapfreq2(pdg)) | |
| 633 else r2[#r2+1] = 0 end | |
| 634 elseif is_ht == false then -- use unphased GT | |
| 635 if stack[k] then | |
| 636 local pdg = { stack[k][4], x[4] } | |
| 637 r2[#r2+1] = algo_r2(algo_hapfreq2(pdg)) | |
| 638 else r2[#r2+1] = 0 end | |
| 639 else -- use phased GT | |
| 640 if stack[k] then | |
| 641 local f, ht = { [0]=0, 0, 0, 0 }, { stack[k][3], x[3] } | |
| 642 for i = 1, #ht[1] do | |
| 643 local j = ht[1][i] * 2 + ht[2][i] | |
| 644 f[j] = f[j] + 1 | |
| 645 end | |
| 646 local sum = f[0] + f[1] + f[2] + f[3] | |
| 647 for k = 0, 3 do f[k] = f[k] / sum end | |
| 648 r2[#r2+1] = algo_r2(f) | |
| 649 else r2[#r2+1] = 0 end | |
| 650 end | |
| 651 end | |
| 652 for k = 1, #r2 do | |
| 653 r2[k] = string.format('%.3f', r2[k]) | |
| 654 end | |
| 655 print(x[1], x[2], table.concat(r2, '\t')) | |
| 656 if #stack == w then table.remove(stack, 1) end | |
| 657 stack[#stack+1] = x | |
| 658 end | |
| 659 end | |
| 660 end | |
| 661 fp:close() | |
| 662 end | |
| 663 | |
| 664 ------------------- | |
| 665 -- END: commands -- | |
| 666 ------------------- | |
| 667 | |
| 668 | |
| 669 ------------------- | |
| 670 -- MAIN FUNCTION -- | |
| 671 ------------------- | |
| 672 | |
| 673 if #arg == 0 then | |
| 674 print("\nUsage: vcfutils.lua <command> <arguments>\n") | |
| 675 print("Command: freq count biallelic alleles in each population") | |
| 676 print(" r2 compute r^2") | |
| 677 print(" vcf2chi2 compute 1-degree chi-square between two groups of samples") | |
| 678 print(" vcf2bgl convert PL annotated VCF to Beagle input") | |
| 679 print(" bgl2vcf convert Beagle input to VCF") | |
| 680 print("") | |
| 681 os.exit(1) | |
| 682 end | |
| 683 | |
| 684 local cmd = arg[1] | |
| 685 table.remove(arg, 1) | |
| 686 if cmd == 'vcf2bgl' then cmd_vcf2bgl() | |
| 687 elseif cmd == 'bgl2vcf' then cmd_bgl2vcf() | |
| 688 elseif cmd == 'freq' then cmd_freq() | |
| 689 elseif cmd == 'r2' then cmd_r2() | |
| 690 elseif cmd == 'vcf2chi2' then cmd_vcf2chi2() | |
| 691 else | |
| 692 print('ERROR: unknown command "' .. cmd .. '"') | |
| 693 os.exit(1) | |
| 694 end |
