comparison PsiCLASS-1.0.2/samtools-0.1.19/misc/wgsim.c @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:903fc43d6227
1 /* The MIT License
2
3 Copyright (c) 2008 Genome Research Ltd (GRL).
4 2011 Heng Li <lh3@live.co.uk>
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice shall be
15 included in all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
21 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
22 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
23 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 SOFTWARE.
25 */
26
27 /* This program is separated from maq's read simulator with Colin
28 * Hercus' modification to allow longer indels. */
29
30 #include <stdlib.h>
31 #include <math.h>
32 #include <time.h>
33 #include <assert.h>
34 #include <stdio.h>
35 #include <unistd.h>
36 #include <stdint.h>
37 #include <ctype.h>
38 #include <string.h>
39 #include <zlib.h>
40 #include "kseq.h"
41 KSEQ_INIT(gzFile, gzread)
42
43 #define PACKAGE_VERSION "0.3.0"
44
45 const uint8_t nst_nt4_table[256] = {
46 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
47 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
48 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
49 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
50 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
51 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
52 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
53 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
54 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
55 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
56 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
57 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
58 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
59 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
60 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
61 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
62 };
63
64 /* Simple normal random number generator, copied from genran.c */
65
66 double ran_normal()
67 {
68 static int iset = 0;
69 static double gset;
70 double fac, rsq, v1, v2;
71 if (iset == 0) {
72 do {
73 v1 = 2.0 * drand48() - 1.0;
74 v2 = 2.0 * drand48() - 1.0;
75 rsq = v1 * v1 + v2 * v2;
76 } while (rsq >= 1.0 || rsq == 0.0);
77 fac = sqrt(-2.0 * log(rsq) / rsq);
78 gset = v1 * fac;
79 iset = 1;
80 return v2 * fac;
81 } else {
82 iset = 0;
83 return gset;
84 }
85 }
86
87 /* wgsim */
88
89 enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000};
90 typedef unsigned short mut_t;
91 static mut_t mutmsk = (mut_t)0xf000;
92
93 typedef struct {
94 int l, m; /* length and maximum buffer size */
95 mut_t *s; /* sequence */
96 } mutseq_t;
97
98 static double ERR_RATE = 0.02;
99 static double MUT_RATE = 0.001;
100 static double INDEL_FRAC = 0.15;
101 static double INDEL_EXTEND = 0.3;
102 static double MAX_N_RATIO = 0.1;
103
104 void wgsim_mut_diref(const kseq_t *ks, int is_hap, mutseq_t *hap1, mutseq_t *hap2)
105 {
106 int i, deleting = 0;
107 mutseq_t *ret[2];
108
109 ret[0] = hap1; ret[1] = hap2;
110 ret[0]->l = ks->seq.l; ret[1]->l = ks->seq.l;
111 ret[0]->m = ks->seq.m; ret[1]->m = ks->seq.m;
112 ret[0]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t));
113 ret[1]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t));
114 for (i = 0; i != ks->seq.l; ++i) {
115 int c;
116 c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)ks->seq.s[i]];
117 if (deleting) {
118 if (drand48() < INDEL_EXTEND) {
119 if (deleting & 1) ret[0]->s[i] |= DELETE;
120 if (deleting & 2) ret[1]->s[i] |= DELETE;
121 continue;
122 } else deleting = 0;
123 }
124 if (c < 4 && drand48() < MUT_RATE) { // mutation
125 if (drand48() >= INDEL_FRAC) { // substitution
126 double r = drand48();
127 c = (c + (int)(r * 3.0 + 1)) & 3;
128 if (is_hap || drand48() < 0.333333) { // hom
129 ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c;
130 } else { // het
131 ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c;
132 }
133 } else { // indel
134 if (drand48() < 0.5) { // deletion
135 if (is_hap || drand48() < 0.333333) { // hom-del
136 ret[0]->s[i] = ret[1]->s[i] = DELETE;
137 deleting = 3;
138 } else { // het-del
139 deleting = drand48()<0.5?1:2;
140 ret[deleting-1]->s[i] = DELETE;
141 }
142 } else { // insertion
143 int num_ins = 0, ins = 0;
144 do {
145 num_ins++;
146 ins = (ins << 2) | (int)(drand48() * 4.0);
147 } while (num_ins < 4 && drand48() < INDEL_EXTEND);
148
149 if (is_hap || drand48() < 0.333333) { // hom-ins
150 ret[0]->s[i] = ret[1]->s[i] = (num_ins << 12) | (ins << 4) | c;
151 } else { // het-ins
152 ret[drand48()<0.5?0:1]->s[i] = (num_ins << 12) | (ins << 4) | c;
153 }
154 }
155 }
156 }
157 }
158 }
159 void wgsim_print_mutref(const char *name, const kseq_t *ks, mutseq_t *hap1, mutseq_t *hap2)
160 {
161 int i;
162 for (i = 0; i != ks->seq.l; ++i) {
163 int c[3];
164 c[0] = nst_nt4_table[(int)ks->seq.s[i]];
165 c[1] = hap1->s[i]; c[2] = hap2->s[i];
166 if (c[0] >= 4) continue;
167 if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) {
168 printf("%s\t%d\t", name, i+1);
169 if (c[1] == c[2]) { // hom
170 if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
171 printf("%c\t%c\t-\n", "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]);
172 } else if ((c[1]&mutmsk) == DELETE) { // del
173 printf("%c\t-\t-\n", "ACGTN"[c[0]]);
174 } else if (((c[1] & mutmsk) >> 12) <= 5) { // ins
175 printf("-\t");
176 int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
177 while (n > 0) {
178 putchar("ACGTN"[ins & 0x3]);
179 ins >>= 2;
180 n--;
181 }
182 printf("\t-\n");
183 } else assert(0);
184 } else { // het
185 if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution
186 printf("%c\t%c\t+\n", "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)]);
187 } else if ((c[1]&mutmsk) == DELETE) {
188 printf("%c\t-\t+\n", "ACGTN"[c[0]]);
189 } else if ((c[2]&mutmsk) == DELETE) {
190 printf("%c\t-\t+\n", "ACGTN"[c[0]]);
191 } else if (((c[1] & mutmsk) >> 12) <= 4) { // ins1
192 printf("-\t");
193 int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
194 while (n > 0) {
195 putchar("ACGTN"[ins & 0x3]);
196 ins >>= 2;
197 n--;
198 }
199 printf("\t+\n");
200 } else if (((c[2] & mutmsk) >> 12) <= 5) { // ins2
201 printf("-\t");
202 int n = (c[2]&mutmsk) >> 12, ins = c[2] >> 4;
203 while (n > 0) {
204 putchar("ACGTN"[ins & 0x3]);
205 ins >>= 2;
206 n--;
207 }
208 printf("\t+\n");
209 } else assert(0);
210 }
211 }
212 }
213 }
214
215 void wgsim_core(FILE *fpout1, FILE *fpout2, const char *fn, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r)
216 {
217 kseq_t *ks;
218 mutseq_t rseq[2];
219 gzFile fp_fa;
220 uint64_t tot_len, ii;
221 int i, l, n_ref;
222 char *qstr;
223 int size[2], Q, max_size;
224 uint8_t *tmp_seq[2];
225 mut_t *target;
226
227 l = size_l > size_r? size_l : size_r;
228 qstr = (char*)calloc(l+1, 1);
229 tmp_seq[0] = (uint8_t*)calloc(l+2, 1);
230 tmp_seq[1] = (uint8_t*)calloc(l+2, 1);
231 size[0] = size_l; size[1] = size_r;
232 max_size = size_l > size_r? size_l : size_r;
233
234 Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
235
236 fp_fa = gzopen(fn, "r");
237 ks = kseq_init(fp_fa);
238 tot_len = n_ref = 0;
239 fprintf(stderr, "[%s] calculating the total length of the reference sequence...\n", __func__);
240 while ((l = kseq_read(ks)) >= 0) {
241 tot_len += l;
242 ++n_ref;
243 }
244 fprintf(stderr, "[%s] %d sequences, total length: %llu\n", __func__, n_ref, (long long)tot_len);
245 kseq_destroy(ks);
246 gzclose(fp_fa);
247
248 fp_fa = gzopen(fn, "r");
249 ks = kseq_init(fp_fa);
250 while ((l = kseq_read(ks)) >= 0) {
251 uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5);
252 if (l < dist + 3 * std_dev) {
253 fprintf(stderr, "[%s] skip sequence '%s' as it is shorter than %d!\n", __func__, ks->name.s, dist + 3 * std_dev);
254 continue;
255 }
256
257 // generate mutations and print them out
258 wgsim_mut_diref(ks, is_hap, rseq, rseq+1);
259 wgsim_print_mutref(ks->name.s, ks, rseq, rseq+1);
260
261 for (ii = 0; ii != n_pairs; ++ii) { // the core loop
262 double ran;
263 int d, pos, s[2], is_flip = 0;
264 int n_sub[2], n_indel[2], n_err[2], ext_coor[2], j, k;
265 FILE *fpo[2];
266
267 do { // avoid boundary failure
268 ran = ran_normal();
269 ran = ran * std_dev + dist;
270 d = (int)(ran + 0.5);
271 d = d > max_size? d : max_size;
272 pos = (int)((l - d + 1) * drand48());
273 } while (pos < 0 || pos >= ks->seq.l || pos + d - 1 >= ks->seq.l);
274
275 // flip or not
276 if (drand48() < 0.5) {
277 fpo[0] = fpout1; fpo[1] = fpout2;
278 s[0] = size[0]; s[1] = size[1];
279 } else {
280 fpo[1] = fpout1; fpo[0] = fpout2;
281 s[1] = size[0]; s[0] = size[1];
282 is_flip = 1;
283 }
284
285 // generate the read sequences
286 target = rseq[drand48()<0.5?0:1].s; // haplotype from which the reads are generated
287 n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0;
288
289 #define __gen_read(x, start, iter) do { \
290 for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < ks->seq.l && k < s[x]; iter) { \
291 int c = target[i], mut_type = c & mutmsk; \
292 if (ext_coor[x] < 0) { \
293 if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \
294 ext_coor[x] = i; \
295 } \
296 if (mut_type == DELETE) ++n_indel[x]; \
297 else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) { \
298 tmp_seq[x][k++] = c & 0xf; \
299 if (mut_type == SUBSTITUTE) ++n_sub[x]; \
300 } else { \
301 int n, ins; \
302 ++n_indel[x]; \
303 tmp_seq[x][k++] = c & 0xf; \
304 for (n = mut_type>>12, ins = c>>4; n > 0 && k < s[x]; --n, ins >>= 2) \
305 tmp_seq[x][k++] = ins & 0x3; \
306 } \
307 } \
308 if (k != s[x]) ext_coor[x] = -10; \
309 } while (0)
310
311 __gen_read(0, pos, ++i);
312 __gen_read(1, pos + d - 1, --i);
313 for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement
314 if (ext_coor[0] < 0 || ext_coor[1] < 0) { // fail to generate the read(s)
315 --ii;
316 continue;
317 }
318
319 // generate sequencing errors
320 for (j = 0; j < 2; ++j) {
321 int n_n = 0;
322 for (i = 0; i < s[j]; ++i) {
323 int c = tmp_seq[j][i];
324 if (c >= 4) { // actually c should be never larger than 4 if everything is correct
325 c = 4;
326 ++n_n;
327 } else if (drand48() < ERR_RATE) {
328 // c = (c + (int)(drand48() * 3.0 + 1)) & 3; // random sequencing errors
329 c = (c + 1) & 3; // recurrent sequencing errors
330 ++n_err[j];
331 }
332 tmp_seq[j][i] = c;
333 }
334 if ((double)n_n / s[j] > MAX_N_RATIO) break;
335 }
336 if (j < 2) { // too many ambiguous bases on one of the reads
337 --ii;
338 continue;
339 }
340
341 // print
342 for (j = 0; j < 2; ++j) {
343 for (i = 0; i < s[j]; ++i) qstr[i] = Q;
344 qstr[i] = 0;
345 fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", ks->name.s, ext_coor[0]+1, ext_coor[1]+1,
346 n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
347 (long long)ii, j==0? is_flip+1 : 2-is_flip);
348 for (i = 0; i < s[j]; ++i)
349 fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]);
350 fprintf(fpo[j], "\n+\n%s\n", qstr);
351 }
352 }
353 free(rseq[0].s); free(rseq[1].s);
354 }
355 kseq_destroy(ks);
356 gzclose(fp_fa);
357 free(qstr);
358 free(tmp_seq[0]); free(tmp_seq[1]);
359 }
360
361 static int simu_usage()
362 {
363 fprintf(stderr, "\n");
364 fprintf(stderr, "Program: wgsim (short read simulator)\n");
365 fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
366 fprintf(stderr, "Contact: Heng Li <lh3@sanger.ac.uk>\n\n");
367 fprintf(stderr, "Usage: wgsim [options] <in.ref.fa> <out.read1.fq> <out.read2.fq>\n\n");
368 fprintf(stderr, "Options: -e FLOAT base error rate [%.3f]\n", ERR_RATE);
369 fprintf(stderr, " -d INT outer distance between the two ends [500]\n");
370 fprintf(stderr, " -s INT standard deviation [50]\n");
371 fprintf(stderr, " -N INT number of read pairs [1000000]\n");
372 fprintf(stderr, " -1 INT length of the first read [70]\n");
373 fprintf(stderr, " -2 INT length of the second read [70]\n");
374 fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", MUT_RATE);
375 fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC);
376 fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND);
377 fprintf(stderr, " -S INT seed for random generator [-1]\n");
378 fprintf(stderr, " -h haplotype mode\n");
379 fprintf(stderr, "\n");
380 return 1;
381 }
382
383 int main(int argc, char *argv[])
384 {
385 int64_t N;
386 int dist, std_dev, c, size_l, size_r, is_hap = 0;
387 FILE *fpout1, *fpout2;
388 int seed = -1;
389
390 N = 1000000; dist = 500; std_dev = 50;
391 size_l = size_r = 70;
392 while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:S:")) >= 0) {
393 switch (c) {
394 case 'd': dist = atoi(optarg); break;
395 case 's': std_dev = atoi(optarg); break;
396 case 'N': N = atoi(optarg); break;
397 case '1': size_l = atoi(optarg); break;
398 case '2': size_r = atoi(optarg); break;
399 case 'e': ERR_RATE = atof(optarg); break;
400 case 'r': MUT_RATE = atof(optarg); break;
401 case 'R': INDEL_FRAC = atof(optarg); break;
402 case 'X': INDEL_EXTEND = atof(optarg); break;
403 case 'S': seed = atoi(optarg); break;
404 case 'h': is_hap = 1; break;
405 }
406 }
407 if (argc - optind < 3) return simu_usage();
408 fpout1 = fopen(argv[optind+1], "w");
409 fpout2 = fopen(argv[optind+2], "w");
410 if (!fpout1 || !fpout2) {
411 fprintf(stderr, "[wgsim] file open error\n");
412 return 1;
413 }
414 srand48(seed > 0? seed : time(0));
415 wgsim_core(fpout1, fpout2, argv[optind], is_hap, N, dist, std_dev, size_l, size_r);
416
417 fclose(fpout1); fclose(fpout2);
418 return 0;
419 }