0
|
1 /* The MIT License
|
|
2
|
|
3 Copyright (c) 2008 Genome Research Ltd (GRL).
|
|
4 2011 Heng Li <lh3@live.co.uk>
|
|
5
|
|
6 Permission is hereby granted, free of charge, to any person obtaining
|
|
7 a copy of this software and associated documentation files (the
|
|
8 "Software"), to deal in the Software without restriction, including
|
|
9 without limitation the rights to use, copy, modify, merge, publish,
|
|
10 distribute, sublicense, and/or sell copies of the Software, and to
|
|
11 permit persons to whom the Software is furnished to do so, subject to
|
|
12 the following conditions:
|
|
13
|
|
14 The above copyright notice and this permission notice shall be
|
|
15 included in all copies or substantial portions of the Software.
|
|
16
|
|
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
18 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
19 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
20 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
|
21 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
22 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
23 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
24 SOFTWARE.
|
|
25 */
|
|
26
|
|
27 /* This program is separated from maq's read simulator with Colin
|
|
28 * Hercus' modification to allow longer indels. */
|
|
29
|
|
30 #include <stdlib.h>
|
|
31 #include <math.h>
|
|
32 #include <time.h>
|
|
33 #include <assert.h>
|
|
34 #include <stdio.h>
|
|
35 #include <unistd.h>
|
|
36 #include <stdint.h>
|
|
37 #include <ctype.h>
|
|
38 #include <string.h>
|
|
39 #include <zlib.h>
|
|
40 #include "kseq.h"
|
|
41 KSEQ_INIT(gzFile, gzread)
|
|
42
|
|
43 #define PACKAGE_VERSION "0.3.0"
|
|
44
|
|
45 const uint8_t nst_nt4_table[256] = {
|
|
46 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
47 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
48 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
|
|
49 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
50 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
51 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
52 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
53 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
54 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
55 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
56 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
57 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
58 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
59 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
60 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
61 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
|
|
62 };
|
|
63
|
|
64 /* Simple normal random number generator, copied from genran.c */
|
|
65
|
|
66 double ran_normal()
|
|
67 {
|
|
68 static int iset = 0;
|
|
69 static double gset;
|
|
70 double fac, rsq, v1, v2;
|
|
71 if (iset == 0) {
|
|
72 do {
|
|
73 v1 = 2.0 * drand48() - 1.0;
|
|
74 v2 = 2.0 * drand48() - 1.0;
|
|
75 rsq = v1 * v1 + v2 * v2;
|
|
76 } while (rsq >= 1.0 || rsq == 0.0);
|
|
77 fac = sqrt(-2.0 * log(rsq) / rsq);
|
|
78 gset = v1 * fac;
|
|
79 iset = 1;
|
|
80 return v2 * fac;
|
|
81 } else {
|
|
82 iset = 0;
|
|
83 return gset;
|
|
84 }
|
|
85 }
|
|
86
|
|
87 /* wgsim */
|
|
88
|
|
89 enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000};
|
|
90 typedef unsigned short mut_t;
|
|
91 static mut_t mutmsk = (mut_t)0xf000;
|
|
92
|
|
93 typedef struct {
|
|
94 int l, m; /* length and maximum buffer size */
|
|
95 mut_t *s; /* sequence */
|
|
96 } mutseq_t;
|
|
97
|
|
98 static double ERR_RATE = 0.02;
|
|
99 static double MUT_RATE = 0.001;
|
|
100 static double INDEL_FRAC = 0.15;
|
|
101 static double INDEL_EXTEND = 0.3;
|
|
102 static double MAX_N_RATIO = 0.1;
|
|
103
|
|
104 void wgsim_mut_diref(const kseq_t *ks, int is_hap, mutseq_t *hap1, mutseq_t *hap2)
|
|
105 {
|
|
106 int i, deleting = 0;
|
|
107 mutseq_t *ret[2];
|
|
108
|
|
109 ret[0] = hap1; ret[1] = hap2;
|
|
110 ret[0]->l = ks->seq.l; ret[1]->l = ks->seq.l;
|
|
111 ret[0]->m = ks->seq.m; ret[1]->m = ks->seq.m;
|
|
112 ret[0]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t));
|
|
113 ret[1]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t));
|
|
114 for (i = 0; i != ks->seq.l; ++i) {
|
|
115 int c;
|
|
116 c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)ks->seq.s[i]];
|
|
117 if (deleting) {
|
|
118 if (drand48() < INDEL_EXTEND) {
|
|
119 if (deleting & 1) ret[0]->s[i] |= DELETE;
|
|
120 if (deleting & 2) ret[1]->s[i] |= DELETE;
|
|
121 continue;
|
|
122 } else deleting = 0;
|
|
123 }
|
|
124 if (c < 4 && drand48() < MUT_RATE) { // mutation
|
|
125 if (drand48() >= INDEL_FRAC) { // substitution
|
|
126 double r = drand48();
|
|
127 c = (c + (int)(r * 3.0 + 1)) & 3;
|
|
128 if (is_hap || drand48() < 0.333333) { // hom
|
|
129 ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c;
|
|
130 } else { // het
|
|
131 ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c;
|
|
132 }
|
|
133 } else { // indel
|
|
134 if (drand48() < 0.5) { // deletion
|
|
135 if (is_hap || drand48() < 0.333333) { // hom-del
|
|
136 ret[0]->s[i] = ret[1]->s[i] = DELETE;
|
|
137 deleting = 3;
|
|
138 } else { // het-del
|
|
139 deleting = drand48()<0.5?1:2;
|
|
140 ret[deleting-1]->s[i] = DELETE;
|
|
141 }
|
|
142 } else { // insertion
|
|
143 int num_ins = 0, ins = 0;
|
|
144 do {
|
|
145 num_ins++;
|
|
146 ins = (ins << 2) | (int)(drand48() * 4.0);
|
|
147 } while (num_ins < 4 && drand48() < INDEL_EXTEND);
|
|
148
|
|
149 if (is_hap || drand48() < 0.333333) { // hom-ins
|
|
150 ret[0]->s[i] = ret[1]->s[i] = (num_ins << 12) | (ins << 4) | c;
|
|
151 } else { // het-ins
|
|
152 ret[drand48()<0.5?0:1]->s[i] = (num_ins << 12) | (ins << 4) | c;
|
|
153 }
|
|
154 }
|
|
155 }
|
|
156 }
|
|
157 }
|
|
158 }
|
|
159 void wgsim_print_mutref(const char *name, const kseq_t *ks, mutseq_t *hap1, mutseq_t *hap2)
|
|
160 {
|
|
161 int i;
|
|
162 for (i = 0; i != ks->seq.l; ++i) {
|
|
163 int c[3];
|
|
164 c[0] = nst_nt4_table[(int)ks->seq.s[i]];
|
|
165 c[1] = hap1->s[i]; c[2] = hap2->s[i];
|
|
166 if (c[0] >= 4) continue;
|
|
167 if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) {
|
|
168 printf("%s\t%d\t", name, i+1);
|
|
169 if (c[1] == c[2]) { // hom
|
|
170 if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
|
|
171 printf("%c\t%c\t-\n", "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]);
|
|
172 } else if ((c[1]&mutmsk) == DELETE) { // del
|
|
173 printf("%c\t-\t-\n", "ACGTN"[c[0]]);
|
|
174 } else if (((c[1] & mutmsk) >> 12) <= 5) { // ins
|
|
175 printf("-\t");
|
|
176 int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
|
|
177 while (n > 0) {
|
|
178 putchar("ACGTN"[ins & 0x3]);
|
|
179 ins >>= 2;
|
|
180 n--;
|
|
181 }
|
|
182 printf("\t-\n");
|
|
183 } else assert(0);
|
|
184 } else { // het
|
|
185 if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution
|
|
186 printf("%c\t%c\t+\n", "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)]);
|
|
187 } else if ((c[1]&mutmsk) == DELETE) {
|
|
188 printf("%c\t-\t+\n", "ACGTN"[c[0]]);
|
|
189 } else if ((c[2]&mutmsk) == DELETE) {
|
|
190 printf("%c\t-\t+\n", "ACGTN"[c[0]]);
|
|
191 } else if (((c[1] & mutmsk) >> 12) <= 4) { // ins1
|
|
192 printf("-\t");
|
|
193 int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
|
|
194 while (n > 0) {
|
|
195 putchar("ACGTN"[ins & 0x3]);
|
|
196 ins >>= 2;
|
|
197 n--;
|
|
198 }
|
|
199 printf("\t+\n");
|
|
200 } else if (((c[2] & mutmsk) >> 12) <= 5) { // ins2
|
|
201 printf("-\t");
|
|
202 int n = (c[2]&mutmsk) >> 12, ins = c[2] >> 4;
|
|
203 while (n > 0) {
|
|
204 putchar("ACGTN"[ins & 0x3]);
|
|
205 ins >>= 2;
|
|
206 n--;
|
|
207 }
|
|
208 printf("\t+\n");
|
|
209 } else assert(0);
|
|
210 }
|
|
211 }
|
|
212 }
|
|
213 }
|
|
214
|
|
215 void wgsim_core(FILE *fpout1, FILE *fpout2, const char *fn, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r)
|
|
216 {
|
|
217 kseq_t *ks;
|
|
218 mutseq_t rseq[2];
|
|
219 gzFile fp_fa;
|
|
220 uint64_t tot_len, ii;
|
|
221 int i, l, n_ref;
|
|
222 char *qstr;
|
|
223 int size[2], Q, max_size;
|
|
224 uint8_t *tmp_seq[2];
|
|
225 mut_t *target;
|
|
226
|
|
227 l = size_l > size_r? size_l : size_r;
|
|
228 qstr = (char*)calloc(l+1, 1);
|
|
229 tmp_seq[0] = (uint8_t*)calloc(l+2, 1);
|
|
230 tmp_seq[1] = (uint8_t*)calloc(l+2, 1);
|
|
231 size[0] = size_l; size[1] = size_r;
|
|
232 max_size = size_l > size_r? size_l : size_r;
|
|
233
|
|
234 Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
|
|
235
|
|
236 fp_fa = gzopen(fn, "r");
|
|
237 ks = kseq_init(fp_fa);
|
|
238 tot_len = n_ref = 0;
|
|
239 fprintf(stderr, "[%s] calculating the total length of the reference sequence...\n", __func__);
|
|
240 while ((l = kseq_read(ks)) >= 0) {
|
|
241 tot_len += l;
|
|
242 ++n_ref;
|
|
243 }
|
|
244 fprintf(stderr, "[%s] %d sequences, total length: %llu\n", __func__, n_ref, (long long)tot_len);
|
|
245 kseq_destroy(ks);
|
|
246 gzclose(fp_fa);
|
|
247
|
|
248 fp_fa = gzopen(fn, "r");
|
|
249 ks = kseq_init(fp_fa);
|
|
250 while ((l = kseq_read(ks)) >= 0) {
|
|
251 uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5);
|
|
252 if (l < dist + 3 * std_dev) {
|
|
253 fprintf(stderr, "[%s] skip sequence '%s' as it is shorter than %d!\n", __func__, ks->name.s, dist + 3 * std_dev);
|
|
254 continue;
|
|
255 }
|
|
256
|
|
257 // generate mutations and print them out
|
|
258 wgsim_mut_diref(ks, is_hap, rseq, rseq+1);
|
|
259 wgsim_print_mutref(ks->name.s, ks, rseq, rseq+1);
|
|
260
|
|
261 for (ii = 0; ii != n_pairs; ++ii) { // the core loop
|
|
262 double ran;
|
|
263 int d, pos, s[2], is_flip = 0;
|
|
264 int n_sub[2], n_indel[2], n_err[2], ext_coor[2], j, k;
|
|
265 FILE *fpo[2];
|
|
266
|
|
267 do { // avoid boundary failure
|
|
268 ran = ran_normal();
|
|
269 ran = ran * std_dev + dist;
|
|
270 d = (int)(ran + 0.5);
|
|
271 d = d > max_size? d : max_size;
|
|
272 pos = (int)((l - d + 1) * drand48());
|
|
273 } while (pos < 0 || pos >= ks->seq.l || pos + d - 1 >= ks->seq.l);
|
|
274
|
|
275 // flip or not
|
|
276 if (drand48() < 0.5) {
|
|
277 fpo[0] = fpout1; fpo[1] = fpout2;
|
|
278 s[0] = size[0]; s[1] = size[1];
|
|
279 } else {
|
|
280 fpo[1] = fpout1; fpo[0] = fpout2;
|
|
281 s[1] = size[0]; s[0] = size[1];
|
|
282 is_flip = 1;
|
|
283 }
|
|
284
|
|
285 // generate the read sequences
|
|
286 target = rseq[drand48()<0.5?0:1].s; // haplotype from which the reads are generated
|
|
287 n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0;
|
|
288
|
|
289 #define __gen_read(x, start, iter) do { \
|
|
290 for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < ks->seq.l && k < s[x]; iter) { \
|
|
291 int c = target[i], mut_type = c & mutmsk; \
|
|
292 if (ext_coor[x] < 0) { \
|
|
293 if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \
|
|
294 ext_coor[x] = i; \
|
|
295 } \
|
|
296 if (mut_type == DELETE) ++n_indel[x]; \
|
|
297 else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) { \
|
|
298 tmp_seq[x][k++] = c & 0xf; \
|
|
299 if (mut_type == SUBSTITUTE) ++n_sub[x]; \
|
|
300 } else { \
|
|
301 int n, ins; \
|
|
302 ++n_indel[x]; \
|
|
303 tmp_seq[x][k++] = c & 0xf; \
|
|
304 for (n = mut_type>>12, ins = c>>4; n > 0 && k < s[x]; --n, ins >>= 2) \
|
|
305 tmp_seq[x][k++] = ins & 0x3; \
|
|
306 } \
|
|
307 } \
|
|
308 if (k != s[x]) ext_coor[x] = -10; \
|
|
309 } while (0)
|
|
310
|
|
311 __gen_read(0, pos, ++i);
|
|
312 __gen_read(1, pos + d - 1, --i);
|
|
313 for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement
|
|
314 if (ext_coor[0] < 0 || ext_coor[1] < 0) { // fail to generate the read(s)
|
|
315 --ii;
|
|
316 continue;
|
|
317 }
|
|
318
|
|
319 // generate sequencing errors
|
|
320 for (j = 0; j < 2; ++j) {
|
|
321 int n_n = 0;
|
|
322 for (i = 0; i < s[j]; ++i) {
|
|
323 int c = tmp_seq[j][i];
|
|
324 if (c >= 4) { // actually c should be never larger than 4 if everything is correct
|
|
325 c = 4;
|
|
326 ++n_n;
|
|
327 } else if (drand48() < ERR_RATE) {
|
|
328 // c = (c + (int)(drand48() * 3.0 + 1)) & 3; // random sequencing errors
|
|
329 c = (c + 1) & 3; // recurrent sequencing errors
|
|
330 ++n_err[j];
|
|
331 }
|
|
332 tmp_seq[j][i] = c;
|
|
333 }
|
|
334 if ((double)n_n / s[j] > MAX_N_RATIO) break;
|
|
335 }
|
|
336 if (j < 2) { // too many ambiguous bases on one of the reads
|
|
337 --ii;
|
|
338 continue;
|
|
339 }
|
|
340
|
|
341 // print
|
|
342 for (j = 0; j < 2; ++j) {
|
|
343 for (i = 0; i < s[j]; ++i) qstr[i] = Q;
|
|
344 qstr[i] = 0;
|
|
345 fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", ks->name.s, ext_coor[0]+1, ext_coor[1]+1,
|
|
346 n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
|
|
347 (long long)ii, j==0? is_flip+1 : 2-is_flip);
|
|
348 for (i = 0; i < s[j]; ++i)
|
|
349 fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]);
|
|
350 fprintf(fpo[j], "\n+\n%s\n", qstr);
|
|
351 }
|
|
352 }
|
|
353 free(rseq[0].s); free(rseq[1].s);
|
|
354 }
|
|
355 kseq_destroy(ks);
|
|
356 gzclose(fp_fa);
|
|
357 free(qstr);
|
|
358 free(tmp_seq[0]); free(tmp_seq[1]);
|
|
359 }
|
|
360
|
|
361 static int simu_usage()
|
|
362 {
|
|
363 fprintf(stderr, "\n");
|
|
364 fprintf(stderr, "Program: wgsim (short read simulator)\n");
|
|
365 fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
|
|
366 fprintf(stderr, "Contact: Heng Li <lh3@sanger.ac.uk>\n\n");
|
|
367 fprintf(stderr, "Usage: wgsim [options] <in.ref.fa> <out.read1.fq> <out.read2.fq>\n\n");
|
|
368 fprintf(stderr, "Options: -e FLOAT base error rate [%.3f]\n", ERR_RATE);
|
|
369 fprintf(stderr, " -d INT outer distance between the two ends [500]\n");
|
|
370 fprintf(stderr, " -s INT standard deviation [50]\n");
|
|
371 fprintf(stderr, " -N INT number of read pairs [1000000]\n");
|
|
372 fprintf(stderr, " -1 INT length of the first read [70]\n");
|
|
373 fprintf(stderr, " -2 INT length of the second read [70]\n");
|
|
374 fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", MUT_RATE);
|
|
375 fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC);
|
|
376 fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND);
|
|
377 fprintf(stderr, " -S INT seed for random generator [-1]\n");
|
|
378 fprintf(stderr, " -h haplotype mode\n");
|
|
379 fprintf(stderr, "\n");
|
|
380 return 1;
|
|
381 }
|
|
382
|
|
383 int main(int argc, char *argv[])
|
|
384 {
|
|
385 int64_t N;
|
|
386 int dist, std_dev, c, size_l, size_r, is_hap = 0;
|
|
387 FILE *fpout1, *fpout2;
|
|
388 int seed = -1;
|
|
389
|
|
390 N = 1000000; dist = 500; std_dev = 50;
|
|
391 size_l = size_r = 70;
|
|
392 while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:S:")) >= 0) {
|
|
393 switch (c) {
|
|
394 case 'd': dist = atoi(optarg); break;
|
|
395 case 's': std_dev = atoi(optarg); break;
|
|
396 case 'N': N = atoi(optarg); break;
|
|
397 case '1': size_l = atoi(optarg); break;
|
|
398 case '2': size_r = atoi(optarg); break;
|
|
399 case 'e': ERR_RATE = atof(optarg); break;
|
|
400 case 'r': MUT_RATE = atof(optarg); break;
|
|
401 case 'R': INDEL_FRAC = atof(optarg); break;
|
|
402 case 'X': INDEL_EXTEND = atof(optarg); break;
|
|
403 case 'S': seed = atoi(optarg); break;
|
|
404 case 'h': is_hap = 1; break;
|
|
405 }
|
|
406 }
|
|
407 if (argc - optind < 3) return simu_usage();
|
|
408 fpout1 = fopen(argv[optind+1], "w");
|
|
409 fpout2 = fopen(argv[optind+2], "w");
|
|
410 if (!fpout1 || !fpout2) {
|
|
411 fprintf(stderr, "[wgsim] file open error\n");
|
|
412 return 1;
|
|
413 }
|
|
414 srand48(seed > 0? seed : time(0));
|
|
415 wgsim_core(fpout1, fpout2, argv[optind], is_hap, N, dist, std_dev, size_l, size_r);
|
|
416
|
|
417 fclose(fpout1); fclose(fpout2);
|
|
418 return 0;
|
|
419 }
|