comparison pyPRADA_1.2/tools/samtools-0.1.16/misc/wgsim.c @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:acc2ca1a3ba4
1 /* The MIT License
2
3 Copyright (c) 2008 Genome Research Ltd (GRL).
4
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
12
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 SOFTWARE.
24 */
25
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
27
28 /* This program is separated from maq's read simulator with Colin
29 * Hercus' modification to allow longer indels. Colin is the chief
30 * developer of novoalign. */
31
32 #include <stdlib.h>
33 #include <math.h>
34 #include <time.h>
35 #include <assert.h>
36 #include <stdio.h>
37 #include <unistd.h>
38 #include <stdint.h>
39 #include <ctype.h>
40 #include <string.h>
41
42 #define PACKAGE_VERSION "0.2.3"
43
44 const uint8_t nst_nt4_table[256] = {
45 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
46 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
47 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
48 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
49 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
50 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
51 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
52 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
53 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
54 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
55 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
56 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
57 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
58 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
59 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
60 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
61 };
62
63 const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4};
64
65 /* Simple normal random number generator, copied from genran.c */
66
67 double ran_normal()
68 {
69 static int iset = 0;
70 static double gset;
71 double fac, rsq, v1, v2;
72 if (iset == 0) {
73 do {
74 v1 = 2.0 * drand48() - 1.0;
75 v2 = 2.0 * drand48() - 1.0;
76 rsq = v1 * v1 + v2 * v2;
77 } while (rsq >= 1.0 || rsq == 0.0);
78 fac = sqrt(-2.0 * log(rsq) / rsq);
79 gset = v1 * fac;
80 iset = 1;
81 return v2 * fac;
82 } else {
83 iset = 0;
84 return gset;
85 }
86 }
87
88 /* FASTA parser, copied from seq.c */
89
90 typedef struct {
91 int l, m; /* length and maximum buffer size */
92 unsigned char *s; /* sequence */
93 } seq_t;
94
95 #define INIT_SEQ(seq) (seq).s = 0; (seq).l = (seq).m = 0
96
97 static int SEQ_BLOCK_SIZE = 512;
98
99 void seq_set_block_size(int size)
100 {
101 SEQ_BLOCK_SIZE = size;
102 }
103
104 int seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment)
105 {
106 int c, l, max;
107 char *p;
108
109 c = 0;
110 while (!feof(fp) && fgetc(fp) != '>');
111 if (feof(fp)) return -1;
112 p = locus;
113 while (!feof(fp) && (c = fgetc(fp)) != ' ' && c != '\t' && c != '\n')
114 if (c != '\r') *p++ = c;
115 *p = '\0';
116 if (comment) {
117 p = comment;
118 if (c != '\n') {
119 while (!feof(fp) && ((c = fgetc(fp)) == ' ' || c == '\t'));
120 if (c != '\n') {
121 *p++ = c;
122 while (!feof(fp) && (c = fgetc(fp)) != '\n')
123 if (c != '\r') *p++ = c;
124 }
125 }
126 *p = '\0';
127 } else if (c != '\n') while (!feof(fp) && fgetc(fp) != '\n');
128 l = 0; max = seq->m;
129 while (!feof(fp) && (c = fgetc(fp)) != '>') {
130 if (isalpha(c) || c == '-' || c == '.') {
131 if (l + 1 >= max) {
132 max += SEQ_BLOCK_SIZE;
133 seq->s = (unsigned char*)realloc(seq->s, sizeof(char) * max);
134 }
135 seq->s[l++] = (unsigned char)c;
136 }
137 }
138 if (c == '>') ungetc(c,fp);
139 seq->s[l] = 0;
140 seq->m = max; seq->l = l;
141 return l;
142 }
143
144 /* Error-checking open, copied from utils.c */
145
146 #define xopen(fn, mode) err_xopen_core(__func__, fn, mode)
147
148 FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
149 {
150 FILE *fp = 0;
151 if (strcmp(fn, "-") == 0)
152 return (strstr(mode, "r"))? stdin : stdout;
153 if ((fp = fopen(fn, mode)) == 0) {
154 fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn);
155 abort();
156 }
157 return fp;
158 }
159
160 /* wgsim */
161
162 enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000};
163 typedef unsigned short mut_t;
164 static mut_t mutmsk = (mut_t)0xf000;
165
166 typedef struct {
167 int l, m; /* length and maximum buffer size */
168 mut_t *s; /* sequence */
169 } mutseq_t;
170
171 static double ERR_RATE = 0.02;
172 static double MUT_RATE = 0.001;
173 static double INDEL_FRAC = 0.1;
174 static double INDEL_EXTEND = 0.3;
175 static int IS_SOLID = 0;
176 static int SHOW_MM_INFO = 1;
177
178 void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2)
179 {
180 int i, deleting = 0;
181 mutseq_t *ret[2];
182
183 ret[0] = hap1; ret[1] = hap2;
184 ret[0]->l = seq->l; ret[1]->l = seq->l;
185 ret[0]->m = seq->m; ret[1]->m = seq->m;
186 ret[0]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
187 ret[1]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
188 for (i = 0; i != seq->l; ++i) {
189 int c;
190 c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]];
191 if (deleting) {
192 if (drand48() < INDEL_EXTEND) {
193 if (deleting & 1) ret[0]->s[i] |= DELETE;
194 if (deleting & 2) ret[1]->s[i] |= DELETE;
195 continue;
196 } else deleting = 0;
197 }
198 if (c < 4 && drand48() < MUT_RATE) { // mutation
199 if (drand48() >= INDEL_FRAC) { // substitution
200 double r = drand48();
201 c = (c + (int)(r * 3.0 + 1)) & 3;
202 if (is_hap || drand48() < 0.333333) { // hom
203 ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c;
204 } else { // het
205 ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c;
206 }
207 } else { // indel
208 if (drand48() < 0.5) { // deletion
209 if (is_hap || drand48() < 0.333333) { // hom-del
210 ret[0]->s[i] = ret[1]->s[i] = DELETE;
211 deleting = 3;
212 } else { // het-del
213 deleting = drand48()<0.5?1:2;
214 ret[deleting-1]->s[i] = DELETE;
215 }
216 } else { // insertion
217 int num_ins = 0, ins = 0;
218 do {
219 num_ins++;
220 ins = (ins << 2) | (int)(drand48() * 4.0);
221 } while (num_ins < 4 && drand48() < INDEL_EXTEND);
222
223 if (is_hap || drand48() < 0.333333) { // hom-ins
224 ret[0]->s[i] = ret[1]->s[i] = (num_ins << 12) | (ins << 4) | c;
225 } else { // het-ins
226 ret[drand48()<0.5?0:1]->s[i] = (num_ins << 12) | (ins << 4) | c;
227 }
228 }
229 }
230 }
231 }
232 }
233 void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2)
234 {
235 int i;
236 for (i = 0; i != seq->l; ++i) {
237 int c[3];
238 c[0] = nst_nt4_table[(int)seq->s[i]];
239 c[1] = hap1->s[i]; c[2] = hap2->s[i];
240 if (c[0] >= 4) continue;
241 if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) {
242 printf("%s\t%d\t", name, i+1);
243 if (c[1] == c[2]) { // hom
244 if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
245 printf("%c\t%c\t-\n", "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]);
246 } else if ((c[1]&mutmsk) == DELETE) { // del
247 printf("%c\t-\t-\n", "ACGTN"[c[0]]);
248 } else if (((c[1] & mutmsk) >> 12) <= 5) { // ins
249 printf("-\t");
250 int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
251 while(n > 0) {
252 putchar("ACGTN"[ins & 0x3]);
253 n--;
254 }
255 printf("\t-\n");
256 } else assert(0);
257 } else { // het
258 if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution
259 printf("%c\t%c\t+\n", "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)]);
260 } else if ((c[1]&mutmsk) == DELETE) {
261 printf("%c\t-\t+\n", "ACGTN"[c[0]]);
262 } else if ((c[2]&mutmsk) == DELETE) {
263 printf("%c\t-\t+\n", "ACGTN"[c[0]]);
264 } else if (((c[1] & mutmsk) >> 12) <= 4) { // ins1
265 printf("-\t");
266 int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
267 while (n > 0) {
268 putchar("ACGTN"[ins & 0x3]);
269 n--;
270 }
271 printf("\t+\n");
272 } else if (((c[2] & mutmsk) >> 12) <= 5) { // ins2
273 printf("-\t");
274 int n = (c[2]&mutmsk) >> 12, ins = c[2] >> 4;
275 while (n > 0) {
276 putchar("ACGTN"[ins & 0x3]);
277 ins >>= 2;
278 n--;
279 }
280 printf("\t+\n");
281 } else assert(0);
282 }
283 }
284 }
285 }
286
287 void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r)
288 {
289 seq_t seq;
290 mutseq_t rseq[2];
291 uint64_t tot_len, ii;
292 int i, l, n_ref;
293 char name[256], *qstr;
294 int size[2], Q;
295 uint8_t *tmp_seq[2];
296 mut_t *target;
297
298 INIT_SEQ(seq);
299 srand48(time(0));
300 seq_set_block_size(0x1000000);
301 l = size_l > size_r? size_l : size_r;
302 qstr = (char*)calloc(l+1, 1);
303 tmp_seq[0] = (uint8_t*)calloc(l+2, 1);
304 tmp_seq[1] = (uint8_t*)calloc(l+2, 1);
305 size[0] = size_l; size[1] = size_r;
306
307 Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
308
309 tot_len = n_ref = 0;
310 while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) {
311 tot_len += l;
312 ++n_ref;
313 }
314 fprintf(stderr, "[wgsim_core] %d sequences, total length: %llu\n", n_ref, (long long)tot_len);
315 rewind(fp_fa);
316
317 while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) {
318 uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5);
319 if (l < dist + 3 * std_dev) {
320 fprintf(stderr, "[wgsim_core] kkip sequence '%s' as it is shorter than %d!\n", name, dist + 3 * std_dev);
321 continue;
322 }
323
324 // generate mutations and print them out
325 maq_mut_diref(&seq, is_hap, rseq, rseq+1);
326 maq_print_mutref(name, &seq, rseq, rseq+1);
327
328 for (ii = 0; ii != n_pairs; ++ii) { // the core loop
329 double ran;
330 int d, pos, s[2], is_flip = 0;
331 int n_sub[2], n_indel[2], n_err[2], ext_coor[2], j, k;
332 FILE *fpo[2];
333
334 do { // avoid boundary failure
335 ran = ran_normal();
336 ran = ran * std_dev + dist;
337 d = (int)(ran + 0.5);
338 pos = (int)((l - d + 1) * drand48());
339 } while (pos < 0 || pos >= seq.l || pos + d - 1 >= seq.l);
340
341 // flip or not
342 if (drand48() < 0.5) {
343 fpo[0] = fpout1; fpo[1] = fpout2;
344 s[0] = size[0]; s[1] = size[1];
345 } else {
346 fpo[1] = fpout1; fpo[0] = fpout2;
347 s[1] = size[0]; s[0] = size[1];
348 is_flip = 1;
349 }
350
351 // generate the read sequences
352 target = rseq[drand48()<0.5?0:1].s; // haplotype from which the reads are generated
353 n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0;
354
355 #define __gen_read(x, start, iter) do { \
356 for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < seq.l && k < s[x]; iter) { \
357 int c = target[i], mut_type = c & mutmsk; \
358 if (ext_coor[x] < 0) { \
359 if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \
360 ext_coor[x] = i; \
361 } \
362 if (mut_type == DELETE) ++n_indel[x]; \
363 else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) { \
364 tmp_seq[x][k++] = c & 0xf; \
365 if (mut_type == SUBSTITUTE) ++n_sub[x]; \
366 } else { \
367 int n, ins; \
368 ++n_indel[x]; \
369 tmp_seq[x][k++] = c & 0xf; \
370 for (n = mut_type>>12, ins = c>>4; n > 0 && k < s[x]; --n, ins >>= 2) \
371 tmp_seq[x][k++] = ins & 0x3; \
372 } \
373 } \
374 if (k != s[x]) ext_coor[x] = -10; \
375 } while (0)
376
377 if (!IS_SOLID) {
378 __gen_read(0, pos, ++i);
379 __gen_read(1, pos + d - 1, --i);
380 for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement
381 } else {
382 int c1, c2, c;
383 ++s[0]; ++s[1]; // temporarily increase read length by 1
384 if (is_flip) { // RR pair
385 __gen_read(0, pos + s[0], --i);
386 __gen_read(1, pos + d - 1, --i);
387 } else { // FF pair
388 __gen_read(0, pos, ++i);
389 __gen_read(1, pos + d - 1 - s[1], ++i);
390 ++ext_coor[0]; ++ext_coor[1];
391 }
392 // change to color sequence: (0,1,2,3) -> (A,C,G,T)
393 for (j = 0; j < 2; ++j) {
394 c1 = tmp_seq[j][0];
395 for (i = 1; i < s[j]; ++i) {
396 c2 = tmp_seq[j][i];
397 c = (c1 >= 4 || c2 >= 4)? 4 : nst_color_space_table[(1<<c1)|(1<<c2)];
398 tmp_seq[j][i-1] = c;
399 c1 = c2;
400 }
401 }
402 --s[0]; --s[1]; // change back
403 }
404 if (ext_coor[0] < 0 || ext_coor[1] < 0) { // fail to generate the read(s)
405 --ii;
406 continue;
407 }
408
409 // generate sequencing errors
410 for (j = 0; j < 2; ++j) {
411 for (i = 0; i < s[j]; ++i) {
412 int c = tmp_seq[j][i];
413 if (c >= 4) c = 4; // actually c should be never larger than 4 if everything is correct
414 else if (drand48() < ERR_RATE) {
415 c = (c + (int)(drand48() * 3.0 + 1)) & 3;
416 ++n_err[j];
417 }
418 tmp_seq[j][i] = c;
419 }
420 }
421
422 // print
423 for (j = 0; j < 2; ++j) {
424 for (i = 0; i < s[j]; ++i) qstr[i] = Q;
425 qstr[i] = 0;
426 if (SHOW_MM_INFO) {
427 fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", name, ext_coor[0]+1, ext_coor[1]+1,
428 n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
429 (long long)ii, j==0? is_flip+1 : 2-is_flip);
430 } else {
431 fprintf(fpo[j], "@%s_%u_%u_%llx/%d %d:%d:%d_%d:%d:%d\n", name, ext_coor[0]+1, ext_coor[1]+1,
432 (long long)ii, j==0? is_flip+1 : 2-is_flip,
433 n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1]);
434 }
435 for (i = 0; i < s[j]; ++i)
436 fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]);
437 fprintf(fpo[j], "\n+\n%s\n", qstr);
438 }
439 }
440 free(rseq[0].s); free(rseq[1].s);
441 }
442 free(seq.s); free(qstr);
443 free(tmp_seq[0]); free(tmp_seq[1]);
444 }
445
446 static int simu_usage()
447 {
448 fprintf(stderr, "\n");
449 fprintf(stderr, "Program: wgsim (short read simulator)\n");
450 fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
451 fprintf(stderr, "Contact: Heng Li <lh3@sanger.ac.uk>\n\n");
452 fprintf(stderr, "Usage: wgsim [options] <in.ref.fa> <out.read1.fq> <out.read2.fq>\n\n");
453 fprintf(stderr, "Options: -e FLOAT base error rate [%.3f]\n", ERR_RATE);
454 fprintf(stderr, " -d INT outer distance between the two ends [500]\n");
455 fprintf(stderr, " -s INT standard deviation [50]\n");
456 fprintf(stderr, " -N INT number of read pairs [1000000]\n");
457 fprintf(stderr, " -1 INT length of the first read [70]\n");
458 fprintf(stderr, " -2 INT length of the second read [70]\n");
459 fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", MUT_RATE);
460 fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC);
461 fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND);
462 fprintf(stderr, " -c generate reads in color space (SOLiD reads)\n");
463 fprintf(stderr, " -C show mismatch info in comment rather than read name\n");
464 fprintf(stderr, " -h haplotype mode\n");
465 fprintf(stderr, "\n");
466 fprintf(stderr, "Note: For SOLiD reads, the first read is F3 and the second is R3.\n\n");
467 return 1;
468 }
469
470 int main(int argc, char *argv[])
471 {
472 int64_t N;
473 int dist, std_dev, c, size_l, size_r, is_hap = 0;
474 FILE *fpout1, *fpout2, *fp_fa;
475
476 N = 1000000; dist = 500; std_dev = 50;
477 size_l = size_r = 70;
478 while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:cC")) >= 0) {
479 switch (c) {
480 case 'd': dist = atoi(optarg); break;
481 case 's': std_dev = atoi(optarg); break;
482 case 'N': N = atoi(optarg); break;
483 case '1': size_l = atoi(optarg); break;
484 case '2': size_r = atoi(optarg); break;
485 case 'e': ERR_RATE = atof(optarg); break;
486 case 'r': MUT_RATE = atof(optarg); break;
487 case 'R': INDEL_FRAC = atof(optarg); break;
488 case 'X': INDEL_EXTEND = atof(optarg); break;
489 case 'c': IS_SOLID = 1; break;
490 case 'C': SHOW_MM_INFO = 0; break;
491 case 'h': is_hap = 1; break;
492 }
493 }
494 if (argc - optind < 3) return simu_usage();
495 fp_fa = (strcmp(argv[optind+0], "-") == 0)? stdin : xopen(argv[optind+0], "r");
496 fpout1 = xopen(argv[optind+1], "w");
497 fpout2 = xopen(argv[optind+2], "w");
498 wgsim_core(fpout1, fpout2, fp_fa, is_hap, N, dist, std_dev, size_l, size_r);
499
500 fclose(fpout1); fclose(fpout2); fclose(fp_fa);
501 return 0;
502 }