comparison SNV/SNVMix2_source/SNVMix2-v0.12.1-rc1/SNVMix2.c @ 0:74f5ea818cea

Uploaded
author ryanmorin
date Wed, 12 Oct 2011 19:50:38 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:74f5ea818cea
1 /* The MIT License
2
3 Copyright (c) 2009, by Sohrab Shah <sshah@bccrc.ca> and Rodrigo Goya <rgoya@bcgsc.ca>
4
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
12
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 SOFTWARE.
24 */
25
26 /*
27 C Implementation of SNVMix2
28 */
29
30 #define VERSION "0.12.1-rc1"
31
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #include <math.h>
36 #include "SNVMix2.h"
37
38 #include "bam.h"
39
40 #define START_QNUM 1000
41
42 int main(int argc, char **argv) {
43
44 param_struct params;// = {NULL, NULL, NULL, NULL, NULL, 0, 0, 0, 0, 0};
45
46 initSNVMix(argc, argv, &params);
47
48 if(params.classify || params.filter) {
49 if(params.input_type == Q_PILEUP)
50 snvmixClassify_qualities(&params);
51 else if(params.input_type == M_PILEUP || params.input_type == S_PILEUP)
52 snvmixClassify_pileup(&params);
53 else if(params.input_type == BAM_FILE) {
54 fprintf(stderr,"Output is:\n");
55 fprintf(stderr,"\tchr:pos ref nref ref:ref_count,nref:nref_count,pAA,pAB,pBB,maxP");
56 fprintf(stderr,"\tref_fwd ref_rev");
57 fprintf(stderr,"\tnref_fwd nref_rev");
58 fprintf(stderr,"\tnref_center nref_edges");
59 fprintf(stderr,"\tindel_pos");
60 fprintf(stderr,"\tref_clean nref_clean");
61 fprintf(stderr,"\tref_bad_pair nref_bad_pair");
62 fprintf(stderr,"\tref_ins nref_ins");
63 fprintf(stderr,"\tref_del nref_del");
64 fprintf(stderr,"\tref_junc nref_junc");
65 fprintf(stderr,"\tnref_in_unique_pos");
66 fprintf(stderr,"\traw[A,C,G,T,N]");
67 fprintf(stderr,"\tthr[A,C,G,T,N]\n");
68 snvmixClassify_bamfile(&params);
69 }
70 } else if(params.train) {
71 if(params.input_type == M_PILEUP || params.input_type == S_PILEUP)
72 snvmixTrain_pileup(&params);
73 else if(params.input_type == Q_PILEUP) {
74 fprintf(stderr,"Sorry, Training with quality-type input is not yet implemented\n");
75 exit(1);
76 }
77 }
78 return(0);
79 }
80
81 /*
82 void snvmixClassify_qualities
83 Arguments:
84 *params : pointer to model parameters and command line options
85 Function:
86 Reads a pilep-style file that has been preprocessed to include the quality of
87 calls being the reference allele and the maping quality both in decimal values.
88
89 For each line it evaluates de model according to the parameters in *params and
90 outputs the corresponding SNV prediction values.
91
92 This function may come in handy when filtering of calls is done by a
93 different program or the base-calling and maping quality information is not
94 in a pileup/phred format.
95
96 The file read is a tab-separated file where the columns are:
97 - target sequence (e.g. chromosome)
98 - sequence position
99 - reference base
100 - non-reference base
101 - comma separated probabilities of the call being the reference
102 - comma separated mapping qualities, one for each base call
103
104 */
105 void snvmixClassify_qualities(param_struct *params) {
106 FILE *fptrIN, *fptrOUT;
107
108 char *line = NULL;
109 size_t line_len = 0, prev_len = 0;
110 int read_len = 0;
111
112 int qual_num = 0, col_num = 0;
113
114 char *col, *qual;
115 char *col_sep = "\t", *col_str, *col_ptr;
116 char *qual_sep = ",", *qual_str, *qual_ptr;
117 char *chr, *pos, *ref, *nref;
118
119 long double *bQ, *mQ, *tmpQ;
120 int pos_int, depth = 0, maxp;
121 long int depth_allocated = 0;
122
123 long double bsum = 0, msum = 0, *b, *mu, *notmu, *pi, *log_pi, z = 0;
124 int i, k, states;
125
126 states = params->states;
127
128 // Initialize local values of parameters
129 mu = params->mu;
130 pi = params->pi;
131
132 if( ( b = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_qualities] Could not allocate b\n"); exit(1); }
133 if( ( notmu = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_qualities] Could not allocate notmu\n"); exit(1); }
134 if( ( log_pi = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_qualities] Could not allocate log_pi\n"); exit(1); }
135
136 for(k = 0; k < states; k++) {
137 notmu[k] = 1 - mu[k];
138 log_pi[k] = logl(pi[k]);
139 }
140 fptrIN = params->input;
141 fptrOUT = params->output;
142
143
144 if( (bQ = malloc(START_QNUM * sizeof (long double))) == NULL) {
145 perror("ERROR allocating initial space for bQ"); exit(1);
146 }
147 if( (mQ = malloc(START_QNUM * sizeof (long double))) == NULL) {
148 perror("ERROR allocating initial space for mQ"); exit(1);
149 }
150 depth_allocated = START_QNUM;
151 #if defined __linux__ || defined _GNU_SOURCE
152 while( (read_len = getline(&line,&line_len,fptrIN)) != -1 )
153 #endif
154 #ifdef __APPLE__
155 while( (line = fgetln(fptrIN,&line_len)) != NULL )
156 #endif
157 {
158 line[line_len-1] = '\0';
159 depth = 0;
160 col_ptr = NULL;
161 for(col_num = 0, col_str = line; ; col_num++, col_str = NULL) {
162 col = strtok_r(col_str, "\t", &col_ptr);
163 if(col == NULL) {
164 break;
165 }
166 if(col_num == 0) {
167 chr = col;
168 } else if(col_num == 1) {
169 pos = col;
170 pos_int = strtol(pos, NULL, 10);
171 } else if(col_num == 2) {
172 ref = col;
173 } else if(col_num == 3) {
174 nref = col;
175 } else if(col_num == 4) {
176 for(qual_num = 0, qual_str = col; ; qual_num++, qual_str = NULL) {
177 qual = strtok_r(qual_str, ",", &qual_ptr);
178 if(qual == NULL) {
179 break;
180 }
181 if(qual_num >= depth_allocated) {
182 if( (bQ = realloc(bQ, sizeof(long double) * (depth_allocated + START_QNUM))) == NULL) {
183 perror("ERROR allocating bQ"); exit(1);
184 }
185 if( (mQ = realloc(mQ, sizeof(long double) * (depth_allocated + START_QNUM))) == NULL) {
186 perror("ERROR allocating mQ"); exit(1);
187 }
188 depth_allocated = depth_allocated + START_QNUM;
189 }
190 bQ[qual_num] = atof(qual);
191 }
192 depth = qual_num;
193 } else if(col_num == 5) {
194 for(qual_num = 0, qual_str = col; ; qual_num++, qual_str = NULL) {
195 qual = strtok_r(qual_str, ",", &qual_ptr);
196 if(qual == NULL) {
197 break;
198 }
199 if(qual_num >= depth_allocated) {
200 fprintf(stderr, "FATAL ERROR: should not have more mapping qualities than base qualities\n");
201 exit(1);
202 }
203 mQ[qual_num] = atof(qual);
204 }
205 if(depth != qual_num) {
206 fprintf(stderr, "FATAL ERROR: should not have more base qualities than mapping qualities\n");
207 exit(1);
208 }
209 }
210 }
211
212
213 for(k = 0; k < states; k++)
214 b[k] = 0;
215
216 //b = sum(log(notm*0.5 + m.*(yr.*mur+notyr.*notmur)),2);
217 for(qual_num = 0; qual_num < depth; qual_num++)
218 for(k = 0; k < states; k++)
219 b[k] = b[k] + logl((1- mQ[qual_num])*0.5 + mQ[qual_num] * (bQ[qual_num]*mu[k]+(1-bQ[qual_num])*notmu[k]));
220
221 z = 0;
222 for(k = 0; k < states; k++) {
223 b[k] = expl(b[k] + log_pi[k]);
224 z += b[k];
225 }
226 if(!z) { z = 1; }
227 for(k = 0; k < states; k++)
228 b[k] = b[k] / z;
229 maxp = 0;
230 z = b[0];
231 for(k = 0; k < states; k++)
232 if( b[k] > z) {
233 maxp = k;
234 z = b[k];
235 }
236
237 fprintf(fptrOUT,"%s\t%s\t%s\t%s",chr, pos, ref, nref);
238 for(k = 0; k , states; k++)
239 fprintf(fptrOUT,"%0.10Lf,",b[k]);
240 fprintf(fptrOUT,"%d\n",maxp+1);
241 }
242 fclose(fptrIN);
243 fclose(fptrOUT);
244 free(line);
245
246 free(b); free(notmu); free(log_pi);
247 }
248
249
250 /*
251 void snvmixClassify_pileup
252 Arguments:
253 *params : pointer to model parameters and command line options
254 Function:
255 Reads a MAQ or Samtools pileup format. For required formatting we refer the user
256 to the corresponding websites:
257 http://maq.sourceforge.net/
258 http://samtools.sourceforge.net/
259
260 In general the format of both files can be described as a tab separated file
261 where columns are:
262 - target sequence (e.g. chromosome)
263 - sequence position
264 - reference base
265 - depth
266 - stream of base calls
267 - stream of base call PHRED-scaled qualities
268 - stream of mapping PHRED-scaled qualities
269
270 Note: when handling pileup files, care should be taken when doing copy-paste
271 operations not to transform 'tabs' into streams of spaces.
272
273 For each line it evaluates de model according to the parameters in *params and
274 outputs the corresponding SNV prediction values.
275
276 Predictions can be output either only for positions where when non-reference values
277 are observed or, when run with '-f' flag, for all positions. The -f flag is useful
278 when the resulting calls are being compared or joined accros different datasets
279 and all predictions need to be present.
280 */
281 void snvmixClassify_pileup(param_struct *params) {
282 //MAQ:
283 // 1 554484 C 1752 @,,.,.,, @xxx @xxx xxxxx
284 //SAM:
285 // 1 554484 C 1752 ,,.,.,, xxx xxxx
286 FILE *fptrIN, *fptrOUT;
287
288 char *line = NULL;
289 size_t line_len = 0, prev_len = 0;
290 int read_len = 0;
291 int col_num = 0;
292 long int line_num = 0;
293
294 char *col, *qual;
295 char *col_sep = "\t", *col_str, *col_ptr;
296 char *qual_sep = ",", *qual_str, *qual_ptr;
297 char *chr, *pos, *ref, nref, call, nref_skip = 'N';
298
299 char *bQ, *mQ;
300 int *calls, *tmpQ;
301 int pos_int, depth = 0, qual_num, maxp;
302 int depth_allocated = 0;
303
304 long double bsum = 0, msum = 0, *b, *mu, *notmu, *pi, *log_pi, z = 0;
305 int nref_count = 0, ref_count = 0;
306 long double Y, not_Y;
307 long double phred[PHRED_MAX + 1];
308 int i, call_i, k, states;
309
310 //initPhred(phred, PHRED_MAX+1);
311 initPhred(phred, PHRED_MAX);
312
313 states = params->states;
314
315 // Initialize local values of parameters
316 mu = params->mu;
317 pi = params->pi;
318
319 if( ( b = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate b\n"); exit(1); }
320 if( ( notmu = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate notmu\n"); exit(1); }
321 if( ( log_pi = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate log_pi\n"); exit(1); }
322
323 for(k = 0; k < states; k++) {
324 notmu[k] = 1 - mu[k];
325 log_pi[k] = logl(pi[k]);
326 }
327 fptrIN = params->input;
328 fptrOUT = params->output;
329 if(params->full)
330 nref_skip = -2;
331
332
333 if( (calls = malloc(START_QNUM * sizeof (int))) == NULL) {
334 perror("ERROR allocating initial space for calls"); exit(1);
335 }
336 depth_allocated = START_QNUM;
337 #if defined __linux__ || defined _GNU_SOURCE
338 while( (read_len = getline(&line,&line_len,fptrIN)) != -1 )
339 #endif
340 #ifdef __APPLE__
341 while( (line = fgetln(fptrIN,&line_len)) != NULL )
342 #endif
343 {
344 line[line_len-1] = '\0';
345 line_num++;
346 depth = 0;
347 nref = 0;
348 col_ptr = NULL;
349 for(col_num = 0, col_str = line; nref != nref_skip ; col_num++, col_str = NULL) {
350 col = strtok_r(col_str, "\t", &col_ptr);
351 if(col == NULL) {
352 break;
353 }
354 if(col_num == 0) {
355 chr = col;
356 } else if(col_num == 1) {
357 pos = col;
358 } else if(col_num == 2) {
359 ref = col;
360 } else if(col_num == 3) {
361 depth = atoi(col);
362 if(depth > depth_allocated) {
363 if( (calls = realloc(calls, sizeof (int) * (depth + START_QNUM))) == NULL) {
364 perror("ERROR allocating space for calls"); exit(1);
365 }
366 depth_allocated = depth + START_QNUM;
367 }
368 } else if(col_num == 4) {
369 if(params->input_type == M_PILEUP)
370 col = col+sizeof(char);
371 snvmixGetCallString(col, calls, depth, &nref);
372 } else if(col_num == 5) {
373 bQ = col;
374 // If it's a MAQ pileup, we need to skip the @ sign
375 if(params->input_type == M_PILEUP)
376 bQ = bQ + sizeof(char);
377 for(call_i = 0; call_i < depth; call_i++)
378 bQ[call_i] = bQ[call_i]-33;
379 } else if(col_num == 6) {
380 mQ = col;
381 // If it's a MAQ pileup, we need to skip the @ sign
382 if(params->input_type == M_PILEUP)
383 mQ = mQ + sizeof(char);
384 for(call_i = 0; call_i < depth; call_i++)
385 mQ[call_i] = mQ[call_i]-33;
386 }
387 }
388 // If we quit the for because no nref was found, skip this too, next line
389 nref = snvmixFilterCalls(calls,depth,bQ,mQ,params);
390 nref_count = 0;
391 ref_count = 0;
392 for(qual_num = 0; qual_num < depth; qual_num++) {
393 //if( snvmixSkipCall(calls,qual_num,params,bQ,mQ) )
394 if(calls[qual_num] == -1)
395 continue;
396 if(calls[qual_num] == 1)
397 ref_count++;
398 if(calls[qual_num] == nref)
399 nref_count++;
400 }
401 if(params->filter) {
402 fprintf(fptrOUT,"%s:%s\t%s:%d\t%c:%d\t", chr, pos, ref, ref_count, nref, nref_count);
403 for(qual_num = 0; qual_num < ref_count; qual_num++)
404 fprintf(fptrOUT,",");
405 for(qual_num = 0; qual_num < nref_count; qual_num++)
406 fprintf(fptrOUT,"%c",nref);
407 fprintf(fptrOUT,"\n");
408 } else {
409 if(nref == nref_skip)
410 continue;
411 //nref = snvmixFilterCalls(calls,depth,bQ,mQ,params);
412 //if(nref == nref_skip)
413 // continue;
414 for(k = 0; k < states; k++)
415 b[k] = 0;
416 nref_count = 0;
417 for(qual_num = 0; qual_num < depth; qual_num++) {
418 //if( snvmixSkipCall(calls,qual_num,params,bQ,mQ) )
419 if(calls[qual_num] == -1)
420 continue;
421 // from matlab:
422 // b = sum(log(notm*0.5 + m.*(yr.*mur+notyr.*notmur)),2);
423 if(calls[qual_num] == 1) {
424 // If REF then
425 Y = phred[(unsigned char) bQ[qual_num]];
426 not_Y = 1-phred[(unsigned char) bQ[qual_num]];
427 } else {
428 nref_count++;
429 // If NREF then
430 Y = (1-phred[(unsigned char) bQ[qual_num]])/3;
431 not_Y = phred[(unsigned char) bQ[qual_num]] + 2*(1-phred[(unsigned char)bQ[qual_num]])/3;
432 }
433 for(k = 0; k < states; k++)
434 b[k] = b[k] + logl((1-phred[(unsigned char) mQ[qual_num]])*0.5 + phred[(unsigned char) mQ[qual_num]] * (Y * mu[k] + not_Y * notmu[k]));
435 }
436 // Check if any non-references actually passed the filter
437 //if(!nref_count)
438 // continue;
439 z = 0;
440 for(k = 0; k < states; k++) {
441 b[k] = expl(b[k] + log_pi[k]);
442 z += b[k];
443 }
444 if(!z) z = 1;
445 for(k = 0; k < states; k++)
446 b[k] = b[k] / z;
447 maxp = 0;
448 z = b[0];
449 for(k = 0; k < states; k++)
450 if( b[k] > z) {
451 maxp = k;
452 z = b[k];
453 }
454
455
456 fprintf(fptrOUT,"%s:%s\t%s\t%c\t%s:%d,%c:%d,",chr,pos, ref, nref, ref, ref_count, nref, nref_count);
457 for(k = 0; k < states; k++)
458 fprintf(fptrOUT,"%0.10Lf,",b[k]);
459 fprintf(fptrOUT,"%d\n",maxp+1);
460 }
461 }
462 fclose(fptrIN);
463 fclose(fptrOUT);
464 free(line);
465
466 free(b); free(notmu); free(log_pi);
467 }
468
469 int snvmixClassify_bamfile(param_struct *params) {
470 snvmix_pl_t snvmix_pl;
471 int states, i, k, bam_flag_mask;
472
473 snvmix_pl.params = params;
474 snvmix_pl.tid = -1;
475 // TODO: change this to read range
476 snvmix_pl.begin = 0;
477 snvmix_pl.end = 0x7fffffff;
478 snvmix_pl.in = samopen(params->inputfile, "rb", 0);
479 if (snvmix_pl.in == 0) {
480 fprintf(stderr, "ERROR: Fail to open BAM file %s\n", params->bamfile);
481 return 1;
482 }
483 snvmix_pl.fai = fai_load(params->fastafile);
484 if (snvmix_pl.fai == 0) {
485 fprintf(stderr, "Fail to open BAM file %s\n", params->fastafile);
486 return 1;
487 }
488 if(params->listfile) snvmix_pl.hash = load_pos(params->listfile, snvmix_pl.in->header);
489 snvmix_pl.depth_allocated = 0;
490 snvmix_pl.ref = NULL;
491 snvmix_pl.calls = NULL;
492 //snvmix_pl.bQ = NULL;
493 //snvmix_pl.mQ = NULL;
494
495
496 /* This looks ugly... but we don't want to be allocating new memory for EVERY location. */
497 states = params->states;
498 if( ( snvmix_pl.notmu = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_bamfile] Could not allocate snvmix_pl.notmu\n"); exit(1); }
499 if( ( snvmix_pl.log_pi = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_bamfile] Could not allocate snvmix_pl.log_pi\n"); exit(1); }
500 if( ( snvmix_pl.b = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_bamfile] Could not allocate snvmix_pl.b\n"); exit(1); }
501 for(k = 0; k < states; k++) {
502 snvmix_pl.notmu[k] = 1 - params->mu[k];
503 snvmix_pl.log_pi[k] = logl(params->pi[k]);
504 }
505 initPhred(snvmix_pl.phred, PHRED_MAX);
506
507 /* Init call buffer */
508 snvmix_pl.n_buffer = params->window;
509 if( (snvmix_pl.buffer = malloc(snvmix_pl.n_buffer * sizeof(snvmix_call_t)) ) == NULL) { perror("[snvmixClassify_bamfile] Could not allocate space for buffer\n"); exit(1); }
510 snvmix_pl.b_start = 0;
511 snvmix_pl.b_end = 0;
512 for(i = 0; i < snvmix_pl.n_buffer; i++) {
513 snvmix_pl.buffer[i].tid = 0;
514 snvmix_pl.buffer[i].pos = 0;
515 snvmix_pl.buffer[i].in_use = 0;
516 if( (snvmix_pl.buffer[i].p = malloc(states * sizeof(long double)) ) == NULL) {perror("[snvmixClassify_bamfile] Could not allocate space for buffer.p\n"); exit(1);}
517 }
518
519 bam_flag_mask = (BAM_FUNMAP | BAM_FSECONDARY); // | BAM_FQCFAIL | BAM_FDUP);
520 if(params->filter_chast)
521 bam_flag_mask |= BAM_FQCFAIL;
522 if(params->filter_dup)
523 bam_flag_mask |= BAM_FDUP;
524
525 /* Call pileup with our function and bam_flag_mask */
526 sampileup(snvmix_pl.in, bam_flag_mask, snvmixClassify_pileup_func, &snvmix_pl);
527
528 free(snvmix_pl.notmu);
529 free(snvmix_pl.log_pi);
530 free(snvmix_pl.b);
531 }
532
533 int flush_buffer(uint32_t tid, uint32_t pos, snvmix_pl_t *snvmix_pl) {
534 snvmix_call_t *s;
535 int i, k, cnt, buff_id;
536 cnt = 0;
537 #define _fptrOUT snvmix_pl->params->output
538 //#define _fptrOUT stderr
539 //fprintf(_fptrOUT, "ENTERING flush_buffer, b_start = %d\tb_end = %d\n",snvmix_pl->b_start, snvmix_pl->b_end);
540 for(i = 0, buff_id = snvmix_pl->b_start; i < snvmix_pl->n_buffer; i++, buff_id=(buff_id+1)%snvmix_pl->params->window) {
541 //fprintf(stderr, "ITERATING in flush_buffer, buff_id = %d\n", buff_id);
542 s = snvmix_pl->buffer + buff_id;
543 if(pos+1 >= (s->pos + snvmix_pl->params->window) || tid != s->tid) {
544 //fprintf(stderr, "PASSED if in flush_buffer, buff_id = %d\n", buff_id);
545 if(s->in_use) {
546 //fprintf(_fptrOUT,"%d\t",buff_id);
547 fprintf(_fptrOUT,"%s:%d\t%c\t%c\t%c:%d,%c:%d", snvmix_pl->in->header->target_name[s->tid], s->pos, s->ref, s->nref, s->ref, s->ref_count, s->nref, s->nref_count);
548
549 if(!snvmix_pl->params->filter) {
550 fprintf(_fptrOUT,",");
551 for(k = 0; k < snvmix_pl->params->states; k++)
552 fprintf(_fptrOUT,"%0.10Lf,",s->p[k]);
553 fprintf(_fptrOUT,"%d",s->maxP);
554 }
555
556 fprintf(_fptrOUT,"\t%d %d", s->forward[0], s->reverse[0]);
557 fprintf(_fptrOUT,"\t%d %d", s->forward[1], s->reverse[1]);
558 fprintf(_fptrOUT,"\t%d %d", s->nref_center, s->nref_edges);
559 fprintf(_fptrOUT,"\t%d", s->indel_pos);
560 fprintf(_fptrOUT,"\t%d %d", s->c_clean[0], s->c_clean[1]);
561 fprintf(_fptrOUT,"\t%d %d", s->bad_pair[0], s->bad_pair[1]);
562 fprintf(_fptrOUT,"\t%d %d", s->c_ins[0], s->c_ins[1]);
563 fprintf(_fptrOUT,"\t%d %d", s->c_del[0], s->c_del[1]);
564 fprintf(_fptrOUT,"\t%d %d", s->c_junc[0], s->c_junc[1]);
565 fprintf(_fptrOUT,"\t%d", s->aln_unique_pos);
566 fprintf(_fptrOUT,"\t[%d,%d,%d,%d,%d]", s->raw_cvg[0],s->raw_cvg[1],s->raw_cvg[2],s->raw_cvg[3],s->raw_cvg[4]);
567 fprintf(_fptrOUT,"\t[%d,%d,%d,%d,%d]", s->thr_cvg[0],s->thr_cvg[1],s->thr_cvg[2],s->thr_cvg[3],s->thr_cvg[4]);
568 if(snvmix_pl->params->filter) {
569 fprintf(_fptrOUT,"\t");
570 for(k = 0; k < s->ref_count; k++)
571 fprintf(_fptrOUT,",");
572 for(k = 0; k < s->nref_count; k++)
573 fprintf(_fptrOUT,"%c",s->nref);
574 }
575 fprintf(_fptrOUT,"\n");
576 cnt++;
577 s->in_use = 0;
578 snvmix_pl->b_start = (buff_id+1)%snvmix_pl->params->window;
579 }
580 } else break;
581 if(buff_id == snvmix_pl->b_end)
582 break;
583 }
584 //fprintf(_fptrOUT, "EXITING flush_buffer, b_start = %d\tb_end = %d\n",snvmix_pl->b_start, snvmix_pl->b_end);
585 return cnt;
586 }
587
588
589 static int snvmixClassify_pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) {
590
591 #define _bQ(p, x) bam1_qual(p[x].b)[p[x].qpos]
592 #define _mQ(p, x) p[x].b->core.qual
593
594 //FILE *fptrOUT;
595 char nref_skip = 'N';
596 //int *calls, ;
597 int depth = 0;
598 //long double *b, *notmu, *log_pi, *mu, *pi;
599 long double Y, not_Y, z;
600 //int states;
601 int i, k, qual_num, maxp;
602 //long double *phred;
603 int b_end_tmp = 0;
604 snvmix_call_t *s;
605 snvmix_pl_t *snvmix_pl = (snvmix_pl_t *)data;
606 param_struct *params = snvmix_pl->params;
607 // Avoid allocating new variables, use defines to make things readable
608 #define _calls snvmix_pl->calls
609 #define _b snvmix_pl->b
610 #define _notmu snvmix_pl->notmu
611 #define _log_pi snvmix_pl->log_pi
612 #define _mu params->mu
613 #define _pi params->pi
614 #define _phred snvmix_pl->phred
615 #define _states params->states
616 //fprintf(stderr,"DEBUG: snvmix_pl->buffer[snvmix_pl->b_start].in_use = %d\n", snvmix_pl->buffer[snvmix_pl->b_start].in_use);
617 //if(pos+1 == 148971)
618 // printf("DEBUG: test 148971, start.in_use = %d\tb_start=%d\tb_end=%d\n",snvmix_pl->buffer[snvmix_pl->b_start].in_use, snvmix_pl->b_start,snvmix_pl->b_end);
619 //if(pos == 148971)
620 // printf("DEBUG: test 148972, start.in_use = %d\tb_start=%d\tb_end=%d (pos+1 >= (snvmix_pl->buffer[snvmix_pl->b_start].pos + snvmix_pl->params->window) = %d >= %d\n", snvmix_pl->buffer[snvmix_pl->b_start].in_use, snvmix_pl->b_start,snvmix_pl->b_end, pos+1, snvmix_pl->buffer[snvmix_pl->b_start].pos + snvmix_pl->params->window);
621 // If we have gone further than window indicates, or changed tid, then flush buffer
622 if(snvmix_pl->buffer[snvmix_pl->b_start].in_use && (pos+1 >= (snvmix_pl->buffer[snvmix_pl->b_start].pos + snvmix_pl->params->window) || tid != snvmix_pl->buffer[snvmix_pl->b_start].tid ))
623 flush_buffer(tid, pos, snvmix_pl);
624 b_end_tmp = snvmix_pl->b_end;
625 s = snvmix_pl->buffer + snvmix_pl->b_end;
626 snvmix_pl->b_end = (snvmix_pl->b_end+1)%snvmix_pl->params->window;
627
628 if(snvmix_pl->hash) {
629 khint_t k_h = kh_get(64, snvmix_pl->hash, (uint64_t)tid<<32|pos);
630 if (k_h == kh_end(snvmix_pl->hash)) return 0;
631 }
632
633 s->in_use = 1;
634
635
636 if(params->full)
637 nref_skip = -2;
638
639 if (snvmix_pl->fai && (int)tid != snvmix_pl->tid) {
640 free(snvmix_pl->ref);
641 snvmix_pl->ref = fai_fetch(snvmix_pl->fai, snvmix_pl->in->header->target_name[tid], &snvmix_pl->len);
642 snvmix_pl->tid = tid;
643 }
644
645 //chr = snvmix_pl->in->header->target_name[tid],
646 s->tid = tid;
647 s->ref = toupper((snvmix_pl->ref && (int)pos < snvmix_pl->len)? snvmix_pl->ref[pos] : 'N');
648 s->pos = pos + 1;
649 depth = n;
650
651 if(depth > snvmix_pl->depth_allocated) {
652 if( (snvmix_pl->calls = realloc(snvmix_pl->calls, sizeof (int) * (depth + START_QNUM))) == NULL) {
653 perror("ERROR allocating space for snvmix_pl->calls"); exit(1);
654 }
655 //calls = snvmix_pl->calls;
656 snvmix_pl->depth_allocated = depth + START_QNUM;
657 }
658
659 //for(i = 0; i < depth; i++) {
660 // FIXME: watch out for deletions, do they get automatically an "n"? if so, ok
661 //fprintf(stderr, "DEBUG: len = %d\tref = %c\tcalls = ", snvmix_pl->len, ref);
662 s->raw_cvg[0] = s->raw_cvg[1] = s->raw_cvg[2] = s->raw_cvg[3] = s->raw_cvg[4] = 0;
663 s->thr_cvg[0] = s->thr_cvg[1] = s->thr_cvg[2] = s->thr_cvg[3] = s->thr_cvg[4] = 0;
664 for(i = depth; i--; ) {
665 if(pl[i].is_del) {
666 _calls[i] = -1;
667 continue;
668 }
669 _calls[i] = "nACnGnnnTnnnnnnn"[bam1_seqi(bam1_seq(pl[i].b), pl[i].qpos)];
670 //fprintf(stderr, "%c", _calls[i]);
671 switch(_calls[i]) {
672 case 'A': s->raw_cvg[0]++; break;
673 case 'C': s->raw_cvg[1]++; break;
674 case 'G': s->raw_cvg[2]++; break;
675 case 'T': s->raw_cvg[3]++; break;
676 case 'n': s->raw_cvg[4]++; break;
677 default: break;
678 }
679 // Mask calls according to quality
680 if( snvmixSkipCall_alt(params, _calls[i], (char) bam1_qual(pl[i].b)[pl[i].qpos], (char) pl[i].b->core.qual) )
681 _calls[i] = -1;
682 else {
683 switch(_calls[i]) {
684 case 'A': s->thr_cvg[0]++; break;
685 case 'C': s->thr_cvg[1]++; break;
686 case 'G': s->thr_cvg[2]++; break;
687 case 'T': s->thr_cvg[3]++; break;
688 case 'n': s->thr_cvg[4]++; break;
689 default: break;
690 }
691 if(_calls[i] == s->ref)
692 _calls[i] = 1;
693 else if(_calls[i] == 'n')
694 _calls[i] = -1;
695 }
696 }
697
698 s->nref = snvmixFilterCalls(_calls,depth,NULL,NULL,params);
699 s->nref_count = 0;
700 s->ref_count = 0;
701
702 // FIXME: review this section, extra flags
703 for(i = 0; i < 2; i++) {
704 s->forward[i] = 0;
705 s->reverse[i] = 0;
706 s->good_pair[i] = 0;
707 s->bad_pair[i] = 0;
708 s->c_clean[i] = 0;
709 s->c_ins[i] = 0;
710 s->c_del[i] = 0;
711 s->c_junc[i] = 0;
712 }
713 s->nref_edges = 0; // FIXME: fixed 5 bp edges
714 s->nref_center = 0; // FIXME: fixed 5 bp edges
715 s->indel_pos = 0; // How many reads have a deletion at that site
716 s->indel_near = 0; // How many reasd have a deletion overall
717 s->aln_unique_pos = 0;
718 uint8_t cigar_ops; // 0x1 INS, 0x2 DEL , 0x4 JUNCTION
719 //for(qual_num = 0; qual_num < depth; qual_num++) {
720 uint32_t debug_sort = 0x7fffffff;
721 if(s->nref != nref_skip) {
722 k = -1;
723 for(qual_num = depth; qual_num--; ) {
724 if(_calls[qual_num] == -1) {
725 if(pl[qual_num].is_del)
726 s->indel_pos++;
727 continue;
728 }
729 cigar_ops = 0;
730 for(i=0; i<pl[qual_num].b->core.n_cigar; i++) {
731 int op = bam1_cigar(pl[qual_num].b)[i] & BAM_CIGAR_MASK;
732 if(op == BAM_CINS)
733 cigar_ops |= 0x1;
734 if(op == BAM_CDEL)
735 cigar_ops |= 0x2;
736 if(op == BAM_CREF_SKIP)
737 cigar_ops |= 0x4;
738 }
739 if(pl[qual_num].indel) {
740 s->indel_pos++;
741 }
742 if(_calls[qual_num] == 1) {
743 s->ref_count++;
744 k = 0;
745 } else if(_calls[qual_num] == s->nref) {
746 s->nref_count++;
747 k = 1;
748 if(pl[qual_num].qpos > (pl[qual_num].b->core.l_qseq - 5) || pl[qual_num].qpos < 5)
749 s->nref_edges++;
750 else
751 s->nref_center++;
752 if(pl[qual_num].b->core.pos < debug_sort) { s->aln_unique_pos++; debug_sort = pl[qual_num].b->core.pos; }
753 else if(pl[qual_num].b->core.pos > debug_sort) {fprintf(stderr, "ERROR checking sort in read-position, prev = %Ld, new = %Ld\n", debug_sort, pl[qual_num].b->core.pos); exit(1);}
754 }
755
756 if(k != -1) {
757 if(cigar_ops) {
758 if(cigar_ops & 0x1) s->c_ins[k]++;
759 if(cigar_ops & 0x2) s->c_del[k]++;
760 if(cigar_ops & 0x4) s->c_junc[k]++;
761 } else s->c_clean[k]++;
762
763 if(pl[qual_num].b->core.tid != pl[qual_num].b->core.mtid) s->bad_pair[k]++;
764
765 if(pl[qual_num].b->core.flag & 0x10) s->reverse[k]++;
766 else s->forward[k]++;
767 }
768 }
769 }
770 if(!params->filter) {
771 if(s->nref == nref_skip) {
772 snvmix_pl->b_end = b_end_tmp;
773 s->in_use = 0;
774 return 0;
775 }
776 int block = (_states / 3) * 3;
777 //for(k = 0; k < _states; k++)
778 //for(k = _states; k--; )
779 // _b[k] = 0;
780 k = 0;
781 while( k < block ) {
782 _b[k] = 0;
783 _b[k+1] = 0;
784 _b[k+2] = 0;
785 k += 3;
786 }
787 if( k < _states) {
788 switch( _states - k ) {
789 case 2: _b[k] = 0; k++;
790 case 1: _b[k] = 0;
791 }
792 }
793 //for(qual_num = 0; qual_num < depth; qual_num++) {
794 for(qual_num = depth; qual_num--; ) {
795 //if( snvmixSkipCall(calls,qual_num,params,bQ,mQ) )
796 if(_calls[qual_num] == -1)
797 continue;
798 // from matlab:
799 // b = sum(log(notm*0.5 + m.*(yr.*mur+notyr.*notmur)),2);
800 if(_calls[qual_num] == 1) {
801 // If REF then
802 Y = _phred[(unsigned char) _bQ(pl, qual_num)];
803 not_Y = 1-_phred[(unsigned char) _bQ(pl, qual_num)];
804 } else {
805 // If NREF then
806 Y = (1-_phred[(unsigned char) _bQ(pl, qual_num)])/3;
807 not_Y = _phred[(unsigned char) _bQ(pl, qual_num)] + 2*(1-_phred[(unsigned char)_bQ(pl, qual_num)])/3;
808 }
809 //for(k = 0; k < _states; k++)
810 //for(k = _states; k--; )
811 // _b[k] = _b[k] + logl((1-_phred[(unsigned char) _mQ(pl, qual_num)])*0.5 + _phred[(unsigned char) _mQ(pl, qual_num)] * (Y * _mu[k] + not_Y * _notmu[k]));
812 k = 0;
813 while( k < block ) {
814 _b[k] = _b[k] + logl((1-_phred[(unsigned char) _mQ(pl, qual_num)])*0.5 + _phred[(unsigned char) _mQ(pl, qual_num)] * (Y * _mu[k] + not_Y * _notmu[k]));
815 _b[k+1] = _b[k+1] + logl((1-_phred[(unsigned char) _mQ(pl, qual_num)])*0.5 + _phred[(unsigned char) _mQ(pl, qual_num)] * (Y * _mu[k+1] + not_Y * _notmu[k+1]));
816 _b[k+2] = _b[k+2] + logl((1-_phred[(unsigned char) _mQ(pl, qual_num)])*0.5 + _phred[(unsigned char) _mQ(pl, qual_num)] * (Y * _mu[k+2] + not_Y * _notmu[k+2]));
817 k += 3;
818 }
819 if( k < _states) {
820 switch( _states - k ) {
821 case 2: _b[k] = _b[k] + logl((1-_phred[(unsigned char) _mQ(pl, qual_num)])*0.5 + _phred[(unsigned char) _mQ(pl, qual_num)] * (Y * _mu[k] + not_Y * _notmu[k])); k++;
822 case 1: _b[k] = _b[k] + logl((1-_phred[(unsigned char) _mQ(pl, qual_num)])*0.5 + _phred[(unsigned char) _mQ(pl, qual_num)] * (Y * _mu[k] + not_Y * _notmu[k]));
823 }
824 }
825 }
826 z = 0;
827 //for(k = 0; k < _states; k++) {
828 //for(k = _states; k--; ) {
829 // _b[k] = expl(_b[k] + _log_pi[k]);
830 // z += _b[k];
831 //}
832 k = 0;
833 while( k < block ) {
834 _b[k] = expl(_b[k] + _log_pi[k]); z += _b[k];
835 _b[k+1] = expl(_b[k+1] + _log_pi[k+1]); z += _b[k+1];
836 _b[k+2] = expl(_b[k+2] + _log_pi[k+2]); z += _b[k+2];
837 k += 3;
838 }
839 if( k < _states ) {
840 switch( _states - k ) {
841 case 2: _b[k] = expl(_b[k] + _log_pi[k]); z += _b[k]; k++;
842 case 1: _b[k] = expl(_b[k] + _log_pi[k]); z += _b[k];
843 }
844 }
845 if(!z) z = 1;
846 //for(k = 0; k < _states; k++)
847 //for(k = _states; k--; )
848 // _b[k] = _b[k] / z;
849 k = 0;
850 while( k < block ) {
851 _b[k] = _b[k] / z;
852 _b[k+1] = _b[k+1] / z;
853 _b[k+2] = _b[k+2] / z;
854 k += 3;
855 }
856 if( k < _states ) {
857 switch( _states - k ) {
858 case 2: _b[k] = _b[k] / z; k++;
859 case 1: _b[k] = _b[k] / z;
860 }
861 }
862 maxp = _states - 1;
863 z = _b[maxp];
864 //for(k = 0; k < _states; k++)
865 for(k = _states; k--; )
866 if( _b[k] > z) {
867 maxp = k;
868 z = _b[k];
869 }
870
871
872 for(k = 0; k < _states; k++)
873 s->p[k] = _b[k];
874 s->maxP = maxp+1;
875 }
876 }
877
878 /*
879 void snvmixGetCallString
880 Arguments:
881 *col : pointer to the current file-line in memory
882 *calls : pointer to array where we will store the final base-calls
883 depth : number of base reads for this position
884 *nref : the observed NREF value will be placed here (-1 if none was found)
885
886 Function:
887 This will parse column 5 of the pileup file, which contains the
888 base calls and will fill the array "calls" with:
889 1 : when a reference call was made
890 -1 : when an invalid value is seen ('N', 'n', '*')
891 [ACTG] : when a non-reference call was made
892
893
894
895 */
896 inline void snvmixGetCallString(char *col, int *calls, int depth, char *nref) {
897 int i;
898 int call_i = 0, call_skip_num = 0;
899 char call_skip_char[10];
900 for(i = 0 ; call_i < depth; i++) {
901 switch(col[i]){
902 case ',':
903 case '.':
904 calls[call_i] = 1;
905 call_i++;
906 break;
907 case 'a': case 'A':
908 case 't': case 'T':
909 case 'g': case 'G':
910 case 'c': case 'C':
911 //calls[call_i] = 0;
912 calls[call_i] = toupper(col[i]);
913 call_i++;
914 //if(*nref == 0)
915 //*nref = toupper(*(col+sizeof(char)*i));
916 if(*nref == 0)
917 *nref = toupper(col[i]);
918 break;
919 case 'N':
920 case 'n':
921 case '*':
922 // Not comparable values, we ignore them, but need to
923 // keep count to compare against the number of mapping qualities
924 calls[call_i] = -1;
925 call_i++;
926 break;
927 case '$':
928 // End of a read, just ignore it
929 break;
930 case '^':
931 // Start of read, ignore it and skip the next value (not base-related)
932 i++;
933 break;
934 case '+':
935 case '-':
936 // Eliminate:
937 // +2AA
938 // -3AAA
939 // Start skiping the sign
940 i++;
941 for(call_skip_num = 0; col[i] <= '9' && col[i] >= '0'; call_skip_num++, i++) {
942 //call_skip_char[call_skip_num] = call;
943 call_skip_char[call_skip_num] = col[i];
944 //i++;
945 }
946 // we have the number string in call_skip_char, lets terminate it
947 call_skip_char[call_skip_num] = '\0';
948 // i has been updated to first letter, just need to jump the rest of them
949 i = i+atoi(call_skip_char)-1;
950 break;
951 default:
952 fprintf(stderr,"ERROR: problem reading pileup file calls (%c)\n",col[i]);
953 exit(1);
954 break;
955 }
956 }
957 // If no nref was found, don't bother parsing the other columns, make the for fail with -1
958 if(!*nref)
959 *nref = -1;
960 }
961
962 /*
963 int snvmixFilterCalls
964 Arguments:
965 *calls : array built by snvmixGetCallString containing
966 1 : when a reference call was made
967 -1 : when an invalid value is seen ('N', 'n', '*')
968 [ACTG] : when a non-reference call was made
969 depth : number of calls for the current position
970 *bQ : base qualities observed for each call
971 *mQ : map quality for each call
972 *params : parameter structure, get thresholding data
973 Function:
974 For each valid call read in the position this function will apply
975 thresholding according to the type selected (-t flag) and the bQ (-q)
976 and mQ (-Q) thresholds provided.
977
978 Any base-call that does not pass thresholds will be switched from it's
979 current value in *calls to -1;
980
981 The most prevalent NREF after filtering will be determined and returned.
982
983 */
984 inline int snvmixFilterCalls(int *calls, int depth, char *bQ, char *mQ, param_struct *params) {
985 int qual_num, nref_counts[5];
986 nref_counts[0] = 0;
987 nref_counts[1] = 0;
988 nref_counts[2] = 0;
989 nref_counts[3] = 0;
990 nref_counts[4] = 0;
991 int max_nref = N;
992
993 char nref = 0;
994
995 //for(qual_num = 0; qual_num < depth; qual_num++) {
996 for(qual_num = depth; qual_num--; ) {
997 //if( snvmixSkipCall(calls,qual_num,params,bQ,mQ) ) {
998 if( bQ != NULL && mQ != NULL && snvmixSkipCall(calls,qual_num,params,bQ,mQ) ) {
999 calls[qual_num] = -1;
1000 } else {
1001 nref_counts[0]++;
1002 switch(calls[qual_num]) {
1003 case 'A':
1004 nref_counts[A]++;
1005 break;
1006 case 'C':
1007 nref_counts[C]++;
1008 break;
1009 case 'G':
1010 nref_counts[G]++;
1011 break;
1012 case 'T':
1013 nref_counts[T]++;
1014 break;
1015 case 1:
1016 case -1:
1017 nref_counts[0]--;
1018 break;
1019 default:
1020 fprintf(stderr,"ERROR: unknown call base\n");
1021 exit(1);
1022 }
1023 }
1024 }
1025 if(nref_counts[0]) {
1026 max_nref = A;
1027 if(nref_counts[max_nref] < nref_counts[C])
1028 max_nref = C;
1029 if(nref_counts[max_nref] < nref_counts[G])
1030 max_nref = G;
1031 if(nref_counts[max_nref] < nref_counts[T])
1032 max_nref = T;
1033 } else {
1034 //return -1;
1035 }
1036 //for(qual_num = 0; qual_num < depth; qual_num++) {
1037 for(qual_num = depth; qual_num--; ) {
1038 if(calls[qual_num] == 1)
1039 continue;
1040 if(calls[qual_num] != base_code[max_nref])
1041 calls[qual_num] = -1;
1042 }
1043 return base_code[max_nref];
1044 }
1045
1046 /*
1047 int snvmixSkipCall
1048 Arguments:
1049 *calls : array built by snvmixGetCallString containing
1050 1 : when a reference call was made
1051 -1 : when an invalid value is seen ('N', 'n', '*')
1052 [ACTG] : when a non-reference call was made
1053 qual_num : call number that is being evaluated
1054 *params : parameter structure, get thresholding data
1055 *bQ : base qualities observed for each call
1056 *mQ : map quality for each call
1057 Function:
1058 Evalates quality values in each of the filtering models
1059
1060 Returns 1 if the calls[qual_num] needs to be filtered out
1061 Returns 0 otherwise
1062 */
1063 inline int snvmixSkipCall(int *calls, int qual_num, param_struct *params, char *bQ, char *mQ) {
1064 if(calls[qual_num] == -1)
1065 return(1);
1066 if(params->filter_type == TYPE_mb) {
1067 if(bQ[qual_num] <= params->bQ || mQ[qual_num] <= params->mQ)
1068 return(1);
1069 } else if(params->filter_type == TYPE_b) {
1070 if(bQ[qual_num] <= params->bQ)
1071 return(1);
1072 } else {
1073 if(mQ[qual_num] <= params->mQ)
1074 return(1);
1075 if(params->filter_type == TYPE_m) {
1076 // Use mapping as surrogate
1077 bQ[qual_num] = mQ[qual_num];
1078 // Make mapping one
1079 mQ[qual_num] = (char) PHRED_MAX;
1080 } else if(params->filter_type == TYPE_M) {
1081 // Mapping passed, make it one
1082 mQ[qual_num] = (char) PHRED_MAX;
1083 } else if(params->filter_type == TYPE_Mb) {
1084 // Nothing special here
1085 } else if(params->filter_type == TYPE_MB || params->filter_type == TYPE_SNVMix1) {
1086 if(bQ[qual_num] <= params->bQ)
1087 return(1);
1088 if(params->filter_type == TYPE_SNVMix1) {
1089 bQ[qual_num] = (char) PHRED_MAX;
1090 mQ[qual_num] = (char) PHRED_MAX;
1091 } }
1092 }
1093 return(0);
1094 }
1095
1096 inline int snvmixSkipCall_alt(param_struct *params, int call, char bQ, char mQ) {
1097 if(call == -1)
1098 return(1);
1099 if(params->filter_type != TYPE_b && mQ <= params->mQ)
1100 return(1);
1101 switch(params->filter_type) {
1102 case TYPE_mb:
1103 if(bQ <= params->bQ || mQ <= params->mQ)
1104 return(1);
1105 break;
1106 case TYPE_b:
1107 if(bQ <= params->bQ)
1108 return(1);
1109 break;
1110 case TYPE_m:
1111 // Use mapping as surrogate
1112 bQ = mQ;
1113 // Make mapping one
1114 mQ = (char) PHRED_MAX;
1115 break;
1116 case TYPE_M:
1117 // Mapping passed, make it one
1118 mQ = (char) PHRED_MAX;
1119 break;
1120 case TYPE_Mb:
1121 // Nothing special here
1122 break;
1123 case TYPE_MB:
1124 case TYPE_SNVMix1:
1125 if(bQ <= params->bQ)
1126 return(1);
1127 if(params->filter_type == TYPE_SNVMix1) {
1128 bQ = (char) PHRED_MAX;
1129 mQ = (char) PHRED_MAX;
1130 }
1131 break;
1132 }
1133 return(0);
1134 }
1135
1136
1137 void resetParams(param_struct *params) {
1138 params->input = stdin;
1139 params->output = stdout;
1140 params->inputfile = NULL;
1141 params->outputfile = NULL;
1142 params->modelfile = NULL;
1143 params->fastafile = NULL;
1144 params->listfile = NULL;
1145 params->window = 1;
1146 params->filter_type = 0;
1147 params->filter_dup = 1;
1148 params->filter_chast = 1;
1149 params->train = 0;
1150 params->classify = 0;
1151 params->filter = 0;
1152 params->full = 0;
1153 params->input_type = S_PILEUP; // 0 = processed, 1 = maq pileup, 2 = sam pileup, 3 = bam file
1154 params->mu = NULL;
1155 params->pi = NULL;
1156 params->max_iter = 1000;
1157 params->bQ = 19;
1158 params->mQ = 19;
1159 params->debug = 0;
1160 params->states = 3;
1161 params->trainP.alpha = NULL;
1162 params->trainP.beta = NULL;
1163 params->trainP.delta = NULL;
1164 params->trainP.param_file = NULL;
1165 params->trainP.max_iter = 100;
1166 params->trainP.bQ = NULL;
1167 params->trainP.mQ = NULL;
1168 params->trainP.calls = NULL;
1169 params->window = 1;
1170 }
1171
1172 void initSNVMix(int argc, char **argv, param_struct *params) {
1173 char c;
1174 int i;
1175 resetParams(params);
1176 while ((c = getopt (argc, argv, "hDTCFfi:o:m:t:p:q:Q:a:b:d:M:S:r:l:w:cR")) != -1) {
1177 switch (c)
1178 {
1179 case 'm': params->modelfile = optarg; break;
1180 case 'i': params->inputfile = optarg; break;
1181 case 'o': params->outputfile = optarg; break;
1182 // Bam file opts
1183 case 'r': params->fastafile = optarg; break;
1184 case 'l': params->listfile = optarg; break;
1185 case 'T': params->train = 1; break;
1186 case 'C': params->classify = 1; break;
1187 case 'F': params->filter = 1; break;
1188 case 'f': params->full = 1; break;
1189 case 'p':
1190 if(*optarg == 'q') {
1191 params->input_type = Q_PILEUP;
1192 } else if(*optarg == 'm') {
1193 params->input_type = M_PILEUP;
1194 } else if(*optarg == 's') {
1195 params->input_type = S_PILEUP;
1196 } else if(*optarg == 'b') {
1197 params->input_type = BAM_FILE;
1198 } else {
1199 fprintf(stderr,"ERROR: unknown pileup format %c\n",*optarg);
1200 exit(1);
1201 }
1202 break;
1203 case 't':
1204 if(strcmp("mb",optarg) == 0)
1205 params->filter_type = TYPE_mb;
1206 else if(strcmp("m",optarg) == 0)
1207 params->filter_type = TYPE_m;
1208 else if(strcmp("b",optarg) == 0)
1209 params->filter_type = TYPE_b;
1210 else if(strcmp("M",optarg) == 0)
1211 params->filter_type = TYPE_M;
1212 else if(strcmp("Mb",optarg) == 0)
1213 params->filter_type = TYPE_Mb;
1214 else if(strcmp("MB",optarg) == 0)
1215 params->filter_type = TYPE_MB;
1216 else if(strcmp("SNVMix1",optarg) == 0)
1217 params->filter_type = TYPE_SNVMix1;
1218 else {
1219 fprintf(stderr,"ERROR: filter type '%s' not recognized\n",optarg);
1220 exit(1);
1221 }
1222 break;
1223 case 'q':
1224 params->bQ = atoi(optarg);
1225 if( params->bQ < 0 || params->bQ >= PHRED_MAX ) {
1226 fprintf(stderr,"ERROR: quality threshold value Q%d out of range\n",params->bQ);
1227 exit(1);
1228 }
1229 break;
1230 case 'Q':
1231 params->mQ = atoi(optarg);
1232 if( params->mQ < 0 || params->mQ >= PHRED_MAX ) {
1233 fprintf(stderr,"ERROR: mapping threshold value Q%d out of range\n",params->mQ);
1234 exit(1);
1235 }
1236 break;
1237 case 'h':
1238 usage(argv[0]);
1239 break;
1240 case 'D': params->debug = 1; break;
1241 case 'M': params->trainP.param_file = optarg; break;
1242 case 'S':
1243 params->states = atoi(optarg);
1244 if( params->states < 3 ) {
1245 fprintf(stderr,"ERROR: state minimum is 3\n");
1246 exit(1);
1247 }
1248 break;
1249 case 'w':
1250 params->window = atoi(optarg);
1251 if(params->window < 1)
1252 params->window = 1;
1253 break;
1254 case 'c':
1255 params->filter_chast = 0;
1256 break;
1257 case 'R':
1258 params->filter_dup = 0;
1259 break;
1260 default:
1261 fprintf(stderr,"Unknown option\n");
1262 usage(argv[0]);
1263 break;
1264 }
1265 }
1266 /* Decide if we're going to train or classify */
1267 if(params->filter) {
1268 params->train = 0;
1269 params->classify = 0;
1270 }
1271 if(params->train && params->classify) {
1272 fprintf(stderr,"ERROR: choose either train or classify\n");
1273 exit(1);
1274 } else if(!params->train && !params->classify && !params->filter) {
1275 fprintf(stderr,"Train/Classify/Filter not selected, picking default: Classify\n");
1276 params->classify = 1;
1277 }
1278
1279 /* Set parameters */
1280 allocateParameters(params);
1281 if( params->train ) setTrainingParameters(params);
1282 if( params->classify ) setClassificationParameters(params);
1283
1284 /* Open input and output streams */
1285 if( params->input_type == BAM_FILE ) {
1286 if( params->train ) {
1287 fprintf(stderr, "BAM input for training not yet implemented\n");
1288 exit(1);
1289 }
1290 if( params->inputfile == NULL || strcmp(params->inputfile, "-") == 0) {
1291 fprintf(stderr, "ERROR: if '-p b' is chosen, input file has to be a bam file\n");
1292 exit(1);
1293 }
1294 if( params->fastafile == NULL ) {
1295 fprintf(stderr, "ERROR: if '-p b' is chosen, reference fasta file is required (-r <ref.fa>)\n");
1296 exit(1);
1297 }
1298 } else if( params->inputfile != NULL) {
1299 params->input = fopen(params->inputfile, "r");
1300 if(params->input == NULL) {
1301 perror("ERROR: could not open input file");
1302 exit(1);
1303 }
1304 } else {
1305 params->input = stdin;
1306 }
1307 if( params->outputfile != NULL ) {
1308 params->output = fopen(params->outputfile, "w");
1309 if(params->output == NULL) {
1310 perror("ERROR: could not open output file");
1311 exit(1);
1312 }
1313 } else {
1314 params->output = stdout;
1315 }
1316 }
1317
1318 void allocateParameters(param_struct *params) {
1319
1320 /* Allocate alpha */
1321 if( (params->trainP.alpha = malloc(params->states * sizeof(long double))) == NULL) {
1322 perror("ERROR: could not allocate space for alpha\n"); exit(1);
1323 }
1324 /* Allocate beta */
1325 if( (params->trainP.beta = malloc(params->states * sizeof(long double))) == NULL) {
1326 perror("ERROR: could not allocate space for beta\n"); exit(1);
1327 }
1328 /* Allocate delta*/
1329 if( (params->trainP.delta = malloc(params->states * sizeof(long double))) == NULL) {
1330 perror("ERROR: could not allocate space for delta\n"); exit(1);
1331 }
1332 /* Allocate mu*/
1333 if( (params->mu = malloc(params->states * sizeof(long double))) == NULL) {
1334 perror("ERROR: could not allocate space for mu\n"); exit(1);
1335 }
1336 /* Allocate pi*/
1337 if( (params->pi = malloc(params->states * sizeof(long double))) == NULL) {
1338 perror("ERROR: could not allocate space for pi\n"); exit(1);
1339 }
1340 }
1341
1342 void setTrainingParameters(param_struct *params) {
1343 if( params->trainP.param_file != NULL ) {
1344 readParamFile(params,'t');
1345 } else {
1346 if(params->states != 3) {
1347 fprintf(stderr, "ERROR: if state space larger than 3 requested, parameters must be provided\n");
1348 exit(1);
1349 }
1350 fprintf(stderr,"Training parameter file not given, using defaults\n");
1351 params->trainP.alpha[0] = 1000;
1352 params->trainP.alpha[1] = 5000;
1353 params->trainP.alpha[2] = 1;
1354 params->trainP.beta[0] = 1;
1355 params->trainP.beta[1] = 5000;
1356 params->trainP.beta[2] = 1000;
1357 params->trainP.delta[0] = 10;
1358 params->trainP.delta[1] = 1;
1359 params->trainP.delta[2] = 1;
1360 }
1361 }
1362
1363 void setClassificationParameters(param_struct *params) {
1364 int k;
1365 if( params->modelfile != NULL ) {
1366 readParamFile(params,'c');
1367 } else {
1368 if(params->states != 3) {
1369 fprintf(stderr, "ERROR: if state space larger than 3 requested, model file must be provided\n");
1370 exit(1);
1371 }
1372 params->mu[0] = 0.995905287891696078261816182930;
1373 params->mu[1] = 0.499569401000925172873223800707;
1374 params->mu[2] = 0.001002216846753116409260431219;
1375 params->pi[0] = 0.672519580755555956841362785781;
1376 params->pi[1] = 0.139342327124010650907237618412;
1377 params->pi[2] = 0.188138092120433392251399595807;
1378 fprintf(stderr,"Classification parameter file not given, using for mQ35 bQ10\n");
1379 }
1380 fprintf(stderr,"Mu and pi values:\n");
1381 for(k = 0; k < params->states; k++)
1382 fprintf(stderr,"\tmu[%d] = %Lf\tpi[%d] = %Lf\n", k, params->mu[k], k, params->pi[k]);
1383 fprintf(stderr,"\n");
1384 }
1385
1386 void readParamFile(param_struct *params, char type) {
1387 FILE *fptrPARAM;
1388 char *line = NULL, *param_name, *param, *param_ptr, *filename;
1389 size_t line_len = 0, read_len = 0;
1390 long double *target;
1391 int i;
1392
1393 if(type == 't') {
1394 filename=params->trainP.param_file;
1395 } else if(type == 'c') {
1396 filename=params->modelfile;
1397 }
1398 fptrPARAM=fopen(filename, "r");
1399 if(fptrPARAM == NULL) {
1400 fprintf(stderr, "ERROR: could not open parameter file %s\n", filename);
1401 exit(1);
1402 }
1403
1404 #if defined __linux__ || defined _GNU_SOURCE
1405 while( (read_len = getline(&line,&line_len,fptrPARAM)) != -1 )
1406 #endif
1407 #ifdef __APPLE__
1408 while( (line = fgetln(fptrPARAM,&line_len)) != NULL )
1409 #endif
1410 {
1411 line[line_len-1] = '\0';
1412 target = NULL;
1413 param_name = strtok_r(line, " ", &param_ptr);
1414 if(type == 't') {
1415 if(strcmp("alpha", param_name) == 0) target = params->trainP.alpha;
1416 else if(strcmp("beta", param_name) == 0) target = params->trainP.beta;
1417 else if(strcmp("delta", param_name) == 0) target = params->trainP.delta;
1418 } else if(type == 'c') {
1419 if(strcmp("mu", param_name) == 0) target = params->mu;
1420 else if(strcmp("pi", param_name) == 0) target = params->pi;
1421
1422 }
1423 if(target == NULL) { fprintf(stderr, "Unknown parameter %s, skipping\n", param_name); continue; }
1424
1425 for(i = 0; i < params->states; i++) {
1426 param = strtok_r(NULL, " ", &param_ptr);
1427 if(param == NULL) {
1428 fprintf(stderr,"ERROR: missing value #%d for %s\n", i, param_name);
1429 exit(1);
1430 }
1431 target[i] = atof(param);
1432 fprintf(stderr, "DEBUG: %s[%d] = %Lf\n", param_name, i, target[i]);
1433 }
1434 }
1435 fclose(fptrPARAM);
1436 free(line);
1437 }
1438
1439 void initPhred(long double *phredTable, int elem) {
1440 int i;
1441 for(i = 0; i < elem; i++) {
1442 phredTable[i] = 1-powl(10,(-(long double)i/10));
1443 }
1444 phredTable[i] = 1;
1445 }
1446
1447
1448 void usage(char *selfname) {
1449 param_struct default_params;
1450 resetParams(&default_params);
1451 fprintf(stderr,"Syntax:\n\t%s\t-m <modelfile> [-i <infile>] [-o <outfile>] [-T | -C | -F] [-p < q | m | s >] [-t < mb | m | b | M | Mb | MB | SNVMix1>] [-q <#>] [-Q <#>] [-M <trainP file>] [-h]\n",selfname);
1452 fprintf(stderr,"\tRequired:\n");
1453 fprintf(stderr,"\t-m <modelfile>\tmodel file, a line for mu and a line for pi, three\n");
1454 fprintf(stderr,"\t\t\tspace-separated values each, like:\n");
1455 fprintf(stderr,"\t\t\tmu 0.xxxxxxxx 0.xxxxxxxx 0.xxxxxxxx\n");
1456 fprintf(stderr,"\t\t\tpi 0.xxxxxxxx 0.xxxxxxxx 0.xxxxxxxx\n");
1457 fprintf(stderr,"\tOptional:\n");
1458 fprintf(stderr,"\t-i <infile>\tquality pileup, from pileup2matlab.pl script (def: STDIN)\n");
1459 fprintf(stderr,"\t-o <outfile>\twhere to put the results (def: STDOUT)\n");
1460 fprintf(stderr,"\t-T | -C | -F\tTrain, Classify or Filter (def: Classify)\n");
1461 fprintf(stderr,"\t-p q|m|s\tInput pileup format (def: s)\n\t\t\tq = quality\n\t\t\tm = MAQ\n\t\t\ts = SAMtools\n");
1462 fprintf(stderr,"\t-t mb|m|b|M|Mb|MB|SNVMix1\n\t\t\tFilter type (def: mb)\n");
1463 fprintf(stderr,"\t\t\tmb\tLowest between map and base quality\n");
1464 fprintf(stderr,"\t\t\tm\tFilter on map, and use as surrogate for base quality\n");
1465 fprintf(stderr,"\t\t\tb\tFilter on base quality, take map quality as 1\n");
1466 fprintf(stderr,"\t\t\tM\tFilter on map quality but use only base quality\n");
1467 fprintf(stderr,"\t\t\tMb\tFilter on map quality and use both map and base qualities\n");
1468 fprintf(stderr,"\t\t\tMB\tFilter on map quality AND base quality\n");
1469 fprintf(stderr,"\t\t\tSNVMix1\tFilter on base quality and map quality, afterwards consider them perfect\n");
1470 fprintf(stderr,"\t-q #\t\tCutoff Phred value for Base Quality, anything <= this value is ignored (def: Q%d)\n",default_params.bQ);
1471 fprintf(stderr,"\t-Q #\t\tCutoff Phred value for Map Quality, anything <= this value is ignored (def: Q%d)\n",default_params.mQ);
1472 fprintf(stderr,"\n\tTraining parameters:\n");
1473 fprintf(stderr,"\t-M <file>\tProvide a file containing training parameters\n");
1474 fprintf(stderr,"\t\t\tValues are space-separated:\n");
1475 fprintf(stderr,"\t\t\talpha # # #\n");
1476 fprintf(stderr,"\t\t\tbeta # # #\n");
1477 fprintf(stderr,"\t\t\tdelta # # #\n");
1478 fprintf(stderr,"\n\t-h\t\tthis message\n");
1479 fprintf(stderr,"\nImplementation: Rodrigo Goya, 2009. Version %s\n",VERSION);
1480 exit(0);
1481 }
1482
1483 // Based on classify pileup, but reads the entire file to memory for training purposes.
1484 void snvmixGetTrainSet_pileup(param_struct *params) {
1485 //MAQ:
1486 // 1 554484 C 1752 @,,.,.,, @xxx @xxx xxxxx
1487 //SAM:
1488 // 1 554484 C 1752 ,,.,.,, xxx xxxx
1489
1490 FILE *fptrIN;
1491
1492 char *line = NULL;
1493 size_t line_len = 0, prev_len = 0;
1494 int read_len = 0;
1495 int col_num = 0;
1496 long int line_num = 0;
1497
1498 char *col, *qual;
1499 char *col_sep = "\t", *col_str, *col_ptr;
1500 char *qual_sep = ",", *qual_str, *qual_ptr;
1501 char *chr, *pos, *ref, nref, call;
1502
1503 char *bQ, *mQ;
1504 int *calls, *tmpQ, pos_int;
1505 int depth = 0, qual_num, maxp;
1506 int depth_allocated = 0;
1507
1508 int trainset = 0;
1509 int trainset_allocated = 0;
1510
1511 int nref_count = 0, ref_count = 0;
1512 int i, call_i;
1513
1514 fptrIN = params->input;
1515
1516 if( (params->trainP.bQ = malloc(START_QNUM * sizeof (unsigned char **))) == NULL) {
1517 perror("ERROR allocating initial space for train.bQ"); exit(1);
1518 }
1519 if( (params->trainP.mQ = malloc(START_QNUM * sizeof (unsigned char **))) == NULL) {
1520 perror("ERROR allocating initial space for train.mQ"); exit(1);
1521 }
1522 if( (params->trainP.calls = malloc(START_QNUM * sizeof (signed char **))) == NULL) {
1523 perror("ERROR allocating initial space for train.calls"); exit(1);
1524 }
1525 if( (params->trainP.depth = malloc(START_QNUM * sizeof (int *))) == NULL) {
1526 perror("ERROR allocating initial space for train.depth"); exit(1);
1527 }
1528 trainset_allocated = START_QNUM;
1529
1530 if( (calls = malloc(START_QNUM * sizeof (int))) == NULL) {
1531 perror("ERROR allocating initial space for train.depth"); exit(1);
1532 }
1533 depth_allocated = START_QNUM;
1534 #if defined __linux__ || defined _GNU_SOURCE
1535 while( (read_len = getline(&line,&line_len,fptrIN)) != -1 )
1536 #endif
1537 #ifdef __APPLE__
1538 while( (line = fgetln(fptrIN,&line_len)) != NULL )
1539 #endif
1540 {
1541 line[line_len-1] = '\0';
1542 depth = 0;
1543 nref = 0;
1544 col_ptr = NULL;
1545 for(col_num = 0, col_str = line; ; col_num++, col_str = NULL) {
1546 col = strtok_r(col_str, "\t", &col_ptr);
1547 if(col == NULL) {
1548 break;
1549 }
1550 if(col_num == 0) {
1551 chr = col;
1552 } else if(col_num == 1) {
1553 pos = col;
1554 pos_int = strtol(pos, NULL, 10);
1555 } else if(col_num == 2) {
1556 ref = col;
1557 } else if(col_num == 3) {
1558 depth = atoi(col);
1559 if(depth > depth_allocated) {
1560 if( (calls = realloc(calls, sizeof (int) * (depth + START_QNUM))) == NULL) {
1561 perror("ERROR allocating space for calls"); exit(1);
1562 }
1563 depth_allocated = depth + START_QNUM;
1564 }
1565 } else if(col_num == 4) {
1566 if(params->input_type == M_PILEUP)
1567 col = col+sizeof(char);
1568 snvmixGetCallString(col, calls, depth, &nref);
1569 } else if(col_num == 5) {
1570 bQ = col;
1571 // If it's a MAQ pileup, we need to skip the @ sign
1572 if(params->input_type == M_PILEUP)
1573 bQ = bQ + sizeof(char);
1574 for(call_i = 0; call_i < depth; call_i++)
1575 bQ[call_i] = bQ[call_i]-33;
1576 } else if(col_num == 6) {
1577 mQ = col;
1578 // If it's a MAQ pileup, we need to skip the @ sign
1579 if(params->input_type == M_PILEUP)
1580 mQ = mQ + sizeof(char);
1581 for(call_i = 0; call_i < depth; call_i++)
1582 mQ[call_i] = mQ[call_i]-33;
1583 }
1584 }
1585 if(line_num >= trainset_allocated) {
1586 if( ( params->trainP.bQ = realloc(params->trainP.bQ, (line_num + START_QNUM) * sizeof (unsigned char *)) ) == NULL ) {
1587 perror("ERROR reallocating space for trainP.bQ"); exit(1);
1588 }
1589 if( ( params->trainP.mQ = realloc(params->trainP.mQ, (line_num + START_QNUM) * sizeof (unsigned char *)) ) == NULL ) {
1590 perror("ERROR reallocating space for trainP.mQ"); exit(1);
1591 }
1592 if( ( params->trainP.calls = realloc(params->trainP.calls, (line_num + START_QNUM) * sizeof (signed char *)) ) == NULL) {
1593 perror("ERROR reallocating space for trainP.calls"); exit(1);
1594 }
1595 if( ( params->trainP.depth = realloc(params->trainP.depth, (line_num + START_QNUM) * sizeof (int)) ) == NULL) {
1596 perror("ERROR reallocating space for trainP.depth"); exit(1);
1597 }
1598 trainset_allocated = line_num + START_QNUM;
1599 }
1600
1601 nref = snvmixFilterCalls(calls,depth,bQ,mQ,params);
1602 nref_count = 0;
1603 ref_count = 0;
1604 for(qual_num = 0; qual_num < depth; qual_num++) {
1605 if(calls[qual_num] == -1)
1606 continue;
1607 if(calls[qual_num] == 1)
1608 ref_count++;
1609 if(calls[qual_num] == nref)
1610 nref_count++;
1611 }
1612 params->trainP.depth[line_num] = ref_count + nref_count;
1613
1614 if( (params->trainP.bQ[line_num] = malloc(sizeof(unsigned char) * params->trainP.depth[line_num])) == NULL ) {
1615 perror("ERROR allocating space for trainP.bQ"); exit(1);
1616 }
1617 if( (params->trainP.mQ[line_num] = malloc(sizeof(unsigned char) * params->trainP.depth[line_num])) == NULL ) {
1618 perror("ERROR allocating space for trainP.mQ"); exit(1);
1619 }
1620 if( (params->trainP.calls[line_num] = malloc(sizeof(signed char) * params->trainP.depth[line_num])) == NULL ) {
1621 perror("ERROR allocating space for trainP.calls"); exit(1);
1622 }
1623
1624 call_i = 0;
1625 for(qual_num = 0; qual_num < depth; qual_num++) {
1626 if(calls[qual_num] == -1)
1627 continue;
1628
1629 params->trainP.calls[line_num][call_i] = (signed char) calls[qual_num];
1630 params->trainP.bQ[line_num][call_i] = (unsigned char) bQ[qual_num];
1631 params->trainP.mQ[line_num][call_i] = (unsigned char) mQ[qual_num];
1632 call_i++;
1633 }
1634 if( params->trainP.depth[line_num] != call_i ) {
1635 fprintf(stderr, "ERROR: mismatch between trainP.depth and call_i\n"); exit(1);
1636 }
1637 line_num++;
1638 }
1639 params->trainP.len = line_num;
1640 params->trainP.len = line_num;
1641 fclose(fptrIN);
1642 free(line); free(calls);
1643 }
1644
1645 // Use EM algorithm on a memory-located data set to train the parameters
1646 void snvmixTrain_pileup(param_struct *params) {
1647 int line_num = 0, call_i = 0, k = 0, states;
1648 int iter = 0;
1649 FILE *fptrOUT;
1650
1651 long double phred[PHRED_MAX + 1];
1652 long double bsum = 0, msum = 0, *b, *mu, *notmu, *pi, *log_pi, z = 0;
1653 long double **pG, **xr;
1654 long double *xrhat, *sum_pG;
1655 long double Y, not_Y, M;
1656 int N_sum;
1657 int strength;
1658
1659 long double ll, prev_ll = 0;
1660 //initPhred(phred, PHRED_MAX+1);
1661 initPhred(phred, PHRED_MAX);
1662
1663 states = params->states;
1664
1665 // Initialize local values of parameters
1666 mu = params->mu;
1667 pi = params->pi;
1668
1669 if( ( b = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate b\n"); exit(1); }
1670 if( ( notmu = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate notmu\n"); exit(1); }
1671 if( ( log_pi = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate log_pi\n"); exit(1); }
1672 if( ( xrhat = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate xrhat\n"); exit(1); }
1673 if( ( sum_pG = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate sum_pG\n"); exit(1); }
1674
1675 snvmixGetTrainSet_pileup(params);
1676
1677 if(params->debug) {
1678 for(line_num = 0; line_num < params->trainP.len; line_num++) {
1679 fprintf(stderr, "line_num: %d", line_num);
1680 for(call_i = 0; call_i < params->trainP.depth[line_num]; call_i++) {
1681 fprintf(stderr, "\t(%d,bQ%d,mQ%d)", params->trainP.calls[line_num][call_i], params->trainP.bQ[line_num][call_i], params->trainP.mQ[line_num][call_i]);
1682 }
1683 fprintf(stderr, "\n\n");
1684 }
1685 }
1686 // Initialize mu and pi
1687
1688 fprintf(stderr, "Initializing mu\n");
1689 for(k = 0; k < states; k++) {
1690 params->mu[k] = (long double) params->trainP.alpha[k] / (params->trainP.alpha[k] + params->trainP.beta[k]);
1691 fprintf(stderr,"\talpha[%d] = %Lf,\tbeta[%d] = %Lf,\tmu[%d] = %Lf\n", k, params->trainP.alpha[k], k, params->trainP.beta[k], k, params->mu[k]);
1692 }
1693
1694 fprintf(stderr, "Initializing pi\n");
1695 z = (long double) params->trainP.delta[0] + params->trainP.delta[1] + params->trainP.delta[2];
1696 if(!z) { z = 1; }
1697 for(k = 0; k < states; k++) {
1698 params->pi[k] = (long double) params->trainP.delta[k] / z;
1699 fprintf(stderr,"\tdelta[%d] = %Lf,\tpi[%d] = %Lf\n", k, params->trainP.delta[k], k, params->pi[k]);
1700 }
1701
1702 strength = params->trainP.len;
1703
1704 // Initialize matrices
1705 if( (pG = malloc(sizeof(long double *) * params->trainP.len)) == NULL) {
1706 perror("ERROR allocating initial space for pG"); exit(1);
1707 }
1708 if( (xr = malloc(sizeof(long double *) * params->trainP.len)) == NULL) {
1709 perror("ERROR allocating initial space for xr"); exit(1);
1710 }
1711 for(line_num = 0; line_num < params->trainP.len; line_num++) {
1712 // [states] cells for each line_num
1713 if( (pG[line_num] = malloc(sizeof(long double) * states)) == NULL) {
1714 perror("ERROR allocating space for pG"); exit(1);
1715 }
1716 if( (xr[line_num] = malloc(sizeof(long double) * states)) == NULL) {
1717 perror("ERROR allocating space for xr"); exit(1);
1718 }
1719 }
1720
1721 for(iter = 0; iter < params->trainP.max_iter; iter++) {
1722 // Reset values for this iteration
1723 for(k = 0; k < states; k++) {
1724 notmu[k] = 1 - mu[k];
1725 log_pi[k] = logl(pi[k]);
1726
1727 sum_pG[k] = 0;
1728 xrhat[k] = 0;
1729
1730 }
1731 N_sum = 0;
1732 ll = 0;
1733
1734 // E step
1735 for(line_num = 0; line_num < params->trainP.len; line_num++) {
1736 if(params->trainP.depth == 0)
1737 continue;
1738 for(k = 0; k < states; k++) {
1739 xr[line_num][k] = 0;
1740 b[k] = 0;
1741 }
1742
1743 for(call_i = 0; call_i < params->trainP.depth[line_num]; call_i++) {
1744 if(params->trainP.calls[line_num][call_i] == 1) {
1745 Y = phred[params->trainP.bQ[line_num][call_i]];
1746 not_Y = 1-phred[params->trainP.bQ[line_num][call_i]];
1747 } else {
1748 Y = (1-phred[params->trainP.bQ[line_num][call_i]])/3;
1749 not_Y = phred[params->trainP.bQ[line_num][call_i]] + 2*(1-phred[params->trainP.bQ[line_num][call_i]])/3;
1750 }
1751 M = phred[params->trainP.mQ[line_num][call_i]];
1752 for(k = 0; k < states; k++) {
1753 b[k] = b[k] + logl( (1 - M) * 0.5 + M * (Y * mu[k] + not_Y * notmu[k]) );
1754 xr[line_num][k] = xr[line_num][k] + Y;
1755 }
1756 }
1757 z = 0;
1758 for(k = 0; k < states; k++) {
1759 b[k] = expl(b[k] + log_pi[k]);
1760 z = z + b[k];
1761 }
1762 if(!z) { z=1; }
1763 for(k = 0; k < states; k++) {
1764 pG[line_num][k] = b[k] / z;
1765 }
1766
1767 ll = ll + logl(z);
1768
1769 N_sum = N_sum + params->trainP.depth[line_num];
1770
1771 for(k = 0; k < states; k++) {
1772 sum_pG[k] = sum_pG[k] + pG[line_num][k];
1773 xrhat[k] = xrhat[k] + xr[line_num][k] * pG[line_num][k];
1774 }
1775 }
1776
1777 // Check log likelihood
1778 if(iter == 0)
1779 prev_ll = ll;
1780 else if(ll <= prev_ll)
1781 break;
1782 prev_ll = ll;
1783
1784 // M step
1785 z = 0;
1786 for(k = 0; k < states; k++) {
1787 z = z + sum_pG[k] + params->trainP.delta[k];
1788 }
1789 if(!z) { z=1; }
1790 for(k = 0; k < states; k++) {
1791 // Update pi
1792 params->pi[k] = (sum_pG[k] + params->trainP.delta[k]) / z;
1793 // Update mu
1794 params->mu[k] = (xrhat[k] + params->trainP.alpha[k]*strength-1) / (N_sum + params->trainP.alpha[k]*strength + params->trainP.beta[k]*strength-2);
1795 }
1796 }
1797
1798 if(params->modelfile == NULL) {
1799 fptrOUT = stdout;
1800 } else {
1801 fptrOUT = fopen(params->modelfile, "w");
1802 if(fptrOUT == NULL) {
1803 perror("ERROR: could not open modelfile for writing, using STDOUT");
1804 fptrOUT = stdout;
1805 }
1806 }
1807 fprintf(fptrOUT,"mu");
1808 for(k = 0; k < states; k++) {
1809 fprintf(fptrOUT, " %0.30Lf", params->mu[k]);
1810 }
1811 fprintf(fptrOUT,"\n");
1812 fprintf(fptrOUT,"pi");
1813 for(k = 0; k < states; k++) {
1814 fprintf(fptrOUT, " %0.30Lf", params->pi[k]);
1815 }
1816 fprintf(fptrOUT,"\n");
1817
1818 /* free unused memory */
1819 free(b); free(notmu); free(log_pi);
1820 free(xrhat); free(sum_pG);
1821 for(line_num = 0; line_num < params->trainP.len; line_num++) {
1822 // free [states] cells for each line_num
1823 free(pG[line_num]);
1824 free(xr[line_num]);
1825 }
1826 free(pG); free(xr);
1827 }
1828