Mercurial > repos > ryanmorin > nextgen_variant_identification
comparison SNV/SNVMix2_source/SNVMix2-v0.12.1-rc1/SNVMix2.c @ 0:74f5ea818cea
Uploaded
author | ryanmorin |
---|---|
date | Wed, 12 Oct 2011 19:50:38 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:74f5ea818cea |
---|---|
1 /* The MIT License | |
2 | |
3 Copyright (c) 2009, by Sohrab Shah <sshah@bccrc.ca> and Rodrigo Goya <rgoya@bcgsc.ca> | |
4 | |
5 Permission is hereby granted, free of charge, to any person obtaining | |
6 a copy of this software and associated documentation files (the | |
7 "Software"), to deal in the Software without restriction, including | |
8 without limitation the rights to use, copy, modify, merge, publish, | |
9 distribute, sublicense, and/or sell copies of the Software, and to | |
10 permit persons to whom the Software is furnished to do so, subject to | |
11 the following conditions: | |
12 | |
13 The above copyright notice and this permission notice shall be | |
14 included in all copies or substantial portions of the Software. | |
15 | |
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
23 SOFTWARE. | |
24 */ | |
25 | |
26 /* | |
27 C Implementation of SNVMix2 | |
28 */ | |
29 | |
30 #define VERSION "0.12.1-rc1" | |
31 | |
32 #include <stdio.h> | |
33 #include <stdlib.h> | |
34 #include <string.h> | |
35 #include <math.h> | |
36 #include "SNVMix2.h" | |
37 | |
38 #include "bam.h" | |
39 | |
40 #define START_QNUM 1000 | |
41 | |
42 int main(int argc, char **argv) { | |
43 | |
44 param_struct params;// = {NULL, NULL, NULL, NULL, NULL, 0, 0, 0, 0, 0}; | |
45 | |
46 initSNVMix(argc, argv, ¶ms); | |
47 | |
48 if(params.classify || params.filter) { | |
49 if(params.input_type == Q_PILEUP) | |
50 snvmixClassify_qualities(¶ms); | |
51 else if(params.input_type == M_PILEUP || params.input_type == S_PILEUP) | |
52 snvmixClassify_pileup(¶ms); | |
53 else if(params.input_type == BAM_FILE) { | |
54 fprintf(stderr,"Output is:\n"); | |
55 fprintf(stderr,"\tchr:pos ref nref ref:ref_count,nref:nref_count,pAA,pAB,pBB,maxP"); | |
56 fprintf(stderr,"\tref_fwd ref_rev"); | |
57 fprintf(stderr,"\tnref_fwd nref_rev"); | |
58 fprintf(stderr,"\tnref_center nref_edges"); | |
59 fprintf(stderr,"\tindel_pos"); | |
60 fprintf(stderr,"\tref_clean nref_clean"); | |
61 fprintf(stderr,"\tref_bad_pair nref_bad_pair"); | |
62 fprintf(stderr,"\tref_ins nref_ins"); | |
63 fprintf(stderr,"\tref_del nref_del"); | |
64 fprintf(stderr,"\tref_junc nref_junc"); | |
65 fprintf(stderr,"\tnref_in_unique_pos"); | |
66 fprintf(stderr,"\traw[A,C,G,T,N]"); | |
67 fprintf(stderr,"\tthr[A,C,G,T,N]\n"); | |
68 snvmixClassify_bamfile(¶ms); | |
69 } | |
70 } else if(params.train) { | |
71 if(params.input_type == M_PILEUP || params.input_type == S_PILEUP) | |
72 snvmixTrain_pileup(¶ms); | |
73 else if(params.input_type == Q_PILEUP) { | |
74 fprintf(stderr,"Sorry, Training with quality-type input is not yet implemented\n"); | |
75 exit(1); | |
76 } | |
77 } | |
78 return(0); | |
79 } | |
80 | |
81 /* | |
82 void snvmixClassify_qualities | |
83 Arguments: | |
84 *params : pointer to model parameters and command line options | |
85 Function: | |
86 Reads a pilep-style file that has been preprocessed to include the quality of | |
87 calls being the reference allele and the maping quality both in decimal values. | |
88 | |
89 For each line it evaluates de model according to the parameters in *params and | |
90 outputs the corresponding SNV prediction values. | |
91 | |
92 This function may come in handy when filtering of calls is done by a | |
93 different program or the base-calling and maping quality information is not | |
94 in a pileup/phred format. | |
95 | |
96 The file read is a tab-separated file where the columns are: | |
97 - target sequence (e.g. chromosome) | |
98 - sequence position | |
99 - reference base | |
100 - non-reference base | |
101 - comma separated probabilities of the call being the reference | |
102 - comma separated mapping qualities, one for each base call | |
103 | |
104 */ | |
105 void snvmixClassify_qualities(param_struct *params) { | |
106 FILE *fptrIN, *fptrOUT; | |
107 | |
108 char *line = NULL; | |
109 size_t line_len = 0, prev_len = 0; | |
110 int read_len = 0; | |
111 | |
112 int qual_num = 0, col_num = 0; | |
113 | |
114 char *col, *qual; | |
115 char *col_sep = "\t", *col_str, *col_ptr; | |
116 char *qual_sep = ",", *qual_str, *qual_ptr; | |
117 char *chr, *pos, *ref, *nref; | |
118 | |
119 long double *bQ, *mQ, *tmpQ; | |
120 int pos_int, depth = 0, maxp; | |
121 long int depth_allocated = 0; | |
122 | |
123 long double bsum = 0, msum = 0, *b, *mu, *notmu, *pi, *log_pi, z = 0; | |
124 int i, k, states; | |
125 | |
126 states = params->states; | |
127 | |
128 // Initialize local values of parameters | |
129 mu = params->mu; | |
130 pi = params->pi; | |
131 | |
132 if( ( b = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_qualities] Could not allocate b\n"); exit(1); } | |
133 if( ( notmu = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_qualities] Could not allocate notmu\n"); exit(1); } | |
134 if( ( log_pi = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_qualities] Could not allocate log_pi\n"); exit(1); } | |
135 | |
136 for(k = 0; k < states; k++) { | |
137 notmu[k] = 1 - mu[k]; | |
138 log_pi[k] = logl(pi[k]); | |
139 } | |
140 fptrIN = params->input; | |
141 fptrOUT = params->output; | |
142 | |
143 | |
144 if( (bQ = malloc(START_QNUM * sizeof (long double))) == NULL) { | |
145 perror("ERROR allocating initial space for bQ"); exit(1); | |
146 } | |
147 if( (mQ = malloc(START_QNUM * sizeof (long double))) == NULL) { | |
148 perror("ERROR allocating initial space for mQ"); exit(1); | |
149 } | |
150 depth_allocated = START_QNUM; | |
151 #if defined __linux__ || defined _GNU_SOURCE | |
152 while( (read_len = getline(&line,&line_len,fptrIN)) != -1 ) | |
153 #endif | |
154 #ifdef __APPLE__ | |
155 while( (line = fgetln(fptrIN,&line_len)) != NULL ) | |
156 #endif | |
157 { | |
158 line[line_len-1] = '\0'; | |
159 depth = 0; | |
160 col_ptr = NULL; | |
161 for(col_num = 0, col_str = line; ; col_num++, col_str = NULL) { | |
162 col = strtok_r(col_str, "\t", &col_ptr); | |
163 if(col == NULL) { | |
164 break; | |
165 } | |
166 if(col_num == 0) { | |
167 chr = col; | |
168 } else if(col_num == 1) { | |
169 pos = col; | |
170 pos_int = strtol(pos, NULL, 10); | |
171 } else if(col_num == 2) { | |
172 ref = col; | |
173 } else if(col_num == 3) { | |
174 nref = col; | |
175 } else if(col_num == 4) { | |
176 for(qual_num = 0, qual_str = col; ; qual_num++, qual_str = NULL) { | |
177 qual = strtok_r(qual_str, ",", &qual_ptr); | |
178 if(qual == NULL) { | |
179 break; | |
180 } | |
181 if(qual_num >= depth_allocated) { | |
182 if( (bQ = realloc(bQ, sizeof(long double) * (depth_allocated + START_QNUM))) == NULL) { | |
183 perror("ERROR allocating bQ"); exit(1); | |
184 } | |
185 if( (mQ = realloc(mQ, sizeof(long double) * (depth_allocated + START_QNUM))) == NULL) { | |
186 perror("ERROR allocating mQ"); exit(1); | |
187 } | |
188 depth_allocated = depth_allocated + START_QNUM; | |
189 } | |
190 bQ[qual_num] = atof(qual); | |
191 } | |
192 depth = qual_num; | |
193 } else if(col_num == 5) { | |
194 for(qual_num = 0, qual_str = col; ; qual_num++, qual_str = NULL) { | |
195 qual = strtok_r(qual_str, ",", &qual_ptr); | |
196 if(qual == NULL) { | |
197 break; | |
198 } | |
199 if(qual_num >= depth_allocated) { | |
200 fprintf(stderr, "FATAL ERROR: should not have more mapping qualities than base qualities\n"); | |
201 exit(1); | |
202 } | |
203 mQ[qual_num] = atof(qual); | |
204 } | |
205 if(depth != qual_num) { | |
206 fprintf(stderr, "FATAL ERROR: should not have more base qualities than mapping qualities\n"); | |
207 exit(1); | |
208 } | |
209 } | |
210 } | |
211 | |
212 | |
213 for(k = 0; k < states; k++) | |
214 b[k] = 0; | |
215 | |
216 //b = sum(log(notm*0.5 + m.*(yr.*mur+notyr.*notmur)),2); | |
217 for(qual_num = 0; qual_num < depth; qual_num++) | |
218 for(k = 0; k < states; k++) | |
219 b[k] = b[k] + logl((1- mQ[qual_num])*0.5 + mQ[qual_num] * (bQ[qual_num]*mu[k]+(1-bQ[qual_num])*notmu[k])); | |
220 | |
221 z = 0; | |
222 for(k = 0; k < states; k++) { | |
223 b[k] = expl(b[k] + log_pi[k]); | |
224 z += b[k]; | |
225 } | |
226 if(!z) { z = 1; } | |
227 for(k = 0; k < states; k++) | |
228 b[k] = b[k] / z; | |
229 maxp = 0; | |
230 z = b[0]; | |
231 for(k = 0; k < states; k++) | |
232 if( b[k] > z) { | |
233 maxp = k; | |
234 z = b[k]; | |
235 } | |
236 | |
237 fprintf(fptrOUT,"%s\t%s\t%s\t%s",chr, pos, ref, nref); | |
238 for(k = 0; k , states; k++) | |
239 fprintf(fptrOUT,"%0.10Lf,",b[k]); | |
240 fprintf(fptrOUT,"%d\n",maxp+1); | |
241 } | |
242 fclose(fptrIN); | |
243 fclose(fptrOUT); | |
244 free(line); | |
245 | |
246 free(b); free(notmu); free(log_pi); | |
247 } | |
248 | |
249 | |
250 /* | |
251 void snvmixClassify_pileup | |
252 Arguments: | |
253 *params : pointer to model parameters and command line options | |
254 Function: | |
255 Reads a MAQ or Samtools pileup format. For required formatting we refer the user | |
256 to the corresponding websites: | |
257 http://maq.sourceforge.net/ | |
258 http://samtools.sourceforge.net/ | |
259 | |
260 In general the format of both files can be described as a tab separated file | |
261 where columns are: | |
262 - target sequence (e.g. chromosome) | |
263 - sequence position | |
264 - reference base | |
265 - depth | |
266 - stream of base calls | |
267 - stream of base call PHRED-scaled qualities | |
268 - stream of mapping PHRED-scaled qualities | |
269 | |
270 Note: when handling pileup files, care should be taken when doing copy-paste | |
271 operations not to transform 'tabs' into streams of spaces. | |
272 | |
273 For each line it evaluates de model according to the parameters in *params and | |
274 outputs the corresponding SNV prediction values. | |
275 | |
276 Predictions can be output either only for positions where when non-reference values | |
277 are observed or, when run with '-f' flag, for all positions. The -f flag is useful | |
278 when the resulting calls are being compared or joined accros different datasets | |
279 and all predictions need to be present. | |
280 */ | |
281 void snvmixClassify_pileup(param_struct *params) { | |
282 //MAQ: | |
283 // 1 554484 C 1752 @,,.,.,, @xxx @xxx xxxxx | |
284 //SAM: | |
285 // 1 554484 C 1752 ,,.,.,, xxx xxxx | |
286 FILE *fptrIN, *fptrOUT; | |
287 | |
288 char *line = NULL; | |
289 size_t line_len = 0, prev_len = 0; | |
290 int read_len = 0; | |
291 int col_num = 0; | |
292 long int line_num = 0; | |
293 | |
294 char *col, *qual; | |
295 char *col_sep = "\t", *col_str, *col_ptr; | |
296 char *qual_sep = ",", *qual_str, *qual_ptr; | |
297 char *chr, *pos, *ref, nref, call, nref_skip = 'N'; | |
298 | |
299 char *bQ, *mQ; | |
300 int *calls, *tmpQ; | |
301 int pos_int, depth = 0, qual_num, maxp; | |
302 int depth_allocated = 0; | |
303 | |
304 long double bsum = 0, msum = 0, *b, *mu, *notmu, *pi, *log_pi, z = 0; | |
305 int nref_count = 0, ref_count = 0; | |
306 long double Y, not_Y; | |
307 long double phred[PHRED_MAX + 1]; | |
308 int i, call_i, k, states; | |
309 | |
310 //initPhred(phred, PHRED_MAX+1); | |
311 initPhred(phred, PHRED_MAX); | |
312 | |
313 states = params->states; | |
314 | |
315 // Initialize local values of parameters | |
316 mu = params->mu; | |
317 pi = params->pi; | |
318 | |
319 if( ( b = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate b\n"); exit(1); } | |
320 if( ( notmu = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate notmu\n"); exit(1); } | |
321 if( ( log_pi = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate log_pi\n"); exit(1); } | |
322 | |
323 for(k = 0; k < states; k++) { | |
324 notmu[k] = 1 - mu[k]; | |
325 log_pi[k] = logl(pi[k]); | |
326 } | |
327 fptrIN = params->input; | |
328 fptrOUT = params->output; | |
329 if(params->full) | |
330 nref_skip = -2; | |
331 | |
332 | |
333 if( (calls = malloc(START_QNUM * sizeof (int))) == NULL) { | |
334 perror("ERROR allocating initial space for calls"); exit(1); | |
335 } | |
336 depth_allocated = START_QNUM; | |
337 #if defined __linux__ || defined _GNU_SOURCE | |
338 while( (read_len = getline(&line,&line_len,fptrIN)) != -1 ) | |
339 #endif | |
340 #ifdef __APPLE__ | |
341 while( (line = fgetln(fptrIN,&line_len)) != NULL ) | |
342 #endif | |
343 { | |
344 line[line_len-1] = '\0'; | |
345 line_num++; | |
346 depth = 0; | |
347 nref = 0; | |
348 col_ptr = NULL; | |
349 for(col_num = 0, col_str = line; nref != nref_skip ; col_num++, col_str = NULL) { | |
350 col = strtok_r(col_str, "\t", &col_ptr); | |
351 if(col == NULL) { | |
352 break; | |
353 } | |
354 if(col_num == 0) { | |
355 chr = col; | |
356 } else if(col_num == 1) { | |
357 pos = col; | |
358 } else if(col_num == 2) { | |
359 ref = col; | |
360 } else if(col_num == 3) { | |
361 depth = atoi(col); | |
362 if(depth > depth_allocated) { | |
363 if( (calls = realloc(calls, sizeof (int) * (depth + START_QNUM))) == NULL) { | |
364 perror("ERROR allocating space for calls"); exit(1); | |
365 } | |
366 depth_allocated = depth + START_QNUM; | |
367 } | |
368 } else if(col_num == 4) { | |
369 if(params->input_type == M_PILEUP) | |
370 col = col+sizeof(char); | |
371 snvmixGetCallString(col, calls, depth, &nref); | |
372 } else if(col_num == 5) { | |
373 bQ = col; | |
374 // If it's a MAQ pileup, we need to skip the @ sign | |
375 if(params->input_type == M_PILEUP) | |
376 bQ = bQ + sizeof(char); | |
377 for(call_i = 0; call_i < depth; call_i++) | |
378 bQ[call_i] = bQ[call_i]-33; | |
379 } else if(col_num == 6) { | |
380 mQ = col; | |
381 // If it's a MAQ pileup, we need to skip the @ sign | |
382 if(params->input_type == M_PILEUP) | |
383 mQ = mQ + sizeof(char); | |
384 for(call_i = 0; call_i < depth; call_i++) | |
385 mQ[call_i] = mQ[call_i]-33; | |
386 } | |
387 } | |
388 // If we quit the for because no nref was found, skip this too, next line | |
389 nref = snvmixFilterCalls(calls,depth,bQ,mQ,params); | |
390 nref_count = 0; | |
391 ref_count = 0; | |
392 for(qual_num = 0; qual_num < depth; qual_num++) { | |
393 //if( snvmixSkipCall(calls,qual_num,params,bQ,mQ) ) | |
394 if(calls[qual_num] == -1) | |
395 continue; | |
396 if(calls[qual_num] == 1) | |
397 ref_count++; | |
398 if(calls[qual_num] == nref) | |
399 nref_count++; | |
400 } | |
401 if(params->filter) { | |
402 fprintf(fptrOUT,"%s:%s\t%s:%d\t%c:%d\t", chr, pos, ref, ref_count, nref, nref_count); | |
403 for(qual_num = 0; qual_num < ref_count; qual_num++) | |
404 fprintf(fptrOUT,","); | |
405 for(qual_num = 0; qual_num < nref_count; qual_num++) | |
406 fprintf(fptrOUT,"%c",nref); | |
407 fprintf(fptrOUT,"\n"); | |
408 } else { | |
409 if(nref == nref_skip) | |
410 continue; | |
411 //nref = snvmixFilterCalls(calls,depth,bQ,mQ,params); | |
412 //if(nref == nref_skip) | |
413 // continue; | |
414 for(k = 0; k < states; k++) | |
415 b[k] = 0; | |
416 nref_count = 0; | |
417 for(qual_num = 0; qual_num < depth; qual_num++) { | |
418 //if( snvmixSkipCall(calls,qual_num,params,bQ,mQ) ) | |
419 if(calls[qual_num] == -1) | |
420 continue; | |
421 // from matlab: | |
422 // b = sum(log(notm*0.5 + m.*(yr.*mur+notyr.*notmur)),2); | |
423 if(calls[qual_num] == 1) { | |
424 // If REF then | |
425 Y = phred[(unsigned char) bQ[qual_num]]; | |
426 not_Y = 1-phred[(unsigned char) bQ[qual_num]]; | |
427 } else { | |
428 nref_count++; | |
429 // If NREF then | |
430 Y = (1-phred[(unsigned char) bQ[qual_num]])/3; | |
431 not_Y = phred[(unsigned char) bQ[qual_num]] + 2*(1-phred[(unsigned char)bQ[qual_num]])/3; | |
432 } | |
433 for(k = 0; k < states; k++) | |
434 b[k] = b[k] + logl((1-phred[(unsigned char) mQ[qual_num]])*0.5 + phred[(unsigned char) mQ[qual_num]] * (Y * mu[k] + not_Y * notmu[k])); | |
435 } | |
436 // Check if any non-references actually passed the filter | |
437 //if(!nref_count) | |
438 // continue; | |
439 z = 0; | |
440 for(k = 0; k < states; k++) { | |
441 b[k] = expl(b[k] + log_pi[k]); | |
442 z += b[k]; | |
443 } | |
444 if(!z) z = 1; | |
445 for(k = 0; k < states; k++) | |
446 b[k] = b[k] / z; | |
447 maxp = 0; | |
448 z = b[0]; | |
449 for(k = 0; k < states; k++) | |
450 if( b[k] > z) { | |
451 maxp = k; | |
452 z = b[k]; | |
453 } | |
454 | |
455 | |
456 fprintf(fptrOUT,"%s:%s\t%s\t%c\t%s:%d,%c:%d,",chr,pos, ref, nref, ref, ref_count, nref, nref_count); | |
457 for(k = 0; k < states; k++) | |
458 fprintf(fptrOUT,"%0.10Lf,",b[k]); | |
459 fprintf(fptrOUT,"%d\n",maxp+1); | |
460 } | |
461 } | |
462 fclose(fptrIN); | |
463 fclose(fptrOUT); | |
464 free(line); | |
465 | |
466 free(b); free(notmu); free(log_pi); | |
467 } | |
468 | |
469 int snvmixClassify_bamfile(param_struct *params) { | |
470 snvmix_pl_t snvmix_pl; | |
471 int states, i, k, bam_flag_mask; | |
472 | |
473 snvmix_pl.params = params; | |
474 snvmix_pl.tid = -1; | |
475 // TODO: change this to read range | |
476 snvmix_pl.begin = 0; | |
477 snvmix_pl.end = 0x7fffffff; | |
478 snvmix_pl.in = samopen(params->inputfile, "rb", 0); | |
479 if (snvmix_pl.in == 0) { | |
480 fprintf(stderr, "ERROR: Fail to open BAM file %s\n", params->bamfile); | |
481 return 1; | |
482 } | |
483 snvmix_pl.fai = fai_load(params->fastafile); | |
484 if (snvmix_pl.fai == 0) { | |
485 fprintf(stderr, "Fail to open BAM file %s\n", params->fastafile); | |
486 return 1; | |
487 } | |
488 if(params->listfile) snvmix_pl.hash = load_pos(params->listfile, snvmix_pl.in->header); | |
489 snvmix_pl.depth_allocated = 0; | |
490 snvmix_pl.ref = NULL; | |
491 snvmix_pl.calls = NULL; | |
492 //snvmix_pl.bQ = NULL; | |
493 //snvmix_pl.mQ = NULL; | |
494 | |
495 | |
496 /* This looks ugly... but we don't want to be allocating new memory for EVERY location. */ | |
497 states = params->states; | |
498 if( ( snvmix_pl.notmu = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_bamfile] Could not allocate snvmix_pl.notmu\n"); exit(1); } | |
499 if( ( snvmix_pl.log_pi = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_bamfile] Could not allocate snvmix_pl.log_pi\n"); exit(1); } | |
500 if( ( snvmix_pl.b = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_bamfile] Could not allocate snvmix_pl.b\n"); exit(1); } | |
501 for(k = 0; k < states; k++) { | |
502 snvmix_pl.notmu[k] = 1 - params->mu[k]; | |
503 snvmix_pl.log_pi[k] = logl(params->pi[k]); | |
504 } | |
505 initPhred(snvmix_pl.phred, PHRED_MAX); | |
506 | |
507 /* Init call buffer */ | |
508 snvmix_pl.n_buffer = params->window; | |
509 if( (snvmix_pl.buffer = malloc(snvmix_pl.n_buffer * sizeof(snvmix_call_t)) ) == NULL) { perror("[snvmixClassify_bamfile] Could not allocate space for buffer\n"); exit(1); } | |
510 snvmix_pl.b_start = 0; | |
511 snvmix_pl.b_end = 0; | |
512 for(i = 0; i < snvmix_pl.n_buffer; i++) { | |
513 snvmix_pl.buffer[i].tid = 0; | |
514 snvmix_pl.buffer[i].pos = 0; | |
515 snvmix_pl.buffer[i].in_use = 0; | |
516 if( (snvmix_pl.buffer[i].p = malloc(states * sizeof(long double)) ) == NULL) {perror("[snvmixClassify_bamfile] Could not allocate space for buffer.p\n"); exit(1);} | |
517 } | |
518 | |
519 bam_flag_mask = (BAM_FUNMAP | BAM_FSECONDARY); // | BAM_FQCFAIL | BAM_FDUP); | |
520 if(params->filter_chast) | |
521 bam_flag_mask |= BAM_FQCFAIL; | |
522 if(params->filter_dup) | |
523 bam_flag_mask |= BAM_FDUP; | |
524 | |
525 /* Call pileup with our function and bam_flag_mask */ | |
526 sampileup(snvmix_pl.in, bam_flag_mask, snvmixClassify_pileup_func, &snvmix_pl); | |
527 | |
528 free(snvmix_pl.notmu); | |
529 free(snvmix_pl.log_pi); | |
530 free(snvmix_pl.b); | |
531 } | |
532 | |
533 int flush_buffer(uint32_t tid, uint32_t pos, snvmix_pl_t *snvmix_pl) { | |
534 snvmix_call_t *s; | |
535 int i, k, cnt, buff_id; | |
536 cnt = 0; | |
537 #define _fptrOUT snvmix_pl->params->output | |
538 //#define _fptrOUT stderr | |
539 //fprintf(_fptrOUT, "ENTERING flush_buffer, b_start = %d\tb_end = %d\n",snvmix_pl->b_start, snvmix_pl->b_end); | |
540 for(i = 0, buff_id = snvmix_pl->b_start; i < snvmix_pl->n_buffer; i++, buff_id=(buff_id+1)%snvmix_pl->params->window) { | |
541 //fprintf(stderr, "ITERATING in flush_buffer, buff_id = %d\n", buff_id); | |
542 s = snvmix_pl->buffer + buff_id; | |
543 if(pos+1 >= (s->pos + snvmix_pl->params->window) || tid != s->tid) { | |
544 //fprintf(stderr, "PASSED if in flush_buffer, buff_id = %d\n", buff_id); | |
545 if(s->in_use) { | |
546 //fprintf(_fptrOUT,"%d\t",buff_id); | |
547 fprintf(_fptrOUT,"%s:%d\t%c\t%c\t%c:%d,%c:%d", snvmix_pl->in->header->target_name[s->tid], s->pos, s->ref, s->nref, s->ref, s->ref_count, s->nref, s->nref_count); | |
548 | |
549 if(!snvmix_pl->params->filter) { | |
550 fprintf(_fptrOUT,","); | |
551 for(k = 0; k < snvmix_pl->params->states; k++) | |
552 fprintf(_fptrOUT,"%0.10Lf,",s->p[k]); | |
553 fprintf(_fptrOUT,"%d",s->maxP); | |
554 } | |
555 | |
556 fprintf(_fptrOUT,"\t%d %d", s->forward[0], s->reverse[0]); | |
557 fprintf(_fptrOUT,"\t%d %d", s->forward[1], s->reverse[1]); | |
558 fprintf(_fptrOUT,"\t%d %d", s->nref_center, s->nref_edges); | |
559 fprintf(_fptrOUT,"\t%d", s->indel_pos); | |
560 fprintf(_fptrOUT,"\t%d %d", s->c_clean[0], s->c_clean[1]); | |
561 fprintf(_fptrOUT,"\t%d %d", s->bad_pair[0], s->bad_pair[1]); | |
562 fprintf(_fptrOUT,"\t%d %d", s->c_ins[0], s->c_ins[1]); | |
563 fprintf(_fptrOUT,"\t%d %d", s->c_del[0], s->c_del[1]); | |
564 fprintf(_fptrOUT,"\t%d %d", s->c_junc[0], s->c_junc[1]); | |
565 fprintf(_fptrOUT,"\t%d", s->aln_unique_pos); | |
566 fprintf(_fptrOUT,"\t[%d,%d,%d,%d,%d]", s->raw_cvg[0],s->raw_cvg[1],s->raw_cvg[2],s->raw_cvg[3],s->raw_cvg[4]); | |
567 fprintf(_fptrOUT,"\t[%d,%d,%d,%d,%d]", s->thr_cvg[0],s->thr_cvg[1],s->thr_cvg[2],s->thr_cvg[3],s->thr_cvg[4]); | |
568 if(snvmix_pl->params->filter) { | |
569 fprintf(_fptrOUT,"\t"); | |
570 for(k = 0; k < s->ref_count; k++) | |
571 fprintf(_fptrOUT,","); | |
572 for(k = 0; k < s->nref_count; k++) | |
573 fprintf(_fptrOUT,"%c",s->nref); | |
574 } | |
575 fprintf(_fptrOUT,"\n"); | |
576 cnt++; | |
577 s->in_use = 0; | |
578 snvmix_pl->b_start = (buff_id+1)%snvmix_pl->params->window; | |
579 } | |
580 } else break; | |
581 if(buff_id == snvmix_pl->b_end) | |
582 break; | |
583 } | |
584 //fprintf(_fptrOUT, "EXITING flush_buffer, b_start = %d\tb_end = %d\n",snvmix_pl->b_start, snvmix_pl->b_end); | |
585 return cnt; | |
586 } | |
587 | |
588 | |
589 static int snvmixClassify_pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) { | |
590 | |
591 #define _bQ(p, x) bam1_qual(p[x].b)[p[x].qpos] | |
592 #define _mQ(p, x) p[x].b->core.qual | |
593 | |
594 //FILE *fptrOUT; | |
595 char nref_skip = 'N'; | |
596 //int *calls, ; | |
597 int depth = 0; | |
598 //long double *b, *notmu, *log_pi, *mu, *pi; | |
599 long double Y, not_Y, z; | |
600 //int states; | |
601 int i, k, qual_num, maxp; | |
602 //long double *phred; | |
603 int b_end_tmp = 0; | |
604 snvmix_call_t *s; | |
605 snvmix_pl_t *snvmix_pl = (snvmix_pl_t *)data; | |
606 param_struct *params = snvmix_pl->params; | |
607 // Avoid allocating new variables, use defines to make things readable | |
608 #define _calls snvmix_pl->calls | |
609 #define _b snvmix_pl->b | |
610 #define _notmu snvmix_pl->notmu | |
611 #define _log_pi snvmix_pl->log_pi | |
612 #define _mu params->mu | |
613 #define _pi params->pi | |
614 #define _phred snvmix_pl->phred | |
615 #define _states params->states | |
616 //fprintf(stderr,"DEBUG: snvmix_pl->buffer[snvmix_pl->b_start].in_use = %d\n", snvmix_pl->buffer[snvmix_pl->b_start].in_use); | |
617 //if(pos+1 == 148971) | |
618 // printf("DEBUG: test 148971, start.in_use = %d\tb_start=%d\tb_end=%d\n",snvmix_pl->buffer[snvmix_pl->b_start].in_use, snvmix_pl->b_start,snvmix_pl->b_end); | |
619 //if(pos == 148971) | |
620 // printf("DEBUG: test 148972, start.in_use = %d\tb_start=%d\tb_end=%d (pos+1 >= (snvmix_pl->buffer[snvmix_pl->b_start].pos + snvmix_pl->params->window) = %d >= %d\n", snvmix_pl->buffer[snvmix_pl->b_start].in_use, snvmix_pl->b_start,snvmix_pl->b_end, pos+1, snvmix_pl->buffer[snvmix_pl->b_start].pos + snvmix_pl->params->window); | |
621 // If we have gone further than window indicates, or changed tid, then flush buffer | |
622 if(snvmix_pl->buffer[snvmix_pl->b_start].in_use && (pos+1 >= (snvmix_pl->buffer[snvmix_pl->b_start].pos + snvmix_pl->params->window) || tid != snvmix_pl->buffer[snvmix_pl->b_start].tid )) | |
623 flush_buffer(tid, pos, snvmix_pl); | |
624 b_end_tmp = snvmix_pl->b_end; | |
625 s = snvmix_pl->buffer + snvmix_pl->b_end; | |
626 snvmix_pl->b_end = (snvmix_pl->b_end+1)%snvmix_pl->params->window; | |
627 | |
628 if(snvmix_pl->hash) { | |
629 khint_t k_h = kh_get(64, snvmix_pl->hash, (uint64_t)tid<<32|pos); | |
630 if (k_h == kh_end(snvmix_pl->hash)) return 0; | |
631 } | |
632 | |
633 s->in_use = 1; | |
634 | |
635 | |
636 if(params->full) | |
637 nref_skip = -2; | |
638 | |
639 if (snvmix_pl->fai && (int)tid != snvmix_pl->tid) { | |
640 free(snvmix_pl->ref); | |
641 snvmix_pl->ref = fai_fetch(snvmix_pl->fai, snvmix_pl->in->header->target_name[tid], &snvmix_pl->len); | |
642 snvmix_pl->tid = tid; | |
643 } | |
644 | |
645 //chr = snvmix_pl->in->header->target_name[tid], | |
646 s->tid = tid; | |
647 s->ref = toupper((snvmix_pl->ref && (int)pos < snvmix_pl->len)? snvmix_pl->ref[pos] : 'N'); | |
648 s->pos = pos + 1; | |
649 depth = n; | |
650 | |
651 if(depth > snvmix_pl->depth_allocated) { | |
652 if( (snvmix_pl->calls = realloc(snvmix_pl->calls, sizeof (int) * (depth + START_QNUM))) == NULL) { | |
653 perror("ERROR allocating space for snvmix_pl->calls"); exit(1); | |
654 } | |
655 //calls = snvmix_pl->calls; | |
656 snvmix_pl->depth_allocated = depth + START_QNUM; | |
657 } | |
658 | |
659 //for(i = 0; i < depth; i++) { | |
660 // FIXME: watch out for deletions, do they get automatically an "n"? if so, ok | |
661 //fprintf(stderr, "DEBUG: len = %d\tref = %c\tcalls = ", snvmix_pl->len, ref); | |
662 s->raw_cvg[0] = s->raw_cvg[1] = s->raw_cvg[2] = s->raw_cvg[3] = s->raw_cvg[4] = 0; | |
663 s->thr_cvg[0] = s->thr_cvg[1] = s->thr_cvg[2] = s->thr_cvg[3] = s->thr_cvg[4] = 0; | |
664 for(i = depth; i--; ) { | |
665 if(pl[i].is_del) { | |
666 _calls[i] = -1; | |
667 continue; | |
668 } | |
669 _calls[i] = "nACnGnnnTnnnnnnn"[bam1_seqi(bam1_seq(pl[i].b), pl[i].qpos)]; | |
670 //fprintf(stderr, "%c", _calls[i]); | |
671 switch(_calls[i]) { | |
672 case 'A': s->raw_cvg[0]++; break; | |
673 case 'C': s->raw_cvg[1]++; break; | |
674 case 'G': s->raw_cvg[2]++; break; | |
675 case 'T': s->raw_cvg[3]++; break; | |
676 case 'n': s->raw_cvg[4]++; break; | |
677 default: break; | |
678 } | |
679 // Mask calls according to quality | |
680 if( snvmixSkipCall_alt(params, _calls[i], (char) bam1_qual(pl[i].b)[pl[i].qpos], (char) pl[i].b->core.qual) ) | |
681 _calls[i] = -1; | |
682 else { | |
683 switch(_calls[i]) { | |
684 case 'A': s->thr_cvg[0]++; break; | |
685 case 'C': s->thr_cvg[1]++; break; | |
686 case 'G': s->thr_cvg[2]++; break; | |
687 case 'T': s->thr_cvg[3]++; break; | |
688 case 'n': s->thr_cvg[4]++; break; | |
689 default: break; | |
690 } | |
691 if(_calls[i] == s->ref) | |
692 _calls[i] = 1; | |
693 else if(_calls[i] == 'n') | |
694 _calls[i] = -1; | |
695 } | |
696 } | |
697 | |
698 s->nref = snvmixFilterCalls(_calls,depth,NULL,NULL,params); | |
699 s->nref_count = 0; | |
700 s->ref_count = 0; | |
701 | |
702 // FIXME: review this section, extra flags | |
703 for(i = 0; i < 2; i++) { | |
704 s->forward[i] = 0; | |
705 s->reverse[i] = 0; | |
706 s->good_pair[i] = 0; | |
707 s->bad_pair[i] = 0; | |
708 s->c_clean[i] = 0; | |
709 s->c_ins[i] = 0; | |
710 s->c_del[i] = 0; | |
711 s->c_junc[i] = 0; | |
712 } | |
713 s->nref_edges = 0; // FIXME: fixed 5 bp edges | |
714 s->nref_center = 0; // FIXME: fixed 5 bp edges | |
715 s->indel_pos = 0; // How many reads have a deletion at that site | |
716 s->indel_near = 0; // How many reasd have a deletion overall | |
717 s->aln_unique_pos = 0; | |
718 uint8_t cigar_ops; // 0x1 INS, 0x2 DEL , 0x4 JUNCTION | |
719 //for(qual_num = 0; qual_num < depth; qual_num++) { | |
720 uint32_t debug_sort = 0x7fffffff; | |
721 if(s->nref != nref_skip) { | |
722 k = -1; | |
723 for(qual_num = depth; qual_num--; ) { | |
724 if(_calls[qual_num] == -1) { | |
725 if(pl[qual_num].is_del) | |
726 s->indel_pos++; | |
727 continue; | |
728 } | |
729 cigar_ops = 0; | |
730 for(i=0; i<pl[qual_num].b->core.n_cigar; i++) { | |
731 int op = bam1_cigar(pl[qual_num].b)[i] & BAM_CIGAR_MASK; | |
732 if(op == BAM_CINS) | |
733 cigar_ops |= 0x1; | |
734 if(op == BAM_CDEL) | |
735 cigar_ops |= 0x2; | |
736 if(op == BAM_CREF_SKIP) | |
737 cigar_ops |= 0x4; | |
738 } | |
739 if(pl[qual_num].indel) { | |
740 s->indel_pos++; | |
741 } | |
742 if(_calls[qual_num] == 1) { | |
743 s->ref_count++; | |
744 k = 0; | |
745 } else if(_calls[qual_num] == s->nref) { | |
746 s->nref_count++; | |
747 k = 1; | |
748 if(pl[qual_num].qpos > (pl[qual_num].b->core.l_qseq - 5) || pl[qual_num].qpos < 5) | |
749 s->nref_edges++; | |
750 else | |
751 s->nref_center++; | |
752 if(pl[qual_num].b->core.pos < debug_sort) { s->aln_unique_pos++; debug_sort = pl[qual_num].b->core.pos; } | |
753 else if(pl[qual_num].b->core.pos > debug_sort) {fprintf(stderr, "ERROR checking sort in read-position, prev = %Ld, new = %Ld\n", debug_sort, pl[qual_num].b->core.pos); exit(1);} | |
754 } | |
755 | |
756 if(k != -1) { | |
757 if(cigar_ops) { | |
758 if(cigar_ops & 0x1) s->c_ins[k]++; | |
759 if(cigar_ops & 0x2) s->c_del[k]++; | |
760 if(cigar_ops & 0x4) s->c_junc[k]++; | |
761 } else s->c_clean[k]++; | |
762 | |
763 if(pl[qual_num].b->core.tid != pl[qual_num].b->core.mtid) s->bad_pair[k]++; | |
764 | |
765 if(pl[qual_num].b->core.flag & 0x10) s->reverse[k]++; | |
766 else s->forward[k]++; | |
767 } | |
768 } | |
769 } | |
770 if(!params->filter) { | |
771 if(s->nref == nref_skip) { | |
772 snvmix_pl->b_end = b_end_tmp; | |
773 s->in_use = 0; | |
774 return 0; | |
775 } | |
776 int block = (_states / 3) * 3; | |
777 //for(k = 0; k < _states; k++) | |
778 //for(k = _states; k--; ) | |
779 // _b[k] = 0; | |
780 k = 0; | |
781 while( k < block ) { | |
782 _b[k] = 0; | |
783 _b[k+1] = 0; | |
784 _b[k+2] = 0; | |
785 k += 3; | |
786 } | |
787 if( k < _states) { | |
788 switch( _states - k ) { | |
789 case 2: _b[k] = 0; k++; | |
790 case 1: _b[k] = 0; | |
791 } | |
792 } | |
793 //for(qual_num = 0; qual_num < depth; qual_num++) { | |
794 for(qual_num = depth; qual_num--; ) { | |
795 //if( snvmixSkipCall(calls,qual_num,params,bQ,mQ) ) | |
796 if(_calls[qual_num] == -1) | |
797 continue; | |
798 // from matlab: | |
799 // b = sum(log(notm*0.5 + m.*(yr.*mur+notyr.*notmur)),2); | |
800 if(_calls[qual_num] == 1) { | |
801 // If REF then | |
802 Y = _phred[(unsigned char) _bQ(pl, qual_num)]; | |
803 not_Y = 1-_phred[(unsigned char) _bQ(pl, qual_num)]; | |
804 } else { | |
805 // If NREF then | |
806 Y = (1-_phred[(unsigned char) _bQ(pl, qual_num)])/3; | |
807 not_Y = _phred[(unsigned char) _bQ(pl, qual_num)] + 2*(1-_phred[(unsigned char)_bQ(pl, qual_num)])/3; | |
808 } | |
809 //for(k = 0; k < _states; k++) | |
810 //for(k = _states; k--; ) | |
811 // _b[k] = _b[k] + logl((1-_phred[(unsigned char) _mQ(pl, qual_num)])*0.5 + _phred[(unsigned char) _mQ(pl, qual_num)] * (Y * _mu[k] + not_Y * _notmu[k])); | |
812 k = 0; | |
813 while( k < block ) { | |
814 _b[k] = _b[k] + logl((1-_phred[(unsigned char) _mQ(pl, qual_num)])*0.5 + _phred[(unsigned char) _mQ(pl, qual_num)] * (Y * _mu[k] + not_Y * _notmu[k])); | |
815 _b[k+1] = _b[k+1] + logl((1-_phred[(unsigned char) _mQ(pl, qual_num)])*0.5 + _phred[(unsigned char) _mQ(pl, qual_num)] * (Y * _mu[k+1] + not_Y * _notmu[k+1])); | |
816 _b[k+2] = _b[k+2] + logl((1-_phred[(unsigned char) _mQ(pl, qual_num)])*0.5 + _phred[(unsigned char) _mQ(pl, qual_num)] * (Y * _mu[k+2] + not_Y * _notmu[k+2])); | |
817 k += 3; | |
818 } | |
819 if( k < _states) { | |
820 switch( _states - k ) { | |
821 case 2: _b[k] = _b[k] + logl((1-_phred[(unsigned char) _mQ(pl, qual_num)])*0.5 + _phred[(unsigned char) _mQ(pl, qual_num)] * (Y * _mu[k] + not_Y * _notmu[k])); k++; | |
822 case 1: _b[k] = _b[k] + logl((1-_phred[(unsigned char) _mQ(pl, qual_num)])*0.5 + _phred[(unsigned char) _mQ(pl, qual_num)] * (Y * _mu[k] + not_Y * _notmu[k])); | |
823 } | |
824 } | |
825 } | |
826 z = 0; | |
827 //for(k = 0; k < _states; k++) { | |
828 //for(k = _states; k--; ) { | |
829 // _b[k] = expl(_b[k] + _log_pi[k]); | |
830 // z += _b[k]; | |
831 //} | |
832 k = 0; | |
833 while( k < block ) { | |
834 _b[k] = expl(_b[k] + _log_pi[k]); z += _b[k]; | |
835 _b[k+1] = expl(_b[k+1] + _log_pi[k+1]); z += _b[k+1]; | |
836 _b[k+2] = expl(_b[k+2] + _log_pi[k+2]); z += _b[k+2]; | |
837 k += 3; | |
838 } | |
839 if( k < _states ) { | |
840 switch( _states - k ) { | |
841 case 2: _b[k] = expl(_b[k] + _log_pi[k]); z += _b[k]; k++; | |
842 case 1: _b[k] = expl(_b[k] + _log_pi[k]); z += _b[k]; | |
843 } | |
844 } | |
845 if(!z) z = 1; | |
846 //for(k = 0; k < _states; k++) | |
847 //for(k = _states; k--; ) | |
848 // _b[k] = _b[k] / z; | |
849 k = 0; | |
850 while( k < block ) { | |
851 _b[k] = _b[k] / z; | |
852 _b[k+1] = _b[k+1] / z; | |
853 _b[k+2] = _b[k+2] / z; | |
854 k += 3; | |
855 } | |
856 if( k < _states ) { | |
857 switch( _states - k ) { | |
858 case 2: _b[k] = _b[k] / z; k++; | |
859 case 1: _b[k] = _b[k] / z; | |
860 } | |
861 } | |
862 maxp = _states - 1; | |
863 z = _b[maxp]; | |
864 //for(k = 0; k < _states; k++) | |
865 for(k = _states; k--; ) | |
866 if( _b[k] > z) { | |
867 maxp = k; | |
868 z = _b[k]; | |
869 } | |
870 | |
871 | |
872 for(k = 0; k < _states; k++) | |
873 s->p[k] = _b[k]; | |
874 s->maxP = maxp+1; | |
875 } | |
876 } | |
877 | |
878 /* | |
879 void snvmixGetCallString | |
880 Arguments: | |
881 *col : pointer to the current file-line in memory | |
882 *calls : pointer to array where we will store the final base-calls | |
883 depth : number of base reads for this position | |
884 *nref : the observed NREF value will be placed here (-1 if none was found) | |
885 | |
886 Function: | |
887 This will parse column 5 of the pileup file, which contains the | |
888 base calls and will fill the array "calls" with: | |
889 1 : when a reference call was made | |
890 -1 : when an invalid value is seen ('N', 'n', '*') | |
891 [ACTG] : when a non-reference call was made | |
892 | |
893 | |
894 | |
895 */ | |
896 inline void snvmixGetCallString(char *col, int *calls, int depth, char *nref) { | |
897 int i; | |
898 int call_i = 0, call_skip_num = 0; | |
899 char call_skip_char[10]; | |
900 for(i = 0 ; call_i < depth; i++) { | |
901 switch(col[i]){ | |
902 case ',': | |
903 case '.': | |
904 calls[call_i] = 1; | |
905 call_i++; | |
906 break; | |
907 case 'a': case 'A': | |
908 case 't': case 'T': | |
909 case 'g': case 'G': | |
910 case 'c': case 'C': | |
911 //calls[call_i] = 0; | |
912 calls[call_i] = toupper(col[i]); | |
913 call_i++; | |
914 //if(*nref == 0) | |
915 //*nref = toupper(*(col+sizeof(char)*i)); | |
916 if(*nref == 0) | |
917 *nref = toupper(col[i]); | |
918 break; | |
919 case 'N': | |
920 case 'n': | |
921 case '*': | |
922 // Not comparable values, we ignore them, but need to | |
923 // keep count to compare against the number of mapping qualities | |
924 calls[call_i] = -1; | |
925 call_i++; | |
926 break; | |
927 case '$': | |
928 // End of a read, just ignore it | |
929 break; | |
930 case '^': | |
931 // Start of read, ignore it and skip the next value (not base-related) | |
932 i++; | |
933 break; | |
934 case '+': | |
935 case '-': | |
936 // Eliminate: | |
937 // +2AA | |
938 // -3AAA | |
939 // Start skiping the sign | |
940 i++; | |
941 for(call_skip_num = 0; col[i] <= '9' && col[i] >= '0'; call_skip_num++, i++) { | |
942 //call_skip_char[call_skip_num] = call; | |
943 call_skip_char[call_skip_num] = col[i]; | |
944 //i++; | |
945 } | |
946 // we have the number string in call_skip_char, lets terminate it | |
947 call_skip_char[call_skip_num] = '\0'; | |
948 // i has been updated to first letter, just need to jump the rest of them | |
949 i = i+atoi(call_skip_char)-1; | |
950 break; | |
951 default: | |
952 fprintf(stderr,"ERROR: problem reading pileup file calls (%c)\n",col[i]); | |
953 exit(1); | |
954 break; | |
955 } | |
956 } | |
957 // If no nref was found, don't bother parsing the other columns, make the for fail with -1 | |
958 if(!*nref) | |
959 *nref = -1; | |
960 } | |
961 | |
962 /* | |
963 int snvmixFilterCalls | |
964 Arguments: | |
965 *calls : array built by snvmixGetCallString containing | |
966 1 : when a reference call was made | |
967 -1 : when an invalid value is seen ('N', 'n', '*') | |
968 [ACTG] : when a non-reference call was made | |
969 depth : number of calls for the current position | |
970 *bQ : base qualities observed for each call | |
971 *mQ : map quality for each call | |
972 *params : parameter structure, get thresholding data | |
973 Function: | |
974 For each valid call read in the position this function will apply | |
975 thresholding according to the type selected (-t flag) and the bQ (-q) | |
976 and mQ (-Q) thresholds provided. | |
977 | |
978 Any base-call that does not pass thresholds will be switched from it's | |
979 current value in *calls to -1; | |
980 | |
981 The most prevalent NREF after filtering will be determined and returned. | |
982 | |
983 */ | |
984 inline int snvmixFilterCalls(int *calls, int depth, char *bQ, char *mQ, param_struct *params) { | |
985 int qual_num, nref_counts[5]; | |
986 nref_counts[0] = 0; | |
987 nref_counts[1] = 0; | |
988 nref_counts[2] = 0; | |
989 nref_counts[3] = 0; | |
990 nref_counts[4] = 0; | |
991 int max_nref = N; | |
992 | |
993 char nref = 0; | |
994 | |
995 //for(qual_num = 0; qual_num < depth; qual_num++) { | |
996 for(qual_num = depth; qual_num--; ) { | |
997 //if( snvmixSkipCall(calls,qual_num,params,bQ,mQ) ) { | |
998 if( bQ != NULL && mQ != NULL && snvmixSkipCall(calls,qual_num,params,bQ,mQ) ) { | |
999 calls[qual_num] = -1; | |
1000 } else { | |
1001 nref_counts[0]++; | |
1002 switch(calls[qual_num]) { | |
1003 case 'A': | |
1004 nref_counts[A]++; | |
1005 break; | |
1006 case 'C': | |
1007 nref_counts[C]++; | |
1008 break; | |
1009 case 'G': | |
1010 nref_counts[G]++; | |
1011 break; | |
1012 case 'T': | |
1013 nref_counts[T]++; | |
1014 break; | |
1015 case 1: | |
1016 case -1: | |
1017 nref_counts[0]--; | |
1018 break; | |
1019 default: | |
1020 fprintf(stderr,"ERROR: unknown call base\n"); | |
1021 exit(1); | |
1022 } | |
1023 } | |
1024 } | |
1025 if(nref_counts[0]) { | |
1026 max_nref = A; | |
1027 if(nref_counts[max_nref] < nref_counts[C]) | |
1028 max_nref = C; | |
1029 if(nref_counts[max_nref] < nref_counts[G]) | |
1030 max_nref = G; | |
1031 if(nref_counts[max_nref] < nref_counts[T]) | |
1032 max_nref = T; | |
1033 } else { | |
1034 //return -1; | |
1035 } | |
1036 //for(qual_num = 0; qual_num < depth; qual_num++) { | |
1037 for(qual_num = depth; qual_num--; ) { | |
1038 if(calls[qual_num] == 1) | |
1039 continue; | |
1040 if(calls[qual_num] != base_code[max_nref]) | |
1041 calls[qual_num] = -1; | |
1042 } | |
1043 return base_code[max_nref]; | |
1044 } | |
1045 | |
1046 /* | |
1047 int snvmixSkipCall | |
1048 Arguments: | |
1049 *calls : array built by snvmixGetCallString containing | |
1050 1 : when a reference call was made | |
1051 -1 : when an invalid value is seen ('N', 'n', '*') | |
1052 [ACTG] : when a non-reference call was made | |
1053 qual_num : call number that is being evaluated | |
1054 *params : parameter structure, get thresholding data | |
1055 *bQ : base qualities observed for each call | |
1056 *mQ : map quality for each call | |
1057 Function: | |
1058 Evalates quality values in each of the filtering models | |
1059 | |
1060 Returns 1 if the calls[qual_num] needs to be filtered out | |
1061 Returns 0 otherwise | |
1062 */ | |
1063 inline int snvmixSkipCall(int *calls, int qual_num, param_struct *params, char *bQ, char *mQ) { | |
1064 if(calls[qual_num] == -1) | |
1065 return(1); | |
1066 if(params->filter_type == TYPE_mb) { | |
1067 if(bQ[qual_num] <= params->bQ || mQ[qual_num] <= params->mQ) | |
1068 return(1); | |
1069 } else if(params->filter_type == TYPE_b) { | |
1070 if(bQ[qual_num] <= params->bQ) | |
1071 return(1); | |
1072 } else { | |
1073 if(mQ[qual_num] <= params->mQ) | |
1074 return(1); | |
1075 if(params->filter_type == TYPE_m) { | |
1076 // Use mapping as surrogate | |
1077 bQ[qual_num] = mQ[qual_num]; | |
1078 // Make mapping one | |
1079 mQ[qual_num] = (char) PHRED_MAX; | |
1080 } else if(params->filter_type == TYPE_M) { | |
1081 // Mapping passed, make it one | |
1082 mQ[qual_num] = (char) PHRED_MAX; | |
1083 } else if(params->filter_type == TYPE_Mb) { | |
1084 // Nothing special here | |
1085 } else if(params->filter_type == TYPE_MB || params->filter_type == TYPE_SNVMix1) { | |
1086 if(bQ[qual_num] <= params->bQ) | |
1087 return(1); | |
1088 if(params->filter_type == TYPE_SNVMix1) { | |
1089 bQ[qual_num] = (char) PHRED_MAX; | |
1090 mQ[qual_num] = (char) PHRED_MAX; | |
1091 } } | |
1092 } | |
1093 return(0); | |
1094 } | |
1095 | |
1096 inline int snvmixSkipCall_alt(param_struct *params, int call, char bQ, char mQ) { | |
1097 if(call == -1) | |
1098 return(1); | |
1099 if(params->filter_type != TYPE_b && mQ <= params->mQ) | |
1100 return(1); | |
1101 switch(params->filter_type) { | |
1102 case TYPE_mb: | |
1103 if(bQ <= params->bQ || mQ <= params->mQ) | |
1104 return(1); | |
1105 break; | |
1106 case TYPE_b: | |
1107 if(bQ <= params->bQ) | |
1108 return(1); | |
1109 break; | |
1110 case TYPE_m: | |
1111 // Use mapping as surrogate | |
1112 bQ = mQ; | |
1113 // Make mapping one | |
1114 mQ = (char) PHRED_MAX; | |
1115 break; | |
1116 case TYPE_M: | |
1117 // Mapping passed, make it one | |
1118 mQ = (char) PHRED_MAX; | |
1119 break; | |
1120 case TYPE_Mb: | |
1121 // Nothing special here | |
1122 break; | |
1123 case TYPE_MB: | |
1124 case TYPE_SNVMix1: | |
1125 if(bQ <= params->bQ) | |
1126 return(1); | |
1127 if(params->filter_type == TYPE_SNVMix1) { | |
1128 bQ = (char) PHRED_MAX; | |
1129 mQ = (char) PHRED_MAX; | |
1130 } | |
1131 break; | |
1132 } | |
1133 return(0); | |
1134 } | |
1135 | |
1136 | |
1137 void resetParams(param_struct *params) { | |
1138 params->input = stdin; | |
1139 params->output = stdout; | |
1140 params->inputfile = NULL; | |
1141 params->outputfile = NULL; | |
1142 params->modelfile = NULL; | |
1143 params->fastafile = NULL; | |
1144 params->listfile = NULL; | |
1145 params->window = 1; | |
1146 params->filter_type = 0; | |
1147 params->filter_dup = 1; | |
1148 params->filter_chast = 1; | |
1149 params->train = 0; | |
1150 params->classify = 0; | |
1151 params->filter = 0; | |
1152 params->full = 0; | |
1153 params->input_type = S_PILEUP; // 0 = processed, 1 = maq pileup, 2 = sam pileup, 3 = bam file | |
1154 params->mu = NULL; | |
1155 params->pi = NULL; | |
1156 params->max_iter = 1000; | |
1157 params->bQ = 19; | |
1158 params->mQ = 19; | |
1159 params->debug = 0; | |
1160 params->states = 3; | |
1161 params->trainP.alpha = NULL; | |
1162 params->trainP.beta = NULL; | |
1163 params->trainP.delta = NULL; | |
1164 params->trainP.param_file = NULL; | |
1165 params->trainP.max_iter = 100; | |
1166 params->trainP.bQ = NULL; | |
1167 params->trainP.mQ = NULL; | |
1168 params->trainP.calls = NULL; | |
1169 params->window = 1; | |
1170 } | |
1171 | |
1172 void initSNVMix(int argc, char **argv, param_struct *params) { | |
1173 char c; | |
1174 int i; | |
1175 resetParams(params); | |
1176 while ((c = getopt (argc, argv, "hDTCFfi:o:m:t:p:q:Q:a:b:d:M:S:r:l:w:cR")) != -1) { | |
1177 switch (c) | |
1178 { | |
1179 case 'm': params->modelfile = optarg; break; | |
1180 case 'i': params->inputfile = optarg; break; | |
1181 case 'o': params->outputfile = optarg; break; | |
1182 // Bam file opts | |
1183 case 'r': params->fastafile = optarg; break; | |
1184 case 'l': params->listfile = optarg; break; | |
1185 case 'T': params->train = 1; break; | |
1186 case 'C': params->classify = 1; break; | |
1187 case 'F': params->filter = 1; break; | |
1188 case 'f': params->full = 1; break; | |
1189 case 'p': | |
1190 if(*optarg == 'q') { | |
1191 params->input_type = Q_PILEUP; | |
1192 } else if(*optarg == 'm') { | |
1193 params->input_type = M_PILEUP; | |
1194 } else if(*optarg == 's') { | |
1195 params->input_type = S_PILEUP; | |
1196 } else if(*optarg == 'b') { | |
1197 params->input_type = BAM_FILE; | |
1198 } else { | |
1199 fprintf(stderr,"ERROR: unknown pileup format %c\n",*optarg); | |
1200 exit(1); | |
1201 } | |
1202 break; | |
1203 case 't': | |
1204 if(strcmp("mb",optarg) == 0) | |
1205 params->filter_type = TYPE_mb; | |
1206 else if(strcmp("m",optarg) == 0) | |
1207 params->filter_type = TYPE_m; | |
1208 else if(strcmp("b",optarg) == 0) | |
1209 params->filter_type = TYPE_b; | |
1210 else if(strcmp("M",optarg) == 0) | |
1211 params->filter_type = TYPE_M; | |
1212 else if(strcmp("Mb",optarg) == 0) | |
1213 params->filter_type = TYPE_Mb; | |
1214 else if(strcmp("MB",optarg) == 0) | |
1215 params->filter_type = TYPE_MB; | |
1216 else if(strcmp("SNVMix1",optarg) == 0) | |
1217 params->filter_type = TYPE_SNVMix1; | |
1218 else { | |
1219 fprintf(stderr,"ERROR: filter type '%s' not recognized\n",optarg); | |
1220 exit(1); | |
1221 } | |
1222 break; | |
1223 case 'q': | |
1224 params->bQ = atoi(optarg); | |
1225 if( params->bQ < 0 || params->bQ >= PHRED_MAX ) { | |
1226 fprintf(stderr,"ERROR: quality threshold value Q%d out of range\n",params->bQ); | |
1227 exit(1); | |
1228 } | |
1229 break; | |
1230 case 'Q': | |
1231 params->mQ = atoi(optarg); | |
1232 if( params->mQ < 0 || params->mQ >= PHRED_MAX ) { | |
1233 fprintf(stderr,"ERROR: mapping threshold value Q%d out of range\n",params->mQ); | |
1234 exit(1); | |
1235 } | |
1236 break; | |
1237 case 'h': | |
1238 usage(argv[0]); | |
1239 break; | |
1240 case 'D': params->debug = 1; break; | |
1241 case 'M': params->trainP.param_file = optarg; break; | |
1242 case 'S': | |
1243 params->states = atoi(optarg); | |
1244 if( params->states < 3 ) { | |
1245 fprintf(stderr,"ERROR: state minimum is 3\n"); | |
1246 exit(1); | |
1247 } | |
1248 break; | |
1249 case 'w': | |
1250 params->window = atoi(optarg); | |
1251 if(params->window < 1) | |
1252 params->window = 1; | |
1253 break; | |
1254 case 'c': | |
1255 params->filter_chast = 0; | |
1256 break; | |
1257 case 'R': | |
1258 params->filter_dup = 0; | |
1259 break; | |
1260 default: | |
1261 fprintf(stderr,"Unknown option\n"); | |
1262 usage(argv[0]); | |
1263 break; | |
1264 } | |
1265 } | |
1266 /* Decide if we're going to train or classify */ | |
1267 if(params->filter) { | |
1268 params->train = 0; | |
1269 params->classify = 0; | |
1270 } | |
1271 if(params->train && params->classify) { | |
1272 fprintf(stderr,"ERROR: choose either train or classify\n"); | |
1273 exit(1); | |
1274 } else if(!params->train && !params->classify && !params->filter) { | |
1275 fprintf(stderr,"Train/Classify/Filter not selected, picking default: Classify\n"); | |
1276 params->classify = 1; | |
1277 } | |
1278 | |
1279 /* Set parameters */ | |
1280 allocateParameters(params); | |
1281 if( params->train ) setTrainingParameters(params); | |
1282 if( params->classify ) setClassificationParameters(params); | |
1283 | |
1284 /* Open input and output streams */ | |
1285 if( params->input_type == BAM_FILE ) { | |
1286 if( params->train ) { | |
1287 fprintf(stderr, "BAM input for training not yet implemented\n"); | |
1288 exit(1); | |
1289 } | |
1290 if( params->inputfile == NULL || strcmp(params->inputfile, "-") == 0) { | |
1291 fprintf(stderr, "ERROR: if '-p b' is chosen, input file has to be a bam file\n"); | |
1292 exit(1); | |
1293 } | |
1294 if( params->fastafile == NULL ) { | |
1295 fprintf(stderr, "ERROR: if '-p b' is chosen, reference fasta file is required (-r <ref.fa>)\n"); | |
1296 exit(1); | |
1297 } | |
1298 } else if( params->inputfile != NULL) { | |
1299 params->input = fopen(params->inputfile, "r"); | |
1300 if(params->input == NULL) { | |
1301 perror("ERROR: could not open input file"); | |
1302 exit(1); | |
1303 } | |
1304 } else { | |
1305 params->input = stdin; | |
1306 } | |
1307 if( params->outputfile != NULL ) { | |
1308 params->output = fopen(params->outputfile, "w"); | |
1309 if(params->output == NULL) { | |
1310 perror("ERROR: could not open output file"); | |
1311 exit(1); | |
1312 } | |
1313 } else { | |
1314 params->output = stdout; | |
1315 } | |
1316 } | |
1317 | |
1318 void allocateParameters(param_struct *params) { | |
1319 | |
1320 /* Allocate alpha */ | |
1321 if( (params->trainP.alpha = malloc(params->states * sizeof(long double))) == NULL) { | |
1322 perror("ERROR: could not allocate space for alpha\n"); exit(1); | |
1323 } | |
1324 /* Allocate beta */ | |
1325 if( (params->trainP.beta = malloc(params->states * sizeof(long double))) == NULL) { | |
1326 perror("ERROR: could not allocate space for beta\n"); exit(1); | |
1327 } | |
1328 /* Allocate delta*/ | |
1329 if( (params->trainP.delta = malloc(params->states * sizeof(long double))) == NULL) { | |
1330 perror("ERROR: could not allocate space for delta\n"); exit(1); | |
1331 } | |
1332 /* Allocate mu*/ | |
1333 if( (params->mu = malloc(params->states * sizeof(long double))) == NULL) { | |
1334 perror("ERROR: could not allocate space for mu\n"); exit(1); | |
1335 } | |
1336 /* Allocate pi*/ | |
1337 if( (params->pi = malloc(params->states * sizeof(long double))) == NULL) { | |
1338 perror("ERROR: could not allocate space for pi\n"); exit(1); | |
1339 } | |
1340 } | |
1341 | |
1342 void setTrainingParameters(param_struct *params) { | |
1343 if( params->trainP.param_file != NULL ) { | |
1344 readParamFile(params,'t'); | |
1345 } else { | |
1346 if(params->states != 3) { | |
1347 fprintf(stderr, "ERROR: if state space larger than 3 requested, parameters must be provided\n"); | |
1348 exit(1); | |
1349 } | |
1350 fprintf(stderr,"Training parameter file not given, using defaults\n"); | |
1351 params->trainP.alpha[0] = 1000; | |
1352 params->trainP.alpha[1] = 5000; | |
1353 params->trainP.alpha[2] = 1; | |
1354 params->trainP.beta[0] = 1; | |
1355 params->trainP.beta[1] = 5000; | |
1356 params->trainP.beta[2] = 1000; | |
1357 params->trainP.delta[0] = 10; | |
1358 params->trainP.delta[1] = 1; | |
1359 params->trainP.delta[2] = 1; | |
1360 } | |
1361 } | |
1362 | |
1363 void setClassificationParameters(param_struct *params) { | |
1364 int k; | |
1365 if( params->modelfile != NULL ) { | |
1366 readParamFile(params,'c'); | |
1367 } else { | |
1368 if(params->states != 3) { | |
1369 fprintf(stderr, "ERROR: if state space larger than 3 requested, model file must be provided\n"); | |
1370 exit(1); | |
1371 } | |
1372 params->mu[0] = 0.995905287891696078261816182930; | |
1373 params->mu[1] = 0.499569401000925172873223800707; | |
1374 params->mu[2] = 0.001002216846753116409260431219; | |
1375 params->pi[0] = 0.672519580755555956841362785781; | |
1376 params->pi[1] = 0.139342327124010650907237618412; | |
1377 params->pi[2] = 0.188138092120433392251399595807; | |
1378 fprintf(stderr,"Classification parameter file not given, using for mQ35 bQ10\n"); | |
1379 } | |
1380 fprintf(stderr,"Mu and pi values:\n"); | |
1381 for(k = 0; k < params->states; k++) | |
1382 fprintf(stderr,"\tmu[%d] = %Lf\tpi[%d] = %Lf\n", k, params->mu[k], k, params->pi[k]); | |
1383 fprintf(stderr,"\n"); | |
1384 } | |
1385 | |
1386 void readParamFile(param_struct *params, char type) { | |
1387 FILE *fptrPARAM; | |
1388 char *line = NULL, *param_name, *param, *param_ptr, *filename; | |
1389 size_t line_len = 0, read_len = 0; | |
1390 long double *target; | |
1391 int i; | |
1392 | |
1393 if(type == 't') { | |
1394 filename=params->trainP.param_file; | |
1395 } else if(type == 'c') { | |
1396 filename=params->modelfile; | |
1397 } | |
1398 fptrPARAM=fopen(filename, "r"); | |
1399 if(fptrPARAM == NULL) { | |
1400 fprintf(stderr, "ERROR: could not open parameter file %s\n", filename); | |
1401 exit(1); | |
1402 } | |
1403 | |
1404 #if defined __linux__ || defined _GNU_SOURCE | |
1405 while( (read_len = getline(&line,&line_len,fptrPARAM)) != -1 ) | |
1406 #endif | |
1407 #ifdef __APPLE__ | |
1408 while( (line = fgetln(fptrPARAM,&line_len)) != NULL ) | |
1409 #endif | |
1410 { | |
1411 line[line_len-1] = '\0'; | |
1412 target = NULL; | |
1413 param_name = strtok_r(line, " ", ¶m_ptr); | |
1414 if(type == 't') { | |
1415 if(strcmp("alpha", param_name) == 0) target = params->trainP.alpha; | |
1416 else if(strcmp("beta", param_name) == 0) target = params->trainP.beta; | |
1417 else if(strcmp("delta", param_name) == 0) target = params->trainP.delta; | |
1418 } else if(type == 'c') { | |
1419 if(strcmp("mu", param_name) == 0) target = params->mu; | |
1420 else if(strcmp("pi", param_name) == 0) target = params->pi; | |
1421 | |
1422 } | |
1423 if(target == NULL) { fprintf(stderr, "Unknown parameter %s, skipping\n", param_name); continue; } | |
1424 | |
1425 for(i = 0; i < params->states; i++) { | |
1426 param = strtok_r(NULL, " ", ¶m_ptr); | |
1427 if(param == NULL) { | |
1428 fprintf(stderr,"ERROR: missing value #%d for %s\n", i, param_name); | |
1429 exit(1); | |
1430 } | |
1431 target[i] = atof(param); | |
1432 fprintf(stderr, "DEBUG: %s[%d] = %Lf\n", param_name, i, target[i]); | |
1433 } | |
1434 } | |
1435 fclose(fptrPARAM); | |
1436 free(line); | |
1437 } | |
1438 | |
1439 void initPhred(long double *phredTable, int elem) { | |
1440 int i; | |
1441 for(i = 0; i < elem; i++) { | |
1442 phredTable[i] = 1-powl(10,(-(long double)i/10)); | |
1443 } | |
1444 phredTable[i] = 1; | |
1445 } | |
1446 | |
1447 | |
1448 void usage(char *selfname) { | |
1449 param_struct default_params; | |
1450 resetParams(&default_params); | |
1451 fprintf(stderr,"Syntax:\n\t%s\t-m <modelfile> [-i <infile>] [-o <outfile>] [-T | -C | -F] [-p < q | m | s >] [-t < mb | m | b | M | Mb | MB | SNVMix1>] [-q <#>] [-Q <#>] [-M <trainP file>] [-h]\n",selfname); | |
1452 fprintf(stderr,"\tRequired:\n"); | |
1453 fprintf(stderr,"\t-m <modelfile>\tmodel file, a line for mu and a line for pi, three\n"); | |
1454 fprintf(stderr,"\t\t\tspace-separated values each, like:\n"); | |
1455 fprintf(stderr,"\t\t\tmu 0.xxxxxxxx 0.xxxxxxxx 0.xxxxxxxx\n"); | |
1456 fprintf(stderr,"\t\t\tpi 0.xxxxxxxx 0.xxxxxxxx 0.xxxxxxxx\n"); | |
1457 fprintf(stderr,"\tOptional:\n"); | |
1458 fprintf(stderr,"\t-i <infile>\tquality pileup, from pileup2matlab.pl script (def: STDIN)\n"); | |
1459 fprintf(stderr,"\t-o <outfile>\twhere to put the results (def: STDOUT)\n"); | |
1460 fprintf(stderr,"\t-T | -C | -F\tTrain, Classify or Filter (def: Classify)\n"); | |
1461 fprintf(stderr,"\t-p q|m|s\tInput pileup format (def: s)\n\t\t\tq = quality\n\t\t\tm = MAQ\n\t\t\ts = SAMtools\n"); | |
1462 fprintf(stderr,"\t-t mb|m|b|M|Mb|MB|SNVMix1\n\t\t\tFilter type (def: mb)\n"); | |
1463 fprintf(stderr,"\t\t\tmb\tLowest between map and base quality\n"); | |
1464 fprintf(stderr,"\t\t\tm\tFilter on map, and use as surrogate for base quality\n"); | |
1465 fprintf(stderr,"\t\t\tb\tFilter on base quality, take map quality as 1\n"); | |
1466 fprintf(stderr,"\t\t\tM\tFilter on map quality but use only base quality\n"); | |
1467 fprintf(stderr,"\t\t\tMb\tFilter on map quality and use both map and base qualities\n"); | |
1468 fprintf(stderr,"\t\t\tMB\tFilter on map quality AND base quality\n"); | |
1469 fprintf(stderr,"\t\t\tSNVMix1\tFilter on base quality and map quality, afterwards consider them perfect\n"); | |
1470 fprintf(stderr,"\t-q #\t\tCutoff Phred value for Base Quality, anything <= this value is ignored (def: Q%d)\n",default_params.bQ); | |
1471 fprintf(stderr,"\t-Q #\t\tCutoff Phred value for Map Quality, anything <= this value is ignored (def: Q%d)\n",default_params.mQ); | |
1472 fprintf(stderr,"\n\tTraining parameters:\n"); | |
1473 fprintf(stderr,"\t-M <file>\tProvide a file containing training parameters\n"); | |
1474 fprintf(stderr,"\t\t\tValues are space-separated:\n"); | |
1475 fprintf(stderr,"\t\t\talpha # # #\n"); | |
1476 fprintf(stderr,"\t\t\tbeta # # #\n"); | |
1477 fprintf(stderr,"\t\t\tdelta # # #\n"); | |
1478 fprintf(stderr,"\n\t-h\t\tthis message\n"); | |
1479 fprintf(stderr,"\nImplementation: Rodrigo Goya, 2009. Version %s\n",VERSION); | |
1480 exit(0); | |
1481 } | |
1482 | |
1483 // Based on classify pileup, but reads the entire file to memory for training purposes. | |
1484 void snvmixGetTrainSet_pileup(param_struct *params) { | |
1485 //MAQ: | |
1486 // 1 554484 C 1752 @,,.,.,, @xxx @xxx xxxxx | |
1487 //SAM: | |
1488 // 1 554484 C 1752 ,,.,.,, xxx xxxx | |
1489 | |
1490 FILE *fptrIN; | |
1491 | |
1492 char *line = NULL; | |
1493 size_t line_len = 0, prev_len = 0; | |
1494 int read_len = 0; | |
1495 int col_num = 0; | |
1496 long int line_num = 0; | |
1497 | |
1498 char *col, *qual; | |
1499 char *col_sep = "\t", *col_str, *col_ptr; | |
1500 char *qual_sep = ",", *qual_str, *qual_ptr; | |
1501 char *chr, *pos, *ref, nref, call; | |
1502 | |
1503 char *bQ, *mQ; | |
1504 int *calls, *tmpQ, pos_int; | |
1505 int depth = 0, qual_num, maxp; | |
1506 int depth_allocated = 0; | |
1507 | |
1508 int trainset = 0; | |
1509 int trainset_allocated = 0; | |
1510 | |
1511 int nref_count = 0, ref_count = 0; | |
1512 int i, call_i; | |
1513 | |
1514 fptrIN = params->input; | |
1515 | |
1516 if( (params->trainP.bQ = malloc(START_QNUM * sizeof (unsigned char **))) == NULL) { | |
1517 perror("ERROR allocating initial space for train.bQ"); exit(1); | |
1518 } | |
1519 if( (params->trainP.mQ = malloc(START_QNUM * sizeof (unsigned char **))) == NULL) { | |
1520 perror("ERROR allocating initial space for train.mQ"); exit(1); | |
1521 } | |
1522 if( (params->trainP.calls = malloc(START_QNUM * sizeof (signed char **))) == NULL) { | |
1523 perror("ERROR allocating initial space for train.calls"); exit(1); | |
1524 } | |
1525 if( (params->trainP.depth = malloc(START_QNUM * sizeof (int *))) == NULL) { | |
1526 perror("ERROR allocating initial space for train.depth"); exit(1); | |
1527 } | |
1528 trainset_allocated = START_QNUM; | |
1529 | |
1530 if( (calls = malloc(START_QNUM * sizeof (int))) == NULL) { | |
1531 perror("ERROR allocating initial space for train.depth"); exit(1); | |
1532 } | |
1533 depth_allocated = START_QNUM; | |
1534 #if defined __linux__ || defined _GNU_SOURCE | |
1535 while( (read_len = getline(&line,&line_len,fptrIN)) != -1 ) | |
1536 #endif | |
1537 #ifdef __APPLE__ | |
1538 while( (line = fgetln(fptrIN,&line_len)) != NULL ) | |
1539 #endif | |
1540 { | |
1541 line[line_len-1] = '\0'; | |
1542 depth = 0; | |
1543 nref = 0; | |
1544 col_ptr = NULL; | |
1545 for(col_num = 0, col_str = line; ; col_num++, col_str = NULL) { | |
1546 col = strtok_r(col_str, "\t", &col_ptr); | |
1547 if(col == NULL) { | |
1548 break; | |
1549 } | |
1550 if(col_num == 0) { | |
1551 chr = col; | |
1552 } else if(col_num == 1) { | |
1553 pos = col; | |
1554 pos_int = strtol(pos, NULL, 10); | |
1555 } else if(col_num == 2) { | |
1556 ref = col; | |
1557 } else if(col_num == 3) { | |
1558 depth = atoi(col); | |
1559 if(depth > depth_allocated) { | |
1560 if( (calls = realloc(calls, sizeof (int) * (depth + START_QNUM))) == NULL) { | |
1561 perror("ERROR allocating space for calls"); exit(1); | |
1562 } | |
1563 depth_allocated = depth + START_QNUM; | |
1564 } | |
1565 } else if(col_num == 4) { | |
1566 if(params->input_type == M_PILEUP) | |
1567 col = col+sizeof(char); | |
1568 snvmixGetCallString(col, calls, depth, &nref); | |
1569 } else if(col_num == 5) { | |
1570 bQ = col; | |
1571 // If it's a MAQ pileup, we need to skip the @ sign | |
1572 if(params->input_type == M_PILEUP) | |
1573 bQ = bQ + sizeof(char); | |
1574 for(call_i = 0; call_i < depth; call_i++) | |
1575 bQ[call_i] = bQ[call_i]-33; | |
1576 } else if(col_num == 6) { | |
1577 mQ = col; | |
1578 // If it's a MAQ pileup, we need to skip the @ sign | |
1579 if(params->input_type == M_PILEUP) | |
1580 mQ = mQ + sizeof(char); | |
1581 for(call_i = 0; call_i < depth; call_i++) | |
1582 mQ[call_i] = mQ[call_i]-33; | |
1583 } | |
1584 } | |
1585 if(line_num >= trainset_allocated) { | |
1586 if( ( params->trainP.bQ = realloc(params->trainP.bQ, (line_num + START_QNUM) * sizeof (unsigned char *)) ) == NULL ) { | |
1587 perror("ERROR reallocating space for trainP.bQ"); exit(1); | |
1588 } | |
1589 if( ( params->trainP.mQ = realloc(params->trainP.mQ, (line_num + START_QNUM) * sizeof (unsigned char *)) ) == NULL ) { | |
1590 perror("ERROR reallocating space for trainP.mQ"); exit(1); | |
1591 } | |
1592 if( ( params->trainP.calls = realloc(params->trainP.calls, (line_num + START_QNUM) * sizeof (signed char *)) ) == NULL) { | |
1593 perror("ERROR reallocating space for trainP.calls"); exit(1); | |
1594 } | |
1595 if( ( params->trainP.depth = realloc(params->trainP.depth, (line_num + START_QNUM) * sizeof (int)) ) == NULL) { | |
1596 perror("ERROR reallocating space for trainP.depth"); exit(1); | |
1597 } | |
1598 trainset_allocated = line_num + START_QNUM; | |
1599 } | |
1600 | |
1601 nref = snvmixFilterCalls(calls,depth,bQ,mQ,params); | |
1602 nref_count = 0; | |
1603 ref_count = 0; | |
1604 for(qual_num = 0; qual_num < depth; qual_num++) { | |
1605 if(calls[qual_num] == -1) | |
1606 continue; | |
1607 if(calls[qual_num] == 1) | |
1608 ref_count++; | |
1609 if(calls[qual_num] == nref) | |
1610 nref_count++; | |
1611 } | |
1612 params->trainP.depth[line_num] = ref_count + nref_count; | |
1613 | |
1614 if( (params->trainP.bQ[line_num] = malloc(sizeof(unsigned char) * params->trainP.depth[line_num])) == NULL ) { | |
1615 perror("ERROR allocating space for trainP.bQ"); exit(1); | |
1616 } | |
1617 if( (params->trainP.mQ[line_num] = malloc(sizeof(unsigned char) * params->trainP.depth[line_num])) == NULL ) { | |
1618 perror("ERROR allocating space for trainP.mQ"); exit(1); | |
1619 } | |
1620 if( (params->trainP.calls[line_num] = malloc(sizeof(signed char) * params->trainP.depth[line_num])) == NULL ) { | |
1621 perror("ERROR allocating space for trainP.calls"); exit(1); | |
1622 } | |
1623 | |
1624 call_i = 0; | |
1625 for(qual_num = 0; qual_num < depth; qual_num++) { | |
1626 if(calls[qual_num] == -1) | |
1627 continue; | |
1628 | |
1629 params->trainP.calls[line_num][call_i] = (signed char) calls[qual_num]; | |
1630 params->trainP.bQ[line_num][call_i] = (unsigned char) bQ[qual_num]; | |
1631 params->trainP.mQ[line_num][call_i] = (unsigned char) mQ[qual_num]; | |
1632 call_i++; | |
1633 } | |
1634 if( params->trainP.depth[line_num] != call_i ) { | |
1635 fprintf(stderr, "ERROR: mismatch between trainP.depth and call_i\n"); exit(1); | |
1636 } | |
1637 line_num++; | |
1638 } | |
1639 params->trainP.len = line_num; | |
1640 params->trainP.len = line_num; | |
1641 fclose(fptrIN); | |
1642 free(line); free(calls); | |
1643 } | |
1644 | |
1645 // Use EM algorithm on a memory-located data set to train the parameters | |
1646 void snvmixTrain_pileup(param_struct *params) { | |
1647 int line_num = 0, call_i = 0, k = 0, states; | |
1648 int iter = 0; | |
1649 FILE *fptrOUT; | |
1650 | |
1651 long double phred[PHRED_MAX + 1]; | |
1652 long double bsum = 0, msum = 0, *b, *mu, *notmu, *pi, *log_pi, z = 0; | |
1653 long double **pG, **xr; | |
1654 long double *xrhat, *sum_pG; | |
1655 long double Y, not_Y, M; | |
1656 int N_sum; | |
1657 int strength; | |
1658 | |
1659 long double ll, prev_ll = 0; | |
1660 //initPhred(phred, PHRED_MAX+1); | |
1661 initPhred(phred, PHRED_MAX); | |
1662 | |
1663 states = params->states; | |
1664 | |
1665 // Initialize local values of parameters | |
1666 mu = params->mu; | |
1667 pi = params->pi; | |
1668 | |
1669 if( ( b = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate b\n"); exit(1); } | |
1670 if( ( notmu = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate notmu\n"); exit(1); } | |
1671 if( ( log_pi = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate log_pi\n"); exit(1); } | |
1672 if( ( xrhat = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate xrhat\n"); exit(1); } | |
1673 if( ( sum_pG = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate sum_pG\n"); exit(1); } | |
1674 | |
1675 snvmixGetTrainSet_pileup(params); | |
1676 | |
1677 if(params->debug) { | |
1678 for(line_num = 0; line_num < params->trainP.len; line_num++) { | |
1679 fprintf(stderr, "line_num: %d", line_num); | |
1680 for(call_i = 0; call_i < params->trainP.depth[line_num]; call_i++) { | |
1681 fprintf(stderr, "\t(%d,bQ%d,mQ%d)", params->trainP.calls[line_num][call_i], params->trainP.bQ[line_num][call_i], params->trainP.mQ[line_num][call_i]); | |
1682 } | |
1683 fprintf(stderr, "\n\n"); | |
1684 } | |
1685 } | |
1686 // Initialize mu and pi | |
1687 | |
1688 fprintf(stderr, "Initializing mu\n"); | |
1689 for(k = 0; k < states; k++) { | |
1690 params->mu[k] = (long double) params->trainP.alpha[k] / (params->trainP.alpha[k] + params->trainP.beta[k]); | |
1691 fprintf(stderr,"\talpha[%d] = %Lf,\tbeta[%d] = %Lf,\tmu[%d] = %Lf\n", k, params->trainP.alpha[k], k, params->trainP.beta[k], k, params->mu[k]); | |
1692 } | |
1693 | |
1694 fprintf(stderr, "Initializing pi\n"); | |
1695 z = (long double) params->trainP.delta[0] + params->trainP.delta[1] + params->trainP.delta[2]; | |
1696 if(!z) { z = 1; } | |
1697 for(k = 0; k < states; k++) { | |
1698 params->pi[k] = (long double) params->trainP.delta[k] / z; | |
1699 fprintf(stderr,"\tdelta[%d] = %Lf,\tpi[%d] = %Lf\n", k, params->trainP.delta[k], k, params->pi[k]); | |
1700 } | |
1701 | |
1702 strength = params->trainP.len; | |
1703 | |
1704 // Initialize matrices | |
1705 if( (pG = malloc(sizeof(long double *) * params->trainP.len)) == NULL) { | |
1706 perror("ERROR allocating initial space for pG"); exit(1); | |
1707 } | |
1708 if( (xr = malloc(sizeof(long double *) * params->trainP.len)) == NULL) { | |
1709 perror("ERROR allocating initial space for xr"); exit(1); | |
1710 } | |
1711 for(line_num = 0; line_num < params->trainP.len; line_num++) { | |
1712 // [states] cells for each line_num | |
1713 if( (pG[line_num] = malloc(sizeof(long double) * states)) == NULL) { | |
1714 perror("ERROR allocating space for pG"); exit(1); | |
1715 } | |
1716 if( (xr[line_num] = malloc(sizeof(long double) * states)) == NULL) { | |
1717 perror("ERROR allocating space for xr"); exit(1); | |
1718 } | |
1719 } | |
1720 | |
1721 for(iter = 0; iter < params->trainP.max_iter; iter++) { | |
1722 // Reset values for this iteration | |
1723 for(k = 0; k < states; k++) { | |
1724 notmu[k] = 1 - mu[k]; | |
1725 log_pi[k] = logl(pi[k]); | |
1726 | |
1727 sum_pG[k] = 0; | |
1728 xrhat[k] = 0; | |
1729 | |
1730 } | |
1731 N_sum = 0; | |
1732 ll = 0; | |
1733 | |
1734 // E step | |
1735 for(line_num = 0; line_num < params->trainP.len; line_num++) { | |
1736 if(params->trainP.depth == 0) | |
1737 continue; | |
1738 for(k = 0; k < states; k++) { | |
1739 xr[line_num][k] = 0; | |
1740 b[k] = 0; | |
1741 } | |
1742 | |
1743 for(call_i = 0; call_i < params->trainP.depth[line_num]; call_i++) { | |
1744 if(params->trainP.calls[line_num][call_i] == 1) { | |
1745 Y = phred[params->trainP.bQ[line_num][call_i]]; | |
1746 not_Y = 1-phred[params->trainP.bQ[line_num][call_i]]; | |
1747 } else { | |
1748 Y = (1-phred[params->trainP.bQ[line_num][call_i]])/3; | |
1749 not_Y = phred[params->trainP.bQ[line_num][call_i]] + 2*(1-phred[params->trainP.bQ[line_num][call_i]])/3; | |
1750 } | |
1751 M = phred[params->trainP.mQ[line_num][call_i]]; | |
1752 for(k = 0; k < states; k++) { | |
1753 b[k] = b[k] + logl( (1 - M) * 0.5 + M * (Y * mu[k] + not_Y * notmu[k]) ); | |
1754 xr[line_num][k] = xr[line_num][k] + Y; | |
1755 } | |
1756 } | |
1757 z = 0; | |
1758 for(k = 0; k < states; k++) { | |
1759 b[k] = expl(b[k] + log_pi[k]); | |
1760 z = z + b[k]; | |
1761 } | |
1762 if(!z) { z=1; } | |
1763 for(k = 0; k < states; k++) { | |
1764 pG[line_num][k] = b[k] / z; | |
1765 } | |
1766 | |
1767 ll = ll + logl(z); | |
1768 | |
1769 N_sum = N_sum + params->trainP.depth[line_num]; | |
1770 | |
1771 for(k = 0; k < states; k++) { | |
1772 sum_pG[k] = sum_pG[k] + pG[line_num][k]; | |
1773 xrhat[k] = xrhat[k] + xr[line_num][k] * pG[line_num][k]; | |
1774 } | |
1775 } | |
1776 | |
1777 // Check log likelihood | |
1778 if(iter == 0) | |
1779 prev_ll = ll; | |
1780 else if(ll <= prev_ll) | |
1781 break; | |
1782 prev_ll = ll; | |
1783 | |
1784 // M step | |
1785 z = 0; | |
1786 for(k = 0; k < states; k++) { | |
1787 z = z + sum_pG[k] + params->trainP.delta[k]; | |
1788 } | |
1789 if(!z) { z=1; } | |
1790 for(k = 0; k < states; k++) { | |
1791 // Update pi | |
1792 params->pi[k] = (sum_pG[k] + params->trainP.delta[k]) / z; | |
1793 // Update mu | |
1794 params->mu[k] = (xrhat[k] + params->trainP.alpha[k]*strength-1) / (N_sum + params->trainP.alpha[k]*strength + params->trainP.beta[k]*strength-2); | |
1795 } | |
1796 } | |
1797 | |
1798 if(params->modelfile == NULL) { | |
1799 fptrOUT = stdout; | |
1800 } else { | |
1801 fptrOUT = fopen(params->modelfile, "w"); | |
1802 if(fptrOUT == NULL) { | |
1803 perror("ERROR: could not open modelfile for writing, using STDOUT"); | |
1804 fptrOUT = stdout; | |
1805 } | |
1806 } | |
1807 fprintf(fptrOUT,"mu"); | |
1808 for(k = 0; k < states; k++) { | |
1809 fprintf(fptrOUT, " %0.30Lf", params->mu[k]); | |
1810 } | |
1811 fprintf(fptrOUT,"\n"); | |
1812 fprintf(fptrOUT,"pi"); | |
1813 for(k = 0; k < states; k++) { | |
1814 fprintf(fptrOUT, " %0.30Lf", params->pi[k]); | |
1815 } | |
1816 fprintf(fptrOUT,"\n"); | |
1817 | |
1818 /* free unused memory */ | |
1819 free(b); free(notmu); free(log_pi); | |
1820 free(xrhat); free(sum_pG); | |
1821 for(line_num = 0; line_num < params->trainP.len; line_num++) { | |
1822 // free [states] cells for each line_num | |
1823 free(pG[line_num]); | |
1824 free(xr[line_num]); | |
1825 } | |
1826 free(pG); free(xr); | |
1827 } | |
1828 |