Mercurial > repos > ryanmorin > nextgen_variant_identification
diff SNV/SNVMix2_source/SNVMix2-v0.12.1-rc1/SNVMix2.c @ 0:74f5ea818cea
Uploaded
author | ryanmorin |
---|---|
date | Wed, 12 Oct 2011 19:50:38 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SNV/SNVMix2_source/SNVMix2-v0.12.1-rc1/SNVMix2.c Wed Oct 12 19:50:38 2011 -0400 @@ -0,0 +1,1828 @@ +/* The MIT License + + Copyright (c) 2009, by Sohrab Shah <sshah@bccrc.ca> and Rodrigo Goya <rgoya@bcgsc.ca> + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* +C Implementation of SNVMix2 +*/ + +#define VERSION "0.12.1-rc1" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <math.h> +#include "SNVMix2.h" + +#include "bam.h" + +#define START_QNUM 1000 + +int main(int argc, char **argv) { + + param_struct params;// = {NULL, NULL, NULL, NULL, NULL, 0, 0, 0, 0, 0}; + + initSNVMix(argc, argv, ¶ms); + + if(params.classify || params.filter) { + if(params.input_type == Q_PILEUP) + snvmixClassify_qualities(¶ms); + else if(params.input_type == M_PILEUP || params.input_type == S_PILEUP) + snvmixClassify_pileup(¶ms); + else if(params.input_type == BAM_FILE) { + fprintf(stderr,"Output is:\n"); + fprintf(stderr,"\tchr:pos ref nref ref:ref_count,nref:nref_count,pAA,pAB,pBB,maxP"); + fprintf(stderr,"\tref_fwd ref_rev"); + fprintf(stderr,"\tnref_fwd nref_rev"); + fprintf(stderr,"\tnref_center nref_edges"); + fprintf(stderr,"\tindel_pos"); + fprintf(stderr,"\tref_clean nref_clean"); + fprintf(stderr,"\tref_bad_pair nref_bad_pair"); + fprintf(stderr,"\tref_ins nref_ins"); + fprintf(stderr,"\tref_del nref_del"); + fprintf(stderr,"\tref_junc nref_junc"); + fprintf(stderr,"\tnref_in_unique_pos"); + fprintf(stderr,"\traw[A,C,G,T,N]"); + fprintf(stderr,"\tthr[A,C,G,T,N]\n"); + snvmixClassify_bamfile(¶ms); + } + } else if(params.train) { + if(params.input_type == M_PILEUP || params.input_type == S_PILEUP) + snvmixTrain_pileup(¶ms); + else if(params.input_type == Q_PILEUP) { + fprintf(stderr,"Sorry, Training with quality-type input is not yet implemented\n"); + exit(1); + } + } + return(0); +} + +/* + void snvmixClassify_qualities + Arguments: + *params : pointer to model parameters and command line options + Function: + Reads a pilep-style file that has been preprocessed to include the quality of + calls being the reference allele and the maping quality both in decimal values. + + For each line it evaluates de model according to the parameters in *params and + outputs the corresponding SNV prediction values. + + This function may come in handy when filtering of calls is done by a + different program or the base-calling and maping quality information is not + in a pileup/phred format. + + The file read is a tab-separated file where the columns are: + - target sequence (e.g. chromosome) + - sequence position + - reference base + - non-reference base + - comma separated probabilities of the call being the reference + - comma separated mapping qualities, one for each base call + +*/ +void snvmixClassify_qualities(param_struct *params) { + FILE *fptrIN, *fptrOUT; + + char *line = NULL; + size_t line_len = 0, prev_len = 0; + int read_len = 0; + + int qual_num = 0, col_num = 0; + + char *col, *qual; + char *col_sep = "\t", *col_str, *col_ptr; + char *qual_sep = ",", *qual_str, *qual_ptr; + char *chr, *pos, *ref, *nref; + + long double *bQ, *mQ, *tmpQ; + int pos_int, depth = 0, maxp; + long int depth_allocated = 0; + + long double bsum = 0, msum = 0, *b, *mu, *notmu, *pi, *log_pi, z = 0; + int i, k, states; + + states = params->states; + + // Initialize local values of parameters + mu = params->mu; + pi = params->pi; + + if( ( b = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_qualities] Could not allocate b\n"); exit(1); } + if( ( notmu = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_qualities] Could not allocate notmu\n"); exit(1); } + if( ( log_pi = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_qualities] Could not allocate log_pi\n"); exit(1); } + + for(k = 0; k < states; k++) { + notmu[k] = 1 - mu[k]; + log_pi[k] = logl(pi[k]); + } + fptrIN = params->input; + fptrOUT = params->output; + + + if( (bQ = malloc(START_QNUM * sizeof (long double))) == NULL) { + perror("ERROR allocating initial space for bQ"); exit(1); + } + if( (mQ = malloc(START_QNUM * sizeof (long double))) == NULL) { + perror("ERROR allocating initial space for mQ"); exit(1); + } + depth_allocated = START_QNUM; +#if defined __linux__ || defined _GNU_SOURCE + while( (read_len = getline(&line,&line_len,fptrIN)) != -1 ) +#endif +#ifdef __APPLE__ + while( (line = fgetln(fptrIN,&line_len)) != NULL ) +#endif + { + line[line_len-1] = '\0'; + depth = 0; + col_ptr = NULL; + for(col_num = 0, col_str = line; ; col_num++, col_str = NULL) { + col = strtok_r(col_str, "\t", &col_ptr); + if(col == NULL) { + break; + } + if(col_num == 0) { + chr = col; + } else if(col_num == 1) { + pos = col; + pos_int = strtol(pos, NULL, 10); + } else if(col_num == 2) { + ref = col; + } else if(col_num == 3) { + nref = col; + } else if(col_num == 4) { + for(qual_num = 0, qual_str = col; ; qual_num++, qual_str = NULL) { + qual = strtok_r(qual_str, ",", &qual_ptr); + if(qual == NULL) { + break; + } + if(qual_num >= depth_allocated) { + if( (bQ = realloc(bQ, sizeof(long double) * (depth_allocated + START_QNUM))) == NULL) { + perror("ERROR allocating bQ"); exit(1); + } + if( (mQ = realloc(mQ, sizeof(long double) * (depth_allocated + START_QNUM))) == NULL) { + perror("ERROR allocating mQ"); exit(1); + } + depth_allocated = depth_allocated + START_QNUM; + } + bQ[qual_num] = atof(qual); + } + depth = qual_num; + } else if(col_num == 5) { + for(qual_num = 0, qual_str = col; ; qual_num++, qual_str = NULL) { + qual = strtok_r(qual_str, ",", &qual_ptr); + if(qual == NULL) { + break; + } + if(qual_num >= depth_allocated) { + fprintf(stderr, "FATAL ERROR: should not have more mapping qualities than base qualities\n"); + exit(1); + } + mQ[qual_num] = atof(qual); + } + if(depth != qual_num) { + fprintf(stderr, "FATAL ERROR: should not have more base qualities than mapping qualities\n"); + exit(1); + } + } + } + + + for(k = 0; k < states; k++) + b[k] = 0; + + //b = sum(log(notm*0.5 + m.*(yr.*mur+notyr.*notmur)),2); + for(qual_num = 0; qual_num < depth; qual_num++) + for(k = 0; k < states; k++) + b[k] = b[k] + logl((1- mQ[qual_num])*0.5 + mQ[qual_num] * (bQ[qual_num]*mu[k]+(1-bQ[qual_num])*notmu[k])); + + z = 0; + for(k = 0; k < states; k++) { + b[k] = expl(b[k] + log_pi[k]); + z += b[k]; + } + if(!z) { z = 1; } + for(k = 0; k < states; k++) + b[k] = b[k] / z; + maxp = 0; + z = b[0]; + for(k = 0; k < states; k++) + if( b[k] > z) { + maxp = k; + z = b[k]; + } + + fprintf(fptrOUT,"%s\t%s\t%s\t%s",chr, pos, ref, nref); + for(k = 0; k , states; k++) + fprintf(fptrOUT,"%0.10Lf,",b[k]); + fprintf(fptrOUT,"%d\n",maxp+1); + } + fclose(fptrIN); + fclose(fptrOUT); + free(line); + + free(b); free(notmu); free(log_pi); +} + + +/* + void snvmixClassify_pileup + Arguments: + *params : pointer to model parameters and command line options + Function: + Reads a MAQ or Samtools pileup format. For required formatting we refer the user + to the corresponding websites: + http://maq.sourceforge.net/ + http://samtools.sourceforge.net/ + + In general the format of both files can be described as a tab separated file + where columns are: + - target sequence (e.g. chromosome) + - sequence position + - reference base + - depth + - stream of base calls + - stream of base call PHRED-scaled qualities + - stream of mapping PHRED-scaled qualities + + Note: when handling pileup files, care should be taken when doing copy-paste + operations not to transform 'tabs' into streams of spaces. + + For each line it evaluates de model according to the parameters in *params and + outputs the corresponding SNV prediction values. + + Predictions can be output either only for positions where when non-reference values + are observed or, when run with '-f' flag, for all positions. The -f flag is useful + when the resulting calls are being compared or joined accros different datasets + and all predictions need to be present. +*/ +void snvmixClassify_pileup(param_struct *params) { +//MAQ: +// 1 554484 C 1752 @,,.,.,, @xxx @xxx xxxxx +//SAM: +// 1 554484 C 1752 ,,.,.,, xxx xxxx + FILE *fptrIN, *fptrOUT; + + char *line = NULL; + size_t line_len = 0, prev_len = 0; + int read_len = 0; + int col_num = 0; + long int line_num = 0; + + char *col, *qual; + char *col_sep = "\t", *col_str, *col_ptr; + char *qual_sep = ",", *qual_str, *qual_ptr; + char *chr, *pos, *ref, nref, call, nref_skip = 'N'; + + char *bQ, *mQ; + int *calls, *tmpQ; + int pos_int, depth = 0, qual_num, maxp; + int depth_allocated = 0; + + long double bsum = 0, msum = 0, *b, *mu, *notmu, *pi, *log_pi, z = 0; + int nref_count = 0, ref_count = 0; + long double Y, not_Y; + long double phred[PHRED_MAX + 1]; + int i, call_i, k, states; + + //initPhred(phred, PHRED_MAX+1); + initPhred(phred, PHRED_MAX); + + states = params->states; + + // Initialize local values of parameters + mu = params->mu; + pi = params->pi; + + if( ( b = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate b\n"); exit(1); } + if( ( notmu = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate notmu\n"); exit(1); } + if( ( log_pi = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate log_pi\n"); exit(1); } + + for(k = 0; k < states; k++) { + notmu[k] = 1 - mu[k]; + log_pi[k] = logl(pi[k]); + } + fptrIN = params->input; + fptrOUT = params->output; + if(params->full) + nref_skip = -2; + + + if( (calls = malloc(START_QNUM * sizeof (int))) == NULL) { + perror("ERROR allocating initial space for calls"); exit(1); + } + depth_allocated = START_QNUM; +#if defined __linux__ || defined _GNU_SOURCE + while( (read_len = getline(&line,&line_len,fptrIN)) != -1 ) +#endif +#ifdef __APPLE__ + while( (line = fgetln(fptrIN,&line_len)) != NULL ) +#endif + { + line[line_len-1] = '\0'; + line_num++; + depth = 0; + nref = 0; + col_ptr = NULL; + for(col_num = 0, col_str = line; nref != nref_skip ; col_num++, col_str = NULL) { + col = strtok_r(col_str, "\t", &col_ptr); + if(col == NULL) { + break; + } + if(col_num == 0) { + chr = col; + } else if(col_num == 1) { + pos = col; + } else if(col_num == 2) { + ref = col; + } else if(col_num == 3) { + depth = atoi(col); + if(depth > depth_allocated) { + if( (calls = realloc(calls, sizeof (int) * (depth + START_QNUM))) == NULL) { + perror("ERROR allocating space for calls"); exit(1); + } + depth_allocated = depth + START_QNUM; + } + } else if(col_num == 4) { + if(params->input_type == M_PILEUP) + col = col+sizeof(char); + snvmixGetCallString(col, calls, depth, &nref); + } else if(col_num == 5) { + bQ = col; + // If it's a MAQ pileup, we need to skip the @ sign + if(params->input_type == M_PILEUP) + bQ = bQ + sizeof(char); + for(call_i = 0; call_i < depth; call_i++) + bQ[call_i] = bQ[call_i]-33; + } else if(col_num == 6) { + mQ = col; + // If it's a MAQ pileup, we need to skip the @ sign + if(params->input_type == M_PILEUP) + mQ = mQ + sizeof(char); + for(call_i = 0; call_i < depth; call_i++) + mQ[call_i] = mQ[call_i]-33; + } + } + // If we quit the for because no nref was found, skip this too, next line + nref = snvmixFilterCalls(calls,depth,bQ,mQ,params); + nref_count = 0; + ref_count = 0; + for(qual_num = 0; qual_num < depth; qual_num++) { + //if( snvmixSkipCall(calls,qual_num,params,bQ,mQ) ) + if(calls[qual_num] == -1) + continue; + if(calls[qual_num] == 1) + ref_count++; + if(calls[qual_num] == nref) + nref_count++; + } +if(params->filter) { + fprintf(fptrOUT,"%s:%s\t%s:%d\t%c:%d\t", chr, pos, ref, ref_count, nref, nref_count); + for(qual_num = 0; qual_num < ref_count; qual_num++) + fprintf(fptrOUT,","); + for(qual_num = 0; qual_num < nref_count; qual_num++) + fprintf(fptrOUT,"%c",nref); + fprintf(fptrOUT,"\n"); +} else { + if(nref == nref_skip) + continue; + //nref = snvmixFilterCalls(calls,depth,bQ,mQ,params); + //if(nref == nref_skip) + // continue; + for(k = 0; k < states; k++) + b[k] = 0; + nref_count = 0; + for(qual_num = 0; qual_num < depth; qual_num++) { + //if( snvmixSkipCall(calls,qual_num,params,bQ,mQ) ) + if(calls[qual_num] == -1) + continue; + // from matlab: + // b = sum(log(notm*0.5 + m.*(yr.*mur+notyr.*notmur)),2); + if(calls[qual_num] == 1) { + // If REF then + Y = phred[(unsigned char) bQ[qual_num]]; + not_Y = 1-phred[(unsigned char) bQ[qual_num]]; + } else { + nref_count++; + // If NREF then + Y = (1-phred[(unsigned char) bQ[qual_num]])/3; + not_Y = phred[(unsigned char) bQ[qual_num]] + 2*(1-phred[(unsigned char)bQ[qual_num]])/3; + } + for(k = 0; k < states; k++) + b[k] = b[k] + logl((1-phred[(unsigned char) mQ[qual_num]])*0.5 + phred[(unsigned char) mQ[qual_num]] * (Y * mu[k] + not_Y * notmu[k])); + } + // Check if any non-references actually passed the filter + //if(!nref_count) + // continue; + z = 0; + for(k = 0; k < states; k++) { + b[k] = expl(b[k] + log_pi[k]); + z += b[k]; + } + if(!z) z = 1; + for(k = 0; k < states; k++) + b[k] = b[k] / z; + maxp = 0; + z = b[0]; + for(k = 0; k < states; k++) + if( b[k] > z) { + maxp = k; + z = b[k]; + } + + + fprintf(fptrOUT,"%s:%s\t%s\t%c\t%s:%d,%c:%d,",chr,pos, ref, nref, ref, ref_count, nref, nref_count); + for(k = 0; k < states; k++) + fprintf(fptrOUT,"%0.10Lf,",b[k]); + fprintf(fptrOUT,"%d\n",maxp+1); +} + } + fclose(fptrIN); + fclose(fptrOUT); + free(line); + + free(b); free(notmu); free(log_pi); +} + +int snvmixClassify_bamfile(param_struct *params) { + snvmix_pl_t snvmix_pl; + int states, i, k, bam_flag_mask; + + snvmix_pl.params = params; + snvmix_pl.tid = -1; + // TODO: change this to read range + snvmix_pl.begin = 0; + snvmix_pl.end = 0x7fffffff; + snvmix_pl.in = samopen(params->inputfile, "rb", 0); + if (snvmix_pl.in == 0) { + fprintf(stderr, "ERROR: Fail to open BAM file %s\n", params->bamfile); + return 1; + } + snvmix_pl.fai = fai_load(params->fastafile); + if (snvmix_pl.fai == 0) { + fprintf(stderr, "Fail to open BAM file %s\n", params->fastafile); + return 1; + } + if(params->listfile) snvmix_pl.hash = load_pos(params->listfile, snvmix_pl.in->header); + snvmix_pl.depth_allocated = 0; + snvmix_pl.ref = NULL; + snvmix_pl.calls = NULL; + //snvmix_pl.bQ = NULL; + //snvmix_pl.mQ = NULL; + + + /* This looks ugly... but we don't want to be allocating new memory for EVERY location. */ + states = params->states; + if( ( snvmix_pl.notmu = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_bamfile] Could not allocate snvmix_pl.notmu\n"); exit(1); } + if( ( snvmix_pl.log_pi = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_bamfile] Could not allocate snvmix_pl.log_pi\n"); exit(1); } + if( ( snvmix_pl.b = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_bamfile] Could not allocate snvmix_pl.b\n"); exit(1); } + for(k = 0; k < states; k++) { + snvmix_pl.notmu[k] = 1 - params->mu[k]; + snvmix_pl.log_pi[k] = logl(params->pi[k]); + } + initPhred(snvmix_pl.phred, PHRED_MAX); + + /* Init call buffer */ + snvmix_pl.n_buffer = params->window; + if( (snvmix_pl.buffer = malloc(snvmix_pl.n_buffer * sizeof(snvmix_call_t)) ) == NULL) { perror("[snvmixClassify_bamfile] Could not allocate space for buffer\n"); exit(1); } + snvmix_pl.b_start = 0; + snvmix_pl.b_end = 0; + for(i = 0; i < snvmix_pl.n_buffer; i++) { + snvmix_pl.buffer[i].tid = 0; + snvmix_pl.buffer[i].pos = 0; + snvmix_pl.buffer[i].in_use = 0; + if( (snvmix_pl.buffer[i].p = malloc(states * sizeof(long double)) ) == NULL) {perror("[snvmixClassify_bamfile] Could not allocate space for buffer.p\n"); exit(1);} + } + + bam_flag_mask = (BAM_FUNMAP | BAM_FSECONDARY); // | BAM_FQCFAIL | BAM_FDUP); + if(params->filter_chast) + bam_flag_mask |= BAM_FQCFAIL; + if(params->filter_dup) + bam_flag_mask |= BAM_FDUP; + + /* Call pileup with our function and bam_flag_mask */ + sampileup(snvmix_pl.in, bam_flag_mask, snvmixClassify_pileup_func, &snvmix_pl); + + free(snvmix_pl.notmu); + free(snvmix_pl.log_pi); + free(snvmix_pl.b); +} + +int flush_buffer(uint32_t tid, uint32_t pos, snvmix_pl_t *snvmix_pl) { + snvmix_call_t *s; + int i, k, cnt, buff_id; + cnt = 0; + #define _fptrOUT snvmix_pl->params->output + //#define _fptrOUT stderr +//fprintf(_fptrOUT, "ENTERING flush_buffer, b_start = %d\tb_end = %d\n",snvmix_pl->b_start, snvmix_pl->b_end); + for(i = 0, buff_id = snvmix_pl->b_start; i < snvmix_pl->n_buffer; i++, buff_id=(buff_id+1)%snvmix_pl->params->window) { +//fprintf(stderr, "ITERATING in flush_buffer, buff_id = %d\n", buff_id); + s = snvmix_pl->buffer + buff_id; + if(pos+1 >= (s->pos + snvmix_pl->params->window) || tid != s->tid) { +//fprintf(stderr, "PASSED if in flush_buffer, buff_id = %d\n", buff_id); + if(s->in_use) { + //fprintf(_fptrOUT,"%d\t",buff_id); + fprintf(_fptrOUT,"%s:%d\t%c\t%c\t%c:%d,%c:%d", snvmix_pl->in->header->target_name[s->tid], s->pos, s->ref, s->nref, s->ref, s->ref_count, s->nref, s->nref_count); + + if(!snvmix_pl->params->filter) { + fprintf(_fptrOUT,","); + for(k = 0; k < snvmix_pl->params->states; k++) + fprintf(_fptrOUT,"%0.10Lf,",s->p[k]); + fprintf(_fptrOUT,"%d",s->maxP); + } + + fprintf(_fptrOUT,"\t%d %d", s->forward[0], s->reverse[0]); + fprintf(_fptrOUT,"\t%d %d", s->forward[1], s->reverse[1]); + fprintf(_fptrOUT,"\t%d %d", s->nref_center, s->nref_edges); + fprintf(_fptrOUT,"\t%d", s->indel_pos); + fprintf(_fptrOUT,"\t%d %d", s->c_clean[0], s->c_clean[1]); + fprintf(_fptrOUT,"\t%d %d", s->bad_pair[0], s->bad_pair[1]); + fprintf(_fptrOUT,"\t%d %d", s->c_ins[0], s->c_ins[1]); + fprintf(_fptrOUT,"\t%d %d", s->c_del[0], s->c_del[1]); + fprintf(_fptrOUT,"\t%d %d", s->c_junc[0], s->c_junc[1]); + fprintf(_fptrOUT,"\t%d", s->aln_unique_pos); + fprintf(_fptrOUT,"\t[%d,%d,%d,%d,%d]", s->raw_cvg[0],s->raw_cvg[1],s->raw_cvg[2],s->raw_cvg[3],s->raw_cvg[4]); + fprintf(_fptrOUT,"\t[%d,%d,%d,%d,%d]", s->thr_cvg[0],s->thr_cvg[1],s->thr_cvg[2],s->thr_cvg[3],s->thr_cvg[4]); + if(snvmix_pl->params->filter) { + fprintf(_fptrOUT,"\t"); + for(k = 0; k < s->ref_count; k++) + fprintf(_fptrOUT,","); + for(k = 0; k < s->nref_count; k++) + fprintf(_fptrOUT,"%c",s->nref); + } + fprintf(_fptrOUT,"\n"); + cnt++; + s->in_use = 0; + snvmix_pl->b_start = (buff_id+1)%snvmix_pl->params->window; + } + } else break; + if(buff_id == snvmix_pl->b_end) + break; + } +//fprintf(_fptrOUT, "EXITING flush_buffer, b_start = %d\tb_end = %d\n",snvmix_pl->b_start, snvmix_pl->b_end); + return cnt; +} + + +static int snvmixClassify_pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) { + +#define _bQ(p, x) bam1_qual(p[x].b)[p[x].qpos] +#define _mQ(p, x) p[x].b->core.qual + + //FILE *fptrOUT; + char nref_skip = 'N'; + //int *calls, ; + int depth = 0; + //long double *b, *notmu, *log_pi, *mu, *pi; + long double Y, not_Y, z; + //int states; + int i, k, qual_num, maxp; + //long double *phred; + int b_end_tmp = 0; + snvmix_call_t *s; + snvmix_pl_t *snvmix_pl = (snvmix_pl_t *)data; + param_struct *params = snvmix_pl->params; + // Avoid allocating new variables, use defines to make things readable +#define _calls snvmix_pl->calls +#define _b snvmix_pl->b +#define _notmu snvmix_pl->notmu +#define _log_pi snvmix_pl->log_pi +#define _mu params->mu +#define _pi params->pi +#define _phred snvmix_pl->phred +#define _states params->states + //fprintf(stderr,"DEBUG: snvmix_pl->buffer[snvmix_pl->b_start].in_use = %d\n", snvmix_pl->buffer[snvmix_pl->b_start].in_use); + //if(pos+1 == 148971) + // printf("DEBUG: test 148971, start.in_use = %d\tb_start=%d\tb_end=%d\n",snvmix_pl->buffer[snvmix_pl->b_start].in_use, snvmix_pl->b_start,snvmix_pl->b_end); + //if(pos == 148971) + // printf("DEBUG: test 148972, start.in_use = %d\tb_start=%d\tb_end=%d (pos+1 >= (snvmix_pl->buffer[snvmix_pl->b_start].pos + snvmix_pl->params->window) = %d >= %d\n", snvmix_pl->buffer[snvmix_pl->b_start].in_use, snvmix_pl->b_start,snvmix_pl->b_end, pos+1, snvmix_pl->buffer[snvmix_pl->b_start].pos + snvmix_pl->params->window); + // If we have gone further than window indicates, or changed tid, then flush buffer + if(snvmix_pl->buffer[snvmix_pl->b_start].in_use && (pos+1 >= (snvmix_pl->buffer[snvmix_pl->b_start].pos + snvmix_pl->params->window) || tid != snvmix_pl->buffer[snvmix_pl->b_start].tid )) + flush_buffer(tid, pos, snvmix_pl); + b_end_tmp = snvmix_pl->b_end; + s = snvmix_pl->buffer + snvmix_pl->b_end; + snvmix_pl->b_end = (snvmix_pl->b_end+1)%snvmix_pl->params->window; + + if(snvmix_pl->hash) { + khint_t k_h = kh_get(64, snvmix_pl->hash, (uint64_t)tid<<32|pos); + if (k_h == kh_end(snvmix_pl->hash)) return 0; + } + + s->in_use = 1; + + + if(params->full) + nref_skip = -2; + + if (snvmix_pl->fai && (int)tid != snvmix_pl->tid) { + free(snvmix_pl->ref); + snvmix_pl->ref = fai_fetch(snvmix_pl->fai, snvmix_pl->in->header->target_name[tid], &snvmix_pl->len); + snvmix_pl->tid = tid; + } + + //chr = snvmix_pl->in->header->target_name[tid], + s->tid = tid; + s->ref = toupper((snvmix_pl->ref && (int)pos < snvmix_pl->len)? snvmix_pl->ref[pos] : 'N'); + s->pos = pos + 1; + depth = n; + + if(depth > snvmix_pl->depth_allocated) { + if( (snvmix_pl->calls = realloc(snvmix_pl->calls, sizeof (int) * (depth + START_QNUM))) == NULL) { + perror("ERROR allocating space for snvmix_pl->calls"); exit(1); + } + //calls = snvmix_pl->calls; + snvmix_pl->depth_allocated = depth + START_QNUM; + } + + //for(i = 0; i < depth; i++) { + // FIXME: watch out for deletions, do they get automatically an "n"? if so, ok +//fprintf(stderr, "DEBUG: len = %d\tref = %c\tcalls = ", snvmix_pl->len, ref); + s->raw_cvg[0] = s->raw_cvg[1] = s->raw_cvg[2] = s->raw_cvg[3] = s->raw_cvg[4] = 0; + s->thr_cvg[0] = s->thr_cvg[1] = s->thr_cvg[2] = s->thr_cvg[3] = s->thr_cvg[4] = 0; + for(i = depth; i--; ) { + if(pl[i].is_del) { + _calls[i] = -1; + continue; + } + _calls[i] = "nACnGnnnTnnnnnnn"[bam1_seqi(bam1_seq(pl[i].b), pl[i].qpos)]; +//fprintf(stderr, "%c", _calls[i]); + switch(_calls[i]) { + case 'A': s->raw_cvg[0]++; break; + case 'C': s->raw_cvg[1]++; break; + case 'G': s->raw_cvg[2]++; break; + case 'T': s->raw_cvg[3]++; break; + case 'n': s->raw_cvg[4]++; break; + default: break; + } + // Mask calls according to quality + if( snvmixSkipCall_alt(params, _calls[i], (char) bam1_qual(pl[i].b)[pl[i].qpos], (char) pl[i].b->core.qual) ) + _calls[i] = -1; + else { + switch(_calls[i]) { + case 'A': s->thr_cvg[0]++; break; + case 'C': s->thr_cvg[1]++; break; + case 'G': s->thr_cvg[2]++; break; + case 'T': s->thr_cvg[3]++; break; + case 'n': s->thr_cvg[4]++; break; + default: break; + } + if(_calls[i] == s->ref) + _calls[i] = 1; + else if(_calls[i] == 'n') + _calls[i] = -1; + } + } + + s->nref = snvmixFilterCalls(_calls,depth,NULL,NULL,params); + s->nref_count = 0; + s->ref_count = 0; + + // FIXME: review this section, extra flags + for(i = 0; i < 2; i++) { + s->forward[i] = 0; + s->reverse[i] = 0; + s->good_pair[i] = 0; + s->bad_pair[i] = 0; + s->c_clean[i] = 0; + s->c_ins[i] = 0; + s->c_del[i] = 0; + s->c_junc[i] = 0; + } + s->nref_edges = 0; // FIXME: fixed 5 bp edges + s->nref_center = 0; // FIXME: fixed 5 bp edges + s->indel_pos = 0; // How many reads have a deletion at that site + s->indel_near = 0; // How many reasd have a deletion overall + s->aln_unique_pos = 0; + uint8_t cigar_ops; // 0x1 INS, 0x2 DEL , 0x4 JUNCTION + //for(qual_num = 0; qual_num < depth; qual_num++) { + uint32_t debug_sort = 0x7fffffff; + if(s->nref != nref_skip) { + k = -1; + for(qual_num = depth; qual_num--; ) { + if(_calls[qual_num] == -1) { + if(pl[qual_num].is_del) + s->indel_pos++; + continue; + } + cigar_ops = 0; + for(i=0; i<pl[qual_num].b->core.n_cigar; i++) { + int op = bam1_cigar(pl[qual_num].b)[i] & BAM_CIGAR_MASK; + if(op == BAM_CINS) + cigar_ops |= 0x1; + if(op == BAM_CDEL) + cigar_ops |= 0x2; + if(op == BAM_CREF_SKIP) + cigar_ops |= 0x4; + } + if(pl[qual_num].indel) { + s->indel_pos++; + } + if(_calls[qual_num] == 1) { + s->ref_count++; + k = 0; + } else if(_calls[qual_num] == s->nref) { + s->nref_count++; + k = 1; + if(pl[qual_num].qpos > (pl[qual_num].b->core.l_qseq - 5) || pl[qual_num].qpos < 5) + s->nref_edges++; + else + s->nref_center++; + if(pl[qual_num].b->core.pos < debug_sort) { s->aln_unique_pos++; debug_sort = pl[qual_num].b->core.pos; } + else if(pl[qual_num].b->core.pos > debug_sort) {fprintf(stderr, "ERROR checking sort in read-position, prev = %Ld, new = %Ld\n", debug_sort, pl[qual_num].b->core.pos); exit(1);} + } + + if(k != -1) { + if(cigar_ops) { + if(cigar_ops & 0x1) s->c_ins[k]++; + if(cigar_ops & 0x2) s->c_del[k]++; + if(cigar_ops & 0x4) s->c_junc[k]++; + } else s->c_clean[k]++; + + if(pl[qual_num].b->core.tid != pl[qual_num].b->core.mtid) s->bad_pair[k]++; + + if(pl[qual_num].b->core.flag & 0x10) s->reverse[k]++; + else s->forward[k]++; + } + } + } + if(!params->filter) { + if(s->nref == nref_skip) { + snvmix_pl->b_end = b_end_tmp; + s->in_use = 0; + return 0; + } + int block = (_states / 3) * 3; + //for(k = 0; k < _states; k++) + //for(k = _states; k--; ) + // _b[k] = 0; + k = 0; + while( k < block ) { + _b[k] = 0; + _b[k+1] = 0; + _b[k+2] = 0; + k += 3; + } + if( k < _states) { + switch( _states - k ) { + case 2: _b[k] = 0; k++; + case 1: _b[k] = 0; + } + } + //for(qual_num = 0; qual_num < depth; qual_num++) { + for(qual_num = depth; qual_num--; ) { + //if( snvmixSkipCall(calls,qual_num,params,bQ,mQ) ) + if(_calls[qual_num] == -1) + continue; + // from matlab: + // b = sum(log(notm*0.5 + m.*(yr.*mur+notyr.*notmur)),2); + if(_calls[qual_num] == 1) { + // If REF then + Y = _phred[(unsigned char) _bQ(pl, qual_num)]; + not_Y = 1-_phred[(unsigned char) _bQ(pl, qual_num)]; + } else { + // If NREF then + Y = (1-_phred[(unsigned char) _bQ(pl, qual_num)])/3; + not_Y = _phred[(unsigned char) _bQ(pl, qual_num)] + 2*(1-_phred[(unsigned char)_bQ(pl, qual_num)])/3; + } + //for(k = 0; k < _states; k++) + //for(k = _states; k--; ) + // _b[k] = _b[k] + logl((1-_phred[(unsigned char) _mQ(pl, qual_num)])*0.5 + _phred[(unsigned char) _mQ(pl, qual_num)] * (Y * _mu[k] + not_Y * _notmu[k])); + k = 0; + while( k < block ) { + _b[k] = _b[k] + logl((1-_phred[(unsigned char) _mQ(pl, qual_num)])*0.5 + _phred[(unsigned char) _mQ(pl, qual_num)] * (Y * _mu[k] + not_Y * _notmu[k])); + _b[k+1] = _b[k+1] + logl((1-_phred[(unsigned char) _mQ(pl, qual_num)])*0.5 + _phred[(unsigned char) _mQ(pl, qual_num)] * (Y * _mu[k+1] + not_Y * _notmu[k+1])); + _b[k+2] = _b[k+2] + logl((1-_phred[(unsigned char) _mQ(pl, qual_num)])*0.5 + _phred[(unsigned char) _mQ(pl, qual_num)] * (Y * _mu[k+2] + not_Y * _notmu[k+2])); + k += 3; + } + if( k < _states) { + switch( _states - k ) { + case 2: _b[k] = _b[k] + logl((1-_phred[(unsigned char) _mQ(pl, qual_num)])*0.5 + _phred[(unsigned char) _mQ(pl, qual_num)] * (Y * _mu[k] + not_Y * _notmu[k])); k++; + case 1: _b[k] = _b[k] + logl((1-_phred[(unsigned char) _mQ(pl, qual_num)])*0.5 + _phred[(unsigned char) _mQ(pl, qual_num)] * (Y * _mu[k] + not_Y * _notmu[k])); + } + } + } + z = 0; + //for(k = 0; k < _states; k++) { + //for(k = _states; k--; ) { + // _b[k] = expl(_b[k] + _log_pi[k]); + // z += _b[k]; + //} + k = 0; + while( k < block ) { + _b[k] = expl(_b[k] + _log_pi[k]); z += _b[k]; + _b[k+1] = expl(_b[k+1] + _log_pi[k+1]); z += _b[k+1]; + _b[k+2] = expl(_b[k+2] + _log_pi[k+2]); z += _b[k+2]; + k += 3; + } + if( k < _states ) { + switch( _states - k ) { + case 2: _b[k] = expl(_b[k] + _log_pi[k]); z += _b[k]; k++; + case 1: _b[k] = expl(_b[k] + _log_pi[k]); z += _b[k]; + } + } + if(!z) z = 1; + //for(k = 0; k < _states; k++) + //for(k = _states; k--; ) + // _b[k] = _b[k] / z; + k = 0; + while( k < block ) { + _b[k] = _b[k] / z; + _b[k+1] = _b[k+1] / z; + _b[k+2] = _b[k+2] / z; + k += 3; + } + if( k < _states ) { + switch( _states - k ) { + case 2: _b[k] = _b[k] / z; k++; + case 1: _b[k] = _b[k] / z; + } + } + maxp = _states - 1; + z = _b[maxp]; + //for(k = 0; k < _states; k++) + for(k = _states; k--; ) + if( _b[k] > z) { + maxp = k; + z = _b[k]; + } + + + for(k = 0; k < _states; k++) + s->p[k] = _b[k]; + s->maxP = maxp+1; + } +} + +/* + void snvmixGetCallString + Arguments: + *col : pointer to the current file-line in memory + *calls : pointer to array where we will store the final base-calls + depth : number of base reads for this position + *nref : the observed NREF value will be placed here (-1 if none was found) + + Function: + This will parse column 5 of the pileup file, which contains the + base calls and will fill the array "calls" with: + 1 : when a reference call was made + -1 : when an invalid value is seen ('N', 'n', '*') + [ACTG] : when a non-reference call was made + + + +*/ +inline void snvmixGetCallString(char *col, int *calls, int depth, char *nref) { + int i; + int call_i = 0, call_skip_num = 0; + char call_skip_char[10]; + for(i = 0 ; call_i < depth; i++) { + switch(col[i]){ + case ',': + case '.': + calls[call_i] = 1; + call_i++; + break; + case 'a': case 'A': + case 't': case 'T': + case 'g': case 'G': + case 'c': case 'C': + //calls[call_i] = 0; + calls[call_i] = toupper(col[i]); + call_i++; + //if(*nref == 0) + //*nref = toupper(*(col+sizeof(char)*i)); + if(*nref == 0) + *nref = toupper(col[i]); + break; + case 'N': + case 'n': + case '*': + // Not comparable values, we ignore them, but need to + // keep count to compare against the number of mapping qualities + calls[call_i] = -1; + call_i++; + break; + case '$': + // End of a read, just ignore it + break; + case '^': + // Start of read, ignore it and skip the next value (not base-related) + i++; + break; + case '+': + case '-': + // Eliminate: + // +2AA + // -3AAA + // Start skiping the sign + i++; + for(call_skip_num = 0; col[i] <= '9' && col[i] >= '0'; call_skip_num++, i++) { + //call_skip_char[call_skip_num] = call; + call_skip_char[call_skip_num] = col[i]; + //i++; + } + // we have the number string in call_skip_char, lets terminate it + call_skip_char[call_skip_num] = '\0'; + // i has been updated to first letter, just need to jump the rest of them + i = i+atoi(call_skip_char)-1; + break; + default: + fprintf(stderr,"ERROR: problem reading pileup file calls (%c)\n",col[i]); + exit(1); + break; + } + } + // If no nref was found, don't bother parsing the other columns, make the for fail with -1 + if(!*nref) + *nref = -1; +} + +/* + int snvmixFilterCalls + Arguments: + *calls : array built by snvmixGetCallString containing + 1 : when a reference call was made + -1 : when an invalid value is seen ('N', 'n', '*') + [ACTG] : when a non-reference call was made + depth : number of calls for the current position + *bQ : base qualities observed for each call + *mQ : map quality for each call + *params : parameter structure, get thresholding data + Function: + For each valid call read in the position this function will apply + thresholding according to the type selected (-t flag) and the bQ (-q) + and mQ (-Q) thresholds provided. + + Any base-call that does not pass thresholds will be switched from it's + current value in *calls to -1; + + The most prevalent NREF after filtering will be determined and returned. + +*/ +inline int snvmixFilterCalls(int *calls, int depth, char *bQ, char *mQ, param_struct *params) { + int qual_num, nref_counts[5]; + nref_counts[0] = 0; + nref_counts[1] = 0; + nref_counts[2] = 0; + nref_counts[3] = 0; + nref_counts[4] = 0; + int max_nref = N; + + char nref = 0; + + //for(qual_num = 0; qual_num < depth; qual_num++) { + for(qual_num = depth; qual_num--; ) { + //if( snvmixSkipCall(calls,qual_num,params,bQ,mQ) ) { + if( bQ != NULL && mQ != NULL && snvmixSkipCall(calls,qual_num,params,bQ,mQ) ) { + calls[qual_num] = -1; + } else { + nref_counts[0]++; + switch(calls[qual_num]) { + case 'A': + nref_counts[A]++; + break; + case 'C': + nref_counts[C]++; + break; + case 'G': + nref_counts[G]++; + break; + case 'T': + nref_counts[T]++; + break; + case 1: + case -1: + nref_counts[0]--; + break; + default: + fprintf(stderr,"ERROR: unknown call base\n"); + exit(1); + } + } + } + if(nref_counts[0]) { + max_nref = A; + if(nref_counts[max_nref] < nref_counts[C]) + max_nref = C; + if(nref_counts[max_nref] < nref_counts[G]) + max_nref = G; + if(nref_counts[max_nref] < nref_counts[T]) + max_nref = T; + } else { + //return -1; + } + //for(qual_num = 0; qual_num < depth; qual_num++) { + for(qual_num = depth; qual_num--; ) { + if(calls[qual_num] == 1) + continue; + if(calls[qual_num] != base_code[max_nref]) + calls[qual_num] = -1; + } + return base_code[max_nref]; +} + +/* + int snvmixSkipCall + Arguments: + *calls : array built by snvmixGetCallString containing + 1 : when a reference call was made + -1 : when an invalid value is seen ('N', 'n', '*') + [ACTG] : when a non-reference call was made + qual_num : call number that is being evaluated + *params : parameter structure, get thresholding data + *bQ : base qualities observed for each call + *mQ : map quality for each call + Function: + Evalates quality values in each of the filtering models + + Returns 1 if the calls[qual_num] needs to be filtered out + Returns 0 otherwise +*/ +inline int snvmixSkipCall(int *calls, int qual_num, param_struct *params, char *bQ, char *mQ) { + if(calls[qual_num] == -1) + return(1); + if(params->filter_type == TYPE_mb) { + if(bQ[qual_num] <= params->bQ || mQ[qual_num] <= params->mQ) + return(1); + } else if(params->filter_type == TYPE_b) { + if(bQ[qual_num] <= params->bQ) + return(1); + } else { + if(mQ[qual_num] <= params->mQ) + return(1); + if(params->filter_type == TYPE_m) { + // Use mapping as surrogate + bQ[qual_num] = mQ[qual_num]; + // Make mapping one + mQ[qual_num] = (char) PHRED_MAX; + } else if(params->filter_type == TYPE_M) { + // Mapping passed, make it one + mQ[qual_num] = (char) PHRED_MAX; + } else if(params->filter_type == TYPE_Mb) { + // Nothing special here + } else if(params->filter_type == TYPE_MB || params->filter_type == TYPE_SNVMix1) { + if(bQ[qual_num] <= params->bQ) + return(1); + if(params->filter_type == TYPE_SNVMix1) { + bQ[qual_num] = (char) PHRED_MAX; + mQ[qual_num] = (char) PHRED_MAX; + } } + } + return(0); +} + +inline int snvmixSkipCall_alt(param_struct *params, int call, char bQ, char mQ) { + if(call == -1) + return(1); + if(params->filter_type != TYPE_b && mQ <= params->mQ) + return(1); + switch(params->filter_type) { + case TYPE_mb: + if(bQ <= params->bQ || mQ <= params->mQ) + return(1); + break; + case TYPE_b: + if(bQ <= params->bQ) + return(1); + break; + case TYPE_m: + // Use mapping as surrogate + bQ = mQ; + // Make mapping one + mQ = (char) PHRED_MAX; + break; + case TYPE_M: + // Mapping passed, make it one + mQ = (char) PHRED_MAX; + break; + case TYPE_Mb: + // Nothing special here + break; + case TYPE_MB: + case TYPE_SNVMix1: + if(bQ <= params->bQ) + return(1); + if(params->filter_type == TYPE_SNVMix1) { + bQ = (char) PHRED_MAX; + mQ = (char) PHRED_MAX; + } + break; + } + return(0); +} + + +void resetParams(param_struct *params) { + params->input = stdin; + params->output = stdout; + params->inputfile = NULL; + params->outputfile = NULL; + params->modelfile = NULL; + params->fastafile = NULL; + params->listfile = NULL; + params->window = 1; + params->filter_type = 0; + params->filter_dup = 1; + params->filter_chast = 1; + params->train = 0; + params->classify = 0; + params->filter = 0; + params->full = 0; + params->input_type = S_PILEUP; // 0 = processed, 1 = maq pileup, 2 = sam pileup, 3 = bam file + params->mu = NULL; + params->pi = NULL; + params->max_iter = 1000; + params->bQ = 19; + params->mQ = 19; + params->debug = 0; + params->states = 3; + params->trainP.alpha = NULL; + params->trainP.beta = NULL; + params->trainP.delta = NULL; + params->trainP.param_file = NULL; + params->trainP.max_iter = 100; + params->trainP.bQ = NULL; + params->trainP.mQ = NULL; + params->trainP.calls = NULL; + params->window = 1; +} + +void initSNVMix(int argc, char **argv, param_struct *params) { + char c; + int i; + resetParams(params); + while ((c = getopt (argc, argv, "hDTCFfi:o:m:t:p:q:Q:a:b:d:M:S:r:l:w:cR")) != -1) { + switch (c) + { + case 'm': params->modelfile = optarg; break; + case 'i': params->inputfile = optarg; break; + case 'o': params->outputfile = optarg; break; + // Bam file opts + case 'r': params->fastafile = optarg; break; + case 'l': params->listfile = optarg; break; + case 'T': params->train = 1; break; + case 'C': params->classify = 1; break; + case 'F': params->filter = 1; break; + case 'f': params->full = 1; break; + case 'p': + if(*optarg == 'q') { + params->input_type = Q_PILEUP; + } else if(*optarg == 'm') { + params->input_type = M_PILEUP; + } else if(*optarg == 's') { + params->input_type = S_PILEUP; + } else if(*optarg == 'b') { + params->input_type = BAM_FILE; + } else { + fprintf(stderr,"ERROR: unknown pileup format %c\n",*optarg); + exit(1); + } + break; + case 't': + if(strcmp("mb",optarg) == 0) + params->filter_type = TYPE_mb; + else if(strcmp("m",optarg) == 0) + params->filter_type = TYPE_m; + else if(strcmp("b",optarg) == 0) + params->filter_type = TYPE_b; + else if(strcmp("M",optarg) == 0) + params->filter_type = TYPE_M; + else if(strcmp("Mb",optarg) == 0) + params->filter_type = TYPE_Mb; + else if(strcmp("MB",optarg) == 0) + params->filter_type = TYPE_MB; + else if(strcmp("SNVMix1",optarg) == 0) + params->filter_type = TYPE_SNVMix1; + else { + fprintf(stderr,"ERROR: filter type '%s' not recognized\n",optarg); + exit(1); + } + break; + case 'q': + params->bQ = atoi(optarg); + if( params->bQ < 0 || params->bQ >= PHRED_MAX ) { + fprintf(stderr,"ERROR: quality threshold value Q%d out of range\n",params->bQ); + exit(1); + } + break; + case 'Q': + params->mQ = atoi(optarg); + if( params->mQ < 0 || params->mQ >= PHRED_MAX ) { + fprintf(stderr,"ERROR: mapping threshold value Q%d out of range\n",params->mQ); + exit(1); + } + break; + case 'h': + usage(argv[0]); + break; + case 'D': params->debug = 1; break; + case 'M': params->trainP.param_file = optarg; break; + case 'S': + params->states = atoi(optarg); + if( params->states < 3 ) { + fprintf(stderr,"ERROR: state minimum is 3\n"); + exit(1); + } + break; + case 'w': + params->window = atoi(optarg); + if(params->window < 1) + params->window = 1; + break; + case 'c': + params->filter_chast = 0; + break; + case 'R': + params->filter_dup = 0; + break; + default: + fprintf(stderr,"Unknown option\n"); + usage(argv[0]); + break; + } + } + /* Decide if we're going to train or classify */ + if(params->filter) { + params->train = 0; + params->classify = 0; + } + if(params->train && params->classify) { + fprintf(stderr,"ERROR: choose either train or classify\n"); + exit(1); + } else if(!params->train && !params->classify && !params->filter) { + fprintf(stderr,"Train/Classify/Filter not selected, picking default: Classify\n"); + params->classify = 1; + } + + /* Set parameters */ + allocateParameters(params); + if( params->train ) setTrainingParameters(params); + if( params->classify ) setClassificationParameters(params); + + /* Open input and output streams */ + if( params->input_type == BAM_FILE ) { + if( params->train ) { + fprintf(stderr, "BAM input for training not yet implemented\n"); + exit(1); + } + if( params->inputfile == NULL || strcmp(params->inputfile, "-") == 0) { + fprintf(stderr, "ERROR: if '-p b' is chosen, input file has to be a bam file\n"); + exit(1); + } + if( params->fastafile == NULL ) { + fprintf(stderr, "ERROR: if '-p b' is chosen, reference fasta file is required (-r <ref.fa>)\n"); + exit(1); + } + } else if( params->inputfile != NULL) { + params->input = fopen(params->inputfile, "r"); + if(params->input == NULL) { + perror("ERROR: could not open input file"); + exit(1); + } + } else { + params->input = stdin; + } + if( params->outputfile != NULL ) { + params->output = fopen(params->outputfile, "w"); + if(params->output == NULL) { + perror("ERROR: could not open output file"); + exit(1); + } + } else { + params->output = stdout; + } +} + +void allocateParameters(param_struct *params) { + + /* Allocate alpha */ + if( (params->trainP.alpha = malloc(params->states * sizeof(long double))) == NULL) { + perror("ERROR: could not allocate space for alpha\n"); exit(1); + } + /* Allocate beta */ + if( (params->trainP.beta = malloc(params->states * sizeof(long double))) == NULL) { + perror("ERROR: could not allocate space for beta\n"); exit(1); + } + /* Allocate delta*/ + if( (params->trainP.delta = malloc(params->states * sizeof(long double))) == NULL) { + perror("ERROR: could not allocate space for delta\n"); exit(1); + } + /* Allocate mu*/ + if( (params->mu = malloc(params->states * sizeof(long double))) == NULL) { + perror("ERROR: could not allocate space for mu\n"); exit(1); + } + /* Allocate pi*/ + if( (params->pi = malloc(params->states * sizeof(long double))) == NULL) { + perror("ERROR: could not allocate space for pi\n"); exit(1); + } +} + +void setTrainingParameters(param_struct *params) { + if( params->trainP.param_file != NULL ) { + readParamFile(params,'t'); + } else { + if(params->states != 3) { + fprintf(stderr, "ERROR: if state space larger than 3 requested, parameters must be provided\n"); + exit(1); + } + fprintf(stderr,"Training parameter file not given, using defaults\n"); + params->trainP.alpha[0] = 1000; + params->trainP.alpha[1] = 5000; + params->trainP.alpha[2] = 1; + params->trainP.beta[0] = 1; + params->trainP.beta[1] = 5000; + params->trainP.beta[2] = 1000; + params->trainP.delta[0] = 10; + params->trainP.delta[1] = 1; + params->trainP.delta[2] = 1; + } +} + +void setClassificationParameters(param_struct *params) { + int k; + if( params->modelfile != NULL ) { + readParamFile(params,'c'); + } else { + if(params->states != 3) { + fprintf(stderr, "ERROR: if state space larger than 3 requested, model file must be provided\n"); + exit(1); + } + params->mu[0] = 0.995905287891696078261816182930; + params->mu[1] = 0.499569401000925172873223800707; + params->mu[2] = 0.001002216846753116409260431219; + params->pi[0] = 0.672519580755555956841362785781; + params->pi[1] = 0.139342327124010650907237618412; + params->pi[2] = 0.188138092120433392251399595807; + fprintf(stderr,"Classification parameter file not given, using for mQ35 bQ10\n"); + } + fprintf(stderr,"Mu and pi values:\n"); + for(k = 0; k < params->states; k++) + fprintf(stderr,"\tmu[%d] = %Lf\tpi[%d] = %Lf\n", k, params->mu[k], k, params->pi[k]); + fprintf(stderr,"\n"); +} + +void readParamFile(param_struct *params, char type) { + FILE *fptrPARAM; + char *line = NULL, *param_name, *param, *param_ptr, *filename; + size_t line_len = 0, read_len = 0; + long double *target; + int i; + + if(type == 't') { + filename=params->trainP.param_file; + } else if(type == 'c') { + filename=params->modelfile; + } + fptrPARAM=fopen(filename, "r"); + if(fptrPARAM == NULL) { + fprintf(stderr, "ERROR: could not open parameter file %s\n", filename); + exit(1); + } + +#if defined __linux__ || defined _GNU_SOURCE + while( (read_len = getline(&line,&line_len,fptrPARAM)) != -1 ) +#endif +#ifdef __APPLE__ + while( (line = fgetln(fptrPARAM,&line_len)) != NULL ) +#endif + { + line[line_len-1] = '\0'; + target = NULL; + param_name = strtok_r(line, " ", ¶m_ptr); + if(type == 't') { + if(strcmp("alpha", param_name) == 0) target = params->trainP.alpha; + else if(strcmp("beta", param_name) == 0) target = params->trainP.beta; + else if(strcmp("delta", param_name) == 0) target = params->trainP.delta; + } else if(type == 'c') { + if(strcmp("mu", param_name) == 0) target = params->mu; + else if(strcmp("pi", param_name) == 0) target = params->pi; + + } + if(target == NULL) { fprintf(stderr, "Unknown parameter %s, skipping\n", param_name); continue; } + + for(i = 0; i < params->states; i++) { + param = strtok_r(NULL, " ", ¶m_ptr); + if(param == NULL) { + fprintf(stderr,"ERROR: missing value #%d for %s\n", i, param_name); + exit(1); + } + target[i] = atof(param); +fprintf(stderr, "DEBUG: %s[%d] = %Lf\n", param_name, i, target[i]); + } + } + fclose(fptrPARAM); + free(line); +} + +void initPhred(long double *phredTable, int elem) { + int i; + for(i = 0; i < elem; i++) { + phredTable[i] = 1-powl(10,(-(long double)i/10)); + } + phredTable[i] = 1; +} + + +void usage(char *selfname) { + param_struct default_params; + resetParams(&default_params); + fprintf(stderr,"Syntax:\n\t%s\t-m <modelfile> [-i <infile>] [-o <outfile>] [-T | -C | -F] [-p < q | m | s >] [-t < mb | m | b | M | Mb | MB | SNVMix1>] [-q <#>] [-Q <#>] [-M <trainP file>] [-h]\n",selfname); + fprintf(stderr,"\tRequired:\n"); + fprintf(stderr,"\t-m <modelfile>\tmodel file, a line for mu and a line for pi, three\n"); + fprintf(stderr,"\t\t\tspace-separated values each, like:\n"); + fprintf(stderr,"\t\t\tmu 0.xxxxxxxx 0.xxxxxxxx 0.xxxxxxxx\n"); + fprintf(stderr,"\t\t\tpi 0.xxxxxxxx 0.xxxxxxxx 0.xxxxxxxx\n"); + fprintf(stderr,"\tOptional:\n"); + fprintf(stderr,"\t-i <infile>\tquality pileup, from pileup2matlab.pl script (def: STDIN)\n"); + fprintf(stderr,"\t-o <outfile>\twhere to put the results (def: STDOUT)\n"); + fprintf(stderr,"\t-T | -C | -F\tTrain, Classify or Filter (def: Classify)\n"); + fprintf(stderr,"\t-p q|m|s\tInput pileup format (def: s)\n\t\t\tq = quality\n\t\t\tm = MAQ\n\t\t\ts = SAMtools\n"); + fprintf(stderr,"\t-t mb|m|b|M|Mb|MB|SNVMix1\n\t\t\tFilter type (def: mb)\n"); + fprintf(stderr,"\t\t\tmb\tLowest between map and base quality\n"); + fprintf(stderr,"\t\t\tm\tFilter on map, and use as surrogate for base quality\n"); + fprintf(stderr,"\t\t\tb\tFilter on base quality, take map quality as 1\n"); + fprintf(stderr,"\t\t\tM\tFilter on map quality but use only base quality\n"); + fprintf(stderr,"\t\t\tMb\tFilter on map quality and use both map and base qualities\n"); + fprintf(stderr,"\t\t\tMB\tFilter on map quality AND base quality\n"); + fprintf(stderr,"\t\t\tSNVMix1\tFilter on base quality and map quality, afterwards consider them perfect\n"); + fprintf(stderr,"\t-q #\t\tCutoff Phred value for Base Quality, anything <= this value is ignored (def: Q%d)\n",default_params.bQ); + fprintf(stderr,"\t-Q #\t\tCutoff Phred value for Map Quality, anything <= this value is ignored (def: Q%d)\n",default_params.mQ); + fprintf(stderr,"\n\tTraining parameters:\n"); + fprintf(stderr,"\t-M <file>\tProvide a file containing training parameters\n"); + fprintf(stderr,"\t\t\tValues are space-separated:\n"); + fprintf(stderr,"\t\t\talpha # # #\n"); + fprintf(stderr,"\t\t\tbeta # # #\n"); + fprintf(stderr,"\t\t\tdelta # # #\n"); + fprintf(stderr,"\n\t-h\t\tthis message\n"); + fprintf(stderr,"\nImplementation: Rodrigo Goya, 2009. Version %s\n",VERSION); + exit(0); +} + +// Based on classify pileup, but reads the entire file to memory for training purposes. +void snvmixGetTrainSet_pileup(param_struct *params) { +//MAQ: +// 1 554484 C 1752 @,,.,.,, @xxx @xxx xxxxx +//SAM: +// 1 554484 C 1752 ,,.,.,, xxx xxxx + + FILE *fptrIN; + + char *line = NULL; + size_t line_len = 0, prev_len = 0; + int read_len = 0; + int col_num = 0; + long int line_num = 0; + + char *col, *qual; + char *col_sep = "\t", *col_str, *col_ptr; + char *qual_sep = ",", *qual_str, *qual_ptr; + char *chr, *pos, *ref, nref, call; + + char *bQ, *mQ; + int *calls, *tmpQ, pos_int; + int depth = 0, qual_num, maxp; + int depth_allocated = 0; + + int trainset = 0; + int trainset_allocated = 0; + + int nref_count = 0, ref_count = 0; + int i, call_i; + + fptrIN = params->input; + + if( (params->trainP.bQ = malloc(START_QNUM * sizeof (unsigned char **))) == NULL) { + perror("ERROR allocating initial space for train.bQ"); exit(1); + } + if( (params->trainP.mQ = malloc(START_QNUM * sizeof (unsigned char **))) == NULL) { + perror("ERROR allocating initial space for train.mQ"); exit(1); + } + if( (params->trainP.calls = malloc(START_QNUM * sizeof (signed char **))) == NULL) { + perror("ERROR allocating initial space for train.calls"); exit(1); + } + if( (params->trainP.depth = malloc(START_QNUM * sizeof (int *))) == NULL) { + perror("ERROR allocating initial space for train.depth"); exit(1); + } + trainset_allocated = START_QNUM; + + if( (calls = malloc(START_QNUM * sizeof (int))) == NULL) { + perror("ERROR allocating initial space for train.depth"); exit(1); + } + depth_allocated = START_QNUM; +#if defined __linux__ || defined _GNU_SOURCE + while( (read_len = getline(&line,&line_len,fptrIN)) != -1 ) +#endif +#ifdef __APPLE__ + while( (line = fgetln(fptrIN,&line_len)) != NULL ) +#endif + { + line[line_len-1] = '\0'; + depth = 0; + nref = 0; + col_ptr = NULL; + for(col_num = 0, col_str = line; ; col_num++, col_str = NULL) { + col = strtok_r(col_str, "\t", &col_ptr); + if(col == NULL) { + break; + } + if(col_num == 0) { + chr = col; + } else if(col_num == 1) { + pos = col; + pos_int = strtol(pos, NULL, 10); + } else if(col_num == 2) { + ref = col; + } else if(col_num == 3) { + depth = atoi(col); + if(depth > depth_allocated) { + if( (calls = realloc(calls, sizeof (int) * (depth + START_QNUM))) == NULL) { + perror("ERROR allocating space for calls"); exit(1); + } + depth_allocated = depth + START_QNUM; + } + } else if(col_num == 4) { + if(params->input_type == M_PILEUP) + col = col+sizeof(char); + snvmixGetCallString(col, calls, depth, &nref); + } else if(col_num == 5) { + bQ = col; + // If it's a MAQ pileup, we need to skip the @ sign + if(params->input_type == M_PILEUP) + bQ = bQ + sizeof(char); + for(call_i = 0; call_i < depth; call_i++) + bQ[call_i] = bQ[call_i]-33; + } else if(col_num == 6) { + mQ = col; + // If it's a MAQ pileup, we need to skip the @ sign + if(params->input_type == M_PILEUP) + mQ = mQ + sizeof(char); + for(call_i = 0; call_i < depth; call_i++) + mQ[call_i] = mQ[call_i]-33; + } + } + if(line_num >= trainset_allocated) { + if( ( params->trainP.bQ = realloc(params->trainP.bQ, (line_num + START_QNUM) * sizeof (unsigned char *)) ) == NULL ) { + perror("ERROR reallocating space for trainP.bQ"); exit(1); + } + if( ( params->trainP.mQ = realloc(params->trainP.mQ, (line_num + START_QNUM) * sizeof (unsigned char *)) ) == NULL ) { + perror("ERROR reallocating space for trainP.mQ"); exit(1); + } + if( ( params->trainP.calls = realloc(params->trainP.calls, (line_num + START_QNUM) * sizeof (signed char *)) ) == NULL) { + perror("ERROR reallocating space for trainP.calls"); exit(1); + } + if( ( params->trainP.depth = realloc(params->trainP.depth, (line_num + START_QNUM) * sizeof (int)) ) == NULL) { + perror("ERROR reallocating space for trainP.depth"); exit(1); + } + trainset_allocated = line_num + START_QNUM; + } + + nref = snvmixFilterCalls(calls,depth,bQ,mQ,params); + nref_count = 0; + ref_count = 0; + for(qual_num = 0; qual_num < depth; qual_num++) { + if(calls[qual_num] == -1) + continue; + if(calls[qual_num] == 1) + ref_count++; + if(calls[qual_num] == nref) + nref_count++; + } + params->trainP.depth[line_num] = ref_count + nref_count; + + if( (params->trainP.bQ[line_num] = malloc(sizeof(unsigned char) * params->trainP.depth[line_num])) == NULL ) { + perror("ERROR allocating space for trainP.bQ"); exit(1); + } + if( (params->trainP.mQ[line_num] = malloc(sizeof(unsigned char) * params->trainP.depth[line_num])) == NULL ) { + perror("ERROR allocating space for trainP.mQ"); exit(1); + } + if( (params->trainP.calls[line_num] = malloc(sizeof(signed char) * params->trainP.depth[line_num])) == NULL ) { + perror("ERROR allocating space for trainP.calls"); exit(1); + } + + call_i = 0; + for(qual_num = 0; qual_num < depth; qual_num++) { + if(calls[qual_num] == -1) + continue; + + params->trainP.calls[line_num][call_i] = (signed char) calls[qual_num]; + params->trainP.bQ[line_num][call_i] = (unsigned char) bQ[qual_num]; + params->trainP.mQ[line_num][call_i] = (unsigned char) mQ[qual_num]; + call_i++; + } + if( params->trainP.depth[line_num] != call_i ) { + fprintf(stderr, "ERROR: mismatch between trainP.depth and call_i\n"); exit(1); + } + line_num++; + } + params->trainP.len = line_num; + params->trainP.len = line_num; + fclose(fptrIN); + free(line); free(calls); +} + +// Use EM algorithm on a memory-located data set to train the parameters +void snvmixTrain_pileup(param_struct *params) { + int line_num = 0, call_i = 0, k = 0, states; + int iter = 0; + FILE *fptrOUT; + + long double phred[PHRED_MAX + 1]; + long double bsum = 0, msum = 0, *b, *mu, *notmu, *pi, *log_pi, z = 0; + long double **pG, **xr; + long double *xrhat, *sum_pG; + long double Y, not_Y, M; + int N_sum; + int strength; + + long double ll, prev_ll = 0; + //initPhred(phred, PHRED_MAX+1); + initPhred(phred, PHRED_MAX); + + states = params->states; + + // Initialize local values of parameters + mu = params->mu; + pi = params->pi; + + if( ( b = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate b\n"); exit(1); } + if( ( notmu = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate notmu\n"); exit(1); } + if( ( log_pi = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate log_pi\n"); exit(1); } + if( ( xrhat = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate xrhat\n"); exit(1); } + if( ( sum_pG = malloc(states * sizeof(long double)) ) == NULL) { perror("[snvmixClassify_pileup] Could not allocate sum_pG\n"); exit(1); } + + snvmixGetTrainSet_pileup(params); + + if(params->debug) { + for(line_num = 0; line_num < params->trainP.len; line_num++) { + fprintf(stderr, "line_num: %d", line_num); + for(call_i = 0; call_i < params->trainP.depth[line_num]; call_i++) { + fprintf(stderr, "\t(%d,bQ%d,mQ%d)", params->trainP.calls[line_num][call_i], params->trainP.bQ[line_num][call_i], params->trainP.mQ[line_num][call_i]); + } + fprintf(stderr, "\n\n"); + } + } + // Initialize mu and pi + + fprintf(stderr, "Initializing mu\n"); + for(k = 0; k < states; k++) { + params->mu[k] = (long double) params->trainP.alpha[k] / (params->trainP.alpha[k] + params->trainP.beta[k]); + fprintf(stderr,"\talpha[%d] = %Lf,\tbeta[%d] = %Lf,\tmu[%d] = %Lf\n", k, params->trainP.alpha[k], k, params->trainP.beta[k], k, params->mu[k]); + } + + fprintf(stderr, "Initializing pi\n"); + z = (long double) params->trainP.delta[0] + params->trainP.delta[1] + params->trainP.delta[2]; + if(!z) { z = 1; } + for(k = 0; k < states; k++) { + params->pi[k] = (long double) params->trainP.delta[k] / z; + fprintf(stderr,"\tdelta[%d] = %Lf,\tpi[%d] = %Lf\n", k, params->trainP.delta[k], k, params->pi[k]); + } + + strength = params->trainP.len; + + // Initialize matrices + if( (pG = malloc(sizeof(long double *) * params->trainP.len)) == NULL) { + perror("ERROR allocating initial space for pG"); exit(1); + } + if( (xr = malloc(sizeof(long double *) * params->trainP.len)) == NULL) { + perror("ERROR allocating initial space for xr"); exit(1); + } + for(line_num = 0; line_num < params->trainP.len; line_num++) { + // [states] cells for each line_num + if( (pG[line_num] = malloc(sizeof(long double) * states)) == NULL) { + perror("ERROR allocating space for pG"); exit(1); + } + if( (xr[line_num] = malloc(sizeof(long double) * states)) == NULL) { + perror("ERROR allocating space for xr"); exit(1); + } + } + + for(iter = 0; iter < params->trainP.max_iter; iter++) { + // Reset values for this iteration + for(k = 0; k < states; k++) { + notmu[k] = 1 - mu[k]; + log_pi[k] = logl(pi[k]); + + sum_pG[k] = 0; + xrhat[k] = 0; + + } + N_sum = 0; + ll = 0; + + // E step + for(line_num = 0; line_num < params->trainP.len; line_num++) { + if(params->trainP.depth == 0) + continue; + for(k = 0; k < states; k++) { + xr[line_num][k] = 0; + b[k] = 0; + } + + for(call_i = 0; call_i < params->trainP.depth[line_num]; call_i++) { + if(params->trainP.calls[line_num][call_i] == 1) { + Y = phred[params->trainP.bQ[line_num][call_i]]; + not_Y = 1-phred[params->trainP.bQ[line_num][call_i]]; + } else { + Y = (1-phred[params->trainP.bQ[line_num][call_i]])/3; + not_Y = phred[params->trainP.bQ[line_num][call_i]] + 2*(1-phred[params->trainP.bQ[line_num][call_i]])/3; + } + M = phred[params->trainP.mQ[line_num][call_i]]; + for(k = 0; k < states; k++) { + b[k] = b[k] + logl( (1 - M) * 0.5 + M * (Y * mu[k] + not_Y * notmu[k]) ); + xr[line_num][k] = xr[line_num][k] + Y; + } + } + z = 0; + for(k = 0; k < states; k++) { + b[k] = expl(b[k] + log_pi[k]); + z = z + b[k]; + } + if(!z) { z=1; } + for(k = 0; k < states; k++) { + pG[line_num][k] = b[k] / z; + } + + ll = ll + logl(z); + + N_sum = N_sum + params->trainP.depth[line_num]; + + for(k = 0; k < states; k++) { + sum_pG[k] = sum_pG[k] + pG[line_num][k]; + xrhat[k] = xrhat[k] + xr[line_num][k] * pG[line_num][k]; + } + } + + // Check log likelihood + if(iter == 0) + prev_ll = ll; + else if(ll <= prev_ll) + break; + prev_ll = ll; + + // M step + z = 0; + for(k = 0; k < states; k++) { + z = z + sum_pG[k] + params->trainP.delta[k]; + } + if(!z) { z=1; } + for(k = 0; k < states; k++) { + // Update pi + params->pi[k] = (sum_pG[k] + params->trainP.delta[k]) / z; + // Update mu + params->mu[k] = (xrhat[k] + params->trainP.alpha[k]*strength-1) / (N_sum + params->trainP.alpha[k]*strength + params->trainP.beta[k]*strength-2); + } + } + + if(params->modelfile == NULL) { + fptrOUT = stdout; + } else { + fptrOUT = fopen(params->modelfile, "w"); + if(fptrOUT == NULL) { + perror("ERROR: could not open modelfile for writing, using STDOUT"); + fptrOUT = stdout; + } + } + fprintf(fptrOUT,"mu"); + for(k = 0; k < states; k++) { + fprintf(fptrOUT, " %0.30Lf", params->mu[k]); + } + fprintf(fptrOUT,"\n"); + fprintf(fptrOUT,"pi"); + for(k = 0; k < states; k++) { + fprintf(fptrOUT, " %0.30Lf", params->pi[k]); + } + fprintf(fptrOUT,"\n"); + + /* free unused memory */ + free(b); free(notmu); free(log_pi); + free(xrhat); free(sum_pG); + for(line_num = 0; line_num < params->trainP.len; line_num++) { + // free [states] cells for each line_num + free(pG[line_num]); + free(xr[line_num]); + } + free(pG); free(xr); +} +