view SNV/SNVMix2_source/SNVMix2-v0.12.1-rc1/SNVMix2.h @ 0:74f5ea818cea

Uploaded
author ryanmorin
date Wed, 12 Oct 2011 19:50:38 -0400
parents
children
line wrap: on
line source

/* The MIT License

   Copyright (c) 2009, by Sohrab Shah <sshah@bccrc.ca> and Rodrigo Goya <rgoya@bcgsc.ca>

   Permission is hereby granted, free of charge, to any person obtaining
   a copy of this software and associated documentation files (the
   "Software"), to deal in the Software without restriction, including
   without limitation the rights to use, copy, modify, merge, publish,
   distribute, sublicense, and/or sell copies of the Software, and to
   permit persons to whom the Software is furnished to do so, subject to
   the following conditions:

   The above copyright notice and this permission notice shall be
   included in all copies or substantial portions of the Software.

   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
   SOFTWARE.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <unistd.h>
#include <stdbool.h>
#include "sam.h"
#include "faidx.h"
#include "khash.h"

#define TYPE_mb 0
#define TYPE_m 1
#define TYPE_b 2
#define TYPE_M 3
#define TYPE_Mb 4
#define TYPE_MB 5
#define TYPE_SNVMix1 6

#define Q_PILEUP 0
#define M_PILEUP 1
#define S_PILEUP 2
#define BAM_FILE 3

#define PHRED_MAX 200

#define N 0
#define A 1
#define G 2
#define C 3
#define T 4

char base_code[] = {'N','A','G','C','T'};

struct params_train {
	long double *alpha;
	long double *beta;
	long double *delta;
	char *param_file;
	int max_iter;
	unsigned char **bQ;
	unsigned char **mQ;
	signed char **calls;
	char *ref;
	int *pos;
	int *depth;
	int len;
};

typedef struct {
	FILE *input;
	FILE *output;
	char *inputfile;
	char *outputfile;
	char *modelfile;
	char *bamfile;
	char *fastafile;
	char *listfile;
	int filter_type;
	int filter_chast;
	int filter_dup;
	int train;
	int classify;
	int filter;
	int full;
	int input_type; // 0 = processed, 1 = maq pileup, 2 = sam pileup
	long double *mu;
	long double *pi;
	int max_iter;
	int bQ;
	int mQ;
	int debug;
	//struct {
	//	int alpha[3];
	//	int beta[3];
	//} train;
	struct params_train trainP;
	int states;
	int window;
} param_struct;

typedef int *indel_list_t;
KHASH_MAP_INIT_INT64(64, indel_list_t)

typedef struct{
	bool in_use;
	uint32_t tid; // get with snvmix_pl->in->header->target_name[tid],
	uint32_t pos;
	char ref;
	char nref;
	int ref_count;
	int nref_count;
	long double *p;
	int maxP;
	// 0 = REF    ,     1 = NREF
	int forward[2];
	int reverse[2];
	int good_pair[2];
	int bad_pair[2];
	int c_clean[2];
	int c_ins[2];
	int c_del[2];
	int c_junc[2];
	int nref_edges;    // FIXME: fixed 5 bp edges
	int nref_center;    //
	int indel_pos;     // How many reads have a deletion at that site
	int indel_near;    // How many reasd have a deletion overall
	int aln_unique_pos;
	int full_depth;    // Full depth at this position
	int raw_cvg[5];
	int thr_cvg[5];
} snvmix_call_t;

typedef struct {
	param_struct *params;
	int begin, end;
	samfile_t *in;
	faidx_t *fai;
	int tid, len;
	char *ref;
	/* might want to move this elsewhere */
	khash_t(64) *hash;
	long double *notmu, *log_pi, *b;
	long double phred[PHRED_MAX + 1];
	int *calls, depth_allocated;
	snvmix_call_t *buffer;
	int n_buffer, b_start, b_end;
	// Extra flags
} snvmix_pl_t;


void updatePhred(long double *phredTable);
void initPhred(long double *phredTable, int elem);

void resetParams(param_struct *params);
void initSNVMix(int argc , char **argv, param_struct *params);
void usage(char *selfname);

void allocateParameters(param_struct *params);
void setTrainingParameters(param_struct *params);
void setClassificationParameters(param_struct *params);
void readParamFile(param_struct *params, char type);


void snvmixClassify_qualities(param_struct *params);
void snvmixClassify_pileup(param_struct *params);

int snvmixClassify_bamfile(param_struct *params);
static int snvmixClassify_pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data);

inline void snvmixGetCallString(char *col, int *calls, int depth, char *nref);

inline int snvmixFilterCalls(int *calls, int depth, char *bQ, char *mQ, param_struct *params);
inline int snvmixSkipCall(int *calls, int qual_num, param_struct *params, char *bQ, char *mQ);
inline int snvmixSkipCall_alt(param_struct *params, int call, char bQ, char mQ);

void snvmixTrain_qualities(param_struct *params);
void snvmixGetTrainSet_pileup(param_struct *params);
void snvmixTrain_pileup(param_struct *params);

long double normalise(long double *values, int len);

char **__bam_get_lines(const char *fn, int *_n);
static khash_t(64) *load_pos(const char *fn, bam_header_t *h)
{
	char **list;
	int i, j, n, *fields, max_fields;
	khash_t(64) *hash;
	bam_init_header_hash(h);
	list = __bam_get_lines(fn, &n);
	fprintf(stderr,"got %d lines\n", n);
	hash = kh_init(64);
	max_fields = 0; fields = 0;
	for (i = 0; i < n; ++i) {
		char *str = list[i];
		int chr, n_fields, ret;
		khint_t k;
		uint64_t x;
		n_fields = ksplit_core(str, 0, &max_fields, &fields);
		if (n_fields < 2) continue;
		chr = bam_get_tid(h, str + fields[0]);
		if (chr < 0) {
			fprintf(stderr, "[load_pos] unknown reference sequence name: %s\n", str + fields[0]);
			continue;
		}
		x = (uint64_t)chr << 32 | (atoi(str + fields[1]) - 1);
		k = kh_put(64, hash, x, &ret);
		if (ret == 0) {
			fprintf(stderr, "[load_pos] position %s:%s has been loaded.\n", str+fields[0], str+fields[1]);
			continue;
		}
		kh_val(hash, k) = 0;
		if (n_fields > 2) {
			// count
			for (j = 2; j < n_fields; ++j) {
				char *s = str + fields[j];
				if ((*s != '+' && *s != '-') || !isdigit(s[1])) break;
 			}
			if (j > 2) { // update kh_val()
				int *q, y, z;
				q = kh_val(hash, k) = (int*)calloc(j - 1, sizeof(int));
				q[0] = j - 2; z = j; y = 1;
				for (j = 2; j < z; ++j)
					q[y++] = atoi(str + fields[j]);
			}
		}
		free(str);
	}
	free(list); free(fields);
	return hash;
}