Mercurial > repos > calkan > mrcanavar
changeset 0:86522a0b5f59 default tip
Uploaded source code for mrCaNaVaR
author | calkan |
---|---|
date | Tue, 21 Feb 2012 10:44:13 -0500 |
parents | |
children | |
files | mrcanavar-0.34/BUGS mrcanavar-0.34/Changelog mrcanavar-0.34/Makefile mrcanavar-0.34/TODO mrcanavar-0.34/callcnv.c mrcanavar-0.34/callcnv.h mrcanavar-0.34/gcnorm.c mrcanavar-0.34/gcnorm.h mrcanavar-0.34/globals.h mrcanavar-0.34/mrcanavar.c mrcanavar-0.34/mrcanavar.h mrcanavar-0.34/prep.c mrcanavar-0.34/prep.h mrcanavar-0.34/sam.c mrcanavar-0.34/sam.h mrcanavar-0.34/utils.c mrcanavar-0.34/utils.h |
diffstat | 16 files changed, 2857 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrcanavar-0.34/Changelog Tue Feb 21 10:44:13 2012 -0500 @@ -0,0 +1,53 @@ +December 7, 2011 + version 0.34: + * Segmentation fault with some kernels & gcc due to an unallocated string is fixed. + +October 13, 2011 + version 0.33: + * Compilation errors in MacOSX are fixed. + * Compilation error in x86_64 systems with gcc version > 4.33 is fixed. + +July 22, 2011 + version 0.32: + * Bug in parsing chromosome names that end with whitespace is fixed. + +June 22, 2011 + version 0.31: + * Bug causing infinite loop in control region selection is fixed. + +April 16, 2011 + version 0.3: + * Bug in GC normalization fixed + * Bug in CN estimation in chrX and chrY for male mammals is fixed. + +March 3, 2011 + GC counting bug in LW when an assembly gap is 1 character is fixed. (version 0.22) + +March 2, 2011 + Bug causing segmentation fault when CW_SIZE is set to a small value is fixed. (version 0.21) + +February 25, 2011 + A bug in GC normalization is fixed. (version 0.2) + +January 23, 2011 + SW normalization implemented (copy paste from LW ;-) ) + +December 27, 2010 + LW normalization implemented + +December 21, 2010 + a bug in handling file names is fixed + +December 14, 2010 + fixes to control selection: normalization is done AFTER controls are selected + Multiplicative GC scaling option (--multgc) + +November 18, 2010 + added removing low coverage windows from controls + +November 14, 2010 + version 0.1-rc + added iteration until min >= mean - 3std ; and 2std for chrX + +November 11, 2010 + version 0.1-alpha
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrcanavar-0.34/Makefile Tue Feb 21 10:44:13 2012 -0500 @@ -0,0 +1,21 @@ +CC=gcc +CFLAGS = -c -O2 -g +LDFLAGS = -lz -lm +SOURCES = mrcanavar.c utils.c prep.c sam.c callcnv.c gcnorm.c +OBJECTS = $(SOURCES:.c=.o) +EXECUTABLE = mrcanavar + +all: $(SOURCES) $(EXECUTABLE) + rm -rf *.o + +$(EXECUTABLE): $(OBJECTS) + $(CC) $(OBJECTS) -o $@ $(LDFLAGS) + +.c.o: + $(CC) $(CFLAGS) $< -o $@ + +clean: + rm -f $(EXECUTABLE) *.o *~ + +install: + cp mrcanavar ~/bin
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrcanavar-0.34/TODO Tue Feb 21 10:44:13 2012 -0500 @@ -0,0 +1,5 @@ +* Pseudoautosomal region support +* Coverage stats; number of reads, number of bp; divide by genome length - gaps +* CNV intervals for duplications and deletions. +* Read SAM files separately into DEPTH files, and merge DEPTH files. For eaay MPI implementation. +* Multiple library support.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrcanavar-0.34/callcnv.c Tue Feb 21 10:44:13 2012 -0500 @@ -0,0 +1,286 @@ +#include "callcnv.h" + +void call_cnv(char *depthFile, char *out_prefix){ + + int i, j; + + float *gclookup; + float *gclookup_x; + + char *fname; + char logname[2 * MAX_STR]; + FILE *log; + + if (GENOME_CONF == NULL) + print_error("Select genome configuration file (input) through the -conf parameter.\n"); + if (depthFile == NULL) + print_error("Select read depth output file through the -depth parameter.\n"); + if (out_prefix == NULL) + print_error("Select output file prefix through the -o parameter.\n"); + + + + loadRefConfig(GENOME_CONF); + + readDepth(depthFile); + + + sprintf(logname, "%s.log", out_prefix); + log = my_fopen(logname, "w", 0); + + /* trivial control removal */ + + + fprintf(stdout, "Control region cleanup..."); + fflush(stdout); + + /* add stdev calculation here */ + + + for (i=0; i<num_chrom; i++){ + for (j=0; j<chromosomes[i]->lw_cnt; j++){ + if (chromosomes[i]->lw[j].depth > (float) LW_MEAN * 2.0 || chromosomes[i]->lw[j].depth < (float) LW_MEAN / 10.0) + chromosomes[i]->lw[j].isControl = 0; + if (strstr(chromosomes[i]->name, "random") || strstr(chromosomes[i]->name, "chrY") || strstr(chromosomes[i]->name, "hap") || strstr(chromosomes[i]->name, "chrUn")) + chromosomes[i]->lw[j].isControl = 0; + } + for (j=0; j<chromosomes[i]->sw_cnt; j++){ + if (chromosomes[i]->sw[j].depth > (float) SW_MEAN * 2.0 || chromosomes[i]->sw[j].depth < (float) SW_MEAN / 10.0) + chromosomes[i]->sw[j].isControl = 0; + if (strstr(chromosomes[i]->name, "random") || strstr(chromosomes[i]->name, "chrY") || strstr(chromosomes[i]->name, "hap") || strstr(chromosomes[i]->name, "chrUn")) + chromosomes[i]->sw[j].isControl = 0; + } + + for (j=0; j<chromosomes[i]->cw_cnt; j++){ + if (chromosomes[i]->cw[j].depth > (float) CW_MEAN * 2.0 || chromosomes[i]->cw[j].depth < (float) CW_MEAN / 10.0) + chromosomes[i]->cw[j].isControl = 0; + if (strstr(chromosomes[i]->name, "random") || strstr(chromosomes[i]->name, "chrY") || strstr(chromosomes[i]->name, "hap") || strstr(chromosomes[i]->name, "chrUn")) + chromosomes[i]->cw[j].isControl = 0; + } + } + + + + fprintf(stdout, "\n"); + + gclookup = (float *) malloc(sizeof(float) * GC_BIN); + gclookup_x = (float *) malloc(sizeof(float) * GC_BIN); + + fprintf (log, "\nmrCaNaVaR version %s\nLast update: %s\n\n", VERSION, LAST_UPDATE); + fprintf (log, "Calculating library %s\n", out_prefix); + fprintf (log, "GC correction mode: %s\n", MULTGC == 1 ? "MULTIPLICATIVE" : "ADDITIVE"); + + norm_until_converges(CW, gclookup, gclookup_x); + + fprintf (log, "\nAfter GC Correction:\n--------------------\n"); + fprintf (log, "Sample Gender: %s.\n", GENDER == MALE ? "Male" : "Female"); + fprintf (log, "CW Average Read Depth: %f, Standard Deviation: %f\n", CW_MEAN, CW_STD); + + + if (GENDER == MALE) + fprintf (log, "CW Average chrX Read Depth: %f, Standard Deviation: %f\n", CW_MEAN_X, CW_STD_X); + + norm_until_converges(LW, gclookup, gclookup_x); + fprintf (log, "LW Average Read Depth: %f, Standard Deviation: %f\n", LW_MEAN, LW_STD); + if (GENDER == MALE) + fprintf (log, "LW Average chrX Read Depth: %f, Standard Deviation: %f\n", LW_MEAN_X, LW_STD_X); + + norm_until_converges(SW, gclookup, gclookup_x); + fprintf (log, "SW Average Read Depth: %f, Standard Deviation: %f\n", SW_MEAN, SW_STD); + if (GENDER == MALE) + fprintf (log, "SW Average chrX Read Depth: %f, Standard Deviation: %f\n", SW_MEAN_X, SW_STD_X); + + + fprintf (stdout, "Writing normalized CW depth to: %s.cw_norm.bed.\n", out_prefix); + + fname = (char *) malloc(sizeof (char) * (strlen(out_prefix) + strlen(".cw_norm.bed") + 1)); + + sprintf (fname, "%s.cw_norm.bed", out_prefix); + dump_text_windows(fname, CW); + + fprintf (stdout, "Writing normalized LW depth to: %s.lw_norm.bed.\n", out_prefix); + + sprintf (fname, "%s.lw_norm.bed", out_prefix); + dump_text_windows(fname, LW); + + fprintf (stdout, "Writing normalized SW depth to: %s.sw_norm.bed.\n", out_prefix); + + sprintf (fname, "%s.sw_norm.bed", out_prefix); + dump_text_windows(fname, SW); + + sprintf (fname, "%s.copynumber.bed", out_prefix); + fprintf (stdout, "Writing copy numbers to: %s.copynumber.bed. \n", out_prefix); + print_copy_numbers(fname); + + free(fname); + + free(gclookup); + free(gclookup_x); + fclose(log); + +} + + + +void readDepth(char *depthFile){ + + FILE *binDepth; + int i, j; + int retVal; + + float lw_total; + float sw_total; + float cw_total; + + int lw_cnt; + int sw_cnt; + int cw_cnt; + + int isMagicNum; + + + binDepth = my_fopen(depthFile, "r", 0); + + retVal = fread(&isMagicNum, sizeof(isMagicNum), 1, binDepth); + + if (isMagicNum != magicNum) + print_error("Read depth file seems to be invalid or corrupt.\n"); + + + lw_total = 0.0; + sw_total = 0.0; + cw_total = 0.0; + + lw_cnt = 0; + sw_cnt = 0; + cw_cnt = 0; + + /* read LW */ + + for (i = 0; i < num_chrom; i++){ + for (j = 0; j < chromosomes[i]->lw_cnt; j++){ + retVal = fread(&(chromosomes[i]->lw[j].depth), sizeof(chromosomes[i]->lw[j].depth), 1, binDepth); + lw_total += chromosomes[i]->lw[j].depth; + chromosomes[i]->lw[j].isControl = 1; + lw_cnt++; + } + } + + /* read SW */ + + for (i = 0; i < num_chrom; i++){ + for (j = 0; j < chromosomes[i]->sw_cnt; j++){ + retVal = fread(&(chromosomes[i]->sw[j].depth), sizeof(chromosomes[i]->sw[j].depth), 1, binDepth); + sw_total += chromosomes[i]->sw[j].depth; + chromosomes[i]->sw[j].isControl = 1; + sw_cnt++; + } + } + + /* read CW */ + + for (i = 0; i < num_chrom; i++){ + for (j = 0; j < chromosomes[i]->cw_cnt; j++){ + retVal = fread(&(chromosomes[i]->cw[j].depth), sizeof(chromosomes[i]->cw[j].depth), 1, binDepth); + cw_total += chromosomes[i]->cw[j].depth; + chromosomes[i]->cw[j].isControl = 1; + cw_cnt++; + } + } + + LW_MEAN = lw_total / lw_cnt; + SW_MEAN = sw_total / sw_cnt; + CW_MEAN = cw_total / cw_cnt; + + fprintf(stdout, "[OK] depth file %s is loaded.\n", depthFile); + if (VERBOSE) + fprintf(stdout, "LW_MEAN: %f\tSW_MEAN: %f\tCW_MEAN:%f\n", LW_MEAN, SW_MEAN, CW_MEAN); + + fclose(binDepth); +} + + +void dump_text_windows(char *fname, enum WINDOWTYPE wt){ + + FILE *txtDepth; + int i, j; + + txtDepth = my_fopen(fname, "w", 0); + + fprintf(txtDepth, "#%s\t%s\t%s\t%s\t%s\t%s\n\n", "CHROM", "START", "END", "GC\%", "READ_DEPTH", "IS_CONTROL"); + + switch (wt){ + case LW: + for (i = 0; i < num_chrom; i++){ + for (j = 0; j < chromosomes[i]->lw_cnt; j++) + if (chromosomes[i]->lw[j].isControl == 1) + fprintf(txtDepth, "%s\t%d\t%d\t%f\t%f\tY\n", chromosomes[i]->name, chromosomes[i]->lw[j].start, chromosomes[i]->lw[j].end, chromosomes[i]->lw[j].gc, chromosomes[i]->lw[j].depth); + else + fprintf(txtDepth, "%s\t%d\t%d\t%f\t%f\tN\n", chromosomes[i]->name, chromosomes[i]->lw[j].start, chromosomes[i]->lw[j].end, chromosomes[i]->lw[j].gc, chromosomes[i]->lw[j].depth); + } + break; + + case SW: + for (i = 0; i < num_chrom; i++){ + for (j = 0; j < chromosomes[i]->sw_cnt; j++) + if (chromosomes[i]->sw[j].isControl == 1) + fprintf(txtDepth, "%s\t%d\t%d\t%f\t%f\tY\n", chromosomes[i]->name, chromosomes[i]->sw[j].start, chromosomes[i]->sw[j].end, chromosomes[i]->sw[j].gc, chromosomes[i]->sw[j].depth); + else + fprintf(txtDepth, "%s\t%d\t%d\t%f\t%f\tN\n", chromosomes[i]->name, chromosomes[i]->sw[j].start, chromosomes[i]->sw[j].end, chromosomes[i]->sw[j].gc, chromosomes[i]->sw[j].depth); + } + break; + + case CW: + for (i = 0; i < num_chrom; i++){ + for (j = 0; j < chromosomes[i]->cw_cnt; j++) + if (chromosomes[i]->cw[j].isControl == 1) + fprintf(txtDepth, "%s\t%d\t%d\t%f\t%f\tY\n", chromosomes[i]->name, chromosomes[i]->cw[j].start, chromosomes[i]->cw[j].end, chromosomes[i]->cw[j].gc, chromosomes[i]->cw[j].depth); + else + fprintf(txtDepth, "%s\t%d\t%d\t%f\t%f\tN\n", chromosomes[i]->name, chromosomes[i]->cw[j].start, chromosomes[i]->cw[j].end, chromosomes[i]->cw[j].gc, chromosomes[i]->cw[j].depth); + } + break; + } + + fclose(txtDepth); + +} + + + +void print_copy_numbers(char *fname){ + /* UNFINISHED */ + + FILE *txtDepth; + int i, j; + float copy_num; + + txtDepth = my_fopen(fname, "w", 0); + + fprintf(txtDepth, "#%s\t%s\t%s\t%s\t%s\n\n", "CHROM", "START", "END", "GC\%", "COPYNUMBER"); + + for (i = 0; i < num_chrom; i++){ + for (j = 0; j < chromosomes[i]->cw_cnt; j++){ + if (GENDER == FEMALE) + copy_num = (chromosomes[i]->cw[j].depth / CW_MEAN) * 2; + else{ + if (!strstr(chromosomes[i]->name, "chrX") && !strstr(chromosomes[i]->name, "chrY")) + copy_num = (chromosomes[i]->cw[j].depth / CW_MEAN) * 2; + else + copy_num = chromosomes[i]->cw[j].depth / CW_MEAN_X; + } + + if (GENDER == FEMALE && (strstr(chromosomes[i]->name, "chrY"))) + continue; + + fprintf(txtDepth, "%s\t%d\t%d\t%f\t%f\n", chromosomes[i]->name, chromosomes[i]->cw[j].start, chromosomes[i]->cw[j].end, chromosomes[i]->cw[j].gc, copy_num); + } + } + + + fclose(txtDepth); + + +} + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrcanavar-0.34/callcnv.h Tue Feb 21 10:44:13 2012 -0500 @@ -0,0 +1,14 @@ +#ifndef __CALLCNV +#define __CALLCNV + +#include <stdio.h> +#include "globals.h" +#include "utils.h" +#include "gcnorm.h" + +void call_cnv(char *, char *); +void readDepth(char *); +void print_copy_numbers(char *); +void dump_text_windows(char *, enum WINDOWTYPE); + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrcanavar-0.34/gcnorm.c Tue Feb 21 10:44:13 2012 -0500 @@ -0,0 +1,901 @@ +#include "gcnorm.h" + +int norm_until_converges (enum WINDOWTYPE wt, float *gclookup, float *gclookup_x){ + float max; + float max_x; + float min; + float min_x; + + float p_max; + float p_max_x; + float p_min; + float p_min_x; + + int i, j; + float mean; + float mean_x; + float stdev; + float stdev_x; + int iter; + + float maxcut; + float maxcut_x; + float mincut; + float mincut_x; + + iter = 1; + calc_stat(wt, gclookup, gclookup_x, 0, &max, &max_x, &min, &min_x); + + switch(wt){ + case LW: + mean = LW_MEAN; + mean_x = LW_MEAN_X; + break; + case SW: + mean = SW_MEAN; + mean_x = SW_MEAN_X; + break; + case CW: + mean = CW_MEAN; + mean_x = CW_MEAN_X; + break; + } + + + if (GENDER == AUTODETECT){ + if (VERBOSE) + fprintf(stdout, "MEAN: %f\tMEAN_X: %f\n", mean, mean_x); + + + if (mean_x / mean < 0.75){ // magical ratio + GENDER = MALE; + + if (VERBOSE) + fprintf(stdout, "Autodetect Gender: Male\n"); + + i = findChrom("chrX", "", -1); + + + switch(wt){ + + case CW: + for (j=0; j<chromosomes[i]->cw_cnt; j++) + if (chromosomes[i]->cw[j].depth > (float) CW_MEAN_X * 2 || chromosomes[i]->cw[j].depth < (float) CW_MEAN_X / 10.0) + chromosomes[i]->cw[j].isControl = 0; + break; + + case LW: + for (j=0; j<chromosomes[i]->lw_cnt; j++) + if (chromosomes[i]->lw[j].depth > (float) LW_MEAN_X * 2 || chromosomes[i]->lw[j].depth < (float) LW_MEAN_X / 10.0) + chromosomes[i]->lw[j].isControl = 0; + break; + + case SW: + for (j=0; j<chromosomes[i]->sw_cnt; j++) + if (chromosomes[i]->sw[j].depth > (float) SW_MEAN_X * 2 || chromosomes[i]->sw[j].depth < (float) SW_MEAN_X / 10.0) + chromosomes[i]->sw[j].isControl = 0; + break; + + } + + } + else{ + GENDER = FEMALE; + if (VERBOSE) + fprintf(stdout, "Autodetect Gender: Female\n"); + } + } + + normalize(wt, gclookup, gclookup_x, &max, &max_x, &min, &min_x); + + + do{ + + if (VERBOSE){ + fprintf(stdout, "Control regions %s iteration %d ", (wt == LW ? "LW" : (wt == SW ? "SW" : "CW")), iter); + fflush(stdout); + } + + + + p_min = min; + p_max = max; + p_min_x = min_x; + p_max_x = max_x; + + calc_stat(wt, gclookup, gclookup_x, 1, &max, &max_x, &min, &min_x); + + + switch(wt){ + case LW: + mean = LW_MEAN; + stdev = LW_STD; + mean_x = LW_MEAN_X; + stdev_x = LW_STD_X; + break; + case SW: + mean = SW_MEAN; + stdev = SW_STD; + mean_x = SW_MEAN_X; + stdev_x = SW_STD_X; + break; + case CW: + mean = CW_MEAN; + stdev = CW_STD; + mean_x = CW_MEAN_X; + stdev_x = CW_STD_X; + break; + } + + + if (GENDER == FEMALE){ + max_x = max; + mean_x = mean; + stdev_x = stdev; + min_x = min; + } + + + iter++; + + + maxcut = mean + 2.5 * stdev; + mincut = mean - 2.5 * stdev; + maxcut_x = mean_x + 2.5 * stdev_x; + mincut_x = mean_x - 2.5 * stdev_x; + + if (GENDER != FEMALE){ + maxcut_x = mean_x + 2 * stdev_x; + mincut_x = mean_x - 2 * stdev_x; + } + + if (mincut < 0.0) mincut = mean / 10.0; + if (mincut_x < 0.0) mincut_x = mean_x / 10.0; + + if (maxcut - mean > mean - mincut) + maxcut = mean + (mean - mincut); + + if (GENDER != FEMALE){ + if (maxcut_x - mean_x > mean_x - mincut_x) + maxcut_x = mean_x + (mean_x - mincut_x); + } + + + if (VERBOSE){ + fprintf(stdout, "mean: %f\tstdev: %f\tmax: %f (cut: %f)\tmin: %f (cut: %f) \n", mean, stdev, max, maxcut, min, mincut); + if (GENDER != FEMALE) + fprintf(stdout, "mean_x: %f\tstdev_x: %f\tmax_x: %f (cut: %f)\tmin_x: %f (cut: %f)\n", mean_x, stdev_x, max_x, maxcut_x, min_x, mincut_x); + } + + if (p_min == min && p_max == max && p_min_x == min_x && p_max_x == max_x) + break; + + } while (max >= maxcut || max_x >= maxcut_x || min <= mincut || min_x <= mincut_x); + + + fprintf(stdout, "%s Normalization completed.\n", (wt == LW ? "LW" : (wt == SW ? "SW" : "CW"))); + + return 1; +} + + +void normalize (enum WINDOWTYPE wt, float *gclookup, float *gclookup_x, float *max, float *max_x, float *min, float *min_x){ + int gc_index; + float new_depth; + int i, j; + + float total, total_x; + int win_cnt, win_cnt_x; + + total = 0.0; + win_cnt = 0; + total_x = 0.0; + win_cnt_x = 0; + + + switch(wt){ + case LW: + for (i=0; i<num_chrom; i++){ + for (j=0; j<chromosomes[i]->lw_cnt; j++){ + gc_index = chromosomes[i]->lw[j].gc * (float) GC_BIN; + if ((!strstr(chromosomes[i]->name, "chrX") && !strstr(chromosomes[i]->name, "chrY")) || GENDER == FEMALE){ + if (MULTGC) + new_depth = gclookup[i] * chromosomes[i]->lw[j].depth; + else + new_depth = chromosomes[i]->lw[j].depth - (gclookup[gc_index] - LW_MEAN); + } + else if ((strstr(chromosomes[i]->name, "chrX") || strstr(chromosomes[i]->name, "chrY")) && GENDER == MALE){ + if (MULTGC) + new_depth = gclookup_x[i] * chromosomes[i]->lw[j].depth; + else + new_depth = chromosomes[i]->lw[j].depth - (gclookup_x[gc_index] - LW_MEAN_X); + } + + if (new_depth < 0) new_depth = 0; + + chromosomes[i]->lw[j].depth = new_depth; + } + } + break; + + case SW: + for (i=0; i<num_chrom; i++){ + for (j=0; j<chromosomes[i]->sw_cnt; j++){ + gc_index = chromosomes[i]->sw[j].gc * (float) GC_BIN; + if ((!strstr(chromosomes[i]->name, "chrX") && !strstr(chromosomes[i]->name, "chrY")) || GENDER == FEMALE){ + if (MULTGC) + new_depth = gclookup[i] * chromosomes[i]->sw[j].depth; + else + new_depth = chromosomes[i]->sw[j].depth - (gclookup[gc_index] - SW_MEAN); + } + else if ((strstr(chromosomes[i]->name, "chrX") || strstr(chromosomes[i]->name, "chrY")) && GENDER == MALE){ + if (MULTGC) + new_depth = gclookup_x[i] * chromosomes[i]->sw[j].depth; + else + new_depth = chromosomes[i]->sw[j].depth - (gclookup_x[gc_index] - SW_MEAN_X); + } + + if (new_depth < 0) new_depth = 0; + chromosomes[i]->sw[j].depth = new_depth; + } + } + break; + + case CW: + for (i=0; i<num_chrom; i++){ + for (j=0; j<chromosomes[i]->cw_cnt; j++){ + gc_index = chromosomes[i]->cw[j].gc * (float) GC_BIN; + if ((!strstr(chromosomes[i]->name, "chrX") && !strstr(chromosomes[i]->name, "chrY")) || GENDER == FEMALE){ + if (MULTGC) + new_depth = gclookup[i] * chromosomes[i]->cw[j].depth; + else + new_depth = chromosomes[i]->cw[j].depth - (gclookup[gc_index] - CW_MEAN); + } + else if ((strstr(chromosomes[i]->name, "chrX") || strstr(chromosomes[i]->name, "chrY")) && GENDER == MALE){ + if (MULTGC) + new_depth = gclookup_x[i] * chromosomes[i]->cw[j].depth; + else + new_depth = chromosomes[i]->cw[j].depth - (gclookup_x[gc_index] - CW_MEAN_X); + } + + if (new_depth < 0) new_depth = 0; + + chromosomes[i]->cw[j].depth = new_depth; + } + } + break; + } + + + + calc_stat(wt, gclookup, gclookup_x, 0, max, max_x, min, min_x); + +} + + +void calc_stat(enum WINDOWTYPE wt, float *gclookup, float *gclookup_x, char doClean, float *_max, float *_max_x, float *_min, float *_min_x){ + int i, j, k; + + float lw_var; + float sw_var; + float cw_var; + + float lw_total; + float sw_total; + float cw_total; + + float lw_var_x; + float sw_var_x; + float cw_var_x; + + float lw_total_x; + float sw_total_x; + float cw_total_x; + + int win_cnt; + float max; + + int win_cnt_x; + float max_x; + + float min; + float min_x; + + int gc_index; + + float gc_total[GC_BIN]; // total depth by GC + int gc_wincount[GC_BIN]; // count of windows with the given GC content + + float gc_total_x[GC_BIN]; // total depth by GC + int gc_wincount_x[GC_BIN]; // count of windows with the given GC content + + float MEAN; + float MEAN_X; + + + float maxcut, mincut; + float maxcut_x, mincut_x; + + lw_var = 0.0; + sw_var = 0.0; + cw_var = 0.0; + + lw_total = 0.0; + sw_total = 0.0; + cw_total = 0.0; + + win_cnt = 0; + + lw_var_x = 0.0; + sw_var_x = 0.0; + cw_var_x = 0.0; + + lw_total_x = 0.0; + sw_total_x = 0.0; + cw_total_x = 0.0; + + win_cnt_x = 0; + + for (i=0; i<GC_BIN; i++){ + gc_total[i] = 0.0; + gclookup[i] = 0.0; + gc_wincount[i] = 0; + + gc_total_x[i] = 0.0; + gclookup_x[i] = 0.0; + gc_wincount_x[i] = 0; + } + + max = 0.0; + max_x = 0.0; + + min = FLT_MAX; + min_x = FLT_MAX; + + /* + + NOTE: + + As of November 11, 2010, only CW implementation is reliable. + + + */ + + switch (wt){ + case LW: + + if (doClean){ + + for (i=0; i<num_chrom; i++){ + for (j=0; j<chromosomes[i]->lw_cnt; j++){ + + if (chromosomes[i]->lw[j].isControl == 1){ + + /* AUTOSOMES -- NOTE: chrY is NOT a control; it is prefiltered, no need to check for it here */ + if (!strstr(chromosomes[i]->name, "chrX") || GENDER == FEMALE){ + + maxcut = LW_MEAN + 2.5 * LW_STD; + mincut = LW_MEAN - 2.5 * LW_STD; + if (mincut < LW_MEAN / 10.0) mincut = LW_MEAN / 10.0; + + if (maxcut - LW_MEAN > LW_MEAN - mincut) + maxcut = LW_MEAN + (LW_MEAN - mincut); + + if (chromosomes[i]->lw[j].depth > maxcut || chromosomes[i]->lw[j].depth < mincut){ + + /* Remove this window and its neighbors from controls */ + chromosomes[i]->lw[j].isControl = 0; + + k = j; + + while (k > 0 && chromosomes[i]->lw[k-1].end >= chromosomes[i]->lw[j].start){ + chromosomes[i]->lw[--k].isControl = 0; + } + + k = j; + + while (k < chromosomes[i]->lw_cnt-1 && chromosomes[i]->lw[k+1].start <= chromosomes[i]->lw[j].end){ + chromosomes[i]->lw[++k].isControl = 0; + } + + } + + else{ + lw_total += chromosomes[i]->lw[j].depth; + win_cnt++; + } + } // if AUTOSOME + + + /* chrX -- NOTE: chrY is NOT a control; it is prefiltered, no need to check for it here */ + else if (strstr(chromosomes[i]->name, "chrX") && GENDER != FEMALE){ + + maxcut_x = LW_MEAN_X + 2 * LW_STD_X; + mincut_x = LW_MEAN_X - 2 * LW_STD_X; + if (mincut_x < LW_MEAN_X / 10.0) mincut_x = LW_MEAN_X / 10.0; + + if (chromosomes[i]->lw[j].depth > maxcut_x || chromosomes[i]->lw[j].depth < mincut_x){ + + /* Remove this window and its neighbors from controls */ + chromosomes[i]->lw[j].isControl = 0; + + k = j; + + while (k > 0 && chromosomes[i]->lw[k-1].end >= chromosomes[i]->lw[j].start){ + chromosomes[i]->lw[--k].isControl = 0; + } + + k = j; + + while (k < chromosomes[i]->lw_cnt-1 && chromosomes[i]->lw[k+1].start <= chromosomes[i]->lw[j].end){ + chromosomes[i]->lw[++k].isControl = 0; + } + } + + else{ + lw_total_x += chromosomes[i]->lw[j].depth; + win_cnt_x++; + } + } // if chrX + + } + + } + } + + LW_MEAN_X = lw_total_x / win_cnt_x; + LW_MEAN = lw_total / win_cnt; + + } // do clean + + lw_total = 0.0; + win_cnt = 0; + lw_total_x = 0.0; + win_cnt_x = 0; + + for (i=0; i<num_chrom; i++){ + for (j=0; j<chromosomes[i]->lw_cnt; j++){ + + if (chromosomes[i]->lw[j].isControl == 1){ + + /* AUTOSOMES -- NOTE: chrY is NOT a control; it is prefiltered, no need to check for it here */ + if (!strstr(chromosomes[i]->name, "chrX") || GENDER==FEMALE){ + + lw_var += (LW_MEAN - chromosomes[i]->lw[j].depth) * (LW_MEAN - chromosomes[i]->lw[j].depth); + win_cnt++; + + lw_total += chromosomes[i]->lw[j].depth; + gc_index = chromosomes[i]->lw[j].gc * GC_BIN; + gc_total[gc_index] += chromosomes[i]->lw[j].depth; + gc_wincount[gc_index]++; + + if (chromosomes[i]->lw[j].depth > max) + max = chromosomes[i]->lw[j].depth; + if (chromosomes[i]->lw[j].depth < min) + min = chromosomes[i]->lw[j].depth; + + } // AUTOSOMES + + /* chrX -- NOTE: chrY is NOT a control; it is prefiltered, no need to check for it here */ + else if (strstr(chromosomes[i]->name, "chrX") && GENDER != FEMALE){ + + lw_var_x += (LW_MEAN_X - chromosomes[i]->lw[j].depth) * (LW_MEAN_X - chromosomes[i]->lw[j].depth); + win_cnt_x++; + + lw_total_x += chromosomes[i]->lw[j].depth; + gc_index = chromosomes[i]->lw[j].gc * GC_BIN; + gc_total_x[gc_index] += chromosomes[i]->lw[j].depth; + gc_wincount_x[gc_index]++; + + if (chromosomes[i]->lw[j].depth > max_x) + max_x = chromosomes[i]->lw[j].depth; + if (chromosomes[i]->lw[j].depth < min_x) + min_x = chromosomes[i]->lw[j].depth; + + } // chrX + + + } // if control + } // outer for + } // if (!doClean) + + LW_MEAN_X = lw_total_x / win_cnt_x; + LW_STD_X = sqrt(lw_var_x / win_cnt_x); + + LW_MEAN = lw_total / win_cnt; + LW_STD = sqrt(lw_var / win_cnt); + + MEAN = LW_MEAN; + MEAN_X = LW_MEAN_X; + + break; + + case SW: + + + if (doClean){ + for (i=0; i<num_chrom; i++){ + for (j=0; j<chromosomes[i]->sw_cnt; j++){ + + if (chromosomes[i]->sw[j].isControl == 1){ + + /* AUTOSOMES -- NOTE: chrY is NOT a control; it is prefiltered, no need to check for it here */ + if (!strstr(chromosomes[i]->name, "chrX") || GENDER == FEMALE){ + + maxcut = SW_MEAN + 2.5 * SW_STD; + mincut = SW_MEAN - 2.5 * SW_STD; + if (mincut < SW_MEAN / 10.0) mincut = SW_MEAN / 10.0; + + if (maxcut - SW_MEAN > SW_MEAN - mincut) + maxcut = SW_MEAN + (SW_MEAN - mincut); + + if (chromosomes[i]->sw[j].depth > maxcut || chromosomes[i]->sw[j].depth < mincut){ + + /* Remove this window and its neighbors from controls */ + chromosomes[i]->sw[j].isControl = 0; + + k = j; + + while (k > 0 && chromosomes[i]->sw[k-1].end >= chromosomes[i]->sw[j].start){ + chromosomes[i]->sw[--k].isControl = 0; + } + + k = j; + + while (k < chromosomes[i]->sw_cnt-1 && chromosomes[i]->sw[k+1].start <= chromosomes[i]->sw[j].end){ + chromosomes[i]->sw[++k].isControl = 0; + } + + } + + else{ + sw_total += chromosomes[i]->sw[j].depth; + win_cnt++; + } + } // if AUTOSOME + + + /* chrX -- NOTE: chrY is NOT a control; it is prefiltered, no need to check for it here */ + else if (strstr(chromosomes[i]->name, "chrX") && GENDER != FEMALE){ + + maxcut_x = SW_MEAN_X + 2 * SW_STD_X; + mincut_x = SW_MEAN_X - 2 * SW_STD_X; + if (mincut_x < SW_MEAN_X / 10.0) mincut_x = SW_MEAN_X / 10.0; + + if (chromosomes[i]->sw[j].depth > maxcut_x || chromosomes[i]->sw[j].depth < mincut_x){ + + /* Remove this window and its neighbors from controls */ + chromosomes[i]->sw[j].isControl = 0; + + k = j; + + while (k > 0 && chromosomes[i]->sw[k-1].end >= chromosomes[i]->sw[j].start){ + chromosomes[i]->sw[--k].isControl = 0; + } + + k = j; + + while (k < chromosomes[i]->sw_cnt-1 && chromosomes[i]->sw[k+1].start <= chromosomes[i]->sw[j].end){ + chromosomes[i]->sw[++k].isControl = 0; + } + } + + else{ + sw_total_x += chromosomes[i]->sw[j].depth; + win_cnt_x++; + } + } // if chrX + + } + + } + } + + SW_MEAN_X = sw_total_x / win_cnt_x; + SW_MEAN = sw_total / win_cnt; + + } // do clean + + sw_total = 0.0; + win_cnt = 0; + sw_total_x = 0.0; + win_cnt_x = 0; + + for (i=0; i<num_chrom; i++){ + for (j=0; j<chromosomes[i]->sw_cnt; j++){ + + if (chromosomes[i]->sw[j].isControl == 1){ + + /* AUTOSOMES -- NOTE: chrY is NOT a control; it is prefiltered, no need to check for it here */ + if (!strstr(chromosomes[i]->name, "chrX") || GENDER==FEMALE){ + + sw_var += (SW_MEAN - chromosomes[i]->sw[j].depth) * (SW_MEAN - chromosomes[i]->sw[j].depth); + win_cnt++; + + sw_total += chromosomes[i]->sw[j].depth; + gc_index = chromosomes[i]->sw[j].gc * GC_BIN; + gc_total[gc_index] += chromosomes[i]->sw[j].depth; + gc_wincount[gc_index]++; + + if (chromosomes[i]->sw[j].depth > max) + max = chromosomes[i]->sw[j].depth; + if (chromosomes[i]->sw[j].depth < min) + min = chromosomes[i]->sw[j].depth; + + } // AUTOSOMES + + /* chrX -- NOTE: chrY is NOT a control; it is prefiltered, no need to check for it here */ + else if (strstr(chromosomes[i]->name, "chrX") && GENDER != FEMALE){ + + sw_var_x += (SW_MEAN_X - chromosomes[i]->sw[j].depth) * (SW_MEAN_X - chromosomes[i]->sw[j].depth); + win_cnt_x++; + + sw_total_x += chromosomes[i]->sw[j].depth; + gc_index = chromosomes[i]->sw[j].gc * GC_BIN; + gc_total_x[gc_index] += chromosomes[i]->sw[j].depth; + gc_wincount_x[gc_index]++; + + if (chromosomes[i]->sw[j].depth > max_x) + max_x = chromosomes[i]->sw[j].depth; + if (chromosomes[i]->sw[j].depth < min_x) + min_x = chromosomes[i]->sw[j].depth; + + } // chrX + + + } // if control + } // outer for + } // if (!doClean) + + SW_MEAN_X = sw_total_x / win_cnt_x; + SW_STD_X = sqrt(sw_var_x / win_cnt_x); + + SW_MEAN = sw_total / win_cnt; + SW_STD = sqrt(sw_var / win_cnt); + + MEAN = SW_MEAN; + MEAN_X = SW_MEAN_X; + + + break; + + case CW: + + if (doClean){ + for (i=0; i<num_chrom; i++){ + for (j=0; j<chromosomes[i]->cw_cnt; j++){ + + + if (chromosomes[i]->cw[j].isControl == 1){ + + /* AUTOSOMES -- NOTE: chrY is NOT a control; it is prefiltered, no need to check for it here */ + if (!strstr(chromosomes[i]->name, "chrX") || GENDER == FEMALE){ + + maxcut = CW_MEAN + 2.5 * CW_STD; + mincut = CW_MEAN - 2.5 * CW_STD; + if (mincut < CW_MEAN / 10.0) mincut = CW_MEAN / 10.0; + + if (maxcut - CW_MEAN > CW_MEAN - mincut) + maxcut = CW_MEAN + (CW_MEAN - mincut); + + if (chromosomes[i]->cw[j].depth > maxcut || chromosomes[i]->cw[j].depth < mincut){ + + /* Remove this window and its neighbors from controls */ + chromosomes[i]->cw[j].isControl = 0; + + if (j > 0) + chromosomes[i]->cw[j-1].isControl = 0; + if (j < chromosomes[i]->cw_cnt-1) + chromosomes[i]->cw[j+1].isControl = 0; + } + else{ + cw_total += chromosomes[i]->cw[j].depth; + win_cnt++; + } + } // if AUTOSOME + + + /* chrX -- NOTE: chrY is NOT a control; it is prefiltered, no need to check for it here */ + else if (strstr(chromosomes[i]->name, "chrX") && GENDER != FEMALE){ + + maxcut_x = CW_MEAN_X + 2 * CW_STD_X; + mincut_x = CW_MEAN_X - 2 * CW_STD_X; + if (mincut_x < CW_MEAN_X / 10.0) mincut_x = CW_MEAN_X / 10.0; + + if (chromosomes[i]->cw[j].depth > maxcut_x || chromosomes[i]->cw[j].depth < mincut_x){ + + /* Remove this window and its neighbors from controls */ + chromosomes[i]->cw[j].isControl = 0; + + if (j > 0) + chromosomes[i]->cw[j-1].isControl = 0; + if (j < chromosomes[i]->cw_cnt-1) + chromosomes[i]->cw[j+1].isControl = 0; + } + else{ + cw_total_x += chromosomes[i]->cw[j].depth; + win_cnt_x++; + } + } // if chrX + + } + + } + } + + + CW_MEAN_X = cw_total_x / win_cnt_x; + + CW_MEAN = cw_total / win_cnt; + + } // do clean + + cw_total = 0.0; + win_cnt = 0; + cw_total_x = 0.0; + win_cnt_x = 0; + + for (i=0; i<num_chrom; i++){ + for (j=0; j<chromosomes[i]->cw_cnt; j++){ + + if (chromosomes[i]->cw[j].isControl == 1){ + + /* AUTOSOMES -- NOTE: chrY is NOT a control; it is prefiltered, no need to check for it here */ + if (!strstr(chromosomes[i]->name, "chrX") || GENDER==FEMALE){ + + cw_var += (CW_MEAN - chromosomes[i]->cw[j].depth) * (CW_MEAN - chromosomes[i]->cw[j].depth); + win_cnt++; + + cw_total += chromosomes[i]->cw[j].depth; + gc_index = chromosomes[i]->cw[j].gc * GC_BIN; + gc_total[gc_index] += chromosomes[i]->cw[j].depth; + gc_wincount[gc_index]++; + + if (chromosomes[i]->cw[j].depth > max) + max = chromosomes[i]->cw[j].depth; + if (chromosomes[i]->cw[j].depth < min) + min = chromosomes[i]->cw[j].depth; + + } // AUTOSOMES + + /* chrX -- NOTE: chrY is NOT a control; it is prefiltered, no need to check for it here */ + else if (strstr(chromosomes[i]->name, "chrX") && GENDER != FEMALE){ + + cw_var_x += (CW_MEAN_X - chromosomes[i]->cw[j].depth) * (CW_MEAN_X - chromosomes[i]->cw[j].depth); + win_cnt_x++; + + cw_total_x += chromosomes[i]->cw[j].depth; + gc_index = chromosomes[i]->cw[j].gc * GC_BIN; + gc_total_x[gc_index] += chromosomes[i]->cw[j].depth; + gc_wincount_x[gc_index]++; + + if (chromosomes[i]->cw[j].depth > max_x) + max_x = chromosomes[i]->cw[j].depth; + if (chromosomes[i]->cw[j].depth < min_x) + min_x = chromosomes[i]->cw[j].depth; + + } // chrX + + + } // if control + } // outer for + } // if (!doClean) + + + CW_MEAN_X = cw_total_x / win_cnt_x; + CW_STD_X = sqrt(cw_var_x / win_cnt_x); + + CW_MEAN = cw_total / win_cnt; + CW_STD = sqrt(cw_var / win_cnt); + + MEAN = CW_MEAN; + MEAN_X = CW_MEAN_X; + + break; + } + + + /* calculate the gclookup table */ + + for (i=0; i<GC_BIN; i++){ + + /* AUTOSOMES */ + + if (gc_wincount[i] == 0 || gc_total[i] == 0){ + j=i-1; + + while (j >= 0 && gc_total[j] == 0) + j--; + + if (gc_total[j] == 0 || gc_wincount[j] == 0){ + j=i+1; + while (j < GC_BIN && gc_total[j] == 0) j++; + } + + gc_total[i] = gc_total[j]; + gc_wincount[i] = gc_wincount[j]; + } + + if (gc_total[i] != 0.0){ + gclookup[i] = gc_total[i] / (float) gc_wincount[i]; + + if (MULTGC){ + + /* + if (VERBOSE && i > 200) + fprintf(stdout, "GC\%: %f\tlookup: %f\tnow: ", ((float)i / 10.0), gclookup[i]); + */ + + gclookup[i] = MEAN / gclookup[i]; + + /* + if (VERBOSE && i > 200) + fprintf(stdout, "%f\n", gclookup[i]); + */ + + if (gclookup[i] > MAX_GC_CORR) + gclookup[i] = MAX_GC_CORR; + else if (gclookup[i] < MIN_GC_CORR) + gclookup[i] = MIN_GC_CORR; + } + + /* + if (!MULTGC && VERBOSE && i > 200) + fprintf(stdout, "GC\%: %f\tlookup: %f\n", ((float)i / 10.0), gclookup[i]); + */ + + } + + + + /* chrX */ + + if (GENDER != FEMALE){ + + if (gc_wincount_x[i] == 0 || gc_total_x[i] == 0){ + j=i-1; + + while (j >= 0 && gc_total_x[j] == 0) + j--; + + if (gc_total_x[j] == 0 || gc_wincount_x[j] == 0){ + j=i+1; + while (j < GC_BIN && gc_total_x[j] == 0) j++; + } + + gc_total_x[i] = gc_total_x[j]; + gc_wincount_x[i] = gc_wincount_x[j]; + } + + if (gc_total_x[i] != 0.0){ + gclookup_x[i] = gc_total_x[i] / (float) gc_wincount_x[i]; + + if (MULTGC){ + gclookup_x[i] = MEAN_X / gclookup_x[i]; + if (gclookup_x[i] > MAX_GC_CORR) + gclookup_x[i] = MAX_GC_CORR; + else if (gclookup_x[i] < MIN_GC_CORR) + gclookup_x[i] = MIN_GC_CORR; + } + + } + + } + + + } + + + *_max = max; + *_max_x = max_x; + + *_min = min; + *_min_x = min_x; + + +} + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrcanavar-0.34/gcnorm.h Tue Feb 21 10:44:13 2012 -0500 @@ -0,0 +1,23 @@ +#ifndef __GCNORM +#define __GCNORM + +#include <stdio.h> +#include <string.h> + +#include "globals.h" +#include "sam.h" +#include <math.h> +#include <float.h> + +#define MAXCOPY 2.5 +#define MAXCOPY_X 1.5 +#define MINCOPY 1.5 +#define MINCOPY_X 0.5 + + +int norm_until_converges(enum WINDOWTYPE, float *, float *); +void normalize(enum WINDOWTYPE, float *, float *, float *, float *, float *, float *); +void calc_stat(enum WINDOWTYPE, float *, float *, char, float *, float *, float *, float *); +float clean_outlier(enum WINDOWTYPE, float *, float *); + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrcanavar-0.34/globals.h Tue Feb 21 10:44:13 2012 -0500 @@ -0,0 +1,81 @@ + +#ifndef __GLOBALS +#define __GLOBALS + + +#define MAX_STR 256 +#define GC_BIN 1000 + +#define VERSION "0.34" +#define LAST_UPDATE "December 7, 2011" + + +enum MODETYPE {NONE, PREP, READSAM, CALL}; + +enum WINDOWTYPE {LW, SW, CW}; + +enum GENDERTYPE {AUTODETECT, MALE, FEMALE}; + +enum GENDERTYPE GENDER; + +enum MODETYPE RUNMODE; + +int num_chrom; +int MULTGC; +float MAX_GC_CORR; +float MIN_GC_CORR; +int VERBOSE; +int CHECKSAM; + +static const int magicNum = 3111696; + +char *GENOME_FASTA; +char *GENOME_GAPS; +char *GENOME_CONF; + +int LW_SIZE; +int SW_SIZE; +int CW_SIZE; +int LW_SLIDE; +int SW_SLIDE; + +float LW_MEAN; +float LW_STD; +float LW_MEAN_X; +float LW_STD_X; + +float SW_MEAN; +float SW_STD; +float SW_MEAN_X; +float SW_STD_X; + +float CW_MEAN; +float CW_STD; +float CW_MEAN_X; +float CW_STD_X; + +int CONT_WINDOW; +int CUT_WINDOW; + +typedef struct window{ + int start; + int end; + float gc; + float depth; + char isControl; +}_window; + +typedef struct chrom{ + char *name; + int length; + int lw_cnt; + int sw_cnt; + int cw_cnt; + struct window *sw; + struct window *lw; + struct window *cw; +}_chrom; + +struct chrom **chromosomes; + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrcanavar-0.34/mrcanavar.c Tue Feb 21 10:44:13 2012 -0500 @@ -0,0 +1,185 @@ +#include "mrcanavar.h" + + + +int main(int argc, char **argv){ + + int i; + + char gzSAM; + + char *indirSAM; + char *depthFile; + char *out_prefix; + + + init_globals(); + + gzSAM = 0; + indirSAM = NULL; + depthFile = NULL; + out_prefix = NULL; + + + for (i=1; i<argc; i++){ + + + /* modes */ + if (!strcmp(argv[i], "--prep")) + set_runmode(PREP); + else if (!strcmp(argv[i], "--read")) + set_runmode(READSAM); + else if (!strcmp(argv[i], "--call")) + set_runmode(CALL); + + + /* compulsory parameters for all modes */ + else if (!strcmp(argv[i], "-conf")) + set_str(&GENOME_CONF, argv[i+1]); + + /* compulsory parameters for the PREP mode */ + else if (!strcmp(argv[i], "-fasta")) + set_str(&GENOME_FASTA, argv[i+1]); + else if (!strcmp(argv[i], "-gaps")) + set_str(&GENOME_GAPS, argv[i+1]); + + /* optional parameters for the PREP mode */ + else if (!strcmp(argv[i], "-lw_size")) + LW_SIZE = atoi(argv[i+1]); + else if (!strcmp(argv[i], "-sw_size")) + SW_SIZE = atoi(argv[i+1]); + else if (!strcmp(argv[i], "-cw_size")) + CW_SIZE = atoi(argv[i+1]); + else if (!strcmp(argv[i], "-lw_slide")) + LW_SLIDE = atoi(argv[i+1]); + else if (!strcmp(argv[i], "-sw_slide")) + SW_SLIDE = atoi(argv[i+1]); + + + /* compulsory parameters for both READ and CALL modes */ + else if (!strcmp(argv[i], "-depth")) + set_str(&depthFile, argv[i+1]); + + /* compulsory parameters for the READ mode */ + + else if (!strcmp(argv[i], "-samdir")) + set_str(&indirSAM, argv[i+1]); + + + /* optional parameters for the SAM mode */ + else if (!strcmp(argv[i], "--gz")) + gzSAM = 1; + + + /* compulsory parameters for the CALL mode */ + else if (!strcmp(argv[i], "-o")) + set_str(&out_prefix, argv[i+1]); + + /* optional parameters for the CALL mode */ + else if (!strcmp(argv[i], "-cont_win")) + CONT_WINDOW = atoi(argv[i+1]); + else if (!strcmp(argv[i], "-cut_win")) + CUT_WINDOW = atoi(argv[i+1]); + else if (!strcmp(argv[i], "--xx")) + set_gender(FEMALE); + else if (!strcmp(argv[i], "--xy")) + set_gender(MALE); + else if (!strcmp(argv[i], "--multgc")) + MULTGC = 1; + + + + + /* generic stuff */ + else if (!strcmp(argv[i], "-v")){ + fprintf (stdout, "\nmrCaNaVaR version %s.\nLast update: %s.\n\n", VERSION, LAST_UPDATE); + return 0; + } + else if (!strcmp(argv[i], "-h")){ + printHelp(argv[0]); + return 0; + } + else if (!strcmp(argv[i], "--verbose")) + VERBOSE = 1; + else if (!strcmp(argv[i], "--disable-sam-check")) + CHECKSAM = 0; + } + + + switch(RUNMODE){ + case NONE: + print_error("Select mode: --prep, ---read, or --call\n"); + break; + case PREP: + fprintf(stdout, "Mode: Prepare genome...\n"); + prep_genome(); + break; + case READSAM: + fprintf(stdout, "Mode: Read SAM ...\n"); + read_depth(indirSAM, depthFile, gzSAM); + break; + case CALL: + fprintf(stdout, "Mode: Call copy numbers ...\n"); + call_cnv(depthFile, out_prefix); + break; + default: + break; + } + + return 0; + +} + + +void printHelp(char *binfile){ + + fprintf (stdout, "\nmrCaNaVaR version %s\nLast update: %s\n\n", VERSION, LAST_UPDATE); + fprintf (stdout, "%s --<mode> [options] \n\n", binfile); + + fprintf (stdout, "======== RUN MODES ========\n\n"); + + fprintf (stdout, "\t--prep : Prepare reference genome configuration file.\n"); + fprintf (stdout, "\t--read : Read mapping information from SAM files.\n"); + fprintf (stdout, "\t--call : Call CNVs and predict copy numbers.\n\n"); + + fprintf (stdout, "======== PREP MODE COMPULSORY PARAMETERS ========\n\n"); + fprintf (stdout, "\t-fasta <fasta_file> : FASTA file for the reference genome.\n"); + fprintf (stdout, "\t-gaps <gaps_file> : Gap coordinates of the reference genome in BED format.\n"); + fprintf (stdout, "\t-conf <config_file> : Reference configuration file (output).\n\n"); + fprintf (stdout, "======== PREP MODE OPTIONAL PARAMETERS ========\n\n"); + fprintf (stdout, "\t-lw_size <lw_size> : Long window span size. Default is 5000.\n"); + fprintf (stdout, "\t-lw_slide <lw_slide> : Long window slide size. Default is 1000.\n"); + fprintf (stdout, "\t-sw_size <sw_size> : Short window span size. Default is 1000.\n"); + fprintf (stdout, "\t-sw_slide <sw_slide> : Short window slide size. Default is 1000.\n"); + fprintf (stdout, "\t-cw_size <cw_size> : Copy number window size. Default is 1000.\n\n"); + + fprintf (stdout, "======== READ MODE COMPULSORY PARAMETERS ========\n\n"); + fprintf (stdout, "\t-conf <config_file> : Reference configuration file (input).\n"); + fprintf (stdout, "\t-samdir <sam_dir> : Directory that contains SAM files for mapping information.\n"); + fprintf (stdout, "\t-depth <depth_file> : Read depth file (output).\n\n"); + fprintf (stdout, "======== READ MODE OPTIONAL PARAMETERS ========\n\n"); + fprintf (stdout, "\t--gz : Indicates that the SAM files are compressed in gzip format.\n\n"); + + fprintf (stdout, "======== CALL MODE COMPULSORY PARAMETERS ========\n\n"); + fprintf (stdout, "\t-conf <config_file> : Reference configuration file (input).\n"); + fprintf (stdout, "\t-depth <depth_file> : Read depth file (input).\n"); + fprintf (stdout, "\t-o <out_prefix> : Prefix for the output file names.\n\n"); + + + fprintf (stdout, "======== CALL MODE OPTIONAL PARAMETERS ========\n\n"); + fprintf (stdout, "\t--xx : Set gender of the sequenced sample as female. Mammalian genomes only. Default is autodetect.\n"); + fprintf (stdout, "\t--xy : Set gender of the sequenced sample as male. Mammalian genomes only. Default is autodetect.\n"); + fprintf (stdout, "\t--multgc : Perform multiplicative GC correction. Default is additive.\n"); + fprintf (stdout, "\t--verbose : Verbose output.\n"); + + + /* + + fprintf (stdout, "======== NOT IMPLEMENTED YET ========\n\n"); + fprintf (stdout, "\t-cont_win <cwin_number> : Contiguous window number to look for high/low depth windows. Default is 7.\n"); + fprintf (stdout, "\t-cut_win <cwin_cutoff> : Window number cutoff. Default is 6.\n\n"); + + + */ + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrcanavar-0.34/mrcanavar.h Tue Feb 21 10:44:13 2012 -0500 @@ -0,0 +1,19 @@ +#ifndef __MRCANAVAR +#define __MRCANAVAR + +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <zlib.h> + +#include "callcnv.h" +#include "globals.h" +#include "utils.h" +#include "prep.h" +#include "sam.h" + + + +void printHelp(char *); + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrcanavar-0.34/prep.c Tue Feb 21 10:44:13 2012 -0500 @@ -0,0 +1,381 @@ +#include "prep.h" + +void prep_genome(void){ + FILE *fasta; + FILE *gaps; + FILE *config; + + int num_gaps; + + char chrom[MAX_STR]; + int start, end; + int max_len; + + int i; + + if (GENOME_FASTA == NULL) + print_error("Select genome fasta file (input) through -fasta parameter.\n"); + if (GENOME_GAPS == NULL) + print_error("Select genome gaps file (input) through -gaps parameter.\n"); + if (GENOME_CONF == NULL) + print_error("Select genome fasta file (output) through -conf parameter.\n"); + + + if (LW_SIZE <= 0) + print_error ("Large window size (-lw_size) must be a poitive integer.\n"); + if (SW_SIZE <= 0) + print_error ("Small window size (-sw_size) must be a poitive integer.\n"); + if (CW_SIZE <= 0) + print_error ("Copy number window size (-cw_size) must be a poitive integer.\n"); + if (LW_SLIDE <= 0) + print_error ("Large window slide (-lw_slide) must be a poitive integer.\n"); + if (SW_SLIDE <= 0) + print_error ("Small window slide (-sw_slide) must be a poitive integer.\n"); + + if (SW_SIZE >= LW_SIZE) + print_error ("Small window size (-sw_size) should be smaller than large window size (-lw_size).\n"); + if (SW_SLIDE > LW_SIZE) + print_error ("Small window slide (-sw_slide) should be smaller than or equal to large window slide (-lw_slide).\n"); + + + fasta = my_fopen(GENOME_FASTA, "r", 0); + gaps = my_fopen(GENOME_GAPS, "r", 0); + + num_gaps = 0; + + + /* initialize gap table */ + while (fscanf (gaps, "%s\t%d\t%d\n", chrom, &start, &end) > 0) + num_gaps ++; + + + gaptable = (struct gapcell *) malloc(sizeof(struct gapcell) * num_gaps); + + rewind(gaps); + + + /* read gap table */ + + i = 0; + while (fscanf (gaps, "%s\t%d\t%d\n", chrom, &start, &end) > 0){ + trimspace(chrom); + gaptable[i].chrom = NULL; + set_str(&(gaptable[i].chrom), chrom); + gaptable[i].start = start; + gaptable[i].end = end; + i++; + } + fclose(gaps); + + fprintf (stdout, "Scanning the reference genome.\n"); + /* count basic numbers from the reference genome */ + max_len = 0; + num_chrom = count_chrom(fasta, &max_len); + rewind(fasta); + + chromosomes = (struct chrom **) malloc (sizeof (struct chrom *) * num_chrom); + + for (i=0; i<num_chrom; i++) + chromosomes[i] = (struct chrom *) malloc (sizeof (struct chrom) * num_chrom); + + fprintf(stdout, "Total of %d chromosomes in the reference genome, with %d gaps.\nLongest chromosome is %d bp\n", num_chrom, num_gaps, max_len); + + fprintf (stdout, "Reading the reference genome.\n"); + + read_ref(fasta, max_len, num_gaps); + + fprintf (stdout, "Saving the reference configuration.\n"); + saveRefConfig(GENOME_CONF); + +} + + + +int count_chrom(FILE *fasta, int *max_len){ + int length=0; int maxlength=0; + char ch; + int cnt = 0; + char skip[MAX_STR]; + char *retStr; + + //while (fscanf(fasta, "%c", &ch) > 0){ + while (1){ + + ch = fgetc(fasta); + if (feof(fasta)) break; + + if (ch == '>') { + cnt ++; + retStr = fgets(skip, MAX_STR, fasta); + fprintf(stdout, "."); + fflush(stdout); + if (length > maxlength) maxlength = length; + length = 0; + } + else if (!isspace(ch)) length++; + } + fprintf(stdout, "\n"); + + if (length > maxlength) + maxlength = length; + + *max_len = maxlength; + return cnt; +} + + + +void read_ref(FILE *fasta, int max_len, int num_gaps){ + + char ch; + int cnt = 0; + char chrom_name[MAX_STR]; + int i; + int length; + char *chrom_seq; + char *retStr; + + chrom_seq = (char *) malloc (sizeof(char) * (max_len+1)); + cnt = -1; + i = 0; + length = 0; + + // while (fscanf(fasta, "%c", &ch) > 0){ + while (1){ + + ch = fgetc(fasta); + if (feof(fasta)) break; + + if (ch == '>') { + + if (cnt != -1){ + /* insert the previous chromosome */ + chrom_seq[length]=0; + insert_chrom(cnt, chrom_name, length, chrom_seq, num_gaps); + } + + cnt ++; + retStr = fgets(chrom_name, MAX_STR, fasta); + chrom_name[strlen(chrom_name)-1] = 0; + trimspace(chrom_name); + length = 0; + } + else if (!isspace(ch)){ + chrom_seq[length++] = ch; + } + } + + /* insert the last chromosome */ + chrom_seq[length]=0; + trimspace(chrom_name); + insert_chrom(cnt, chrom_name, length, chrom_seq, num_gaps); + + free(chrom_seq); +} + + +void insert_chrom(int cnt, char *chrom_name, int length, char *chrom_seq, int num_gaps){ + + fprintf(stdout, "Chromosome %s (%d bp). Counting windows ... ", chrom_name, length); + fflush(stdout); + + chromosomes[cnt]->name = NULL; + set_str(&(chromosomes[cnt]->name), chrom_name); + chromosomes[cnt]->length = length; + + windowmaker(num_gaps, cnt, chrom_name, chrom_seq, length, 0); + fprintf(stdout, "\t\tRecalculating windows ... "); + fflush(stdout); + windowmaker(num_gaps, cnt, chrom_name, chrom_seq, length, 1); + + fprintf(stdout, "\n"); +} + + +void windowmaker(int num_gaps, int chrom_id, char *chrom_name, char *chrom_seq, int length, int flag){ + int i; + int j; + int nchar; + int s, e; + + int gc; + float gc_p; + + int lw_cnt = 0; // large window (5Kb) + int sw_cnt = 0; // small window (1Kb) + int cw_cnt = 0; // copy number window (1kb non-ovp); + + + int win_cnt; + + /* if flag = 0; don't record it */ + + if (!flag){ + for (i=0;i<num_gaps;i++){ + if (!strcmp(gaptable[i].chrom, chrom_name)){ + if (gaptable[i].start == gaptable[i].end) + chrom_seq[gaptable[i].start] = 'X'; + else{ + for (j=gaptable[i].start; (j<gaptable[i].end && j<length); j++) + chrom_seq[j] = 'X'; + } + } + } + } + + + /* count cw_cnt */ + + nchar = 0; + s = 0 ; e = 0; + gc = 0; + + for (i=0; i<length; i++){ + + if (chrom_seq[i] != 'N' && chrom_seq[i] != 'X') + nchar++; + if (chrom_seq[i] == 'G' || chrom_seq[i] == 'C') + gc++; + + if ((nchar == CW_SIZE || chrom_seq[i] == 'X') && nchar != 0){ + e = i; + if (chrom_seq[i] == 'X'){ + nchar = 0; + gc = 0; + } + else if (flag == 1 && nchar == CW_SIZE){ + /* save */ + chromosomes[chrom_id]->cw[cw_cnt].start = s; + chromosomes[chrom_id]->cw[cw_cnt].end = e+1; + chromosomes[chrom_id]->cw[cw_cnt].gc = (float)gc / (float)nchar; + chromosomes[chrom_id]->cw[cw_cnt].depth = 0; // for readability/completeness + cw_cnt++; + gc = 0; + } + else if (flag == 0 && nchar == CW_SIZE){ + cw_cnt ++; + } + + s = i + 1; + nchar = 0; + } + + if (chrom_seq[i] == 'X'){ + s = i + 1; + nchar = 0; + gc = 0; + } + + } + + + /* count sw_cnt */ + + i = 0; + nchar = 0; + s = 0 ; e = 0; + gc = 0; + + while (i < length){ + + if (chrom_seq[i] != 'N' && chrom_seq[i] != 'X') + nchar++; + if (chrom_seq[i] == 'G' || chrom_seq[i] == 'C') + gc++; + + if ((nchar == SW_SIZE || chrom_seq[i] == 'X') && nchar != 0){ + e = i; + if (chrom_seq[i] == 'X'){ + nchar = 0; + gc = 0; + } + else if (flag == 1 && nchar == SW_SIZE){ + /* save */ + chromosomes[chrom_id]->sw[sw_cnt].start = s; + chromosomes[chrom_id]->sw[sw_cnt].end = e+1; + chromosomes[chrom_id]->sw[sw_cnt].gc = (float)gc / (float)nchar; + chromosomes[chrom_id]->sw[sw_cnt].depth = 0; // for readability/completeness + sw_cnt++; + gc = 0; + } + else if (flag == 0 && nchar == SW_SIZE) + sw_cnt++; + + s = s + SW_SLIDE; + i = s - 1; + nchar = 0; + } + if (chrom_seq[i]=='X'){ + s = i + 1; + i = s - 1; + gc = 0; + } + + i++; + } + + + nchar = 0; + i = 0; + s = 0 ; e = 0; + gc = 0; + /* count lw_cnt */ + + while (i<length){ + + if (chrom_seq[i] != 'N' && chrom_seq[i] != 'X') + nchar++; + if (chrom_seq[i] == 'G' || chrom_seq[i] == 'C') + gc++; + + if ((nchar == LW_SIZE || chrom_seq[i] == 'X') && nchar != 0){ + e = i; + if (chrom_seq[i] == 'X'){ + nchar = 0; + gc = 0; + } + else if (flag == 1 && nchar == LW_SIZE){ + /* save */ + chromosomes[chrom_id]->lw[lw_cnt].start = s; + chromosomes[chrom_id]->lw[lw_cnt].end = e+1; + chromosomes[chrom_id]->lw[lw_cnt].gc = (float)gc / (float)nchar; + chromosomes[chrom_id]->lw[lw_cnt].depth = 0; // for readability/completeness + lw_cnt++; + gc = 0; + } + else if (flag == 0 && nchar == LW_SIZE) + lw_cnt++; + + s = s + LW_SLIDE; + i = s - 1; + nchar = 0; + } + + if (chrom_seq[i] == 'X') { + s = i + 1; + i = s - 1; + gc = 0; + } + + i++; + } + + + + if (flag == 0){ + + printf("\n\t\tLW: %d\tSW: %d\tCW: %d\n", lw_cnt, sw_cnt, cw_cnt); + + chromosomes[chrom_id]->lw_cnt = lw_cnt; + chromosomes[chrom_id]->sw_cnt = sw_cnt; + chromosomes[chrom_id]->cw_cnt = cw_cnt; + + chromosomes[chrom_id]->cw = (struct window *) malloc (sizeof(struct window) * cw_cnt); + chromosomes[chrom_id]->lw = (struct window *) malloc (sizeof(struct window) * lw_cnt); + chromosomes[chrom_id]->sw = (struct window *) malloc (sizeof(struct window) * sw_cnt); + + + } + + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrcanavar-0.34/prep.h Tue Feb 21 10:44:13 2012 -0500 @@ -0,0 +1,29 @@ +#ifndef __PREP +#define __PREP + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <zlib.h> + +#include "globals.h" +#include "utils.h" + +typedef struct gapcell{ + char *chrom; + int start; + int end; +}_gapcell; + + +struct gapcell *gaptable; + +void prep_genome(void); +int count_chrom(FILE *, int *); +void read_ref(FILE *, int, int); +void insert_chrom(int, char *, int, char *, int); +void windowmaker(int, int, char *, char *, int, int); + + + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrcanavar-0.34/sam.c Tue Feb 21 10:44:13 2012 -0500 @@ -0,0 +1,479 @@ +#include "sam.h" + +void read_depth(char *indirSAM, char *depthFile, char gzSAM){ + + + DIR *dp; + int fileCnt; + int totalFile; + + struct dirent *ep; + + int i; + + if (GENOME_CONF == NULL) + print_error("Select genom configuration file (input) through the -conf parameter.\n"); + if (indirSAM == NULL) + print_error("Select input directory that contains the SAM files through the -samdir parameter.\n"); + if (depthFile == NULL) + print_error("Select read depth output file through the -depth parameter.\n"); + + + loadRefConfig(GENOME_CONF); + + fprintf(stdout, "Scanning the SAM directory: %s\n", indirSAM); + + dp = opendir(indirSAM); + + if (dp == NULL) + print_error("SAM directory not found.\n"); + + fileCnt = 0; + totalFile = 0; + + while((ep=readdir(dp))){ + if (ep->d_name[0] == '.') + continue; + if (ep->d_type == DT_DIR) + continue; + + if (CHECKSAM && !endswith(ep->d_name, ".sam") && !endswith(ep->d_name, ".sam.gz")) + continue; + + /* + if (!strstr(ep->d_name, "sam")) + continue; + */ + + i = strlen(ep->d_name)-1; + + if ((ep->d_name[i] == 'z' && ep->d_name[i-1] == 'g' && ep->d_name[i-2] == '.') && gzSAM == 0){ + print_error("File name ends with .gz yet --gz option is not selected. Are you sure?\nIf the files are indeed uncompressed, rename the files.\n"); + } + totalFile++; + } + + rewinddir(dp); + + while((ep=readdir(dp))){ + if (ep->d_name[0] == '.') + continue; + if (ep->d_type == DT_DIR) + continue; + + if (CHECKSAM && !endswith(ep->d_name, ".sam") && !endswith(ep->d_name, ".sam.gz")) + continue; + + + /* + if (!strstr(ep->d_name, "sam")) + continue; + */ + + fprintf(stdout, "\r \rLoading file %d of %d: %s...", (fileCnt+1), totalFile, ep->d_name); + fflush(stdout); + + readSAM(indirSAM, ep->d_name, gzSAM); + + fileCnt++; + } + + closedir(dp); + + if (fileCnt == 0) + print_error("SAM directory does not contain any files with extensions \".sam\" or \".sam.gz\".\nIf you do have the correct files, please rename them to have either \".sam\" or \".sam.gz\" extensions.\n"); + else + fprintf(stdout, "\n\n%d file%s loaded.\n", fileCnt, fileCnt>1 ? "s" : ""); + + saveDepth(depthFile); + +} + + +void readSAM(char *indirSAM, char *fname, char gzSAM){ + FILE *sam; + + char *samfile; + + char samLine[4 * MAX_STR]; + char prevChrom[MAX_STR]; + char chrom[MAX_STR]; + char *token; + + + int pos; + int chrom_id; + int prev_chrom_id = -1; + int lineCnt; + + samfile = (char *) malloc (sizeof(char) * (strlen(indirSAM) + strlen(fname) + 2)); + + sprintf(samfile, "%s/%s", indirSAM, fname); + + sam = my_fopen(samfile, "r", gzSAM); + + prevChrom[0] = 0; + + while (1){ + + /* read entire line from the SAM file */ + if (gzSAM){ + gzgets(sam, samLine, (4 * MAX_STR)); + if (gzeof(sam)) + break; + } + else{ + fgets(samLine, (4 * MAX_STR), sam); + if (feof(sam)) + break; + } + + /* parse chrom */ + token = NULL; + + token = strtok(samLine, "\t"); // read name + token = strtok(NULL, "\t"); // flag + token = strtok(NULL, "\t"); //chrom + + strcpy(chrom, token); + trimspace(chrom); + + token = strtok(NULL, "\t"); // pos + + pos = atoi(token) - 1; // SAM file is 1-based; our format is 0-based + + if (pos < 0) + continue; + + /* + debug if needed + fprintf(stdout, "%s\t%d\n", chrom, pos); + */ + + + + chrom_id = insert_read_lw(chrom, pos, prevChrom, prev_chrom_id); + + /* + if (VERBOSE) + fprintf(stdout, "[READSAM]\t%s\t%d\tchrom_id: %d\n", chrom, pos, chrom_id); + */ + + if (chrom_id == -1) // this chromosome is not in the config file + continue; + + chrom_id = insert_read_sw(chrom, pos, prevChrom, chrom_id); + + if (chrom_id == -1) // this chromosome is not in the config file; it shouldn't come to this + continue; + + chrom_id = insert_read_cw(chrom, pos, prevChrom, chrom_id); + + + + if (chrom_id == -1) // this chromosome is not in the config file; it shouldn't come to this + continue; + + strcpy(prevChrom, chrom); + prev_chrom_id = chrom_id; + + + } + + if (gzSAM) + gzclose(sam); + else + fclose(sam); + + free(samfile); +} + + + + + +int insert_read_lw(char *chrom, int pos, char *prevChrom, int prev_chrom_id){ + int chrom_id; + int window_id; + + int flank; + + chrom_id = findChrom(chrom, prevChrom, prev_chrom_id); + + if (chrom_id != -1){ + + if (pos >= chromosomes[chrom_id]->length){ + return chrom_id; + } + + window_id = windowSearch(chromosomes[chrom_id]->lw, chromosomes[chrom_id]->lw_cnt, pos); + + + if (window_id != -1){ + + chromosomes[chrom_id]->lw[window_id].depth += 1; + + /* iterate left */ + flank = window_id - 1; + + while (flank >= 0 && (pos >= chromosomes[chrom_id]->lw[flank].start && pos <= chromosomes[chrom_id]->lw[flank].end)) + chromosomes[chrom_id]->lw[flank--].depth += 1; + + + /* iterate right */ + flank = window_id + 1; + + while (flank < chromosomes[chrom_id]->lw_cnt && (pos >= chromosomes[chrom_id]->lw[flank].start && pos <= chromosomes[chrom_id]->lw[flank].end)) + chromosomes[chrom_id]->lw[flank++].depth += 1; + + + } + + } + + return chrom_id; +} + + + +int insert_read_sw(char *chrom, int pos, char *prevChrom, int prev_chrom_id){ + int chrom_id; + int window_id; + + int flank; + + chrom_id = findChrom(chrom, prevChrom, prev_chrom_id); + + if (chrom_id != -1){ + + if (pos >= chromosomes[chrom_id]->length) + return chrom_id; + + window_id = windowSearch(chromosomes[chrom_id]->sw, chromosomes[chrom_id]->sw_cnt, pos); + + if (window_id != -1){ + + chromosomes[chrom_id]->sw[window_id].depth += 1; + + /* iterate left */ + flank = window_id - 1; + + while (flank >= 0 && (pos >= chromosomes[chrom_id]->sw[flank].start && pos <= chromosomes[chrom_id]->sw[flank].end)) + chromosomes[chrom_id]->sw[flank--].depth += 1; + + + /* iterate right */ + flank = window_id + 1; + + while (flank < chromosomes[chrom_id]->sw_cnt && (pos >= chromosomes[chrom_id]->sw[flank].start && pos <= chromosomes[chrom_id]->sw[flank].end)) + chromosomes[chrom_id]->sw[flank++].depth += 1; + + + } + + } + + return chrom_id; +} + + +int insert_read_cw(char *chrom, int pos, char *prevChrom, int prev_chrom_id){ + int chrom_id; + int window_id; + + int flank; + + chrom_id = findChrom(chrom, prevChrom, prev_chrom_id); + + if (chrom_id != -1){ + + if (pos >= chromosomes[chrom_id]->length) + return chrom_id; + + window_id = windowSearch(chromosomes[chrom_id]->cw, chromosomes[chrom_id]->cw_cnt, pos); + + if (window_id != -1){ + + chromosomes[chrom_id]->cw[window_id].depth += 1; + + } + + } + return chrom_id; +} + + +int findChrom(char *chrom, char *prevChrom, int prev_chrom_id){ + int chrom_id; + + + if (!strcmp(chrom, prevChrom)){ + chrom_id = prev_chrom_id; + } + + else if (prev_chrom_id == -1 && !strcmp(chrom, chromosomes[0]->name)){ // the first entry in the file + chrom_id = 0; + } + else if (prev_chrom_id != -1 && prev_chrom_id < num_chrom -1 && !strcmp(chrom, chromosomes[prev_chrom_id+1]->name)){ + chrom_id = prev_chrom_id + 1; + } + else{ + chrom_id = binSearch(chrom); + } + + return chrom_id; + +} + +int binSearch(char *chrom){ + int start; + int end; + int med; + int strtest; + + start = 0; + + end = num_chrom - 1; + + med = (start + end) / 2; + + + while (1){ + + if (start > end) + return -1; + + if (start == med || end == med){ + if (!strcmp(chrom, chromosomes[start]->name)) + return start; + else if (!strcmp(chrom, chromosomes[end]->name)) + return end; + else{ + return -1; + } + } + + strtest = strcmp(chrom, chromosomes[med]->name); + + if(strtest == 0) + return med; + + else if (strtest < 0){ + end = med; + med = (start + end) / 2; + } + + else { + start = med; + med = (start + end) / 2; + } + + } +} + + + +int windowSearch(struct window *searchWin, int winCnt, int pos){ + int start; + int end; + int med; + + start = 0; + end = winCnt - 1; + + med = (start + end) / 2; + + while (1){ + + if (start > end) + return -1; + + if (start == med || end == med){ + + if (pos >= searchWin[start].start && pos < searchWin[start].end) + return start; + + else if (pos >= searchWin[end].start && pos < searchWin[end].end) + return end; + + else return -1; + + } + + if (pos >= searchWin[med].start && pos < searchWin[med].end) + return med; + + else if (pos < searchWin[med].start){ + end = med; + med = (start + end) / 2; + } + + else { + start = med; + med = (start + end) / 2; + } + } +} + + + +void saveDepth(char *depthFile){ + int i; + int j; + int retVal; + + char *fname; + FILE *txtDepth; + FILE *binDepth; + + fname = (char *) malloc (sizeof(char) * (strlen(depthFile) + 10)); + + sprintf(fname, "%s.lw.txt", depthFile); + + txtDepth = my_fopen(fname, "w", 0); + binDepth = my_fopen(depthFile, "w", 0); + + /* start with the magicNum, I will use this as a format check when loading */ + retVal = fwrite(&magicNum, sizeof(magicNum), 1, binDepth); + + fprintf(stdout, "Saving depth file %s\n", fname); + + for (i = 0; i < num_chrom; i++){ + for (j = 0; j < chromosomes[i]->lw_cnt; j++){ + fprintf(txtDepth, "%s\t%d\t%d\t%f\t%d\n", chromosomes[i]->name, chromosomes[i]->lw[j].start, chromosomes[i]->lw[j].end, chromosomes[i]->lw[j].gc, (int) (chromosomes[i]->lw[j].depth)); + retVal = fwrite(&(chromosomes[i]->lw[j].depth), sizeof(chromosomes[i]->lw[j].depth), 1, binDepth); + } + } + + fclose(txtDepth); + + sprintf(fname, "%s.sw.txt", depthFile); + txtDepth = my_fopen(fname, "w", 0); + fprintf(stdout, "Saving depth file %s\n", fname); + + for (i = 0; i < num_chrom; i++){ + + for (j = 0; j < chromosomes[i]->sw_cnt; j++){ + fprintf(txtDepth, "%s\t%d\t%d\t%f\t%d\n", chromosomes[i]->name, chromosomes[i]->sw[j].start, chromosomes[i]->sw[j].end, chromosomes[i]->sw[j].gc, (int) (chromosomes[i]->sw[j].depth)); + retVal = fwrite(&(chromosomes[i]->sw[j].depth), sizeof(chromosomes[i]->cw[j].depth), 1, binDepth); + } + } + + fclose(txtDepth); + + sprintf(fname, "%s.cw.txt", depthFile); + txtDepth = my_fopen(fname, "w", 0); + fprintf(stdout, "Saving depth file %s\n", fname); + + for (i = 0; i < num_chrom; i++){ + for (j = 0; j < chromosomes[i]->cw_cnt; j++){ + fprintf(txtDepth, "%s\t%d\t%d\t%f\t%d\n", chromosomes[i]->name, chromosomes[i]->cw[j].start, chromosomes[i]->cw[j].end, chromosomes[i]->cw[j].gc, (int) (chromosomes[i]->cw[j].depth)); + retVal = fwrite(&(chromosomes[i]->cw[j].depth), sizeof(chromosomes[i]->cw[j].depth), 1, binDepth); + } + } + + fclose(txtDepth); + fclose(binDepth); + + free(fname); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrcanavar-0.34/sam.h Tue Feb 21 10:44:13 2012 -0500 @@ -0,0 +1,28 @@ +#ifndef __SAM +#define __SAM + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <zlib.h> +#include <dirent.h> + +#include "globals.h" +#include "utils.h" + +void read_depth(char *, char *, char); +void readSAM(char *, char *, char); + +int insert_read_lw(char *, int, char *, int); +int insert_read_sw(char *, int, char *, int); +int insert_read_cw(char *, int, char *, int); + +int findChrom(char *, char *, int); + +int binSearch(char *); + +int windowSearch(struct window *, int, int); + +void saveDepth(char *); + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrcanavar-0.34/utils.c Tue Feb 21 10:44:13 2012 -0500 @@ -0,0 +1,319 @@ + +#include "utils.h" + + + +void print_error(char *msg){ + fprintf(stderr, "\nmrCaNaVaR version %s.\nLast update: %s.\n", VERSION, LAST_UPDATE); + fprintf(stderr, "\n%s\n", msg); + fprintf(stderr, "Invoke parameter -h for help.\n"); + exit (0); +} + +void set_runmode(enum MODETYPE NEWMODE){ + if (RUNMODE != NONE && RUNMODE != NEWMODE){ + print_error("Select one run mode only.\n"); + } + RUNMODE = NEWMODE; +} + +void set_gender(enum GENDERTYPE SET){ + if (GENDER != AUTODETECT && RUNMODE != SET){ + print_error("Select only one of --xx or --xy.\n"); + } + GENDER= SET; +} + +void set_str(char **target, char *source){ + + if (*target != NULL) free((*target)); + + (*target) = (char *) malloc(sizeof(char) * (strlen(source)+1)); + + strncpy((*target), source, (strlen(source)+1)); + +} + + +void init_globals(void){ + GENOME_FASTA = NULL; + GENOME_CONF = NULL; + GENOME_GAPS = NULL; + RUNMODE = NONE; + GENDER = AUTODETECT; + MULTGC = 0; + MAX_GC_CORR = 20.0; + MIN_GC_CORR = 0.05; + VERBOSE = 0; + + num_chrom = 0; + + CHECKSAM = 1; + + /* set window size defaults */ + + LW_SIZE = 5000; + SW_SIZE = 1000; + CW_SIZE = 1000; + LW_SLIDE = 1000; + SW_SLIDE = 1000; + + LW_MEAN = 0.0; + SW_MEAN = 0.0; + CW_MEAN = 0.0; + + LW_STD = 0.0; + SW_STD = 0.0; + CW_STD = 0.0; + + CONT_WINDOW = 7; + CUT_WINDOW = 6; + +} + + +FILE *my_fopen(char *fname, char *mode, char GZ){ + FILE *fp; + + + if (!GZ) + fp = fopen(fname, mode); + else + fp = gzopen(fname, mode); + + if (fp == NULL){ + fprintf(stderr, "Unable to open file %s in %s mode.", fname, mode[0]=='w' ? "write" : "read"); + exit (0); + } + + return fp; +} + + +void saveRefConfig(char *configFile){ + + int i; + int wcnt; + char chrom_name_len; + int retVal; + FILE *config; + + config = my_fopen(configFile, "w", 0); + + /* sort the chromosomes pointer array */ + + qsort(chromosomes, num_chrom, sizeof (struct chrom *), compChr); + + + /* start with the magicNum, I will use this as a format check when loading */ + retVal = fwrite(&magicNum, sizeof(magicNum), 1, config); + + /* window sizes / slides */ + + retVal = fwrite(&LW_SIZE, sizeof(LW_SIZE), 1, config); + retVal = fwrite(&SW_SIZE, sizeof(SW_SIZE), 1, config); + retVal = fwrite(&CW_SIZE, sizeof(CW_SIZE), 1, config); + retVal = fwrite(&LW_SLIDE, sizeof(LW_SLIDE), 1, config); + retVal = fwrite(&SW_SLIDE, sizeof(SW_SLIDE), 1, config); + + + /* reference genome numbers */ + + /* number of chromosomes */ + retVal = fwrite(&num_chrom, sizeof(num_chrom), 1, config); + + /* iterate through chromosomes, write their names, and window counts */ + + for (i=0; i<num_chrom; i++){ + chrom_name_len = (char) strlen(chromosomes[i]->name); + retVal = fwrite(&chrom_name_len, sizeof(chrom_name_len), 1, config); + retVal = fwrite(chromosomes[i]->name, chrom_name_len * sizeof(char), 1, config); + + retVal = fwrite(&(chromosomes[i]->length), sizeof(chromosomes[i]->length), 1, config); + retVal = fwrite(&(chromosomes[i]->lw_cnt), sizeof(chromosomes[i]->lw_cnt), 1, config); + retVal = fwrite(&(chromosomes[i]->sw_cnt), sizeof(chromosomes[i]->sw_cnt), 1, config); + retVal = fwrite(&(chromosomes[i]->cw_cnt), sizeof(chromosomes[i]->cw_cnt), 1, config); + + /* long windows */ + for (wcnt=0; wcnt<chromosomes[i]->lw_cnt; wcnt++){ + retVal = fwrite(&(chromosomes[i]->lw[wcnt].start), sizeof(chromosomes[i]->lw[wcnt].start), 1, config); + retVal = fwrite(&(chromosomes[i]->lw[wcnt].end), sizeof(chromosomes[i]->lw[wcnt].end), 1, config); + retVal = fwrite(&(chromosomes[i]->lw[wcnt].gc), sizeof(chromosomes[i]->lw[wcnt].gc), 1, config); + } + + /* short windows */ + for (wcnt=0; wcnt<chromosomes[i]->sw_cnt; wcnt++){ + retVal = fwrite(&(chromosomes[i]->sw[wcnt].start), sizeof(chromosomes[i]->sw[wcnt].start), 1, config); + retVal = fwrite(&(chromosomes[i]->sw[wcnt].end), sizeof(chromosomes[i]->sw[wcnt].end), 1, config); + retVal = fwrite(&(chromosomes[i]->sw[wcnt].gc), sizeof(chromosomes[i]->sw[wcnt].gc), 1, config); + } + + /* copy windows */ + for (wcnt=0; wcnt<chromosomes[i]->cw_cnt; wcnt++){ + retVal = fwrite(&(chromosomes[i]->cw[wcnt].start), sizeof(chromosomes[i]->cw[wcnt].start), 1, config); + retVal = fwrite(&(chromosomes[i]->cw[wcnt].end), sizeof(chromosomes[i]->cw[wcnt].end), 1, config); + retVal = fwrite(&(chromosomes[i]->cw[wcnt].gc), sizeof(chromosomes[i]->cw[wcnt].gc), 1, config); + } + + } + + fclose(config); + +} + + + + +void loadRefConfig(char *configFile){ + + int i; + int wcnt; + char chrom_name_len; + char readString[MAX_STR]; + int retVal; + + int isMagicNum; + + FILE *config; + + config = my_fopen(configFile, "r", 0); + + /* start with the magicNum, use this as a format check when loading */ + retVal = fread(&isMagicNum, sizeof(isMagicNum), 1, config); + + if (isMagicNum != magicNum) + print_error("Reference configuration file seems to be invalid or corrupt.\n"); + + + fprintf(stdout, "Loading reference configuration, hold on ... "); + fflush(stdout); + + /* window sizes / slides */ + + retVal = fread(&LW_SIZE, sizeof(LW_SIZE), 1, config); + retVal = fread(&SW_SIZE, sizeof(SW_SIZE), 1, config); + retVal = fread(&CW_SIZE, sizeof(CW_SIZE), 1, config); + retVal = fread(&LW_SLIDE, sizeof(LW_SLIDE), 1, config); + retVal = fread(&SW_SLIDE, sizeof(SW_SLIDE), 1, config); + + + /* reference genome numbers */ + + /* number of chromosomes */ + retVal = fread(&num_chrom, sizeof(num_chrom), 1, config); + + /* create chromosomes data structure */ + + chromosomes = (struct chrom **) malloc(sizeof(struct chrom *) * num_chrom); + for (i=0; i<num_chrom; i++) + chromosomes[i] = (struct chrom *) malloc(sizeof(struct chrom) * num_chrom); + + /* iterate through chromosomes, write their names, and window counts */ + + for (i=0; i<num_chrom; i++){ + + retVal = fread(&chrom_name_len, sizeof(chrom_name_len), 1, config); + + retVal = fread(readString, chrom_name_len * sizeof(char), 1, config); + readString[chrom_name_len] = 0; + trimspace(readString); + + set_str(&(chromosomes[i]->name), readString); + + retVal = fread(&(chromosomes[i]->length), sizeof(chromosomes[i]->length), 1, config); + retVal = fread(&(chromosomes[i]->lw_cnt), sizeof(chromosomes[i]->lw_cnt), 1, config); + retVal = fread(&(chromosomes[i]->sw_cnt), sizeof(chromosomes[i]->sw_cnt), 1, config); + retVal = fread(&(chromosomes[i]->cw_cnt), sizeof(chromosomes[i]->cw_cnt), 1, config); + + /* create windows structures */ + + + chromosomes[i]->lw = (struct window *) malloc (sizeof(struct window) * chromosomes[i]->lw_cnt); + chromosomes[i]->sw = (struct window *) malloc (sizeof(struct window) * chromosomes[i]->sw_cnt); + chromosomes[i]->cw = (struct window *) malloc (sizeof(struct window) * chromosomes[i]->cw_cnt); + + + + /* long windows */ + for (wcnt=0; wcnt<chromosomes[i]->lw_cnt; wcnt++){ + retVal = fread(&(chromosomes[i]->lw[wcnt].start), sizeof(chromosomes[i]->lw[wcnt].start), 1, config); + retVal = fread(&(chromosomes[i]->lw[wcnt].end), sizeof(chromosomes[i]->lw[wcnt].end), 1, config); + retVal = fread(&(chromosomes[i]->lw[wcnt].gc), sizeof(chromosomes[i]->lw[wcnt].gc), 1, config); + // this is the default, auto-control-picker will fix this + chromosomes[i]->lw[wcnt].isControl = 1; + chromosomes[i]->lw[wcnt].depth = 0.0; + } + + /* short windows */ + for (wcnt=0; wcnt<chromosomes[i]->sw_cnt; wcnt++){ + retVal = fread(&(chromosomes[i]->sw[wcnt].start), sizeof(chromosomes[i]->sw[wcnt].start), 1, config); + retVal = fread(&(chromosomes[i]->sw[wcnt].end), sizeof(chromosomes[i]->sw[wcnt].end), 1, config); + retVal = fread(&(chromosomes[i]->sw[wcnt].gc), sizeof(chromosomes[i]->sw[wcnt].gc), 1, config); + chromosomes[i]->sw[wcnt].isControl = 1; + chromosomes[i]->sw[wcnt].depth = 0.0; + } + + /* copy windows */ + for (wcnt=0; wcnt<chromosomes[i]->cw_cnt; wcnt++){ + retVal = fread(&(chromosomes[i]->cw[wcnt].start), sizeof(chromosomes[i]->cw[wcnt].start), 1, config); + retVal = fread(&(chromosomes[i]->cw[wcnt].end), sizeof(chromosomes[i]->cw[wcnt].end), 1, config); + retVal = fread(&(chromosomes[i]->cw[wcnt].gc), sizeof(chromosomes[i]->cw[wcnt].gc), 1, config); + chromosomes[i]->cw[wcnt].isControl = 1; + chromosomes[i]->cw[wcnt].depth = 0.0; + } + + /* fill in the rest of the chromosomes structure */ + + } + + fprintf(stdout, "[OK]. %d chromosomes loaded.\n", num_chrom); + + fclose(config); +} + + + +static int compChr(const void *p1, const void *p2){ + /* compare function to sort the chromosome pointer array */ + struct chrom *a, *b; + + a = *((struct chrom **)p1); + b = *((struct chrom **)p2); + + + return (strcmp ( a->name, b->name) ); + +} + + +int endswith(char *src, char *end){ + + + int endlen = strlen(end); + int srclen = strlen(src); + int copyindex; + + + if (endlen > srclen) + return 0; + + copyindex = srclen - endlen; + + if (memcmp(src+copyindex, end, endlen)) + return 0; + else + return 1; + + +} + +void trimspace(char *str){ + int len; + len = strlen(str) - 1; + while(1){ + if (isspace(str[len])) + str[len--]=0; + else + break; + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrcanavar-0.34/utils.h Tue Feb 21 10:44:13 2012 -0500 @@ -0,0 +1,33 @@ +#ifndef __UTILS +#define __UTILS + + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <zlib.h> + +#include "globals.h" + + + +void print_error(char *); + +void set_runmode(enum MODETYPE); +void set_gender(enum GENDERTYPE SET); + +void set_str(char **, char *); + +void init_globals(void); + +FILE *my_fopen(char *, char *, char); + +void saveRefConfig(char *); +void loadRefConfig(char *); + +int endswith(char *, char *); +void trimspace(char *); + +static int compChr(const void *, const void *); + +#endif