Mercurial > repos > calkan > mrcanavar
diff mrcanavar-0.34/sam.c @ 0:86522a0b5f59 default tip
Uploaded source code for mrCaNaVaR
author | calkan |
---|---|
date | Tue, 21 Feb 2012 10:44:13 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrcanavar-0.34/sam.c Tue Feb 21 10:44:13 2012 -0500 @@ -0,0 +1,479 @@ +#include "sam.h" + +void read_depth(char *indirSAM, char *depthFile, char gzSAM){ + + + DIR *dp; + int fileCnt; + int totalFile; + + struct dirent *ep; + + int i; + + if (GENOME_CONF == NULL) + print_error("Select genom configuration file (input) through the -conf parameter.\n"); + if (indirSAM == NULL) + print_error("Select input directory that contains the SAM files through the -samdir parameter.\n"); + if (depthFile == NULL) + print_error("Select read depth output file through the -depth parameter.\n"); + + + loadRefConfig(GENOME_CONF); + + fprintf(stdout, "Scanning the SAM directory: %s\n", indirSAM); + + dp = opendir(indirSAM); + + if (dp == NULL) + print_error("SAM directory not found.\n"); + + fileCnt = 0; + totalFile = 0; + + while((ep=readdir(dp))){ + if (ep->d_name[0] == '.') + continue; + if (ep->d_type == DT_DIR) + continue; + + if (CHECKSAM && !endswith(ep->d_name, ".sam") && !endswith(ep->d_name, ".sam.gz")) + continue; + + /* + if (!strstr(ep->d_name, "sam")) + continue; + */ + + i = strlen(ep->d_name)-1; + + if ((ep->d_name[i] == 'z' && ep->d_name[i-1] == 'g' && ep->d_name[i-2] == '.') && gzSAM == 0){ + print_error("File name ends with .gz yet --gz option is not selected. Are you sure?\nIf the files are indeed uncompressed, rename the files.\n"); + } + totalFile++; + } + + rewinddir(dp); + + while((ep=readdir(dp))){ + if (ep->d_name[0] == '.') + continue; + if (ep->d_type == DT_DIR) + continue; + + if (CHECKSAM && !endswith(ep->d_name, ".sam") && !endswith(ep->d_name, ".sam.gz")) + continue; + + + /* + if (!strstr(ep->d_name, "sam")) + continue; + */ + + fprintf(stdout, "\r \rLoading file %d of %d: %s...", (fileCnt+1), totalFile, ep->d_name); + fflush(stdout); + + readSAM(indirSAM, ep->d_name, gzSAM); + + fileCnt++; + } + + closedir(dp); + + if (fileCnt == 0) + print_error("SAM directory does not contain any files with extensions \".sam\" or \".sam.gz\".\nIf you do have the correct files, please rename them to have either \".sam\" or \".sam.gz\" extensions.\n"); + else + fprintf(stdout, "\n\n%d file%s loaded.\n", fileCnt, fileCnt>1 ? "s" : ""); + + saveDepth(depthFile); + +} + + +void readSAM(char *indirSAM, char *fname, char gzSAM){ + FILE *sam; + + char *samfile; + + char samLine[4 * MAX_STR]; + char prevChrom[MAX_STR]; + char chrom[MAX_STR]; + char *token; + + + int pos; + int chrom_id; + int prev_chrom_id = -1; + int lineCnt; + + samfile = (char *) malloc (sizeof(char) * (strlen(indirSAM) + strlen(fname) + 2)); + + sprintf(samfile, "%s/%s", indirSAM, fname); + + sam = my_fopen(samfile, "r", gzSAM); + + prevChrom[0] = 0; + + while (1){ + + /* read entire line from the SAM file */ + if (gzSAM){ + gzgets(sam, samLine, (4 * MAX_STR)); + if (gzeof(sam)) + break; + } + else{ + fgets(samLine, (4 * MAX_STR), sam); + if (feof(sam)) + break; + } + + /* parse chrom */ + token = NULL; + + token = strtok(samLine, "\t"); // read name + token = strtok(NULL, "\t"); // flag + token = strtok(NULL, "\t"); //chrom + + strcpy(chrom, token); + trimspace(chrom); + + token = strtok(NULL, "\t"); // pos + + pos = atoi(token) - 1; // SAM file is 1-based; our format is 0-based + + if (pos < 0) + continue; + + /* + debug if needed + fprintf(stdout, "%s\t%d\n", chrom, pos); + */ + + + + chrom_id = insert_read_lw(chrom, pos, prevChrom, prev_chrom_id); + + /* + if (VERBOSE) + fprintf(stdout, "[READSAM]\t%s\t%d\tchrom_id: %d\n", chrom, pos, chrom_id); + */ + + if (chrom_id == -1) // this chromosome is not in the config file + continue; + + chrom_id = insert_read_sw(chrom, pos, prevChrom, chrom_id); + + if (chrom_id == -1) // this chromosome is not in the config file; it shouldn't come to this + continue; + + chrom_id = insert_read_cw(chrom, pos, prevChrom, chrom_id); + + + + if (chrom_id == -1) // this chromosome is not in the config file; it shouldn't come to this + continue; + + strcpy(prevChrom, chrom); + prev_chrom_id = chrom_id; + + + } + + if (gzSAM) + gzclose(sam); + else + fclose(sam); + + free(samfile); +} + + + + + +int insert_read_lw(char *chrom, int pos, char *prevChrom, int prev_chrom_id){ + int chrom_id; + int window_id; + + int flank; + + chrom_id = findChrom(chrom, prevChrom, prev_chrom_id); + + if (chrom_id != -1){ + + if (pos >= chromosomes[chrom_id]->length){ + return chrom_id; + } + + window_id = windowSearch(chromosomes[chrom_id]->lw, chromosomes[chrom_id]->lw_cnt, pos); + + + if (window_id != -1){ + + chromosomes[chrom_id]->lw[window_id].depth += 1; + + /* iterate left */ + flank = window_id - 1; + + while (flank >= 0 && (pos >= chromosomes[chrom_id]->lw[flank].start && pos <= chromosomes[chrom_id]->lw[flank].end)) + chromosomes[chrom_id]->lw[flank--].depth += 1; + + + /* iterate right */ + flank = window_id + 1; + + while (flank < chromosomes[chrom_id]->lw_cnt && (pos >= chromosomes[chrom_id]->lw[flank].start && pos <= chromosomes[chrom_id]->lw[flank].end)) + chromosomes[chrom_id]->lw[flank++].depth += 1; + + + } + + } + + return chrom_id; +} + + + +int insert_read_sw(char *chrom, int pos, char *prevChrom, int prev_chrom_id){ + int chrom_id; + int window_id; + + int flank; + + chrom_id = findChrom(chrom, prevChrom, prev_chrom_id); + + if (chrom_id != -1){ + + if (pos >= chromosomes[chrom_id]->length) + return chrom_id; + + window_id = windowSearch(chromosomes[chrom_id]->sw, chromosomes[chrom_id]->sw_cnt, pos); + + if (window_id != -1){ + + chromosomes[chrom_id]->sw[window_id].depth += 1; + + /* iterate left */ + flank = window_id - 1; + + while (flank >= 0 && (pos >= chromosomes[chrom_id]->sw[flank].start && pos <= chromosomes[chrom_id]->sw[flank].end)) + chromosomes[chrom_id]->sw[flank--].depth += 1; + + + /* iterate right */ + flank = window_id + 1; + + while (flank < chromosomes[chrom_id]->sw_cnt && (pos >= chromosomes[chrom_id]->sw[flank].start && pos <= chromosomes[chrom_id]->sw[flank].end)) + chromosomes[chrom_id]->sw[flank++].depth += 1; + + + } + + } + + return chrom_id; +} + + +int insert_read_cw(char *chrom, int pos, char *prevChrom, int prev_chrom_id){ + int chrom_id; + int window_id; + + int flank; + + chrom_id = findChrom(chrom, prevChrom, prev_chrom_id); + + if (chrom_id != -1){ + + if (pos >= chromosomes[chrom_id]->length) + return chrom_id; + + window_id = windowSearch(chromosomes[chrom_id]->cw, chromosomes[chrom_id]->cw_cnt, pos); + + if (window_id != -1){ + + chromosomes[chrom_id]->cw[window_id].depth += 1; + + } + + } + return chrom_id; +} + + +int findChrom(char *chrom, char *prevChrom, int prev_chrom_id){ + int chrom_id; + + + if (!strcmp(chrom, prevChrom)){ + chrom_id = prev_chrom_id; + } + + else if (prev_chrom_id == -1 && !strcmp(chrom, chromosomes[0]->name)){ // the first entry in the file + chrom_id = 0; + } + else if (prev_chrom_id != -1 && prev_chrom_id < num_chrom -1 && !strcmp(chrom, chromosomes[prev_chrom_id+1]->name)){ + chrom_id = prev_chrom_id + 1; + } + else{ + chrom_id = binSearch(chrom); + } + + return chrom_id; + +} + +int binSearch(char *chrom){ + int start; + int end; + int med; + int strtest; + + start = 0; + + end = num_chrom - 1; + + med = (start + end) / 2; + + + while (1){ + + if (start > end) + return -1; + + if (start == med || end == med){ + if (!strcmp(chrom, chromosomes[start]->name)) + return start; + else if (!strcmp(chrom, chromosomes[end]->name)) + return end; + else{ + return -1; + } + } + + strtest = strcmp(chrom, chromosomes[med]->name); + + if(strtest == 0) + return med; + + else if (strtest < 0){ + end = med; + med = (start + end) / 2; + } + + else { + start = med; + med = (start + end) / 2; + } + + } +} + + + +int windowSearch(struct window *searchWin, int winCnt, int pos){ + int start; + int end; + int med; + + start = 0; + end = winCnt - 1; + + med = (start + end) / 2; + + while (1){ + + if (start > end) + return -1; + + if (start == med || end == med){ + + if (pos >= searchWin[start].start && pos < searchWin[start].end) + return start; + + else if (pos >= searchWin[end].start && pos < searchWin[end].end) + return end; + + else return -1; + + } + + if (pos >= searchWin[med].start && pos < searchWin[med].end) + return med; + + else if (pos < searchWin[med].start){ + end = med; + med = (start + end) / 2; + } + + else { + start = med; + med = (start + end) / 2; + } + } +} + + + +void saveDepth(char *depthFile){ + int i; + int j; + int retVal; + + char *fname; + FILE *txtDepth; + FILE *binDepth; + + fname = (char *) malloc (sizeof(char) * (strlen(depthFile) + 10)); + + sprintf(fname, "%s.lw.txt", depthFile); + + txtDepth = my_fopen(fname, "w", 0); + binDepth = my_fopen(depthFile, "w", 0); + + /* start with the magicNum, I will use this as a format check when loading */ + retVal = fwrite(&magicNum, sizeof(magicNum), 1, binDepth); + + fprintf(stdout, "Saving depth file %s\n", fname); + + for (i = 0; i < num_chrom; i++){ + for (j = 0; j < chromosomes[i]->lw_cnt; j++){ + fprintf(txtDepth, "%s\t%d\t%d\t%f\t%d\n", chromosomes[i]->name, chromosomes[i]->lw[j].start, chromosomes[i]->lw[j].end, chromosomes[i]->lw[j].gc, (int) (chromosomes[i]->lw[j].depth)); + retVal = fwrite(&(chromosomes[i]->lw[j].depth), sizeof(chromosomes[i]->lw[j].depth), 1, binDepth); + } + } + + fclose(txtDepth); + + sprintf(fname, "%s.sw.txt", depthFile); + txtDepth = my_fopen(fname, "w", 0); + fprintf(stdout, "Saving depth file %s\n", fname); + + for (i = 0; i < num_chrom; i++){ + + for (j = 0; j < chromosomes[i]->sw_cnt; j++){ + fprintf(txtDepth, "%s\t%d\t%d\t%f\t%d\n", chromosomes[i]->name, chromosomes[i]->sw[j].start, chromosomes[i]->sw[j].end, chromosomes[i]->sw[j].gc, (int) (chromosomes[i]->sw[j].depth)); + retVal = fwrite(&(chromosomes[i]->sw[j].depth), sizeof(chromosomes[i]->cw[j].depth), 1, binDepth); + } + } + + fclose(txtDepth); + + sprintf(fname, "%s.cw.txt", depthFile); + txtDepth = my_fopen(fname, "w", 0); + fprintf(stdout, "Saving depth file %s\n", fname); + + for (i = 0; i < num_chrom; i++){ + for (j = 0; j < chromosomes[i]->cw_cnt; j++){ + fprintf(txtDepth, "%s\t%d\t%d\t%f\t%d\n", chromosomes[i]->name, chromosomes[i]->cw[j].start, chromosomes[i]->cw[j].end, chromosomes[i]->cw[j].gc, (int) (chromosomes[i]->cw[j].depth)); + retVal = fwrite(&(chromosomes[i]->cw[j].depth), sizeof(chromosomes[i]->cw[j].depth), 1, binDepth); + } + } + + fclose(txtDepth); + fclose(binDepth); + + free(fname); +}