diff mrcanavar-0.34/sam.c @ 0:86522a0b5f59 default tip

Uploaded source code for mrCaNaVaR
author calkan
date Tue, 21 Feb 2012 10:44:13 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mrcanavar-0.34/sam.c	Tue Feb 21 10:44:13 2012 -0500
@@ -0,0 +1,479 @@
+#include "sam.h"
+
+void read_depth(char *indirSAM, char *depthFile, char gzSAM){
+
+  
+  DIR *dp;
+  int fileCnt;
+  int totalFile;
+
+  struct dirent *ep;
+
+  int i;
+
+  if (GENOME_CONF == NULL)
+    print_error("Select genom configuration file (input) through the -conf parameter.\n");
+  if (indirSAM == NULL)
+    print_error("Select input directory that contains the SAM files through the -samdir parameter.\n");
+  if (depthFile == NULL)
+    print_error("Select read depth output file through the -depth parameter.\n");
+
+
+  loadRefConfig(GENOME_CONF);
+
+  fprintf(stdout, "Scanning the SAM directory: %s\n", indirSAM);
+
+  dp = opendir(indirSAM);
+
+  if (dp == NULL)
+    print_error("SAM directory not found.\n");
+
+  fileCnt = 0;
+  totalFile = 0;
+
+  while((ep=readdir(dp))){
+    if (ep->d_name[0] == '.')
+      continue;
+    if (ep->d_type == DT_DIR)
+      continue;
+
+    if (CHECKSAM && !endswith(ep->d_name, ".sam") && !endswith(ep->d_name, ".sam.gz"))
+      continue;
+    
+    /*
+    if (!strstr(ep->d_name, "sam"))
+      continue;
+    */
+
+    i = strlen(ep->d_name)-1;
+
+    if ((ep->d_name[i] == 'z' && ep->d_name[i-1] == 'g' && ep->d_name[i-2] == '.') && gzSAM == 0){
+      print_error("File name ends with .gz yet --gz option is not selected. Are you sure?\nIf the files are indeed uncompressed, rename the files.\n");
+    }
+    totalFile++;
+  }
+  
+  rewinddir(dp);
+
+  while((ep=readdir(dp))){
+    if (ep->d_name[0] == '.')
+      continue;
+    if (ep->d_type == DT_DIR)
+      continue;
+
+    if (CHECKSAM && !endswith(ep->d_name, ".sam") && !endswith(ep->d_name, ".sam.gz"))
+      continue;
+
+    
+    /*
+    if (!strstr(ep->d_name, "sam"))
+      continue;
+    */
+    
+    fprintf(stdout, "\r                                                      \rLoading file %d of %d: %s...", (fileCnt+1), totalFile, ep->d_name);
+    fflush(stdout);
+
+    readSAM(indirSAM, ep->d_name, gzSAM);
+
+    fileCnt++;
+  }
+  
+  closedir(dp);
+
+  if (fileCnt == 0)
+    print_error("SAM directory does not contain any files with extensions \".sam\" or \".sam.gz\".\nIf you do have the correct files, please rename them to have either \".sam\" or \".sam.gz\" extensions.\n");
+  else
+    fprintf(stdout, "\n\n%d file%s loaded.\n", fileCnt, fileCnt>1 ? "s" : "");
+
+  saveDepth(depthFile);
+
+}
+
+
+void readSAM(char *indirSAM, char *fname, char gzSAM){
+  FILE *sam;
+
+  char *samfile;
+  
+  char samLine[4 * MAX_STR];
+  char prevChrom[MAX_STR];
+  char chrom[MAX_STR];
+  char *token;
+  
+
+  int pos;
+  int chrom_id;
+  int prev_chrom_id = -1;
+  int lineCnt;
+
+  samfile = (char *) malloc (sizeof(char) * (strlen(indirSAM) + strlen(fname) + 2));
+  
+  sprintf(samfile, "%s/%s", indirSAM, fname);
+
+  sam = my_fopen(samfile, "r", gzSAM);
+
+  prevChrom[0] = 0;
+
+  while (1){
+    
+    /* read entire line from the SAM file */
+    if (gzSAM){
+      gzgets(sam, samLine, (4 * MAX_STR));
+      if (gzeof(sam))
+	break;
+    }
+    else{
+      fgets(samLine, (4 * MAX_STR), sam);
+      if (feof(sam))
+	break;
+    }
+    
+    /* parse chrom */
+    token = NULL;
+
+    token = strtok(samLine, "\t"); // read name
+    token = strtok(NULL,    "\t"); // flag
+    token = strtok(NULL,    "\t"); //chrom
+    
+    strcpy(chrom, token);
+    trimspace(chrom);
+
+    token = strtok(NULL,    "\t"); // pos
+
+    pos = atoi(token) - 1;  // SAM file is 1-based; our format is 0-based
+
+    if (pos < 0)
+      continue;
+
+   /* 
+       debug if needed 
+       fprintf(stdout, "%s\t%d\n", chrom, pos);
+    */
+
+
+
+    chrom_id = insert_read_lw(chrom, pos, prevChrom, prev_chrom_id);
+  
+    /*
+    if (VERBOSE)
+      fprintf(stdout, "[READSAM]\t%s\t%d\tchrom_id: %d\n", chrom, pos, chrom_id);
+    */
+
+    if (chrom_id == -1) // this chromosome is not in the config file
+      continue;
+    
+    chrom_id = insert_read_sw(chrom, pos, prevChrom, chrom_id);
+
+    if (chrom_id == -1) // this chromosome is not in the config file; it shouldn't come to this
+      continue;
+
+    chrom_id = insert_read_cw(chrom, pos, prevChrom, chrom_id);
+    
+
+
+    if (chrom_id == -1) // this chromosome is not in the config file; it shouldn't come to this
+      continue;
+
+    strcpy(prevChrom, chrom);
+    prev_chrom_id = chrom_id;
+
+
+  }
+    
+  if (gzSAM)
+    gzclose(sam);
+  else
+    fclose(sam);
+      
+  free(samfile);
+}
+
+
+
+
+
+int insert_read_lw(char *chrom, int pos, char *prevChrom, int prev_chrom_id){
+  int chrom_id;
+  int window_id;
+
+  int flank;
+  
+  chrom_id = findChrom(chrom, prevChrom, prev_chrom_id);
+
+  if (chrom_id != -1){
+
+    if (pos >= chromosomes[chrom_id]->length){
+      return chrom_id;
+    }
+
+    window_id = windowSearch(chromosomes[chrom_id]->lw, chromosomes[chrom_id]->lw_cnt, pos);
+
+    
+    if (window_id != -1){
+
+      chromosomes[chrom_id]->lw[window_id].depth += 1;
+      
+      /* iterate left */
+      flank = window_id - 1;
+
+      while (flank >= 0 && (pos >= chromosomes[chrom_id]->lw[flank].start && pos <= chromosomes[chrom_id]->lw[flank].end))
+	chromosomes[chrom_id]->lw[flank--].depth += 1;
+
+
+     /* iterate right */
+      flank = window_id + 1;
+
+      while (flank < chromosomes[chrom_id]->lw_cnt && (pos >= chromosomes[chrom_id]->lw[flank].start && pos <= chromosomes[chrom_id]->lw[flank].end))
+	chromosomes[chrom_id]->lw[flank++].depth += 1;
+      
+
+    }
+    
+  }
+  
+  return chrom_id;
+}
+
+
+
+int insert_read_sw(char *chrom, int pos, char *prevChrom, int prev_chrom_id){
+  int chrom_id;
+  int window_id;
+
+  int flank;
+  
+  chrom_id = findChrom(chrom, prevChrom, prev_chrom_id);
+  
+  if (chrom_id != -1){
+
+    if (pos >= chromosomes[chrom_id]->length)
+      return chrom_id;
+
+    window_id = windowSearch(chromosomes[chrom_id]->sw, chromosomes[chrom_id]->sw_cnt, pos);
+    
+    if (window_id != -1){
+
+      chromosomes[chrom_id]->sw[window_id].depth += 1;
+      
+      /* iterate left */
+      flank = window_id - 1;
+
+      while (flank >= 0 && (pos >= chromosomes[chrom_id]->sw[flank].start && pos <= chromosomes[chrom_id]->sw[flank].end))
+	chromosomes[chrom_id]->sw[flank--].depth += 1;
+      
+
+     /* iterate right */
+      flank = window_id + 1;
+
+      while (flank < chromosomes[chrom_id]->sw_cnt && (pos >= chromosomes[chrom_id]->sw[flank].start && pos <= chromosomes[chrom_id]->sw[flank].end))
+	chromosomes[chrom_id]->sw[flank++].depth += 1;
+      
+
+    }
+    
+  }
+  
+  return chrom_id;
+}
+
+
+int insert_read_cw(char *chrom, int pos, char *prevChrom, int prev_chrom_id){
+  int chrom_id;
+  int window_id;
+
+  int flank;
+  
+  chrom_id = findChrom(chrom, prevChrom, prev_chrom_id);
+  
+  if (chrom_id != -1){
+
+    if (pos >= chromosomes[chrom_id]->length)
+      return chrom_id;
+
+    window_id = windowSearch(chromosomes[chrom_id]->cw, chromosomes[chrom_id]->cw_cnt, pos);
+    
+    if (window_id != -1){
+
+      chromosomes[chrom_id]->cw[window_id].depth += 1;
+      
+    }
+    
+  } 
+  return chrom_id;
+}
+
+
+int findChrom(char *chrom, char *prevChrom, int prev_chrom_id){
+  int chrom_id;
+
+
+  if (!strcmp(chrom, prevChrom)){
+    chrom_id = prev_chrom_id;
+  }
+  
+  else if (prev_chrom_id == -1 && !strcmp(chrom, chromosomes[0]->name)){ // the first entry in the file
+    chrom_id = 0;
+  }
+  else if (prev_chrom_id != -1 && prev_chrom_id < num_chrom -1 && !strcmp(chrom, chromosomes[prev_chrom_id+1]->name)){
+    chrom_id = prev_chrom_id + 1;
+  }
+  else{
+    chrom_id = binSearch(chrom);
+  }
+   
+  return chrom_id;
+
+}
+
+int binSearch(char *chrom){
+  int start;
+  int end;
+  int med;
+  int strtest;
+
+  start = 0;
+
+  end = num_chrom - 1;
+
+  med = (start + end) / 2;
+  
+  
+  while (1){
+
+    if (start > end)
+      return -1;
+    
+    if (start == med || end == med){
+      if (!strcmp(chrom, chromosomes[start]->name))
+	return start;
+      else if (!strcmp(chrom, chromosomes[end]->name))
+	return end;
+      else{
+	return -1;
+      }
+    }
+
+    strtest = strcmp(chrom, chromosomes[med]->name);
+
+    if(strtest == 0)
+      return med;
+    
+    else if (strtest < 0){
+      end = med;
+      med = (start + end) / 2;
+    }
+    
+    else {
+      start = med;
+      med = (start + end) / 2;
+    }
+    
+  }
+}
+
+
+
+int windowSearch(struct window *searchWin, int winCnt, int pos){
+  int start;
+  int end;
+  int med;
+
+  start = 0;
+  end = winCnt - 1;
+
+  med = (start + end) / 2;
+  
+  while (1){
+    
+    if (start > end)
+      return -1;
+
+    if (start == med || end == med){
+
+      if (pos >= searchWin[start].start && pos < searchWin[start].end)
+	return start;
+
+      else if (pos >= searchWin[end].start && pos < searchWin[end].end)
+	return end;
+
+      else return -1;
+
+    }
+
+    if (pos >= searchWin[med].start && pos < searchWin[med].end)
+      return med;
+  
+    else if (pos < searchWin[med].start){
+      end = med;
+      med = (start + end) / 2;
+    }
+    
+    else {
+      start = med;
+      med = (start + end) / 2;
+    }    
+  }
+}
+
+
+
+void saveDepth(char *depthFile){
+  int i;
+  int j;
+  int retVal;
+
+  char *fname;
+  FILE *txtDepth;
+  FILE *binDepth;
+
+  fname = (char *) malloc (sizeof(char) * (strlen(depthFile) + 10));
+
+  sprintf(fname, "%s.lw.txt", depthFile);
+
+  txtDepth = my_fopen(fname, "w", 0);
+  binDepth = my_fopen(depthFile, "w", 0);
+
+  /* start with the magicNum, I will use this as a format check when loading */
+  retVal = fwrite(&magicNum, sizeof(magicNum), 1, binDepth);
+
+  fprintf(stdout, "Saving depth file %s\n", fname);
+
+  for (i = 0; i < num_chrom; i++){    
+    for (j = 0; j < chromosomes[i]->lw_cnt; j++){
+      fprintf(txtDepth, "%s\t%d\t%d\t%f\t%d\n", chromosomes[i]->name, chromosomes[i]->lw[j].start, chromosomes[i]->lw[j].end, chromosomes[i]->lw[j].gc, (int) (chromosomes[i]->lw[j].depth));
+      retVal = fwrite(&(chromosomes[i]->lw[j].depth), sizeof(chromosomes[i]->lw[j].depth), 1, binDepth);
+    }
+  }
+   
+  fclose(txtDepth);
+
+  sprintf(fname, "%s.sw.txt", depthFile);
+  txtDepth = my_fopen(fname, "w", 0);
+  fprintf(stdout, "Saving depth file %s\n", fname);
+
+  for (i = 0; i < num_chrom; i++){    
+  
+    for (j = 0; j < chromosomes[i]->sw_cnt; j++){
+      fprintf(txtDepth, "%s\t%d\t%d\t%f\t%d\n", chromosomes[i]->name, chromosomes[i]->sw[j].start, chromosomes[i]->sw[j].end,  chromosomes[i]->sw[j].gc, (int) (chromosomes[i]->sw[j].depth));
+      retVal = fwrite(&(chromosomes[i]->sw[j].depth), sizeof(chromosomes[i]->cw[j].depth), 1, binDepth);
+    }
+  }
+
+  fclose(txtDepth);
+
+  sprintf(fname, "%s.cw.txt", depthFile);
+  txtDepth = my_fopen(fname, "w", 0);
+  fprintf(stdout, "Saving depth file %s\n", fname);
+  
+  for (i = 0; i < num_chrom; i++){    
+    for (j = 0; j < chromosomes[i]->cw_cnt; j++){
+      fprintf(txtDepth, "%s\t%d\t%d\t%f\t%d\n", chromosomes[i]->name, chromosomes[i]->cw[j].start, chromosomes[i]->cw[j].end,  chromosomes[i]->cw[j].gc, (int) (chromosomes[i]->cw[j].depth));
+      retVal = fwrite(&(chromosomes[i]->cw[j].depth), sizeof(chromosomes[i]->cw[j].depth), 1, binDepth);
+    }
+  }
+
+  fclose(txtDepth);
+  fclose(binDepth);
+
+  free(fname);
+}