view mrcanavar-0.34/sam.c @ 0:86522a0b5f59 default tip

Uploaded source code for mrCaNaVaR
author calkan
date Tue, 21 Feb 2012 10:44:13 -0500
parents
children
line wrap: on
line source

#include "sam.h"

void read_depth(char *indirSAM, char *depthFile, char gzSAM){

  
  DIR *dp;
  int fileCnt;
  int totalFile;

  struct dirent *ep;

  int i;

  if (GENOME_CONF == NULL)
    print_error("Select genom configuration file (input) through the -conf parameter.\n");
  if (indirSAM == NULL)
    print_error("Select input directory that contains the SAM files through the -samdir parameter.\n");
  if (depthFile == NULL)
    print_error("Select read depth output file through the -depth parameter.\n");


  loadRefConfig(GENOME_CONF);

  fprintf(stdout, "Scanning the SAM directory: %s\n", indirSAM);

  dp = opendir(indirSAM);

  if (dp == NULL)
    print_error("SAM directory not found.\n");

  fileCnt = 0;
  totalFile = 0;

  while((ep=readdir(dp))){
    if (ep->d_name[0] == '.')
      continue;
    if (ep->d_type == DT_DIR)
      continue;

    if (CHECKSAM && !endswith(ep->d_name, ".sam") && !endswith(ep->d_name, ".sam.gz"))
      continue;
    
    /*
    if (!strstr(ep->d_name, "sam"))
      continue;
    */

    i = strlen(ep->d_name)-1;

    if ((ep->d_name[i] == 'z' && ep->d_name[i-1] == 'g' && ep->d_name[i-2] == '.') && gzSAM == 0){
      print_error("File name ends with .gz yet --gz option is not selected. Are you sure?\nIf the files are indeed uncompressed, rename the files.\n");
    }
    totalFile++;
  }
  
  rewinddir(dp);

  while((ep=readdir(dp))){
    if (ep->d_name[0] == '.')
      continue;
    if (ep->d_type == DT_DIR)
      continue;

    if (CHECKSAM && !endswith(ep->d_name, ".sam") && !endswith(ep->d_name, ".sam.gz"))
      continue;

    
    /*
    if (!strstr(ep->d_name, "sam"))
      continue;
    */
    
    fprintf(stdout, "\r                                                      \rLoading file %d of %d: %s...", (fileCnt+1), totalFile, ep->d_name);
    fflush(stdout);

    readSAM(indirSAM, ep->d_name, gzSAM);

    fileCnt++;
  }
  
  closedir(dp);

  if (fileCnt == 0)
    print_error("SAM directory does not contain any files with extensions \".sam\" or \".sam.gz\".\nIf you do have the correct files, please rename them to have either \".sam\" or \".sam.gz\" extensions.\n");
  else
    fprintf(stdout, "\n\n%d file%s loaded.\n", fileCnt, fileCnt>1 ? "s" : "");

  saveDepth(depthFile);

}


void readSAM(char *indirSAM, char *fname, char gzSAM){
  FILE *sam;

  char *samfile;
  
  char samLine[4 * MAX_STR];
  char prevChrom[MAX_STR];
  char chrom[MAX_STR];
  char *token;
  

  int pos;
  int chrom_id;
  int prev_chrom_id = -1;
  int lineCnt;

  samfile = (char *) malloc (sizeof(char) * (strlen(indirSAM) + strlen(fname) + 2));
  
  sprintf(samfile, "%s/%s", indirSAM, fname);

  sam = my_fopen(samfile, "r", gzSAM);

  prevChrom[0] = 0;

  while (1){
    
    /* read entire line from the SAM file */
    if (gzSAM){
      gzgets(sam, samLine, (4 * MAX_STR));
      if (gzeof(sam))
	break;
    }
    else{
      fgets(samLine, (4 * MAX_STR), sam);
      if (feof(sam))
	break;
    }
    
    /* parse chrom */
    token = NULL;

    token = strtok(samLine, "\t"); // read name
    token = strtok(NULL,    "\t"); // flag
    token = strtok(NULL,    "\t"); //chrom
    
    strcpy(chrom, token);
    trimspace(chrom);

    token = strtok(NULL,    "\t"); // pos

    pos = atoi(token) - 1;  // SAM file is 1-based; our format is 0-based

    if (pos < 0)
      continue;

   /* 
       debug if needed 
       fprintf(stdout, "%s\t%d\n", chrom, pos);
    */



    chrom_id = insert_read_lw(chrom, pos, prevChrom, prev_chrom_id);
  
    /*
    if (VERBOSE)
      fprintf(stdout, "[READSAM]\t%s\t%d\tchrom_id: %d\n", chrom, pos, chrom_id);
    */

    if (chrom_id == -1) // this chromosome is not in the config file
      continue;
    
    chrom_id = insert_read_sw(chrom, pos, prevChrom, chrom_id);

    if (chrom_id == -1) // this chromosome is not in the config file; it shouldn't come to this
      continue;

    chrom_id = insert_read_cw(chrom, pos, prevChrom, chrom_id);
    


    if (chrom_id == -1) // this chromosome is not in the config file; it shouldn't come to this
      continue;

    strcpy(prevChrom, chrom);
    prev_chrom_id = chrom_id;


  }
    
  if (gzSAM)
    gzclose(sam);
  else
    fclose(sam);
      
  free(samfile);
}





int insert_read_lw(char *chrom, int pos, char *prevChrom, int prev_chrom_id){
  int chrom_id;
  int window_id;

  int flank;
  
  chrom_id = findChrom(chrom, prevChrom, prev_chrom_id);

  if (chrom_id != -1){

    if (pos >= chromosomes[chrom_id]->length){
      return chrom_id;
    }

    window_id = windowSearch(chromosomes[chrom_id]->lw, chromosomes[chrom_id]->lw_cnt, pos);

    
    if (window_id != -1){

      chromosomes[chrom_id]->lw[window_id].depth += 1;
      
      /* iterate left */
      flank = window_id - 1;

      while (flank >= 0 && (pos >= chromosomes[chrom_id]->lw[flank].start && pos <= chromosomes[chrom_id]->lw[flank].end))
	chromosomes[chrom_id]->lw[flank--].depth += 1;


     /* iterate right */
      flank = window_id + 1;

      while (flank < chromosomes[chrom_id]->lw_cnt && (pos >= chromosomes[chrom_id]->lw[flank].start && pos <= chromosomes[chrom_id]->lw[flank].end))
	chromosomes[chrom_id]->lw[flank++].depth += 1;
      

    }
    
  }
  
  return chrom_id;
}



int insert_read_sw(char *chrom, int pos, char *prevChrom, int prev_chrom_id){
  int chrom_id;
  int window_id;

  int flank;
  
  chrom_id = findChrom(chrom, prevChrom, prev_chrom_id);
  
  if (chrom_id != -1){

    if (pos >= chromosomes[chrom_id]->length)
      return chrom_id;

    window_id = windowSearch(chromosomes[chrom_id]->sw, chromosomes[chrom_id]->sw_cnt, pos);
    
    if (window_id != -1){

      chromosomes[chrom_id]->sw[window_id].depth += 1;
      
      /* iterate left */
      flank = window_id - 1;

      while (flank >= 0 && (pos >= chromosomes[chrom_id]->sw[flank].start && pos <= chromosomes[chrom_id]->sw[flank].end))
	chromosomes[chrom_id]->sw[flank--].depth += 1;
      

     /* iterate right */
      flank = window_id + 1;

      while (flank < chromosomes[chrom_id]->sw_cnt && (pos >= chromosomes[chrom_id]->sw[flank].start && pos <= chromosomes[chrom_id]->sw[flank].end))
	chromosomes[chrom_id]->sw[flank++].depth += 1;
      

    }
    
  }
  
  return chrom_id;
}


int insert_read_cw(char *chrom, int pos, char *prevChrom, int prev_chrom_id){
  int chrom_id;
  int window_id;

  int flank;
  
  chrom_id = findChrom(chrom, prevChrom, prev_chrom_id);
  
  if (chrom_id != -1){

    if (pos >= chromosomes[chrom_id]->length)
      return chrom_id;

    window_id = windowSearch(chromosomes[chrom_id]->cw, chromosomes[chrom_id]->cw_cnt, pos);
    
    if (window_id != -1){

      chromosomes[chrom_id]->cw[window_id].depth += 1;
      
    }
    
  } 
  return chrom_id;
}


int findChrom(char *chrom, char *prevChrom, int prev_chrom_id){
  int chrom_id;


  if (!strcmp(chrom, prevChrom)){
    chrom_id = prev_chrom_id;
  }
  
  else if (prev_chrom_id == -1 && !strcmp(chrom, chromosomes[0]->name)){ // the first entry in the file
    chrom_id = 0;
  }
  else if (prev_chrom_id != -1 && prev_chrom_id < num_chrom -1 && !strcmp(chrom, chromosomes[prev_chrom_id+1]->name)){
    chrom_id = prev_chrom_id + 1;
  }
  else{
    chrom_id = binSearch(chrom);
  }
   
  return chrom_id;

}

int binSearch(char *chrom){
  int start;
  int end;
  int med;
  int strtest;

  start = 0;

  end = num_chrom - 1;

  med = (start + end) / 2;
  
  
  while (1){

    if (start > end)
      return -1;
    
    if (start == med || end == med){
      if (!strcmp(chrom, chromosomes[start]->name))
	return start;
      else if (!strcmp(chrom, chromosomes[end]->name))
	return end;
      else{
	return -1;
      }
    }

    strtest = strcmp(chrom, chromosomes[med]->name);

    if(strtest == 0)
      return med;
    
    else if (strtest < 0){
      end = med;
      med = (start + end) / 2;
    }
    
    else {
      start = med;
      med = (start + end) / 2;
    }
    
  }
}



int windowSearch(struct window *searchWin, int winCnt, int pos){
  int start;
  int end;
  int med;

  start = 0;
  end = winCnt - 1;

  med = (start + end) / 2;
  
  while (1){
    
    if (start > end)
      return -1;

    if (start == med || end == med){

      if (pos >= searchWin[start].start && pos < searchWin[start].end)
	return start;

      else if (pos >= searchWin[end].start && pos < searchWin[end].end)
	return end;

      else return -1;

    }

    if (pos >= searchWin[med].start && pos < searchWin[med].end)
      return med;
  
    else if (pos < searchWin[med].start){
      end = med;
      med = (start + end) / 2;
    }
    
    else {
      start = med;
      med = (start + end) / 2;
    }    
  }
}



void saveDepth(char *depthFile){
  int i;
  int j;
  int retVal;

  char *fname;
  FILE *txtDepth;
  FILE *binDepth;

  fname = (char *) malloc (sizeof(char) * (strlen(depthFile) + 10));

  sprintf(fname, "%s.lw.txt", depthFile);

  txtDepth = my_fopen(fname, "w", 0);
  binDepth = my_fopen(depthFile, "w", 0);

  /* start with the magicNum, I will use this as a format check when loading */
  retVal = fwrite(&magicNum, sizeof(magicNum), 1, binDepth);

  fprintf(stdout, "Saving depth file %s\n", fname);

  for (i = 0; i < num_chrom; i++){    
    for (j = 0; j < chromosomes[i]->lw_cnt; j++){
      fprintf(txtDepth, "%s\t%d\t%d\t%f\t%d\n", chromosomes[i]->name, chromosomes[i]->lw[j].start, chromosomes[i]->lw[j].end, chromosomes[i]->lw[j].gc, (int) (chromosomes[i]->lw[j].depth));
      retVal = fwrite(&(chromosomes[i]->lw[j].depth), sizeof(chromosomes[i]->lw[j].depth), 1, binDepth);
    }
  }
   
  fclose(txtDepth);

  sprintf(fname, "%s.sw.txt", depthFile);
  txtDepth = my_fopen(fname, "w", 0);
  fprintf(stdout, "Saving depth file %s\n", fname);

  for (i = 0; i < num_chrom; i++){    
  
    for (j = 0; j < chromosomes[i]->sw_cnt; j++){
      fprintf(txtDepth, "%s\t%d\t%d\t%f\t%d\n", chromosomes[i]->name, chromosomes[i]->sw[j].start, chromosomes[i]->sw[j].end,  chromosomes[i]->sw[j].gc, (int) (chromosomes[i]->sw[j].depth));
      retVal = fwrite(&(chromosomes[i]->sw[j].depth), sizeof(chromosomes[i]->cw[j].depth), 1, binDepth);
    }
  }

  fclose(txtDepth);

  sprintf(fname, "%s.cw.txt", depthFile);
  txtDepth = my_fopen(fname, "w", 0);
  fprintf(stdout, "Saving depth file %s\n", fname);
  
  for (i = 0; i < num_chrom; i++){    
    for (j = 0; j < chromosomes[i]->cw_cnt; j++){
      fprintf(txtDepth, "%s\t%d\t%d\t%f\t%d\n", chromosomes[i]->name, chromosomes[i]->cw[j].start, chromosomes[i]->cw[j].end,  chromosomes[i]->cw[j].gc, (int) (chromosomes[i]->cw[j].depth));
      retVal = fwrite(&(chromosomes[i]->cw[j].depth), sizeof(chromosomes[i]->cw[j].depth), 1, binDepth);
    }
  }

  fclose(txtDepth);
  fclose(binDepth);

  free(fname);
}