view mrcanavar-0.34/prep.c @ 0:86522a0b5f59 default tip

Uploaded source code for mrCaNaVaR
author calkan
date Tue, 21 Feb 2012 10:44:13 -0500
parents
children
line wrap: on
line source

#include "prep.h"

void prep_genome(void){
  FILE *fasta;
  FILE *gaps;
  FILE *config;
  
  int num_gaps;
  
  char chrom[MAX_STR];
  int start, end;
  int max_len;
  
  int i;

  if (GENOME_FASTA == NULL)
    print_error("Select genome fasta file (input) through -fasta parameter.\n");
  if (GENOME_GAPS == NULL)
    print_error("Select genome gaps file (input) through -gaps parameter.\n"); 
  if (GENOME_CONF == NULL)
    print_error("Select genome fasta file (output) through -conf parameter.\n");


  if (LW_SIZE <= 0)
    print_error ("Large window size (-lw_size) must be a poitive integer.\n");
  if (SW_SIZE <= 0)
    print_error ("Small window size (-sw_size) must be a poitive integer.\n");
  if (CW_SIZE <= 0)
    print_error ("Copy number window size (-cw_size) must be a poitive integer.\n");
  if (LW_SLIDE <= 0)
    print_error ("Large window slide (-lw_slide) must be a poitive integer.\n");
  if (SW_SLIDE <= 0)
    print_error ("Small window slide (-sw_slide) must be a poitive integer.\n");

  if (SW_SIZE >= LW_SIZE)
    print_error ("Small window size (-sw_size) should be smaller than large window size (-lw_size).\n");
  if (SW_SLIDE > LW_SIZE)
    print_error ("Small window slide (-sw_slide) should be smaller than or equal to large window slide (-lw_slide).\n");


  fasta = my_fopen(GENOME_FASTA, "r", 0);
  gaps = my_fopen(GENOME_GAPS, "r", 0);

  num_gaps = 0;


  /* initialize gap table */
  while (fscanf (gaps, "%s\t%d\t%d\n", chrom, &start, &end) > 0)
    num_gaps ++;


  gaptable = (struct gapcell *) malloc(sizeof(struct gapcell) * num_gaps);

  rewind(gaps);
  

  /* read gap table */

  i = 0;
  while (fscanf (gaps, "%s\t%d\t%d\n", chrom, &start, &end) > 0){
    trimspace(chrom);
    gaptable[i].chrom = NULL;
    set_str(&(gaptable[i].chrom), chrom);
    gaptable[i].start = start;
    gaptable[i].end = end;
    i++;
  }
  fclose(gaps);

  fprintf (stdout, "Scanning the reference genome.\n");
  /* count basic numbers from the reference genome */
  max_len = 0;
  num_chrom = count_chrom(fasta, &max_len);
  rewind(fasta);

  chromosomes = (struct chrom **) malloc (sizeof (struct chrom *) * num_chrom);

  for (i=0; i<num_chrom; i++)
    chromosomes[i] = (struct chrom *) malloc (sizeof (struct chrom) * num_chrom);

  fprintf(stdout, "Total of %d chromosomes in the reference genome, with %d gaps.\nLongest chromosome is %d bp\n", num_chrom, num_gaps, max_len);

  fprintf (stdout, "Reading the reference genome.\n");

  read_ref(fasta, max_len, num_gaps);

  fprintf (stdout, "Saving the reference configuration.\n");
  saveRefConfig(GENOME_CONF);

}



int count_chrom(FILE *fasta, int *max_len){
  int length=0; int maxlength=0;
  char ch;
  int cnt = 0;
  char skip[MAX_STR];
  char *retStr;

  //while (fscanf(fasta, "%c", &ch) > 0){
  while (1){

    ch = fgetc(fasta);
    if (feof(fasta)) break;

    if (ch == '>') {
      cnt ++;
      retStr = fgets(skip, MAX_STR, fasta);
      fprintf(stdout, ".");
      fflush(stdout);
      if (length > maxlength) maxlength = length;
      length = 0;
    }
    else if (!isspace(ch)) length++;
  }
  fprintf(stdout, "\n");

  if (length > maxlength)
    maxlength = length;

  *max_len = maxlength;
  return cnt;
}



void read_ref(FILE *fasta, int max_len, int num_gaps){

  char ch;
  int cnt = 0;
  char chrom_name[MAX_STR];
  int i;
  int length;
  char *chrom_seq;
  char *retStr;

  chrom_seq = (char *) malloc (sizeof(char) * (max_len+1));
  cnt = -1;
  i = 0;
  length = 0;

  //  while (fscanf(fasta, "%c", &ch) > 0){
  while (1){
    
    ch = fgetc(fasta);
    if (feof(fasta)) break;
    
    if (ch == '>') {

      if (cnt != -1){
	/* insert the previous chromosome */
	chrom_seq[length]=0;
	insert_chrom(cnt, chrom_name, length, chrom_seq, num_gaps);
      }

      cnt ++;
      retStr = fgets(chrom_name, MAX_STR, fasta);
      chrom_name[strlen(chrom_name)-1] = 0;
      trimspace(chrom_name);
      length = 0;      
    }
    else if (!isspace(ch)){
      chrom_seq[length++] = ch;
    }
  }

  /* insert the last chromosome */
  chrom_seq[length]=0;
  trimspace(chrom_name);
  insert_chrom(cnt, chrom_name, length, chrom_seq, num_gaps);

  free(chrom_seq);
}


void insert_chrom(int cnt, char *chrom_name, int length, char *chrom_seq, int num_gaps){

  fprintf(stdout, "Chromosome %s (%d bp). Counting windows ... ", chrom_name, length);
  fflush(stdout);

  chromosomes[cnt]->name = NULL;
  set_str(&(chromosomes[cnt]->name), chrom_name);
  chromosomes[cnt]->length = length;  

  windowmaker(num_gaps, cnt, chrom_name, chrom_seq, length, 0);
  fprintf(stdout, "\t\tRecalculating windows ... ");
  fflush(stdout);
  windowmaker(num_gaps, cnt, chrom_name, chrom_seq, length, 1);

  fprintf(stdout, "\n");
}


void windowmaker(int num_gaps, int chrom_id, char *chrom_name, char *chrom_seq, int length, int flag){
  int i;
  int j;
  int nchar;
  int s, e;

  int gc;
  float gc_p;

  int lw_cnt = 0; // large window (5Kb)
  int sw_cnt = 0; // small window (1Kb)
  int cw_cnt = 0; // copy number window (1kb non-ovp);


  int win_cnt;

  /* if flag = 0; don't record it */

  if (!flag){
    for (i=0;i<num_gaps;i++){
      if (!strcmp(gaptable[i].chrom, chrom_name)){
	if (gaptable[i].start == gaptable[i].end)
	  chrom_seq[gaptable[i].start] = 'X';
	else{
	  for (j=gaptable[i].start; (j<gaptable[i].end && j<length); j++)
	    chrom_seq[j] = 'X';
	}
      }
    }
  }


  /* count cw_cnt */

  nchar = 0;
  s = 0 ; e = 0;
  gc = 0;
  
  for (i=0; i<length; i++){
    
    if (chrom_seq[i] != 'N' && chrom_seq[i] != 'X')
      nchar++;
    if (chrom_seq[i] == 'G' || chrom_seq[i] == 'C')
      gc++;
  
    if ((nchar == CW_SIZE || chrom_seq[i] == 'X') && nchar != 0){
      e = i;
      if (chrom_seq[i] == 'X'){
	nchar = 0;
	gc = 0;
      }
      else if (flag == 1 && nchar == CW_SIZE){
	/* save */
	chromosomes[chrom_id]->cw[cw_cnt].start = s;
	chromosomes[chrom_id]->cw[cw_cnt].end = e+1;
	chromosomes[chrom_id]->cw[cw_cnt].gc = (float)gc / (float)nchar;
	chromosomes[chrom_id]->cw[cw_cnt].depth = 0; // for readability/completeness
	cw_cnt++;
	gc = 0;
      }
      else if (flag == 0 && nchar == CW_SIZE){
	cw_cnt ++;
      }

      s = i + 1;
      nchar = 0;
    }

    if (chrom_seq[i] == 'X'){
      s = i + 1;
      nchar = 0;
      gc = 0;
    }
    
  }


  /* count sw_cnt */

  i = 0;
  nchar = 0;
  s = 0 ; e = 0;
  gc = 0;

  while (i < length){

    if (chrom_seq[i] != 'N' && chrom_seq[i] != 'X') 
      nchar++;
    if (chrom_seq[i] == 'G' || chrom_seq[i] == 'C')
      gc++;

    if ((nchar == SW_SIZE || chrom_seq[i] == 'X') && nchar != 0){
      e = i;
      if (chrom_seq[i] == 'X'){
        nchar = 0;
	gc = 0;
      }
      else  if (flag == 1 && nchar == SW_SIZE){
	/* save */
	chromosomes[chrom_id]->sw[sw_cnt].start = s;
	chromosomes[chrom_id]->sw[sw_cnt].end = e+1;
	chromosomes[chrom_id]->sw[sw_cnt].gc = (float)gc / (float)nchar;
	chromosomes[chrom_id]->sw[sw_cnt].depth = 0; // for readability/completeness
	sw_cnt++;
	gc = 0;
      }
      else if (flag == 0 && nchar == SW_SIZE)
	sw_cnt++;

      s = s + SW_SLIDE;
      i = s - 1;
      nchar = 0;
    }
    if (chrom_seq[i]=='X'){ 
      s = i + 1; 
      i = s - 1;
      gc = 0;
    }
     
    i++;
  }


  nchar = 0;
  i = 0;
  s = 0 ; e = 0;
  gc = 0;
  /* count lw_cnt */

  while (i<length){

    if (chrom_seq[i] != 'N' && chrom_seq[i] != 'X') 
      nchar++;
    if (chrom_seq[i] == 'G' || chrom_seq[i] == 'C')
      gc++;

    if ((nchar == LW_SIZE || chrom_seq[i] == 'X') && nchar != 0){
      e = i;
      if (chrom_seq[i] == 'X'){
        nchar = 0;
	gc = 0;
      }
      else if (flag == 1 && nchar == LW_SIZE){
	/* save */
	chromosomes[chrom_id]->lw[lw_cnt].start = s;
	chromosomes[chrom_id]->lw[lw_cnt].end = e+1;
	chromosomes[chrom_id]->lw[lw_cnt].gc = (float)gc / (float)nchar;
	chromosomes[chrom_id]->lw[lw_cnt].depth = 0; // for readability/completeness
	lw_cnt++;
	gc = 0;
      }
      else if (flag == 0 && nchar == LW_SIZE)
	lw_cnt++;

      s = s + LW_SLIDE;
      i = s - 1;
      nchar = 0;
    }

    if (chrom_seq[i] == 'X') {
      s = i + 1; 
      i = s - 1;
      gc = 0;
    }

    i++;
  }



  if (flag == 0){

    printf("\n\t\tLW: %d\tSW: %d\tCW: %d\n", lw_cnt, sw_cnt, cw_cnt);

    chromosomes[chrom_id]->lw_cnt = lw_cnt;
    chromosomes[chrom_id]->sw_cnt = sw_cnt;
    chromosomes[chrom_id]->cw_cnt = cw_cnt;

    chromosomes[chrom_id]->cw = (struct window *) malloc (sizeof(struct window) * cw_cnt);
    chromosomes[chrom_id]->lw = (struct window *) malloc (sizeof(struct window) * lw_cnt);
    chromosomes[chrom_id]->sw = (struct window *) malloc (sizeof(struct window) * sw_cnt);
    

  }


}