diff mrcanavar-0.34/prep.c @ 0:86522a0b5f59 default tip

Uploaded source code for mrCaNaVaR
author calkan
date Tue, 21 Feb 2012 10:44:13 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mrcanavar-0.34/prep.c	Tue Feb 21 10:44:13 2012 -0500
@@ -0,0 +1,381 @@
+#include "prep.h"
+
+void prep_genome(void){
+  FILE *fasta;
+  FILE *gaps;
+  FILE *config;
+  
+  int num_gaps;
+  
+  char chrom[MAX_STR];
+  int start, end;
+  int max_len;
+  
+  int i;
+
+  if (GENOME_FASTA == NULL)
+    print_error("Select genome fasta file (input) through -fasta parameter.\n");
+  if (GENOME_GAPS == NULL)
+    print_error("Select genome gaps file (input) through -gaps parameter.\n"); 
+  if (GENOME_CONF == NULL)
+    print_error("Select genome fasta file (output) through -conf parameter.\n");
+
+
+  if (LW_SIZE <= 0)
+    print_error ("Large window size (-lw_size) must be a poitive integer.\n");
+  if (SW_SIZE <= 0)
+    print_error ("Small window size (-sw_size) must be a poitive integer.\n");
+  if (CW_SIZE <= 0)
+    print_error ("Copy number window size (-cw_size) must be a poitive integer.\n");
+  if (LW_SLIDE <= 0)
+    print_error ("Large window slide (-lw_slide) must be a poitive integer.\n");
+  if (SW_SLIDE <= 0)
+    print_error ("Small window slide (-sw_slide) must be a poitive integer.\n");
+
+  if (SW_SIZE >= LW_SIZE)
+    print_error ("Small window size (-sw_size) should be smaller than large window size (-lw_size).\n");
+  if (SW_SLIDE > LW_SIZE)
+    print_error ("Small window slide (-sw_slide) should be smaller than or equal to large window slide (-lw_slide).\n");
+
+
+  fasta = my_fopen(GENOME_FASTA, "r", 0);
+  gaps = my_fopen(GENOME_GAPS, "r", 0);
+
+  num_gaps = 0;
+
+
+  /* initialize gap table */
+  while (fscanf (gaps, "%s\t%d\t%d\n", chrom, &start, &end) > 0)
+    num_gaps ++;
+
+
+  gaptable = (struct gapcell *) malloc(sizeof(struct gapcell) * num_gaps);
+
+  rewind(gaps);
+  
+
+  /* read gap table */
+
+  i = 0;
+  while (fscanf (gaps, "%s\t%d\t%d\n", chrom, &start, &end) > 0){
+    trimspace(chrom);
+    gaptable[i].chrom = NULL;
+    set_str(&(gaptable[i].chrom), chrom);
+    gaptable[i].start = start;
+    gaptable[i].end = end;
+    i++;
+  }
+  fclose(gaps);
+
+  fprintf (stdout, "Scanning the reference genome.\n");
+  /* count basic numbers from the reference genome */
+  max_len = 0;
+  num_chrom = count_chrom(fasta, &max_len);
+  rewind(fasta);
+
+  chromosomes = (struct chrom **) malloc (sizeof (struct chrom *) * num_chrom);
+
+  for (i=0; i<num_chrom; i++)
+    chromosomes[i] = (struct chrom *) malloc (sizeof (struct chrom) * num_chrom);
+
+  fprintf(stdout, "Total of %d chromosomes in the reference genome, with %d gaps.\nLongest chromosome is %d bp\n", num_chrom, num_gaps, max_len);
+
+  fprintf (stdout, "Reading the reference genome.\n");
+
+  read_ref(fasta, max_len, num_gaps);
+
+  fprintf (stdout, "Saving the reference configuration.\n");
+  saveRefConfig(GENOME_CONF);
+
+}
+
+
+
+int count_chrom(FILE *fasta, int *max_len){
+  int length=0; int maxlength=0;
+  char ch;
+  int cnt = 0;
+  char skip[MAX_STR];
+  char *retStr;
+
+  //while (fscanf(fasta, "%c", &ch) > 0){
+  while (1){
+
+    ch = fgetc(fasta);
+    if (feof(fasta)) break;
+
+    if (ch == '>') {
+      cnt ++;
+      retStr = fgets(skip, MAX_STR, fasta);
+      fprintf(stdout, ".");
+      fflush(stdout);
+      if (length > maxlength) maxlength = length;
+      length = 0;
+    }
+    else if (!isspace(ch)) length++;
+  }
+  fprintf(stdout, "\n");
+
+  if (length > maxlength)
+    maxlength = length;
+
+  *max_len = maxlength;
+  return cnt;
+}
+
+
+
+void read_ref(FILE *fasta, int max_len, int num_gaps){
+
+  char ch;
+  int cnt = 0;
+  char chrom_name[MAX_STR];
+  int i;
+  int length;
+  char *chrom_seq;
+  char *retStr;
+
+  chrom_seq = (char *) malloc (sizeof(char) * (max_len+1));
+  cnt = -1;
+  i = 0;
+  length = 0;
+
+  //  while (fscanf(fasta, "%c", &ch) > 0){
+  while (1){
+    
+    ch = fgetc(fasta);
+    if (feof(fasta)) break;
+    
+    if (ch == '>') {
+
+      if (cnt != -1){
+	/* insert the previous chromosome */
+	chrom_seq[length]=0;
+	insert_chrom(cnt, chrom_name, length, chrom_seq, num_gaps);
+      }
+
+      cnt ++;
+      retStr = fgets(chrom_name, MAX_STR, fasta);
+      chrom_name[strlen(chrom_name)-1] = 0;
+      trimspace(chrom_name);
+      length = 0;      
+    }
+    else if (!isspace(ch)){
+      chrom_seq[length++] = ch;
+    }
+  }
+
+  /* insert the last chromosome */
+  chrom_seq[length]=0;
+  trimspace(chrom_name);
+  insert_chrom(cnt, chrom_name, length, chrom_seq, num_gaps);
+
+  free(chrom_seq);
+}
+
+
+void insert_chrom(int cnt, char *chrom_name, int length, char *chrom_seq, int num_gaps){
+
+  fprintf(stdout, "Chromosome %s (%d bp). Counting windows ... ", chrom_name, length);
+  fflush(stdout);
+
+  chromosomes[cnt]->name = NULL;
+  set_str(&(chromosomes[cnt]->name), chrom_name);
+  chromosomes[cnt]->length = length;  
+
+  windowmaker(num_gaps, cnt, chrom_name, chrom_seq, length, 0);
+  fprintf(stdout, "\t\tRecalculating windows ... ");
+  fflush(stdout);
+  windowmaker(num_gaps, cnt, chrom_name, chrom_seq, length, 1);
+
+  fprintf(stdout, "\n");
+}
+
+
+void windowmaker(int num_gaps, int chrom_id, char *chrom_name, char *chrom_seq, int length, int flag){
+  int i;
+  int j;
+  int nchar;
+  int s, e;
+
+  int gc;
+  float gc_p;
+
+  int lw_cnt = 0; // large window (5Kb)
+  int sw_cnt = 0; // small window (1Kb)
+  int cw_cnt = 0; // copy number window (1kb non-ovp);
+
+
+  int win_cnt;
+
+  /* if flag = 0; don't record it */
+
+  if (!flag){
+    for (i=0;i<num_gaps;i++){
+      if (!strcmp(gaptable[i].chrom, chrom_name)){
+	if (gaptable[i].start == gaptable[i].end)
+	  chrom_seq[gaptable[i].start] = 'X';
+	else{
+	  for (j=gaptable[i].start; (j<gaptable[i].end && j<length); j++)
+	    chrom_seq[j] = 'X';
+	}
+      }
+    }
+  }
+
+
+  /* count cw_cnt */
+
+  nchar = 0;
+  s = 0 ; e = 0;
+  gc = 0;
+  
+  for (i=0; i<length; i++){
+    
+    if (chrom_seq[i] != 'N' && chrom_seq[i] != 'X')
+      nchar++;
+    if (chrom_seq[i] == 'G' || chrom_seq[i] == 'C')
+      gc++;
+  
+    if ((nchar == CW_SIZE || chrom_seq[i] == 'X') && nchar != 0){
+      e = i;
+      if (chrom_seq[i] == 'X'){
+	nchar = 0;
+	gc = 0;
+      }
+      else if (flag == 1 && nchar == CW_SIZE){
+	/* save */
+	chromosomes[chrom_id]->cw[cw_cnt].start = s;
+	chromosomes[chrom_id]->cw[cw_cnt].end = e+1;
+	chromosomes[chrom_id]->cw[cw_cnt].gc = (float)gc / (float)nchar;
+	chromosomes[chrom_id]->cw[cw_cnt].depth = 0; // for readability/completeness
+	cw_cnt++;
+	gc = 0;
+      }
+      else if (flag == 0 && nchar == CW_SIZE){
+	cw_cnt ++;
+      }
+
+      s = i + 1;
+      nchar = 0;
+    }
+
+    if (chrom_seq[i] == 'X'){
+      s = i + 1;
+      nchar = 0;
+      gc = 0;
+    }
+    
+  }
+
+
+  /* count sw_cnt */
+
+  i = 0;
+  nchar = 0;
+  s = 0 ; e = 0;
+  gc = 0;
+
+  while (i < length){
+
+    if (chrom_seq[i] != 'N' && chrom_seq[i] != 'X') 
+      nchar++;
+    if (chrom_seq[i] == 'G' || chrom_seq[i] == 'C')
+      gc++;
+
+    if ((nchar == SW_SIZE || chrom_seq[i] == 'X') && nchar != 0){
+      e = i;
+      if (chrom_seq[i] == 'X'){
+        nchar = 0;
+	gc = 0;
+      }
+      else  if (flag == 1 && nchar == SW_SIZE){
+	/* save */
+	chromosomes[chrom_id]->sw[sw_cnt].start = s;
+	chromosomes[chrom_id]->sw[sw_cnt].end = e+1;
+	chromosomes[chrom_id]->sw[sw_cnt].gc = (float)gc / (float)nchar;
+	chromosomes[chrom_id]->sw[sw_cnt].depth = 0; // for readability/completeness
+	sw_cnt++;
+	gc = 0;
+      }
+      else if (flag == 0 && nchar == SW_SIZE)
+	sw_cnt++;
+
+      s = s + SW_SLIDE;
+      i = s - 1;
+      nchar = 0;
+    }
+    if (chrom_seq[i]=='X'){ 
+      s = i + 1; 
+      i = s - 1;
+      gc = 0;
+    }
+     
+    i++;
+  }
+
+
+  nchar = 0;
+  i = 0;
+  s = 0 ; e = 0;
+  gc = 0;
+  /* count lw_cnt */
+
+  while (i<length){
+
+    if (chrom_seq[i] != 'N' && chrom_seq[i] != 'X') 
+      nchar++;
+    if (chrom_seq[i] == 'G' || chrom_seq[i] == 'C')
+      gc++;
+
+    if ((nchar == LW_SIZE || chrom_seq[i] == 'X') && nchar != 0){
+      e = i;
+      if (chrom_seq[i] == 'X'){
+        nchar = 0;
+	gc = 0;
+      }
+      else if (flag == 1 && nchar == LW_SIZE){
+	/* save */
+	chromosomes[chrom_id]->lw[lw_cnt].start = s;
+	chromosomes[chrom_id]->lw[lw_cnt].end = e+1;
+	chromosomes[chrom_id]->lw[lw_cnt].gc = (float)gc / (float)nchar;
+	chromosomes[chrom_id]->lw[lw_cnt].depth = 0; // for readability/completeness
+	lw_cnt++;
+	gc = 0;
+      }
+      else if (flag == 0 && nchar == LW_SIZE)
+	lw_cnt++;
+
+      s = s + LW_SLIDE;
+      i = s - 1;
+      nchar = 0;
+    }
+
+    if (chrom_seq[i] == 'X') {
+      s = i + 1; 
+      i = s - 1;
+      gc = 0;
+    }
+
+    i++;
+  }
+
+
+
+  if (flag == 0){
+
+    printf("\n\t\tLW: %d\tSW: %d\tCW: %d\n", lw_cnt, sw_cnt, cw_cnt);
+
+    chromosomes[chrom_id]->lw_cnt = lw_cnt;
+    chromosomes[chrom_id]->sw_cnt = sw_cnt;
+    chromosomes[chrom_id]->cw_cnt = cw_cnt;
+
+    chromosomes[chrom_id]->cw = (struct window *) malloc (sizeof(struct window) * cw_cnt);
+    chromosomes[chrom_id]->lw = (struct window *) malloc (sizeof(struct window) * lw_cnt);
+    chromosomes[chrom_id]->sw = (struct window *) malloc (sizeof(struct window) * sw_cnt);
+    
+
+  }
+
+
+}