Mercurial > repos > mmaiensc > ember
view GALAXY_FILES/tools/EMBER/Ember_Galaxy.c @ 3:037c3edda16e
Uploaded
author | mmaiensc |
---|---|
date | Thu, 22 Mar 2012 13:49:52 -0400 |
parents | 003f802d4c7d |
children |
line wrap: on
line source
/* * ember.c : Expectation Maximization of Binding and Expression pRofiles * ---------------------------------------------------------------- * algorithm adapted from MEME algorithm: Bailey and Elkan, Proc Int Conf Intell Syst Mol Biol, 2:28-36 (1994). * See Eqs. 3-13, plus "Implementation of MM" section for detailed description * changes are as follows: * background changes with column (j=1,...,numclasses), so it is a 2D histogram of the same size as the motif model; same for psuedocounts * background model is determined by full list of genes (second input); z-values of genes in first input are subtracted * update of background can be turned off by commenting out 8th line of z_update() * in this case, also comment out index_genes at end of initialize(), as it will save some time * sequences cannot overlap, since they are fixed per gene; this simplifies the enumeration of all possibilities * a ZOOPS options is included (change in code, parameter zoops); this does not alter the algorithm, * only adds a restriction on which genes are chosen as targets in choose_targets() * general notes: * the common dimensions are: * numpeaks - number of peaks, size of peaklist, indexed by m * peaklist[m].numgenes - number of genes matched to a peak, size of peaklist[m].genelist, indexed by n * numclasses - number of comparisons made (i.e. motif width), indexed by j * 5 - number of classifications in each comparison, assumed to be -2,-1,0,1,2, indexed by k * 2 - number of models (motif=0 and background=1), indexed by l * nmots - total number of motifs to find, indexed by i in main (always stored at l=0, as found motifs are erased) * col_tog can be used to force the algorithm to ignore certain columns (behavior dimensions) * * things to add/consider: * - best measure of motif quality score? * - want to ensure that the same sequence can't get assigned to more than one motif? */ #include "all.h" typedef struct { char id[50]; char name[500]; char chrom[50]; long start; long end; int sense; double z; double erase; double score; int index; // will refer to index of this id in bklist int used; // set to 0 if a non-classified gene int assigned; // set to 1 if assigned int *values; } gene; typedef struct { char info[5000]; char chrom[50]; long pkposn; int numgenes; int numused; int assgngene; // set to # targets picked for this gene gene *genelist; } peak; // FREE PARAMETERS TO CHOOSE IN COMMAND LINE double mu; // scale background histogram by mu for psuedocounts double tscale; // scales the threshold (to make choices more or less stringent) int zoops; // set to 1 to enforce ZOOPS model (Zero Or One Per Strand) int printall; // set to 1 to print all genes, not just targets int mots; // number of motifs to search for int repeat; // number of times to wean columns and repeat search double rsca; // percent of average to set relative entropy cutoff // END PARAMETERS TO CHOOSE char *input, *background, *pref, *togs; char *target_out, *list_out, *model_out, *annot_out; long iseedval; int numpeaks, numclasses, usedclasses, numbk, bkused, nmots; peak *peaklist; gene *bklist; double ***hist, ***oldhist, lambda, t, **beta; double *Beta; int *col_tog, col_togl, set_togs, *col_tog_cpy; int verbose; void read_cmd_line( int argc, char **argv ); void print_usage(); void read_data( char *filename ); void read_bkgrnd( char *filename ); void initialize( long *seed ); void reinitialize( long *seed ); void beta_update(); void index_genes(); void hist_update(); void bkhist_update(); void lambda_update(); void toggle_update(); void toggle_reset(); void z_update(); void erase_update(); double lprob( int m, int n, int l ); double prob( int m, int n, int l ); void copy_hist(); double hist_diff(); double score( int m, int n ); void t_update(); double ElogL(); double ratio_score(); double rel_entropy(); double col_rel_entropy( int j ); double chisq(); int motif_count(); int assign_gene( int m, long *seed ); void score_genes(); void choose_targets(); void print_peak( FILE *fp, int i ); void print_gene( FILE *fp, int i, int j ); void print_state( FILE *fp, char *filename ); //void print_targets( FILE *fp, char *filename ); void print_model( FILE *fp, char *filename ); //void print_annotpeak( FILE *fp, char *filename, int motif ); char *nice_d( double val ); int main(int argc, char *argv[]){ int i,iter,r; int pfreq; double diff, conv = 1e-6; long seed; FILE *fout; if(argc == 1 ){ print_usage(); } /* * read command line params */ read_cmd_line( argc, argv ); /* * allocate output names */ // target_out = (char *) malloc((strlen(pref)+20)*sizeof(char)); // list_out = (char *) malloc((strlen(pref)+20)*sizeof(char)); // model_out = (char *) malloc((strlen(pref)+20)*sizeof(char)); // annot_out = (char *) malloc((strlen(pref)+20)*sizeof(char)); /* * read in data, copy values from global to local variables */ if( verbose != 0 ) printf("\nReading data and allocating\n"); read_data( input ); read_bkgrnd( background ); nmots = mots; seed = iseedval; /* * allocate memory for other arrays */ hist = (double ***) alloc3d(sizeof(double),2,numclasses,5); oldhist = (double ***) alloc3d(sizeof(double),2,numclasses,5); beta = (double **) alloc2d(sizeof(double),numclasses,5); Beta = (double *) malloc(numclasses*sizeof(double)); /* * find nmots motif models */ for(i=1; i<= nmots; i++){ /* * initialize, fill initial parameter values from z-guesses */ if( verbose != 0 ) printf("Expression Pattern %i,\n Initializing\n", i); if( i==1 ) initialize( &seed ); else reinitialize( &seed ); hist_update(); lambda_update(); /* * now sample for awhile, repeating repeat times * each repeat starts at the previously refined search (?) */ for(r=0; r<= repeat; r++){ if( verbose != 0 ) printf(" Sampling\n"); if( verbose != 0 && repeat> 0 ) printf(" Repeat %i\n", r ); diff = 1; pfreq = 10; iter=1; while( diff > conv ){ z_update(); // E step copy_hist(); hist_update(); // M step bkhist_update(); // M step lambda_update(); // M step diff = hist_diff(); if( iter%pfreq == 0 && verbose != 0 ) printf(" iteration %i, (normalized) ElogL %.2f, change %.2e\n", iter, ElogL(), diff); iter++; } if( repeat > 0 && r< repeat ) toggle_update(); } /* * score and choose final targets */ t_update(); if( verbose != 0 ) printf(" Done, scoring and choosing targets on threshold %.3f\n\n", t); score_genes(); choose_targets(); /* * print final targets */ // sprintf( target_out,"%s-%i.targets", pref, i); // sprintf( list_out,"%s-%i.list", pref, i); // sprintf( model_out,"%s-%i.model", pref, i); // sprintf( annot_out,"%s-%i.xls", pref, i); print_state(fout, target_out); print_model(fout, model_out); // print_targets(fout, list_out); // print_annotpeak( fout, annot_out, i); /* * erase found motifs (only if we're going around again) */ if( i< nmots ) erase_update(); if( repeat > 0 ) toggle_reset(); } } /*==================== FUNCTIONS ======================*/ /* * read command line inputs and options */ void read_cmd_line( int argc, char **argv ){ int i, j; int found_reqs=0; // default values mots = 1; zoops = 0; printall = 0; mu = 0.1; tscale = 1.0; iseedval = make_seed(); set_togs = 0; verbose = 1; repeat = 0; rsca = 1.0; // parse options for(i=1; i< argc; i++){ if( argv[i][0] != '-' ) print_usage(); i++; if( i >= argc || argv[i][0] == '-' ) print_usage(); switch( argv[i-1][1] ){ case 'i': input = (char *) malloc((strlen(argv[i])+5)*sizeof(char)); strcpy(input, argv[i]); found_reqs++; break; case 'b': background = (char *) malloc((strlen(argv[i])+5)*sizeof(char)); strcpy(background, argv[i]); found_reqs++; break; case 'o': target_out = (char *) malloc((strlen(argv[i])+5)*sizeof(char)); strcpy(target_out, argv[i]); found_reqs++; break; case 'p': model_out = (char *) malloc((strlen(argv[i])+5)*sizeof(char)); strcpy(model_out, argv[i]); found_reqs++; break; case 'n': mots = atoi(argv[i]); if( mots <= 0 ){ printf("\nError: choose number of expression patterns to be at least 1\n"); print_usage(); } break; case 'z': if( argv[i][0] == 'y' ) zoops = 1; else if( argv[i][0] == 'n' ) zoops = 0; else{ printf("\nError: if using -z option, choose y or n\n"); print_usage(); } break; case 'v': if( argv[i][0] == 'y' ) verbose = 1; else if( argv[i][0] == 'n' ) verbose = 0; else{ printf("\nError: if using -v option, choose y or n\n"); print_usage(); } break; case 'm': mu = atof(argv[i]); if( mu<= 0 ){ printf("\nError: choose mu to be positive\n"); print_usage(); } break; case 't': tscale = atof(argv[i]); if( tscale<= 0 ){ printf("\nError: choose threshold to be positive\n"); print_usage(); } break; case 's': iseedval = -1*atol(argv[i]); if( iseedval>= 0 ){ printf("\nError: choose seed to be a large positive integer\n"); print_usage(); } break; case 'c': set_togs = 1; togs = (char *) malloc((strlen(argv[i])+5)*sizeof(char)); strcpy( togs, argv[i] ); break; case 'a': if( argv[i][0] == 'y' ) printall = 1; else if( argv[i][0] == 'n' ) printall = 0; else{ printf("\nError: if using -a option, choose y or n\n"); print_usage(); } break; case 'r': repeat = atoi(argv[i]); if( repeat< 0 ){ printf("\nError: choose repeat to be a non-negative integer\n"); print_usage(); } break; case 'R': rsca = atof(argv[i]); if( rsca <= 0 ){ printf("\nError: choose repeat scalar to be positive\n"); print_usage(); } break; default: printf("\nError: unkown option: -%c\n", argv[i-1][1]); print_usage(); } } // double check that the correct number of required inputs has been read in if( found_reqs != 4 ){ printf("\nError: improper number of required inputs found\n"); print_usage(); } } /* * usage info */ void print_usage(){ printf("\nUsage: -i matched_data -b background_data -o target_output_name -p model_output_name [option value]\n"); printf("Options:\n"); printf(" -n number of expression patterns to find (positive integer, default 1)\n"); printf(" -z enforce ZOOPS model (y or n, default n)\n"); printf(" -m scalar for psuedocounts (positive real number, default 0.1)\n"); printf(" -t scalar for threshold (positive real number, default 1.0)\n"); printf(" -s seed (positive long integer, default chosen from date and time)\n"); printf(" -c column toggle file, used to ignore certain classifications (default: all on)\n"); printf(" -v verboseness, turn on/off printing to stdout (y or n, default: y)\n"); printf(" -a print all genes (and scores), rather than only those above the threshold (y or n, default: n)\n"); printf(" -r repeat search: after convergence of each expression pattern, turn off columns with lower relative entropy\n"); printf(" than average*scalar and repeat expression pattern search (non-negative integer, default 0)\n"); printf(" -R relative entropy scalar: if using the -r option, multiplies the average relative entropy\n"); printf(" by this value to set the threshold for turning off columns (positive real number, default 1.0)\n"); printf("\n"); exit(0); } /* * function: read data from filename into the peaklist data structure * notes on assumed format of data: * peak lines start with "PEAK:" field * gene lines start with "GENE:" field * gene lines with only 1 classification are ignored * assume all fields are <5000 characters */ void read_data( char *filename ){ int j,m,n; char val[5000], other[5000]; FILE *fp; /* * first count number of peaks, and allocate */ fp = fopen(filename,"rt"); if( fp == NULL ){ printf("Error: can't open file %s\n", filename); exit(0); } numpeaks = 0; while( fscanf(fp, "%s", &val ) != EOF ){ if( contains( val, "PEAK" ) == 1 ){ numpeaks++; } } fclose(fp); peaklist = (peak *) malloc(numpeaks*sizeof(peak)); /* * now count number of genes w/>1 classification, and allocate * also count number of classifications, and allocate that at the end * n tracks number of genes, j field; subtract genes that don't have full classifications * m tracks peak index; record peak info */ fp = fopen(filename,"rt"); n=-1; m=-1; j=0; sprintf(other,"NA"); numclasses=0; while( fscanf(fp, "%s", &val ) != EOF ){ if( contains( val, "PEAK" ) == 1 ){ if( j==7 ) ; else{ if( j-6 > numclasses ) numclasses = j-6; } if( m>=0 ){ peaklist[m].numgenes = n; peaklist[m].genelist = (gene *) malloc( peaklist[m].numgenes*sizeof(gene)); } n=0; j=0; if( strcmp(other, "NA") != 0 ) strcpy(peaklist[m].info, other); m++; sprintf(other,"NA"); } else if( contains( val, "GENE" ) == 1 ){ n++; j=0; } else if( n==0 ){ if( j==0 ) strcpy(peaklist[m].chrom, val); if( j==1 ) peaklist[m].pkposn = atol(val); if( j> 1 ){ if( strcmp(other, "NA") == 0 ) strcpy(other, val); else sprintf(other,"%s %s", other, val); } j++; } else{ j++; } } peaklist[m].numgenes = n; peaklist[m].genelist = (gene *) malloc( peaklist[m].numgenes*sizeof(gene)); strcpy( peaklist[m].info, other); fclose(fp); for(m=0; m< numpeaks; m++){ for(n=0; n< peaklist[m].numgenes; n++){ peaklist[m].genelist[n].values = (int *) malloc(numclasses*sizeof(int)); } } /* * now read in gene info */ fp = fopen(filename,"rt"); m=-1; j=0; while( fscanf(fp, "%s", &val ) != EOF ){ if( contains( val, "PEAK" ) == 1 ){ if( j==7 ) peaklist[m].genelist[n].used = 0; if( j==6+numclasses) peaklist[m].genelist[n].used = 1; n=-1; j=0; m++; } else if( contains( val, "GENE" ) == 1 ){ if( j==7 ) peaklist[m].genelist[n].used = 0; if( j==6+numclasses ) peaklist[m].genelist[n].used = 1; n++; j=0; } else if( n==-1 ){ } else { if( j==0 ) strcpy( peaklist[m].genelist[n].id, val ); if( j==1 ) strcpy( peaklist[m].genelist[n].name, val ); if( j==2 ) strcpy( peaklist[m].genelist[n].chrom, val ); if( j==3 ) peaklist[m].genelist[n].start = atol(val); if( j==4 ) peaklist[m].genelist[n].end = atol(val); if( j==5 ) peaklist[m].genelist[n].sense = atoi(val); if( j>=6 ) peaklist[m].genelist[n].values[j-6] = atoi(val); j++; } } if( j==7 ) peaklist[m].genelist[n].used = 0; if( j==6+numclasses) peaklist[m].genelist[n].used = 1; fclose(fp); /* * now count how many genes are to be used for each */ for(m=0; m< numpeaks; m++){ peaklist[m].numused = 0; for(n=0; n< peaklist[m].numgenes; n++){ peaklist[m].numused+= peaklist[m].genelist[n].used; } } } void read_bkgrnd( char *filename ){ int j,n; char val[500]; FILE *fp; /* * first count number of genes, and allocate */ numbk = count_lines( fp, filename );; bklist = (gene *) malloc(numbk*sizeof(gene)); for(n=0; n< numbk; n++){ bklist[n].values = (int *) malloc(numclasses*sizeof(int)); } /* * now read in gene info */ fp = fopen(filename,"rt"); n=-1; j=0; while( fscanf(fp, "%s", &val ) != EOF ){ if( contains( val, "_a" ) == 1 ){ if( j==7 ) bklist[n].used = 0; if( j==6+numclasses ) bklist[n].used = 1; j=1; n++; strcpy( bklist[n].id, val ); } else{ if( j==1 ) strcpy( bklist[n].name, val ); if( j==2 ) strcpy( bklist[n].chrom, val ); if( j==3 ) bklist[n].start = atol(val); if( j==4 ) bklist[n].end = atol(val); if( j==5 ) bklist[n].sense = atoi(val); if( j>=6 ) bklist[n].values[j-6] = atoi(val); j++; } } if( j==7 ) bklist[n].used = 0; if( j==6+numclasses) bklist[n].used = 1; fclose(fp); /* * now count how many genes are to be used for each */ bkused = 0; for(n=0; n< numbk; n++){ bkused+= bklist[n].used; } } /* * make initial guesses of z-values, set erase to 1 * read in column toggle file if provided * also histogram background and scale into beta */ void initialize( long *seed ){ int j,m,n; int val; FILE *fp; for(m=0; m< numpeaks; m++){ for(n=0; n< peaklist[m].numgenes; n++){ if( peaklist[m].genelist[n].used == 1 ){ peaklist[m].genelist[n].erase = 1.0; peaklist[m].genelist[n].z = 0.0; } } if( peaklist[m].numused > 0 ){ n = assign_gene( m, seed ); peaklist[m].genelist[n].z = 1.0; // pick one random gene from each set to read into motif model } } col_tog = (int *) malloc(numclasses*sizeof(int)); col_tog_cpy = (int *) malloc(numclasses*sizeof(int)); if( set_togs == 0 ){ for(j=0; j< numclasses; j++){ col_tog[j] = 1; col_tog_cpy[j] = 1; } usedclasses = numclasses; } else{ // first count number of fields col_togl = 0; fp = fopen(togs,"rt"); if( fp == NULL ){ printf("Error: can't open file %s\n", togs); exit(0); } j=0; while( fscanf(fp,"%i", &val) != EOF ){ col_togl++; } fclose(fp); if( col_togl != numclasses ){ printf("Error: number of fields in toggle file (%i) is different from number of classifications in data (%i)\n", col_togl, numclasses); exit(0); } // now read in values fp = fopen(togs,"rt"); j=0; usedclasses = 0; while( fscanf(fp,"%i", &val) != EOF ){ col_tog[j] = val; col_tog_cpy[j] = val; j++; if( val == 1 ) usedclasses++; if( val != 1 && val != 0 ){ printf("Error: choose the toggle values to be 0 or 1\n"); exit(0); } } fclose(fp); } for(n=0; n< numbk; n++){ bklist[n].z = 1.0; } beta_update(); if( verbose != 0 ) printf(" Indexing genes in first pass\n"); index_genes(); } /* * make initial guesses of z-values for a second, etc motif * recount number of used genes per peak to those with erase > 0.5 */ void reinitialize( long *seed ){ int m,n; for(m=0; m< numpeaks; m++){ peaklist[m].numused = 0; for(n=0; n< peaklist[m].numgenes; n++){ peaklist[m].genelist[n].z = 0.0; if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].erase> 0.5 ){ peaklist[m].numused++; } } if( peaklist[m].numused > 0 ){ n = assign_gene( m, seed ); peaklist[m].genelist[n].z = 1.0; // pick one random gene from each set to read into motif model } } for(n=0; n< numbk; n++){ bklist[n].z = 1.0; } } /* * histogram background into beta */ void beta_update(){ int j,k,n; double tot=0.0; bkhist_update(); for(j=0; j< numclasses; j++){ // if( col_tog[j] == 1 ){ Beta[j] = 0.0; for(k=0; k< 5; k++){ beta[j][k] = hist[1][j][k]*mu; Beta[j]+= beta[j][k]; } // } } } /* * for all the genes in peaklist[*].genelist[*], record the index of that gene in bklist */ void index_genes(){ int m,n,nu; int found; for(m=0; m< numpeaks; m++){ for(n=0; n< peaklist[m].numgenes; n++){ if( peaklist[m].genelist[n].used == 1 ){ found = 0; nu = 0; while(nu< numbk && found == 0){ if( strcmp(peaklist[m].genelist[n].id, bklist[nu].id) == 0 ){ peaklist[m].genelist[n].index = nu; found = 1; } nu++; } } } } } /* * update motif model histogram */ void hist_update(){ int j,k,m,n; double tot=0; for(j=0; j< numclasses; j++){ // if( col_tog[j] == 1 ){ for(k=0; k< 5; k++){ hist[0][j][k] = 0.0; } // } } for(m=0; m< numpeaks; m++){ for(n=0; n< peaklist[m].numgenes; n++){ if( peaklist[m].genelist[n].used == 1 ){ tot+=peaklist[m].genelist[n].erase*peaklist[m].genelist[n].z; for(j=0; j< numclasses; j++){ // if( col_tog[j] == 1 ){ k = peaklist[m].genelist[n].values[j]+2; hist[0][j][k]+= peaklist[m].genelist[n].erase*peaklist[m].genelist[n].z; // } } } } } for(j=0; j< numclasses; j++){ // if( col_tog[j] == 1 ){ for(k=0; k< 5; k++){ hist[0][j][k]+= beta[j][k]; hist[0][j][k]/= (tot+Beta[j]);; } // } } } /* * update background hist using z values from bklist */ void bkhist_update(){ int j,k,n; double tot=0.0; for(j=0; j< numclasses; j++){ // if( col_tog[j] == 1 ){ for(k=0; k< 5; k++){ hist[1][j][k] = 0; } // } } for(n=0; n< numbk; n++){ if( bklist[n].used == 1 ){ tot+=bklist[n].z; for(j=0; j< numclasses; j++){ // if( col_tog[j] == 1 ){ k = bklist[n].values[j]+2; hist[1][j][k]+= bklist[n].z; // } } } } for(j=0; j< numclasses; j++){ // if( col_tog[j] == 1 ){ for(k=0; k< 5; k++){ hist[1][j][k]/= tot; } // } } } /* * update lambda value */ void lambda_update(){ int m,n; double tot=0.0; lambda=0; for(m=0; m< numpeaks; m++){ for(n=0; n< peaklist[m].numgenes; n++){ if( peaklist[m].genelist[n].used == 1 ){ lambda+= peaklist[m].genelist[n].z; tot++; } } } lambda/=tot; } /* * update the toggle columns based on which columns have better than average relative entropy */ void toggle_update(){ int j; double avg=0.0; avg = rsca*rel_entropy(); usedclasses=0; for(j=0; j< numclasses; j++){ if( col_tog[j] == 1 ){ if( col_rel_entropy(j) < avg ) col_tog[j] = 0; if( col_tog[j] == 1 ) usedclasses++; } } } /* * reset col_tog values with their original ones */ void toggle_reset(){ int j; usedclasses=0; for(j=0; j< numclasses; j++){ col_tog[j] = col_tog_cpy[j]; if( col_tog[j] == 1 ) usedclasses++; } } /* * update z */ void z_update(){ int j,m,n; double a, c, denom; for(m=0; m< numpeaks; m++){ for(n=0; n< peaklist[m].numgenes; n++){ if( peaklist[m].genelist[n].used == 1 ){ // new z value for motif model a = lprob(m,n,0)+log(lambda); c = lprob(m,n,1)+log(1-lambda); if( a> c ){ denom = a + log(1+exp(c-a)); } else{ denom = c + log(1+exp(a-c)); } peaklist[m].genelist[n].z = exp( lprob(m,n,0)+log(lambda) - denom ); // peaklist[m].genelist[n].z = prob(m,n,0)*lambda/( prob(m,n,0)*lambda + prob(m,n,1)*(1-lambda) ); // new z value for corresponding gene in background model bklist[peaklist[m].genelist[n].index].z = 1.0-peaklist[m].genelist[n].z; } } } } /* * update erase parameters */ void erase_update(){ int j,m,n; for(m=0; m< numpeaks; m++){ for(n=0; n< peaklist[m].numgenes; n++){ if( peaklist[m].genelist[n].used == 1 ){ peaklist[m].genelist[n].erase*= (1-peaklist[m].genelist[n].z); } } } } /* * calculate (log) (multinomial) probability of sequence m,n in model l */ double lprob( int m, int n, int l ){ int j,k; double val=0.0; for(j=0; j< numclasses; j++){ if( col_tog[j] == 1 ){ k = peaklist[m].genelist[n].values[j]+2; val+= log(hist[l][j][k]); } } return val; } double prob( int m, int n, int l ){ int j,k; double val=1.0; for(j=0; j< numclasses; j++){ if( col_tog[j] == 1 ){ k = peaklist[m].genelist[n].values[j]+2; val*= hist[l][j][k]; } } return val; } /* * copy hist into oldhist before updating */ void copy_hist(){ int j,k,l; for(l=0; l< 2; l++){ for(j=0; j< numclasses; j++){ // if( col_tog[j] == 1 ){ for(k=0; k< 5; k++){ oldhist[l][j][k] = hist[l][j][k]; } // } } } } /* * compute change in hist */ double hist_diff(){ int j,k,l; double val=0.0; for(l=0; l< 2; l++){ for(j=0; j< numclasses; j++){ if( col_tog[j] == 1 ){ for(k=0; k< 5; k++){ val+= fabs( oldhist[l][j][k]-hist[l][j][k] ); } } } } return val; } /* * calculate the score of a sequence */ double score( int m, int n ){ int j,k; double val=0.0; for(j=0; j< numclasses; j++){ if( col_tog[j] == 1 ){ k = peaklist[m].genelist[n].values[j]+2; val+= log( hist[0][j][k]/hist[1][j][k] ); } } return val; } /* * calculate value of t (threshold) */ void t_update(){ t = tscale*log( (1-lambda)/lambda ); } /* * quality scores of a motif, averaging over only those sequences picked as targets * ElogL: log-likelihood of the combination of models, averaged by # genes * ratio_score: log-likelihood ratio of motif vs background models * rel_entropy: relative entropy of motif to background * col_rel_entropy: same as rel_entropy, but for just one column (rel_entropy is average over all cols) * chisq: chi-square statistic */ double ElogL(){ int m,n; double val=0.0, tot=0.0; for(m=0; m< numpeaks; m++){ for(n=0; n< peaklist[m].numgenes; n++){ if( peaklist[m].genelist[n].used == 1 ){ val+= peaklist[m].genelist[n].z*lprob(m, n, 0 ) \ + (1-peaklist[m].genelist[n].z)*lprob(m, n, 1 )\ + peaklist[m].genelist[n].z*log(lambda) \ + (1-peaklist[m].genelist[n].z)*log(1-lambda); tot++; } } } return val/tot; } double ratio_score(){ int m,n; double val=0.0; double tot=0.0; for(m=0; m< numpeaks; m++){ for(n=0; n< peaklist[m].numgenes; n++){ if( peaklist[m].genelist[n].assigned == 1 ){ val+= lprob(m,n,0)-lprob(m,n,1); tot++; } } } return val/tot; } double rel_entropy(){ int j,k; double val=0.0; for(j=0; j< numclasses; j++){ if( col_tog[j] == 1 ){ val+= col_rel_entropy( j ); } } return val/usedclasses; } double col_rel_entropy( int j ){ int k; double val=0.0; for(k=0; k< 5; k++){ if( hist[1][j][j] > 0 && hist[0][j][k] > 0 ){ // do in log base 2 for bits val+= hist[0][j][k]*log(hist[0][j][k]/hist[1][j][k])/log(2); } } return val; } double chisq(){ int j, k; double val=0.0; for(j=0; j< numclasses; j++){ if( col_tog[j] == 1 ){ for(k=0; k< 5; k++){ val+= pow(hist[0][j][k] - hist[1][j][k],2)/hist[1][j][k]; } } } return val; } /* * number of members of a motif model */ int motif_count(){ int m,n; int tot=0; for(m=0; m< numpeaks; m++){ for(n=0; n< peaklist[m].numgenes; n++){ if( peaklist[m].genelist[n].assigned == 1 ){ tot++; } } } return tot; } /* * assign a random used gene: one that is used and hasn't been erased */ int assign_gene( int m, long *seed ){ int n; n = floor( peaklist[m].numgenes*ran1(seed) ); while( peaklist[m].genelist[n].used == 0 || peaklist[m].genelist[n].erase < 0.5 ){ n = floor( peaklist[m].numgenes*ran1(seed) ); } return n; } /* * score all genes using the models */ void score_genes(){ int m,n; for(m=0; m< numpeaks; m++){ for(n=0; n< peaklist[m].numgenes; n++){ if( peaklist[m].genelist[n].used == 1 ){ peaklist[m].genelist[n].score= score( m, n ); } } } } /* * choose targets using the scores, and ZOOPS setting * zoops = 1 -> 0 or 1 target/strand * else -> all above threshold */ void choose_targets(){ int m,n; int best; for(m=0; m< numpeaks; m++){ peaklist[m].assgngene = 0; best=-1; for(n=0; n< peaklist[m].numgenes; n++){ if( peaklist[m].genelist[n].used == 1 ){ if( peaklist[m].genelist[n].score > t ){ peaklist[m].genelist[n].assigned = 1; peaklist[m].assgngene++; } else peaklist[m].genelist[n].assigned = 0; if( best == -1 ) best = n; else if( peaklist[m].genelist[n].score > peaklist[m].genelist[best].score ) best = n; } } if( zoops == 1 ){ for(n=0; n< peaklist[m].numgenes; n++){ if( peaklist[m].genelist[n].used == 1 && n != best ) peaklist[m].genelist[n].assigned = 0; } if( peaklist[m].genelist[best].score > t ) peaklist[m].assgngene = 1; else peaklist[m].assgngene = 0; } } } /* * functions to print to output (fp must already be opened for top two) */ void print_peak( FILE *fp, int m ){ int n; fprintf(fp,"PEAK: %s %ld %s\n", peaklist[m].chrom, peaklist[m].pkposn, peaklist[m].info); for(n=0; n< peaklist[m].numgenes; n++){ if( printall == 0 ){ if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].assigned == 1 ){ print_gene( fp, m, n ); } } if( printall == 1 ){ if( peaklist[m].genelist[n].used == 1 ){ print_gene( fp, m, n ); } } } } void print_gene( FILE *fp, int m, int n ){ int j; if( peaklist[m].genelist[n].used == 1 ){ fprintf(fp, "TGENE: %0.3f %s %s %s %ld %ld %i", peaklist[m].genelist[n].score, peaklist[m].genelist[n].id, peaklist[m].genelist[n].name, peaklist[m].genelist[n].chrom, peaklist[m].genelist[n].start, peaklist[m].genelist[n].end, peaklist[m].genelist[n].sense); for(j=0; j< numclasses; j++){ if( col_tog[j] == 0 ) fprintf(fp," %i ", peaklist[m].genelist[n].values[j]); else fprintf(fp," %i", peaklist[m].genelist[n].values[j]); } fprintf(fp,"\n"); } } void print_state(FILE *fp, char *filename){ int m; fp = fopen(filename,"w"); for(m=0; m< numpeaks; m++){ print_peak(fp, m); } fclose(fp); } /* * this prints a list of <region id> <name> */ /**************** void print_targets( FILE *fp, char *filename ){ int m,n,nu; fp = fopen(filename,"w"); for(m=0; m< numpeaks; m++){ if( peaklist[m].assgngene > 0 ){ fprintf(fp,"%s ", peaklist[m].regid); n=0; nu=0; while( n< peaklist[m].numgenes ){ if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].assigned == 1 ){ if( nu< peaklist[m].assgngene-1 ) fprintf(fp,"%s,", peaklist[m].genelist[n].name); else fprintf(fp,"%s\n", peaklist[m].genelist[n].name); nu++; } n++; } } } fclose(fp); } *************/ /* * print the motif model and its score */ void print_model( FILE *fp, char *filename ){ int j,k; fp = fopen(filename,"w"); fprintf(fp,"comparison\\class; avg relative entropy %0.3f, %i genes, threshold %0.3f, quality %0.1f\n", rel_entropy(), motif_count(), t, rel_entropy()*motif_count() ); fprintf(fp," ++ + 0 - --\n"); for(j=0; j< numclasses; j++){ if( j< 10 ) fprintf(fp," "); if( col_tog[j] == 0 ) fprintf(fp,"#"); else fprintf(fp," "); fprintf(fp,"%i,%.3f",j+1, col_rel_entropy(j) ); for(k=4; k>=0; k--){ if( hist[1][j][k] > 0 ) fprintf(fp,"%s", nice_d(log(hist[0][j][k]/hist[1][j][k])) ); else fprintf(fp," NA"); } fprintf(fp,"\n"); } fclose(fp); } /* * prints a double with 3 digits, aligned by spaces * longest would be -999.000, so 8 characters * shortest would be 1.000, 5 characters * up to 3 spaces */ char *nice_d( double val ){ char *returnval; char *dval; char *spaces; int i, numspaces=0; returnval = (char *) malloc(10*sizeof(char)); dval = (char *) malloc(10*sizeof(char)); spaces = (char *) malloc(10*sizeof(char)); sprintf(dval,"%.3f", val); if( val >= 0 ) numspaces++; if( fabs(val) < 100 ){ if( fabs(val) < 10 ) numspaces+=2; else numspaces++; } if( numspaces == 0 ) sprintf(spaces,""); if( numspaces == 1 ) sprintf(spaces," "); if( numspaces == 2 ) sprintf(spaces," "); if( numspaces == 3 ) sprintf(spaces," "); sprintf(returnval,"%s%s", spaces, dval); return returnval; } /* * print the peaks only in annotation format, for use in comppeaks, subpeaks, etc */ /************** void print_annotpeak( FILE *fp, char *filename, int motif ){ int m; fp = fopen(filename,"w"); for(m=0; m< numpeaks; m++){ if( peaklist[m].assgngene > 0 ){ fprintf(fp,"%s-%i %s %s %ld %ld %ld ", peaklist[m].filename, motif, peaklist[m].regid, peaklist[m].chrom, peaklist[m].pkposn, peaklist[m].start, peaklist[m].end ); fprintf(fp,"%f 1.0 1.0 1 1 1.0 ", peaklist[m].height ); fprintf(fp,"1 1.0 1.0 1.0 1 -1.0 poop 1\n"); } } close(fp); } *************/