Mercurial > repos > mmaiensc > ember
diff GALAXY_FILES/tools/EMBER/Ember_Galaxy.c @ 3:037c3edda16e
Uploaded
author | mmaiensc |
---|---|
date | Thu, 22 Mar 2012 13:49:52 -0400 |
parents | 003f802d4c7d |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GALAXY_FILES/tools/EMBER/Ember_Galaxy.c Thu Mar 22 13:49:52 2012 -0400 @@ -0,0 +1,1227 @@ +/* + * ember.c : Expectation Maximization of Binding and Expression pRofiles + * ---------------------------------------------------------------- + * algorithm adapted from MEME algorithm: Bailey and Elkan, Proc Int Conf Intell Syst Mol Biol, 2:28-36 (1994). + * See Eqs. 3-13, plus "Implementation of MM" section for detailed description + * changes are as follows: + * background changes with column (j=1,...,numclasses), so it is a 2D histogram of the same size as the motif model; same for psuedocounts + * background model is determined by full list of genes (second input); z-values of genes in first input are subtracted + * update of background can be turned off by commenting out 8th line of z_update() + * in this case, also comment out index_genes at end of initialize(), as it will save some time + * sequences cannot overlap, since they are fixed per gene; this simplifies the enumeration of all possibilities + * a ZOOPS options is included (change in code, parameter zoops); this does not alter the algorithm, + * only adds a restriction on which genes are chosen as targets in choose_targets() + * general notes: + * the common dimensions are: + * numpeaks - number of peaks, size of peaklist, indexed by m + * peaklist[m].numgenes - number of genes matched to a peak, size of peaklist[m].genelist, indexed by n + * numclasses - number of comparisons made (i.e. motif width), indexed by j + * 5 - number of classifications in each comparison, assumed to be -2,-1,0,1,2, indexed by k + * 2 - number of models (motif=0 and background=1), indexed by l + * nmots - total number of motifs to find, indexed by i in main (always stored at l=0, as found motifs are erased) + * col_tog can be used to force the algorithm to ignore certain columns (behavior dimensions) + * + * things to add/consider: + * - best measure of motif quality score? + * - want to ensure that the same sequence can't get assigned to more than one motif? + */ + +#include "all.h" + +typedef struct { + char id[50]; + char name[500]; + char chrom[50]; + long start; + long end; + int sense; + double z; + double erase; + double score; + int index; // will refer to index of this id in bklist + int used; // set to 0 if a non-classified gene + int assigned; // set to 1 if assigned + int *values; +} gene; + +typedef struct { + char info[5000]; + char chrom[50]; + long pkposn; + int numgenes; + int numused; + int assgngene; // set to # targets picked for this gene + gene *genelist; +} peak; + +// FREE PARAMETERS TO CHOOSE IN COMMAND LINE +double mu; // scale background histogram by mu for psuedocounts +double tscale; // scales the threshold (to make choices more or less stringent) +int zoops; // set to 1 to enforce ZOOPS model (Zero Or One Per Strand) +int printall; // set to 1 to print all genes, not just targets +int mots; // number of motifs to search for +int repeat; // number of times to wean columns and repeat search +double rsca; // percent of average to set relative entropy cutoff +// END PARAMETERS TO CHOOSE + +char *input, *background, *pref, *togs; +char *target_out, *list_out, *model_out, *annot_out; +long iseedval; +int numpeaks, numclasses, usedclasses, numbk, bkused, nmots; +peak *peaklist; +gene *bklist; +double ***hist, ***oldhist, lambda, t, **beta; +double *Beta; +int *col_tog, col_togl, set_togs, *col_tog_cpy; +int verbose; + +void read_cmd_line( int argc, char **argv ); +void print_usage(); +void read_data( char *filename ); +void read_bkgrnd( char *filename ); +void initialize( long *seed ); +void reinitialize( long *seed ); +void beta_update(); +void index_genes(); + +void hist_update(); +void bkhist_update(); +void lambda_update(); +void toggle_update(); +void toggle_reset(); +void z_update(); +void erase_update(); +double lprob( int m, int n, int l ); +double prob( int m, int n, int l ); +void copy_hist(); +double hist_diff(); +double score( int m, int n ); +void t_update(); +double ElogL(); +double ratio_score(); +double rel_entropy(); +double col_rel_entropy( int j ); +double chisq(); +int motif_count(); + +int assign_gene( int m, long *seed ); +void score_genes(); +void choose_targets(); + +void print_peak( FILE *fp, int i ); +void print_gene( FILE *fp, int i, int j ); +void print_state( FILE *fp, char *filename ); +//void print_targets( FILE *fp, char *filename ); +void print_model( FILE *fp, char *filename ); +//void print_annotpeak( FILE *fp, char *filename, int motif ); +char *nice_d( double val ); + +int main(int argc, char *argv[]){ + int i,iter,r; + int pfreq; + double diff, conv = 1e-6; + long seed; + FILE *fout; + + if(argc == 1 ){ + print_usage(); + } + + /* + * read command line params + */ + read_cmd_line( argc, argv ); + + /* + * allocate output names + */ +// target_out = (char *) malloc((strlen(pref)+20)*sizeof(char)); +// list_out = (char *) malloc((strlen(pref)+20)*sizeof(char)); +// model_out = (char *) malloc((strlen(pref)+20)*sizeof(char)); +// annot_out = (char *) malloc((strlen(pref)+20)*sizeof(char)); + + /* + * read in data, copy values from global to local variables + */ + if( verbose != 0 ) printf("\nReading data and allocating\n"); + read_data( input ); + read_bkgrnd( background ); + nmots = mots; + seed = iseedval; + + /* + * allocate memory for other arrays + */ + hist = (double ***) alloc3d(sizeof(double),2,numclasses,5); + oldhist = (double ***) alloc3d(sizeof(double),2,numclasses,5); + beta = (double **) alloc2d(sizeof(double),numclasses,5); + Beta = (double *) malloc(numclasses*sizeof(double)); + + /* + * find nmots motif models + */ + for(i=1; i<= nmots; i++){ + /* + * initialize, fill initial parameter values from z-guesses + */ + if( verbose != 0 ) printf("Expression Pattern %i,\n Initializing\n", i); + if( i==1 ) initialize( &seed ); + else reinitialize( &seed ); + hist_update(); + lambda_update(); + + /* + * now sample for awhile, repeating repeat times + * each repeat starts at the previously refined search (?) + */ + for(r=0; r<= repeat; r++){ + if( verbose != 0 ) printf(" Sampling\n"); + if( verbose != 0 && repeat> 0 ) printf(" Repeat %i\n", r ); + diff = 1; + pfreq = 10; + iter=1; + while( diff > conv ){ + z_update(); // E step + copy_hist(); + hist_update(); // M step + bkhist_update(); // M step + lambda_update(); // M step + diff = hist_diff(); + if( iter%pfreq == 0 && verbose != 0 ) printf(" iteration %i, (normalized) ElogL %.2f, change %.2e\n", iter, ElogL(), diff); + iter++; + } + if( repeat > 0 && r< repeat ) toggle_update(); + } + + /* + * score and choose final targets + */ + t_update(); + if( verbose != 0 ) printf(" Done, scoring and choosing targets on threshold %.3f\n\n", t); + score_genes(); + choose_targets(); + + /* + * print final targets + */ +// sprintf( target_out,"%s-%i.targets", pref, i); +// sprintf( list_out,"%s-%i.list", pref, i); +// sprintf( model_out,"%s-%i.model", pref, i); +// sprintf( annot_out,"%s-%i.xls", pref, i); + print_state(fout, target_out); + print_model(fout, model_out); +// print_targets(fout, list_out); +// print_annotpeak( fout, annot_out, i); + + /* + * erase found motifs (only if we're going around again) + */ + if( i< nmots ) erase_update(); + if( repeat > 0 ) toggle_reset(); + } + +} + +/*==================== FUNCTIONS ======================*/ + +/* + * read command line inputs and options + */ +void read_cmd_line( int argc, char **argv ){ + int i, j; + int found_reqs=0; + + // default values + mots = 1; + zoops = 0; + printall = 0; + mu = 0.1; + tscale = 1.0; + iseedval = make_seed(); + set_togs = 0; + verbose = 1; + repeat = 0; + rsca = 1.0; + + // parse options + for(i=1; i< argc; i++){ + if( argv[i][0] != '-' ) print_usage(); + i++; + if( i >= argc || argv[i][0] == '-' ) print_usage(); + switch( argv[i-1][1] ){ + case 'i': + input = (char *) malloc((strlen(argv[i])+5)*sizeof(char)); + strcpy(input, argv[i]); + found_reqs++; + break; + case 'b': + background = (char *) malloc((strlen(argv[i])+5)*sizeof(char)); + strcpy(background, argv[i]); + found_reqs++; + break; + case 'o': + target_out = (char *) malloc((strlen(argv[i])+5)*sizeof(char)); + strcpy(target_out, argv[i]); + found_reqs++; + break; + case 'p': + model_out = (char *) malloc((strlen(argv[i])+5)*sizeof(char)); + strcpy(model_out, argv[i]); + found_reqs++; + break; + case 'n': + mots = atoi(argv[i]); + if( mots <= 0 ){ + printf("\nError: choose number of expression patterns to be at least 1\n"); + print_usage(); + } + break; + case 'z': + if( argv[i][0] == 'y' ) zoops = 1; + else if( argv[i][0] == 'n' ) zoops = 0; + else{ + printf("\nError: if using -z option, choose y or n\n"); + print_usage(); + } + break; + case 'v': + if( argv[i][0] == 'y' ) verbose = 1; + else if( argv[i][0] == 'n' ) verbose = 0; + else{ + printf("\nError: if using -v option, choose y or n\n"); + print_usage(); + } + break; + case 'm': + mu = atof(argv[i]); + if( mu<= 0 ){ + printf("\nError: choose mu to be positive\n"); + print_usage(); + } + break; + case 't': + tscale = atof(argv[i]); + if( tscale<= 0 ){ + printf("\nError: choose threshold to be positive\n"); + print_usage(); + } + break; + case 's': + iseedval = -1*atol(argv[i]); + if( iseedval>= 0 ){ + printf("\nError: choose seed to be a large positive integer\n"); + print_usage(); + } + break; + case 'c': + set_togs = 1; + togs = (char *) malloc((strlen(argv[i])+5)*sizeof(char)); + strcpy( togs, argv[i] ); + break; + case 'a': + if( argv[i][0] == 'y' ) printall = 1; + else if( argv[i][0] == 'n' ) printall = 0; + else{ + printf("\nError: if using -a option, choose y or n\n"); + print_usage(); + } + break; + case 'r': + repeat = atoi(argv[i]); + if( repeat< 0 ){ + printf("\nError: choose repeat to be a non-negative integer\n"); + print_usage(); + } + break; + case 'R': + rsca = atof(argv[i]); + if( rsca <= 0 ){ + printf("\nError: choose repeat scalar to be positive\n"); + print_usage(); + } + break; + default: + printf("\nError: unkown option: -%c\n", argv[i-1][1]); + print_usage(); + } + } + // double check that the correct number of required inputs has been read in + if( found_reqs != 4 ){ + printf("\nError: improper number of required inputs found\n"); + print_usage(); + } +} + +/* + * usage info + */ +void print_usage(){ + printf("\nUsage: -i matched_data -b background_data -o target_output_name -p model_output_name [option value]\n"); + printf("Options:\n"); + printf(" -n number of expression patterns to find (positive integer, default 1)\n"); + printf(" -z enforce ZOOPS model (y or n, default n)\n"); + printf(" -m scalar for psuedocounts (positive real number, default 0.1)\n"); + printf(" -t scalar for threshold (positive real number, default 1.0)\n"); + printf(" -s seed (positive long integer, default chosen from date and time)\n"); + printf(" -c column toggle file, used to ignore certain classifications (default: all on)\n"); + printf(" -v verboseness, turn on/off printing to stdout (y or n, default: y)\n"); + printf(" -a print all genes (and scores), rather than only those above the threshold (y or n, default: n)\n"); + printf(" -r repeat search: after convergence of each expression pattern, turn off columns with lower relative entropy\n"); + printf(" than average*scalar and repeat expression pattern search (non-negative integer, default 0)\n"); + printf(" -R relative entropy scalar: if using the -r option, multiplies the average relative entropy\n"); + printf(" by this value to set the threshold for turning off columns (positive real number, default 1.0)\n"); + printf("\n"); + exit(0); +} + +/* + * function: read data from filename into the peaklist data structure + * notes on assumed format of data: + * peak lines start with "PEAK:" field + * gene lines start with "GENE:" field + * gene lines with only 1 classification are ignored + * assume all fields are <5000 characters + */ +void read_data( char *filename ){ + int j,m,n; + char val[5000], other[5000]; + FILE *fp; + + /* + * first count number of peaks, and allocate + */ + fp = fopen(filename,"rt"); + if( fp == NULL ){ + printf("Error: can't open file %s\n", filename); + exit(0); + } + numpeaks = 0; + while( fscanf(fp, "%s", &val ) != EOF ){ + if( contains( val, "PEAK" ) == 1 ){ + numpeaks++; + } + } + fclose(fp); + peaklist = (peak *) malloc(numpeaks*sizeof(peak)); + + /* + * now count number of genes w/>1 classification, and allocate + * also count number of classifications, and allocate that at the end + * n tracks number of genes, j field; subtract genes that don't have full classifications + * m tracks peak index; record peak info + */ + fp = fopen(filename,"rt"); + n=-1; + m=-1; + j=0; + sprintf(other,"NA"); + numclasses=0; + while( fscanf(fp, "%s", &val ) != EOF ){ + if( contains( val, "PEAK" ) == 1 ){ + if( j==7 ) ; + else{ + if( j-6 > numclasses ) numclasses = j-6; + } + if( m>=0 ){ + peaklist[m].numgenes = n; + peaklist[m].genelist = (gene *) malloc( peaklist[m].numgenes*sizeof(gene)); + } + n=0; + j=0; + if( strcmp(other, "NA") != 0 ) strcpy(peaklist[m].info, other); + m++; + sprintf(other,"NA"); + } + else if( contains( val, "GENE" ) == 1 ){ + n++; + j=0; + } + else if( n==0 ){ + if( j==0 ) strcpy(peaklist[m].chrom, val); + if( j==1 ) peaklist[m].pkposn = atol(val); + if( j> 1 ){ + if( strcmp(other, "NA") == 0 ) strcpy(other, val); + else sprintf(other,"%s %s", other, val); + } + j++; + } + else{ + j++; + } + } + peaklist[m].numgenes = n; + peaklist[m].genelist = (gene *) malloc( peaklist[m].numgenes*sizeof(gene)); + strcpy( peaklist[m].info, other); + fclose(fp); + + for(m=0; m< numpeaks; m++){ + for(n=0; n< peaklist[m].numgenes; n++){ + peaklist[m].genelist[n].values = (int *) malloc(numclasses*sizeof(int)); + } + } + + /* + * now read in gene info + */ + fp = fopen(filename,"rt"); + m=-1; + j=0; + while( fscanf(fp, "%s", &val ) != EOF ){ + if( contains( val, "PEAK" ) == 1 ){ + if( j==7 ) peaklist[m].genelist[n].used = 0; + if( j==6+numclasses) peaklist[m].genelist[n].used = 1; + n=-1; + j=0; + m++; + } + else if( contains( val, "GENE" ) == 1 ){ + if( j==7 ) peaklist[m].genelist[n].used = 0; + if( j==6+numclasses ) peaklist[m].genelist[n].used = 1; + n++; + j=0; + } + else if( n==-1 ){ + } + else { + if( j==0 ) strcpy( peaklist[m].genelist[n].id, val ); + if( j==1 ) strcpy( peaklist[m].genelist[n].name, val ); + if( j==2 ) strcpy( peaklist[m].genelist[n].chrom, val ); + if( j==3 ) peaklist[m].genelist[n].start = atol(val); + if( j==4 ) peaklist[m].genelist[n].end = atol(val); + if( j==5 ) peaklist[m].genelist[n].sense = atoi(val); + if( j>=6 ) peaklist[m].genelist[n].values[j-6] = atoi(val); + j++; + } + } + if( j==7 ) peaklist[m].genelist[n].used = 0; + if( j==6+numclasses) peaklist[m].genelist[n].used = 1; + fclose(fp); + + /* + * now count how many genes are to be used for each + */ + for(m=0; m< numpeaks; m++){ + peaklist[m].numused = 0; + for(n=0; n< peaklist[m].numgenes; n++){ + peaklist[m].numused+= peaklist[m].genelist[n].used; + } + } +} + +void read_bkgrnd( char *filename ){ + int j,n; + char val[500]; + FILE *fp; + + /* + * first count number of genes, and allocate + */ + numbk = count_lines( fp, filename );; + bklist = (gene *) malloc(numbk*sizeof(gene)); + + for(n=0; n< numbk; n++){ + bklist[n].values = (int *) malloc(numclasses*sizeof(int)); + } + + /* + * now read in gene info + */ + fp = fopen(filename,"rt"); + n=-1; + j=0; + while( fscanf(fp, "%s", &val ) != EOF ){ + if( contains( val, "_a" ) == 1 ){ + if( j==7 ) bklist[n].used = 0; + if( j==6+numclasses ) bklist[n].used = 1; + j=1; + n++; + strcpy( bklist[n].id, val ); + } + else{ + if( j==1 ) strcpy( bklist[n].name, val ); + if( j==2 ) strcpy( bklist[n].chrom, val ); + if( j==3 ) bklist[n].start = atol(val); + if( j==4 ) bklist[n].end = atol(val); + if( j==5 ) bklist[n].sense = atoi(val); + if( j>=6 ) bklist[n].values[j-6] = atoi(val); + j++; + } + } + if( j==7 ) bklist[n].used = 0; + if( j==6+numclasses) bklist[n].used = 1; + fclose(fp); + + /* + * now count how many genes are to be used for each + */ + bkused = 0; + for(n=0; n< numbk; n++){ + bkused+= bklist[n].used; + } +} + +/* + * make initial guesses of z-values, set erase to 1 + * read in column toggle file if provided + * also histogram background and scale into beta + */ +void initialize( long *seed ){ + int j,m,n; + int val; + FILE *fp; + + for(m=0; m< numpeaks; m++){ + for(n=0; n< peaklist[m].numgenes; n++){ + if( peaklist[m].genelist[n].used == 1 ){ + peaklist[m].genelist[n].erase = 1.0; + peaklist[m].genelist[n].z = 0.0; + } + } + if( peaklist[m].numused > 0 ){ + n = assign_gene( m, seed ); + peaklist[m].genelist[n].z = 1.0; // pick one random gene from each set to read into motif model + } + } + + col_tog = (int *) malloc(numclasses*sizeof(int)); + col_tog_cpy = (int *) malloc(numclasses*sizeof(int)); + if( set_togs == 0 ){ + for(j=0; j< numclasses; j++){ + col_tog[j] = 1; + col_tog_cpy[j] = 1; + } + usedclasses = numclasses; + } + else{ + // first count number of fields + col_togl = 0; + fp = fopen(togs,"rt"); + if( fp == NULL ){ + printf("Error: can't open file %s\n", togs); + exit(0); + } + j=0; + while( fscanf(fp,"%i", &val) != EOF ){ + col_togl++; + } + fclose(fp); + if( col_togl != numclasses ){ + printf("Error: number of fields in toggle file (%i) is different from number of classifications in data (%i)\n", col_togl, numclasses); + exit(0); + } + // now read in values + fp = fopen(togs,"rt"); + j=0; + usedclasses = 0; + while( fscanf(fp,"%i", &val) != EOF ){ + col_tog[j] = val; + col_tog_cpy[j] = val; + j++; + if( val == 1 ) usedclasses++; + if( val != 1 && val != 0 ){ + printf("Error: choose the toggle values to be 0 or 1\n"); + exit(0); + } + } + fclose(fp); + } + + for(n=0; n< numbk; n++){ + bklist[n].z = 1.0; + } + beta_update(); + if( verbose != 0 ) printf(" Indexing genes in first pass\n"); + index_genes(); +} + +/* + * make initial guesses of z-values for a second, etc motif + * recount number of used genes per peak to those with erase > 0.5 + */ +void reinitialize( long *seed ){ + int m,n; + for(m=0; m< numpeaks; m++){ + peaklist[m].numused = 0; + for(n=0; n< peaklist[m].numgenes; n++){ + peaklist[m].genelist[n].z = 0.0; + if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].erase> 0.5 ){ + peaklist[m].numused++; + } + } + if( peaklist[m].numused > 0 ){ + n = assign_gene( m, seed ); + peaklist[m].genelist[n].z = 1.0; // pick one random gene from each set to read into motif model + } + } + for(n=0; n< numbk; n++){ + bklist[n].z = 1.0; + } +} + +/* + * histogram background into beta + */ +void beta_update(){ + int j,k,n; + double tot=0.0; + bkhist_update(); + for(j=0; j< numclasses; j++){ +// if( col_tog[j] == 1 ){ + Beta[j] = 0.0; + for(k=0; k< 5; k++){ + beta[j][k] = hist[1][j][k]*mu; + Beta[j]+= beta[j][k]; + } +// } + } +} + +/* + * for all the genes in peaklist[*].genelist[*], record the index of that gene in bklist + */ +void index_genes(){ + int m,n,nu; + int found; + for(m=0; m< numpeaks; m++){ + for(n=0; n< peaklist[m].numgenes; n++){ + if( peaklist[m].genelist[n].used == 1 ){ + found = 0; + nu = 0; + while(nu< numbk && found == 0){ + if( strcmp(peaklist[m].genelist[n].id, bklist[nu].id) == 0 ){ + peaklist[m].genelist[n].index = nu; + found = 1; + } + nu++; + } + } + } + } +} + +/* + * update motif model histogram + */ +void hist_update(){ + int j,k,m,n; + double tot=0; + for(j=0; j< numclasses; j++){ +// if( col_tog[j] == 1 ){ + for(k=0; k< 5; k++){ + hist[0][j][k] = 0.0; + } +// } + } + for(m=0; m< numpeaks; m++){ + for(n=0; n< peaklist[m].numgenes; n++){ + if( peaklist[m].genelist[n].used == 1 ){ + tot+=peaklist[m].genelist[n].erase*peaklist[m].genelist[n].z; + for(j=0; j< numclasses; j++){ +// if( col_tog[j] == 1 ){ + k = peaklist[m].genelist[n].values[j]+2; + hist[0][j][k]+= peaklist[m].genelist[n].erase*peaklist[m].genelist[n].z; +// } + } + } + } + } + for(j=0; j< numclasses; j++){ +// if( col_tog[j] == 1 ){ + for(k=0; k< 5; k++){ + hist[0][j][k]+= beta[j][k]; + hist[0][j][k]/= (tot+Beta[j]);; + } +// } + } +} + +/* + * update background hist using z values from bklist + */ +void bkhist_update(){ + int j,k,n; + double tot=0.0; + for(j=0; j< numclasses; j++){ +// if( col_tog[j] == 1 ){ + for(k=0; k< 5; k++){ + hist[1][j][k] = 0; + } +// } + } + for(n=0; n< numbk; n++){ + if( bklist[n].used == 1 ){ + tot+=bklist[n].z; + for(j=0; j< numclasses; j++){ +// if( col_tog[j] == 1 ){ + k = bklist[n].values[j]+2; + hist[1][j][k]+= bklist[n].z; +// } + } + } + } + for(j=0; j< numclasses; j++){ +// if( col_tog[j] == 1 ){ + for(k=0; k< 5; k++){ + hist[1][j][k]/= tot; + } +// } + } +} + +/* + * update lambda value + */ +void lambda_update(){ + int m,n; + double tot=0.0; + lambda=0; + for(m=0; m< numpeaks; m++){ + for(n=0; n< peaklist[m].numgenes; n++){ + if( peaklist[m].genelist[n].used == 1 ){ + lambda+= peaklist[m].genelist[n].z; + tot++; + } + } + } + lambda/=tot; +} + +/* + * update the toggle columns based on which columns have better than average relative entropy + */ +void toggle_update(){ + int j; + double avg=0.0; + avg = rsca*rel_entropy(); + usedclasses=0; + for(j=0; j< numclasses; j++){ + if( col_tog[j] == 1 ){ + if( col_rel_entropy(j) < avg ) col_tog[j] = 0; + if( col_tog[j] == 1 ) usedclasses++; + } + } +} + +/* + * reset col_tog values with their original ones + */ +void toggle_reset(){ + int j; + usedclasses=0; + for(j=0; j< numclasses; j++){ + col_tog[j] = col_tog_cpy[j]; + if( col_tog[j] == 1 ) usedclasses++; + } +} + +/* + * update z + */ +void z_update(){ + int j,m,n; + double a, c, denom; + for(m=0; m< numpeaks; m++){ + for(n=0; n< peaklist[m].numgenes; n++){ + if( peaklist[m].genelist[n].used == 1 ){ + // new z value for motif model + a = lprob(m,n,0)+log(lambda); + c = lprob(m,n,1)+log(1-lambda); + if( a> c ){ + denom = a + log(1+exp(c-a)); + } + else{ + denom = c + log(1+exp(a-c)); + } + peaklist[m].genelist[n].z = exp( lprob(m,n,0)+log(lambda) - denom ); + // peaklist[m].genelist[n].z = prob(m,n,0)*lambda/( prob(m,n,0)*lambda + prob(m,n,1)*(1-lambda) ); + // new z value for corresponding gene in background model + bklist[peaklist[m].genelist[n].index].z = 1.0-peaklist[m].genelist[n].z; + } + } + } +} + +/* + * update erase parameters + */ +void erase_update(){ + int j,m,n; + for(m=0; m< numpeaks; m++){ + for(n=0; n< peaklist[m].numgenes; n++){ + if( peaklist[m].genelist[n].used == 1 ){ + peaklist[m].genelist[n].erase*= (1-peaklist[m].genelist[n].z); + } + } + } +} + +/* + * calculate (log) (multinomial) probability of sequence m,n in model l + */ +double lprob( int m, int n, int l ){ + int j,k; + double val=0.0; + for(j=0; j< numclasses; j++){ + if( col_tog[j] == 1 ){ + k = peaklist[m].genelist[n].values[j]+2; + val+= log(hist[l][j][k]); + } + } + return val; +} +double prob( int m, int n, int l ){ + int j,k; + double val=1.0; + for(j=0; j< numclasses; j++){ + if( col_tog[j] == 1 ){ + k = peaklist[m].genelist[n].values[j]+2; + val*= hist[l][j][k]; + } + } + return val; +} + +/* + * copy hist into oldhist before updating + */ +void copy_hist(){ + int j,k,l; + for(l=0; l< 2; l++){ + for(j=0; j< numclasses; j++){ +// if( col_tog[j] == 1 ){ + for(k=0; k< 5; k++){ + oldhist[l][j][k] = hist[l][j][k]; + } +// } + } + } +} + +/* + * compute change in hist + */ +double hist_diff(){ + int j,k,l; + double val=0.0; + for(l=0; l< 2; l++){ + for(j=0; j< numclasses; j++){ + if( col_tog[j] == 1 ){ + for(k=0; k< 5; k++){ + val+= fabs( oldhist[l][j][k]-hist[l][j][k] ); + } + } + } + } + return val; +} + +/* + * calculate the score of a sequence + */ +double score( int m, int n ){ + int j,k; + double val=0.0; + for(j=0; j< numclasses; j++){ + if( col_tog[j] == 1 ){ + k = peaklist[m].genelist[n].values[j]+2; + val+= log( hist[0][j][k]/hist[1][j][k] ); + } + } + return val; +} + +/* + * calculate value of t (threshold) + */ +void t_update(){ + t = tscale*log( (1-lambda)/lambda ); +} + +/* + * quality scores of a motif, averaging over only those sequences picked as targets + * ElogL: log-likelihood of the combination of models, averaged by # genes + * ratio_score: log-likelihood ratio of motif vs background models + * rel_entropy: relative entropy of motif to background + * col_rel_entropy: same as rel_entropy, but for just one column (rel_entropy is average over all cols) + * chisq: chi-square statistic + */ +double ElogL(){ + int m,n; + double val=0.0, tot=0.0; + for(m=0; m< numpeaks; m++){ + for(n=0; n< peaklist[m].numgenes; n++){ + if( peaklist[m].genelist[n].used == 1 ){ + val+= peaklist[m].genelist[n].z*lprob(m, n, 0 ) \ + + (1-peaklist[m].genelist[n].z)*lprob(m, n, 1 )\ + + peaklist[m].genelist[n].z*log(lambda) \ + + (1-peaklist[m].genelist[n].z)*log(1-lambda); + tot++; + } + } + } + return val/tot; +} + +double ratio_score(){ + int m,n; + double val=0.0; + double tot=0.0; + for(m=0; m< numpeaks; m++){ + for(n=0; n< peaklist[m].numgenes; n++){ + if( peaklist[m].genelist[n].assigned == 1 ){ + val+= lprob(m,n,0)-lprob(m,n,1); + tot++; + } + } + } + return val/tot; +} + +double rel_entropy(){ + int j,k; + double val=0.0; + for(j=0; j< numclasses; j++){ + if( col_tog[j] == 1 ){ + val+= col_rel_entropy( j ); + } + } + return val/usedclasses; +} + +double col_rel_entropy( int j ){ + int k; + double val=0.0; + for(k=0; k< 5; k++){ + if( hist[1][j][j] > 0 && hist[0][j][k] > 0 ){ + // do in log base 2 for bits + val+= hist[0][j][k]*log(hist[0][j][k]/hist[1][j][k])/log(2); + } + } + return val; +} + +double chisq(){ + int j, k; + double val=0.0; + for(j=0; j< numclasses; j++){ + if( col_tog[j] == 1 ){ + for(k=0; k< 5; k++){ + val+= pow(hist[0][j][k] - hist[1][j][k],2)/hist[1][j][k]; + } + } + } + return val; +} + +/* + * number of members of a motif model + */ +int motif_count(){ + int m,n; + int tot=0; + for(m=0; m< numpeaks; m++){ + for(n=0; n< peaklist[m].numgenes; n++){ + if( peaklist[m].genelist[n].assigned == 1 ){ + tot++; + } + } + } + return tot; +} + +/* + * assign a random used gene: one that is used and hasn't been erased + */ +int assign_gene( int m, long *seed ){ + int n; + n = floor( peaklist[m].numgenes*ran1(seed) ); + while( peaklist[m].genelist[n].used == 0 || peaklist[m].genelist[n].erase < 0.5 ){ + n = floor( peaklist[m].numgenes*ran1(seed) ); + } + return n; +} + +/* + * score all genes using the models + */ +void score_genes(){ + int m,n; + for(m=0; m< numpeaks; m++){ + for(n=0; n< peaklist[m].numgenes; n++){ + if( peaklist[m].genelist[n].used == 1 ){ + peaklist[m].genelist[n].score= score( m, n ); + } + } + } +} + +/* + * choose targets using the scores, and ZOOPS setting + * zoops = 1 -> 0 or 1 target/strand + * else -> all above threshold + */ +void choose_targets(){ + int m,n; + int best; + for(m=0; m< numpeaks; m++){ + peaklist[m].assgngene = 0; + best=-1; + for(n=0; n< peaklist[m].numgenes; n++){ + if( peaklist[m].genelist[n].used == 1 ){ + if( peaklist[m].genelist[n].score > t ){ + peaklist[m].genelist[n].assigned = 1; + peaklist[m].assgngene++; + } + else peaklist[m].genelist[n].assigned = 0; + if( best == -1 ) best = n; + else if( peaklist[m].genelist[n].score > peaklist[m].genelist[best].score ) best = n; + } + } + if( zoops == 1 ){ + for(n=0; n< peaklist[m].numgenes; n++){ + if( peaklist[m].genelist[n].used == 1 && n != best ) peaklist[m].genelist[n].assigned = 0; + } + if( peaklist[m].genelist[best].score > t ) peaklist[m].assgngene = 1; + else peaklist[m].assgngene = 0; + } + } +} + +/* + * functions to print to output (fp must already be opened for top two) + */ +void print_peak( FILE *fp, int m ){ + int n; + fprintf(fp,"PEAK: %s %ld %s\n", peaklist[m].chrom, peaklist[m].pkposn, peaklist[m].info); + for(n=0; n< peaklist[m].numgenes; n++){ + if( printall == 0 ){ + if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].assigned == 1 ){ + print_gene( fp, m, n ); + } + } + if( printall == 1 ){ + if( peaklist[m].genelist[n].used == 1 ){ + print_gene( fp, m, n ); + } + } + } +} + +void print_gene( FILE *fp, int m, int n ){ + int j; + if( peaklist[m].genelist[n].used == 1 ){ + fprintf(fp, "TGENE: %0.3f %s %s %s %ld %ld %i", peaklist[m].genelist[n].score, peaklist[m].genelist[n].id, peaklist[m].genelist[n].name, peaklist[m].genelist[n].chrom, peaklist[m].genelist[n].start, peaklist[m].genelist[n].end, peaklist[m].genelist[n].sense); + for(j=0; j< numclasses; j++){ + if( col_tog[j] == 0 ) fprintf(fp," %i ", peaklist[m].genelist[n].values[j]); + else fprintf(fp," %i", peaklist[m].genelist[n].values[j]); + } + fprintf(fp,"\n"); + } +} + +void print_state(FILE *fp, char *filename){ + int m; + fp = fopen(filename,"w"); + for(m=0; m< numpeaks; m++){ + print_peak(fp, m); + } + fclose(fp); +} + +/* + * this prints a list of <region id> <name> + */ +/**************** +void print_targets( FILE *fp, char *filename ){ + int m,n,nu; + fp = fopen(filename,"w"); + for(m=0; m< numpeaks; m++){ + if( peaklist[m].assgngene > 0 ){ + fprintf(fp,"%s ", peaklist[m].regid); + n=0; + nu=0; + while( n< peaklist[m].numgenes ){ + if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].assigned == 1 ){ + if( nu< peaklist[m].assgngene-1 ) fprintf(fp,"%s,", peaklist[m].genelist[n].name); + else fprintf(fp,"%s\n", peaklist[m].genelist[n].name); + nu++; + } + n++; + } + } + } + fclose(fp); +} +*************/ + +/* + * print the motif model and its score + */ +void print_model( FILE *fp, char *filename ){ + int j,k; + fp = fopen(filename,"w"); + fprintf(fp,"comparison\\class; avg relative entropy %0.3f, %i genes, threshold %0.3f, quality %0.1f\n", rel_entropy(), motif_count(), t, rel_entropy()*motif_count() ); + fprintf(fp," ++ + 0 - --\n"); + for(j=0; j< numclasses; j++){ + if( j< 10 ) fprintf(fp," "); + if( col_tog[j] == 0 ) fprintf(fp,"#"); + else fprintf(fp," "); + fprintf(fp,"%i,%.3f",j+1, col_rel_entropy(j) ); + for(k=4; k>=0; k--){ + if( hist[1][j][k] > 0 ) fprintf(fp,"%s", nice_d(log(hist[0][j][k]/hist[1][j][k])) ); + else fprintf(fp," NA"); + } + fprintf(fp,"\n"); + } + fclose(fp); +} + +/* + * prints a double with 3 digits, aligned by spaces + * longest would be -999.000, so 8 characters + * shortest would be 1.000, 5 characters + * up to 3 spaces + */ +char *nice_d( double val ){ + char *returnval; + char *dval; + char *spaces; + int i, numspaces=0; + + returnval = (char *) malloc(10*sizeof(char)); + dval = (char *) malloc(10*sizeof(char)); + spaces = (char *) malloc(10*sizeof(char)); + + sprintf(dval,"%.3f", val); + + if( val >= 0 ) numspaces++; + if( fabs(val) < 100 ){ + if( fabs(val) < 10 ) numspaces+=2; + else numspaces++; + } + if( numspaces == 0 ) sprintf(spaces,""); + if( numspaces == 1 ) sprintf(spaces," "); + if( numspaces == 2 ) sprintf(spaces," "); + if( numspaces == 3 ) sprintf(spaces," "); + sprintf(returnval,"%s%s", spaces, dval); + return returnval; +} + +/* + * print the peaks only in annotation format, for use in comppeaks, subpeaks, etc + */ +/************** +void print_annotpeak( FILE *fp, char *filename, int motif ){ + int m; + fp = fopen(filename,"w"); + for(m=0; m< numpeaks; m++){ + if( peaklist[m].assgngene > 0 ){ + fprintf(fp,"%s-%i %s %s %ld %ld %ld ", peaklist[m].filename, motif, peaklist[m].regid, peaklist[m].chrom, peaklist[m].pkposn, peaklist[m].start, peaklist[m].end ); + fprintf(fp,"%f 1.0 1.0 1 1 1.0 ", peaklist[m].height ); + fprintf(fp,"1 1.0 1.0 1.0 1 -1.0 poop 1\n"); + } + } + close(fp); +} +*************/ +