Mercurial > repos > mmaiensc > ember

/*
 * ember.c : Expectation Maximization of Binding and Expression pRofiles
 * ----------------------------------------------------------------
 * algorithm adapted from MEME algorithm: Bailey and Elkan, Proc Int Conf Intell Syst Mol Biol, 2:28-36 (1994).
 * 	See Eqs. 3-13, plus "Implementation of MM" section for detailed description
 * changes are as follows:
 * 	background changes with column (j=1,...,numclasses), so it is a 2D histogram of the same size as the motif model; same for psuedocounts
 * 	background model is determined by full list of genes (second input); z-values of genes in first input are subtracted
 *	   update of background can be turned off by commenting out 8th line of z_update()
 *	   in this case, also comment out index_genes at end of initialize(), as it will save some time
 * 	sequences cannot overlap, since they are fixed per gene; this simplifies the enumeration of all possibilities
 * 	a ZOOPS options is included (change in code, parameter zoops); this does not alter the algorithm,
 * 	   only adds a restriction on which genes are chosen as targets in choose_targets()
 * general notes:
 * 	the common dimensions are:
 * 	   numpeaks - number of peaks, size of peaklist, indexed by m
 * 	   peaklist[m].numgenes - number of genes matched to a peak, size of peaklist[m].genelist, indexed by n
 * 	   numclasses - number of comparisons made (i.e. motif width), indexed by j
 * 	   5 - number of classifications in each comparison, assumed to be -2,-1,0,1,2, indexed by k
 * 	   2 - number of models (motif=0 and background=1), indexed by l
 * 	   nmots - total number of motifs to find, indexed by i in main (always stored at l=0, as found motifs are erased)
 *      col_tog can be used to force the algorithm to ignore certain columns (behavior dimensions)
 *
 * things to add/consider:
 * 	- best measure of motif quality score?
 * 	- want to ensure that the same sequence can't get assigned to more than one motif?
 */

#include "all.h"

typedef struct {
	char id[50];
	char name[500];
	char chrom[50];
	long start;
	long end;
	int sense;
	double z;
	double erase;
	double score;
	int index;	// will refer to index of this id in bklist
	int used;	// set to 0 if a non-classified gene
	int assigned;	// set to 1 if assigned
	int *values;
} gene;

typedef struct {
	char info[5000];
	char chrom[50];
	long pkposn;
	int numgenes;
	int numused;
	int assgngene;	// set to # targets picked for this gene
	gene *genelist;
} peak;

// FREE PARAMETERS TO CHOOSE IN COMMAND LINE
double mu;	// scale background histogram by mu for psuedocounts
double tscale;	// scales the threshold (to make choices more or less stringent)
int zoops;	// set to 1 to enforce ZOOPS model (Zero Or One Per Strand)
int printall;	// set to 1 to print all genes, not just targets
int mots;	// number of motifs to search for
int repeat;	// number of times to wean columns and repeat search
double rsca;	// percent of average to set relative entropy cutoff
// END PARAMETERS TO CHOOSE

char *input, *background, *pref, *togs;
char *target_out, *list_out, *model_out, *annot_out;
long iseedval;
int numpeaks, numclasses, usedclasses, numbk, bkused, nmots;
peak *peaklist;
gene *bklist;
double ***hist, ***oldhist, lambda, t, **beta;
double *Beta;
int *col_tog, col_togl, set_togs, *col_tog_cpy;
int verbose;

void read_cmd_line( int argc, char **argv );
void print_usage();
void read_data( char *filename );
void read_bkgrnd( char *filename );
void initialize( long *seed );
void reinitialize( long *seed );
void beta_update();
void index_genes();

void hist_update();
void bkhist_update();
void lambda_update();
void toggle_update();
void toggle_reset();
void z_update();
void erase_update();
double lprob( int m, int n, int l );
double prob( int m, int n, int l );
void copy_hist();
double hist_diff();
double score( int m, int n );
void t_update();
double ElogL();
double ratio_score();
double rel_entropy();
double col_rel_entropy( int j );
double chisq();
int motif_count();

int assign_gene( int m, long *seed );
void score_genes();
void choose_targets();

void print_peak( FILE *fp, int i );
void print_gene( FILE *fp, int i, int j );
void print_state( FILE *fp, char *filename );
//void print_targets( FILE *fp, char *filename );
void print_model( FILE *fp, char *filename );
//void print_annotpeak( FILE *fp, char *filename, int motif );
char *nice_d( double val );

int main(int argc, char *argv[]){
	int i,iter,r;
	int pfreq;
	double diff, conv = 1e-6;
	long seed;
	FILE *fout;

	if(argc == 1 ){
		print_usage();
	}

	/*
	 * read command line params
	 */
	read_cmd_line( argc, argv );

	/*
	 * allocate output names
	 */
//	target_out = (char *) malloc((strlen(pref)+20)*sizeof(char));
//	list_out = (char *) malloc((strlen(pref)+20)*sizeof(char));
//	model_out = (char *) malloc((strlen(pref)+20)*sizeof(char));
//	annot_out = (char *) malloc((strlen(pref)+20)*sizeof(char));

	/*
	 * read in data, copy values from global to local variables
	 */
	if( verbose != 0 ) printf("\nReading data and allocating\n");
	read_data( input );
	read_bkgrnd( background );
	nmots = mots;
	seed = iseedval;

	/*
	 * allocate memory for other arrays
	 */
	hist = (double ***) alloc3d(sizeof(double),2,numclasses,5);
	oldhist = (double ***) alloc3d(sizeof(double),2,numclasses,5);
	beta = (double **) alloc2d(sizeof(double),numclasses,5);
	Beta = (double *) malloc(numclasses*sizeof(double));

	/*
	 * find nmots motif models
	 */
	for(i=1; i<= nmots; i++){
		/*
		 * initialize, fill initial parameter values from z-guesses
		 */
		if( verbose != 0 ) printf("Expression Pattern %i,\n   Initializing\n", i);
		if( i==1 ) initialize( &seed );
		else reinitialize( &seed );
		hist_update();
		lambda_update();

		/*
		 * now sample for awhile, repeating repeat times
		 * each repeat starts at the previously refined search (?)
		 */
		for(r=0; r<= repeat; r++){
			if( verbose != 0 ) printf("   Sampling\n");
			if( verbose != 0 && repeat> 0 ) printf("   Repeat %i\n", r );
			diff = 1;
			pfreq = 10;
			iter=1;
			while( diff > conv ){
				z_update();		// E step
				copy_hist();
				hist_update();		// M step
				bkhist_update();	// M step
				lambda_update();	// M step
				diff = hist_diff();
				if( iter%pfreq == 0  && verbose != 0 ) printf("      iteration %i, (normalized) ElogL %.2f, change %.2e\n", iter, ElogL(), diff);
				iter++;
			}
			if( repeat > 0 && r< repeat ) toggle_update();
		}

		/*
		 * score and choose final targets
		 */
		t_update();
		if( verbose != 0 ) printf("   Done, scoring and choosing targets on threshold %.3f\n\n", t);
		score_genes();
		choose_targets();

		/*
		 * print final targets
		 */
//		sprintf( target_out,"%s-%i.targets", pref, i);
//		sprintf( list_out,"%s-%i.list", pref, i);
//		sprintf( model_out,"%s-%i.model", pref, i);
//		sprintf( annot_out,"%s-%i.xls", pref, i);
		print_state(fout, target_out);
		print_model(fout, model_out);
//		print_targets(fout, list_out);
//		print_annotpeak( fout, annot_out, i);

		/*
		 * erase found motifs (only if we're going around again)
		 */
		if( i< nmots ) erase_update();
		if( repeat > 0 ) toggle_reset();
	}

}

/*==================== FUNCTIONS ======================*/

/*
 * read command line inputs and options
 */
void read_cmd_line( int argc, char **argv ){
	int i, j;
	int found_reqs=0;

	// default values
	mots = 1;
	zoops = 0;
	printall = 0;
	mu = 0.1;
	tscale = 1.0;
	iseedval = make_seed();
	set_togs = 0;
	verbose = 1;
	repeat = 0;
	rsca = 1.0;

	// parse options
	for(i=1; i< argc; i++){
		if( argv[i][0] != '-' ) print_usage();
		i++;
		if( i >= argc || argv[i][0] == '-' ) print_usage();
		switch( argv[i-1][1] ){
			case 'i':
				input = (char *) malloc((strlen(argv[i])+5)*sizeof(char));
				strcpy(input, argv[i]);
				found_reqs++;
				break;
			case 'b':
				background = (char *) malloc((strlen(argv[i])+5)*sizeof(char));
				strcpy(background, argv[i]);
				found_reqs++;
				break;
			case 'o':
				target_out = (char *) malloc((strlen(argv[i])+5)*sizeof(char));
				strcpy(target_out, argv[i]);
				found_reqs++;
				break;
			case 'p':
				model_out = (char *) malloc((strlen(argv[i])+5)*sizeof(char));
				strcpy(model_out, argv[i]);
				found_reqs++;
				break;
			case 'n':
				mots = atoi(argv[i]);
				if( mots <= 0 ){
					printf("\nError: choose number of expression patterns to be at least 1\n");
					print_usage();
				}
				break;
			case 'z':
				if( argv[i][0] == 'y' ) zoops = 1;
				else if( argv[i][0] == 'n' ) zoops = 0;
				else{
					printf("\nError: if using -z option, choose y or n\n");
					print_usage();
				}
				break;
			case 'v':
				if( argv[i][0] == 'y' ) verbose = 1;
				else if( argv[i][0] == 'n' ) verbose = 0;
				else{
					printf("\nError: if using -v option, choose y or n\n");
					print_usage();
				}
				break;
			case 'm':
				mu = atof(argv[i]);
				if( mu<= 0 ){
					printf("\nError: choose mu to be positive\n");
					print_usage();
				}
				break;
			case 't':
				tscale = atof(argv[i]);
				if( tscale<= 0 ){
					printf("\nError: choose threshold to be positive\n");
					print_usage();
				}
				break;
			case 's':
				iseedval = -1*atol(argv[i]);
				if( iseedval>= 0 ){
					printf("\nError: choose seed to be a large positive integer\n");
					print_usage();
				}
				break;
			case 'c':
				set_togs = 1;
				togs = (char *) malloc((strlen(argv[i])+5)*sizeof(char));
				strcpy( togs, argv[i] );
				break;
			case 'a':
				if( argv[i][0] == 'y' ) printall = 1;
				else if( argv[i][0] == 'n' ) printall = 0;
				else{
					printf("\nError: if using -a option, choose y or n\n");
					print_usage();
				}
				break;
			case 'r':
				repeat = atoi(argv[i]);
				if( repeat< 0 ){
					printf("\nError: choose repeat to be a non-negative integer\n");
					print_usage();
				}
				break;
			case 'R':
				rsca = atof(argv[i]);
				if( rsca <= 0 ){
					printf("\nError: choose repeat scalar to be positive\n");
					print_usage();
				}
				break;
			default:
				printf("\nError: unkown option: -%c\n", argv[i-1][1]);
				print_usage();
		}
	}
	// double check that the correct number of required inputs has been read in
	if( found_reqs != 4 ){
		printf("\nError: improper number of required inputs found\n");
		print_usage();
	}
}

/*
 * usage info
 */
void print_usage(){
	printf("\nUsage: -i matched_data -b background_data -o target_output_name -p model_output_name [option value]\n");
	printf("Options:\n");
	printf("   -n number of expression patterns to find (positive integer, default 1)\n");
	printf("   -z enforce ZOOPS model (y or n, default n)\n");
	printf("   -m scalar for psuedocounts (positive real number, default 0.1)\n");
	printf("   -t scalar for threshold (positive real number, default 1.0)\n");
	printf("   -s seed (positive long integer, default chosen from date and time)\n");
	printf("   -c column toggle file, used to ignore certain classifications (default: all on)\n");
	printf("   -v verboseness, turn on/off printing to stdout (y or n, default: y)\n");
	printf("   -a print all genes (and scores), rather than only those above the threshold (y or n, default: n)\n");
	printf("   -r repeat search: after convergence of each expression pattern, turn off columns with lower relative entropy\n");
	printf("      than average*scalar and repeat expression pattern search (non-negative integer, default 0)\n");
        printf("   -R relative entropy scalar: if using the -r option, multiplies the average relative entropy\n");
	printf("      by this value to set the threshold for turning off columns (positive real number, default 1.0)\n");
	printf("\n");
	exit(0);
}

/*
 * function: read data from filename into the peaklist data structure
 * notes on assumed format of data:
 *    peak lines start with "PEAK:" field
 *    gene lines start with "GENE:" field
 *    gene lines with only 1 classification are ignored
 *    assume all fields are <5000 characters
 */
void read_data( char *filename ){
	int j,m,n;
	char val[5000], other[5000];
	FILE *fp;

	/*
	 * first count number of peaks, and allocate
	 */
	fp = fopen(filename,"rt");
	if( fp == NULL ){
		printf("Error: can't open file %s\n", filename);
		exit(0);
	}
	numpeaks = 0;
	while( fscanf(fp, "%s", &val ) != EOF ){
		if( contains( val, "PEAK" ) == 1 ){
			numpeaks++;
		}
	}
	fclose(fp);
	peaklist = (peak *) malloc(numpeaks*sizeof(peak));

	/*
	 * now count number of genes w/>1 classification, and allocate
	 * also count number of classifications, and allocate that at the end
	 * n tracks number of genes, j field; subtract genes that don't have full classifications
	 * m tracks peak index; record peak info
	 */
	fp = fopen(filename,"rt");
	n=-1;
	m=-1;
	j=0;
	sprintf(other,"NA");
	numclasses=0;
	while( fscanf(fp, "%s", &val ) != EOF ){
		if( contains( val, "PEAK" ) == 1 ){
			if( j==7 ) ;
			else{
				if( j-6 > numclasses ) numclasses = j-6;
			}
			if( m>=0 ){
				peaklist[m].numgenes = n;
				peaklist[m].genelist = (gene *) malloc( peaklist[m].numgenes*sizeof(gene));
			}
			n=0;
			j=0;
			if( strcmp(other, "NA") != 0 ) strcpy(peaklist[m].info, other);
			m++;
			sprintf(other,"NA");
		}
		else if( contains( val, "GENE" ) == 1 ){
			n++;
			j=0;
		}
		else if( n==0 ){
			if( j==0 ) strcpy(peaklist[m].chrom, val);
			if( j==1 ) peaklist[m].pkposn = atol(val);
			if( j> 1 ){
				if( strcmp(other, "NA") == 0 ) strcpy(other, val);
				else sprintf(other,"%s %s", other, val);
			}
			j++;
		}
		else{
			j++;
		}
	}
	peaklist[m].numgenes = n;
	peaklist[m].genelist = (gene *) malloc( peaklist[m].numgenes*sizeof(gene));
	strcpy( peaklist[m].info, other);
	fclose(fp);

	for(m=0; m< numpeaks; m++){
		for(n=0; n< peaklist[m].numgenes; n++){
			peaklist[m].genelist[n].values = (int *) malloc(numclasses*sizeof(int));
		}
	}

	/*
	 * now read in gene info
	 */
	fp = fopen(filename,"rt");
	m=-1;
	j=0;
	while( fscanf(fp, "%s", &val ) != EOF ){
		if( contains( val, "PEAK" ) == 1 ){
			if( j==7 ) peaklist[m].genelist[n].used = 0;
			if( j==6+numclasses) peaklist[m].genelist[n].used = 1;
			n=-1;
			j=0;
			m++;
		}
		else if( contains( val, "GENE" ) == 1 ){
			if( j==7 ) peaklist[m].genelist[n].used = 0;
			if( j==6+numclasses ) peaklist[m].genelist[n].used = 1;
			n++;
			j=0;
		}
		else if( n==-1 ){
		}
		else {
			if( j==0 ) strcpy( peaklist[m].genelist[n].id, val );
			if( j==1 ) strcpy( peaklist[m].genelist[n].name, val );
			if( j==2 ) strcpy( peaklist[m].genelist[n].chrom, val );
			if( j==3 ) peaklist[m].genelist[n].start = atol(val);
			if( j==4 ) peaklist[m].genelist[n].end = atol(val);
			if( j==5 ) peaklist[m].genelist[n].sense = atoi(val);
			if( j>=6 ) peaklist[m].genelist[n].values[j-6] = atoi(val);
			j++;
		}
	}
	if( j==7 ) peaklist[m].genelist[n].used = 0;
	if( j==6+numclasses) peaklist[m].genelist[n].used = 1;
	fclose(fp);

	/*
	 * now count how many genes are to be used for each
	 */
	for(m=0; m< numpeaks; m++){
		peaklist[m].numused = 0;
		for(n=0; n< peaklist[m].numgenes; n++){
			peaklist[m].numused+= peaklist[m].genelist[n].used;
		}
	}
}

void read_bkgrnd( char *filename ){
	int j,n;
	char val[500];
	FILE *fp;

	/*
	 * first count number of genes, and allocate
	 */
	numbk = count_lines( fp, filename );;
	bklist = (gene *) malloc(numbk*sizeof(gene));

	for(n=0; n< numbk; n++){
		bklist[n].values = (int *) malloc(numclasses*sizeof(int));
	}

	/*
	 * now read in gene info
	 */
	fp = fopen(filename,"rt");
	n=-1;
	j=0;
	while( fscanf(fp, "%s", &val ) != EOF ){
		if( contains( val, "_a" ) == 1 ){
			if( j==7 ) bklist[n].used = 0;
			if( j==6+numclasses ) bklist[n].used = 1;
			j=1;
			n++;
			strcpy( bklist[n].id, val );
		}
		else{
			if( j==1 ) strcpy( bklist[n].name, val );
			if( j==2 ) strcpy( bklist[n].chrom, val );
			if( j==3 ) bklist[n].start = atol(val);
			if( j==4 ) bklist[n].end = atol(val);
			if( j==5 ) bklist[n].sense = atoi(val);
			if( j>=6 ) bklist[n].values[j-6] = atoi(val);
			j++;
		}
	}
	if( j==7 ) bklist[n].used = 0;
	if( j==6+numclasses) bklist[n].used = 1;
	fclose(fp);

	/*
	 * now count how many genes are to be used for each
	 */
	bkused = 0;
	for(n=0; n< numbk; n++){
		bkused+= bklist[n].used;
	}
}

/*
 * make initial guesses of z-values, set erase to 1
 * read in column toggle file if provided
 * also histogram background and scale into beta
 */
void initialize( long *seed ){
	int j,m,n;
	int val;
	FILE *fp;

	for(m=0; m< numpeaks; m++){
		for(n=0; n< peaklist[m].numgenes; n++){
			if( peaklist[m].genelist[n].used == 1 ){
				peaklist[m].genelist[n].erase = 1.0;
				peaklist[m].genelist[n].z = 0.0;
			}
		}
		if( peaklist[m].numused > 0 ){
			n = assign_gene( m, seed );
			peaklist[m].genelist[n].z = 1.0; // pick one random gene from each set to read into motif model
		}
	}

	col_tog = (int *) malloc(numclasses*sizeof(int));
	col_tog_cpy = (int *) malloc(numclasses*sizeof(int));
	if( set_togs == 0 ){
		for(j=0; j< numclasses; j++){
			col_tog[j] = 1;
			col_tog_cpy[j] = 1;
		}
		usedclasses = numclasses;
	}
	else{
		// first count number of fields
		col_togl = 0;
		fp = fopen(togs,"rt");
		if( fp == NULL ){
			printf("Error: can't open file %s\n", togs);
			exit(0);
		}
		j=0;
		while( fscanf(fp,"%i", &val) != EOF ){
			col_togl++;
		}
		fclose(fp);
		if( col_togl != numclasses ){
			printf("Error: number of fields in toggle file (%i) is different from number of classifications in data (%i)\n", col_togl, numclasses);
			exit(0);
		}
		// now read in values
		fp = fopen(togs,"rt");
		j=0;
		usedclasses = 0;
		while( fscanf(fp,"%i", &val) != EOF ){
			col_tog[j] = val;
			col_tog_cpy[j] = val;
			j++;
			if( val == 1 ) usedclasses++;
			if( val != 1 && val != 0 ){
				printf("Error: choose the toggle values to be 0 or 1\n");
				exit(0);
			}
		}
		fclose(fp);
	}

	for(n=0; n< numbk; n++){
		bklist[n].z = 1.0;
	}
	beta_update();
	if( verbose != 0 ) printf("   Indexing genes in first pass\n");
	index_genes();
}

/*
 * make initial guesses of z-values for a second, etc motif
 * recount number of used genes per peak to those with erase > 0.5
 */
void reinitialize( long *seed ){
	int m,n;
	for(m=0; m< numpeaks; m++){
		peaklist[m].numused = 0;
		for(n=0; n< peaklist[m].numgenes; n++){
			peaklist[m].genelist[n].z = 0.0;
			if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].erase> 0.5 ){
				peaklist[m].numused++;
			}
		}
		if( peaklist[m].numused > 0 ){
			n = assign_gene( m, seed );
			peaklist[m].genelist[n].z = 1.0; // pick one random gene from each set to read into motif model
		}
	}
	for(n=0; n< numbk; n++){
		bklist[n].z = 1.0;
	}
}

/*
 * histogram background into beta
 */
void beta_update(){
	int j,k,n;
	double tot=0.0;
	bkhist_update();
	for(j=0; j< numclasses; j++){
//		if( col_tog[j] == 1 ){
			Beta[j] = 0.0;
			for(k=0; k< 5; k++){
				beta[j][k] = hist[1][j][k]*mu;
				Beta[j]+= beta[j][k];
			}
//		}
	}
}

/*
 * for all the genes in peaklist[*].genelist[*], record the index of that gene in bklist
 */
void index_genes(){
	int m,n,nu;
	int found;
	for(m=0; m< numpeaks; m++){
		for(n=0; n< peaklist[m].numgenes; n++){
			if( peaklist[m].genelist[n].used == 1 ){
				found = 0;
				nu = 0;
				while(nu< numbk && found == 0){
					if( strcmp(peaklist[m].genelist[n].id, bklist[nu].id) == 0 ){
						peaklist[m].genelist[n].index = nu;
						found = 1;
					}
					nu++;
				}
			}
		}
	}
}

/*
 * update motif model histogram
 */
void hist_update(){
	int j,k,m,n;
	double tot=0;
	for(j=0; j< numclasses; j++){
//		if( col_tog[j] == 1 ){
			for(k=0; k< 5; k++){
				hist[0][j][k] = 0.0;
			}
//		}
	}
	for(m=0; m< numpeaks; m++){
		for(n=0; n< peaklist[m].numgenes; n++){
			if( peaklist[m].genelist[n].used == 1 ){
				tot+=peaklist[m].genelist[n].erase*peaklist[m].genelist[n].z;
				for(j=0; j< numclasses; j++){
//					if( col_tog[j] == 1 ){
						k = peaklist[m].genelist[n].values[j]+2;
						hist[0][j][k]+= peaklist[m].genelist[n].erase*peaklist[m].genelist[n].z;
//					}
				}
			}
		}
	}
	for(j=0; j< numclasses; j++){
//		if( col_tog[j] == 1 ){
			for(k=0; k< 5; k++){
				hist[0][j][k]+= beta[j][k];
				hist[0][j][k]/= (tot+Beta[j]);;
			}
//		}
	}
}

/*
 * update background hist using z values from bklist
 */
void bkhist_update(){
	int j,k,n;
	double tot=0.0;
	for(j=0; j< numclasses; j++){
//		if( col_tog[j] == 1 ){
			for(k=0; k< 5; k++){
				hist[1][j][k] = 0;
			}
//		}
	}
	for(n=0; n< numbk; n++){
		if( bklist[n].used == 1 ){
			tot+=bklist[n].z;
			for(j=0; j< numclasses; j++){
//				if( col_tog[j] == 1 ){
					k = bklist[n].values[j]+2;
					hist[1][j][k]+= bklist[n].z;
//				}
			}
		}
	}
	for(j=0; j< numclasses; j++){
//		if( col_tog[j] == 1 ){
			for(k=0; k< 5; k++){
				hist[1][j][k]/= tot;
			}
//		}
	}
}

/*
 * update lambda value
 */
void lambda_update(){
	int m,n;
	double tot=0.0;
	lambda=0;
	for(m=0; m< numpeaks; m++){
		for(n=0; n< peaklist[m].numgenes; n++){
			if( peaklist[m].genelist[n].used == 1 ){
				lambda+= peaklist[m].genelist[n].z;
				tot++;
			}
		}
	}
	lambda/=tot;
}

/*
 * update the toggle columns based on which columns have better than average relative entropy
 */
void toggle_update(){
	int j;
	double avg=0.0;
	avg = rsca*rel_entropy();
	usedclasses=0;
	for(j=0; j< numclasses; j++){
		if( col_tog[j] == 1 ){
			if( col_rel_entropy(j) < avg ) col_tog[j] = 0;
			if( col_tog[j] == 1 ) usedclasses++;
		}
	}
}

/*
 * reset col_tog values with their original ones
 */
void toggle_reset(){
	int j;
	usedclasses=0;
	for(j=0; j< numclasses; j++){
		col_tog[j] = col_tog_cpy[j];
		if( col_tog[j] == 1 ) usedclasses++;
	}
}

/*
 * update z
 */
void z_update(){
	int j,m,n;
	double a, c, denom;
	for(m=0; m< numpeaks; m++){
		for(n=0; n< peaklist[m].numgenes; n++){
			if( peaklist[m].genelist[n].used == 1 ){
				// new z value for motif model
				a = lprob(m,n,0)+log(lambda);
				c = lprob(m,n,1)+log(1-lambda);
				if( a> c ){
					denom = a + log(1+exp(c-a));
				}
				else{
					denom = c + log(1+exp(a-c));
				}
				peaklist[m].genelist[n].z = exp( lprob(m,n,0)+log(lambda) - denom );
	//			peaklist[m].genelist[n].z = prob(m,n,0)*lambda/( prob(m,n,0)*lambda + prob(m,n,1)*(1-lambda) );
				// new z value for corresponding gene in background model
				bklist[peaklist[m].genelist[n].index].z = 1.0-peaklist[m].genelist[n].z;
			}
		}
	}
}

/*
 * update erase parameters
 */
void erase_update(){
	int j,m,n;
	for(m=0; m< numpeaks; m++){
		for(n=0; n< peaklist[m].numgenes; n++){
			if( peaklist[m].genelist[n].used == 1 ){
				peaklist[m].genelist[n].erase*= (1-peaklist[m].genelist[n].z);
			}
		}
	}
}

/*
 * calculate (log) (multinomial) probability of sequence m,n in model l
 */
double lprob( int m, int n, int l ){
	int j,k;
	double val=0.0;
	for(j=0; j< numclasses; j++){
		if( col_tog[j] == 1 ){
			k = peaklist[m].genelist[n].values[j]+2;
			val+= log(hist[l][j][k]);
		}
	}
	return val;
}
double prob( int m, int n, int l ){
	int j,k;
	double val=1.0;
	for(j=0; j< numclasses; j++){
		if( col_tog[j] == 1 ){
			k = peaklist[m].genelist[n].values[j]+2;
			val*= hist[l][j][k];
		}
	}
	return val;
}

/*
 * copy hist into oldhist before updating
 */
void copy_hist(){
	int j,k,l;
	for(l=0; l< 2; l++){
		for(j=0; j< numclasses; j++){
//			if( col_tog[j] == 1 ){
				for(k=0; k< 5; k++){
					oldhist[l][j][k] = hist[l][j][k];
				}
//			}
		}
	}
}

/*
 * compute change in hist
 */
double hist_diff(){
	int j,k,l;
	double val=0.0;
	for(l=0; l< 2; l++){
		for(j=0; j< numclasses; j++){
			if( col_tog[j] == 1 ){
				for(k=0; k< 5; k++){
					val+= fabs( oldhist[l][j][k]-hist[l][j][k] );
				}
			}
		}
	}
	return val;
}

/*
 * calculate the score of a sequence
 */
double score( int m, int n ){
	int j,k;
	double val=0.0;
	for(j=0; j< numclasses; j++){
		if( col_tog[j] == 1 ){
			k = peaklist[m].genelist[n].values[j]+2;
			val+= log( hist[0][j][k]/hist[1][j][k] );
		}
	}
	return val;
}

/*
 * calculate value of t (threshold)
 */
void t_update(){
	t = tscale*log( (1-lambda)/lambda );
}

/*
 * quality scores of a motif, averaging over only those sequences picked as targets
 *   ElogL: log-likelihood of the combination of models, averaged by # genes
 *   ratio_score: log-likelihood ratio of motif vs background models
 *   rel_entropy: relative entropy of motif to background
 *   col_rel_entropy: same as rel_entropy, but for just one column (rel_entropy is average over all cols)
 *   chisq: chi-square statistic
 */
double ElogL(){
	int m,n;
	double val=0.0, tot=0.0;
	for(m=0; m< numpeaks; m++){
		for(n=0; n< peaklist[m].numgenes; n++){
			if( peaklist[m].genelist[n].used == 1 ){
				val+=   peaklist[m].genelist[n].z*lprob(m, n, 0 )    \
				      + (1-peaklist[m].genelist[n].z)*lprob(m, n, 1 )\
				      + peaklist[m].genelist[n].z*log(lambda)       \
				      + (1-peaklist[m].genelist[n].z)*log(1-lambda);
				tot++;
			}
		}
	}
	return val/tot;
}

double ratio_score(){
	int m,n;
	double val=0.0;
	double tot=0.0;
	for(m=0; m< numpeaks; m++){
		for(n=0; n< peaklist[m].numgenes; n++){
			if( peaklist[m].genelist[n].assigned == 1 ){
				val+= lprob(m,n,0)-lprob(m,n,1);
				tot++;
			}
		}
	}
	return val/tot;
}

double rel_entropy(){
	int j,k;
	double val=0.0;
	for(j=0; j< numclasses; j++){
		if( col_tog[j] == 1 ){
			val+= col_rel_entropy( j );
		}
	}
	return val/usedclasses;
}

double col_rel_entropy( int j ){
	int k;
	double val=0.0;
	for(k=0; k< 5; k++){
		if( hist[1][j][j] > 0 && hist[0][j][k] > 0 ){
			// do in log base 2 for bits
			val+= hist[0][j][k]*log(hist[0][j][k]/hist[1][j][k])/log(2);
		}
	}
	return val;
}

double chisq(){
	int j, k;
	double val=0.0;
	for(j=0; j< numclasses; j++){
		if( col_tog[j] == 1 ){
			for(k=0; k< 5; k++){
				val+= pow(hist[0][j][k] - hist[1][j][k],2)/hist[1][j][k];
			}
		}
	}
	return val;
}

/*
 * number of members of a motif model
 */
int motif_count(){
	int m,n;
	int tot=0;
	for(m=0; m< numpeaks; m++){
		for(n=0; n< peaklist[m].numgenes; n++){
			if( peaklist[m].genelist[n].assigned == 1 ){
				tot++;
			}
		}
	}
	return tot;
}

/*
 * assign a random used gene: one that is used and hasn't been erased
 */
int assign_gene( int m, long *seed ){
	int n;
	n = floor( peaklist[m].numgenes*ran1(seed) );
	while( peaklist[m].genelist[n].used == 0 || peaklist[m].genelist[n].erase < 0.5 ){
		n = floor( peaklist[m].numgenes*ran1(seed) );
	}
	return n;
}

/*
 * score all genes using the models
 */
void score_genes(){
	int m,n;
	for(m=0; m< numpeaks; m++){
		for(n=0; n< peaklist[m].numgenes; n++){
			if( peaklist[m].genelist[n].used == 1 ){
				peaklist[m].genelist[n].score= score( m, n );
			}
		}
	}
}

/*
 * choose targets using the scores, and ZOOPS setting
 * zoops = 1 -> 0 or 1 target/strand
 * else -> all above threshold
 */
void choose_targets(){
	int m,n;
	int best;
	for(m=0; m< numpeaks; m++){
		peaklist[m].assgngene = 0;
		best=-1;
		for(n=0; n< peaklist[m].numgenes; n++){
			if( peaklist[m].genelist[n].used == 1 ){
				if( peaklist[m].genelist[n].score > t ){
					peaklist[m].genelist[n].assigned = 1;
					peaklist[m].assgngene++;
				}
				else peaklist[m].genelist[n].assigned = 0;
				if( best == -1 ) best = n;
				else if( peaklist[m].genelist[n].score > peaklist[m].genelist[best].score ) best = n;
			}
		}
		if( zoops == 1 ){
			for(n=0; n< peaklist[m].numgenes; n++){
				if( peaklist[m].genelist[n].used == 1 && n != best ) peaklist[m].genelist[n].assigned = 0;
			}
			if( peaklist[m].genelist[best].score > t ) peaklist[m].assgngene = 1;
			else peaklist[m].assgngene = 0;
		}
	}
}

/*
 * functions to print to output (fp must already be opened for top two)
 */
void print_peak( FILE *fp, int m ){
	int n;
	fprintf(fp,"PEAK: %s %ld %s\n", peaklist[m].chrom, peaklist[m].pkposn, peaklist[m].info);
	for(n=0; n< peaklist[m].numgenes; n++){
		if( printall == 0 ){
			if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].assigned == 1 ){
				print_gene( fp, m, n );
			}
		}
		if( printall == 1 ){
			if( peaklist[m].genelist[n].used == 1 ){
				print_gene( fp, m, n );
			}
		}
	}
}

void print_gene( FILE *fp, int m, int n ){
	int j;
	if( peaklist[m].genelist[n].used == 1 ){
		fprintf(fp, "TGENE: %0.3f %s %s %s %ld %ld %i", peaklist[m].genelist[n].score, peaklist[m].genelist[n].id, peaklist[m].genelist[n].name, peaklist[m].genelist[n].chrom, peaklist[m].genelist[n].start, peaklist[m].genelist[n].end, peaklist[m].genelist[n].sense);
		for(j=0; j< numclasses; j++){
			if( col_tog[j] == 0 ) fprintf(fp," %i ", peaklist[m].genelist[n].values[j]);
			else fprintf(fp," %i", peaklist[m].genelist[n].values[j]);
		}
		fprintf(fp,"\n");
	}
}

void print_state(FILE *fp, char *filename){
	int m;
	fp = fopen(filename,"w");
	for(m=0; m< numpeaks; m++){
		print_peak(fp, m);
	}
	fclose(fp);
}

/*
 * this prints a list of <region id> <name>
 */
/****************
void print_targets( FILE *fp, char *filename ){
	int m,n,nu;
	fp = fopen(filename,"w");
	for(m=0; m< numpeaks; m++){
		if( peaklist[m].assgngene > 0 ){
			fprintf(fp,"%s ", peaklist[m].regid);
			n=0;
			nu=0;
			while( n< peaklist[m].numgenes ){
				if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].assigned == 1 ){
					if( nu< peaklist[m].assgngene-1 ) fprintf(fp,"%s,", peaklist[m].genelist[n].name);
					else fprintf(fp,"%s\n", peaklist[m].genelist[n].name);
					nu++;
				}
				n++;
			}
		}
	}
	fclose(fp);
}
*************/

/*
 * print the motif model and its score
 */
void print_model( FILE *fp, char *filename ){
	int j,k;
	fp = fopen(filename,"w");
	fprintf(fp,"comparison\\class; avg relative entropy %0.3f, %i genes, threshold %0.3f, quality %0.1f\n", rel_entropy(), motif_count(), t, rel_entropy()*motif_count() );
	fprintf(fp,"             ++       +       0       -      --\n");
	for(j=0; j< numclasses; j++){
		if( j< 10 ) fprintf(fp," ");
		if( col_tog[j] == 0 ) fprintf(fp,"#");
		else fprintf(fp," ");
		fprintf(fp,"%i,%.3f",j+1, col_rel_entropy(j) );
		for(k=4; k>=0; k--){
			if( hist[1][j][k] > 0 ) fprintf(fp,"%s", nice_d(log(hist[0][j][k]/hist[1][j][k])) );
			else fprintf(fp,"      NA");
		}
		fprintf(fp,"\n");
	}
	fclose(fp);
}

/*
 * prints a double with 3 digits, aligned by spaces
 * longest would be -999.000, so 8 characters
 * shortest would be 1.000, 5 characters
 * up to 3 spaces
 */
char *nice_d( double val ){
	char *returnval;
	char *dval;
	char *spaces;
	int i, numspaces=0;

	returnval = (char *) malloc(10*sizeof(char));
	dval = (char *) malloc(10*sizeof(char));
	spaces = (char *) malloc(10*sizeof(char));

	sprintf(dval,"%.3f", val);

	if( val >= 0 ) numspaces++;
	if( fabs(val) < 100 ){
		if( fabs(val) < 10 ) numspaces+=2;
		else numspaces++;
	}
	if( numspaces == 0 ) sprintf(spaces,"");
	if( numspaces == 1 ) sprintf(spaces," ");
	if( numspaces == 2 ) sprintf(spaces,"  ");
	if( numspaces == 3 ) sprintf(spaces,"   ");
	sprintf(returnval,"%s%s", spaces, dval);
	return returnval;
}

/*
 * print the peaks only in annotation format, for use in comppeaks, subpeaks, etc
 */
/**************
void print_annotpeak( FILE *fp, char *filename, int motif ){
	int m;
	fp = fopen(filename,"w");
	for(m=0; m< numpeaks; m++){
		if( peaklist[m].assgngene > 0 ){
			fprintf(fp,"%s-%i %s %s %ld %ld %ld ", peaklist[m].filename, motif, peaklist[m].regid, peaklist[m].chrom, peaklist[m].pkposn, peaklist[m].start, peaklist[m].end );
			fprintf(fp,"%f 1.0 1.0 1 1 1.0 ", peaklist[m].height );
			fprintf(fp,"1 1.0 1.0 1.0 1 -1.0 poop 1\n");
		}
	}
	close(fp);
}
*************/
author	mmaiensc
date	Thu, 22 Mar 2012 14:07:11 -0400
parents	003f802d4c7d
children