Mercurial > repos > mmaiensc > ember

diff GALAXY_FILES/tools/EMBER/Ember_Galaxy.c @ 0:003f802d4c7d
Uploaded
author: mmaiensc
date: Wed, 29 Feb 2012 15:03:33 -0500
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/GALAXY_FILES/tools/EMBER/Ember_Galaxy.c	Wed Feb 29 15:03:33 2012 -0500
@@ -0,0 +1,1227 @@
+/* 
+ * ember.c : Expectation Maximization of Binding and Expression pRofiles
+ * ----------------------------------------------------------------
+ * algorithm adapted from MEME algorithm: Bailey and Elkan, Proc Int Conf Intell Syst Mol Biol, 2:28-36 (1994).
+ * 	See Eqs. 3-13, plus "Implementation of MM" section for detailed description
+ * changes are as follows:
+ * 	background changes with column (j=1,...,numclasses), so it is a 2D histogram of the same size as the motif model; same for psuedocounts
+ * 	background model is determined by full list of genes (second input); z-values of genes in first input are subtracted
+ *	   update of background can be turned off by commenting out 8th line of z_update()
+ *	   in this case, also comment out index_genes at end of initialize(), as it will save some time
+ * 	sequences cannot overlap, since they are fixed per gene; this simplifies the enumeration of all possibilities
+ * 	a ZOOPS options is included (change in code, parameter zoops); this does not alter the algorithm, 
+ * 	   only adds a restriction on which genes are chosen as targets in choose_targets()
+ * general notes:
+ * 	the common dimensions are:
+ * 	   numpeaks - number of peaks, size of peaklist, indexed by m
+ * 	   peaklist[m].numgenes - number of genes matched to a peak, size of peaklist[m].genelist, indexed by n
+ * 	   numclasses - number of comparisons made (i.e. motif width), indexed by j
+ * 	   5 - number of classifications in each comparison, assumed to be -2,-1,0,1,2, indexed by k
+ * 	   2 - number of models (motif=0 and background=1), indexed by l
+ * 	   nmots - total number of motifs to find, indexed by i in main (always stored at l=0, as found motifs are erased)
+ *      col_tog can be used to force the algorithm to ignore certain columns (behavior dimensions)
+ *
+ * things to add/consider:
+ * 	- best measure of motif quality score?
+ * 	- want to ensure that the same sequence can't get assigned to more than one motif?
+ */
+
+#include "all.h"
+
+typedef struct {
+	char id[50];
+	char name[500];
+	char chrom[50];
+	long start;
+	long end;
+	int sense;
+	double z;
+	double erase;
+	double score;
+	int index;	// will refer to index of this id in bklist
+	int used;	// set to 0 if a non-classified gene
+	int assigned;	// set to 1 if assigned
+	int *values;
+} gene;
+
+typedef struct {
+	char info[5000];
+	char chrom[50];
+	long pkposn;
+	int numgenes;
+	int numused;
+	int assgngene;	// set to # targets picked for this gene
+	gene *genelist;
+} peak;
+
+// FREE PARAMETERS TO CHOOSE IN COMMAND LINE
+double mu;	// scale background histogram by mu for psuedocounts
+double tscale;	// scales the threshold (to make choices more or less stringent)
+int zoops;	// set to 1 to enforce ZOOPS model (Zero Or One Per Strand)
+int printall;	// set to 1 to print all genes, not just targets
+int mots;	// number of motifs to search for
+int repeat;	// number of times to wean columns and repeat search
+double rsca;	// percent of average to set relative entropy cutoff
+// END PARAMETERS TO CHOOSE
+
+char *input, *background, *pref, *togs;
+char *target_out, *list_out, *model_out, *annot_out;
+long iseedval;
+int numpeaks, numclasses, usedclasses, numbk, bkused, nmots;
+peak *peaklist;
+gene *bklist;
+double ***hist, ***oldhist, lambda, t, **beta;
+double *Beta;
+int *col_tog, col_togl, set_togs, *col_tog_cpy;
+int verbose;
+
+void read_cmd_line( int argc, char **argv );
+void print_usage();
+void read_data( char *filename );
+void read_bkgrnd( char *filename );
+void initialize( long *seed );
+void reinitialize( long *seed );
+void beta_update();
+void index_genes();
+
+void hist_update();
+void bkhist_update();
+void lambda_update();
+void toggle_update();
+void toggle_reset();
+void z_update();
+void erase_update();
+double lprob( int m, int n, int l );
+double prob( int m, int n, int l );
+void copy_hist();
+double hist_diff();
+double score( int m, int n );
+void t_update();
+double ElogL();
+double ratio_score();
+double rel_entropy();
+double col_rel_entropy( int j );
+double chisq();
+int motif_count();
+
+int assign_gene( int m, long *seed );
+void score_genes();
+void choose_targets();
+
+void print_peak( FILE *fp, int i );
+void print_gene( FILE *fp, int i, int j );
+void print_state( FILE *fp, char *filename );
+//void print_targets( FILE *fp, char *filename );
+void print_model( FILE *fp, char *filename );
+//void print_annotpeak( FILE *fp, char *filename, int motif );
+char *nice_d( double val );
+
+int main(int argc, char *argv[]){
+	int i,iter,r;
+	int pfreq;
+	double diff, conv = 1e-6;
+	long seed;
+	FILE *fout;
+        
+	if(argc == 1 ){
+		print_usage();
+	}
+
+	/*
+	 * read command line params
+	 */
+	read_cmd_line( argc, argv );
+
+	/*
+	 * allocate output names
+	 */
+//	target_out = (char *) malloc((strlen(pref)+20)*sizeof(char));
+//	list_out = (char *) malloc((strlen(pref)+20)*sizeof(char));
+//	model_out = (char *) malloc((strlen(pref)+20)*sizeof(char));
+//	annot_out = (char *) malloc((strlen(pref)+20)*sizeof(char));
+
+	/*
+	 * read in data, copy values from global to local variables
+	 */
+	if( verbose != 0 ) printf("\nReading data and allocating\n");
+	read_data( input );
+	read_bkgrnd( background );
+	nmots = mots;
+	seed = iseedval;
+	
+	/*
+	 * allocate memory for other arrays
+	 */
+	hist = (double ***) alloc3d(sizeof(double),2,numclasses,5);
+	oldhist = (double ***) alloc3d(sizeof(double),2,numclasses,5);
+	beta = (double **) alloc2d(sizeof(double),numclasses,5);
+	Beta = (double *) malloc(numclasses*sizeof(double));
+
+	/*
+	 * find nmots motif models
+	 */
+	for(i=1; i<= nmots; i++){
+		/*
+		 * initialize, fill initial parameter values from z-guesses
+		 */
+		if( verbose != 0 ) printf("Expression Pattern %i,\n   Initializing\n", i);
+		if( i==1 ) initialize( &seed );
+		else reinitialize( &seed );
+		hist_update();
+		lambda_update();
+	
+		/*
+		 * now sample for awhile, repeating repeat times
+		 * each repeat starts at the previously refined search (?)
+		 */
+		for(r=0; r<= repeat; r++){
+			if( verbose != 0 ) printf("   Sampling\n");
+			if( verbose != 0 && repeat> 0 ) printf("   Repeat %i\n", r );
+			diff = 1;
+			pfreq = 10;
+			iter=1;
+			while( diff > conv ){
+				z_update();		// E step
+				copy_hist();		
+				hist_update();		// M step
+				bkhist_update();	// M step
+				lambda_update();	// M step
+				diff = hist_diff();
+				if( iter%pfreq == 0  && verbose != 0 ) printf("      iteration %i, (normalized) ElogL %.2f, change %.2e\n", iter, ElogL(), diff);
+				iter++;
+			}
+			if( repeat > 0 && r< repeat ) toggle_update();
+		}
+	
+		/*
+		 * score and choose final targets
+		 */
+		t_update();
+		if( verbose != 0 ) printf("   Done, scoring and choosing targets on threshold %.3f\n\n", t);
+		score_genes();
+		choose_targets();
+	
+		/*
+		 * print final targets
+		 */
+//		sprintf( target_out,"%s-%i.targets", pref, i);
+//		sprintf( list_out,"%s-%i.list", pref, i);
+//		sprintf( model_out,"%s-%i.model", pref, i);
+//		sprintf( annot_out,"%s-%i.xls", pref, i);
+		print_state(fout, target_out);
+		print_model(fout, model_out);
+//		print_targets(fout, list_out);
+//		print_annotpeak( fout, annot_out, i);
+	
+		/*
+		 * erase found motifs (only if we're going around again)
+		 */
+		if( i< nmots ) erase_update();
+		if( repeat > 0 ) toggle_reset();
+	}
+
+}
+
+/*==================== FUNCTIONS ======================*/
+
+/*
+ * read command line inputs and options
+ */
+void read_cmd_line( int argc, char **argv ){
+	int i, j;
+	int found_reqs=0;
+	
+	// default values
+	mots = 1;
+	zoops = 0;
+	printall = 0;
+	mu = 0.1;
+	tscale = 1.0;
+	iseedval = make_seed();
+	set_togs = 0;
+	verbose = 1;
+	repeat = 0;
+	rsca = 1.0;
+
+	// parse options
+	for(i=1; i< argc; i++){
+		if( argv[i][0] != '-' ) print_usage();
+		i++;
+		if( i >= argc || argv[i][0] == '-' ) print_usage();
+		switch( argv[i-1][1] ){
+			case 'i':
+				input = (char *) malloc((strlen(argv[i])+5)*sizeof(char));
+				strcpy(input, argv[i]);
+				found_reqs++;
+				break;
+			case 'b':
+				background = (char *) malloc((strlen(argv[i])+5)*sizeof(char));
+				strcpy(background, argv[i]);
+				found_reqs++;
+				break;
+			case 'o': 
+				target_out = (char *) malloc((strlen(argv[i])+5)*sizeof(char));
+				strcpy(target_out, argv[i]);
+				found_reqs++;
+				break;
+			case 'p': 
+				model_out = (char *) malloc((strlen(argv[i])+5)*sizeof(char));
+				strcpy(model_out, argv[i]);
+				found_reqs++;
+				break;
+			case 'n':
+				mots = atoi(argv[i]);
+				if( mots <= 0 ){
+					printf("\nError: choose number of expression patterns to be at least 1\n");
+					print_usage();
+				}
+				break;
+			case 'z':
+				if( argv[i][0] == 'y' ) zoops = 1;
+				else if( argv[i][0] == 'n' ) zoops = 0;
+				else{
+					printf("\nError: if using -z option, choose y or n\n");
+					print_usage();
+				}
+				break;
+			case 'v':
+				if( argv[i][0] == 'y' ) verbose = 1;
+				else if( argv[i][0] == 'n' ) verbose = 0;
+				else{
+					printf("\nError: if using -v option, choose y or n\n");
+					print_usage();
+				}
+				break;
+			case 'm':
+				mu = atof(argv[i]);
+				if( mu<= 0 ){
+					printf("\nError: choose mu to be positive\n");
+					print_usage();
+				}
+				break;
+			case 't':
+				tscale = atof(argv[i]);
+				if( tscale<= 0 ){
+					printf("\nError: choose threshold to be positive\n");
+					print_usage();
+				}
+				break;
+			case 's':
+				iseedval = -1*atol(argv[i]);
+				if( iseedval>= 0 ){
+					printf("\nError: choose seed to be a large positive integer\n");
+					print_usage();
+				}
+				break;
+			case 'c':
+				set_togs = 1;
+				togs = (char *) malloc((strlen(argv[i])+5)*sizeof(char));
+				strcpy( togs, argv[i] );
+				break;
+			case 'a':
+				if( argv[i][0] == 'y' ) printall = 1;
+				else if( argv[i][0] == 'n' ) printall = 0;
+				else{
+					printf("\nError: if using -a option, choose y or n\n");
+					print_usage();
+				}
+				break;
+			case 'r':
+				repeat = atoi(argv[i]);
+				if( repeat< 0 ){
+					printf("\nError: choose repeat to be a non-negative integer\n");
+					print_usage();
+				}
+				break;
+			case 'R':
+				rsca = atof(argv[i]);
+				if( rsca <= 0 ){
+					printf("\nError: choose repeat scalar to be positive\n");
+					print_usage();
+				}
+				break;
+			default:
+				printf("\nError: unkown option: -%c\n", argv[i-1][1]);
+				print_usage();
+		}
+	}
+	// double check that the correct number of required inputs has been read in
+	if( found_reqs != 4 ){
+		printf("\nError: improper number of required inputs found\n");
+		print_usage();
+	}
+}
+
+/*
+ * usage info
+ */
+void print_usage(){
+	printf("\nUsage: -i matched_data -b background_data -o target_output_name -p model_output_name [option value]\n");
+	printf("Options:\n");
+	printf("   -n number of expression patterns to find (positive integer, default 1)\n");
+	printf("   -z enforce ZOOPS model (y or n, default n)\n");
+	printf("   -m scalar for psuedocounts (positive real number, default 0.1)\n");
+	printf("   -t scalar for threshold (positive real number, default 1.0)\n");
+	printf("   -s seed (positive long integer, default chosen from date and time)\n");
+	printf("   -c column toggle file, used to ignore certain classifications (default: all on)\n");
+	printf("   -v verboseness, turn on/off printing to stdout (y or n, default: y)\n");
+	printf("   -a print all genes (and scores), rather than only those above the threshold (y or n, default: n)\n");
+	printf("   -r repeat search: after convergence of each expression pattern, turn off columns with lower relative entropy\n");
+	printf("      than average*scalar and repeat expression pattern search (non-negative integer, default 0)\n");
+        printf("   -R relative entropy scalar: if using the -r option, multiplies the average relative entropy\n");
+	printf("      by this value to set the threshold for turning off columns (positive real number, default 1.0)\n");
+	printf("\n");
+	exit(0);
+}
+
+/*
+ * function: read data from filename into the peaklist data structure
+ * notes on assumed format of data:
+ *    peak lines start with "PEAK:" field
+ *    gene lines start with "GENE:" field
+ *    gene lines with only 1 classification are ignored
+ *    assume all fields are <5000 characters
+ */
+void read_data( char *filename ){
+	int j,m,n;
+	char val[5000], other[5000];
+	FILE *fp;
+
+	/*
+	 * first count number of peaks, and allocate
+	 */
+	fp = fopen(filename,"rt");
+	if( fp == NULL ){
+		printf("Error: can't open file %s\n", filename);
+		exit(0);
+	}
+	numpeaks = 0;
+	while( fscanf(fp, "%s", &val ) != EOF ){
+		if( contains( val, "PEAK" ) == 1 ){
+			numpeaks++;
+		}
+	}
+	fclose(fp);
+	peaklist = (peak *) malloc(numpeaks*sizeof(peak));
+
+	/*
+	 * now count number of genes w/>1 classification, and allocate
+	 * also count number of classifications, and allocate that at the end
+	 * n tracks number of genes, j field; subtract genes that don't have full classifications
+	 * m tracks peak index; record peak info
+	 */
+	fp = fopen(filename,"rt");
+	n=-1;
+	m=-1;
+	j=0;
+	sprintf(other,"NA");
+	numclasses=0;
+	while( fscanf(fp, "%s", &val ) != EOF ){
+		if( contains( val, "PEAK" ) == 1 ){
+			if( j==7 ) ;
+			else{
+				if( j-6 > numclasses ) numclasses = j-6;
+			}
+			if( m>=0 ){
+				peaklist[m].numgenes = n;
+				peaklist[m].genelist = (gene *) malloc( peaklist[m].numgenes*sizeof(gene));
+			}
+			n=0;
+			j=0;
+			if( strcmp(other, "NA") != 0 ) strcpy(peaklist[m].info, other);
+			m++;
+			sprintf(other,"NA");
+		}
+		else if( contains( val, "GENE" ) == 1 ){
+			n++;
+			j=0;
+		}
+		else if( n==0 ){
+			if( j==0 ) strcpy(peaklist[m].chrom, val);
+			if( j==1 ) peaklist[m].pkposn = atol(val);
+			if( j> 1 ){
+				if( strcmp(other, "NA") == 0 ) strcpy(other, val);
+				else sprintf(other,"%s %s", other, val);
+			}
+			j++;
+		}
+		else{
+			j++;
+		}
+	}
+	peaklist[m].numgenes = n;
+	peaklist[m].genelist = (gene *) malloc( peaklist[m].numgenes*sizeof(gene));
+	strcpy( peaklist[m].info, other);
+	fclose(fp);
+
+	for(m=0; m< numpeaks; m++){
+		for(n=0; n< peaklist[m].numgenes; n++){
+			peaklist[m].genelist[n].values = (int *) malloc(numclasses*sizeof(int));
+		}
+	}
+
+	/*
+	 * now read in gene info
+	 */
+	fp = fopen(filename,"rt");
+	m=-1;
+	j=0;
+	while( fscanf(fp, "%s", &val ) != EOF ){
+		if( contains( val, "PEAK" ) == 1 ){
+			if( j==7 ) peaklist[m].genelist[n].used = 0;
+			if( j==6+numclasses) peaklist[m].genelist[n].used = 1;
+			n=-1;
+			j=0;
+			m++;
+		}
+		else if( contains( val, "GENE" ) == 1 ){
+			if( j==7 ) peaklist[m].genelist[n].used = 0;
+			if( j==6+numclasses ) peaklist[m].genelist[n].used = 1;
+			n++;
+			j=0;
+		}
+		else if( n==-1 ){
+		}
+		else {
+			if( j==0 ) strcpy( peaklist[m].genelist[n].id, val );
+			if( j==1 ) strcpy( peaklist[m].genelist[n].name, val );
+			if( j==2 ) strcpy( peaklist[m].genelist[n].chrom, val );
+			if( j==3 ) peaklist[m].genelist[n].start = atol(val);
+			if( j==4 ) peaklist[m].genelist[n].end = atol(val);
+			if( j==5 ) peaklist[m].genelist[n].sense = atoi(val);
+			if( j>=6 ) peaklist[m].genelist[n].values[j-6] = atoi(val);
+			j++;
+		}
+	}
+	if( j==7 ) peaklist[m].genelist[n].used = 0;
+	if( j==6+numclasses) peaklist[m].genelist[n].used = 1;
+	fclose(fp);
+
+	/*
+	 * now count how many genes are to be used for each
+	 */
+	for(m=0; m< numpeaks; m++){
+		peaklist[m].numused = 0;
+		for(n=0; n< peaklist[m].numgenes; n++){
+			peaklist[m].numused+= peaklist[m].genelist[n].used;
+		}
+	}
+}
+
+void read_bkgrnd( char *filename ){
+	int j,n;
+	char val[500];
+	FILE *fp;
+
+	/*
+	 * first count number of genes, and allocate
+	 */
+	numbk = count_lines( fp, filename );;
+	bklist = (gene *) malloc(numbk*sizeof(gene));
+
+	for(n=0; n< numbk; n++){
+		bklist[n].values = (int *) malloc(numclasses*sizeof(int));
+	}
+
+	/*
+	 * now read in gene info
+	 */
+	fp = fopen(filename,"rt");
+	n=-1;
+	j=0;
+	while( fscanf(fp, "%s", &val ) != EOF ){
+		if( contains( val, "_a" ) == 1 ){
+			if( j==7 ) bklist[n].used = 0;
+			if( j==6+numclasses ) bklist[n].used = 1;
+			j=1;
+			n++;
+			strcpy( bklist[n].id, val );
+		}
+		else{
+			if( j==1 ) strcpy( bklist[n].name, val );
+			if( j==2 ) strcpy( bklist[n].chrom, val );
+			if( j==3 ) bklist[n].start = atol(val);
+			if( j==4 ) bklist[n].end = atol(val);
+			if( j==5 ) bklist[n].sense = atoi(val);
+			if( j>=6 ) bklist[n].values[j-6] = atoi(val);
+			j++;
+		}
+	}
+	if( j==7 ) bklist[n].used = 0;
+	if( j==6+numclasses) bklist[n].used = 1;
+	fclose(fp);
+
+	/*
+	 * now count how many genes are to be used for each
+	 */
+	bkused = 0;
+	for(n=0; n< numbk; n++){
+		bkused+= bklist[n].used;
+	}
+}
+
+/*
+ * make initial guesses of z-values, set erase to 1
+ * read in column toggle file if provided
+ * also histogram background and scale into beta
+ */
+void initialize( long *seed ){
+	int j,m,n;
+	int val;
+	FILE *fp;
+
+	for(m=0; m< numpeaks; m++){
+		for(n=0; n< peaklist[m].numgenes; n++){
+			if( peaklist[m].genelist[n].used == 1 ){
+				peaklist[m].genelist[n].erase = 1.0;
+				peaklist[m].genelist[n].z = 0.0; 
+			}
+		}
+		if( peaklist[m].numused > 0 ){
+			n = assign_gene( m, seed );
+			peaklist[m].genelist[n].z = 1.0; // pick one random gene from each set to read into motif model
+		}
+	}
+
+	col_tog = (int *) malloc(numclasses*sizeof(int));
+	col_tog_cpy = (int *) malloc(numclasses*sizeof(int));
+	if( set_togs == 0 ){
+		for(j=0; j< numclasses; j++){
+			col_tog[j] = 1;
+			col_tog_cpy[j] = 1;
+		}
+		usedclasses = numclasses;
+	}
+	else{
+		// first count number of fields
+		col_togl = 0;
+		fp = fopen(togs,"rt");
+		if( fp == NULL ){
+			printf("Error: can't open file %s\n", togs);
+			exit(0);
+		}
+		j=0;
+		while( fscanf(fp,"%i", &val) != EOF ){
+			col_togl++;
+		}
+		fclose(fp);
+		if( col_togl != numclasses ){
+			printf("Error: number of fields in toggle file (%i) is different from number of classifications in data (%i)\n", col_togl, numclasses);
+			exit(0);
+		}
+		// now read in values
+		fp = fopen(togs,"rt");
+		j=0;
+		usedclasses = 0;
+		while( fscanf(fp,"%i", &val) != EOF ){
+			col_tog[j] = val;
+			col_tog_cpy[j] = val;
+			j++;
+			if( val == 1 ) usedclasses++;
+			if( val != 1 && val != 0 ){
+				printf("Error: choose the toggle values to be 0 or 1\n");
+				exit(0);
+			}
+		}
+		fclose(fp);
+	}
+
+	for(n=0; n< numbk; n++){
+		bklist[n].z = 1.0;
+	}
+	beta_update();
+	if( verbose != 0 ) printf("   Indexing genes in first pass\n");
+	index_genes();
+}
+
+/*
+ * make initial guesses of z-values for a second, etc motif
+ * recount number of used genes per peak to those with erase > 0.5
+ */
+void reinitialize( long *seed ){
+	int m,n;
+	for(m=0; m< numpeaks; m++){
+		peaklist[m].numused = 0;
+		for(n=0; n< peaklist[m].numgenes; n++){
+			peaklist[m].genelist[n].z = 0.0;
+			if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].erase> 0.5 ){
+				peaklist[m].numused++;
+			}
+		}
+		if( peaklist[m].numused > 0 ){
+			n = assign_gene( m, seed );
+			peaklist[m].genelist[n].z = 1.0; // pick one random gene from each set to read into motif model
+		}
+	}
+	for(n=0; n< numbk; n++){
+		bklist[n].z = 1.0;
+	}
+}
+
+/*
+ * histogram background into beta
+ */
+void beta_update(){
+	int j,k,n;
+	double tot=0.0;
+	bkhist_update();
+	for(j=0; j< numclasses; j++){
+//		if( col_tog[j] == 1 ){
+			Beta[j] = 0.0;
+			for(k=0; k< 5; k++){
+				beta[j][k] = hist[1][j][k]*mu;
+				Beta[j]+= beta[j][k];
+			}
+//		}
+	}
+}
+
+/*
+ * for all the genes in peaklist[*].genelist[*], record the index of that gene in bklist
+ */
+void index_genes(){
+	int m,n,nu;
+	int found;
+	for(m=0; m< numpeaks; m++){
+		for(n=0; n< peaklist[m].numgenes; n++){
+			if( peaklist[m].genelist[n].used == 1 ){
+				found = 0;
+				nu = 0;
+				while(nu< numbk && found == 0){
+					if( strcmp(peaklist[m].genelist[n].id, bklist[nu].id) == 0 ){
+						peaklist[m].genelist[n].index = nu;
+						found = 1;
+					}
+					nu++;
+				}
+			}
+		}
+	}
+}
+
+/*
+ * update motif model histogram
+ */
+void hist_update(){
+	int j,k,m,n;
+	double tot=0;
+	for(j=0; j< numclasses; j++){
+//		if( col_tog[j] == 1 ){
+			for(k=0; k< 5; k++){
+				hist[0][j][k] = 0.0;
+			}
+//		}
+	}
+	for(m=0; m< numpeaks; m++){
+		for(n=0; n< peaklist[m].numgenes; n++){
+			if( peaklist[m].genelist[n].used == 1 ){
+				tot+=peaklist[m].genelist[n].erase*peaklist[m].genelist[n].z;
+				for(j=0; j< numclasses; j++){
+//					if( col_tog[j] == 1 ){
+						k = peaklist[m].genelist[n].values[j]+2;
+						hist[0][j][k]+= peaklist[m].genelist[n].erase*peaklist[m].genelist[n].z;
+//					}
+				}
+			}
+		}
+	}
+	for(j=0; j< numclasses; j++){
+//		if( col_tog[j] == 1 ){
+			for(k=0; k< 5; k++){
+				hist[0][j][k]+= beta[j][k];
+				hist[0][j][k]/= (tot+Beta[j]);;
+			}
+//		}
+	}
+}
+
+/*
+ * update background hist using z values from bklist
+ */
+void bkhist_update(){
+	int j,k,n;
+	double tot=0.0;
+	for(j=0; j< numclasses; j++){
+//		if( col_tog[j] == 1 ){
+			for(k=0; k< 5; k++){
+				hist[1][j][k] = 0;
+			}
+//		}
+	}
+	for(n=0; n< numbk; n++){
+		if( bklist[n].used == 1 ){
+			tot+=bklist[n].z;
+			for(j=0; j< numclasses; j++){
+//				if( col_tog[j] == 1 ){
+					k = bklist[n].values[j]+2;
+					hist[1][j][k]+= bklist[n].z;
+//				}
+			}
+		}
+	}
+	for(j=0; j< numclasses; j++){
+//		if( col_tog[j] == 1 ){
+			for(k=0; k< 5; k++){
+				hist[1][j][k]/= tot;
+			}
+//		}
+	}
+}
+
+/*
+ * update lambda value
+ */
+void lambda_update(){
+	int m,n;
+	double tot=0.0;
+	lambda=0;
+	for(m=0; m< numpeaks; m++){
+		for(n=0; n< peaklist[m].numgenes; n++){
+			if( peaklist[m].genelist[n].used == 1 ){
+				lambda+= peaklist[m].genelist[n].z;
+				tot++;
+			}
+		}
+	}
+	lambda/=tot;
+}
+
+/*
+ * update the toggle columns based on which columns have better than average relative entropy
+ */
+void toggle_update(){
+	int j;
+	double avg=0.0;
+	avg = rsca*rel_entropy();
+	usedclasses=0;
+	for(j=0; j< numclasses; j++){
+		if( col_tog[j] == 1 ){
+			if( col_rel_entropy(j) < avg ) col_tog[j] = 0;
+			if( col_tog[j] == 1 ) usedclasses++;
+		}
+	}
+}
+
+/*
+ * reset col_tog values with their original ones
+ */
+void toggle_reset(){
+	int j;
+	usedclasses=0;
+	for(j=0; j< numclasses; j++){
+		col_tog[j] = col_tog_cpy[j];
+		if( col_tog[j] == 1 ) usedclasses++;
+	}
+}
+
+/*
+ * update z
+ */
+void z_update(){
+	int j,m,n;
+	double a, c, denom;
+	for(m=0; m< numpeaks; m++){
+		for(n=0; n< peaklist[m].numgenes; n++){
+			if( peaklist[m].genelist[n].used == 1 ){
+				// new z value for motif model
+				a = lprob(m,n,0)+log(lambda);
+				c = lprob(m,n,1)+log(1-lambda);
+				if( a> c ){
+					denom = a + log(1+exp(c-a));
+				}
+				else{
+					denom = c + log(1+exp(a-c));
+				}
+				peaklist[m].genelist[n].z = exp( lprob(m,n,0)+log(lambda) - denom );
+	//			peaklist[m].genelist[n].z = prob(m,n,0)*lambda/( prob(m,n,0)*lambda + prob(m,n,1)*(1-lambda) );
+				// new z value for corresponding gene in background model
+				bklist[peaklist[m].genelist[n].index].z = 1.0-peaklist[m].genelist[n].z;
+			}
+		}
+	}
+}
+
+/*
+ * update erase parameters 
+ */
+void erase_update(){
+	int j,m,n;
+	for(m=0; m< numpeaks; m++){
+		for(n=0; n< peaklist[m].numgenes; n++){
+			if( peaklist[m].genelist[n].used == 1 ){
+				peaklist[m].genelist[n].erase*= (1-peaklist[m].genelist[n].z);
+			}
+		}
+	}
+}
+
+/*
+ * calculate (log) (multinomial) probability of sequence m,n in model l
+ */
+double lprob( int m, int n, int l ){
+	int j,k;
+	double val=0.0;
+	for(j=0; j< numclasses; j++){
+		if( col_tog[j] == 1 ){
+			k = peaklist[m].genelist[n].values[j]+2;
+			val+= log(hist[l][j][k]);
+		}
+	}
+	return val;
+}
+double prob( int m, int n, int l ){
+	int j,k;
+	double val=1.0;
+	for(j=0; j< numclasses; j++){
+		if( col_tog[j] == 1 ){
+			k = peaklist[m].genelist[n].values[j]+2;
+			val*= hist[l][j][k];
+		}
+	}
+	return val;
+}
+
+/*
+ * copy hist into oldhist before updating
+ */
+void copy_hist(){
+	int j,k,l;
+	for(l=0; l< 2; l++){
+		for(j=0; j< numclasses; j++){
+//			if( col_tog[j] == 1 ){
+				for(k=0; k< 5; k++){
+					oldhist[l][j][k] = hist[l][j][k];
+				}
+//			}
+		}
+	}
+}
+
+/*
+ * compute change in hist
+ */
+double hist_diff(){
+	int j,k,l;
+	double val=0.0;
+	for(l=0; l< 2; l++){
+		for(j=0; j< numclasses; j++){
+			if( col_tog[j] == 1 ){
+				for(k=0; k< 5; k++){
+					val+= fabs( oldhist[l][j][k]-hist[l][j][k] );
+				}
+			}
+		}
+	}
+	return val;
+}
+
+/*
+ * calculate the score of a sequence
+ */
+double score( int m, int n ){
+	int j,k;
+	double val=0.0;
+	for(j=0; j< numclasses; j++){
+		if( col_tog[j] == 1 ){
+			k = peaklist[m].genelist[n].values[j]+2;
+			val+= log( hist[0][j][k]/hist[1][j][k] );
+		}
+	}
+	return val;
+}
+
+/*
+ * calculate value of t (threshold)
+ */
+void t_update(){
+	t = tscale*log( (1-lambda)/lambda );
+}
+
+/*
+ * quality scores of a motif, averaging over only those sequences picked as targets
+ *   ElogL: log-likelihood of the combination of models, averaged by # genes
+ *   ratio_score: log-likelihood ratio of motif vs background models
+ *   rel_entropy: relative entropy of motif to background
+ *   col_rel_entropy: same as rel_entropy, but for just one column (rel_entropy is average over all cols)
+ *   chisq: chi-square statistic
+ */
+double ElogL(){
+	int m,n;
+	double val=0.0, tot=0.0;
+	for(m=0; m< numpeaks; m++){
+		for(n=0; n< peaklist[m].numgenes; n++){
+			if( peaklist[m].genelist[n].used == 1 ){
+				val+=   peaklist[m].genelist[n].z*lprob(m, n, 0 )    \
+				      + (1-peaklist[m].genelist[n].z)*lprob(m, n, 1 )\
+				      + peaklist[m].genelist[n].z*log(lambda)       \
+				      + (1-peaklist[m].genelist[n].z)*log(1-lambda);
+				tot++;
+			}
+		}
+	}
+	return val/tot;
+}
+
+double ratio_score(){
+	int m,n;
+	double val=0.0;
+	double tot=0.0;
+	for(m=0; m< numpeaks; m++){
+		for(n=0; n< peaklist[m].numgenes; n++){
+			if( peaklist[m].genelist[n].assigned == 1 ){
+				val+= lprob(m,n,0)-lprob(m,n,1);
+				tot++;
+			}
+		}
+	}
+	return val/tot;
+}
+
+double rel_entropy(){
+	int j,k;
+	double val=0.0;
+	for(j=0; j< numclasses; j++){
+		if( col_tog[j] == 1 ){
+			val+= col_rel_entropy( j );
+		}
+	}
+	return val/usedclasses;
+}
+
+double col_rel_entropy( int j ){
+	int k;
+	double val=0.0;
+	for(k=0; k< 5; k++){
+		if( hist[1][j][j] > 0 && hist[0][j][k] > 0 ){
+			// do in log base 2 for bits
+			val+= hist[0][j][k]*log(hist[0][j][k]/hist[1][j][k])/log(2);
+		}
+	}
+	return val;
+}
+
+double chisq(){
+	int j, k;
+	double val=0.0;
+	for(j=0; j< numclasses; j++){
+		if( col_tog[j] == 1 ){
+			for(k=0; k< 5; k++){
+				val+= pow(hist[0][j][k] - hist[1][j][k],2)/hist[1][j][k];
+			}
+		}
+	}
+	return val;
+}
+	
+/*
+ * number of members of a motif model
+ */
+int motif_count(){
+	int m,n;
+	int tot=0;
+	for(m=0; m< numpeaks; m++){
+		for(n=0; n< peaklist[m].numgenes; n++){
+			if( peaklist[m].genelist[n].assigned == 1 ){
+				tot++;
+			}
+		}
+	}
+	return tot;
+}
+
+/*
+ * assign a random used gene: one that is used and hasn't been erased
+ */
+int assign_gene( int m, long *seed ){
+	int n;
+	n = floor( peaklist[m].numgenes*ran1(seed) );
+	while( peaklist[m].genelist[n].used == 0 || peaklist[m].genelist[n].erase < 0.5 ){
+		n = floor( peaklist[m].numgenes*ran1(seed) );
+	}
+	return n;
+}
+
+/*
+ * score all genes using the models
+ */
+void score_genes(){
+	int m,n;
+	for(m=0; m< numpeaks; m++){
+		for(n=0; n< peaklist[m].numgenes; n++){
+			if( peaklist[m].genelist[n].used == 1 ){
+				peaklist[m].genelist[n].score= score( m, n );
+			}
+		}
+	}	
+}
+
+/*
+ * choose targets using the scores, and ZOOPS setting
+ * zoops = 1 -> 0 or 1 target/strand
+ * else -> all above threshold
+ */
+void choose_targets(){
+	int m,n;
+	int best;
+	for(m=0; m< numpeaks; m++){
+		peaklist[m].assgngene = 0;
+		best=-1;
+		for(n=0; n< peaklist[m].numgenes; n++){
+			if( peaklist[m].genelist[n].used == 1 ){
+				if( peaklist[m].genelist[n].score > t ){
+					peaklist[m].genelist[n].assigned = 1;
+					peaklist[m].assgngene++;
+				}
+				else peaklist[m].genelist[n].assigned = 0;
+				if( best == -1 ) best = n;
+				else if( peaklist[m].genelist[n].score > peaklist[m].genelist[best].score ) best = n;
+			}
+		}
+		if( zoops == 1 ){
+			for(n=0; n< peaklist[m].numgenes; n++){
+				if( peaklist[m].genelist[n].used == 1 && n != best ) peaklist[m].genelist[n].assigned = 0;
+			}
+			if( peaklist[m].genelist[best].score > t ) peaklist[m].assgngene = 1;
+			else peaklist[m].assgngene = 0;
+		}
+	}
+}
+
+/*
+ * functions to print to output (fp must already be opened for top two)
+ */
+void print_peak( FILE *fp, int m ){
+	int n;
+	fprintf(fp,"PEAK: %s %ld %s\n", peaklist[m].chrom, peaklist[m].pkposn, peaklist[m].info);
+	for(n=0; n< peaklist[m].numgenes; n++){
+		if( printall == 0 ){
+			if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].assigned == 1 ){
+				print_gene( fp, m, n );
+			}
+		}
+		if( printall == 1 ){
+			if( peaklist[m].genelist[n].used == 1 ){
+				print_gene( fp, m, n );
+			}
+		}
+	}
+}
+
+void print_gene( FILE *fp, int m, int n ){
+	int j;
+	if( peaklist[m].genelist[n].used == 1 ){
+		fprintf(fp, "TGENE: %0.3f %s %s %s %ld %ld %i", peaklist[m].genelist[n].score, peaklist[m].genelist[n].id, peaklist[m].genelist[n].name, peaklist[m].genelist[n].chrom, peaklist[m].genelist[n].start, peaklist[m].genelist[n].end, peaklist[m].genelist[n].sense);
+		for(j=0; j< numclasses; j++){
+			if( col_tog[j] == 0 ) fprintf(fp," %i ", peaklist[m].genelist[n].values[j]);
+			else fprintf(fp," %i", peaklist[m].genelist[n].values[j]);
+		}
+		fprintf(fp,"\n");
+	}
+}
+
+void print_state(FILE *fp, char *filename){
+	int m;
+	fp = fopen(filename,"w");
+	for(m=0; m< numpeaks; m++){
+		print_peak(fp, m);
+	}
+	fclose(fp);
+}
+
+/*
+ * this prints a list of <region id> <name>
+ */
+/****************
+void print_targets( FILE *fp, char *filename ){
+	int m,n,nu;
+	fp = fopen(filename,"w");
+	for(m=0; m< numpeaks; m++){
+		if( peaklist[m].assgngene > 0 ){
+			fprintf(fp,"%s ", peaklist[m].regid);
+			n=0;
+			nu=0;
+			while( n< peaklist[m].numgenes ){
+				if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].assigned == 1 ){
+					if( nu< peaklist[m].assgngene-1 ) fprintf(fp,"%s,", peaklist[m].genelist[n].name);
+					else fprintf(fp,"%s\n", peaklist[m].genelist[n].name);
+					nu++;
+				}
+				n++;
+			}
+		}
+	}
+	fclose(fp);
+}
+*************/
+
+/*
+ * print the motif model and its score
+ */
+void print_model( FILE *fp, char *filename ){
+	int j,k;
+	fp = fopen(filename,"w");
+	fprintf(fp,"comparison\\class; avg relative entropy %0.3f, %i genes, threshold %0.3f, quality %0.1f\n", rel_entropy(), motif_count(), t, rel_entropy()*motif_count() );
+	fprintf(fp,"             ++       +       0       -      --\n");
+	for(j=0; j< numclasses; j++){
+		if( j< 10 ) fprintf(fp," ");
+		if( col_tog[j] == 0 ) fprintf(fp,"#");
+		else fprintf(fp," ");
+		fprintf(fp,"%i,%.3f",j+1, col_rel_entropy(j) );
+		for(k=4; k>=0; k--){
+			if( hist[1][j][k] > 0 ) fprintf(fp,"%s", nice_d(log(hist[0][j][k]/hist[1][j][k])) );
+			else fprintf(fp,"      NA");
+		}
+		fprintf(fp,"\n");
+	}
+	fclose(fp);
+}
+
+/*
+ * prints a double with 3 digits, aligned by spaces
+ * longest would be -999.000, so 8 characters
+ * shortest would be 1.000, 5 characters
+ * up to 3 spaces
+ */
+char *nice_d( double val ){
+	char *returnval;
+	char *dval;
+	char *spaces;
+	int i, numspaces=0;
+
+	returnval = (char *) malloc(10*sizeof(char));	
+	dval = (char *) malloc(10*sizeof(char));
+	spaces = (char *) malloc(10*sizeof(char));
+	
+	sprintf(dval,"%.3f", val);
+	
+	if( val >= 0 ) numspaces++;
+	if( fabs(val) < 100 ){
+		if( fabs(val) < 10 ) numspaces+=2;
+		else numspaces++;
+	}
+	if( numspaces == 0 ) sprintf(spaces,"");
+	if( numspaces == 1 ) sprintf(spaces," ");
+	if( numspaces == 2 ) sprintf(spaces,"  ");
+	if( numspaces == 3 ) sprintf(spaces,"   ");
+	sprintf(returnval,"%s%s", spaces, dval);
+	return returnval;
+}
+
+/*
+ * print the peaks only in annotation format, for use in comppeaks, subpeaks, etc
+ */
+/**************
+void print_annotpeak( FILE *fp, char *filename, int motif ){
+	int m;
+	fp = fopen(filename,"w");
+	for(m=0; m< numpeaks; m++){
+		if( peaklist[m].assgngene > 0 ){
+			fprintf(fp,"%s-%i %s %s %ld %ld %ld ", peaklist[m].filename, motif, peaklist[m].regid, peaklist[m].chrom, peaklist[m].pkposn, peaklist[m].start, peaklist[m].end );
+			fprintf(fp,"%f 1.0 1.0 1 1 1.0 ", peaklist[m].height );
+			fprintf(fp,"1 1.0 1.0 1.0 1 -1.0 poop 1\n");
+		}
+	}
+	close(fp);
+}
+*************/
+
author	mmaiensc
date	Wed, 29 Feb 2012 15:03:33 -0500
parents
children