comparison GALAXY_FILES/tools/EMBER/Ember_Galaxy.c @ 3:037c3edda16e

Uploaded
author mmaiensc
date Thu, 22 Mar 2012 13:49:52 -0400
parents 003f802d4c7d
children
comparison
equal deleted inserted replaced
2:1a84b8178b45 3:037c3edda16e
1 /*
2 * ember.c : Expectation Maximization of Binding and Expression pRofiles
3 * ----------------------------------------------------------------
4 * algorithm adapted from MEME algorithm: Bailey and Elkan, Proc Int Conf Intell Syst Mol Biol, 2:28-36 (1994).
5 * See Eqs. 3-13, plus "Implementation of MM" section for detailed description
6 * changes are as follows:
7 * background changes with column (j=1,...,numclasses), so it is a 2D histogram of the same size as the motif model; same for psuedocounts
8 * background model is determined by full list of genes (second input); z-values of genes in first input are subtracted
9 * update of background can be turned off by commenting out 8th line of z_update()
10 * in this case, also comment out index_genes at end of initialize(), as it will save some time
11 * sequences cannot overlap, since they are fixed per gene; this simplifies the enumeration of all possibilities
12 * a ZOOPS options is included (change in code, parameter zoops); this does not alter the algorithm,
13 * only adds a restriction on which genes are chosen as targets in choose_targets()
14 * general notes:
15 * the common dimensions are:
16 * numpeaks - number of peaks, size of peaklist, indexed by m
17 * peaklist[m].numgenes - number of genes matched to a peak, size of peaklist[m].genelist, indexed by n
18 * numclasses - number of comparisons made (i.e. motif width), indexed by j
19 * 5 - number of classifications in each comparison, assumed to be -2,-1,0,1,2, indexed by k
20 * 2 - number of models (motif=0 and background=1), indexed by l
21 * nmots - total number of motifs to find, indexed by i in main (always stored at l=0, as found motifs are erased)
22 * col_tog can be used to force the algorithm to ignore certain columns (behavior dimensions)
23 *
24 * things to add/consider:
25 * - best measure of motif quality score?
26 * - want to ensure that the same sequence can't get assigned to more than one motif?
27 */
28
29 #include "all.h"
30
31 typedef struct {
32 char id[50];
33 char name[500];
34 char chrom[50];
35 long start;
36 long end;
37 int sense;
38 double z;
39 double erase;
40 double score;
41 int index; // will refer to index of this id in bklist
42 int used; // set to 0 if a non-classified gene
43 int assigned; // set to 1 if assigned
44 int *values;
45 } gene;
46
47 typedef struct {
48 char info[5000];
49 char chrom[50];
50 long pkposn;
51 int numgenes;
52 int numused;
53 int assgngene; // set to # targets picked for this gene
54 gene *genelist;
55 } peak;
56
57 // FREE PARAMETERS TO CHOOSE IN COMMAND LINE
58 double mu; // scale background histogram by mu for psuedocounts
59 double tscale; // scales the threshold (to make choices more or less stringent)
60 int zoops; // set to 1 to enforce ZOOPS model (Zero Or One Per Strand)
61 int printall; // set to 1 to print all genes, not just targets
62 int mots; // number of motifs to search for
63 int repeat; // number of times to wean columns and repeat search
64 double rsca; // percent of average to set relative entropy cutoff
65 // END PARAMETERS TO CHOOSE
66
67 char *input, *background, *pref, *togs;
68 char *target_out, *list_out, *model_out, *annot_out;
69 long iseedval;
70 int numpeaks, numclasses, usedclasses, numbk, bkused, nmots;
71 peak *peaklist;
72 gene *bklist;
73 double ***hist, ***oldhist, lambda, t, **beta;
74 double *Beta;
75 int *col_tog, col_togl, set_togs, *col_tog_cpy;
76 int verbose;
77
78 void read_cmd_line( int argc, char **argv );
79 void print_usage();
80 void read_data( char *filename );
81 void read_bkgrnd( char *filename );
82 void initialize( long *seed );
83 void reinitialize( long *seed );
84 void beta_update();
85 void index_genes();
86
87 void hist_update();
88 void bkhist_update();
89 void lambda_update();
90 void toggle_update();
91 void toggle_reset();
92 void z_update();
93 void erase_update();
94 double lprob( int m, int n, int l );
95 double prob( int m, int n, int l );
96 void copy_hist();
97 double hist_diff();
98 double score( int m, int n );
99 void t_update();
100 double ElogL();
101 double ratio_score();
102 double rel_entropy();
103 double col_rel_entropy( int j );
104 double chisq();
105 int motif_count();
106
107 int assign_gene( int m, long *seed );
108 void score_genes();
109 void choose_targets();
110
111 void print_peak( FILE *fp, int i );
112 void print_gene( FILE *fp, int i, int j );
113 void print_state( FILE *fp, char *filename );
114 //void print_targets( FILE *fp, char *filename );
115 void print_model( FILE *fp, char *filename );
116 //void print_annotpeak( FILE *fp, char *filename, int motif );
117 char *nice_d( double val );
118
119 int main(int argc, char *argv[]){
120 int i,iter,r;
121 int pfreq;
122 double diff, conv = 1e-6;
123 long seed;
124 FILE *fout;
125
126 if(argc == 1 ){
127 print_usage();
128 }
129
130 /*
131 * read command line params
132 */
133 read_cmd_line( argc, argv );
134
135 /*
136 * allocate output names
137 */
138 // target_out = (char *) malloc((strlen(pref)+20)*sizeof(char));
139 // list_out = (char *) malloc((strlen(pref)+20)*sizeof(char));
140 // model_out = (char *) malloc((strlen(pref)+20)*sizeof(char));
141 // annot_out = (char *) malloc((strlen(pref)+20)*sizeof(char));
142
143 /*
144 * read in data, copy values from global to local variables
145 */
146 if( verbose != 0 ) printf("\nReading data and allocating\n");
147 read_data( input );
148 read_bkgrnd( background );
149 nmots = mots;
150 seed = iseedval;
151
152 /*
153 * allocate memory for other arrays
154 */
155 hist = (double ***) alloc3d(sizeof(double),2,numclasses,5);
156 oldhist = (double ***) alloc3d(sizeof(double),2,numclasses,5);
157 beta = (double **) alloc2d(sizeof(double),numclasses,5);
158 Beta = (double *) malloc(numclasses*sizeof(double));
159
160 /*
161 * find nmots motif models
162 */
163 for(i=1; i<= nmots; i++){
164 /*
165 * initialize, fill initial parameter values from z-guesses
166 */
167 if( verbose != 0 ) printf("Expression Pattern %i,\n Initializing\n", i);
168 if( i==1 ) initialize( &seed );
169 else reinitialize( &seed );
170 hist_update();
171 lambda_update();
172
173 /*
174 * now sample for awhile, repeating repeat times
175 * each repeat starts at the previously refined search (?)
176 */
177 for(r=0; r<= repeat; r++){
178 if( verbose != 0 ) printf(" Sampling\n");
179 if( verbose != 0 && repeat> 0 ) printf(" Repeat %i\n", r );
180 diff = 1;
181 pfreq = 10;
182 iter=1;
183 while( diff > conv ){
184 z_update(); // E step
185 copy_hist();
186 hist_update(); // M step
187 bkhist_update(); // M step
188 lambda_update(); // M step
189 diff = hist_diff();
190 if( iter%pfreq == 0 && verbose != 0 ) printf(" iteration %i, (normalized) ElogL %.2f, change %.2e\n", iter, ElogL(), diff);
191 iter++;
192 }
193 if( repeat > 0 && r< repeat ) toggle_update();
194 }
195
196 /*
197 * score and choose final targets
198 */
199 t_update();
200 if( verbose != 0 ) printf(" Done, scoring and choosing targets on threshold %.3f\n\n", t);
201 score_genes();
202 choose_targets();
203
204 /*
205 * print final targets
206 */
207 // sprintf( target_out,"%s-%i.targets", pref, i);
208 // sprintf( list_out,"%s-%i.list", pref, i);
209 // sprintf( model_out,"%s-%i.model", pref, i);
210 // sprintf( annot_out,"%s-%i.xls", pref, i);
211 print_state(fout, target_out);
212 print_model(fout, model_out);
213 // print_targets(fout, list_out);
214 // print_annotpeak( fout, annot_out, i);
215
216 /*
217 * erase found motifs (only if we're going around again)
218 */
219 if( i< nmots ) erase_update();
220 if( repeat > 0 ) toggle_reset();
221 }
222
223 }
224
225 /*==================== FUNCTIONS ======================*/
226
227 /*
228 * read command line inputs and options
229 */
230 void read_cmd_line( int argc, char **argv ){
231 int i, j;
232 int found_reqs=0;
233
234 // default values
235 mots = 1;
236 zoops = 0;
237 printall = 0;
238 mu = 0.1;
239 tscale = 1.0;
240 iseedval = make_seed();
241 set_togs = 0;
242 verbose = 1;
243 repeat = 0;
244 rsca = 1.0;
245
246 // parse options
247 for(i=1; i< argc; i++){
248 if( argv[i][0] != '-' ) print_usage();
249 i++;
250 if( i >= argc || argv[i][0] == '-' ) print_usage();
251 switch( argv[i-1][1] ){
252 case 'i':
253 input = (char *) malloc((strlen(argv[i])+5)*sizeof(char));
254 strcpy(input, argv[i]);
255 found_reqs++;
256 break;
257 case 'b':
258 background = (char *) malloc((strlen(argv[i])+5)*sizeof(char));
259 strcpy(background, argv[i]);
260 found_reqs++;
261 break;
262 case 'o':
263 target_out = (char *) malloc((strlen(argv[i])+5)*sizeof(char));
264 strcpy(target_out, argv[i]);
265 found_reqs++;
266 break;
267 case 'p':
268 model_out = (char *) malloc((strlen(argv[i])+5)*sizeof(char));
269 strcpy(model_out, argv[i]);
270 found_reqs++;
271 break;
272 case 'n':
273 mots = atoi(argv[i]);
274 if( mots <= 0 ){
275 printf("\nError: choose number of expression patterns to be at least 1\n");
276 print_usage();
277 }
278 break;
279 case 'z':
280 if( argv[i][0] == 'y' ) zoops = 1;
281 else if( argv[i][0] == 'n' ) zoops = 0;
282 else{
283 printf("\nError: if using -z option, choose y or n\n");
284 print_usage();
285 }
286 break;
287 case 'v':
288 if( argv[i][0] == 'y' ) verbose = 1;
289 else if( argv[i][0] == 'n' ) verbose = 0;
290 else{
291 printf("\nError: if using -v option, choose y or n\n");
292 print_usage();
293 }
294 break;
295 case 'm':
296 mu = atof(argv[i]);
297 if( mu<= 0 ){
298 printf("\nError: choose mu to be positive\n");
299 print_usage();
300 }
301 break;
302 case 't':
303 tscale = atof(argv[i]);
304 if( tscale<= 0 ){
305 printf("\nError: choose threshold to be positive\n");
306 print_usage();
307 }
308 break;
309 case 's':
310 iseedval = -1*atol(argv[i]);
311 if( iseedval>= 0 ){
312 printf("\nError: choose seed to be a large positive integer\n");
313 print_usage();
314 }
315 break;
316 case 'c':
317 set_togs = 1;
318 togs = (char *) malloc((strlen(argv[i])+5)*sizeof(char));
319 strcpy( togs, argv[i] );
320 break;
321 case 'a':
322 if( argv[i][0] == 'y' ) printall = 1;
323 else if( argv[i][0] == 'n' ) printall = 0;
324 else{
325 printf("\nError: if using -a option, choose y or n\n");
326 print_usage();
327 }
328 break;
329 case 'r':
330 repeat = atoi(argv[i]);
331 if( repeat< 0 ){
332 printf("\nError: choose repeat to be a non-negative integer\n");
333 print_usage();
334 }
335 break;
336 case 'R':
337 rsca = atof(argv[i]);
338 if( rsca <= 0 ){
339 printf("\nError: choose repeat scalar to be positive\n");
340 print_usage();
341 }
342 break;
343 default:
344 printf("\nError: unkown option: -%c\n", argv[i-1][1]);
345 print_usage();
346 }
347 }
348 // double check that the correct number of required inputs has been read in
349 if( found_reqs != 4 ){
350 printf("\nError: improper number of required inputs found\n");
351 print_usage();
352 }
353 }
354
355 /*
356 * usage info
357 */
358 void print_usage(){
359 printf("\nUsage: -i matched_data -b background_data -o target_output_name -p model_output_name [option value]\n");
360 printf("Options:\n");
361 printf(" -n number of expression patterns to find (positive integer, default 1)\n");
362 printf(" -z enforce ZOOPS model (y or n, default n)\n");
363 printf(" -m scalar for psuedocounts (positive real number, default 0.1)\n");
364 printf(" -t scalar for threshold (positive real number, default 1.0)\n");
365 printf(" -s seed (positive long integer, default chosen from date and time)\n");
366 printf(" -c column toggle file, used to ignore certain classifications (default: all on)\n");
367 printf(" -v verboseness, turn on/off printing to stdout (y or n, default: y)\n");
368 printf(" -a print all genes (and scores), rather than only those above the threshold (y or n, default: n)\n");
369 printf(" -r repeat search: after convergence of each expression pattern, turn off columns with lower relative entropy\n");
370 printf(" than average*scalar and repeat expression pattern search (non-negative integer, default 0)\n");
371 printf(" -R relative entropy scalar: if using the -r option, multiplies the average relative entropy\n");
372 printf(" by this value to set the threshold for turning off columns (positive real number, default 1.0)\n");
373 printf("\n");
374 exit(0);
375 }
376
377 /*
378 * function: read data from filename into the peaklist data structure
379 * notes on assumed format of data:
380 * peak lines start with "PEAK:" field
381 * gene lines start with "GENE:" field
382 * gene lines with only 1 classification are ignored
383 * assume all fields are <5000 characters
384 */
385 void read_data( char *filename ){
386 int j,m,n;
387 char val[5000], other[5000];
388 FILE *fp;
389
390 /*
391 * first count number of peaks, and allocate
392 */
393 fp = fopen(filename,"rt");
394 if( fp == NULL ){
395 printf("Error: can't open file %s\n", filename);
396 exit(0);
397 }
398 numpeaks = 0;
399 while( fscanf(fp, "%s", &val ) != EOF ){
400 if( contains( val, "PEAK" ) == 1 ){
401 numpeaks++;
402 }
403 }
404 fclose(fp);
405 peaklist = (peak *) malloc(numpeaks*sizeof(peak));
406
407 /*
408 * now count number of genes w/>1 classification, and allocate
409 * also count number of classifications, and allocate that at the end
410 * n tracks number of genes, j field; subtract genes that don't have full classifications
411 * m tracks peak index; record peak info
412 */
413 fp = fopen(filename,"rt");
414 n=-1;
415 m=-1;
416 j=0;
417 sprintf(other,"NA");
418 numclasses=0;
419 while( fscanf(fp, "%s", &val ) != EOF ){
420 if( contains( val, "PEAK" ) == 1 ){
421 if( j==7 ) ;
422 else{
423 if( j-6 > numclasses ) numclasses = j-6;
424 }
425 if( m>=0 ){
426 peaklist[m].numgenes = n;
427 peaklist[m].genelist = (gene *) malloc( peaklist[m].numgenes*sizeof(gene));
428 }
429 n=0;
430 j=0;
431 if( strcmp(other, "NA") != 0 ) strcpy(peaklist[m].info, other);
432 m++;
433 sprintf(other,"NA");
434 }
435 else if( contains( val, "GENE" ) == 1 ){
436 n++;
437 j=0;
438 }
439 else if( n==0 ){
440 if( j==0 ) strcpy(peaklist[m].chrom, val);
441 if( j==1 ) peaklist[m].pkposn = atol(val);
442 if( j> 1 ){
443 if( strcmp(other, "NA") == 0 ) strcpy(other, val);
444 else sprintf(other,"%s %s", other, val);
445 }
446 j++;
447 }
448 else{
449 j++;
450 }
451 }
452 peaklist[m].numgenes = n;
453 peaklist[m].genelist = (gene *) malloc( peaklist[m].numgenes*sizeof(gene));
454 strcpy( peaklist[m].info, other);
455 fclose(fp);
456
457 for(m=0; m< numpeaks; m++){
458 for(n=0; n< peaklist[m].numgenes; n++){
459 peaklist[m].genelist[n].values = (int *) malloc(numclasses*sizeof(int));
460 }
461 }
462
463 /*
464 * now read in gene info
465 */
466 fp = fopen(filename,"rt");
467 m=-1;
468 j=0;
469 while( fscanf(fp, "%s", &val ) != EOF ){
470 if( contains( val, "PEAK" ) == 1 ){
471 if( j==7 ) peaklist[m].genelist[n].used = 0;
472 if( j==6+numclasses) peaklist[m].genelist[n].used = 1;
473 n=-1;
474 j=0;
475 m++;
476 }
477 else if( contains( val, "GENE" ) == 1 ){
478 if( j==7 ) peaklist[m].genelist[n].used = 0;
479 if( j==6+numclasses ) peaklist[m].genelist[n].used = 1;
480 n++;
481 j=0;
482 }
483 else if( n==-1 ){
484 }
485 else {
486 if( j==0 ) strcpy( peaklist[m].genelist[n].id, val );
487 if( j==1 ) strcpy( peaklist[m].genelist[n].name, val );
488 if( j==2 ) strcpy( peaklist[m].genelist[n].chrom, val );
489 if( j==3 ) peaklist[m].genelist[n].start = atol(val);
490 if( j==4 ) peaklist[m].genelist[n].end = atol(val);
491 if( j==5 ) peaklist[m].genelist[n].sense = atoi(val);
492 if( j>=6 ) peaklist[m].genelist[n].values[j-6] = atoi(val);
493 j++;
494 }
495 }
496 if( j==7 ) peaklist[m].genelist[n].used = 0;
497 if( j==6+numclasses) peaklist[m].genelist[n].used = 1;
498 fclose(fp);
499
500 /*
501 * now count how many genes are to be used for each
502 */
503 for(m=0; m< numpeaks; m++){
504 peaklist[m].numused = 0;
505 for(n=0; n< peaklist[m].numgenes; n++){
506 peaklist[m].numused+= peaklist[m].genelist[n].used;
507 }
508 }
509 }
510
511 void read_bkgrnd( char *filename ){
512 int j,n;
513 char val[500];
514 FILE *fp;
515
516 /*
517 * first count number of genes, and allocate
518 */
519 numbk = count_lines( fp, filename );;
520 bklist = (gene *) malloc(numbk*sizeof(gene));
521
522 for(n=0; n< numbk; n++){
523 bklist[n].values = (int *) malloc(numclasses*sizeof(int));
524 }
525
526 /*
527 * now read in gene info
528 */
529 fp = fopen(filename,"rt");
530 n=-1;
531 j=0;
532 while( fscanf(fp, "%s", &val ) != EOF ){
533 if( contains( val, "_a" ) == 1 ){
534 if( j==7 ) bklist[n].used = 0;
535 if( j==6+numclasses ) bklist[n].used = 1;
536 j=1;
537 n++;
538 strcpy( bklist[n].id, val );
539 }
540 else{
541 if( j==1 ) strcpy( bklist[n].name, val );
542 if( j==2 ) strcpy( bklist[n].chrom, val );
543 if( j==3 ) bklist[n].start = atol(val);
544 if( j==4 ) bklist[n].end = atol(val);
545 if( j==5 ) bklist[n].sense = atoi(val);
546 if( j>=6 ) bklist[n].values[j-6] = atoi(val);
547 j++;
548 }
549 }
550 if( j==7 ) bklist[n].used = 0;
551 if( j==6+numclasses) bklist[n].used = 1;
552 fclose(fp);
553
554 /*
555 * now count how many genes are to be used for each
556 */
557 bkused = 0;
558 for(n=0; n< numbk; n++){
559 bkused+= bklist[n].used;
560 }
561 }
562
563 /*
564 * make initial guesses of z-values, set erase to 1
565 * read in column toggle file if provided
566 * also histogram background and scale into beta
567 */
568 void initialize( long *seed ){
569 int j,m,n;
570 int val;
571 FILE *fp;
572
573 for(m=0; m< numpeaks; m++){
574 for(n=0; n< peaklist[m].numgenes; n++){
575 if( peaklist[m].genelist[n].used == 1 ){
576 peaklist[m].genelist[n].erase = 1.0;
577 peaklist[m].genelist[n].z = 0.0;
578 }
579 }
580 if( peaklist[m].numused > 0 ){
581 n = assign_gene( m, seed );
582 peaklist[m].genelist[n].z = 1.0; // pick one random gene from each set to read into motif model
583 }
584 }
585
586 col_tog = (int *) malloc(numclasses*sizeof(int));
587 col_tog_cpy = (int *) malloc(numclasses*sizeof(int));
588 if( set_togs == 0 ){
589 for(j=0; j< numclasses; j++){
590 col_tog[j] = 1;
591 col_tog_cpy[j] = 1;
592 }
593 usedclasses = numclasses;
594 }
595 else{
596 // first count number of fields
597 col_togl = 0;
598 fp = fopen(togs,"rt");
599 if( fp == NULL ){
600 printf("Error: can't open file %s\n", togs);
601 exit(0);
602 }
603 j=0;
604 while( fscanf(fp,"%i", &val) != EOF ){
605 col_togl++;
606 }
607 fclose(fp);
608 if( col_togl != numclasses ){
609 printf("Error: number of fields in toggle file (%i) is different from number of classifications in data (%i)\n", col_togl, numclasses);
610 exit(0);
611 }
612 // now read in values
613 fp = fopen(togs,"rt");
614 j=0;
615 usedclasses = 0;
616 while( fscanf(fp,"%i", &val) != EOF ){
617 col_tog[j] = val;
618 col_tog_cpy[j] = val;
619 j++;
620 if( val == 1 ) usedclasses++;
621 if( val != 1 && val != 0 ){
622 printf("Error: choose the toggle values to be 0 or 1\n");
623 exit(0);
624 }
625 }
626 fclose(fp);
627 }
628
629 for(n=0; n< numbk; n++){
630 bklist[n].z = 1.0;
631 }
632 beta_update();
633 if( verbose != 0 ) printf(" Indexing genes in first pass\n");
634 index_genes();
635 }
636
637 /*
638 * make initial guesses of z-values for a second, etc motif
639 * recount number of used genes per peak to those with erase > 0.5
640 */
641 void reinitialize( long *seed ){
642 int m,n;
643 for(m=0; m< numpeaks; m++){
644 peaklist[m].numused = 0;
645 for(n=0; n< peaklist[m].numgenes; n++){
646 peaklist[m].genelist[n].z = 0.0;
647 if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].erase> 0.5 ){
648 peaklist[m].numused++;
649 }
650 }
651 if( peaklist[m].numused > 0 ){
652 n = assign_gene( m, seed );
653 peaklist[m].genelist[n].z = 1.0; // pick one random gene from each set to read into motif model
654 }
655 }
656 for(n=0; n< numbk; n++){
657 bklist[n].z = 1.0;
658 }
659 }
660
661 /*
662 * histogram background into beta
663 */
664 void beta_update(){
665 int j,k,n;
666 double tot=0.0;
667 bkhist_update();
668 for(j=0; j< numclasses; j++){
669 // if( col_tog[j] == 1 ){
670 Beta[j] = 0.0;
671 for(k=0; k< 5; k++){
672 beta[j][k] = hist[1][j][k]*mu;
673 Beta[j]+= beta[j][k];
674 }
675 // }
676 }
677 }
678
679 /*
680 * for all the genes in peaklist[*].genelist[*], record the index of that gene in bklist
681 */
682 void index_genes(){
683 int m,n,nu;
684 int found;
685 for(m=0; m< numpeaks; m++){
686 for(n=0; n< peaklist[m].numgenes; n++){
687 if( peaklist[m].genelist[n].used == 1 ){
688 found = 0;
689 nu = 0;
690 while(nu< numbk && found == 0){
691 if( strcmp(peaklist[m].genelist[n].id, bklist[nu].id) == 0 ){
692 peaklist[m].genelist[n].index = nu;
693 found = 1;
694 }
695 nu++;
696 }
697 }
698 }
699 }
700 }
701
702 /*
703 * update motif model histogram
704 */
705 void hist_update(){
706 int j,k,m,n;
707 double tot=0;
708 for(j=0; j< numclasses; j++){
709 // if( col_tog[j] == 1 ){
710 for(k=0; k< 5; k++){
711 hist[0][j][k] = 0.0;
712 }
713 // }
714 }
715 for(m=0; m< numpeaks; m++){
716 for(n=0; n< peaklist[m].numgenes; n++){
717 if( peaklist[m].genelist[n].used == 1 ){
718 tot+=peaklist[m].genelist[n].erase*peaklist[m].genelist[n].z;
719 for(j=0; j< numclasses; j++){
720 // if( col_tog[j] == 1 ){
721 k = peaklist[m].genelist[n].values[j]+2;
722 hist[0][j][k]+= peaklist[m].genelist[n].erase*peaklist[m].genelist[n].z;
723 // }
724 }
725 }
726 }
727 }
728 for(j=0; j< numclasses; j++){
729 // if( col_tog[j] == 1 ){
730 for(k=0; k< 5; k++){
731 hist[0][j][k]+= beta[j][k];
732 hist[0][j][k]/= (tot+Beta[j]);;
733 }
734 // }
735 }
736 }
737
738 /*
739 * update background hist using z values from bklist
740 */
741 void bkhist_update(){
742 int j,k,n;
743 double tot=0.0;
744 for(j=0; j< numclasses; j++){
745 // if( col_tog[j] == 1 ){
746 for(k=0; k< 5; k++){
747 hist[1][j][k] = 0;
748 }
749 // }
750 }
751 for(n=0; n< numbk; n++){
752 if( bklist[n].used == 1 ){
753 tot+=bklist[n].z;
754 for(j=0; j< numclasses; j++){
755 // if( col_tog[j] == 1 ){
756 k = bklist[n].values[j]+2;
757 hist[1][j][k]+= bklist[n].z;
758 // }
759 }
760 }
761 }
762 for(j=0; j< numclasses; j++){
763 // if( col_tog[j] == 1 ){
764 for(k=0; k< 5; k++){
765 hist[1][j][k]/= tot;
766 }
767 // }
768 }
769 }
770
771 /*
772 * update lambda value
773 */
774 void lambda_update(){
775 int m,n;
776 double tot=0.0;
777 lambda=0;
778 for(m=0; m< numpeaks; m++){
779 for(n=0; n< peaklist[m].numgenes; n++){
780 if( peaklist[m].genelist[n].used == 1 ){
781 lambda+= peaklist[m].genelist[n].z;
782 tot++;
783 }
784 }
785 }
786 lambda/=tot;
787 }
788
789 /*
790 * update the toggle columns based on which columns have better than average relative entropy
791 */
792 void toggle_update(){
793 int j;
794 double avg=0.0;
795 avg = rsca*rel_entropy();
796 usedclasses=0;
797 for(j=0; j< numclasses; j++){
798 if( col_tog[j] == 1 ){
799 if( col_rel_entropy(j) < avg ) col_tog[j] = 0;
800 if( col_tog[j] == 1 ) usedclasses++;
801 }
802 }
803 }
804
805 /*
806 * reset col_tog values with their original ones
807 */
808 void toggle_reset(){
809 int j;
810 usedclasses=0;
811 for(j=0; j< numclasses; j++){
812 col_tog[j] = col_tog_cpy[j];
813 if( col_tog[j] == 1 ) usedclasses++;
814 }
815 }
816
817 /*
818 * update z
819 */
820 void z_update(){
821 int j,m,n;
822 double a, c, denom;
823 for(m=0; m< numpeaks; m++){
824 for(n=0; n< peaklist[m].numgenes; n++){
825 if( peaklist[m].genelist[n].used == 1 ){
826 // new z value for motif model
827 a = lprob(m,n,0)+log(lambda);
828 c = lprob(m,n,1)+log(1-lambda);
829 if( a> c ){
830 denom = a + log(1+exp(c-a));
831 }
832 else{
833 denom = c + log(1+exp(a-c));
834 }
835 peaklist[m].genelist[n].z = exp( lprob(m,n,0)+log(lambda) - denom );
836 // peaklist[m].genelist[n].z = prob(m,n,0)*lambda/( prob(m,n,0)*lambda + prob(m,n,1)*(1-lambda) );
837 // new z value for corresponding gene in background model
838 bklist[peaklist[m].genelist[n].index].z = 1.0-peaklist[m].genelist[n].z;
839 }
840 }
841 }
842 }
843
844 /*
845 * update erase parameters
846 */
847 void erase_update(){
848 int j,m,n;
849 for(m=0; m< numpeaks; m++){
850 for(n=0; n< peaklist[m].numgenes; n++){
851 if( peaklist[m].genelist[n].used == 1 ){
852 peaklist[m].genelist[n].erase*= (1-peaklist[m].genelist[n].z);
853 }
854 }
855 }
856 }
857
858 /*
859 * calculate (log) (multinomial) probability of sequence m,n in model l
860 */
861 double lprob( int m, int n, int l ){
862 int j,k;
863 double val=0.0;
864 for(j=0; j< numclasses; j++){
865 if( col_tog[j] == 1 ){
866 k = peaklist[m].genelist[n].values[j]+2;
867 val+= log(hist[l][j][k]);
868 }
869 }
870 return val;
871 }
872 double prob( int m, int n, int l ){
873 int j,k;
874 double val=1.0;
875 for(j=0; j< numclasses; j++){
876 if( col_tog[j] == 1 ){
877 k = peaklist[m].genelist[n].values[j]+2;
878 val*= hist[l][j][k];
879 }
880 }
881 return val;
882 }
883
884 /*
885 * copy hist into oldhist before updating
886 */
887 void copy_hist(){
888 int j,k,l;
889 for(l=0; l< 2; l++){
890 for(j=0; j< numclasses; j++){
891 // if( col_tog[j] == 1 ){
892 for(k=0; k< 5; k++){
893 oldhist[l][j][k] = hist[l][j][k];
894 }
895 // }
896 }
897 }
898 }
899
900 /*
901 * compute change in hist
902 */
903 double hist_diff(){
904 int j,k,l;
905 double val=0.0;
906 for(l=0; l< 2; l++){
907 for(j=0; j< numclasses; j++){
908 if( col_tog[j] == 1 ){
909 for(k=0; k< 5; k++){
910 val+= fabs( oldhist[l][j][k]-hist[l][j][k] );
911 }
912 }
913 }
914 }
915 return val;
916 }
917
918 /*
919 * calculate the score of a sequence
920 */
921 double score( int m, int n ){
922 int j,k;
923 double val=0.0;
924 for(j=0; j< numclasses; j++){
925 if( col_tog[j] == 1 ){
926 k = peaklist[m].genelist[n].values[j]+2;
927 val+= log( hist[0][j][k]/hist[1][j][k] );
928 }
929 }
930 return val;
931 }
932
933 /*
934 * calculate value of t (threshold)
935 */
936 void t_update(){
937 t = tscale*log( (1-lambda)/lambda );
938 }
939
940 /*
941 * quality scores of a motif, averaging over only those sequences picked as targets
942 * ElogL: log-likelihood of the combination of models, averaged by # genes
943 * ratio_score: log-likelihood ratio of motif vs background models
944 * rel_entropy: relative entropy of motif to background
945 * col_rel_entropy: same as rel_entropy, but for just one column (rel_entropy is average over all cols)
946 * chisq: chi-square statistic
947 */
948 double ElogL(){
949 int m,n;
950 double val=0.0, tot=0.0;
951 for(m=0; m< numpeaks; m++){
952 for(n=0; n< peaklist[m].numgenes; n++){
953 if( peaklist[m].genelist[n].used == 1 ){
954 val+= peaklist[m].genelist[n].z*lprob(m, n, 0 ) \
955 + (1-peaklist[m].genelist[n].z)*lprob(m, n, 1 )\
956 + peaklist[m].genelist[n].z*log(lambda) \
957 + (1-peaklist[m].genelist[n].z)*log(1-lambda);
958 tot++;
959 }
960 }
961 }
962 return val/tot;
963 }
964
965 double ratio_score(){
966 int m,n;
967 double val=0.0;
968 double tot=0.0;
969 for(m=0; m< numpeaks; m++){
970 for(n=0; n< peaklist[m].numgenes; n++){
971 if( peaklist[m].genelist[n].assigned == 1 ){
972 val+= lprob(m,n,0)-lprob(m,n,1);
973 tot++;
974 }
975 }
976 }
977 return val/tot;
978 }
979
980 double rel_entropy(){
981 int j,k;
982 double val=0.0;
983 for(j=0; j< numclasses; j++){
984 if( col_tog[j] == 1 ){
985 val+= col_rel_entropy( j );
986 }
987 }
988 return val/usedclasses;
989 }
990
991 double col_rel_entropy( int j ){
992 int k;
993 double val=0.0;
994 for(k=0; k< 5; k++){
995 if( hist[1][j][j] > 0 && hist[0][j][k] > 0 ){
996 // do in log base 2 for bits
997 val+= hist[0][j][k]*log(hist[0][j][k]/hist[1][j][k])/log(2);
998 }
999 }
1000 return val;
1001 }
1002
1003 double chisq(){
1004 int j, k;
1005 double val=0.0;
1006 for(j=0; j< numclasses; j++){
1007 if( col_tog[j] == 1 ){
1008 for(k=0; k< 5; k++){
1009 val+= pow(hist[0][j][k] - hist[1][j][k],2)/hist[1][j][k];
1010 }
1011 }
1012 }
1013 return val;
1014 }
1015
1016 /*
1017 * number of members of a motif model
1018 */
1019 int motif_count(){
1020 int m,n;
1021 int tot=0;
1022 for(m=0; m< numpeaks; m++){
1023 for(n=0; n< peaklist[m].numgenes; n++){
1024 if( peaklist[m].genelist[n].assigned == 1 ){
1025 tot++;
1026 }
1027 }
1028 }
1029 return tot;
1030 }
1031
1032 /*
1033 * assign a random used gene: one that is used and hasn't been erased
1034 */
1035 int assign_gene( int m, long *seed ){
1036 int n;
1037 n = floor( peaklist[m].numgenes*ran1(seed) );
1038 while( peaklist[m].genelist[n].used == 0 || peaklist[m].genelist[n].erase < 0.5 ){
1039 n = floor( peaklist[m].numgenes*ran1(seed) );
1040 }
1041 return n;
1042 }
1043
1044 /*
1045 * score all genes using the models
1046 */
1047 void score_genes(){
1048 int m,n;
1049 for(m=0; m< numpeaks; m++){
1050 for(n=0; n< peaklist[m].numgenes; n++){
1051 if( peaklist[m].genelist[n].used == 1 ){
1052 peaklist[m].genelist[n].score= score( m, n );
1053 }
1054 }
1055 }
1056 }
1057
1058 /*
1059 * choose targets using the scores, and ZOOPS setting
1060 * zoops = 1 -> 0 or 1 target/strand
1061 * else -> all above threshold
1062 */
1063 void choose_targets(){
1064 int m,n;
1065 int best;
1066 for(m=0; m< numpeaks; m++){
1067 peaklist[m].assgngene = 0;
1068 best=-1;
1069 for(n=0; n< peaklist[m].numgenes; n++){
1070 if( peaklist[m].genelist[n].used == 1 ){
1071 if( peaklist[m].genelist[n].score > t ){
1072 peaklist[m].genelist[n].assigned = 1;
1073 peaklist[m].assgngene++;
1074 }
1075 else peaklist[m].genelist[n].assigned = 0;
1076 if( best == -1 ) best = n;
1077 else if( peaklist[m].genelist[n].score > peaklist[m].genelist[best].score ) best = n;
1078 }
1079 }
1080 if( zoops == 1 ){
1081 for(n=0; n< peaklist[m].numgenes; n++){
1082 if( peaklist[m].genelist[n].used == 1 && n != best ) peaklist[m].genelist[n].assigned = 0;
1083 }
1084 if( peaklist[m].genelist[best].score > t ) peaklist[m].assgngene = 1;
1085 else peaklist[m].assgngene = 0;
1086 }
1087 }
1088 }
1089
1090 /*
1091 * functions to print to output (fp must already be opened for top two)
1092 */
1093 void print_peak( FILE *fp, int m ){
1094 int n;
1095 fprintf(fp,"PEAK: %s %ld %s\n", peaklist[m].chrom, peaklist[m].pkposn, peaklist[m].info);
1096 for(n=0; n< peaklist[m].numgenes; n++){
1097 if( printall == 0 ){
1098 if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].assigned == 1 ){
1099 print_gene( fp, m, n );
1100 }
1101 }
1102 if( printall == 1 ){
1103 if( peaklist[m].genelist[n].used == 1 ){
1104 print_gene( fp, m, n );
1105 }
1106 }
1107 }
1108 }
1109
1110 void print_gene( FILE *fp, int m, int n ){
1111 int j;
1112 if( peaklist[m].genelist[n].used == 1 ){
1113 fprintf(fp, "TGENE: %0.3f %s %s %s %ld %ld %i", peaklist[m].genelist[n].score, peaklist[m].genelist[n].id, peaklist[m].genelist[n].name, peaklist[m].genelist[n].chrom, peaklist[m].genelist[n].start, peaklist[m].genelist[n].end, peaklist[m].genelist[n].sense);
1114 for(j=0; j< numclasses; j++){
1115 if( col_tog[j] == 0 ) fprintf(fp," %i ", peaklist[m].genelist[n].values[j]);
1116 else fprintf(fp," %i", peaklist[m].genelist[n].values[j]);
1117 }
1118 fprintf(fp,"\n");
1119 }
1120 }
1121
1122 void print_state(FILE *fp, char *filename){
1123 int m;
1124 fp = fopen(filename,"w");
1125 for(m=0; m< numpeaks; m++){
1126 print_peak(fp, m);
1127 }
1128 fclose(fp);
1129 }
1130
1131 /*
1132 * this prints a list of <region id> <name>
1133 */
1134 /****************
1135 void print_targets( FILE *fp, char *filename ){
1136 int m,n,nu;
1137 fp = fopen(filename,"w");
1138 for(m=0; m< numpeaks; m++){
1139 if( peaklist[m].assgngene > 0 ){
1140 fprintf(fp,"%s ", peaklist[m].regid);
1141 n=0;
1142 nu=0;
1143 while( n< peaklist[m].numgenes ){
1144 if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].assigned == 1 ){
1145 if( nu< peaklist[m].assgngene-1 ) fprintf(fp,"%s,", peaklist[m].genelist[n].name);
1146 else fprintf(fp,"%s\n", peaklist[m].genelist[n].name);
1147 nu++;
1148 }
1149 n++;
1150 }
1151 }
1152 }
1153 fclose(fp);
1154 }
1155 *************/
1156
1157 /*
1158 * print the motif model and its score
1159 */
1160 void print_model( FILE *fp, char *filename ){
1161 int j,k;
1162 fp = fopen(filename,"w");
1163 fprintf(fp,"comparison\\class; avg relative entropy %0.3f, %i genes, threshold %0.3f, quality %0.1f\n", rel_entropy(), motif_count(), t, rel_entropy()*motif_count() );
1164 fprintf(fp," ++ + 0 - --\n");
1165 for(j=0; j< numclasses; j++){
1166 if( j< 10 ) fprintf(fp," ");
1167 if( col_tog[j] == 0 ) fprintf(fp,"#");
1168 else fprintf(fp," ");
1169 fprintf(fp,"%i,%.3f",j+1, col_rel_entropy(j) );
1170 for(k=4; k>=0; k--){
1171 if( hist[1][j][k] > 0 ) fprintf(fp,"%s", nice_d(log(hist[0][j][k]/hist[1][j][k])) );
1172 else fprintf(fp," NA");
1173 }
1174 fprintf(fp,"\n");
1175 }
1176 fclose(fp);
1177 }
1178
1179 /*
1180 * prints a double with 3 digits, aligned by spaces
1181 * longest would be -999.000, so 8 characters
1182 * shortest would be 1.000, 5 characters
1183 * up to 3 spaces
1184 */
1185 char *nice_d( double val ){
1186 char *returnval;
1187 char *dval;
1188 char *spaces;
1189 int i, numspaces=0;
1190
1191 returnval = (char *) malloc(10*sizeof(char));
1192 dval = (char *) malloc(10*sizeof(char));
1193 spaces = (char *) malloc(10*sizeof(char));
1194
1195 sprintf(dval,"%.3f", val);
1196
1197 if( val >= 0 ) numspaces++;
1198 if( fabs(val) < 100 ){
1199 if( fabs(val) < 10 ) numspaces+=2;
1200 else numspaces++;
1201 }
1202 if( numspaces == 0 ) sprintf(spaces,"");
1203 if( numspaces == 1 ) sprintf(spaces," ");
1204 if( numspaces == 2 ) sprintf(spaces," ");
1205 if( numspaces == 3 ) sprintf(spaces," ");
1206 sprintf(returnval,"%s%s", spaces, dval);
1207 return returnval;
1208 }
1209
1210 /*
1211 * print the peaks only in annotation format, for use in comppeaks, subpeaks, etc
1212 */
1213 /**************
1214 void print_annotpeak( FILE *fp, char *filename, int motif ){
1215 int m;
1216 fp = fopen(filename,"w");
1217 for(m=0; m< numpeaks; m++){
1218 if( peaklist[m].assgngene > 0 ){
1219 fprintf(fp,"%s-%i %s %s %ld %ld %ld ", peaklist[m].filename, motif, peaklist[m].regid, peaklist[m].chrom, peaklist[m].pkposn, peaklist[m].start, peaklist[m].end );
1220 fprintf(fp,"%f 1.0 1.0 1 1 1.0 ", peaklist[m].height );
1221 fprintf(fp,"1 1.0 1.0 1.0 1 -1.0 poop 1\n");
1222 }
1223 }
1224 close(fp);
1225 }
1226 *************/
1227