Mercurial > repos > mmaiensc > ember
comparison GALAXY_FILES/tools/EMBER/Ember_Galaxy.c @ 3:037c3edda16e
Uploaded
author | mmaiensc |
---|---|
date | Thu, 22 Mar 2012 13:49:52 -0400 |
parents | 003f802d4c7d |
children |
comparison
equal
deleted
inserted
replaced
2:1a84b8178b45 | 3:037c3edda16e |
---|---|
1 /* | |
2 * ember.c : Expectation Maximization of Binding and Expression pRofiles | |
3 * ---------------------------------------------------------------- | |
4 * algorithm adapted from MEME algorithm: Bailey and Elkan, Proc Int Conf Intell Syst Mol Biol, 2:28-36 (1994). | |
5 * See Eqs. 3-13, plus "Implementation of MM" section for detailed description | |
6 * changes are as follows: | |
7 * background changes with column (j=1,...,numclasses), so it is a 2D histogram of the same size as the motif model; same for psuedocounts | |
8 * background model is determined by full list of genes (second input); z-values of genes in first input are subtracted | |
9 * update of background can be turned off by commenting out 8th line of z_update() | |
10 * in this case, also comment out index_genes at end of initialize(), as it will save some time | |
11 * sequences cannot overlap, since they are fixed per gene; this simplifies the enumeration of all possibilities | |
12 * a ZOOPS options is included (change in code, parameter zoops); this does not alter the algorithm, | |
13 * only adds a restriction on which genes are chosen as targets in choose_targets() | |
14 * general notes: | |
15 * the common dimensions are: | |
16 * numpeaks - number of peaks, size of peaklist, indexed by m | |
17 * peaklist[m].numgenes - number of genes matched to a peak, size of peaklist[m].genelist, indexed by n | |
18 * numclasses - number of comparisons made (i.e. motif width), indexed by j | |
19 * 5 - number of classifications in each comparison, assumed to be -2,-1,0,1,2, indexed by k | |
20 * 2 - number of models (motif=0 and background=1), indexed by l | |
21 * nmots - total number of motifs to find, indexed by i in main (always stored at l=0, as found motifs are erased) | |
22 * col_tog can be used to force the algorithm to ignore certain columns (behavior dimensions) | |
23 * | |
24 * things to add/consider: | |
25 * - best measure of motif quality score? | |
26 * - want to ensure that the same sequence can't get assigned to more than one motif? | |
27 */ | |
28 | |
29 #include "all.h" | |
30 | |
31 typedef struct { | |
32 char id[50]; | |
33 char name[500]; | |
34 char chrom[50]; | |
35 long start; | |
36 long end; | |
37 int sense; | |
38 double z; | |
39 double erase; | |
40 double score; | |
41 int index; // will refer to index of this id in bklist | |
42 int used; // set to 0 if a non-classified gene | |
43 int assigned; // set to 1 if assigned | |
44 int *values; | |
45 } gene; | |
46 | |
47 typedef struct { | |
48 char info[5000]; | |
49 char chrom[50]; | |
50 long pkposn; | |
51 int numgenes; | |
52 int numused; | |
53 int assgngene; // set to # targets picked for this gene | |
54 gene *genelist; | |
55 } peak; | |
56 | |
57 // FREE PARAMETERS TO CHOOSE IN COMMAND LINE | |
58 double mu; // scale background histogram by mu for psuedocounts | |
59 double tscale; // scales the threshold (to make choices more or less stringent) | |
60 int zoops; // set to 1 to enforce ZOOPS model (Zero Or One Per Strand) | |
61 int printall; // set to 1 to print all genes, not just targets | |
62 int mots; // number of motifs to search for | |
63 int repeat; // number of times to wean columns and repeat search | |
64 double rsca; // percent of average to set relative entropy cutoff | |
65 // END PARAMETERS TO CHOOSE | |
66 | |
67 char *input, *background, *pref, *togs; | |
68 char *target_out, *list_out, *model_out, *annot_out; | |
69 long iseedval; | |
70 int numpeaks, numclasses, usedclasses, numbk, bkused, nmots; | |
71 peak *peaklist; | |
72 gene *bklist; | |
73 double ***hist, ***oldhist, lambda, t, **beta; | |
74 double *Beta; | |
75 int *col_tog, col_togl, set_togs, *col_tog_cpy; | |
76 int verbose; | |
77 | |
78 void read_cmd_line( int argc, char **argv ); | |
79 void print_usage(); | |
80 void read_data( char *filename ); | |
81 void read_bkgrnd( char *filename ); | |
82 void initialize( long *seed ); | |
83 void reinitialize( long *seed ); | |
84 void beta_update(); | |
85 void index_genes(); | |
86 | |
87 void hist_update(); | |
88 void bkhist_update(); | |
89 void lambda_update(); | |
90 void toggle_update(); | |
91 void toggle_reset(); | |
92 void z_update(); | |
93 void erase_update(); | |
94 double lprob( int m, int n, int l ); | |
95 double prob( int m, int n, int l ); | |
96 void copy_hist(); | |
97 double hist_diff(); | |
98 double score( int m, int n ); | |
99 void t_update(); | |
100 double ElogL(); | |
101 double ratio_score(); | |
102 double rel_entropy(); | |
103 double col_rel_entropy( int j ); | |
104 double chisq(); | |
105 int motif_count(); | |
106 | |
107 int assign_gene( int m, long *seed ); | |
108 void score_genes(); | |
109 void choose_targets(); | |
110 | |
111 void print_peak( FILE *fp, int i ); | |
112 void print_gene( FILE *fp, int i, int j ); | |
113 void print_state( FILE *fp, char *filename ); | |
114 //void print_targets( FILE *fp, char *filename ); | |
115 void print_model( FILE *fp, char *filename ); | |
116 //void print_annotpeak( FILE *fp, char *filename, int motif ); | |
117 char *nice_d( double val ); | |
118 | |
119 int main(int argc, char *argv[]){ | |
120 int i,iter,r; | |
121 int pfreq; | |
122 double diff, conv = 1e-6; | |
123 long seed; | |
124 FILE *fout; | |
125 | |
126 if(argc == 1 ){ | |
127 print_usage(); | |
128 } | |
129 | |
130 /* | |
131 * read command line params | |
132 */ | |
133 read_cmd_line( argc, argv ); | |
134 | |
135 /* | |
136 * allocate output names | |
137 */ | |
138 // target_out = (char *) malloc((strlen(pref)+20)*sizeof(char)); | |
139 // list_out = (char *) malloc((strlen(pref)+20)*sizeof(char)); | |
140 // model_out = (char *) malloc((strlen(pref)+20)*sizeof(char)); | |
141 // annot_out = (char *) malloc((strlen(pref)+20)*sizeof(char)); | |
142 | |
143 /* | |
144 * read in data, copy values from global to local variables | |
145 */ | |
146 if( verbose != 0 ) printf("\nReading data and allocating\n"); | |
147 read_data( input ); | |
148 read_bkgrnd( background ); | |
149 nmots = mots; | |
150 seed = iseedval; | |
151 | |
152 /* | |
153 * allocate memory for other arrays | |
154 */ | |
155 hist = (double ***) alloc3d(sizeof(double),2,numclasses,5); | |
156 oldhist = (double ***) alloc3d(sizeof(double),2,numclasses,5); | |
157 beta = (double **) alloc2d(sizeof(double),numclasses,5); | |
158 Beta = (double *) malloc(numclasses*sizeof(double)); | |
159 | |
160 /* | |
161 * find nmots motif models | |
162 */ | |
163 for(i=1; i<= nmots; i++){ | |
164 /* | |
165 * initialize, fill initial parameter values from z-guesses | |
166 */ | |
167 if( verbose != 0 ) printf("Expression Pattern %i,\n Initializing\n", i); | |
168 if( i==1 ) initialize( &seed ); | |
169 else reinitialize( &seed ); | |
170 hist_update(); | |
171 lambda_update(); | |
172 | |
173 /* | |
174 * now sample for awhile, repeating repeat times | |
175 * each repeat starts at the previously refined search (?) | |
176 */ | |
177 for(r=0; r<= repeat; r++){ | |
178 if( verbose != 0 ) printf(" Sampling\n"); | |
179 if( verbose != 0 && repeat> 0 ) printf(" Repeat %i\n", r ); | |
180 diff = 1; | |
181 pfreq = 10; | |
182 iter=1; | |
183 while( diff > conv ){ | |
184 z_update(); // E step | |
185 copy_hist(); | |
186 hist_update(); // M step | |
187 bkhist_update(); // M step | |
188 lambda_update(); // M step | |
189 diff = hist_diff(); | |
190 if( iter%pfreq == 0 && verbose != 0 ) printf(" iteration %i, (normalized) ElogL %.2f, change %.2e\n", iter, ElogL(), diff); | |
191 iter++; | |
192 } | |
193 if( repeat > 0 && r< repeat ) toggle_update(); | |
194 } | |
195 | |
196 /* | |
197 * score and choose final targets | |
198 */ | |
199 t_update(); | |
200 if( verbose != 0 ) printf(" Done, scoring and choosing targets on threshold %.3f\n\n", t); | |
201 score_genes(); | |
202 choose_targets(); | |
203 | |
204 /* | |
205 * print final targets | |
206 */ | |
207 // sprintf( target_out,"%s-%i.targets", pref, i); | |
208 // sprintf( list_out,"%s-%i.list", pref, i); | |
209 // sprintf( model_out,"%s-%i.model", pref, i); | |
210 // sprintf( annot_out,"%s-%i.xls", pref, i); | |
211 print_state(fout, target_out); | |
212 print_model(fout, model_out); | |
213 // print_targets(fout, list_out); | |
214 // print_annotpeak( fout, annot_out, i); | |
215 | |
216 /* | |
217 * erase found motifs (only if we're going around again) | |
218 */ | |
219 if( i< nmots ) erase_update(); | |
220 if( repeat > 0 ) toggle_reset(); | |
221 } | |
222 | |
223 } | |
224 | |
225 /*==================== FUNCTIONS ======================*/ | |
226 | |
227 /* | |
228 * read command line inputs and options | |
229 */ | |
230 void read_cmd_line( int argc, char **argv ){ | |
231 int i, j; | |
232 int found_reqs=0; | |
233 | |
234 // default values | |
235 mots = 1; | |
236 zoops = 0; | |
237 printall = 0; | |
238 mu = 0.1; | |
239 tscale = 1.0; | |
240 iseedval = make_seed(); | |
241 set_togs = 0; | |
242 verbose = 1; | |
243 repeat = 0; | |
244 rsca = 1.0; | |
245 | |
246 // parse options | |
247 for(i=1; i< argc; i++){ | |
248 if( argv[i][0] != '-' ) print_usage(); | |
249 i++; | |
250 if( i >= argc || argv[i][0] == '-' ) print_usage(); | |
251 switch( argv[i-1][1] ){ | |
252 case 'i': | |
253 input = (char *) malloc((strlen(argv[i])+5)*sizeof(char)); | |
254 strcpy(input, argv[i]); | |
255 found_reqs++; | |
256 break; | |
257 case 'b': | |
258 background = (char *) malloc((strlen(argv[i])+5)*sizeof(char)); | |
259 strcpy(background, argv[i]); | |
260 found_reqs++; | |
261 break; | |
262 case 'o': | |
263 target_out = (char *) malloc((strlen(argv[i])+5)*sizeof(char)); | |
264 strcpy(target_out, argv[i]); | |
265 found_reqs++; | |
266 break; | |
267 case 'p': | |
268 model_out = (char *) malloc((strlen(argv[i])+5)*sizeof(char)); | |
269 strcpy(model_out, argv[i]); | |
270 found_reqs++; | |
271 break; | |
272 case 'n': | |
273 mots = atoi(argv[i]); | |
274 if( mots <= 0 ){ | |
275 printf("\nError: choose number of expression patterns to be at least 1\n"); | |
276 print_usage(); | |
277 } | |
278 break; | |
279 case 'z': | |
280 if( argv[i][0] == 'y' ) zoops = 1; | |
281 else if( argv[i][0] == 'n' ) zoops = 0; | |
282 else{ | |
283 printf("\nError: if using -z option, choose y or n\n"); | |
284 print_usage(); | |
285 } | |
286 break; | |
287 case 'v': | |
288 if( argv[i][0] == 'y' ) verbose = 1; | |
289 else if( argv[i][0] == 'n' ) verbose = 0; | |
290 else{ | |
291 printf("\nError: if using -v option, choose y or n\n"); | |
292 print_usage(); | |
293 } | |
294 break; | |
295 case 'm': | |
296 mu = atof(argv[i]); | |
297 if( mu<= 0 ){ | |
298 printf("\nError: choose mu to be positive\n"); | |
299 print_usage(); | |
300 } | |
301 break; | |
302 case 't': | |
303 tscale = atof(argv[i]); | |
304 if( tscale<= 0 ){ | |
305 printf("\nError: choose threshold to be positive\n"); | |
306 print_usage(); | |
307 } | |
308 break; | |
309 case 's': | |
310 iseedval = -1*atol(argv[i]); | |
311 if( iseedval>= 0 ){ | |
312 printf("\nError: choose seed to be a large positive integer\n"); | |
313 print_usage(); | |
314 } | |
315 break; | |
316 case 'c': | |
317 set_togs = 1; | |
318 togs = (char *) malloc((strlen(argv[i])+5)*sizeof(char)); | |
319 strcpy( togs, argv[i] ); | |
320 break; | |
321 case 'a': | |
322 if( argv[i][0] == 'y' ) printall = 1; | |
323 else if( argv[i][0] == 'n' ) printall = 0; | |
324 else{ | |
325 printf("\nError: if using -a option, choose y or n\n"); | |
326 print_usage(); | |
327 } | |
328 break; | |
329 case 'r': | |
330 repeat = atoi(argv[i]); | |
331 if( repeat< 0 ){ | |
332 printf("\nError: choose repeat to be a non-negative integer\n"); | |
333 print_usage(); | |
334 } | |
335 break; | |
336 case 'R': | |
337 rsca = atof(argv[i]); | |
338 if( rsca <= 0 ){ | |
339 printf("\nError: choose repeat scalar to be positive\n"); | |
340 print_usage(); | |
341 } | |
342 break; | |
343 default: | |
344 printf("\nError: unkown option: -%c\n", argv[i-1][1]); | |
345 print_usage(); | |
346 } | |
347 } | |
348 // double check that the correct number of required inputs has been read in | |
349 if( found_reqs != 4 ){ | |
350 printf("\nError: improper number of required inputs found\n"); | |
351 print_usage(); | |
352 } | |
353 } | |
354 | |
355 /* | |
356 * usage info | |
357 */ | |
358 void print_usage(){ | |
359 printf("\nUsage: -i matched_data -b background_data -o target_output_name -p model_output_name [option value]\n"); | |
360 printf("Options:\n"); | |
361 printf(" -n number of expression patterns to find (positive integer, default 1)\n"); | |
362 printf(" -z enforce ZOOPS model (y or n, default n)\n"); | |
363 printf(" -m scalar for psuedocounts (positive real number, default 0.1)\n"); | |
364 printf(" -t scalar for threshold (positive real number, default 1.0)\n"); | |
365 printf(" -s seed (positive long integer, default chosen from date and time)\n"); | |
366 printf(" -c column toggle file, used to ignore certain classifications (default: all on)\n"); | |
367 printf(" -v verboseness, turn on/off printing to stdout (y or n, default: y)\n"); | |
368 printf(" -a print all genes (and scores), rather than only those above the threshold (y or n, default: n)\n"); | |
369 printf(" -r repeat search: after convergence of each expression pattern, turn off columns with lower relative entropy\n"); | |
370 printf(" than average*scalar and repeat expression pattern search (non-negative integer, default 0)\n"); | |
371 printf(" -R relative entropy scalar: if using the -r option, multiplies the average relative entropy\n"); | |
372 printf(" by this value to set the threshold for turning off columns (positive real number, default 1.0)\n"); | |
373 printf("\n"); | |
374 exit(0); | |
375 } | |
376 | |
377 /* | |
378 * function: read data from filename into the peaklist data structure | |
379 * notes on assumed format of data: | |
380 * peak lines start with "PEAK:" field | |
381 * gene lines start with "GENE:" field | |
382 * gene lines with only 1 classification are ignored | |
383 * assume all fields are <5000 characters | |
384 */ | |
385 void read_data( char *filename ){ | |
386 int j,m,n; | |
387 char val[5000], other[5000]; | |
388 FILE *fp; | |
389 | |
390 /* | |
391 * first count number of peaks, and allocate | |
392 */ | |
393 fp = fopen(filename,"rt"); | |
394 if( fp == NULL ){ | |
395 printf("Error: can't open file %s\n", filename); | |
396 exit(0); | |
397 } | |
398 numpeaks = 0; | |
399 while( fscanf(fp, "%s", &val ) != EOF ){ | |
400 if( contains( val, "PEAK" ) == 1 ){ | |
401 numpeaks++; | |
402 } | |
403 } | |
404 fclose(fp); | |
405 peaklist = (peak *) malloc(numpeaks*sizeof(peak)); | |
406 | |
407 /* | |
408 * now count number of genes w/>1 classification, and allocate | |
409 * also count number of classifications, and allocate that at the end | |
410 * n tracks number of genes, j field; subtract genes that don't have full classifications | |
411 * m tracks peak index; record peak info | |
412 */ | |
413 fp = fopen(filename,"rt"); | |
414 n=-1; | |
415 m=-1; | |
416 j=0; | |
417 sprintf(other,"NA"); | |
418 numclasses=0; | |
419 while( fscanf(fp, "%s", &val ) != EOF ){ | |
420 if( contains( val, "PEAK" ) == 1 ){ | |
421 if( j==7 ) ; | |
422 else{ | |
423 if( j-6 > numclasses ) numclasses = j-6; | |
424 } | |
425 if( m>=0 ){ | |
426 peaklist[m].numgenes = n; | |
427 peaklist[m].genelist = (gene *) malloc( peaklist[m].numgenes*sizeof(gene)); | |
428 } | |
429 n=0; | |
430 j=0; | |
431 if( strcmp(other, "NA") != 0 ) strcpy(peaklist[m].info, other); | |
432 m++; | |
433 sprintf(other,"NA"); | |
434 } | |
435 else if( contains( val, "GENE" ) == 1 ){ | |
436 n++; | |
437 j=0; | |
438 } | |
439 else if( n==0 ){ | |
440 if( j==0 ) strcpy(peaklist[m].chrom, val); | |
441 if( j==1 ) peaklist[m].pkposn = atol(val); | |
442 if( j> 1 ){ | |
443 if( strcmp(other, "NA") == 0 ) strcpy(other, val); | |
444 else sprintf(other,"%s %s", other, val); | |
445 } | |
446 j++; | |
447 } | |
448 else{ | |
449 j++; | |
450 } | |
451 } | |
452 peaklist[m].numgenes = n; | |
453 peaklist[m].genelist = (gene *) malloc( peaklist[m].numgenes*sizeof(gene)); | |
454 strcpy( peaklist[m].info, other); | |
455 fclose(fp); | |
456 | |
457 for(m=0; m< numpeaks; m++){ | |
458 for(n=0; n< peaklist[m].numgenes; n++){ | |
459 peaklist[m].genelist[n].values = (int *) malloc(numclasses*sizeof(int)); | |
460 } | |
461 } | |
462 | |
463 /* | |
464 * now read in gene info | |
465 */ | |
466 fp = fopen(filename,"rt"); | |
467 m=-1; | |
468 j=0; | |
469 while( fscanf(fp, "%s", &val ) != EOF ){ | |
470 if( contains( val, "PEAK" ) == 1 ){ | |
471 if( j==7 ) peaklist[m].genelist[n].used = 0; | |
472 if( j==6+numclasses) peaklist[m].genelist[n].used = 1; | |
473 n=-1; | |
474 j=0; | |
475 m++; | |
476 } | |
477 else if( contains( val, "GENE" ) == 1 ){ | |
478 if( j==7 ) peaklist[m].genelist[n].used = 0; | |
479 if( j==6+numclasses ) peaklist[m].genelist[n].used = 1; | |
480 n++; | |
481 j=0; | |
482 } | |
483 else if( n==-1 ){ | |
484 } | |
485 else { | |
486 if( j==0 ) strcpy( peaklist[m].genelist[n].id, val ); | |
487 if( j==1 ) strcpy( peaklist[m].genelist[n].name, val ); | |
488 if( j==2 ) strcpy( peaklist[m].genelist[n].chrom, val ); | |
489 if( j==3 ) peaklist[m].genelist[n].start = atol(val); | |
490 if( j==4 ) peaklist[m].genelist[n].end = atol(val); | |
491 if( j==5 ) peaklist[m].genelist[n].sense = atoi(val); | |
492 if( j>=6 ) peaklist[m].genelist[n].values[j-6] = atoi(val); | |
493 j++; | |
494 } | |
495 } | |
496 if( j==7 ) peaklist[m].genelist[n].used = 0; | |
497 if( j==6+numclasses) peaklist[m].genelist[n].used = 1; | |
498 fclose(fp); | |
499 | |
500 /* | |
501 * now count how many genes are to be used for each | |
502 */ | |
503 for(m=0; m< numpeaks; m++){ | |
504 peaklist[m].numused = 0; | |
505 for(n=0; n< peaklist[m].numgenes; n++){ | |
506 peaklist[m].numused+= peaklist[m].genelist[n].used; | |
507 } | |
508 } | |
509 } | |
510 | |
511 void read_bkgrnd( char *filename ){ | |
512 int j,n; | |
513 char val[500]; | |
514 FILE *fp; | |
515 | |
516 /* | |
517 * first count number of genes, and allocate | |
518 */ | |
519 numbk = count_lines( fp, filename );; | |
520 bklist = (gene *) malloc(numbk*sizeof(gene)); | |
521 | |
522 for(n=0; n< numbk; n++){ | |
523 bklist[n].values = (int *) malloc(numclasses*sizeof(int)); | |
524 } | |
525 | |
526 /* | |
527 * now read in gene info | |
528 */ | |
529 fp = fopen(filename,"rt"); | |
530 n=-1; | |
531 j=0; | |
532 while( fscanf(fp, "%s", &val ) != EOF ){ | |
533 if( contains( val, "_a" ) == 1 ){ | |
534 if( j==7 ) bklist[n].used = 0; | |
535 if( j==6+numclasses ) bklist[n].used = 1; | |
536 j=1; | |
537 n++; | |
538 strcpy( bklist[n].id, val ); | |
539 } | |
540 else{ | |
541 if( j==1 ) strcpy( bklist[n].name, val ); | |
542 if( j==2 ) strcpy( bklist[n].chrom, val ); | |
543 if( j==3 ) bklist[n].start = atol(val); | |
544 if( j==4 ) bklist[n].end = atol(val); | |
545 if( j==5 ) bklist[n].sense = atoi(val); | |
546 if( j>=6 ) bklist[n].values[j-6] = atoi(val); | |
547 j++; | |
548 } | |
549 } | |
550 if( j==7 ) bklist[n].used = 0; | |
551 if( j==6+numclasses) bklist[n].used = 1; | |
552 fclose(fp); | |
553 | |
554 /* | |
555 * now count how many genes are to be used for each | |
556 */ | |
557 bkused = 0; | |
558 for(n=0; n< numbk; n++){ | |
559 bkused+= bklist[n].used; | |
560 } | |
561 } | |
562 | |
563 /* | |
564 * make initial guesses of z-values, set erase to 1 | |
565 * read in column toggle file if provided | |
566 * also histogram background and scale into beta | |
567 */ | |
568 void initialize( long *seed ){ | |
569 int j,m,n; | |
570 int val; | |
571 FILE *fp; | |
572 | |
573 for(m=0; m< numpeaks; m++){ | |
574 for(n=0; n< peaklist[m].numgenes; n++){ | |
575 if( peaklist[m].genelist[n].used == 1 ){ | |
576 peaklist[m].genelist[n].erase = 1.0; | |
577 peaklist[m].genelist[n].z = 0.0; | |
578 } | |
579 } | |
580 if( peaklist[m].numused > 0 ){ | |
581 n = assign_gene( m, seed ); | |
582 peaklist[m].genelist[n].z = 1.0; // pick one random gene from each set to read into motif model | |
583 } | |
584 } | |
585 | |
586 col_tog = (int *) malloc(numclasses*sizeof(int)); | |
587 col_tog_cpy = (int *) malloc(numclasses*sizeof(int)); | |
588 if( set_togs == 0 ){ | |
589 for(j=0; j< numclasses; j++){ | |
590 col_tog[j] = 1; | |
591 col_tog_cpy[j] = 1; | |
592 } | |
593 usedclasses = numclasses; | |
594 } | |
595 else{ | |
596 // first count number of fields | |
597 col_togl = 0; | |
598 fp = fopen(togs,"rt"); | |
599 if( fp == NULL ){ | |
600 printf("Error: can't open file %s\n", togs); | |
601 exit(0); | |
602 } | |
603 j=0; | |
604 while( fscanf(fp,"%i", &val) != EOF ){ | |
605 col_togl++; | |
606 } | |
607 fclose(fp); | |
608 if( col_togl != numclasses ){ | |
609 printf("Error: number of fields in toggle file (%i) is different from number of classifications in data (%i)\n", col_togl, numclasses); | |
610 exit(0); | |
611 } | |
612 // now read in values | |
613 fp = fopen(togs,"rt"); | |
614 j=0; | |
615 usedclasses = 0; | |
616 while( fscanf(fp,"%i", &val) != EOF ){ | |
617 col_tog[j] = val; | |
618 col_tog_cpy[j] = val; | |
619 j++; | |
620 if( val == 1 ) usedclasses++; | |
621 if( val != 1 && val != 0 ){ | |
622 printf("Error: choose the toggle values to be 0 or 1\n"); | |
623 exit(0); | |
624 } | |
625 } | |
626 fclose(fp); | |
627 } | |
628 | |
629 for(n=0; n< numbk; n++){ | |
630 bklist[n].z = 1.0; | |
631 } | |
632 beta_update(); | |
633 if( verbose != 0 ) printf(" Indexing genes in first pass\n"); | |
634 index_genes(); | |
635 } | |
636 | |
637 /* | |
638 * make initial guesses of z-values for a second, etc motif | |
639 * recount number of used genes per peak to those with erase > 0.5 | |
640 */ | |
641 void reinitialize( long *seed ){ | |
642 int m,n; | |
643 for(m=0; m< numpeaks; m++){ | |
644 peaklist[m].numused = 0; | |
645 for(n=0; n< peaklist[m].numgenes; n++){ | |
646 peaklist[m].genelist[n].z = 0.0; | |
647 if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].erase> 0.5 ){ | |
648 peaklist[m].numused++; | |
649 } | |
650 } | |
651 if( peaklist[m].numused > 0 ){ | |
652 n = assign_gene( m, seed ); | |
653 peaklist[m].genelist[n].z = 1.0; // pick one random gene from each set to read into motif model | |
654 } | |
655 } | |
656 for(n=0; n< numbk; n++){ | |
657 bklist[n].z = 1.0; | |
658 } | |
659 } | |
660 | |
661 /* | |
662 * histogram background into beta | |
663 */ | |
664 void beta_update(){ | |
665 int j,k,n; | |
666 double tot=0.0; | |
667 bkhist_update(); | |
668 for(j=0; j< numclasses; j++){ | |
669 // if( col_tog[j] == 1 ){ | |
670 Beta[j] = 0.0; | |
671 for(k=0; k< 5; k++){ | |
672 beta[j][k] = hist[1][j][k]*mu; | |
673 Beta[j]+= beta[j][k]; | |
674 } | |
675 // } | |
676 } | |
677 } | |
678 | |
679 /* | |
680 * for all the genes in peaklist[*].genelist[*], record the index of that gene in bklist | |
681 */ | |
682 void index_genes(){ | |
683 int m,n,nu; | |
684 int found; | |
685 for(m=0; m< numpeaks; m++){ | |
686 for(n=0; n< peaklist[m].numgenes; n++){ | |
687 if( peaklist[m].genelist[n].used == 1 ){ | |
688 found = 0; | |
689 nu = 0; | |
690 while(nu< numbk && found == 0){ | |
691 if( strcmp(peaklist[m].genelist[n].id, bklist[nu].id) == 0 ){ | |
692 peaklist[m].genelist[n].index = nu; | |
693 found = 1; | |
694 } | |
695 nu++; | |
696 } | |
697 } | |
698 } | |
699 } | |
700 } | |
701 | |
702 /* | |
703 * update motif model histogram | |
704 */ | |
705 void hist_update(){ | |
706 int j,k,m,n; | |
707 double tot=0; | |
708 for(j=0; j< numclasses; j++){ | |
709 // if( col_tog[j] == 1 ){ | |
710 for(k=0; k< 5; k++){ | |
711 hist[0][j][k] = 0.0; | |
712 } | |
713 // } | |
714 } | |
715 for(m=0; m< numpeaks; m++){ | |
716 for(n=0; n< peaklist[m].numgenes; n++){ | |
717 if( peaklist[m].genelist[n].used == 1 ){ | |
718 tot+=peaklist[m].genelist[n].erase*peaklist[m].genelist[n].z; | |
719 for(j=0; j< numclasses; j++){ | |
720 // if( col_tog[j] == 1 ){ | |
721 k = peaklist[m].genelist[n].values[j]+2; | |
722 hist[0][j][k]+= peaklist[m].genelist[n].erase*peaklist[m].genelist[n].z; | |
723 // } | |
724 } | |
725 } | |
726 } | |
727 } | |
728 for(j=0; j< numclasses; j++){ | |
729 // if( col_tog[j] == 1 ){ | |
730 for(k=0; k< 5; k++){ | |
731 hist[0][j][k]+= beta[j][k]; | |
732 hist[0][j][k]/= (tot+Beta[j]);; | |
733 } | |
734 // } | |
735 } | |
736 } | |
737 | |
738 /* | |
739 * update background hist using z values from bklist | |
740 */ | |
741 void bkhist_update(){ | |
742 int j,k,n; | |
743 double tot=0.0; | |
744 for(j=0; j< numclasses; j++){ | |
745 // if( col_tog[j] == 1 ){ | |
746 for(k=0; k< 5; k++){ | |
747 hist[1][j][k] = 0; | |
748 } | |
749 // } | |
750 } | |
751 for(n=0; n< numbk; n++){ | |
752 if( bklist[n].used == 1 ){ | |
753 tot+=bklist[n].z; | |
754 for(j=0; j< numclasses; j++){ | |
755 // if( col_tog[j] == 1 ){ | |
756 k = bklist[n].values[j]+2; | |
757 hist[1][j][k]+= bklist[n].z; | |
758 // } | |
759 } | |
760 } | |
761 } | |
762 for(j=0; j< numclasses; j++){ | |
763 // if( col_tog[j] == 1 ){ | |
764 for(k=0; k< 5; k++){ | |
765 hist[1][j][k]/= tot; | |
766 } | |
767 // } | |
768 } | |
769 } | |
770 | |
771 /* | |
772 * update lambda value | |
773 */ | |
774 void lambda_update(){ | |
775 int m,n; | |
776 double tot=0.0; | |
777 lambda=0; | |
778 for(m=0; m< numpeaks; m++){ | |
779 for(n=0; n< peaklist[m].numgenes; n++){ | |
780 if( peaklist[m].genelist[n].used == 1 ){ | |
781 lambda+= peaklist[m].genelist[n].z; | |
782 tot++; | |
783 } | |
784 } | |
785 } | |
786 lambda/=tot; | |
787 } | |
788 | |
789 /* | |
790 * update the toggle columns based on which columns have better than average relative entropy | |
791 */ | |
792 void toggle_update(){ | |
793 int j; | |
794 double avg=0.0; | |
795 avg = rsca*rel_entropy(); | |
796 usedclasses=0; | |
797 for(j=0; j< numclasses; j++){ | |
798 if( col_tog[j] == 1 ){ | |
799 if( col_rel_entropy(j) < avg ) col_tog[j] = 0; | |
800 if( col_tog[j] == 1 ) usedclasses++; | |
801 } | |
802 } | |
803 } | |
804 | |
805 /* | |
806 * reset col_tog values with their original ones | |
807 */ | |
808 void toggle_reset(){ | |
809 int j; | |
810 usedclasses=0; | |
811 for(j=0; j< numclasses; j++){ | |
812 col_tog[j] = col_tog_cpy[j]; | |
813 if( col_tog[j] == 1 ) usedclasses++; | |
814 } | |
815 } | |
816 | |
817 /* | |
818 * update z | |
819 */ | |
820 void z_update(){ | |
821 int j,m,n; | |
822 double a, c, denom; | |
823 for(m=0; m< numpeaks; m++){ | |
824 for(n=0; n< peaklist[m].numgenes; n++){ | |
825 if( peaklist[m].genelist[n].used == 1 ){ | |
826 // new z value for motif model | |
827 a = lprob(m,n,0)+log(lambda); | |
828 c = lprob(m,n,1)+log(1-lambda); | |
829 if( a> c ){ | |
830 denom = a + log(1+exp(c-a)); | |
831 } | |
832 else{ | |
833 denom = c + log(1+exp(a-c)); | |
834 } | |
835 peaklist[m].genelist[n].z = exp( lprob(m,n,0)+log(lambda) - denom ); | |
836 // peaklist[m].genelist[n].z = prob(m,n,0)*lambda/( prob(m,n,0)*lambda + prob(m,n,1)*(1-lambda) ); | |
837 // new z value for corresponding gene in background model | |
838 bklist[peaklist[m].genelist[n].index].z = 1.0-peaklist[m].genelist[n].z; | |
839 } | |
840 } | |
841 } | |
842 } | |
843 | |
844 /* | |
845 * update erase parameters | |
846 */ | |
847 void erase_update(){ | |
848 int j,m,n; | |
849 for(m=0; m< numpeaks; m++){ | |
850 for(n=0; n< peaklist[m].numgenes; n++){ | |
851 if( peaklist[m].genelist[n].used == 1 ){ | |
852 peaklist[m].genelist[n].erase*= (1-peaklist[m].genelist[n].z); | |
853 } | |
854 } | |
855 } | |
856 } | |
857 | |
858 /* | |
859 * calculate (log) (multinomial) probability of sequence m,n in model l | |
860 */ | |
861 double lprob( int m, int n, int l ){ | |
862 int j,k; | |
863 double val=0.0; | |
864 for(j=0; j< numclasses; j++){ | |
865 if( col_tog[j] == 1 ){ | |
866 k = peaklist[m].genelist[n].values[j]+2; | |
867 val+= log(hist[l][j][k]); | |
868 } | |
869 } | |
870 return val; | |
871 } | |
872 double prob( int m, int n, int l ){ | |
873 int j,k; | |
874 double val=1.0; | |
875 for(j=0; j< numclasses; j++){ | |
876 if( col_tog[j] == 1 ){ | |
877 k = peaklist[m].genelist[n].values[j]+2; | |
878 val*= hist[l][j][k]; | |
879 } | |
880 } | |
881 return val; | |
882 } | |
883 | |
884 /* | |
885 * copy hist into oldhist before updating | |
886 */ | |
887 void copy_hist(){ | |
888 int j,k,l; | |
889 for(l=0; l< 2; l++){ | |
890 for(j=0; j< numclasses; j++){ | |
891 // if( col_tog[j] == 1 ){ | |
892 for(k=0; k< 5; k++){ | |
893 oldhist[l][j][k] = hist[l][j][k]; | |
894 } | |
895 // } | |
896 } | |
897 } | |
898 } | |
899 | |
900 /* | |
901 * compute change in hist | |
902 */ | |
903 double hist_diff(){ | |
904 int j,k,l; | |
905 double val=0.0; | |
906 for(l=0; l< 2; l++){ | |
907 for(j=0; j< numclasses; j++){ | |
908 if( col_tog[j] == 1 ){ | |
909 for(k=0; k< 5; k++){ | |
910 val+= fabs( oldhist[l][j][k]-hist[l][j][k] ); | |
911 } | |
912 } | |
913 } | |
914 } | |
915 return val; | |
916 } | |
917 | |
918 /* | |
919 * calculate the score of a sequence | |
920 */ | |
921 double score( int m, int n ){ | |
922 int j,k; | |
923 double val=0.0; | |
924 for(j=0; j< numclasses; j++){ | |
925 if( col_tog[j] == 1 ){ | |
926 k = peaklist[m].genelist[n].values[j]+2; | |
927 val+= log( hist[0][j][k]/hist[1][j][k] ); | |
928 } | |
929 } | |
930 return val; | |
931 } | |
932 | |
933 /* | |
934 * calculate value of t (threshold) | |
935 */ | |
936 void t_update(){ | |
937 t = tscale*log( (1-lambda)/lambda ); | |
938 } | |
939 | |
940 /* | |
941 * quality scores of a motif, averaging over only those sequences picked as targets | |
942 * ElogL: log-likelihood of the combination of models, averaged by # genes | |
943 * ratio_score: log-likelihood ratio of motif vs background models | |
944 * rel_entropy: relative entropy of motif to background | |
945 * col_rel_entropy: same as rel_entropy, but for just one column (rel_entropy is average over all cols) | |
946 * chisq: chi-square statistic | |
947 */ | |
948 double ElogL(){ | |
949 int m,n; | |
950 double val=0.0, tot=0.0; | |
951 for(m=0; m< numpeaks; m++){ | |
952 for(n=0; n< peaklist[m].numgenes; n++){ | |
953 if( peaklist[m].genelist[n].used == 1 ){ | |
954 val+= peaklist[m].genelist[n].z*lprob(m, n, 0 ) \ | |
955 + (1-peaklist[m].genelist[n].z)*lprob(m, n, 1 )\ | |
956 + peaklist[m].genelist[n].z*log(lambda) \ | |
957 + (1-peaklist[m].genelist[n].z)*log(1-lambda); | |
958 tot++; | |
959 } | |
960 } | |
961 } | |
962 return val/tot; | |
963 } | |
964 | |
965 double ratio_score(){ | |
966 int m,n; | |
967 double val=0.0; | |
968 double tot=0.0; | |
969 for(m=0; m< numpeaks; m++){ | |
970 for(n=0; n< peaklist[m].numgenes; n++){ | |
971 if( peaklist[m].genelist[n].assigned == 1 ){ | |
972 val+= lprob(m,n,0)-lprob(m,n,1); | |
973 tot++; | |
974 } | |
975 } | |
976 } | |
977 return val/tot; | |
978 } | |
979 | |
980 double rel_entropy(){ | |
981 int j,k; | |
982 double val=0.0; | |
983 for(j=0; j< numclasses; j++){ | |
984 if( col_tog[j] == 1 ){ | |
985 val+= col_rel_entropy( j ); | |
986 } | |
987 } | |
988 return val/usedclasses; | |
989 } | |
990 | |
991 double col_rel_entropy( int j ){ | |
992 int k; | |
993 double val=0.0; | |
994 for(k=0; k< 5; k++){ | |
995 if( hist[1][j][j] > 0 && hist[0][j][k] > 0 ){ | |
996 // do in log base 2 for bits | |
997 val+= hist[0][j][k]*log(hist[0][j][k]/hist[1][j][k])/log(2); | |
998 } | |
999 } | |
1000 return val; | |
1001 } | |
1002 | |
1003 double chisq(){ | |
1004 int j, k; | |
1005 double val=0.0; | |
1006 for(j=0; j< numclasses; j++){ | |
1007 if( col_tog[j] == 1 ){ | |
1008 for(k=0; k< 5; k++){ | |
1009 val+= pow(hist[0][j][k] - hist[1][j][k],2)/hist[1][j][k]; | |
1010 } | |
1011 } | |
1012 } | |
1013 return val; | |
1014 } | |
1015 | |
1016 /* | |
1017 * number of members of a motif model | |
1018 */ | |
1019 int motif_count(){ | |
1020 int m,n; | |
1021 int tot=0; | |
1022 for(m=0; m< numpeaks; m++){ | |
1023 for(n=0; n< peaklist[m].numgenes; n++){ | |
1024 if( peaklist[m].genelist[n].assigned == 1 ){ | |
1025 tot++; | |
1026 } | |
1027 } | |
1028 } | |
1029 return tot; | |
1030 } | |
1031 | |
1032 /* | |
1033 * assign a random used gene: one that is used and hasn't been erased | |
1034 */ | |
1035 int assign_gene( int m, long *seed ){ | |
1036 int n; | |
1037 n = floor( peaklist[m].numgenes*ran1(seed) ); | |
1038 while( peaklist[m].genelist[n].used == 0 || peaklist[m].genelist[n].erase < 0.5 ){ | |
1039 n = floor( peaklist[m].numgenes*ran1(seed) ); | |
1040 } | |
1041 return n; | |
1042 } | |
1043 | |
1044 /* | |
1045 * score all genes using the models | |
1046 */ | |
1047 void score_genes(){ | |
1048 int m,n; | |
1049 for(m=0; m< numpeaks; m++){ | |
1050 for(n=0; n< peaklist[m].numgenes; n++){ | |
1051 if( peaklist[m].genelist[n].used == 1 ){ | |
1052 peaklist[m].genelist[n].score= score( m, n ); | |
1053 } | |
1054 } | |
1055 } | |
1056 } | |
1057 | |
1058 /* | |
1059 * choose targets using the scores, and ZOOPS setting | |
1060 * zoops = 1 -> 0 or 1 target/strand | |
1061 * else -> all above threshold | |
1062 */ | |
1063 void choose_targets(){ | |
1064 int m,n; | |
1065 int best; | |
1066 for(m=0; m< numpeaks; m++){ | |
1067 peaklist[m].assgngene = 0; | |
1068 best=-1; | |
1069 for(n=0; n< peaklist[m].numgenes; n++){ | |
1070 if( peaklist[m].genelist[n].used == 1 ){ | |
1071 if( peaklist[m].genelist[n].score > t ){ | |
1072 peaklist[m].genelist[n].assigned = 1; | |
1073 peaklist[m].assgngene++; | |
1074 } | |
1075 else peaklist[m].genelist[n].assigned = 0; | |
1076 if( best == -1 ) best = n; | |
1077 else if( peaklist[m].genelist[n].score > peaklist[m].genelist[best].score ) best = n; | |
1078 } | |
1079 } | |
1080 if( zoops == 1 ){ | |
1081 for(n=0; n< peaklist[m].numgenes; n++){ | |
1082 if( peaklist[m].genelist[n].used == 1 && n != best ) peaklist[m].genelist[n].assigned = 0; | |
1083 } | |
1084 if( peaklist[m].genelist[best].score > t ) peaklist[m].assgngene = 1; | |
1085 else peaklist[m].assgngene = 0; | |
1086 } | |
1087 } | |
1088 } | |
1089 | |
1090 /* | |
1091 * functions to print to output (fp must already be opened for top two) | |
1092 */ | |
1093 void print_peak( FILE *fp, int m ){ | |
1094 int n; | |
1095 fprintf(fp,"PEAK: %s %ld %s\n", peaklist[m].chrom, peaklist[m].pkposn, peaklist[m].info); | |
1096 for(n=0; n< peaklist[m].numgenes; n++){ | |
1097 if( printall == 0 ){ | |
1098 if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].assigned == 1 ){ | |
1099 print_gene( fp, m, n ); | |
1100 } | |
1101 } | |
1102 if( printall == 1 ){ | |
1103 if( peaklist[m].genelist[n].used == 1 ){ | |
1104 print_gene( fp, m, n ); | |
1105 } | |
1106 } | |
1107 } | |
1108 } | |
1109 | |
1110 void print_gene( FILE *fp, int m, int n ){ | |
1111 int j; | |
1112 if( peaklist[m].genelist[n].used == 1 ){ | |
1113 fprintf(fp, "TGENE: %0.3f %s %s %s %ld %ld %i", peaklist[m].genelist[n].score, peaklist[m].genelist[n].id, peaklist[m].genelist[n].name, peaklist[m].genelist[n].chrom, peaklist[m].genelist[n].start, peaklist[m].genelist[n].end, peaklist[m].genelist[n].sense); | |
1114 for(j=0; j< numclasses; j++){ | |
1115 if( col_tog[j] == 0 ) fprintf(fp," %i ", peaklist[m].genelist[n].values[j]); | |
1116 else fprintf(fp," %i", peaklist[m].genelist[n].values[j]); | |
1117 } | |
1118 fprintf(fp,"\n"); | |
1119 } | |
1120 } | |
1121 | |
1122 void print_state(FILE *fp, char *filename){ | |
1123 int m; | |
1124 fp = fopen(filename,"w"); | |
1125 for(m=0; m< numpeaks; m++){ | |
1126 print_peak(fp, m); | |
1127 } | |
1128 fclose(fp); | |
1129 } | |
1130 | |
1131 /* | |
1132 * this prints a list of <region id> <name> | |
1133 */ | |
1134 /**************** | |
1135 void print_targets( FILE *fp, char *filename ){ | |
1136 int m,n,nu; | |
1137 fp = fopen(filename,"w"); | |
1138 for(m=0; m< numpeaks; m++){ | |
1139 if( peaklist[m].assgngene > 0 ){ | |
1140 fprintf(fp,"%s ", peaklist[m].regid); | |
1141 n=0; | |
1142 nu=0; | |
1143 while( n< peaklist[m].numgenes ){ | |
1144 if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].assigned == 1 ){ | |
1145 if( nu< peaklist[m].assgngene-1 ) fprintf(fp,"%s,", peaklist[m].genelist[n].name); | |
1146 else fprintf(fp,"%s\n", peaklist[m].genelist[n].name); | |
1147 nu++; | |
1148 } | |
1149 n++; | |
1150 } | |
1151 } | |
1152 } | |
1153 fclose(fp); | |
1154 } | |
1155 *************/ | |
1156 | |
1157 /* | |
1158 * print the motif model and its score | |
1159 */ | |
1160 void print_model( FILE *fp, char *filename ){ | |
1161 int j,k; | |
1162 fp = fopen(filename,"w"); | |
1163 fprintf(fp,"comparison\\class; avg relative entropy %0.3f, %i genes, threshold %0.3f, quality %0.1f\n", rel_entropy(), motif_count(), t, rel_entropy()*motif_count() ); | |
1164 fprintf(fp," ++ + 0 - --\n"); | |
1165 for(j=0; j< numclasses; j++){ | |
1166 if( j< 10 ) fprintf(fp," "); | |
1167 if( col_tog[j] == 0 ) fprintf(fp,"#"); | |
1168 else fprintf(fp," "); | |
1169 fprintf(fp,"%i,%.3f",j+1, col_rel_entropy(j) ); | |
1170 for(k=4; k>=0; k--){ | |
1171 if( hist[1][j][k] > 0 ) fprintf(fp,"%s", nice_d(log(hist[0][j][k]/hist[1][j][k])) ); | |
1172 else fprintf(fp," NA"); | |
1173 } | |
1174 fprintf(fp,"\n"); | |
1175 } | |
1176 fclose(fp); | |
1177 } | |
1178 | |
1179 /* | |
1180 * prints a double with 3 digits, aligned by spaces | |
1181 * longest would be -999.000, so 8 characters | |
1182 * shortest would be 1.000, 5 characters | |
1183 * up to 3 spaces | |
1184 */ | |
1185 char *nice_d( double val ){ | |
1186 char *returnval; | |
1187 char *dval; | |
1188 char *spaces; | |
1189 int i, numspaces=0; | |
1190 | |
1191 returnval = (char *) malloc(10*sizeof(char)); | |
1192 dval = (char *) malloc(10*sizeof(char)); | |
1193 spaces = (char *) malloc(10*sizeof(char)); | |
1194 | |
1195 sprintf(dval,"%.3f", val); | |
1196 | |
1197 if( val >= 0 ) numspaces++; | |
1198 if( fabs(val) < 100 ){ | |
1199 if( fabs(val) < 10 ) numspaces+=2; | |
1200 else numspaces++; | |
1201 } | |
1202 if( numspaces == 0 ) sprintf(spaces,""); | |
1203 if( numspaces == 1 ) sprintf(spaces," "); | |
1204 if( numspaces == 2 ) sprintf(spaces," "); | |
1205 if( numspaces == 3 ) sprintf(spaces," "); | |
1206 sprintf(returnval,"%s%s", spaces, dval); | |
1207 return returnval; | |
1208 } | |
1209 | |
1210 /* | |
1211 * print the peaks only in annotation format, for use in comppeaks, subpeaks, etc | |
1212 */ | |
1213 /************** | |
1214 void print_annotpeak( FILE *fp, char *filename, int motif ){ | |
1215 int m; | |
1216 fp = fopen(filename,"w"); | |
1217 for(m=0; m< numpeaks; m++){ | |
1218 if( peaklist[m].assgngene > 0 ){ | |
1219 fprintf(fp,"%s-%i %s %s %ld %ld %ld ", peaklist[m].filename, motif, peaklist[m].regid, peaklist[m].chrom, peaklist[m].pkposn, peaklist[m].start, peaklist[m].end ); | |
1220 fprintf(fp,"%f 1.0 1.0 1 1 1.0 ", peaklist[m].height ); | |
1221 fprintf(fp,"1 1.0 1.0 1.0 1 -1.0 poop 1\n"); | |
1222 } | |
1223 } | |
1224 close(fp); | |
1225 } | |
1226 *************/ | |
1227 |