0
|
1 /*
|
|
2 * ember.c : Expectation Maximization of Binding and Expression pRofiles
|
|
3 * ----------------------------------------------------------------
|
|
4 * algorithm adapted from MEME algorithm: Bailey and Elkan, Proc Int Conf Intell Syst Mol Biol, 2:28-36 (1994).
|
|
5 * See Eqs. 3-13, plus "Implementation of MM" section for detailed description
|
|
6 * changes are as follows:
|
|
7 * background changes with column (j=1,...,numclasses), so it is a 2D histogram of the same size as the motif model; same for psuedocounts
|
|
8 * background model is determined by full list of genes (second input); z-values of genes in first input are subtracted
|
|
9 * update of background can be turned off by commenting out 8th line of z_update()
|
|
10 * in this case, also comment out index_genes at end of initialize(), as it will save some time
|
|
11 * sequences cannot overlap, since they are fixed per gene; this simplifies the enumeration of all possibilities
|
|
12 * a ZOOPS options is included (change in code, parameter zoops); this does not alter the algorithm,
|
|
13 * only adds a restriction on which genes are chosen as targets in choose_targets()
|
|
14 * general notes:
|
|
15 * the common dimensions are:
|
|
16 * numpeaks - number of peaks, size of peaklist, indexed by m
|
|
17 * peaklist[m].numgenes - number of genes matched to a peak, size of peaklist[m].genelist, indexed by n
|
|
18 * numclasses - number of comparisons made (i.e. motif width), indexed by j
|
|
19 * 5 - number of classifications in each comparison, assumed to be -2,-1,0,1,2, indexed by k
|
|
20 * 2 - number of models (motif=0 and background=1), indexed by l
|
|
21 * nmots - total number of motifs to find, indexed by i in main (always stored at l=0, as found motifs are erased)
|
|
22 * col_tog can be used to force the algorithm to ignore certain columns (behavior dimensions)
|
|
23 *
|
|
24 * things to add/consider:
|
|
25 * - best measure of motif quality score?
|
|
26 * - want to ensure that the same sequence can't get assigned to more than one motif?
|
|
27 */
|
|
28
|
|
29 #include "all.h"
|
|
30
|
|
31 typedef struct {
|
|
32 char id[50];
|
|
33 char name[500];
|
|
34 char chrom[50];
|
|
35 long start;
|
|
36 long end;
|
|
37 int sense;
|
|
38 double z;
|
|
39 double erase;
|
|
40 double score;
|
|
41 int index; // will refer to index of this id in bklist
|
|
42 int used; // set to 0 if a non-classified gene
|
|
43 int assigned; // set to 1 if assigned
|
|
44 int *values;
|
|
45 } gene;
|
|
46
|
|
47 typedef struct {
|
|
48 char info[5000];
|
|
49 char chrom[50];
|
|
50 long pkposn;
|
|
51 int numgenes;
|
|
52 int numused;
|
|
53 int assgngene; // set to # targets picked for this gene
|
|
54 gene *genelist;
|
|
55 } peak;
|
|
56
|
|
57 // FREE PARAMETERS TO CHOOSE IN COMMAND LINE
|
|
58 double mu; // scale background histogram by mu for psuedocounts
|
|
59 double tscale; // scales the threshold (to make choices more or less stringent)
|
|
60 int zoops; // set to 1 to enforce ZOOPS model (Zero Or One Per Strand)
|
|
61 int printall; // set to 1 to print all genes, not just targets
|
|
62 int mots; // number of motifs to search for
|
|
63 int repeat; // number of times to wean columns and repeat search
|
|
64 double rsca; // percent of average to set relative entropy cutoff
|
|
65 // END PARAMETERS TO CHOOSE
|
|
66
|
|
67 char *input, *background, *pref, *togs;
|
|
68 char *target_out, *list_out, *model_out, *annot_out;
|
|
69 long iseedval;
|
|
70 int numpeaks, numclasses, usedclasses, numbk, bkused, nmots;
|
|
71 peak *peaklist;
|
|
72 gene *bklist;
|
|
73 double ***hist, ***oldhist, lambda, t, **beta;
|
|
74 double *Beta;
|
|
75 int *col_tog, col_togl, set_togs, *col_tog_cpy;
|
|
76 int verbose;
|
|
77
|
|
78 void read_cmd_line( int argc, char **argv );
|
|
79 void print_usage();
|
|
80 void read_data( char *filename );
|
|
81 void read_bkgrnd( char *filename );
|
|
82 void initialize( long *seed );
|
|
83 void reinitialize( long *seed );
|
|
84 void beta_update();
|
|
85 void index_genes();
|
|
86
|
|
87 void hist_update();
|
|
88 void bkhist_update();
|
|
89 void lambda_update();
|
|
90 void toggle_update();
|
|
91 void toggle_reset();
|
|
92 void z_update();
|
|
93 void erase_update();
|
|
94 double lprob( int m, int n, int l );
|
|
95 double prob( int m, int n, int l );
|
|
96 void copy_hist();
|
|
97 double hist_diff();
|
|
98 double score( int m, int n );
|
|
99 void t_update();
|
|
100 double ElogL();
|
|
101 double ratio_score();
|
|
102 double rel_entropy();
|
|
103 double col_rel_entropy( int j );
|
|
104 double chisq();
|
|
105 int motif_count();
|
|
106
|
|
107 int assign_gene( int m, long *seed );
|
|
108 void score_genes();
|
|
109 void choose_targets();
|
|
110
|
|
111 void print_peak( FILE *fp, int i );
|
|
112 void print_gene( FILE *fp, int i, int j );
|
|
113 void print_state( FILE *fp, char *filename );
|
|
114 //void print_targets( FILE *fp, char *filename );
|
|
115 void print_model( FILE *fp, char *filename );
|
|
116 //void print_annotpeak( FILE *fp, char *filename, int motif );
|
|
117 char *nice_d( double val );
|
|
118
|
|
119 int main(int argc, char *argv[]){
|
|
120 int i,iter,r;
|
|
121 int pfreq;
|
|
122 double diff, conv = 1e-6;
|
|
123 long seed;
|
|
124 FILE *fout;
|
|
125
|
|
126 if(argc == 1 ){
|
|
127 print_usage();
|
|
128 }
|
|
129
|
|
130 /*
|
|
131 * read command line params
|
|
132 */
|
|
133 read_cmd_line( argc, argv );
|
|
134
|
|
135 /*
|
|
136 * allocate output names
|
|
137 */
|
|
138 // target_out = (char *) malloc((strlen(pref)+20)*sizeof(char));
|
|
139 // list_out = (char *) malloc((strlen(pref)+20)*sizeof(char));
|
|
140 // model_out = (char *) malloc((strlen(pref)+20)*sizeof(char));
|
|
141 // annot_out = (char *) malloc((strlen(pref)+20)*sizeof(char));
|
|
142
|
|
143 /*
|
|
144 * read in data, copy values from global to local variables
|
|
145 */
|
|
146 if( verbose != 0 ) printf("\nReading data and allocating\n");
|
|
147 read_data( input );
|
|
148 read_bkgrnd( background );
|
|
149 nmots = mots;
|
|
150 seed = iseedval;
|
|
151
|
|
152 /*
|
|
153 * allocate memory for other arrays
|
|
154 */
|
|
155 hist = (double ***) alloc3d(sizeof(double),2,numclasses,5);
|
|
156 oldhist = (double ***) alloc3d(sizeof(double),2,numclasses,5);
|
|
157 beta = (double **) alloc2d(sizeof(double),numclasses,5);
|
|
158 Beta = (double *) malloc(numclasses*sizeof(double));
|
|
159
|
|
160 /*
|
|
161 * find nmots motif models
|
|
162 */
|
|
163 for(i=1; i<= nmots; i++){
|
|
164 /*
|
|
165 * initialize, fill initial parameter values from z-guesses
|
|
166 */
|
|
167 if( verbose != 0 ) printf("Expression Pattern %i,\n Initializing\n", i);
|
|
168 if( i==1 ) initialize( &seed );
|
|
169 else reinitialize( &seed );
|
|
170 hist_update();
|
|
171 lambda_update();
|
|
172
|
|
173 /*
|
|
174 * now sample for awhile, repeating repeat times
|
|
175 * each repeat starts at the previously refined search (?)
|
|
176 */
|
|
177 for(r=0; r<= repeat; r++){
|
|
178 if( verbose != 0 ) printf(" Sampling\n");
|
|
179 if( verbose != 0 && repeat> 0 ) printf(" Repeat %i\n", r );
|
|
180 diff = 1;
|
|
181 pfreq = 10;
|
|
182 iter=1;
|
|
183 while( diff > conv ){
|
|
184 z_update(); // E step
|
|
185 copy_hist();
|
|
186 hist_update(); // M step
|
|
187 bkhist_update(); // M step
|
|
188 lambda_update(); // M step
|
|
189 diff = hist_diff();
|
|
190 if( iter%pfreq == 0 && verbose != 0 ) printf(" iteration %i, (normalized) ElogL %.2f, change %.2e\n", iter, ElogL(), diff);
|
|
191 iter++;
|
|
192 }
|
|
193 if( repeat > 0 && r< repeat ) toggle_update();
|
|
194 }
|
|
195
|
|
196 /*
|
|
197 * score and choose final targets
|
|
198 */
|
|
199 t_update();
|
|
200 if( verbose != 0 ) printf(" Done, scoring and choosing targets on threshold %.3f\n\n", t);
|
|
201 score_genes();
|
|
202 choose_targets();
|
|
203
|
|
204 /*
|
|
205 * print final targets
|
|
206 */
|
|
207 // sprintf( target_out,"%s-%i.targets", pref, i);
|
|
208 // sprintf( list_out,"%s-%i.list", pref, i);
|
|
209 // sprintf( model_out,"%s-%i.model", pref, i);
|
|
210 // sprintf( annot_out,"%s-%i.xls", pref, i);
|
|
211 print_state(fout, target_out);
|
|
212 print_model(fout, model_out);
|
|
213 // print_targets(fout, list_out);
|
|
214 // print_annotpeak( fout, annot_out, i);
|
|
215
|
|
216 /*
|
|
217 * erase found motifs (only if we're going around again)
|
|
218 */
|
|
219 if( i< nmots ) erase_update();
|
|
220 if( repeat > 0 ) toggle_reset();
|
|
221 }
|
|
222
|
|
223 }
|
|
224
|
|
225 /*==================== FUNCTIONS ======================*/
|
|
226
|
|
227 /*
|
|
228 * read command line inputs and options
|
|
229 */
|
|
230 void read_cmd_line( int argc, char **argv ){
|
|
231 int i, j;
|
|
232 int found_reqs=0;
|
|
233
|
|
234 // default values
|
|
235 mots = 1;
|
|
236 zoops = 0;
|
|
237 printall = 0;
|
|
238 mu = 0.1;
|
|
239 tscale = 1.0;
|
|
240 iseedval = make_seed();
|
|
241 set_togs = 0;
|
|
242 verbose = 1;
|
|
243 repeat = 0;
|
|
244 rsca = 1.0;
|
|
245
|
|
246 // parse options
|
|
247 for(i=1; i< argc; i++){
|
|
248 if( argv[i][0] != '-' ) print_usage();
|
|
249 i++;
|
|
250 if( i >= argc || argv[i][0] == '-' ) print_usage();
|
|
251 switch( argv[i-1][1] ){
|
|
252 case 'i':
|
|
253 input = (char *) malloc((strlen(argv[i])+5)*sizeof(char));
|
|
254 strcpy(input, argv[i]);
|
|
255 found_reqs++;
|
|
256 break;
|
|
257 case 'b':
|
|
258 background = (char *) malloc((strlen(argv[i])+5)*sizeof(char));
|
|
259 strcpy(background, argv[i]);
|
|
260 found_reqs++;
|
|
261 break;
|
|
262 case 'o':
|
|
263 target_out = (char *) malloc((strlen(argv[i])+5)*sizeof(char));
|
|
264 strcpy(target_out, argv[i]);
|
|
265 found_reqs++;
|
|
266 break;
|
|
267 case 'p':
|
|
268 model_out = (char *) malloc((strlen(argv[i])+5)*sizeof(char));
|
|
269 strcpy(model_out, argv[i]);
|
|
270 found_reqs++;
|
|
271 break;
|
|
272 case 'n':
|
|
273 mots = atoi(argv[i]);
|
|
274 if( mots <= 0 ){
|
|
275 printf("\nError: choose number of expression patterns to be at least 1\n");
|
|
276 print_usage();
|
|
277 }
|
|
278 break;
|
|
279 case 'z':
|
|
280 if( argv[i][0] == 'y' ) zoops = 1;
|
|
281 else if( argv[i][0] == 'n' ) zoops = 0;
|
|
282 else{
|
|
283 printf("\nError: if using -z option, choose y or n\n");
|
|
284 print_usage();
|
|
285 }
|
|
286 break;
|
|
287 case 'v':
|
|
288 if( argv[i][0] == 'y' ) verbose = 1;
|
|
289 else if( argv[i][0] == 'n' ) verbose = 0;
|
|
290 else{
|
|
291 printf("\nError: if using -v option, choose y or n\n");
|
|
292 print_usage();
|
|
293 }
|
|
294 break;
|
|
295 case 'm':
|
|
296 mu = atof(argv[i]);
|
|
297 if( mu<= 0 ){
|
|
298 printf("\nError: choose mu to be positive\n");
|
|
299 print_usage();
|
|
300 }
|
|
301 break;
|
|
302 case 't':
|
|
303 tscale = atof(argv[i]);
|
|
304 if( tscale<= 0 ){
|
|
305 printf("\nError: choose threshold to be positive\n");
|
|
306 print_usage();
|
|
307 }
|
|
308 break;
|
|
309 case 's':
|
|
310 iseedval = -1*atol(argv[i]);
|
|
311 if( iseedval>= 0 ){
|
|
312 printf("\nError: choose seed to be a large positive integer\n");
|
|
313 print_usage();
|
|
314 }
|
|
315 break;
|
|
316 case 'c':
|
|
317 set_togs = 1;
|
|
318 togs = (char *) malloc((strlen(argv[i])+5)*sizeof(char));
|
|
319 strcpy( togs, argv[i] );
|
|
320 break;
|
|
321 case 'a':
|
|
322 if( argv[i][0] == 'y' ) printall = 1;
|
|
323 else if( argv[i][0] == 'n' ) printall = 0;
|
|
324 else{
|
|
325 printf("\nError: if using -a option, choose y or n\n");
|
|
326 print_usage();
|
|
327 }
|
|
328 break;
|
|
329 case 'r':
|
|
330 repeat = atoi(argv[i]);
|
|
331 if( repeat< 0 ){
|
|
332 printf("\nError: choose repeat to be a non-negative integer\n");
|
|
333 print_usage();
|
|
334 }
|
|
335 break;
|
|
336 case 'R':
|
|
337 rsca = atof(argv[i]);
|
|
338 if( rsca <= 0 ){
|
|
339 printf("\nError: choose repeat scalar to be positive\n");
|
|
340 print_usage();
|
|
341 }
|
|
342 break;
|
|
343 default:
|
|
344 printf("\nError: unkown option: -%c\n", argv[i-1][1]);
|
|
345 print_usage();
|
|
346 }
|
|
347 }
|
|
348 // double check that the correct number of required inputs has been read in
|
|
349 if( found_reqs != 4 ){
|
|
350 printf("\nError: improper number of required inputs found\n");
|
|
351 print_usage();
|
|
352 }
|
|
353 }
|
|
354
|
|
355 /*
|
|
356 * usage info
|
|
357 */
|
|
358 void print_usage(){
|
|
359 printf("\nUsage: -i matched_data -b background_data -o target_output_name -p model_output_name [option value]\n");
|
|
360 printf("Options:\n");
|
|
361 printf(" -n number of expression patterns to find (positive integer, default 1)\n");
|
|
362 printf(" -z enforce ZOOPS model (y or n, default n)\n");
|
|
363 printf(" -m scalar for psuedocounts (positive real number, default 0.1)\n");
|
|
364 printf(" -t scalar for threshold (positive real number, default 1.0)\n");
|
|
365 printf(" -s seed (positive long integer, default chosen from date and time)\n");
|
|
366 printf(" -c column toggle file, used to ignore certain classifications (default: all on)\n");
|
|
367 printf(" -v verboseness, turn on/off printing to stdout (y or n, default: y)\n");
|
|
368 printf(" -a print all genes (and scores), rather than only those above the threshold (y or n, default: n)\n");
|
|
369 printf(" -r repeat search: after convergence of each expression pattern, turn off columns with lower relative entropy\n");
|
|
370 printf(" than average*scalar and repeat expression pattern search (non-negative integer, default 0)\n");
|
|
371 printf(" -R relative entropy scalar: if using the -r option, multiplies the average relative entropy\n");
|
|
372 printf(" by this value to set the threshold for turning off columns (positive real number, default 1.0)\n");
|
|
373 printf("\n");
|
|
374 exit(0);
|
|
375 }
|
|
376
|
|
377 /*
|
|
378 * function: read data from filename into the peaklist data structure
|
|
379 * notes on assumed format of data:
|
|
380 * peak lines start with "PEAK:" field
|
|
381 * gene lines start with "GENE:" field
|
|
382 * gene lines with only 1 classification are ignored
|
|
383 * assume all fields are <5000 characters
|
|
384 */
|
|
385 void read_data( char *filename ){
|
|
386 int j,m,n;
|
|
387 char val[5000], other[5000];
|
|
388 FILE *fp;
|
|
389
|
|
390 /*
|
|
391 * first count number of peaks, and allocate
|
|
392 */
|
|
393 fp = fopen(filename,"rt");
|
|
394 if( fp == NULL ){
|
|
395 printf("Error: can't open file %s\n", filename);
|
|
396 exit(0);
|
|
397 }
|
|
398 numpeaks = 0;
|
|
399 while( fscanf(fp, "%s", &val ) != EOF ){
|
|
400 if( contains( val, "PEAK" ) == 1 ){
|
|
401 numpeaks++;
|
|
402 }
|
|
403 }
|
|
404 fclose(fp);
|
|
405 peaklist = (peak *) malloc(numpeaks*sizeof(peak));
|
|
406
|
|
407 /*
|
|
408 * now count number of genes w/>1 classification, and allocate
|
|
409 * also count number of classifications, and allocate that at the end
|
|
410 * n tracks number of genes, j field; subtract genes that don't have full classifications
|
|
411 * m tracks peak index; record peak info
|
|
412 */
|
|
413 fp = fopen(filename,"rt");
|
|
414 n=-1;
|
|
415 m=-1;
|
|
416 j=0;
|
|
417 sprintf(other,"NA");
|
|
418 numclasses=0;
|
|
419 while( fscanf(fp, "%s", &val ) != EOF ){
|
|
420 if( contains( val, "PEAK" ) == 1 ){
|
|
421 if( j==7 ) ;
|
|
422 else{
|
|
423 if( j-6 > numclasses ) numclasses = j-6;
|
|
424 }
|
|
425 if( m>=0 ){
|
|
426 peaklist[m].numgenes = n;
|
|
427 peaklist[m].genelist = (gene *) malloc( peaklist[m].numgenes*sizeof(gene));
|
|
428 }
|
|
429 n=0;
|
|
430 j=0;
|
|
431 if( strcmp(other, "NA") != 0 ) strcpy(peaklist[m].info, other);
|
|
432 m++;
|
|
433 sprintf(other,"NA");
|
|
434 }
|
|
435 else if( contains( val, "GENE" ) == 1 ){
|
|
436 n++;
|
|
437 j=0;
|
|
438 }
|
|
439 else if( n==0 ){
|
|
440 if( j==0 ) strcpy(peaklist[m].chrom, val);
|
|
441 if( j==1 ) peaklist[m].pkposn = atol(val);
|
|
442 if( j> 1 ){
|
|
443 if( strcmp(other, "NA") == 0 ) strcpy(other, val);
|
|
444 else sprintf(other,"%s %s", other, val);
|
|
445 }
|
|
446 j++;
|
|
447 }
|
|
448 else{
|
|
449 j++;
|
|
450 }
|
|
451 }
|
|
452 peaklist[m].numgenes = n;
|
|
453 peaklist[m].genelist = (gene *) malloc( peaklist[m].numgenes*sizeof(gene));
|
|
454 strcpy( peaklist[m].info, other);
|
|
455 fclose(fp);
|
|
456
|
|
457 for(m=0; m< numpeaks; m++){
|
|
458 for(n=0; n< peaklist[m].numgenes; n++){
|
|
459 peaklist[m].genelist[n].values = (int *) malloc(numclasses*sizeof(int));
|
|
460 }
|
|
461 }
|
|
462
|
|
463 /*
|
|
464 * now read in gene info
|
|
465 */
|
|
466 fp = fopen(filename,"rt");
|
|
467 m=-1;
|
|
468 j=0;
|
|
469 while( fscanf(fp, "%s", &val ) != EOF ){
|
|
470 if( contains( val, "PEAK" ) == 1 ){
|
|
471 if( j==7 ) peaklist[m].genelist[n].used = 0;
|
|
472 if( j==6+numclasses) peaklist[m].genelist[n].used = 1;
|
|
473 n=-1;
|
|
474 j=0;
|
|
475 m++;
|
|
476 }
|
|
477 else if( contains( val, "GENE" ) == 1 ){
|
|
478 if( j==7 ) peaklist[m].genelist[n].used = 0;
|
|
479 if( j==6+numclasses ) peaklist[m].genelist[n].used = 1;
|
|
480 n++;
|
|
481 j=0;
|
|
482 }
|
|
483 else if( n==-1 ){
|
|
484 }
|
|
485 else {
|
|
486 if( j==0 ) strcpy( peaklist[m].genelist[n].id, val );
|
|
487 if( j==1 ) strcpy( peaklist[m].genelist[n].name, val );
|
|
488 if( j==2 ) strcpy( peaklist[m].genelist[n].chrom, val );
|
|
489 if( j==3 ) peaklist[m].genelist[n].start = atol(val);
|
|
490 if( j==4 ) peaklist[m].genelist[n].end = atol(val);
|
|
491 if( j==5 ) peaklist[m].genelist[n].sense = atoi(val);
|
|
492 if( j>=6 ) peaklist[m].genelist[n].values[j-6] = atoi(val);
|
|
493 j++;
|
|
494 }
|
|
495 }
|
|
496 if( j==7 ) peaklist[m].genelist[n].used = 0;
|
|
497 if( j==6+numclasses) peaklist[m].genelist[n].used = 1;
|
|
498 fclose(fp);
|
|
499
|
|
500 /*
|
|
501 * now count how many genes are to be used for each
|
|
502 */
|
|
503 for(m=0; m< numpeaks; m++){
|
|
504 peaklist[m].numused = 0;
|
|
505 for(n=0; n< peaklist[m].numgenes; n++){
|
|
506 peaklist[m].numused+= peaklist[m].genelist[n].used;
|
|
507 }
|
|
508 }
|
|
509 }
|
|
510
|
|
511 void read_bkgrnd( char *filename ){
|
|
512 int j,n;
|
|
513 char val[500];
|
|
514 FILE *fp;
|
|
515
|
|
516 /*
|
|
517 * first count number of genes, and allocate
|
|
518 */
|
|
519 numbk = count_lines( fp, filename );;
|
|
520 bklist = (gene *) malloc(numbk*sizeof(gene));
|
|
521
|
|
522 for(n=0; n< numbk; n++){
|
|
523 bklist[n].values = (int *) malloc(numclasses*sizeof(int));
|
|
524 }
|
|
525
|
|
526 /*
|
|
527 * now read in gene info
|
|
528 */
|
|
529 fp = fopen(filename,"rt");
|
|
530 n=-1;
|
|
531 j=0;
|
|
532 while( fscanf(fp, "%s", &val ) != EOF ){
|
|
533 if( contains( val, "_a" ) == 1 ){
|
|
534 if( j==7 ) bklist[n].used = 0;
|
|
535 if( j==6+numclasses ) bklist[n].used = 1;
|
|
536 j=1;
|
|
537 n++;
|
|
538 strcpy( bklist[n].id, val );
|
|
539 }
|
|
540 else{
|
|
541 if( j==1 ) strcpy( bklist[n].name, val );
|
|
542 if( j==2 ) strcpy( bklist[n].chrom, val );
|
|
543 if( j==3 ) bklist[n].start = atol(val);
|
|
544 if( j==4 ) bklist[n].end = atol(val);
|
|
545 if( j==5 ) bklist[n].sense = atoi(val);
|
|
546 if( j>=6 ) bklist[n].values[j-6] = atoi(val);
|
|
547 j++;
|
|
548 }
|
|
549 }
|
|
550 if( j==7 ) bklist[n].used = 0;
|
|
551 if( j==6+numclasses) bklist[n].used = 1;
|
|
552 fclose(fp);
|
|
553
|
|
554 /*
|
|
555 * now count how many genes are to be used for each
|
|
556 */
|
|
557 bkused = 0;
|
|
558 for(n=0; n< numbk; n++){
|
|
559 bkused+= bklist[n].used;
|
|
560 }
|
|
561 }
|
|
562
|
|
563 /*
|
|
564 * make initial guesses of z-values, set erase to 1
|
|
565 * read in column toggle file if provided
|
|
566 * also histogram background and scale into beta
|
|
567 */
|
|
568 void initialize( long *seed ){
|
|
569 int j,m,n;
|
|
570 int val;
|
|
571 FILE *fp;
|
|
572
|
|
573 for(m=0; m< numpeaks; m++){
|
|
574 for(n=0; n< peaklist[m].numgenes; n++){
|
|
575 if( peaklist[m].genelist[n].used == 1 ){
|
|
576 peaklist[m].genelist[n].erase = 1.0;
|
|
577 peaklist[m].genelist[n].z = 0.0;
|
|
578 }
|
|
579 }
|
|
580 if( peaklist[m].numused > 0 ){
|
|
581 n = assign_gene( m, seed );
|
|
582 peaklist[m].genelist[n].z = 1.0; // pick one random gene from each set to read into motif model
|
|
583 }
|
|
584 }
|
|
585
|
|
586 col_tog = (int *) malloc(numclasses*sizeof(int));
|
|
587 col_tog_cpy = (int *) malloc(numclasses*sizeof(int));
|
|
588 if( set_togs == 0 ){
|
|
589 for(j=0; j< numclasses; j++){
|
|
590 col_tog[j] = 1;
|
|
591 col_tog_cpy[j] = 1;
|
|
592 }
|
|
593 usedclasses = numclasses;
|
|
594 }
|
|
595 else{
|
|
596 // first count number of fields
|
|
597 col_togl = 0;
|
|
598 fp = fopen(togs,"rt");
|
|
599 if( fp == NULL ){
|
|
600 printf("Error: can't open file %s\n", togs);
|
|
601 exit(0);
|
|
602 }
|
|
603 j=0;
|
|
604 while( fscanf(fp,"%i", &val) != EOF ){
|
|
605 col_togl++;
|
|
606 }
|
|
607 fclose(fp);
|
|
608 if( col_togl != numclasses ){
|
|
609 printf("Error: number of fields in toggle file (%i) is different from number of classifications in data (%i)\n", col_togl, numclasses);
|
|
610 exit(0);
|
|
611 }
|
|
612 // now read in values
|
|
613 fp = fopen(togs,"rt");
|
|
614 j=0;
|
|
615 usedclasses = 0;
|
|
616 while( fscanf(fp,"%i", &val) != EOF ){
|
|
617 col_tog[j] = val;
|
|
618 col_tog_cpy[j] = val;
|
|
619 j++;
|
|
620 if( val == 1 ) usedclasses++;
|
|
621 if( val != 1 && val != 0 ){
|
|
622 printf("Error: choose the toggle values to be 0 or 1\n");
|
|
623 exit(0);
|
|
624 }
|
|
625 }
|
|
626 fclose(fp);
|
|
627 }
|
|
628
|
|
629 for(n=0; n< numbk; n++){
|
|
630 bklist[n].z = 1.0;
|
|
631 }
|
|
632 beta_update();
|
|
633 if( verbose != 0 ) printf(" Indexing genes in first pass\n");
|
|
634 index_genes();
|
|
635 }
|
|
636
|
|
637 /*
|
|
638 * make initial guesses of z-values for a second, etc motif
|
|
639 * recount number of used genes per peak to those with erase > 0.5
|
|
640 */
|
|
641 void reinitialize( long *seed ){
|
|
642 int m,n;
|
|
643 for(m=0; m< numpeaks; m++){
|
|
644 peaklist[m].numused = 0;
|
|
645 for(n=0; n< peaklist[m].numgenes; n++){
|
|
646 peaklist[m].genelist[n].z = 0.0;
|
|
647 if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].erase> 0.5 ){
|
|
648 peaklist[m].numused++;
|
|
649 }
|
|
650 }
|
|
651 if( peaklist[m].numused > 0 ){
|
|
652 n = assign_gene( m, seed );
|
|
653 peaklist[m].genelist[n].z = 1.0; // pick one random gene from each set to read into motif model
|
|
654 }
|
|
655 }
|
|
656 for(n=0; n< numbk; n++){
|
|
657 bklist[n].z = 1.0;
|
|
658 }
|
|
659 }
|
|
660
|
|
661 /*
|
|
662 * histogram background into beta
|
|
663 */
|
|
664 void beta_update(){
|
|
665 int j,k,n;
|
|
666 double tot=0.0;
|
|
667 bkhist_update();
|
|
668 for(j=0; j< numclasses; j++){
|
|
669 // if( col_tog[j] == 1 ){
|
|
670 Beta[j] = 0.0;
|
|
671 for(k=0; k< 5; k++){
|
|
672 beta[j][k] = hist[1][j][k]*mu;
|
|
673 Beta[j]+= beta[j][k];
|
|
674 }
|
|
675 // }
|
|
676 }
|
|
677 }
|
|
678
|
|
679 /*
|
|
680 * for all the genes in peaklist[*].genelist[*], record the index of that gene in bklist
|
|
681 */
|
|
682 void index_genes(){
|
|
683 int m,n,nu;
|
|
684 int found;
|
|
685 for(m=0; m< numpeaks; m++){
|
|
686 for(n=0; n< peaklist[m].numgenes; n++){
|
|
687 if( peaklist[m].genelist[n].used == 1 ){
|
|
688 found = 0;
|
|
689 nu = 0;
|
|
690 while(nu< numbk && found == 0){
|
|
691 if( strcmp(peaklist[m].genelist[n].id, bklist[nu].id) == 0 ){
|
|
692 peaklist[m].genelist[n].index = nu;
|
|
693 found = 1;
|
|
694 }
|
|
695 nu++;
|
|
696 }
|
|
697 }
|
|
698 }
|
|
699 }
|
|
700 }
|
|
701
|
|
702 /*
|
|
703 * update motif model histogram
|
|
704 */
|
|
705 void hist_update(){
|
|
706 int j,k,m,n;
|
|
707 double tot=0;
|
|
708 for(j=0; j< numclasses; j++){
|
|
709 // if( col_tog[j] == 1 ){
|
|
710 for(k=0; k< 5; k++){
|
|
711 hist[0][j][k] = 0.0;
|
|
712 }
|
|
713 // }
|
|
714 }
|
|
715 for(m=0; m< numpeaks; m++){
|
|
716 for(n=0; n< peaklist[m].numgenes; n++){
|
|
717 if( peaklist[m].genelist[n].used == 1 ){
|
|
718 tot+=peaklist[m].genelist[n].erase*peaklist[m].genelist[n].z;
|
|
719 for(j=0; j< numclasses; j++){
|
|
720 // if( col_tog[j] == 1 ){
|
|
721 k = peaklist[m].genelist[n].values[j]+2;
|
|
722 hist[0][j][k]+= peaklist[m].genelist[n].erase*peaklist[m].genelist[n].z;
|
|
723 // }
|
|
724 }
|
|
725 }
|
|
726 }
|
|
727 }
|
|
728 for(j=0; j< numclasses; j++){
|
|
729 // if( col_tog[j] == 1 ){
|
|
730 for(k=0; k< 5; k++){
|
|
731 hist[0][j][k]+= beta[j][k];
|
|
732 hist[0][j][k]/= (tot+Beta[j]);;
|
|
733 }
|
|
734 // }
|
|
735 }
|
|
736 }
|
|
737
|
|
738 /*
|
|
739 * update background hist using z values from bklist
|
|
740 */
|
|
741 void bkhist_update(){
|
|
742 int j,k,n;
|
|
743 double tot=0.0;
|
|
744 for(j=0; j< numclasses; j++){
|
|
745 // if( col_tog[j] == 1 ){
|
|
746 for(k=0; k< 5; k++){
|
|
747 hist[1][j][k] = 0;
|
|
748 }
|
|
749 // }
|
|
750 }
|
|
751 for(n=0; n< numbk; n++){
|
|
752 if( bklist[n].used == 1 ){
|
|
753 tot+=bklist[n].z;
|
|
754 for(j=0; j< numclasses; j++){
|
|
755 // if( col_tog[j] == 1 ){
|
|
756 k = bklist[n].values[j]+2;
|
|
757 hist[1][j][k]+= bklist[n].z;
|
|
758 // }
|
|
759 }
|
|
760 }
|
|
761 }
|
|
762 for(j=0; j< numclasses; j++){
|
|
763 // if( col_tog[j] == 1 ){
|
|
764 for(k=0; k< 5; k++){
|
|
765 hist[1][j][k]/= tot;
|
|
766 }
|
|
767 // }
|
|
768 }
|
|
769 }
|
|
770
|
|
771 /*
|
|
772 * update lambda value
|
|
773 */
|
|
774 void lambda_update(){
|
|
775 int m,n;
|
|
776 double tot=0.0;
|
|
777 lambda=0;
|
|
778 for(m=0; m< numpeaks; m++){
|
|
779 for(n=0; n< peaklist[m].numgenes; n++){
|
|
780 if( peaklist[m].genelist[n].used == 1 ){
|
|
781 lambda+= peaklist[m].genelist[n].z;
|
|
782 tot++;
|
|
783 }
|
|
784 }
|
|
785 }
|
|
786 lambda/=tot;
|
|
787 }
|
|
788
|
|
789 /*
|
|
790 * update the toggle columns based on which columns have better than average relative entropy
|
|
791 */
|
|
792 void toggle_update(){
|
|
793 int j;
|
|
794 double avg=0.0;
|
|
795 avg = rsca*rel_entropy();
|
|
796 usedclasses=0;
|
|
797 for(j=0; j< numclasses; j++){
|
|
798 if( col_tog[j] == 1 ){
|
|
799 if( col_rel_entropy(j) < avg ) col_tog[j] = 0;
|
|
800 if( col_tog[j] == 1 ) usedclasses++;
|
|
801 }
|
|
802 }
|
|
803 }
|
|
804
|
|
805 /*
|
|
806 * reset col_tog values with their original ones
|
|
807 */
|
|
808 void toggle_reset(){
|
|
809 int j;
|
|
810 usedclasses=0;
|
|
811 for(j=0; j< numclasses; j++){
|
|
812 col_tog[j] = col_tog_cpy[j];
|
|
813 if( col_tog[j] == 1 ) usedclasses++;
|
|
814 }
|
|
815 }
|
|
816
|
|
817 /*
|
|
818 * update z
|
|
819 */
|
|
820 void z_update(){
|
|
821 int j,m,n;
|
|
822 double a, c, denom;
|
|
823 for(m=0; m< numpeaks; m++){
|
|
824 for(n=0; n< peaklist[m].numgenes; n++){
|
|
825 if( peaklist[m].genelist[n].used == 1 ){
|
|
826 // new z value for motif model
|
|
827 a = lprob(m,n,0)+log(lambda);
|
|
828 c = lprob(m,n,1)+log(1-lambda);
|
|
829 if( a> c ){
|
|
830 denom = a + log(1+exp(c-a));
|
|
831 }
|
|
832 else{
|
|
833 denom = c + log(1+exp(a-c));
|
|
834 }
|
|
835 peaklist[m].genelist[n].z = exp( lprob(m,n,0)+log(lambda) - denom );
|
|
836 // peaklist[m].genelist[n].z = prob(m,n,0)*lambda/( prob(m,n,0)*lambda + prob(m,n,1)*(1-lambda) );
|
|
837 // new z value for corresponding gene in background model
|
|
838 bklist[peaklist[m].genelist[n].index].z = 1.0-peaklist[m].genelist[n].z;
|
|
839 }
|
|
840 }
|
|
841 }
|
|
842 }
|
|
843
|
|
844 /*
|
|
845 * update erase parameters
|
|
846 */
|
|
847 void erase_update(){
|
|
848 int j,m,n;
|
|
849 for(m=0; m< numpeaks; m++){
|
|
850 for(n=0; n< peaklist[m].numgenes; n++){
|
|
851 if( peaklist[m].genelist[n].used == 1 ){
|
|
852 peaklist[m].genelist[n].erase*= (1-peaklist[m].genelist[n].z);
|
|
853 }
|
|
854 }
|
|
855 }
|
|
856 }
|
|
857
|
|
858 /*
|
|
859 * calculate (log) (multinomial) probability of sequence m,n in model l
|
|
860 */
|
|
861 double lprob( int m, int n, int l ){
|
|
862 int j,k;
|
|
863 double val=0.0;
|
|
864 for(j=0; j< numclasses; j++){
|
|
865 if( col_tog[j] == 1 ){
|
|
866 k = peaklist[m].genelist[n].values[j]+2;
|
|
867 val+= log(hist[l][j][k]);
|
|
868 }
|
|
869 }
|
|
870 return val;
|
|
871 }
|
|
872 double prob( int m, int n, int l ){
|
|
873 int j,k;
|
|
874 double val=1.0;
|
|
875 for(j=0; j< numclasses; j++){
|
|
876 if( col_tog[j] == 1 ){
|
|
877 k = peaklist[m].genelist[n].values[j]+2;
|
|
878 val*= hist[l][j][k];
|
|
879 }
|
|
880 }
|
|
881 return val;
|
|
882 }
|
|
883
|
|
884 /*
|
|
885 * copy hist into oldhist before updating
|
|
886 */
|
|
887 void copy_hist(){
|
|
888 int j,k,l;
|
|
889 for(l=0; l< 2; l++){
|
|
890 for(j=0; j< numclasses; j++){
|
|
891 // if( col_tog[j] == 1 ){
|
|
892 for(k=0; k< 5; k++){
|
|
893 oldhist[l][j][k] = hist[l][j][k];
|
|
894 }
|
|
895 // }
|
|
896 }
|
|
897 }
|
|
898 }
|
|
899
|
|
900 /*
|
|
901 * compute change in hist
|
|
902 */
|
|
903 double hist_diff(){
|
|
904 int j,k,l;
|
|
905 double val=0.0;
|
|
906 for(l=0; l< 2; l++){
|
|
907 for(j=0; j< numclasses; j++){
|
|
908 if( col_tog[j] == 1 ){
|
|
909 for(k=0; k< 5; k++){
|
|
910 val+= fabs( oldhist[l][j][k]-hist[l][j][k] );
|
|
911 }
|
|
912 }
|
|
913 }
|
|
914 }
|
|
915 return val;
|
|
916 }
|
|
917
|
|
918 /*
|
|
919 * calculate the score of a sequence
|
|
920 */
|
|
921 double score( int m, int n ){
|
|
922 int j,k;
|
|
923 double val=0.0;
|
|
924 for(j=0; j< numclasses; j++){
|
|
925 if( col_tog[j] == 1 ){
|
|
926 k = peaklist[m].genelist[n].values[j]+2;
|
|
927 val+= log( hist[0][j][k]/hist[1][j][k] );
|
|
928 }
|
|
929 }
|
|
930 return val;
|
|
931 }
|
|
932
|
|
933 /*
|
|
934 * calculate value of t (threshold)
|
|
935 */
|
|
936 void t_update(){
|
|
937 t = tscale*log( (1-lambda)/lambda );
|
|
938 }
|
|
939
|
|
940 /*
|
|
941 * quality scores of a motif, averaging over only those sequences picked as targets
|
|
942 * ElogL: log-likelihood of the combination of models, averaged by # genes
|
|
943 * ratio_score: log-likelihood ratio of motif vs background models
|
|
944 * rel_entropy: relative entropy of motif to background
|
|
945 * col_rel_entropy: same as rel_entropy, but for just one column (rel_entropy is average over all cols)
|
|
946 * chisq: chi-square statistic
|
|
947 */
|
|
948 double ElogL(){
|
|
949 int m,n;
|
|
950 double val=0.0, tot=0.0;
|
|
951 for(m=0; m< numpeaks; m++){
|
|
952 for(n=0; n< peaklist[m].numgenes; n++){
|
|
953 if( peaklist[m].genelist[n].used == 1 ){
|
|
954 val+= peaklist[m].genelist[n].z*lprob(m, n, 0 ) \
|
|
955 + (1-peaklist[m].genelist[n].z)*lprob(m, n, 1 )\
|
|
956 + peaklist[m].genelist[n].z*log(lambda) \
|
|
957 + (1-peaklist[m].genelist[n].z)*log(1-lambda);
|
|
958 tot++;
|
|
959 }
|
|
960 }
|
|
961 }
|
|
962 return val/tot;
|
|
963 }
|
|
964
|
|
965 double ratio_score(){
|
|
966 int m,n;
|
|
967 double val=0.0;
|
|
968 double tot=0.0;
|
|
969 for(m=0; m< numpeaks; m++){
|
|
970 for(n=0; n< peaklist[m].numgenes; n++){
|
|
971 if( peaklist[m].genelist[n].assigned == 1 ){
|
|
972 val+= lprob(m,n,0)-lprob(m,n,1);
|
|
973 tot++;
|
|
974 }
|
|
975 }
|
|
976 }
|
|
977 return val/tot;
|
|
978 }
|
|
979
|
|
980 double rel_entropy(){
|
|
981 int j,k;
|
|
982 double val=0.0;
|
|
983 for(j=0; j< numclasses; j++){
|
|
984 if( col_tog[j] == 1 ){
|
|
985 val+= col_rel_entropy( j );
|
|
986 }
|
|
987 }
|
|
988 return val/usedclasses;
|
|
989 }
|
|
990
|
|
991 double col_rel_entropy( int j ){
|
|
992 int k;
|
|
993 double val=0.0;
|
|
994 for(k=0; k< 5; k++){
|
|
995 if( hist[1][j][j] > 0 && hist[0][j][k] > 0 ){
|
|
996 // do in log base 2 for bits
|
|
997 val+= hist[0][j][k]*log(hist[0][j][k]/hist[1][j][k])/log(2);
|
|
998 }
|
|
999 }
|
|
1000 return val;
|
|
1001 }
|
|
1002
|
|
1003 double chisq(){
|
|
1004 int j, k;
|
|
1005 double val=0.0;
|
|
1006 for(j=0; j< numclasses; j++){
|
|
1007 if( col_tog[j] == 1 ){
|
|
1008 for(k=0; k< 5; k++){
|
|
1009 val+= pow(hist[0][j][k] - hist[1][j][k],2)/hist[1][j][k];
|
|
1010 }
|
|
1011 }
|
|
1012 }
|
|
1013 return val;
|
|
1014 }
|
|
1015
|
|
1016 /*
|
|
1017 * number of members of a motif model
|
|
1018 */
|
|
1019 int motif_count(){
|
|
1020 int m,n;
|
|
1021 int tot=0;
|
|
1022 for(m=0; m< numpeaks; m++){
|
|
1023 for(n=0; n< peaklist[m].numgenes; n++){
|
|
1024 if( peaklist[m].genelist[n].assigned == 1 ){
|
|
1025 tot++;
|
|
1026 }
|
|
1027 }
|
|
1028 }
|
|
1029 return tot;
|
|
1030 }
|
|
1031
|
|
1032 /*
|
|
1033 * assign a random used gene: one that is used and hasn't been erased
|
|
1034 */
|
|
1035 int assign_gene( int m, long *seed ){
|
|
1036 int n;
|
|
1037 n = floor( peaklist[m].numgenes*ran1(seed) );
|
|
1038 while( peaklist[m].genelist[n].used == 0 || peaklist[m].genelist[n].erase < 0.5 ){
|
|
1039 n = floor( peaklist[m].numgenes*ran1(seed) );
|
|
1040 }
|
|
1041 return n;
|
|
1042 }
|
|
1043
|
|
1044 /*
|
|
1045 * score all genes using the models
|
|
1046 */
|
|
1047 void score_genes(){
|
|
1048 int m,n;
|
|
1049 for(m=0; m< numpeaks; m++){
|
|
1050 for(n=0; n< peaklist[m].numgenes; n++){
|
|
1051 if( peaklist[m].genelist[n].used == 1 ){
|
|
1052 peaklist[m].genelist[n].score= score( m, n );
|
|
1053 }
|
|
1054 }
|
|
1055 }
|
|
1056 }
|
|
1057
|
|
1058 /*
|
|
1059 * choose targets using the scores, and ZOOPS setting
|
|
1060 * zoops = 1 -> 0 or 1 target/strand
|
|
1061 * else -> all above threshold
|
|
1062 */
|
|
1063 void choose_targets(){
|
|
1064 int m,n;
|
|
1065 int best;
|
|
1066 for(m=0; m< numpeaks; m++){
|
|
1067 peaklist[m].assgngene = 0;
|
|
1068 best=-1;
|
|
1069 for(n=0; n< peaklist[m].numgenes; n++){
|
|
1070 if( peaklist[m].genelist[n].used == 1 ){
|
|
1071 if( peaklist[m].genelist[n].score > t ){
|
|
1072 peaklist[m].genelist[n].assigned = 1;
|
|
1073 peaklist[m].assgngene++;
|
|
1074 }
|
|
1075 else peaklist[m].genelist[n].assigned = 0;
|
|
1076 if( best == -1 ) best = n;
|
|
1077 else if( peaklist[m].genelist[n].score > peaklist[m].genelist[best].score ) best = n;
|
|
1078 }
|
|
1079 }
|
|
1080 if( zoops == 1 ){
|
|
1081 for(n=0; n< peaklist[m].numgenes; n++){
|
|
1082 if( peaklist[m].genelist[n].used == 1 && n != best ) peaklist[m].genelist[n].assigned = 0;
|
|
1083 }
|
|
1084 if( peaklist[m].genelist[best].score > t ) peaklist[m].assgngene = 1;
|
|
1085 else peaklist[m].assgngene = 0;
|
|
1086 }
|
|
1087 }
|
|
1088 }
|
|
1089
|
|
1090 /*
|
|
1091 * functions to print to output (fp must already be opened for top two)
|
|
1092 */
|
|
1093 void print_peak( FILE *fp, int m ){
|
|
1094 int n;
|
|
1095 fprintf(fp,"PEAK: %s %ld %s\n", peaklist[m].chrom, peaklist[m].pkposn, peaklist[m].info);
|
|
1096 for(n=0; n< peaklist[m].numgenes; n++){
|
|
1097 if( printall == 0 ){
|
|
1098 if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].assigned == 1 ){
|
|
1099 print_gene( fp, m, n );
|
|
1100 }
|
|
1101 }
|
|
1102 if( printall == 1 ){
|
|
1103 if( peaklist[m].genelist[n].used == 1 ){
|
|
1104 print_gene( fp, m, n );
|
|
1105 }
|
|
1106 }
|
|
1107 }
|
|
1108 }
|
|
1109
|
|
1110 void print_gene( FILE *fp, int m, int n ){
|
|
1111 int j;
|
|
1112 if( peaklist[m].genelist[n].used == 1 ){
|
|
1113 fprintf(fp, "TGENE: %0.3f %s %s %s %ld %ld %i", peaklist[m].genelist[n].score, peaklist[m].genelist[n].id, peaklist[m].genelist[n].name, peaklist[m].genelist[n].chrom, peaklist[m].genelist[n].start, peaklist[m].genelist[n].end, peaklist[m].genelist[n].sense);
|
|
1114 for(j=0; j< numclasses; j++){
|
|
1115 if( col_tog[j] == 0 ) fprintf(fp," %i ", peaklist[m].genelist[n].values[j]);
|
|
1116 else fprintf(fp," %i", peaklist[m].genelist[n].values[j]);
|
|
1117 }
|
|
1118 fprintf(fp,"\n");
|
|
1119 }
|
|
1120 }
|
|
1121
|
|
1122 void print_state(FILE *fp, char *filename){
|
|
1123 int m;
|
|
1124 fp = fopen(filename,"w");
|
|
1125 for(m=0; m< numpeaks; m++){
|
|
1126 print_peak(fp, m);
|
|
1127 }
|
|
1128 fclose(fp);
|
|
1129 }
|
|
1130
|
|
1131 /*
|
|
1132 * this prints a list of <region id> <name>
|
|
1133 */
|
|
1134 /****************
|
|
1135 void print_targets( FILE *fp, char *filename ){
|
|
1136 int m,n,nu;
|
|
1137 fp = fopen(filename,"w");
|
|
1138 for(m=0; m< numpeaks; m++){
|
|
1139 if( peaklist[m].assgngene > 0 ){
|
|
1140 fprintf(fp,"%s ", peaklist[m].regid);
|
|
1141 n=0;
|
|
1142 nu=0;
|
|
1143 while( n< peaklist[m].numgenes ){
|
|
1144 if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].assigned == 1 ){
|
|
1145 if( nu< peaklist[m].assgngene-1 ) fprintf(fp,"%s,", peaklist[m].genelist[n].name);
|
|
1146 else fprintf(fp,"%s\n", peaklist[m].genelist[n].name);
|
|
1147 nu++;
|
|
1148 }
|
|
1149 n++;
|
|
1150 }
|
|
1151 }
|
|
1152 }
|
|
1153 fclose(fp);
|
|
1154 }
|
|
1155 *************/
|
|
1156
|
|
1157 /*
|
|
1158 * print the motif model and its score
|
|
1159 */
|
|
1160 void print_model( FILE *fp, char *filename ){
|
|
1161 int j,k;
|
|
1162 fp = fopen(filename,"w");
|
|
1163 fprintf(fp,"comparison\\class; avg relative entropy %0.3f, %i genes, threshold %0.3f, quality %0.1f\n", rel_entropy(), motif_count(), t, rel_entropy()*motif_count() );
|
|
1164 fprintf(fp," ++ + 0 - --\n");
|
|
1165 for(j=0; j< numclasses; j++){
|
|
1166 if( j< 10 ) fprintf(fp," ");
|
|
1167 if( col_tog[j] == 0 ) fprintf(fp,"#");
|
|
1168 else fprintf(fp," ");
|
|
1169 fprintf(fp,"%i,%.3f",j+1, col_rel_entropy(j) );
|
|
1170 for(k=4; k>=0; k--){
|
|
1171 if( hist[1][j][k] > 0 ) fprintf(fp,"%s", nice_d(log(hist[0][j][k]/hist[1][j][k])) );
|
|
1172 else fprintf(fp," NA");
|
|
1173 }
|
|
1174 fprintf(fp,"\n");
|
|
1175 }
|
|
1176 fclose(fp);
|
|
1177 }
|
|
1178
|
|
1179 /*
|
|
1180 * prints a double with 3 digits, aligned by spaces
|
|
1181 * longest would be -999.000, so 8 characters
|
|
1182 * shortest would be 1.000, 5 characters
|
|
1183 * up to 3 spaces
|
|
1184 */
|
|
1185 char *nice_d( double val ){
|
|
1186 char *returnval;
|
|
1187 char *dval;
|
|
1188 char *spaces;
|
|
1189 int i, numspaces=0;
|
|
1190
|
|
1191 returnval = (char *) malloc(10*sizeof(char));
|
|
1192 dval = (char *) malloc(10*sizeof(char));
|
|
1193 spaces = (char *) malloc(10*sizeof(char));
|
|
1194
|
|
1195 sprintf(dval,"%.3f", val);
|
|
1196
|
|
1197 if( val >= 0 ) numspaces++;
|
|
1198 if( fabs(val) < 100 ){
|
|
1199 if( fabs(val) < 10 ) numspaces+=2;
|
|
1200 else numspaces++;
|
|
1201 }
|
|
1202 if( numspaces == 0 ) sprintf(spaces,"");
|
|
1203 if( numspaces == 1 ) sprintf(spaces," ");
|
|
1204 if( numspaces == 2 ) sprintf(spaces," ");
|
|
1205 if( numspaces == 3 ) sprintf(spaces," ");
|
|
1206 sprintf(returnval,"%s%s", spaces, dval);
|
|
1207 return returnval;
|
|
1208 }
|
|
1209
|
|
1210 /*
|
|
1211 * print the peaks only in annotation format, for use in comppeaks, subpeaks, etc
|
|
1212 */
|
|
1213 /**************
|
|
1214 void print_annotpeak( FILE *fp, char *filename, int motif ){
|
|
1215 int m;
|
|
1216 fp = fopen(filename,"w");
|
|
1217 for(m=0; m< numpeaks; m++){
|
|
1218 if( peaklist[m].assgngene > 0 ){
|
|
1219 fprintf(fp,"%s-%i %s %s %ld %ld %ld ", peaklist[m].filename, motif, peaklist[m].regid, peaklist[m].chrom, peaklist[m].pkposn, peaklist[m].start, peaklist[m].end );
|
|
1220 fprintf(fp,"%f 1.0 1.0 1 1 1.0 ", peaklist[m].height );
|
|
1221 fprintf(fp,"1 1.0 1.0 1.0 1 -1.0 poop 1\n");
|
|
1222 }
|
|
1223 }
|
|
1224 close(fp);
|
|
1225 }
|
|
1226 *************/
|
|
1227
|