comparison src/flock1.c @ 1:7eab80f86779 draft

add FLOCK
author immport-devteam
date Mon, 27 Feb 2017 13:26:09 -0500
parents
children
comparison
equal deleted inserted replaced
0:8d951baf795f 1:7eab80f86779
1 /*****************************************************************************
2
3 FLOCK: FLOw cytometry Clustering without K (Named by: Jamie A. Lee and Richard H. Scheuermann)
4
5 Author: (Max) Yu Qian, Ph.D.
6
7 Copyright: Scheuermann Lab, Dept. of Pathology, UTSW
8
9 Development: November 2005 ~ Forever
10
11 Algorithm Status: May 2007: Release 1.0
12
13 Usage: flock data_file
14 Note: the input file format must be channel values and the delimiter between two values must be a tab.
15
16 Changes made July 23, 2010: made errors to STDERR
17 Changes made Nov 4, 2010: added one more error (select_num_bin<min_grid) || (select_num_bin>max_grid) to STDERR;
18 MAX_GRID changed to 50 as larger than 50 seems not useful for any file we have got
19
20 ******************************************************************************/
21 #include <time.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <math.h>
25 #include <string.h>
26 #include <sys/stat.h>
27 #include <unistd.h>
28 #include <assert.h>
29
30
31 #define DEBUG 0
32 #define LINE_LEN 1024
33 #define FILE_NAME_LEN 128
34 #define PARA_NAME_LEN 64
35 #define MAX_VALUE 1000000000
36 #define MIN_GRID 6
37 #define MAX_GRID 50
38
39 #define NORM_METHOD 2 //2 if z-score; 0 if no normalization; 1 if min-max
40 #define KMEANS_TERM 100
41 //#define MAX_NUM_POP 30
42
43
44 int find_connected(int **G, int num_dense_grids, int ndim, int *grid_clusterID);
45
46 /************* Read basic info of the source file ****************************/
47 void getfileinfo(FILE *f_src, int *file_Len, int *num_dm, char *name_string, int *time_ID)
48 {
49 char src[LINE_LEN];
50 char current_name[64];
51 char prv;
52
53 int num_rows=0;
54 int num_columns=0;
55 int ch='\n';
56 int prev='\n';
57 int time_pos=0;
58 int i=0;
59 int j=0;
60 int sw=0;
61
62 src[0]='\0';
63 fgets(src, LINE_LEN, f_src);
64
65 if ((src[0]=='F') && (src[1]=='C') && (src[2]=='S'))
66 {
67 fprintf(stderr,"the correct input format is a tab-delimited txt file, instead of FCS file.\n");
68 abort();
69 }
70
71 name_string[0]='\0';
72 current_name[0]='\0';
73 prv='\n';
74
75 // skip space and tab characters
76 while ((src[i]==' ') || (src[i]=='\t'))
77 i++;
78
79 // repeat until the end of line is reached
80 while ((src[i]!='\0') && (src[i]!='\n') && (src[i]!='\r'))
81 {
82 current_name[j]=src[i];
83
84 if ((src[i]=='\t') && (prv!='\t')) //a complete word
85 {
86 current_name[j]='\0';
87
88 if (0!=strcmp(current_name,"Time"))
89 {
90 num_columns++; //num_columns does not inlcude the column of Time
91 time_pos++;
92 if (sw) {
93 strcat(name_string,"\t");
94 }
95 strcat(name_string,current_name);
96 sw = 1;
97 }
98 else
99 {
100 *time_ID=time_pos;
101 }
102
103
104 current_name[0]='\0';
105 j=0;
106 }
107
108 if ((src[i]=='\t') && (prv=='\t')) //a duplicate tab or space
109 {
110 current_name[0]='\0';
111 j=0;
112 }
113
114 if (src[i]!='\t')
115 j++;
116
117 prv=src[i];
118 i++;
119 }
120
121 if (prv!='\t') //the last one hasn't been retrieved
122 {
123 current_name[j]='\0';
124
125 if (0!=strcmp(current_name,"Time"))
126 {
127 num_columns++;
128 strcat(name_string,"\t");
129 strcat(name_string,current_name);
130 time_pos++;
131 }
132 else
133 {
134 *time_ID=time_pos;
135 }
136
137
138 }
139 if (DEBUG==1)
140 {
141 printf("time_ID is %d\n",*time_ID);
142 printf("name_string is %s\n",name_string);
143 }
144
145 //start computing # of rows
146
147 while ((ch = fgetc(f_src))!= EOF )
148 {
149 if (ch == '\n')
150 {
151 ++num_rows;
152 }
153 prev = ch;
154 }
155 if (prev!='\n')
156 ++num_rows;
157
158 //added on July 23, 2010
159 if (num_rows<50)
160 {
161 fprintf(stderr,"Number of events in the input file is too few and should not be processed!\n"); //modified on July 23, 2010
162 abort();
163 }
164
165 *file_Len=num_rows;
166 *num_dm=num_columns;
167
168 printf("original file size is %d; number of dimensions is %d\n", *file_Len, *num_dm);
169 }
170
171
172
173 /************************************* Read the source file into uncomp_data **************************************/
174 void readsource(FILE *f_src, int file_Len, int num_dm, double **uncomp_data, int time_ID)
175 {
176 int time_pass=0; //to mark whether the time_ID has been passed
177 int index=0;
178
179 int i=0;
180 int j=0;
181 int t=0;
182
183 char src[LINE_LEN];
184 char xc[LINE_LEN/10];
185
186 src[0]='\0';
187 fgets(src,LINE_LEN, f_src); //skip the first line about parameter names
188
189 while (!feof(f_src) && (index<file_Len)) //index = 0, 1, ..., file_Len-1
190 {
191 src[0]='\0';
192 fgets(src,LINE_LEN,f_src);
193 i=0;
194 time_pass=0;
195
196 if (time_ID==-1)
197 {
198 for (t=0;t<num_dm;t++) //there is no time_ID
199 {
200 xc[0]='\0';
201 j=0;
202 while ((src[i]!='\0') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t'))
203 {
204 xc[j]=src[i];
205 i++;
206 j++;
207 }
208
209 xc[j]='\0';
210 i++;
211
212 uncomp_data[index][t]=atof(xc);
213 }
214 }
215 else
216 {
217 for (t=0;t<=num_dm;t++) //the time column needs to be skipped, so there are num_dm+1 columns
218 {
219 xc[0]='\0';
220 j=0;
221 while ((src[i]!='\0') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t'))
222 {
223 xc[j]=src[i];
224 i++;
225 j++;
226 }
227
228 xc[j]='\0';
229 i++;
230
231 if (t==time_ID)
232 {
233 time_pass=1;
234 continue;
235 }
236
237 if (time_pass)
238 uncomp_data[index][t-1]=atof(xc);
239 else
240 uncomp_data[index][t]=atof(xc);
241 }
242 }
243 index++;
244 //fprintf(fout_ID,"%s",src);
245 } //end of while
246
247 if (DEBUG == 1)
248 {
249 printf("the last line of the source data is:\n");
250 for (j=0;j<num_dm;j++)
251 printf("%f ",uncomp_data[index-1][j]);
252 printf("\n");
253 }
254 }
255
256
257 /**************************************** Normalization ******************************************/
258 void tran(double **orig_data, int file_Len, int num_dm, int norm_used, double **matrix_to_cluster)
259 {
260 int i=0;
261 int j=0;
262
263 double biggest=0;
264 double smallest=MAX_VALUE;
265
266 double *aver; //average of each column
267 double *std; //standard deviation of each column
268
269 aver=(double*)malloc(sizeof(double)*file_Len);
270 memset(aver,0,sizeof(double)*file_Len);
271
272 std=(double*)malloc(sizeof(double)*file_Len);
273 memset(std,0,sizeof(double)*file_Len);
274
275 if (norm_used==2) //z-score normalization
276 {
277 for (j=0;j<num_dm;j++)
278 {
279 aver[j]=0;
280 for (i=0;i<file_Len;i++)
281 aver[j]=aver[j]+orig_data[i][j];
282 aver[j]=aver[j]/(double)file_Len;
283
284 std[j]=0;
285 for (i=0;i<file_Len;i++)
286 std[j]=std[j]+(orig_data[i][j]-aver[j])*(orig_data[i][j]-aver[j]);
287 std[j]=sqrt(std[j]/(double)file_Len);
288
289 for (i=0;i<file_Len;i++)
290 matrix_to_cluster[i][j]=(orig_data[i][j]-aver[j])/std[j]; //z-score normalization
291 }
292 }
293
294 if (norm_used==1) //0-1 min-max normalization
295 {
296 for (j=0;j<num_dm;j++)
297 {
298 biggest=0;
299 smallest=MAX_VALUE;
300 for (i=0;i<file_Len;i++)
301 {
302 if (orig_data[i][j]>biggest)
303 biggest=orig_data[i][j];
304 if (orig_data[i][j]<smallest)
305 smallest=orig_data[i][j];
306 }
307
308 for (i=0;i<file_Len;i++)
309 {
310 if (biggest==smallest)
311 matrix_to_cluster[i][j]=biggest;
312 else
313 matrix_to_cluster[i][j]=(orig_data[i][j]-smallest)/(biggest-smallest);
314 }
315 }
316 }
317
318 if (norm_used==0) //no normalization
319 {
320 for (i=0;i<file_Len;i++)
321 for (j=0;j<num_dm;j++)
322 matrix_to_cluster[i][j]=orig_data[i][j];
323 }
324
325 }
326
327
328
329 /********************************************** RadixSort *******************************************/
330 /* Perform a radix sort using each dimension from the original data as a radix.
331 * Outputs:
332 * sorted_seq -- a permutation vector mapping the ordered list onto the original data.
333 * (sorted_seq[i] -> index in the original data of the ith element of the ordered list)
334 * grid_ID -- mapping between the original data and the "grids" (see below) found as a byproduct
335 * of the sorting procedure.
336 * num_nonempty -- the number of grids that occur in the data (= the number of distinct values assumed
337 * by grid_ID)
338 */
339
340 void radixsort_flock(int **position,int file_Len,int num_dm,int num_bin,int *sorted_seq,int *num_nonempty,int *grid_ID)
341 {
342 int i=0;
343 int length=0;
344 int start=0;
345 int prev_ID=0;
346 int curr_ID=0;
347
348 int j=0;
349 int t=0;
350 int p=0;
351 int loc=0;
352 int temp=0;
353 int equal=0;
354
355 int *count; //count[i]=j means there are j numbers having value i at the processing digit
356 int *index; //index[i]=j means the starting position of grid i is j
357 int *cp; //current position
358 int *mark; //mark[i]=0 means it is not an ending point of a part, 1 means it is (a "part" is a group of items with identical bins for all dimensions)
359 int *seq; //temporary sequence
360
361 count=(int*)malloc(sizeof(int)*num_bin);
362 memset(count,0,sizeof(int)*num_bin);
363
364 cp=(int*)malloc(sizeof(int)*num_bin);
365 memset(cp,0,sizeof(int)*num_bin);
366
367 index=(int*)malloc(sizeof(int)*num_bin); // initialized below
368
369 seq=(int*)malloc(sizeof(int)*file_Len);
370 memset(seq,0,sizeof(int)*file_Len);
371
372 mark=(int*)malloc(sizeof(int)*file_Len);
373 memset(mark,0,sizeof(int)*file_Len);
374
375 for (i=0;i<file_Len;i++)
376 {
377 sorted_seq[i]=i;
378 mark[i]=0;
379 seq[i]=0;
380 }
381 for (i=0;i<num_bin;i++)
382 {
383 index[i]=0;
384 cp[i]=0;
385 count[i]=0;
386 }
387
388 for (j=0;j<num_dm;j++)
389 {
390 if (j==0) //compute the initial values of mark
391 {
392 for (i=0;i<file_Len;i++)
393 count[position[i][j]]++; // initialize the count to the number of items in each bin of the 0th dimension
394
395 index[0] = 0;
396 for (i=0;i<num_bin-1;i++)
397 {
398 index[i+1]=index[i]+count[i]; //index[k]=x means k segment starts at x (in the ordered list)
399 if ((index[i+1]>0) && (index[i+1]<=file_Len))
400 {
401 mark[index[i+1]-1]=1; // Mark the end of the segment in the ordered list
402 }
403 else
404 {
405 printf("out of myboundary for mark at index[i+1]-1.\n");
406 }
407 }
408 mark[file_Len-1]=1;
409
410 for (i=0;i<file_Len;i++)
411 {
412 /* Build a permutation vector for the partially ordered data. Store the PV in sorted_seq */
413 loc=position[i][j];
414 temp=index[loc]+cp[loc]; //cp[i]=j means the offset from the starting position of grid i is j
415 sorted_seq[temp]=i; //sorted_seq[i]=temp is also another way to sort
416 cp[loc]++;
417 }
418 }
419 else
420 {
421 //reset count, index, loc, temp, cp, start, and length
422 length=0;
423 loc=0;
424 temp=0;
425 start=0;
426 for (p=0;p<num_bin;p++)
427 {
428 cp[p]=0;
429 count[p]=0;
430 index[p]=0;
431 }
432
433 for (i=0;i<file_Len;i++)
434 {
435 int iperm = sorted_seq[i]; // iperm allows us to traverse the data in sorted order.
436 if (mark[i]!=1)
437 {
438 /* Count the number of items in each bin of
439 dimension j, BUT we are going to reset at the end
440 of each "part". Thus, the total effect is to do
441 a sort by bin on the jth dimension for each group
442 of data that has been identical for the
443 dimensions processed up to this point. This is
444 the standard radix sort procedure, but doing it
445 this way saves us having to allocate buckets to
446 hold the data in each group of "identical-so-far"
447 elements. */
448 count[position[iperm][j]]++; //count[position[i][j]]++;
449 length++; // This is the total length of the part, irrespective of the value of the jth component
450 // (actually, less one, since we don't increment for the final element below)
451 }
452 if (mark[i]==1)
453 {
454 //length++;
455 count[position[iperm][j]]++;//count[position[i][j]]++; //the current point must be counted in
456 start=i-length; //this part starts from start to i: [start,i]
457 /* Now we sort on the jth radix, just like we did for the 0th above, but we restrict it to just the current part.
458 This would be a lot more clear if we broke this bit of code out into a separate function and processed recursively,
459 plus we could multi-thread over the parts. (Hmmm...)
460 */
461 index[0] = start; // Let index give the offset within the whole ordered list.
462 for (t=0;t<num_bin-1;t++)
463 {
464 index[t+1]=index[t]+count[t];
465
466 if ((index[t+1]<=file_Len) && (index[t+1]>0))
467 {
468 mark[index[t+1]-1]=1; // update the part boundaries to include the differences in the current radix.
469 }
470
471 }
472 mark[i]=1;
473
474 /* Update the permutation vector for the current part (i.e., from start to i). By the time we finish the loop over i
475 the PV will be completely updated for the partial ordering up to the current radix. */
476 for (t=start;t<=i;t++)
477 {
478 loc=position[sorted_seq[t]][j];//loc=position[t][j];
479 temp=index[loc]+cp[loc];
480 if ((temp<file_Len) && (temp>=0))
481 {
482 // seq is a temporary because we have to maintain the old PV until we have finished this step.
483 seq[temp]=sorted_seq[t]; //sorted_seq[i]=temp is also another way to sort
484 cp[loc]++;
485 }
486 else
487 {
488 printf("out of myboundary for seq at temp.\n");
489 }
490 }
491
492 for (t=start;t<=i;t++)
493 {
494 // copy the temporary back into sorted_seq. sorted_seq is now updated for radix j up through
495 // entry i in the ordered list.
496 if ((t>=0) && (t<file_Len))
497 sorted_seq[t]=seq[t];
498 else
499 printf("out of myboundary for seq and sorted_seq at t.\n");
500 }
501 //reset count, index, seq, length, and cp
502 length=0;
503 loc=0;
504 temp=0;
505 for (p=0;p<num_bin;p++)
506 {
507 cp[p]=0;
508 count[p]=0;
509 index[p]=0;
510 }
511 }
512 }//end for i
513 }//end else
514 }//end for j
515
516 /* sorted_seq[] now contains the ordered list for all radices. mark[] gives the boundaries between groups of elements that are
517 identical over all radices (= dimensions in the original data) (although it appears we aren't going to make use of this fact) */
518
519 for (i=0;i<file_Len;i++)
520 grid_ID[i]=0; //in case the initial value hasn't been assigned
521 *num_nonempty=1; //starting from 1!
522
523 /* assign the "grid" identifiers for all of the data. A grid will be what we were calling a "part" above. We will number them
524 serially and tag the *unordered* data with the grid IDs. We will also count the number of populated grids (in general there will
525 be many possible combinations of bin values that simply never occur) */
526
527 for (i=1;i<file_Len;i++)
528 {
529 equal=1;
530 prev_ID=sorted_seq[i-1];
531 curr_ID=sorted_seq[i];
532 for (j=0;j<num_dm;j++)
533 {
534 if (position[prev_ID][j]!=position[curr_ID][j])
535 {
536 equal=0; //not equal
537 break;
538 }
539 }
540
541 if (equal)
542 {
543 grid_ID[curr_ID]=grid_ID[prev_ID];
544 }
545 else
546 {
547 *num_nonempty=*num_nonempty+1;
548 grid_ID[curr_ID]=grid_ID[prev_ID]+1;
549 }
550 //all_grid_vol[grid_ID[curr_ID]]++;
551 }
552
553 //free memory
554 free(count);
555 free(index);
556 free(cp);
557 free(seq);
558 free(mark);
559
560 }
561
562 /********************************************** Compute Position of Events ************************************************/
563 void compute_position(double **data_in, int file_Len, int num_dm, int num_bin, int **position, double *interval)
564 {
565 /* What we are really doing here is binning the data, with the bins
566 spanning the range of the data and number of bins = num_bin */
567 int i=0;
568 int j=0;
569
570 double *small; //small[j] is the smallest value within dimension j
571 double *big; //big[j] is the biggest value within dimension j
572
573 small=(double*)malloc(sizeof(double)*num_dm);
574 memset(small,0,sizeof(double)*num_dm);
575
576 big=(double*)malloc(sizeof(double)*num_dm);
577 memset(big,0,sizeof(double)*num_dm);
578
579
580 for (j=0;j<num_dm;j++)
581 {
582 big[j]=MAX_VALUE*(-1);
583 small[j]=MAX_VALUE;
584 for (i=0;i<file_Len;i++)
585 {
586 if (data_in[i][j]>big[j])
587 big[j]=data_in[i][j];
588
589 if (data_in[i][j]<small[j])
590 small[j]=data_in[i][j];
591 }
592
593 interval[j]=(big[j]-small[j])/(double)num_bin; //interval is computed using the biggest value and smallest value instead of the channel limit
594 /* XXX: I'm pretty sure the denominator of the fraction above should be num_bin-1. */
595 /* I don't think so: num_bin is the number of bins */
596 }
597
598 for (j=0;j<num_dm;j++)
599 {
600 for (i=0;i<file_Len;i++)
601 {
602 if (data_in[i][j]>=big[j])
603 position[i][j]=num_bin-1;
604 else
605 {
606 position[i][j]=(int)((data_in[i][j]-small[j])/interval[j]); //position[i][j]=t means point i is at the t grid of dimensional j
607 if ((position[i][j]>=num_bin) || (position[i][j]<0))
608 {
609 //printf("position mis-computed in density analysis!\n");
610 //exit(0);
611 fprintf(stderr,"Incorrect input file format or input parameters (number of bins overflows)!\n"); //modified on July 23, 2010
612 abort();
613
614 }
615 }
616 }
617 }
618
619
620 free(small);
621 free(big);
622 }
623
624 /********************************************** select_bin to select the number of bins **********************************/
625 //num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty);
626 /* Determine the number of bins to use in each dimension. Additionally sort the data elements according to the binned
627 * values, and partition the data into "grids" with identical (binned) values. We try progressively more bins until we
628 * maximize a merit function, then return the results obtained using the optimal number of bins.
629 *
630 * Outputs:
631 * position -- binned data values
632 * sorted_seq -- permutation vector mapping the ordered list to the original data
633 * all_grid_ID -- grid to which each data element was assigned.
634 * num_nonempty -- number of distinct values assumed by all_grid_ID
635 * interval -- bin width for each data dimension
636 * return value -- the number of bins selected.
637 */
638
639 int select_bin(double **normalized_data, int file_Len, int num_dm, int min_grid, int max_grid, int **position, int *sorted_seq,
640 int *all_grid_ID, int *num_nonempty, double *interval, int user_num_bin)
641 {
642
643 int num_bin=0;
644 int select_num_bin=0;
645 int m=0;
646 int n=0;
647
648 int i=0;
649 int bin_scope=0;
650 int temp_num_nonempty=0;
651
652 int *temp_grid_ID;
653 int *temp_sorted_seq;
654 int **temp_position;
655
656 //sorted_seq[i]=j means the event j ranks i
657
658 double temp_index=0;
659 double *bin_index;
660 double *temp_interval;
661
662
663
664 temp_grid_ID=(int *)malloc(sizeof(int)*file_Len);
665 memset(temp_grid_ID,0,sizeof(int)*file_Len);
666
667 temp_sorted_seq=(int *)malloc(sizeof(int)*file_Len);
668 memset(temp_sorted_seq,0,sizeof(int)*file_Len);
669
670 temp_position=(int **)malloc(sizeof(int*)*file_Len);
671 memset(temp_position,0,sizeof(int*)*file_Len);
672 for (m=0;m<file_Len;m++)
673 {
674 temp_position[m]=(int*)malloc(sizeof(int)*num_dm);
675 memset(temp_position[m],0,sizeof(int)*num_dm);
676 }
677
678 temp_interval=(double*)malloc(sizeof(double)*num_dm);
679 memset(temp_interval,0,sizeof(double)*num_dm);
680
681 bin_scope=max_grid-min_grid+1;
682 bin_index=(double *)malloc(sizeof(double)*bin_scope);
683 memset(bin_index,0,sizeof(double)*bin_scope);
684
685 i=0;
686
687 for (num_bin=min_grid;num_bin<=max_grid;num_bin++)
688 {
689 /* compute_position bins the data into num_bin bins. Each
690 dimension is binned independently.
691
692 Outputs:
693 temp_position[i][j] -- bin for the jth component of data element i.
694 temp_interval[j] -- bin-width for the jth component
695 */
696 compute_position(normalized_data, file_Len, num_dm, num_bin, temp_position, temp_interval);
697 radixsort_flock(temp_position,file_Len,num_dm,num_bin,temp_sorted_seq,&temp_num_nonempty,temp_grid_ID);
698
699 /* our figure of merit is the number of non-empty grids divided by number of bins per dimension.
700 We declare victory when we have found a local maximum */
701 bin_index[i]=((double)temp_num_nonempty)/((double)num_bin);
702 if ((double)(temp_num_nonempty)>=(double)(file_Len)*0.95)
703 break;
704 if ((bin_index[i]<temp_index) && (user_num_bin==0))
705 break;
706 if ((user_num_bin==num_bin-1) && (user_num_bin!=0))
707 break;
708
709 /* Since we have accepted this trial bin, copy all the temporary results into
710 the output buffers */
711 memcpy(all_grid_ID,temp_grid_ID,sizeof(int)*file_Len);
712 memcpy(sorted_seq,temp_sorted_seq,sizeof(int)*file_Len);
713 memcpy(interval,temp_interval,sizeof(double)*num_dm);
714
715 for (m=0;m<file_Len;m++)
716 for (n=0;n<num_dm;n++)
717 position[m][n]=temp_position[m][n];
718
719 temp_index=bin_index[i];
720 select_num_bin=num_bin;
721 num_nonempty[0]=temp_num_nonempty;
722 i++;
723 }
724
725 if ((select_num_bin<min_grid) || (select_num_bin>max_grid))
726 {
727 fprintf(stderr,"Number of events collected is too few in terms of number of markers used. The file should not be processed!\n"); //modified on Nov 4, 2010
728 exit(0);
729 }
730
731 if (temp_index==0)
732 {
733 fprintf(stderr,"Too many dimensions with too few events in the input file, or a too large number of bins used.\n"); //modified on July 23, 2010
734 abort();
735 }
736
737
738 free(temp_grid_ID);
739 free(temp_sorted_seq);
740 free(bin_index);
741 free(temp_interval);
742
743 for (m=0;m<file_Len;m++)
744 free(temp_position[m]);
745 free(temp_position);
746
747 return select_num_bin;
748 }
749
750 /************************************* Select dense grids **********************************/
751 // compute num_dense_grids, num_dense_events, dense_grid_reverse, and all_grid_vol
752 // den_cutoff=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse);
753 /*
754 * Prune away grids that are insufficiently "dense" (i.e., contain too few data items)
755 *
756 * Outputs:
757 * num_dense_grids -- number of dense grids
758 * num_dense_events -- total number of data items in all dense grids
759 * dense_grid_reverse -- mapping from list of all grids to list of dense grids.
760 * return value -- density cutoff for separating dense from non-dense grids.
761 */
762
763 int select_dense(int file_Len, int *all_grid_ID, int num_nonempty, int *num_dense_grids, int *num_dense_events, int *dense_grid_reverse, int den_t_event)
764 {
765
766
767 int i=0;
768 int vol_ID=0;
769 int biggest_size=0; //biggest grid_size, to define grid_size_index
770 int biggest_index=0;
771 //int actual_threshold=0; //the actual threshold on grid_size, e.g., 1 implies 1 event in the grid
772 //int num_remain=0; //number of remaining grids with different density thresholds
773 int temp_num_dense_grids=0;
774 int temp_num_dense_events=0;
775
776 int *grid_size_index;
777 int *all_grid_vol;
778 int *grid_density_index;
779
780 //double den_average=0;
781 // double avr_index=0;
782
783
784 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
785 // Compute all_grid_vol
786 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
787 all_grid_vol=(int *)malloc(sizeof(int)*num_nonempty);
788 memset(all_grid_vol,0,sizeof(int)*num_nonempty);
789
790 /* Grid "volume" is just the number of data contained in the grid. */
791 for (i=0;i<file_Len;i++)
792 {
793 vol_ID=all_grid_ID[i]; //vol_ID=all_grid_ID[sorted_seq[i]];
794 all_grid_vol[vol_ID]++; //all_grid_vol[i]=j means grid i has j points
795 }
796
797
798
799 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
800 // Compute grid_size_index (histogram of grid sizes)
801 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
802
803 for (i=0;i<num_nonempty;i++)
804 {
805 if (biggest_size<all_grid_vol[i])
806 {
807 biggest_size=all_grid_vol[i];
808 }
809 }
810
811 //added on July 23, 2010
812 if (biggest_size<3)
813 {
814 fprintf(stderr,"Too many dimensions with too few events in the input file, or a too large number of bins used.\n"); //modified on July 23, 2010
815 abort();
816 }
817
818 grid_size_index=(int*)malloc(sizeof(int)*biggest_size);
819 memset(grid_size_index,0,sizeof(int)*biggest_size);
820
821 for (i=0;i<num_nonempty;i++)
822 {
823 grid_size_index[all_grid_vol[i]-1]++; //grid_size_index[0]=5 means there are 5 grids having size 1
824 }
825
826
827
828 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
829 // Compute den_cutoff
830 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
831
832 grid_density_index=(int *)malloc(sizeof(int)*(biggest_size-2));//from event 2 to biggest_size-1, i.e., from 0 to biggest_size-3
833 memset(grid_density_index,0,sizeof(int)*(biggest_size-2));
834
835 if (den_t_event==0) //user doesn't define the density threshold
836 {
837 biggest_index=0;
838
839 for (i=2;i<biggest_size-1;i++) //the grid with 1 event will be skipped, i.e., grid_density_index[0] won't be defined
840 {
841 grid_density_index[i-1]=(grid_size_index[i-1]+grid_size_index[i+1]-2*grid_size_index[i]);
842 if (biggest_index<grid_density_index[i-1])
843 {
844 biggest_index=grid_density_index[i-1];
845 den_t_event=i+1;
846 }
847 }
848 }
849
850 if (den_t_event==0) //if something is wrong
851 den_t_event=3;
852
853 for (i=0;i<num_nonempty;i++)
854 if (all_grid_vol[i]>=den_t_event)
855 temp_num_dense_grids++;
856
857 if (temp_num_dense_grids<=1)
858 {
859 //modified on July 23, 2010
860 //printf("a too high density threshold is set! Please decrease your density threshold.\n");
861 //exit(0);
862 fprintf(stderr,"a too high density threshold is set! Please decrease your density threshold.\n"); //modified on July 23, 2010
863 abort();
864 }
865
866 if (temp_num_dense_grids>=100000)
867 {
868 //modified on July 23, 2010
869 //printf("a too low density threshold is set! Please increase your density threshold.\n");
870 //exit(0);
871 fprintf(stderr,"a too low density threshold is set! Please increase your density threshold.\n"); //modified on July 23, 2010
872 abort();
873 }
874
875 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
876 // Compute dense_grid_reverse
877 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
878 temp_num_dense_grids=0;
879 temp_num_dense_events=0;
880 for (i=0;i<num_nonempty;i++)
881 {
882 dense_grid_reverse[i]=-1;
883
884 if (all_grid_vol[i]>=den_t_event)
885 {
886 dense_grid_reverse[i]=temp_num_dense_grids; //dense_grid_reverse provides a mapping from all nonempty grids to dense grids.
887 temp_num_dense_grids++;
888 temp_num_dense_events+=all_grid_vol[i];
889 }
890 }
891
892 num_dense_grids[0]=temp_num_dense_grids;
893 num_dense_events[0]=temp_num_dense_events;
894
895 free(grid_size_index);
896 free(all_grid_vol);
897
898 return den_t_event;
899 }
900
901 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
902 // Compute densegridID_To_gridevent and eventID_To_denseventID
903 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
904 // grid_To_event(file_Len, dense_grid_reverse, all_grid_ID, eventID_To_denseventID, densegridID_To_gridevent);
905 /*
906 * Filter the data so that only the data belonging to dense grids is left
907 *
908 * Output:
909 * eventID_To_denseeventID -- mapping from original event ID to ID in list containing only events contained in dense grids.
910 * densegridID_To_gridevent -- mapping from dense grids to prototype members of the grids.
911 *
912 */
913
914 void grid_To_event(int file_Len, int *dense_grid_reverse, int *all_grid_ID, int *eventID_To_denseventID, int *densegridID_To_gridevent)
915 {
916 int i=0;
917 int dense_grid_ID=0;
918 int dense_event_ID=0;
919
920 for (i=0;i<file_Len;i++)
921 {
922 dense_grid_ID=dense_grid_reverse[all_grid_ID[i]];
923 eventID_To_denseventID[i]=-1;
924 if (dense_grid_ID!=-1) //for point (i) belonging to dense grids
925 {
926 eventID_To_denseventID[i]=dense_event_ID;
927 dense_event_ID++;
928
929 if (densegridID_To_gridevent[dense_grid_ID]==-1) //for point i that hasn't been selected
930 densegridID_To_gridevent[dense_grid_ID]=i; //densegridID_To_gridevent maps dense_grid_ID to its representative point
931 }
932 }
933
934
935 }
936
937 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
938 // Compute dense_grid_seq
939 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
940 // generate_grid_seq(file_Len, num_dm, sorted_seq, num_dense_grids, densegridID_To_gridevent, position, dense_grid_rank, dense_grid_seq);
941 /* Construct a table of binned data values for each dense grid.
942 *
943 * Output:
944 *
945 * dense_grid_seq -- table of binned data values for each dense grid (recall that all members of a grid have identical binned data values)
946 */
947
948 void generate_grid_seq(int num_dm, int num_dense_grids, int *densegridID_To_gridevent, int **position, int **dense_grid_seq)
949 {
950
951 int i=0;
952 int j=0;
953 int ReventID=0; //representative event ID of the dense grid
954
955 for (i=0;i<num_dense_grids;i++)
956 {
957 ReventID = densegridID_To_gridevent[i];
958
959 for (j=0;j<num_dm;j++)
960 dense_grid_seq[i][j]=position[ReventID][j];
961 }
962 }
963
964 //compare two vectors
965 int compare_value(int num_dm, int *search_value, int *seq_value)
966 {
967 int i=0;
968
969 for (i=0;i<num_dm;i++)
970 {
971 if (search_value[i]<seq_value[i])
972 return 1;
973 if (search_value[i]>seq_value[i])
974 return -1;
975 if (search_value[i]==seq_value[i])
976 continue;
977 }
978 return 0;
979 }
980
981 //binary search the dense_grid_seq to return the dense grid ID if it exists
982 int binary_search(int num_dense_grids, int num_dm, int *search_value, int **dense_grid_seq)
983 {
984
985 int low=0;
986 int high=0;
987 int mid=0;
988 int comp_result=0;
989 int match=0;
990 //int found=0;
991
992 low = 0;
993 high = num_dense_grids-1;
994
995 while (low <= high)
996 {
997 mid = (int)((low + high)/2);
998
999 comp_result=compare_value(num_dm, search_value,dense_grid_seq[mid]);
1000
1001
1002 switch(comp_result)
1003 {
1004 case 1:
1005 high=mid-1;
1006 break;
1007 case -1:
1008 low=mid+1;
1009 break;
1010 case 0:
1011 match=1;
1012 break;
1013 }
1014 if (match==1)
1015 break;
1016 }
1017
1018
1019
1020 if (match==1)
1021 {
1022 return mid;
1023 }
1024 else
1025 return -1;
1026 }
1027
1028
1029 /********************************************** Computing Centers Using IDs **********************************************/
1030
1031 void ID2Center(double **data_in, int file_Len, int num_dm, int *eventID_To_denseventID, int num_clust, int *cluster_ID, double **population_center)
1032 {
1033 int i=0;
1034 int j=0;
1035 int ID=0;
1036 int eventID=0;
1037 int *size_c;
1038
1039
1040
1041 size_c=(int *)malloc(sizeof(int)*num_clust);
1042 memset(size_c,0,sizeof(int)*num_clust);
1043
1044 for (i=0;i<num_clust;i++)
1045 for (j=0;j<num_dm;j++)
1046 population_center[i][j]=0;
1047
1048 for (i=0;i<file_Len;i++)
1049 {
1050 eventID=eventID_To_denseventID[i];
1051
1052 if (eventID!=-1) //only events in dense grids count
1053 {
1054 ID=cluster_ID[eventID];
1055
1056 if (ID==-1)
1057 {
1058 //modified on July 23, 2010
1059 //printf("ID==-1! in ID2Center\n");
1060 //exit(0);
1061 fprintf(stderr,"Incorrect file format or input parameters (no dense regions found!)\n"); //modified on July 23, 2010
1062 abort();
1063 }
1064
1065 for (j=0;j<num_dm;j++)
1066 population_center[ID][j]=population_center[ID][j]+data_in[i][j];
1067
1068 size_c[ID]++;
1069 }
1070 }
1071
1072
1073 for (i=0;i<num_clust;i++)
1074 {
1075 for (j=0;j<num_dm;j++)
1076 if (size_c[i]!=0)
1077 population_center[i][j]=(population_center[i][j]/(double)(size_c[i]));
1078 else
1079 population_center[i][j]=0;
1080 }
1081
1082 free(size_c);
1083
1084 }
1085
1086 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1087 // Compute Population Center with all events
1088 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1089 void ID2Center_all(double **data_in, int file_Len, int num_dm, int num_clust, int *cluster_ID, double **population_center)
1090 {
1091 int i=0;
1092 int j=0;
1093 int ID=0;
1094 int *size_c;
1095
1096
1097
1098 size_c=(int *)malloc(sizeof(int)*num_clust);
1099 memset(size_c,0,sizeof(int)*num_clust);
1100
1101 for (i=0;i<num_clust;i++)
1102 for (j=0;j<num_dm;j++)
1103 population_center[i][j]=0;
1104
1105 for (i=0;i<file_Len;i++)
1106 {
1107 ID=cluster_ID[i];
1108
1109 if (ID==-1)
1110 {
1111 //commented on July 23, 2010
1112 //printf("ID==-1! in ID2Center_all\n");
1113 //exit(0);
1114 fprintf(stderr,"Incorrect file format or input parameters (resulting in incorrect population IDs)\n"); //modified on July 23, 2010
1115 abort();
1116 }
1117
1118 for (j=0;j<num_dm;j++)
1119 population_center[ID][j]=population_center[ID][j]+data_in[i][j];
1120
1121 size_c[ID]++;
1122 }
1123
1124
1125 for (i=0;i<num_clust;i++)
1126 {
1127 for (j=0;j<num_dm;j++)
1128 if (size_c[i]!=0)
1129 population_center[i][j]=(population_center[i][j]/(double)(size_c[i]));
1130 else
1131 population_center[i][j]=0;
1132 }
1133
1134 free(size_c);
1135
1136 }
1137
1138 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1139 // Merge neighboring grids to clusters
1140 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1141
1142 int merge_grids(double **normalized_data, double *interval, int file_Len, int num_dm, int num_bin, int **position, int num_dense_grids,
1143 int *dense_grid_reverse, int **dense_grid_seq, int *eventID_To_denseventID, int *densegridID_To_gridevent, int *all_grid_ID,
1144 int *cluster_ID, int *grid_ID, int *grid_clusterID)
1145 {
1146
1147
1148 int i=0;
1149 int j=0;
1150 int t=0;
1151 int p=0;
1152 int num_clust=0;
1153 int ReventID=0;
1154 int denseID=0;
1155 int neighbor_ID=0;
1156 //int temp_grid=0;
1157
1158 int *grid_value;
1159 int *search_value;
1160
1161 int **graph_of_grid; //the graph constructed for dense grids: each dense grid is a graph node
1162
1163 double real_dist=0;
1164 double **norm_grid_center;
1165
1166 norm_grid_center=(double **)malloc(sizeof(double*)*num_dense_grids);
1167 memset(norm_grid_center,0,sizeof(double*)*num_dense_grids);
1168
1169 for (i=0;i<num_dense_grids;i++)
1170 {
1171 norm_grid_center[i]=(double *)malloc(sizeof(double)*num_dm);
1172 memset(norm_grid_center[i],0,sizeof(double)*num_dm);
1173 }
1174
1175 for (i=0;i<file_Len;i++)
1176 {
1177 denseID=eventID_To_denseventID[i];
1178 if (denseID!=-1) //only dense events can enter
1179 {
1180 grid_ID[denseID]=dense_grid_reverse[all_grid_ID[i]];
1181
1182 if (grid_ID[denseID]==-1)
1183 {
1184 fprintf(stderr,"Incorrect input file format or input parameters (no dense region found)!\n"); //modified on July 23, 2010
1185 abort();
1186 }
1187 }
1188 }
1189
1190
1191 /* Find centroid (in the normalized data) for each dense grid */
1192 ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_dense_grids,grid_ID,norm_grid_center);
1193
1194 //printf("pass the grid ID2 center\n"); //commmented on July 23, 2010
1195
1196
1197 graph_of_grid=(int **)malloc(sizeof(int*)*num_dense_grids);
1198 memset(graph_of_grid,0,sizeof(int*)*num_dense_grids);
1199 for (i=0;i<num_dense_grids;i++)
1200 {
1201 graph_of_grid[i]=(int *)malloc(sizeof(int)*num_dm);
1202 memset(graph_of_grid[i],0,sizeof(int)*num_dm);
1203
1204
1205 for (j=0;j<num_dm;j++)
1206 graph_of_grid[i][j]=-1;
1207 }
1208
1209 grid_value=(int *)malloc(sizeof(int)*num_dm);
1210 memset(grid_value,0,sizeof(int)*num_dm);
1211
1212 search_value=(int *)malloc(sizeof(int)*num_dm);
1213 memset(search_value,0,sizeof(int)*num_dm);
1214
1215
1216 for (i=0;i<num_dense_grids;i++)
1217 {
1218 ReventID=densegridID_To_gridevent[i];
1219
1220 for (j=0;j<num_dm;j++)
1221 {
1222 grid_value[j]=position[ReventID][j];
1223
1224 }
1225
1226
1227 /* For each dimension, find the neighbor, if any, that is equal in all other dimensions and 1 greater in
1228 the chosen dimension. If the neighbor's centroid is not too far away, add it to this grid's neighbor
1229 list. */
1230 for (t=0;t<num_dm;t++)
1231 {
1232 for (p=0;p<num_dm;p++)
1233 search_value[p]=grid_value[p];
1234
1235 if (grid_value[t]==num_bin-1)
1236 continue;
1237
1238 search_value[t]=grid_value[t]+1; //we only consider the neighbor at the bigger side
1239
1240 neighbor_ID=binary_search(num_dense_grids, num_dm, search_value, dense_grid_seq);
1241
1242 if (neighbor_ID!=-1)
1243 {
1244 real_dist=norm_grid_center[i][t]-norm_grid_center[neighbor_ID][t];
1245
1246 if (real_dist<0)
1247 real_dist=-real_dist;
1248
1249 if (real_dist<2*interval[t])
1250 graph_of_grid[i][t]=neighbor_ID;
1251 }
1252 }
1253 grid_clusterID[i]=i; //initialize grid_clusterID
1254 }
1255
1256
1257 //graph constructed
1258 //DFS-based search begins
1259
1260 /* Use a depth-first search to construct a list of connected subgraphs (= "clusters").
1261 Note our graph as we have constructed it is a DAG, so we can use that to our advantage
1262 in our search. */
1263 // num_clust=dfs(graph_of_grid,num_dense_grids,num_dm,grid_clusterID);
1264 num_clust=find_connected(graph_of_grid, num_dense_grids, num_dm, grid_clusterID);
1265
1266
1267 //computes grid_ID and cluster_ID
1268 for (i=0;i<file_Len;i++)
1269 {
1270 denseID=eventID_To_denseventID[i];
1271 if (denseID!=-1) //only dense events can enter
1272 {
1273 cluster_ID[denseID]=grid_clusterID[grid_ID[denseID]];
1274 //if (cluster_ID[denseID]==-1)
1275 // printf("catch you!\n");
1276 }
1277 }
1278
1279 free(search_value);
1280 free(grid_value);
1281
1282 for (i=0;i<num_dense_grids;i++)
1283 {
1284 free(graph_of_grid[i]);
1285 free(norm_grid_center[i]);
1286 }
1287 free(graph_of_grid);
1288 free(norm_grid_center);
1289
1290 return num_clust;
1291 }
1292
1293 /********************************************* Merge Clusters to Populations *******************************************/
1294 // This is the function future work can be on because it is about how to cluster the about 500 points accurately
1295
1296 int merge_clusters(int num_clust, int num_dm, double *interval, double **cluster_center, int *cluster_populationID, int max_num_pop)
1297 {
1298 int num_population=0;
1299 int temp_num_population=0;
1300
1301 int i=0;
1302 int j=0;
1303 int t=0;
1304 int merge=0;
1305 int smid=0;
1306 int bgid=0;
1307 double merge_dist=1.1; //initial value of merge_dist*interval should be slightly larger than the bin width
1308
1309 int *map_ID;
1310
1311 double diff=0;
1312
1313 map_ID=(int*)malloc(sizeof(int)*num_clust);
1314 memset(map_ID,0,sizeof(int)*num_clust);
1315
1316 while ((num_population>max_num_pop) || (num_population<=1))
1317 {
1318
1319 if (num_population<=1)
1320 merge_dist=merge_dist-0.1;
1321
1322 if (num_population>max_num_pop)
1323 merge_dist=merge_dist+0.1;
1324
1325
1326 for (i=0;i<num_clust;i++)
1327 cluster_populationID[i]=i;
1328
1329 for (i=0;i<num_clust-1;i++)
1330 {
1331 for (j=i+1;j<num_clust;j++)
1332 {
1333 merge=1;
1334
1335 for (t=0;t<num_dm;t++)
1336 {
1337 diff=cluster_center[i][t]-cluster_center[j][t];
1338
1339 if (diff<0)
1340 diff=-diff;
1341 if (diff>(merge_dist*interval[t]))
1342 merge=0;
1343 }
1344
1345 if ((merge) && (cluster_populationID[i]!=cluster_populationID[j]))
1346 {
1347 if (cluster_populationID[i]<cluster_populationID[j]) //they could not be equal
1348 {
1349 smid = cluster_populationID[i];
1350 bgid = cluster_populationID[j];
1351 }
1352 else
1353 {
1354 smid = cluster_populationID[j];
1355 bgid = cluster_populationID[i];
1356 }
1357 for (t=0;t<num_clust;t++)
1358 {
1359 if (cluster_populationID[t]==bgid)
1360 cluster_populationID[t]=smid;
1361 }
1362 }
1363 }
1364 }
1365
1366
1367
1368 for (i=0;i<num_clust;i++)
1369 map_ID[i]=-1;
1370
1371 for (i=0;i<num_clust;i++)
1372 map_ID[cluster_populationID[i]]=1;
1373
1374 num_population=0;
1375 for (i=0;i<num_clust;i++)
1376 {
1377 if (map_ID[i]==1)
1378 {
1379 map_ID[i]=num_population;
1380 num_population++;
1381 }
1382 }
1383
1384 if ((temp_num_population>max_num_pop) && (num_population==1))
1385 break;
1386 else
1387 temp_num_population=num_population;
1388
1389 if (num_clust<=1)
1390 break;
1391 }
1392
1393 for (i=0;i<num_clust;i++)
1394 cluster_populationID[i]=map_ID[cluster_populationID[i]];
1395
1396 free(map_ID);
1397
1398 return num_population;
1399 }
1400
1401 ///////////////////////////////////////////////////////
1402 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1403 double kmeans(double **Matrix, int k, double kmean_term, int file_Len, int num_dm, int *shortest_id, double **center)
1404 {
1405
1406 int i=0;
1407 int j=0;
1408 int t=0;
1409 int random=0;
1410 int random1=0;
1411 int random2=0;
1412 int times=0;
1413 int dist_used=0; //0 is Euclidean and 1 is Pearson
1414 int random_init=0; //0: not use random seeds;
1415 int real_Len=0;
1416 int skipped=0;
1417
1418 int *num; //num[i]=t means the ith cluster has t points
1419
1420 double vvv=1.0; // the biggest variation;
1421 double distance=0.0;
1422 double xv=0.0;
1423 double variation=0.0;
1424
1425 double mean_dx=0;
1426 double mean_dy=0;
1427 double sum_var=0;
1428 double dx=0;
1429 double dy=0;
1430 double sd_x=0;
1431 double sd_y=0;
1432 double diff=0;
1433 double distortion=0;
1434
1435 double shortest_distance;
1436
1437 double *temp_center;
1438
1439 double **sum;
1440
1441 temp_center = (double *)malloc(sizeof(double)*num_dm);
1442 memset(temp_center,0,sizeof(double)*num_dm);
1443
1444 if (random_init)
1445 {
1446 for (i=0;i<k;i++)
1447 {
1448 random1=rand()*rand();
1449 random2=abs((random1%5)+1);
1450 for (t=0;t<random2;t++)
1451 random2=random2*rand()+rand();
1452
1453 random=abs(random2%file_Len);
1454 for (j=0;j<num_dm;j++)
1455 center[i][j]=Matrix[random][j];
1456 }
1457 }
1458
1459
1460 num = (int *)malloc(sizeof(int)*k);
1461 memset(num,0,sizeof(int)*k);
1462
1463 sum = (double **)malloc(sizeof(double*)*k);
1464 memset(sum,0,sizeof(double*)*k);
1465 for (i=0;i<k;i++)
1466 {
1467 sum[i] = (double *)malloc(sizeof(double)*num_dm);
1468 memset(sum[i],0,sizeof(double)*num_dm);
1469 }
1470
1471
1472 times=0;
1473 real_Len=0;
1474
1475 while (((vvv>kmean_term) && (kmean_term<1)) || ((times<kmean_term) && (kmean_term>=1)))
1476 {
1477 for (i=0;i<k;i++)
1478 {
1479 num[i]=0;
1480 for (j=0;j<num_dm;j++)
1481 sum[i][j]=0.0;
1482 }
1483
1484 for (i=0;i<file_Len;i++) //for each data point i, we compute the distance between Matrix[i] and center[j]
1485 {
1486 skipped = 0;
1487 shortest_distance=MAX_VALUE;
1488 for (j=0;j<k;j++) //for each center j
1489 {
1490 distance=0.0;
1491
1492 if (dist_used==0) //Euclidean distance
1493 {
1494 for (t=0;t<num_dm;t++) //for each dimension here num_dm is always 1 as we consider individual dimensions
1495 {
1496
1497 diff=center[j][t]-Matrix[i][t];
1498
1499 diff=diff*diff;
1500
1501 distance = distance+diff; //here we have a weight for each dimension
1502 }
1503 }
1504 else //pearson correlation
1505 {
1506 mean_dx=0.0;
1507 mean_dy=0.0;
1508 sum_var=0.0;
1509 dx=0.0;
1510 dy=0.0;
1511 sd_x=0.0;
1512 sd_y=0.0;
1513 for (t=0;t<num_dm;t++)
1514 {
1515 mean_dx+=center[j][t];
1516 mean_dy+=Matrix[i][t];
1517 }
1518 mean_dx=mean_dx/(double)num_dm;
1519 mean_dy=mean_dy/(double)num_dm;
1520
1521 for (t=0;t<num_dm;t++)
1522 {
1523 dx=center[j][t]-mean_dx;
1524 dy=Matrix[i][t]-mean_dy;
1525 sum_var+=dx*dy;
1526 sd_x+=dx*dx;
1527 sd_y+=dy*dy;
1528 }
1529 if (sqrt(sd_x*sd_y)==0)
1530 distance = 1.0;
1531 else
1532 distance = 1.0 - (sum_var/(sqrt(sd_x*sd_y))); // distance ranges from 0 to 2;
1533 //printf("distance=%f\n",distance);
1534 } //pearson correlation ends
1535
1536 if ((distance<shortest_distance) && (skipped == 0))
1537 {
1538 shortest_distance=distance;
1539 shortest_id[i]=j;
1540 }
1541 }//end for j
1542 real_Len++;
1543 num[shortest_id[i]]=num[shortest_id[i]]+1;
1544 for (t=0;t<num_dm;t++)
1545 sum[shortest_id[i]][t]=sum[shortest_id[i]][t]+Matrix[i][t];
1546 }//end for i
1547 /* recompute the centers */
1548 //compute_mean(group);
1549 vvv=0.0;
1550 for (j=0;j<k;j++)
1551 {
1552 memcpy(temp_center,center[j],sizeof(double)*num_dm);
1553 variation=0.0;
1554 if (num[j]!=0)
1555 {
1556 for (t=0;t<num_dm;t++)
1557 {
1558 center[j][t]=sum[j][t]/(double)num[j];
1559 xv=(temp_center[t]-center[j][t]);
1560 variation=variation+xv*xv;
1561 }
1562 }
1563
1564 if (variation>vvv)
1565 vvv=variation; //vvv is the biggest variation among the k clusters;
1566 }
1567 //compute_variation;
1568 times++;
1569 } //end for while
1570
1571
1572
1573
1574 free(num);
1575
1576 for (i=0;i<k;i++)
1577 free(sum[i]);
1578 free(sum);
1579 free(temp_center);
1580
1581
1582 return distortion;
1583
1584 }
1585
1586 //////////////////////////////////////////////////////
1587 /*************************** Show *****************************/
1588 void show(double **Matrix, int *cluster_id, int file_Len, int k, int num_dm, char *name_string)
1589 {
1590 int situ1=0;
1591 int situ2=0;
1592
1593 int i=0;
1594 int id=0;
1595 int j=0;
1596 int t=0;
1597
1598 int *size_c;
1599
1600
1601
1602 int **size_mybound_1;
1603 int **size_mybound_2;
1604 int **size_mybound_3;
1605 int **size_mybound_0;
1606
1607 double interval=0.0;
1608
1609 double *big;
1610 double *small;
1611
1612
1613 double **center;
1614 double **mybound;
1615
1616 int **prof; //prof[i][j]=1 means population i is + at parameter j
1617
1618 FILE *fpcnt_id; //proportion id
1619 //FILE *fcent_id; //center_id, i.e., centers of clusters within the original data
1620 FILE *fprof_id; //profile_id
1621
1622 big=(double *)malloc(sizeof(double)*num_dm);
1623 memset(big,0,sizeof(double)*num_dm);
1624
1625 small=(double *)malloc(sizeof(double)*num_dm);
1626 memset(small,0,sizeof(double)*num_dm);
1627
1628 for (i=0;i<num_dm;i++)
1629 {
1630 big[i]=0.0;
1631 small[i]=(double)MAX_VALUE;
1632 }
1633
1634
1635 size_c=(int *)malloc(sizeof(int)*k);
1636 memset(size_c,0,sizeof(int)*k);
1637
1638 center=(double**)malloc(sizeof(double*)*k);
1639 memset(center,0,sizeof(double*)*k);
1640 for (i=0;i<k;i++)
1641 {
1642 center[i]=(double*)malloc(sizeof(double)*num_dm);
1643 memset(center[i],0,sizeof(double)*num_dm);
1644 }
1645
1646 mybound=(double**)malloc(sizeof(double*)*num_dm);
1647 memset(mybound,0,sizeof(double*)*num_dm);
1648 for (i=0;i<num_dm;i++) //there are 3 mybounds for 4 categories
1649 {
1650 mybound[i]=(double*)malloc(sizeof(double)*3);
1651 memset(mybound[i],0,sizeof(double)*3);
1652 }
1653
1654 prof=(int **)malloc(sizeof(int*)*k);
1655 memset(prof,0,sizeof(int*)*k);
1656 for (i=0;i<k;i++)
1657 {
1658 prof[i]=(int *)malloc(sizeof(int)*num_dm);
1659 memset(prof[i],0,sizeof(int)*num_dm);
1660 }
1661
1662
1663 for (i=0;i<file_Len;i++)
1664 {
1665 id=cluster_id[i];
1666 for (j=0;j<num_dm;j++)
1667 {
1668 center[id][j]=center[id][j]+Matrix[i][j];
1669 if (big[j]<Matrix[i][j])
1670 big[j]=Matrix[i][j];
1671 if (small[j]>Matrix[i][j])
1672 small[j]=Matrix[i][j];
1673 }
1674
1675 size_c[id]++;
1676 }
1677
1678 for (i=0;i<k;i++)
1679 for (j=0;j<num_dm;j++)
1680 {
1681 if (size_c[i]!=0)
1682 center[i][j]=(center[i][j]/(double)(size_c[i]));
1683 else
1684 center[i][j]=0;
1685 }
1686
1687 for (j=0;j<num_dm;j++)
1688 {
1689 interval=((big[j]-small[j])/4.0);
1690 //printf("interval[%d] is %f\n",j,interval);
1691 for (i=0;i<3;i++)
1692 mybound[j][i]=small[j]+((double)(i+1)*interval);
1693 }
1694
1695
1696 size_mybound_0=(int **)malloc(sizeof(int*)*k);
1697 memset(size_mybound_0,0,sizeof(int*)*k);
1698
1699 for (i=0;i<k;i++)
1700 {
1701 size_mybound_0[i]=(int*)malloc(sizeof(int)*num_dm);
1702 memset(size_mybound_0[i],0,sizeof(int)*num_dm);
1703 }
1704
1705 size_mybound_1=(int **)malloc(sizeof(int*)*k);
1706 memset(size_mybound_1,0,sizeof(int*)*k);
1707
1708 for (i=0;i<k;i++)
1709 {
1710 size_mybound_1[i]=(int*)malloc(sizeof(int)*num_dm);
1711 memset(size_mybound_1[i],0,sizeof(int)*num_dm);
1712 }
1713
1714 size_mybound_2=(int **)malloc(sizeof(int*)*k);
1715 memset(size_mybound_2,0,sizeof(int*)*k);
1716
1717 for (i=0;i<k;i++)
1718 {
1719 size_mybound_2[i]=(int*)malloc(sizeof(int)*num_dm);
1720 memset(size_mybound_2[i],0,sizeof(int)*num_dm);
1721 }
1722
1723 size_mybound_3=(int **)malloc(sizeof(int*)*k);
1724 memset(size_mybound_3,0,sizeof(int*)*k);
1725
1726 for (i=0;i<k;i++)
1727 {
1728 size_mybound_3[i]=(int*)malloc(sizeof(int)*num_dm);
1729 memset(size_mybound_3[i],0,sizeof(int)*num_dm);
1730 }
1731
1732 for (i=0;i<file_Len;i++)
1733 for (j=0;j<num_dm;j++)
1734 {
1735 if (Matrix[i][j]<mybound[j][0])// && ((Matrix[i][j]-small[j])>0)) //the smallest values excluded
1736 size_mybound_0[cluster_id[i]][j]++;
1737 else
1738 {
1739 if (Matrix[i][j]<mybound[j][1])
1740 size_mybound_1[cluster_id[i]][j]++;
1741 else
1742 {
1743 if (Matrix[i][j]<mybound[j][2])
1744 size_mybound_2[cluster_id[i]][j]++;
1745 else
1746 //if (Matrix[i][j]!=big[j]) //the biggest values excluded
1747 size_mybound_3[cluster_id[i]][j]++;
1748 }
1749
1750 }
1751 }
1752
1753 fprof_id=fopen("profile.txt","w");
1754 fprintf(fprof_id,"Population_ID\t");
1755 fprintf(fprof_id,"%s\n",name_string);
1756
1757 for (i=0;i<k;i++)
1758 {
1759 fprintf(fprof_id,"%d\t",i+1); //i changed to i+1 to start from 1 instead of 0: April 16, 2009
1760 for (j=0;j<num_dm;j++)
1761 {
1762
1763 if (size_mybound_0[i][j]>size_mybound_1[i][j])
1764 situ1=0;
1765 else
1766 situ1=1;
1767 if (size_mybound_2[i][j]>size_mybound_3[i][j])
1768 situ2=2;
1769 else
1770 situ2=3;
1771
1772 if ((situ1==0) && (situ2==2))
1773 {
1774 if (size_mybound_0[i][j]>size_mybound_2[i][j])
1775 prof[i][j]=0;
1776 else
1777 prof[i][j]=2;
1778 }
1779 if ((situ1==0) && (situ2==3))
1780 {
1781 if (size_mybound_0[i][j]>size_mybound_3[i][j])
1782 prof[i][j]=0;
1783 else
1784 prof[i][j]=3;
1785 }
1786 if ((situ1==1) && (situ2==2))
1787 {
1788 if (size_mybound_1[i][j]>size_mybound_2[i][j])
1789 prof[i][j]=1;
1790 else
1791 prof[i][j]=2;
1792 }
1793 if ((situ1==1) && (situ2==3))
1794 {
1795 if (size_mybound_1[i][j]>size_mybound_3[i][j])
1796 prof[i][j]=1;
1797 else
1798 prof[i][j]=3;
1799 }
1800
1801 //begin to output profile
1802 if (j==num_dm-1)
1803 {
1804 if (prof[i][j]==0)
1805 fprintf(fprof_id,"1\n");
1806 if (prof[i][j]==1)
1807 fprintf(fprof_id,"2\n");
1808 if (prof[i][j]==2)
1809 fprintf(fprof_id,"3\n");
1810 if (prof[i][j]==3)
1811 fprintf(fprof_id,"4\n");
1812 }
1813 else
1814 {
1815 if (prof[i][j]==0)
1816 fprintf(fprof_id,"1\t");
1817 if (prof[i][j]==1)
1818 fprintf(fprof_id,"2\t");
1819 if (prof[i][j]==2)
1820 fprintf(fprof_id,"3\t");
1821 if (prof[i][j]==3)
1822 fprintf(fprof_id,"4\t");
1823 }
1824 }
1825 }
1826 fclose(fprof_id);
1827
1828 ///////////////////////////////////////////////////////////
1829
1830
1831 fpcnt_id=fopen("percentage.txt","w");
1832 fprintf(fpcnt_id,"Population_ID\tPercentage\n");
1833
1834 for (t=0;t<k;t++)
1835 {
1836 fprintf(fpcnt_id,"%d\t%.2f\n",t+1,(double)size_c[t]*100.0/(double)file_Len); //t changed to t+1 to start from 1 instead of 0: April 16, 2009
1837 }
1838 fclose(fpcnt_id);
1839
1840 free(big);
1841 free(small);
1842 free(size_c);
1843
1844 for (i=0;i<k;i++)
1845 {
1846 free(center[i]);
1847 free(prof[i]);
1848 free(size_mybound_0[i]);
1849 free(size_mybound_1[i]);
1850 free(size_mybound_2[i]);
1851 free(size_mybound_3[i]);
1852 }
1853 free(center);
1854 free(prof);
1855 free(size_mybound_0);
1856 free(size_mybound_1);
1857 free(size_mybound_2);
1858 free(size_mybound_3);
1859
1860 for (i=0;i<num_dm;i++)
1861 free(mybound[i]);
1862 free(mybound);
1863
1864 }
1865
1866
1867
1868 /******************************************************** Main Function **************************************************/
1869 //for normalized data, there are five variables:
1870 //cluster_ID
1871 //population_center
1872 //grid_clusterID
1873 //grid_ID
1874 //grid_Center
1875
1876 //the same five variables exist for the original data
1877 //however, the three IDs (cluster_ID, grid_ID, grid_clusterID) don't change in either normalized or original data
1878 //also, data + cluster_ID -> population_center
1879 //data + grid_ID -> grid_Center
1880
1881 /* what is the final output */
1882 //the final things we want are grid_Center in the original data and grid_clusterID //or population_center in the original data
1883 //Sparse grids will not be considered when computing the two centroids (centroids of grids and centroids of clusters)
1884
1885 /* what information should select_bin output */
1886 //the size of all IDs are unknown to function main because we only consider the events in dense grids, and also the number of dense grids
1887 //is unknown, therefore I must use a prescreening to output
1888 //how many bins I should use
1889 //the number of dense grids
1890 //total number of events in the dense grids
1891
1892 /* basic procedure of main function */
1893 //1. read raw file and normalize the raw file
1894 //2. select_bin
1895 //3. allocate memory for eventID_To_denseventID, grid_clusterID, grid_ID, cluster_ID.
1896 //4. call select_dense and merge_grids with grid_clusterID, grid_ID, cluster_ID.
1897 //5. release normalized data; allocate memory for grid_Center and population_center
1898 //6. output grid_Center and population_center using ID2Center together with grid_clusterID //from IDs to centers
1899
1900 int main (int argc, char **argv)
1901 {
1902 //inputs
1903 FILE *f_src; //source file pointer
1904
1905 FILE *f_out; //coordinates
1906 FILE *f_cid; //population-ID of events
1907 FILE *f_ctr; //centroids of populations
1908 FILE *f_results; //coordinates file event and population column
1909 FILE *f_mfi; //added April 16, 2009 for mean fluorescence intensity
1910 FILE *f_parameters; //number of bins and density calculated by
1911 //the algorithm. Used to update the database
1912 FILE *f_properties; //Properties file used by Image generation software
1913
1914 char para_name_string[LINE_LEN];
1915
1916 int time_ID=-1;
1917 int num_bin=0; //the bins I will use on each dimension
1918
1919 int file_Len=0; //number of events
1920 int num_dm=0;
1921 int num_clust=0;
1922 int num_dense_events=0;
1923 int num_dense_grids=0;
1924 int num_nonempty=0;
1925 int num_population=0;
1926 //int temp=0;
1927
1928 //below are read from configuration file
1929 int i=0;
1930 int j=0;
1931 int max_num_pop=0;
1932
1933 int den_t_event=0;
1934
1935 int *grid_clusterID; //(dense)gridID to clusterID
1936 int *grid_ID; //(dense)eventID to gridID
1937 int *cluster_ID; //(dense)eventID to clusterID
1938 int *eventID_To_denseventID; //eventID_To_denseventID[i]=k means event i is in a dense grid and its ID within dense events is k
1939 int *all_grid_ID; //gridID for all events
1940 int *densegridID_To_gridevent;
1941 int *sorted_seq;
1942 int *dense_grid_reverse;
1943 int *population_ID; //denseeventID to populationID
1944 int *cluster_populationID; //clusterID to populationID
1945 int *grid_populationID; //gridID to populationID
1946 int *all_population_ID; //populationID of event
1947
1948 int **position;
1949 int **dense_grid_seq;
1950
1951 double *interval;
1952
1953 double **population_center; //population centroids in the raw/original data
1954 double **cluster_center; //cluster centroids in the raw/original data
1955
1956 double **input_data;
1957 double **normalized_data;
1958
1959 int min = 999999;
1960 int max = 0;
1961
1962 printf( "Starting time:\t\t\t\t");
1963 fflush(stdout);
1964 system("/bin/date");
1965 /////////////////////////////////////////////////////////////
1966
1967 if ((argc!=2) && (argc!=4) && (argc!=5))
1968 {
1969 //modified on Jul 23, 2010
1970 //printf("usage:\n");
1971 //printf("basic mode: flock data_file\n");
1972 //printf("advanced mode: flock data_file num_bin density_index\n");
1973 //exit(0);
1974
1975 fprintf(stderr,"Incorrect number of input parameters!\n"); //modified on July 23, 2010
1976 //fprintf(stderr,"basic mode: flock data_file\n"); //modified on July 23, 2010
1977 //fprintf(stderr,"advanced mode1: flock data_file num_bin density_index\n"); //modified on July 23, 2010
1978 fprintf(stderr,"advanced mode: flock data_file num_bin density_index max_num_pop\n"); //added on Dec 16, 2010 for GenePattern
1979 abort();
1980 }
1981
1982 f_src=fopen(argv[1],"r");
1983
1984 if (argc==2)
1985 {
1986 max_num_pop=30; //default value, maximum 30 clusters
1987 }
1988
1989 if (argc==4)
1990 {
1991 num_bin=atoi(argv[2]);
1992 printf("num_bin is %d\n",num_bin);
1993
1994 den_t_event=atoi(argv[3]);
1995 printf("density_index is %d\n",den_t_event);
1996
1997 max_num_pop=30;
1998
1999 if (((num_bin<6) && (num_bin!=0)) || (num_bin>29))
2000 {
2001 fprintf(stderr,"Incorrect input range of number of bins, which should be larger than 5 and smaller than 30\n");
2002 abort();
2003 }
2004
2005 if (((den_t_event<3) && (den_t_event!=0)) || (den_t_event>99))
2006 {
2007 fprintf(stderr,"Incorrect input range of density threshold, which should be larger than 2 and smaller than 100\n");
2008 abort();
2009 }
2010 }
2011
2012 if (argc==5)
2013 {
2014 num_bin=atoi(argv[2]);
2015 printf("num_bin is %d\n",num_bin);
2016
2017 den_t_event=atoi(argv[3]);
2018 printf("density_index is %d\n",den_t_event);
2019
2020 max_num_pop=atoi(argv[4]);
2021 printf("max number of clusters is %d\n",max_num_pop);
2022
2023 if (((num_bin<6) && (num_bin!=0)) || (num_bin>29))
2024 {
2025 fprintf(stderr,"Incorrect input range of number of bins, which should be larger than 5 and smaller than 30\n");
2026 abort();
2027 }
2028
2029 if (((den_t_event<3) && (den_t_event!=0)) || (den_t_event>99))
2030 {
2031 fprintf(stderr,"Incorrect input range of density threshold, which should be larger than 2 and smaller than 100\n");
2032 abort();
2033 }
2034
2035 if ((max_num_pop<5) || (max_num_pop>999))
2036 {
2037 fprintf(stderr,"Incorrect input range of maximum number of populations, which should be larger than 4 and smaller than 1000\n");
2038 abort();
2039 }
2040 }
2041
2042
2043 getfileinfo(f_src, &file_Len, &num_dm, para_name_string, &time_ID); //get the filelength, number of dimensions, and num/name of parameters
2044
2045 /************************************************* Read the data *****************************************************/
2046
2047 rewind(f_src); //reset the file pointer
2048
2049 input_data = (double **)malloc(sizeof(double*)*file_Len);
2050 memset(input_data,0,sizeof(double*)*file_Len);
2051 for (i=0;i<file_Len;i++)
2052 {
2053 input_data[i]=(double *)malloc(sizeof(double)*num_dm);
2054 memset(input_data[i],0,sizeof(double)*num_dm);
2055 }
2056
2057 readsource(f_src, file_Len, num_dm, input_data, time_ID); //read the data;
2058
2059 fclose(f_src);
2060
2061 normalized_data=(double **)malloc(sizeof(double*)*file_Len);
2062 memset(normalized_data,0,sizeof(double*)*file_Len);
2063 for (i=0;i<file_Len;i++)
2064 {
2065 normalized_data[i]=(double *)malloc(sizeof(double)*num_dm);
2066 memset(normalized_data[i],0,sizeof(double)*num_dm);
2067 }
2068
2069 tran(input_data, file_Len, num_dm, NORM_METHOD, normalized_data);
2070
2071
2072 position=(int **)malloc(sizeof(int*)*file_Len);
2073 memset(position,0,sizeof(int*)*file_Len);
2074 for (i=0;i<file_Len;i++)
2075 {
2076 position[i]=(int*)malloc(sizeof(int)*num_dm);
2077 memset(position[i],0,sizeof(int)*num_dm);
2078 }
2079
2080 all_grid_ID=(int *)malloc(sizeof(int)*file_Len);
2081 memset(all_grid_ID,0,sizeof(int)*file_Len);
2082
2083 sorted_seq=(int*)malloc(sizeof(int)*file_Len);
2084 memset(sorted_seq,0,sizeof(int)*file_Len);
2085
2086 interval=(double*)malloc(sizeof(double)*num_dm);
2087 memset(interval,0,sizeof(double)*num_dm);
2088
2089 /************************************************* select_bin *************************************************/
2090
2091 if (num_bin>=1) //num_bin has been selected by user
2092 select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin);
2093 else //num_bin has not been selected by user
2094 {
2095 num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin);
2096 printf("selected bin number is %d\n",num_bin);
2097 }
2098 printf("number of non-empty grids is %d\n",num_nonempty);
2099
2100
2101
2102 /* Although we return sorted_seq from select_bin(), we don't use it for anything, except possibly diagnostics */
2103 free(sorted_seq);
2104
2105
2106 dense_grid_reverse=(int*)malloc(sizeof(int)*num_nonempty);
2107 memset(dense_grid_reverse,0,sizeof(int)*num_nonempty);
2108
2109 /************************************************* select_dense **********************************************/
2110
2111 if (den_t_event>=1) //den_t_event must be larger or equal to 2 if the user wants to set it
2112 select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event);
2113 else
2114 {
2115 den_t_event=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event);
2116 printf("automated selected density threshold is %d\n",den_t_event);
2117 }
2118
2119 printf("Number of dense grids is %d\n",num_dense_grids);
2120
2121 densegridID_To_gridevent = (int *)malloc(sizeof(int)*num_dense_grids);
2122 memset(densegridID_To_gridevent,0,sizeof(int)*num_dense_grids);
2123
2124 for (i=0;i<num_dense_grids;i++)
2125 densegridID_To_gridevent[i]=-1; //initialize all densegridID_To_gridevent values to -1
2126
2127
2128 eventID_To_denseventID=(int *)malloc(sizeof(int)*file_Len);
2129 memset(eventID_To_denseventID,0,sizeof(int)*file_Len); //eventID_To_denseventID[i]=k means event i is in a dense grid and its ID within dense events is k
2130
2131
2132 grid_To_event(file_Len, dense_grid_reverse, all_grid_ID, eventID_To_denseventID, densegridID_To_gridevent);
2133
2134
2135 dense_grid_seq=(int **)malloc(sizeof(int*)*num_dense_grids);
2136 memset(dense_grid_seq,0,sizeof(int*)*num_dense_grids);
2137 for (i=0;i<num_dense_grids;i++)
2138 {
2139 dense_grid_seq[i]=(int *)malloc(sizeof(int)*num_dm);
2140 memset(dense_grid_seq[i],0,sizeof(int)*num_dm);
2141 }
2142
2143
2144 /* Look up the binned data values for each dense grid */
2145 generate_grid_seq(num_dm, num_dense_grids, densegridID_To_gridevent, position, dense_grid_seq);
2146
2147
2148 /************************************************* allocate memory *********************************************/
2149
2150 grid_clusterID=(int *)malloc(sizeof(int)*num_dense_grids);
2151 memset(grid_clusterID,0,sizeof(int)*num_dense_grids);
2152
2153 grid_ID=(int *)malloc(sizeof(int)*num_dense_events);
2154 memset(grid_ID,0,sizeof(int)*num_dense_events);
2155
2156 cluster_ID=(int *)malloc(sizeof(int)*num_dense_events);
2157 memset(cluster_ID,0,sizeof(int)*num_dense_events);
2158
2159
2160 /*********************************************** merge_grids ***********************************************/
2161 //int merge_grids(int file_Len, int num_dm, int num_bin, int **position, int num_dense_grids, int *dense_grid_rank, int *dense_grid_reverse,
2162 // int **dense_grid_seq, int *eventID_To_denseventID, int *densegridID_To_gridevent, int *all_grid_ID,
2163 // int *cluster_ID, int *grid_ID, int *grid_clusterID)
2164
2165 num_clust = merge_grids(normalized_data, interval, file_Len, num_dm, num_bin, position, num_dense_grids, dense_grid_reverse, dense_grid_seq, eventID_To_denseventID, densegridID_To_gridevent, all_grid_ID, cluster_ID, grid_ID, grid_clusterID);
2166
2167 printf("computed number of groups is %d\n",num_clust);
2168
2169
2170 /************************************** release unnecessary memory and allocate memory and compute centers **********************************/
2171
2172
2173 for (i=0;i<file_Len;i++)
2174 free(position[i]);
2175 free(position);
2176
2177 for (i=0;i<num_dense_grids;i++)
2178 free(dense_grid_seq[i]);
2179 free(dense_grid_seq);
2180
2181 free(dense_grid_reverse);
2182
2183 free(densegridID_To_gridevent);
2184 free(all_grid_ID);
2185
2186 // cluster_center ////////////////////////////////////////////////////////////////////////////////////////////////////////
2187
2188 cluster_center=(double **)malloc(sizeof(double*)*num_clust);
2189 memset(cluster_center,0,sizeof(double*)*num_clust);
2190 for (i=0;i<num_clust;i++)
2191 {
2192 cluster_center[i]=(double*)malloc(sizeof(double)*num_dm);
2193 memset(cluster_center[i],0,sizeof(double)*num_dm);
2194 }
2195
2196 ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_clust,cluster_ID,cluster_center); //produce the centers with normalized data
2197
2198 //printf("pass the first ID2center\n"); //commented on July 23, 2010
2199
2200 /*** population_ID and grid_populationID **/
2201
2202 cluster_populationID=(int*)malloc(sizeof(int)*num_clust);
2203 memset(cluster_populationID,0,sizeof(int)*num_clust);
2204
2205 grid_populationID=(int*)malloc(sizeof(int)*num_dense_grids);
2206 memset(grid_populationID,0,sizeof(int)*num_dense_grids);
2207
2208 population_ID=(int*)malloc(sizeof(int)*num_dense_events);
2209 memset(population_ID,0,sizeof(int)*num_dense_events);
2210
2211 num_population = merge_clusters(num_clust, num_dm, interval, cluster_center, cluster_populationID,max_num_pop);
2212
2213
2214
2215 for (i=0;i<num_clust;i++)
2216 free(cluster_center[i]);
2217 free(cluster_center);
2218
2219 free(interval);
2220
2221 for (i=0;i<num_dense_grids;i++)
2222 {
2223 grid_populationID[i]=cluster_populationID[grid_clusterID[i]];
2224 }
2225
2226 for (i=0;i<num_dense_events;i++)
2227 {
2228 population_ID[i]=cluster_populationID[cluster_ID[i]];
2229 }
2230
2231 printf("computed number of populations is %d\n",num_population);
2232
2233
2234
2235 // population_center /////////////////////////////////////////////////////////////////////////////////////////////////////
2236
2237
2238 population_center=(double **)malloc(sizeof(double*)*num_population);
2239 memset(population_center,0,sizeof(double*)*num_population);
2240 for (i=0;i<num_population;i++)
2241 {
2242 population_center[i]=(double*)malloc(sizeof(double)*num_dm);
2243 memset(population_center[i],0,sizeof(double)*num_dm);
2244 }
2245
2246
2247
2248 ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_population,population_ID,population_center); //produce population centers with normalized data
2249
2250
2251 // show ////////////////////////////////////////////////////////////////////////////////
2252 all_population_ID=(int*)malloc(sizeof(int)*file_Len);
2253 memset(all_population_ID,0,sizeof(int)*file_Len);
2254
2255 kmeans(normalized_data, num_population, KMEANS_TERM, file_Len, num_dm, all_population_ID, population_center);
2256 show(input_data, all_population_ID, file_Len, num_population, num_dm, para_name_string);
2257
2258 ID2Center_all(input_data,file_Len,num_dm,num_population,all_population_ID,population_center);
2259
2260
2261 f_cid=fopen("population_id.txt","w");
2262 f_ctr=fopen("population_center.txt","w");
2263 f_out=fopen("coordinates.txt","w");
2264 f_results=fopen("flock_results.txt","w");
2265
2266 /*
2267 f_parameters=fopen("parameters.txt","w");
2268 fprintf(f_parameters,"Number_of_Bins\t%d\n",num_bin);
2269 fprintf(f_parameters,"Density\t%f\n",aver_index);
2270 fclose(f_parameters);
2271 */
2272
2273 for (i=0;i<file_Len;i++)
2274 fprintf(f_cid,"%d\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009
2275
2276 /*
2277 * New to check for min/max to add to parameters.txt
2278 *
2279 */
2280
2281 fprintf(f_out,"%s\n",para_name_string);
2282 //fprintf(f_results,"%s\tEvent\tPopulation\n",para_name_string);
2283 fprintf(f_results,"%s\tPopulation\n",para_name_string);
2284 for (i=0;i<file_Len;i++)
2285 {
2286 for (j=0;j<num_dm;j++)
2287 {
2288 if (input_data[i][j] < min) {
2289 min = (int)input_data[i][j];
2290 }
2291 if (input_data[i][j] > max) {
2292 max = (int)input_data[i][j];
2293 }
2294 if (j==num_dm-1)
2295 {
2296 fprintf(f_out,"%d\n",(int)input_data[i][j]);
2297 fprintf(f_results,"%d\t",(int)input_data[i][j]);
2298 }
2299 else
2300 {
2301 fprintf(f_out,"%d\t",(int)input_data[i][j]);
2302 fprintf(f_results,"%d\t",(int)input_data[i][j]);
2303 }
2304 }
2305 //fprintf(f_results,"%d\t",i + 1);
2306 fprintf(f_results,"%d\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009
2307 }
2308
2309 /*
2310 f_parameters=fopen("parameters.txt","w");
2311 fprintf(f_parameters,"Number_of_Bins\t%d\n",num_bin);
2312 fprintf(f_parameters,"Density\t%d\n",den_t_event);
2313 fprintf(f_parameters,"Min\t%d\n",min);
2314 fprintf(f_parameters,"Max\t%d\n",max);
2315 fclose(f_parameters);
2316 */
2317
2318 f_properties=fopen("fcs.properties","w");
2319 fprintf(f_properties,"Bins=%d\n",num_bin);
2320 fprintf(f_properties,"Density=%d\n",den_t_event);
2321 fprintf(f_properties,"Min=%d\n",min);
2322 fprintf(f_properties,"Max=%d\n",max);
2323 fprintf(f_properties,"Populations=%d\n",num_population);
2324 fprintf(f_properties,"Events=%d\n",file_Len);
2325 fprintf(f_properties,"Markers=%d\n",num_dm);
2326 fclose(f_properties);
2327
2328
2329 for (i=0;i<num_population;i++) {
2330 /* Add if we want to include population id in the output
2331 */
2332 fprintf(f_ctr,"%d\t",i+1); //i changed to i+1 to start from 1 instead of 0: April 16, 2009
2333
2334 for (j=0;j<num_dm;j++) {
2335 if (j==num_dm-1)
2336 fprintf(f_ctr,"%.0f\n",population_center[i][j]);
2337 else
2338 fprintf(f_ctr,"%.0f\t",population_center[i][j]);
2339 }
2340 }
2341
2342 //added April 16, 2009
2343 f_mfi=fopen("MFI.txt","w");
2344
2345 for (i=0;i<num_population;i++)
2346 {
2347 fprintf(f_mfi,"%d\t",i+1);
2348
2349 for (j=0;j<num_dm;j++)
2350 {
2351 if (j==num_dm-1)
2352 fprintf(f_mfi,"%.0f\n",population_center[i][j]);
2353 else
2354 fprintf(f_mfi,"%.0f\t",population_center[i][j]);
2355 }
2356 }
2357 fclose(f_mfi);
2358
2359 //ended April 16, 2009
2360
2361 fclose(f_cid);
2362 fclose(f_ctr);
2363 fclose(f_out);
2364 fclose(f_results);
2365
2366
2367 for (i=0;i<num_population;i++)
2368 {
2369 free(population_center[i]);
2370 }
2371 free(population_center);
2372
2373
2374 for (i=0;i<file_Len;i++)
2375 free(normalized_data[i]);
2376 free(normalized_data);
2377
2378 free(grid_populationID);
2379
2380 free(cluster_populationID);
2381 free(grid_clusterID);
2382 free(cluster_ID);
2383
2384 for (i=0;i<file_Len;i++)
2385 free(input_data[i]);
2386 free(input_data);
2387
2388 free(grid_ID);
2389 free(population_ID);
2390 free(all_population_ID);
2391 free(eventID_To_denseventID);
2392
2393 ///////////////////////////////////////////////////////////
2394 printf("Ending time:\t\t\t\t");
2395 fflush(stdout);
2396 system("/bin/date");
2397
2398 return 0;
2399
2400 }