comparison src/flock2.c @ 1:7eab80f86779 draft

add FLOCK
author immport-devteam
date Mon, 27 Feb 2017 13:26:09 -0500
parents
children
comparison
equal deleted inserted replaced
0:8d951baf795f 1:7eab80f86779
1 ///////////
2 // Changes made:
3 // 1. Added another parameter: number of populations
4 // 2. Added a hierarchical merging step based on density change between centroids of two hyper-regions
5 // 3. Picked the longest dimensions between the population centroids to judge whether the two parts should be merged
6 // 4. Removed checking time parameter
7 // 5. Output error to stderr
8 // 6. Fixed the bug of density threshold always = 3
9 // 7. Added another error (select_num_bin<min_grid) || (select_num_bin>max_grid) to STDERR
10 // 8. Fixed a bug for 2D data by using K=e*K
11 // 9. Added some header files, may not be necessary
12 // 10. Added a lower bound (at least two) for number of populations
13 /***************************************************************************************************************************************
14
15 FLOCK: FLOw cytometry Clustering without K (Named by: Jamie A. Lee and Richard H. Scheuermann)
16
17 Author: (Max) Yu Qian, Ph.D.
18
19 Copyright: Scheuermann Lab, Dept. of Pathology, UTSW
20
21 Development: November 2005 ~ Forever
22
23 Status: July 2010: Release 2.0
24
25 Usage: flock data_file
26 Note: the input file format must be channel values and the delimiter between two values must be a tab.
27
28
29
30 ****************************************************************************************************************************************/
31
32 #include <time.h>
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <math.h>
36 #include <string.h>
37 #include <sys/stat.h>
38 #include <unistd.h>
39 #include <assert.h>
40
41
42
43 #define DEBUG 0
44 #define LINE_LEN 1024
45 #define FILE_NAME_LEN 128
46 #define PARA_NAME_LEN 64
47 #define MAX_VALUE 1000000000
48 #define MIN_GRID 6
49 #define MAX_GRID 50
50 #define E_T 1.0
51
52 #define NORM_METHOD 2 //2 if z-score; 0 if no normalization; 1 if min-max
53 #define KMEANS_TERM 10
54 #define MAX_POP_NUM 128
55
56 #ifndef max
57 #define max( a, b ) ( ((a) > (b)) ? (a) : (b) )
58 #endif
59
60 #ifndef min
61 #define min( a, b ) ( ((a) < (b)) ? (a) : (b) )
62 #endif
63
64 static long **Gr=0;
65 static long *gcID = 0; /* grid cluster IDs */
66 static long *cluster_count=0; /* count of nodes per cluster */
67 static long ndense=0;
68 static long ndim=0;
69 /* cid changes between depth-first searches, but is constant within a
70 single search, so it goes here. */
71 static long cid=0;
72 /* Do a depth-first search for a single connected component in graph
73 * G. Start from node, tag the nodes found with cid, and record
74 * the tags in grid_clusterID. Also, record the node count in
75 * cluster_count. If we find a node that has already been assigned to
76 * a cluster, that means we're merging two clusters, so zero out the
77 * old cid's node count.
78 *
79 * Note that our graph is constructed as a DAG, so we can be
80 * guaranteed to terminate without checking for cycles.
81 *
82 * Note2: this function can potentially recurse to depth = ndense.
83 * Plan stack size accordingly.
84 *
85 * Output:
86 *
87 * grid_clusterID[] -- array where we tag the nodes.
88 * cluster_count[] -- count of the number of nodes per cluster.
89 */
90 static void merge_cluster(long from, long into)
91 {
92 int i;
93
94 for(i=0; i<ndense; ++i)
95 if(gcID[i] == from)
96 gcID[i] = into;
97 }
98
99 void depth_first(long node)
100 {
101 long i;
102
103 if(gcID[node] == cid) // we're guaranteed no cycles, but it is possible to reach a node
104 return; // through two different paths in the same cluster. This early
105 // return saves us some unneeded work.
106
107 /* Check to see if we are merging a cluster */
108 if(gcID[node] >= 0) {
109 /* We are, so zero the count for the old cluster. */
110 cluster_count[ gcID[node] ] = 0;
111 merge_cluster(gcID[node], cid);
112 return;
113 }
114
115 /* Update for this node */
116 gcID[node] = cid;
117 cluster_count[cid]++;
118
119 /* Recursively search the child nodes */
120 for(i=0; i<ndim; ++i)
121 if(Gr[node][i] >= 0) /* This is a child node */
122 depth_first(Gr[node][i]);
123 }
124
125 void bail(const char *msg)
126 {
127 fprintf(stderr,"%s",msg);
128 exit(0);
129 }
130
131
132 static void check_clusters(long *gcID, int ndense)
133 {
134 int i;
135
136 for(i=0; i<ndense; ++i)
137 if(gcID[i] < 0) {
138 fprintf(stderr,"faulty cluster id at i= %d\n",i);
139 exit(0);
140 }
141 }
142
143
144
145 long find_connected(long **G, long n_dense_grids, long num_dm, long *grid_clusterID)
146 {
147 long nclust=0; /* number of clusters found */
148 long i;
149 long *subfac;
150 long clustid=0;
151 int subval=0,nempty=0;
152
153 int sz = n_dense_grids*sizeof(long);
154 cluster_count = malloc(sz);
155 if(!cluster_count)
156 bail("find_connected: Unable to allocate %zd bytes.\n");
157 memset(cluster_count,0,sz);
158
159 /* set up the statics that will be used in the DFS */
160 Gr=G;
161 gcID = grid_clusterID;
162 ndense = n_dense_grids;
163 ndim = num_dm;
164
165 for(i=0;i<ndense;++i)
166 grid_clusterID[i] = -1;
167
168 for(i=0;i<ndense;++i) {
169 if(grid_clusterID[i] < 0) { /* grid hasn't been assigned yet */
170 cid = nclust++;
171 depth_first(i);
172 }
173 }
174
175
176
177 /* At this point we probably have some clusters that are empty due to merging.
178 We want to compact the cluster numbering to eliminate the empty clusters. */
179
180 subfac = malloc(sz);
181 if(!subfac)
182 bail("find_connected: Unable to allocate %zd bytes.\n");
183 subval=0;
184 nempty=0;
185
186 /* cluster #i needs to have its ID decremented by 1 for each empty cluster with
187 ID < i. Precaclulate the decrements in this loop: */
188 for(i=0;i<nclust;++i) {
189 //clustid = grid_clusterID[i];
190 if(cluster_count[i] == 0) {
191 subval++;
192 nempty++;
193 }
194 subfac[i] = subval;
195 }
196
197 //printf("nempty is %d\n",nempty);
198
199 /* Now apply the decrements to all of the dense grids */
200 for(i=0;i<ndense;++i) {
201 clustid = grid_clusterID[i];
202 grid_clusterID[i] -= subfac[clustid];
203 }
204
205
206
207 /* correct the number of clusters found */
208 nclust -= nempty;
209
210 //printf("nclust is %d\n",nclust);
211
212 return nclust;
213 }
214
215 /************************************* Read basic info of the source file **************************************/
216 /************************************* Read basic info of the source file **************************************/
217 void getfileinfo(FILE *f_src, long *file_Len, long *num_dm, char *name_string, long *time_ID)
218 {
219 char src[LINE_LEN];
220 char current_name[64];
221 char prv;
222
223 long num_rows=0;
224 long num_columns=0;
225 long ch='\n';
226 long prev='\n';
227 long time_pos=0;
228 long i=0;
229 long j=0;
230 int sw=0;
231
232 src[0]='\0';
233 fgets(src, LINE_LEN, f_src);
234
235 name_string[0]='\0';
236 current_name[0]='\0';
237 prv='\n';
238
239 while ((src[i]==' ') || (src[i]=='\t')) //skip space and tab characters
240 i++;
241
242 while ((src[i]!='\0') && (src[i]!='\n')) //repeat until the end of the line
243 {
244 current_name[j]=src[i];
245
246 if ((src[i]=='\t') && (prv!='\t')) //a complete word
247 {
248 current_name[j]='\0';
249
250 if (0!=strcmp(current_name,"Time"))
251 {
252 num_columns++; //num_columns does not inlcude the column of Time
253 time_pos++;
254 if (sw) {
255 strcat(name_string,"\t");
256 }
257 strcat(name_string,current_name);
258 sw = 1;
259 }
260 else
261 {
262 *time_ID=time_pos;
263 }
264
265 current_name[0]='\0';
266 j=0;
267 }
268
269 if ((src[i]=='\t') && (prv=='\t')) //a duplicate tab or space
270 {
271 current_name[0]='\0';
272 j=0;
273 }
274
275 if (src[i]!='\t')
276 j++;
277
278 prv=src[i];
279 i++;
280 }
281
282 //name_string[j]='\0';
283 if (prv!='\t') //the last one hasn't been retrieved
284 {
285 current_name[j]='\0';
286 if (0!=strcmp(current_name,"Time"))
287 {
288 num_columns++;
289 strcat(name_string,"\t");
290 strcat(name_string,current_name);
291 time_pos++;
292 }
293 else
294 {
295 *time_ID=time_pos;
296 }
297 }
298 if (DEBUG==1)
299 {
300 printf("time_ID is %ld\n",*time_ID);
301 printf("name_string is %s\n",name_string);
302 }
303
304 //start computing # of rows
305
306 while ((ch = fgetc(f_src))!= EOF )
307 {
308 if (ch == '\n')
309 {
310 ++num_rows;
311 }
312 prev = ch;
313 }
314 if (prev!='\n')
315 ++num_rows;
316
317 if (num_rows<50)
318 {
319 fprintf(stderr,"Number of events in the input file is too few and should not be processed!\n"); //modified on July 23, 2010
320 exit(0);
321 }
322
323 *file_Len=num_rows;
324 *num_dm=num_columns;
325
326 printf("original file size is %ld; number of dimensions is %ld\n", *file_Len, *num_dm);
327 }
328
329
330
331 /************************************* Read the source file into uncomp_data **************************************/
332 void readsource(FILE *f_src, long file_Len, long num_dm, double **uncomp_data, long time_ID)
333 {
334 //long time_pass=0; //to mark whether the time_ID has been passed
335 long index=0;
336
337 long i=0;
338 long j=0;
339 long t=0;
340
341 char src[LINE_LEN];
342 char xc[LINE_LEN/10];
343
344 src[0]='\0';
345 fgets(src,LINE_LEN, f_src); //skip the first line about parameter names
346
347 while (!feof(f_src) && (index<file_Len)) //index = 0, 1, ..., file_Len-1
348 {
349 src[0]='\0';
350 fgets(src,LINE_LEN,f_src);
351 i=0;
352 //time_pass=0;
353
354 //if (time_ID==-1) //there is no time_ID
355 // {
356 for (t=0;t<num_dm;t++)
357 {
358 xc[0]='\0';
359 j=0;
360 while ((src[i]!='\0') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t'))
361 {
362 xc[j]=src[i];
363 i++;
364 j++;
365 }
366
367 xc[j]='\0';
368 i++;
369
370 uncomp_data[index][t]=atof(xc);
371 }
372 // }
373 /*else
374 {
375 for (t=0;t<=num_dm;t++) //the time column needs to be skipped, so there are num_dm+1 columns
376 {
377 xc[0]='\0';
378 j=0;
379 while ((src[i]!='\0') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t'))
380 {
381 xc[j]=src[i];
382 i++;
383 j++;
384 }
385
386 xc[j]='\0';
387 i++;
388
389 if (t==time_ID)
390 {
391 time_pass=1;
392 continue;
393 }
394
395 if (time_pass)
396 uncomp_data[index][t-1]=atof(xc);
397 else
398 uncomp_data[index][t]=atof(xc);
399 }
400 }*/ //commented by Yu Qian on Aug 31, 2010 as we do not want to check time_ID anymore
401 index++;
402 //fprintf(fout_ID,"%s",src);
403 } //end of while
404
405 if (DEBUG == 1)
406 {
407 printf("the last line of the source data is:\n");
408 for (j=0;j<num_dm;j++)
409 printf("%f ",uncomp_data[index-1][j]);
410 printf("\n");
411 }
412 }
413
414
415 /**************************************** Normalization ******************************************/
416 void tran(double **orig_data, long file_Len, long num_dm, long norm_used, double **matrix_to_cluster)
417 {
418 long i=0;
419 long j=0;
420
421 double biggest=0;
422 double smallest=MAX_VALUE;
423
424 double *aver; //average of each column
425 double *std; //standard deviation of each column
426
427 aver=(double*)malloc(sizeof(double)*file_Len);
428 memset(aver,0,sizeof(double)*file_Len);
429
430 std=(double*)malloc(sizeof(double)*file_Len);
431 memset(std,0,sizeof(double)*file_Len);
432
433 if (norm_used==2) //z-score normalization
434 {
435 for (j=0;j<num_dm;j++)
436 {
437 aver[j]=0;
438 for (i=0;i<file_Len;i++)
439 aver[j]=aver[j]+orig_data[i][j];
440 aver[j]=aver[j]/(double)file_Len;
441
442 std[j]=0;
443 for (i=0;i<file_Len;i++)
444 std[j]=std[j]+((orig_data[i][j]-aver[j])*(orig_data[i][j]-aver[j]));
445 std[j]=sqrt(std[j]/(double)file_Len);
446
447 for (i=0;i<file_Len;i++)
448 matrix_to_cluster[i][j]=(orig_data[i][j]-aver[j])/std[j]; //z-score normalization
449 }
450 }
451
452 if (norm_used==1) //0-1 min-max normalization
453 {
454 for (j=0;j<num_dm;j++)
455 {
456 biggest=0;
457 smallest=MAX_VALUE;
458 for (i=0;i<file_Len;i++)
459 {
460 if (orig_data[i][j]>biggest)
461 biggest=orig_data[i][j];
462 if (orig_data[i][j]<smallest)
463 smallest=orig_data[i][j];
464 }
465
466 for (i=0;i<file_Len;i++)
467 {
468 if (biggest==smallest)
469 matrix_to_cluster[i][j]=biggest;
470 else
471 matrix_to_cluster[i][j]=(orig_data[i][j]-smallest)/(biggest-smallest);
472 }
473 }
474 }
475
476 if (norm_used==0) //no normalization
477 {
478 for (i=0;i<file_Len;i++)
479 for (j=0;j<num_dm;j++)
480 matrix_to_cluster[i][j]=orig_data[i][j];
481 }
482
483 }
484
485
486
487 /********************************************** RadixSort *******************************************/
488 /* Perform a radix sort using each dimension from the original data as a radix.
489 * Outputs:
490 * sorted_seq -- a permutation vector mapping the ordered list onto the original data.
491 * (sorted_seq[i] -> index in the original data of the ith element of the ordered list)
492 * grid_ID -- mapping between the original data and the "grids" (see below) found as a byproduct
493 * of the sorting procedure.
494 * num_nonempty -- the number of grids that occur in the data (= the number of distinct values assumed
495 * by grid_ID)
496 */
497
498 void radixsort_flock(long **position,long file_Len,long num_dm,long num_bin,long *sorted_seq,long *num_nonempty,long *grid_ID)
499 {
500 long i=0;
501 long length=0;
502 long start=0;
503 long prev_ID=0;
504 long curr_ID=0;
505
506 long j=0;
507 long t=0;
508 long p=0;
509 long loc=0;
510 long temp=0;
511 long equal=0;
512
513 long *count; //count[i]=j means there are j numbers having value i at the processing digit
514 long *index; //index[i]=j means the starting position of grid i is j
515 long *cp; //current position
516 long *mark; //mark[i]=0 means it is not an ending point of a part, 1 means it is (a "part" is a group of items with identical bins for all dimensions)
517 long *seq; //temporary sequence
518
519 count=(long*)malloc(sizeof(long)*num_bin);
520 memset(count,0,sizeof(long)*num_bin);
521
522 cp=(long*)malloc(sizeof(long)*num_bin);
523 memset(cp,0,sizeof(long)*num_bin);
524
525 index=(long*)malloc(sizeof(long)*num_bin); // initialized below
526
527 seq=(long*)malloc(sizeof(long)*file_Len);
528 memset(seq,0,sizeof(long)*file_Len);
529
530 mark=(long*)malloc(sizeof(long)*file_Len);
531 memset(mark,0,sizeof(long)*file_Len);
532
533 for (i=0;i<file_Len;i++)
534 {
535 sorted_seq[i]=i;
536 mark[i]=0;
537 seq[i]=0;
538 }
539 for (i=0;i<num_bin;i++)
540 {
541 index[i]=0;
542 cp[i]=0;
543 count[i]=0;
544 }
545
546 for (j=0;j<num_dm;j++)
547 {
548 if (j==0) //compute the initial values of mark
549 {
550 for (i=0;i<file_Len;i++)
551 count[position[i][j]]++; // initialize the count to the number of items in each bin of the 0th dimension
552
553 index[0] = 0;
554 for (i=0;i<num_bin-1;i++)
555 {
556 index[i+1]=index[i]+count[i]; //index[k]=x means k segment starts at x (in the ordered list)
557 if ((index[i+1]>0) && (index[i+1]<=file_Len))
558 {
559 mark[index[i+1]-1]=1; // Mark the end of the segment in the ordered list
560 }
561 else
562 {
563 printf("out of myboundary for mark at index[i+1]-1.\n");
564 }
565 }
566 mark[file_Len-1]=1;
567
568 for (i=0;i<file_Len;i++)
569 {
570 /* Build a permutation vector for the partially ordered data. Store the PV in sorted_seq */
571 loc=position[i][j];
572 temp=index[loc]+cp[loc]; //cp[i]=j means the offset from the starting position of grid i is j
573 sorted_seq[temp]=i; //sorted_seq[i]=temp is also another way to sort
574 cp[loc]++;
575 }
576 }
577 else
578 {
579 //reset count, index, loc, temp, cp, start, and length
580 length=0;
581 loc=0;
582 temp=0;
583 start=0;
584 for (p=0;p<num_bin;p++)
585 {
586 cp[p]=0;
587 count[p]=0;
588 index[p]=0;
589 }
590
591 for (i=0;i<file_Len;i++)
592 {
593 long iperm = sorted_seq[i]; // iperm allows us to traverse the data in sorted order.
594 if (mark[i]!=1)
595 {
596 /* Count the number of items in each bin of
597 dimension j, BUT we are going to reset at the end
598 of each "part". Thus, the total effect is to do
599 a sort by bin on the jth dimension for each group
600 of data that has been identical for the
601 dimensions processed up to this point. This is
602 the standard radix sort procedure, but doing it
603 this way saves us having to allocate buckets to
604 hold the data in each group of "identical-so-far"
605 elements. */
606 count[position[iperm][j]]++; //count[position[i][j]]++;
607 length++; // This is the total length of the part, irrespective of the value of the jth component
608 // (actually, less one, since we don't increment for the final element below)
609 }
610 if (mark[i]==1)
611 {
612 //length++;
613 count[position[iperm][j]]++;//count[position[i][j]]++; //the current point must be counted in
614 start=i-length; //this part starts from start to i: [start,i]
615 /* Now we sort on the jth radix, just like we did for the 0th above, but we restrict it to just the current part.
616 This would be a lot more clear if we broke this bit of code out into a separate function and processed recursively,
617 plus we could multi-thread over the parts. (Hmmm...)
618 */
619 index[0] = start; // Let index give the offset within the whole ordered list.
620 for (t=0;t<num_bin-1;t++)
621 {
622 index[t+1]=index[t]+count[t];
623
624 if ((index[t+1]<=file_Len) && (index[t+1]>0))
625 {
626 mark[index[t+1]-1]=1; // update the part boundaries to include the differences in the current radix.
627 }
628
629 }
630 mark[i]=1;
631
632 /* Update the permutation vector for the current part (i.e., from start to i). By the time we finish the loop over i
633 the PV will be completely updated for the partial ordering up to the current radix. */
634 for (t=start;t<=i;t++)
635 {
636 loc=position[sorted_seq[t]][j];//loc=position[t][j];
637 temp=index[loc]+cp[loc];
638 if ((temp<file_Len) && (temp>=0))
639 {
640 // seq is a temporary because we have to maintain the old PV until we have finished this step.
641 seq[temp]=sorted_seq[t]; //sorted_seq[i]=temp is also another way to sort
642 cp[loc]++;
643 }
644 else
645 {
646 printf("out of myboundary for seq at temp.\n");
647 }
648 }
649
650 for (t=start;t<=i;t++)
651 {
652 // copy the temporary back into sorted_seq. sorted_seq is now updated for radix j up through
653 // entry i in the ordered list.
654 if ((t>=0) && (t<file_Len))
655 sorted_seq[t]=seq[t];
656 else
657 printf("out of myboundary for seq and sorted_seq at t.\n");
658 }
659 //reset count, index, seq, length, and cp
660 length=0;
661 loc=0;
662 temp=0;
663 for (p=0;p<num_bin;p++)
664 {
665 cp[p]=0;
666 count[p]=0;
667 index[p]=0;
668 }
669 }
670 }//end for i
671 }//end else
672 }//end for j
673
674 /* sorted_seq[] now contains the ordered list for all radices. mark[] gives the boundaries between groups of elements that are
675 identical over all radices (= dimensions in the original data) (although it appears we aren't going to make use of this fact) */
676
677 for (i=0;i<file_Len;i++)
678 grid_ID[i]=0; //in case the initial value hasn't been assigned
679 *num_nonempty=1; //starting from 1!
680
681 /* assign the "grid" identifiers for all of the data. A grid will be what we were calling a "part" above. We will number them
682 serially and tag the *unordered* data with the grid IDs. We will also count the number of populated grids (in general there will
683 be many possible combinations of bin values that simply never occur) */
684
685 for (i=1;i<file_Len;i++)
686 {
687 equal=1;
688 prev_ID=sorted_seq[i-1];
689 curr_ID=sorted_seq[i];
690 for (j=0;j<num_dm;j++)
691 {
692 if (position[prev_ID][j]!=position[curr_ID][j])
693 {
694 equal=0; //not equal
695 break;
696 }
697 }
698
699 if (equal)
700 {
701 grid_ID[curr_ID]=grid_ID[prev_ID];
702 }
703 else
704 {
705 *num_nonempty=*num_nonempty+1;
706 grid_ID[curr_ID]=grid_ID[prev_ID]+1;
707 }
708 //all_grid_vol[grid_ID[curr_ID]]++;
709 }
710
711 //free memory
712 free(count);
713 free(index);
714 free(cp);
715 free(seq);
716 free(mark);
717
718 }
719
720 /********************************************** Compute Position of Events ************************************************/
721 void compute_position(double **data_in, long file_Len, long num_dm, long num_bin, long **position, double *interval)
722 {
723 /* What we are really doing here is binning the data, with the bins
724 spanning the range of the data and number of bins = num_bin */
725 long i=0;
726 long j=0;
727
728 double *small; //small[j] is the smallest value within dimension j
729 double *big; //big[j] is the biggest value within dimension j
730
731 small=(double*)malloc(sizeof(double)*num_dm);
732 memset(small,0,sizeof(double)*num_dm);
733
734 big=(double*)malloc(sizeof(double)*num_dm);
735 memset(big,0,sizeof(double)*num_dm);
736
737
738 for (j=0;j<num_dm;j++)
739 {
740 big[j]=MAX_VALUE*(-1);
741 small[j]=MAX_VALUE;
742 for (i=0;i<file_Len;i++)
743 {
744 if (data_in[i][j]>big[j])
745 big[j]=data_in[i][j];
746
747 if (data_in[i][j]<small[j])
748 small[j]=data_in[i][j];
749 }
750
751 interval[j]=(big[j]-small[j])/(double)num_bin; //interval is computed using the biggest value and smallest value instead of the channel limit
752 /* XXX: I'm pretty sure the denominator of the fraction above should be num_bin-1. */
753 }
754
755 for (j=0;j<num_dm;j++)
756 {
757 for (i=0;i<file_Len;i++)
758 {
759 if (data_in[i][j]>=big[j])
760 position[i][j]=num_bin-1;
761 else
762 {
763 position[i][j]=(long)((data_in[i][j]-small[j])/interval[j]); //position[i][j]=t means point i is at the t grid of dimensional j
764 if ((position[i][j]>=num_bin) || (position[i][j]<0))
765 {
766 //printf("position mis-computed in density analysis!\n");
767 //exit(0);
768 fprintf(stderr,"Incorrect input file format or input parameters (number of bins overflows)!\n"); //modified on July 23, 2010
769 exit(0);
770 }
771 }
772 }
773 }
774
775
776 free(small);
777 free(big);
778 }
779
780 /********************************************** select_bin to select the number of bins **********************************/
781 //num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty);
782 /* Determine the number of bins to use in each dimension. Additionally sort the data elements according to the binned
783 * values, and partition the data into "grids" with identical (binned) values. We try progressively more bins until we
784 * maximize a merit function, then return the results obtained using the optimal number of bins.
785 *
786 * Outputs:
787 * position -- binned data values
788 * sorted_seq -- permutation vector mapping the ordered list to the original data
789 * all_grid_ID -- grid to which each data element was assigned.
790 * num_nonempty -- number of distinct values assumed by all_grid_ID
791 * interval -- bin width for each data dimension
792 * return value -- the number of bins selected.
793 */
794
795 long select_bin(double **normalized_data, long file_Len, long num_dm, long min_grid, long max_grid, long **position, long *sorted_seq,
796 long *all_grid_ID, long *num_nonempty, double *interval, long user_num_bin)
797 {
798
799 long num_bin=0;
800 long select_num_bin=0;
801 long m=0;
802 long n=0;
803
804 long i=0;
805 long bin_scope=0;
806 long temp_num_nonempty=0;
807
808 long *temp_grid_ID;
809 long *temp_sorted_seq;
810 long **temp_position;
811
812 //sorted_seq[i]=j means the event j ranks i
813
814 double temp_index=0;
815 double *bin_index;
816 double *temp_interval;
817
818
819 temp_grid_ID=(long *)malloc(sizeof(long)*file_Len);
820 memset(temp_grid_ID,0,sizeof(long)*file_Len);
821
822 temp_sorted_seq=(long *)malloc(sizeof(long)*file_Len);
823 memset(temp_sorted_seq,0,sizeof(long)*file_Len);
824
825 temp_position=(long **)malloc(sizeof(long*)*file_Len);
826 memset(temp_position,0,sizeof(long*)*file_Len);
827 for (m=0;m<file_Len;m++)
828 {
829 temp_position[m]=(long*)malloc(sizeof(long)*num_dm);
830 memset(temp_position[m],0,sizeof(long)*num_dm);
831 }
832
833 temp_interval=(double*)malloc(sizeof(double)*num_dm);
834 memset(temp_interval,0,sizeof(double)*num_dm);
835
836 bin_scope=max_grid-min_grid+1;
837 bin_index=(double *)malloc(sizeof(double)*bin_scope);
838 memset(bin_index,0,sizeof(double)*bin_scope);
839
840 i=0;
841
842 for (num_bin=min_grid;num_bin<=max_grid;num_bin++)
843 {
844
845 temp_num_nonempty=0;
846 /* compute_position bins the data into num_bin bins. Each
847 dimension is binned independently.
848
849 Outputs:
850 temp_position[i][j] -- bin for the jth component of data element i.
851 temp_interval[j] -- bin-width for the jth component
852 */
853 compute_position(normalized_data, file_Len, num_dm, num_bin, temp_position, temp_interval);
854 radixsort_flock(temp_position,file_Len,num_dm,num_bin,temp_sorted_seq,&temp_num_nonempty,temp_grid_ID);
855
856 /* our figure of merit is the number of non-empty grids divided by number of bins per dimension.
857 We declare victory when we have found a local maximum */
858 bin_index[i]=((double)temp_num_nonempty)/((double)num_bin);
859 if ((double)(temp_num_nonempty)>=(double)(file_Len)*0.95)
860 break;
861 if ((bin_index[i]<temp_index) && (user_num_bin==0))
862 break;
863 if ((user_num_bin==num_bin-1) && (user_num_bin!=0))
864 break;
865
866 /* Since we have accepted this trial bin, copy all the temporary results into
867 the output buffers */
868 memcpy(all_grid_ID,temp_grid_ID,sizeof(long)*file_Len);
869 memcpy(sorted_seq,temp_sorted_seq,sizeof(long)*file_Len);
870 memcpy(interval,temp_interval,sizeof(double)*num_dm);
871
872 for (m=0;m<file_Len;m++)
873 for (n=0;n<num_dm;n++)
874 position[m][n]=temp_position[m][n];
875
876 temp_index=bin_index[i];
877 select_num_bin=num_bin;
878 num_nonempty[0]=temp_num_nonempty;
879 i++;
880 }
881
882 if ((select_num_bin<min_grid) || (select_num_bin>max_grid))
883 {
884 fprintf(stderr,"Number of events collected is too few in terms of number of markers used. The file should not be processed!\n"); //modified on Nov 04, 2010
885 exit(0);
886 }
887
888 if (temp_index==0)
889 {
890 fprintf(stderr,"Too many dimensions with too few events in the input file, or a too large number of bins used.\n"); //modified on July 23, 2010
891 exit(0);
892 }
893
894 free(temp_grid_ID);
895 free(temp_sorted_seq);
896 free(bin_index);
897 free(temp_interval);
898
899 for (m=0;m<file_Len;m++)
900 free(temp_position[m]);
901 free(temp_position);
902
903 return select_num_bin;
904 }
905
906 /************************************* Select dense grids **********************************/
907 // compute num_dense_grids, num_dense_events, dense_grid_reverse, and all_grid_vol
908 // den_cutoff=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse);
909 /*
910 * Prune away grids that are insufficiently "dense" (i.e., contain too few data items)
911 *
912 * Outputs:
913 * num_dense_grids -- number of dense grids
914 * num_dense_events -- total number of data items in all dense grids
915 * dense_grid_reverse -- mapping from list of all grids to list of dense grids.
916 * return value -- density cutoff for separating dense from non-dense grids.
917 */
918
919 int select_dense(long file_Len, long *all_grid_ID, long num_nonempty, long *num_dense_grids, long *num_dense_events, long *dense_grid_reverse, int den_t_event)
920 {
921
922
923 long i=0;
924 long vol_ID=0;
925 long biggest_size=0; //biggest grid_size, to define grid_size_index
926 long biggest_index=0;
927 //long actual_threshold=0; //the actual threshold on grid_size, e.g., 1 implies 1 event in the grid
928 //long num_remain=0; //number of remaining grids with different density thresholds
929 long temp_num_dense_grids=0;
930 long temp_num_dense_events=0;
931
932 long *grid_size_index;
933 long *all_grid_vol;
934 long *grid_density_index;
935
936 //double den_average=0;
937 // double avr_index=0;
938
939
940 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
941 // Compute all_grid_vol
942 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
943 all_grid_vol=(long *)malloc(sizeof(long)*num_nonempty);
944 memset(all_grid_vol,0,sizeof(long)*num_nonempty);
945
946 /* Grid "volume" is just the number of data contained in the grid. */
947 for (i=0;i<file_Len;i++)
948 {
949 vol_ID=all_grid_ID[i]; //vol_ID=all_grid_ID[sorted_seq[i]];
950 all_grid_vol[vol_ID]++; //all_grid_vol[i]=j means grid i has j points
951 }
952
953
954
955 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
956 // Compute grid_size_index (histogram of grid sizes)
957 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
958
959 for (i=0;i<num_nonempty;i++)
960 {
961 if (biggest_size<all_grid_vol[i])
962 {
963 biggest_size=all_grid_vol[i];
964 }
965 }
966
967 if (biggest_size<3)
968 {
969 fprintf(stderr,"Too many dimensions with too few events in the input file, or a too large number of bins used.\n"); //modified on July 23, 2010
970 exit(0);
971 }
972
973
974 grid_size_index=(long*)malloc(sizeof(long)*biggest_size);
975 memset(grid_size_index,0,sizeof(long)*biggest_size);
976
977 for (i=0;i<num_nonempty;i++)
978 {
979 grid_size_index[all_grid_vol[i]-1]++; //grid_size_index[0]=5 means there are 5 grids having size 1
980 }
981
982
983
984 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
985 // Compute den_cutoff
986 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
987
988 grid_density_index=(long *)malloc(sizeof(long)*(biggest_size-2));//from event 2 to biggest_size-1, i.e., from 0 to biggest_size-3
989 memset(grid_density_index,0,sizeof(long)*(biggest_size-2));
990
991 if (den_t_event==0)
992 {
993 biggest_index=0;
994
995 for (i=2;i<biggest_size-1;i++) //the grid with 1 event will be skipped, i.e., grid_density_index[0] won't be defined
996 {
997 grid_density_index[i-1]=(grid_size_index[i-1]+grid_size_index[i+1]-2*grid_size_index[i]);
998 if (biggest_index<grid_density_index[i-1])
999 {
1000 biggest_index=grid_density_index[i-1];
1001 den_t_event=i+1;
1002 }
1003 }
1004 }
1005
1006 if (den_t_event==0) //if biggest_size==3
1007 {
1008 fprintf(stderr,"the densest hyperregion has only 3 events, smaller than the user-specified value. Therefore the density threshold is automatically changed to 3.\n");
1009 den_t_event=3;
1010 }
1011
1012 for (i=0;i<num_nonempty;i++)
1013 if (all_grid_vol[i]>=den_t_event)
1014 temp_num_dense_grids++;
1015
1016 if (temp_num_dense_grids<=1)
1017 {
1018 //exit(0);
1019 //printf("a too high density threshold is set! Please decrease your density threshold.\n");
1020 fprintf(stderr,"a too high density threshold! Please decrease your density threshold.\n"); //modified on July 23, 2010
1021 exit(0);
1022 }
1023
1024 if (temp_num_dense_grids>=100000)
1025 {
1026 //modified on July 23, 2010
1027 //printf("a too low density threshold is set! Please increase your density threshold.\n");
1028 //exit(0);
1029 fprintf(stderr,"a too low density threshold! Please increase your density threshold.\n"); //modified on July 23, 2010
1030 exit(0);
1031 }
1032
1033 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1034 // Compute dense_grid_reverse
1035 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1036 temp_num_dense_grids=0;
1037 temp_num_dense_events=0;
1038 for (i=0;i<num_nonempty;i++)
1039 {
1040 dense_grid_reverse[i]=-1;
1041
1042 if (all_grid_vol[i]>=den_t_event)
1043 {
1044 dense_grid_reverse[i]=temp_num_dense_grids; //dense_grid_reverse provides a mapping from all nonempty grids to dense grids.
1045 temp_num_dense_grids++;
1046 temp_num_dense_events+=all_grid_vol[i];
1047 }
1048 }
1049
1050 num_dense_grids[0]=temp_num_dense_grids;
1051 num_dense_events[0]=temp_num_dense_events;
1052
1053 free(grid_size_index);
1054 free(all_grid_vol);
1055
1056 return den_t_event;
1057 }
1058
1059 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1060 // Compute densegridID_To_gridevent and eventID_To_denseventID
1061 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1062 // grid_To_event(file_Len, dense_grid_reverse, all_grid_ID, eventID_To_denseventID, densegridID_To_gridevent);
1063 /*
1064 * Filter the data so that only the data belonging to dense grids is left
1065 *
1066 * Output:
1067 * eventID_To_denseeventID -- mapping from original event ID to ID in list containing only events contained in dense grids.
1068 * densegridID_To_gridevent -- mapping from dense grids to prototype members of the grids.
1069 *
1070 */
1071
1072 void grid_To_event(long file_Len, long *dense_grid_reverse, long *all_grid_ID, long *eventID_To_denseventID, long *densegridID_To_gridevent)
1073 {
1074 long i=0;
1075 long dense_grid_ID=0;
1076 long dense_event_ID=0;
1077
1078 for (i=0;i<file_Len;i++)
1079 {
1080 dense_grid_ID=dense_grid_reverse[all_grid_ID[i]];
1081 eventID_To_denseventID[i]=-1;
1082 if (dense_grid_ID!=-1) //for point (i) belonging to dense grids
1083 {
1084 eventID_To_denseventID[i]=dense_event_ID;
1085 dense_event_ID++;
1086
1087 if (densegridID_To_gridevent[dense_grid_ID]==-1) //for point i that hasn't been selected
1088 densegridID_To_gridevent[dense_grid_ID]=i; //densegridID_To_gridevent maps dense_grid_ID to its representative point
1089 }
1090 }
1091
1092
1093 }
1094
1095 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1096 // Compute dense_grid_seq
1097 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1098 // generate_grid_seq(file_Len, num_dm, sorted_seq, num_dense_grids, densegridID_To_gridevent, position, dense_grid_rank, dense_grid_seq);
1099 /* Construct a table of binned data values for each dense grid.
1100 *
1101 * Output:
1102 *
1103 * dense_grid_seq -- table of binned data values for each dense grid (recall that all members of a grid have identical binned data values)
1104 */
1105
1106 void generate_grid_seq(long num_dm, long num_dense_grids, long *densegridID_To_gridevent, long **position, long **dense_grid_seq)
1107 {
1108
1109 long i=0;
1110 long j=0;
1111 long ReventID=0; //representative event ID of the dense grid
1112
1113 for (i=0;i<num_dense_grids;i++)
1114 {
1115 ReventID = densegridID_To_gridevent[i];
1116
1117 for (j=0;j<num_dm;j++)
1118 dense_grid_seq[i][j]=position[ReventID][j];
1119 }
1120 }
1121
1122 //compare two vectors
1123 long compare_value(long num_dm, long *search_value, long *seq_value)
1124 {
1125 long i=0;
1126
1127 for (i=0;i<num_dm;i++)
1128 {
1129 if (search_value[i]<seq_value[i])
1130 return 1;
1131 if (search_value[i]>seq_value[i])
1132 return -1;
1133 if (search_value[i]==seq_value[i])
1134 continue;
1135 }
1136 return 0;
1137 }
1138
1139 //binary search the dense_grid_seq to return the dense grid ID if it exists
1140 long binary_search(long num_dense_grids, long num_dm, long *search_value, long **dense_grid_seq)
1141 {
1142
1143 long low=0;
1144 long high=0;
1145 long mid=0;
1146 long comp_result=0;
1147 long match=0;
1148 //long found=0;
1149
1150 low = 0;
1151 high = num_dense_grids-1;
1152
1153 while (low <= high)
1154 {
1155 mid = (long)((low + high)/2);
1156
1157 comp_result=compare_value(num_dm, search_value,dense_grid_seq[mid]);
1158
1159
1160 switch(comp_result)
1161 {
1162 case 1:
1163 high=mid-1;
1164 break;
1165 case -1:
1166 low=mid+1;
1167 break;
1168 case 0:
1169 match=1;
1170 break;
1171 }
1172 if (match==1)
1173 break;
1174 }
1175
1176
1177
1178 if (match==1)
1179 {
1180 return mid;
1181 }
1182 else
1183 return -1;
1184 }
1185
1186
1187 /********************************************** Computing Centers Using IDs **********************************************/
1188
1189 void ID2Center(double **data_in, long file_Len, long num_dm, long *eventID_To_denseventID, long num_clust, long *cluster_ID, double **population_center)
1190 {
1191 long i=0;
1192 long j=0;
1193 long ID=0;
1194 long eventID=0;
1195 long *size_c;
1196
1197
1198
1199 size_c=(long *)malloc(sizeof(long)*num_clust);
1200 memset(size_c,0,sizeof(long)*num_clust);
1201
1202 for (i=0;i<num_clust;i++)
1203 for (j=0;j<num_dm;j++)
1204 population_center[i][j]=0;
1205
1206 for (i=0;i<file_Len;i++)
1207 {
1208 eventID=eventID_To_denseventID[i];
1209
1210 if (eventID!=-1) //only events in dense grids count
1211 {
1212 ID=cluster_ID[eventID];
1213
1214 if (ID==-1)
1215 {
1216 //printf("ID==-1! in ID2Center\n");
1217 //exit(0);
1218 fprintf(stderr,"Incorrect file format or input parameters (no dense regions found!)\n"); //modified on July 23, 2010
1219 exit(0);
1220 }
1221
1222 for (j=0;j<num_dm;j++)
1223 population_center[ID][j]=population_center[ID][j]+data_in[i][j];
1224
1225 size_c[ID]++;
1226 }
1227 }
1228
1229
1230 for (i=0;i<num_clust;i++)
1231 {
1232 for (j=0;j<num_dm;j++)
1233 if (size_c[i]!=0)
1234 population_center[i][j]=(population_center[i][j]/(double)(size_c[i]));
1235 else
1236 {
1237 population_center[i][j]=0;
1238 printf("size_c[%ld]=0 in ID2center\n",i);
1239 }
1240 }
1241
1242 free(size_c);
1243
1244 }
1245
1246 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1247 // Compute Population Center with all events
1248 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1249 void ID2Center_all(double **data_in, long file_Len, long num_dm, long num_clust, long *cluster_ID, double **population_center)
1250 {
1251 long i=0;
1252 long j=0;
1253 long ID=0;
1254 long *size_c;
1255
1256
1257
1258 size_c=(long *)malloc(sizeof(long)*num_clust);
1259 memset(size_c,0,sizeof(long)*num_clust);
1260
1261 for (i=0;i<num_clust;i++)
1262 for (j=0;j<num_dm;j++)
1263 population_center[i][j]=0;
1264
1265
1266 for (i=0;i<file_Len;i++)
1267 {
1268 ID=cluster_ID[i];
1269
1270 if (ID==-1)
1271 {
1272 //printf("ID==-1! in ID2Center_all\n");
1273 //exit(0);
1274 fprintf(stderr,"Incorrect file format or input parameters (resulting in incorrect population IDs)\n"); //modified on July 23, 2010
1275 exit(0);
1276 }
1277
1278 for (j=0;j<num_dm;j++)
1279 population_center[ID][j]=population_center[ID][j]+data_in[i][j];
1280
1281 size_c[ID]++;
1282 }
1283
1284
1285 for (i=0;i<num_clust;i++)
1286 {
1287 for (j=0;j<num_dm;j++)
1288 if (size_c[i]!=0)
1289 population_center[i][j]=(population_center[i][j]/(double)(size_c[i]));
1290 else
1291 {
1292 population_center[i][j]=0;
1293 printf("size_c[%ld]=0 in ID2center_all\n",i);
1294 }
1295 }
1296
1297
1298 free(size_c);
1299
1300 }
1301
1302 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1303 // Merge neighboring grids to clusters
1304 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1305
1306 long merge_grids(double **normalized_data, double *interval, long file_Len, long num_dm, long num_bin, long **position, long num_dense_grids,
1307 long *dense_grid_reverse, long **dense_grid_seq, long *eventID_To_denseventID, long *densegridID_To_gridevent, long *all_grid_ID,
1308 long *cluster_ID, long *grid_ID, long *grid_clusterID)
1309 {
1310
1311
1312 long i=0;
1313 long j=0;
1314 long t=0;
1315 long p=0;
1316 long num_clust=0;
1317 long ReventID=0;
1318 long denseID=0;
1319 long neighbor_ID=0;
1320 //long temp_grid=0;
1321
1322 long *grid_value;
1323 long *search_value;
1324
1325 long **graph_of_grid; //the graph constructed for dense grids: each dense grid is a graph node
1326
1327 double real_dist=0;
1328 double **norm_grid_center;
1329
1330 norm_grid_center=(double **)malloc(sizeof(double*)*num_dense_grids);
1331 memset(norm_grid_center,0,sizeof(double*)*num_dense_grids);
1332
1333 for (i=0;i<num_dense_grids;i++)
1334 {
1335 norm_grid_center[i]=(double *)malloc(sizeof(double)*num_dm);
1336 memset(norm_grid_center[i],0,sizeof(double)*num_dm);
1337 }
1338
1339 for (i=0;i<file_Len;i++)
1340 {
1341 denseID=eventID_To_denseventID[i];
1342 if (denseID!=-1) //only dense events can enter
1343 {
1344 grid_ID[denseID]=dense_grid_reverse[all_grid_ID[i]];
1345
1346 if (grid_ID[denseID]==-1)
1347 {
1348 fprintf(stderr,"Incorrect input file format or input parameters (no dense region found)!\n");
1349 exit(0);
1350 }
1351 }
1352 }
1353
1354
1355 /* Find centroid (in the normalized data) for each dense grid */
1356 ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_dense_grids,grid_ID,norm_grid_center);
1357
1358 //printf("pass the grid ID2 center\n");
1359
1360
1361 graph_of_grid=(long **)malloc(sizeof(long*)*num_dense_grids);
1362 memset(graph_of_grid,0,sizeof(long*)*num_dense_grids);
1363 for (i=0;i<num_dense_grids;i++)
1364 {
1365 graph_of_grid[i]=(long *)malloc(sizeof(long)*num_dm);
1366 memset(graph_of_grid[i],0,sizeof(long)*num_dm);
1367
1368
1369 for (j=0;j<num_dm;j++)
1370 graph_of_grid[i][j]=-1;
1371 }
1372
1373 grid_value=(long *)malloc(sizeof(long)*num_dm);
1374 memset(grid_value,0,sizeof(long)*num_dm);
1375
1376 search_value=(long *)malloc(sizeof(long)*num_dm);
1377 memset(search_value,0,sizeof(long)*num_dm);
1378
1379
1380 for (i=0;i<num_dense_grids;i++)
1381 {
1382 ReventID=densegridID_To_gridevent[i];
1383
1384 for (j=0;j<num_dm;j++)
1385 {
1386 grid_value[j]=position[ReventID][j];
1387
1388 }
1389
1390
1391 /* For each dimension, find the neighbor, if any, that is equal in all other dimensions and 1 greater in
1392 the chosen dimension. If the neighbor's centroid is not too far away, add it to this grid's neighbor
1393 list. */
1394 for (t=0;t<num_dm;t++)
1395 {
1396 for (p=0;p<num_dm;p++)
1397 search_value[p]=grid_value[p];
1398
1399 if (grid_value[t]==num_bin-1)
1400 continue;
1401
1402 search_value[t]=grid_value[t]+1; //we only consider the neighbor at the bigger side
1403
1404 neighbor_ID=binary_search(num_dense_grids, num_dm, search_value, dense_grid_seq);
1405
1406 if (neighbor_ID!=-1)
1407 {
1408 real_dist=norm_grid_center[i][t]-norm_grid_center[neighbor_ID][t];
1409
1410 if (real_dist<0)
1411 real_dist=-real_dist;
1412
1413 if (real_dist<2*interval[t]) //by using 2*interval, this switch is void
1414 graph_of_grid[i][t]=neighbor_ID;
1415 }
1416 }
1417 grid_clusterID[i]=i; //initialize grid_clusterID
1418 }
1419
1420
1421 //graph constructed
1422 //DFS-based search begins
1423
1424 /* Use a depth-first search to construct a list of connected subgraphs (= "clusters").
1425 Note our graph as we have constructed it is a DAG, so we can use that to our advantage
1426 in our search. */
1427 // num_clust=dfs(graph_of_grid,num_dense_grids,num_dm,grid_clusterID);
1428 num_clust=find_connected(graph_of_grid, num_dense_grids, num_dm, grid_clusterID);
1429
1430
1431 //computes grid_ID and cluster_ID
1432 for (i=0;i<file_Len;i++)
1433 {
1434 denseID=eventID_To_denseventID[i];
1435 if (denseID!=-1) //only dense events can enter
1436 {
1437 cluster_ID[denseID]=grid_clusterID[grid_ID[denseID]];
1438 //if (cluster_ID[denseID]==-1)
1439 // printf("catch you!\n");
1440 }
1441 }
1442
1443 free(search_value);
1444 free(grid_value);
1445
1446 for (i=0;i<num_dense_grids;i++)
1447 {
1448 free(graph_of_grid[i]);
1449 free(norm_grid_center[i]);
1450 }
1451 free(graph_of_grid);
1452 free(norm_grid_center);
1453
1454 return num_clust;
1455 }
1456
1457 /********************************************* Merge Clusters to Populations *******************************************/
1458
1459 long merge_clusters(long num_clust, long num_dm, double *interval, double **cluster_center, long *cluster_populationID, long max_num_pop)
1460 {
1461 long num_population=0;
1462 long temp_num_population=0;
1463
1464 long i=0;
1465 long j=0;
1466 long t=0;
1467 long merge=0;
1468 long smid=0;
1469 long bgid=0;
1470 double merge_dist=1.1;
1471
1472 long *map_ID;
1473
1474 double diff=0;
1475
1476 map_ID=(long*)malloc(sizeof(long)*num_clust);
1477 memset(map_ID,0,sizeof(long)*num_clust);
1478
1479 for (i=0;i<num_clust;i++)
1480 {
1481 cluster_populationID[i]=i;
1482 map_ID[i]=i;
1483 }
1484
1485
1486 while (((num_population>max_num_pop) && (merge_dist<5.0)) || ((num_population<=1) && (merge_dist>0.1)))
1487 {
1488
1489 if (num_population<=1)
1490 merge_dist=merge_dist-0.1;
1491
1492 if (num_population>max_num_pop)
1493 merge_dist=merge_dist+0.1;
1494
1495
1496 for (i=0;i<num_clust;i++)
1497 cluster_populationID[i]=i;
1498
1499 for (i=0;i<num_clust-1;i++)
1500 {
1501 for (j=i+1;j<num_clust;j++)
1502 {
1503 merge=1;
1504
1505 for (t=0;t<num_dm;t++)
1506 {
1507 diff=cluster_center[i][t]-cluster_center[j][t];
1508
1509 if (diff<0)
1510 diff=-diff;
1511 if (diff>(merge_dist*interval[t]))
1512 merge=0;
1513 }
1514
1515 if ((merge) && (cluster_populationID[i]!=cluster_populationID[j]))
1516 {
1517 if (cluster_populationID[i]<cluster_populationID[j]) //they could not be equal
1518 {
1519 smid = cluster_populationID[i];
1520 bgid = cluster_populationID[j];
1521 }
1522 else
1523 {
1524 smid = cluster_populationID[j];
1525 bgid = cluster_populationID[i];
1526 }
1527 for (t=0;t<num_clust;t++)
1528 {
1529 if (cluster_populationID[t]==bgid)
1530 cluster_populationID[t]=smid;
1531 }
1532 }
1533 }
1534 }
1535
1536
1537
1538 for (i=0;i<num_clust;i++)
1539 map_ID[i]=-1;
1540
1541 for (i=0;i<num_clust;i++)
1542 map_ID[cluster_populationID[i]]=1;
1543
1544 num_population=0;
1545 for (i=0;i<num_clust;i++)
1546 {
1547 if (map_ID[i]==1)
1548 {
1549 map_ID[i]=num_population;
1550 num_population++;
1551 }
1552 }
1553
1554 if ((temp_num_population>max_num_pop) && (num_population==1))
1555 break;
1556 else
1557 temp_num_population=num_population;
1558
1559 if (num_clust<=1)
1560 break;
1561 } //end while
1562
1563 for (i=0;i<num_clust;i++)
1564 cluster_populationID[i]=map_ID[cluster_populationID[i]];
1565
1566 free(map_ID);
1567
1568 return num_population;
1569 }
1570
1571 ///////////////////////////////////////////////////////
1572 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1573 double kmeans(double **Matrix, long k, double kmean_term, long file_Len, long num_dm, long *shortest_id, double **center)
1574 {
1575
1576 long i=0;
1577 long j=0;
1578 long t=0;
1579 long random=0;
1580 long random1=0;
1581 long random2=0;
1582 long times=0;
1583 long dist_used=0; //0 is Euclidean and 1 is Pearson
1584 long random_init=0; //0: not use random seeds;
1585 long real_Len=0;
1586 long skipped=0;
1587
1588 long *num; //num[i]=t means the ith cluster has t points
1589
1590 double vvv=1.0; // the biggest variation;
1591 double distance=0.0;
1592 double xv=0.0;
1593 double variation=0.0;
1594
1595 double mean_dx=0;
1596 double mean_dy=0;
1597 double sum_var=0;
1598 double dx=0;
1599 double dy=0;
1600 double sd_x=0;
1601 double sd_y=0;
1602 double diff=0;
1603 double distortion=0;
1604 double sd=0; //standard deviation
1605 double shortest_distance;
1606
1607 double *temp_center;
1608
1609 double **sum;
1610
1611 temp_center = (double *)malloc(sizeof(double)*num_dm);
1612 memset(temp_center,0,sizeof(double)*num_dm);
1613
1614 if (random_init)
1615 {
1616 for (i=0;i<k;i++)
1617 {
1618 random1=rand()*rand();
1619 random2=abs((random1%5)+1);
1620 for (t=0;t<random2;t++)
1621 random2=random2*rand()+rand();
1622
1623 random=abs(random2%file_Len);
1624 for (j=0;j<num_dm;j++)
1625 center[i][j]=Matrix[random][j];
1626 }
1627 }
1628
1629
1630 num = (long *)malloc(sizeof(long)*k);
1631 memset(num,0,sizeof(long)*k);
1632
1633 sum = (double **)malloc(sizeof(double*)*k);
1634 memset(sum,0,sizeof(double*)*k);
1635 for (i=0;i<k;i++)
1636 {
1637 sum[i] = (double *)malloc(sizeof(double)*num_dm);
1638 memset(sum[i],0,sizeof(double)*num_dm);
1639 }
1640
1641
1642 times=0;
1643 real_Len=0;
1644
1645 while (((vvv>kmean_term) && (kmean_term<1)) || ((times<kmean_term) && (kmean_term>=1)))
1646 {
1647 for (i=0;i<k;i++)
1648 {
1649 num[i]=0;
1650 for (j=0;j<num_dm;j++)
1651 sum[i][j]=0.0;
1652 }
1653
1654 for (i=0;i<file_Len;i++) //for each data point i, we compute the distance between Matrix[i] and center[j]
1655 {
1656 skipped = 0;
1657 shortest_distance=MAX_VALUE;
1658 for (j=0;j<k;j++) //for each center j
1659 {
1660 distance=0.0;
1661
1662 if (dist_used==0) //Euclidean distance
1663 {
1664 for (t=0;t<num_dm;t++) //for each dimension here num_dm is always 1 as we consider individual dimensions
1665 {
1666
1667 diff=center[j][t]-Matrix[i][t];
1668
1669 diff=diff*diff;
1670 //this K-means is only used for dimension selection, and therefore for 1-dimensional data only, no need to use cube distance
1671
1672 distance = distance+diff; //here we have a weight for each dimension
1673 }
1674 }
1675 else //pearson correlation
1676 {
1677 mean_dx=0.0;
1678 mean_dy=0.0;
1679 sum_var=0.0;
1680 dx=0.0;
1681 dy=0.0;
1682 sd_x=0.0;
1683 sd_y=0.0;
1684 for (t=0;t<num_dm;t++)
1685 {
1686 mean_dx+=center[j][t];
1687 mean_dy+=Matrix[i][t];
1688 }
1689 mean_dx=mean_dx/(double)num_dm;
1690 mean_dy=mean_dy/(double)num_dm;
1691
1692 for (t=0;t<num_dm;t++)
1693 {
1694 dx=center[j][t]-mean_dx;
1695 dy=Matrix[i][t]-mean_dy;
1696 sum_var+=dx*dy;
1697 sd_x+=dx*dx;
1698 sd_y+=dy*dy;
1699 }
1700 if (sqrt(sd_x*sd_y)==0)
1701 distance = 1.0;
1702 else
1703 distance = 1.0 - (sum_var/(sqrt(sd_x*sd_y))); // distance ranges from 0 to 2;
1704 //printf("distance=%f\n",distance);
1705 } //pearson correlation ends
1706
1707 if ((distance<shortest_distance) && (skipped == 0))
1708 {
1709 shortest_distance=distance;
1710 shortest_id[i]=j;
1711 }
1712 }//end for j
1713 real_Len++;
1714 num[shortest_id[i]]=num[shortest_id[i]]+1;
1715 for (t=0;t<num_dm;t++)
1716 sum[shortest_id[i]][t]=sum[shortest_id[i]][t]+Matrix[i][t];
1717 }//end for i
1718 /* recompute the centers */
1719 //compute_mean(group);
1720 vvv=0.0;
1721 for (j=0;j<k;j++)
1722 {
1723 memcpy(temp_center,center[j],sizeof(double)*num_dm);
1724 variation=0.0;
1725 if (num[j]!=0)
1726 {
1727 for (t=0;t<num_dm;t++)
1728 {
1729 center[j][t]=sum[j][t]/(double)num[j];
1730 xv=(temp_center[t]-center[j][t]);
1731 variation=variation+xv*xv;
1732 }
1733 }
1734
1735 if (variation>vvv)
1736 vvv=variation; //vvv is the biggest variation among the k clusters;
1737 }
1738 //compute_variation;
1739 times++;
1740 } //end for while
1741
1742
1743
1744
1745 free(num);
1746
1747 for (i=0;i<k;i++)
1748 free(sum[i]);
1749 free(sum);
1750 free(temp_center);
1751
1752
1753 return distortion;
1754
1755 }
1756
1757 //////////////////////////////////////////////////////
1758 /*************************** Show *****************************/
1759 void show(double **Matrix, long *cluster_id, long file_Len, long k, long num_dm, char *name_string)
1760 {
1761 int situ1=0;
1762 int situ2=0;
1763
1764 long i=0;
1765 long id=0;
1766 long j=0;
1767 long info_id=0;
1768 long nearest_id=0;
1769 long insert=0;
1770 long temp=0;
1771 long m=0;
1772 long n=0;
1773 long t=0;
1774
1775 long *size_c;
1776
1777
1778
1779 long **size_mybound_1;
1780 long **size_mybound_2;
1781 long **size_mybound_3;
1782 long **size_mybound_0;
1783
1784 double interval=0.0;
1785
1786 double *big;
1787 double *small;
1788
1789
1790 double **center;
1791 double **mybound;
1792
1793 long **prof; //prof[i][j]=1 means population i is + at parameter j
1794
1795 FILE *fpcnt_id; //proportion id
1796 //FILE *fcent_id; //center_id, i.e., centers of clusters within the original data
1797 FILE *fprof_id; //profile_id
1798
1799 big=(double *)malloc(sizeof(double)*num_dm);
1800 memset(big,0,sizeof(double)*num_dm);
1801
1802 small=(double *)malloc(sizeof(double)*num_dm);
1803 memset(small,0,sizeof(double)*num_dm);
1804
1805 for (i=0;i<num_dm;i++)
1806 {
1807 big[i]=0.0;
1808 small[i]=(double)MAX_VALUE;
1809 }
1810
1811
1812 size_c=(long *)malloc(sizeof(long)*k);
1813 memset(size_c,0,sizeof(long)*k);
1814
1815 center=(double**)malloc(sizeof(double*)*k);
1816 memset(center,0,sizeof(double*)*k);
1817 for (i=0;i<k;i++)
1818 {
1819 center[i]=(double*)malloc(sizeof(double)*num_dm);
1820 memset(center[i],0,sizeof(double)*num_dm);
1821 }
1822
1823 mybound=(double**)malloc(sizeof(double*)*num_dm);
1824 memset(mybound,0,sizeof(double*)*num_dm);
1825 for (i=0;i<num_dm;i++) //there are 3 mybounds for 4 categories
1826 {
1827 mybound[i]=(double*)malloc(sizeof(double)*3);
1828 memset(mybound[i],0,sizeof(double)*3);
1829 }
1830
1831 prof=(long **)malloc(sizeof(long*)*k);
1832 memset(prof,0,sizeof(long*)*k);
1833 for (i=0;i<k;i++)
1834 {
1835 prof[i]=(long *)malloc(sizeof(long)*num_dm);
1836 memset(prof[i],0,sizeof(long)*num_dm);
1837 }
1838
1839
1840 for (i=0;i<file_Len;i++)
1841 {
1842 id=cluster_id[i];
1843 for (j=0;j<num_dm;j++)
1844 {
1845 center[id][j]=center[id][j]+Matrix[i][j];
1846 if (big[j]<Matrix[i][j])
1847 big[j]=Matrix[i][j];
1848 if (small[j]>Matrix[i][j])
1849 small[j]=Matrix[i][j];
1850 }
1851
1852 size_c[id]++;
1853 }
1854
1855 for (i=0;i<k;i++)
1856 for (j=0;j<num_dm;j++)
1857 {
1858 if (size_c[i]!=0)
1859 center[i][j]=(center[i][j]/(double)(size_c[i]));
1860 else
1861 center[i][j]=0;
1862 }
1863
1864 for (j=0;j<num_dm;j++)
1865 {
1866 interval=((big[j]-small[j])/4.0);
1867 //printf("interval[%d] is %f\n",j,interval);
1868 for (i=0;i<3;i++)
1869 mybound[j][i]=small[j]+((double)(i+1)*interval);
1870 }
1871
1872
1873 size_mybound_0=(long **)malloc(sizeof(long*)*k);
1874 memset(size_mybound_0,0,sizeof(long*)*k);
1875
1876 for (i=0;i<k;i++)
1877 {
1878 size_mybound_0[i]=(long*)malloc(sizeof(long)*num_dm);
1879 memset(size_mybound_0[i],0,sizeof(long)*num_dm);
1880 }
1881
1882 size_mybound_1=(long **)malloc(sizeof(long*)*k);
1883 memset(size_mybound_1,0,sizeof(long*)*k);
1884
1885 for (i=0;i<k;i++)
1886 {
1887 size_mybound_1[i]=(long*)malloc(sizeof(long)*num_dm);
1888 memset(size_mybound_1[i],0,sizeof(long)*num_dm);
1889 }
1890
1891 size_mybound_2=(long **)malloc(sizeof(long*)*k);
1892 memset(size_mybound_2,0,sizeof(long*)*k);
1893
1894 for (i=0;i<k;i++)
1895 {
1896 size_mybound_2[i]=(long*)malloc(sizeof(long)*num_dm);
1897 memset(size_mybound_2[i],0,sizeof(long)*num_dm);
1898 }
1899
1900 size_mybound_3=(long **)malloc(sizeof(long*)*k);
1901 memset(size_mybound_3,0,sizeof(long*)*k);
1902
1903 for (i=0;i<k;i++)
1904 {
1905 size_mybound_3[i]=(long*)malloc(sizeof(long)*num_dm);
1906 memset(size_mybound_3[i],0,sizeof(long)*num_dm);
1907 }
1908
1909 for (i=0;i<file_Len;i++)
1910 for (j=0;j<num_dm;j++)
1911 {
1912 if (Matrix[i][j]<mybound[j][0])// && ((Matrix[i][j]-small[j])>0)) //the smallest values excluded
1913 size_mybound_0[cluster_id[i]][j]++;
1914 else
1915 {
1916 if (Matrix[i][j]<mybound[j][1])
1917 size_mybound_1[cluster_id[i]][j]++;
1918 else
1919 {
1920 if (Matrix[i][j]<mybound[j][2])
1921 size_mybound_2[cluster_id[i]][j]++;
1922 else
1923 //if (Matrix[i][j]!=big[j]) //the biggest values excluded
1924 size_mybound_3[cluster_id[i]][j]++;
1925 }
1926
1927 }
1928 }
1929
1930 fprof_id=fopen("profile.txt","w");
1931 fprintf(fprof_id,"Population_ID\t");
1932 fprintf(fprof_id,"%s\n",name_string);
1933
1934 for (i=0;i<k;i++)
1935 {
1936 fprintf(fprof_id,"%ld\t",i+1); //i changed to i+1 to start from 1 instead of 0: April 16, 2009
1937 for (j=0;j<num_dm;j++)
1938 {
1939
1940 if (size_mybound_0[i][j]>size_mybound_1[i][j])
1941 situ1=0;
1942 else
1943 situ1=1;
1944 if (size_mybound_2[i][j]>size_mybound_3[i][j])
1945 situ2=2;
1946 else
1947 situ2=3;
1948
1949 if ((situ1==0) && (situ2==2))
1950 {
1951 if (size_mybound_0[i][j]>size_mybound_2[i][j])
1952 prof[i][j]=0;
1953 else
1954 prof[i][j]=2;
1955 }
1956 if ((situ1==0) && (situ2==3))
1957 {
1958 if (size_mybound_0[i][j]>size_mybound_3[i][j])
1959 prof[i][j]=0;
1960 else
1961 prof[i][j]=3;
1962 }
1963 if ((situ1==1) && (situ2==2))
1964 {
1965 if (size_mybound_1[i][j]>size_mybound_2[i][j])
1966 prof[i][j]=1;
1967 else
1968 prof[i][j]=2;
1969 }
1970 if ((situ1==1) && (situ2==3))
1971 {
1972 if (size_mybound_1[i][j]>size_mybound_3[i][j])
1973 prof[i][j]=1;
1974 else
1975 prof[i][j]=3;
1976 }
1977
1978 //begin to output profile
1979 if (j==num_dm-1)
1980 {
1981 if (prof[i][j]==0)
1982 fprintf(fprof_id,"1\n");
1983 if (prof[i][j]==1)
1984 fprintf(fprof_id,"2\n");
1985 if (prof[i][j]==2)
1986 fprintf(fprof_id,"3\n");
1987 if (prof[i][j]==3)
1988 fprintf(fprof_id,"4\n");
1989 }
1990 else
1991 {
1992 if (prof[i][j]==0)
1993 fprintf(fprof_id,"1\t");
1994 if (prof[i][j]==1)
1995 fprintf(fprof_id,"2\t");
1996 if (prof[i][j]==2)
1997 fprintf(fprof_id,"3\t");
1998 if (prof[i][j]==3)
1999 fprintf(fprof_id,"4\t");
2000 }
2001 }
2002 }
2003 fclose(fprof_id);
2004
2005 ///////////////////////////////////////////////////////////
2006
2007
2008 fpcnt_id=fopen("percentage.txt","w");
2009 fprintf(fpcnt_id,"Population_ID\tPercentage\n");
2010
2011 for (t=0;t<k;t++)
2012 {
2013 fprintf(fpcnt_id,"%ld\t%.2f\n",t+1,(double)size_c[t]*100.0/(double)file_Len); //t changed to t+1 to start from 1 instead of 0: April 16, 2009
2014 }
2015 fclose(fpcnt_id);
2016
2017 free(big);
2018 free(small);
2019 free(size_c);
2020
2021 for (i=0;i<k;i++)
2022 {
2023 free(center[i]);
2024 free(prof[i]);
2025 free(size_mybound_0[i]);
2026 free(size_mybound_1[i]);
2027 free(size_mybound_2[i]);
2028 free(size_mybound_3[i]);
2029 }
2030 free(center);
2031 free(prof);
2032 free(size_mybound_0);
2033 free(size_mybound_1);
2034 free(size_mybound_2);
2035 free(size_mybound_3);
2036
2037 for (i=0;i<num_dm;i++)
2038 free(mybound[i]);
2039 free(mybound);
2040
2041 }
2042 ///////////////////////////////////////////////////////////////////////////
2043
2044 double get_avg_dist(double *population_center, long smaller_pop_ID, long larger_pop_ID, long *population_ID, long num_population, long file_Len,
2045 long num_dm, double **normalized_data, long d1, long d2, long d3, long *size_c)
2046 {
2047 long k=0;
2048 long temp=0;
2049 long i=0;
2050 long j=0;
2051 long t=0;
2052 long current_biggest=0;
2053
2054
2055 double total_dist=0.0;
2056 double distance=0.0;
2057 double dist=0.0;
2058 double biggest_distance=0.0;
2059
2060 long *nearest_neighbors;
2061 double *nearest_distance;
2062
2063 k=min(size_c[smaller_pop_ID],size_c[larger_pop_ID]);
2064
2065 if (k>0)
2066 {
2067 k=(long)sqrt((double)k);
2068 if (num_dm<=3)
2069 k=(long)(2.7183*(double)k); //e*k for low-dimensional space, added Nov. 4, 2010
2070 //printf("the k-nearest k is %d\n",k);
2071 }
2072 else
2073 {
2074 printf("error in k\n");
2075 exit(0);
2076 }
2077
2078 nearest_neighbors=(long *)malloc(sizeof(long)*k);
2079 memset(nearest_neighbors,0,sizeof(long)*k);
2080
2081 nearest_distance=(double *)malloc(sizeof(double)*k);
2082 memset(nearest_distance,0,sizeof(double)*k);
2083
2084 temp=0;
2085
2086 for (i=0;i<file_Len;i++)
2087 {
2088 if ((population_ID[i]==smaller_pop_ID) || (population_ID[i]==larger_pop_ID)) //the event belongs to the pair of populations
2089 {
2090 distance=0.0;
2091 for (t=0;t<num_dm;t++)
2092 {
2093 if ((t!=d1) && (t!=d2) && (t!=d3))
2094 continue;
2095 else
2096 {
2097 dist=population_center[t]-normalized_data[i][t];
2098 distance=distance+dist*dist;
2099 }
2100 }
2101
2102 if (temp<k)
2103 {
2104 nearest_neighbors[temp]=i;
2105 nearest_distance[temp]=distance;
2106 temp++;
2107 }
2108 else
2109 {
2110 biggest_distance=0.0;
2111 for (j=0;j<k;j++)
2112 {
2113 if (nearest_distance[j]>biggest_distance)
2114 {
2115 biggest_distance=nearest_distance[j];
2116 current_biggest=j;
2117 }
2118 }
2119
2120 if (biggest_distance>distance)
2121 {
2122 nearest_neighbors[current_biggest]=i;
2123 nearest_distance[current_biggest]=distance;
2124 }
2125 }
2126 }
2127 }
2128
2129 for (i=0;i<k;i++)
2130 total_dist=total_dist+nearest_distance[i];
2131
2132 total_dist=total_dist/(double)k;
2133
2134 free(nearest_distance);
2135 free(nearest_neighbors);
2136
2137 return total_dist;
2138 }
2139
2140
2141 /******************************************************** Main Function **************************************************/
2142 //for normalized data, there are five variables:
2143 //cluster_ID
2144 //population_center
2145 //grid_clusterID
2146 //grid_ID
2147 //grid_Center
2148
2149 //the same five variables exist for the original data
2150 //however, the three IDs (cluster_ID, grid_ID, grid_clusterID) don't change in either normalized or original data
2151 //also, data + cluster_ID -> population_center
2152 //data + grid_ID -> grid_Center
2153
2154 /* what is the final output */
2155 //the final things we want are grid_Center in the original data and grid_clusterID //or population_center in the original data
2156 //Sparse grids will not be considered when computing the two centroids (centroids of grids and centroids of clusters)
2157
2158 /* what information should select_bin output */
2159 //the size of all IDs are unknown to function main because we only consider the events in dense grids, and also the number of dense grids
2160 //is unknown, therefore I must use a prescreening to output
2161 //how many bins I should use
2162 //the number of dense grids
2163 //total number of events in the dense grids
2164
2165 /* basic procedure of main function */
2166 //1. read raw file and normalize the raw file
2167 //2. select_bin
2168 //3. allocate memory for eventID_To_denseventID, grid_clusterID, grid_ID, cluster_ID.
2169 //4. call select_dense and merge_grids with grid_clusterID, grid_ID, cluster_ID.
2170 //5. release normalized data; allocate memory for grid_Center and population_center
2171 //6. output grid_Center and population_center using ID2Center together with grid_clusterID //from IDs to centers
2172
2173 int main (int argc, char **argv)
2174 {
2175 //inputs
2176 FILE *f_src; //source file pointer
2177
2178 FILE *f_out; //coordinates
2179 FILE *f_cid; //population-ID of events
2180 FILE *f_ctr; //centroids of populations
2181 FILE *f_results; //coordinates file event and population column
2182 FILE *f_mfi; //added April 16, 2009 for mean fluorescence intensity
2183 FILE *f_parameters; //number of bins and density calculated by
2184 //the algorithm. Used to update the database
2185 FILE *f_properties; //Properties file used by Image generation software
2186
2187 //char tmpbuf[128];
2188
2189 char para_name_string[LINE_LEN];
2190
2191 long time_ID=-1;
2192 long num_bin=0; //the bins I will use on each dimension
2193 long num_pop=0;
2194 long file_Len=0; //number of events
2195 long num_dm=0;
2196 long num_clust=0;
2197 long num_dense_events=0;
2198 long num_dense_grids=0;
2199 long num_nonempty=0;
2200 long num_population=0;
2201 long num_real_pop=0;
2202 long keep_merge=0;
2203 long num_checked_range=0;
2204
2205 //below are read from configuration file
2206 long i=0;
2207 long j=0;
2208 long p=0;
2209 long t=0;
2210 long q=0;
2211 long temp_i=0;
2212 long temp_j=0;
2213 long first_i=0;
2214 long first_j=0;
2215 long d1=0;
2216 long d2=0;
2217 long d3=0;
2218 long d_d=0;
2219 long max_num_pop=0; //upperbound of the number of populations that is equal to 2^d
2220 long index_id=0;
2221 long num_computed_population=0;
2222 long tot_size=0;
2223
2224 int den_t_event=0;
2225
2226 double distance=0.0;
2227 double dist=0.0;
2228 double current_smallest_dist=0;
2229 double max_d_dist=0.0;
2230 double Ei=0.0;
2231 double Ej=0.0;
2232 double E1=0.0;
2233 double E2=0.0;
2234 double E3=0.0;
2235 double Ep=0.0;
2236 double temp=0.0;
2237 double tmp=0.0;
2238
2239 long *grid_clusterID; //(dense)gridID to clusterID
2240 long *grid_ID; //(dense)eventID to gridID
2241 long *cluster_ID; //(dense)eventID to clusterID
2242 long *eventID_To_denseventID; //eventID_To_denseventID[i]=k means event i is in a dense grid and its ID within dense events is k
2243 long *all_grid_ID; //gridID for all events
2244 long *densegridID_To_gridevent;
2245 long *sorted_seq;
2246 long *dense_grid_reverse;
2247 long *population_ID; //denseeventID to populationID
2248 long *cluster_populationID; //clusterID to populationID
2249 long *grid_populationID; //gridID to populationID
2250 long *all_population_ID; //populationID of event
2251 long *size_c;
2252 long *size_p_2;
2253 long *partit;
2254 long *size_p;
2255 long *temp_size_j;
2256 long *all_computed_pop_ID;
2257
2258
2259 long **position;
2260 long **dense_grid_seq;
2261 long **temp_population_ID;
2262
2263 double *interval;
2264 double *center_1;
2265 double *center_2;
2266 double *center_3;
2267 double *aver;
2268 double *std;
2269 double *center_p_1;
2270 double *center_p_2;
2271 double *center_p_3;
2272
2273 double **population_center; //population centroids in the raw/original data
2274 double **cluster_center; //cluster centroids in the raw/original data
2275 double **new_population_center;
2276 double **temp_population_center;
2277 double **min_pop;
2278 double **max_pop;
2279
2280 double **input_data;
2281 double **normalized_data;
2282
2283 double **real_pop_center;
2284 double **distance_pop;
2285
2286 double ***ind_pop;
2287 double ***ind_pop_3;
2288
2289 int min = 999999;
2290 int max = 0;
2291
2292 printf( "Starting time:\t\t\t\t");
2293 fflush(stdout);
2294 system("/bin/date");
2295
2296 /*
2297 * Windows Version
2298 _strtime( tmpbuf );
2299 printf( "Starting time:\t\t\t\t%s\n", tmpbuf );
2300 _strdate( tmpbuf );
2301 printf( "Starting date:\t\t\t\t%s\n", tmpbuf );
2302 */
2303
2304 /////////////////////////////////////////////////////////////
2305
2306 if ((argc!=2) && (argc!=3) && (argc!=4) && (argc!=5) && (argc!=6))
2307 {
2308 fprintf(stderr,"Incorrect number of input parameters!\n");
2309 fprintf(stderr,"usage:\n");
2310 fprintf(stderr,"basic mode: flock data_file\n");
2311 fprintf(stderr,"advanced mode 0 (specify maximum # of pops): flock data_file max_num_pop\n");
2312 fprintf(stderr,"advanced mode 1 (without # of pops): flock data_file num_bin density_index\n");
2313 fprintf(stderr,"advanced mode 2 (specify # of pops): flock data_file num_bin density_index number_of_pop\n");
2314 fprintf(stderr,"advanced mode 3 (specify both # of pops): flock data_file num_bin density_index number_of_pop max_num_pop\n");
2315 exit(0);
2316 }
2317
2318 f_src=fopen(argv[1],"r");
2319
2320 if (argc==3)
2321 {
2322 max_num_pop=atoi(argv[2]);
2323 printf("num of maximum pop is %ld\n",max_num_pop);
2324 }
2325
2326 if (argc==4)
2327 {
2328 num_bin=atoi(argv[2]);
2329 printf("num_bin is %ld\n",num_bin);
2330
2331 den_t_event=atoi(argv[3]);
2332 printf("density_index is %d\n",den_t_event);
2333 }
2334
2335 if (argc==5)
2336 {
2337 num_bin=atoi(argv[2]);
2338 printf("num_bin is %ld\n",num_bin);
2339
2340 den_t_event=atoi(argv[3]);
2341 printf("density_index is %d\n",den_t_event);
2342
2343 num_pop=atoi(argv[4]);
2344 printf("num of pop is %ld\n",num_pop);
2345 }
2346
2347 if (argc==6)
2348 {
2349 num_bin=atoi(argv[2]);
2350 printf("num_bin is %ld\n",num_bin);
2351
2352 den_t_event=atoi(argv[3]);
2353 printf("density_index is %d\n",den_t_event);
2354
2355 num_pop=atoi(argv[4]);
2356 printf("num of pop is %ld\n",num_pop);
2357
2358 max_num_pop=atoi(argv[5]);
2359 printf("num of pop is %ld\n",max_num_pop);
2360 }
2361
2362
2363 if (num_pop==1)
2364 {
2365 printf("it is not allowed to specify only one population\n");
2366 exit(0);
2367 }
2368
2369
2370
2371 getfileinfo(f_src, &file_Len, &num_dm, para_name_string, &time_ID); //get the filelength, number of dimensions, and num/name of parameters
2372
2373
2374 if (max_num_pop==0)
2375 {
2376 max_num_pop=(long)pow(2,num_dm);
2377 if (max_num_pop>MAX_POP_NUM)
2378 max_num_pop=MAX_POP_NUM;
2379 }
2380
2381 /************************************************* Read the data *****************************************************/
2382
2383 rewind(f_src); //reset the file pointer
2384
2385 input_data = (double **)malloc(sizeof(double*)*file_Len);
2386 memset(input_data,0,sizeof(double*)*file_Len);
2387 for (i=0;i<file_Len;i++)
2388 {
2389 input_data[i]=(double *)malloc(sizeof(double)*num_dm);
2390 memset(input_data[i],0,sizeof(double)*num_dm);
2391 }
2392
2393 readsource(f_src, file_Len, num_dm, input_data, time_ID); //read the data;
2394
2395 fclose(f_src);
2396
2397 normalized_data=(double **)malloc(sizeof(double*)*file_Len);
2398 memset(normalized_data,0,sizeof(double*)*file_Len);
2399 for (i=0;i<file_Len;i++)
2400 {
2401 normalized_data[i]=(double *)malloc(sizeof(double)*num_dm);
2402 memset(normalized_data[i],0,sizeof(double)*num_dm);
2403 }
2404
2405 tran(input_data, file_Len, num_dm, NORM_METHOD, normalized_data);
2406
2407
2408 position=(long **)malloc(sizeof(long*)*file_Len);
2409 memset(position,0,sizeof(long*)*file_Len);
2410 for (i=0;i<file_Len;i++)
2411 {
2412 position[i]=(long*)malloc(sizeof(long)*num_dm);
2413 memset(position[i],0,sizeof(long)*num_dm);
2414 }
2415
2416 all_grid_ID=(long *)malloc(sizeof(long)*file_Len);
2417 memset(all_grid_ID,0,sizeof(long)*file_Len);
2418
2419 sorted_seq=(long*)malloc(sizeof(long)*file_Len);
2420 memset(sorted_seq,0,sizeof(long)*file_Len);
2421
2422 interval=(double*)malloc(sizeof(double)*num_dm);
2423 memset(interval,0,sizeof(double)*num_dm);
2424
2425 /************************************************* select_bin *************************************************/
2426
2427 if (num_bin>=1) //num_bin has been selected by user
2428 {
2429 select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin);
2430 printf("user set bin number is %ld\n",num_bin);
2431 }
2432 else //num_bin has not been selected by user
2433 {
2434 num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin);
2435 printf("selected bin number is %ld\n",num_bin);
2436 }
2437 printf("number of non-empty grids is %ld\n",num_nonempty);
2438
2439
2440
2441 /* Although we return sorted_seq from select_bin(), we don't use it for anything, except possibly diagnostics */
2442 free(sorted_seq);
2443
2444
2445 dense_grid_reverse=(long*)malloc(sizeof(long)*num_nonempty);
2446 memset(dense_grid_reverse,0,sizeof(long)*num_nonempty);
2447
2448 /************************************************* select_dense **********************************************/
2449
2450 if (den_t_event>=1) //den_t_event must be larger or equal to 2 if the user wants to set it
2451 den_t_event=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event);
2452 else
2453 {
2454 den_t_event=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event);
2455 printf("automated selected density threshold is %d\n",den_t_event);
2456 }
2457
2458 printf("Number of dense grids is %ld\n",num_dense_grids);
2459
2460 densegridID_To_gridevent = (long *)malloc(sizeof(long)*num_dense_grids);
2461 memset(densegridID_To_gridevent,0,sizeof(long)*num_dense_grids);
2462
2463 for (i=0;i<num_dense_grids;i++)
2464 densegridID_To_gridevent[i]=-1; //initialize all densegridID_To_gridevent values to -1
2465
2466
2467 eventID_To_denseventID=(long *)malloc(sizeof(long)*file_Len);
2468 memset(eventID_To_denseventID,0,sizeof(long)*file_Len); //eventID_To_denseventID[i]=k means event i is in a dense grid and its ID within dense events is k
2469
2470
2471 grid_To_event(file_Len, dense_grid_reverse, all_grid_ID, eventID_To_denseventID, densegridID_To_gridevent);
2472
2473
2474 dense_grid_seq=(long **)malloc(sizeof(long*)*num_dense_grids);
2475 memset(dense_grid_seq,0,sizeof(long*)*num_dense_grids);
2476 for (i=0;i<num_dense_grids;i++)
2477 {
2478 dense_grid_seq[i]=(long *)malloc(sizeof(long)*num_dm);
2479 memset(dense_grid_seq[i],0,sizeof(long)*num_dm);
2480 }
2481
2482
2483 /* Look up the binned data values for each dense grid */
2484 generate_grid_seq(num_dm, num_dense_grids, densegridID_To_gridevent, position, dense_grid_seq);
2485
2486
2487 /************************************************* allocate memory *********************************************/
2488
2489 grid_clusterID=(long *)malloc(sizeof(long)*num_dense_grids);
2490 memset(grid_clusterID,0,sizeof(long)*num_dense_grids);
2491
2492 grid_ID=(long *)malloc(sizeof(long)*num_dense_events);
2493 memset(grid_ID,0,sizeof(long)*num_dense_events);
2494
2495 cluster_ID=(long *)malloc(sizeof(long)*num_dense_events);
2496 memset(cluster_ID,0,sizeof(long)*num_dense_events);
2497
2498
2499 /*********************************************** merge_grids ***********************************************/
2500 //long merge_grids(long file_Len, long num_dm, long num_bin, long **position, long num_dense_grids, long *dense_grid_rank, long *dense_grid_reverse,
2501 // long **dense_grid_seq, long *eventID_To_denseventID, long *densegridID_To_gridevent, long *all_grid_ID,
2502 // long *cluster_ID, long *grid_ID, long *grid_clusterID)
2503
2504 num_clust = merge_grids(normalized_data, interval, file_Len, num_dm, num_bin, position, num_dense_grids, dense_grid_reverse, dense_grid_seq, eventID_To_denseventID, densegridID_To_gridevent, all_grid_ID, cluster_ID, grid_ID, grid_clusterID);
2505
2506 printf("computed number of groups is %ld\n",num_clust);
2507
2508
2509 /************************************** release unnecessary memory and allocate memory and compute centers **********************************/
2510
2511
2512 for (i=0;i<file_Len;i++)
2513 free(position[i]);
2514 free(position);
2515
2516 for (i=0;i<num_dense_grids;i++)
2517 free(dense_grid_seq[i]);
2518 free(dense_grid_seq);
2519
2520 free(dense_grid_reverse);
2521
2522 free(densegridID_To_gridevent);
2523 free(all_grid_ID);
2524
2525 // cluster_center ////////////////////////////////////////////////////////////////////////////////////////////////////////
2526
2527 cluster_center=(double **)malloc(sizeof(double*)*num_clust);
2528 memset(cluster_center,0,sizeof(double*)*num_clust);
2529 for (i=0;i<num_clust;i++)
2530 {
2531 cluster_center[i]=(double*)malloc(sizeof(double)*num_dm);
2532 memset(cluster_center[i],0,sizeof(double)*num_dm);
2533 }
2534
2535 ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_clust,cluster_ID,cluster_center); //produce the centers with normalized data
2536
2537 //printf("pass the first ID2center\n");
2538
2539 /*** population_ID and grid_populationID **/
2540
2541 cluster_populationID=(long*)malloc(sizeof(long)*num_clust);
2542 memset(cluster_populationID,0,sizeof(long)*num_clust);
2543
2544 grid_populationID=(long*)malloc(sizeof(long)*num_dense_grids);
2545 memset(grid_populationID,0,sizeof(long)*num_dense_grids);
2546
2547 population_ID=(long*)malloc(sizeof(long)*num_dense_events);
2548 memset(population_ID,0,sizeof(long)*num_dense_events);
2549
2550 num_population = merge_clusters(num_clust, num_dm, interval, cluster_center, cluster_populationID, max_num_pop);
2551
2552
2553
2554 for (i=0;i<num_clust;i++)
2555 free(cluster_center[i]);
2556 free(cluster_center);
2557
2558 free(interval);
2559
2560 for (i=0;i<num_dense_grids;i++)
2561 {
2562 grid_populationID[i]=cluster_populationID[grid_clusterID[i]];
2563 }
2564
2565 for (i=0;i<num_dense_events;i++)
2566 {
2567 population_ID[i]=cluster_populationID[cluster_ID[i]];
2568 }
2569
2570 printf("Number of merged groups before second-run partitioning is %ld\n",num_population);
2571
2572
2573
2574 // population_center /////////////////////////////////////////////////////////////////////////////////////////////////////
2575
2576
2577 population_center=(double **)malloc(sizeof(double*)*num_population);
2578 memset(population_center,0,sizeof(double*)*num_population);
2579 for (i=0;i<num_population;i++)
2580 {
2581 population_center[i]=(double*)malloc(sizeof(double)*num_dm);
2582 memset(population_center[i],0,sizeof(double)*num_dm);
2583 }
2584
2585
2586
2587 ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_population,population_ID,population_center); //produce population centers with normalized data
2588
2589
2590 // show ////////////////////////////////////////////////////////////////////////////////
2591 all_population_ID=(long*)malloc(sizeof(long)*file_Len);
2592 memset(all_population_ID,0,sizeof(long)*file_Len);
2593
2594 kmeans(normalized_data, num_population, KMEANS_TERM, file_Len, num_dm, all_population_ID, population_center);
2595
2596 for (i=0;i<num_population;i++)
2597 free(population_center[i]);
2598
2599 free(population_center);
2600
2601 ////// there needs to be another step to further partition the data
2602
2603
2604 center_p_1=(double *)malloc(sizeof(double)*3);
2605 memset(center_p_1,0,sizeof(double)*3);
2606
2607 center_p_2=(double *)malloc(sizeof(double)*3);
2608 memset(center_p_2,0,sizeof(double)*3);
2609
2610 center_p_3=(double *)malloc(sizeof(double)*3);
2611 memset(center_p_3,0,sizeof(double)*3);
2612
2613 size_p=(long *)malloc(sizeof(long)*num_population);
2614 memset(size_p,0,sizeof(long)*num_population);
2615
2616 temp_size_j=(long *)malloc(sizeof(long)*num_population);
2617 memset(temp_size_j,0,sizeof(long)*num_population);
2618
2619 all_computed_pop_ID=(long *)malloc(sizeof(long)*file_Len);
2620 memset(all_computed_pop_ID,0,sizeof(long)*file_Len);
2621
2622 for (i=0;i<file_Len;i++)
2623 {
2624 size_p[all_population_ID[i]]++;
2625 all_computed_pop_ID[i]=all_population_ID[i];
2626 }
2627
2628 ind_pop=(double***)malloc(sizeof(double**)*num_population);
2629 memset(ind_pop,0,sizeof(double**)*num_population);
2630
2631 ind_pop_3=(double***)malloc(sizeof(double**)*num_population);
2632 memset(ind_pop_3,0,sizeof(double**)*num_population);
2633
2634 for (i=0;i<num_population;i++)
2635 {
2636 ind_pop[i]=(double**)malloc(sizeof(double*)*size_p[i]);
2637 memset(ind_pop[i],0,sizeof(double*)*size_p[i]);
2638
2639 ind_pop_3[i]=(double**)malloc(sizeof(double*)*size_p[i]);
2640 memset(ind_pop_3[i],0,sizeof(double*)*size_p[i]);
2641
2642 for (j=0;j<size_p[i];j++)
2643 {
2644 ind_pop[i][j]=(double*)malloc(sizeof(double)*num_dm);
2645 memset(ind_pop[i][j],0,sizeof(double)*num_dm);
2646
2647 ind_pop_3[i][j]=(double*)malloc(sizeof(double)*3);
2648 memset(ind_pop_3[i][j],0,sizeof(double)*3);
2649 }
2650 temp_size_j[i]=0;
2651 }
2652
2653 for (i=0;i<file_Len;i++) //to generate ind_pop[i]
2654 {
2655 index_id=all_population_ID[i];
2656
2657 j=temp_size_j[index_id];
2658
2659 for (t=0;t<num_dm;t++)
2660 {
2661 ind_pop[index_id][j][t]=input_data[i][t];
2662 }
2663 temp_size_j[index_id]++;
2664 }
2665
2666 aver=(double*)malloc(sizeof(double)*num_dm);
2667 memset(aver,0,sizeof(double)*num_dm);
2668
2669 std=(double*)malloc(sizeof(double)*num_dm);
2670 memset(std,0,sizeof(double)*num_dm);
2671
2672 temp_population_center=(double**)malloc(sizeof(double*)*2);
2673 memset(temp_population_center,0,sizeof(double*)*2);
2674 for (i=0;i<2;i++)
2675 {
2676 temp_population_center[i]=(double*)malloc(sizeof(double)*3);
2677 memset(temp_population_center[i],0,sizeof(double)*3);
2678 }
2679
2680 size_p_2=(long*)malloc(sizeof(long)*2);
2681 memset(size_p_2,0,sizeof(long)*2);
2682
2683 partit=(long*)malloc(sizeof(long)*num_population);
2684 memset(partit,0,sizeof(long)*num_population);
2685
2686 num_computed_population=num_population;
2687
2688 temp_population_ID=(long**)malloc(sizeof(long*)*num_population);
2689 memset(temp_population_ID,0,sizeof(long*)*num_population);
2690
2691 for (i=0;i<num_population;i++)
2692 {
2693 temp_population_ID[i]=(long*)malloc(sizeof(long)*size_p[i]);
2694 memset(temp_population_ID[i],0,sizeof(long)*size_p[i]);
2695 }
2696
2697 //printf("num_population=%d\n",num_population);
2698
2699 for (i=0;i<num_population;i++)
2700 {
2701 partit[i]=0;
2702 tran(ind_pop[i], size_p[i], num_dm, 1, ind_pop[i]);//0-1 normalize ind_pop[i]
2703
2704 //find the 3 dimensions with the largest std for ind_pop[i]
2705 d1=-1;
2706 d2=-1;
2707 d3=-1;
2708
2709 for (t=0;t<num_dm;t++)
2710 {
2711 aver[t]=0;
2712 for (j=0;j<size_p[i];j++)
2713 {
2714 aver[t]=aver[t]+ind_pop[i][j][t];
2715 }
2716 aver[t]=aver[t]/(double)size_p[i];
2717
2718 std[t]=0;
2719 for (j=0;j<size_p[i];j++)
2720 {
2721 std[t]=std[t]+((ind_pop[i][j][t]-aver[t])*(ind_pop[i][j][t]-aver[t]));
2722 }
2723 std[t]=sqrt(std[t]/(double)size_p[i]);
2724 }
2725
2726
2727
2728 for (j=0;j<3;j++)
2729 {
2730 max_d_dist=0;
2731 for (t=0;t<num_dm;t++)
2732 {
2733 if ((t!=d1) && (t!=d2))
2734 {
2735 dist=std[t];
2736 }
2737 else
2738 dist=-1;
2739
2740 if (dist>max_d_dist)
2741 {
2742 max_d_dist=dist;
2743 d_d=t;
2744 }
2745 }
2746
2747 if (j==0)
2748 d1=d_d;
2749 if (j==1)
2750 d2=d_d;
2751 if (j==2)
2752 d3=d_d;
2753 }
2754
2755
2756 for (t=0;t<size_p[i];t++)
2757 {
2758 ind_pop_3[i][t][0]=ind_pop[i][t][d1];
2759 ind_pop_3[i][t][1]=ind_pop[i][t][d2];
2760 ind_pop_3[i][t][2]=ind_pop[i][t][d3];
2761 }
2762
2763
2764 temp_population_center[0][0]=(aver[d1])/2.0;
2765
2766 temp_population_center[0][1]=aver[d2];
2767 temp_population_center[0][2]=aver[d3];
2768
2769 temp_population_center[1][0]=(aver[d1]+1.0)/2.0;
2770
2771 temp_population_center[1][1]=aver[d2];
2772 temp_population_center[1][2]=aver[d3];
2773
2774
2775
2776 //run K-means with K=2
2777 kmeans(ind_pop_3[i], 2, KMEANS_TERM, size_p[i], 3, temp_population_ID[i], temp_population_center);
2778
2779 for (j=0;j<2;j++)
2780 size_p_2[j]=0;
2781
2782 for (j=0;j<size_p[i];j++)
2783 size_p_2[temp_population_ID[i][j]]++;
2784
2785 for (j=0;j<2;j++)
2786 if (size_p_2[j]==0)
2787 printf("size_p_2=0 at i=%ld\n",i);
2788
2789
2790 //check whether the 2 parts should be merged
2791
2792
2793 Ei=get_avg_dist(temp_population_center[0], 0,1, temp_population_ID[i], 2,size_p[i], 3, ind_pop_3[i], 0,1,2,size_p_2);
2794 Ej=get_avg_dist(temp_population_center[1], 0,1, temp_population_ID[i], 2,size_p[i], 3, ind_pop_3[i], 0,1,2,size_p_2);
2795
2796 for (t=0;t<3;t++)
2797 {
2798 center_p_1[t]=temp_population_center[0][t]*0.25+temp_population_center[1][t]*0.75;
2799 center_p_2[t]=temp_population_center[0][t]*0.5+temp_population_center[1][t]*0.5;
2800 center_p_3[t]=temp_population_center[0][t]*0.75+temp_population_center[1][t]*0.25;
2801 }
2802
2803 E1=get_avg_dist(center_p_1, 0,1, temp_population_ID[i], 2,size_p[i], 3, ind_pop_3[i], 0,1,2,size_p_2);
2804
2805 E2=get_avg_dist(center_p_2, 0,1, temp_population_ID[i], 2,size_p[i], 3, ind_pop_3[i], 0,1,2,size_p_2);
2806
2807 E3=get_avg_dist(center_p_3, 0,1, temp_population_ID[i], 2,size_p[i], 3, ind_pop_3[i], 0,1,2,size_p_2);
2808
2809 //printf("i=%d;E1=%f;E2=%f;E3=%f\n",i,E1,E2,E3);
2810
2811 if (E1<E2)
2812 Ep=E2;
2813 else
2814 Ep=E1;
2815
2816 Ep=max(Ep,E3); //Ep is the most sparse area
2817
2818
2819
2820 if ((Ep>Ei) && (Ep>Ej)) //the two parts should be partitioned
2821 {
2822 partit[i]=num_computed_population;
2823 num_computed_population++;
2824 }
2825
2826 } //end for (i=0;i<num_population;i++)
2827
2828
2829 for (i=0;i<num_population;i++)
2830 temp_size_j[i]=0;
2831
2832 for (i=0;i<file_Len;i++)
2833 {
2834 index_id=all_population_ID[i];
2835 if (partit[index_id]>0)
2836 {
2837 j=temp_size_j[index_id];
2838 if (temp_population_ID[index_id][j]==1)
2839 all_computed_pop_ID[i]=partit[index_id];
2840
2841 temp_size_j[index_id]++;
2842 }
2843 }
2844
2845 printf("Number of groups after partitioning is %ld\n", num_computed_population);
2846
2847
2848 free(size_p_2);
2849
2850 free(aver);
2851 free(std);
2852 free(partit);
2853
2854 free(center_p_1);
2855 free(center_p_2);
2856 free(center_p_3);
2857
2858
2859
2860 for (i=0;i<num_population;i++)
2861 free(temp_population_ID[i]);
2862 free(temp_population_ID);
2863
2864 for (i=0;i<num_population;i++)
2865 {
2866 for (j=0;j<size_p[i];j++)
2867 {
2868 free(ind_pop[i][j]);
2869 free(ind_pop_3[i][j]);
2870 }
2871 }
2872
2873
2874
2875 for (i=0;i<num_population;i++)
2876 {
2877 free(ind_pop[i]);
2878 free(ind_pop_3[i]);
2879 }
2880 free(ind_pop);
2881 free(ind_pop_3);
2882
2883
2884
2885
2886 for (i=0;i<2;i++)
2887 free(temp_population_center[i]);
2888
2889 free(temp_population_center);
2890
2891 free(temp_size_j);
2892 free(size_p);
2893
2894
2895
2896 //update the IDs, Centers, and # of populations
2897 for (i=0;i<file_Len;i++)
2898 {
2899 all_population_ID[i]=all_computed_pop_ID[i];
2900 if (all_population_ID[i]>=num_computed_population)
2901 printf("all_population_ID[%ld]=%ld\n",i,all_population_ID[i]);
2902 }
2903
2904 num_population=num_computed_population;
2905
2906 free(all_computed_pop_ID);
2907 //end partitioning
2908 // since the num_population has changed, population_center needs to be redefined as below
2909
2910 population_center=(double **)malloc(sizeof(double*)*num_population);
2911 memset(population_center,0,sizeof(double*)*num_population);
2912 for (i=0;i<num_population;i++)
2913 {
2914 population_center[i]=(double*)malloc(sizeof(double)*num_dm);
2915 memset(population_center[i],0,sizeof(double)*num_dm);
2916 }
2917
2918
2919 ID2Center_all(normalized_data,file_Len,num_dm,num_population,all_population_ID,population_center);
2920
2921
2922
2923 ////// end of further partitioning
2924
2925 ////////////////////////////////////////////////////////////
2926 // to further merge populations to avoid overpartitioning
2927 // Added June 20, 2010
2928 ////////////////////////////////////////////////////////////
2929
2930
2931
2932 num_real_pop=num_population;
2933 keep_merge=1;
2934
2935 if (num_pop>num_real_pop)
2936 {
2937 fprintf(stderr,"number of populations specified too large\n");
2938 exit(0);
2939 }
2940
2941 center_1=(double *)malloc(sizeof(double)*num_dm);
2942 memset(center_1,0,sizeof(double)*num_dm);
2943
2944 center_2=(double *)malloc(sizeof(double)*num_dm);
2945 memset(center_2,0,sizeof(double)*num_dm);
2946
2947 center_3=(double *)malloc(sizeof(double)*num_dm);
2948 memset(center_3,0,sizeof(double)*num_dm);
2949
2950
2951
2952 while (((num_real_pop>num_pop) && (num_pop!=0)) || ((keep_merge==1) && (num_pop==0) && (num_real_pop>2)))
2953 {
2954
2955 keep_merge=0; //so, if no entering merge function, while will exit when num_pop==0
2956
2957 real_pop_center=(double **)malloc(sizeof(double*)*num_real_pop);
2958 memset(real_pop_center,0,sizeof(double*)*num_real_pop);
2959 for (i=0;i<num_real_pop;i++)
2960 {
2961 real_pop_center[i]=(double*)malloc(sizeof(double)*num_dm);
2962 memset(real_pop_center[i],0,sizeof(double)*num_dm);
2963 }
2964
2965 min_pop=(double **)malloc(sizeof(double*)*num_real_pop);
2966 memset(min_pop,0,sizeof(double*)*num_real_pop);
2967 for (i=0;i<num_real_pop;i++)
2968 {
2969 min_pop[i]=(double*)malloc(sizeof(double)*num_dm);
2970 memset(min_pop[i],0,sizeof(double)*num_dm);
2971 }
2972
2973 max_pop=(double **)malloc(sizeof(double*)*num_real_pop);
2974 memset(max_pop,0,sizeof(double*)*num_real_pop);
2975 for (i=0;i<num_real_pop;i++)
2976 {
2977 max_pop[i]=(double*)malloc(sizeof(double)*num_dm);
2978 memset(max_pop[i],0,sizeof(double)*num_dm);
2979 }
2980
2981 if (num_real_pop==num_population)
2982 {
2983 for (i=0;i<num_real_pop;i++)
2984 for (j=0;j<num_dm;j++)
2985 real_pop_center[i][j]=population_center[i][j];
2986 }
2987 else
2988 {
2989 ID2Center_all(normalized_data,file_Len,num_dm,num_real_pop,all_population_ID,real_pop_center);
2990 }
2991
2992 for (i=0;i<num_real_pop;i++)
2993 {
2994 for (j=0;j<num_dm;j++)
2995 {
2996 min_pop[i][j]=MAX_VALUE;
2997
2998 max_pop[i][j]=0;
2999 }
3000 }
3001
3002 for (i=0;i<file_Len;i++)
3003 {
3004 index_id=all_population_ID[i];
3005 for (j=0;j<num_dm;j++)
3006 {
3007 if (normalized_data[i][j]<min_pop[index_id][j])
3008 min_pop[index_id][j]=normalized_data[i][j];
3009
3010 if (normalized_data[i][j]>max_pop[index_id][j])
3011 max_pop[index_id][j]=normalized_data[i][j];
3012 }
3013 }
3014
3015 /* for (i=0;i<num_real_pop;i++)
3016 {
3017 for (j=0;j<num_dm;j++)
3018 {
3019 printf("min_pop is %f\t",min_pop[i][j]);
3020 //printf("max_pop is %f\n",max_pop[i][j]);
3021 }
3022 printf("\n");
3023 }
3024 */
3025
3026
3027 distance_pop=(double **)malloc(sizeof(double*)*num_real_pop);
3028 memset(distance_pop,0,sizeof(double*)*num_real_pop);
3029 for (i=0;i<num_real_pop;i++)
3030 {
3031 distance_pop[i]=(double*)malloc(sizeof(double)*num_real_pop);
3032 memset(distance_pop[i],0,sizeof(double)*num_real_pop);
3033 }
3034
3035 for (i=0;i<num_real_pop-1;i++)
3036 {
3037 for (j=i+1;j<num_real_pop;j++)
3038 {
3039 distance=0;
3040
3041 for (t=0;t<num_dm;t++)
3042 {
3043 dist=real_pop_center[i][t]-real_pop_center[j][t];
3044 distance=distance+dist*dist;
3045 }
3046 distance_pop[i][j]=distance;
3047 distance_pop[j][i]=distance;
3048 }
3049 }
3050
3051 if (num_real_pop>2)
3052 num_checked_range=(long)((double)num_real_pop*(num_real_pop-1)/2.0); //to find the mergeable pair among the num_checked_range pairs of smallest distances
3053 else
3054 num_checked_range=1;
3055
3056 size_c=(long *)malloc(sizeof(long)*num_real_pop);
3057 memset(size_c,0,sizeof(long)*num_real_pop);
3058
3059 for (i=0;i<file_Len;i++)
3060 size_c[all_population_ID[i]]++;
3061
3062 for (p=0;p<num_checked_range;p++)
3063 {
3064 current_smallest_dist=MAX_VALUE;
3065
3066 for (i=0;i<num_real_pop-1;i++)
3067 {
3068 for (j=i+1;j<num_real_pop;j++)
3069 {
3070 if (distance_pop[i][j]<current_smallest_dist)
3071 {
3072 current_smallest_dist=distance_pop[i][j];
3073
3074 temp_i=i;
3075 temp_j=j;
3076 }
3077 }
3078 }
3079
3080 if (p==0)
3081 {
3082 first_i=temp_i;
3083 first_j=temp_j;
3084 }
3085
3086 distance_pop[temp_i][temp_j]=MAX_VALUE+1; //make sure this pair won't be selected next time
3087
3088 //normalize and calculate std for temp_i and temp_j
3089
3090 //pick the largest 3 dimensions on standard deviation
3091 d1=-1;
3092 d2=-1;
3093 d3=-1;
3094
3095 for (i=0;i<3;i++)
3096 {
3097 max_d_dist=0;
3098 for (j=0;j<num_dm;j++)
3099 {
3100 if ((j!=d1) && (j!=d2))
3101 {
3102 temp=min(min_pop[temp_i][j],min_pop[temp_j][j]);
3103 tmp=max(max_pop[temp_i][j],max_pop[temp_j][j]);
3104 if (tmp<=temp)
3105 {
3106 //printf("min_pop[%d][%d]=%f \t min_pop[%d][%d]=%f \t max_pop[%d][%d]=%f \t max_pop[%d][%d]=%f\n",temp_i,j,min_pop[temp_i][j],temp_j, j, min_pop[temp_j][j],temp_i,j,max_pop[temp_i][j],temp_j,j,max_pop[temp_j][j]);
3107 dist=0;
3108 }
3109 else
3110 dist=(real_pop_center[temp_i][j]-real_pop_center[temp_j][j])/(tmp-temp);
3111 if (dist<0)
3112 dist=-dist;
3113 }
3114 else
3115 dist=-1;
3116
3117 if (dist>max_d_dist)
3118 {
3119 max_d_dist=dist;
3120 d_d=j;
3121 }
3122 }
3123
3124 if (i==0)
3125 d1=d_d;
3126 if (i==1)
3127 d2=d_d;
3128 if (i==2)
3129 d3=d_d;
3130 }
3131
3132 //printf("d1=%d;d2=%d;d3=%d\n",d1,d2,d3);
3133
3134 Ei=get_avg_dist(real_pop_center[temp_i], temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c);
3135 Ej=get_avg_dist(real_pop_center[temp_j], temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c);
3136
3137 for (t=0;t<num_dm;t++)
3138 {
3139 center_1[t]=real_pop_center[temp_i][t]*0.25+real_pop_center[temp_j][t]*0.75;
3140 center_2[t]=real_pop_center[temp_i][t]*0.5+real_pop_center[temp_j][t]*0.5;
3141 center_3[t]=real_pop_center[temp_i][t]*0.75+real_pop_center[temp_j][t]*0.25;
3142 }
3143
3144 E1=get_avg_dist(center_1, temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c);
3145
3146 E2=get_avg_dist(center_2, temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c);
3147
3148 E3=get_avg_dist(center_3, temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c);
3149
3150 if (E1<E2)
3151 Ep=E2;
3152 else
3153 Ep=E1;
3154
3155 Ep=max(Ep,E3); //Ep is the most sparse area
3156
3157 Ep=Ep*E_T;
3158 //printf("Ep=%f;Ei=%f;Ej=%f\n",Ep,Ei,Ej);
3159
3160 if ((Ep<=Ei) || (Ep<=Ej))//if the most sparse area between i and j are still denser than one of them, temp_i and temp_j should be merged
3161 {
3162 keep_merge=1;
3163 break;
3164 }//end if (Ep)
3165 } //end for p
3166
3167 //printf("keep_merge=%d\n",keep_merge);
3168
3169 for (i=0;i<num_real_pop;i++)
3170 {
3171 free(real_pop_center[i]);
3172 free(distance_pop[i]);
3173 free(min_pop[i]);
3174 free(max_pop[i]);
3175 }
3176 free(real_pop_center);
3177 free(min_pop);
3178 free(max_pop);
3179 free(distance_pop);
3180 free(size_c);
3181
3182 //printf("temp_i=%d;temp_j=%d\n",temp_i,temp_j);
3183
3184 if (keep_merge) //found one within p loop
3185 {
3186 for (i=0;i<file_Len;i++)
3187 {
3188 if (all_population_ID[i]>temp_j)
3189 {
3190 all_population_ID[i]=all_population_ID[i]-1;
3191 }
3192 else
3193 {
3194 if (all_population_ID[i]==temp_j)
3195 {
3196 all_population_ID[i]=temp_i;
3197 }
3198 }
3199 }
3200
3201 num_real_pop--;
3202 }
3203 else
3204 {
3205 if ((num_pop!=0) && (num_pop<num_real_pop)) //not reach the specified num of pop
3206 {
3207 for (i=0;i<file_Len;i++)
3208 {
3209 if (all_population_ID[i]>first_j)
3210 {
3211 all_population_ID[i]=all_population_ID[i]-1;
3212 }
3213 else
3214 {
3215 if (all_population_ID[i]==first_j)
3216 {
3217 all_population_ID[i]=first_i;
3218 }
3219 }
3220 }
3221
3222 num_real_pop--;
3223
3224
3225
3226 }
3227 }
3228 //printf("num_real_pop is %d\n",num_real_pop);
3229
3230 } //end of while
3231
3232
3233
3234 free(center_1);
3235 free(center_2);
3236 free(center_3);
3237 printf("Final number of populations is %ld\n",num_real_pop);
3238
3239 new_population_center=(double **)malloc(sizeof(double*)*num_real_pop);
3240 memset(new_population_center,0,sizeof(double*)*num_real_pop);
3241 for (i=0;i<num_real_pop;i++)
3242 {
3243 new_population_center[i]=(double*)malloc(sizeof(double)*num_dm);
3244 memset(new_population_center[i],0,sizeof(double)*num_dm);
3245 }
3246
3247
3248 ///////////////////////////////////////////
3249 //End of population mapping
3250 ///////////////////////////////////////////
3251
3252 show(input_data, all_population_ID, file_Len, num_real_pop, num_dm, para_name_string);
3253
3254 ID2Center_all(input_data,file_Len,num_dm,num_real_pop,all_population_ID,new_population_center);
3255
3256
3257 f_cid=fopen("population_id.txt","w");
3258 f_ctr=fopen("population_center.txt","w");
3259 f_out=fopen("coordinates.txt","w");
3260 f_results=fopen("flock_results.txt","w");
3261
3262 /*
3263 f_parameters=fopen("parameters.txt","w");
3264 fprintf(f_parameters,"Number_of_Bins\t%d\n",num_bin);
3265 fprintf(f_parameters,"Density\t%f\n",aver_index);
3266 fclose(f_parameters);
3267 */
3268
3269 for (i=0;i<file_Len;i++)
3270 fprintf(f_cid,"%ld\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009
3271
3272 /*
3273 * New to check for min/max to add to parameters.txt
3274 *
3275 */
3276
3277 fprintf(f_out,"%s\n",para_name_string);
3278 //fprintf(f_results,"%s\tEvent\tPopulation\n",para_name_string);
3279 fprintf(f_results,"%s\tPopulation\n",para_name_string);
3280 for (i=0;i<file_Len;i++)
3281 {
3282 for (j=0;j<num_dm;j++)
3283 {
3284 if (input_data[i][j] < min) {
3285 min = (int)input_data[i][j];
3286 }
3287 if (input_data[i][j] > max) {
3288 max = (int)input_data[i][j];
3289 }
3290 if (j==num_dm-1)
3291 {
3292 fprintf(f_out,"%d\n",(int)input_data[i][j]);
3293 fprintf(f_results,"%d\t",(int)input_data[i][j]);
3294 }
3295 else
3296 {
3297 fprintf(f_out,"%d\t",(int)input_data[i][j]);
3298 fprintf(f_results,"%d\t",(int)input_data[i][j]);
3299 }
3300 }
3301 //fprintf(f_results,"%ld\t",i + 1);
3302 fprintf(f_results,"%ld\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009
3303 }
3304
3305 /*
3306 f_parameters=fopen("parameters.txt","w");
3307 fprintf(f_parameters,"Number_of_Bins\t%ld\n",num_bin);
3308 fprintf(f_parameters,"Density\t%d\n",den_t_event);
3309 fprintf(f_parameters,"Min\t%d\n",min);
3310 fprintf(f_parameters,"Max\t%d\n",max);
3311 fclose(f_parameters);
3312 */
3313
3314 f_properties=fopen("fcs.properties","w");
3315 fprintf(f_properties,"Bins=%ld\n",num_bin);
3316 fprintf(f_properties,"Density=%d\n",den_t_event);
3317 fprintf(f_properties,"Min=%d\n",min);
3318 fprintf(f_properties,"Max=%d\n",max);
3319 fprintf(f_properties,"Populations=%ld\n",num_real_pop);
3320 fprintf(f_properties,"Events=%ld\n",file_Len);
3321 fprintf(f_properties,"Markers=%ld\n",num_dm);
3322 fclose(f_properties);
3323
3324 for (i=0;i<num_real_pop;i++) {
3325 /* Add if we want to include population id in the output
3326 */
3327 fprintf(f_ctr,"%ld\t",i+1); //i changed to i+1 to start from 1 instead of 0: April 16, 2009
3328
3329 for (j=0;j<num_dm;j++) {
3330 if (j==num_dm-1)
3331 fprintf(f_ctr,"%.0f\n",new_population_center[i][j]);
3332 else
3333 fprintf(f_ctr,"%.0f\t",new_population_center[i][j]);
3334 }
3335 }
3336
3337 //added April 16, 2009
3338 f_mfi=fopen("MFI.txt","w");
3339
3340 for (i=0;i<num_real_pop;i++)
3341 {
3342 fprintf(f_mfi,"%ld\t",i+1);
3343
3344 for (j=0;j<num_dm;j++)
3345 {
3346 if (j==num_dm-1)
3347 fprintf(f_mfi,"%.0f\n",new_population_center[i][j]);
3348 else
3349 fprintf(f_mfi,"%.0f\t",new_population_center[i][j]);
3350 }
3351 }
3352 fclose(f_mfi);
3353
3354 //ended April 16, 2009
3355
3356 fclose(f_cid);
3357 fclose(f_ctr);
3358 fclose(f_out);
3359 fclose(f_results);
3360
3361
3362 for (i=0;i<num_population;i++)
3363 free(population_center[i]);
3364
3365 free(population_center);
3366
3367 for (i=0;i<num_real_pop;i++)
3368 free(new_population_center[i]);
3369
3370 free(new_population_center);
3371
3372 for (i=0;i<file_Len;i++)
3373 free(normalized_data[i]);
3374 free(normalized_data);
3375
3376 free(grid_populationID);
3377
3378 free(cluster_populationID);
3379 free(grid_clusterID);
3380 free(cluster_ID);
3381
3382 for (i=0;i<file_Len;i++)
3383 free(input_data[i]);
3384 free(input_data);
3385
3386 free(grid_ID);
3387 free(population_ID);
3388 free(all_population_ID);
3389 free(eventID_To_denseventID);
3390
3391 ///////////////////////////////////////////////////////////
3392 printf("Ending time:\t\t\t\t");
3393 fflush(stdout);
3394 system("/bin/date");
3395
3396 /*
3397 * Windows version
3398 _strtime( tmpbuf );
3399 printf( "Ending time:\t\t\t\t%s\n", tmpbuf );
3400 _strdate( tmpbuf );
3401 printf( "Ending date:\t\t\t\t%s\n", tmpbuf );
3402 */
3403
3404 return 0;
3405 }