Mercurial > repos > immport-devteam > cross_sample
comparison src/flock2.c @ 1:7eab80f86779 draft
add FLOCK
author | immport-devteam |
---|---|
date | Mon, 27 Feb 2017 13:26:09 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:8d951baf795f | 1:7eab80f86779 |
---|---|
1 /////////// | |
2 // Changes made: | |
3 // 1. Added another parameter: number of populations | |
4 // 2. Added a hierarchical merging step based on density change between centroids of two hyper-regions | |
5 // 3. Picked the longest dimensions between the population centroids to judge whether the two parts should be merged | |
6 // 4. Removed checking time parameter | |
7 // 5. Output error to stderr | |
8 // 6. Fixed the bug of density threshold always = 3 | |
9 // 7. Added another error (select_num_bin<min_grid) || (select_num_bin>max_grid) to STDERR | |
10 // 8. Fixed a bug for 2D data by using K=e*K | |
11 // 9. Added some header files, may not be necessary | |
12 // 10. Added a lower bound (at least two) for number of populations | |
13 /*************************************************************************************************************************************** | |
14 | |
15 FLOCK: FLOw cytometry Clustering without K (Named by: Jamie A. Lee and Richard H. Scheuermann) | |
16 | |
17 Author: (Max) Yu Qian, Ph.D. | |
18 | |
19 Copyright: Scheuermann Lab, Dept. of Pathology, UTSW | |
20 | |
21 Development: November 2005 ~ Forever | |
22 | |
23 Status: July 2010: Release 2.0 | |
24 | |
25 Usage: flock data_file | |
26 Note: the input file format must be channel values and the delimiter between two values must be a tab. | |
27 | |
28 | |
29 | |
30 ****************************************************************************************************************************************/ | |
31 | |
32 #include <time.h> | |
33 #include <stdio.h> | |
34 #include <stdlib.h> | |
35 #include <math.h> | |
36 #include <string.h> | |
37 #include <sys/stat.h> | |
38 #include <unistd.h> | |
39 #include <assert.h> | |
40 | |
41 | |
42 | |
43 #define DEBUG 0 | |
44 #define LINE_LEN 1024 | |
45 #define FILE_NAME_LEN 128 | |
46 #define PARA_NAME_LEN 64 | |
47 #define MAX_VALUE 1000000000 | |
48 #define MIN_GRID 6 | |
49 #define MAX_GRID 50 | |
50 #define E_T 1.0 | |
51 | |
52 #define NORM_METHOD 2 //2 if z-score; 0 if no normalization; 1 if min-max | |
53 #define KMEANS_TERM 10 | |
54 #define MAX_POP_NUM 128 | |
55 | |
56 #ifndef max | |
57 #define max( a, b ) ( ((a) > (b)) ? (a) : (b) ) | |
58 #endif | |
59 | |
60 #ifndef min | |
61 #define min( a, b ) ( ((a) < (b)) ? (a) : (b) ) | |
62 #endif | |
63 | |
64 static long **Gr=0; | |
65 static long *gcID = 0; /* grid cluster IDs */ | |
66 static long *cluster_count=0; /* count of nodes per cluster */ | |
67 static long ndense=0; | |
68 static long ndim=0; | |
69 /* cid changes between depth-first searches, but is constant within a | |
70 single search, so it goes here. */ | |
71 static long cid=0; | |
72 /* Do a depth-first search for a single connected component in graph | |
73 * G. Start from node, tag the nodes found with cid, and record | |
74 * the tags in grid_clusterID. Also, record the node count in | |
75 * cluster_count. If we find a node that has already been assigned to | |
76 * a cluster, that means we're merging two clusters, so zero out the | |
77 * old cid's node count. | |
78 * | |
79 * Note that our graph is constructed as a DAG, so we can be | |
80 * guaranteed to terminate without checking for cycles. | |
81 * | |
82 * Note2: this function can potentially recurse to depth = ndense. | |
83 * Plan stack size accordingly. | |
84 * | |
85 * Output: | |
86 * | |
87 * grid_clusterID[] -- array where we tag the nodes. | |
88 * cluster_count[] -- count of the number of nodes per cluster. | |
89 */ | |
90 static void merge_cluster(long from, long into) | |
91 { | |
92 int i; | |
93 | |
94 for(i=0; i<ndense; ++i) | |
95 if(gcID[i] == from) | |
96 gcID[i] = into; | |
97 } | |
98 | |
99 void depth_first(long node) | |
100 { | |
101 long i; | |
102 | |
103 if(gcID[node] == cid) // we're guaranteed no cycles, but it is possible to reach a node | |
104 return; // through two different paths in the same cluster. This early | |
105 // return saves us some unneeded work. | |
106 | |
107 /* Check to see if we are merging a cluster */ | |
108 if(gcID[node] >= 0) { | |
109 /* We are, so zero the count for the old cluster. */ | |
110 cluster_count[ gcID[node] ] = 0; | |
111 merge_cluster(gcID[node], cid); | |
112 return; | |
113 } | |
114 | |
115 /* Update for this node */ | |
116 gcID[node] = cid; | |
117 cluster_count[cid]++; | |
118 | |
119 /* Recursively search the child nodes */ | |
120 for(i=0; i<ndim; ++i) | |
121 if(Gr[node][i] >= 0) /* This is a child node */ | |
122 depth_first(Gr[node][i]); | |
123 } | |
124 | |
125 void bail(const char *msg) | |
126 { | |
127 fprintf(stderr,"%s",msg); | |
128 exit(0); | |
129 } | |
130 | |
131 | |
132 static void check_clusters(long *gcID, int ndense) | |
133 { | |
134 int i; | |
135 | |
136 for(i=0; i<ndense; ++i) | |
137 if(gcID[i] < 0) { | |
138 fprintf(stderr,"faulty cluster id at i= %d\n",i); | |
139 exit(0); | |
140 } | |
141 } | |
142 | |
143 | |
144 | |
145 long find_connected(long **G, long n_dense_grids, long num_dm, long *grid_clusterID) | |
146 { | |
147 long nclust=0; /* number of clusters found */ | |
148 long i; | |
149 long *subfac; | |
150 long clustid=0; | |
151 int subval=0,nempty=0; | |
152 | |
153 int sz = n_dense_grids*sizeof(long); | |
154 cluster_count = malloc(sz); | |
155 if(!cluster_count) | |
156 bail("find_connected: Unable to allocate %zd bytes.\n"); | |
157 memset(cluster_count,0,sz); | |
158 | |
159 /* set up the statics that will be used in the DFS */ | |
160 Gr=G; | |
161 gcID = grid_clusterID; | |
162 ndense = n_dense_grids; | |
163 ndim = num_dm; | |
164 | |
165 for(i=0;i<ndense;++i) | |
166 grid_clusterID[i] = -1; | |
167 | |
168 for(i=0;i<ndense;++i) { | |
169 if(grid_clusterID[i] < 0) { /* grid hasn't been assigned yet */ | |
170 cid = nclust++; | |
171 depth_first(i); | |
172 } | |
173 } | |
174 | |
175 | |
176 | |
177 /* At this point we probably have some clusters that are empty due to merging. | |
178 We want to compact the cluster numbering to eliminate the empty clusters. */ | |
179 | |
180 subfac = malloc(sz); | |
181 if(!subfac) | |
182 bail("find_connected: Unable to allocate %zd bytes.\n"); | |
183 subval=0; | |
184 nempty=0; | |
185 | |
186 /* cluster #i needs to have its ID decremented by 1 for each empty cluster with | |
187 ID < i. Precaclulate the decrements in this loop: */ | |
188 for(i=0;i<nclust;++i) { | |
189 //clustid = grid_clusterID[i]; | |
190 if(cluster_count[i] == 0) { | |
191 subval++; | |
192 nempty++; | |
193 } | |
194 subfac[i] = subval; | |
195 } | |
196 | |
197 //printf("nempty is %d\n",nempty); | |
198 | |
199 /* Now apply the decrements to all of the dense grids */ | |
200 for(i=0;i<ndense;++i) { | |
201 clustid = grid_clusterID[i]; | |
202 grid_clusterID[i] -= subfac[clustid]; | |
203 } | |
204 | |
205 | |
206 | |
207 /* correct the number of clusters found */ | |
208 nclust -= nempty; | |
209 | |
210 //printf("nclust is %d\n",nclust); | |
211 | |
212 return nclust; | |
213 } | |
214 | |
215 /************************************* Read basic info of the source file **************************************/ | |
216 /************************************* Read basic info of the source file **************************************/ | |
217 void getfileinfo(FILE *f_src, long *file_Len, long *num_dm, char *name_string, long *time_ID) | |
218 { | |
219 char src[LINE_LEN]; | |
220 char current_name[64]; | |
221 char prv; | |
222 | |
223 long num_rows=0; | |
224 long num_columns=0; | |
225 long ch='\n'; | |
226 long prev='\n'; | |
227 long time_pos=0; | |
228 long i=0; | |
229 long j=0; | |
230 int sw=0; | |
231 | |
232 src[0]='\0'; | |
233 fgets(src, LINE_LEN, f_src); | |
234 | |
235 name_string[0]='\0'; | |
236 current_name[0]='\0'; | |
237 prv='\n'; | |
238 | |
239 while ((src[i]==' ') || (src[i]=='\t')) //skip space and tab characters | |
240 i++; | |
241 | |
242 while ((src[i]!='\0') && (src[i]!='\n')) //repeat until the end of the line | |
243 { | |
244 current_name[j]=src[i]; | |
245 | |
246 if ((src[i]=='\t') && (prv!='\t')) //a complete word | |
247 { | |
248 current_name[j]='\0'; | |
249 | |
250 if (0!=strcmp(current_name,"Time")) | |
251 { | |
252 num_columns++; //num_columns does not inlcude the column of Time | |
253 time_pos++; | |
254 if (sw) { | |
255 strcat(name_string,"\t"); | |
256 } | |
257 strcat(name_string,current_name); | |
258 sw = 1; | |
259 } | |
260 else | |
261 { | |
262 *time_ID=time_pos; | |
263 } | |
264 | |
265 current_name[0]='\0'; | |
266 j=0; | |
267 } | |
268 | |
269 if ((src[i]=='\t') && (prv=='\t')) //a duplicate tab or space | |
270 { | |
271 current_name[0]='\0'; | |
272 j=0; | |
273 } | |
274 | |
275 if (src[i]!='\t') | |
276 j++; | |
277 | |
278 prv=src[i]; | |
279 i++; | |
280 } | |
281 | |
282 //name_string[j]='\0'; | |
283 if (prv!='\t') //the last one hasn't been retrieved | |
284 { | |
285 current_name[j]='\0'; | |
286 if (0!=strcmp(current_name,"Time")) | |
287 { | |
288 num_columns++; | |
289 strcat(name_string,"\t"); | |
290 strcat(name_string,current_name); | |
291 time_pos++; | |
292 } | |
293 else | |
294 { | |
295 *time_ID=time_pos; | |
296 } | |
297 } | |
298 if (DEBUG==1) | |
299 { | |
300 printf("time_ID is %ld\n",*time_ID); | |
301 printf("name_string is %s\n",name_string); | |
302 } | |
303 | |
304 //start computing # of rows | |
305 | |
306 while ((ch = fgetc(f_src))!= EOF ) | |
307 { | |
308 if (ch == '\n') | |
309 { | |
310 ++num_rows; | |
311 } | |
312 prev = ch; | |
313 } | |
314 if (prev!='\n') | |
315 ++num_rows; | |
316 | |
317 if (num_rows<50) | |
318 { | |
319 fprintf(stderr,"Number of events in the input file is too few and should not be processed!\n"); //modified on July 23, 2010 | |
320 exit(0); | |
321 } | |
322 | |
323 *file_Len=num_rows; | |
324 *num_dm=num_columns; | |
325 | |
326 printf("original file size is %ld; number of dimensions is %ld\n", *file_Len, *num_dm); | |
327 } | |
328 | |
329 | |
330 | |
331 /************************************* Read the source file into uncomp_data **************************************/ | |
332 void readsource(FILE *f_src, long file_Len, long num_dm, double **uncomp_data, long time_ID) | |
333 { | |
334 //long time_pass=0; //to mark whether the time_ID has been passed | |
335 long index=0; | |
336 | |
337 long i=0; | |
338 long j=0; | |
339 long t=0; | |
340 | |
341 char src[LINE_LEN]; | |
342 char xc[LINE_LEN/10]; | |
343 | |
344 src[0]='\0'; | |
345 fgets(src,LINE_LEN, f_src); //skip the first line about parameter names | |
346 | |
347 while (!feof(f_src) && (index<file_Len)) //index = 0, 1, ..., file_Len-1 | |
348 { | |
349 src[0]='\0'; | |
350 fgets(src,LINE_LEN,f_src); | |
351 i=0; | |
352 //time_pass=0; | |
353 | |
354 //if (time_ID==-1) //there is no time_ID | |
355 // { | |
356 for (t=0;t<num_dm;t++) | |
357 { | |
358 xc[0]='\0'; | |
359 j=0; | |
360 while ((src[i]!='\0') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t')) | |
361 { | |
362 xc[j]=src[i]; | |
363 i++; | |
364 j++; | |
365 } | |
366 | |
367 xc[j]='\0'; | |
368 i++; | |
369 | |
370 uncomp_data[index][t]=atof(xc); | |
371 } | |
372 // } | |
373 /*else | |
374 { | |
375 for (t=0;t<=num_dm;t++) //the time column needs to be skipped, so there are num_dm+1 columns | |
376 { | |
377 xc[0]='\0'; | |
378 j=0; | |
379 while ((src[i]!='\0') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t')) | |
380 { | |
381 xc[j]=src[i]; | |
382 i++; | |
383 j++; | |
384 } | |
385 | |
386 xc[j]='\0'; | |
387 i++; | |
388 | |
389 if (t==time_ID) | |
390 { | |
391 time_pass=1; | |
392 continue; | |
393 } | |
394 | |
395 if (time_pass) | |
396 uncomp_data[index][t-1]=atof(xc); | |
397 else | |
398 uncomp_data[index][t]=atof(xc); | |
399 } | |
400 }*/ //commented by Yu Qian on Aug 31, 2010 as we do not want to check time_ID anymore | |
401 index++; | |
402 //fprintf(fout_ID,"%s",src); | |
403 } //end of while | |
404 | |
405 if (DEBUG == 1) | |
406 { | |
407 printf("the last line of the source data is:\n"); | |
408 for (j=0;j<num_dm;j++) | |
409 printf("%f ",uncomp_data[index-1][j]); | |
410 printf("\n"); | |
411 } | |
412 } | |
413 | |
414 | |
415 /**************************************** Normalization ******************************************/ | |
416 void tran(double **orig_data, long file_Len, long num_dm, long norm_used, double **matrix_to_cluster) | |
417 { | |
418 long i=0; | |
419 long j=0; | |
420 | |
421 double biggest=0; | |
422 double smallest=MAX_VALUE; | |
423 | |
424 double *aver; //average of each column | |
425 double *std; //standard deviation of each column | |
426 | |
427 aver=(double*)malloc(sizeof(double)*file_Len); | |
428 memset(aver,0,sizeof(double)*file_Len); | |
429 | |
430 std=(double*)malloc(sizeof(double)*file_Len); | |
431 memset(std,0,sizeof(double)*file_Len); | |
432 | |
433 if (norm_used==2) //z-score normalization | |
434 { | |
435 for (j=0;j<num_dm;j++) | |
436 { | |
437 aver[j]=0; | |
438 for (i=0;i<file_Len;i++) | |
439 aver[j]=aver[j]+orig_data[i][j]; | |
440 aver[j]=aver[j]/(double)file_Len; | |
441 | |
442 std[j]=0; | |
443 for (i=0;i<file_Len;i++) | |
444 std[j]=std[j]+((orig_data[i][j]-aver[j])*(orig_data[i][j]-aver[j])); | |
445 std[j]=sqrt(std[j]/(double)file_Len); | |
446 | |
447 for (i=0;i<file_Len;i++) | |
448 matrix_to_cluster[i][j]=(orig_data[i][j]-aver[j])/std[j]; //z-score normalization | |
449 } | |
450 } | |
451 | |
452 if (norm_used==1) //0-1 min-max normalization | |
453 { | |
454 for (j=0;j<num_dm;j++) | |
455 { | |
456 biggest=0; | |
457 smallest=MAX_VALUE; | |
458 for (i=0;i<file_Len;i++) | |
459 { | |
460 if (orig_data[i][j]>biggest) | |
461 biggest=orig_data[i][j]; | |
462 if (orig_data[i][j]<smallest) | |
463 smallest=orig_data[i][j]; | |
464 } | |
465 | |
466 for (i=0;i<file_Len;i++) | |
467 { | |
468 if (biggest==smallest) | |
469 matrix_to_cluster[i][j]=biggest; | |
470 else | |
471 matrix_to_cluster[i][j]=(orig_data[i][j]-smallest)/(biggest-smallest); | |
472 } | |
473 } | |
474 } | |
475 | |
476 if (norm_used==0) //no normalization | |
477 { | |
478 for (i=0;i<file_Len;i++) | |
479 for (j=0;j<num_dm;j++) | |
480 matrix_to_cluster[i][j]=orig_data[i][j]; | |
481 } | |
482 | |
483 } | |
484 | |
485 | |
486 | |
487 /********************************************** RadixSort *******************************************/ | |
488 /* Perform a radix sort using each dimension from the original data as a radix. | |
489 * Outputs: | |
490 * sorted_seq -- a permutation vector mapping the ordered list onto the original data. | |
491 * (sorted_seq[i] -> index in the original data of the ith element of the ordered list) | |
492 * grid_ID -- mapping between the original data and the "grids" (see below) found as a byproduct | |
493 * of the sorting procedure. | |
494 * num_nonempty -- the number of grids that occur in the data (= the number of distinct values assumed | |
495 * by grid_ID) | |
496 */ | |
497 | |
498 void radixsort_flock(long **position,long file_Len,long num_dm,long num_bin,long *sorted_seq,long *num_nonempty,long *grid_ID) | |
499 { | |
500 long i=0; | |
501 long length=0; | |
502 long start=0; | |
503 long prev_ID=0; | |
504 long curr_ID=0; | |
505 | |
506 long j=0; | |
507 long t=0; | |
508 long p=0; | |
509 long loc=0; | |
510 long temp=0; | |
511 long equal=0; | |
512 | |
513 long *count; //count[i]=j means there are j numbers having value i at the processing digit | |
514 long *index; //index[i]=j means the starting position of grid i is j | |
515 long *cp; //current position | |
516 long *mark; //mark[i]=0 means it is not an ending point of a part, 1 means it is (a "part" is a group of items with identical bins for all dimensions) | |
517 long *seq; //temporary sequence | |
518 | |
519 count=(long*)malloc(sizeof(long)*num_bin); | |
520 memset(count,0,sizeof(long)*num_bin); | |
521 | |
522 cp=(long*)malloc(sizeof(long)*num_bin); | |
523 memset(cp,0,sizeof(long)*num_bin); | |
524 | |
525 index=(long*)malloc(sizeof(long)*num_bin); // initialized below | |
526 | |
527 seq=(long*)malloc(sizeof(long)*file_Len); | |
528 memset(seq,0,sizeof(long)*file_Len); | |
529 | |
530 mark=(long*)malloc(sizeof(long)*file_Len); | |
531 memset(mark,0,sizeof(long)*file_Len); | |
532 | |
533 for (i=0;i<file_Len;i++) | |
534 { | |
535 sorted_seq[i]=i; | |
536 mark[i]=0; | |
537 seq[i]=0; | |
538 } | |
539 for (i=0;i<num_bin;i++) | |
540 { | |
541 index[i]=0; | |
542 cp[i]=0; | |
543 count[i]=0; | |
544 } | |
545 | |
546 for (j=0;j<num_dm;j++) | |
547 { | |
548 if (j==0) //compute the initial values of mark | |
549 { | |
550 for (i=0;i<file_Len;i++) | |
551 count[position[i][j]]++; // initialize the count to the number of items in each bin of the 0th dimension | |
552 | |
553 index[0] = 0; | |
554 for (i=0;i<num_bin-1;i++) | |
555 { | |
556 index[i+1]=index[i]+count[i]; //index[k]=x means k segment starts at x (in the ordered list) | |
557 if ((index[i+1]>0) && (index[i+1]<=file_Len)) | |
558 { | |
559 mark[index[i+1]-1]=1; // Mark the end of the segment in the ordered list | |
560 } | |
561 else | |
562 { | |
563 printf("out of myboundary for mark at index[i+1]-1.\n"); | |
564 } | |
565 } | |
566 mark[file_Len-1]=1; | |
567 | |
568 for (i=0;i<file_Len;i++) | |
569 { | |
570 /* Build a permutation vector for the partially ordered data. Store the PV in sorted_seq */ | |
571 loc=position[i][j]; | |
572 temp=index[loc]+cp[loc]; //cp[i]=j means the offset from the starting position of grid i is j | |
573 sorted_seq[temp]=i; //sorted_seq[i]=temp is also another way to sort | |
574 cp[loc]++; | |
575 } | |
576 } | |
577 else | |
578 { | |
579 //reset count, index, loc, temp, cp, start, and length | |
580 length=0; | |
581 loc=0; | |
582 temp=0; | |
583 start=0; | |
584 for (p=0;p<num_bin;p++) | |
585 { | |
586 cp[p]=0; | |
587 count[p]=0; | |
588 index[p]=0; | |
589 } | |
590 | |
591 for (i=0;i<file_Len;i++) | |
592 { | |
593 long iperm = sorted_seq[i]; // iperm allows us to traverse the data in sorted order. | |
594 if (mark[i]!=1) | |
595 { | |
596 /* Count the number of items in each bin of | |
597 dimension j, BUT we are going to reset at the end | |
598 of each "part". Thus, the total effect is to do | |
599 a sort by bin on the jth dimension for each group | |
600 of data that has been identical for the | |
601 dimensions processed up to this point. This is | |
602 the standard radix sort procedure, but doing it | |
603 this way saves us having to allocate buckets to | |
604 hold the data in each group of "identical-so-far" | |
605 elements. */ | |
606 count[position[iperm][j]]++; //count[position[i][j]]++; | |
607 length++; // This is the total length of the part, irrespective of the value of the jth component | |
608 // (actually, less one, since we don't increment for the final element below) | |
609 } | |
610 if (mark[i]==1) | |
611 { | |
612 //length++; | |
613 count[position[iperm][j]]++;//count[position[i][j]]++; //the current point must be counted in | |
614 start=i-length; //this part starts from start to i: [start,i] | |
615 /* Now we sort on the jth radix, just like we did for the 0th above, but we restrict it to just the current part. | |
616 This would be a lot more clear if we broke this bit of code out into a separate function and processed recursively, | |
617 plus we could multi-thread over the parts. (Hmmm...) | |
618 */ | |
619 index[0] = start; // Let index give the offset within the whole ordered list. | |
620 for (t=0;t<num_bin-1;t++) | |
621 { | |
622 index[t+1]=index[t]+count[t]; | |
623 | |
624 if ((index[t+1]<=file_Len) && (index[t+1]>0)) | |
625 { | |
626 mark[index[t+1]-1]=1; // update the part boundaries to include the differences in the current radix. | |
627 } | |
628 | |
629 } | |
630 mark[i]=1; | |
631 | |
632 /* Update the permutation vector for the current part (i.e., from start to i). By the time we finish the loop over i | |
633 the PV will be completely updated for the partial ordering up to the current radix. */ | |
634 for (t=start;t<=i;t++) | |
635 { | |
636 loc=position[sorted_seq[t]][j];//loc=position[t][j]; | |
637 temp=index[loc]+cp[loc]; | |
638 if ((temp<file_Len) && (temp>=0)) | |
639 { | |
640 // seq is a temporary because we have to maintain the old PV until we have finished this step. | |
641 seq[temp]=sorted_seq[t]; //sorted_seq[i]=temp is also another way to sort | |
642 cp[loc]++; | |
643 } | |
644 else | |
645 { | |
646 printf("out of myboundary for seq at temp.\n"); | |
647 } | |
648 } | |
649 | |
650 for (t=start;t<=i;t++) | |
651 { | |
652 // copy the temporary back into sorted_seq. sorted_seq is now updated for radix j up through | |
653 // entry i in the ordered list. | |
654 if ((t>=0) && (t<file_Len)) | |
655 sorted_seq[t]=seq[t]; | |
656 else | |
657 printf("out of myboundary for seq and sorted_seq at t.\n"); | |
658 } | |
659 //reset count, index, seq, length, and cp | |
660 length=0; | |
661 loc=0; | |
662 temp=0; | |
663 for (p=0;p<num_bin;p++) | |
664 { | |
665 cp[p]=0; | |
666 count[p]=0; | |
667 index[p]=0; | |
668 } | |
669 } | |
670 }//end for i | |
671 }//end else | |
672 }//end for j | |
673 | |
674 /* sorted_seq[] now contains the ordered list for all radices. mark[] gives the boundaries between groups of elements that are | |
675 identical over all radices (= dimensions in the original data) (although it appears we aren't going to make use of this fact) */ | |
676 | |
677 for (i=0;i<file_Len;i++) | |
678 grid_ID[i]=0; //in case the initial value hasn't been assigned | |
679 *num_nonempty=1; //starting from 1! | |
680 | |
681 /* assign the "grid" identifiers for all of the data. A grid will be what we were calling a "part" above. We will number them | |
682 serially and tag the *unordered* data with the grid IDs. We will also count the number of populated grids (in general there will | |
683 be many possible combinations of bin values that simply never occur) */ | |
684 | |
685 for (i=1;i<file_Len;i++) | |
686 { | |
687 equal=1; | |
688 prev_ID=sorted_seq[i-1]; | |
689 curr_ID=sorted_seq[i]; | |
690 for (j=0;j<num_dm;j++) | |
691 { | |
692 if (position[prev_ID][j]!=position[curr_ID][j]) | |
693 { | |
694 equal=0; //not equal | |
695 break; | |
696 } | |
697 } | |
698 | |
699 if (equal) | |
700 { | |
701 grid_ID[curr_ID]=grid_ID[prev_ID]; | |
702 } | |
703 else | |
704 { | |
705 *num_nonempty=*num_nonempty+1; | |
706 grid_ID[curr_ID]=grid_ID[prev_ID]+1; | |
707 } | |
708 //all_grid_vol[grid_ID[curr_ID]]++; | |
709 } | |
710 | |
711 //free memory | |
712 free(count); | |
713 free(index); | |
714 free(cp); | |
715 free(seq); | |
716 free(mark); | |
717 | |
718 } | |
719 | |
720 /********************************************** Compute Position of Events ************************************************/ | |
721 void compute_position(double **data_in, long file_Len, long num_dm, long num_bin, long **position, double *interval) | |
722 { | |
723 /* What we are really doing here is binning the data, with the bins | |
724 spanning the range of the data and number of bins = num_bin */ | |
725 long i=0; | |
726 long j=0; | |
727 | |
728 double *small; //small[j] is the smallest value within dimension j | |
729 double *big; //big[j] is the biggest value within dimension j | |
730 | |
731 small=(double*)malloc(sizeof(double)*num_dm); | |
732 memset(small,0,sizeof(double)*num_dm); | |
733 | |
734 big=(double*)malloc(sizeof(double)*num_dm); | |
735 memset(big,0,sizeof(double)*num_dm); | |
736 | |
737 | |
738 for (j=0;j<num_dm;j++) | |
739 { | |
740 big[j]=MAX_VALUE*(-1); | |
741 small[j]=MAX_VALUE; | |
742 for (i=0;i<file_Len;i++) | |
743 { | |
744 if (data_in[i][j]>big[j]) | |
745 big[j]=data_in[i][j]; | |
746 | |
747 if (data_in[i][j]<small[j]) | |
748 small[j]=data_in[i][j]; | |
749 } | |
750 | |
751 interval[j]=(big[j]-small[j])/(double)num_bin; //interval is computed using the biggest value and smallest value instead of the channel limit | |
752 /* XXX: I'm pretty sure the denominator of the fraction above should be num_bin-1. */ | |
753 } | |
754 | |
755 for (j=0;j<num_dm;j++) | |
756 { | |
757 for (i=0;i<file_Len;i++) | |
758 { | |
759 if (data_in[i][j]>=big[j]) | |
760 position[i][j]=num_bin-1; | |
761 else | |
762 { | |
763 position[i][j]=(long)((data_in[i][j]-small[j])/interval[j]); //position[i][j]=t means point i is at the t grid of dimensional j | |
764 if ((position[i][j]>=num_bin) || (position[i][j]<0)) | |
765 { | |
766 //printf("position mis-computed in density analysis!\n"); | |
767 //exit(0); | |
768 fprintf(stderr,"Incorrect input file format or input parameters (number of bins overflows)!\n"); //modified on July 23, 2010 | |
769 exit(0); | |
770 } | |
771 } | |
772 } | |
773 } | |
774 | |
775 | |
776 free(small); | |
777 free(big); | |
778 } | |
779 | |
780 /********************************************** select_bin to select the number of bins **********************************/ | |
781 //num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty); | |
782 /* Determine the number of bins to use in each dimension. Additionally sort the data elements according to the binned | |
783 * values, and partition the data into "grids" with identical (binned) values. We try progressively more bins until we | |
784 * maximize a merit function, then return the results obtained using the optimal number of bins. | |
785 * | |
786 * Outputs: | |
787 * position -- binned data values | |
788 * sorted_seq -- permutation vector mapping the ordered list to the original data | |
789 * all_grid_ID -- grid to which each data element was assigned. | |
790 * num_nonempty -- number of distinct values assumed by all_grid_ID | |
791 * interval -- bin width for each data dimension | |
792 * return value -- the number of bins selected. | |
793 */ | |
794 | |
795 long select_bin(double **normalized_data, long file_Len, long num_dm, long min_grid, long max_grid, long **position, long *sorted_seq, | |
796 long *all_grid_ID, long *num_nonempty, double *interval, long user_num_bin) | |
797 { | |
798 | |
799 long num_bin=0; | |
800 long select_num_bin=0; | |
801 long m=0; | |
802 long n=0; | |
803 | |
804 long i=0; | |
805 long bin_scope=0; | |
806 long temp_num_nonempty=0; | |
807 | |
808 long *temp_grid_ID; | |
809 long *temp_sorted_seq; | |
810 long **temp_position; | |
811 | |
812 //sorted_seq[i]=j means the event j ranks i | |
813 | |
814 double temp_index=0; | |
815 double *bin_index; | |
816 double *temp_interval; | |
817 | |
818 | |
819 temp_grid_ID=(long *)malloc(sizeof(long)*file_Len); | |
820 memset(temp_grid_ID,0,sizeof(long)*file_Len); | |
821 | |
822 temp_sorted_seq=(long *)malloc(sizeof(long)*file_Len); | |
823 memset(temp_sorted_seq,0,sizeof(long)*file_Len); | |
824 | |
825 temp_position=(long **)malloc(sizeof(long*)*file_Len); | |
826 memset(temp_position,0,sizeof(long*)*file_Len); | |
827 for (m=0;m<file_Len;m++) | |
828 { | |
829 temp_position[m]=(long*)malloc(sizeof(long)*num_dm); | |
830 memset(temp_position[m],0,sizeof(long)*num_dm); | |
831 } | |
832 | |
833 temp_interval=(double*)malloc(sizeof(double)*num_dm); | |
834 memset(temp_interval,0,sizeof(double)*num_dm); | |
835 | |
836 bin_scope=max_grid-min_grid+1; | |
837 bin_index=(double *)malloc(sizeof(double)*bin_scope); | |
838 memset(bin_index,0,sizeof(double)*bin_scope); | |
839 | |
840 i=0; | |
841 | |
842 for (num_bin=min_grid;num_bin<=max_grid;num_bin++) | |
843 { | |
844 | |
845 temp_num_nonempty=0; | |
846 /* compute_position bins the data into num_bin bins. Each | |
847 dimension is binned independently. | |
848 | |
849 Outputs: | |
850 temp_position[i][j] -- bin for the jth component of data element i. | |
851 temp_interval[j] -- bin-width for the jth component | |
852 */ | |
853 compute_position(normalized_data, file_Len, num_dm, num_bin, temp_position, temp_interval); | |
854 radixsort_flock(temp_position,file_Len,num_dm,num_bin,temp_sorted_seq,&temp_num_nonempty,temp_grid_ID); | |
855 | |
856 /* our figure of merit is the number of non-empty grids divided by number of bins per dimension. | |
857 We declare victory when we have found a local maximum */ | |
858 bin_index[i]=((double)temp_num_nonempty)/((double)num_bin); | |
859 if ((double)(temp_num_nonempty)>=(double)(file_Len)*0.95) | |
860 break; | |
861 if ((bin_index[i]<temp_index) && (user_num_bin==0)) | |
862 break; | |
863 if ((user_num_bin==num_bin-1) && (user_num_bin!=0)) | |
864 break; | |
865 | |
866 /* Since we have accepted this trial bin, copy all the temporary results into | |
867 the output buffers */ | |
868 memcpy(all_grid_ID,temp_grid_ID,sizeof(long)*file_Len); | |
869 memcpy(sorted_seq,temp_sorted_seq,sizeof(long)*file_Len); | |
870 memcpy(interval,temp_interval,sizeof(double)*num_dm); | |
871 | |
872 for (m=0;m<file_Len;m++) | |
873 for (n=0;n<num_dm;n++) | |
874 position[m][n]=temp_position[m][n]; | |
875 | |
876 temp_index=bin_index[i]; | |
877 select_num_bin=num_bin; | |
878 num_nonempty[0]=temp_num_nonempty; | |
879 i++; | |
880 } | |
881 | |
882 if ((select_num_bin<min_grid) || (select_num_bin>max_grid)) | |
883 { | |
884 fprintf(stderr,"Number of events collected is too few in terms of number of markers used. The file should not be processed!\n"); //modified on Nov 04, 2010 | |
885 exit(0); | |
886 } | |
887 | |
888 if (temp_index==0) | |
889 { | |
890 fprintf(stderr,"Too many dimensions with too few events in the input file, or a too large number of bins used.\n"); //modified on July 23, 2010 | |
891 exit(0); | |
892 } | |
893 | |
894 free(temp_grid_ID); | |
895 free(temp_sorted_seq); | |
896 free(bin_index); | |
897 free(temp_interval); | |
898 | |
899 for (m=0;m<file_Len;m++) | |
900 free(temp_position[m]); | |
901 free(temp_position); | |
902 | |
903 return select_num_bin; | |
904 } | |
905 | |
906 /************************************* Select dense grids **********************************/ | |
907 // compute num_dense_grids, num_dense_events, dense_grid_reverse, and all_grid_vol | |
908 // den_cutoff=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse); | |
909 /* | |
910 * Prune away grids that are insufficiently "dense" (i.e., contain too few data items) | |
911 * | |
912 * Outputs: | |
913 * num_dense_grids -- number of dense grids | |
914 * num_dense_events -- total number of data items in all dense grids | |
915 * dense_grid_reverse -- mapping from list of all grids to list of dense grids. | |
916 * return value -- density cutoff for separating dense from non-dense grids. | |
917 */ | |
918 | |
919 int select_dense(long file_Len, long *all_grid_ID, long num_nonempty, long *num_dense_grids, long *num_dense_events, long *dense_grid_reverse, int den_t_event) | |
920 { | |
921 | |
922 | |
923 long i=0; | |
924 long vol_ID=0; | |
925 long biggest_size=0; //biggest grid_size, to define grid_size_index | |
926 long biggest_index=0; | |
927 //long actual_threshold=0; //the actual threshold on grid_size, e.g., 1 implies 1 event in the grid | |
928 //long num_remain=0; //number of remaining grids with different density thresholds | |
929 long temp_num_dense_grids=0; | |
930 long temp_num_dense_events=0; | |
931 | |
932 long *grid_size_index; | |
933 long *all_grid_vol; | |
934 long *grid_density_index; | |
935 | |
936 //double den_average=0; | |
937 // double avr_index=0; | |
938 | |
939 | |
940 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
941 // Compute all_grid_vol | |
942 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
943 all_grid_vol=(long *)malloc(sizeof(long)*num_nonempty); | |
944 memset(all_grid_vol,0,sizeof(long)*num_nonempty); | |
945 | |
946 /* Grid "volume" is just the number of data contained in the grid. */ | |
947 for (i=0;i<file_Len;i++) | |
948 { | |
949 vol_ID=all_grid_ID[i]; //vol_ID=all_grid_ID[sorted_seq[i]]; | |
950 all_grid_vol[vol_ID]++; //all_grid_vol[i]=j means grid i has j points | |
951 } | |
952 | |
953 | |
954 | |
955 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
956 // Compute grid_size_index (histogram of grid sizes) | |
957 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
958 | |
959 for (i=0;i<num_nonempty;i++) | |
960 { | |
961 if (biggest_size<all_grid_vol[i]) | |
962 { | |
963 biggest_size=all_grid_vol[i]; | |
964 } | |
965 } | |
966 | |
967 if (biggest_size<3) | |
968 { | |
969 fprintf(stderr,"Too many dimensions with too few events in the input file, or a too large number of bins used.\n"); //modified on July 23, 2010 | |
970 exit(0); | |
971 } | |
972 | |
973 | |
974 grid_size_index=(long*)malloc(sizeof(long)*biggest_size); | |
975 memset(grid_size_index,0,sizeof(long)*biggest_size); | |
976 | |
977 for (i=0;i<num_nonempty;i++) | |
978 { | |
979 grid_size_index[all_grid_vol[i]-1]++; //grid_size_index[0]=5 means there are 5 grids having size 1 | |
980 } | |
981 | |
982 | |
983 | |
984 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
985 // Compute den_cutoff | |
986 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
987 | |
988 grid_density_index=(long *)malloc(sizeof(long)*(biggest_size-2));//from event 2 to biggest_size-1, i.e., from 0 to biggest_size-3 | |
989 memset(grid_density_index,0,sizeof(long)*(biggest_size-2)); | |
990 | |
991 if (den_t_event==0) | |
992 { | |
993 biggest_index=0; | |
994 | |
995 for (i=2;i<biggest_size-1;i++) //the grid with 1 event will be skipped, i.e., grid_density_index[0] won't be defined | |
996 { | |
997 grid_density_index[i-1]=(grid_size_index[i-1]+grid_size_index[i+1]-2*grid_size_index[i]); | |
998 if (biggest_index<grid_density_index[i-1]) | |
999 { | |
1000 biggest_index=grid_density_index[i-1]; | |
1001 den_t_event=i+1; | |
1002 } | |
1003 } | |
1004 } | |
1005 | |
1006 if (den_t_event==0) //if biggest_size==3 | |
1007 { | |
1008 fprintf(stderr,"the densest hyperregion has only 3 events, smaller than the user-specified value. Therefore the density threshold is automatically changed to 3.\n"); | |
1009 den_t_event=3; | |
1010 } | |
1011 | |
1012 for (i=0;i<num_nonempty;i++) | |
1013 if (all_grid_vol[i]>=den_t_event) | |
1014 temp_num_dense_grids++; | |
1015 | |
1016 if (temp_num_dense_grids<=1) | |
1017 { | |
1018 //exit(0); | |
1019 //printf("a too high density threshold is set! Please decrease your density threshold.\n"); | |
1020 fprintf(stderr,"a too high density threshold! Please decrease your density threshold.\n"); //modified on July 23, 2010 | |
1021 exit(0); | |
1022 } | |
1023 | |
1024 if (temp_num_dense_grids>=100000) | |
1025 { | |
1026 //modified on July 23, 2010 | |
1027 //printf("a too low density threshold is set! Please increase your density threshold.\n"); | |
1028 //exit(0); | |
1029 fprintf(stderr,"a too low density threshold! Please increase your density threshold.\n"); //modified on July 23, 2010 | |
1030 exit(0); | |
1031 } | |
1032 | |
1033 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
1034 // Compute dense_grid_reverse | |
1035 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
1036 temp_num_dense_grids=0; | |
1037 temp_num_dense_events=0; | |
1038 for (i=0;i<num_nonempty;i++) | |
1039 { | |
1040 dense_grid_reverse[i]=-1; | |
1041 | |
1042 if (all_grid_vol[i]>=den_t_event) | |
1043 { | |
1044 dense_grid_reverse[i]=temp_num_dense_grids; //dense_grid_reverse provides a mapping from all nonempty grids to dense grids. | |
1045 temp_num_dense_grids++; | |
1046 temp_num_dense_events+=all_grid_vol[i]; | |
1047 } | |
1048 } | |
1049 | |
1050 num_dense_grids[0]=temp_num_dense_grids; | |
1051 num_dense_events[0]=temp_num_dense_events; | |
1052 | |
1053 free(grid_size_index); | |
1054 free(all_grid_vol); | |
1055 | |
1056 return den_t_event; | |
1057 } | |
1058 | |
1059 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
1060 // Compute densegridID_To_gridevent and eventID_To_denseventID | |
1061 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
1062 // grid_To_event(file_Len, dense_grid_reverse, all_grid_ID, eventID_To_denseventID, densegridID_To_gridevent); | |
1063 /* | |
1064 * Filter the data so that only the data belonging to dense grids is left | |
1065 * | |
1066 * Output: | |
1067 * eventID_To_denseeventID -- mapping from original event ID to ID in list containing only events contained in dense grids. | |
1068 * densegridID_To_gridevent -- mapping from dense grids to prototype members of the grids. | |
1069 * | |
1070 */ | |
1071 | |
1072 void grid_To_event(long file_Len, long *dense_grid_reverse, long *all_grid_ID, long *eventID_To_denseventID, long *densegridID_To_gridevent) | |
1073 { | |
1074 long i=0; | |
1075 long dense_grid_ID=0; | |
1076 long dense_event_ID=0; | |
1077 | |
1078 for (i=0;i<file_Len;i++) | |
1079 { | |
1080 dense_grid_ID=dense_grid_reverse[all_grid_ID[i]]; | |
1081 eventID_To_denseventID[i]=-1; | |
1082 if (dense_grid_ID!=-1) //for point (i) belonging to dense grids | |
1083 { | |
1084 eventID_To_denseventID[i]=dense_event_ID; | |
1085 dense_event_ID++; | |
1086 | |
1087 if (densegridID_To_gridevent[dense_grid_ID]==-1) //for point i that hasn't been selected | |
1088 densegridID_To_gridevent[dense_grid_ID]=i; //densegridID_To_gridevent maps dense_grid_ID to its representative point | |
1089 } | |
1090 } | |
1091 | |
1092 | |
1093 } | |
1094 | |
1095 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
1096 // Compute dense_grid_seq | |
1097 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
1098 // generate_grid_seq(file_Len, num_dm, sorted_seq, num_dense_grids, densegridID_To_gridevent, position, dense_grid_rank, dense_grid_seq); | |
1099 /* Construct a table of binned data values for each dense grid. | |
1100 * | |
1101 * Output: | |
1102 * | |
1103 * dense_grid_seq -- table of binned data values for each dense grid (recall that all members of a grid have identical binned data values) | |
1104 */ | |
1105 | |
1106 void generate_grid_seq(long num_dm, long num_dense_grids, long *densegridID_To_gridevent, long **position, long **dense_grid_seq) | |
1107 { | |
1108 | |
1109 long i=0; | |
1110 long j=0; | |
1111 long ReventID=0; //representative event ID of the dense grid | |
1112 | |
1113 for (i=0;i<num_dense_grids;i++) | |
1114 { | |
1115 ReventID = densegridID_To_gridevent[i]; | |
1116 | |
1117 for (j=0;j<num_dm;j++) | |
1118 dense_grid_seq[i][j]=position[ReventID][j]; | |
1119 } | |
1120 } | |
1121 | |
1122 //compare two vectors | |
1123 long compare_value(long num_dm, long *search_value, long *seq_value) | |
1124 { | |
1125 long i=0; | |
1126 | |
1127 for (i=0;i<num_dm;i++) | |
1128 { | |
1129 if (search_value[i]<seq_value[i]) | |
1130 return 1; | |
1131 if (search_value[i]>seq_value[i]) | |
1132 return -1; | |
1133 if (search_value[i]==seq_value[i]) | |
1134 continue; | |
1135 } | |
1136 return 0; | |
1137 } | |
1138 | |
1139 //binary search the dense_grid_seq to return the dense grid ID if it exists | |
1140 long binary_search(long num_dense_grids, long num_dm, long *search_value, long **dense_grid_seq) | |
1141 { | |
1142 | |
1143 long low=0; | |
1144 long high=0; | |
1145 long mid=0; | |
1146 long comp_result=0; | |
1147 long match=0; | |
1148 //long found=0; | |
1149 | |
1150 low = 0; | |
1151 high = num_dense_grids-1; | |
1152 | |
1153 while (low <= high) | |
1154 { | |
1155 mid = (long)((low + high)/2); | |
1156 | |
1157 comp_result=compare_value(num_dm, search_value,dense_grid_seq[mid]); | |
1158 | |
1159 | |
1160 switch(comp_result) | |
1161 { | |
1162 case 1: | |
1163 high=mid-1; | |
1164 break; | |
1165 case -1: | |
1166 low=mid+1; | |
1167 break; | |
1168 case 0: | |
1169 match=1; | |
1170 break; | |
1171 } | |
1172 if (match==1) | |
1173 break; | |
1174 } | |
1175 | |
1176 | |
1177 | |
1178 if (match==1) | |
1179 { | |
1180 return mid; | |
1181 } | |
1182 else | |
1183 return -1; | |
1184 } | |
1185 | |
1186 | |
1187 /********************************************** Computing Centers Using IDs **********************************************/ | |
1188 | |
1189 void ID2Center(double **data_in, long file_Len, long num_dm, long *eventID_To_denseventID, long num_clust, long *cluster_ID, double **population_center) | |
1190 { | |
1191 long i=0; | |
1192 long j=0; | |
1193 long ID=0; | |
1194 long eventID=0; | |
1195 long *size_c; | |
1196 | |
1197 | |
1198 | |
1199 size_c=(long *)malloc(sizeof(long)*num_clust); | |
1200 memset(size_c,0,sizeof(long)*num_clust); | |
1201 | |
1202 for (i=0;i<num_clust;i++) | |
1203 for (j=0;j<num_dm;j++) | |
1204 population_center[i][j]=0; | |
1205 | |
1206 for (i=0;i<file_Len;i++) | |
1207 { | |
1208 eventID=eventID_To_denseventID[i]; | |
1209 | |
1210 if (eventID!=-1) //only events in dense grids count | |
1211 { | |
1212 ID=cluster_ID[eventID]; | |
1213 | |
1214 if (ID==-1) | |
1215 { | |
1216 //printf("ID==-1! in ID2Center\n"); | |
1217 //exit(0); | |
1218 fprintf(stderr,"Incorrect file format or input parameters (no dense regions found!)\n"); //modified on July 23, 2010 | |
1219 exit(0); | |
1220 } | |
1221 | |
1222 for (j=0;j<num_dm;j++) | |
1223 population_center[ID][j]=population_center[ID][j]+data_in[i][j]; | |
1224 | |
1225 size_c[ID]++; | |
1226 } | |
1227 } | |
1228 | |
1229 | |
1230 for (i=0;i<num_clust;i++) | |
1231 { | |
1232 for (j=0;j<num_dm;j++) | |
1233 if (size_c[i]!=0) | |
1234 population_center[i][j]=(population_center[i][j]/(double)(size_c[i])); | |
1235 else | |
1236 { | |
1237 population_center[i][j]=0; | |
1238 printf("size_c[%ld]=0 in ID2center\n",i); | |
1239 } | |
1240 } | |
1241 | |
1242 free(size_c); | |
1243 | |
1244 } | |
1245 | |
1246 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
1247 // Compute Population Center with all events | |
1248 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
1249 void ID2Center_all(double **data_in, long file_Len, long num_dm, long num_clust, long *cluster_ID, double **population_center) | |
1250 { | |
1251 long i=0; | |
1252 long j=0; | |
1253 long ID=0; | |
1254 long *size_c; | |
1255 | |
1256 | |
1257 | |
1258 size_c=(long *)malloc(sizeof(long)*num_clust); | |
1259 memset(size_c,0,sizeof(long)*num_clust); | |
1260 | |
1261 for (i=0;i<num_clust;i++) | |
1262 for (j=0;j<num_dm;j++) | |
1263 population_center[i][j]=0; | |
1264 | |
1265 | |
1266 for (i=0;i<file_Len;i++) | |
1267 { | |
1268 ID=cluster_ID[i]; | |
1269 | |
1270 if (ID==-1) | |
1271 { | |
1272 //printf("ID==-1! in ID2Center_all\n"); | |
1273 //exit(0); | |
1274 fprintf(stderr,"Incorrect file format or input parameters (resulting in incorrect population IDs)\n"); //modified on July 23, 2010 | |
1275 exit(0); | |
1276 } | |
1277 | |
1278 for (j=0;j<num_dm;j++) | |
1279 population_center[ID][j]=population_center[ID][j]+data_in[i][j]; | |
1280 | |
1281 size_c[ID]++; | |
1282 } | |
1283 | |
1284 | |
1285 for (i=0;i<num_clust;i++) | |
1286 { | |
1287 for (j=0;j<num_dm;j++) | |
1288 if (size_c[i]!=0) | |
1289 population_center[i][j]=(population_center[i][j]/(double)(size_c[i])); | |
1290 else | |
1291 { | |
1292 population_center[i][j]=0; | |
1293 printf("size_c[%ld]=0 in ID2center_all\n",i); | |
1294 } | |
1295 } | |
1296 | |
1297 | |
1298 free(size_c); | |
1299 | |
1300 } | |
1301 | |
1302 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
1303 // Merge neighboring grids to clusters | |
1304 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
1305 | |
1306 long merge_grids(double **normalized_data, double *interval, long file_Len, long num_dm, long num_bin, long **position, long num_dense_grids, | |
1307 long *dense_grid_reverse, long **dense_grid_seq, long *eventID_To_denseventID, long *densegridID_To_gridevent, long *all_grid_ID, | |
1308 long *cluster_ID, long *grid_ID, long *grid_clusterID) | |
1309 { | |
1310 | |
1311 | |
1312 long i=0; | |
1313 long j=0; | |
1314 long t=0; | |
1315 long p=0; | |
1316 long num_clust=0; | |
1317 long ReventID=0; | |
1318 long denseID=0; | |
1319 long neighbor_ID=0; | |
1320 //long temp_grid=0; | |
1321 | |
1322 long *grid_value; | |
1323 long *search_value; | |
1324 | |
1325 long **graph_of_grid; //the graph constructed for dense grids: each dense grid is a graph node | |
1326 | |
1327 double real_dist=0; | |
1328 double **norm_grid_center; | |
1329 | |
1330 norm_grid_center=(double **)malloc(sizeof(double*)*num_dense_grids); | |
1331 memset(norm_grid_center,0,sizeof(double*)*num_dense_grids); | |
1332 | |
1333 for (i=0;i<num_dense_grids;i++) | |
1334 { | |
1335 norm_grid_center[i]=(double *)malloc(sizeof(double)*num_dm); | |
1336 memset(norm_grid_center[i],0,sizeof(double)*num_dm); | |
1337 } | |
1338 | |
1339 for (i=0;i<file_Len;i++) | |
1340 { | |
1341 denseID=eventID_To_denseventID[i]; | |
1342 if (denseID!=-1) //only dense events can enter | |
1343 { | |
1344 grid_ID[denseID]=dense_grid_reverse[all_grid_ID[i]]; | |
1345 | |
1346 if (grid_ID[denseID]==-1) | |
1347 { | |
1348 fprintf(stderr,"Incorrect input file format or input parameters (no dense region found)!\n"); | |
1349 exit(0); | |
1350 } | |
1351 } | |
1352 } | |
1353 | |
1354 | |
1355 /* Find centroid (in the normalized data) for each dense grid */ | |
1356 ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_dense_grids,grid_ID,norm_grid_center); | |
1357 | |
1358 //printf("pass the grid ID2 center\n"); | |
1359 | |
1360 | |
1361 graph_of_grid=(long **)malloc(sizeof(long*)*num_dense_grids); | |
1362 memset(graph_of_grid,0,sizeof(long*)*num_dense_grids); | |
1363 for (i=0;i<num_dense_grids;i++) | |
1364 { | |
1365 graph_of_grid[i]=(long *)malloc(sizeof(long)*num_dm); | |
1366 memset(graph_of_grid[i],0,sizeof(long)*num_dm); | |
1367 | |
1368 | |
1369 for (j=0;j<num_dm;j++) | |
1370 graph_of_grid[i][j]=-1; | |
1371 } | |
1372 | |
1373 grid_value=(long *)malloc(sizeof(long)*num_dm); | |
1374 memset(grid_value,0,sizeof(long)*num_dm); | |
1375 | |
1376 search_value=(long *)malloc(sizeof(long)*num_dm); | |
1377 memset(search_value,0,sizeof(long)*num_dm); | |
1378 | |
1379 | |
1380 for (i=0;i<num_dense_grids;i++) | |
1381 { | |
1382 ReventID=densegridID_To_gridevent[i]; | |
1383 | |
1384 for (j=0;j<num_dm;j++) | |
1385 { | |
1386 grid_value[j]=position[ReventID][j]; | |
1387 | |
1388 } | |
1389 | |
1390 | |
1391 /* For each dimension, find the neighbor, if any, that is equal in all other dimensions and 1 greater in | |
1392 the chosen dimension. If the neighbor's centroid is not too far away, add it to this grid's neighbor | |
1393 list. */ | |
1394 for (t=0;t<num_dm;t++) | |
1395 { | |
1396 for (p=0;p<num_dm;p++) | |
1397 search_value[p]=grid_value[p]; | |
1398 | |
1399 if (grid_value[t]==num_bin-1) | |
1400 continue; | |
1401 | |
1402 search_value[t]=grid_value[t]+1; //we only consider the neighbor at the bigger side | |
1403 | |
1404 neighbor_ID=binary_search(num_dense_grids, num_dm, search_value, dense_grid_seq); | |
1405 | |
1406 if (neighbor_ID!=-1) | |
1407 { | |
1408 real_dist=norm_grid_center[i][t]-norm_grid_center[neighbor_ID][t]; | |
1409 | |
1410 if (real_dist<0) | |
1411 real_dist=-real_dist; | |
1412 | |
1413 if (real_dist<2*interval[t]) //by using 2*interval, this switch is void | |
1414 graph_of_grid[i][t]=neighbor_ID; | |
1415 } | |
1416 } | |
1417 grid_clusterID[i]=i; //initialize grid_clusterID | |
1418 } | |
1419 | |
1420 | |
1421 //graph constructed | |
1422 //DFS-based search begins | |
1423 | |
1424 /* Use a depth-first search to construct a list of connected subgraphs (= "clusters"). | |
1425 Note our graph as we have constructed it is a DAG, so we can use that to our advantage | |
1426 in our search. */ | |
1427 // num_clust=dfs(graph_of_grid,num_dense_grids,num_dm,grid_clusterID); | |
1428 num_clust=find_connected(graph_of_grid, num_dense_grids, num_dm, grid_clusterID); | |
1429 | |
1430 | |
1431 //computes grid_ID and cluster_ID | |
1432 for (i=0;i<file_Len;i++) | |
1433 { | |
1434 denseID=eventID_To_denseventID[i]; | |
1435 if (denseID!=-1) //only dense events can enter | |
1436 { | |
1437 cluster_ID[denseID]=grid_clusterID[grid_ID[denseID]]; | |
1438 //if (cluster_ID[denseID]==-1) | |
1439 // printf("catch you!\n"); | |
1440 } | |
1441 } | |
1442 | |
1443 free(search_value); | |
1444 free(grid_value); | |
1445 | |
1446 for (i=0;i<num_dense_grids;i++) | |
1447 { | |
1448 free(graph_of_grid[i]); | |
1449 free(norm_grid_center[i]); | |
1450 } | |
1451 free(graph_of_grid); | |
1452 free(norm_grid_center); | |
1453 | |
1454 return num_clust; | |
1455 } | |
1456 | |
1457 /********************************************* Merge Clusters to Populations *******************************************/ | |
1458 | |
1459 long merge_clusters(long num_clust, long num_dm, double *interval, double **cluster_center, long *cluster_populationID, long max_num_pop) | |
1460 { | |
1461 long num_population=0; | |
1462 long temp_num_population=0; | |
1463 | |
1464 long i=0; | |
1465 long j=0; | |
1466 long t=0; | |
1467 long merge=0; | |
1468 long smid=0; | |
1469 long bgid=0; | |
1470 double merge_dist=1.1; | |
1471 | |
1472 long *map_ID; | |
1473 | |
1474 double diff=0; | |
1475 | |
1476 map_ID=(long*)malloc(sizeof(long)*num_clust); | |
1477 memset(map_ID,0,sizeof(long)*num_clust); | |
1478 | |
1479 for (i=0;i<num_clust;i++) | |
1480 { | |
1481 cluster_populationID[i]=i; | |
1482 map_ID[i]=i; | |
1483 } | |
1484 | |
1485 | |
1486 while (((num_population>max_num_pop) && (merge_dist<5.0)) || ((num_population<=1) && (merge_dist>0.1))) | |
1487 { | |
1488 | |
1489 if (num_population<=1) | |
1490 merge_dist=merge_dist-0.1; | |
1491 | |
1492 if (num_population>max_num_pop) | |
1493 merge_dist=merge_dist+0.1; | |
1494 | |
1495 | |
1496 for (i=0;i<num_clust;i++) | |
1497 cluster_populationID[i]=i; | |
1498 | |
1499 for (i=0;i<num_clust-1;i++) | |
1500 { | |
1501 for (j=i+1;j<num_clust;j++) | |
1502 { | |
1503 merge=1; | |
1504 | |
1505 for (t=0;t<num_dm;t++) | |
1506 { | |
1507 diff=cluster_center[i][t]-cluster_center[j][t]; | |
1508 | |
1509 if (diff<0) | |
1510 diff=-diff; | |
1511 if (diff>(merge_dist*interval[t])) | |
1512 merge=0; | |
1513 } | |
1514 | |
1515 if ((merge) && (cluster_populationID[i]!=cluster_populationID[j])) | |
1516 { | |
1517 if (cluster_populationID[i]<cluster_populationID[j]) //they could not be equal | |
1518 { | |
1519 smid = cluster_populationID[i]; | |
1520 bgid = cluster_populationID[j]; | |
1521 } | |
1522 else | |
1523 { | |
1524 smid = cluster_populationID[j]; | |
1525 bgid = cluster_populationID[i]; | |
1526 } | |
1527 for (t=0;t<num_clust;t++) | |
1528 { | |
1529 if (cluster_populationID[t]==bgid) | |
1530 cluster_populationID[t]=smid; | |
1531 } | |
1532 } | |
1533 } | |
1534 } | |
1535 | |
1536 | |
1537 | |
1538 for (i=0;i<num_clust;i++) | |
1539 map_ID[i]=-1; | |
1540 | |
1541 for (i=0;i<num_clust;i++) | |
1542 map_ID[cluster_populationID[i]]=1; | |
1543 | |
1544 num_population=0; | |
1545 for (i=0;i<num_clust;i++) | |
1546 { | |
1547 if (map_ID[i]==1) | |
1548 { | |
1549 map_ID[i]=num_population; | |
1550 num_population++; | |
1551 } | |
1552 } | |
1553 | |
1554 if ((temp_num_population>max_num_pop) && (num_population==1)) | |
1555 break; | |
1556 else | |
1557 temp_num_population=num_population; | |
1558 | |
1559 if (num_clust<=1) | |
1560 break; | |
1561 } //end while | |
1562 | |
1563 for (i=0;i<num_clust;i++) | |
1564 cluster_populationID[i]=map_ID[cluster_populationID[i]]; | |
1565 | |
1566 free(map_ID); | |
1567 | |
1568 return num_population; | |
1569 } | |
1570 | |
1571 /////////////////////////////////////////////////////// | |
1572 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
1573 double kmeans(double **Matrix, long k, double kmean_term, long file_Len, long num_dm, long *shortest_id, double **center) | |
1574 { | |
1575 | |
1576 long i=0; | |
1577 long j=0; | |
1578 long t=0; | |
1579 long random=0; | |
1580 long random1=0; | |
1581 long random2=0; | |
1582 long times=0; | |
1583 long dist_used=0; //0 is Euclidean and 1 is Pearson | |
1584 long random_init=0; //0: not use random seeds; | |
1585 long real_Len=0; | |
1586 long skipped=0; | |
1587 | |
1588 long *num; //num[i]=t means the ith cluster has t points | |
1589 | |
1590 double vvv=1.0; // the biggest variation; | |
1591 double distance=0.0; | |
1592 double xv=0.0; | |
1593 double variation=0.0; | |
1594 | |
1595 double mean_dx=0; | |
1596 double mean_dy=0; | |
1597 double sum_var=0; | |
1598 double dx=0; | |
1599 double dy=0; | |
1600 double sd_x=0; | |
1601 double sd_y=0; | |
1602 double diff=0; | |
1603 double distortion=0; | |
1604 double sd=0; //standard deviation | |
1605 double shortest_distance; | |
1606 | |
1607 double *temp_center; | |
1608 | |
1609 double **sum; | |
1610 | |
1611 temp_center = (double *)malloc(sizeof(double)*num_dm); | |
1612 memset(temp_center,0,sizeof(double)*num_dm); | |
1613 | |
1614 if (random_init) | |
1615 { | |
1616 for (i=0;i<k;i++) | |
1617 { | |
1618 random1=rand()*rand(); | |
1619 random2=abs((random1%5)+1); | |
1620 for (t=0;t<random2;t++) | |
1621 random2=random2*rand()+rand(); | |
1622 | |
1623 random=abs(random2%file_Len); | |
1624 for (j=0;j<num_dm;j++) | |
1625 center[i][j]=Matrix[random][j]; | |
1626 } | |
1627 } | |
1628 | |
1629 | |
1630 num = (long *)malloc(sizeof(long)*k); | |
1631 memset(num,0,sizeof(long)*k); | |
1632 | |
1633 sum = (double **)malloc(sizeof(double*)*k); | |
1634 memset(sum,0,sizeof(double*)*k); | |
1635 for (i=0;i<k;i++) | |
1636 { | |
1637 sum[i] = (double *)malloc(sizeof(double)*num_dm); | |
1638 memset(sum[i],0,sizeof(double)*num_dm); | |
1639 } | |
1640 | |
1641 | |
1642 times=0; | |
1643 real_Len=0; | |
1644 | |
1645 while (((vvv>kmean_term) && (kmean_term<1)) || ((times<kmean_term) && (kmean_term>=1))) | |
1646 { | |
1647 for (i=0;i<k;i++) | |
1648 { | |
1649 num[i]=0; | |
1650 for (j=0;j<num_dm;j++) | |
1651 sum[i][j]=0.0; | |
1652 } | |
1653 | |
1654 for (i=0;i<file_Len;i++) //for each data point i, we compute the distance between Matrix[i] and center[j] | |
1655 { | |
1656 skipped = 0; | |
1657 shortest_distance=MAX_VALUE; | |
1658 for (j=0;j<k;j++) //for each center j | |
1659 { | |
1660 distance=0.0; | |
1661 | |
1662 if (dist_used==0) //Euclidean distance | |
1663 { | |
1664 for (t=0;t<num_dm;t++) //for each dimension here num_dm is always 1 as we consider individual dimensions | |
1665 { | |
1666 | |
1667 diff=center[j][t]-Matrix[i][t]; | |
1668 | |
1669 diff=diff*diff; | |
1670 //this K-means is only used for dimension selection, and therefore for 1-dimensional data only, no need to use cube distance | |
1671 | |
1672 distance = distance+diff; //here we have a weight for each dimension | |
1673 } | |
1674 } | |
1675 else //pearson correlation | |
1676 { | |
1677 mean_dx=0.0; | |
1678 mean_dy=0.0; | |
1679 sum_var=0.0; | |
1680 dx=0.0; | |
1681 dy=0.0; | |
1682 sd_x=0.0; | |
1683 sd_y=0.0; | |
1684 for (t=0;t<num_dm;t++) | |
1685 { | |
1686 mean_dx+=center[j][t]; | |
1687 mean_dy+=Matrix[i][t]; | |
1688 } | |
1689 mean_dx=mean_dx/(double)num_dm; | |
1690 mean_dy=mean_dy/(double)num_dm; | |
1691 | |
1692 for (t=0;t<num_dm;t++) | |
1693 { | |
1694 dx=center[j][t]-mean_dx; | |
1695 dy=Matrix[i][t]-mean_dy; | |
1696 sum_var+=dx*dy; | |
1697 sd_x+=dx*dx; | |
1698 sd_y+=dy*dy; | |
1699 } | |
1700 if (sqrt(sd_x*sd_y)==0) | |
1701 distance = 1.0; | |
1702 else | |
1703 distance = 1.0 - (sum_var/(sqrt(sd_x*sd_y))); // distance ranges from 0 to 2; | |
1704 //printf("distance=%f\n",distance); | |
1705 } //pearson correlation ends | |
1706 | |
1707 if ((distance<shortest_distance) && (skipped == 0)) | |
1708 { | |
1709 shortest_distance=distance; | |
1710 shortest_id[i]=j; | |
1711 } | |
1712 }//end for j | |
1713 real_Len++; | |
1714 num[shortest_id[i]]=num[shortest_id[i]]+1; | |
1715 for (t=0;t<num_dm;t++) | |
1716 sum[shortest_id[i]][t]=sum[shortest_id[i]][t]+Matrix[i][t]; | |
1717 }//end for i | |
1718 /* recompute the centers */ | |
1719 //compute_mean(group); | |
1720 vvv=0.0; | |
1721 for (j=0;j<k;j++) | |
1722 { | |
1723 memcpy(temp_center,center[j],sizeof(double)*num_dm); | |
1724 variation=0.0; | |
1725 if (num[j]!=0) | |
1726 { | |
1727 for (t=0;t<num_dm;t++) | |
1728 { | |
1729 center[j][t]=sum[j][t]/(double)num[j]; | |
1730 xv=(temp_center[t]-center[j][t]); | |
1731 variation=variation+xv*xv; | |
1732 } | |
1733 } | |
1734 | |
1735 if (variation>vvv) | |
1736 vvv=variation; //vvv is the biggest variation among the k clusters; | |
1737 } | |
1738 //compute_variation; | |
1739 times++; | |
1740 } //end for while | |
1741 | |
1742 | |
1743 | |
1744 | |
1745 free(num); | |
1746 | |
1747 for (i=0;i<k;i++) | |
1748 free(sum[i]); | |
1749 free(sum); | |
1750 free(temp_center); | |
1751 | |
1752 | |
1753 return distortion; | |
1754 | |
1755 } | |
1756 | |
1757 ////////////////////////////////////////////////////// | |
1758 /*************************** Show *****************************/ | |
1759 void show(double **Matrix, long *cluster_id, long file_Len, long k, long num_dm, char *name_string) | |
1760 { | |
1761 int situ1=0; | |
1762 int situ2=0; | |
1763 | |
1764 long i=0; | |
1765 long id=0; | |
1766 long j=0; | |
1767 long info_id=0; | |
1768 long nearest_id=0; | |
1769 long insert=0; | |
1770 long temp=0; | |
1771 long m=0; | |
1772 long n=0; | |
1773 long t=0; | |
1774 | |
1775 long *size_c; | |
1776 | |
1777 | |
1778 | |
1779 long **size_mybound_1; | |
1780 long **size_mybound_2; | |
1781 long **size_mybound_3; | |
1782 long **size_mybound_0; | |
1783 | |
1784 double interval=0.0; | |
1785 | |
1786 double *big; | |
1787 double *small; | |
1788 | |
1789 | |
1790 double **center; | |
1791 double **mybound; | |
1792 | |
1793 long **prof; //prof[i][j]=1 means population i is + at parameter j | |
1794 | |
1795 FILE *fpcnt_id; //proportion id | |
1796 //FILE *fcent_id; //center_id, i.e., centers of clusters within the original data | |
1797 FILE *fprof_id; //profile_id | |
1798 | |
1799 big=(double *)malloc(sizeof(double)*num_dm); | |
1800 memset(big,0,sizeof(double)*num_dm); | |
1801 | |
1802 small=(double *)malloc(sizeof(double)*num_dm); | |
1803 memset(small,0,sizeof(double)*num_dm); | |
1804 | |
1805 for (i=0;i<num_dm;i++) | |
1806 { | |
1807 big[i]=0.0; | |
1808 small[i]=(double)MAX_VALUE; | |
1809 } | |
1810 | |
1811 | |
1812 size_c=(long *)malloc(sizeof(long)*k); | |
1813 memset(size_c,0,sizeof(long)*k); | |
1814 | |
1815 center=(double**)malloc(sizeof(double*)*k); | |
1816 memset(center,0,sizeof(double*)*k); | |
1817 for (i=0;i<k;i++) | |
1818 { | |
1819 center[i]=(double*)malloc(sizeof(double)*num_dm); | |
1820 memset(center[i],0,sizeof(double)*num_dm); | |
1821 } | |
1822 | |
1823 mybound=(double**)malloc(sizeof(double*)*num_dm); | |
1824 memset(mybound,0,sizeof(double*)*num_dm); | |
1825 for (i=0;i<num_dm;i++) //there are 3 mybounds for 4 categories | |
1826 { | |
1827 mybound[i]=(double*)malloc(sizeof(double)*3); | |
1828 memset(mybound[i],0,sizeof(double)*3); | |
1829 } | |
1830 | |
1831 prof=(long **)malloc(sizeof(long*)*k); | |
1832 memset(prof,0,sizeof(long*)*k); | |
1833 for (i=0;i<k;i++) | |
1834 { | |
1835 prof[i]=(long *)malloc(sizeof(long)*num_dm); | |
1836 memset(prof[i],0,sizeof(long)*num_dm); | |
1837 } | |
1838 | |
1839 | |
1840 for (i=0;i<file_Len;i++) | |
1841 { | |
1842 id=cluster_id[i]; | |
1843 for (j=0;j<num_dm;j++) | |
1844 { | |
1845 center[id][j]=center[id][j]+Matrix[i][j]; | |
1846 if (big[j]<Matrix[i][j]) | |
1847 big[j]=Matrix[i][j]; | |
1848 if (small[j]>Matrix[i][j]) | |
1849 small[j]=Matrix[i][j]; | |
1850 } | |
1851 | |
1852 size_c[id]++; | |
1853 } | |
1854 | |
1855 for (i=0;i<k;i++) | |
1856 for (j=0;j<num_dm;j++) | |
1857 { | |
1858 if (size_c[i]!=0) | |
1859 center[i][j]=(center[i][j]/(double)(size_c[i])); | |
1860 else | |
1861 center[i][j]=0; | |
1862 } | |
1863 | |
1864 for (j=0;j<num_dm;j++) | |
1865 { | |
1866 interval=((big[j]-small[j])/4.0); | |
1867 //printf("interval[%d] is %f\n",j,interval); | |
1868 for (i=0;i<3;i++) | |
1869 mybound[j][i]=small[j]+((double)(i+1)*interval); | |
1870 } | |
1871 | |
1872 | |
1873 size_mybound_0=(long **)malloc(sizeof(long*)*k); | |
1874 memset(size_mybound_0,0,sizeof(long*)*k); | |
1875 | |
1876 for (i=0;i<k;i++) | |
1877 { | |
1878 size_mybound_0[i]=(long*)malloc(sizeof(long)*num_dm); | |
1879 memset(size_mybound_0[i],0,sizeof(long)*num_dm); | |
1880 } | |
1881 | |
1882 size_mybound_1=(long **)malloc(sizeof(long*)*k); | |
1883 memset(size_mybound_1,0,sizeof(long*)*k); | |
1884 | |
1885 for (i=0;i<k;i++) | |
1886 { | |
1887 size_mybound_1[i]=(long*)malloc(sizeof(long)*num_dm); | |
1888 memset(size_mybound_1[i],0,sizeof(long)*num_dm); | |
1889 } | |
1890 | |
1891 size_mybound_2=(long **)malloc(sizeof(long*)*k); | |
1892 memset(size_mybound_2,0,sizeof(long*)*k); | |
1893 | |
1894 for (i=0;i<k;i++) | |
1895 { | |
1896 size_mybound_2[i]=(long*)malloc(sizeof(long)*num_dm); | |
1897 memset(size_mybound_2[i],0,sizeof(long)*num_dm); | |
1898 } | |
1899 | |
1900 size_mybound_3=(long **)malloc(sizeof(long*)*k); | |
1901 memset(size_mybound_3,0,sizeof(long*)*k); | |
1902 | |
1903 for (i=0;i<k;i++) | |
1904 { | |
1905 size_mybound_3[i]=(long*)malloc(sizeof(long)*num_dm); | |
1906 memset(size_mybound_3[i],0,sizeof(long)*num_dm); | |
1907 } | |
1908 | |
1909 for (i=0;i<file_Len;i++) | |
1910 for (j=0;j<num_dm;j++) | |
1911 { | |
1912 if (Matrix[i][j]<mybound[j][0])// && ((Matrix[i][j]-small[j])>0)) //the smallest values excluded | |
1913 size_mybound_0[cluster_id[i]][j]++; | |
1914 else | |
1915 { | |
1916 if (Matrix[i][j]<mybound[j][1]) | |
1917 size_mybound_1[cluster_id[i]][j]++; | |
1918 else | |
1919 { | |
1920 if (Matrix[i][j]<mybound[j][2]) | |
1921 size_mybound_2[cluster_id[i]][j]++; | |
1922 else | |
1923 //if (Matrix[i][j]!=big[j]) //the biggest values excluded | |
1924 size_mybound_3[cluster_id[i]][j]++; | |
1925 } | |
1926 | |
1927 } | |
1928 } | |
1929 | |
1930 fprof_id=fopen("profile.txt","w"); | |
1931 fprintf(fprof_id,"Population_ID\t"); | |
1932 fprintf(fprof_id,"%s\n",name_string); | |
1933 | |
1934 for (i=0;i<k;i++) | |
1935 { | |
1936 fprintf(fprof_id,"%ld\t",i+1); //i changed to i+1 to start from 1 instead of 0: April 16, 2009 | |
1937 for (j=0;j<num_dm;j++) | |
1938 { | |
1939 | |
1940 if (size_mybound_0[i][j]>size_mybound_1[i][j]) | |
1941 situ1=0; | |
1942 else | |
1943 situ1=1; | |
1944 if (size_mybound_2[i][j]>size_mybound_3[i][j]) | |
1945 situ2=2; | |
1946 else | |
1947 situ2=3; | |
1948 | |
1949 if ((situ1==0) && (situ2==2)) | |
1950 { | |
1951 if (size_mybound_0[i][j]>size_mybound_2[i][j]) | |
1952 prof[i][j]=0; | |
1953 else | |
1954 prof[i][j]=2; | |
1955 } | |
1956 if ((situ1==0) && (situ2==3)) | |
1957 { | |
1958 if (size_mybound_0[i][j]>size_mybound_3[i][j]) | |
1959 prof[i][j]=0; | |
1960 else | |
1961 prof[i][j]=3; | |
1962 } | |
1963 if ((situ1==1) && (situ2==2)) | |
1964 { | |
1965 if (size_mybound_1[i][j]>size_mybound_2[i][j]) | |
1966 prof[i][j]=1; | |
1967 else | |
1968 prof[i][j]=2; | |
1969 } | |
1970 if ((situ1==1) && (situ2==3)) | |
1971 { | |
1972 if (size_mybound_1[i][j]>size_mybound_3[i][j]) | |
1973 prof[i][j]=1; | |
1974 else | |
1975 prof[i][j]=3; | |
1976 } | |
1977 | |
1978 //begin to output profile | |
1979 if (j==num_dm-1) | |
1980 { | |
1981 if (prof[i][j]==0) | |
1982 fprintf(fprof_id,"1\n"); | |
1983 if (prof[i][j]==1) | |
1984 fprintf(fprof_id,"2\n"); | |
1985 if (prof[i][j]==2) | |
1986 fprintf(fprof_id,"3\n"); | |
1987 if (prof[i][j]==3) | |
1988 fprintf(fprof_id,"4\n"); | |
1989 } | |
1990 else | |
1991 { | |
1992 if (prof[i][j]==0) | |
1993 fprintf(fprof_id,"1\t"); | |
1994 if (prof[i][j]==1) | |
1995 fprintf(fprof_id,"2\t"); | |
1996 if (prof[i][j]==2) | |
1997 fprintf(fprof_id,"3\t"); | |
1998 if (prof[i][j]==3) | |
1999 fprintf(fprof_id,"4\t"); | |
2000 } | |
2001 } | |
2002 } | |
2003 fclose(fprof_id); | |
2004 | |
2005 /////////////////////////////////////////////////////////// | |
2006 | |
2007 | |
2008 fpcnt_id=fopen("percentage.txt","w"); | |
2009 fprintf(fpcnt_id,"Population_ID\tPercentage\n"); | |
2010 | |
2011 for (t=0;t<k;t++) | |
2012 { | |
2013 fprintf(fpcnt_id,"%ld\t%.2f\n",t+1,(double)size_c[t]*100.0/(double)file_Len); //t changed to t+1 to start from 1 instead of 0: April 16, 2009 | |
2014 } | |
2015 fclose(fpcnt_id); | |
2016 | |
2017 free(big); | |
2018 free(small); | |
2019 free(size_c); | |
2020 | |
2021 for (i=0;i<k;i++) | |
2022 { | |
2023 free(center[i]); | |
2024 free(prof[i]); | |
2025 free(size_mybound_0[i]); | |
2026 free(size_mybound_1[i]); | |
2027 free(size_mybound_2[i]); | |
2028 free(size_mybound_3[i]); | |
2029 } | |
2030 free(center); | |
2031 free(prof); | |
2032 free(size_mybound_0); | |
2033 free(size_mybound_1); | |
2034 free(size_mybound_2); | |
2035 free(size_mybound_3); | |
2036 | |
2037 for (i=0;i<num_dm;i++) | |
2038 free(mybound[i]); | |
2039 free(mybound); | |
2040 | |
2041 } | |
2042 /////////////////////////////////////////////////////////////////////////// | |
2043 | |
2044 double get_avg_dist(double *population_center, long smaller_pop_ID, long larger_pop_ID, long *population_ID, long num_population, long file_Len, | |
2045 long num_dm, double **normalized_data, long d1, long d2, long d3, long *size_c) | |
2046 { | |
2047 long k=0; | |
2048 long temp=0; | |
2049 long i=0; | |
2050 long j=0; | |
2051 long t=0; | |
2052 long current_biggest=0; | |
2053 | |
2054 | |
2055 double total_dist=0.0; | |
2056 double distance=0.0; | |
2057 double dist=0.0; | |
2058 double biggest_distance=0.0; | |
2059 | |
2060 long *nearest_neighbors; | |
2061 double *nearest_distance; | |
2062 | |
2063 k=min(size_c[smaller_pop_ID],size_c[larger_pop_ID]); | |
2064 | |
2065 if (k>0) | |
2066 { | |
2067 k=(long)sqrt((double)k); | |
2068 if (num_dm<=3) | |
2069 k=(long)(2.7183*(double)k); //e*k for low-dimensional space, added Nov. 4, 2010 | |
2070 //printf("the k-nearest k is %d\n",k); | |
2071 } | |
2072 else | |
2073 { | |
2074 printf("error in k\n"); | |
2075 exit(0); | |
2076 } | |
2077 | |
2078 nearest_neighbors=(long *)malloc(sizeof(long)*k); | |
2079 memset(nearest_neighbors,0,sizeof(long)*k); | |
2080 | |
2081 nearest_distance=(double *)malloc(sizeof(double)*k); | |
2082 memset(nearest_distance,0,sizeof(double)*k); | |
2083 | |
2084 temp=0; | |
2085 | |
2086 for (i=0;i<file_Len;i++) | |
2087 { | |
2088 if ((population_ID[i]==smaller_pop_ID) || (population_ID[i]==larger_pop_ID)) //the event belongs to the pair of populations | |
2089 { | |
2090 distance=0.0; | |
2091 for (t=0;t<num_dm;t++) | |
2092 { | |
2093 if ((t!=d1) && (t!=d2) && (t!=d3)) | |
2094 continue; | |
2095 else | |
2096 { | |
2097 dist=population_center[t]-normalized_data[i][t]; | |
2098 distance=distance+dist*dist; | |
2099 } | |
2100 } | |
2101 | |
2102 if (temp<k) | |
2103 { | |
2104 nearest_neighbors[temp]=i; | |
2105 nearest_distance[temp]=distance; | |
2106 temp++; | |
2107 } | |
2108 else | |
2109 { | |
2110 biggest_distance=0.0; | |
2111 for (j=0;j<k;j++) | |
2112 { | |
2113 if (nearest_distance[j]>biggest_distance) | |
2114 { | |
2115 biggest_distance=nearest_distance[j]; | |
2116 current_biggest=j; | |
2117 } | |
2118 } | |
2119 | |
2120 if (biggest_distance>distance) | |
2121 { | |
2122 nearest_neighbors[current_biggest]=i; | |
2123 nearest_distance[current_biggest]=distance; | |
2124 } | |
2125 } | |
2126 } | |
2127 } | |
2128 | |
2129 for (i=0;i<k;i++) | |
2130 total_dist=total_dist+nearest_distance[i]; | |
2131 | |
2132 total_dist=total_dist/(double)k; | |
2133 | |
2134 free(nearest_distance); | |
2135 free(nearest_neighbors); | |
2136 | |
2137 return total_dist; | |
2138 } | |
2139 | |
2140 | |
2141 /******************************************************** Main Function **************************************************/ | |
2142 //for normalized data, there are five variables: | |
2143 //cluster_ID | |
2144 //population_center | |
2145 //grid_clusterID | |
2146 //grid_ID | |
2147 //grid_Center | |
2148 | |
2149 //the same five variables exist for the original data | |
2150 //however, the three IDs (cluster_ID, grid_ID, grid_clusterID) don't change in either normalized or original data | |
2151 //also, data + cluster_ID -> population_center | |
2152 //data + grid_ID -> grid_Center | |
2153 | |
2154 /* what is the final output */ | |
2155 //the final things we want are grid_Center in the original data and grid_clusterID //or population_center in the original data | |
2156 //Sparse grids will not be considered when computing the two centroids (centroids of grids and centroids of clusters) | |
2157 | |
2158 /* what information should select_bin output */ | |
2159 //the size of all IDs are unknown to function main because we only consider the events in dense grids, and also the number of dense grids | |
2160 //is unknown, therefore I must use a prescreening to output | |
2161 //how many bins I should use | |
2162 //the number of dense grids | |
2163 //total number of events in the dense grids | |
2164 | |
2165 /* basic procedure of main function */ | |
2166 //1. read raw file and normalize the raw file | |
2167 //2. select_bin | |
2168 //3. allocate memory for eventID_To_denseventID, grid_clusterID, grid_ID, cluster_ID. | |
2169 //4. call select_dense and merge_grids with grid_clusterID, grid_ID, cluster_ID. | |
2170 //5. release normalized data; allocate memory for grid_Center and population_center | |
2171 //6. output grid_Center and population_center using ID2Center together with grid_clusterID //from IDs to centers | |
2172 | |
2173 int main (int argc, char **argv) | |
2174 { | |
2175 //inputs | |
2176 FILE *f_src; //source file pointer | |
2177 | |
2178 FILE *f_out; //coordinates | |
2179 FILE *f_cid; //population-ID of events | |
2180 FILE *f_ctr; //centroids of populations | |
2181 FILE *f_results; //coordinates file event and population column | |
2182 FILE *f_mfi; //added April 16, 2009 for mean fluorescence intensity | |
2183 FILE *f_parameters; //number of bins and density calculated by | |
2184 //the algorithm. Used to update the database | |
2185 FILE *f_properties; //Properties file used by Image generation software | |
2186 | |
2187 //char tmpbuf[128]; | |
2188 | |
2189 char para_name_string[LINE_LEN]; | |
2190 | |
2191 long time_ID=-1; | |
2192 long num_bin=0; //the bins I will use on each dimension | |
2193 long num_pop=0; | |
2194 long file_Len=0; //number of events | |
2195 long num_dm=0; | |
2196 long num_clust=0; | |
2197 long num_dense_events=0; | |
2198 long num_dense_grids=0; | |
2199 long num_nonempty=0; | |
2200 long num_population=0; | |
2201 long num_real_pop=0; | |
2202 long keep_merge=0; | |
2203 long num_checked_range=0; | |
2204 | |
2205 //below are read from configuration file | |
2206 long i=0; | |
2207 long j=0; | |
2208 long p=0; | |
2209 long t=0; | |
2210 long q=0; | |
2211 long temp_i=0; | |
2212 long temp_j=0; | |
2213 long first_i=0; | |
2214 long first_j=0; | |
2215 long d1=0; | |
2216 long d2=0; | |
2217 long d3=0; | |
2218 long d_d=0; | |
2219 long max_num_pop=0; //upperbound of the number of populations that is equal to 2^d | |
2220 long index_id=0; | |
2221 long num_computed_population=0; | |
2222 long tot_size=0; | |
2223 | |
2224 int den_t_event=0; | |
2225 | |
2226 double distance=0.0; | |
2227 double dist=0.0; | |
2228 double current_smallest_dist=0; | |
2229 double max_d_dist=0.0; | |
2230 double Ei=0.0; | |
2231 double Ej=0.0; | |
2232 double E1=0.0; | |
2233 double E2=0.0; | |
2234 double E3=0.0; | |
2235 double Ep=0.0; | |
2236 double temp=0.0; | |
2237 double tmp=0.0; | |
2238 | |
2239 long *grid_clusterID; //(dense)gridID to clusterID | |
2240 long *grid_ID; //(dense)eventID to gridID | |
2241 long *cluster_ID; //(dense)eventID to clusterID | |
2242 long *eventID_To_denseventID; //eventID_To_denseventID[i]=k means event i is in a dense grid and its ID within dense events is k | |
2243 long *all_grid_ID; //gridID for all events | |
2244 long *densegridID_To_gridevent; | |
2245 long *sorted_seq; | |
2246 long *dense_grid_reverse; | |
2247 long *population_ID; //denseeventID to populationID | |
2248 long *cluster_populationID; //clusterID to populationID | |
2249 long *grid_populationID; //gridID to populationID | |
2250 long *all_population_ID; //populationID of event | |
2251 long *size_c; | |
2252 long *size_p_2; | |
2253 long *partit; | |
2254 long *size_p; | |
2255 long *temp_size_j; | |
2256 long *all_computed_pop_ID; | |
2257 | |
2258 | |
2259 long **position; | |
2260 long **dense_grid_seq; | |
2261 long **temp_population_ID; | |
2262 | |
2263 double *interval; | |
2264 double *center_1; | |
2265 double *center_2; | |
2266 double *center_3; | |
2267 double *aver; | |
2268 double *std; | |
2269 double *center_p_1; | |
2270 double *center_p_2; | |
2271 double *center_p_3; | |
2272 | |
2273 double **population_center; //population centroids in the raw/original data | |
2274 double **cluster_center; //cluster centroids in the raw/original data | |
2275 double **new_population_center; | |
2276 double **temp_population_center; | |
2277 double **min_pop; | |
2278 double **max_pop; | |
2279 | |
2280 double **input_data; | |
2281 double **normalized_data; | |
2282 | |
2283 double **real_pop_center; | |
2284 double **distance_pop; | |
2285 | |
2286 double ***ind_pop; | |
2287 double ***ind_pop_3; | |
2288 | |
2289 int min = 999999; | |
2290 int max = 0; | |
2291 | |
2292 printf( "Starting time:\t\t\t\t"); | |
2293 fflush(stdout); | |
2294 system("/bin/date"); | |
2295 | |
2296 /* | |
2297 * Windows Version | |
2298 _strtime( tmpbuf ); | |
2299 printf( "Starting time:\t\t\t\t%s\n", tmpbuf ); | |
2300 _strdate( tmpbuf ); | |
2301 printf( "Starting date:\t\t\t\t%s\n", tmpbuf ); | |
2302 */ | |
2303 | |
2304 ///////////////////////////////////////////////////////////// | |
2305 | |
2306 if ((argc!=2) && (argc!=3) && (argc!=4) && (argc!=5) && (argc!=6)) | |
2307 { | |
2308 fprintf(stderr,"Incorrect number of input parameters!\n"); | |
2309 fprintf(stderr,"usage:\n"); | |
2310 fprintf(stderr,"basic mode: flock data_file\n"); | |
2311 fprintf(stderr,"advanced mode 0 (specify maximum # of pops): flock data_file max_num_pop\n"); | |
2312 fprintf(stderr,"advanced mode 1 (without # of pops): flock data_file num_bin density_index\n"); | |
2313 fprintf(stderr,"advanced mode 2 (specify # of pops): flock data_file num_bin density_index number_of_pop\n"); | |
2314 fprintf(stderr,"advanced mode 3 (specify both # of pops): flock data_file num_bin density_index number_of_pop max_num_pop\n"); | |
2315 exit(0); | |
2316 } | |
2317 | |
2318 f_src=fopen(argv[1],"r"); | |
2319 | |
2320 if (argc==3) | |
2321 { | |
2322 max_num_pop=atoi(argv[2]); | |
2323 printf("num of maximum pop is %ld\n",max_num_pop); | |
2324 } | |
2325 | |
2326 if (argc==4) | |
2327 { | |
2328 num_bin=atoi(argv[2]); | |
2329 printf("num_bin is %ld\n",num_bin); | |
2330 | |
2331 den_t_event=atoi(argv[3]); | |
2332 printf("density_index is %d\n",den_t_event); | |
2333 } | |
2334 | |
2335 if (argc==5) | |
2336 { | |
2337 num_bin=atoi(argv[2]); | |
2338 printf("num_bin is %ld\n",num_bin); | |
2339 | |
2340 den_t_event=atoi(argv[3]); | |
2341 printf("density_index is %d\n",den_t_event); | |
2342 | |
2343 num_pop=atoi(argv[4]); | |
2344 printf("num of pop is %ld\n",num_pop); | |
2345 } | |
2346 | |
2347 if (argc==6) | |
2348 { | |
2349 num_bin=atoi(argv[2]); | |
2350 printf("num_bin is %ld\n",num_bin); | |
2351 | |
2352 den_t_event=atoi(argv[3]); | |
2353 printf("density_index is %d\n",den_t_event); | |
2354 | |
2355 num_pop=atoi(argv[4]); | |
2356 printf("num of pop is %ld\n",num_pop); | |
2357 | |
2358 max_num_pop=atoi(argv[5]); | |
2359 printf("num of pop is %ld\n",max_num_pop); | |
2360 } | |
2361 | |
2362 | |
2363 if (num_pop==1) | |
2364 { | |
2365 printf("it is not allowed to specify only one population\n"); | |
2366 exit(0); | |
2367 } | |
2368 | |
2369 | |
2370 | |
2371 getfileinfo(f_src, &file_Len, &num_dm, para_name_string, &time_ID); //get the filelength, number of dimensions, and num/name of parameters | |
2372 | |
2373 | |
2374 if (max_num_pop==0) | |
2375 { | |
2376 max_num_pop=(long)pow(2,num_dm); | |
2377 if (max_num_pop>MAX_POP_NUM) | |
2378 max_num_pop=MAX_POP_NUM; | |
2379 } | |
2380 | |
2381 /************************************************* Read the data *****************************************************/ | |
2382 | |
2383 rewind(f_src); //reset the file pointer | |
2384 | |
2385 input_data = (double **)malloc(sizeof(double*)*file_Len); | |
2386 memset(input_data,0,sizeof(double*)*file_Len); | |
2387 for (i=0;i<file_Len;i++) | |
2388 { | |
2389 input_data[i]=(double *)malloc(sizeof(double)*num_dm); | |
2390 memset(input_data[i],0,sizeof(double)*num_dm); | |
2391 } | |
2392 | |
2393 readsource(f_src, file_Len, num_dm, input_data, time_ID); //read the data; | |
2394 | |
2395 fclose(f_src); | |
2396 | |
2397 normalized_data=(double **)malloc(sizeof(double*)*file_Len); | |
2398 memset(normalized_data,0,sizeof(double*)*file_Len); | |
2399 for (i=0;i<file_Len;i++) | |
2400 { | |
2401 normalized_data[i]=(double *)malloc(sizeof(double)*num_dm); | |
2402 memset(normalized_data[i],0,sizeof(double)*num_dm); | |
2403 } | |
2404 | |
2405 tran(input_data, file_Len, num_dm, NORM_METHOD, normalized_data); | |
2406 | |
2407 | |
2408 position=(long **)malloc(sizeof(long*)*file_Len); | |
2409 memset(position,0,sizeof(long*)*file_Len); | |
2410 for (i=0;i<file_Len;i++) | |
2411 { | |
2412 position[i]=(long*)malloc(sizeof(long)*num_dm); | |
2413 memset(position[i],0,sizeof(long)*num_dm); | |
2414 } | |
2415 | |
2416 all_grid_ID=(long *)malloc(sizeof(long)*file_Len); | |
2417 memset(all_grid_ID,0,sizeof(long)*file_Len); | |
2418 | |
2419 sorted_seq=(long*)malloc(sizeof(long)*file_Len); | |
2420 memset(sorted_seq,0,sizeof(long)*file_Len); | |
2421 | |
2422 interval=(double*)malloc(sizeof(double)*num_dm); | |
2423 memset(interval,0,sizeof(double)*num_dm); | |
2424 | |
2425 /************************************************* select_bin *************************************************/ | |
2426 | |
2427 if (num_bin>=1) //num_bin has been selected by user | |
2428 { | |
2429 select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin); | |
2430 printf("user set bin number is %ld\n",num_bin); | |
2431 } | |
2432 else //num_bin has not been selected by user | |
2433 { | |
2434 num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin); | |
2435 printf("selected bin number is %ld\n",num_bin); | |
2436 } | |
2437 printf("number of non-empty grids is %ld\n",num_nonempty); | |
2438 | |
2439 | |
2440 | |
2441 /* Although we return sorted_seq from select_bin(), we don't use it for anything, except possibly diagnostics */ | |
2442 free(sorted_seq); | |
2443 | |
2444 | |
2445 dense_grid_reverse=(long*)malloc(sizeof(long)*num_nonempty); | |
2446 memset(dense_grid_reverse,0,sizeof(long)*num_nonempty); | |
2447 | |
2448 /************************************************* select_dense **********************************************/ | |
2449 | |
2450 if (den_t_event>=1) //den_t_event must be larger or equal to 2 if the user wants to set it | |
2451 den_t_event=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event); | |
2452 else | |
2453 { | |
2454 den_t_event=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event); | |
2455 printf("automated selected density threshold is %d\n",den_t_event); | |
2456 } | |
2457 | |
2458 printf("Number of dense grids is %ld\n",num_dense_grids); | |
2459 | |
2460 densegridID_To_gridevent = (long *)malloc(sizeof(long)*num_dense_grids); | |
2461 memset(densegridID_To_gridevent,0,sizeof(long)*num_dense_grids); | |
2462 | |
2463 for (i=0;i<num_dense_grids;i++) | |
2464 densegridID_To_gridevent[i]=-1; //initialize all densegridID_To_gridevent values to -1 | |
2465 | |
2466 | |
2467 eventID_To_denseventID=(long *)malloc(sizeof(long)*file_Len); | |
2468 memset(eventID_To_denseventID,0,sizeof(long)*file_Len); //eventID_To_denseventID[i]=k means event i is in a dense grid and its ID within dense events is k | |
2469 | |
2470 | |
2471 grid_To_event(file_Len, dense_grid_reverse, all_grid_ID, eventID_To_denseventID, densegridID_To_gridevent); | |
2472 | |
2473 | |
2474 dense_grid_seq=(long **)malloc(sizeof(long*)*num_dense_grids); | |
2475 memset(dense_grid_seq,0,sizeof(long*)*num_dense_grids); | |
2476 for (i=0;i<num_dense_grids;i++) | |
2477 { | |
2478 dense_grid_seq[i]=(long *)malloc(sizeof(long)*num_dm); | |
2479 memset(dense_grid_seq[i],0,sizeof(long)*num_dm); | |
2480 } | |
2481 | |
2482 | |
2483 /* Look up the binned data values for each dense grid */ | |
2484 generate_grid_seq(num_dm, num_dense_grids, densegridID_To_gridevent, position, dense_grid_seq); | |
2485 | |
2486 | |
2487 /************************************************* allocate memory *********************************************/ | |
2488 | |
2489 grid_clusterID=(long *)malloc(sizeof(long)*num_dense_grids); | |
2490 memset(grid_clusterID,0,sizeof(long)*num_dense_grids); | |
2491 | |
2492 grid_ID=(long *)malloc(sizeof(long)*num_dense_events); | |
2493 memset(grid_ID,0,sizeof(long)*num_dense_events); | |
2494 | |
2495 cluster_ID=(long *)malloc(sizeof(long)*num_dense_events); | |
2496 memset(cluster_ID,0,sizeof(long)*num_dense_events); | |
2497 | |
2498 | |
2499 /*********************************************** merge_grids ***********************************************/ | |
2500 //long merge_grids(long file_Len, long num_dm, long num_bin, long **position, long num_dense_grids, long *dense_grid_rank, long *dense_grid_reverse, | |
2501 // long **dense_grid_seq, long *eventID_To_denseventID, long *densegridID_To_gridevent, long *all_grid_ID, | |
2502 // long *cluster_ID, long *grid_ID, long *grid_clusterID) | |
2503 | |
2504 num_clust = merge_grids(normalized_data, interval, file_Len, num_dm, num_bin, position, num_dense_grids, dense_grid_reverse, dense_grid_seq, eventID_To_denseventID, densegridID_To_gridevent, all_grid_ID, cluster_ID, grid_ID, grid_clusterID); | |
2505 | |
2506 printf("computed number of groups is %ld\n",num_clust); | |
2507 | |
2508 | |
2509 /************************************** release unnecessary memory and allocate memory and compute centers **********************************/ | |
2510 | |
2511 | |
2512 for (i=0;i<file_Len;i++) | |
2513 free(position[i]); | |
2514 free(position); | |
2515 | |
2516 for (i=0;i<num_dense_grids;i++) | |
2517 free(dense_grid_seq[i]); | |
2518 free(dense_grid_seq); | |
2519 | |
2520 free(dense_grid_reverse); | |
2521 | |
2522 free(densegridID_To_gridevent); | |
2523 free(all_grid_ID); | |
2524 | |
2525 // cluster_center //////////////////////////////////////////////////////////////////////////////////////////////////////// | |
2526 | |
2527 cluster_center=(double **)malloc(sizeof(double*)*num_clust); | |
2528 memset(cluster_center,0,sizeof(double*)*num_clust); | |
2529 for (i=0;i<num_clust;i++) | |
2530 { | |
2531 cluster_center[i]=(double*)malloc(sizeof(double)*num_dm); | |
2532 memset(cluster_center[i],0,sizeof(double)*num_dm); | |
2533 } | |
2534 | |
2535 ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_clust,cluster_ID,cluster_center); //produce the centers with normalized data | |
2536 | |
2537 //printf("pass the first ID2center\n"); | |
2538 | |
2539 /*** population_ID and grid_populationID **/ | |
2540 | |
2541 cluster_populationID=(long*)malloc(sizeof(long)*num_clust); | |
2542 memset(cluster_populationID,0,sizeof(long)*num_clust); | |
2543 | |
2544 grid_populationID=(long*)malloc(sizeof(long)*num_dense_grids); | |
2545 memset(grid_populationID,0,sizeof(long)*num_dense_grids); | |
2546 | |
2547 population_ID=(long*)malloc(sizeof(long)*num_dense_events); | |
2548 memset(population_ID,0,sizeof(long)*num_dense_events); | |
2549 | |
2550 num_population = merge_clusters(num_clust, num_dm, interval, cluster_center, cluster_populationID, max_num_pop); | |
2551 | |
2552 | |
2553 | |
2554 for (i=0;i<num_clust;i++) | |
2555 free(cluster_center[i]); | |
2556 free(cluster_center); | |
2557 | |
2558 free(interval); | |
2559 | |
2560 for (i=0;i<num_dense_grids;i++) | |
2561 { | |
2562 grid_populationID[i]=cluster_populationID[grid_clusterID[i]]; | |
2563 } | |
2564 | |
2565 for (i=0;i<num_dense_events;i++) | |
2566 { | |
2567 population_ID[i]=cluster_populationID[cluster_ID[i]]; | |
2568 } | |
2569 | |
2570 printf("Number of merged groups before second-run partitioning is %ld\n",num_population); | |
2571 | |
2572 | |
2573 | |
2574 // population_center ///////////////////////////////////////////////////////////////////////////////////////////////////// | |
2575 | |
2576 | |
2577 population_center=(double **)malloc(sizeof(double*)*num_population); | |
2578 memset(population_center,0,sizeof(double*)*num_population); | |
2579 for (i=0;i<num_population;i++) | |
2580 { | |
2581 population_center[i]=(double*)malloc(sizeof(double)*num_dm); | |
2582 memset(population_center[i],0,sizeof(double)*num_dm); | |
2583 } | |
2584 | |
2585 | |
2586 | |
2587 ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_population,population_ID,population_center); //produce population centers with normalized data | |
2588 | |
2589 | |
2590 // show //////////////////////////////////////////////////////////////////////////////// | |
2591 all_population_ID=(long*)malloc(sizeof(long)*file_Len); | |
2592 memset(all_population_ID,0,sizeof(long)*file_Len); | |
2593 | |
2594 kmeans(normalized_data, num_population, KMEANS_TERM, file_Len, num_dm, all_population_ID, population_center); | |
2595 | |
2596 for (i=0;i<num_population;i++) | |
2597 free(population_center[i]); | |
2598 | |
2599 free(population_center); | |
2600 | |
2601 ////// there needs to be another step to further partition the data | |
2602 | |
2603 | |
2604 center_p_1=(double *)malloc(sizeof(double)*3); | |
2605 memset(center_p_1,0,sizeof(double)*3); | |
2606 | |
2607 center_p_2=(double *)malloc(sizeof(double)*3); | |
2608 memset(center_p_2,0,sizeof(double)*3); | |
2609 | |
2610 center_p_3=(double *)malloc(sizeof(double)*3); | |
2611 memset(center_p_3,0,sizeof(double)*3); | |
2612 | |
2613 size_p=(long *)malloc(sizeof(long)*num_population); | |
2614 memset(size_p,0,sizeof(long)*num_population); | |
2615 | |
2616 temp_size_j=(long *)malloc(sizeof(long)*num_population); | |
2617 memset(temp_size_j,0,sizeof(long)*num_population); | |
2618 | |
2619 all_computed_pop_ID=(long *)malloc(sizeof(long)*file_Len); | |
2620 memset(all_computed_pop_ID,0,sizeof(long)*file_Len); | |
2621 | |
2622 for (i=0;i<file_Len;i++) | |
2623 { | |
2624 size_p[all_population_ID[i]]++; | |
2625 all_computed_pop_ID[i]=all_population_ID[i]; | |
2626 } | |
2627 | |
2628 ind_pop=(double***)malloc(sizeof(double**)*num_population); | |
2629 memset(ind_pop,0,sizeof(double**)*num_population); | |
2630 | |
2631 ind_pop_3=(double***)malloc(sizeof(double**)*num_population); | |
2632 memset(ind_pop_3,0,sizeof(double**)*num_population); | |
2633 | |
2634 for (i=0;i<num_population;i++) | |
2635 { | |
2636 ind_pop[i]=(double**)malloc(sizeof(double*)*size_p[i]); | |
2637 memset(ind_pop[i],0,sizeof(double*)*size_p[i]); | |
2638 | |
2639 ind_pop_3[i]=(double**)malloc(sizeof(double*)*size_p[i]); | |
2640 memset(ind_pop_3[i],0,sizeof(double*)*size_p[i]); | |
2641 | |
2642 for (j=0;j<size_p[i];j++) | |
2643 { | |
2644 ind_pop[i][j]=(double*)malloc(sizeof(double)*num_dm); | |
2645 memset(ind_pop[i][j],0,sizeof(double)*num_dm); | |
2646 | |
2647 ind_pop_3[i][j]=(double*)malloc(sizeof(double)*3); | |
2648 memset(ind_pop_3[i][j],0,sizeof(double)*3); | |
2649 } | |
2650 temp_size_j[i]=0; | |
2651 } | |
2652 | |
2653 for (i=0;i<file_Len;i++) //to generate ind_pop[i] | |
2654 { | |
2655 index_id=all_population_ID[i]; | |
2656 | |
2657 j=temp_size_j[index_id]; | |
2658 | |
2659 for (t=0;t<num_dm;t++) | |
2660 { | |
2661 ind_pop[index_id][j][t]=input_data[i][t]; | |
2662 } | |
2663 temp_size_j[index_id]++; | |
2664 } | |
2665 | |
2666 aver=(double*)malloc(sizeof(double)*num_dm); | |
2667 memset(aver,0,sizeof(double)*num_dm); | |
2668 | |
2669 std=(double*)malloc(sizeof(double)*num_dm); | |
2670 memset(std,0,sizeof(double)*num_dm); | |
2671 | |
2672 temp_population_center=(double**)malloc(sizeof(double*)*2); | |
2673 memset(temp_population_center,0,sizeof(double*)*2); | |
2674 for (i=0;i<2;i++) | |
2675 { | |
2676 temp_population_center[i]=(double*)malloc(sizeof(double)*3); | |
2677 memset(temp_population_center[i],0,sizeof(double)*3); | |
2678 } | |
2679 | |
2680 size_p_2=(long*)malloc(sizeof(long)*2); | |
2681 memset(size_p_2,0,sizeof(long)*2); | |
2682 | |
2683 partit=(long*)malloc(sizeof(long)*num_population); | |
2684 memset(partit,0,sizeof(long)*num_population); | |
2685 | |
2686 num_computed_population=num_population; | |
2687 | |
2688 temp_population_ID=(long**)malloc(sizeof(long*)*num_population); | |
2689 memset(temp_population_ID,0,sizeof(long*)*num_population); | |
2690 | |
2691 for (i=0;i<num_population;i++) | |
2692 { | |
2693 temp_population_ID[i]=(long*)malloc(sizeof(long)*size_p[i]); | |
2694 memset(temp_population_ID[i],0,sizeof(long)*size_p[i]); | |
2695 } | |
2696 | |
2697 //printf("num_population=%d\n",num_population); | |
2698 | |
2699 for (i=0;i<num_population;i++) | |
2700 { | |
2701 partit[i]=0; | |
2702 tran(ind_pop[i], size_p[i], num_dm, 1, ind_pop[i]);//0-1 normalize ind_pop[i] | |
2703 | |
2704 //find the 3 dimensions with the largest std for ind_pop[i] | |
2705 d1=-1; | |
2706 d2=-1; | |
2707 d3=-1; | |
2708 | |
2709 for (t=0;t<num_dm;t++) | |
2710 { | |
2711 aver[t]=0; | |
2712 for (j=0;j<size_p[i];j++) | |
2713 { | |
2714 aver[t]=aver[t]+ind_pop[i][j][t]; | |
2715 } | |
2716 aver[t]=aver[t]/(double)size_p[i]; | |
2717 | |
2718 std[t]=0; | |
2719 for (j=0;j<size_p[i];j++) | |
2720 { | |
2721 std[t]=std[t]+((ind_pop[i][j][t]-aver[t])*(ind_pop[i][j][t]-aver[t])); | |
2722 } | |
2723 std[t]=sqrt(std[t]/(double)size_p[i]); | |
2724 } | |
2725 | |
2726 | |
2727 | |
2728 for (j=0;j<3;j++) | |
2729 { | |
2730 max_d_dist=0; | |
2731 for (t=0;t<num_dm;t++) | |
2732 { | |
2733 if ((t!=d1) && (t!=d2)) | |
2734 { | |
2735 dist=std[t]; | |
2736 } | |
2737 else | |
2738 dist=-1; | |
2739 | |
2740 if (dist>max_d_dist) | |
2741 { | |
2742 max_d_dist=dist; | |
2743 d_d=t; | |
2744 } | |
2745 } | |
2746 | |
2747 if (j==0) | |
2748 d1=d_d; | |
2749 if (j==1) | |
2750 d2=d_d; | |
2751 if (j==2) | |
2752 d3=d_d; | |
2753 } | |
2754 | |
2755 | |
2756 for (t=0;t<size_p[i];t++) | |
2757 { | |
2758 ind_pop_3[i][t][0]=ind_pop[i][t][d1]; | |
2759 ind_pop_3[i][t][1]=ind_pop[i][t][d2]; | |
2760 ind_pop_3[i][t][2]=ind_pop[i][t][d3]; | |
2761 } | |
2762 | |
2763 | |
2764 temp_population_center[0][0]=(aver[d1])/2.0; | |
2765 | |
2766 temp_population_center[0][1]=aver[d2]; | |
2767 temp_population_center[0][2]=aver[d3]; | |
2768 | |
2769 temp_population_center[1][0]=(aver[d1]+1.0)/2.0; | |
2770 | |
2771 temp_population_center[1][1]=aver[d2]; | |
2772 temp_population_center[1][2]=aver[d3]; | |
2773 | |
2774 | |
2775 | |
2776 //run K-means with K=2 | |
2777 kmeans(ind_pop_3[i], 2, KMEANS_TERM, size_p[i], 3, temp_population_ID[i], temp_population_center); | |
2778 | |
2779 for (j=0;j<2;j++) | |
2780 size_p_2[j]=0; | |
2781 | |
2782 for (j=0;j<size_p[i];j++) | |
2783 size_p_2[temp_population_ID[i][j]]++; | |
2784 | |
2785 for (j=0;j<2;j++) | |
2786 if (size_p_2[j]==0) | |
2787 printf("size_p_2=0 at i=%ld\n",i); | |
2788 | |
2789 | |
2790 //check whether the 2 parts should be merged | |
2791 | |
2792 | |
2793 Ei=get_avg_dist(temp_population_center[0], 0,1, temp_population_ID[i], 2,size_p[i], 3, ind_pop_3[i], 0,1,2,size_p_2); | |
2794 Ej=get_avg_dist(temp_population_center[1], 0,1, temp_population_ID[i], 2,size_p[i], 3, ind_pop_3[i], 0,1,2,size_p_2); | |
2795 | |
2796 for (t=0;t<3;t++) | |
2797 { | |
2798 center_p_1[t]=temp_population_center[0][t]*0.25+temp_population_center[1][t]*0.75; | |
2799 center_p_2[t]=temp_population_center[0][t]*0.5+temp_population_center[1][t]*0.5; | |
2800 center_p_3[t]=temp_population_center[0][t]*0.75+temp_population_center[1][t]*0.25; | |
2801 } | |
2802 | |
2803 E1=get_avg_dist(center_p_1, 0,1, temp_population_ID[i], 2,size_p[i], 3, ind_pop_3[i], 0,1,2,size_p_2); | |
2804 | |
2805 E2=get_avg_dist(center_p_2, 0,1, temp_population_ID[i], 2,size_p[i], 3, ind_pop_3[i], 0,1,2,size_p_2); | |
2806 | |
2807 E3=get_avg_dist(center_p_3, 0,1, temp_population_ID[i], 2,size_p[i], 3, ind_pop_3[i], 0,1,2,size_p_2); | |
2808 | |
2809 //printf("i=%d;E1=%f;E2=%f;E3=%f\n",i,E1,E2,E3); | |
2810 | |
2811 if (E1<E2) | |
2812 Ep=E2; | |
2813 else | |
2814 Ep=E1; | |
2815 | |
2816 Ep=max(Ep,E3); //Ep is the most sparse area | |
2817 | |
2818 | |
2819 | |
2820 if ((Ep>Ei) && (Ep>Ej)) //the two parts should be partitioned | |
2821 { | |
2822 partit[i]=num_computed_population; | |
2823 num_computed_population++; | |
2824 } | |
2825 | |
2826 } //end for (i=0;i<num_population;i++) | |
2827 | |
2828 | |
2829 for (i=0;i<num_population;i++) | |
2830 temp_size_j[i]=0; | |
2831 | |
2832 for (i=0;i<file_Len;i++) | |
2833 { | |
2834 index_id=all_population_ID[i]; | |
2835 if (partit[index_id]>0) | |
2836 { | |
2837 j=temp_size_j[index_id]; | |
2838 if (temp_population_ID[index_id][j]==1) | |
2839 all_computed_pop_ID[i]=partit[index_id]; | |
2840 | |
2841 temp_size_j[index_id]++; | |
2842 } | |
2843 } | |
2844 | |
2845 printf("Number of groups after partitioning is %ld\n", num_computed_population); | |
2846 | |
2847 | |
2848 free(size_p_2); | |
2849 | |
2850 free(aver); | |
2851 free(std); | |
2852 free(partit); | |
2853 | |
2854 free(center_p_1); | |
2855 free(center_p_2); | |
2856 free(center_p_3); | |
2857 | |
2858 | |
2859 | |
2860 for (i=0;i<num_population;i++) | |
2861 free(temp_population_ID[i]); | |
2862 free(temp_population_ID); | |
2863 | |
2864 for (i=0;i<num_population;i++) | |
2865 { | |
2866 for (j=0;j<size_p[i];j++) | |
2867 { | |
2868 free(ind_pop[i][j]); | |
2869 free(ind_pop_3[i][j]); | |
2870 } | |
2871 } | |
2872 | |
2873 | |
2874 | |
2875 for (i=0;i<num_population;i++) | |
2876 { | |
2877 free(ind_pop[i]); | |
2878 free(ind_pop_3[i]); | |
2879 } | |
2880 free(ind_pop); | |
2881 free(ind_pop_3); | |
2882 | |
2883 | |
2884 | |
2885 | |
2886 for (i=0;i<2;i++) | |
2887 free(temp_population_center[i]); | |
2888 | |
2889 free(temp_population_center); | |
2890 | |
2891 free(temp_size_j); | |
2892 free(size_p); | |
2893 | |
2894 | |
2895 | |
2896 //update the IDs, Centers, and # of populations | |
2897 for (i=0;i<file_Len;i++) | |
2898 { | |
2899 all_population_ID[i]=all_computed_pop_ID[i]; | |
2900 if (all_population_ID[i]>=num_computed_population) | |
2901 printf("all_population_ID[%ld]=%ld\n",i,all_population_ID[i]); | |
2902 } | |
2903 | |
2904 num_population=num_computed_population; | |
2905 | |
2906 free(all_computed_pop_ID); | |
2907 //end partitioning | |
2908 // since the num_population has changed, population_center needs to be redefined as below | |
2909 | |
2910 population_center=(double **)malloc(sizeof(double*)*num_population); | |
2911 memset(population_center,0,sizeof(double*)*num_population); | |
2912 for (i=0;i<num_population;i++) | |
2913 { | |
2914 population_center[i]=(double*)malloc(sizeof(double)*num_dm); | |
2915 memset(population_center[i],0,sizeof(double)*num_dm); | |
2916 } | |
2917 | |
2918 | |
2919 ID2Center_all(normalized_data,file_Len,num_dm,num_population,all_population_ID,population_center); | |
2920 | |
2921 | |
2922 | |
2923 ////// end of further partitioning | |
2924 | |
2925 //////////////////////////////////////////////////////////// | |
2926 // to further merge populations to avoid overpartitioning | |
2927 // Added June 20, 2010 | |
2928 //////////////////////////////////////////////////////////// | |
2929 | |
2930 | |
2931 | |
2932 num_real_pop=num_population; | |
2933 keep_merge=1; | |
2934 | |
2935 if (num_pop>num_real_pop) | |
2936 { | |
2937 fprintf(stderr,"number of populations specified too large\n"); | |
2938 exit(0); | |
2939 } | |
2940 | |
2941 center_1=(double *)malloc(sizeof(double)*num_dm); | |
2942 memset(center_1,0,sizeof(double)*num_dm); | |
2943 | |
2944 center_2=(double *)malloc(sizeof(double)*num_dm); | |
2945 memset(center_2,0,sizeof(double)*num_dm); | |
2946 | |
2947 center_3=(double *)malloc(sizeof(double)*num_dm); | |
2948 memset(center_3,0,sizeof(double)*num_dm); | |
2949 | |
2950 | |
2951 | |
2952 while (((num_real_pop>num_pop) && (num_pop!=0)) || ((keep_merge==1) && (num_pop==0) && (num_real_pop>2))) | |
2953 { | |
2954 | |
2955 keep_merge=0; //so, if no entering merge function, while will exit when num_pop==0 | |
2956 | |
2957 real_pop_center=(double **)malloc(sizeof(double*)*num_real_pop); | |
2958 memset(real_pop_center,0,sizeof(double*)*num_real_pop); | |
2959 for (i=0;i<num_real_pop;i++) | |
2960 { | |
2961 real_pop_center[i]=(double*)malloc(sizeof(double)*num_dm); | |
2962 memset(real_pop_center[i],0,sizeof(double)*num_dm); | |
2963 } | |
2964 | |
2965 min_pop=(double **)malloc(sizeof(double*)*num_real_pop); | |
2966 memset(min_pop,0,sizeof(double*)*num_real_pop); | |
2967 for (i=0;i<num_real_pop;i++) | |
2968 { | |
2969 min_pop[i]=(double*)malloc(sizeof(double)*num_dm); | |
2970 memset(min_pop[i],0,sizeof(double)*num_dm); | |
2971 } | |
2972 | |
2973 max_pop=(double **)malloc(sizeof(double*)*num_real_pop); | |
2974 memset(max_pop,0,sizeof(double*)*num_real_pop); | |
2975 for (i=0;i<num_real_pop;i++) | |
2976 { | |
2977 max_pop[i]=(double*)malloc(sizeof(double)*num_dm); | |
2978 memset(max_pop[i],0,sizeof(double)*num_dm); | |
2979 } | |
2980 | |
2981 if (num_real_pop==num_population) | |
2982 { | |
2983 for (i=0;i<num_real_pop;i++) | |
2984 for (j=0;j<num_dm;j++) | |
2985 real_pop_center[i][j]=population_center[i][j]; | |
2986 } | |
2987 else | |
2988 { | |
2989 ID2Center_all(normalized_data,file_Len,num_dm,num_real_pop,all_population_ID,real_pop_center); | |
2990 } | |
2991 | |
2992 for (i=0;i<num_real_pop;i++) | |
2993 { | |
2994 for (j=0;j<num_dm;j++) | |
2995 { | |
2996 min_pop[i][j]=MAX_VALUE; | |
2997 | |
2998 max_pop[i][j]=0; | |
2999 } | |
3000 } | |
3001 | |
3002 for (i=0;i<file_Len;i++) | |
3003 { | |
3004 index_id=all_population_ID[i]; | |
3005 for (j=0;j<num_dm;j++) | |
3006 { | |
3007 if (normalized_data[i][j]<min_pop[index_id][j]) | |
3008 min_pop[index_id][j]=normalized_data[i][j]; | |
3009 | |
3010 if (normalized_data[i][j]>max_pop[index_id][j]) | |
3011 max_pop[index_id][j]=normalized_data[i][j]; | |
3012 } | |
3013 } | |
3014 | |
3015 /* for (i=0;i<num_real_pop;i++) | |
3016 { | |
3017 for (j=0;j<num_dm;j++) | |
3018 { | |
3019 printf("min_pop is %f\t",min_pop[i][j]); | |
3020 //printf("max_pop is %f\n",max_pop[i][j]); | |
3021 } | |
3022 printf("\n"); | |
3023 } | |
3024 */ | |
3025 | |
3026 | |
3027 distance_pop=(double **)malloc(sizeof(double*)*num_real_pop); | |
3028 memset(distance_pop,0,sizeof(double*)*num_real_pop); | |
3029 for (i=0;i<num_real_pop;i++) | |
3030 { | |
3031 distance_pop[i]=(double*)malloc(sizeof(double)*num_real_pop); | |
3032 memset(distance_pop[i],0,sizeof(double)*num_real_pop); | |
3033 } | |
3034 | |
3035 for (i=0;i<num_real_pop-1;i++) | |
3036 { | |
3037 for (j=i+1;j<num_real_pop;j++) | |
3038 { | |
3039 distance=0; | |
3040 | |
3041 for (t=0;t<num_dm;t++) | |
3042 { | |
3043 dist=real_pop_center[i][t]-real_pop_center[j][t]; | |
3044 distance=distance+dist*dist; | |
3045 } | |
3046 distance_pop[i][j]=distance; | |
3047 distance_pop[j][i]=distance; | |
3048 } | |
3049 } | |
3050 | |
3051 if (num_real_pop>2) | |
3052 num_checked_range=(long)((double)num_real_pop*(num_real_pop-1)/2.0); //to find the mergeable pair among the num_checked_range pairs of smallest distances | |
3053 else | |
3054 num_checked_range=1; | |
3055 | |
3056 size_c=(long *)malloc(sizeof(long)*num_real_pop); | |
3057 memset(size_c,0,sizeof(long)*num_real_pop); | |
3058 | |
3059 for (i=0;i<file_Len;i++) | |
3060 size_c[all_population_ID[i]]++; | |
3061 | |
3062 for (p=0;p<num_checked_range;p++) | |
3063 { | |
3064 current_smallest_dist=MAX_VALUE; | |
3065 | |
3066 for (i=0;i<num_real_pop-1;i++) | |
3067 { | |
3068 for (j=i+1;j<num_real_pop;j++) | |
3069 { | |
3070 if (distance_pop[i][j]<current_smallest_dist) | |
3071 { | |
3072 current_smallest_dist=distance_pop[i][j]; | |
3073 | |
3074 temp_i=i; | |
3075 temp_j=j; | |
3076 } | |
3077 } | |
3078 } | |
3079 | |
3080 if (p==0) | |
3081 { | |
3082 first_i=temp_i; | |
3083 first_j=temp_j; | |
3084 } | |
3085 | |
3086 distance_pop[temp_i][temp_j]=MAX_VALUE+1; //make sure this pair won't be selected next time | |
3087 | |
3088 //normalize and calculate std for temp_i and temp_j | |
3089 | |
3090 //pick the largest 3 dimensions on standard deviation | |
3091 d1=-1; | |
3092 d2=-1; | |
3093 d3=-1; | |
3094 | |
3095 for (i=0;i<3;i++) | |
3096 { | |
3097 max_d_dist=0; | |
3098 for (j=0;j<num_dm;j++) | |
3099 { | |
3100 if ((j!=d1) && (j!=d2)) | |
3101 { | |
3102 temp=min(min_pop[temp_i][j],min_pop[temp_j][j]); | |
3103 tmp=max(max_pop[temp_i][j],max_pop[temp_j][j]); | |
3104 if (tmp<=temp) | |
3105 { | |
3106 //printf("min_pop[%d][%d]=%f \t min_pop[%d][%d]=%f \t max_pop[%d][%d]=%f \t max_pop[%d][%d]=%f\n",temp_i,j,min_pop[temp_i][j],temp_j, j, min_pop[temp_j][j],temp_i,j,max_pop[temp_i][j],temp_j,j,max_pop[temp_j][j]); | |
3107 dist=0; | |
3108 } | |
3109 else | |
3110 dist=(real_pop_center[temp_i][j]-real_pop_center[temp_j][j])/(tmp-temp); | |
3111 if (dist<0) | |
3112 dist=-dist; | |
3113 } | |
3114 else | |
3115 dist=-1; | |
3116 | |
3117 if (dist>max_d_dist) | |
3118 { | |
3119 max_d_dist=dist; | |
3120 d_d=j; | |
3121 } | |
3122 } | |
3123 | |
3124 if (i==0) | |
3125 d1=d_d; | |
3126 if (i==1) | |
3127 d2=d_d; | |
3128 if (i==2) | |
3129 d3=d_d; | |
3130 } | |
3131 | |
3132 //printf("d1=%d;d2=%d;d3=%d\n",d1,d2,d3); | |
3133 | |
3134 Ei=get_avg_dist(real_pop_center[temp_i], temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c); | |
3135 Ej=get_avg_dist(real_pop_center[temp_j], temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c); | |
3136 | |
3137 for (t=0;t<num_dm;t++) | |
3138 { | |
3139 center_1[t]=real_pop_center[temp_i][t]*0.25+real_pop_center[temp_j][t]*0.75; | |
3140 center_2[t]=real_pop_center[temp_i][t]*0.5+real_pop_center[temp_j][t]*0.5; | |
3141 center_3[t]=real_pop_center[temp_i][t]*0.75+real_pop_center[temp_j][t]*0.25; | |
3142 } | |
3143 | |
3144 E1=get_avg_dist(center_1, temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c); | |
3145 | |
3146 E2=get_avg_dist(center_2, temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c); | |
3147 | |
3148 E3=get_avg_dist(center_3, temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c); | |
3149 | |
3150 if (E1<E2) | |
3151 Ep=E2; | |
3152 else | |
3153 Ep=E1; | |
3154 | |
3155 Ep=max(Ep,E3); //Ep is the most sparse area | |
3156 | |
3157 Ep=Ep*E_T; | |
3158 //printf("Ep=%f;Ei=%f;Ej=%f\n",Ep,Ei,Ej); | |
3159 | |
3160 if ((Ep<=Ei) || (Ep<=Ej))//if the most sparse area between i and j are still denser than one of them, temp_i and temp_j should be merged | |
3161 { | |
3162 keep_merge=1; | |
3163 break; | |
3164 }//end if (Ep) | |
3165 } //end for p | |
3166 | |
3167 //printf("keep_merge=%d\n",keep_merge); | |
3168 | |
3169 for (i=0;i<num_real_pop;i++) | |
3170 { | |
3171 free(real_pop_center[i]); | |
3172 free(distance_pop[i]); | |
3173 free(min_pop[i]); | |
3174 free(max_pop[i]); | |
3175 } | |
3176 free(real_pop_center); | |
3177 free(min_pop); | |
3178 free(max_pop); | |
3179 free(distance_pop); | |
3180 free(size_c); | |
3181 | |
3182 //printf("temp_i=%d;temp_j=%d\n",temp_i,temp_j); | |
3183 | |
3184 if (keep_merge) //found one within p loop | |
3185 { | |
3186 for (i=0;i<file_Len;i++) | |
3187 { | |
3188 if (all_population_ID[i]>temp_j) | |
3189 { | |
3190 all_population_ID[i]=all_population_ID[i]-1; | |
3191 } | |
3192 else | |
3193 { | |
3194 if (all_population_ID[i]==temp_j) | |
3195 { | |
3196 all_population_ID[i]=temp_i; | |
3197 } | |
3198 } | |
3199 } | |
3200 | |
3201 num_real_pop--; | |
3202 } | |
3203 else | |
3204 { | |
3205 if ((num_pop!=0) && (num_pop<num_real_pop)) //not reach the specified num of pop | |
3206 { | |
3207 for (i=0;i<file_Len;i++) | |
3208 { | |
3209 if (all_population_ID[i]>first_j) | |
3210 { | |
3211 all_population_ID[i]=all_population_ID[i]-1; | |
3212 } | |
3213 else | |
3214 { | |
3215 if (all_population_ID[i]==first_j) | |
3216 { | |
3217 all_population_ID[i]=first_i; | |
3218 } | |
3219 } | |
3220 } | |
3221 | |
3222 num_real_pop--; | |
3223 | |
3224 | |
3225 | |
3226 } | |
3227 } | |
3228 //printf("num_real_pop is %d\n",num_real_pop); | |
3229 | |
3230 } //end of while | |
3231 | |
3232 | |
3233 | |
3234 free(center_1); | |
3235 free(center_2); | |
3236 free(center_3); | |
3237 printf("Final number of populations is %ld\n",num_real_pop); | |
3238 | |
3239 new_population_center=(double **)malloc(sizeof(double*)*num_real_pop); | |
3240 memset(new_population_center,0,sizeof(double*)*num_real_pop); | |
3241 for (i=0;i<num_real_pop;i++) | |
3242 { | |
3243 new_population_center[i]=(double*)malloc(sizeof(double)*num_dm); | |
3244 memset(new_population_center[i],0,sizeof(double)*num_dm); | |
3245 } | |
3246 | |
3247 | |
3248 /////////////////////////////////////////// | |
3249 //End of population mapping | |
3250 /////////////////////////////////////////// | |
3251 | |
3252 show(input_data, all_population_ID, file_Len, num_real_pop, num_dm, para_name_string); | |
3253 | |
3254 ID2Center_all(input_data,file_Len,num_dm,num_real_pop,all_population_ID,new_population_center); | |
3255 | |
3256 | |
3257 f_cid=fopen("population_id.txt","w"); | |
3258 f_ctr=fopen("population_center.txt","w"); | |
3259 f_out=fopen("coordinates.txt","w"); | |
3260 f_results=fopen("flock_results.txt","w"); | |
3261 | |
3262 /* | |
3263 f_parameters=fopen("parameters.txt","w"); | |
3264 fprintf(f_parameters,"Number_of_Bins\t%d\n",num_bin); | |
3265 fprintf(f_parameters,"Density\t%f\n",aver_index); | |
3266 fclose(f_parameters); | |
3267 */ | |
3268 | |
3269 for (i=0;i<file_Len;i++) | |
3270 fprintf(f_cid,"%ld\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009 | |
3271 | |
3272 /* | |
3273 * New to check for min/max to add to parameters.txt | |
3274 * | |
3275 */ | |
3276 | |
3277 fprintf(f_out,"%s\n",para_name_string); | |
3278 //fprintf(f_results,"%s\tEvent\tPopulation\n",para_name_string); | |
3279 fprintf(f_results,"%s\tPopulation\n",para_name_string); | |
3280 for (i=0;i<file_Len;i++) | |
3281 { | |
3282 for (j=0;j<num_dm;j++) | |
3283 { | |
3284 if (input_data[i][j] < min) { | |
3285 min = (int)input_data[i][j]; | |
3286 } | |
3287 if (input_data[i][j] > max) { | |
3288 max = (int)input_data[i][j]; | |
3289 } | |
3290 if (j==num_dm-1) | |
3291 { | |
3292 fprintf(f_out,"%d\n",(int)input_data[i][j]); | |
3293 fprintf(f_results,"%d\t",(int)input_data[i][j]); | |
3294 } | |
3295 else | |
3296 { | |
3297 fprintf(f_out,"%d\t",(int)input_data[i][j]); | |
3298 fprintf(f_results,"%d\t",(int)input_data[i][j]); | |
3299 } | |
3300 } | |
3301 //fprintf(f_results,"%ld\t",i + 1); | |
3302 fprintf(f_results,"%ld\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009 | |
3303 } | |
3304 | |
3305 /* | |
3306 f_parameters=fopen("parameters.txt","w"); | |
3307 fprintf(f_parameters,"Number_of_Bins\t%ld\n",num_bin); | |
3308 fprintf(f_parameters,"Density\t%d\n",den_t_event); | |
3309 fprintf(f_parameters,"Min\t%d\n",min); | |
3310 fprintf(f_parameters,"Max\t%d\n",max); | |
3311 fclose(f_parameters); | |
3312 */ | |
3313 | |
3314 f_properties=fopen("fcs.properties","w"); | |
3315 fprintf(f_properties,"Bins=%ld\n",num_bin); | |
3316 fprintf(f_properties,"Density=%d\n",den_t_event); | |
3317 fprintf(f_properties,"Min=%d\n",min); | |
3318 fprintf(f_properties,"Max=%d\n",max); | |
3319 fprintf(f_properties,"Populations=%ld\n",num_real_pop); | |
3320 fprintf(f_properties,"Events=%ld\n",file_Len); | |
3321 fprintf(f_properties,"Markers=%ld\n",num_dm); | |
3322 fclose(f_properties); | |
3323 | |
3324 for (i=0;i<num_real_pop;i++) { | |
3325 /* Add if we want to include population id in the output | |
3326 */ | |
3327 fprintf(f_ctr,"%ld\t",i+1); //i changed to i+1 to start from 1 instead of 0: April 16, 2009 | |
3328 | |
3329 for (j=0;j<num_dm;j++) { | |
3330 if (j==num_dm-1) | |
3331 fprintf(f_ctr,"%.0f\n",new_population_center[i][j]); | |
3332 else | |
3333 fprintf(f_ctr,"%.0f\t",new_population_center[i][j]); | |
3334 } | |
3335 } | |
3336 | |
3337 //added April 16, 2009 | |
3338 f_mfi=fopen("MFI.txt","w"); | |
3339 | |
3340 for (i=0;i<num_real_pop;i++) | |
3341 { | |
3342 fprintf(f_mfi,"%ld\t",i+1); | |
3343 | |
3344 for (j=0;j<num_dm;j++) | |
3345 { | |
3346 if (j==num_dm-1) | |
3347 fprintf(f_mfi,"%.0f\n",new_population_center[i][j]); | |
3348 else | |
3349 fprintf(f_mfi,"%.0f\t",new_population_center[i][j]); | |
3350 } | |
3351 } | |
3352 fclose(f_mfi); | |
3353 | |
3354 //ended April 16, 2009 | |
3355 | |
3356 fclose(f_cid); | |
3357 fclose(f_ctr); | |
3358 fclose(f_out); | |
3359 fclose(f_results); | |
3360 | |
3361 | |
3362 for (i=0;i<num_population;i++) | |
3363 free(population_center[i]); | |
3364 | |
3365 free(population_center); | |
3366 | |
3367 for (i=0;i<num_real_pop;i++) | |
3368 free(new_population_center[i]); | |
3369 | |
3370 free(new_population_center); | |
3371 | |
3372 for (i=0;i<file_Len;i++) | |
3373 free(normalized_data[i]); | |
3374 free(normalized_data); | |
3375 | |
3376 free(grid_populationID); | |
3377 | |
3378 free(cluster_populationID); | |
3379 free(grid_clusterID); | |
3380 free(cluster_ID); | |
3381 | |
3382 for (i=0;i<file_Len;i++) | |
3383 free(input_data[i]); | |
3384 free(input_data); | |
3385 | |
3386 free(grid_ID); | |
3387 free(population_ID); | |
3388 free(all_population_ID); | |
3389 free(eventID_To_denseventID); | |
3390 | |
3391 /////////////////////////////////////////////////////////// | |
3392 printf("Ending time:\t\t\t\t"); | |
3393 fflush(stdout); | |
3394 system("/bin/date"); | |
3395 | |
3396 /* | |
3397 * Windows version | |
3398 _strtime( tmpbuf ); | |
3399 printf( "Ending time:\t\t\t\t%s\n", tmpbuf ); | |
3400 _strdate( tmpbuf ); | |
3401 printf( "Ending date:\t\t\t\t%s\n", tmpbuf ); | |
3402 */ | |
3403 | |
3404 return 0; | |
3405 } |