1
|
1 ///////////
|
|
2 // Changes made:
|
|
3 // 1. Added another parameter: number of populations
|
|
4 // 2. Added a hierarchical merging step based on density change between centroids of two hyper-regions
|
|
5 // 3. Picked the longest dimensions between the population centroids to judge whether the two parts should be merged
|
|
6 // 4. Removed checking time parameter
|
|
7 // 5. Output error to stderr
|
|
8 // 6. Fixed the bug of density threshold always = 3
|
|
9 // 7. Added another error (select_num_bin<min_grid) || (select_num_bin>max_grid) to STDERR
|
|
10 // 8. Fixed a bug for 2D data by using K=e*K
|
|
11 // 9. Added some header files, may not be necessary
|
|
12 // 10. Added a lower bound (at least two) for number of populations
|
|
13 /***************************************************************************************************************************************
|
|
14
|
|
15 FLOCK: FLOw cytometry Clustering without K (Named by: Jamie A. Lee and Richard H. Scheuermann)
|
|
16
|
|
17 Author: (Max) Yu Qian, Ph.D.
|
|
18
|
|
19 Copyright: Scheuermann Lab, Dept. of Pathology, UTSW
|
|
20
|
|
21 Development: November 2005 ~ Forever
|
|
22
|
|
23 Status: July 2010: Release 2.0
|
|
24
|
|
25 Usage: flock data_file
|
|
26 Note: the input file format must be channel values and the delimiter between two values must be a tab.
|
|
27
|
|
28
|
|
29
|
|
30 ****************************************************************************************************************************************/
|
|
31
|
|
32 #include <time.h>
|
|
33 #include <stdio.h>
|
|
34 #include <stdlib.h>
|
|
35 #include <math.h>
|
|
36 #include <string.h>
|
|
37 #include <sys/stat.h>
|
|
38 #include <unistd.h>
|
|
39 #include <assert.h>
|
|
40
|
|
41
|
|
42
|
|
43 #define DEBUG 0
|
|
44 #define LINE_LEN 1024
|
|
45 #define FILE_NAME_LEN 128
|
|
46 #define PARA_NAME_LEN 64
|
|
47 #define MAX_VALUE 1000000000
|
|
48 #define MIN_GRID 6
|
|
49 #define MAX_GRID 50
|
|
50 #define E_T 1.0
|
|
51
|
|
52 #define NORM_METHOD 2 //2 if z-score; 0 if no normalization; 1 if min-max
|
|
53 #define KMEANS_TERM 10
|
|
54 #define MAX_POP_NUM 128
|
|
55
|
|
56 #ifndef max
|
|
57 #define max( a, b ) ( ((a) > (b)) ? (a) : (b) )
|
|
58 #endif
|
|
59
|
|
60 #ifndef min
|
|
61 #define min( a, b ) ( ((a) < (b)) ? (a) : (b) )
|
|
62 #endif
|
|
63
|
|
64 static long **Gr=0;
|
|
65 static long *gcID = 0; /* grid cluster IDs */
|
|
66 static long *cluster_count=0; /* count of nodes per cluster */
|
|
67 static long ndense=0;
|
|
68 static long ndim=0;
|
|
69 /* cid changes between depth-first searches, but is constant within a
|
|
70 single search, so it goes here. */
|
|
71 static long cid=0;
|
|
72 /* Do a depth-first search for a single connected component in graph
|
|
73 * G. Start from node, tag the nodes found with cid, and record
|
|
74 * the tags in grid_clusterID. Also, record the node count in
|
|
75 * cluster_count. If we find a node that has already been assigned to
|
|
76 * a cluster, that means we're merging two clusters, so zero out the
|
|
77 * old cid's node count.
|
|
78 *
|
|
79 * Note that our graph is constructed as a DAG, so we can be
|
|
80 * guaranteed to terminate without checking for cycles.
|
|
81 *
|
|
82 * Note2: this function can potentially recurse to depth = ndense.
|
|
83 * Plan stack size accordingly.
|
|
84 *
|
|
85 * Output:
|
|
86 *
|
|
87 * grid_clusterID[] -- array where we tag the nodes.
|
|
88 * cluster_count[] -- count of the number of nodes per cluster.
|
|
89 */
|
|
90 static void merge_cluster(long from, long into)
|
|
91 {
|
|
92 int i;
|
|
93
|
|
94 for(i=0; i<ndense; ++i)
|
|
95 if(gcID[i] == from)
|
|
96 gcID[i] = into;
|
|
97 }
|
|
98
|
|
99 void depth_first(long node)
|
|
100 {
|
|
101 long i;
|
|
102
|
|
103 if(gcID[node] == cid) // we're guaranteed no cycles, but it is possible to reach a node
|
|
104 return; // through two different paths in the same cluster. This early
|
|
105 // return saves us some unneeded work.
|
|
106
|
|
107 /* Check to see if we are merging a cluster */
|
|
108 if(gcID[node] >= 0) {
|
|
109 /* We are, so zero the count for the old cluster. */
|
|
110 cluster_count[ gcID[node] ] = 0;
|
|
111 merge_cluster(gcID[node], cid);
|
|
112 return;
|
|
113 }
|
|
114
|
|
115 /* Update for this node */
|
|
116 gcID[node] = cid;
|
|
117 cluster_count[cid]++;
|
|
118
|
|
119 /* Recursively search the child nodes */
|
|
120 for(i=0; i<ndim; ++i)
|
|
121 if(Gr[node][i] >= 0) /* This is a child node */
|
|
122 depth_first(Gr[node][i]);
|
|
123 }
|
|
124
|
|
125 void bail(const char *msg)
|
|
126 {
|
|
127 fprintf(stderr,"%s",msg);
|
|
128 exit(0);
|
|
129 }
|
|
130
|
|
131
|
|
132 static void check_clusters(long *gcID, int ndense)
|
|
133 {
|
|
134 int i;
|
|
135
|
|
136 for(i=0; i<ndense; ++i)
|
|
137 if(gcID[i] < 0) {
|
|
138 fprintf(stderr,"faulty cluster id at i= %d\n",i);
|
|
139 exit(0);
|
|
140 }
|
|
141 }
|
|
142
|
|
143
|
|
144
|
|
145 long find_connected(long **G, long n_dense_grids, long num_dm, long *grid_clusterID)
|
|
146 {
|
|
147 long nclust=0; /* number of clusters found */
|
|
148 long i;
|
|
149 long *subfac;
|
|
150 long clustid=0;
|
|
151 int subval=0,nempty=0;
|
|
152
|
|
153 int sz = n_dense_grids*sizeof(long);
|
|
154 cluster_count = malloc(sz);
|
|
155 if(!cluster_count)
|
|
156 bail("find_connected: Unable to allocate %zd bytes.\n");
|
|
157 memset(cluster_count,0,sz);
|
|
158
|
|
159 /* set up the statics that will be used in the DFS */
|
|
160 Gr=G;
|
|
161 gcID = grid_clusterID;
|
|
162 ndense = n_dense_grids;
|
|
163 ndim = num_dm;
|
|
164
|
|
165 for(i=0;i<ndense;++i)
|
|
166 grid_clusterID[i] = -1;
|
|
167
|
|
168 for(i=0;i<ndense;++i) {
|
|
169 if(grid_clusterID[i] < 0) { /* grid hasn't been assigned yet */
|
|
170 cid = nclust++;
|
|
171 depth_first(i);
|
|
172 }
|
|
173 }
|
|
174
|
|
175
|
|
176
|
|
177 /* At this point we probably have some clusters that are empty due to merging.
|
|
178 We want to compact the cluster numbering to eliminate the empty clusters. */
|
|
179
|
|
180 subfac = malloc(sz);
|
|
181 if(!subfac)
|
|
182 bail("find_connected: Unable to allocate %zd bytes.\n");
|
|
183 subval=0;
|
|
184 nempty=0;
|
|
185
|
|
186 /* cluster #i needs to have its ID decremented by 1 for each empty cluster with
|
|
187 ID < i. Precaclulate the decrements in this loop: */
|
|
188 for(i=0;i<nclust;++i) {
|
|
189 //clustid = grid_clusterID[i];
|
|
190 if(cluster_count[i] == 0) {
|
|
191 subval++;
|
|
192 nempty++;
|
|
193 }
|
|
194 subfac[i] = subval;
|
|
195 }
|
|
196
|
|
197 //printf("nempty is %d\n",nempty);
|
|
198
|
|
199 /* Now apply the decrements to all of the dense grids */
|
|
200 for(i=0;i<ndense;++i) {
|
|
201 clustid = grid_clusterID[i];
|
|
202 grid_clusterID[i] -= subfac[clustid];
|
|
203 }
|
|
204
|
|
205
|
|
206
|
|
207 /* correct the number of clusters found */
|
|
208 nclust -= nempty;
|
|
209
|
|
210 //printf("nclust is %d\n",nclust);
|
|
211
|
|
212 return nclust;
|
|
213 }
|
|
214
|
|
215 /************************************* Read basic info of the source file **************************************/
|
|
216 /************************************* Read basic info of the source file **************************************/
|
|
217 void getfileinfo(FILE *f_src, long *file_Len, long *num_dm, char *name_string, long *time_ID)
|
|
218 {
|
|
219 char src[LINE_LEN];
|
|
220 char current_name[64];
|
|
221 char prv;
|
|
222
|
|
223 long num_rows=0;
|
|
224 long num_columns=0;
|
|
225 long ch='\n';
|
|
226 long prev='\n';
|
|
227 long time_pos=0;
|
|
228 long i=0;
|
|
229 long j=0;
|
|
230 int sw=0;
|
|
231
|
|
232 src[0]='\0';
|
|
233 fgets(src, LINE_LEN, f_src);
|
|
234
|
|
235 name_string[0]='\0';
|
|
236 current_name[0]='\0';
|
|
237 prv='\n';
|
|
238
|
|
239 while ((src[i]==' ') || (src[i]=='\t')) //skip space and tab characters
|
|
240 i++;
|
|
241
|
|
242 while ((src[i]!='\0') && (src[i]!='\n')) //repeat until the end of the line
|
|
243 {
|
|
244 current_name[j]=src[i];
|
|
245
|
|
246 if ((src[i]=='\t') && (prv!='\t')) //a complete word
|
|
247 {
|
|
248 current_name[j]='\0';
|
|
249
|
|
250 if (0!=strcmp(current_name,"Time"))
|
|
251 {
|
|
252 num_columns++; //num_columns does not inlcude the column of Time
|
|
253 time_pos++;
|
|
254 if (sw) {
|
|
255 strcat(name_string,"\t");
|
|
256 }
|
|
257 strcat(name_string,current_name);
|
|
258 sw = 1;
|
|
259 }
|
|
260 else
|
|
261 {
|
|
262 *time_ID=time_pos;
|
|
263 }
|
|
264
|
|
265 current_name[0]='\0';
|
|
266 j=0;
|
|
267 }
|
|
268
|
|
269 if ((src[i]=='\t') && (prv=='\t')) //a duplicate tab or space
|
|
270 {
|
|
271 current_name[0]='\0';
|
|
272 j=0;
|
|
273 }
|
|
274
|
|
275 if (src[i]!='\t')
|
|
276 j++;
|
|
277
|
|
278 prv=src[i];
|
|
279 i++;
|
|
280 }
|
|
281
|
|
282 //name_string[j]='\0';
|
|
283 if (prv!='\t') //the last one hasn't been retrieved
|
|
284 {
|
|
285 current_name[j]='\0';
|
|
286 if (0!=strcmp(current_name,"Time"))
|
|
287 {
|
|
288 num_columns++;
|
|
289 strcat(name_string,"\t");
|
|
290 strcat(name_string,current_name);
|
|
291 time_pos++;
|
|
292 }
|
|
293 else
|
|
294 {
|
|
295 *time_ID=time_pos;
|
|
296 }
|
|
297 }
|
|
298 if (DEBUG==1)
|
|
299 {
|
|
300 printf("time_ID is %ld\n",*time_ID);
|
|
301 printf("name_string is %s\n",name_string);
|
|
302 }
|
|
303
|
|
304 //start computing # of rows
|
|
305
|
|
306 while ((ch = fgetc(f_src))!= EOF )
|
|
307 {
|
|
308 if (ch == '\n')
|
|
309 {
|
|
310 ++num_rows;
|
|
311 }
|
|
312 prev = ch;
|
|
313 }
|
|
314 if (prev!='\n')
|
|
315 ++num_rows;
|
|
316
|
|
317 if (num_rows<50)
|
|
318 {
|
|
319 fprintf(stderr,"Number of events in the input file is too few and should not be processed!\n"); //modified on July 23, 2010
|
|
320 exit(0);
|
|
321 }
|
|
322
|
|
323 *file_Len=num_rows;
|
|
324 *num_dm=num_columns;
|
|
325
|
|
326 printf("original file size is %ld; number of dimensions is %ld\n", *file_Len, *num_dm);
|
|
327 }
|
|
328
|
|
329
|
|
330
|
|
331 /************************************* Read the source file into uncomp_data **************************************/
|
|
332 void readsource(FILE *f_src, long file_Len, long num_dm, double **uncomp_data, long time_ID)
|
|
333 {
|
|
334 //long time_pass=0; //to mark whether the time_ID has been passed
|
|
335 long index=0;
|
|
336
|
|
337 long i=0;
|
|
338 long j=0;
|
|
339 long t=0;
|
|
340
|
|
341 char src[LINE_LEN];
|
|
342 char xc[LINE_LEN/10];
|
|
343
|
|
344 src[0]='\0';
|
|
345 fgets(src,LINE_LEN, f_src); //skip the first line about parameter names
|
|
346
|
|
347 while (!feof(f_src) && (index<file_Len)) //index = 0, 1, ..., file_Len-1
|
|
348 {
|
|
349 src[0]='\0';
|
|
350 fgets(src,LINE_LEN,f_src);
|
|
351 i=0;
|
|
352 //time_pass=0;
|
|
353
|
|
354 //if (time_ID==-1) //there is no time_ID
|
|
355 // {
|
|
356 for (t=0;t<num_dm;t++)
|
|
357 {
|
|
358 xc[0]='\0';
|
|
359 j=0;
|
|
360 while ((src[i]!='\0') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t'))
|
|
361 {
|
|
362 xc[j]=src[i];
|
|
363 i++;
|
|
364 j++;
|
|
365 }
|
|
366
|
|
367 xc[j]='\0';
|
|
368 i++;
|
|
369
|
|
370 uncomp_data[index][t]=atof(xc);
|
|
371 }
|
|
372 // }
|
|
373 /*else
|
|
374 {
|
|
375 for (t=0;t<=num_dm;t++) //the time column needs to be skipped, so there are num_dm+1 columns
|
|
376 {
|
|
377 xc[0]='\0';
|
|
378 j=0;
|
|
379 while ((src[i]!='\0') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t'))
|
|
380 {
|
|
381 xc[j]=src[i];
|
|
382 i++;
|
|
383 j++;
|
|
384 }
|
|
385
|
|
386 xc[j]='\0';
|
|
387 i++;
|
|
388
|
|
389 if (t==time_ID)
|
|
390 {
|
|
391 time_pass=1;
|
|
392 continue;
|
|
393 }
|
|
394
|
|
395 if (time_pass)
|
|
396 uncomp_data[index][t-1]=atof(xc);
|
|
397 else
|
|
398 uncomp_data[index][t]=atof(xc);
|
|
399 }
|
|
400 }*/ //commented by Yu Qian on Aug 31, 2010 as we do not want to check time_ID anymore
|
|
401 index++;
|
|
402 //fprintf(fout_ID,"%s",src);
|
|
403 } //end of while
|
|
404
|
|
405 if (DEBUG == 1)
|
|
406 {
|
|
407 printf("the last line of the source data is:\n");
|
|
408 for (j=0;j<num_dm;j++)
|
|
409 printf("%f ",uncomp_data[index-1][j]);
|
|
410 printf("\n");
|
|
411 }
|
|
412 }
|
|
413
|
|
414
|
|
415 /**************************************** Normalization ******************************************/
|
|
416 void tran(double **orig_data, long file_Len, long num_dm, long norm_used, double **matrix_to_cluster)
|
|
417 {
|
|
418 long i=0;
|
|
419 long j=0;
|
|
420
|
|
421 double biggest=0;
|
|
422 double smallest=MAX_VALUE;
|
|
423
|
|
424 double *aver; //average of each column
|
|
425 double *std; //standard deviation of each column
|
|
426
|
|
427 aver=(double*)malloc(sizeof(double)*file_Len);
|
|
428 memset(aver,0,sizeof(double)*file_Len);
|
|
429
|
|
430 std=(double*)malloc(sizeof(double)*file_Len);
|
|
431 memset(std,0,sizeof(double)*file_Len);
|
|
432
|
|
433 if (norm_used==2) //z-score normalization
|
|
434 {
|
|
435 for (j=0;j<num_dm;j++)
|
|
436 {
|
|
437 aver[j]=0;
|
|
438 for (i=0;i<file_Len;i++)
|
|
439 aver[j]=aver[j]+orig_data[i][j];
|
|
440 aver[j]=aver[j]/(double)file_Len;
|
|
441
|
|
442 std[j]=0;
|
|
443 for (i=0;i<file_Len;i++)
|
|
444 std[j]=std[j]+((orig_data[i][j]-aver[j])*(orig_data[i][j]-aver[j]));
|
|
445 std[j]=sqrt(std[j]/(double)file_Len);
|
|
446
|
|
447 for (i=0;i<file_Len;i++)
|
|
448 matrix_to_cluster[i][j]=(orig_data[i][j]-aver[j])/std[j]; //z-score normalization
|
|
449 }
|
|
450 }
|
|
451
|
|
452 if (norm_used==1) //0-1 min-max normalization
|
|
453 {
|
|
454 for (j=0;j<num_dm;j++)
|
|
455 {
|
|
456 biggest=0;
|
|
457 smallest=MAX_VALUE;
|
|
458 for (i=0;i<file_Len;i++)
|
|
459 {
|
|
460 if (orig_data[i][j]>biggest)
|
|
461 biggest=orig_data[i][j];
|
|
462 if (orig_data[i][j]<smallest)
|
|
463 smallest=orig_data[i][j];
|
|
464 }
|
|
465
|
|
466 for (i=0;i<file_Len;i++)
|
|
467 {
|
|
468 if (biggest==smallest)
|
|
469 matrix_to_cluster[i][j]=biggest;
|
|
470 else
|
|
471 matrix_to_cluster[i][j]=(orig_data[i][j]-smallest)/(biggest-smallest);
|
|
472 }
|
|
473 }
|
|
474 }
|
|
475
|
|
476 if (norm_used==0) //no normalization
|
|
477 {
|
|
478 for (i=0;i<file_Len;i++)
|
|
479 for (j=0;j<num_dm;j++)
|
|
480 matrix_to_cluster[i][j]=orig_data[i][j];
|
|
481 }
|
|
482
|
|
483 }
|
|
484
|
|
485
|
|
486
|
|
487 /********************************************** RadixSort *******************************************/
|
|
488 /* Perform a radix sort using each dimension from the original data as a radix.
|
|
489 * Outputs:
|
|
490 * sorted_seq -- a permutation vector mapping the ordered list onto the original data.
|
|
491 * (sorted_seq[i] -> index in the original data of the ith element of the ordered list)
|
|
492 * grid_ID -- mapping between the original data and the "grids" (see below) found as a byproduct
|
|
493 * of the sorting procedure.
|
|
494 * num_nonempty -- the number of grids that occur in the data (= the number of distinct values assumed
|
|
495 * by grid_ID)
|
|
496 */
|
|
497
|
|
498 void radixsort_flock(long **position,long file_Len,long num_dm,long num_bin,long *sorted_seq,long *num_nonempty,long *grid_ID)
|
|
499 {
|
|
500 long i=0;
|
|
501 long length=0;
|
|
502 long start=0;
|
|
503 long prev_ID=0;
|
|
504 long curr_ID=0;
|
|
505
|
|
506 long j=0;
|
|
507 long t=0;
|
|
508 long p=0;
|
|
509 long loc=0;
|
|
510 long temp=0;
|
|
511 long equal=0;
|
|
512
|
|
513 long *count; //count[i]=j means there are j numbers having value i at the processing digit
|
|
514 long *index; //index[i]=j means the starting position of grid i is j
|
|
515 long *cp; //current position
|
|
516 long *mark; //mark[i]=0 means it is not an ending point of a part, 1 means it is (a "part" is a group of items with identical bins for all dimensions)
|
|
517 long *seq; //temporary sequence
|
|
518
|
|
519 count=(long*)malloc(sizeof(long)*num_bin);
|
|
520 memset(count,0,sizeof(long)*num_bin);
|
|
521
|
|
522 cp=(long*)malloc(sizeof(long)*num_bin);
|
|
523 memset(cp,0,sizeof(long)*num_bin);
|
|
524
|
|
525 index=(long*)malloc(sizeof(long)*num_bin); // initialized below
|
|
526
|
|
527 seq=(long*)malloc(sizeof(long)*file_Len);
|
|
528 memset(seq,0,sizeof(long)*file_Len);
|
|
529
|
|
530 mark=(long*)malloc(sizeof(long)*file_Len);
|
|
531 memset(mark,0,sizeof(long)*file_Len);
|
|
532
|
|
533 for (i=0;i<file_Len;i++)
|
|
534 {
|
|
535 sorted_seq[i]=i;
|
|
536 mark[i]=0;
|
|
537 seq[i]=0;
|
|
538 }
|
|
539 for (i=0;i<num_bin;i++)
|
|
540 {
|
|
541 index[i]=0;
|
|
542 cp[i]=0;
|
|
543 count[i]=0;
|
|
544 }
|
|
545
|
|
546 for (j=0;j<num_dm;j++)
|
|
547 {
|
|
548 if (j==0) //compute the initial values of mark
|
|
549 {
|
|
550 for (i=0;i<file_Len;i++)
|
|
551 count[position[i][j]]++; // initialize the count to the number of items in each bin of the 0th dimension
|
|
552
|
|
553 index[0] = 0;
|
|
554 for (i=0;i<num_bin-1;i++)
|
|
555 {
|
|
556 index[i+1]=index[i]+count[i]; //index[k]=x means k segment starts at x (in the ordered list)
|
|
557 if ((index[i+1]>0) && (index[i+1]<=file_Len))
|
|
558 {
|
|
559 mark[index[i+1]-1]=1; // Mark the end of the segment in the ordered list
|
|
560 }
|
|
561 else
|
|
562 {
|
|
563 printf("out of myboundary for mark at index[i+1]-1.\n");
|
|
564 }
|
|
565 }
|
|
566 mark[file_Len-1]=1;
|
|
567
|
|
568 for (i=0;i<file_Len;i++)
|
|
569 {
|
|
570 /* Build a permutation vector for the partially ordered data. Store the PV in sorted_seq */
|
|
571 loc=position[i][j];
|
|
572 temp=index[loc]+cp[loc]; //cp[i]=j means the offset from the starting position of grid i is j
|
|
573 sorted_seq[temp]=i; //sorted_seq[i]=temp is also another way to sort
|
|
574 cp[loc]++;
|
|
575 }
|
|
576 }
|
|
577 else
|
|
578 {
|
|
579 //reset count, index, loc, temp, cp, start, and length
|
|
580 length=0;
|
|
581 loc=0;
|
|
582 temp=0;
|
|
583 start=0;
|
|
584 for (p=0;p<num_bin;p++)
|
|
585 {
|
|
586 cp[p]=0;
|
|
587 count[p]=0;
|
|
588 index[p]=0;
|
|
589 }
|
|
590
|
|
591 for (i=0;i<file_Len;i++)
|
|
592 {
|
|
593 long iperm = sorted_seq[i]; // iperm allows us to traverse the data in sorted order.
|
|
594 if (mark[i]!=1)
|
|
595 {
|
|
596 /* Count the number of items in each bin of
|
|
597 dimension j, BUT we are going to reset at the end
|
|
598 of each "part". Thus, the total effect is to do
|
|
599 a sort by bin on the jth dimension for each group
|
|
600 of data that has been identical for the
|
|
601 dimensions processed up to this point. This is
|
|
602 the standard radix sort procedure, but doing it
|
|
603 this way saves us having to allocate buckets to
|
|
604 hold the data in each group of "identical-so-far"
|
|
605 elements. */
|
|
606 count[position[iperm][j]]++; //count[position[i][j]]++;
|
|
607 length++; // This is the total length of the part, irrespective of the value of the jth component
|
|
608 // (actually, less one, since we don't increment for the final element below)
|
|
609 }
|
|
610 if (mark[i]==1)
|
|
611 {
|
|
612 //length++;
|
|
613 count[position[iperm][j]]++;//count[position[i][j]]++; //the current point must be counted in
|
|
614 start=i-length; //this part starts from start to i: [start,i]
|
|
615 /* Now we sort on the jth radix, just like we did for the 0th above, but we restrict it to just the current part.
|
|
616 This would be a lot more clear if we broke this bit of code out into a separate function and processed recursively,
|
|
617 plus we could multi-thread over the parts. (Hmmm...)
|
|
618 */
|
|
619 index[0] = start; // Let index give the offset within the whole ordered list.
|
|
620 for (t=0;t<num_bin-1;t++)
|
|
621 {
|
|
622 index[t+1]=index[t]+count[t];
|
|
623
|
|
624 if ((index[t+1]<=file_Len) && (index[t+1]>0))
|
|
625 {
|
|
626 mark[index[t+1]-1]=1; // update the part boundaries to include the differences in the current radix.
|
|
627 }
|
|
628
|
|
629 }
|
|
630 mark[i]=1;
|
|
631
|
|
632 /* Update the permutation vector for the current part (i.e., from start to i). By the time we finish the loop over i
|
|
633 the PV will be completely updated for the partial ordering up to the current radix. */
|
|
634 for (t=start;t<=i;t++)
|
|
635 {
|
|
636 loc=position[sorted_seq[t]][j];//loc=position[t][j];
|
|
637 temp=index[loc]+cp[loc];
|
|
638 if ((temp<file_Len) && (temp>=0))
|
|
639 {
|
|
640 // seq is a temporary because we have to maintain the old PV until we have finished this step.
|
|
641 seq[temp]=sorted_seq[t]; //sorted_seq[i]=temp is also another way to sort
|
|
642 cp[loc]++;
|
|
643 }
|
|
644 else
|
|
645 {
|
|
646 printf("out of myboundary for seq at temp.\n");
|
|
647 }
|
|
648 }
|
|
649
|
|
650 for (t=start;t<=i;t++)
|
|
651 {
|
|
652 // copy the temporary back into sorted_seq. sorted_seq is now updated for radix j up through
|
|
653 // entry i in the ordered list.
|
|
654 if ((t>=0) && (t<file_Len))
|
|
655 sorted_seq[t]=seq[t];
|
|
656 else
|
|
657 printf("out of myboundary for seq and sorted_seq at t.\n");
|
|
658 }
|
|
659 //reset count, index, seq, length, and cp
|
|
660 length=0;
|
|
661 loc=0;
|
|
662 temp=0;
|
|
663 for (p=0;p<num_bin;p++)
|
|
664 {
|
|
665 cp[p]=0;
|
|
666 count[p]=0;
|
|
667 index[p]=0;
|
|
668 }
|
|
669 }
|
|
670 }//end for i
|
|
671 }//end else
|
|
672 }//end for j
|
|
673
|
|
674 /* sorted_seq[] now contains the ordered list for all radices. mark[] gives the boundaries between groups of elements that are
|
|
675 identical over all radices (= dimensions in the original data) (although it appears we aren't going to make use of this fact) */
|
|
676
|
|
677 for (i=0;i<file_Len;i++)
|
|
678 grid_ID[i]=0; //in case the initial value hasn't been assigned
|
|
679 *num_nonempty=1; //starting from 1!
|
|
680
|
|
681 /* assign the "grid" identifiers for all of the data. A grid will be what we were calling a "part" above. We will number them
|
|
682 serially and tag the *unordered* data with the grid IDs. We will also count the number of populated grids (in general there will
|
|
683 be many possible combinations of bin values that simply never occur) */
|
|
684
|
|
685 for (i=1;i<file_Len;i++)
|
|
686 {
|
|
687 equal=1;
|
|
688 prev_ID=sorted_seq[i-1];
|
|
689 curr_ID=sorted_seq[i];
|
|
690 for (j=0;j<num_dm;j++)
|
|
691 {
|
|
692 if (position[prev_ID][j]!=position[curr_ID][j])
|
|
693 {
|
|
694 equal=0; //not equal
|
|
695 break;
|
|
696 }
|
|
697 }
|
|
698
|
|
699 if (equal)
|
|
700 {
|
|
701 grid_ID[curr_ID]=grid_ID[prev_ID];
|
|
702 }
|
|
703 else
|
|
704 {
|
|
705 *num_nonempty=*num_nonempty+1;
|
|
706 grid_ID[curr_ID]=grid_ID[prev_ID]+1;
|
|
707 }
|
|
708 //all_grid_vol[grid_ID[curr_ID]]++;
|
|
709 }
|
|
710
|
|
711 //free memory
|
|
712 free(count);
|
|
713 free(index);
|
|
714 free(cp);
|
|
715 free(seq);
|
|
716 free(mark);
|
|
717
|
|
718 }
|
|
719
|
|
720 /********************************************** Compute Position of Events ************************************************/
|
|
721 void compute_position(double **data_in, long file_Len, long num_dm, long num_bin, long **position, double *interval)
|
|
722 {
|
|
723 /* What we are really doing here is binning the data, with the bins
|
|
724 spanning the range of the data and number of bins = num_bin */
|
|
725 long i=0;
|
|
726 long j=0;
|
|
727
|
|
728 double *small; //small[j] is the smallest value within dimension j
|
|
729 double *big; //big[j] is the biggest value within dimension j
|
|
730
|
|
731 small=(double*)malloc(sizeof(double)*num_dm);
|
|
732 memset(small,0,sizeof(double)*num_dm);
|
|
733
|
|
734 big=(double*)malloc(sizeof(double)*num_dm);
|
|
735 memset(big,0,sizeof(double)*num_dm);
|
|
736
|
|
737
|
|
738 for (j=0;j<num_dm;j++)
|
|
739 {
|
|
740 big[j]=MAX_VALUE*(-1);
|
|
741 small[j]=MAX_VALUE;
|
|
742 for (i=0;i<file_Len;i++)
|
|
743 {
|
|
744 if (data_in[i][j]>big[j])
|
|
745 big[j]=data_in[i][j];
|
|
746
|
|
747 if (data_in[i][j]<small[j])
|
|
748 small[j]=data_in[i][j];
|
|
749 }
|
|
750
|
|
751 interval[j]=(big[j]-small[j])/(double)num_bin; //interval is computed using the biggest value and smallest value instead of the channel limit
|
|
752 /* XXX: I'm pretty sure the denominator of the fraction above should be num_bin-1. */
|
|
753 }
|
|
754
|
|
755 for (j=0;j<num_dm;j++)
|
|
756 {
|
|
757 for (i=0;i<file_Len;i++)
|
|
758 {
|
|
759 if (data_in[i][j]>=big[j])
|
|
760 position[i][j]=num_bin-1;
|
|
761 else
|
|
762 {
|
|
763 position[i][j]=(long)((data_in[i][j]-small[j])/interval[j]); //position[i][j]=t means point i is at the t grid of dimensional j
|
|
764 if ((position[i][j]>=num_bin) || (position[i][j]<0))
|
|
765 {
|
|
766 //printf("position mis-computed in density analysis!\n");
|
|
767 //exit(0);
|
|
768 fprintf(stderr,"Incorrect input file format or input parameters (number of bins overflows)!\n"); //modified on July 23, 2010
|
|
769 exit(0);
|
|
770 }
|
|
771 }
|
|
772 }
|
|
773 }
|
|
774
|
|
775
|
|
776 free(small);
|
|
777 free(big);
|
|
778 }
|
|
779
|
|
780 /********************************************** select_bin to select the number of bins **********************************/
|
|
781 //num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty);
|
|
782 /* Determine the number of bins to use in each dimension. Additionally sort the data elements according to the binned
|
|
783 * values, and partition the data into "grids" with identical (binned) values. We try progressively more bins until we
|
|
784 * maximize a merit function, then return the results obtained using the optimal number of bins.
|
|
785 *
|
|
786 * Outputs:
|
|
787 * position -- binned data values
|
|
788 * sorted_seq -- permutation vector mapping the ordered list to the original data
|
|
789 * all_grid_ID -- grid to which each data element was assigned.
|
|
790 * num_nonempty -- number of distinct values assumed by all_grid_ID
|
|
791 * interval -- bin width for each data dimension
|
|
792 * return value -- the number of bins selected.
|
|
793 */
|
|
794
|
|
795 long select_bin(double **normalized_data, long file_Len, long num_dm, long min_grid, long max_grid, long **position, long *sorted_seq,
|
|
796 long *all_grid_ID, long *num_nonempty, double *interval, long user_num_bin)
|
|
797 {
|
|
798
|
|
799 long num_bin=0;
|
|
800 long select_num_bin=0;
|
|
801 long m=0;
|
|
802 long n=0;
|
|
803
|
|
804 long i=0;
|
|
805 long bin_scope=0;
|
|
806 long temp_num_nonempty=0;
|
|
807
|
|
808 long *temp_grid_ID;
|
|
809 long *temp_sorted_seq;
|
|
810 long **temp_position;
|
|
811
|
|
812 //sorted_seq[i]=j means the event j ranks i
|
|
813
|
|
814 double temp_index=0;
|
|
815 double *bin_index;
|
|
816 double *temp_interval;
|
|
817
|
|
818
|
|
819 temp_grid_ID=(long *)malloc(sizeof(long)*file_Len);
|
|
820 memset(temp_grid_ID,0,sizeof(long)*file_Len);
|
|
821
|
|
822 temp_sorted_seq=(long *)malloc(sizeof(long)*file_Len);
|
|
823 memset(temp_sorted_seq,0,sizeof(long)*file_Len);
|
|
824
|
|
825 temp_position=(long **)malloc(sizeof(long*)*file_Len);
|
|
826 memset(temp_position,0,sizeof(long*)*file_Len);
|
|
827 for (m=0;m<file_Len;m++)
|
|
828 {
|
|
829 temp_position[m]=(long*)malloc(sizeof(long)*num_dm);
|
|
830 memset(temp_position[m],0,sizeof(long)*num_dm);
|
|
831 }
|
|
832
|
|
833 temp_interval=(double*)malloc(sizeof(double)*num_dm);
|
|
834 memset(temp_interval,0,sizeof(double)*num_dm);
|
|
835
|
|
836 bin_scope=max_grid-min_grid+1;
|
|
837 bin_index=(double *)malloc(sizeof(double)*bin_scope);
|
|
838 memset(bin_index,0,sizeof(double)*bin_scope);
|
|
839
|
|
840 i=0;
|
|
841
|
|
842 for (num_bin=min_grid;num_bin<=max_grid;num_bin++)
|
|
843 {
|
|
844
|
|
845 temp_num_nonempty=0;
|
|
846 /* compute_position bins the data into num_bin bins. Each
|
|
847 dimension is binned independently.
|
|
848
|
|
849 Outputs:
|
|
850 temp_position[i][j] -- bin for the jth component of data element i.
|
|
851 temp_interval[j] -- bin-width for the jth component
|
|
852 */
|
|
853 compute_position(normalized_data, file_Len, num_dm, num_bin, temp_position, temp_interval);
|
|
854 radixsort_flock(temp_position,file_Len,num_dm,num_bin,temp_sorted_seq,&temp_num_nonempty,temp_grid_ID);
|
|
855
|
|
856 /* our figure of merit is the number of non-empty grids divided by number of bins per dimension.
|
|
857 We declare victory when we have found a local maximum */
|
|
858 bin_index[i]=((double)temp_num_nonempty)/((double)num_bin);
|
|
859 if ((double)(temp_num_nonempty)>=(double)(file_Len)*0.95)
|
|
860 break;
|
|
861 if ((bin_index[i]<temp_index) && (user_num_bin==0))
|
|
862 break;
|
|
863 if ((user_num_bin==num_bin-1) && (user_num_bin!=0))
|
|
864 break;
|
|
865
|
|
866 /* Since we have accepted this trial bin, copy all the temporary results into
|
|
867 the output buffers */
|
|
868 memcpy(all_grid_ID,temp_grid_ID,sizeof(long)*file_Len);
|
|
869 memcpy(sorted_seq,temp_sorted_seq,sizeof(long)*file_Len);
|
|
870 memcpy(interval,temp_interval,sizeof(double)*num_dm);
|
|
871
|
|
872 for (m=0;m<file_Len;m++)
|
|
873 for (n=0;n<num_dm;n++)
|
|
874 position[m][n]=temp_position[m][n];
|
|
875
|
|
876 temp_index=bin_index[i];
|
|
877 select_num_bin=num_bin;
|
|
878 num_nonempty[0]=temp_num_nonempty;
|
|
879 i++;
|
|
880 }
|
|
881
|
|
882 if ((select_num_bin<min_grid) || (select_num_bin>max_grid))
|
|
883 {
|
|
884 fprintf(stderr,"Number of events collected is too few in terms of number of markers used. The file should not be processed!\n"); //modified on Nov 04, 2010
|
|
885 exit(0);
|
|
886 }
|
|
887
|
|
888 if (temp_index==0)
|
|
889 {
|
|
890 fprintf(stderr,"Too many dimensions with too few events in the input file, or a too large number of bins used.\n"); //modified on July 23, 2010
|
|
891 exit(0);
|
|
892 }
|
|
893
|
|
894 free(temp_grid_ID);
|
|
895 free(temp_sorted_seq);
|
|
896 free(bin_index);
|
|
897 free(temp_interval);
|
|
898
|
|
899 for (m=0;m<file_Len;m++)
|
|
900 free(temp_position[m]);
|
|
901 free(temp_position);
|
|
902
|
|
903 return select_num_bin;
|
|
904 }
|
|
905
|
|
906 /************************************* Select dense grids **********************************/
|
|
907 // compute num_dense_grids, num_dense_events, dense_grid_reverse, and all_grid_vol
|
|
908 // den_cutoff=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse);
|
|
909 /*
|
|
910 * Prune away grids that are insufficiently "dense" (i.e., contain too few data items)
|
|
911 *
|
|
912 * Outputs:
|
|
913 * num_dense_grids -- number of dense grids
|
|
914 * num_dense_events -- total number of data items in all dense grids
|
|
915 * dense_grid_reverse -- mapping from list of all grids to list of dense grids.
|
|
916 * return value -- density cutoff for separating dense from non-dense grids.
|
|
917 */
|
|
918
|
|
919 int select_dense(long file_Len, long *all_grid_ID, long num_nonempty, long *num_dense_grids, long *num_dense_events, long *dense_grid_reverse, int den_t_event)
|
|
920 {
|
|
921
|
|
922
|
|
923 long i=0;
|
|
924 long vol_ID=0;
|
|
925 long biggest_size=0; //biggest grid_size, to define grid_size_index
|
|
926 long biggest_index=0;
|
|
927 //long actual_threshold=0; //the actual threshold on grid_size, e.g., 1 implies 1 event in the grid
|
|
928 //long num_remain=0; //number of remaining grids with different density thresholds
|
|
929 long temp_num_dense_grids=0;
|
|
930 long temp_num_dense_events=0;
|
|
931
|
|
932 long *grid_size_index;
|
|
933 long *all_grid_vol;
|
|
934 long *grid_density_index;
|
|
935
|
|
936 //double den_average=0;
|
|
937 // double avr_index=0;
|
|
938
|
|
939
|
|
940 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
941 // Compute all_grid_vol
|
|
942 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
943 all_grid_vol=(long *)malloc(sizeof(long)*num_nonempty);
|
|
944 memset(all_grid_vol,0,sizeof(long)*num_nonempty);
|
|
945
|
|
946 /* Grid "volume" is just the number of data contained in the grid. */
|
|
947 for (i=0;i<file_Len;i++)
|
|
948 {
|
|
949 vol_ID=all_grid_ID[i]; //vol_ID=all_grid_ID[sorted_seq[i]];
|
|
950 all_grid_vol[vol_ID]++; //all_grid_vol[i]=j means grid i has j points
|
|
951 }
|
|
952
|
|
953
|
|
954
|
|
955 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
956 // Compute grid_size_index (histogram of grid sizes)
|
|
957 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
958
|
|
959 for (i=0;i<num_nonempty;i++)
|
|
960 {
|
|
961 if (biggest_size<all_grid_vol[i])
|
|
962 {
|
|
963 biggest_size=all_grid_vol[i];
|
|
964 }
|
|
965 }
|
|
966
|
|
967 if (biggest_size<3)
|
|
968 {
|
|
969 fprintf(stderr,"Too many dimensions with too few events in the input file, or a too large number of bins used.\n"); //modified on July 23, 2010
|
|
970 exit(0);
|
|
971 }
|
|
972
|
|
973
|
|
974 grid_size_index=(long*)malloc(sizeof(long)*biggest_size);
|
|
975 memset(grid_size_index,0,sizeof(long)*biggest_size);
|
|
976
|
|
977 for (i=0;i<num_nonempty;i++)
|
|
978 {
|
|
979 grid_size_index[all_grid_vol[i]-1]++; //grid_size_index[0]=5 means there are 5 grids having size 1
|
|
980 }
|
|
981
|
|
982
|
|
983
|
|
984 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
985 // Compute den_cutoff
|
|
986 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
987
|
|
988 grid_density_index=(long *)malloc(sizeof(long)*(biggest_size-2));//from event 2 to biggest_size-1, i.e., from 0 to biggest_size-3
|
|
989 memset(grid_density_index,0,sizeof(long)*(biggest_size-2));
|
|
990
|
|
991 if (den_t_event==0)
|
|
992 {
|
|
993 biggest_index=0;
|
|
994
|
|
995 for (i=2;i<biggest_size-1;i++) //the grid with 1 event will be skipped, i.e., grid_density_index[0] won't be defined
|
|
996 {
|
|
997 grid_density_index[i-1]=(grid_size_index[i-1]+grid_size_index[i+1]-2*grid_size_index[i]);
|
|
998 if (biggest_index<grid_density_index[i-1])
|
|
999 {
|
|
1000 biggest_index=grid_density_index[i-1];
|
|
1001 den_t_event=i+1;
|
|
1002 }
|
|
1003 }
|
|
1004 }
|
|
1005
|
|
1006 if (den_t_event==0) //if biggest_size==3
|
|
1007 {
|
|
1008 fprintf(stderr,"the densest hyperregion has only 3 events, smaller than the user-specified value. Therefore the density threshold is automatically changed to 3.\n");
|
|
1009 den_t_event=3;
|
|
1010 }
|
|
1011
|
|
1012 for (i=0;i<num_nonempty;i++)
|
|
1013 if (all_grid_vol[i]>=den_t_event)
|
|
1014 temp_num_dense_grids++;
|
|
1015
|
|
1016 if (temp_num_dense_grids<=1)
|
|
1017 {
|
|
1018 //exit(0);
|
|
1019 //printf("a too high density threshold is set! Please decrease your density threshold.\n");
|
|
1020 fprintf(stderr,"a too high density threshold! Please decrease your density threshold.\n"); //modified on July 23, 2010
|
|
1021 exit(0);
|
|
1022 }
|
|
1023
|
|
1024 if (temp_num_dense_grids>=100000)
|
|
1025 {
|
|
1026 //modified on July 23, 2010
|
|
1027 //printf("a too low density threshold is set! Please increase your density threshold.\n");
|
|
1028 //exit(0);
|
|
1029 fprintf(stderr,"a too low density threshold! Please increase your density threshold.\n"); //modified on July 23, 2010
|
|
1030 exit(0);
|
|
1031 }
|
|
1032
|
|
1033 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
1034 // Compute dense_grid_reverse
|
|
1035 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
1036 temp_num_dense_grids=0;
|
|
1037 temp_num_dense_events=0;
|
|
1038 for (i=0;i<num_nonempty;i++)
|
|
1039 {
|
|
1040 dense_grid_reverse[i]=-1;
|
|
1041
|
|
1042 if (all_grid_vol[i]>=den_t_event)
|
|
1043 {
|
|
1044 dense_grid_reverse[i]=temp_num_dense_grids; //dense_grid_reverse provides a mapping from all nonempty grids to dense grids.
|
|
1045 temp_num_dense_grids++;
|
|
1046 temp_num_dense_events+=all_grid_vol[i];
|
|
1047 }
|
|
1048 }
|
|
1049
|
|
1050 num_dense_grids[0]=temp_num_dense_grids;
|
|
1051 num_dense_events[0]=temp_num_dense_events;
|
|
1052
|
|
1053 free(grid_size_index);
|
|
1054 free(all_grid_vol);
|
|
1055
|
|
1056 return den_t_event;
|
|
1057 }
|
|
1058
|
|
1059 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
1060 // Compute densegridID_To_gridevent and eventID_To_denseventID
|
|
1061 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
1062 // grid_To_event(file_Len, dense_grid_reverse, all_grid_ID, eventID_To_denseventID, densegridID_To_gridevent);
|
|
1063 /*
|
|
1064 * Filter the data so that only the data belonging to dense grids is left
|
|
1065 *
|
|
1066 * Output:
|
|
1067 * eventID_To_denseeventID -- mapping from original event ID to ID in list containing only events contained in dense grids.
|
|
1068 * densegridID_To_gridevent -- mapping from dense grids to prototype members of the grids.
|
|
1069 *
|
|
1070 */
|
|
1071
|
|
1072 void grid_To_event(long file_Len, long *dense_grid_reverse, long *all_grid_ID, long *eventID_To_denseventID, long *densegridID_To_gridevent)
|
|
1073 {
|
|
1074 long i=0;
|
|
1075 long dense_grid_ID=0;
|
|
1076 long dense_event_ID=0;
|
|
1077
|
|
1078 for (i=0;i<file_Len;i++)
|
|
1079 {
|
|
1080 dense_grid_ID=dense_grid_reverse[all_grid_ID[i]];
|
|
1081 eventID_To_denseventID[i]=-1;
|
|
1082 if (dense_grid_ID!=-1) //for point (i) belonging to dense grids
|
|
1083 {
|
|
1084 eventID_To_denseventID[i]=dense_event_ID;
|
|
1085 dense_event_ID++;
|
|
1086
|
|
1087 if (densegridID_To_gridevent[dense_grid_ID]==-1) //for point i that hasn't been selected
|
|
1088 densegridID_To_gridevent[dense_grid_ID]=i; //densegridID_To_gridevent maps dense_grid_ID to its representative point
|
|
1089 }
|
|
1090 }
|
|
1091
|
|
1092
|
|
1093 }
|
|
1094
|
|
1095 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
1096 // Compute dense_grid_seq
|
|
1097 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
1098 // generate_grid_seq(file_Len, num_dm, sorted_seq, num_dense_grids, densegridID_To_gridevent, position, dense_grid_rank, dense_grid_seq);
|
|
1099 /* Construct a table of binned data values for each dense grid.
|
|
1100 *
|
|
1101 * Output:
|
|
1102 *
|
|
1103 * dense_grid_seq -- table of binned data values for each dense grid (recall that all members of a grid have identical binned data values)
|
|
1104 */
|
|
1105
|
|
1106 void generate_grid_seq(long num_dm, long num_dense_grids, long *densegridID_To_gridevent, long **position, long **dense_grid_seq)
|
|
1107 {
|
|
1108
|
|
1109 long i=0;
|
|
1110 long j=0;
|
|
1111 long ReventID=0; //representative event ID of the dense grid
|
|
1112
|
|
1113 for (i=0;i<num_dense_grids;i++)
|
|
1114 {
|
|
1115 ReventID = densegridID_To_gridevent[i];
|
|
1116
|
|
1117 for (j=0;j<num_dm;j++)
|
|
1118 dense_grid_seq[i][j]=position[ReventID][j];
|
|
1119 }
|
|
1120 }
|
|
1121
|
|
1122 //compare two vectors
|
|
1123 long compare_value(long num_dm, long *search_value, long *seq_value)
|
|
1124 {
|
|
1125 long i=0;
|
|
1126
|
|
1127 for (i=0;i<num_dm;i++)
|
|
1128 {
|
|
1129 if (search_value[i]<seq_value[i])
|
|
1130 return 1;
|
|
1131 if (search_value[i]>seq_value[i])
|
|
1132 return -1;
|
|
1133 if (search_value[i]==seq_value[i])
|
|
1134 continue;
|
|
1135 }
|
|
1136 return 0;
|
|
1137 }
|
|
1138
|
|
1139 //binary search the dense_grid_seq to return the dense grid ID if it exists
|
|
1140 long binary_search(long num_dense_grids, long num_dm, long *search_value, long **dense_grid_seq)
|
|
1141 {
|
|
1142
|
|
1143 long low=0;
|
|
1144 long high=0;
|
|
1145 long mid=0;
|
|
1146 long comp_result=0;
|
|
1147 long match=0;
|
|
1148 //long found=0;
|
|
1149
|
|
1150 low = 0;
|
|
1151 high = num_dense_grids-1;
|
|
1152
|
|
1153 while (low <= high)
|
|
1154 {
|
|
1155 mid = (long)((low + high)/2);
|
|
1156
|
|
1157 comp_result=compare_value(num_dm, search_value,dense_grid_seq[mid]);
|
|
1158
|
|
1159
|
|
1160 switch(comp_result)
|
|
1161 {
|
|
1162 case 1:
|
|
1163 high=mid-1;
|
|
1164 break;
|
|
1165 case -1:
|
|
1166 low=mid+1;
|
|
1167 break;
|
|
1168 case 0:
|
|
1169 match=1;
|
|
1170 break;
|
|
1171 }
|
|
1172 if (match==1)
|
|
1173 break;
|
|
1174 }
|
|
1175
|
|
1176
|
|
1177
|
|
1178 if (match==1)
|
|
1179 {
|
|
1180 return mid;
|
|
1181 }
|
|
1182 else
|
|
1183 return -1;
|
|
1184 }
|
|
1185
|
|
1186
|
|
1187 /********************************************** Computing Centers Using IDs **********************************************/
|
|
1188
|
|
1189 void ID2Center(double **data_in, long file_Len, long num_dm, long *eventID_To_denseventID, long num_clust, long *cluster_ID, double **population_center)
|
|
1190 {
|
|
1191 long i=0;
|
|
1192 long j=0;
|
|
1193 long ID=0;
|
|
1194 long eventID=0;
|
|
1195 long *size_c;
|
|
1196
|
|
1197
|
|
1198
|
|
1199 size_c=(long *)malloc(sizeof(long)*num_clust);
|
|
1200 memset(size_c,0,sizeof(long)*num_clust);
|
|
1201
|
|
1202 for (i=0;i<num_clust;i++)
|
|
1203 for (j=0;j<num_dm;j++)
|
|
1204 population_center[i][j]=0;
|
|
1205
|
|
1206 for (i=0;i<file_Len;i++)
|
|
1207 {
|
|
1208 eventID=eventID_To_denseventID[i];
|
|
1209
|
|
1210 if (eventID!=-1) //only events in dense grids count
|
|
1211 {
|
|
1212 ID=cluster_ID[eventID];
|
|
1213
|
|
1214 if (ID==-1)
|
|
1215 {
|
|
1216 //printf("ID==-1! in ID2Center\n");
|
|
1217 //exit(0);
|
|
1218 fprintf(stderr,"Incorrect file format or input parameters (no dense regions found!)\n"); //modified on July 23, 2010
|
|
1219 exit(0);
|
|
1220 }
|
|
1221
|
|
1222 for (j=0;j<num_dm;j++)
|
|
1223 population_center[ID][j]=population_center[ID][j]+data_in[i][j];
|
|
1224
|
|
1225 size_c[ID]++;
|
|
1226 }
|
|
1227 }
|
|
1228
|
|
1229
|
|
1230 for (i=0;i<num_clust;i++)
|
|
1231 {
|
|
1232 for (j=0;j<num_dm;j++)
|
|
1233 if (size_c[i]!=0)
|
|
1234 population_center[i][j]=(population_center[i][j]/(double)(size_c[i]));
|
|
1235 else
|
|
1236 {
|
|
1237 population_center[i][j]=0;
|
|
1238 printf("size_c[%ld]=0 in ID2center\n",i);
|
|
1239 }
|
|
1240 }
|
|
1241
|
|
1242 free(size_c);
|
|
1243
|
|
1244 }
|
|
1245
|
|
1246 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
1247 // Compute Population Center with all events
|
|
1248 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
1249 void ID2Center_all(double **data_in, long file_Len, long num_dm, long num_clust, long *cluster_ID, double **population_center)
|
|
1250 {
|
|
1251 long i=0;
|
|
1252 long j=0;
|
|
1253 long ID=0;
|
|
1254 long *size_c;
|
|
1255
|
|
1256
|
|
1257
|
|
1258 size_c=(long *)malloc(sizeof(long)*num_clust);
|
|
1259 memset(size_c,0,sizeof(long)*num_clust);
|
|
1260
|
|
1261 for (i=0;i<num_clust;i++)
|
|
1262 for (j=0;j<num_dm;j++)
|
|
1263 population_center[i][j]=0;
|
|
1264
|
|
1265
|
|
1266 for (i=0;i<file_Len;i++)
|
|
1267 {
|
|
1268 ID=cluster_ID[i];
|
|
1269
|
|
1270 if (ID==-1)
|
|
1271 {
|
|
1272 //printf("ID==-1! in ID2Center_all\n");
|
|
1273 //exit(0);
|
|
1274 fprintf(stderr,"Incorrect file format or input parameters (resulting in incorrect population IDs)\n"); //modified on July 23, 2010
|
|
1275 exit(0);
|
|
1276 }
|
|
1277
|
|
1278 for (j=0;j<num_dm;j++)
|
|
1279 population_center[ID][j]=population_center[ID][j]+data_in[i][j];
|
|
1280
|
|
1281 size_c[ID]++;
|
|
1282 }
|
|
1283
|
|
1284
|
|
1285 for (i=0;i<num_clust;i++)
|
|
1286 {
|
|
1287 for (j=0;j<num_dm;j++)
|
|
1288 if (size_c[i]!=0)
|
|
1289 population_center[i][j]=(population_center[i][j]/(double)(size_c[i]));
|
|
1290 else
|
|
1291 {
|
|
1292 population_center[i][j]=0;
|
|
1293 printf("size_c[%ld]=0 in ID2center_all\n",i);
|
|
1294 }
|
|
1295 }
|
|
1296
|
|
1297
|
|
1298 free(size_c);
|
|
1299
|
|
1300 }
|
|
1301
|
|
1302 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
1303 // Merge neighboring grids to clusters
|
|
1304 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
1305
|
|
1306 long merge_grids(double **normalized_data, double *interval, long file_Len, long num_dm, long num_bin, long **position, long num_dense_grids,
|
|
1307 long *dense_grid_reverse, long **dense_grid_seq, long *eventID_To_denseventID, long *densegridID_To_gridevent, long *all_grid_ID,
|
|
1308 long *cluster_ID, long *grid_ID, long *grid_clusterID)
|
|
1309 {
|
|
1310
|
|
1311
|
|
1312 long i=0;
|
|
1313 long j=0;
|
|
1314 long t=0;
|
|
1315 long p=0;
|
|
1316 long num_clust=0;
|
|
1317 long ReventID=0;
|
|
1318 long denseID=0;
|
|
1319 long neighbor_ID=0;
|
|
1320 //long temp_grid=0;
|
|
1321
|
|
1322 long *grid_value;
|
|
1323 long *search_value;
|
|
1324
|
|
1325 long **graph_of_grid; //the graph constructed for dense grids: each dense grid is a graph node
|
|
1326
|
|
1327 double real_dist=0;
|
|
1328 double **norm_grid_center;
|
|
1329
|
|
1330 norm_grid_center=(double **)malloc(sizeof(double*)*num_dense_grids);
|
|
1331 memset(norm_grid_center,0,sizeof(double*)*num_dense_grids);
|
|
1332
|
|
1333 for (i=0;i<num_dense_grids;i++)
|
|
1334 {
|
|
1335 norm_grid_center[i]=(double *)malloc(sizeof(double)*num_dm);
|
|
1336 memset(norm_grid_center[i],0,sizeof(double)*num_dm);
|
|
1337 }
|
|
1338
|
|
1339 for (i=0;i<file_Len;i++)
|
|
1340 {
|
|
1341 denseID=eventID_To_denseventID[i];
|
|
1342 if (denseID!=-1) //only dense events can enter
|
|
1343 {
|
|
1344 grid_ID[denseID]=dense_grid_reverse[all_grid_ID[i]];
|
|
1345
|
|
1346 if (grid_ID[denseID]==-1)
|
|
1347 {
|
|
1348 fprintf(stderr,"Incorrect input file format or input parameters (no dense region found)!\n");
|
|
1349 exit(0);
|
|
1350 }
|
|
1351 }
|
|
1352 }
|
|
1353
|
|
1354
|
|
1355 /* Find centroid (in the normalized data) for each dense grid */
|
|
1356 ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_dense_grids,grid_ID,norm_grid_center);
|
|
1357
|
|
1358 //printf("pass the grid ID2 center\n");
|
|
1359
|
|
1360
|
|
1361 graph_of_grid=(long **)malloc(sizeof(long*)*num_dense_grids);
|
|
1362 memset(graph_of_grid,0,sizeof(long*)*num_dense_grids);
|
|
1363 for (i=0;i<num_dense_grids;i++)
|
|
1364 {
|
|
1365 graph_of_grid[i]=(long *)malloc(sizeof(long)*num_dm);
|
|
1366 memset(graph_of_grid[i],0,sizeof(long)*num_dm);
|
|
1367
|
|
1368
|
|
1369 for (j=0;j<num_dm;j++)
|
|
1370 graph_of_grid[i][j]=-1;
|
|
1371 }
|
|
1372
|
|
1373 grid_value=(long *)malloc(sizeof(long)*num_dm);
|
|
1374 memset(grid_value,0,sizeof(long)*num_dm);
|
|
1375
|
|
1376 search_value=(long *)malloc(sizeof(long)*num_dm);
|
|
1377 memset(search_value,0,sizeof(long)*num_dm);
|
|
1378
|
|
1379
|
|
1380 for (i=0;i<num_dense_grids;i++)
|
|
1381 {
|
|
1382 ReventID=densegridID_To_gridevent[i];
|
|
1383
|
|
1384 for (j=0;j<num_dm;j++)
|
|
1385 {
|
|
1386 grid_value[j]=position[ReventID][j];
|
|
1387
|
|
1388 }
|
|
1389
|
|
1390
|
|
1391 /* For each dimension, find the neighbor, if any, that is equal in all other dimensions and 1 greater in
|
|
1392 the chosen dimension. If the neighbor's centroid is not too far away, add it to this grid's neighbor
|
|
1393 list. */
|
|
1394 for (t=0;t<num_dm;t++)
|
|
1395 {
|
|
1396 for (p=0;p<num_dm;p++)
|
|
1397 search_value[p]=grid_value[p];
|
|
1398
|
|
1399 if (grid_value[t]==num_bin-1)
|
|
1400 continue;
|
|
1401
|
|
1402 search_value[t]=grid_value[t]+1; //we only consider the neighbor at the bigger side
|
|
1403
|
|
1404 neighbor_ID=binary_search(num_dense_grids, num_dm, search_value, dense_grid_seq);
|
|
1405
|
|
1406 if (neighbor_ID!=-1)
|
|
1407 {
|
|
1408 real_dist=norm_grid_center[i][t]-norm_grid_center[neighbor_ID][t];
|
|
1409
|
|
1410 if (real_dist<0)
|
|
1411 real_dist=-real_dist;
|
|
1412
|
|
1413 if (real_dist<2*interval[t]) //by using 2*interval, this switch is void
|
|
1414 graph_of_grid[i][t]=neighbor_ID;
|
|
1415 }
|
|
1416 }
|
|
1417 grid_clusterID[i]=i; //initialize grid_clusterID
|
|
1418 }
|
|
1419
|
|
1420
|
|
1421 //graph constructed
|
|
1422 //DFS-based search begins
|
|
1423
|
|
1424 /* Use a depth-first search to construct a list of connected subgraphs (= "clusters").
|
|
1425 Note our graph as we have constructed it is a DAG, so we can use that to our advantage
|
|
1426 in our search. */
|
|
1427 // num_clust=dfs(graph_of_grid,num_dense_grids,num_dm,grid_clusterID);
|
|
1428 num_clust=find_connected(graph_of_grid, num_dense_grids, num_dm, grid_clusterID);
|
|
1429
|
|
1430
|
|
1431 //computes grid_ID and cluster_ID
|
|
1432 for (i=0;i<file_Len;i++)
|
|
1433 {
|
|
1434 denseID=eventID_To_denseventID[i];
|
|
1435 if (denseID!=-1) //only dense events can enter
|
|
1436 {
|
|
1437 cluster_ID[denseID]=grid_clusterID[grid_ID[denseID]];
|
|
1438 //if (cluster_ID[denseID]==-1)
|
|
1439 // printf("catch you!\n");
|
|
1440 }
|
|
1441 }
|
|
1442
|
|
1443 free(search_value);
|
|
1444 free(grid_value);
|
|
1445
|
|
1446 for (i=0;i<num_dense_grids;i++)
|
|
1447 {
|
|
1448 free(graph_of_grid[i]);
|
|
1449 free(norm_grid_center[i]);
|
|
1450 }
|
|
1451 free(graph_of_grid);
|
|
1452 free(norm_grid_center);
|
|
1453
|
|
1454 return num_clust;
|
|
1455 }
|
|
1456
|
|
1457 /********************************************* Merge Clusters to Populations *******************************************/
|
|
1458
|
|
1459 long merge_clusters(long num_clust, long num_dm, double *interval, double **cluster_center, long *cluster_populationID, long max_num_pop)
|
|
1460 {
|
|
1461 long num_population=0;
|
|
1462 long temp_num_population=0;
|
|
1463
|
|
1464 long i=0;
|
|
1465 long j=0;
|
|
1466 long t=0;
|
|
1467 long merge=0;
|
|
1468 long smid=0;
|
|
1469 long bgid=0;
|
|
1470 double merge_dist=1.1;
|
|
1471
|
|
1472 long *map_ID;
|
|
1473
|
|
1474 double diff=0;
|
|
1475
|
|
1476 map_ID=(long*)malloc(sizeof(long)*num_clust);
|
|
1477 memset(map_ID,0,sizeof(long)*num_clust);
|
|
1478
|
|
1479 for (i=0;i<num_clust;i++)
|
|
1480 {
|
|
1481 cluster_populationID[i]=i;
|
|
1482 map_ID[i]=i;
|
|
1483 }
|
|
1484
|
|
1485
|
|
1486 while (((num_population>max_num_pop) && (merge_dist<5.0)) || ((num_population<=1) && (merge_dist>0.1)))
|
|
1487 {
|
|
1488
|
|
1489 if (num_population<=1)
|
|
1490 merge_dist=merge_dist-0.1;
|
|
1491
|
|
1492 if (num_population>max_num_pop)
|
|
1493 merge_dist=merge_dist+0.1;
|
|
1494
|
|
1495
|
|
1496 for (i=0;i<num_clust;i++)
|
|
1497 cluster_populationID[i]=i;
|
|
1498
|
|
1499 for (i=0;i<num_clust-1;i++)
|
|
1500 {
|
|
1501 for (j=i+1;j<num_clust;j++)
|
|
1502 {
|
|
1503 merge=1;
|
|
1504
|
|
1505 for (t=0;t<num_dm;t++)
|
|
1506 {
|
|
1507 diff=cluster_center[i][t]-cluster_center[j][t];
|
|
1508
|
|
1509 if (diff<0)
|
|
1510 diff=-diff;
|
|
1511 if (diff>(merge_dist*interval[t]))
|
|
1512 merge=0;
|
|
1513 }
|
|
1514
|
|
1515 if ((merge) && (cluster_populationID[i]!=cluster_populationID[j]))
|
|
1516 {
|
|
1517 if (cluster_populationID[i]<cluster_populationID[j]) //they could not be equal
|
|
1518 {
|
|
1519 smid = cluster_populationID[i];
|
|
1520 bgid = cluster_populationID[j];
|
|
1521 }
|
|
1522 else
|
|
1523 {
|
|
1524 smid = cluster_populationID[j];
|
|
1525 bgid = cluster_populationID[i];
|
|
1526 }
|
|
1527 for (t=0;t<num_clust;t++)
|
|
1528 {
|
|
1529 if (cluster_populationID[t]==bgid)
|
|
1530 cluster_populationID[t]=smid;
|
|
1531 }
|
|
1532 }
|
|
1533 }
|
|
1534 }
|
|
1535
|
|
1536
|
|
1537
|
|
1538 for (i=0;i<num_clust;i++)
|
|
1539 map_ID[i]=-1;
|
|
1540
|
|
1541 for (i=0;i<num_clust;i++)
|
|
1542 map_ID[cluster_populationID[i]]=1;
|
|
1543
|
|
1544 num_population=0;
|
|
1545 for (i=0;i<num_clust;i++)
|
|
1546 {
|
|
1547 if (map_ID[i]==1)
|
|
1548 {
|
|
1549 map_ID[i]=num_population;
|
|
1550 num_population++;
|
|
1551 }
|
|
1552 }
|
|
1553
|
|
1554 if ((temp_num_population>max_num_pop) && (num_population==1))
|
|
1555 break;
|
|
1556 else
|
|
1557 temp_num_population=num_population;
|
|
1558
|
|
1559 if (num_clust<=1)
|
|
1560 break;
|
|
1561 } //end while
|
|
1562
|
|
1563 for (i=0;i<num_clust;i++)
|
|
1564 cluster_populationID[i]=map_ID[cluster_populationID[i]];
|
|
1565
|
|
1566 free(map_ID);
|
|
1567
|
|
1568 return num_population;
|
|
1569 }
|
|
1570
|
|
1571 ///////////////////////////////////////////////////////
|
|
1572 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
1573 double kmeans(double **Matrix, long k, double kmean_term, long file_Len, long num_dm, long *shortest_id, double **center)
|
|
1574 {
|
|
1575
|
|
1576 long i=0;
|
|
1577 long j=0;
|
|
1578 long t=0;
|
|
1579 long random=0;
|
|
1580 long random1=0;
|
|
1581 long random2=0;
|
|
1582 long times=0;
|
|
1583 long dist_used=0; //0 is Euclidean and 1 is Pearson
|
|
1584 long random_init=0; //0: not use random seeds;
|
|
1585 long real_Len=0;
|
|
1586 long skipped=0;
|
|
1587
|
|
1588 long *num; //num[i]=t means the ith cluster has t points
|
|
1589
|
|
1590 double vvv=1.0; // the biggest variation;
|
|
1591 double distance=0.0;
|
|
1592 double xv=0.0;
|
|
1593 double variation=0.0;
|
|
1594
|
|
1595 double mean_dx=0;
|
|
1596 double mean_dy=0;
|
|
1597 double sum_var=0;
|
|
1598 double dx=0;
|
|
1599 double dy=0;
|
|
1600 double sd_x=0;
|
|
1601 double sd_y=0;
|
|
1602 double diff=0;
|
|
1603 double distortion=0;
|
|
1604 double sd=0; //standard deviation
|
|
1605 double shortest_distance;
|
|
1606
|
|
1607 double *temp_center;
|
|
1608
|
|
1609 double **sum;
|
|
1610
|
|
1611 temp_center = (double *)malloc(sizeof(double)*num_dm);
|
|
1612 memset(temp_center,0,sizeof(double)*num_dm);
|
|
1613
|
|
1614 if (random_init)
|
|
1615 {
|
|
1616 for (i=0;i<k;i++)
|
|
1617 {
|
|
1618 random1=rand()*rand();
|
|
1619 random2=abs((random1%5)+1);
|
|
1620 for (t=0;t<random2;t++)
|
|
1621 random2=random2*rand()+rand();
|
|
1622
|
|
1623 random=abs(random2%file_Len);
|
|
1624 for (j=0;j<num_dm;j++)
|
|
1625 center[i][j]=Matrix[random][j];
|
|
1626 }
|
|
1627 }
|
|
1628
|
|
1629
|
|
1630 num = (long *)malloc(sizeof(long)*k);
|
|
1631 memset(num,0,sizeof(long)*k);
|
|
1632
|
|
1633 sum = (double **)malloc(sizeof(double*)*k);
|
|
1634 memset(sum,0,sizeof(double*)*k);
|
|
1635 for (i=0;i<k;i++)
|
|
1636 {
|
|
1637 sum[i] = (double *)malloc(sizeof(double)*num_dm);
|
|
1638 memset(sum[i],0,sizeof(double)*num_dm);
|
|
1639 }
|
|
1640
|
|
1641
|
|
1642 times=0;
|
|
1643 real_Len=0;
|
|
1644
|
|
1645 while (((vvv>kmean_term) && (kmean_term<1)) || ((times<kmean_term) && (kmean_term>=1)))
|
|
1646 {
|
|
1647 for (i=0;i<k;i++)
|
|
1648 {
|
|
1649 num[i]=0;
|
|
1650 for (j=0;j<num_dm;j++)
|
|
1651 sum[i][j]=0.0;
|
|
1652 }
|
|
1653
|
|
1654 for (i=0;i<file_Len;i++) //for each data point i, we compute the distance between Matrix[i] and center[j]
|
|
1655 {
|
|
1656 skipped = 0;
|
|
1657 shortest_distance=MAX_VALUE;
|
|
1658 for (j=0;j<k;j++) //for each center j
|
|
1659 {
|
|
1660 distance=0.0;
|
|
1661
|
|
1662 if (dist_used==0) //Euclidean distance
|
|
1663 {
|
|
1664 for (t=0;t<num_dm;t++) //for each dimension here num_dm is always 1 as we consider individual dimensions
|
|
1665 {
|
|
1666
|
|
1667 diff=center[j][t]-Matrix[i][t];
|
|
1668
|
|
1669 diff=diff*diff;
|
|
1670 //this K-means is only used for dimension selection, and therefore for 1-dimensional data only, no need to use cube distance
|
|
1671
|
|
1672 distance = distance+diff; //here we have a weight for each dimension
|
|
1673 }
|
|
1674 }
|
|
1675 else //pearson correlation
|
|
1676 {
|
|
1677 mean_dx=0.0;
|
|
1678 mean_dy=0.0;
|
|
1679 sum_var=0.0;
|
|
1680 dx=0.0;
|
|
1681 dy=0.0;
|
|
1682 sd_x=0.0;
|
|
1683 sd_y=0.0;
|
|
1684 for (t=0;t<num_dm;t++)
|
|
1685 {
|
|
1686 mean_dx+=center[j][t];
|
|
1687 mean_dy+=Matrix[i][t];
|
|
1688 }
|
|
1689 mean_dx=mean_dx/(double)num_dm;
|
|
1690 mean_dy=mean_dy/(double)num_dm;
|
|
1691
|
|
1692 for (t=0;t<num_dm;t++)
|
|
1693 {
|
|
1694 dx=center[j][t]-mean_dx;
|
|
1695 dy=Matrix[i][t]-mean_dy;
|
|
1696 sum_var+=dx*dy;
|
|
1697 sd_x+=dx*dx;
|
|
1698 sd_y+=dy*dy;
|
|
1699 }
|
|
1700 if (sqrt(sd_x*sd_y)==0)
|
|
1701 distance = 1.0;
|
|
1702 else
|
|
1703 distance = 1.0 - (sum_var/(sqrt(sd_x*sd_y))); // distance ranges from 0 to 2;
|
|
1704 //printf("distance=%f\n",distance);
|
|
1705 } //pearson correlation ends
|
|
1706
|
|
1707 if ((distance<shortest_distance) && (skipped == 0))
|
|
1708 {
|
|
1709 shortest_distance=distance;
|
|
1710 shortest_id[i]=j;
|
|
1711 }
|
|
1712 }//end for j
|
|
1713 real_Len++;
|
|
1714 num[shortest_id[i]]=num[shortest_id[i]]+1;
|
|
1715 for (t=0;t<num_dm;t++)
|
|
1716 sum[shortest_id[i]][t]=sum[shortest_id[i]][t]+Matrix[i][t];
|
|
1717 }//end for i
|
|
1718 /* recompute the centers */
|
|
1719 //compute_mean(group);
|
|
1720 vvv=0.0;
|
|
1721 for (j=0;j<k;j++)
|
|
1722 {
|
|
1723 memcpy(temp_center,center[j],sizeof(double)*num_dm);
|
|
1724 variation=0.0;
|
|
1725 if (num[j]!=0)
|
|
1726 {
|
|
1727 for (t=0;t<num_dm;t++)
|
|
1728 {
|
|
1729 center[j][t]=sum[j][t]/(double)num[j];
|
|
1730 xv=(temp_center[t]-center[j][t]);
|
|
1731 variation=variation+xv*xv;
|
|
1732 }
|
|
1733 }
|
|
1734
|
|
1735 if (variation>vvv)
|
|
1736 vvv=variation; //vvv is the biggest variation among the k clusters;
|
|
1737 }
|
|
1738 //compute_variation;
|
|
1739 times++;
|
|
1740 } //end for while
|
|
1741
|
|
1742
|
|
1743
|
|
1744
|
|
1745 free(num);
|
|
1746
|
|
1747 for (i=0;i<k;i++)
|
|
1748 free(sum[i]);
|
|
1749 free(sum);
|
|
1750 free(temp_center);
|
|
1751
|
|
1752
|
|
1753 return distortion;
|
|
1754
|
|
1755 }
|
|
1756
|
|
1757 //////////////////////////////////////////////////////
|
|
1758 /*************************** Show *****************************/
|
|
1759 void show(double **Matrix, long *cluster_id, long file_Len, long k, long num_dm, char *name_string)
|
|
1760 {
|
|
1761 int situ1=0;
|
|
1762 int situ2=0;
|
|
1763
|
|
1764 long i=0;
|
|
1765 long id=0;
|
|
1766 long j=0;
|
|
1767 long info_id=0;
|
|
1768 long nearest_id=0;
|
|
1769 long insert=0;
|
|
1770 long temp=0;
|
|
1771 long m=0;
|
|
1772 long n=0;
|
|
1773 long t=0;
|
|
1774
|
|
1775 long *size_c;
|
|
1776
|
|
1777
|
|
1778
|
|
1779 long **size_mybound_1;
|
|
1780 long **size_mybound_2;
|
|
1781 long **size_mybound_3;
|
|
1782 long **size_mybound_0;
|
|
1783
|
|
1784 double interval=0.0;
|
|
1785
|
|
1786 double *big;
|
|
1787 double *small;
|
|
1788
|
|
1789
|
|
1790 double **center;
|
|
1791 double **mybound;
|
|
1792
|
|
1793 long **prof; //prof[i][j]=1 means population i is + at parameter j
|
|
1794
|
|
1795 FILE *fpcnt_id; //proportion id
|
|
1796 //FILE *fcent_id; //center_id, i.e., centers of clusters within the original data
|
|
1797 FILE *fprof_id; //profile_id
|
|
1798
|
|
1799 big=(double *)malloc(sizeof(double)*num_dm);
|
|
1800 memset(big,0,sizeof(double)*num_dm);
|
|
1801
|
|
1802 small=(double *)malloc(sizeof(double)*num_dm);
|
|
1803 memset(small,0,sizeof(double)*num_dm);
|
|
1804
|
|
1805 for (i=0;i<num_dm;i++)
|
|
1806 {
|
|
1807 big[i]=0.0;
|
|
1808 small[i]=(double)MAX_VALUE;
|
|
1809 }
|
|
1810
|
|
1811
|
|
1812 size_c=(long *)malloc(sizeof(long)*k);
|
|
1813 memset(size_c,0,sizeof(long)*k);
|
|
1814
|
|
1815 center=(double**)malloc(sizeof(double*)*k);
|
|
1816 memset(center,0,sizeof(double*)*k);
|
|
1817 for (i=0;i<k;i++)
|
|
1818 {
|
|
1819 center[i]=(double*)malloc(sizeof(double)*num_dm);
|
|
1820 memset(center[i],0,sizeof(double)*num_dm);
|
|
1821 }
|
|
1822
|
|
1823 mybound=(double**)malloc(sizeof(double*)*num_dm);
|
|
1824 memset(mybound,0,sizeof(double*)*num_dm);
|
|
1825 for (i=0;i<num_dm;i++) //there are 3 mybounds for 4 categories
|
|
1826 {
|
|
1827 mybound[i]=(double*)malloc(sizeof(double)*3);
|
|
1828 memset(mybound[i],0,sizeof(double)*3);
|
|
1829 }
|
|
1830
|
|
1831 prof=(long **)malloc(sizeof(long*)*k);
|
|
1832 memset(prof,0,sizeof(long*)*k);
|
|
1833 for (i=0;i<k;i++)
|
|
1834 {
|
|
1835 prof[i]=(long *)malloc(sizeof(long)*num_dm);
|
|
1836 memset(prof[i],0,sizeof(long)*num_dm);
|
|
1837 }
|
|
1838
|
|
1839
|
|
1840 for (i=0;i<file_Len;i++)
|
|
1841 {
|
|
1842 id=cluster_id[i];
|
|
1843 for (j=0;j<num_dm;j++)
|
|
1844 {
|
|
1845 center[id][j]=center[id][j]+Matrix[i][j];
|
|
1846 if (big[j]<Matrix[i][j])
|
|
1847 big[j]=Matrix[i][j];
|
|
1848 if (small[j]>Matrix[i][j])
|
|
1849 small[j]=Matrix[i][j];
|
|
1850 }
|
|
1851
|
|
1852 size_c[id]++;
|
|
1853 }
|
|
1854
|
|
1855 for (i=0;i<k;i++)
|
|
1856 for (j=0;j<num_dm;j++)
|
|
1857 {
|
|
1858 if (size_c[i]!=0)
|
|
1859 center[i][j]=(center[i][j]/(double)(size_c[i]));
|
|
1860 else
|
|
1861 center[i][j]=0;
|
|
1862 }
|
|
1863
|
|
1864 for (j=0;j<num_dm;j++)
|
|
1865 {
|
|
1866 interval=((big[j]-small[j])/4.0);
|
|
1867 //printf("interval[%d] is %f\n",j,interval);
|
|
1868 for (i=0;i<3;i++)
|
|
1869 mybound[j][i]=small[j]+((double)(i+1)*interval);
|
|
1870 }
|
|
1871
|
|
1872
|
|
1873 size_mybound_0=(long **)malloc(sizeof(long*)*k);
|
|
1874 memset(size_mybound_0,0,sizeof(long*)*k);
|
|
1875
|
|
1876 for (i=0;i<k;i++)
|
|
1877 {
|
|
1878 size_mybound_0[i]=(long*)malloc(sizeof(long)*num_dm);
|
|
1879 memset(size_mybound_0[i],0,sizeof(long)*num_dm);
|
|
1880 }
|
|
1881
|
|
1882 size_mybound_1=(long **)malloc(sizeof(long*)*k);
|
|
1883 memset(size_mybound_1,0,sizeof(long*)*k);
|
|
1884
|
|
1885 for (i=0;i<k;i++)
|
|
1886 {
|
|
1887 size_mybound_1[i]=(long*)malloc(sizeof(long)*num_dm);
|
|
1888 memset(size_mybound_1[i],0,sizeof(long)*num_dm);
|
|
1889 }
|
|
1890
|
|
1891 size_mybound_2=(long **)malloc(sizeof(long*)*k);
|
|
1892 memset(size_mybound_2,0,sizeof(long*)*k);
|
|
1893
|
|
1894 for (i=0;i<k;i++)
|
|
1895 {
|
|
1896 size_mybound_2[i]=(long*)malloc(sizeof(long)*num_dm);
|
|
1897 memset(size_mybound_2[i],0,sizeof(long)*num_dm);
|
|
1898 }
|
|
1899
|
|
1900 size_mybound_3=(long **)malloc(sizeof(long*)*k);
|
|
1901 memset(size_mybound_3,0,sizeof(long*)*k);
|
|
1902
|
|
1903 for (i=0;i<k;i++)
|
|
1904 {
|
|
1905 size_mybound_3[i]=(long*)malloc(sizeof(long)*num_dm);
|
|
1906 memset(size_mybound_3[i],0,sizeof(long)*num_dm);
|
|
1907 }
|
|
1908
|
|
1909 for (i=0;i<file_Len;i++)
|
|
1910 for (j=0;j<num_dm;j++)
|
|
1911 {
|
|
1912 if (Matrix[i][j]<mybound[j][0])// && ((Matrix[i][j]-small[j])>0)) //the smallest values excluded
|
|
1913 size_mybound_0[cluster_id[i]][j]++;
|
|
1914 else
|
|
1915 {
|
|
1916 if (Matrix[i][j]<mybound[j][1])
|
|
1917 size_mybound_1[cluster_id[i]][j]++;
|
|
1918 else
|
|
1919 {
|
|
1920 if (Matrix[i][j]<mybound[j][2])
|
|
1921 size_mybound_2[cluster_id[i]][j]++;
|
|
1922 else
|
|
1923 //if (Matrix[i][j]!=big[j]) //the biggest values excluded
|
|
1924 size_mybound_3[cluster_id[i]][j]++;
|
|
1925 }
|
|
1926
|
|
1927 }
|
|
1928 }
|
|
1929
|
|
1930 fprof_id=fopen("profile.txt","w");
|
|
1931 fprintf(fprof_id,"Population_ID\t");
|
|
1932 fprintf(fprof_id,"%s\n",name_string);
|
|
1933
|
|
1934 for (i=0;i<k;i++)
|
|
1935 {
|
|
1936 fprintf(fprof_id,"%ld\t",i+1); //i changed to i+1 to start from 1 instead of 0: April 16, 2009
|
|
1937 for (j=0;j<num_dm;j++)
|
|
1938 {
|
|
1939
|
|
1940 if (size_mybound_0[i][j]>size_mybound_1[i][j])
|
|
1941 situ1=0;
|
|
1942 else
|
|
1943 situ1=1;
|
|
1944 if (size_mybound_2[i][j]>size_mybound_3[i][j])
|
|
1945 situ2=2;
|
|
1946 else
|
|
1947 situ2=3;
|
|
1948
|
|
1949 if ((situ1==0) && (situ2==2))
|
|
1950 {
|
|
1951 if (size_mybound_0[i][j]>size_mybound_2[i][j])
|
|
1952 prof[i][j]=0;
|
|
1953 else
|
|
1954 prof[i][j]=2;
|
|
1955 }
|
|
1956 if ((situ1==0) && (situ2==3))
|
|
1957 {
|
|
1958 if (size_mybound_0[i][j]>size_mybound_3[i][j])
|
|
1959 prof[i][j]=0;
|
|
1960 else
|
|
1961 prof[i][j]=3;
|
|
1962 }
|
|
1963 if ((situ1==1) && (situ2==2))
|
|
1964 {
|
|
1965 if (size_mybound_1[i][j]>size_mybound_2[i][j])
|
|
1966 prof[i][j]=1;
|
|
1967 else
|
|
1968 prof[i][j]=2;
|
|
1969 }
|
|
1970 if ((situ1==1) && (situ2==3))
|
|
1971 {
|
|
1972 if (size_mybound_1[i][j]>size_mybound_3[i][j])
|
|
1973 prof[i][j]=1;
|
|
1974 else
|
|
1975 prof[i][j]=3;
|
|
1976 }
|
|
1977
|
|
1978 //begin to output profile
|
|
1979 if (j==num_dm-1)
|
|
1980 {
|
|
1981 if (prof[i][j]==0)
|
|
1982 fprintf(fprof_id,"1\n");
|
|
1983 if (prof[i][j]==1)
|
|
1984 fprintf(fprof_id,"2\n");
|
|
1985 if (prof[i][j]==2)
|
|
1986 fprintf(fprof_id,"3\n");
|
|
1987 if (prof[i][j]==3)
|
|
1988 fprintf(fprof_id,"4\n");
|
|
1989 }
|
|
1990 else
|
|
1991 {
|
|
1992 if (prof[i][j]==0)
|
|
1993 fprintf(fprof_id,"1\t");
|
|
1994 if (prof[i][j]==1)
|
|
1995 fprintf(fprof_id,"2\t");
|
|
1996 if (prof[i][j]==2)
|
|
1997 fprintf(fprof_id,"3\t");
|
|
1998 if (prof[i][j]==3)
|
|
1999 fprintf(fprof_id,"4\t");
|
|
2000 }
|
|
2001 }
|
|
2002 }
|
|
2003 fclose(fprof_id);
|
|
2004
|
|
2005 ///////////////////////////////////////////////////////////
|
|
2006
|
|
2007
|
|
2008 fpcnt_id=fopen("percentage.txt","w");
|
|
2009 fprintf(fpcnt_id,"Population_ID\tPercentage\n");
|
|
2010
|
|
2011 for (t=0;t<k;t++)
|
|
2012 {
|
|
2013 fprintf(fpcnt_id,"%ld\t%.2f\n",t+1,(double)size_c[t]*100.0/(double)file_Len); //t changed to t+1 to start from 1 instead of 0: April 16, 2009
|
|
2014 }
|
|
2015 fclose(fpcnt_id);
|
|
2016
|
|
2017 free(big);
|
|
2018 free(small);
|
|
2019 free(size_c);
|
|
2020
|
|
2021 for (i=0;i<k;i++)
|
|
2022 {
|
|
2023 free(center[i]);
|
|
2024 free(prof[i]);
|
|
2025 free(size_mybound_0[i]);
|
|
2026 free(size_mybound_1[i]);
|
|
2027 free(size_mybound_2[i]);
|
|
2028 free(size_mybound_3[i]);
|
|
2029 }
|
|
2030 free(center);
|
|
2031 free(prof);
|
|
2032 free(size_mybound_0);
|
|
2033 free(size_mybound_1);
|
|
2034 free(size_mybound_2);
|
|
2035 free(size_mybound_3);
|
|
2036
|
|
2037 for (i=0;i<num_dm;i++)
|
|
2038 free(mybound[i]);
|
|
2039 free(mybound);
|
|
2040
|
|
2041 }
|
|
2042 ///////////////////////////////////////////////////////////////////////////
|
|
2043
|
|
2044 double get_avg_dist(double *population_center, long smaller_pop_ID, long larger_pop_ID, long *population_ID, long num_population, long file_Len,
|
|
2045 long num_dm, double **normalized_data, long d1, long d2, long d3, long *size_c)
|
|
2046 {
|
|
2047 long k=0;
|
|
2048 long temp=0;
|
|
2049 long i=0;
|
|
2050 long j=0;
|
|
2051 long t=0;
|
|
2052 long current_biggest=0;
|
|
2053
|
|
2054
|
|
2055 double total_dist=0.0;
|
|
2056 double distance=0.0;
|
|
2057 double dist=0.0;
|
|
2058 double biggest_distance=0.0;
|
|
2059
|
|
2060 long *nearest_neighbors;
|
|
2061 double *nearest_distance;
|
|
2062
|
|
2063 k=min(size_c[smaller_pop_ID],size_c[larger_pop_ID]);
|
|
2064
|
|
2065 if (k>0)
|
|
2066 {
|
|
2067 k=(long)sqrt((double)k);
|
|
2068 if (num_dm<=3)
|
|
2069 k=(long)(2.7183*(double)k); //e*k for low-dimensional space, added Nov. 4, 2010
|
|
2070 //printf("the k-nearest k is %d\n",k);
|
|
2071 }
|
|
2072 else
|
|
2073 {
|
|
2074 printf("error in k\n");
|
|
2075 exit(0);
|
|
2076 }
|
|
2077
|
|
2078 nearest_neighbors=(long *)malloc(sizeof(long)*k);
|
|
2079 memset(nearest_neighbors,0,sizeof(long)*k);
|
|
2080
|
|
2081 nearest_distance=(double *)malloc(sizeof(double)*k);
|
|
2082 memset(nearest_distance,0,sizeof(double)*k);
|
|
2083
|
|
2084 temp=0;
|
|
2085
|
|
2086 for (i=0;i<file_Len;i++)
|
|
2087 {
|
|
2088 if ((population_ID[i]==smaller_pop_ID) || (population_ID[i]==larger_pop_ID)) //the event belongs to the pair of populations
|
|
2089 {
|
|
2090 distance=0.0;
|
|
2091 for (t=0;t<num_dm;t++)
|
|
2092 {
|
|
2093 if ((t!=d1) && (t!=d2) && (t!=d3))
|
|
2094 continue;
|
|
2095 else
|
|
2096 {
|
|
2097 dist=population_center[t]-normalized_data[i][t];
|
|
2098 distance=distance+dist*dist;
|
|
2099 }
|
|
2100 }
|
|
2101
|
|
2102 if (temp<k)
|
|
2103 {
|
|
2104 nearest_neighbors[temp]=i;
|
|
2105 nearest_distance[temp]=distance;
|
|
2106 temp++;
|
|
2107 }
|
|
2108 else
|
|
2109 {
|
|
2110 biggest_distance=0.0;
|
|
2111 for (j=0;j<k;j++)
|
|
2112 {
|
|
2113 if (nearest_distance[j]>biggest_distance)
|
|
2114 {
|
|
2115 biggest_distance=nearest_distance[j];
|
|
2116 current_biggest=j;
|
|
2117 }
|
|
2118 }
|
|
2119
|
|
2120 if (biggest_distance>distance)
|
|
2121 {
|
|
2122 nearest_neighbors[current_biggest]=i;
|
|
2123 nearest_distance[current_biggest]=distance;
|
|
2124 }
|
|
2125 }
|
|
2126 }
|
|
2127 }
|
|
2128
|
|
2129 for (i=0;i<k;i++)
|
|
2130 total_dist=total_dist+nearest_distance[i];
|
|
2131
|
|
2132 total_dist=total_dist/(double)k;
|
|
2133
|
|
2134 free(nearest_distance);
|
|
2135 free(nearest_neighbors);
|
|
2136
|
|
2137 return total_dist;
|
|
2138 }
|
|
2139
|
|
2140
|
|
2141 /******************************************************** Main Function **************************************************/
|
|
2142 //for normalized data, there are five variables:
|
|
2143 //cluster_ID
|
|
2144 //population_center
|
|
2145 //grid_clusterID
|
|
2146 //grid_ID
|
|
2147 //grid_Center
|
|
2148
|
|
2149 //the same five variables exist for the original data
|
|
2150 //however, the three IDs (cluster_ID, grid_ID, grid_clusterID) don't change in either normalized or original data
|
|
2151 //also, data + cluster_ID -> population_center
|
|
2152 //data + grid_ID -> grid_Center
|
|
2153
|
|
2154 /* what is the final output */
|
|
2155 //the final things we want are grid_Center in the original data and grid_clusterID //or population_center in the original data
|
|
2156 //Sparse grids will not be considered when computing the two centroids (centroids of grids and centroids of clusters)
|
|
2157
|
|
2158 /* what information should select_bin output */
|
|
2159 //the size of all IDs are unknown to function main because we only consider the events in dense grids, and also the number of dense grids
|
|
2160 //is unknown, therefore I must use a prescreening to output
|
|
2161 //how many bins I should use
|
|
2162 //the number of dense grids
|
|
2163 //total number of events in the dense grids
|
|
2164
|
|
2165 /* basic procedure of main function */
|
|
2166 //1. read raw file and normalize the raw file
|
|
2167 //2. select_bin
|
|
2168 //3. allocate memory for eventID_To_denseventID, grid_clusterID, grid_ID, cluster_ID.
|
|
2169 //4. call select_dense and merge_grids with grid_clusterID, grid_ID, cluster_ID.
|
|
2170 //5. release normalized data; allocate memory for grid_Center and population_center
|
|
2171 //6. output grid_Center and population_center using ID2Center together with grid_clusterID //from IDs to centers
|
|
2172
|
|
2173 int main (int argc, char **argv)
|
|
2174 {
|
|
2175 //inputs
|
|
2176 FILE *f_src; //source file pointer
|
|
2177
|
|
2178 FILE *f_out; //coordinates
|
|
2179 FILE *f_cid; //population-ID of events
|
|
2180 FILE *f_ctr; //centroids of populations
|
|
2181 FILE *f_results; //coordinates file event and population column
|
|
2182 FILE *f_mfi; //added April 16, 2009 for mean fluorescence intensity
|
|
2183 FILE *f_parameters; //number of bins and density calculated by
|
|
2184 //the algorithm. Used to update the database
|
|
2185 FILE *f_properties; //Properties file used by Image generation software
|
|
2186
|
|
2187 //char tmpbuf[128];
|
|
2188
|
|
2189 char para_name_string[LINE_LEN];
|
|
2190
|
|
2191 long time_ID=-1;
|
|
2192 long num_bin=0; //the bins I will use on each dimension
|
|
2193 long num_pop=0;
|
|
2194 long file_Len=0; //number of events
|
|
2195 long num_dm=0;
|
|
2196 long num_clust=0;
|
|
2197 long num_dense_events=0;
|
|
2198 long num_dense_grids=0;
|
|
2199 long num_nonempty=0;
|
|
2200 long num_population=0;
|
|
2201 long num_real_pop=0;
|
|
2202 long keep_merge=0;
|
|
2203 long num_checked_range=0;
|
|
2204
|
|
2205 //below are read from configuration file
|
|
2206 long i=0;
|
|
2207 long j=0;
|
|
2208 long p=0;
|
|
2209 long t=0;
|
|
2210 long q=0;
|
|
2211 long temp_i=0;
|
|
2212 long temp_j=0;
|
|
2213 long first_i=0;
|
|
2214 long first_j=0;
|
|
2215 long d1=0;
|
|
2216 long d2=0;
|
|
2217 long d3=0;
|
|
2218 long d_d=0;
|
|
2219 long max_num_pop=0; //upperbound of the number of populations that is equal to 2^d
|
|
2220 long index_id=0;
|
|
2221 long num_computed_population=0;
|
|
2222 long tot_size=0;
|
|
2223
|
|
2224 int den_t_event=0;
|
|
2225
|
|
2226 double distance=0.0;
|
|
2227 double dist=0.0;
|
|
2228 double current_smallest_dist=0;
|
|
2229 double max_d_dist=0.0;
|
|
2230 double Ei=0.0;
|
|
2231 double Ej=0.0;
|
|
2232 double E1=0.0;
|
|
2233 double E2=0.0;
|
|
2234 double E3=0.0;
|
|
2235 double Ep=0.0;
|
|
2236 double temp=0.0;
|
|
2237 double tmp=0.0;
|
|
2238
|
|
2239 long *grid_clusterID; //(dense)gridID to clusterID
|
|
2240 long *grid_ID; //(dense)eventID to gridID
|
|
2241 long *cluster_ID; //(dense)eventID to clusterID
|
|
2242 long *eventID_To_denseventID; //eventID_To_denseventID[i]=k means event i is in a dense grid and its ID within dense events is k
|
|
2243 long *all_grid_ID; //gridID for all events
|
|
2244 long *densegridID_To_gridevent;
|
|
2245 long *sorted_seq;
|
|
2246 long *dense_grid_reverse;
|
|
2247 long *population_ID; //denseeventID to populationID
|
|
2248 long *cluster_populationID; //clusterID to populationID
|
|
2249 long *grid_populationID; //gridID to populationID
|
|
2250 long *all_population_ID; //populationID of event
|
|
2251 long *size_c;
|
|
2252 long *size_p_2;
|
|
2253 long *partit;
|
|
2254 long *size_p;
|
|
2255 long *temp_size_j;
|
|
2256 long *all_computed_pop_ID;
|
|
2257
|
|
2258
|
|
2259 long **position;
|
|
2260 long **dense_grid_seq;
|
|
2261 long **temp_population_ID;
|
|
2262
|
|
2263 double *interval;
|
|
2264 double *center_1;
|
|
2265 double *center_2;
|
|
2266 double *center_3;
|
|
2267 double *aver;
|
|
2268 double *std;
|
|
2269 double *center_p_1;
|
|
2270 double *center_p_2;
|
|
2271 double *center_p_3;
|
|
2272
|
|
2273 double **population_center; //population centroids in the raw/original data
|
|
2274 double **cluster_center; //cluster centroids in the raw/original data
|
|
2275 double **new_population_center;
|
|
2276 double **temp_population_center;
|
|
2277 double **min_pop;
|
|
2278 double **max_pop;
|
|
2279
|
|
2280 double **input_data;
|
|
2281 double **normalized_data;
|
|
2282
|
|
2283 double **real_pop_center;
|
|
2284 double **distance_pop;
|
|
2285
|
|
2286 double ***ind_pop;
|
|
2287 double ***ind_pop_3;
|
|
2288
|
|
2289 int min = 999999;
|
|
2290 int max = 0;
|
|
2291
|
|
2292 printf( "Starting time:\t\t\t\t");
|
|
2293 fflush(stdout);
|
|
2294 system("/bin/date");
|
|
2295
|
|
2296 /*
|
|
2297 * Windows Version
|
|
2298 _strtime( tmpbuf );
|
|
2299 printf( "Starting time:\t\t\t\t%s\n", tmpbuf );
|
|
2300 _strdate( tmpbuf );
|
|
2301 printf( "Starting date:\t\t\t\t%s\n", tmpbuf );
|
|
2302 */
|
|
2303
|
|
2304 /////////////////////////////////////////////////////////////
|
|
2305
|
|
2306 if ((argc!=2) && (argc!=3) && (argc!=4) && (argc!=5) && (argc!=6))
|
|
2307 {
|
|
2308 fprintf(stderr,"Incorrect number of input parameters!\n");
|
|
2309 fprintf(stderr,"usage:\n");
|
|
2310 fprintf(stderr,"basic mode: flock data_file\n");
|
|
2311 fprintf(stderr,"advanced mode 0 (specify maximum # of pops): flock data_file max_num_pop\n");
|
|
2312 fprintf(stderr,"advanced mode 1 (without # of pops): flock data_file num_bin density_index\n");
|
|
2313 fprintf(stderr,"advanced mode 2 (specify # of pops): flock data_file num_bin density_index number_of_pop\n");
|
|
2314 fprintf(stderr,"advanced mode 3 (specify both # of pops): flock data_file num_bin density_index number_of_pop max_num_pop\n");
|
|
2315 exit(0);
|
|
2316 }
|
|
2317
|
|
2318 f_src=fopen(argv[1],"r");
|
|
2319
|
|
2320 if (argc==3)
|
|
2321 {
|
|
2322 max_num_pop=atoi(argv[2]);
|
|
2323 printf("num of maximum pop is %ld\n",max_num_pop);
|
|
2324 }
|
|
2325
|
|
2326 if (argc==4)
|
|
2327 {
|
|
2328 num_bin=atoi(argv[2]);
|
|
2329 printf("num_bin is %ld\n",num_bin);
|
|
2330
|
|
2331 den_t_event=atoi(argv[3]);
|
|
2332 printf("density_index is %d\n",den_t_event);
|
|
2333 }
|
|
2334
|
|
2335 if (argc==5)
|
|
2336 {
|
|
2337 num_bin=atoi(argv[2]);
|
|
2338 printf("num_bin is %ld\n",num_bin);
|
|
2339
|
|
2340 den_t_event=atoi(argv[3]);
|
|
2341 printf("density_index is %d\n",den_t_event);
|
|
2342
|
|
2343 num_pop=atoi(argv[4]);
|
|
2344 printf("num of pop is %ld\n",num_pop);
|
|
2345 }
|
|
2346
|
|
2347 if (argc==6)
|
|
2348 {
|
|
2349 num_bin=atoi(argv[2]);
|
|
2350 printf("num_bin is %ld\n",num_bin);
|
|
2351
|
|
2352 den_t_event=atoi(argv[3]);
|
|
2353 printf("density_index is %d\n",den_t_event);
|
|
2354
|
|
2355 num_pop=atoi(argv[4]);
|
|
2356 printf("num of pop is %ld\n",num_pop);
|
|
2357
|
|
2358 max_num_pop=atoi(argv[5]);
|
|
2359 printf("num of pop is %ld\n",max_num_pop);
|
|
2360 }
|
|
2361
|
|
2362
|
|
2363 if (num_pop==1)
|
|
2364 {
|
|
2365 printf("it is not allowed to specify only one population\n");
|
|
2366 exit(0);
|
|
2367 }
|
|
2368
|
|
2369
|
|
2370
|
|
2371 getfileinfo(f_src, &file_Len, &num_dm, para_name_string, &time_ID); //get the filelength, number of dimensions, and num/name of parameters
|
|
2372
|
|
2373
|
|
2374 if (max_num_pop==0)
|
|
2375 {
|
|
2376 max_num_pop=(long)pow(2,num_dm);
|
|
2377 if (max_num_pop>MAX_POP_NUM)
|
|
2378 max_num_pop=MAX_POP_NUM;
|
|
2379 }
|
|
2380
|
|
2381 /************************************************* Read the data *****************************************************/
|
|
2382
|
|
2383 rewind(f_src); //reset the file pointer
|
|
2384
|
|
2385 input_data = (double **)malloc(sizeof(double*)*file_Len);
|
|
2386 memset(input_data,0,sizeof(double*)*file_Len);
|
|
2387 for (i=0;i<file_Len;i++)
|
|
2388 {
|
|
2389 input_data[i]=(double *)malloc(sizeof(double)*num_dm);
|
|
2390 memset(input_data[i],0,sizeof(double)*num_dm);
|
|
2391 }
|
|
2392
|
|
2393 readsource(f_src, file_Len, num_dm, input_data, time_ID); //read the data;
|
|
2394
|
|
2395 fclose(f_src);
|
|
2396
|
|
2397 normalized_data=(double **)malloc(sizeof(double*)*file_Len);
|
|
2398 memset(normalized_data,0,sizeof(double*)*file_Len);
|
|
2399 for (i=0;i<file_Len;i++)
|
|
2400 {
|
|
2401 normalized_data[i]=(double *)malloc(sizeof(double)*num_dm);
|
|
2402 memset(normalized_data[i],0,sizeof(double)*num_dm);
|
|
2403 }
|
|
2404
|
|
2405 tran(input_data, file_Len, num_dm, NORM_METHOD, normalized_data);
|
|
2406
|
|
2407
|
|
2408 position=(long **)malloc(sizeof(long*)*file_Len);
|
|
2409 memset(position,0,sizeof(long*)*file_Len);
|
|
2410 for (i=0;i<file_Len;i++)
|
|
2411 {
|
|
2412 position[i]=(long*)malloc(sizeof(long)*num_dm);
|
|
2413 memset(position[i],0,sizeof(long)*num_dm);
|
|
2414 }
|
|
2415
|
|
2416 all_grid_ID=(long *)malloc(sizeof(long)*file_Len);
|
|
2417 memset(all_grid_ID,0,sizeof(long)*file_Len);
|
|
2418
|
|
2419 sorted_seq=(long*)malloc(sizeof(long)*file_Len);
|
|
2420 memset(sorted_seq,0,sizeof(long)*file_Len);
|
|
2421
|
|
2422 interval=(double*)malloc(sizeof(double)*num_dm);
|
|
2423 memset(interval,0,sizeof(double)*num_dm);
|
|
2424
|
|
2425 /************************************************* select_bin *************************************************/
|
|
2426
|
|
2427 if (num_bin>=1) //num_bin has been selected by user
|
|
2428 {
|
|
2429 select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin);
|
|
2430 printf("user set bin number is %ld\n",num_bin);
|
|
2431 }
|
|
2432 else //num_bin has not been selected by user
|
|
2433 {
|
|
2434 num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin);
|
|
2435 printf("selected bin number is %ld\n",num_bin);
|
|
2436 }
|
|
2437 printf("number of non-empty grids is %ld\n",num_nonempty);
|
|
2438
|
|
2439
|
|
2440
|
|
2441 /* Although we return sorted_seq from select_bin(), we don't use it for anything, except possibly diagnostics */
|
|
2442 free(sorted_seq);
|
|
2443
|
|
2444
|
|
2445 dense_grid_reverse=(long*)malloc(sizeof(long)*num_nonempty);
|
|
2446 memset(dense_grid_reverse,0,sizeof(long)*num_nonempty);
|
|
2447
|
|
2448 /************************************************* select_dense **********************************************/
|
|
2449
|
|
2450 if (den_t_event>=1) //den_t_event must be larger or equal to 2 if the user wants to set it
|
|
2451 den_t_event=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event);
|
|
2452 else
|
|
2453 {
|
|
2454 den_t_event=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event);
|
|
2455 printf("automated selected density threshold is %d\n",den_t_event);
|
|
2456 }
|
|
2457
|
|
2458 printf("Number of dense grids is %ld\n",num_dense_grids);
|
|
2459
|
|
2460 densegridID_To_gridevent = (long *)malloc(sizeof(long)*num_dense_grids);
|
|
2461 memset(densegridID_To_gridevent,0,sizeof(long)*num_dense_grids);
|
|
2462
|
|
2463 for (i=0;i<num_dense_grids;i++)
|
|
2464 densegridID_To_gridevent[i]=-1; //initialize all densegridID_To_gridevent values to -1
|
|
2465
|
|
2466
|
|
2467 eventID_To_denseventID=(long *)malloc(sizeof(long)*file_Len);
|
|
2468 memset(eventID_To_denseventID,0,sizeof(long)*file_Len); //eventID_To_denseventID[i]=k means event i is in a dense grid and its ID within dense events is k
|
|
2469
|
|
2470
|
|
2471 grid_To_event(file_Len, dense_grid_reverse, all_grid_ID, eventID_To_denseventID, densegridID_To_gridevent);
|
|
2472
|
|
2473
|
|
2474 dense_grid_seq=(long **)malloc(sizeof(long*)*num_dense_grids);
|
|
2475 memset(dense_grid_seq,0,sizeof(long*)*num_dense_grids);
|
|
2476 for (i=0;i<num_dense_grids;i++)
|
|
2477 {
|
|
2478 dense_grid_seq[i]=(long *)malloc(sizeof(long)*num_dm);
|
|
2479 memset(dense_grid_seq[i],0,sizeof(long)*num_dm);
|
|
2480 }
|
|
2481
|
|
2482
|
|
2483 /* Look up the binned data values for each dense grid */
|
|
2484 generate_grid_seq(num_dm, num_dense_grids, densegridID_To_gridevent, position, dense_grid_seq);
|
|
2485
|
|
2486
|
|
2487 /************************************************* allocate memory *********************************************/
|
|
2488
|
|
2489 grid_clusterID=(long *)malloc(sizeof(long)*num_dense_grids);
|
|
2490 memset(grid_clusterID,0,sizeof(long)*num_dense_grids);
|
|
2491
|
|
2492 grid_ID=(long *)malloc(sizeof(long)*num_dense_events);
|
|
2493 memset(grid_ID,0,sizeof(long)*num_dense_events);
|
|
2494
|
|
2495 cluster_ID=(long *)malloc(sizeof(long)*num_dense_events);
|
|
2496 memset(cluster_ID,0,sizeof(long)*num_dense_events);
|
|
2497
|
|
2498
|
|
2499 /*********************************************** merge_grids ***********************************************/
|
|
2500 //long merge_grids(long file_Len, long num_dm, long num_bin, long **position, long num_dense_grids, long *dense_grid_rank, long *dense_grid_reverse,
|
|
2501 // long **dense_grid_seq, long *eventID_To_denseventID, long *densegridID_To_gridevent, long *all_grid_ID,
|
|
2502 // long *cluster_ID, long *grid_ID, long *grid_clusterID)
|
|
2503
|
|
2504 num_clust = merge_grids(normalized_data, interval, file_Len, num_dm, num_bin, position, num_dense_grids, dense_grid_reverse, dense_grid_seq, eventID_To_denseventID, densegridID_To_gridevent, all_grid_ID, cluster_ID, grid_ID, grid_clusterID);
|
|
2505
|
|
2506 printf("computed number of groups is %ld\n",num_clust);
|
|
2507
|
|
2508
|
|
2509 /************************************** release unnecessary memory and allocate memory and compute centers **********************************/
|
|
2510
|
|
2511
|
|
2512 for (i=0;i<file_Len;i++)
|
|
2513 free(position[i]);
|
|
2514 free(position);
|
|
2515
|
|
2516 for (i=0;i<num_dense_grids;i++)
|
|
2517 free(dense_grid_seq[i]);
|
|
2518 free(dense_grid_seq);
|
|
2519
|
|
2520 free(dense_grid_reverse);
|
|
2521
|
|
2522 free(densegridID_To_gridevent);
|
|
2523 free(all_grid_ID);
|
|
2524
|
|
2525 // cluster_center ////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
2526
|
|
2527 cluster_center=(double **)malloc(sizeof(double*)*num_clust);
|
|
2528 memset(cluster_center,0,sizeof(double*)*num_clust);
|
|
2529 for (i=0;i<num_clust;i++)
|
|
2530 {
|
|
2531 cluster_center[i]=(double*)malloc(sizeof(double)*num_dm);
|
|
2532 memset(cluster_center[i],0,sizeof(double)*num_dm);
|
|
2533 }
|
|
2534
|
|
2535 ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_clust,cluster_ID,cluster_center); //produce the centers with normalized data
|
|
2536
|
|
2537 //printf("pass the first ID2center\n");
|
|
2538
|
|
2539 /*** population_ID and grid_populationID **/
|
|
2540
|
|
2541 cluster_populationID=(long*)malloc(sizeof(long)*num_clust);
|
|
2542 memset(cluster_populationID,0,sizeof(long)*num_clust);
|
|
2543
|
|
2544 grid_populationID=(long*)malloc(sizeof(long)*num_dense_grids);
|
|
2545 memset(grid_populationID,0,sizeof(long)*num_dense_grids);
|
|
2546
|
|
2547 population_ID=(long*)malloc(sizeof(long)*num_dense_events);
|
|
2548 memset(population_ID,0,sizeof(long)*num_dense_events);
|
|
2549
|
|
2550 num_population = merge_clusters(num_clust, num_dm, interval, cluster_center, cluster_populationID, max_num_pop);
|
|
2551
|
|
2552
|
|
2553
|
|
2554 for (i=0;i<num_clust;i++)
|
|
2555 free(cluster_center[i]);
|
|
2556 free(cluster_center);
|
|
2557
|
|
2558 free(interval);
|
|
2559
|
|
2560 for (i=0;i<num_dense_grids;i++)
|
|
2561 {
|
|
2562 grid_populationID[i]=cluster_populationID[grid_clusterID[i]];
|
|
2563 }
|
|
2564
|
|
2565 for (i=0;i<num_dense_events;i++)
|
|
2566 {
|
|
2567 population_ID[i]=cluster_populationID[cluster_ID[i]];
|
|
2568 }
|
|
2569
|
|
2570 printf("Number of merged groups before second-run partitioning is %ld\n",num_population);
|
|
2571
|
|
2572
|
|
2573
|
|
2574 // population_center /////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
2575
|
|
2576
|
|
2577 population_center=(double **)malloc(sizeof(double*)*num_population);
|
|
2578 memset(population_center,0,sizeof(double*)*num_population);
|
|
2579 for (i=0;i<num_population;i++)
|
|
2580 {
|
|
2581 population_center[i]=(double*)malloc(sizeof(double)*num_dm);
|
|
2582 memset(population_center[i],0,sizeof(double)*num_dm);
|
|
2583 }
|
|
2584
|
|
2585
|
|
2586
|
|
2587 ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_population,population_ID,population_center); //produce population centers with normalized data
|
|
2588
|
|
2589
|
|
2590 // show ////////////////////////////////////////////////////////////////////////////////
|
|
2591 all_population_ID=(long*)malloc(sizeof(long)*file_Len);
|
|
2592 memset(all_population_ID,0,sizeof(long)*file_Len);
|
|
2593
|
|
2594 kmeans(normalized_data, num_population, KMEANS_TERM, file_Len, num_dm, all_population_ID, population_center);
|
|
2595
|
|
2596 for (i=0;i<num_population;i++)
|
|
2597 free(population_center[i]);
|
|
2598
|
|
2599 free(population_center);
|
|
2600
|
|
2601 ////// there needs to be another step to further partition the data
|
|
2602
|
|
2603
|
|
2604 center_p_1=(double *)malloc(sizeof(double)*3);
|
|
2605 memset(center_p_1,0,sizeof(double)*3);
|
|
2606
|
|
2607 center_p_2=(double *)malloc(sizeof(double)*3);
|
|
2608 memset(center_p_2,0,sizeof(double)*3);
|
|
2609
|
|
2610 center_p_3=(double *)malloc(sizeof(double)*3);
|
|
2611 memset(center_p_3,0,sizeof(double)*3);
|
|
2612
|
|
2613 size_p=(long *)malloc(sizeof(long)*num_population);
|
|
2614 memset(size_p,0,sizeof(long)*num_population);
|
|
2615
|
|
2616 temp_size_j=(long *)malloc(sizeof(long)*num_population);
|
|
2617 memset(temp_size_j,0,sizeof(long)*num_population);
|
|
2618
|
|
2619 all_computed_pop_ID=(long *)malloc(sizeof(long)*file_Len);
|
|
2620 memset(all_computed_pop_ID,0,sizeof(long)*file_Len);
|
|
2621
|
|
2622 for (i=0;i<file_Len;i++)
|
|
2623 {
|
|
2624 size_p[all_population_ID[i]]++;
|
|
2625 all_computed_pop_ID[i]=all_population_ID[i];
|
|
2626 }
|
|
2627
|
|
2628 ind_pop=(double***)malloc(sizeof(double**)*num_population);
|
|
2629 memset(ind_pop,0,sizeof(double**)*num_population);
|
|
2630
|
|
2631 ind_pop_3=(double***)malloc(sizeof(double**)*num_population);
|
|
2632 memset(ind_pop_3,0,sizeof(double**)*num_population);
|
|
2633
|
|
2634 for (i=0;i<num_population;i++)
|
|
2635 {
|
|
2636 ind_pop[i]=(double**)malloc(sizeof(double*)*size_p[i]);
|
|
2637 memset(ind_pop[i],0,sizeof(double*)*size_p[i]);
|
|
2638
|
|
2639 ind_pop_3[i]=(double**)malloc(sizeof(double*)*size_p[i]);
|
|
2640 memset(ind_pop_3[i],0,sizeof(double*)*size_p[i]);
|
|
2641
|
|
2642 for (j=0;j<size_p[i];j++)
|
|
2643 {
|
|
2644 ind_pop[i][j]=(double*)malloc(sizeof(double)*num_dm);
|
|
2645 memset(ind_pop[i][j],0,sizeof(double)*num_dm);
|
|
2646
|
|
2647 ind_pop_3[i][j]=(double*)malloc(sizeof(double)*3);
|
|
2648 memset(ind_pop_3[i][j],0,sizeof(double)*3);
|
|
2649 }
|
|
2650 temp_size_j[i]=0;
|
|
2651 }
|
|
2652
|
|
2653 for (i=0;i<file_Len;i++) //to generate ind_pop[i]
|
|
2654 {
|
|
2655 index_id=all_population_ID[i];
|
|
2656
|
|
2657 j=temp_size_j[index_id];
|
|
2658
|
|
2659 for (t=0;t<num_dm;t++)
|
|
2660 {
|
|
2661 ind_pop[index_id][j][t]=input_data[i][t];
|
|
2662 }
|
|
2663 temp_size_j[index_id]++;
|
|
2664 }
|
|
2665
|
|
2666 aver=(double*)malloc(sizeof(double)*num_dm);
|
|
2667 memset(aver,0,sizeof(double)*num_dm);
|
|
2668
|
|
2669 std=(double*)malloc(sizeof(double)*num_dm);
|
|
2670 memset(std,0,sizeof(double)*num_dm);
|
|
2671
|
|
2672 temp_population_center=(double**)malloc(sizeof(double*)*2);
|
|
2673 memset(temp_population_center,0,sizeof(double*)*2);
|
|
2674 for (i=0;i<2;i++)
|
|
2675 {
|
|
2676 temp_population_center[i]=(double*)malloc(sizeof(double)*3);
|
|
2677 memset(temp_population_center[i],0,sizeof(double)*3);
|
|
2678 }
|
|
2679
|
|
2680 size_p_2=(long*)malloc(sizeof(long)*2);
|
|
2681 memset(size_p_2,0,sizeof(long)*2);
|
|
2682
|
|
2683 partit=(long*)malloc(sizeof(long)*num_population);
|
|
2684 memset(partit,0,sizeof(long)*num_population);
|
|
2685
|
|
2686 num_computed_population=num_population;
|
|
2687
|
|
2688 temp_population_ID=(long**)malloc(sizeof(long*)*num_population);
|
|
2689 memset(temp_population_ID,0,sizeof(long*)*num_population);
|
|
2690
|
|
2691 for (i=0;i<num_population;i++)
|
|
2692 {
|
|
2693 temp_population_ID[i]=(long*)malloc(sizeof(long)*size_p[i]);
|
|
2694 memset(temp_population_ID[i],0,sizeof(long)*size_p[i]);
|
|
2695 }
|
|
2696
|
|
2697 //printf("num_population=%d\n",num_population);
|
|
2698
|
|
2699 for (i=0;i<num_population;i++)
|
|
2700 {
|
|
2701 partit[i]=0;
|
|
2702 tran(ind_pop[i], size_p[i], num_dm, 1, ind_pop[i]);//0-1 normalize ind_pop[i]
|
|
2703
|
|
2704 //find the 3 dimensions with the largest std for ind_pop[i]
|
|
2705 d1=-1;
|
|
2706 d2=-1;
|
|
2707 d3=-1;
|
|
2708
|
|
2709 for (t=0;t<num_dm;t++)
|
|
2710 {
|
|
2711 aver[t]=0;
|
|
2712 for (j=0;j<size_p[i];j++)
|
|
2713 {
|
|
2714 aver[t]=aver[t]+ind_pop[i][j][t];
|
|
2715 }
|
|
2716 aver[t]=aver[t]/(double)size_p[i];
|
|
2717
|
|
2718 std[t]=0;
|
|
2719 for (j=0;j<size_p[i];j++)
|
|
2720 {
|
|
2721 std[t]=std[t]+((ind_pop[i][j][t]-aver[t])*(ind_pop[i][j][t]-aver[t]));
|
|
2722 }
|
|
2723 std[t]=sqrt(std[t]/(double)size_p[i]);
|
|
2724 }
|
|
2725
|
|
2726
|
|
2727
|
|
2728 for (j=0;j<3;j++)
|
|
2729 {
|
|
2730 max_d_dist=0;
|
|
2731 for (t=0;t<num_dm;t++)
|
|
2732 {
|
|
2733 if ((t!=d1) && (t!=d2))
|
|
2734 {
|
|
2735 dist=std[t];
|
|
2736 }
|
|
2737 else
|
|
2738 dist=-1;
|
|
2739
|
|
2740 if (dist>max_d_dist)
|
|
2741 {
|
|
2742 max_d_dist=dist;
|
|
2743 d_d=t;
|
|
2744 }
|
|
2745 }
|
|
2746
|
|
2747 if (j==0)
|
|
2748 d1=d_d;
|
|
2749 if (j==1)
|
|
2750 d2=d_d;
|
|
2751 if (j==2)
|
|
2752 d3=d_d;
|
|
2753 }
|
|
2754
|
|
2755
|
|
2756 for (t=0;t<size_p[i];t++)
|
|
2757 {
|
|
2758 ind_pop_3[i][t][0]=ind_pop[i][t][d1];
|
|
2759 ind_pop_3[i][t][1]=ind_pop[i][t][d2];
|
|
2760 ind_pop_3[i][t][2]=ind_pop[i][t][d3];
|
|
2761 }
|
|
2762
|
|
2763
|
|
2764 temp_population_center[0][0]=(aver[d1])/2.0;
|
|
2765
|
|
2766 temp_population_center[0][1]=aver[d2];
|
|
2767 temp_population_center[0][2]=aver[d3];
|
|
2768
|
|
2769 temp_population_center[1][0]=(aver[d1]+1.0)/2.0;
|
|
2770
|
|
2771 temp_population_center[1][1]=aver[d2];
|
|
2772 temp_population_center[1][2]=aver[d3];
|
|
2773
|
|
2774
|
|
2775
|
|
2776 //run K-means with K=2
|
|
2777 kmeans(ind_pop_3[i], 2, KMEANS_TERM, size_p[i], 3, temp_population_ID[i], temp_population_center);
|
|
2778
|
|
2779 for (j=0;j<2;j++)
|
|
2780 size_p_2[j]=0;
|
|
2781
|
|
2782 for (j=0;j<size_p[i];j++)
|
|
2783 size_p_2[temp_population_ID[i][j]]++;
|
|
2784
|
|
2785 for (j=0;j<2;j++)
|
|
2786 if (size_p_2[j]==0)
|
|
2787 printf("size_p_2=0 at i=%ld\n",i);
|
|
2788
|
|
2789
|
|
2790 //check whether the 2 parts should be merged
|
|
2791
|
|
2792
|
|
2793 Ei=get_avg_dist(temp_population_center[0], 0,1, temp_population_ID[i], 2,size_p[i], 3, ind_pop_3[i], 0,1,2,size_p_2);
|
|
2794 Ej=get_avg_dist(temp_population_center[1], 0,1, temp_population_ID[i], 2,size_p[i], 3, ind_pop_3[i], 0,1,2,size_p_2);
|
|
2795
|
|
2796 for (t=0;t<3;t++)
|
|
2797 {
|
|
2798 center_p_1[t]=temp_population_center[0][t]*0.25+temp_population_center[1][t]*0.75;
|
|
2799 center_p_2[t]=temp_population_center[0][t]*0.5+temp_population_center[1][t]*0.5;
|
|
2800 center_p_3[t]=temp_population_center[0][t]*0.75+temp_population_center[1][t]*0.25;
|
|
2801 }
|
|
2802
|
|
2803 E1=get_avg_dist(center_p_1, 0,1, temp_population_ID[i], 2,size_p[i], 3, ind_pop_3[i], 0,1,2,size_p_2);
|
|
2804
|
|
2805 E2=get_avg_dist(center_p_2, 0,1, temp_population_ID[i], 2,size_p[i], 3, ind_pop_3[i], 0,1,2,size_p_2);
|
|
2806
|
|
2807 E3=get_avg_dist(center_p_3, 0,1, temp_population_ID[i], 2,size_p[i], 3, ind_pop_3[i], 0,1,2,size_p_2);
|
|
2808
|
|
2809 //printf("i=%d;E1=%f;E2=%f;E3=%f\n",i,E1,E2,E3);
|
|
2810
|
|
2811 if (E1<E2)
|
|
2812 Ep=E2;
|
|
2813 else
|
|
2814 Ep=E1;
|
|
2815
|
|
2816 Ep=max(Ep,E3); //Ep is the most sparse area
|
|
2817
|
|
2818
|
|
2819
|
|
2820 if ((Ep>Ei) && (Ep>Ej)) //the two parts should be partitioned
|
|
2821 {
|
|
2822 partit[i]=num_computed_population;
|
|
2823 num_computed_population++;
|
|
2824 }
|
|
2825
|
|
2826 } //end for (i=0;i<num_population;i++)
|
|
2827
|
|
2828
|
|
2829 for (i=0;i<num_population;i++)
|
|
2830 temp_size_j[i]=0;
|
|
2831
|
|
2832 for (i=0;i<file_Len;i++)
|
|
2833 {
|
|
2834 index_id=all_population_ID[i];
|
|
2835 if (partit[index_id]>0)
|
|
2836 {
|
|
2837 j=temp_size_j[index_id];
|
|
2838 if (temp_population_ID[index_id][j]==1)
|
|
2839 all_computed_pop_ID[i]=partit[index_id];
|
|
2840
|
|
2841 temp_size_j[index_id]++;
|
|
2842 }
|
|
2843 }
|
|
2844
|
|
2845 printf("Number of groups after partitioning is %ld\n", num_computed_population);
|
|
2846
|
|
2847
|
|
2848 free(size_p_2);
|
|
2849
|
|
2850 free(aver);
|
|
2851 free(std);
|
|
2852 free(partit);
|
|
2853
|
|
2854 free(center_p_1);
|
|
2855 free(center_p_2);
|
|
2856 free(center_p_3);
|
|
2857
|
|
2858
|
|
2859
|
|
2860 for (i=0;i<num_population;i++)
|
|
2861 free(temp_population_ID[i]);
|
|
2862 free(temp_population_ID);
|
|
2863
|
|
2864 for (i=0;i<num_population;i++)
|
|
2865 {
|
|
2866 for (j=0;j<size_p[i];j++)
|
|
2867 {
|
|
2868 free(ind_pop[i][j]);
|
|
2869 free(ind_pop_3[i][j]);
|
|
2870 }
|
|
2871 }
|
|
2872
|
|
2873
|
|
2874
|
|
2875 for (i=0;i<num_population;i++)
|
|
2876 {
|
|
2877 free(ind_pop[i]);
|
|
2878 free(ind_pop_3[i]);
|
|
2879 }
|
|
2880 free(ind_pop);
|
|
2881 free(ind_pop_3);
|
|
2882
|
|
2883
|
|
2884
|
|
2885
|
|
2886 for (i=0;i<2;i++)
|
|
2887 free(temp_population_center[i]);
|
|
2888
|
|
2889 free(temp_population_center);
|
|
2890
|
|
2891 free(temp_size_j);
|
|
2892 free(size_p);
|
|
2893
|
|
2894
|
|
2895
|
|
2896 //update the IDs, Centers, and # of populations
|
|
2897 for (i=0;i<file_Len;i++)
|
|
2898 {
|
|
2899 all_population_ID[i]=all_computed_pop_ID[i];
|
|
2900 if (all_population_ID[i]>=num_computed_population)
|
|
2901 printf("all_population_ID[%ld]=%ld\n",i,all_population_ID[i]);
|
|
2902 }
|
|
2903
|
|
2904 num_population=num_computed_population;
|
|
2905
|
|
2906 free(all_computed_pop_ID);
|
|
2907 //end partitioning
|
|
2908 // since the num_population has changed, population_center needs to be redefined as below
|
|
2909
|
|
2910 population_center=(double **)malloc(sizeof(double*)*num_population);
|
|
2911 memset(population_center,0,sizeof(double*)*num_population);
|
|
2912 for (i=0;i<num_population;i++)
|
|
2913 {
|
|
2914 population_center[i]=(double*)malloc(sizeof(double)*num_dm);
|
|
2915 memset(population_center[i],0,sizeof(double)*num_dm);
|
|
2916 }
|
|
2917
|
|
2918
|
|
2919 ID2Center_all(normalized_data,file_Len,num_dm,num_population,all_population_ID,population_center);
|
|
2920
|
|
2921
|
|
2922
|
|
2923 ////// end of further partitioning
|
|
2924
|
|
2925 ////////////////////////////////////////////////////////////
|
|
2926 // to further merge populations to avoid overpartitioning
|
|
2927 // Added June 20, 2010
|
|
2928 ////////////////////////////////////////////////////////////
|
|
2929
|
|
2930
|
|
2931
|
|
2932 num_real_pop=num_population;
|
|
2933 keep_merge=1;
|
|
2934
|
|
2935 if (num_pop>num_real_pop)
|
|
2936 {
|
|
2937 fprintf(stderr,"number of populations specified too large\n");
|
|
2938 exit(0);
|
|
2939 }
|
|
2940
|
|
2941 center_1=(double *)malloc(sizeof(double)*num_dm);
|
|
2942 memset(center_1,0,sizeof(double)*num_dm);
|
|
2943
|
|
2944 center_2=(double *)malloc(sizeof(double)*num_dm);
|
|
2945 memset(center_2,0,sizeof(double)*num_dm);
|
|
2946
|
|
2947 center_3=(double *)malloc(sizeof(double)*num_dm);
|
|
2948 memset(center_3,0,sizeof(double)*num_dm);
|
|
2949
|
|
2950
|
|
2951
|
|
2952 while (((num_real_pop>num_pop) && (num_pop!=0)) || ((keep_merge==1) && (num_pop==0) && (num_real_pop>2)))
|
|
2953 {
|
|
2954
|
|
2955 keep_merge=0; //so, if no entering merge function, while will exit when num_pop==0
|
|
2956
|
|
2957 real_pop_center=(double **)malloc(sizeof(double*)*num_real_pop);
|
|
2958 memset(real_pop_center,0,sizeof(double*)*num_real_pop);
|
|
2959 for (i=0;i<num_real_pop;i++)
|
|
2960 {
|
|
2961 real_pop_center[i]=(double*)malloc(sizeof(double)*num_dm);
|
|
2962 memset(real_pop_center[i],0,sizeof(double)*num_dm);
|
|
2963 }
|
|
2964
|
|
2965 min_pop=(double **)malloc(sizeof(double*)*num_real_pop);
|
|
2966 memset(min_pop,0,sizeof(double*)*num_real_pop);
|
|
2967 for (i=0;i<num_real_pop;i++)
|
|
2968 {
|
|
2969 min_pop[i]=(double*)malloc(sizeof(double)*num_dm);
|
|
2970 memset(min_pop[i],0,sizeof(double)*num_dm);
|
|
2971 }
|
|
2972
|
|
2973 max_pop=(double **)malloc(sizeof(double*)*num_real_pop);
|
|
2974 memset(max_pop,0,sizeof(double*)*num_real_pop);
|
|
2975 for (i=0;i<num_real_pop;i++)
|
|
2976 {
|
|
2977 max_pop[i]=(double*)malloc(sizeof(double)*num_dm);
|
|
2978 memset(max_pop[i],0,sizeof(double)*num_dm);
|
|
2979 }
|
|
2980
|
|
2981 if (num_real_pop==num_population)
|
|
2982 {
|
|
2983 for (i=0;i<num_real_pop;i++)
|
|
2984 for (j=0;j<num_dm;j++)
|
|
2985 real_pop_center[i][j]=population_center[i][j];
|
|
2986 }
|
|
2987 else
|
|
2988 {
|
|
2989 ID2Center_all(normalized_data,file_Len,num_dm,num_real_pop,all_population_ID,real_pop_center);
|
|
2990 }
|
|
2991
|
|
2992 for (i=0;i<num_real_pop;i++)
|
|
2993 {
|
|
2994 for (j=0;j<num_dm;j++)
|
|
2995 {
|
|
2996 min_pop[i][j]=MAX_VALUE;
|
|
2997
|
|
2998 max_pop[i][j]=0;
|
|
2999 }
|
|
3000 }
|
|
3001
|
|
3002 for (i=0;i<file_Len;i++)
|
|
3003 {
|
|
3004 index_id=all_population_ID[i];
|
|
3005 for (j=0;j<num_dm;j++)
|
|
3006 {
|
|
3007 if (normalized_data[i][j]<min_pop[index_id][j])
|
|
3008 min_pop[index_id][j]=normalized_data[i][j];
|
|
3009
|
|
3010 if (normalized_data[i][j]>max_pop[index_id][j])
|
|
3011 max_pop[index_id][j]=normalized_data[i][j];
|
|
3012 }
|
|
3013 }
|
|
3014
|
|
3015 /* for (i=0;i<num_real_pop;i++)
|
|
3016 {
|
|
3017 for (j=0;j<num_dm;j++)
|
|
3018 {
|
|
3019 printf("min_pop is %f\t",min_pop[i][j]);
|
|
3020 //printf("max_pop is %f\n",max_pop[i][j]);
|
|
3021 }
|
|
3022 printf("\n");
|
|
3023 }
|
|
3024 */
|
|
3025
|
|
3026
|
|
3027 distance_pop=(double **)malloc(sizeof(double*)*num_real_pop);
|
|
3028 memset(distance_pop,0,sizeof(double*)*num_real_pop);
|
|
3029 for (i=0;i<num_real_pop;i++)
|
|
3030 {
|
|
3031 distance_pop[i]=(double*)malloc(sizeof(double)*num_real_pop);
|
|
3032 memset(distance_pop[i],0,sizeof(double)*num_real_pop);
|
|
3033 }
|
|
3034
|
|
3035 for (i=0;i<num_real_pop-1;i++)
|
|
3036 {
|
|
3037 for (j=i+1;j<num_real_pop;j++)
|
|
3038 {
|
|
3039 distance=0;
|
|
3040
|
|
3041 for (t=0;t<num_dm;t++)
|
|
3042 {
|
|
3043 dist=real_pop_center[i][t]-real_pop_center[j][t];
|
|
3044 distance=distance+dist*dist;
|
|
3045 }
|
|
3046 distance_pop[i][j]=distance;
|
|
3047 distance_pop[j][i]=distance;
|
|
3048 }
|
|
3049 }
|
|
3050
|
|
3051 if (num_real_pop>2)
|
|
3052 num_checked_range=(long)((double)num_real_pop*(num_real_pop-1)/2.0); //to find the mergeable pair among the num_checked_range pairs of smallest distances
|
|
3053 else
|
|
3054 num_checked_range=1;
|
|
3055
|
|
3056 size_c=(long *)malloc(sizeof(long)*num_real_pop);
|
|
3057 memset(size_c,0,sizeof(long)*num_real_pop);
|
|
3058
|
|
3059 for (i=0;i<file_Len;i++)
|
|
3060 size_c[all_population_ID[i]]++;
|
|
3061
|
|
3062 for (p=0;p<num_checked_range;p++)
|
|
3063 {
|
|
3064 current_smallest_dist=MAX_VALUE;
|
|
3065
|
|
3066 for (i=0;i<num_real_pop-1;i++)
|
|
3067 {
|
|
3068 for (j=i+1;j<num_real_pop;j++)
|
|
3069 {
|
|
3070 if (distance_pop[i][j]<current_smallest_dist)
|
|
3071 {
|
|
3072 current_smallest_dist=distance_pop[i][j];
|
|
3073
|
|
3074 temp_i=i;
|
|
3075 temp_j=j;
|
|
3076 }
|
|
3077 }
|
|
3078 }
|
|
3079
|
|
3080 if (p==0)
|
|
3081 {
|
|
3082 first_i=temp_i;
|
|
3083 first_j=temp_j;
|
|
3084 }
|
|
3085
|
|
3086 distance_pop[temp_i][temp_j]=MAX_VALUE+1; //make sure this pair won't be selected next time
|
|
3087
|
|
3088 //normalize and calculate std for temp_i and temp_j
|
|
3089
|
|
3090 //pick the largest 3 dimensions on standard deviation
|
|
3091 d1=-1;
|
|
3092 d2=-1;
|
|
3093 d3=-1;
|
|
3094
|
|
3095 for (i=0;i<3;i++)
|
|
3096 {
|
|
3097 max_d_dist=0;
|
|
3098 for (j=0;j<num_dm;j++)
|
|
3099 {
|
|
3100 if ((j!=d1) && (j!=d2))
|
|
3101 {
|
|
3102 temp=min(min_pop[temp_i][j],min_pop[temp_j][j]);
|
|
3103 tmp=max(max_pop[temp_i][j],max_pop[temp_j][j]);
|
|
3104 if (tmp<=temp)
|
|
3105 {
|
|
3106 //printf("min_pop[%d][%d]=%f \t min_pop[%d][%d]=%f \t max_pop[%d][%d]=%f \t max_pop[%d][%d]=%f\n",temp_i,j,min_pop[temp_i][j],temp_j, j, min_pop[temp_j][j],temp_i,j,max_pop[temp_i][j],temp_j,j,max_pop[temp_j][j]);
|
|
3107 dist=0;
|
|
3108 }
|
|
3109 else
|
|
3110 dist=(real_pop_center[temp_i][j]-real_pop_center[temp_j][j])/(tmp-temp);
|
|
3111 if (dist<0)
|
|
3112 dist=-dist;
|
|
3113 }
|
|
3114 else
|
|
3115 dist=-1;
|
|
3116
|
|
3117 if (dist>max_d_dist)
|
|
3118 {
|
|
3119 max_d_dist=dist;
|
|
3120 d_d=j;
|
|
3121 }
|
|
3122 }
|
|
3123
|
|
3124 if (i==0)
|
|
3125 d1=d_d;
|
|
3126 if (i==1)
|
|
3127 d2=d_d;
|
|
3128 if (i==2)
|
|
3129 d3=d_d;
|
|
3130 }
|
|
3131
|
|
3132 //printf("d1=%d;d2=%d;d3=%d\n",d1,d2,d3);
|
|
3133
|
|
3134 Ei=get_avg_dist(real_pop_center[temp_i], temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c);
|
|
3135 Ej=get_avg_dist(real_pop_center[temp_j], temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c);
|
|
3136
|
|
3137 for (t=0;t<num_dm;t++)
|
|
3138 {
|
|
3139 center_1[t]=real_pop_center[temp_i][t]*0.25+real_pop_center[temp_j][t]*0.75;
|
|
3140 center_2[t]=real_pop_center[temp_i][t]*0.5+real_pop_center[temp_j][t]*0.5;
|
|
3141 center_3[t]=real_pop_center[temp_i][t]*0.75+real_pop_center[temp_j][t]*0.25;
|
|
3142 }
|
|
3143
|
|
3144 E1=get_avg_dist(center_1, temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c);
|
|
3145
|
|
3146 E2=get_avg_dist(center_2, temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c);
|
|
3147
|
|
3148 E3=get_avg_dist(center_3, temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c);
|
|
3149
|
|
3150 if (E1<E2)
|
|
3151 Ep=E2;
|
|
3152 else
|
|
3153 Ep=E1;
|
|
3154
|
|
3155 Ep=max(Ep,E3); //Ep is the most sparse area
|
|
3156
|
|
3157 Ep=Ep*E_T;
|
|
3158 //printf("Ep=%f;Ei=%f;Ej=%f\n",Ep,Ei,Ej);
|
|
3159
|
|
3160 if ((Ep<=Ei) || (Ep<=Ej))//if the most sparse area between i and j are still denser than one of them, temp_i and temp_j should be merged
|
|
3161 {
|
|
3162 keep_merge=1;
|
|
3163 break;
|
|
3164 }//end if (Ep)
|
|
3165 } //end for p
|
|
3166
|
|
3167 //printf("keep_merge=%d\n",keep_merge);
|
|
3168
|
|
3169 for (i=0;i<num_real_pop;i++)
|
|
3170 {
|
|
3171 free(real_pop_center[i]);
|
|
3172 free(distance_pop[i]);
|
|
3173 free(min_pop[i]);
|
|
3174 free(max_pop[i]);
|
|
3175 }
|
|
3176 free(real_pop_center);
|
|
3177 free(min_pop);
|
|
3178 free(max_pop);
|
|
3179 free(distance_pop);
|
|
3180 free(size_c);
|
|
3181
|
|
3182 //printf("temp_i=%d;temp_j=%d\n",temp_i,temp_j);
|
|
3183
|
|
3184 if (keep_merge) //found one within p loop
|
|
3185 {
|
|
3186 for (i=0;i<file_Len;i++)
|
|
3187 {
|
|
3188 if (all_population_ID[i]>temp_j)
|
|
3189 {
|
|
3190 all_population_ID[i]=all_population_ID[i]-1;
|
|
3191 }
|
|
3192 else
|
|
3193 {
|
|
3194 if (all_population_ID[i]==temp_j)
|
|
3195 {
|
|
3196 all_population_ID[i]=temp_i;
|
|
3197 }
|
|
3198 }
|
|
3199 }
|
|
3200
|
|
3201 num_real_pop--;
|
|
3202 }
|
|
3203 else
|
|
3204 {
|
|
3205 if ((num_pop!=0) && (num_pop<num_real_pop)) //not reach the specified num of pop
|
|
3206 {
|
|
3207 for (i=0;i<file_Len;i++)
|
|
3208 {
|
|
3209 if (all_population_ID[i]>first_j)
|
|
3210 {
|
|
3211 all_population_ID[i]=all_population_ID[i]-1;
|
|
3212 }
|
|
3213 else
|
|
3214 {
|
|
3215 if (all_population_ID[i]==first_j)
|
|
3216 {
|
|
3217 all_population_ID[i]=first_i;
|
|
3218 }
|
|
3219 }
|
|
3220 }
|
|
3221
|
|
3222 num_real_pop--;
|
|
3223
|
|
3224
|
|
3225
|
|
3226 }
|
|
3227 }
|
|
3228 //printf("num_real_pop is %d\n",num_real_pop);
|
|
3229
|
|
3230 } //end of while
|
|
3231
|
|
3232
|
|
3233
|
|
3234 free(center_1);
|
|
3235 free(center_2);
|
|
3236 free(center_3);
|
|
3237 printf("Final number of populations is %ld\n",num_real_pop);
|
|
3238
|
|
3239 new_population_center=(double **)malloc(sizeof(double*)*num_real_pop);
|
|
3240 memset(new_population_center,0,sizeof(double*)*num_real_pop);
|
|
3241 for (i=0;i<num_real_pop;i++)
|
|
3242 {
|
|
3243 new_population_center[i]=(double*)malloc(sizeof(double)*num_dm);
|
|
3244 memset(new_population_center[i],0,sizeof(double)*num_dm);
|
|
3245 }
|
|
3246
|
|
3247
|
|
3248 ///////////////////////////////////////////
|
|
3249 //End of population mapping
|
|
3250 ///////////////////////////////////////////
|
|
3251
|
|
3252 show(input_data, all_population_ID, file_Len, num_real_pop, num_dm, para_name_string);
|
|
3253
|
|
3254 ID2Center_all(input_data,file_Len,num_dm,num_real_pop,all_population_ID,new_population_center);
|
|
3255
|
|
3256
|
|
3257 f_cid=fopen("population_id.txt","w");
|
|
3258 f_ctr=fopen("population_center.txt","w");
|
|
3259 f_out=fopen("coordinates.txt","w");
|
|
3260 f_results=fopen("flock_results.txt","w");
|
|
3261
|
|
3262 /*
|
|
3263 f_parameters=fopen("parameters.txt","w");
|
|
3264 fprintf(f_parameters,"Number_of_Bins\t%d\n",num_bin);
|
|
3265 fprintf(f_parameters,"Density\t%f\n",aver_index);
|
|
3266 fclose(f_parameters);
|
|
3267 */
|
|
3268
|
|
3269 for (i=0;i<file_Len;i++)
|
|
3270 fprintf(f_cid,"%ld\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009
|
|
3271
|
|
3272 /*
|
|
3273 * New to check for min/max to add to parameters.txt
|
|
3274 *
|
|
3275 */
|
|
3276
|
|
3277 fprintf(f_out,"%s\n",para_name_string);
|
|
3278 //fprintf(f_results,"%s\tEvent\tPopulation\n",para_name_string);
|
|
3279 fprintf(f_results,"%s\tPopulation\n",para_name_string);
|
|
3280 for (i=0;i<file_Len;i++)
|
|
3281 {
|
|
3282 for (j=0;j<num_dm;j++)
|
|
3283 {
|
|
3284 if (input_data[i][j] < min) {
|
|
3285 min = (int)input_data[i][j];
|
|
3286 }
|
|
3287 if (input_data[i][j] > max) {
|
|
3288 max = (int)input_data[i][j];
|
|
3289 }
|
|
3290 if (j==num_dm-1)
|
|
3291 {
|
|
3292 fprintf(f_out,"%d\n",(int)input_data[i][j]);
|
|
3293 fprintf(f_results,"%d\t",(int)input_data[i][j]);
|
|
3294 }
|
|
3295 else
|
|
3296 {
|
|
3297 fprintf(f_out,"%d\t",(int)input_data[i][j]);
|
|
3298 fprintf(f_results,"%d\t",(int)input_data[i][j]);
|
|
3299 }
|
|
3300 }
|
|
3301 //fprintf(f_results,"%ld\t",i + 1);
|
|
3302 fprintf(f_results,"%ld\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009
|
|
3303 }
|
|
3304
|
|
3305 /*
|
|
3306 f_parameters=fopen("parameters.txt","w");
|
|
3307 fprintf(f_parameters,"Number_of_Bins\t%ld\n",num_bin);
|
|
3308 fprintf(f_parameters,"Density\t%d\n",den_t_event);
|
|
3309 fprintf(f_parameters,"Min\t%d\n",min);
|
|
3310 fprintf(f_parameters,"Max\t%d\n",max);
|
|
3311 fclose(f_parameters);
|
|
3312 */
|
|
3313
|
|
3314 f_properties=fopen("fcs.properties","w");
|
|
3315 fprintf(f_properties,"Bins=%ld\n",num_bin);
|
|
3316 fprintf(f_properties,"Density=%d\n",den_t_event);
|
|
3317 fprintf(f_properties,"Min=%d\n",min);
|
|
3318 fprintf(f_properties,"Max=%d\n",max);
|
|
3319 fprintf(f_properties,"Populations=%ld\n",num_real_pop);
|
|
3320 fprintf(f_properties,"Events=%ld\n",file_Len);
|
|
3321 fprintf(f_properties,"Markers=%ld\n",num_dm);
|
|
3322 fclose(f_properties);
|
|
3323
|
|
3324 for (i=0;i<num_real_pop;i++) {
|
|
3325 /* Add if we want to include population id in the output
|
|
3326 */
|
|
3327 fprintf(f_ctr,"%ld\t",i+1); //i changed to i+1 to start from 1 instead of 0: April 16, 2009
|
|
3328
|
|
3329 for (j=0;j<num_dm;j++) {
|
|
3330 if (j==num_dm-1)
|
|
3331 fprintf(f_ctr,"%.0f\n",new_population_center[i][j]);
|
|
3332 else
|
|
3333 fprintf(f_ctr,"%.0f\t",new_population_center[i][j]);
|
|
3334 }
|
|
3335 }
|
|
3336
|
|
3337 //added April 16, 2009
|
|
3338 f_mfi=fopen("MFI.txt","w");
|
|
3339
|
|
3340 for (i=0;i<num_real_pop;i++)
|
|
3341 {
|
|
3342 fprintf(f_mfi,"%ld\t",i+1);
|
|
3343
|
|
3344 for (j=0;j<num_dm;j++)
|
|
3345 {
|
|
3346 if (j==num_dm-1)
|
|
3347 fprintf(f_mfi,"%.0f\n",new_population_center[i][j]);
|
|
3348 else
|
|
3349 fprintf(f_mfi,"%.0f\t",new_population_center[i][j]);
|
|
3350 }
|
|
3351 }
|
|
3352 fclose(f_mfi);
|
|
3353
|
|
3354 //ended April 16, 2009
|
|
3355
|
|
3356 fclose(f_cid);
|
|
3357 fclose(f_ctr);
|
|
3358 fclose(f_out);
|
|
3359 fclose(f_results);
|
|
3360
|
|
3361
|
|
3362 for (i=0;i<num_population;i++)
|
|
3363 free(population_center[i]);
|
|
3364
|
|
3365 free(population_center);
|
|
3366
|
|
3367 for (i=0;i<num_real_pop;i++)
|
|
3368 free(new_population_center[i]);
|
|
3369
|
|
3370 free(new_population_center);
|
|
3371
|
|
3372 for (i=0;i<file_Len;i++)
|
|
3373 free(normalized_data[i]);
|
|
3374 free(normalized_data);
|
|
3375
|
|
3376 free(grid_populationID);
|
|
3377
|
|
3378 free(cluster_populationID);
|
|
3379 free(grid_clusterID);
|
|
3380 free(cluster_ID);
|
|
3381
|
|
3382 for (i=0;i<file_Len;i++)
|
|
3383 free(input_data[i]);
|
|
3384 free(input_data);
|
|
3385
|
|
3386 free(grid_ID);
|
|
3387 free(population_ID);
|
|
3388 free(all_population_ID);
|
|
3389 free(eventID_To_denseventID);
|
|
3390
|
|
3391 ///////////////////////////////////////////////////////////
|
|
3392 printf("Ending time:\t\t\t\t");
|
|
3393 fflush(stdout);
|
|
3394 system("/bin/date");
|
|
3395
|
|
3396 /*
|
|
3397 * Windows version
|
|
3398 _strtime( tmpbuf );
|
|
3399 printf( "Ending time:\t\t\t\t%s\n", tmpbuf );
|
|
3400 _strdate( tmpbuf );
|
|
3401 printf( "Ending date:\t\t\t\t%s\n", tmpbuf );
|
|
3402 */
|
|
3403
|
|
3404 return 0;
|
|
3405 }
|