1
|
1 /*****************************************************************************
|
|
2
|
|
3 FLOCK: FLOw cytometry Clustering without K (Named by: Jamie A. Lee and Richard H. Scheuermann)
|
|
4
|
|
5 Author: (Max) Yu Qian, Ph.D.
|
|
6
|
|
7 Copyright: Scheuermann Lab, Dept. of Pathology, UTSW
|
|
8
|
|
9 Development: November 2005 ~ Forever
|
|
10
|
|
11 Algorithm Status: May 2007: Release 1.0
|
|
12
|
|
13 Usage: flock data_file
|
|
14 Note: the input file format must be channel values and the delimiter between two values must be a tab.
|
|
15
|
|
16 Changes made July 23, 2010: made errors to STDERR
|
|
17 Changes made Nov 4, 2010: added one more error (select_num_bin<min_grid) || (select_num_bin>max_grid) to STDERR;
|
|
18 MAX_GRID changed to 50 as larger than 50 seems not useful for any file we have got
|
|
19
|
|
20 ******************************************************************************/
|
|
21 #include <time.h>
|
|
22 #include <stdio.h>
|
|
23 #include <stdlib.h>
|
|
24 #include <math.h>
|
|
25 #include <string.h>
|
|
26 #include <sys/stat.h>
|
|
27 #include <unistd.h>
|
|
28 #include <assert.h>
|
|
29
|
|
30
|
|
31 #define DEBUG 0
|
|
32 #define LINE_LEN 1024
|
|
33 #define FILE_NAME_LEN 128
|
|
34 #define PARA_NAME_LEN 64
|
|
35 #define MAX_VALUE 1000000000
|
|
36 #define MIN_GRID 6
|
|
37 #define MAX_GRID 50
|
|
38
|
|
39 #define NORM_METHOD 2 //2 if z-score; 0 if no normalization; 1 if min-max
|
|
40 #define KMEANS_TERM 100
|
|
41 //#define MAX_NUM_POP 30
|
|
42
|
|
43
|
|
44 int find_connected(int **G, int num_dense_grids, int ndim, int *grid_clusterID);
|
|
45
|
|
46 /************* Read basic info of the source file ****************************/
|
|
47 void getfileinfo(FILE *f_src, int *file_Len, int *num_dm, char *name_string, int *time_ID)
|
|
48 {
|
|
49 char src[LINE_LEN];
|
|
50 char current_name[64];
|
|
51 char prv;
|
|
52
|
|
53 int num_rows=0;
|
|
54 int num_columns=0;
|
|
55 int ch='\n';
|
|
56 int prev='\n';
|
|
57 int time_pos=0;
|
|
58 int i=0;
|
|
59 int j=0;
|
|
60 int sw=0;
|
|
61
|
|
62 src[0]='\0';
|
|
63 fgets(src, LINE_LEN, f_src);
|
|
64
|
|
65 if ((src[0]=='F') && (src[1]=='C') && (src[2]=='S'))
|
|
66 {
|
|
67 fprintf(stderr,"the correct input format is a tab-delimited txt file, instead of FCS file.\n");
|
|
68 abort();
|
|
69 }
|
|
70
|
|
71 name_string[0]='\0';
|
|
72 current_name[0]='\0';
|
|
73 prv='\n';
|
|
74
|
|
75 // skip space and tab characters
|
|
76 while ((src[i]==' ') || (src[i]=='\t'))
|
|
77 i++;
|
|
78
|
|
79 // repeat until the end of line is reached
|
|
80 while ((src[i]!='\0') && (src[i]!='\n') && (src[i]!='\r'))
|
|
81 {
|
|
82 current_name[j]=src[i];
|
|
83
|
|
84 if ((src[i]=='\t') && (prv!='\t')) //a complete word
|
|
85 {
|
|
86 current_name[j]='\0';
|
|
87
|
|
88 if (0!=strcmp(current_name,"Time"))
|
|
89 {
|
|
90 num_columns++; //num_columns does not inlcude the column of Time
|
|
91 time_pos++;
|
|
92 if (sw) {
|
|
93 strcat(name_string,"\t");
|
|
94 }
|
|
95 strcat(name_string,current_name);
|
|
96 sw = 1;
|
|
97 }
|
|
98 else
|
|
99 {
|
|
100 *time_ID=time_pos;
|
|
101 }
|
|
102
|
|
103
|
|
104 current_name[0]='\0';
|
|
105 j=0;
|
|
106 }
|
|
107
|
|
108 if ((src[i]=='\t') && (prv=='\t')) //a duplicate tab or space
|
|
109 {
|
|
110 current_name[0]='\0';
|
|
111 j=0;
|
|
112 }
|
|
113
|
|
114 if (src[i]!='\t')
|
|
115 j++;
|
|
116
|
|
117 prv=src[i];
|
|
118 i++;
|
|
119 }
|
|
120
|
|
121 if (prv!='\t') //the last one hasn't been retrieved
|
|
122 {
|
|
123 current_name[j]='\0';
|
|
124
|
|
125 if (0!=strcmp(current_name,"Time"))
|
|
126 {
|
|
127 num_columns++;
|
|
128 strcat(name_string,"\t");
|
|
129 strcat(name_string,current_name);
|
|
130 time_pos++;
|
|
131 }
|
|
132 else
|
|
133 {
|
|
134 *time_ID=time_pos;
|
|
135 }
|
|
136
|
|
137
|
|
138 }
|
|
139 if (DEBUG==1)
|
|
140 {
|
|
141 printf("time_ID is %d\n",*time_ID);
|
|
142 printf("name_string is %s\n",name_string);
|
|
143 }
|
|
144
|
|
145 //start computing # of rows
|
|
146
|
|
147 while ((ch = fgetc(f_src))!= EOF )
|
|
148 {
|
|
149 if (ch == '\n')
|
|
150 {
|
|
151 ++num_rows;
|
|
152 }
|
|
153 prev = ch;
|
|
154 }
|
|
155 if (prev!='\n')
|
|
156 ++num_rows;
|
|
157
|
|
158 //added on July 23, 2010
|
|
159 if (num_rows<50)
|
|
160 {
|
|
161 fprintf(stderr,"Number of events in the input file is too few and should not be processed!\n"); //modified on July 23, 2010
|
|
162 abort();
|
|
163 }
|
|
164
|
|
165 *file_Len=num_rows;
|
|
166 *num_dm=num_columns;
|
|
167
|
|
168 printf("original file size is %d; number of dimensions is %d\n", *file_Len, *num_dm);
|
|
169 }
|
|
170
|
|
171
|
|
172
|
|
173 /************************************* Read the source file into uncomp_data **************************************/
|
|
174 void readsource(FILE *f_src, int file_Len, int num_dm, double **uncomp_data, int time_ID)
|
|
175 {
|
|
176 int time_pass=0; //to mark whether the time_ID has been passed
|
|
177 int index=0;
|
|
178
|
|
179 int i=0;
|
|
180 int j=0;
|
|
181 int t=0;
|
|
182
|
|
183 char src[LINE_LEN];
|
|
184 char xc[LINE_LEN/10];
|
|
185
|
|
186 src[0]='\0';
|
|
187 fgets(src,LINE_LEN, f_src); //skip the first line about parameter names
|
|
188
|
|
189 while (!feof(f_src) && (index<file_Len)) //index = 0, 1, ..., file_Len-1
|
|
190 {
|
|
191 src[0]='\0';
|
|
192 fgets(src,LINE_LEN,f_src);
|
|
193 i=0;
|
|
194 time_pass=0;
|
|
195
|
|
196 if (time_ID==-1)
|
|
197 {
|
|
198 for (t=0;t<num_dm;t++) //there is no time_ID
|
|
199 {
|
|
200 xc[0]='\0';
|
|
201 j=0;
|
|
202 while ((src[i]!='\0') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t'))
|
|
203 {
|
|
204 xc[j]=src[i];
|
|
205 i++;
|
|
206 j++;
|
|
207 }
|
|
208
|
|
209 xc[j]='\0';
|
|
210 i++;
|
|
211
|
|
212 uncomp_data[index][t]=atof(xc);
|
|
213 }
|
|
214 }
|
|
215 else
|
|
216 {
|
|
217 for (t=0;t<=num_dm;t++) //the time column needs to be skipped, so there are num_dm+1 columns
|
|
218 {
|
|
219 xc[0]='\0';
|
|
220 j=0;
|
|
221 while ((src[i]!='\0') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t'))
|
|
222 {
|
|
223 xc[j]=src[i];
|
|
224 i++;
|
|
225 j++;
|
|
226 }
|
|
227
|
|
228 xc[j]='\0';
|
|
229 i++;
|
|
230
|
|
231 if (t==time_ID)
|
|
232 {
|
|
233 time_pass=1;
|
|
234 continue;
|
|
235 }
|
|
236
|
|
237 if (time_pass)
|
|
238 uncomp_data[index][t-1]=atof(xc);
|
|
239 else
|
|
240 uncomp_data[index][t]=atof(xc);
|
|
241 }
|
|
242 }
|
|
243 index++;
|
|
244 //fprintf(fout_ID,"%s",src);
|
|
245 } //end of while
|
|
246
|
|
247 if (DEBUG == 1)
|
|
248 {
|
|
249 printf("the last line of the source data is:\n");
|
|
250 for (j=0;j<num_dm;j++)
|
|
251 printf("%f ",uncomp_data[index-1][j]);
|
|
252 printf("\n");
|
|
253 }
|
|
254 }
|
|
255
|
|
256
|
|
257 /**************************************** Normalization ******************************************/
|
|
258 void tran(double **orig_data, int file_Len, int num_dm, int norm_used, double **matrix_to_cluster)
|
|
259 {
|
|
260 int i=0;
|
|
261 int j=0;
|
|
262
|
|
263 double biggest=0;
|
|
264 double smallest=MAX_VALUE;
|
|
265
|
|
266 double *aver; //average of each column
|
|
267 double *std; //standard deviation of each column
|
|
268
|
|
269 aver=(double*)malloc(sizeof(double)*file_Len);
|
|
270 memset(aver,0,sizeof(double)*file_Len);
|
|
271
|
|
272 std=(double*)malloc(sizeof(double)*file_Len);
|
|
273 memset(std,0,sizeof(double)*file_Len);
|
|
274
|
|
275 if (norm_used==2) //z-score normalization
|
|
276 {
|
|
277 for (j=0;j<num_dm;j++)
|
|
278 {
|
|
279 aver[j]=0;
|
|
280 for (i=0;i<file_Len;i++)
|
|
281 aver[j]=aver[j]+orig_data[i][j];
|
|
282 aver[j]=aver[j]/(double)file_Len;
|
|
283
|
|
284 std[j]=0;
|
|
285 for (i=0;i<file_Len;i++)
|
|
286 std[j]=std[j]+(orig_data[i][j]-aver[j])*(orig_data[i][j]-aver[j]);
|
|
287 std[j]=sqrt(std[j]/(double)file_Len);
|
|
288
|
|
289 for (i=0;i<file_Len;i++)
|
|
290 matrix_to_cluster[i][j]=(orig_data[i][j]-aver[j])/std[j]; //z-score normalization
|
|
291 }
|
|
292 }
|
|
293
|
|
294 if (norm_used==1) //0-1 min-max normalization
|
|
295 {
|
|
296 for (j=0;j<num_dm;j++)
|
|
297 {
|
|
298 biggest=0;
|
|
299 smallest=MAX_VALUE;
|
|
300 for (i=0;i<file_Len;i++)
|
|
301 {
|
|
302 if (orig_data[i][j]>biggest)
|
|
303 biggest=orig_data[i][j];
|
|
304 if (orig_data[i][j]<smallest)
|
|
305 smallest=orig_data[i][j];
|
|
306 }
|
|
307
|
|
308 for (i=0;i<file_Len;i++)
|
|
309 {
|
|
310 if (biggest==smallest)
|
|
311 matrix_to_cluster[i][j]=biggest;
|
|
312 else
|
|
313 matrix_to_cluster[i][j]=(orig_data[i][j]-smallest)/(biggest-smallest);
|
|
314 }
|
|
315 }
|
|
316 }
|
|
317
|
|
318 if (norm_used==0) //no normalization
|
|
319 {
|
|
320 for (i=0;i<file_Len;i++)
|
|
321 for (j=0;j<num_dm;j++)
|
|
322 matrix_to_cluster[i][j]=orig_data[i][j];
|
|
323 }
|
|
324
|
|
325 }
|
|
326
|
|
327
|
|
328
|
|
329 /********************************************** RadixSort *******************************************/
|
|
330 /* Perform a radix sort using each dimension from the original data as a radix.
|
|
331 * Outputs:
|
|
332 * sorted_seq -- a permutation vector mapping the ordered list onto the original data.
|
|
333 * (sorted_seq[i] -> index in the original data of the ith element of the ordered list)
|
|
334 * grid_ID -- mapping between the original data and the "grids" (see below) found as a byproduct
|
|
335 * of the sorting procedure.
|
|
336 * num_nonempty -- the number of grids that occur in the data (= the number of distinct values assumed
|
|
337 * by grid_ID)
|
|
338 */
|
|
339
|
|
340 void radixsort_flock(int **position,int file_Len,int num_dm,int num_bin,int *sorted_seq,int *num_nonempty,int *grid_ID)
|
|
341 {
|
|
342 int i=0;
|
|
343 int length=0;
|
|
344 int start=0;
|
|
345 int prev_ID=0;
|
|
346 int curr_ID=0;
|
|
347
|
|
348 int j=0;
|
|
349 int t=0;
|
|
350 int p=0;
|
|
351 int loc=0;
|
|
352 int temp=0;
|
|
353 int equal=0;
|
|
354
|
|
355 int *count; //count[i]=j means there are j numbers having value i at the processing digit
|
|
356 int *index; //index[i]=j means the starting position of grid i is j
|
|
357 int *cp; //current position
|
|
358 int *mark; //mark[i]=0 means it is not an ending point of a part, 1 means it is (a "part" is a group of items with identical bins for all dimensions)
|
|
359 int *seq; //temporary sequence
|
|
360
|
|
361 count=(int*)malloc(sizeof(int)*num_bin);
|
|
362 memset(count,0,sizeof(int)*num_bin);
|
|
363
|
|
364 cp=(int*)malloc(sizeof(int)*num_bin);
|
|
365 memset(cp,0,sizeof(int)*num_bin);
|
|
366
|
|
367 index=(int*)malloc(sizeof(int)*num_bin); // initialized below
|
|
368
|
|
369 seq=(int*)malloc(sizeof(int)*file_Len);
|
|
370 memset(seq,0,sizeof(int)*file_Len);
|
|
371
|
|
372 mark=(int*)malloc(sizeof(int)*file_Len);
|
|
373 memset(mark,0,sizeof(int)*file_Len);
|
|
374
|
|
375 for (i=0;i<file_Len;i++)
|
|
376 {
|
|
377 sorted_seq[i]=i;
|
|
378 mark[i]=0;
|
|
379 seq[i]=0;
|
|
380 }
|
|
381 for (i=0;i<num_bin;i++)
|
|
382 {
|
|
383 index[i]=0;
|
|
384 cp[i]=0;
|
|
385 count[i]=0;
|
|
386 }
|
|
387
|
|
388 for (j=0;j<num_dm;j++)
|
|
389 {
|
|
390 if (j==0) //compute the initial values of mark
|
|
391 {
|
|
392 for (i=0;i<file_Len;i++)
|
|
393 count[position[i][j]]++; // initialize the count to the number of items in each bin of the 0th dimension
|
|
394
|
|
395 index[0] = 0;
|
|
396 for (i=0;i<num_bin-1;i++)
|
|
397 {
|
|
398 index[i+1]=index[i]+count[i]; //index[k]=x means k segment starts at x (in the ordered list)
|
|
399 if ((index[i+1]>0) && (index[i+1]<=file_Len))
|
|
400 {
|
|
401 mark[index[i+1]-1]=1; // Mark the end of the segment in the ordered list
|
|
402 }
|
|
403 else
|
|
404 {
|
|
405 printf("out of myboundary for mark at index[i+1]-1.\n");
|
|
406 }
|
|
407 }
|
|
408 mark[file_Len-1]=1;
|
|
409
|
|
410 for (i=0;i<file_Len;i++)
|
|
411 {
|
|
412 /* Build a permutation vector for the partially ordered data. Store the PV in sorted_seq */
|
|
413 loc=position[i][j];
|
|
414 temp=index[loc]+cp[loc]; //cp[i]=j means the offset from the starting position of grid i is j
|
|
415 sorted_seq[temp]=i; //sorted_seq[i]=temp is also another way to sort
|
|
416 cp[loc]++;
|
|
417 }
|
|
418 }
|
|
419 else
|
|
420 {
|
|
421 //reset count, index, loc, temp, cp, start, and length
|
|
422 length=0;
|
|
423 loc=0;
|
|
424 temp=0;
|
|
425 start=0;
|
|
426 for (p=0;p<num_bin;p++)
|
|
427 {
|
|
428 cp[p]=0;
|
|
429 count[p]=0;
|
|
430 index[p]=0;
|
|
431 }
|
|
432
|
|
433 for (i=0;i<file_Len;i++)
|
|
434 {
|
|
435 int iperm = sorted_seq[i]; // iperm allows us to traverse the data in sorted order.
|
|
436 if (mark[i]!=1)
|
|
437 {
|
|
438 /* Count the number of items in each bin of
|
|
439 dimension j, BUT we are going to reset at the end
|
|
440 of each "part". Thus, the total effect is to do
|
|
441 a sort by bin on the jth dimension for each group
|
|
442 of data that has been identical for the
|
|
443 dimensions processed up to this point. This is
|
|
444 the standard radix sort procedure, but doing it
|
|
445 this way saves us having to allocate buckets to
|
|
446 hold the data in each group of "identical-so-far"
|
|
447 elements. */
|
|
448 count[position[iperm][j]]++; //count[position[i][j]]++;
|
|
449 length++; // This is the total length of the part, irrespective of the value of the jth component
|
|
450 // (actually, less one, since we don't increment for the final element below)
|
|
451 }
|
|
452 if (mark[i]==1)
|
|
453 {
|
|
454 //length++;
|
|
455 count[position[iperm][j]]++;//count[position[i][j]]++; //the current point must be counted in
|
|
456 start=i-length; //this part starts from start to i: [start,i]
|
|
457 /* Now we sort on the jth radix, just like we did for the 0th above, but we restrict it to just the current part.
|
|
458 This would be a lot more clear if we broke this bit of code out into a separate function and processed recursively,
|
|
459 plus we could multi-thread over the parts. (Hmmm...)
|
|
460 */
|
|
461 index[0] = start; // Let index give the offset within the whole ordered list.
|
|
462 for (t=0;t<num_bin-1;t++)
|
|
463 {
|
|
464 index[t+1]=index[t]+count[t];
|
|
465
|
|
466 if ((index[t+1]<=file_Len) && (index[t+1]>0))
|
|
467 {
|
|
468 mark[index[t+1]-1]=1; // update the part boundaries to include the differences in the current radix.
|
|
469 }
|
|
470
|
|
471 }
|
|
472 mark[i]=1;
|
|
473
|
|
474 /* Update the permutation vector for the current part (i.e., from start to i). By the time we finish the loop over i
|
|
475 the PV will be completely updated for the partial ordering up to the current radix. */
|
|
476 for (t=start;t<=i;t++)
|
|
477 {
|
|
478 loc=position[sorted_seq[t]][j];//loc=position[t][j];
|
|
479 temp=index[loc]+cp[loc];
|
|
480 if ((temp<file_Len) && (temp>=0))
|
|
481 {
|
|
482 // seq is a temporary because we have to maintain the old PV until we have finished this step.
|
|
483 seq[temp]=sorted_seq[t]; //sorted_seq[i]=temp is also another way to sort
|
|
484 cp[loc]++;
|
|
485 }
|
|
486 else
|
|
487 {
|
|
488 printf("out of myboundary for seq at temp.\n");
|
|
489 }
|
|
490 }
|
|
491
|
|
492 for (t=start;t<=i;t++)
|
|
493 {
|
|
494 // copy the temporary back into sorted_seq. sorted_seq is now updated for radix j up through
|
|
495 // entry i in the ordered list.
|
|
496 if ((t>=0) && (t<file_Len))
|
|
497 sorted_seq[t]=seq[t];
|
|
498 else
|
|
499 printf("out of myboundary for seq and sorted_seq at t.\n");
|
|
500 }
|
|
501 //reset count, index, seq, length, and cp
|
|
502 length=0;
|
|
503 loc=0;
|
|
504 temp=0;
|
|
505 for (p=0;p<num_bin;p++)
|
|
506 {
|
|
507 cp[p]=0;
|
|
508 count[p]=0;
|
|
509 index[p]=0;
|
|
510 }
|
|
511 }
|
|
512 }//end for i
|
|
513 }//end else
|
|
514 }//end for j
|
|
515
|
|
516 /* sorted_seq[] now contains the ordered list for all radices. mark[] gives the boundaries between groups of elements that are
|
|
517 identical over all radices (= dimensions in the original data) (although it appears we aren't going to make use of this fact) */
|
|
518
|
|
519 for (i=0;i<file_Len;i++)
|
|
520 grid_ID[i]=0; //in case the initial value hasn't been assigned
|
|
521 *num_nonempty=1; //starting from 1!
|
|
522
|
|
523 /* assign the "grid" identifiers for all of the data. A grid will be what we were calling a "part" above. We will number them
|
|
524 serially and tag the *unordered* data with the grid IDs. We will also count the number of populated grids (in general there will
|
|
525 be many possible combinations of bin values that simply never occur) */
|
|
526
|
|
527 for (i=1;i<file_Len;i++)
|
|
528 {
|
|
529 equal=1;
|
|
530 prev_ID=sorted_seq[i-1];
|
|
531 curr_ID=sorted_seq[i];
|
|
532 for (j=0;j<num_dm;j++)
|
|
533 {
|
|
534 if (position[prev_ID][j]!=position[curr_ID][j])
|
|
535 {
|
|
536 equal=0; //not equal
|
|
537 break;
|
|
538 }
|
|
539 }
|
|
540
|
|
541 if (equal)
|
|
542 {
|
|
543 grid_ID[curr_ID]=grid_ID[prev_ID];
|
|
544 }
|
|
545 else
|
|
546 {
|
|
547 *num_nonempty=*num_nonempty+1;
|
|
548 grid_ID[curr_ID]=grid_ID[prev_ID]+1;
|
|
549 }
|
|
550 //all_grid_vol[grid_ID[curr_ID]]++;
|
|
551 }
|
|
552
|
|
553 //free memory
|
|
554 free(count);
|
|
555 free(index);
|
|
556 free(cp);
|
|
557 free(seq);
|
|
558 free(mark);
|
|
559
|
|
560 }
|
|
561
|
|
562 /********************************************** Compute Position of Events ************************************************/
|
|
563 void compute_position(double **data_in, int file_Len, int num_dm, int num_bin, int **position, double *interval)
|
|
564 {
|
|
565 /* What we are really doing here is binning the data, with the bins
|
|
566 spanning the range of the data and number of bins = num_bin */
|
|
567 int i=0;
|
|
568 int j=0;
|
|
569
|
|
570 double *small; //small[j] is the smallest value within dimension j
|
|
571 double *big; //big[j] is the biggest value within dimension j
|
|
572
|
|
573 small=(double*)malloc(sizeof(double)*num_dm);
|
|
574 memset(small,0,sizeof(double)*num_dm);
|
|
575
|
|
576 big=(double*)malloc(sizeof(double)*num_dm);
|
|
577 memset(big,0,sizeof(double)*num_dm);
|
|
578
|
|
579
|
|
580 for (j=0;j<num_dm;j++)
|
|
581 {
|
|
582 big[j]=MAX_VALUE*(-1);
|
|
583 small[j]=MAX_VALUE;
|
|
584 for (i=0;i<file_Len;i++)
|
|
585 {
|
|
586 if (data_in[i][j]>big[j])
|
|
587 big[j]=data_in[i][j];
|
|
588
|
|
589 if (data_in[i][j]<small[j])
|
|
590 small[j]=data_in[i][j];
|
|
591 }
|
|
592
|
|
593 interval[j]=(big[j]-small[j])/(double)num_bin; //interval is computed using the biggest value and smallest value instead of the channel limit
|
|
594 /* XXX: I'm pretty sure the denominator of the fraction above should be num_bin-1. */
|
|
595 /* I don't think so: num_bin is the number of bins */
|
|
596 }
|
|
597
|
|
598 for (j=0;j<num_dm;j++)
|
|
599 {
|
|
600 for (i=0;i<file_Len;i++)
|
|
601 {
|
|
602 if (data_in[i][j]>=big[j])
|
|
603 position[i][j]=num_bin-1;
|
|
604 else
|
|
605 {
|
|
606 position[i][j]=(int)((data_in[i][j]-small[j])/interval[j]); //position[i][j]=t means point i is at the t grid of dimensional j
|
|
607 if ((position[i][j]>=num_bin) || (position[i][j]<0))
|
|
608 {
|
|
609 //printf("position mis-computed in density analysis!\n");
|
|
610 //exit(0);
|
|
611 fprintf(stderr,"Incorrect input file format or input parameters (number of bins overflows)!\n"); //modified on July 23, 2010
|
|
612 abort();
|
|
613
|
|
614 }
|
|
615 }
|
|
616 }
|
|
617 }
|
|
618
|
|
619
|
|
620 free(small);
|
|
621 free(big);
|
|
622 }
|
|
623
|
|
624 /********************************************** select_bin to select the number of bins **********************************/
|
|
625 //num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty);
|
|
626 /* Determine the number of bins to use in each dimension. Additionally sort the data elements according to the binned
|
|
627 * values, and partition the data into "grids" with identical (binned) values. We try progressively more bins until we
|
|
628 * maximize a merit function, then return the results obtained using the optimal number of bins.
|
|
629 *
|
|
630 * Outputs:
|
|
631 * position -- binned data values
|
|
632 * sorted_seq -- permutation vector mapping the ordered list to the original data
|
|
633 * all_grid_ID -- grid to which each data element was assigned.
|
|
634 * num_nonempty -- number of distinct values assumed by all_grid_ID
|
|
635 * interval -- bin width for each data dimension
|
|
636 * return value -- the number of bins selected.
|
|
637 */
|
|
638
|
|
639 int select_bin(double **normalized_data, int file_Len, int num_dm, int min_grid, int max_grid, int **position, int *sorted_seq,
|
|
640 int *all_grid_ID, int *num_nonempty, double *interval, int user_num_bin)
|
|
641 {
|
|
642
|
|
643 int num_bin=0;
|
|
644 int select_num_bin=0;
|
|
645 int m=0;
|
|
646 int n=0;
|
|
647
|
|
648 int i=0;
|
|
649 int bin_scope=0;
|
|
650 int temp_num_nonempty=0;
|
|
651
|
|
652 int *temp_grid_ID;
|
|
653 int *temp_sorted_seq;
|
|
654 int **temp_position;
|
|
655
|
|
656 //sorted_seq[i]=j means the event j ranks i
|
|
657
|
|
658 double temp_index=0;
|
|
659 double *bin_index;
|
|
660 double *temp_interval;
|
|
661
|
|
662
|
|
663
|
|
664 temp_grid_ID=(int *)malloc(sizeof(int)*file_Len);
|
|
665 memset(temp_grid_ID,0,sizeof(int)*file_Len);
|
|
666
|
|
667 temp_sorted_seq=(int *)malloc(sizeof(int)*file_Len);
|
|
668 memset(temp_sorted_seq,0,sizeof(int)*file_Len);
|
|
669
|
|
670 temp_position=(int **)malloc(sizeof(int*)*file_Len);
|
|
671 memset(temp_position,0,sizeof(int*)*file_Len);
|
|
672 for (m=0;m<file_Len;m++)
|
|
673 {
|
|
674 temp_position[m]=(int*)malloc(sizeof(int)*num_dm);
|
|
675 memset(temp_position[m],0,sizeof(int)*num_dm);
|
|
676 }
|
|
677
|
|
678 temp_interval=(double*)malloc(sizeof(double)*num_dm);
|
|
679 memset(temp_interval,0,sizeof(double)*num_dm);
|
|
680
|
|
681 bin_scope=max_grid-min_grid+1;
|
|
682 bin_index=(double *)malloc(sizeof(double)*bin_scope);
|
|
683 memset(bin_index,0,sizeof(double)*bin_scope);
|
|
684
|
|
685 i=0;
|
|
686
|
|
687 for (num_bin=min_grid;num_bin<=max_grid;num_bin++)
|
|
688 {
|
|
689 /* compute_position bins the data into num_bin bins. Each
|
|
690 dimension is binned independently.
|
|
691
|
|
692 Outputs:
|
|
693 temp_position[i][j] -- bin for the jth component of data element i.
|
|
694 temp_interval[j] -- bin-width for the jth component
|
|
695 */
|
|
696 compute_position(normalized_data, file_Len, num_dm, num_bin, temp_position, temp_interval);
|
|
697 radixsort_flock(temp_position,file_Len,num_dm,num_bin,temp_sorted_seq,&temp_num_nonempty,temp_grid_ID);
|
|
698
|
|
699 /* our figure of merit is the number of non-empty grids divided by number of bins per dimension.
|
|
700 We declare victory when we have found a local maximum */
|
|
701 bin_index[i]=((double)temp_num_nonempty)/((double)num_bin);
|
|
702 if ((double)(temp_num_nonempty)>=(double)(file_Len)*0.95)
|
|
703 break;
|
|
704 if ((bin_index[i]<temp_index) && (user_num_bin==0))
|
|
705 break;
|
|
706 if ((user_num_bin==num_bin-1) && (user_num_bin!=0))
|
|
707 break;
|
|
708
|
|
709 /* Since we have accepted this trial bin, copy all the temporary results into
|
|
710 the output buffers */
|
|
711 memcpy(all_grid_ID,temp_grid_ID,sizeof(int)*file_Len);
|
|
712 memcpy(sorted_seq,temp_sorted_seq,sizeof(int)*file_Len);
|
|
713 memcpy(interval,temp_interval,sizeof(double)*num_dm);
|
|
714
|
|
715 for (m=0;m<file_Len;m++)
|
|
716 for (n=0;n<num_dm;n++)
|
|
717 position[m][n]=temp_position[m][n];
|
|
718
|
|
719 temp_index=bin_index[i];
|
|
720 select_num_bin=num_bin;
|
|
721 num_nonempty[0]=temp_num_nonempty;
|
|
722 i++;
|
|
723 }
|
|
724
|
|
725 if ((select_num_bin<min_grid) || (select_num_bin>max_grid))
|
|
726 {
|
|
727 fprintf(stderr,"Number of events collected is too few in terms of number of markers used. The file should not be processed!\n"); //modified on Nov 4, 2010
|
|
728 exit(0);
|
|
729 }
|
|
730
|
|
731 if (temp_index==0)
|
|
732 {
|
|
733 fprintf(stderr,"Too many dimensions with too few events in the input file, or a too large number of bins used.\n"); //modified on July 23, 2010
|
|
734 abort();
|
|
735 }
|
|
736
|
|
737
|
|
738 free(temp_grid_ID);
|
|
739 free(temp_sorted_seq);
|
|
740 free(bin_index);
|
|
741 free(temp_interval);
|
|
742
|
|
743 for (m=0;m<file_Len;m++)
|
|
744 free(temp_position[m]);
|
|
745 free(temp_position);
|
|
746
|
|
747 return select_num_bin;
|
|
748 }
|
|
749
|
|
750 /************************************* Select dense grids **********************************/
|
|
751 // compute num_dense_grids, num_dense_events, dense_grid_reverse, and all_grid_vol
|
|
752 // den_cutoff=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse);
|
|
753 /*
|
|
754 * Prune away grids that are insufficiently "dense" (i.e., contain too few data items)
|
|
755 *
|
|
756 * Outputs:
|
|
757 * num_dense_grids -- number of dense grids
|
|
758 * num_dense_events -- total number of data items in all dense grids
|
|
759 * dense_grid_reverse -- mapping from list of all grids to list of dense grids.
|
|
760 * return value -- density cutoff for separating dense from non-dense grids.
|
|
761 */
|
|
762
|
|
763 int select_dense(int file_Len, int *all_grid_ID, int num_nonempty, int *num_dense_grids, int *num_dense_events, int *dense_grid_reverse, int den_t_event)
|
|
764 {
|
|
765
|
|
766
|
|
767 int i=0;
|
|
768 int vol_ID=0;
|
|
769 int biggest_size=0; //biggest grid_size, to define grid_size_index
|
|
770 int biggest_index=0;
|
|
771 //int actual_threshold=0; //the actual threshold on grid_size, e.g., 1 implies 1 event in the grid
|
|
772 //int num_remain=0; //number of remaining grids with different density thresholds
|
|
773 int temp_num_dense_grids=0;
|
|
774 int temp_num_dense_events=0;
|
|
775
|
|
776 int *grid_size_index;
|
|
777 int *all_grid_vol;
|
|
778 int *grid_density_index;
|
|
779
|
|
780 //double den_average=0;
|
|
781 // double avr_index=0;
|
|
782
|
|
783
|
|
784 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
785 // Compute all_grid_vol
|
|
786 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
787 all_grid_vol=(int *)malloc(sizeof(int)*num_nonempty);
|
|
788 memset(all_grid_vol,0,sizeof(int)*num_nonempty);
|
|
789
|
|
790 /* Grid "volume" is just the number of data contained in the grid. */
|
|
791 for (i=0;i<file_Len;i++)
|
|
792 {
|
|
793 vol_ID=all_grid_ID[i]; //vol_ID=all_grid_ID[sorted_seq[i]];
|
|
794 all_grid_vol[vol_ID]++; //all_grid_vol[i]=j means grid i has j points
|
|
795 }
|
|
796
|
|
797
|
|
798
|
|
799 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
800 // Compute grid_size_index (histogram of grid sizes)
|
|
801 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
802
|
|
803 for (i=0;i<num_nonempty;i++)
|
|
804 {
|
|
805 if (biggest_size<all_grid_vol[i])
|
|
806 {
|
|
807 biggest_size=all_grid_vol[i];
|
|
808 }
|
|
809 }
|
|
810
|
|
811 //added on July 23, 2010
|
|
812 if (biggest_size<3)
|
|
813 {
|
|
814 fprintf(stderr,"Too many dimensions with too few events in the input file, or a too large number of bins used.\n"); //modified on July 23, 2010
|
|
815 abort();
|
|
816 }
|
|
817
|
|
818 grid_size_index=(int*)malloc(sizeof(int)*biggest_size);
|
|
819 memset(grid_size_index,0,sizeof(int)*biggest_size);
|
|
820
|
|
821 for (i=0;i<num_nonempty;i++)
|
|
822 {
|
|
823 grid_size_index[all_grid_vol[i]-1]++; //grid_size_index[0]=5 means there are 5 grids having size 1
|
|
824 }
|
|
825
|
|
826
|
|
827
|
|
828 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
829 // Compute den_cutoff
|
|
830 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
831
|
|
832 grid_density_index=(int *)malloc(sizeof(int)*(biggest_size-2));//from event 2 to biggest_size-1, i.e., from 0 to biggest_size-3
|
|
833 memset(grid_density_index,0,sizeof(int)*(biggest_size-2));
|
|
834
|
|
835 if (den_t_event==0) //user doesn't define the density threshold
|
|
836 {
|
|
837 biggest_index=0;
|
|
838
|
|
839 for (i=2;i<biggest_size-1;i++) //the grid with 1 event will be skipped, i.e., grid_density_index[0] won't be defined
|
|
840 {
|
|
841 grid_density_index[i-1]=(grid_size_index[i-1]+grid_size_index[i+1]-2*grid_size_index[i]);
|
|
842 if (biggest_index<grid_density_index[i-1])
|
|
843 {
|
|
844 biggest_index=grid_density_index[i-1];
|
|
845 den_t_event=i+1;
|
|
846 }
|
|
847 }
|
|
848 }
|
|
849
|
|
850 if (den_t_event==0) //if something is wrong
|
|
851 den_t_event=3;
|
|
852
|
|
853 for (i=0;i<num_nonempty;i++)
|
|
854 if (all_grid_vol[i]>=den_t_event)
|
|
855 temp_num_dense_grids++;
|
|
856
|
|
857 if (temp_num_dense_grids<=1)
|
|
858 {
|
|
859 //modified on July 23, 2010
|
|
860 //printf("a too high density threshold is set! Please decrease your density threshold.\n");
|
|
861 //exit(0);
|
|
862 fprintf(stderr,"a too high density threshold is set! Please decrease your density threshold.\n"); //modified on July 23, 2010
|
|
863 abort();
|
|
864 }
|
|
865
|
|
866 if (temp_num_dense_grids>=100000)
|
|
867 {
|
|
868 //modified on July 23, 2010
|
|
869 //printf("a too low density threshold is set! Please increase your density threshold.\n");
|
|
870 //exit(0);
|
|
871 fprintf(stderr,"a too low density threshold is set! Please increase your density threshold.\n"); //modified on July 23, 2010
|
|
872 abort();
|
|
873 }
|
|
874
|
|
875 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
876 // Compute dense_grid_reverse
|
|
877 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
878 temp_num_dense_grids=0;
|
|
879 temp_num_dense_events=0;
|
|
880 for (i=0;i<num_nonempty;i++)
|
|
881 {
|
|
882 dense_grid_reverse[i]=-1;
|
|
883
|
|
884 if (all_grid_vol[i]>=den_t_event)
|
|
885 {
|
|
886 dense_grid_reverse[i]=temp_num_dense_grids; //dense_grid_reverse provides a mapping from all nonempty grids to dense grids.
|
|
887 temp_num_dense_grids++;
|
|
888 temp_num_dense_events+=all_grid_vol[i];
|
|
889 }
|
|
890 }
|
|
891
|
|
892 num_dense_grids[0]=temp_num_dense_grids;
|
|
893 num_dense_events[0]=temp_num_dense_events;
|
|
894
|
|
895 free(grid_size_index);
|
|
896 free(all_grid_vol);
|
|
897
|
|
898 return den_t_event;
|
|
899 }
|
|
900
|
|
901 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
902 // Compute densegridID_To_gridevent and eventID_To_denseventID
|
|
903 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
904 // grid_To_event(file_Len, dense_grid_reverse, all_grid_ID, eventID_To_denseventID, densegridID_To_gridevent);
|
|
905 /*
|
|
906 * Filter the data so that only the data belonging to dense grids is left
|
|
907 *
|
|
908 * Output:
|
|
909 * eventID_To_denseeventID -- mapping from original event ID to ID in list containing only events contained in dense grids.
|
|
910 * densegridID_To_gridevent -- mapping from dense grids to prototype members of the grids.
|
|
911 *
|
|
912 */
|
|
913
|
|
914 void grid_To_event(int file_Len, int *dense_grid_reverse, int *all_grid_ID, int *eventID_To_denseventID, int *densegridID_To_gridevent)
|
|
915 {
|
|
916 int i=0;
|
|
917 int dense_grid_ID=0;
|
|
918 int dense_event_ID=0;
|
|
919
|
|
920 for (i=0;i<file_Len;i++)
|
|
921 {
|
|
922 dense_grid_ID=dense_grid_reverse[all_grid_ID[i]];
|
|
923 eventID_To_denseventID[i]=-1;
|
|
924 if (dense_grid_ID!=-1) //for point (i) belonging to dense grids
|
|
925 {
|
|
926 eventID_To_denseventID[i]=dense_event_ID;
|
|
927 dense_event_ID++;
|
|
928
|
|
929 if (densegridID_To_gridevent[dense_grid_ID]==-1) //for point i that hasn't been selected
|
|
930 densegridID_To_gridevent[dense_grid_ID]=i; //densegridID_To_gridevent maps dense_grid_ID to its representative point
|
|
931 }
|
|
932 }
|
|
933
|
|
934
|
|
935 }
|
|
936
|
|
937 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
938 // Compute dense_grid_seq
|
|
939 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
940 // generate_grid_seq(file_Len, num_dm, sorted_seq, num_dense_grids, densegridID_To_gridevent, position, dense_grid_rank, dense_grid_seq);
|
|
941 /* Construct a table of binned data values for each dense grid.
|
|
942 *
|
|
943 * Output:
|
|
944 *
|
|
945 * dense_grid_seq -- table of binned data values for each dense grid (recall that all members of a grid have identical binned data values)
|
|
946 */
|
|
947
|
|
948 void generate_grid_seq(int num_dm, int num_dense_grids, int *densegridID_To_gridevent, int **position, int **dense_grid_seq)
|
|
949 {
|
|
950
|
|
951 int i=0;
|
|
952 int j=0;
|
|
953 int ReventID=0; //representative event ID of the dense grid
|
|
954
|
|
955 for (i=0;i<num_dense_grids;i++)
|
|
956 {
|
|
957 ReventID = densegridID_To_gridevent[i];
|
|
958
|
|
959 for (j=0;j<num_dm;j++)
|
|
960 dense_grid_seq[i][j]=position[ReventID][j];
|
|
961 }
|
|
962 }
|
|
963
|
|
964 //compare two vectors
|
|
965 int compare_value(int num_dm, int *search_value, int *seq_value)
|
|
966 {
|
|
967 int i=0;
|
|
968
|
|
969 for (i=0;i<num_dm;i++)
|
|
970 {
|
|
971 if (search_value[i]<seq_value[i])
|
|
972 return 1;
|
|
973 if (search_value[i]>seq_value[i])
|
|
974 return -1;
|
|
975 if (search_value[i]==seq_value[i])
|
|
976 continue;
|
|
977 }
|
|
978 return 0;
|
|
979 }
|
|
980
|
|
981 //binary search the dense_grid_seq to return the dense grid ID if it exists
|
|
982 int binary_search(int num_dense_grids, int num_dm, int *search_value, int **dense_grid_seq)
|
|
983 {
|
|
984
|
|
985 int low=0;
|
|
986 int high=0;
|
|
987 int mid=0;
|
|
988 int comp_result=0;
|
|
989 int match=0;
|
|
990 //int found=0;
|
|
991
|
|
992 low = 0;
|
|
993 high = num_dense_grids-1;
|
|
994
|
|
995 while (low <= high)
|
|
996 {
|
|
997 mid = (int)((low + high)/2);
|
|
998
|
|
999 comp_result=compare_value(num_dm, search_value,dense_grid_seq[mid]);
|
|
1000
|
|
1001
|
|
1002 switch(comp_result)
|
|
1003 {
|
|
1004 case 1:
|
|
1005 high=mid-1;
|
|
1006 break;
|
|
1007 case -1:
|
|
1008 low=mid+1;
|
|
1009 break;
|
|
1010 case 0:
|
|
1011 match=1;
|
|
1012 break;
|
|
1013 }
|
|
1014 if (match==1)
|
|
1015 break;
|
|
1016 }
|
|
1017
|
|
1018
|
|
1019
|
|
1020 if (match==1)
|
|
1021 {
|
|
1022 return mid;
|
|
1023 }
|
|
1024 else
|
|
1025 return -1;
|
|
1026 }
|
|
1027
|
|
1028
|
|
1029 /********************************************** Computing Centers Using IDs **********************************************/
|
|
1030
|
|
1031 void ID2Center(double **data_in, int file_Len, int num_dm, int *eventID_To_denseventID, int num_clust, int *cluster_ID, double **population_center)
|
|
1032 {
|
|
1033 int i=0;
|
|
1034 int j=0;
|
|
1035 int ID=0;
|
|
1036 int eventID=0;
|
|
1037 int *size_c;
|
|
1038
|
|
1039
|
|
1040
|
|
1041 size_c=(int *)malloc(sizeof(int)*num_clust);
|
|
1042 memset(size_c,0,sizeof(int)*num_clust);
|
|
1043
|
|
1044 for (i=0;i<num_clust;i++)
|
|
1045 for (j=0;j<num_dm;j++)
|
|
1046 population_center[i][j]=0;
|
|
1047
|
|
1048 for (i=0;i<file_Len;i++)
|
|
1049 {
|
|
1050 eventID=eventID_To_denseventID[i];
|
|
1051
|
|
1052 if (eventID!=-1) //only events in dense grids count
|
|
1053 {
|
|
1054 ID=cluster_ID[eventID];
|
|
1055
|
|
1056 if (ID==-1)
|
|
1057 {
|
|
1058 //modified on July 23, 2010
|
|
1059 //printf("ID==-1! in ID2Center\n");
|
|
1060 //exit(0);
|
|
1061 fprintf(stderr,"Incorrect file format or input parameters (no dense regions found!)\n"); //modified on July 23, 2010
|
|
1062 abort();
|
|
1063 }
|
|
1064
|
|
1065 for (j=0;j<num_dm;j++)
|
|
1066 population_center[ID][j]=population_center[ID][j]+data_in[i][j];
|
|
1067
|
|
1068 size_c[ID]++;
|
|
1069 }
|
|
1070 }
|
|
1071
|
|
1072
|
|
1073 for (i=0;i<num_clust;i++)
|
|
1074 {
|
|
1075 for (j=0;j<num_dm;j++)
|
|
1076 if (size_c[i]!=0)
|
|
1077 population_center[i][j]=(population_center[i][j]/(double)(size_c[i]));
|
|
1078 else
|
|
1079 population_center[i][j]=0;
|
|
1080 }
|
|
1081
|
|
1082 free(size_c);
|
|
1083
|
|
1084 }
|
|
1085
|
|
1086 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
1087 // Compute Population Center with all events
|
|
1088 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
1089 void ID2Center_all(double **data_in, int file_Len, int num_dm, int num_clust, int *cluster_ID, double **population_center)
|
|
1090 {
|
|
1091 int i=0;
|
|
1092 int j=0;
|
|
1093 int ID=0;
|
|
1094 int *size_c;
|
|
1095
|
|
1096
|
|
1097
|
|
1098 size_c=(int *)malloc(sizeof(int)*num_clust);
|
|
1099 memset(size_c,0,sizeof(int)*num_clust);
|
|
1100
|
|
1101 for (i=0;i<num_clust;i++)
|
|
1102 for (j=0;j<num_dm;j++)
|
|
1103 population_center[i][j]=0;
|
|
1104
|
|
1105 for (i=0;i<file_Len;i++)
|
|
1106 {
|
|
1107 ID=cluster_ID[i];
|
|
1108
|
|
1109 if (ID==-1)
|
|
1110 {
|
|
1111 //commented on July 23, 2010
|
|
1112 //printf("ID==-1! in ID2Center_all\n");
|
|
1113 //exit(0);
|
|
1114 fprintf(stderr,"Incorrect file format or input parameters (resulting in incorrect population IDs)\n"); //modified on July 23, 2010
|
|
1115 abort();
|
|
1116 }
|
|
1117
|
|
1118 for (j=0;j<num_dm;j++)
|
|
1119 population_center[ID][j]=population_center[ID][j]+data_in[i][j];
|
|
1120
|
|
1121 size_c[ID]++;
|
|
1122 }
|
|
1123
|
|
1124
|
|
1125 for (i=0;i<num_clust;i++)
|
|
1126 {
|
|
1127 for (j=0;j<num_dm;j++)
|
|
1128 if (size_c[i]!=0)
|
|
1129 population_center[i][j]=(population_center[i][j]/(double)(size_c[i]));
|
|
1130 else
|
|
1131 population_center[i][j]=0;
|
|
1132 }
|
|
1133
|
|
1134 free(size_c);
|
|
1135
|
|
1136 }
|
|
1137
|
|
1138 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
1139 // Merge neighboring grids to clusters
|
|
1140 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
1141
|
|
1142 int merge_grids(double **normalized_data, double *interval, int file_Len, int num_dm, int num_bin, int **position, int num_dense_grids,
|
|
1143 int *dense_grid_reverse, int **dense_grid_seq, int *eventID_To_denseventID, int *densegridID_To_gridevent, int *all_grid_ID,
|
|
1144 int *cluster_ID, int *grid_ID, int *grid_clusterID)
|
|
1145 {
|
|
1146
|
|
1147
|
|
1148 int i=0;
|
|
1149 int j=0;
|
|
1150 int t=0;
|
|
1151 int p=0;
|
|
1152 int num_clust=0;
|
|
1153 int ReventID=0;
|
|
1154 int denseID=0;
|
|
1155 int neighbor_ID=0;
|
|
1156 //int temp_grid=0;
|
|
1157
|
|
1158 int *grid_value;
|
|
1159 int *search_value;
|
|
1160
|
|
1161 int **graph_of_grid; //the graph constructed for dense grids: each dense grid is a graph node
|
|
1162
|
|
1163 double real_dist=0;
|
|
1164 double **norm_grid_center;
|
|
1165
|
|
1166 norm_grid_center=(double **)malloc(sizeof(double*)*num_dense_grids);
|
|
1167 memset(norm_grid_center,0,sizeof(double*)*num_dense_grids);
|
|
1168
|
|
1169 for (i=0;i<num_dense_grids;i++)
|
|
1170 {
|
|
1171 norm_grid_center[i]=(double *)malloc(sizeof(double)*num_dm);
|
|
1172 memset(norm_grid_center[i],0,sizeof(double)*num_dm);
|
|
1173 }
|
|
1174
|
|
1175 for (i=0;i<file_Len;i++)
|
|
1176 {
|
|
1177 denseID=eventID_To_denseventID[i];
|
|
1178 if (denseID!=-1) //only dense events can enter
|
|
1179 {
|
|
1180 grid_ID[denseID]=dense_grid_reverse[all_grid_ID[i]];
|
|
1181
|
|
1182 if (grid_ID[denseID]==-1)
|
|
1183 {
|
|
1184 fprintf(stderr,"Incorrect input file format or input parameters (no dense region found)!\n"); //modified on July 23, 2010
|
|
1185 abort();
|
|
1186 }
|
|
1187 }
|
|
1188 }
|
|
1189
|
|
1190
|
|
1191 /* Find centroid (in the normalized data) for each dense grid */
|
|
1192 ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_dense_grids,grid_ID,norm_grid_center);
|
|
1193
|
|
1194 //printf("pass the grid ID2 center\n"); //commmented on July 23, 2010
|
|
1195
|
|
1196
|
|
1197 graph_of_grid=(int **)malloc(sizeof(int*)*num_dense_grids);
|
|
1198 memset(graph_of_grid,0,sizeof(int*)*num_dense_grids);
|
|
1199 for (i=0;i<num_dense_grids;i++)
|
|
1200 {
|
|
1201 graph_of_grid[i]=(int *)malloc(sizeof(int)*num_dm);
|
|
1202 memset(graph_of_grid[i],0,sizeof(int)*num_dm);
|
|
1203
|
|
1204
|
|
1205 for (j=0;j<num_dm;j++)
|
|
1206 graph_of_grid[i][j]=-1;
|
|
1207 }
|
|
1208
|
|
1209 grid_value=(int *)malloc(sizeof(int)*num_dm);
|
|
1210 memset(grid_value,0,sizeof(int)*num_dm);
|
|
1211
|
|
1212 search_value=(int *)malloc(sizeof(int)*num_dm);
|
|
1213 memset(search_value,0,sizeof(int)*num_dm);
|
|
1214
|
|
1215
|
|
1216 for (i=0;i<num_dense_grids;i++)
|
|
1217 {
|
|
1218 ReventID=densegridID_To_gridevent[i];
|
|
1219
|
|
1220 for (j=0;j<num_dm;j++)
|
|
1221 {
|
|
1222 grid_value[j]=position[ReventID][j];
|
|
1223
|
|
1224 }
|
|
1225
|
|
1226
|
|
1227 /* For each dimension, find the neighbor, if any, that is equal in all other dimensions and 1 greater in
|
|
1228 the chosen dimension. If the neighbor's centroid is not too far away, add it to this grid's neighbor
|
|
1229 list. */
|
|
1230 for (t=0;t<num_dm;t++)
|
|
1231 {
|
|
1232 for (p=0;p<num_dm;p++)
|
|
1233 search_value[p]=grid_value[p];
|
|
1234
|
|
1235 if (grid_value[t]==num_bin-1)
|
|
1236 continue;
|
|
1237
|
|
1238 search_value[t]=grid_value[t]+1; //we only consider the neighbor at the bigger side
|
|
1239
|
|
1240 neighbor_ID=binary_search(num_dense_grids, num_dm, search_value, dense_grid_seq);
|
|
1241
|
|
1242 if (neighbor_ID!=-1)
|
|
1243 {
|
|
1244 real_dist=norm_grid_center[i][t]-norm_grid_center[neighbor_ID][t];
|
|
1245
|
|
1246 if (real_dist<0)
|
|
1247 real_dist=-real_dist;
|
|
1248
|
|
1249 if (real_dist<2*interval[t])
|
|
1250 graph_of_grid[i][t]=neighbor_ID;
|
|
1251 }
|
|
1252 }
|
|
1253 grid_clusterID[i]=i; //initialize grid_clusterID
|
|
1254 }
|
|
1255
|
|
1256
|
|
1257 //graph constructed
|
|
1258 //DFS-based search begins
|
|
1259
|
|
1260 /* Use a depth-first search to construct a list of connected subgraphs (= "clusters").
|
|
1261 Note our graph as we have constructed it is a DAG, so we can use that to our advantage
|
|
1262 in our search. */
|
|
1263 // num_clust=dfs(graph_of_grid,num_dense_grids,num_dm,grid_clusterID);
|
|
1264 num_clust=find_connected(graph_of_grid, num_dense_grids, num_dm, grid_clusterID);
|
|
1265
|
|
1266
|
|
1267 //computes grid_ID and cluster_ID
|
|
1268 for (i=0;i<file_Len;i++)
|
|
1269 {
|
|
1270 denseID=eventID_To_denseventID[i];
|
|
1271 if (denseID!=-1) //only dense events can enter
|
|
1272 {
|
|
1273 cluster_ID[denseID]=grid_clusterID[grid_ID[denseID]];
|
|
1274 //if (cluster_ID[denseID]==-1)
|
|
1275 // printf("catch you!\n");
|
|
1276 }
|
|
1277 }
|
|
1278
|
|
1279 free(search_value);
|
|
1280 free(grid_value);
|
|
1281
|
|
1282 for (i=0;i<num_dense_grids;i++)
|
|
1283 {
|
|
1284 free(graph_of_grid[i]);
|
|
1285 free(norm_grid_center[i]);
|
|
1286 }
|
|
1287 free(graph_of_grid);
|
|
1288 free(norm_grid_center);
|
|
1289
|
|
1290 return num_clust;
|
|
1291 }
|
|
1292
|
|
1293 /********************************************* Merge Clusters to Populations *******************************************/
|
|
1294 // This is the function future work can be on because it is about how to cluster the about 500 points accurately
|
|
1295
|
|
1296 int merge_clusters(int num_clust, int num_dm, double *interval, double **cluster_center, int *cluster_populationID, int max_num_pop)
|
|
1297 {
|
|
1298 int num_population=0;
|
|
1299 int temp_num_population=0;
|
|
1300
|
|
1301 int i=0;
|
|
1302 int j=0;
|
|
1303 int t=0;
|
|
1304 int merge=0;
|
|
1305 int smid=0;
|
|
1306 int bgid=0;
|
|
1307 double merge_dist=1.1; //initial value of merge_dist*interval should be slightly larger than the bin width
|
|
1308
|
|
1309 int *map_ID;
|
|
1310
|
|
1311 double diff=0;
|
|
1312
|
|
1313 map_ID=(int*)malloc(sizeof(int)*num_clust);
|
|
1314 memset(map_ID,0,sizeof(int)*num_clust);
|
|
1315
|
|
1316 while ((num_population>max_num_pop) || (num_population<=1))
|
|
1317 {
|
|
1318
|
|
1319 if (num_population<=1)
|
|
1320 merge_dist=merge_dist-0.1;
|
|
1321
|
|
1322 if (num_population>max_num_pop)
|
|
1323 merge_dist=merge_dist+0.1;
|
|
1324
|
|
1325
|
|
1326 for (i=0;i<num_clust;i++)
|
|
1327 cluster_populationID[i]=i;
|
|
1328
|
|
1329 for (i=0;i<num_clust-1;i++)
|
|
1330 {
|
|
1331 for (j=i+1;j<num_clust;j++)
|
|
1332 {
|
|
1333 merge=1;
|
|
1334
|
|
1335 for (t=0;t<num_dm;t++)
|
|
1336 {
|
|
1337 diff=cluster_center[i][t]-cluster_center[j][t];
|
|
1338
|
|
1339 if (diff<0)
|
|
1340 diff=-diff;
|
|
1341 if (diff>(merge_dist*interval[t]))
|
|
1342 merge=0;
|
|
1343 }
|
|
1344
|
|
1345 if ((merge) && (cluster_populationID[i]!=cluster_populationID[j]))
|
|
1346 {
|
|
1347 if (cluster_populationID[i]<cluster_populationID[j]) //they could not be equal
|
|
1348 {
|
|
1349 smid = cluster_populationID[i];
|
|
1350 bgid = cluster_populationID[j];
|
|
1351 }
|
|
1352 else
|
|
1353 {
|
|
1354 smid = cluster_populationID[j];
|
|
1355 bgid = cluster_populationID[i];
|
|
1356 }
|
|
1357 for (t=0;t<num_clust;t++)
|
|
1358 {
|
|
1359 if (cluster_populationID[t]==bgid)
|
|
1360 cluster_populationID[t]=smid;
|
|
1361 }
|
|
1362 }
|
|
1363 }
|
|
1364 }
|
|
1365
|
|
1366
|
|
1367
|
|
1368 for (i=0;i<num_clust;i++)
|
|
1369 map_ID[i]=-1;
|
|
1370
|
|
1371 for (i=0;i<num_clust;i++)
|
|
1372 map_ID[cluster_populationID[i]]=1;
|
|
1373
|
|
1374 num_population=0;
|
|
1375 for (i=0;i<num_clust;i++)
|
|
1376 {
|
|
1377 if (map_ID[i]==1)
|
|
1378 {
|
|
1379 map_ID[i]=num_population;
|
|
1380 num_population++;
|
|
1381 }
|
|
1382 }
|
|
1383
|
|
1384 if ((temp_num_population>max_num_pop) && (num_population==1))
|
|
1385 break;
|
|
1386 else
|
|
1387 temp_num_population=num_population;
|
|
1388
|
|
1389 if (num_clust<=1)
|
|
1390 break;
|
|
1391 }
|
|
1392
|
|
1393 for (i=0;i<num_clust;i++)
|
|
1394 cluster_populationID[i]=map_ID[cluster_populationID[i]];
|
|
1395
|
|
1396 free(map_ID);
|
|
1397
|
|
1398 return num_population;
|
|
1399 }
|
|
1400
|
|
1401 ///////////////////////////////////////////////////////
|
|
1402 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
1403 double kmeans(double **Matrix, int k, double kmean_term, int file_Len, int num_dm, int *shortest_id, double **center)
|
|
1404 {
|
|
1405
|
|
1406 int i=0;
|
|
1407 int j=0;
|
|
1408 int t=0;
|
|
1409 int random=0;
|
|
1410 int random1=0;
|
|
1411 int random2=0;
|
|
1412 int times=0;
|
|
1413 int dist_used=0; //0 is Euclidean and 1 is Pearson
|
|
1414 int random_init=0; //0: not use random seeds;
|
|
1415 int real_Len=0;
|
|
1416 int skipped=0;
|
|
1417
|
|
1418 int *num; //num[i]=t means the ith cluster has t points
|
|
1419
|
|
1420 double vvv=1.0; // the biggest variation;
|
|
1421 double distance=0.0;
|
|
1422 double xv=0.0;
|
|
1423 double variation=0.0;
|
|
1424
|
|
1425 double mean_dx=0;
|
|
1426 double mean_dy=0;
|
|
1427 double sum_var=0;
|
|
1428 double dx=0;
|
|
1429 double dy=0;
|
|
1430 double sd_x=0;
|
|
1431 double sd_y=0;
|
|
1432 double diff=0;
|
|
1433 double distortion=0;
|
|
1434
|
|
1435 double shortest_distance;
|
|
1436
|
|
1437 double *temp_center;
|
|
1438
|
|
1439 double **sum;
|
|
1440
|
|
1441 temp_center = (double *)malloc(sizeof(double)*num_dm);
|
|
1442 memset(temp_center,0,sizeof(double)*num_dm);
|
|
1443
|
|
1444 if (random_init)
|
|
1445 {
|
|
1446 for (i=0;i<k;i++)
|
|
1447 {
|
|
1448 random1=rand()*rand();
|
|
1449 random2=abs((random1%5)+1);
|
|
1450 for (t=0;t<random2;t++)
|
|
1451 random2=random2*rand()+rand();
|
|
1452
|
|
1453 random=abs(random2%file_Len);
|
|
1454 for (j=0;j<num_dm;j++)
|
|
1455 center[i][j]=Matrix[random][j];
|
|
1456 }
|
|
1457 }
|
|
1458
|
|
1459
|
|
1460 num = (int *)malloc(sizeof(int)*k);
|
|
1461 memset(num,0,sizeof(int)*k);
|
|
1462
|
|
1463 sum = (double **)malloc(sizeof(double*)*k);
|
|
1464 memset(sum,0,sizeof(double*)*k);
|
|
1465 for (i=0;i<k;i++)
|
|
1466 {
|
|
1467 sum[i] = (double *)malloc(sizeof(double)*num_dm);
|
|
1468 memset(sum[i],0,sizeof(double)*num_dm);
|
|
1469 }
|
|
1470
|
|
1471
|
|
1472 times=0;
|
|
1473 real_Len=0;
|
|
1474
|
|
1475 while (((vvv>kmean_term) && (kmean_term<1)) || ((times<kmean_term) && (kmean_term>=1)))
|
|
1476 {
|
|
1477 for (i=0;i<k;i++)
|
|
1478 {
|
|
1479 num[i]=0;
|
|
1480 for (j=0;j<num_dm;j++)
|
|
1481 sum[i][j]=0.0;
|
|
1482 }
|
|
1483
|
|
1484 for (i=0;i<file_Len;i++) //for each data point i, we compute the distance between Matrix[i] and center[j]
|
|
1485 {
|
|
1486 skipped = 0;
|
|
1487 shortest_distance=MAX_VALUE;
|
|
1488 for (j=0;j<k;j++) //for each center j
|
|
1489 {
|
|
1490 distance=0.0;
|
|
1491
|
|
1492 if (dist_used==0) //Euclidean distance
|
|
1493 {
|
|
1494 for (t=0;t<num_dm;t++) //for each dimension here num_dm is always 1 as we consider individual dimensions
|
|
1495 {
|
|
1496
|
|
1497 diff=center[j][t]-Matrix[i][t];
|
|
1498
|
|
1499 diff=diff*diff;
|
|
1500
|
|
1501 distance = distance+diff; //here we have a weight for each dimension
|
|
1502 }
|
|
1503 }
|
|
1504 else //pearson correlation
|
|
1505 {
|
|
1506 mean_dx=0.0;
|
|
1507 mean_dy=0.0;
|
|
1508 sum_var=0.0;
|
|
1509 dx=0.0;
|
|
1510 dy=0.0;
|
|
1511 sd_x=0.0;
|
|
1512 sd_y=0.0;
|
|
1513 for (t=0;t<num_dm;t++)
|
|
1514 {
|
|
1515 mean_dx+=center[j][t];
|
|
1516 mean_dy+=Matrix[i][t];
|
|
1517 }
|
|
1518 mean_dx=mean_dx/(double)num_dm;
|
|
1519 mean_dy=mean_dy/(double)num_dm;
|
|
1520
|
|
1521 for (t=0;t<num_dm;t++)
|
|
1522 {
|
|
1523 dx=center[j][t]-mean_dx;
|
|
1524 dy=Matrix[i][t]-mean_dy;
|
|
1525 sum_var+=dx*dy;
|
|
1526 sd_x+=dx*dx;
|
|
1527 sd_y+=dy*dy;
|
|
1528 }
|
|
1529 if (sqrt(sd_x*sd_y)==0)
|
|
1530 distance = 1.0;
|
|
1531 else
|
|
1532 distance = 1.0 - (sum_var/(sqrt(sd_x*sd_y))); // distance ranges from 0 to 2;
|
|
1533 //printf("distance=%f\n",distance);
|
|
1534 } //pearson correlation ends
|
|
1535
|
|
1536 if ((distance<shortest_distance) && (skipped == 0))
|
|
1537 {
|
|
1538 shortest_distance=distance;
|
|
1539 shortest_id[i]=j;
|
|
1540 }
|
|
1541 }//end for j
|
|
1542 real_Len++;
|
|
1543 num[shortest_id[i]]=num[shortest_id[i]]+1;
|
|
1544 for (t=0;t<num_dm;t++)
|
|
1545 sum[shortest_id[i]][t]=sum[shortest_id[i]][t]+Matrix[i][t];
|
|
1546 }//end for i
|
|
1547 /* recompute the centers */
|
|
1548 //compute_mean(group);
|
|
1549 vvv=0.0;
|
|
1550 for (j=0;j<k;j++)
|
|
1551 {
|
|
1552 memcpy(temp_center,center[j],sizeof(double)*num_dm);
|
|
1553 variation=0.0;
|
|
1554 if (num[j]!=0)
|
|
1555 {
|
|
1556 for (t=0;t<num_dm;t++)
|
|
1557 {
|
|
1558 center[j][t]=sum[j][t]/(double)num[j];
|
|
1559 xv=(temp_center[t]-center[j][t]);
|
|
1560 variation=variation+xv*xv;
|
|
1561 }
|
|
1562 }
|
|
1563
|
|
1564 if (variation>vvv)
|
|
1565 vvv=variation; //vvv is the biggest variation among the k clusters;
|
|
1566 }
|
|
1567 //compute_variation;
|
|
1568 times++;
|
|
1569 } //end for while
|
|
1570
|
|
1571
|
|
1572
|
|
1573
|
|
1574 free(num);
|
|
1575
|
|
1576 for (i=0;i<k;i++)
|
|
1577 free(sum[i]);
|
|
1578 free(sum);
|
|
1579 free(temp_center);
|
|
1580
|
|
1581
|
|
1582 return distortion;
|
|
1583
|
|
1584 }
|
|
1585
|
|
1586 //////////////////////////////////////////////////////
|
|
1587 /*************************** Show *****************************/
|
|
1588 void show(double **Matrix, int *cluster_id, int file_Len, int k, int num_dm, char *name_string)
|
|
1589 {
|
|
1590 int situ1=0;
|
|
1591 int situ2=0;
|
|
1592
|
|
1593 int i=0;
|
|
1594 int id=0;
|
|
1595 int j=0;
|
|
1596 int t=0;
|
|
1597
|
|
1598 int *size_c;
|
|
1599
|
|
1600
|
|
1601
|
|
1602 int **size_mybound_1;
|
|
1603 int **size_mybound_2;
|
|
1604 int **size_mybound_3;
|
|
1605 int **size_mybound_0;
|
|
1606
|
|
1607 double interval=0.0;
|
|
1608
|
|
1609 double *big;
|
|
1610 double *small;
|
|
1611
|
|
1612
|
|
1613 double **center;
|
|
1614 double **mybound;
|
|
1615
|
|
1616 int **prof; //prof[i][j]=1 means population i is + at parameter j
|
|
1617
|
|
1618 FILE *fpcnt_id; //proportion id
|
|
1619 //FILE *fcent_id; //center_id, i.e., centers of clusters within the original data
|
|
1620 FILE *fprof_id; //profile_id
|
|
1621
|
|
1622 big=(double *)malloc(sizeof(double)*num_dm);
|
|
1623 memset(big,0,sizeof(double)*num_dm);
|
|
1624
|
|
1625 small=(double *)malloc(sizeof(double)*num_dm);
|
|
1626 memset(small,0,sizeof(double)*num_dm);
|
|
1627
|
|
1628 for (i=0;i<num_dm;i++)
|
|
1629 {
|
|
1630 big[i]=0.0;
|
|
1631 small[i]=(double)MAX_VALUE;
|
|
1632 }
|
|
1633
|
|
1634
|
|
1635 size_c=(int *)malloc(sizeof(int)*k);
|
|
1636 memset(size_c,0,sizeof(int)*k);
|
|
1637
|
|
1638 center=(double**)malloc(sizeof(double*)*k);
|
|
1639 memset(center,0,sizeof(double*)*k);
|
|
1640 for (i=0;i<k;i++)
|
|
1641 {
|
|
1642 center[i]=(double*)malloc(sizeof(double)*num_dm);
|
|
1643 memset(center[i],0,sizeof(double)*num_dm);
|
|
1644 }
|
|
1645
|
|
1646 mybound=(double**)malloc(sizeof(double*)*num_dm);
|
|
1647 memset(mybound,0,sizeof(double*)*num_dm);
|
|
1648 for (i=0;i<num_dm;i++) //there are 3 mybounds for 4 categories
|
|
1649 {
|
|
1650 mybound[i]=(double*)malloc(sizeof(double)*3);
|
|
1651 memset(mybound[i],0,sizeof(double)*3);
|
|
1652 }
|
|
1653
|
|
1654 prof=(int **)malloc(sizeof(int*)*k);
|
|
1655 memset(prof,0,sizeof(int*)*k);
|
|
1656 for (i=0;i<k;i++)
|
|
1657 {
|
|
1658 prof[i]=(int *)malloc(sizeof(int)*num_dm);
|
|
1659 memset(prof[i],0,sizeof(int)*num_dm);
|
|
1660 }
|
|
1661
|
|
1662
|
|
1663 for (i=0;i<file_Len;i++)
|
|
1664 {
|
|
1665 id=cluster_id[i];
|
|
1666 for (j=0;j<num_dm;j++)
|
|
1667 {
|
|
1668 center[id][j]=center[id][j]+Matrix[i][j];
|
|
1669 if (big[j]<Matrix[i][j])
|
|
1670 big[j]=Matrix[i][j];
|
|
1671 if (small[j]>Matrix[i][j])
|
|
1672 small[j]=Matrix[i][j];
|
|
1673 }
|
|
1674
|
|
1675 size_c[id]++;
|
|
1676 }
|
|
1677
|
|
1678 for (i=0;i<k;i++)
|
|
1679 for (j=0;j<num_dm;j++)
|
|
1680 {
|
|
1681 if (size_c[i]!=0)
|
|
1682 center[i][j]=(center[i][j]/(double)(size_c[i]));
|
|
1683 else
|
|
1684 center[i][j]=0;
|
|
1685 }
|
|
1686
|
|
1687 for (j=0;j<num_dm;j++)
|
|
1688 {
|
|
1689 interval=((big[j]-small[j])/4.0);
|
|
1690 //printf("interval[%d] is %f\n",j,interval);
|
|
1691 for (i=0;i<3;i++)
|
|
1692 mybound[j][i]=small[j]+((double)(i+1)*interval);
|
|
1693 }
|
|
1694
|
|
1695
|
|
1696 size_mybound_0=(int **)malloc(sizeof(int*)*k);
|
|
1697 memset(size_mybound_0,0,sizeof(int*)*k);
|
|
1698
|
|
1699 for (i=0;i<k;i++)
|
|
1700 {
|
|
1701 size_mybound_0[i]=(int*)malloc(sizeof(int)*num_dm);
|
|
1702 memset(size_mybound_0[i],0,sizeof(int)*num_dm);
|
|
1703 }
|
|
1704
|
|
1705 size_mybound_1=(int **)malloc(sizeof(int*)*k);
|
|
1706 memset(size_mybound_1,0,sizeof(int*)*k);
|
|
1707
|
|
1708 for (i=0;i<k;i++)
|
|
1709 {
|
|
1710 size_mybound_1[i]=(int*)malloc(sizeof(int)*num_dm);
|
|
1711 memset(size_mybound_1[i],0,sizeof(int)*num_dm);
|
|
1712 }
|
|
1713
|
|
1714 size_mybound_2=(int **)malloc(sizeof(int*)*k);
|
|
1715 memset(size_mybound_2,0,sizeof(int*)*k);
|
|
1716
|
|
1717 for (i=0;i<k;i++)
|
|
1718 {
|
|
1719 size_mybound_2[i]=(int*)malloc(sizeof(int)*num_dm);
|
|
1720 memset(size_mybound_2[i],0,sizeof(int)*num_dm);
|
|
1721 }
|
|
1722
|
|
1723 size_mybound_3=(int **)malloc(sizeof(int*)*k);
|
|
1724 memset(size_mybound_3,0,sizeof(int*)*k);
|
|
1725
|
|
1726 for (i=0;i<k;i++)
|
|
1727 {
|
|
1728 size_mybound_3[i]=(int*)malloc(sizeof(int)*num_dm);
|
|
1729 memset(size_mybound_3[i],0,sizeof(int)*num_dm);
|
|
1730 }
|
|
1731
|
|
1732 for (i=0;i<file_Len;i++)
|
|
1733 for (j=0;j<num_dm;j++)
|
|
1734 {
|
|
1735 if (Matrix[i][j]<mybound[j][0])// && ((Matrix[i][j]-small[j])>0)) //the smallest values excluded
|
|
1736 size_mybound_0[cluster_id[i]][j]++;
|
|
1737 else
|
|
1738 {
|
|
1739 if (Matrix[i][j]<mybound[j][1])
|
|
1740 size_mybound_1[cluster_id[i]][j]++;
|
|
1741 else
|
|
1742 {
|
|
1743 if (Matrix[i][j]<mybound[j][2])
|
|
1744 size_mybound_2[cluster_id[i]][j]++;
|
|
1745 else
|
|
1746 //if (Matrix[i][j]!=big[j]) //the biggest values excluded
|
|
1747 size_mybound_3[cluster_id[i]][j]++;
|
|
1748 }
|
|
1749
|
|
1750 }
|
|
1751 }
|
|
1752
|
|
1753 fprof_id=fopen("profile.txt","w");
|
|
1754 fprintf(fprof_id,"Population_ID\t");
|
|
1755 fprintf(fprof_id,"%s\n",name_string);
|
|
1756
|
|
1757 for (i=0;i<k;i++)
|
|
1758 {
|
|
1759 fprintf(fprof_id,"%d\t",i+1); //i changed to i+1 to start from 1 instead of 0: April 16, 2009
|
|
1760 for (j=0;j<num_dm;j++)
|
|
1761 {
|
|
1762
|
|
1763 if (size_mybound_0[i][j]>size_mybound_1[i][j])
|
|
1764 situ1=0;
|
|
1765 else
|
|
1766 situ1=1;
|
|
1767 if (size_mybound_2[i][j]>size_mybound_3[i][j])
|
|
1768 situ2=2;
|
|
1769 else
|
|
1770 situ2=3;
|
|
1771
|
|
1772 if ((situ1==0) && (situ2==2))
|
|
1773 {
|
|
1774 if (size_mybound_0[i][j]>size_mybound_2[i][j])
|
|
1775 prof[i][j]=0;
|
|
1776 else
|
|
1777 prof[i][j]=2;
|
|
1778 }
|
|
1779 if ((situ1==0) && (situ2==3))
|
|
1780 {
|
|
1781 if (size_mybound_0[i][j]>size_mybound_3[i][j])
|
|
1782 prof[i][j]=0;
|
|
1783 else
|
|
1784 prof[i][j]=3;
|
|
1785 }
|
|
1786 if ((situ1==1) && (situ2==2))
|
|
1787 {
|
|
1788 if (size_mybound_1[i][j]>size_mybound_2[i][j])
|
|
1789 prof[i][j]=1;
|
|
1790 else
|
|
1791 prof[i][j]=2;
|
|
1792 }
|
|
1793 if ((situ1==1) && (situ2==3))
|
|
1794 {
|
|
1795 if (size_mybound_1[i][j]>size_mybound_3[i][j])
|
|
1796 prof[i][j]=1;
|
|
1797 else
|
|
1798 prof[i][j]=3;
|
|
1799 }
|
|
1800
|
|
1801 //begin to output profile
|
|
1802 if (j==num_dm-1)
|
|
1803 {
|
|
1804 if (prof[i][j]==0)
|
|
1805 fprintf(fprof_id,"1\n");
|
|
1806 if (prof[i][j]==1)
|
|
1807 fprintf(fprof_id,"2\n");
|
|
1808 if (prof[i][j]==2)
|
|
1809 fprintf(fprof_id,"3\n");
|
|
1810 if (prof[i][j]==3)
|
|
1811 fprintf(fprof_id,"4\n");
|
|
1812 }
|
|
1813 else
|
|
1814 {
|
|
1815 if (prof[i][j]==0)
|
|
1816 fprintf(fprof_id,"1\t");
|
|
1817 if (prof[i][j]==1)
|
|
1818 fprintf(fprof_id,"2\t");
|
|
1819 if (prof[i][j]==2)
|
|
1820 fprintf(fprof_id,"3\t");
|
|
1821 if (prof[i][j]==3)
|
|
1822 fprintf(fprof_id,"4\t");
|
|
1823 }
|
|
1824 }
|
|
1825 }
|
|
1826 fclose(fprof_id);
|
|
1827
|
|
1828 ///////////////////////////////////////////////////////////
|
|
1829
|
|
1830
|
|
1831 fpcnt_id=fopen("percentage.txt","w");
|
|
1832 fprintf(fpcnt_id,"Population_ID\tPercentage\n");
|
|
1833
|
|
1834 for (t=0;t<k;t++)
|
|
1835 {
|
|
1836 fprintf(fpcnt_id,"%d\t%.2f\n",t+1,(double)size_c[t]*100.0/(double)file_Len); //t changed to t+1 to start from 1 instead of 0: April 16, 2009
|
|
1837 }
|
|
1838 fclose(fpcnt_id);
|
|
1839
|
|
1840 free(big);
|
|
1841 free(small);
|
|
1842 free(size_c);
|
|
1843
|
|
1844 for (i=0;i<k;i++)
|
|
1845 {
|
|
1846 free(center[i]);
|
|
1847 free(prof[i]);
|
|
1848 free(size_mybound_0[i]);
|
|
1849 free(size_mybound_1[i]);
|
|
1850 free(size_mybound_2[i]);
|
|
1851 free(size_mybound_3[i]);
|
|
1852 }
|
|
1853 free(center);
|
|
1854 free(prof);
|
|
1855 free(size_mybound_0);
|
|
1856 free(size_mybound_1);
|
|
1857 free(size_mybound_2);
|
|
1858 free(size_mybound_3);
|
|
1859
|
|
1860 for (i=0;i<num_dm;i++)
|
|
1861 free(mybound[i]);
|
|
1862 free(mybound);
|
|
1863
|
|
1864 }
|
|
1865
|
|
1866
|
|
1867
|
|
1868 /******************************************************** Main Function **************************************************/
|
|
1869 //for normalized data, there are five variables:
|
|
1870 //cluster_ID
|
|
1871 //population_center
|
|
1872 //grid_clusterID
|
|
1873 //grid_ID
|
|
1874 //grid_Center
|
|
1875
|
|
1876 //the same five variables exist for the original data
|
|
1877 //however, the three IDs (cluster_ID, grid_ID, grid_clusterID) don't change in either normalized or original data
|
|
1878 //also, data + cluster_ID -> population_center
|
|
1879 //data + grid_ID -> grid_Center
|
|
1880
|
|
1881 /* what is the final output */
|
|
1882 //the final things we want are grid_Center in the original data and grid_clusterID //or population_center in the original data
|
|
1883 //Sparse grids will not be considered when computing the two centroids (centroids of grids and centroids of clusters)
|
|
1884
|
|
1885 /* what information should select_bin output */
|
|
1886 //the size of all IDs are unknown to function main because we only consider the events in dense grids, and also the number of dense grids
|
|
1887 //is unknown, therefore I must use a prescreening to output
|
|
1888 //how many bins I should use
|
|
1889 //the number of dense grids
|
|
1890 //total number of events in the dense grids
|
|
1891
|
|
1892 /* basic procedure of main function */
|
|
1893 //1. read raw file and normalize the raw file
|
|
1894 //2. select_bin
|
|
1895 //3. allocate memory for eventID_To_denseventID, grid_clusterID, grid_ID, cluster_ID.
|
|
1896 //4. call select_dense and merge_grids with grid_clusterID, grid_ID, cluster_ID.
|
|
1897 //5. release normalized data; allocate memory for grid_Center and population_center
|
|
1898 //6. output grid_Center and population_center using ID2Center together with grid_clusterID //from IDs to centers
|
|
1899
|
|
1900 int main (int argc, char **argv)
|
|
1901 {
|
|
1902 //inputs
|
|
1903 FILE *f_src; //source file pointer
|
|
1904
|
|
1905 FILE *f_out; //coordinates
|
|
1906 FILE *f_cid; //population-ID of events
|
|
1907 FILE *f_ctr; //centroids of populations
|
|
1908 FILE *f_results; //coordinates file event and population column
|
|
1909 FILE *f_mfi; //added April 16, 2009 for mean fluorescence intensity
|
|
1910 FILE *f_parameters; //number of bins and density calculated by
|
|
1911 //the algorithm. Used to update the database
|
|
1912 FILE *f_properties; //Properties file used by Image generation software
|
|
1913
|
|
1914 char para_name_string[LINE_LEN];
|
|
1915
|
|
1916 int time_ID=-1;
|
|
1917 int num_bin=0; //the bins I will use on each dimension
|
|
1918
|
|
1919 int file_Len=0; //number of events
|
|
1920 int num_dm=0;
|
|
1921 int num_clust=0;
|
|
1922 int num_dense_events=0;
|
|
1923 int num_dense_grids=0;
|
|
1924 int num_nonempty=0;
|
|
1925 int num_population=0;
|
|
1926 //int temp=0;
|
|
1927
|
|
1928 //below are read from configuration file
|
|
1929 int i=0;
|
|
1930 int j=0;
|
|
1931 int max_num_pop=0;
|
|
1932
|
|
1933 int den_t_event=0;
|
|
1934
|
|
1935 int *grid_clusterID; //(dense)gridID to clusterID
|
|
1936 int *grid_ID; //(dense)eventID to gridID
|
|
1937 int *cluster_ID; //(dense)eventID to clusterID
|
|
1938 int *eventID_To_denseventID; //eventID_To_denseventID[i]=k means event i is in a dense grid and its ID within dense events is k
|
|
1939 int *all_grid_ID; //gridID for all events
|
|
1940 int *densegridID_To_gridevent;
|
|
1941 int *sorted_seq;
|
|
1942 int *dense_grid_reverse;
|
|
1943 int *population_ID; //denseeventID to populationID
|
|
1944 int *cluster_populationID; //clusterID to populationID
|
|
1945 int *grid_populationID; //gridID to populationID
|
|
1946 int *all_population_ID; //populationID of event
|
|
1947
|
|
1948 int **position;
|
|
1949 int **dense_grid_seq;
|
|
1950
|
|
1951 double *interval;
|
|
1952
|
|
1953 double **population_center; //population centroids in the raw/original data
|
|
1954 double **cluster_center; //cluster centroids in the raw/original data
|
|
1955
|
|
1956 double **input_data;
|
|
1957 double **normalized_data;
|
|
1958
|
|
1959 int min = 999999;
|
|
1960 int max = 0;
|
|
1961
|
|
1962 printf( "Starting time:\t\t\t\t");
|
|
1963 fflush(stdout);
|
|
1964 system("/bin/date");
|
|
1965 /////////////////////////////////////////////////////////////
|
|
1966
|
|
1967 if ((argc!=2) && (argc!=4) && (argc!=5))
|
|
1968 {
|
|
1969 //modified on Jul 23, 2010
|
|
1970 //printf("usage:\n");
|
|
1971 //printf("basic mode: flock data_file\n");
|
|
1972 //printf("advanced mode: flock data_file num_bin density_index\n");
|
|
1973 //exit(0);
|
|
1974
|
|
1975 fprintf(stderr,"Incorrect number of input parameters!\n"); //modified on July 23, 2010
|
|
1976 //fprintf(stderr,"basic mode: flock data_file\n"); //modified on July 23, 2010
|
|
1977 //fprintf(stderr,"advanced mode1: flock data_file num_bin density_index\n"); //modified on July 23, 2010
|
|
1978 fprintf(stderr,"advanced mode: flock data_file num_bin density_index max_num_pop\n"); //added on Dec 16, 2010 for GenePattern
|
|
1979 abort();
|
|
1980 }
|
|
1981
|
|
1982 f_src=fopen(argv[1],"r");
|
|
1983
|
|
1984 if (argc==2)
|
|
1985 {
|
|
1986 max_num_pop=30; //default value, maximum 30 clusters
|
|
1987 }
|
|
1988
|
|
1989 if (argc==4)
|
|
1990 {
|
|
1991 num_bin=atoi(argv[2]);
|
|
1992 printf("num_bin is %d\n",num_bin);
|
|
1993
|
|
1994 den_t_event=atoi(argv[3]);
|
|
1995 printf("density_index is %d\n",den_t_event);
|
|
1996
|
|
1997 max_num_pop=30;
|
|
1998
|
|
1999 if (((num_bin<6) && (num_bin!=0)) || (num_bin>29))
|
|
2000 {
|
|
2001 fprintf(stderr,"Incorrect input range of number of bins, which should be larger than 5 and smaller than 30\n");
|
|
2002 abort();
|
|
2003 }
|
|
2004
|
|
2005 if (((den_t_event<3) && (den_t_event!=0)) || (den_t_event>99))
|
|
2006 {
|
|
2007 fprintf(stderr,"Incorrect input range of density threshold, which should be larger than 2 and smaller than 100\n");
|
|
2008 abort();
|
|
2009 }
|
|
2010 }
|
|
2011
|
|
2012 if (argc==5)
|
|
2013 {
|
|
2014 num_bin=atoi(argv[2]);
|
|
2015 printf("num_bin is %d\n",num_bin);
|
|
2016
|
|
2017 den_t_event=atoi(argv[3]);
|
|
2018 printf("density_index is %d\n",den_t_event);
|
|
2019
|
|
2020 max_num_pop=atoi(argv[4]);
|
|
2021 printf("max number of clusters is %d\n",max_num_pop);
|
|
2022
|
|
2023 if (((num_bin<6) && (num_bin!=0)) || (num_bin>29))
|
|
2024 {
|
|
2025 fprintf(stderr,"Incorrect input range of number of bins, which should be larger than 5 and smaller than 30\n");
|
|
2026 abort();
|
|
2027 }
|
|
2028
|
|
2029 if (((den_t_event<3) && (den_t_event!=0)) || (den_t_event>99))
|
|
2030 {
|
|
2031 fprintf(stderr,"Incorrect input range of density threshold, which should be larger than 2 and smaller than 100\n");
|
|
2032 abort();
|
|
2033 }
|
|
2034
|
|
2035 if ((max_num_pop<5) || (max_num_pop>999))
|
|
2036 {
|
|
2037 fprintf(stderr,"Incorrect input range of maximum number of populations, which should be larger than 4 and smaller than 1000\n");
|
|
2038 abort();
|
|
2039 }
|
|
2040 }
|
|
2041
|
|
2042
|
|
2043 getfileinfo(f_src, &file_Len, &num_dm, para_name_string, &time_ID); //get the filelength, number of dimensions, and num/name of parameters
|
|
2044
|
|
2045 /************************************************* Read the data *****************************************************/
|
|
2046
|
|
2047 rewind(f_src); //reset the file pointer
|
|
2048
|
|
2049 input_data = (double **)malloc(sizeof(double*)*file_Len);
|
|
2050 memset(input_data,0,sizeof(double*)*file_Len);
|
|
2051 for (i=0;i<file_Len;i++)
|
|
2052 {
|
|
2053 input_data[i]=(double *)malloc(sizeof(double)*num_dm);
|
|
2054 memset(input_data[i],0,sizeof(double)*num_dm);
|
|
2055 }
|
|
2056
|
|
2057 readsource(f_src, file_Len, num_dm, input_data, time_ID); //read the data;
|
|
2058
|
|
2059 fclose(f_src);
|
|
2060
|
|
2061 normalized_data=(double **)malloc(sizeof(double*)*file_Len);
|
|
2062 memset(normalized_data,0,sizeof(double*)*file_Len);
|
|
2063 for (i=0;i<file_Len;i++)
|
|
2064 {
|
|
2065 normalized_data[i]=(double *)malloc(sizeof(double)*num_dm);
|
|
2066 memset(normalized_data[i],0,sizeof(double)*num_dm);
|
|
2067 }
|
|
2068
|
|
2069 tran(input_data, file_Len, num_dm, NORM_METHOD, normalized_data);
|
|
2070
|
|
2071
|
|
2072 position=(int **)malloc(sizeof(int*)*file_Len);
|
|
2073 memset(position,0,sizeof(int*)*file_Len);
|
|
2074 for (i=0;i<file_Len;i++)
|
|
2075 {
|
|
2076 position[i]=(int*)malloc(sizeof(int)*num_dm);
|
|
2077 memset(position[i],0,sizeof(int)*num_dm);
|
|
2078 }
|
|
2079
|
|
2080 all_grid_ID=(int *)malloc(sizeof(int)*file_Len);
|
|
2081 memset(all_grid_ID,0,sizeof(int)*file_Len);
|
|
2082
|
|
2083 sorted_seq=(int*)malloc(sizeof(int)*file_Len);
|
|
2084 memset(sorted_seq,0,sizeof(int)*file_Len);
|
|
2085
|
|
2086 interval=(double*)malloc(sizeof(double)*num_dm);
|
|
2087 memset(interval,0,sizeof(double)*num_dm);
|
|
2088
|
|
2089 /************************************************* select_bin *************************************************/
|
|
2090
|
|
2091 if (num_bin>=1) //num_bin has been selected by user
|
|
2092 select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin);
|
|
2093 else //num_bin has not been selected by user
|
|
2094 {
|
|
2095 num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin);
|
|
2096 printf("selected bin number is %d\n",num_bin);
|
|
2097 }
|
|
2098 printf("number of non-empty grids is %d\n",num_nonempty);
|
|
2099
|
|
2100
|
|
2101
|
|
2102 /* Although we return sorted_seq from select_bin(), we don't use it for anything, except possibly diagnostics */
|
|
2103 free(sorted_seq);
|
|
2104
|
|
2105
|
|
2106 dense_grid_reverse=(int*)malloc(sizeof(int)*num_nonempty);
|
|
2107 memset(dense_grid_reverse,0,sizeof(int)*num_nonempty);
|
|
2108
|
|
2109 /************************************************* select_dense **********************************************/
|
|
2110
|
|
2111 if (den_t_event>=1) //den_t_event must be larger or equal to 2 if the user wants to set it
|
|
2112 select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event);
|
|
2113 else
|
|
2114 {
|
|
2115 den_t_event=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event);
|
|
2116 printf("automated selected density threshold is %d\n",den_t_event);
|
|
2117 }
|
|
2118
|
|
2119 printf("Number of dense grids is %d\n",num_dense_grids);
|
|
2120
|
|
2121 densegridID_To_gridevent = (int *)malloc(sizeof(int)*num_dense_grids);
|
|
2122 memset(densegridID_To_gridevent,0,sizeof(int)*num_dense_grids);
|
|
2123
|
|
2124 for (i=0;i<num_dense_grids;i++)
|
|
2125 densegridID_To_gridevent[i]=-1; //initialize all densegridID_To_gridevent values to -1
|
|
2126
|
|
2127
|
|
2128 eventID_To_denseventID=(int *)malloc(sizeof(int)*file_Len);
|
|
2129 memset(eventID_To_denseventID,0,sizeof(int)*file_Len); //eventID_To_denseventID[i]=k means event i is in a dense grid and its ID within dense events is k
|
|
2130
|
|
2131
|
|
2132 grid_To_event(file_Len, dense_grid_reverse, all_grid_ID, eventID_To_denseventID, densegridID_To_gridevent);
|
|
2133
|
|
2134
|
|
2135 dense_grid_seq=(int **)malloc(sizeof(int*)*num_dense_grids);
|
|
2136 memset(dense_grid_seq,0,sizeof(int*)*num_dense_grids);
|
|
2137 for (i=0;i<num_dense_grids;i++)
|
|
2138 {
|
|
2139 dense_grid_seq[i]=(int *)malloc(sizeof(int)*num_dm);
|
|
2140 memset(dense_grid_seq[i],0,sizeof(int)*num_dm);
|
|
2141 }
|
|
2142
|
|
2143
|
|
2144 /* Look up the binned data values for each dense grid */
|
|
2145 generate_grid_seq(num_dm, num_dense_grids, densegridID_To_gridevent, position, dense_grid_seq);
|
|
2146
|
|
2147
|
|
2148 /************************************************* allocate memory *********************************************/
|
|
2149
|
|
2150 grid_clusterID=(int *)malloc(sizeof(int)*num_dense_grids);
|
|
2151 memset(grid_clusterID,0,sizeof(int)*num_dense_grids);
|
|
2152
|
|
2153 grid_ID=(int *)malloc(sizeof(int)*num_dense_events);
|
|
2154 memset(grid_ID,0,sizeof(int)*num_dense_events);
|
|
2155
|
|
2156 cluster_ID=(int *)malloc(sizeof(int)*num_dense_events);
|
|
2157 memset(cluster_ID,0,sizeof(int)*num_dense_events);
|
|
2158
|
|
2159
|
|
2160 /*********************************************** merge_grids ***********************************************/
|
|
2161 //int merge_grids(int file_Len, int num_dm, int num_bin, int **position, int num_dense_grids, int *dense_grid_rank, int *dense_grid_reverse,
|
|
2162 // int **dense_grid_seq, int *eventID_To_denseventID, int *densegridID_To_gridevent, int *all_grid_ID,
|
|
2163 // int *cluster_ID, int *grid_ID, int *grid_clusterID)
|
|
2164
|
|
2165 num_clust = merge_grids(normalized_data, interval, file_Len, num_dm, num_bin, position, num_dense_grids, dense_grid_reverse, dense_grid_seq, eventID_To_denseventID, densegridID_To_gridevent, all_grid_ID, cluster_ID, grid_ID, grid_clusterID);
|
|
2166
|
|
2167 printf("computed number of groups is %d\n",num_clust);
|
|
2168
|
|
2169
|
|
2170 /************************************** release unnecessary memory and allocate memory and compute centers **********************************/
|
|
2171
|
|
2172
|
|
2173 for (i=0;i<file_Len;i++)
|
|
2174 free(position[i]);
|
|
2175 free(position);
|
|
2176
|
|
2177 for (i=0;i<num_dense_grids;i++)
|
|
2178 free(dense_grid_seq[i]);
|
|
2179 free(dense_grid_seq);
|
|
2180
|
|
2181 free(dense_grid_reverse);
|
|
2182
|
|
2183 free(densegridID_To_gridevent);
|
|
2184 free(all_grid_ID);
|
|
2185
|
|
2186 // cluster_center ////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
2187
|
|
2188 cluster_center=(double **)malloc(sizeof(double*)*num_clust);
|
|
2189 memset(cluster_center,0,sizeof(double*)*num_clust);
|
|
2190 for (i=0;i<num_clust;i++)
|
|
2191 {
|
|
2192 cluster_center[i]=(double*)malloc(sizeof(double)*num_dm);
|
|
2193 memset(cluster_center[i],0,sizeof(double)*num_dm);
|
|
2194 }
|
|
2195
|
|
2196 ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_clust,cluster_ID,cluster_center); //produce the centers with normalized data
|
|
2197
|
|
2198 //printf("pass the first ID2center\n"); //commented on July 23, 2010
|
|
2199
|
|
2200 /*** population_ID and grid_populationID **/
|
|
2201
|
|
2202 cluster_populationID=(int*)malloc(sizeof(int)*num_clust);
|
|
2203 memset(cluster_populationID,0,sizeof(int)*num_clust);
|
|
2204
|
|
2205 grid_populationID=(int*)malloc(sizeof(int)*num_dense_grids);
|
|
2206 memset(grid_populationID,0,sizeof(int)*num_dense_grids);
|
|
2207
|
|
2208 population_ID=(int*)malloc(sizeof(int)*num_dense_events);
|
|
2209 memset(population_ID,0,sizeof(int)*num_dense_events);
|
|
2210
|
|
2211 num_population = merge_clusters(num_clust, num_dm, interval, cluster_center, cluster_populationID,max_num_pop);
|
|
2212
|
|
2213
|
|
2214
|
|
2215 for (i=0;i<num_clust;i++)
|
|
2216 free(cluster_center[i]);
|
|
2217 free(cluster_center);
|
|
2218
|
|
2219 free(interval);
|
|
2220
|
|
2221 for (i=0;i<num_dense_grids;i++)
|
|
2222 {
|
|
2223 grid_populationID[i]=cluster_populationID[grid_clusterID[i]];
|
|
2224 }
|
|
2225
|
|
2226 for (i=0;i<num_dense_events;i++)
|
|
2227 {
|
|
2228 population_ID[i]=cluster_populationID[cluster_ID[i]];
|
|
2229 }
|
|
2230
|
|
2231 printf("computed number of populations is %d\n",num_population);
|
|
2232
|
|
2233
|
|
2234
|
|
2235 // population_center /////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
2236
|
|
2237
|
|
2238 population_center=(double **)malloc(sizeof(double*)*num_population);
|
|
2239 memset(population_center,0,sizeof(double*)*num_population);
|
|
2240 for (i=0;i<num_population;i++)
|
|
2241 {
|
|
2242 population_center[i]=(double*)malloc(sizeof(double)*num_dm);
|
|
2243 memset(population_center[i],0,sizeof(double)*num_dm);
|
|
2244 }
|
|
2245
|
|
2246
|
|
2247
|
|
2248 ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_population,population_ID,population_center); //produce population centers with normalized data
|
|
2249
|
|
2250
|
|
2251 // show ////////////////////////////////////////////////////////////////////////////////
|
|
2252 all_population_ID=(int*)malloc(sizeof(int)*file_Len);
|
|
2253 memset(all_population_ID,0,sizeof(int)*file_Len);
|
|
2254
|
|
2255 kmeans(normalized_data, num_population, KMEANS_TERM, file_Len, num_dm, all_population_ID, population_center);
|
|
2256 show(input_data, all_population_ID, file_Len, num_population, num_dm, para_name_string);
|
|
2257
|
|
2258 ID2Center_all(input_data,file_Len,num_dm,num_population,all_population_ID,population_center);
|
|
2259
|
|
2260
|
|
2261 f_cid=fopen("population_id.txt","w");
|
|
2262 f_ctr=fopen("population_center.txt","w");
|
|
2263 f_out=fopen("coordinates.txt","w");
|
|
2264 f_results=fopen("flock_results.txt","w");
|
|
2265
|
|
2266 /*
|
|
2267 f_parameters=fopen("parameters.txt","w");
|
|
2268 fprintf(f_parameters,"Number_of_Bins\t%d\n",num_bin);
|
|
2269 fprintf(f_parameters,"Density\t%f\n",aver_index);
|
|
2270 fclose(f_parameters);
|
|
2271 */
|
|
2272
|
|
2273 for (i=0;i<file_Len;i++)
|
|
2274 fprintf(f_cid,"%d\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009
|
|
2275
|
|
2276 /*
|
|
2277 * New to check for min/max to add to parameters.txt
|
|
2278 *
|
|
2279 */
|
|
2280
|
|
2281 fprintf(f_out,"%s\n",para_name_string);
|
|
2282 //fprintf(f_results,"%s\tEvent\tPopulation\n",para_name_string);
|
|
2283 fprintf(f_results,"%s\tPopulation\n",para_name_string);
|
|
2284 for (i=0;i<file_Len;i++)
|
|
2285 {
|
|
2286 for (j=0;j<num_dm;j++)
|
|
2287 {
|
|
2288 if (input_data[i][j] < min) {
|
|
2289 min = (int)input_data[i][j];
|
|
2290 }
|
|
2291 if (input_data[i][j] > max) {
|
|
2292 max = (int)input_data[i][j];
|
|
2293 }
|
|
2294 if (j==num_dm-1)
|
|
2295 {
|
|
2296 fprintf(f_out,"%d\n",(int)input_data[i][j]);
|
|
2297 fprintf(f_results,"%d\t",(int)input_data[i][j]);
|
|
2298 }
|
|
2299 else
|
|
2300 {
|
|
2301 fprintf(f_out,"%d\t",(int)input_data[i][j]);
|
|
2302 fprintf(f_results,"%d\t",(int)input_data[i][j]);
|
|
2303 }
|
|
2304 }
|
|
2305 //fprintf(f_results,"%d\t",i + 1);
|
|
2306 fprintf(f_results,"%d\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009
|
|
2307 }
|
|
2308
|
|
2309 /*
|
|
2310 f_parameters=fopen("parameters.txt","w");
|
|
2311 fprintf(f_parameters,"Number_of_Bins\t%d\n",num_bin);
|
|
2312 fprintf(f_parameters,"Density\t%d\n",den_t_event);
|
|
2313 fprintf(f_parameters,"Min\t%d\n",min);
|
|
2314 fprintf(f_parameters,"Max\t%d\n",max);
|
|
2315 fclose(f_parameters);
|
|
2316 */
|
|
2317
|
|
2318 f_properties=fopen("fcs.properties","w");
|
|
2319 fprintf(f_properties,"Bins=%d\n",num_bin);
|
|
2320 fprintf(f_properties,"Density=%d\n",den_t_event);
|
|
2321 fprintf(f_properties,"Min=%d\n",min);
|
|
2322 fprintf(f_properties,"Max=%d\n",max);
|
|
2323 fprintf(f_properties,"Populations=%d\n",num_population);
|
|
2324 fprintf(f_properties,"Events=%d\n",file_Len);
|
|
2325 fprintf(f_properties,"Markers=%d\n",num_dm);
|
|
2326 fclose(f_properties);
|
|
2327
|
|
2328
|
|
2329 for (i=0;i<num_population;i++) {
|
|
2330 /* Add if we want to include population id in the output
|
|
2331 */
|
|
2332 fprintf(f_ctr,"%d\t",i+1); //i changed to i+1 to start from 1 instead of 0: April 16, 2009
|
|
2333
|
|
2334 for (j=0;j<num_dm;j++) {
|
|
2335 if (j==num_dm-1)
|
|
2336 fprintf(f_ctr,"%.0f\n",population_center[i][j]);
|
|
2337 else
|
|
2338 fprintf(f_ctr,"%.0f\t",population_center[i][j]);
|
|
2339 }
|
|
2340 }
|
|
2341
|
|
2342 //added April 16, 2009
|
|
2343 f_mfi=fopen("MFI.txt","w");
|
|
2344
|
|
2345 for (i=0;i<num_population;i++)
|
|
2346 {
|
|
2347 fprintf(f_mfi,"%d\t",i+1);
|
|
2348
|
|
2349 for (j=0;j<num_dm;j++)
|
|
2350 {
|
|
2351 if (j==num_dm-1)
|
|
2352 fprintf(f_mfi,"%.0f\n",population_center[i][j]);
|
|
2353 else
|
|
2354 fprintf(f_mfi,"%.0f\t",population_center[i][j]);
|
|
2355 }
|
|
2356 }
|
|
2357 fclose(f_mfi);
|
|
2358
|
|
2359 //ended April 16, 2009
|
|
2360
|
|
2361 fclose(f_cid);
|
|
2362 fclose(f_ctr);
|
|
2363 fclose(f_out);
|
|
2364 fclose(f_results);
|
|
2365
|
|
2366
|
|
2367 for (i=0;i<num_population;i++)
|
|
2368 {
|
|
2369 free(population_center[i]);
|
|
2370 }
|
|
2371 free(population_center);
|
|
2372
|
|
2373
|
|
2374 for (i=0;i<file_Len;i++)
|
|
2375 free(normalized_data[i]);
|
|
2376 free(normalized_data);
|
|
2377
|
|
2378 free(grid_populationID);
|
|
2379
|
|
2380 free(cluster_populationID);
|
|
2381 free(grid_clusterID);
|
|
2382 free(cluster_ID);
|
|
2383
|
|
2384 for (i=0;i<file_Len;i++)
|
|
2385 free(input_data[i]);
|
|
2386 free(input_data);
|
|
2387
|
|
2388 free(grid_ID);
|
|
2389 free(population_ID);
|
|
2390 free(all_population_ID);
|
|
2391 free(eventID_To_denseventID);
|
|
2392
|
|
2393 ///////////////////////////////////////////////////////////
|
|
2394 printf("Ending time:\t\t\t\t");
|
|
2395 fflush(stdout);
|
|
2396 system("/bin/date");
|
|
2397
|
|
2398 return 0;
|
|
2399
|
|
2400 }
|