1
|
1 /////////////////////////////////////////////////////////
|
|
2 // Cent_adjust version number and modification history
|
|
3 // ImmPort BISC project
|
|
4 // Author: Yu "Max" Qian
|
|
5 // v1.01: Oct 16, 2009
|
|
6 // Line 899 of the main function:
|
|
7 // Changed kmean_term=1 to kmean_term=2
|
|
8 //////////////////////////////////////////////////////////
|
|
9
|
|
10
|
|
11 #include <time.h>
|
|
12 #include <stdio.h>
|
|
13 #include <stdlib.h>
|
|
14 #include <math.h>
|
|
15 #include <string.h>
|
|
16
|
|
17 #define DEBUG 0
|
|
18 #define LINE_LEN 1024
|
|
19 #define FILE_NAME_LEN 128
|
|
20 #define PARA_NAME_LEN 64
|
|
21 #define MAX_VALUE 1000000000
|
|
22 #define CUBE 0
|
|
23
|
|
24
|
|
25 void getctrfileinfo(FILE *f_src_ctr, long *num_clust)
|
|
26 {
|
|
27 int ch='\n';
|
|
28 int prev='\n';
|
|
29 long num_rows=0;
|
|
30
|
|
31 while ((ch = fgetc(f_src_ctr))!= EOF )
|
|
32 {
|
|
33 if (ch == '\n')
|
|
34 {
|
|
35 ++num_rows;
|
|
36 }
|
|
37 prev = ch;
|
|
38 }
|
|
39 if (prev!='\n')
|
|
40 ++num_rows;
|
|
41
|
|
42 *num_clust=num_rows;
|
|
43 //printf("center file has %ld rows\n", *num_clust);
|
|
44 }
|
|
45
|
|
46 /************************************* Read basic info of the source file **************************************/
|
|
47 void getfileinfo(FILE *f_src, long *file_Len, long *num_dm, char *name_string, int *time_ID)
|
|
48 {
|
|
49 char src[LINE_LEN];
|
|
50 char current_name[64];
|
|
51 char prv;
|
|
52
|
|
53 long num_rows=0;
|
|
54 long num_columns=0;
|
|
55 int ch='\n';
|
|
56 int prev='\n';
|
|
57 long time_pos=0;
|
|
58 long i=0;
|
|
59 long j=0;
|
|
60
|
|
61
|
|
62
|
|
63 src[0]='\0';
|
|
64 fgets(src, LINE_LEN, f_src);
|
|
65
|
|
66 name_string[0]='\0';
|
|
67 current_name[0]='\0';
|
|
68 prv='\n';
|
|
69
|
|
70 while ((src[i]==' ') || (src[i]=='\t')) //skip space and tab characters
|
|
71 i++;
|
|
72
|
|
73 while ((src[i]!='\r') && (src[i]!='\n')) //repeat until the end of the line
|
|
74 {
|
|
75 current_name[j]=src[i];
|
|
76
|
|
77 if ((src[i]=='\t') && (prv!='\t')) //a complete word
|
|
78 {
|
|
79 current_name[j]='\0';
|
|
80
|
|
81 /*
|
|
82 * Commented out John Campbell, June 10 2010
|
|
83 * We no longer want to automatically remove Time column.
|
|
84 * This column should have been removed by column selection
|
|
85 if (0!=strcmp(current_name,"Time"))
|
|
86 {
|
|
87 num_columns++; //num_columns does not inlcude the column of Time
|
|
88 time_pos++;
|
|
89 strcat(name_string,current_name);
|
|
90 strcat(name_string,"\t");
|
|
91 }
|
|
92 else
|
|
93 {
|
|
94 *time_ID=time_pos;
|
|
95 }
|
|
96 */
|
|
97
|
|
98 num_columns++;
|
|
99 strcat(name_string,current_name);
|
|
100 strcat(name_string,"\t");
|
|
101
|
|
102 current_name[0]='\0';
|
|
103 j=0;
|
|
104 }
|
|
105
|
|
106 if ((src[i]=='\t') && (prv=='\t')) //a duplicate tab or space
|
|
107 {
|
|
108 current_name[0]='\0';
|
|
109 j=0;
|
|
110 }
|
|
111
|
|
112 if (src[i]!='\t')
|
|
113 j++;
|
|
114
|
|
115
|
|
116 prv=src[i];
|
|
117 i++;
|
|
118 }
|
|
119
|
|
120 if (prv!='\t') //the last one hasn't been retrieved
|
|
121 {
|
|
122 current_name[j]='\0';
|
|
123 /*
|
|
124 * Commented out John Campbell, June 10 2010
|
|
125 * We no longer want to automatically remove Time column.
|
|
126 * This column should have been removed by column selection
|
|
127 if (0!=strcmp(current_name,"Time"))
|
|
128 {
|
|
129 num_columns++;
|
|
130 strcat(name_string,current_name);
|
|
131 time_pos++;
|
|
132 }
|
|
133 else
|
|
134 {
|
|
135 *time_ID=time_pos;
|
|
136 }
|
|
137 */
|
|
138
|
|
139 num_columns++;
|
|
140 strcat(name_string,current_name);
|
|
141 }
|
|
142
|
|
143 if (DEBUG==1)
|
|
144 {
|
|
145 printf("time_ID is %d\n",*time_ID);
|
|
146 printf("name_string is %s\n",name_string);
|
|
147 }
|
|
148
|
|
149 // # of rows
|
|
150
|
|
151 while ((ch = fgetc(f_src))!= EOF )
|
|
152 {
|
|
153 if (ch == '\n')
|
|
154 {
|
|
155 ++num_rows;
|
|
156 }
|
|
157 prev = ch;
|
|
158 }
|
|
159 if (prev!='\n')
|
|
160 ++num_rows;
|
|
161
|
|
162 *file_Len=num_rows;
|
|
163 *num_dm=num_columns;
|
|
164
|
|
165 //printf("original file size is %ld; number of dimensions is %ld\n", *file_Len, *num_dm);
|
|
166 }
|
|
167
|
|
168
|
|
169
|
|
170 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
171 /************************************* Read the source file into uncomp_data **************************************/
|
|
172 void readsource(FILE *f_src, long file_Len, long num_dm, double **uncomp_data, int time_ID)
|
|
173 {
|
|
174 long time_pass=0; //to mark whether the time_ID has been passed
|
|
175 long index=0;
|
|
176
|
|
177 long i=0;
|
|
178 long j=0;
|
|
179 long t=0;
|
|
180
|
|
181 char src[LINE_LEN];
|
|
182 char xc[LINE_LEN/10];
|
|
183
|
|
184 src[0]='\0';
|
|
185 fgets(src,LINE_LEN, f_src); //skip the first line about parameter names
|
|
186
|
|
187 while (!feof(f_src) && (index<file_Len)) //index = 0, 1, ..., file_Len-1
|
|
188 {
|
|
189 src[0]='\0';
|
|
190 fgets(src,LINE_LEN,f_src);
|
|
191 i=0;
|
|
192 time_pass=0;
|
|
193
|
|
194 if (time_ID==-1)
|
|
195 {
|
|
196 for (t=0;t<num_dm;t++) //there is no time_ID
|
|
197 {
|
|
198 xc[0]='\0';
|
|
199 j=0;
|
|
200 while ((src[i]!='\r') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t'))
|
|
201 {
|
|
202 xc[j]=src[i];
|
|
203 i++;
|
|
204 j++;
|
|
205 }
|
|
206
|
|
207 xc[j]='\0';
|
|
208 i++;
|
|
209
|
|
210 uncomp_data[index][t]=atof(xc);
|
|
211 }
|
|
212 }
|
|
213 else
|
|
214 {
|
|
215 for (t=0;t<=num_dm;t++) //the time column needs to be skipped, so there are num_dm+1 columns
|
|
216 {
|
|
217 xc[0]='\0';
|
|
218 j=0;
|
|
219 while ((src[i]!='\r') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t'))
|
|
220 {
|
|
221 xc[j]=src[i];
|
|
222 i++;
|
|
223 j++;
|
|
224 }
|
|
225
|
|
226 xc[j]='\0';
|
|
227 i++;
|
|
228
|
|
229 if (t==time_ID)
|
|
230 {
|
|
231 time_pass=1;
|
|
232 continue;
|
|
233 }
|
|
234
|
|
235 if (time_pass)
|
|
236 uncomp_data[index][t-1]=atof(xc);
|
|
237 else
|
|
238 uncomp_data[index][t]=atof(xc);
|
|
239 }
|
|
240 }
|
|
241 index++;
|
|
242 //fprintf(fout_ID,"%s",src);
|
|
243 } //end of while
|
|
244
|
|
245 if (DEBUG == 1)
|
|
246 {
|
|
247 printf("the last line of the source data is:\n");
|
|
248 for (j=0;j<num_dm;j++)
|
|
249 printf("%f ",uncomp_data[index-1][j]);
|
|
250 printf("\n");
|
|
251 }
|
|
252 }
|
|
253
|
|
254 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
255 void readcenter(FILE *f_src_ctr, long num_clust, long num_dm, double **cluster_center, long *IDmapping)
|
|
256 {
|
|
257 char src[LINE_LEN];
|
|
258 char xc[LINE_LEN/10];
|
|
259
|
|
260 long i=0;
|
|
261 long j=0;
|
|
262 int m=0;
|
|
263 int t=0;
|
|
264
|
|
265 for (i=0;i<num_clust;i++)
|
|
266 {
|
|
267 src[0]='\0';
|
|
268 fgets(src,LINE_LEN, f_src_ctr);
|
|
269 m=0;
|
|
270 for (j=0;j<num_dm+1;j++)
|
|
271 {
|
|
272 xc[0]='\0';
|
|
273 t=0;
|
|
274 while ((src[m]!='\r') && (src[m]!='\n') && (src[m]!=' ') && (src[m]!='\t'))
|
|
275 {
|
|
276 xc[t]=src[m];
|
|
277 m++;
|
|
278 t++;
|
|
279 }
|
|
280 xc[t]='\0';
|
|
281 m++;
|
|
282 if (j==0)
|
|
283 IDmapping[i]=atoi(xc);
|
|
284 else
|
|
285 cluster_center[i][j-1]=atof(xc);
|
|
286 //printf("cluster_center[%d][%d]=%f\n",i,j,cluster_center[i][j]);
|
|
287 }
|
|
288 }
|
|
289 }
|
|
290
|
|
291
|
|
292 /**************************************** Normalization ******************************************/
|
|
293 void tran(double **orig_data, long clean_Len, long num_dm, long norm_used, double **matrix_to_cluster)
|
|
294 {
|
|
295 long i=0;
|
|
296 long j=0;
|
|
297
|
|
298 double biggest=0;
|
|
299 double smallest=MAX_VALUE;
|
|
300
|
|
301 double *aver; //average of each column
|
|
302 double *std; //standard deviation of each column
|
|
303
|
|
304 aver=(double*)malloc(sizeof(double)*clean_Len);
|
|
305 memset(aver,0,sizeof(double)*clean_Len);
|
|
306
|
|
307 std=(double*)malloc(sizeof(double)*clean_Len);
|
|
308 memset(std,0,sizeof(double)*clean_Len);
|
|
309
|
|
310 if (norm_used==2) //z-score normalization
|
|
311 {
|
|
312 for (j=0;j<num_dm;j++)
|
|
313 {
|
|
314 aver[j]=0;
|
|
315 for (i=0;i<clean_Len;i++)
|
|
316 aver[j]=aver[j]+orig_data[i][j];
|
|
317 aver[j]=aver[j]/(double)clean_Len;
|
|
318
|
|
319 std[j]=0;
|
|
320 for (i=0;i<clean_Len;i++)
|
|
321 std[j]=std[j]+(orig_data[i][j]-aver[j])*(orig_data[i][j]-aver[j]);
|
|
322 std[j]=sqrt(std[j]/(double)clean_Len);
|
|
323
|
|
324 for (i=0;i<clean_Len;i++)
|
|
325 matrix_to_cluster[i][j]=(orig_data[i][j]-aver[j])/std[j]; //z-score normalization
|
|
326 }
|
|
327 }
|
|
328
|
|
329 if (norm_used==1) //0-1 min-max normalization
|
|
330 {
|
|
331 for (j=0;j<num_dm;j++)
|
|
332 {
|
|
333 biggest=0;
|
|
334 smallest=MAX_VALUE;
|
|
335 for (i=0;i<clean_Len;i++)
|
|
336 {
|
|
337 if (orig_data[i][j]>biggest)
|
|
338 biggest=orig_data[i][j];
|
|
339 if (orig_data[i][j]<smallest)
|
|
340 smallest=orig_data[i][j];
|
|
341 }
|
|
342
|
|
343 for (i=0;i<clean_Len;i++)
|
|
344 {
|
|
345 if (biggest==smallest)
|
|
346 matrix_to_cluster[i][j]=biggest;
|
|
347 else
|
|
348 matrix_to_cluster[i][j]=(orig_data[i][j]-smallest)/(biggest-smallest);
|
|
349 }
|
|
350 }
|
|
351 }
|
|
352
|
|
353 if (norm_used==0) //no normalization
|
|
354 {
|
|
355 for (i=0;i<clean_Len;i++)
|
|
356 for (j=0;j<num_dm;j++)
|
|
357 matrix_to_cluster[i][j]=orig_data[i][j];
|
|
358 }
|
|
359
|
|
360
|
|
361 }
|
|
362
|
|
363 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
364 void assign_event(double **Matrix, long k, long dist_used, double kmean_term, long file_Len, long num_dm, long *shortest_id, double **center, int random_init)
|
|
365 {
|
|
366
|
|
367 long i=0;
|
|
368
|
|
369 long j=0;
|
|
370 long t=0;
|
|
371 long random=0;
|
|
372 long random1=0;
|
|
373 long random2=0;
|
|
374 long times=0;
|
|
375 long times_allowed=0;
|
|
376
|
|
377 long *num; //num[i]=t means the ith cluster has t points
|
|
378
|
|
379 double vvv=1.0; // the biggest variation;
|
|
380 double distance=0.0;
|
|
381 double xv=0.0;
|
|
382 double variation=0.0;
|
|
383 double EPS=0.0;
|
|
384 double diff=0.0;
|
|
385 double mean_dx=0;
|
|
386 double mean_dy=0;
|
|
387 double sum_var=0;
|
|
388 double dx=0;
|
|
389 double dy=0;
|
|
390 double sd_x=0;
|
|
391 double sd_y=0;
|
|
392
|
|
393 double *temp_center;
|
|
394 double *shortest_distance;
|
|
395
|
|
396 double **sum;
|
|
397
|
|
398 temp_center = (double *)malloc(sizeof(double)*num_dm);
|
|
399 memset(temp_center,0,sizeof(double)*num_dm);
|
|
400
|
|
401 /* Choosing Centers */
|
|
402 if (random_init)
|
|
403 {
|
|
404 for (i=0;i<k;i++)
|
|
405 {
|
|
406 random1=rand()*rand();
|
|
407 //srand( (unsigned)time( NULL ) );
|
|
408 random2=abs((random1%5)+1);
|
|
409 for (t=0;t<random2;t++)
|
|
410 random2=random2*rand()+rand();
|
|
411
|
|
412 random=abs(random2%file_Len);
|
|
413 //printf("random=%d\n",random);
|
|
414 for (j=0;j<num_dm;j++)
|
|
415 center[i][j]=Matrix[random][j];
|
|
416
|
|
417 }
|
|
418 }
|
|
419
|
|
420 //printf("finish random selection\n");
|
|
421 /* To compute the nearest center for every point */
|
|
422
|
|
423 shortest_distance = (double *)malloc(sizeof(double)*file_Len);
|
|
424 memset(shortest_distance,0,sizeof(double)*file_Len);
|
|
425
|
|
426 num = (long *)malloc(sizeof(long)*k);
|
|
427 memset(num,0,sizeof(long)*k);
|
|
428
|
|
429 sum = (double **)malloc(sizeof(double*)*k);
|
|
430 memset(sum,0,sizeof(double*)*k);
|
|
431 for (i=0;i<k;i++)
|
|
432 {
|
|
433 sum[i] = (double *)malloc(sizeof(double)*num_dm);
|
|
434 memset(sum[i],0,sizeof(double)*num_dm);
|
|
435 }
|
|
436
|
|
437 for (i=0;i<k;i++)
|
|
438 for (j=0;j<num_dm;j++)
|
|
439 sum[i][j]=0.0; //sum[i][j] = k means the sum of the jth dimension of all points in the ith group is k
|
|
440
|
|
441 //printf("before recursion\n");
|
|
442 if (kmean_term>=1)
|
|
443 times_allowed = (long)kmean_term;
|
|
444 else
|
|
445 EPS = kmean_term;
|
|
446
|
|
447 times=0;
|
|
448
|
|
449 while (((vvv>EPS) && (kmean_term<1)) || ((times<times_allowed) && (kmean_term>=1)))
|
|
450 {
|
|
451 for (i=0;i<k;i++)
|
|
452 {
|
|
453 num[i]=0;
|
|
454 for (j=0;j<num_dm;j++)
|
|
455 sum[i][j]=0.0;
|
|
456 }
|
|
457
|
|
458 for (i=0;i<file_Len;i++) //for each data point i, we compute the distance between Matrix[i] and center[j]
|
|
459 {
|
|
460 shortest_distance[i]=MAX_VALUE;
|
|
461 for (j=0;j<k;j++) //for each center j
|
|
462 {
|
|
463
|
|
464 distance=0.0;
|
|
465
|
|
466 if (dist_used==0) //Euclidean distance
|
|
467 {
|
|
468 for (t=0;t<num_dm;t++) //for each dimension
|
|
469 {
|
|
470 diff=center[j][t]-Matrix[i][t];
|
|
471 if (diff<0)
|
|
472 diff=-diff;
|
|
473
|
|
474 if (CUBE)
|
|
475 distance = distance+(diff*diff*diff);
|
|
476 else
|
|
477 distance = distance+(diff*diff);
|
|
478 }
|
|
479 }
|
|
480 else //pearson correlation
|
|
481 {
|
|
482 mean_dx=0.0;
|
|
483 mean_dy=0.0;
|
|
484 sum_var=0.0;
|
|
485 dx=0.0;
|
|
486 dy=0.0;
|
|
487 sd_x=0.0;
|
|
488 sd_y=0.0;
|
|
489 for (t=0;t<num_dm;t++)
|
|
490 {
|
|
491 mean_dx+=center[j][t];
|
|
492 mean_dy+=Matrix[i][t];
|
|
493 }
|
|
494 mean_dx=mean_dx/(double)num_dm;
|
|
495 mean_dy=mean_dy/(double)num_dm;
|
|
496 //printf("mean_dx=%f\n",mean_dx);
|
|
497
|
|
498 for (t=0;t<num_dm;t++)
|
|
499 {
|
|
500 dx=center[j][t]-mean_dx;
|
|
501 dy=Matrix[i][t]-mean_dy;
|
|
502 sum_var+=dx*dy;
|
|
503 sd_x+=dx*dx;
|
|
504 sd_y+=dy*dy;
|
|
505 }
|
|
506 if (sqrt(sd_x*sd_y)==0)
|
|
507 distance = 1.0;
|
|
508 else
|
|
509 distance = 1.0 - (sum_var/(sqrt(sd_x*sd_y))); // distance ranges from 0 to 2;
|
|
510 //printf("distance=%f\n",distance);
|
|
511 } //pearson correlation ends
|
|
512
|
|
513
|
|
514 if (distance<shortest_distance[i])
|
|
515 {
|
|
516 shortest_distance[i]=distance;
|
|
517 shortest_id[i]=j;
|
|
518 }
|
|
519 }//end for j
|
|
520 num[shortest_id[i]]=num[shortest_id[i]]+1;
|
|
521 for (t=0;t<num_dm;t++)
|
|
522 sum[shortest_id[i]][t]=sum[shortest_id[i]][t]+Matrix[i][t];
|
|
523 }//end for i
|
|
524 /* recompute the centers */
|
|
525 //compute_mean(group);
|
|
526 vvv=0.0;
|
|
527 for (j=0;j<k;j++)
|
|
528 {
|
|
529 memcpy(temp_center,center[j],sizeof(double)*num_dm);
|
|
530 variation=0.0;
|
|
531 if (num[j]!=0)
|
|
532 {
|
|
533 for (t=0;t<num_dm;t++)
|
|
534 {
|
|
535 center[j][t]=sum[j][t]/(double)num[j];
|
|
536 xv=(temp_center[t]-center[j][t]);
|
|
537 variation=variation+xv*xv;
|
|
538 }
|
|
539 }
|
|
540
|
|
541 if (variation>vvv)
|
|
542 vvv=variation; //vvv is the biggest variation among the k clusters;
|
|
543 }
|
|
544 //compute_variation;
|
|
545 times++;
|
|
546 } //end for while
|
|
547
|
|
548
|
|
549
|
|
550 free(num);
|
|
551 for (i=0;i<k;i++)
|
|
552 free(sum[i]);
|
|
553 free(sum);
|
|
554 free(temp_center);
|
|
555 free(shortest_distance);
|
|
556
|
|
557 }
|
|
558
|
|
559 //////////////////////////////////////////////////////
|
|
560 /*************************** Show *****************************/
|
|
561 void show(double **Matrix, long *cluster_id, long file_Len, long k, long num_dm, char *name_string, long *IDmapping)
|
|
562 {
|
|
563 int situ1=0;
|
|
564 int situ2=0;
|
|
565
|
|
566 long i=0;
|
|
567 long id=0;
|
|
568 long j=0;
|
|
569 long info_id=0;
|
|
570 long nearest_id=0;
|
|
571 long insert=0;
|
|
572 long temp=0;
|
|
573 long m=0;
|
|
574 long n=0;
|
|
575 long t=0;
|
|
576
|
|
577 long *size_c;
|
|
578
|
|
579
|
|
580
|
|
581 long **size_mybound_1;
|
|
582 long **size_mybound_2;
|
|
583 long **size_mybound_3;
|
|
584 long **size_mybound_0;
|
|
585
|
|
586 double interval=0.0;
|
|
587
|
|
588 double *big;
|
|
589 double *small;
|
|
590
|
|
591
|
|
592 double **center;
|
|
593 double **mybound;
|
|
594
|
|
595 long **prof; //prof[i][j]=1 means population i is + at parameter j
|
|
596
|
|
597 FILE *fpcnt_id; //proportion id
|
|
598 //FILE *fcent_id; //center_id, i.e., centers of clusters within the original data
|
|
599 FILE *fprof_id; //profile_id
|
|
600
|
|
601 big=(double *)malloc(sizeof(double)*num_dm);
|
|
602 memset(big,0,sizeof(double)*num_dm);
|
|
603
|
|
604 small=(double *)malloc(sizeof(double)*num_dm);
|
|
605 memset(small,0,sizeof(double)*num_dm);
|
|
606
|
|
607 for (i=0;i<num_dm;i++)
|
|
608 {
|
|
609 big[i]=0.0;
|
|
610 small[i]=(double)MAX_VALUE;
|
|
611 }
|
|
612
|
|
613
|
|
614 size_c=(long *)malloc(sizeof(long)*k);
|
|
615 memset(size_c,0,sizeof(long)*k);
|
|
616
|
|
617 center=(double**)malloc(sizeof(double*)*k);
|
|
618 memset(center,0,sizeof(double*)*k);
|
|
619 for (i=0;i<k;i++)
|
|
620 {
|
|
621 center[i]=(double*)malloc(sizeof(double)*num_dm);
|
|
622 memset(center[i],0,sizeof(double)*num_dm);
|
|
623 }
|
|
624
|
|
625 mybound=(double**)malloc(sizeof(double*)*num_dm);
|
|
626 memset(mybound,0,sizeof(double*)*num_dm);
|
|
627 for (i=0;i<num_dm;i++) //there are 3 mybounds for 4 categories
|
|
628 {
|
|
629 mybound[i]=(double*)malloc(sizeof(double)*3);
|
|
630 memset(mybound[i],0,sizeof(double)*3);
|
|
631 }
|
|
632
|
|
633 prof=(long **)malloc(sizeof(long*)*k);
|
|
634 memset(prof,0,sizeof(long*)*k);
|
|
635 for (i=0;i<k;i++)
|
|
636 {
|
|
637 prof[i]=(long *)malloc(sizeof(long)*num_dm);
|
|
638 memset(prof[i],0,sizeof(long)*num_dm);
|
|
639 }
|
|
640
|
|
641
|
|
642 for (i=0;i<file_Len;i++)
|
|
643 {
|
|
644 id=cluster_id[i];
|
|
645 for (j=0;j<num_dm;j++)
|
|
646 {
|
|
647 center[id][j]=center[id][j]+Matrix[i][j];
|
|
648 if (big[j]<Matrix[i][j])
|
|
649 big[j]=Matrix[i][j];
|
|
650 if (small[j]>Matrix[i][j])
|
|
651 small[j]=Matrix[i][j];
|
|
652 }
|
|
653
|
|
654 size_c[id]++;
|
|
655 }
|
|
656
|
|
657 for (i=0;i<k;i++)
|
|
658 for (j=0;j<num_dm;j++)
|
|
659 {
|
|
660 if (size_c[i]!=0)
|
|
661 center[i][j]=(center[i][j]/(double)(size_c[i]));
|
|
662 else
|
|
663 center[i][j]=0;
|
|
664 }
|
|
665
|
|
666 for (j=0;j<num_dm;j++)
|
|
667 {
|
|
668 interval=((big[j]-small[j])/4.0);
|
|
669 //printf("interval[%d] is %f\n",j,interval);
|
|
670 for (i=0;i<3;i++)
|
|
671 mybound[j][i]=small[j]+((double)(i+1)*interval);
|
|
672 }
|
|
673
|
|
674
|
|
675 size_mybound_0=(long **)malloc(sizeof(long*)*k);
|
|
676 memset(size_mybound_0,0,sizeof(long*)*k);
|
|
677
|
|
678 for (i=0;i<k;i++)
|
|
679 {
|
|
680 size_mybound_0[i]=(long*)malloc(sizeof(long)*num_dm);
|
|
681 memset(size_mybound_0[i],0,sizeof(long)*num_dm);
|
|
682 }
|
|
683
|
|
684 size_mybound_1=(long **)malloc(sizeof(long*)*k);
|
|
685 memset(size_mybound_1,0,sizeof(long*)*k);
|
|
686
|
|
687 for (i=0;i<k;i++)
|
|
688 {
|
|
689 size_mybound_1[i]=(long*)malloc(sizeof(long)*num_dm);
|
|
690 memset(size_mybound_1[i],0,sizeof(long)*num_dm);
|
|
691 }
|
|
692
|
|
693 size_mybound_2=(long **)malloc(sizeof(long*)*k);
|
|
694 memset(size_mybound_2,0,sizeof(long*)*k);
|
|
695
|
|
696 for (i=0;i<k;i++)
|
|
697 {
|
|
698 size_mybound_2[i]=(long*)malloc(sizeof(long)*num_dm);
|
|
699 memset(size_mybound_2[i],0,sizeof(long)*num_dm);
|
|
700 }
|
|
701
|
|
702 size_mybound_3=(long **)malloc(sizeof(long*)*k);
|
|
703 memset(size_mybound_3,0,sizeof(long*)*k);
|
|
704
|
|
705 for (i=0;i<k;i++)
|
|
706 {
|
|
707 size_mybound_3[i]=(long*)malloc(sizeof(long)*num_dm);
|
|
708 memset(size_mybound_3[i],0,sizeof(long)*num_dm);
|
|
709 }
|
|
710
|
|
711 for (i=0;i<file_Len;i++)
|
|
712 for (j=0;j<num_dm;j++)
|
|
713 {
|
|
714 if (Matrix[i][j]<mybound[j][0])// && ((Matrix[i][j]-small[j])>0)) //the smallest values excluded
|
|
715 size_mybound_0[cluster_id[i]][j]++;
|
|
716 else
|
|
717 {
|
|
718 if (Matrix[i][j]<mybound[j][1])
|
|
719 size_mybound_1[cluster_id[i]][j]++;
|
|
720 else
|
|
721 {
|
|
722 if (Matrix[i][j]<mybound[j][2])
|
|
723 size_mybound_2[cluster_id[i]][j]++;
|
|
724 else
|
|
725 //if (Matrix[i][j]!=big[j]) //the biggest values excluded
|
|
726 size_mybound_3[cluster_id[i]][j]++;
|
|
727 }
|
|
728
|
|
729 }
|
|
730 }
|
|
731
|
|
732 fprof_id=fopen("profile.txt","w");
|
|
733 fprintf(fprof_id,"Population_ID\t");
|
|
734 fprintf(fprof_id,"%s\n",name_string);
|
|
735
|
|
736 for (i=0;i<k;i++)
|
|
737 {
|
|
738 fprintf(fprof_id,"%ld\t",IDmapping[i]);
|
|
739 for (j=0;j<num_dm;j++)
|
|
740 {
|
|
741
|
|
742 if (size_mybound_0[i][j]>size_mybound_1[i][j])
|
|
743 situ1=0;
|
|
744 else
|
|
745 situ1=1;
|
|
746 if (size_mybound_2[i][j]>size_mybound_3[i][j])
|
|
747 situ2=2;
|
|
748 else
|
|
749 situ2=3;
|
|
750
|
|
751 if ((situ1==0) && (situ2==2))
|
|
752 {
|
|
753 if (size_mybound_0[i][j]>size_mybound_2[i][j])
|
|
754 prof[i][j]=0;
|
|
755 else
|
|
756 prof[i][j]=2;
|
|
757 }
|
|
758 if ((situ1==0) && (situ2==3))
|
|
759 {
|
|
760 if (size_mybound_0[i][j]>size_mybound_3[i][j])
|
|
761 prof[i][j]=0;
|
|
762 else
|
|
763 prof[i][j]=3;
|
|
764 }
|
|
765 if ((situ1==1) && (situ2==2))
|
|
766 {
|
|
767 if (size_mybound_1[i][j]>size_mybound_2[i][j])
|
|
768 prof[i][j]=1;
|
|
769 else
|
|
770 prof[i][j]=2;
|
|
771 }
|
|
772 if ((situ1==1) && (situ2==3))
|
|
773 {
|
|
774 if (size_mybound_1[i][j]>size_mybound_3[i][j])
|
|
775 prof[i][j]=1;
|
|
776 else
|
|
777 prof[i][j]=3;
|
|
778 }
|
|
779
|
|
780 //begin to output profile
|
|
781 if (j==num_dm-1)
|
|
782 {
|
|
783 if (prof[i][j]==0)
|
|
784 fprintf(fprof_id,"1\n");
|
|
785 if (prof[i][j]==1)
|
|
786 fprintf(fprof_id,"2\n");
|
|
787 if (prof[i][j]==2)
|
|
788 fprintf(fprof_id,"3\n");
|
|
789 if (prof[i][j]==3)
|
|
790 fprintf(fprof_id,"4\n");
|
|
791 }
|
|
792 else
|
|
793 {
|
|
794 if (prof[i][j]==0)
|
|
795 fprintf(fprof_id,"1\t");
|
|
796 if (prof[i][j]==1)
|
|
797 fprintf(fprof_id,"2\t");
|
|
798 if (prof[i][j]==2)
|
|
799 fprintf(fprof_id,"3\t");
|
|
800 if (prof[i][j]==3)
|
|
801 fprintf(fprof_id,"4\t");
|
|
802 }
|
|
803 }
|
|
804 }
|
|
805 fclose(fprof_id);
|
|
806
|
|
807 ///////////////////////////////////////////////////////////
|
|
808
|
|
809
|
|
810 fpcnt_id=fopen("percentage.txt","w");
|
|
811 fprintf(fpcnt_id,"Population_ID\tPercentage\n");
|
|
812
|
|
813 for (t=0;t<k;t++)
|
|
814 {
|
|
815 fprintf(fpcnt_id,"%ld\t%.2f\n",IDmapping[t],(double)size_c[t]*100.0/(double)file_Len);
|
|
816 }
|
|
817 fclose(fpcnt_id);
|
|
818
|
|
819 free(big);
|
|
820 free(small);
|
|
821 free(size_c);
|
|
822
|
|
823 for (i=0;i<k;i++)
|
|
824 {
|
|
825 free(center[i]);
|
|
826 free(prof[i]);
|
|
827 free(size_mybound_0[i]);
|
|
828 free(size_mybound_1[i]);
|
|
829 free(size_mybound_2[i]);
|
|
830 free(size_mybound_3[i]);
|
|
831 }
|
|
832 free(center);
|
|
833 free(prof);
|
|
834 free(size_mybound_0);
|
|
835 free(size_mybound_1);
|
|
836 free(size_mybound_2);
|
|
837 free(size_mybound_3);
|
|
838
|
|
839 for (i=0;i<num_dm;i++)
|
|
840 free(mybound[i]);
|
|
841 free(mybound);
|
|
842
|
|
843 }
|
|
844 /******************************************************** Main Function **************************************************/
|
|
845 int main (int argc, char **argv)
|
|
846 {
|
|
847 //inputs
|
|
848 FILE *f_src; //source file pointer
|
|
849 FILE *f_src_ctr; //source center file
|
|
850 //outputs
|
|
851 FILE *f_cid; //cluster-id file pointer
|
|
852 FILE *f_mfi; //added April 16, 2009
|
|
853
|
|
854
|
|
855 char name_string[LINE_LEN]; //for name use
|
|
856
|
|
857 int time_id=-1;
|
|
858
|
|
859 long file_Len=0;
|
|
860 long num_clust=0;
|
|
861 long num_dm=0;
|
|
862 long norm_used=0;
|
|
863 long dist_used=0;
|
|
864 long i=0;
|
|
865 long j=0;
|
|
866
|
|
867 long *cluster_id;
|
|
868 long *IDmapping; //this is to keep the original populationID of the center.txt
|
|
869
|
|
870 double kmean_term=0;
|
|
871
|
|
872 double **cluster_center;
|
|
873 double **orig_data;
|
|
874 double **normalized_data;
|
|
875
|
|
876 /*
|
|
877 _strtime( tmpbuf );
|
|
878 printf( "Starting time:\t\t\t\t%s\n", tmpbuf );
|
|
879 _strdate( tmpbuf );
|
|
880 printf( "Starting date:\t\t\t\t%s\n", tmpbuf );
|
|
881 */
|
|
882
|
|
883 if (argc!=3)
|
|
884 {
|
|
885 printf("usage: cent_adjust input_center input_data_file\n");
|
|
886 exit(0);
|
|
887 }
|
|
888
|
|
889
|
|
890 f_src_ctr=fopen(argv[1],"r");
|
|
891
|
|
892 //read source data
|
|
893 f_src=fopen(argv[2],"r");
|
|
894
|
|
895 getfileinfo(f_src, &file_Len, &num_dm, name_string, &time_id); //get the filelength, number of dimensions, and num/name of parameters
|
|
896
|
|
897 rewind(f_src); //reset data file pointer
|
|
898
|
|
899 orig_data = (double **)malloc(sizeof(double*)*file_Len);
|
|
900 memset(orig_data,0,sizeof(double*)*file_Len);
|
|
901 for (i=0;i<file_Len;i++)
|
|
902 {
|
|
903 orig_data[i]=(double *)malloc(sizeof(double)*num_dm);
|
|
904 memset(orig_data[i],0,sizeof(double)*num_dm);
|
|
905 }
|
|
906
|
|
907 readsource(f_src, file_Len, num_dm, orig_data, time_id); //read the data;
|
|
908
|
|
909 fclose(f_src);
|
|
910 /////////////////////////////////////////////////////////////////////////////
|
|
911 getctrfileinfo(f_src_ctr, &num_clust); //get how many populations
|
|
912 norm_used=0;
|
|
913 dist_used=0;
|
|
914 kmean_term=2; //modified on Oct 16, 2009: changed kmean_term=1 to kmean_term=2
|
|
915
|
|
916 rewind(f_src_ctr); //reset center file pointer
|
|
917
|
|
918 //read population center
|
|
919 cluster_center=(double **)malloc(sizeof(double*)*num_clust);
|
|
920 memset(cluster_center,0,sizeof(double*)*num_clust);
|
|
921 for (i=0;i<num_clust;i++)
|
|
922 {
|
|
923 cluster_center[i]=(double*)malloc(sizeof(double)*num_dm);
|
|
924 memset(cluster_center[i],0,sizeof(double)*num_dm);
|
|
925 }
|
|
926 for (i=0;i<num_clust;i++)
|
|
927 for (j=0;j<num_dm;j++)
|
|
928 cluster_center[i][j]=0;
|
|
929
|
|
930 IDmapping=(long *)malloc(sizeof(long)*num_clust);
|
|
931 memset(IDmapping,0,sizeof(long)*num_clust);
|
|
932
|
|
933 readcenter(f_src_ctr,num_clust,num_dm,cluster_center,IDmapping); //read population center
|
|
934 fclose(f_src_ctr);
|
|
935
|
|
936 /////////////////////////////////////////////////////////////////////////////
|
|
937 normalized_data=(double **)malloc(sizeof(double*)*file_Len);
|
|
938 memset(normalized_data,0,sizeof(double*)*file_Len);
|
|
939 for (i=0;i<file_Len;i++)
|
|
940 {
|
|
941 normalized_data[i]=(double *)malloc(sizeof(double)*num_dm);
|
|
942 memset(normalized_data[i],0,sizeof(double)*num_dm);
|
|
943 }
|
|
944
|
|
945 tran(orig_data, file_Len, num_dm, norm_used, normalized_data);
|
|
946 /************************************************* Compute number of clusters *************************************************/
|
|
947
|
|
948 cluster_id=(long*)malloc(sizeof(long)*file_Len);
|
|
949 memset(cluster_id,0,sizeof(long)*file_Len);
|
|
950
|
|
951 assign_event(normalized_data,num_clust,dist_used,kmean_term,file_Len,num_dm,cluster_id,cluster_center,0);
|
|
952
|
|
953
|
|
954 //show(orig_data,cluster_id,file_Len,num_clust,num_dm,show_data,num_disp,name_string);
|
|
955 show(orig_data, cluster_id, file_Len, num_clust, num_dm, name_string, IDmapping);
|
|
956
|
|
957 f_cid=fopen("population_id.txt","w");
|
|
958
|
|
959 for (i=0;i<file_Len;i++)
|
|
960 fprintf(f_cid,"%ld\n",IDmapping[cluster_id[i]]);
|
|
961
|
|
962
|
|
963 fclose(f_cid);
|
|
964
|
|
965 //added April 16, 2009
|
|
966 f_mfi=fopen("MFI.txt","w");
|
|
967
|
|
968 for (i=0;i<num_clust;i++)
|
|
969 {
|
|
970 fprintf(f_mfi,"%ld\t",IDmapping[i]);
|
|
971
|
|
972 for (j=0;j<num_dm;j++)
|
|
973 {
|
|
974 if (j==num_dm-1)
|
|
975 fprintf(f_mfi,"%.0f\n",cluster_center[i][j]);
|
|
976 else
|
|
977 fprintf(f_mfi,"%.0f\t",cluster_center[i][j]);
|
|
978 }
|
|
979 }
|
|
980 fclose(f_mfi);
|
|
981
|
|
982 //ended April 16, 2009
|
|
983
|
|
984 for (i=0;i<num_clust;i++)
|
|
985 free(cluster_center[i]);
|
|
986 free(cluster_center);
|
|
987
|
|
988
|
|
989 /********************************************** Release memory ******************************************/
|
|
990
|
|
991 for (i=0;i<file_Len;i++)
|
|
992 {
|
|
993 free(orig_data[i]);
|
|
994 free(normalized_data[i]);
|
|
995 }
|
|
996
|
|
997 free(orig_data);
|
|
998 free(normalized_data);
|
|
999 free(cluster_id);
|
|
1000 free(IDmapping);
|
|
1001
|
|
1002 /*
|
|
1003 _strtime( tmpbuf );
|
|
1004 printf( "Ending time:\t\t\t\t%s\n", tmpbuf );
|
|
1005 _strdate( tmpbuf );
|
|
1006 printf( "Ending date:\t\t\t\t%s\n", tmpbuf );
|
|
1007 */
|
|
1008
|
|
1009 }
|