Mercurial > repos > immport-devteam > cross_sample
view src/flock1.c @ 1:7eab80f86779 draft
add FLOCK
author | immport-devteam |
---|---|
date | Mon, 27 Feb 2017 13:26:09 -0500 |
parents | |
children |
line wrap: on
line source
/***************************************************************************** FLOCK: FLOw cytometry Clustering without K (Named by: Jamie A. Lee and Richard H. Scheuermann) Author: (Max) Yu Qian, Ph.D. Copyright: Scheuermann Lab, Dept. of Pathology, UTSW Development: November 2005 ~ Forever Algorithm Status: May 2007: Release 1.0 Usage: flock data_file Note: the input file format must be channel values and the delimiter between two values must be a tab. Changes made July 23, 2010: made errors to STDERR Changes made Nov 4, 2010: added one more error (select_num_bin<min_grid) || (select_num_bin>max_grid) to STDERR; MAX_GRID changed to 50 as larger than 50 seems not useful for any file we have got ******************************************************************************/ #include <time.h> #include <stdio.h> #include <stdlib.h> #include <math.h> #include <string.h> #include <sys/stat.h> #include <unistd.h> #include <assert.h> #define DEBUG 0 #define LINE_LEN 1024 #define FILE_NAME_LEN 128 #define PARA_NAME_LEN 64 #define MAX_VALUE 1000000000 #define MIN_GRID 6 #define MAX_GRID 50 #define NORM_METHOD 2 //2 if z-score; 0 if no normalization; 1 if min-max #define KMEANS_TERM 100 //#define MAX_NUM_POP 30 int find_connected(int **G, int num_dense_grids, int ndim, int *grid_clusterID); /************* Read basic info of the source file ****************************/ void getfileinfo(FILE *f_src, int *file_Len, int *num_dm, char *name_string, int *time_ID) { char src[LINE_LEN]; char current_name[64]; char prv; int num_rows=0; int num_columns=0; int ch='\n'; int prev='\n'; int time_pos=0; int i=0; int j=0; int sw=0; src[0]='\0'; fgets(src, LINE_LEN, f_src); if ((src[0]=='F') && (src[1]=='C') && (src[2]=='S')) { fprintf(stderr,"the correct input format is a tab-delimited txt file, instead of FCS file.\n"); abort(); } name_string[0]='\0'; current_name[0]='\0'; prv='\n'; // skip space and tab characters while ((src[i]==' ') || (src[i]=='\t')) i++; // repeat until the end of line is reached while ((src[i]!='\0') && (src[i]!='\n') && (src[i]!='\r')) { current_name[j]=src[i]; if ((src[i]=='\t') && (prv!='\t')) //a complete word { current_name[j]='\0'; if (0!=strcmp(current_name,"Time")) { num_columns++; //num_columns does not inlcude the column of Time time_pos++; if (sw) { strcat(name_string,"\t"); } strcat(name_string,current_name); sw = 1; } else { *time_ID=time_pos; } current_name[0]='\0'; j=0; } if ((src[i]=='\t') && (prv=='\t')) //a duplicate tab or space { current_name[0]='\0'; j=0; } if (src[i]!='\t') j++; prv=src[i]; i++; } if (prv!='\t') //the last one hasn't been retrieved { current_name[j]='\0'; if (0!=strcmp(current_name,"Time")) { num_columns++; strcat(name_string,"\t"); strcat(name_string,current_name); time_pos++; } else { *time_ID=time_pos; } } if (DEBUG==1) { printf("time_ID is %d\n",*time_ID); printf("name_string is %s\n",name_string); } //start computing # of rows while ((ch = fgetc(f_src))!= EOF ) { if (ch == '\n') { ++num_rows; } prev = ch; } if (prev!='\n') ++num_rows; //added on July 23, 2010 if (num_rows<50) { fprintf(stderr,"Number of events in the input file is too few and should not be processed!\n"); //modified on July 23, 2010 abort(); } *file_Len=num_rows; *num_dm=num_columns; printf("original file size is %d; number of dimensions is %d\n", *file_Len, *num_dm); } /************************************* Read the source file into uncomp_data **************************************/ void readsource(FILE *f_src, int file_Len, int num_dm, double **uncomp_data, int time_ID) { int time_pass=0; //to mark whether the time_ID has been passed int index=0; int i=0; int j=0; int t=0; char src[LINE_LEN]; char xc[LINE_LEN/10]; src[0]='\0'; fgets(src,LINE_LEN, f_src); //skip the first line about parameter names while (!feof(f_src) && (index<file_Len)) //index = 0, 1, ..., file_Len-1 { src[0]='\0'; fgets(src,LINE_LEN,f_src); i=0; time_pass=0; if (time_ID==-1) { for (t=0;t<num_dm;t++) //there is no time_ID { xc[0]='\0'; j=0; while ((src[i]!='\0') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t')) { xc[j]=src[i]; i++; j++; } xc[j]='\0'; i++; uncomp_data[index][t]=atof(xc); } } else { for (t=0;t<=num_dm;t++) //the time column needs to be skipped, so there are num_dm+1 columns { xc[0]='\0'; j=0; while ((src[i]!='\0') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t')) { xc[j]=src[i]; i++; j++; } xc[j]='\0'; i++; if (t==time_ID) { time_pass=1; continue; } if (time_pass) uncomp_data[index][t-1]=atof(xc); else uncomp_data[index][t]=atof(xc); } } index++; //fprintf(fout_ID,"%s",src); } //end of while if (DEBUG == 1) { printf("the last line of the source data is:\n"); for (j=0;j<num_dm;j++) printf("%f ",uncomp_data[index-1][j]); printf("\n"); } } /**************************************** Normalization ******************************************/ void tran(double **orig_data, int file_Len, int num_dm, int norm_used, double **matrix_to_cluster) { int i=0; int j=0; double biggest=0; double smallest=MAX_VALUE; double *aver; //average of each column double *std; //standard deviation of each column aver=(double*)malloc(sizeof(double)*file_Len); memset(aver,0,sizeof(double)*file_Len); std=(double*)malloc(sizeof(double)*file_Len); memset(std,0,sizeof(double)*file_Len); if (norm_used==2) //z-score normalization { for (j=0;j<num_dm;j++) { aver[j]=0; for (i=0;i<file_Len;i++) aver[j]=aver[j]+orig_data[i][j]; aver[j]=aver[j]/(double)file_Len; std[j]=0; for (i=0;i<file_Len;i++) std[j]=std[j]+(orig_data[i][j]-aver[j])*(orig_data[i][j]-aver[j]); std[j]=sqrt(std[j]/(double)file_Len); for (i=0;i<file_Len;i++) matrix_to_cluster[i][j]=(orig_data[i][j]-aver[j])/std[j]; //z-score normalization } } if (norm_used==1) //0-1 min-max normalization { for (j=0;j<num_dm;j++) { biggest=0; smallest=MAX_VALUE; for (i=0;i<file_Len;i++) { if (orig_data[i][j]>biggest) biggest=orig_data[i][j]; if (orig_data[i][j]<smallest) smallest=orig_data[i][j]; } for (i=0;i<file_Len;i++) { if (biggest==smallest) matrix_to_cluster[i][j]=biggest; else matrix_to_cluster[i][j]=(orig_data[i][j]-smallest)/(biggest-smallest); } } } if (norm_used==0) //no normalization { for (i=0;i<file_Len;i++) for (j=0;j<num_dm;j++) matrix_to_cluster[i][j]=orig_data[i][j]; } } /********************************************** RadixSort *******************************************/ /* Perform a radix sort using each dimension from the original data as a radix. * Outputs: * sorted_seq -- a permutation vector mapping the ordered list onto the original data. * (sorted_seq[i] -> index in the original data of the ith element of the ordered list) * grid_ID -- mapping between the original data and the "grids" (see below) found as a byproduct * of the sorting procedure. * num_nonempty -- the number of grids that occur in the data (= the number of distinct values assumed * by grid_ID) */ void radixsort_flock(int **position,int file_Len,int num_dm,int num_bin,int *sorted_seq,int *num_nonempty,int *grid_ID) { int i=0; int length=0; int start=0; int prev_ID=0; int curr_ID=0; int j=0; int t=0; int p=0; int loc=0; int temp=0; int equal=0; int *count; //count[i]=j means there are j numbers having value i at the processing digit int *index; //index[i]=j means the starting position of grid i is j int *cp; //current position int *mark; //mark[i]=0 means it is not an ending point of a part, 1 means it is (a "part" is a group of items with identical bins for all dimensions) int *seq; //temporary sequence count=(int*)malloc(sizeof(int)*num_bin); memset(count,0,sizeof(int)*num_bin); cp=(int*)malloc(sizeof(int)*num_bin); memset(cp,0,sizeof(int)*num_bin); index=(int*)malloc(sizeof(int)*num_bin); // initialized below seq=(int*)malloc(sizeof(int)*file_Len); memset(seq,0,sizeof(int)*file_Len); mark=(int*)malloc(sizeof(int)*file_Len); memset(mark,0,sizeof(int)*file_Len); for (i=0;i<file_Len;i++) { sorted_seq[i]=i; mark[i]=0; seq[i]=0; } for (i=0;i<num_bin;i++) { index[i]=0; cp[i]=0; count[i]=0; } for (j=0;j<num_dm;j++) { if (j==0) //compute the initial values of mark { for (i=0;i<file_Len;i++) count[position[i][j]]++; // initialize the count to the number of items in each bin of the 0th dimension index[0] = 0; for (i=0;i<num_bin-1;i++) { index[i+1]=index[i]+count[i]; //index[k]=x means k segment starts at x (in the ordered list) if ((index[i+1]>0) && (index[i+1]<=file_Len)) { mark[index[i+1]-1]=1; // Mark the end of the segment in the ordered list } else { printf("out of myboundary for mark at index[i+1]-1.\n"); } } mark[file_Len-1]=1; for (i=0;i<file_Len;i++) { /* Build a permutation vector for the partially ordered data. Store the PV in sorted_seq */ loc=position[i][j]; temp=index[loc]+cp[loc]; //cp[i]=j means the offset from the starting position of grid i is j sorted_seq[temp]=i; //sorted_seq[i]=temp is also another way to sort cp[loc]++; } } else { //reset count, index, loc, temp, cp, start, and length length=0; loc=0; temp=0; start=0; for (p=0;p<num_bin;p++) { cp[p]=0; count[p]=0; index[p]=0; } for (i=0;i<file_Len;i++) { int iperm = sorted_seq[i]; // iperm allows us to traverse the data in sorted order. if (mark[i]!=1) { /* Count the number of items in each bin of dimension j, BUT we are going to reset at the end of each "part". Thus, the total effect is to do a sort by bin on the jth dimension for each group of data that has been identical for the dimensions processed up to this point. This is the standard radix sort procedure, but doing it this way saves us having to allocate buckets to hold the data in each group of "identical-so-far" elements. */ count[position[iperm][j]]++; //count[position[i][j]]++; length++; // This is the total length of the part, irrespective of the value of the jth component // (actually, less one, since we don't increment for the final element below) } if (mark[i]==1) { //length++; count[position[iperm][j]]++;//count[position[i][j]]++; //the current point must be counted in start=i-length; //this part starts from start to i: [start,i] /* Now we sort on the jth radix, just like we did for the 0th above, but we restrict it to just the current part. This would be a lot more clear if we broke this bit of code out into a separate function and processed recursively, plus we could multi-thread over the parts. (Hmmm...) */ index[0] = start; // Let index give the offset within the whole ordered list. for (t=0;t<num_bin-1;t++) { index[t+1]=index[t]+count[t]; if ((index[t+1]<=file_Len) && (index[t+1]>0)) { mark[index[t+1]-1]=1; // update the part boundaries to include the differences in the current radix. } } mark[i]=1; /* Update the permutation vector for the current part (i.e., from start to i). By the time we finish the loop over i the PV will be completely updated for the partial ordering up to the current radix. */ for (t=start;t<=i;t++) { loc=position[sorted_seq[t]][j];//loc=position[t][j]; temp=index[loc]+cp[loc]; if ((temp<file_Len) && (temp>=0)) { // seq is a temporary because we have to maintain the old PV until we have finished this step. seq[temp]=sorted_seq[t]; //sorted_seq[i]=temp is also another way to sort cp[loc]++; } else { printf("out of myboundary for seq at temp.\n"); } } for (t=start;t<=i;t++) { // copy the temporary back into sorted_seq. sorted_seq is now updated for radix j up through // entry i in the ordered list. if ((t>=0) && (t<file_Len)) sorted_seq[t]=seq[t]; else printf("out of myboundary for seq and sorted_seq at t.\n"); } //reset count, index, seq, length, and cp length=0; loc=0; temp=0; for (p=0;p<num_bin;p++) { cp[p]=0; count[p]=0; index[p]=0; } } }//end for i }//end else }//end for j /* sorted_seq[] now contains the ordered list for all radices. mark[] gives the boundaries between groups of elements that are identical over all radices (= dimensions in the original data) (although it appears we aren't going to make use of this fact) */ for (i=0;i<file_Len;i++) grid_ID[i]=0; //in case the initial value hasn't been assigned *num_nonempty=1; //starting from 1! /* assign the "grid" identifiers for all of the data. A grid will be what we were calling a "part" above. We will number them serially and tag the *unordered* data with the grid IDs. We will also count the number of populated grids (in general there will be many possible combinations of bin values that simply never occur) */ for (i=1;i<file_Len;i++) { equal=1; prev_ID=sorted_seq[i-1]; curr_ID=sorted_seq[i]; for (j=0;j<num_dm;j++) { if (position[prev_ID][j]!=position[curr_ID][j]) { equal=0; //not equal break; } } if (equal) { grid_ID[curr_ID]=grid_ID[prev_ID]; } else { *num_nonempty=*num_nonempty+1; grid_ID[curr_ID]=grid_ID[prev_ID]+1; } //all_grid_vol[grid_ID[curr_ID]]++; } //free memory free(count); free(index); free(cp); free(seq); free(mark); } /********************************************** Compute Position of Events ************************************************/ void compute_position(double **data_in, int file_Len, int num_dm, int num_bin, int **position, double *interval) { /* What we are really doing here is binning the data, with the bins spanning the range of the data and number of bins = num_bin */ int i=0; int j=0; double *small; //small[j] is the smallest value within dimension j double *big; //big[j] is the biggest value within dimension j small=(double*)malloc(sizeof(double)*num_dm); memset(small,0,sizeof(double)*num_dm); big=(double*)malloc(sizeof(double)*num_dm); memset(big,0,sizeof(double)*num_dm); for (j=0;j<num_dm;j++) { big[j]=MAX_VALUE*(-1); small[j]=MAX_VALUE; for (i=0;i<file_Len;i++) { if (data_in[i][j]>big[j]) big[j]=data_in[i][j]; if (data_in[i][j]<small[j]) small[j]=data_in[i][j]; } interval[j]=(big[j]-small[j])/(double)num_bin; //interval is computed using the biggest value and smallest value instead of the channel limit /* XXX: I'm pretty sure the denominator of the fraction above should be num_bin-1. */ /* I don't think so: num_bin is the number of bins */ } for (j=0;j<num_dm;j++) { for (i=0;i<file_Len;i++) { if (data_in[i][j]>=big[j]) position[i][j]=num_bin-1; else { position[i][j]=(int)((data_in[i][j]-small[j])/interval[j]); //position[i][j]=t means point i is at the t grid of dimensional j if ((position[i][j]>=num_bin) || (position[i][j]<0)) { //printf("position mis-computed in density analysis!\n"); //exit(0); fprintf(stderr,"Incorrect input file format or input parameters (number of bins overflows)!\n"); //modified on July 23, 2010 abort(); } } } } free(small); free(big); } /********************************************** select_bin to select the number of bins **********************************/ //num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty); /* Determine the number of bins to use in each dimension. Additionally sort the data elements according to the binned * values, and partition the data into "grids" with identical (binned) values. We try progressively more bins until we * maximize a merit function, then return the results obtained using the optimal number of bins. * * Outputs: * position -- binned data values * sorted_seq -- permutation vector mapping the ordered list to the original data * all_grid_ID -- grid to which each data element was assigned. * num_nonempty -- number of distinct values assumed by all_grid_ID * interval -- bin width for each data dimension * return value -- the number of bins selected. */ int select_bin(double **normalized_data, int file_Len, int num_dm, int min_grid, int max_grid, int **position, int *sorted_seq, int *all_grid_ID, int *num_nonempty, double *interval, int user_num_bin) { int num_bin=0; int select_num_bin=0; int m=0; int n=0; int i=0; int bin_scope=0; int temp_num_nonempty=0; int *temp_grid_ID; int *temp_sorted_seq; int **temp_position; //sorted_seq[i]=j means the event j ranks i double temp_index=0; double *bin_index; double *temp_interval; temp_grid_ID=(int *)malloc(sizeof(int)*file_Len); memset(temp_grid_ID,0,sizeof(int)*file_Len); temp_sorted_seq=(int *)malloc(sizeof(int)*file_Len); memset(temp_sorted_seq,0,sizeof(int)*file_Len); temp_position=(int **)malloc(sizeof(int*)*file_Len); memset(temp_position,0,sizeof(int*)*file_Len); for (m=0;m<file_Len;m++) { temp_position[m]=(int*)malloc(sizeof(int)*num_dm); memset(temp_position[m],0,sizeof(int)*num_dm); } temp_interval=(double*)malloc(sizeof(double)*num_dm); memset(temp_interval,0,sizeof(double)*num_dm); bin_scope=max_grid-min_grid+1; bin_index=(double *)malloc(sizeof(double)*bin_scope); memset(bin_index,0,sizeof(double)*bin_scope); i=0; for (num_bin=min_grid;num_bin<=max_grid;num_bin++) { /* compute_position bins the data into num_bin bins. Each dimension is binned independently. Outputs: temp_position[i][j] -- bin for the jth component of data element i. temp_interval[j] -- bin-width for the jth component */ compute_position(normalized_data, file_Len, num_dm, num_bin, temp_position, temp_interval); radixsort_flock(temp_position,file_Len,num_dm,num_bin,temp_sorted_seq,&temp_num_nonempty,temp_grid_ID); /* our figure of merit is the number of non-empty grids divided by number of bins per dimension. We declare victory when we have found a local maximum */ bin_index[i]=((double)temp_num_nonempty)/((double)num_bin); if ((double)(temp_num_nonempty)>=(double)(file_Len)*0.95) break; if ((bin_index[i]<temp_index) && (user_num_bin==0)) break; if ((user_num_bin==num_bin-1) && (user_num_bin!=0)) break; /* Since we have accepted this trial bin, copy all the temporary results into the output buffers */ memcpy(all_grid_ID,temp_grid_ID,sizeof(int)*file_Len); memcpy(sorted_seq,temp_sorted_seq,sizeof(int)*file_Len); memcpy(interval,temp_interval,sizeof(double)*num_dm); for (m=0;m<file_Len;m++) for (n=0;n<num_dm;n++) position[m][n]=temp_position[m][n]; temp_index=bin_index[i]; select_num_bin=num_bin; num_nonempty[0]=temp_num_nonempty; i++; } if ((select_num_bin<min_grid) || (select_num_bin>max_grid)) { fprintf(stderr,"Number of events collected is too few in terms of number of markers used. The file should not be processed!\n"); //modified on Nov 4, 2010 exit(0); } if (temp_index==0) { fprintf(stderr,"Too many dimensions with too few events in the input file, or a too large number of bins used.\n"); //modified on July 23, 2010 abort(); } free(temp_grid_ID); free(temp_sorted_seq); free(bin_index); free(temp_interval); for (m=0;m<file_Len;m++) free(temp_position[m]); free(temp_position); return select_num_bin; } /************************************* Select dense grids **********************************/ // compute num_dense_grids, num_dense_events, dense_grid_reverse, and all_grid_vol // den_cutoff=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse); /* * Prune away grids that are insufficiently "dense" (i.e., contain too few data items) * * Outputs: * num_dense_grids -- number of dense grids * num_dense_events -- total number of data items in all dense grids * dense_grid_reverse -- mapping from list of all grids to list of dense grids. * return value -- density cutoff for separating dense from non-dense grids. */ int select_dense(int file_Len, int *all_grid_ID, int num_nonempty, int *num_dense_grids, int *num_dense_events, int *dense_grid_reverse, int den_t_event) { int i=0; int vol_ID=0; int biggest_size=0; //biggest grid_size, to define grid_size_index int biggest_index=0; //int actual_threshold=0; //the actual threshold on grid_size, e.g., 1 implies 1 event in the grid //int num_remain=0; //number of remaining grids with different density thresholds int temp_num_dense_grids=0; int temp_num_dense_events=0; int *grid_size_index; int *all_grid_vol; int *grid_density_index; //double den_average=0; // double avr_index=0; ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Compute all_grid_vol ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// all_grid_vol=(int *)malloc(sizeof(int)*num_nonempty); memset(all_grid_vol,0,sizeof(int)*num_nonempty); /* Grid "volume" is just the number of data contained in the grid. */ for (i=0;i<file_Len;i++) { vol_ID=all_grid_ID[i]; //vol_ID=all_grid_ID[sorted_seq[i]]; all_grid_vol[vol_ID]++; //all_grid_vol[i]=j means grid i has j points } ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Compute grid_size_index (histogram of grid sizes) ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// for (i=0;i<num_nonempty;i++) { if (biggest_size<all_grid_vol[i]) { biggest_size=all_grid_vol[i]; } } //added on July 23, 2010 if (biggest_size<3) { fprintf(stderr,"Too many dimensions with too few events in the input file, or a too large number of bins used.\n"); //modified on July 23, 2010 abort(); } grid_size_index=(int*)malloc(sizeof(int)*biggest_size); memset(grid_size_index,0,sizeof(int)*biggest_size); for (i=0;i<num_nonempty;i++) { grid_size_index[all_grid_vol[i]-1]++; //grid_size_index[0]=5 means there are 5 grids having size 1 } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Compute den_cutoff //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// grid_density_index=(int *)malloc(sizeof(int)*(biggest_size-2));//from event 2 to biggest_size-1, i.e., from 0 to biggest_size-3 memset(grid_density_index,0,sizeof(int)*(biggest_size-2)); if (den_t_event==0) //user doesn't define the density threshold { biggest_index=0; for (i=2;i<biggest_size-1;i++) //the grid with 1 event will be skipped, i.e., grid_density_index[0] won't be defined { grid_density_index[i-1]=(grid_size_index[i-1]+grid_size_index[i+1]-2*grid_size_index[i]); if (biggest_index<grid_density_index[i-1]) { biggest_index=grid_density_index[i-1]; den_t_event=i+1; } } } if (den_t_event==0) //if something is wrong den_t_event=3; for (i=0;i<num_nonempty;i++) if (all_grid_vol[i]>=den_t_event) temp_num_dense_grids++; if (temp_num_dense_grids<=1) { //modified on July 23, 2010 //printf("a too high density threshold is set! Please decrease your density threshold.\n"); //exit(0); fprintf(stderr,"a too high density threshold is set! Please decrease your density threshold.\n"); //modified on July 23, 2010 abort(); } if (temp_num_dense_grids>=100000) { //modified on July 23, 2010 //printf("a too low density threshold is set! Please increase your density threshold.\n"); //exit(0); fprintf(stderr,"a too low density threshold is set! Please increase your density threshold.\n"); //modified on July 23, 2010 abort(); } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Compute dense_grid_reverse //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// temp_num_dense_grids=0; temp_num_dense_events=0; for (i=0;i<num_nonempty;i++) { dense_grid_reverse[i]=-1; if (all_grid_vol[i]>=den_t_event) { dense_grid_reverse[i]=temp_num_dense_grids; //dense_grid_reverse provides a mapping from all nonempty grids to dense grids. temp_num_dense_grids++; temp_num_dense_events+=all_grid_vol[i]; } } num_dense_grids[0]=temp_num_dense_grids; num_dense_events[0]=temp_num_dense_events; free(grid_size_index); free(all_grid_vol); return den_t_event; } ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Compute densegridID_To_gridevent and eventID_To_denseventID ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // grid_To_event(file_Len, dense_grid_reverse, all_grid_ID, eventID_To_denseventID, densegridID_To_gridevent); /* * Filter the data so that only the data belonging to dense grids is left * * Output: * eventID_To_denseeventID -- mapping from original event ID to ID in list containing only events contained in dense grids. * densegridID_To_gridevent -- mapping from dense grids to prototype members of the grids. * */ void grid_To_event(int file_Len, int *dense_grid_reverse, int *all_grid_ID, int *eventID_To_denseventID, int *densegridID_To_gridevent) { int i=0; int dense_grid_ID=0; int dense_event_ID=0; for (i=0;i<file_Len;i++) { dense_grid_ID=dense_grid_reverse[all_grid_ID[i]]; eventID_To_denseventID[i]=-1; if (dense_grid_ID!=-1) //for point (i) belonging to dense grids { eventID_To_denseventID[i]=dense_event_ID; dense_event_ID++; if (densegridID_To_gridevent[dense_grid_ID]==-1) //for point i that hasn't been selected densegridID_To_gridevent[dense_grid_ID]=i; //densegridID_To_gridevent maps dense_grid_ID to its representative point } } } ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Compute dense_grid_seq ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // generate_grid_seq(file_Len, num_dm, sorted_seq, num_dense_grids, densegridID_To_gridevent, position, dense_grid_rank, dense_grid_seq); /* Construct a table of binned data values for each dense grid. * * Output: * * dense_grid_seq -- table of binned data values for each dense grid (recall that all members of a grid have identical binned data values) */ void generate_grid_seq(int num_dm, int num_dense_grids, int *densegridID_To_gridevent, int **position, int **dense_grid_seq) { int i=0; int j=0; int ReventID=0; //representative event ID of the dense grid for (i=0;i<num_dense_grids;i++) { ReventID = densegridID_To_gridevent[i]; for (j=0;j<num_dm;j++) dense_grid_seq[i][j]=position[ReventID][j]; } } //compare two vectors int compare_value(int num_dm, int *search_value, int *seq_value) { int i=0; for (i=0;i<num_dm;i++) { if (search_value[i]<seq_value[i]) return 1; if (search_value[i]>seq_value[i]) return -1; if (search_value[i]==seq_value[i]) continue; } return 0; } //binary search the dense_grid_seq to return the dense grid ID if it exists int binary_search(int num_dense_grids, int num_dm, int *search_value, int **dense_grid_seq) { int low=0; int high=0; int mid=0; int comp_result=0; int match=0; //int found=0; low = 0; high = num_dense_grids-1; while (low <= high) { mid = (int)((low + high)/2); comp_result=compare_value(num_dm, search_value,dense_grid_seq[mid]); switch(comp_result) { case 1: high=mid-1; break; case -1: low=mid+1; break; case 0: match=1; break; } if (match==1) break; } if (match==1) { return mid; } else return -1; } /********************************************** Computing Centers Using IDs **********************************************/ void ID2Center(double **data_in, int file_Len, int num_dm, int *eventID_To_denseventID, int num_clust, int *cluster_ID, double **population_center) { int i=0; int j=0; int ID=0; int eventID=0; int *size_c; size_c=(int *)malloc(sizeof(int)*num_clust); memset(size_c,0,sizeof(int)*num_clust); for (i=0;i<num_clust;i++) for (j=0;j<num_dm;j++) population_center[i][j]=0; for (i=0;i<file_Len;i++) { eventID=eventID_To_denseventID[i]; if (eventID!=-1) //only events in dense grids count { ID=cluster_ID[eventID]; if (ID==-1) { //modified on July 23, 2010 //printf("ID==-1! in ID2Center\n"); //exit(0); fprintf(stderr,"Incorrect file format or input parameters (no dense regions found!)\n"); //modified on July 23, 2010 abort(); } for (j=0;j<num_dm;j++) population_center[ID][j]=population_center[ID][j]+data_in[i][j]; size_c[ID]++; } } for (i=0;i<num_clust;i++) { for (j=0;j<num_dm;j++) if (size_c[i]!=0) population_center[i][j]=(population_center[i][j]/(double)(size_c[i])); else population_center[i][j]=0; } free(size_c); } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Compute Population Center with all events /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void ID2Center_all(double **data_in, int file_Len, int num_dm, int num_clust, int *cluster_ID, double **population_center) { int i=0; int j=0; int ID=0; int *size_c; size_c=(int *)malloc(sizeof(int)*num_clust); memset(size_c,0,sizeof(int)*num_clust); for (i=0;i<num_clust;i++) for (j=0;j<num_dm;j++) population_center[i][j]=0; for (i=0;i<file_Len;i++) { ID=cluster_ID[i]; if (ID==-1) { //commented on July 23, 2010 //printf("ID==-1! in ID2Center_all\n"); //exit(0); fprintf(stderr,"Incorrect file format or input parameters (resulting in incorrect population IDs)\n"); //modified on July 23, 2010 abort(); } for (j=0;j<num_dm;j++) population_center[ID][j]=population_center[ID][j]+data_in[i][j]; size_c[ID]++; } for (i=0;i<num_clust;i++) { for (j=0;j<num_dm;j++) if (size_c[i]!=0) population_center[i][j]=(population_center[i][j]/(double)(size_c[i])); else population_center[i][j]=0; } free(size_c); } ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Merge neighboring grids to clusters ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// int merge_grids(double **normalized_data, double *interval, int file_Len, int num_dm, int num_bin, int **position, int num_dense_grids, int *dense_grid_reverse, int **dense_grid_seq, int *eventID_To_denseventID, int *densegridID_To_gridevent, int *all_grid_ID, int *cluster_ID, int *grid_ID, int *grid_clusterID) { int i=0; int j=0; int t=0; int p=0; int num_clust=0; int ReventID=0; int denseID=0; int neighbor_ID=0; //int temp_grid=0; int *grid_value; int *search_value; int **graph_of_grid; //the graph constructed for dense grids: each dense grid is a graph node double real_dist=0; double **norm_grid_center; norm_grid_center=(double **)malloc(sizeof(double*)*num_dense_grids); memset(norm_grid_center,0,sizeof(double*)*num_dense_grids); for (i=0;i<num_dense_grids;i++) { norm_grid_center[i]=(double *)malloc(sizeof(double)*num_dm); memset(norm_grid_center[i],0,sizeof(double)*num_dm); } for (i=0;i<file_Len;i++) { denseID=eventID_To_denseventID[i]; if (denseID!=-1) //only dense events can enter { grid_ID[denseID]=dense_grid_reverse[all_grid_ID[i]]; if (grid_ID[denseID]==-1) { fprintf(stderr,"Incorrect input file format or input parameters (no dense region found)!\n"); //modified on July 23, 2010 abort(); } } } /* Find centroid (in the normalized data) for each dense grid */ ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_dense_grids,grid_ID,norm_grid_center); //printf("pass the grid ID2 center\n"); //commmented on July 23, 2010 graph_of_grid=(int **)malloc(sizeof(int*)*num_dense_grids); memset(graph_of_grid,0,sizeof(int*)*num_dense_grids); for (i=0;i<num_dense_grids;i++) { graph_of_grid[i]=(int *)malloc(sizeof(int)*num_dm); memset(graph_of_grid[i],0,sizeof(int)*num_dm); for (j=0;j<num_dm;j++) graph_of_grid[i][j]=-1; } grid_value=(int *)malloc(sizeof(int)*num_dm); memset(grid_value,0,sizeof(int)*num_dm); search_value=(int *)malloc(sizeof(int)*num_dm); memset(search_value,0,sizeof(int)*num_dm); for (i=0;i<num_dense_grids;i++) { ReventID=densegridID_To_gridevent[i]; for (j=0;j<num_dm;j++) { grid_value[j]=position[ReventID][j]; } /* For each dimension, find the neighbor, if any, that is equal in all other dimensions and 1 greater in the chosen dimension. If the neighbor's centroid is not too far away, add it to this grid's neighbor list. */ for (t=0;t<num_dm;t++) { for (p=0;p<num_dm;p++) search_value[p]=grid_value[p]; if (grid_value[t]==num_bin-1) continue; search_value[t]=grid_value[t]+1; //we only consider the neighbor at the bigger side neighbor_ID=binary_search(num_dense_grids, num_dm, search_value, dense_grid_seq); if (neighbor_ID!=-1) { real_dist=norm_grid_center[i][t]-norm_grid_center[neighbor_ID][t]; if (real_dist<0) real_dist=-real_dist; if (real_dist<2*interval[t]) graph_of_grid[i][t]=neighbor_ID; } } grid_clusterID[i]=i; //initialize grid_clusterID } //graph constructed //DFS-based search begins /* Use a depth-first search to construct a list of connected subgraphs (= "clusters"). Note our graph as we have constructed it is a DAG, so we can use that to our advantage in our search. */ // num_clust=dfs(graph_of_grid,num_dense_grids,num_dm,grid_clusterID); num_clust=find_connected(graph_of_grid, num_dense_grids, num_dm, grid_clusterID); //computes grid_ID and cluster_ID for (i=0;i<file_Len;i++) { denseID=eventID_To_denseventID[i]; if (denseID!=-1) //only dense events can enter { cluster_ID[denseID]=grid_clusterID[grid_ID[denseID]]; //if (cluster_ID[denseID]==-1) // printf("catch you!\n"); } } free(search_value); free(grid_value); for (i=0;i<num_dense_grids;i++) { free(graph_of_grid[i]); free(norm_grid_center[i]); } free(graph_of_grid); free(norm_grid_center); return num_clust; } /********************************************* Merge Clusters to Populations *******************************************/ // This is the function future work can be on because it is about how to cluster the about 500 points accurately int merge_clusters(int num_clust, int num_dm, double *interval, double **cluster_center, int *cluster_populationID, int max_num_pop) { int num_population=0; int temp_num_population=0; int i=0; int j=0; int t=0; int merge=0; int smid=0; int bgid=0; double merge_dist=1.1; //initial value of merge_dist*interval should be slightly larger than the bin width int *map_ID; double diff=0; map_ID=(int*)malloc(sizeof(int)*num_clust); memset(map_ID,0,sizeof(int)*num_clust); while ((num_population>max_num_pop) || (num_population<=1)) { if (num_population<=1) merge_dist=merge_dist-0.1; if (num_population>max_num_pop) merge_dist=merge_dist+0.1; for (i=0;i<num_clust;i++) cluster_populationID[i]=i; for (i=0;i<num_clust-1;i++) { for (j=i+1;j<num_clust;j++) { merge=1; for (t=0;t<num_dm;t++) { diff=cluster_center[i][t]-cluster_center[j][t]; if (diff<0) diff=-diff; if (diff>(merge_dist*interval[t])) merge=0; } if ((merge) && (cluster_populationID[i]!=cluster_populationID[j])) { if (cluster_populationID[i]<cluster_populationID[j]) //they could not be equal { smid = cluster_populationID[i]; bgid = cluster_populationID[j]; } else { smid = cluster_populationID[j]; bgid = cluster_populationID[i]; } for (t=0;t<num_clust;t++) { if (cluster_populationID[t]==bgid) cluster_populationID[t]=smid; } } } } for (i=0;i<num_clust;i++) map_ID[i]=-1; for (i=0;i<num_clust;i++) map_ID[cluster_populationID[i]]=1; num_population=0; for (i=0;i<num_clust;i++) { if (map_ID[i]==1) { map_ID[i]=num_population; num_population++; } } if ((temp_num_population>max_num_pop) && (num_population==1)) break; else temp_num_population=num_population; if (num_clust<=1) break; } for (i=0;i<num_clust;i++) cluster_populationID[i]=map_ID[cluster_populationID[i]]; free(map_ID); return num_population; } /////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// double kmeans(double **Matrix, int k, double kmean_term, int file_Len, int num_dm, int *shortest_id, double **center) { int i=0; int j=0; int t=0; int random=0; int random1=0; int random2=0; int times=0; int dist_used=0; //0 is Euclidean and 1 is Pearson int random_init=0; //0: not use random seeds; int real_Len=0; int skipped=0; int *num; //num[i]=t means the ith cluster has t points double vvv=1.0; // the biggest variation; double distance=0.0; double xv=0.0; double variation=0.0; double mean_dx=0; double mean_dy=0; double sum_var=0; double dx=0; double dy=0; double sd_x=0; double sd_y=0; double diff=0; double distortion=0; double shortest_distance; double *temp_center; double **sum; temp_center = (double *)malloc(sizeof(double)*num_dm); memset(temp_center,0,sizeof(double)*num_dm); if (random_init) { for (i=0;i<k;i++) { random1=rand()*rand(); random2=abs((random1%5)+1); for (t=0;t<random2;t++) random2=random2*rand()+rand(); random=abs(random2%file_Len); for (j=0;j<num_dm;j++) center[i][j]=Matrix[random][j]; } } num = (int *)malloc(sizeof(int)*k); memset(num,0,sizeof(int)*k); sum = (double **)malloc(sizeof(double*)*k); memset(sum,0,sizeof(double*)*k); for (i=0;i<k;i++) { sum[i] = (double *)malloc(sizeof(double)*num_dm); memset(sum[i],0,sizeof(double)*num_dm); } times=0; real_Len=0; while (((vvv>kmean_term) && (kmean_term<1)) || ((times<kmean_term) && (kmean_term>=1))) { for (i=0;i<k;i++) { num[i]=0; for (j=0;j<num_dm;j++) sum[i][j]=0.0; } for (i=0;i<file_Len;i++) //for each data point i, we compute the distance between Matrix[i] and center[j] { skipped = 0; shortest_distance=MAX_VALUE; for (j=0;j<k;j++) //for each center j { distance=0.0; if (dist_used==0) //Euclidean distance { for (t=0;t<num_dm;t++) //for each dimension here num_dm is always 1 as we consider individual dimensions { diff=center[j][t]-Matrix[i][t]; diff=diff*diff; distance = distance+diff; //here we have a weight for each dimension } } else //pearson correlation { mean_dx=0.0; mean_dy=0.0; sum_var=0.0; dx=0.0; dy=0.0; sd_x=0.0; sd_y=0.0; for (t=0;t<num_dm;t++) { mean_dx+=center[j][t]; mean_dy+=Matrix[i][t]; } mean_dx=mean_dx/(double)num_dm; mean_dy=mean_dy/(double)num_dm; for (t=0;t<num_dm;t++) { dx=center[j][t]-mean_dx; dy=Matrix[i][t]-mean_dy; sum_var+=dx*dy; sd_x+=dx*dx; sd_y+=dy*dy; } if (sqrt(sd_x*sd_y)==0) distance = 1.0; else distance = 1.0 - (sum_var/(sqrt(sd_x*sd_y))); // distance ranges from 0 to 2; //printf("distance=%f\n",distance); } //pearson correlation ends if ((distance<shortest_distance) && (skipped == 0)) { shortest_distance=distance; shortest_id[i]=j; } }//end for j real_Len++; num[shortest_id[i]]=num[shortest_id[i]]+1; for (t=0;t<num_dm;t++) sum[shortest_id[i]][t]=sum[shortest_id[i]][t]+Matrix[i][t]; }//end for i /* recompute the centers */ //compute_mean(group); vvv=0.0; for (j=0;j<k;j++) { memcpy(temp_center,center[j],sizeof(double)*num_dm); variation=0.0; if (num[j]!=0) { for (t=0;t<num_dm;t++) { center[j][t]=sum[j][t]/(double)num[j]; xv=(temp_center[t]-center[j][t]); variation=variation+xv*xv; } } if (variation>vvv) vvv=variation; //vvv is the biggest variation among the k clusters; } //compute_variation; times++; } //end for while free(num); for (i=0;i<k;i++) free(sum[i]); free(sum); free(temp_center); return distortion; } ////////////////////////////////////////////////////// /*************************** Show *****************************/ void show(double **Matrix, int *cluster_id, int file_Len, int k, int num_dm, char *name_string) { int situ1=0; int situ2=0; int i=0; int id=0; int j=0; int t=0; int *size_c; int **size_mybound_1; int **size_mybound_2; int **size_mybound_3; int **size_mybound_0; double interval=0.0; double *big; double *small; double **center; double **mybound; int **prof; //prof[i][j]=1 means population i is + at parameter j FILE *fpcnt_id; //proportion id //FILE *fcent_id; //center_id, i.e., centers of clusters within the original data FILE *fprof_id; //profile_id big=(double *)malloc(sizeof(double)*num_dm); memset(big,0,sizeof(double)*num_dm); small=(double *)malloc(sizeof(double)*num_dm); memset(small,0,sizeof(double)*num_dm); for (i=0;i<num_dm;i++) { big[i]=0.0; small[i]=(double)MAX_VALUE; } size_c=(int *)malloc(sizeof(int)*k); memset(size_c,0,sizeof(int)*k); center=(double**)malloc(sizeof(double*)*k); memset(center,0,sizeof(double*)*k); for (i=0;i<k;i++) { center[i]=(double*)malloc(sizeof(double)*num_dm); memset(center[i],0,sizeof(double)*num_dm); } mybound=(double**)malloc(sizeof(double*)*num_dm); memset(mybound,0,sizeof(double*)*num_dm); for (i=0;i<num_dm;i++) //there are 3 mybounds for 4 categories { mybound[i]=(double*)malloc(sizeof(double)*3); memset(mybound[i],0,sizeof(double)*3); } prof=(int **)malloc(sizeof(int*)*k); memset(prof,0,sizeof(int*)*k); for (i=0;i<k;i++) { prof[i]=(int *)malloc(sizeof(int)*num_dm); memset(prof[i],0,sizeof(int)*num_dm); } for (i=0;i<file_Len;i++) { id=cluster_id[i]; for (j=0;j<num_dm;j++) { center[id][j]=center[id][j]+Matrix[i][j]; if (big[j]<Matrix[i][j]) big[j]=Matrix[i][j]; if (small[j]>Matrix[i][j]) small[j]=Matrix[i][j]; } size_c[id]++; } for (i=0;i<k;i++) for (j=0;j<num_dm;j++) { if (size_c[i]!=0) center[i][j]=(center[i][j]/(double)(size_c[i])); else center[i][j]=0; } for (j=0;j<num_dm;j++) { interval=((big[j]-small[j])/4.0); //printf("interval[%d] is %f\n",j,interval); for (i=0;i<3;i++) mybound[j][i]=small[j]+((double)(i+1)*interval); } size_mybound_0=(int **)malloc(sizeof(int*)*k); memset(size_mybound_0,0,sizeof(int*)*k); for (i=0;i<k;i++) { size_mybound_0[i]=(int*)malloc(sizeof(int)*num_dm); memset(size_mybound_0[i],0,sizeof(int)*num_dm); } size_mybound_1=(int **)malloc(sizeof(int*)*k); memset(size_mybound_1,0,sizeof(int*)*k); for (i=0;i<k;i++) { size_mybound_1[i]=(int*)malloc(sizeof(int)*num_dm); memset(size_mybound_1[i],0,sizeof(int)*num_dm); } size_mybound_2=(int **)malloc(sizeof(int*)*k); memset(size_mybound_2,0,sizeof(int*)*k); for (i=0;i<k;i++) { size_mybound_2[i]=(int*)malloc(sizeof(int)*num_dm); memset(size_mybound_2[i],0,sizeof(int)*num_dm); } size_mybound_3=(int **)malloc(sizeof(int*)*k); memset(size_mybound_3,0,sizeof(int*)*k); for (i=0;i<k;i++) { size_mybound_3[i]=(int*)malloc(sizeof(int)*num_dm); memset(size_mybound_3[i],0,sizeof(int)*num_dm); } for (i=0;i<file_Len;i++) for (j=0;j<num_dm;j++) { if (Matrix[i][j]<mybound[j][0])// && ((Matrix[i][j]-small[j])>0)) //the smallest values excluded size_mybound_0[cluster_id[i]][j]++; else { if (Matrix[i][j]<mybound[j][1]) size_mybound_1[cluster_id[i]][j]++; else { if (Matrix[i][j]<mybound[j][2]) size_mybound_2[cluster_id[i]][j]++; else //if (Matrix[i][j]!=big[j]) //the biggest values excluded size_mybound_3[cluster_id[i]][j]++; } } } fprof_id=fopen("profile.txt","w"); fprintf(fprof_id,"Population_ID\t"); fprintf(fprof_id,"%s\n",name_string); for (i=0;i<k;i++) { fprintf(fprof_id,"%d\t",i+1); //i changed to i+1 to start from 1 instead of 0: April 16, 2009 for (j=0;j<num_dm;j++) { if (size_mybound_0[i][j]>size_mybound_1[i][j]) situ1=0; else situ1=1; if (size_mybound_2[i][j]>size_mybound_3[i][j]) situ2=2; else situ2=3; if ((situ1==0) && (situ2==2)) { if (size_mybound_0[i][j]>size_mybound_2[i][j]) prof[i][j]=0; else prof[i][j]=2; } if ((situ1==0) && (situ2==3)) { if (size_mybound_0[i][j]>size_mybound_3[i][j]) prof[i][j]=0; else prof[i][j]=3; } if ((situ1==1) && (situ2==2)) { if (size_mybound_1[i][j]>size_mybound_2[i][j]) prof[i][j]=1; else prof[i][j]=2; } if ((situ1==1) && (situ2==3)) { if (size_mybound_1[i][j]>size_mybound_3[i][j]) prof[i][j]=1; else prof[i][j]=3; } //begin to output profile if (j==num_dm-1) { if (prof[i][j]==0) fprintf(fprof_id,"1\n"); if (prof[i][j]==1) fprintf(fprof_id,"2\n"); if (prof[i][j]==2) fprintf(fprof_id,"3\n"); if (prof[i][j]==3) fprintf(fprof_id,"4\n"); } else { if (prof[i][j]==0) fprintf(fprof_id,"1\t"); if (prof[i][j]==1) fprintf(fprof_id,"2\t"); if (prof[i][j]==2) fprintf(fprof_id,"3\t"); if (prof[i][j]==3) fprintf(fprof_id,"4\t"); } } } fclose(fprof_id); /////////////////////////////////////////////////////////// fpcnt_id=fopen("percentage.txt","w"); fprintf(fpcnt_id,"Population_ID\tPercentage\n"); for (t=0;t<k;t++) { fprintf(fpcnt_id,"%d\t%.2f\n",t+1,(double)size_c[t]*100.0/(double)file_Len); //t changed to t+1 to start from 1 instead of 0: April 16, 2009 } fclose(fpcnt_id); free(big); free(small); free(size_c); for (i=0;i<k;i++) { free(center[i]); free(prof[i]); free(size_mybound_0[i]); free(size_mybound_1[i]); free(size_mybound_2[i]); free(size_mybound_3[i]); } free(center); free(prof); free(size_mybound_0); free(size_mybound_1); free(size_mybound_2); free(size_mybound_3); for (i=0;i<num_dm;i++) free(mybound[i]); free(mybound); } /******************************************************** Main Function **************************************************/ //for normalized data, there are five variables: //cluster_ID //population_center //grid_clusterID //grid_ID //grid_Center //the same five variables exist for the original data //however, the three IDs (cluster_ID, grid_ID, grid_clusterID) don't change in either normalized or original data //also, data + cluster_ID -> population_center //data + grid_ID -> grid_Center /* what is the final output */ //the final things we want are grid_Center in the original data and grid_clusterID //or population_center in the original data //Sparse grids will not be considered when computing the two centroids (centroids of grids and centroids of clusters) /* what information should select_bin output */ //the size of all IDs are unknown to function main because we only consider the events in dense grids, and also the number of dense grids //is unknown, therefore I must use a prescreening to output //how many bins I should use //the number of dense grids //total number of events in the dense grids /* basic procedure of main function */ //1. read raw file and normalize the raw file //2. select_bin //3. allocate memory for eventID_To_denseventID, grid_clusterID, grid_ID, cluster_ID. //4. call select_dense and merge_grids with grid_clusterID, grid_ID, cluster_ID. //5. release normalized data; allocate memory for grid_Center and population_center //6. output grid_Center and population_center using ID2Center together with grid_clusterID //from IDs to centers int main (int argc, char **argv) { //inputs FILE *f_src; //source file pointer FILE *f_out; //coordinates FILE *f_cid; //population-ID of events FILE *f_ctr; //centroids of populations FILE *f_results; //coordinates file event and population column FILE *f_mfi; //added April 16, 2009 for mean fluorescence intensity FILE *f_parameters; //number of bins and density calculated by //the algorithm. Used to update the database FILE *f_properties; //Properties file used by Image generation software char para_name_string[LINE_LEN]; int time_ID=-1; int num_bin=0; //the bins I will use on each dimension int file_Len=0; //number of events int num_dm=0; int num_clust=0; int num_dense_events=0; int num_dense_grids=0; int num_nonempty=0; int num_population=0; //int temp=0; //below are read from configuration file int i=0; int j=0; int max_num_pop=0; int den_t_event=0; int *grid_clusterID; //(dense)gridID to clusterID int *grid_ID; //(dense)eventID to gridID int *cluster_ID; //(dense)eventID to clusterID int *eventID_To_denseventID; //eventID_To_denseventID[i]=k means event i is in a dense grid and its ID within dense events is k int *all_grid_ID; //gridID for all events int *densegridID_To_gridevent; int *sorted_seq; int *dense_grid_reverse; int *population_ID; //denseeventID to populationID int *cluster_populationID; //clusterID to populationID int *grid_populationID; //gridID to populationID int *all_population_ID; //populationID of event int **position; int **dense_grid_seq; double *interval; double **population_center; //population centroids in the raw/original data double **cluster_center; //cluster centroids in the raw/original data double **input_data; double **normalized_data; int min = 999999; int max = 0; printf( "Starting time:\t\t\t\t"); fflush(stdout); system("/bin/date"); ///////////////////////////////////////////////////////////// if ((argc!=2) && (argc!=4) && (argc!=5)) { //modified on Jul 23, 2010 //printf("usage:\n"); //printf("basic mode: flock data_file\n"); //printf("advanced mode: flock data_file num_bin density_index\n"); //exit(0); fprintf(stderr,"Incorrect number of input parameters!\n"); //modified on July 23, 2010 //fprintf(stderr,"basic mode: flock data_file\n"); //modified on July 23, 2010 //fprintf(stderr,"advanced mode1: flock data_file num_bin density_index\n"); //modified on July 23, 2010 fprintf(stderr,"advanced mode: flock data_file num_bin density_index max_num_pop\n"); //added on Dec 16, 2010 for GenePattern abort(); } f_src=fopen(argv[1],"r"); if (argc==2) { max_num_pop=30; //default value, maximum 30 clusters } if (argc==4) { num_bin=atoi(argv[2]); printf("num_bin is %d\n",num_bin); den_t_event=atoi(argv[3]); printf("density_index is %d\n",den_t_event); max_num_pop=30; if (((num_bin<6) && (num_bin!=0)) || (num_bin>29)) { fprintf(stderr,"Incorrect input range of number of bins, which should be larger than 5 and smaller than 30\n"); abort(); } if (((den_t_event<3) && (den_t_event!=0)) || (den_t_event>99)) { fprintf(stderr,"Incorrect input range of density threshold, which should be larger than 2 and smaller than 100\n"); abort(); } } if (argc==5) { num_bin=atoi(argv[2]); printf("num_bin is %d\n",num_bin); den_t_event=atoi(argv[3]); printf("density_index is %d\n",den_t_event); max_num_pop=atoi(argv[4]); printf("max number of clusters is %d\n",max_num_pop); if (((num_bin<6) && (num_bin!=0)) || (num_bin>29)) { fprintf(stderr,"Incorrect input range of number of bins, which should be larger than 5 and smaller than 30\n"); abort(); } if (((den_t_event<3) && (den_t_event!=0)) || (den_t_event>99)) { fprintf(stderr,"Incorrect input range of density threshold, which should be larger than 2 and smaller than 100\n"); abort(); } if ((max_num_pop<5) || (max_num_pop>999)) { fprintf(stderr,"Incorrect input range of maximum number of populations, which should be larger than 4 and smaller than 1000\n"); abort(); } } getfileinfo(f_src, &file_Len, &num_dm, para_name_string, &time_ID); //get the filelength, number of dimensions, and num/name of parameters /************************************************* Read the data *****************************************************/ rewind(f_src); //reset the file pointer input_data = (double **)malloc(sizeof(double*)*file_Len); memset(input_data,0,sizeof(double*)*file_Len); for (i=0;i<file_Len;i++) { input_data[i]=(double *)malloc(sizeof(double)*num_dm); memset(input_data[i],0,sizeof(double)*num_dm); } readsource(f_src, file_Len, num_dm, input_data, time_ID); //read the data; fclose(f_src); normalized_data=(double **)malloc(sizeof(double*)*file_Len); memset(normalized_data,0,sizeof(double*)*file_Len); for (i=0;i<file_Len;i++) { normalized_data[i]=(double *)malloc(sizeof(double)*num_dm); memset(normalized_data[i],0,sizeof(double)*num_dm); } tran(input_data, file_Len, num_dm, NORM_METHOD, normalized_data); position=(int **)malloc(sizeof(int*)*file_Len); memset(position,0,sizeof(int*)*file_Len); for (i=0;i<file_Len;i++) { position[i]=(int*)malloc(sizeof(int)*num_dm); memset(position[i],0,sizeof(int)*num_dm); } all_grid_ID=(int *)malloc(sizeof(int)*file_Len); memset(all_grid_ID,0,sizeof(int)*file_Len); sorted_seq=(int*)malloc(sizeof(int)*file_Len); memset(sorted_seq,0,sizeof(int)*file_Len); interval=(double*)malloc(sizeof(double)*num_dm); memset(interval,0,sizeof(double)*num_dm); /************************************************* select_bin *************************************************/ if (num_bin>=1) //num_bin has been selected by user select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin); else //num_bin has not been selected by user { num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin); printf("selected bin number is %d\n",num_bin); } printf("number of non-empty grids is %d\n",num_nonempty); /* Although we return sorted_seq from select_bin(), we don't use it for anything, except possibly diagnostics */ free(sorted_seq); dense_grid_reverse=(int*)malloc(sizeof(int)*num_nonempty); memset(dense_grid_reverse,0,sizeof(int)*num_nonempty); /************************************************* select_dense **********************************************/ if (den_t_event>=1) //den_t_event must be larger or equal to 2 if the user wants to set it select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event); else { den_t_event=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event); printf("automated selected density threshold is %d\n",den_t_event); } printf("Number of dense grids is %d\n",num_dense_grids); densegridID_To_gridevent = (int *)malloc(sizeof(int)*num_dense_grids); memset(densegridID_To_gridevent,0,sizeof(int)*num_dense_grids); for (i=0;i<num_dense_grids;i++) densegridID_To_gridevent[i]=-1; //initialize all densegridID_To_gridevent values to -1 eventID_To_denseventID=(int *)malloc(sizeof(int)*file_Len); memset(eventID_To_denseventID,0,sizeof(int)*file_Len); //eventID_To_denseventID[i]=k means event i is in a dense grid and its ID within dense events is k grid_To_event(file_Len, dense_grid_reverse, all_grid_ID, eventID_To_denseventID, densegridID_To_gridevent); dense_grid_seq=(int **)malloc(sizeof(int*)*num_dense_grids); memset(dense_grid_seq,0,sizeof(int*)*num_dense_grids); for (i=0;i<num_dense_grids;i++) { dense_grid_seq[i]=(int *)malloc(sizeof(int)*num_dm); memset(dense_grid_seq[i],0,sizeof(int)*num_dm); } /* Look up the binned data values for each dense grid */ generate_grid_seq(num_dm, num_dense_grids, densegridID_To_gridevent, position, dense_grid_seq); /************************************************* allocate memory *********************************************/ grid_clusterID=(int *)malloc(sizeof(int)*num_dense_grids); memset(grid_clusterID,0,sizeof(int)*num_dense_grids); grid_ID=(int *)malloc(sizeof(int)*num_dense_events); memset(grid_ID,0,sizeof(int)*num_dense_events); cluster_ID=(int *)malloc(sizeof(int)*num_dense_events); memset(cluster_ID,0,sizeof(int)*num_dense_events); /*********************************************** merge_grids ***********************************************/ //int merge_grids(int file_Len, int num_dm, int num_bin, int **position, int num_dense_grids, int *dense_grid_rank, int *dense_grid_reverse, // int **dense_grid_seq, int *eventID_To_denseventID, int *densegridID_To_gridevent, int *all_grid_ID, // int *cluster_ID, int *grid_ID, int *grid_clusterID) num_clust = merge_grids(normalized_data, interval, file_Len, num_dm, num_bin, position, num_dense_grids, dense_grid_reverse, dense_grid_seq, eventID_To_denseventID, densegridID_To_gridevent, all_grid_ID, cluster_ID, grid_ID, grid_clusterID); printf("computed number of groups is %d\n",num_clust); /************************************** release unnecessary memory and allocate memory and compute centers **********************************/ for (i=0;i<file_Len;i++) free(position[i]); free(position); for (i=0;i<num_dense_grids;i++) free(dense_grid_seq[i]); free(dense_grid_seq); free(dense_grid_reverse); free(densegridID_To_gridevent); free(all_grid_ID); // cluster_center //////////////////////////////////////////////////////////////////////////////////////////////////////// cluster_center=(double **)malloc(sizeof(double*)*num_clust); memset(cluster_center,0,sizeof(double*)*num_clust); for (i=0;i<num_clust;i++) { cluster_center[i]=(double*)malloc(sizeof(double)*num_dm); memset(cluster_center[i],0,sizeof(double)*num_dm); } ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_clust,cluster_ID,cluster_center); //produce the centers with normalized data //printf("pass the first ID2center\n"); //commented on July 23, 2010 /*** population_ID and grid_populationID **/ cluster_populationID=(int*)malloc(sizeof(int)*num_clust); memset(cluster_populationID,0,sizeof(int)*num_clust); grid_populationID=(int*)malloc(sizeof(int)*num_dense_grids); memset(grid_populationID,0,sizeof(int)*num_dense_grids); population_ID=(int*)malloc(sizeof(int)*num_dense_events); memset(population_ID,0,sizeof(int)*num_dense_events); num_population = merge_clusters(num_clust, num_dm, interval, cluster_center, cluster_populationID,max_num_pop); for (i=0;i<num_clust;i++) free(cluster_center[i]); free(cluster_center); free(interval); for (i=0;i<num_dense_grids;i++) { grid_populationID[i]=cluster_populationID[grid_clusterID[i]]; } for (i=0;i<num_dense_events;i++) { population_ID[i]=cluster_populationID[cluster_ID[i]]; } printf("computed number of populations is %d\n",num_population); // population_center ///////////////////////////////////////////////////////////////////////////////////////////////////// population_center=(double **)malloc(sizeof(double*)*num_population); memset(population_center,0,sizeof(double*)*num_population); for (i=0;i<num_population;i++) { population_center[i]=(double*)malloc(sizeof(double)*num_dm); memset(population_center[i],0,sizeof(double)*num_dm); } ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_population,population_ID,population_center); //produce population centers with normalized data // show //////////////////////////////////////////////////////////////////////////////// all_population_ID=(int*)malloc(sizeof(int)*file_Len); memset(all_population_ID,0,sizeof(int)*file_Len); kmeans(normalized_data, num_population, KMEANS_TERM, file_Len, num_dm, all_population_ID, population_center); show(input_data, all_population_ID, file_Len, num_population, num_dm, para_name_string); ID2Center_all(input_data,file_Len,num_dm,num_population,all_population_ID,population_center); f_cid=fopen("population_id.txt","w"); f_ctr=fopen("population_center.txt","w"); f_out=fopen("coordinates.txt","w"); f_results=fopen("flock_results.txt","w"); /* f_parameters=fopen("parameters.txt","w"); fprintf(f_parameters,"Number_of_Bins\t%d\n",num_bin); fprintf(f_parameters,"Density\t%f\n",aver_index); fclose(f_parameters); */ for (i=0;i<file_Len;i++) fprintf(f_cid,"%d\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009 /* * New to check for min/max to add to parameters.txt * */ fprintf(f_out,"%s\n",para_name_string); //fprintf(f_results,"%s\tEvent\tPopulation\n",para_name_string); fprintf(f_results,"%s\tPopulation\n",para_name_string); for (i=0;i<file_Len;i++) { for (j=0;j<num_dm;j++) { if (input_data[i][j] < min) { min = (int)input_data[i][j]; } if (input_data[i][j] > max) { max = (int)input_data[i][j]; } if (j==num_dm-1) { fprintf(f_out,"%d\n",(int)input_data[i][j]); fprintf(f_results,"%d\t",(int)input_data[i][j]); } else { fprintf(f_out,"%d\t",(int)input_data[i][j]); fprintf(f_results,"%d\t",(int)input_data[i][j]); } } //fprintf(f_results,"%d\t",i + 1); fprintf(f_results,"%d\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009 } /* f_parameters=fopen("parameters.txt","w"); fprintf(f_parameters,"Number_of_Bins\t%d\n",num_bin); fprintf(f_parameters,"Density\t%d\n",den_t_event); fprintf(f_parameters,"Min\t%d\n",min); fprintf(f_parameters,"Max\t%d\n",max); fclose(f_parameters); */ f_properties=fopen("fcs.properties","w"); fprintf(f_properties,"Bins=%d\n",num_bin); fprintf(f_properties,"Density=%d\n",den_t_event); fprintf(f_properties,"Min=%d\n",min); fprintf(f_properties,"Max=%d\n",max); fprintf(f_properties,"Populations=%d\n",num_population); fprintf(f_properties,"Events=%d\n",file_Len); fprintf(f_properties,"Markers=%d\n",num_dm); fclose(f_properties); for (i=0;i<num_population;i++) { /* Add if we want to include population id in the output */ fprintf(f_ctr,"%d\t",i+1); //i changed to i+1 to start from 1 instead of 0: April 16, 2009 for (j=0;j<num_dm;j++) { if (j==num_dm-1) fprintf(f_ctr,"%.0f\n",population_center[i][j]); else fprintf(f_ctr,"%.0f\t",population_center[i][j]); } } //added April 16, 2009 f_mfi=fopen("MFI.txt","w"); for (i=0;i<num_population;i++) { fprintf(f_mfi,"%d\t",i+1); for (j=0;j<num_dm;j++) { if (j==num_dm-1) fprintf(f_mfi,"%.0f\n",population_center[i][j]); else fprintf(f_mfi,"%.0f\t",population_center[i][j]); } } fclose(f_mfi); //ended April 16, 2009 fclose(f_cid); fclose(f_ctr); fclose(f_out); fclose(f_results); for (i=0;i<num_population;i++) { free(population_center[i]); } free(population_center); for (i=0;i<file_Len;i++) free(normalized_data[i]); free(normalized_data); free(grid_populationID); free(cluster_populationID); free(grid_clusterID); free(cluster_ID); for (i=0;i<file_Len;i++) free(input_data[i]); free(input_data); free(grid_ID); free(population_ID); free(all_population_ID); free(eventID_To_denseventID); /////////////////////////////////////////////////////////// printf("Ending time:\t\t\t\t"); fflush(stdout); system("/bin/date"); return 0; }