Mercurial > repos > immport-devteam > cross_sample

diff src/flock2.c @ 1:7eab80f86779 draft
add FLOCK
author: immport-devteam
date: Mon, 27 Feb 2017 13:26:09 -0500
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/flock2.c	Mon Feb 27 13:26:09 2017 -0500
@@ -0,0 +1,3405 @@
+///////////
+// Changes made:
+// 1. Added another parameter: number of populations
+// 2. Added a hierarchical merging step based on density change between centroids of two hyper-regions
+// 3. Picked the longest dimensions between the population centroids to judge whether the two parts should be merged
+// 4. Removed checking time parameter
+// 5. Output error to stderr
+// 6. Fixed the bug of density threshold always = 3
+// 7. Added another error (select_num_bin<min_grid) || (select_num_bin>max_grid) to STDERR
+// 8. Fixed a bug for 2D data by using K=e*K
+// 9. Added some header files, may not be necessary
+// 10. Added a lower bound (at least two) for number of populations
+/***************************************************************************************************************************************
+	
+	FLOCK: FLOw cytometry Clustering without K (Named by: Jamie A. Lee and Richard H. Scheuermann) 
+	
+	Author: (Max) Yu Qian, Ph.D.
+	
+	Copyright: Scheuermann Lab, Dept. of Pathology, UTSW
+	
+	Development: November 2005 ~ Forever
+
+	Status: July 2010: Release 2.0
+
+	Usage: flock data_file
+		    Note: the input file format must be channel values and the delimiter between two values must be a tab.
+
+
+    	
+****************************************************************************************************************************************/
+
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <assert.h>
+
+
+
+#define DEBUG 0
+#define LINE_LEN 1024
+#define FILE_NAME_LEN 128
+#define PARA_NAME_LEN 64
+#define MAX_VALUE 1000000000
+#define MIN_GRID 6
+#define MAX_GRID 50
+#define E_T 1.0
+
+#define NORM_METHOD 2 //2 if z-score; 0 if no normalization; 1 if min-max 
+#define KMEANS_TERM 10 
+#define MAX_POP_NUM 128
+
+#ifndef max
+    #define max( a, b ) ( ((a) > (b)) ? (a) : (b) )
+#endif
+
+#ifndef min
+    #define min( a, b ) ( ((a) < (b)) ? (a) : (b) )
+#endif
+
+static long **Gr=0;
+static long *gcID = 0;          /* grid cluster IDs */
+static long *cluster_count=0;   /* count of nodes per cluster */
+static long ndense=0;
+static long ndim=0;
+/* cid changes between depth-first searches, but is constant within a
+   single search, so it goes here. */
+static long cid=0;
+/* Do a depth-first search for a single connected component in graph
+ * G.  Start from node, tag the nodes found with cid, and record
+ * the tags in grid_clusterID.  Also, record the node count in
+ * cluster_count.  If we find a node that has already been assigned to
+ * a cluster, that means we're merging two clusters, so zero out the
+ * old cid's node count.
+ *
+ * Note that our graph is constructed as a DAG, so we can be
+ * guaranteed to terminate without checking for cycles.  
+ *
+ * Note2: this function can potentially recurse to depth = ndense.
+ * Plan stack size accordingly.  
+ *
+ * Output:
+ *
+ * grid_clusterID[] -- array where we tag the nodes.
+ * cluster_count[]  -- count of the number of nodes per cluster.
+ */
+static void merge_cluster(long from, long into)
+{
+  int i;
+
+  for(i=0; i<ndense; ++i)
+    if(gcID[i] == from)
+      gcID[i] = into;
+}
+   
+void depth_first(long node)
+{
+  long i;
+
+  if(gcID[node] == cid)         // we're guaranteed no cycles, but it is possible to reach a node 
+    return;                     // through two different paths in the same cluster.  This early
+                                // return saves us some unneeded work.
+
+  /* Check to see if we are merging a cluster */
+  if(gcID[node] >= 0) {
+    /* We are, so zero the count for the old cluster. */
+    cluster_count[ gcID[node] ] = 0;  
+    merge_cluster(gcID[node], cid);
+    return;
+  }
+
+  /* Update for this node */
+  gcID[node] = cid;
+  cluster_count[cid]++;
+
+  /* Recursively search the child nodes */
+  for(i=0; i<ndim; ++i)
+    if(Gr[node][i] >= 0)      /* This is a child node */
+      depth_first(Gr[node][i]);
+}
+
+void bail(const char *msg)
+{
+  fprintf(stderr,"%s",msg);
+  exit(0);
+}
+
+
+static void check_clusters(long *gcID, int ndense)
+{
+  int i;
+
+  for(i=0; i<ndense; ++i)
+    if(gcID[i] < 0) {
+      fprintf(stderr,"faulty cluster id at i= %d\n",i);
+      exit(0);
+    }
+}
+
+
+
+long find_connected(long **G, long n_dense_grids, long num_dm, long *grid_clusterID)
+{
+  long nclust=0;                  /* number of clusters found */
+  long i;
+  long *subfac;
+  long clustid=0;
+  int subval=0,nempty=0;
+  
+  int sz = n_dense_grids*sizeof(long); 
+  cluster_count = malloc(sz);
+  if(!cluster_count)
+    bail("find_connected: Unable to allocate %zd bytes.\n");
+  memset(cluster_count,0,sz);
+
+  /* set up the statics that will be used in the DFS */
+  Gr=G;
+  gcID = grid_clusterID;
+  ndense = n_dense_grids;
+  ndim = num_dm;
+  
+  for(i=0;i<ndense;++i)
+    grid_clusterID[i] = -1;
+
+  for(i=0;i<ndense;++i) {
+    if(grid_clusterID[i] < 0) {  /* grid hasn't been assigned yet */
+      cid = nclust++;
+      depth_first(i);
+    }
+  }
+
+
+  
+  /* At this point we probably have some clusters that are empty due to merging.
+     We want to compact the cluster numbering to eliminate the empty clusters. */
+
+  subfac = malloc(sz);
+  if(!subfac)
+    bail("find_connected: Unable to allocate %zd bytes.\n");
+  subval=0;
+  nempty=0;
+
+  /* cluster #i needs to have its ID decremented by 1 for each empty cluster with
+     ID < i.  Precaclulate the decrements in this loop: */
+  for(i=0;i<nclust;++i) {
+    //clustid = grid_clusterID[i];
+    if(cluster_count[i] == 0) {
+      subval++;
+      nempty++;
+    }
+    subfac[i] = subval;
+  }
+
+  //printf("nempty is %d\n",nempty);
+  
+  /* Now apply the decrements to all of the dense grids */
+  for(i=0;i<ndense;++i) {
+    clustid = grid_clusterID[i];
+    grid_clusterID[i] -= subfac[clustid];
+  }
+
+
+  
+  /* correct the number of clusters found */
+  nclust -= nempty;
+
+  //printf("nclust is %d\n",nclust);
+
+  return nclust;
+}
+
+/************************************* Read basic info of the source file **************************************/
+/************************************* Read basic info of the source file **************************************/
+void getfileinfo(FILE *f_src, long *file_Len, long *num_dm, char *name_string, long *time_ID)
+{
+  char src[LINE_LEN];
+  char current_name[64];
+  char prv;
+
+  long num_rows=0;
+  long num_columns=0;
+  long ch='\n';
+  long prev='\n';
+  long time_pos=0;
+  long i=0;
+  long j=0;
+  int sw=0;
+
+  src[0]='\0';
+  fgets(src, LINE_LEN, f_src);
+
+  name_string[0]='\0';
+  current_name[0]='\0';
+  prv='\n';
+
+  while ((src[i]==' ') || (src[i]=='\t')) //skip space and tab characters
+    i++;
+
+  while ((src[i]!='\0') && (src[i]!='\n')) //repeat until the end of the line
+    {
+      current_name[j]=src[i];
+		
+      if ((src[i]=='\t') && (prv!='\t')) //a complete word
+        {
+          current_name[j]='\0';
+
+          if (0!=strcmp(current_name,"Time"))
+            {
+              num_columns++; //num_columns does not inlcude the column of Time
+              time_pos++;
+              if (sw) {
+                  strcat(name_string,"\t");
+              }
+              strcat(name_string,current_name); 
+              sw = 1;
+            }
+          else
+            {
+              *time_ID=time_pos;
+            }
+
+          current_name[0]='\0';
+          j=0;			
+        }		
+		
+      if ((src[i]=='\t') && (prv=='\t')) //a duplicate tab or space
+        {
+          current_name[0]='\0';
+          j=0;
+        }
+		
+      if (src[i]!='\t')
+        j++;
+		
+      prv=src[i];
+      i++;
+    }
+	
+  //name_string[j]='\0';
+  if (prv!='\t') //the last one hasn't been retrieved
+    {
+      current_name[j]='\0';
+      if (0!=strcmp(current_name,"Time"))
+        {
+          num_columns++;
+          strcat(name_string,"\t");
+          strcat(name_string,current_name);
+          time_pos++;
+        }
+      else
+        {
+          *time_ID=time_pos;
+        }
+    }
+  if (DEBUG==1)
+    {
+      printf("time_ID is %ld\n",*time_ID);
+      printf("name_string is %s\n",name_string);
+    }
+
+  //start computing # of rows
+
+  while ((ch = fgetc(f_src))!= EOF )
+    {
+      if (ch == '\n')
+        {
+          ++num_rows;
+        }
+      prev = ch;
+    }
+  if (prev!='\n')
+    ++num_rows;
+
+   if (num_rows<50)
+  {
+    fprintf(stderr,"Number of events in the input file is too few and should not be processed!\n"); //modified on July 23, 2010
+	exit(0);
+  }
+	
+  *file_Len=num_rows;
+  *num_dm=num_columns; 
+
+  printf("original file size is %ld; number of dimensions is %ld\n", *file_Len, *num_dm);
+}
+
+
+
+/************************************* Read the source file into uncomp_data **************************************/
+void readsource(FILE *f_src, long file_Len, long num_dm, double **uncomp_data, long time_ID)
+{
+  //long time_pass=0; //to mark whether the time_ID has been passed
+  long index=0;
+
+  long i=0;
+  long j=0;
+  long t=0;
+
+  char src[LINE_LEN];
+  char xc[LINE_LEN/10];
+
+  src[0]='\0';
+  fgets(src,LINE_LEN, f_src); //skip the first line about parameter names
+
+  while (!feof(f_src) && (index<file_Len)) //index = 0, 1, ..., file_Len-1
+    {
+      src[0]='\0';	    
+      fgets(src,LINE_LEN,f_src);
+      i=0;
+      //time_pass=0;
+						
+      //if (time_ID==-1)  //there is no time_ID
+      //  {
+          for (t=0;t<num_dm;t++) 
+            {
+              xc[0]='\0';
+              j=0;
+              while ((src[i]!='\0') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t'))
+                {
+                  xc[j]=src[i];
+                  i++;
+                  j++;
+                }
+		
+              xc[j]='\0';	    
+              i++;
+
+              uncomp_data[index][t]=atof(xc);
+            }	
+       // }
+      /*else
+        {
+          for (t=0;t<=num_dm;t++) //the time column needs to be skipped, so there are num_dm+1 columns
+            {
+              xc[0]='\0';
+              j=0;
+              while ((src[i]!='\0') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t'))
+                {
+                  xc[j]=src[i];
+                  i++;
+                  j++;
+                }
+		
+              xc[j]='\0';	    
+              i++;
+
+              if (t==time_ID)
+                {
+                  time_pass=1;
+                  continue;
+                }
+				
+              if (time_pass)
+                uncomp_data[index][t-1]=atof(xc);
+              else
+                uncomp_data[index][t]=atof(xc);
+            }
+        }*/ //commented by Yu Qian on Aug 31, 2010 as we do not want to check time_ID anymore        	
+      index++;     	
+      //fprintf(fout_ID,"%s",src);
+    } //end of while
+	
+  if (DEBUG == 1)
+    {
+      printf("the last line of the source data is:\n");
+      for (j=0;j<num_dm;j++)
+        printf("%f ",uncomp_data[index-1][j]);
+      printf("\n");
+    }
+}
+
+
+/**************************************** Normalization ******************************************/
+void tran(double **orig_data, long file_Len, long num_dm, long norm_used, double **matrix_to_cluster)
+{
+  long i=0;
+  long j=0;
+
+  double biggest=0;
+  double smallest=MAX_VALUE;
+
+  double *aver; //average of each column
+  double *std; //standard deviation of each column
+
+  aver=(double*)malloc(sizeof(double)*file_Len);
+  memset(aver,0,sizeof(double)*file_Len);
+
+  std=(double*)malloc(sizeof(double)*file_Len);
+  memset(std,0,sizeof(double)*file_Len);	
+		
+  if (norm_used==2) //z-score normalization
+    {
+      for (j=0;j<num_dm;j++)
+        {
+          aver[j]=0;
+          for (i=0;i<file_Len;i++)
+            aver[j]=aver[j]+orig_data[i][j];
+          aver[j]=aver[j]/(double)file_Len;
+
+          std[j]=0;
+          for (i=0;i<file_Len;i++)
+            std[j]=std[j]+((orig_data[i][j]-aver[j])*(orig_data[i][j]-aver[j]));
+          std[j]=sqrt(std[j]/(double)file_Len);
+			
+          for (i=0;i<file_Len;i++)
+            matrix_to_cluster[i][j]=(orig_data[i][j]-aver[j])/std[j];  //z-score normalization
+        }
+    }
+
+  if (norm_used==1) //0-1 min-max normalization
+    {
+      for (j=0;j<num_dm;j++)
+        {
+          biggest=0;
+          smallest=MAX_VALUE;
+          for (i=0;i<file_Len;i++)
+            {
+              if (orig_data[i][j]>biggest)
+                biggest=orig_data[i][j];
+              if (orig_data[i][j]<smallest)
+                smallest=orig_data[i][j];
+            }
+			
+          for (i=0;i<file_Len;i++)
+            {
+              if (biggest==smallest)
+                matrix_to_cluster[i][j]=biggest;
+              else
+                matrix_to_cluster[i][j]=(orig_data[i][j]-smallest)/(biggest-smallest);
+            }
+        }
+    }
+
+  if (norm_used==0) //no normalization
+    {
+      for (i=0;i<file_Len;i++)
+        for (j=0;j<num_dm;j++)
+          matrix_to_cluster[i][j]=orig_data[i][j];
+    }
+
+}
+
+
+
+/********************************************** RadixSort *******************************************/
+/* Perform a radix sort using each dimension from the original data as a radix.
+ * Outputs:
+ * sorted_seq   -- a permutation vector mapping the ordered list onto the original data.
+ *                  (sorted_seq[i] -> index in the original data of the ith element of the ordered list)
+ * grid_ID      -- mapping between the original data and the "grids" (see below) found as a byproduct
+ *                  of the sorting procedure.
+ * num_nonempty -- the number of grids that occur in the data (= the number of distinct values assumed
+ *                  by grid_ID)
+ */
+
+void radixsort_flock(long **position,long file_Len,long num_dm,long num_bin,long *sorted_seq,long *num_nonempty,long *grid_ID)
+{
+  long i=0;
+  long length=0; 
+  long start=0;
+  long prev_ID=0;
+  long curr_ID=0;
+	
+  long j=0;
+  long t=0;
+  long p=0;
+  long loc=0;
+  long temp=0;
+  long equal=0;
+	
+  long *count; //count[i]=j means there are j numbers having value i at the processing digit
+  long *index; //index[i]=j means the starting position of grid i is j
+  long *cp; //current position
+  long *mark; //mark[i]=0 means it is not an ending point of a part, 1 means it is (a "part" is a group of items with identical bins for all dimensions)
+  long *seq; //temporary sequence
+
+  count=(long*)malloc(sizeof(long)*num_bin);
+  memset(count,0,sizeof(long)*num_bin);
+
+  cp=(long*)malloc(sizeof(long)*num_bin);
+  memset(cp,0,sizeof(long)*num_bin);
+
+  index=(long*)malloc(sizeof(long)*num_bin); // initialized below
+
+  seq=(long*)malloc(sizeof(long)*file_Len);
+  memset(seq,0,sizeof(long)*file_Len);
+	
+  mark=(long*)malloc(sizeof(long)*file_Len);
+  memset(mark,0,sizeof(long)*file_Len);
+	
+  for (i=0;i<file_Len;i++)
+    {
+      sorted_seq[i]=i;
+      mark[i]=0;
+      seq[i]=0;
+    }
+  for (i=0;i<num_bin;i++)
+    {
+      index[i]=0;
+      cp[i]=0;
+      count[i]=0;
+    }
+
+  for (j=0;j<num_dm;j++)
+    {
+      if (j==0) //compute the initial values of mark
+        {
+          for (i=0;i<file_Len;i++)
+            count[position[i][j]]++; // initialize the count to the number of items in each bin of the 0th dimension
+
+          index[0] = 0;
+          for (i=0;i<num_bin-1;i++)
+            {
+              index[i+1]=index[i]+count[i];  //index[k]=x means k segment starts at x (in the ordered list)
+              if ((index[i+1]>0) && (index[i+1]<=file_Len))
+                {
+                  mark[index[i+1]-1]=1; // Mark the end of the segment in the ordered list
+                }
+              else
+                {
+                  printf("out of myboundary for mark at index[i+1]-1.\n");
+                }
+            }
+          mark[file_Len-1]=1;
+			
+          for (i=0;i<file_Len;i++)
+            {
+              /* Build a permutation vector for the partially ordered data.  Store the PV in sorted_seq */
+              loc=position[i][j];
+              temp=index[loc]+cp[loc]; //cp[i]=j means the offset from the starting position of grid i is j 
+              sorted_seq[temp]=i;  //sorted_seq[i]=temp is also another way to sort
+              cp[loc]++;
+            }
+        }
+      else
+        {
+          //reset count, index, loc, temp, cp, start, and length
+          length=0;
+          loc=0;
+          temp=0;
+          start=0;
+          for (p=0;p<num_bin;p++)
+            {
+              cp[p]=0;
+              count[p]=0;
+              index[p]=0;
+            }
+
+          for (i=0;i<file_Len;i++)
+            {
+              long iperm = sorted_seq[i]; // iperm allows us to traverse the data in sorted order.
+              if (mark[i]!=1)
+                {
+                  /* Count the number of items in each bin of
+                     dimension j, BUT we are going to reset at the end
+                     of each "part".  Thus, the total effect is to do
+                     a sort by bin on the jth dimension for each group
+                     of data that has been identical for the
+                     dimensions processed up to this point.  This is
+                     the standard radix sort procedure, but doing it
+                     this way saves us having to allocate buckets to
+                     hold the data in each group of "identical-so-far"
+                     elements. */
+                  count[position[iperm][j]]++;  //count[position[i][j]]++;
+                  length++;                     // This is the total length of the part, irrespective of the value of the jth component
+                                                // (actually, less one, since we don't increment for the final element below)
+                }
+              if (mark[i]==1)
+                {
+                  //length++;
+                  count[position[iperm][j]]++;//count[position[i][j]]++;  //the current point must be counted in
+                  start=i-length; //this part starts from start to i: [start,i]
+                  /* Now we sort on the jth radix, just like we did for the 0th above, but we restrict it to just the current part.
+                     This would be a lot more clear if we broke this bit of code out into a separate function and processed recursively,
+                     plus we could multi-thread over the parts.  (Hmmm...)
+                  */
+                  index[0] = start; // Let index give the offset within the whole ordered list.
+                  for (t=0;t<num_bin-1;t++)
+                    {
+                      index[t+1]=index[t]+count[t];
+						
+                      if ((index[t+1]<=file_Len) && (index[t+1]>0))
+                        {
+                          mark[index[t+1]-1]=1; // update the part boundaries to include the differences in the current radix.
+                        }
+						
+                    }
+                  mark[i]=1;
+
+                  /* Update the permutation vector for the current part (i.e., from start to i).  By the time we finish the loop over i
+                     the PV will be completely updated for the partial ordering up to the current radix. */
+                  for (t=start;t<=i;t++)
+                    {
+                      loc=position[sorted_seq[t]][j];//loc=position[t][j];
+                      temp=index[loc]+cp[loc];
+                      if ((temp<file_Len) && (temp>=0)) 
+                        {
+                          // seq is a temporary because we have to maintain the old PV until we have finished this step.
+                          seq[temp]=sorted_seq[t];  //sorted_seq[i]=temp is also another way to sort
+                          cp[loc]++;
+                        }
+                      else
+                        {
+                          printf("out of myboundary for seq at temp.\n");
+                        }
+                    }
+
+                  for (t=start;t<=i;t++)
+                    {
+                      // copy the temporary back into sorted_seq.  sorted_seq is now updated for radix j up through
+                      // entry i in the ordered list.
+                      if ((t>=0) && (t<file_Len))
+                        sorted_seq[t]=seq[t];
+                      else
+                        printf("out of myboundary for seq and sorted_seq at t.\n");
+                    }
+                  //reset count, index, seq, length, and cp
+                  length=0;
+                  loc=0;
+                  temp=0;
+                  for (p=0;p<num_bin;p++)
+                    {
+                      cp[p]=0;
+                      count[p]=0;
+                      index[p]=0;
+                    }
+                }
+            }//end for i
+        }//end else
+    }//end for j
+
+  /* sorted_seq[] now contains the ordered list for all radices.  mark[] gives the boundaries between groups of elements that are
+     identical over all radices (= dimensions in the original data) (although it appears we aren't going to make use of this fact) */
+  
+  for (i=0;i<file_Len;i++)
+    grid_ID[i]=0; //in case the initial value hasn't been assigned
+  *num_nonempty=1; //starting from 1!	
+
+  /* assign the "grid" identifiers for all of the data.  A grid will be what we were calling a "part" above.  We will number them
+     serially and tag the *unordered* data with the grid IDs.  We will also count the number of populated grids (in general there will
+     be many possible combinations of bin values that simply never occur) */
+  
+  for (i=1;i<file_Len;i++)
+    {
+      equal=1;
+      prev_ID=sorted_seq[i-1];
+      curr_ID=sorted_seq[i];
+      for (j=0;j<num_dm;j++)
+        {
+          if (position[prev_ID][j]!=position[curr_ID][j])
+            {	
+              equal=0;  //not equal
+              break;
+            }
+        }
+		
+      if (equal)
+        {
+          grid_ID[curr_ID]=grid_ID[prev_ID];
+        }
+      else
+        {
+          *num_nonempty=*num_nonempty+1;
+          grid_ID[curr_ID]=grid_ID[prev_ID]+1;
+        }
+      //all_grid_vol[grid_ID[curr_ID]]++;
+    }
+
+  //free memory
+  free(count);
+  free(index);	
+  free(cp);	
+  free(seq);
+  free(mark); 
+  
+}
+
+/********************************************** Compute Position of Events ************************************************/
+void compute_position(double **data_in, long file_Len, long num_dm, long num_bin, long **position, double *interval)
+{
+  /* What we are really doing here is binning the data, with the bins
+     spanning the range of the data and number of bins = num_bin */
+  long i=0;
+  long j=0;
+
+  double *small; //small[j] is the smallest value within dimension j
+  double *big; //big[j] is the biggest value within dimension j
+		
+  small=(double*)malloc(sizeof(double)*num_dm);
+  memset(small,0,sizeof(double)*num_dm);
+
+  big=(double*)malloc(sizeof(double)*num_dm);
+  memset(big,0,sizeof(double)*num_dm);
+	
+	
+  for (j=0;j<num_dm;j++)
+    {
+      big[j]=MAX_VALUE*(-1);
+      small[j]=MAX_VALUE;
+      for (i=0;i<file_Len;i++)
+        {
+          if (data_in[i][j]>big[j])
+            big[j]=data_in[i][j];
+
+          if (data_in[i][j]<small[j])
+            small[j]=data_in[i][j];
+        }
+		
+      interval[j]=(big[j]-small[j])/(double)num_bin;	//interval is computed using the biggest value and smallest value instead of the channel limit
+      /* XXX: I'm pretty sure the denominator of the fraction above should be num_bin-1. */
+    }
+    
+  for (j=0;j<num_dm;j++)
+  {
+     for (i=0;i<file_Len;i++)
+     {	
+        if (data_in[i][j]>=big[j])
+           position[i][j]=num_bin-1;
+        else
+        {
+           position[i][j]=(long)((data_in[i][j]-small[j])/interval[j]); //position[i][j]=t means point i is at the t grid of dimensional j
+           if ((position[i][j]>=num_bin) || (position[i][j]<0))
+           {
+               //printf("position mis-computed in density analysis!\n");
+               //exit(0);
+			   fprintf(stderr,"Incorrect input file format or input parameters (number of bins overflows)!\n"); //modified on July 23, 2010
+				exit(0);
+           }
+        }
+     }
+  }
+
+
+  free(small);
+  free(big);
+}
+
+/********************************************** select_bin to select the number of bins **********************************/
+//num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty);
+/* Determine the number of bins to use in each dimension.  Additionally sort the data elements according to the binned
+ * values, and partition the data into "grids" with identical (binned) values.  We try progressively more bins until we
+ * maximize a merit function, then return the results obtained using the optimal number of bins. 
+ *
+ * Outputs:
+ * position     -- binned data values
+ * sorted_seq   -- permutation vector mapping the ordered list to the original data
+ * all_grid_ID  -- grid to which each data element was assigned.
+ * num_nonempty -- number of distinct values assumed by all_grid_ID
+ * interval     -- bin width for each data dimension
+ * return value -- the number of bins selected.
+ */
+
+long select_bin(double **normalized_data, long file_Len, long num_dm, long min_grid, long max_grid, long **position, long *sorted_seq, 
+                long *all_grid_ID, long *num_nonempty, double *interval, long user_num_bin)
+{
+ 
+  long num_bin=0;
+  long select_num_bin=0;
+  long m=0;
+  long n=0;
+	
+  long i=0;
+  long bin_scope=0;
+  long temp_num_nonempty=0;
+
+  long *temp_grid_ID;
+  long *temp_sorted_seq;
+  long **temp_position;
+
+  //sorted_seq[i]=j means the event j ranks i
+
+  double temp_index=0;
+  double *bin_index;	
+  double *temp_interval;
+	
+	
+  temp_grid_ID=(long *)malloc(sizeof(long)*file_Len);
+  memset(temp_grid_ID,0,sizeof(long)*file_Len);
+	
+  temp_sorted_seq=(long *)malloc(sizeof(long)*file_Len);
+  memset(temp_sorted_seq,0,sizeof(long)*file_Len);
+
+  temp_position=(long **)malloc(sizeof(long*)*file_Len);
+  memset(temp_position,0,sizeof(long*)*file_Len);
+  for (m=0;m<file_Len;m++)
+    {
+      temp_position[m]=(long*)malloc(sizeof(long)*num_dm);
+      memset(temp_position[m],0,sizeof(long)*num_dm);
+    }
+
+  temp_interval=(double*)malloc(sizeof(double)*num_dm);
+  memset(temp_interval,0,sizeof(double)*num_dm);
+
+  bin_scope=max_grid-min_grid+1;
+  bin_index=(double *)malloc(sizeof(double)*bin_scope);
+  memset(bin_index,0,sizeof(double)*bin_scope);
+
+  i=0;
+
+  for (num_bin=min_grid;num_bin<=max_grid;num_bin++)
+    {
+      
+	  temp_num_nonempty=0;
+	   /* compute_position bins the data into num_bin bins.  Each
+         dimension is binned independently.
+
+         Outputs:
+         temp_position[i][j] -- bin for the jth component of data element i.
+         temp_interval[j]    -- bin-width for the jth component
+      */
+      compute_position(normalized_data, file_Len, num_dm, num_bin, temp_position, temp_interval);
+      radixsort_flock(temp_position,file_Len,num_dm,num_bin,temp_sorted_seq,&temp_num_nonempty,temp_grid_ID);
+
+      /* our figure of merit is the number of non-empty grids divided by number of bins per dimension.
+         We declare victory when we have found a local maximum */
+      bin_index[i]=((double)temp_num_nonempty)/((double)num_bin);
+	  if ((double)(temp_num_nonempty)>=(double)(file_Len)*0.95)
+		  break;
+      if ((bin_index[i]<temp_index) && (user_num_bin==0))
+         break;
+	  if ((user_num_bin==num_bin-1) && (user_num_bin!=0))
+		 break;
+      
+      /* Since we have accepted this trial bin, copy all the temporary results into
+         the output buffers */
+      memcpy(all_grid_ID,temp_grid_ID,sizeof(long)*file_Len);
+      memcpy(sorted_seq,temp_sorted_seq,sizeof(long)*file_Len);
+      memcpy(interval,temp_interval,sizeof(double)*num_dm);
+		
+      for (m=0;m<file_Len;m++)
+        for (n=0;n<num_dm;n++)
+          position[m][n]=temp_position[m][n];
+
+      temp_index=bin_index[i];
+      select_num_bin=num_bin;
+      num_nonempty[0]=temp_num_nonempty;
+      i++;
+    }
+
+   if ((select_num_bin<min_grid) || (select_num_bin>max_grid))
+  {
+    fprintf(stderr,"Number of events collected is too few in terms of number of markers used. The file should not be processed!\n"); //modified on Nov 04, 2010
+	exit(0);
+  }
+	
+  if (temp_index==0)
+  {
+	 fprintf(stderr,"Too many dimensions with too few events in the input file, or a too large number of bins used.\n"); //modified on July 23, 2010
+	 exit(0);
+  }
+
+  free(temp_grid_ID);
+  free(temp_sorted_seq);
+  free(bin_index);
+  free(temp_interval);
+	
+  for (m=0;m<file_Len;m++)
+    free(temp_position[m]);
+  free(temp_position);
+
+  return select_num_bin; 
+}
+
+/************************************* Select dense grids **********************************/
+// compute num_dense_grids, num_dense_events, dense_grid_reverse, and all_grid_vol
+// den_cutoff=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse);
+/*
+ * Prune away grids that are insufficiently "dense" (i.e., contain too few data items)
+ *
+ * Outputs:
+ * num_dense_grids    -- number of dense grids
+ * num_dense_events   -- total number of data items in all dense grids
+ * dense_grid_reverse -- mapping from list of all grids to list of dense grids.
+ * return value       -- density cutoff for separating dense from non-dense grids.
+ */
+
+int select_dense(long file_Len, long *all_grid_ID, long num_nonempty, long *num_dense_grids, long *num_dense_events, long *dense_grid_reverse, int den_t_event)
+{
+  
+
+  long i=0;
+  long vol_ID=0;
+  long biggest_size=0; //biggest grid_size, to define grid_size_index
+  long biggest_index=0;
+  //long actual_threshold=0; //the actual threshold on grid_size, e.g., 1 implies 1 event in the grid
+  //long num_remain=0; //number of remaining grids with different density thresholds
+  long temp_num_dense_grids=0;
+  long temp_num_dense_events=0;
+	
+  long *grid_size_index;
+  long *all_grid_vol;
+  long *grid_density_index;
+
+  //double den_average=0;
+ // double avr_index=0;
+ 
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Compute all_grid_vol
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  all_grid_vol=(long *)malloc(sizeof(long)*num_nonempty);
+  memset(all_grid_vol,0,sizeof(long)*num_nonempty);
+
+  /* Grid "volume" is just the number of data contained in the grid. */
+  for (i=0;i<file_Len;i++)
+    {
+      vol_ID=all_grid_ID[i]; //vol_ID=all_grid_ID[sorted_seq[i]];
+      all_grid_vol[vol_ID]++;  //all_grid_vol[i]=j means grid i has j points
+    }
+
+ 
+	
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Compute grid_size_index (histogram of grid sizes)
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  for (i=0;i<num_nonempty;i++)
+    {
+      if (biggest_size<all_grid_vol[i])
+        {
+          biggest_size=all_grid_vol[i];
+        }
+    }
+
+   if (biggest_size<3)
+  {
+	 fprintf(stderr,"Too many dimensions with too few events in the input file, or a too large number of bins used.\n"); //modified on July 23, 2010
+	 exit(0);
+  }
+
+
+  grid_size_index=(long*)malloc(sizeof(long)*biggest_size);
+  memset(grid_size_index,0,sizeof(long)*biggest_size);
+	
+  for (i=0;i<num_nonempty;i++)
+    {
+      grid_size_index[all_grid_vol[i]-1]++; //grid_size_index[0]=5 means there are 5 grids having size 1
+    }
+
+
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Compute den_cutoff
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	
+  grid_density_index=(long *)malloc(sizeof(long)*(biggest_size-2));//from event 2 to biggest_size-1, i.e., from 0 to biggest_size-3
+  memset(grid_density_index,0,sizeof(long)*(biggest_size-2));
+
+  if (den_t_event==0)
+  {
+	  biggest_index=0;
+
+	  for (i=2;i<biggest_size-1;i++) //the grid with 1 event will be skipped, i.e., grid_density_index[0] won't be defined
+	  {
+		  grid_density_index[i-1]=(grid_size_index[i-1]+grid_size_index[i+1]-2*grid_size_index[i]);
+		  if (biggest_index<grid_density_index[i-1])
+		  {
+			biggest_index=grid_density_index[i-1];
+			den_t_event=i+1;
+		  }
+	  }
+  }
+
+  if (den_t_event==0) //if biggest_size==3
+  {
+	 fprintf(stderr,"the densest hyperregion has only 3 events, smaller than the user-specified value. Therefore the density threshold is automatically changed to 3.\n");
+	 den_t_event=3;
+  }
+
+  for (i=0;i<num_nonempty;i++)
+	  if (all_grid_vol[i]>=den_t_event)
+		temp_num_dense_grids++;
+
+  if (temp_num_dense_grids<=1)
+  {
+	  //exit(0);
+	  //printf("a too high density threshold is set! Please decrease your density threshold.\n");
+	  fprintf(stderr,"a too high density threshold! Please decrease your density threshold.\n"); //modified on July 23, 2010
+	  exit(0);
+  }
+
+   if (temp_num_dense_grids>=100000)
+  {
+	  //modified on July 23, 2010
+	  //printf("a too low density threshold is set! Please increase your density threshold.\n");
+	  //exit(0);
+	  fprintf(stderr,"a too low density threshold! Please increase your density threshold.\n"); //modified on July 23, 2010
+	  exit(0);
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Compute dense_grid_reverse
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  temp_num_dense_grids=0;
+  temp_num_dense_events=0;
+  for (i=0;i<num_nonempty;i++)
+    {
+      dense_grid_reverse[i]=-1;
+		
+      if (all_grid_vol[i]>=den_t_event) 
+        {			
+          dense_grid_reverse[i]=temp_num_dense_grids;  //dense_grid_reverse provides a mapping from all nonempty grids to dense grids.
+          temp_num_dense_grids++;
+          temp_num_dense_events+=all_grid_vol[i];						
+        }
+    }
+
+  num_dense_grids[0]=temp_num_dense_grids;
+  num_dense_events[0]=temp_num_dense_events;	
+
+  free(grid_size_index);
+  free(all_grid_vol);
+
+  return den_t_event;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Compute densegridID_To_gridevent and eventID_To_denseventID
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//	grid_To_event(file_Len, dense_grid_reverse, all_grid_ID, eventID_To_denseventID, densegridID_To_gridevent);
+/*
+ * Filter the data so that only the data belonging to dense grids is left
+ *
+ * Output:
+ * eventID_To_denseeventID   -- mapping from original event ID to ID in list containing only events contained in dense grids.
+ * densegridID_To_gridevent  -- mapping from dense grids to prototype members of the grids.
+ *
+ */
+
+void grid_To_event(long file_Len, long *dense_grid_reverse, long *all_grid_ID, long *eventID_To_denseventID, long *densegridID_To_gridevent)
+{
+  long i=0;
+  long dense_grid_ID=0;
+  long dense_event_ID=0;
+
+  for (i=0;i<file_Len;i++)
+    {
+      dense_grid_ID=dense_grid_reverse[all_grid_ID[i]];
+      eventID_To_denseventID[i]=-1;
+      if (dense_grid_ID!=-1) //for point (i) belonging to dense grids 
+        {			
+          eventID_To_denseventID[i]=dense_event_ID;
+          dense_event_ID++;
+		
+          if (densegridID_To_gridevent[dense_grid_ID]==-1) //for point i that hasn't been selected
+            densegridID_To_gridevent[dense_grid_ID]=i; //densegridID_To_gridevent maps dense_grid_ID to its representative point			
+        }		
+    }
+	
+	
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Compute dense_grid_seq
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//	generate_grid_seq(file_Len, num_dm, sorted_seq, num_dense_grids, densegridID_To_gridevent, position, dense_grid_rank, dense_grid_seq);
+/* Construct a table of binned data values for each dense grid.
+ *
+ * Output:
+ *
+ * dense_grid_seq  -- table of binned data values for each dense grid (recall that all members of a grid have identical binned data values)
+ */
+
+void generate_grid_seq(long num_dm, long num_dense_grids, long *densegridID_To_gridevent, long **position, long **dense_grid_seq)
+{  
+
+  long i=0;
+  long j=0;
+  long ReventID=0; //representative event ID of the dense grid
+
+  for (i=0;i<num_dense_grids;i++)
+    {
+      ReventID = densegridID_To_gridevent[i];
+
+      for (j=0;j<num_dm;j++)
+        dense_grid_seq[i][j]=position[ReventID][j];
+    }
+}
+
+//compare two vectors
+long compare_value(long num_dm, long *search_value, long *seq_value)
+{
+  long i=0;
+
+  for (i=0;i<num_dm;i++)
+    {
+      if (search_value[i]<seq_value[i])
+        return 1;
+      if (search_value[i]>seq_value[i])
+        return -1;
+      if (search_value[i]==seq_value[i])
+        continue;
+    }
+  return 0;
+}
+
+//binary search the dense_grid_seq to return the dense grid ID if it exists
+long binary_search(long num_dense_grids, long num_dm, long *search_value, long **dense_grid_seq)
+{
+
+  long low=0;
+  long high=0;
+  long mid=0;
+  long comp_result=0;
+  long match=0;
+  //long found=0;
+	
+  low = 0;
+  high = num_dense_grids-1;
+    
+  while (low <= high) 
+    {
+      mid = (long)((low + high)/2);
+	
+      comp_result=compare_value(num_dm, search_value,dense_grid_seq[mid]);
+	
+		
+      switch(comp_result)
+        {
+        case 1:
+          high=mid-1;
+          break;
+        case -1:
+          low=mid+1;
+          break;
+        case 0:
+          match=1;
+          break;
+        }
+      if (match==1)
+        break;
+    }
+	
+
+
+  if (match==1)
+    {
+      return mid;
+    }
+  else
+    return -1;   
+}
+
+
+/********************************************** Computing Centers Using IDs **********************************************/
+
+void ID2Center(double **data_in, long file_Len, long num_dm, long *eventID_To_denseventID, long num_clust, long *cluster_ID, double **population_center)
+{
+  long i=0;
+  long j=0;
+  long ID=0;
+  long eventID=0;
+  long *size_c;
+
+
+
+  size_c=(long *)malloc(sizeof(long)*num_clust);
+  memset(size_c,0,sizeof(long)*num_clust);
+
+  for (i=0;i<num_clust;i++)
+    for (j=0;j<num_dm;j++)
+      population_center[i][j]=0;
+
+  for (i=0;i<file_Len;i++)
+    {
+      eventID=eventID_To_denseventID[i];
+
+      if (eventID!=-1) //only events in dense grids count
+        {
+          ID=cluster_ID[eventID];
+			
+          if (ID==-1)
+            {
+              //printf("ID==-1! in ID2Center\n");
+              //exit(0);
+			  fprintf(stderr,"Incorrect file format or input parameters (no dense regions found!)\n"); //modified on July 23, 2010
+			  exit(0);
+            }
+
+          for (j=0;j<num_dm;j++)
+            population_center[ID][j]=population_center[ID][j]+data_in[i][j];
+			
+          size_c[ID]++;				
+        }
+    }
+	
+
+  for (i=0;i<num_clust;i++)
+    {
+      for (j=0;j<num_dm;j++)
+        if (size_c[i]!=0)
+          population_center[i][j]=(population_center[i][j]/(double)(size_c[i]));
+        else
+		{
+          population_center[i][j]=0;
+		  printf("size_c[%ld]=0 in ID2center\n",i);
+		}
+    }
+
+  free(size_c);
+
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//  Compute Population Center with all events
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+void ID2Center_all(double **data_in, long file_Len, long num_dm, long num_clust, long *cluster_ID, double **population_center)
+{
+  long i=0;
+  long j=0;
+  long ID=0;
+  long *size_c;
+
+
+
+  size_c=(long *)malloc(sizeof(long)*num_clust);
+  memset(size_c,0,sizeof(long)*num_clust);
+
+  for (i=0;i<num_clust;i++)
+    for (j=0;j<num_dm;j++)
+      population_center[i][j]=0;
+
+	
+  for (i=0;i<file_Len;i++)
+    {
+         ID=cluster_ID[i];
+			
+         if (ID==-1)
+         {
+            //printf("ID==-1! in ID2Center_all\n");
+            //exit(0);
+			fprintf(stderr,"Incorrect file format or input parameters (resulting in incorrect population IDs)\n"); //modified on July 23, 2010
+			exit(0);
+         }
+
+         for (j=0;j<num_dm;j++)
+           population_center[ID][j]=population_center[ID][j]+data_in[i][j];
+			
+         size_c[ID]++;        
+    }
+	
+ 
+  for (i=0;i<num_clust;i++)
+    {
+      for (j=0;j<num_dm;j++)
+        if (size_c[i]!=0)
+          population_center[i][j]=(population_center[i][j]/(double)(size_c[i]));
+        else
+		{
+          population_center[i][j]=0;
+		  printf("size_c[%ld]=0 in ID2center_all\n",i);
+		}
+    }
+
+
+  free(size_c);
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Merge neighboring grids to clusters
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+long merge_grids(double **normalized_data, double *interval, long file_Len, long num_dm, long num_bin, long **position, long num_dense_grids, 
+                 long *dense_grid_reverse, long **dense_grid_seq, long *eventID_To_denseventID, long *densegridID_To_gridevent, long *all_grid_ID,
+                 long *cluster_ID, long *grid_ID, long *grid_clusterID)
+{
+  
+
+  long i=0;
+  long j=0;
+  long t=0;
+  long p=0;
+  long num_clust=0;
+  long ReventID=0;
+  long denseID=0;
+  long neighbor_ID=0;
+  //long temp_grid=0;
+
+  long *grid_value;
+  long *search_value;
+	
+  long **graph_of_grid; //the graph constructed for dense grids: each dense grid is a graph node
+
+  double real_dist=0;
+  double **norm_grid_center;
+	
+  norm_grid_center=(double **)malloc(sizeof(double*)*num_dense_grids);
+  memset(norm_grid_center,0,sizeof(double*)*num_dense_grids);
+	
+  for (i=0;i<num_dense_grids;i++)
+    {
+      norm_grid_center[i]=(double *)malloc(sizeof(double)*num_dm);
+      memset(norm_grid_center[i],0,sizeof(double)*num_dm);
+    }
+
+  for (i=0;i<file_Len;i++)
+    {
+      denseID=eventID_To_denseventID[i];
+      if (denseID!=-1) //only dense events can enter
+        {
+          grid_ID[denseID]=dense_grid_reverse[all_grid_ID[i]];
+			
+          if (grid_ID[denseID]==-1)
+            {
+              fprintf(stderr,"Incorrect input file format or input parameters (no dense region found)!\n");
+              exit(0);
+            }			
+        }
+    }
+
+	
+  /* Find centroid (in the normalized data) for each dense grid */
+  ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_dense_grids,grid_ID,norm_grid_center);	
+ 
+  //printf("pass the grid ID2 center\n");
+
+	
+  graph_of_grid=(long **)malloc(sizeof(long*)*num_dense_grids);
+  memset(graph_of_grid,0,sizeof(long*)*num_dense_grids);
+  for (i=0;i<num_dense_grids;i++)
+    {
+      graph_of_grid[i]=(long *)malloc(sizeof(long)*num_dm);
+      memset(graph_of_grid[i],0,sizeof(long)*num_dm);		
+		
+
+      for (j=0;j<num_dm;j++)
+        graph_of_grid[i][j]=-1;
+    }	
+	
+  grid_value=(long *)malloc(sizeof(long)*num_dm);
+  memset(grid_value,0,sizeof(long)*num_dm);
+
+  search_value=(long *)malloc(sizeof(long)*num_dm);
+  memset(search_value,0,sizeof(long)*num_dm);
+
+  
+  for (i=0;i<num_dense_grids;i++)
+    {
+      ReventID=densegridID_To_gridevent[i];
+
+      for (j=0;j<num_dm;j++)
+        {
+          grid_value[j]=position[ReventID][j];
+          
+        }
+     
+
+      /* For each dimension, find the neighbor, if any, that is equal in all other dimensions and 1 greater in
+         the chosen dimension.  If the neighbor's centroid is not too far away, add it to this grid's neighbor
+         list. */
+      for (t=0;t<num_dm;t++)
+        {
+          for (p=0;p<num_dm;p++)
+            search_value[p]=grid_value[p];
+
+          if (grid_value[t]==num_bin-1)
+            continue;
+
+          search_value[t]=grid_value[t]+1; //we only consider the neighbor at the bigger side
+
+          neighbor_ID=binary_search(num_dense_grids, num_dm, search_value, dense_grid_seq);
+			
+          if (neighbor_ID!=-1) 
+            {
+              real_dist=norm_grid_center[i][t]-norm_grid_center[neighbor_ID][t];
+	
+              if (real_dist<0)
+                real_dist=-real_dist;
+				
+              if (real_dist<2*interval[t]) //by using 2*interval, this switch is void
+                graph_of_grid[i][t]=neighbor_ID;			
+            }
+        }
+      grid_clusterID[i]=i; //initialize grid_clusterID
+    } 
+	
+	
+  //graph constructed
+  //DFS-based search begins
+
+  /* Use a depth-first search to construct a list of connected subgraphs (= "clusters").
+     Note our graph as we have constructed it is a DAG, so we can use that to our advantage
+     in our search. */
+  //  num_clust=dfs(graph_of_grid,num_dense_grids,num_dm,grid_clusterID);
+  num_clust=find_connected(graph_of_grid, num_dense_grids, num_dm, grid_clusterID);
+
+	
+  //computes grid_ID and cluster_ID
+  for (i=0;i<file_Len;i++)
+    {
+      denseID=eventID_To_denseventID[i];
+      if (denseID!=-1) //only dense events can enter
+	  {
+        cluster_ID[denseID]=grid_clusterID[grid_ID[denseID]];
+		//if (cluster_ID[denseID]==-1)
+		//	printf("catch you!\n");
+	  }
+    }
+	
+  free(search_value);
+  free(grid_value);
+
+  for (i=0;i<num_dense_grids;i++)
+    {
+      free(graph_of_grid[i]);
+      free(norm_grid_center[i]);
+    }
+  free(graph_of_grid);
+  free(norm_grid_center);
+
+  return num_clust;
+}
+
+/********************************************* Merge Clusters to Populations *******************************************/
+
+long merge_clusters(long num_clust, long num_dm, double *interval, double **cluster_center, long *cluster_populationID, long max_num_pop)
+{
+  long num_population=0;
+  long temp_num_population=0;
+
+  long i=0;
+  long j=0;
+  long t=0;
+  long merge=0;
+  long smid=0;
+  long bgid=0;
+  double merge_dist=1.1;
+
+  long *map_ID;
+
+  double diff=0;  
+
+  map_ID=(long*)malloc(sizeof(long)*num_clust);
+  memset(map_ID,0,sizeof(long)*num_clust);
+
+  for (i=0;i<num_clust;i++)
+  {
+    	cluster_populationID[i]=i;
+		map_ID[i]=i;
+  }
+    
+
+  while (((num_population>max_num_pop) && (merge_dist<5.0)) || ((num_population<=1) && (merge_dist>0.1)))
+  {
+  
+	  if (num_population<=1)
+	  	  merge_dist=merge_dist-0.1;
+
+	  if (num_population>max_num_pop)
+          merge_dist=merge_dist+0.1;
+	  
+	    
+	 for (i=0;i<num_clust;i++)
+		cluster_populationID[i]=i;
+
+    for (i=0;i<num_clust-1;i++)
+    {
+      for (j=i+1;j<num_clust;j++)
+        {
+          merge=1;
+			
+          for (t=0;t<num_dm;t++)
+            {
+              diff=cluster_center[i][t]-cluster_center[j][t];
+				
+              if (diff<0)
+                diff=-diff;
+              if (diff>(merge_dist*interval[t]))
+                merge=0;
+            }
+
+          if ((merge) && (cluster_populationID[i]!=cluster_populationID[j]))
+            {
+              if (cluster_populationID[i]<cluster_populationID[j])  //they could not be equal
+                {
+                  smid = cluster_populationID[i];
+                  bgid = cluster_populationID[j];
+                }
+              else
+                {
+                  smid = cluster_populationID[j];
+                  bgid = cluster_populationID[i];
+                }
+              for (t=0;t<num_clust;t++)
+                {
+                  if (cluster_populationID[t]==bgid)
+                    cluster_populationID[t]=smid;
+                }
+            }
+        }
+    }
+
+  
+
+  for (i=0;i<num_clust;i++)
+    map_ID[i]=-1;
+
+  for (i=0;i<num_clust;i++)
+    map_ID[cluster_populationID[i]]=1;
+
+  num_population=0;
+  for (i=0;i<num_clust;i++)
+    {
+      if (map_ID[i]==1)
+        {
+          map_ID[i]=num_population;
+          num_population++;
+        }
+    }
+
+  if ((temp_num_population>max_num_pop) && (num_population==1))
+	  break;
+  else
+	  temp_num_population=num_population;
+
+  if (num_clust<=1)
+	break;
+  } //end while
+
+  for (i=0;i<num_clust;i++)
+    cluster_populationID[i]=map_ID[cluster_populationID[i]];
+	
+  free(map_ID);
+
+  return num_population; 
+}
+
+///////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+double kmeans(double **Matrix, long k, double kmean_term, long file_Len, long num_dm, long *shortest_id, double **center)
+{
+	
+	long i=0;
+	long j=0;
+	long t=0;
+	long random=0;
+	long random1=0;
+	long random2=0;
+	long times=0;
+	long dist_used=0; //0 is Euclidean and 1 is Pearson
+	long random_init=0; //0: not use random seeds; 
+	long real_Len=0;
+	long skipped=0;
+	
+ 	long *num;  //num[i]=t means the ith cluster has t points
+	
+	double vvv=1.0; // the biggest variation;
+	double distance=0.0;
+	double xv=0.0;
+	double variation=0.0;
+
+	double mean_dx=0;
+	double mean_dy=0;
+	double sum_var=0;
+	double dx=0;
+	double dy=0;
+	double sd_x=0;
+	double sd_y=0;	
+	double diff=0;
+	double distortion=0;
+	double sd=0; //standard deviation
+	double shortest_distance;
+
+	double *temp_center;	
+			
+	double **sum;	
+ 
+	temp_center = (double *)malloc(sizeof(double)*num_dm);
+	memset(temp_center,0,sizeof(double)*num_dm);
+
+	if (random_init)
+	{
+		for (i=0;i<k;i++)
+		{	
+			random1=rand()*rand();
+			random2=abs((random1%5)+1);
+			for (t=0;t<random2;t++)
+				random2=random2*rand()+rand();
+	
+			random=abs(random2%file_Len);
+			for (j=0;j<num_dm;j++)
+				center[i][j]=Matrix[random][j];				
+		}
+	}
+
+
+	num = (long *)malloc(sizeof(long)*k);
+	memset(num,0,sizeof(long)*k);
+
+	sum = (double **)malloc(sizeof(double*)*k);
+	memset(sum,0,sizeof(double*)*k);
+	for (i=0;i<k;i++)
+	{
+		sum[i] = (double *)malloc(sizeof(double)*num_dm);
+		memset(sum[i],0,sizeof(double)*num_dm);
+	}
+
+
+	times=0;
+	real_Len=0;
+
+	while (((vvv>kmean_term) && (kmean_term<1)) || ((times<kmean_term) && (kmean_term>=1)))
+	{
+		for (i=0;i<k;i++)
+		{
+			num[i]=0;
+			for (j=0;j<num_dm;j++)
+				sum[i][j]=0.0;  
+		}
+		
+		for (i=0;i<file_Len;i++)  //for each data point i, we compute the distance between Matrix[i] and center[j]
+		{		
+			skipped = 0;
+			shortest_distance=MAX_VALUE;
+			for (j=0;j<k;j++)  //for each center j
+			{
+				distance=0.0;
+								
+				if (dist_used==0)  //Euclidean distance
+				{
+					for (t=0;t<num_dm;t++) //for each dimension here num_dm is always 1 as we consider individual dimensions
+					{
+					
+						diff=center[j][t]-Matrix[i][t];
+					
+						diff=diff*diff;  
+						//this K-means is only used for dimension selection, and therefore for 1-dimensional data only, no need to use cube distance
+
+						distance = distance+diff; //here we have a weight for each dimension
+					}
+				}
+				else  //pearson correlation
+				{
+					mean_dx=0.0;
+					mean_dy=0.0;
+					sum_var=0.0;
+					dx=0.0;
+					dy=0.0;
+					sd_x=0.0;
+					sd_y=0.0;
+					for (t=0;t<num_dm;t++)
+					{
+						mean_dx+=center[j][t];
+						mean_dy+=Matrix[i][t];
+					}
+					mean_dx=mean_dx/(double)num_dm;
+					mean_dy=mean_dy/(double)num_dm;
+			
+					for (t=0;t<num_dm;t++)
+					{
+						dx=center[j][t]-mean_dx;
+						dy=Matrix[i][t]-mean_dy;
+						sum_var+=dx*dy;
+						sd_x+=dx*dx;
+						sd_y+=dy*dy;
+					}
+					if (sqrt(sd_x*sd_y)==0)
+						distance = 1.0;
+					else
+						distance = 1.0 - (sum_var/(sqrt(sd_x*sd_y))); // distance ranges from 0 to 2;
+					//printf("distance=%f\n",distance);			
+				}	//pearson correlation ends 				
+
+				if ((distance<shortest_distance) && (skipped == 0))
+				{
+					shortest_distance=distance;						
+					shortest_id[i]=j;  
+				}			
+			}//end for j
+				real_Len++;
+				num[shortest_id[i]]=num[shortest_id[i]]+1;
+				for (t=0;t<num_dm;t++)
+					sum[shortest_id[i]][t]=sum[shortest_id[i]][t]+Matrix[i][t];		
+		}//end for i
+	/* recompute the centers */
+	//compute_mean(group);		
+		vvv=0.0;
+		for (j=0;j<k;j++)
+		{
+			memcpy(temp_center,center[j],sizeof(double)*num_dm);
+			variation=0.0;
+			if (num[j]!=0)
+			{
+				for (t=0;t<num_dm;t++)
+				{
+					center[j][t]=sum[j][t]/(double)num[j];
+					xv=(temp_center[t]-center[j][t]);
+					variation=variation+xv*xv;
+				}
+			}
+			
+			if (variation>vvv)
+				vvv=variation;  //vvv is the biggest variation among the k clusters;			
+		}
+	//compute_variation;
+		times++;
+	} //end for while
+
+
+
+
+	free(num);
+
+	for (i=0;i<k;i++)
+		free(sum[i]);
+	free(sum);
+	free(temp_center);
+
+
+	return distortion;
+		
+}
+
+//////////////////////////////////////////////////////
+/*************************** Show *****************************/
+void show(double **Matrix, long *cluster_id, long file_Len, long k, long num_dm, char *name_string)
+{
+	int situ1=0;
+	int situ2=0;
+
+	long i=0;
+	long id=0;
+	long j=0;
+	long info_id=0;
+	long nearest_id=0;
+	long insert=0;
+	long temp=0;
+	long m=0;
+	long n=0;
+	long t=0;
+	
+	long *size_c;
+	
+
+
+	long **size_mybound_1;
+	long **size_mybound_2;
+	long **size_mybound_3;
+	long **size_mybound_0;
+
+	double interval=0.0;
+
+	double *big;
+	double *small;
+
+
+	double **center;
+	double **mybound;
+	
+	long **prof; //prof[i][j]=1 means population i is + at parameter j
+	
+	FILE *fpcnt_id; //proportion id
+	//FILE *fcent_id; //center_id, i.e., centers of clusters within the original data
+	FILE *fprof_id; //profile_id
+
+	big=(double *)malloc(sizeof(double)*num_dm);
+	memset(big,0,sizeof(double)*num_dm);
+
+	small=(double *)malloc(sizeof(double)*num_dm);
+	memset(small,0,sizeof(double)*num_dm);
+
+	for (i=0;i<num_dm;i++)
+	{
+		big[i]=0.0;
+		small[i]=(double)MAX_VALUE;
+	}
+	
+	
+	size_c=(long *)malloc(sizeof(long)*k);
+	memset(size_c,0,sizeof(long)*k);
+
+	center=(double**)malloc(sizeof(double*)*k);
+	memset(center,0,sizeof(double*)*k);
+	for (i=0;i<k;i++)
+	{
+		center[i]=(double*)malloc(sizeof(double)*num_dm);
+		memset(center[i],0,sizeof(double)*num_dm);
+	}
+
+	mybound=(double**)malloc(sizeof(double*)*num_dm);
+	memset(mybound,0,sizeof(double*)*num_dm);
+	for (i=0;i<num_dm;i++) //there are 3 mybounds for 4 categories
+	{
+		mybound[i]=(double*)malloc(sizeof(double)*3);
+		memset(mybound[i],0,sizeof(double)*3);
+	}
+
+	prof=(long **)malloc(sizeof(long*)*k);
+	memset(prof,0,sizeof(long*)*k);
+	for (i=0;i<k;i++)
+	{
+		prof[i]=(long *)malloc(sizeof(long)*num_dm);
+		memset(prof[i],0,sizeof(long)*num_dm);
+	}
+
+
+	for (i=0;i<file_Len;i++)
+	{
+		id=cluster_id[i];
+		for (j=0;j<num_dm;j++)
+		{
+			center[id][j]=center[id][j]+Matrix[i][j];
+			if (big[j]<Matrix[i][j])
+				big[j]=Matrix[i][j];
+			if (small[j]>Matrix[i][j])
+				small[j]=Matrix[i][j];
+		}
+		
+		size_c[id]++;		
+	}
+
+	for (i=0;i<k;i++)
+		for (j=0;j<num_dm;j++)
+		{			
+			if (size_c[i]!=0)
+				center[i][j]=(center[i][j]/(double)(size_c[i]));
+			else
+				center[i][j]=0;	
+		}
+
+	for (j=0;j<num_dm;j++)
+	{
+		interval=((big[j]-small[j])/4.0);
+		//printf("interval[%d] is %f\n",j,interval);
+		for (i=0;i<3;i++)
+			mybound[j][i]=small[j]+((double)(i+1)*interval);
+	}
+	
+
+	size_mybound_0=(long **)malloc(sizeof(long*)*k);
+	memset(size_mybound_0,0,sizeof(long*)*k);
+	
+	for (i=0;i<k;i++)
+	{
+		size_mybound_0[i]=(long*)malloc(sizeof(long)*num_dm);
+		memset(size_mybound_0[i],0,sizeof(long)*num_dm);		
+	}
+
+	size_mybound_1=(long **)malloc(sizeof(long*)*k);
+	memset(size_mybound_1,0,sizeof(long*)*k);
+	
+	for (i=0;i<k;i++)
+	{
+		size_mybound_1[i]=(long*)malloc(sizeof(long)*num_dm);
+		memset(size_mybound_1[i],0,sizeof(long)*num_dm);		
+	}
+
+	size_mybound_2=(long **)malloc(sizeof(long*)*k);
+	memset(size_mybound_2,0,sizeof(long*)*k);
+	
+	for (i=0;i<k;i++)
+	{
+		size_mybound_2[i]=(long*)malloc(sizeof(long)*num_dm);
+		memset(size_mybound_2[i],0,sizeof(long)*num_dm);	
+	}
+
+	size_mybound_3=(long **)malloc(sizeof(long*)*k);
+	memset(size_mybound_3,0,sizeof(long*)*k);
+
+	for (i=0;i<k;i++)
+	{
+		size_mybound_3[i]=(long*)malloc(sizeof(long)*num_dm);
+		memset(size_mybound_3[i],0,sizeof(long)*num_dm);
+	}
+	
+	for (i=0;i<file_Len;i++)
+		for (j=0;j<num_dm;j++)
+			{
+				if (Matrix[i][j]<mybound[j][0])// && ((Matrix[i][j]-small[j])>0)) //the smallest values excluded
+					size_mybound_0[cluster_id[i]][j]++;
+				else
+				{
+					if (Matrix[i][j]<mybound[j][1])
+						size_mybound_1[cluster_id[i]][j]++;
+					else
+					{
+						if (Matrix[i][j]<mybound[j][2])
+							size_mybound_2[cluster_id[i]][j]++;
+						else
+							//if (Matrix[i][j]!=big[j]) //the biggest values excluded
+								size_mybound_3[cluster_id[i]][j]++;
+					}
+
+				}
+			}
+
+	fprof_id=fopen("profile.txt","w");
+	fprintf(fprof_id,"Population_ID\t");
+	fprintf(fprof_id,"%s\n",name_string);
+	
+	for (i=0;i<k;i++)
+	{
+		fprintf(fprof_id,"%ld\t",i+1); //i changed to i+1 to start from 1 instead of 0: April 16, 2009
+		for (j=0;j<num_dm;j++)
+		{
+			
+			if (size_mybound_0[i][j]>size_mybound_1[i][j])
+				situ1=0;
+			else
+				situ1=1;
+			if (size_mybound_2[i][j]>size_mybound_3[i][j])
+				situ2=2;
+			else
+				situ2=3;
+
+			if ((situ1==0) && (situ2==2))
+			{
+				if (size_mybound_0[i][j]>size_mybound_2[i][j])
+					prof[i][j]=0;
+				else
+					prof[i][j]=2;
+			}
+			if ((situ1==0) && (situ2==3))
+			{
+				if (size_mybound_0[i][j]>size_mybound_3[i][j])
+					prof[i][j]=0;
+				else
+					prof[i][j]=3;
+			}
+			if ((situ1==1) && (situ2==2))
+			{
+				if (size_mybound_1[i][j]>size_mybound_2[i][j])
+					prof[i][j]=1;
+				else
+					prof[i][j]=2;
+			}
+			if ((situ1==1) && (situ2==3))
+			{
+				if (size_mybound_1[i][j]>size_mybound_3[i][j])
+					prof[i][j]=1;
+				else
+					prof[i][j]=3;
+			}
+			
+			//begin to output profile
+			if (j==num_dm-1)
+			{
+				if (prof[i][j]==0)
+					fprintf(fprof_id,"1\n");
+				if (prof[i][j]==1)
+					fprintf(fprof_id,"2\n");
+				if (prof[i][j]==2)
+					fprintf(fprof_id,"3\n");
+				if (prof[i][j]==3)
+					fprintf(fprof_id,"4\n");
+			}
+			else
+			{
+				if (prof[i][j]==0)
+					fprintf(fprof_id,"1\t");
+				if (prof[i][j]==1)
+					fprintf(fprof_id,"2\t");
+				if (prof[i][j]==2)
+					fprintf(fprof_id,"3\t");
+				if (prof[i][j]==3)
+					fprintf(fprof_id,"4\t");
+			}
+		}
+	}
+	fclose(fprof_id);
+
+	///////////////////////////////////////////////////////////
+	
+
+	fpcnt_id=fopen("percentage.txt","w");
+	fprintf(fpcnt_id,"Population_ID\tPercentage\n");
+
+	for (t=0;t<k;t++)
+	{
+		fprintf(fpcnt_id,"%ld\t%.2f\n",t+1,(double)size_c[t]*100.0/(double)file_Len);	//t changed to t+1 to start from 1 instead of 0: April 16, 2009									
+	}
+	fclose(fpcnt_id);
+
+	free(big);
+	free(small);
+	free(size_c);
+
+	for (i=0;i<k;i++)
+	{
+		free(center[i]);
+		free(prof[i]);
+		free(size_mybound_0[i]);
+		free(size_mybound_1[i]);
+		free(size_mybound_2[i]);
+		free(size_mybound_3[i]);
+	}
+	free(center);
+	free(prof);
+	free(size_mybound_0);
+	free(size_mybound_1);
+	free(size_mybound_2);
+	free(size_mybound_3);
+
+	for (i=0;i<num_dm;i++)
+		free(mybound[i]);
+	free(mybound);
+	
+}
+///////////////////////////////////////////////////////////////////////////
+
+double get_avg_dist(double *population_center, long smaller_pop_ID, long larger_pop_ID, long *population_ID, long num_population, long file_Len, 
+				   long num_dm, double **normalized_data, long d1, long d2, long d3, long *size_c)
+{
+	long k=0;
+	long temp=0;
+	long i=0;
+	long j=0;
+	long t=0;
+	long current_biggest=0;
+
+
+	double total_dist=0.0;
+	double distance=0.0;
+	double dist=0.0;
+	double biggest_distance=0.0;
+
+	long *nearest_neighbors;
+	double *nearest_distance;
+
+	k=min(size_c[smaller_pop_ID],size_c[larger_pop_ID]);
+	
+	if (k>0)
+	{
+		k=(long)sqrt((double)k);
+		if (num_dm<=3)
+			k=(long)(2.7183*(double)k);  //e*k for low-dimensional space, added Nov. 4, 2010
+		//printf("the k-nearest k is %d\n",k);
+	}
+	else
+	{
+		printf("error in k\n");
+		exit(0);
+	}
+
+	nearest_neighbors=(long *)malloc(sizeof(long)*k);
+	memset(nearest_neighbors,0,sizeof(long)*k);
+
+	nearest_distance=(double *)malloc(sizeof(double)*k);
+	memset(nearest_distance,0,sizeof(double)*k);
+
+	temp=0;
+
+	for (i=0;i<file_Len;i++)
+	{
+		if ((population_ID[i]==smaller_pop_ID) || (population_ID[i]==larger_pop_ID)) //the event belongs to the pair of populations
+		{
+			distance=0.0;
+			for (t=0;t<num_dm;t++)
+			{
+				if ((t!=d1) && (t!=d2) && (t!=d3))
+					continue;
+				else
+				{
+					dist=population_center[t]-normalized_data[i][t];
+					distance=distance+dist*dist;
+				}
+			}
+
+			if (temp<k)
+			{
+				nearest_neighbors[temp]=i;
+				nearest_distance[temp]=distance;
+				temp++;
+			}
+			else
+			{
+			    biggest_distance=0.0;
+				for (j=0;j<k;j++)
+				{
+					if (nearest_distance[j]>biggest_distance)
+					{
+						biggest_distance=nearest_distance[j];
+						current_biggest=j;
+					}
+				}
+
+				if (biggest_distance>distance)
+				{
+					nearest_neighbors[current_biggest]=i;
+					nearest_distance[current_biggest]=distance;
+				}
+			}
+		}
+	}
+
+    for (i=0;i<k;i++)
+		total_dist=total_dist+nearest_distance[i];
+
+	total_dist=total_dist/(double)k;
+
+	free(nearest_distance);
+	free(nearest_neighbors);
+	
+	return total_dist;
+}
+
+
+/******************************************************** Main Function **************************************************/
+//for normalized data, there are five variables:
+//cluster_ID
+//population_center
+//grid_clusterID
+//grid_ID
+//grid_Center
+
+//the same five variables exist for the original data
+//however, the three IDs (cluster_ID, grid_ID, grid_clusterID) don't change in either normalized or original data
+//also, data + cluster_ID -> population_center
+//data + grid_ID -> grid_Center
+
+/* what is the final output */
+//the final things we want are grid_Center in the original data and grid_clusterID //or population_center in the original data
+//Sparse grids will not be considered when computing the two centroids (centroids of grids and centroids of clusters)
+
+/*  what information should select_bin output */
+//the size of all IDs are unknown to function main because we only consider the events in dense grids, and also the number of dense grids
+//is unknown, therefore I must use a prescreening to output 
+//how many bins I should use
+//the number of dense grids
+//total number of events in the dense grids
+
+/* basic procedure of main function */ 
+//1. read raw file and normalize the raw file
+//2. select_bin
+//3. allocate memory for eventID_To_denseventID, grid_clusterID, grid_ID, cluster_ID. 
+//4. call select_dense and merge_grids with grid_clusterID, grid_ID, cluster_ID.
+//5. release normalized data; allocate memory for grid_Center and population_center
+//6. output grid_Center and population_center using ID2Center together with grid_clusterID //from IDs to centers
+
+int main (int argc, char **argv)
+{
+  //inputs
+  FILE *f_src; //source file pointer
+ 
+  FILE *f_out; //coordinates
+  FILE *f_cid; //population-ID of events
+  FILE *f_ctr; //centroids of populations
+  FILE *f_results; //coordinates file event and population column
+  FILE *f_mfi; //added April 16, 2009 for mean fluorescence intensity
+  FILE *f_parameters; //number of bins and density calculated by
+                      //the algorithm. Used to update the database
+  FILE *f_properties; //Properties file used by Image generation software
+
+  //char tmpbuf[128];
+
+  char para_name_string[LINE_LEN];
+
+  long time_ID=-1;
+  long num_bin=0; //the bins I will use on each dimension
+  long num_pop=0;	
+  long file_Len=0; //number of events		
+  long num_dm=0;
+  long num_clust=0;
+  long num_dense_events=0;
+  long num_dense_grids=0;
+  long num_nonempty=0;
+  long num_population=0;
+  long num_real_pop=0;
+  long keep_merge=0;
+  long num_checked_range=0;
+
+  //below are read from configuration file
+  long i=0;
+  long j=0;
+  long p=0;
+  long t=0;
+  long q=0;
+  long temp_i=0;
+  long temp_j=0;
+  long first_i=0;
+  long first_j=0;
+  long d1=0;
+  long d2=0;
+  long d3=0;
+  long d_d=0;
+  long max_num_pop=0; //upperbound of the number of populations that is equal to 2^d
+  long index_id=0;
+  long num_computed_population=0;
+  long tot_size=0;
+
+  int den_t_event=0;
+
+  double distance=0.0;
+  double dist=0.0;
+  double current_smallest_dist=0;
+  double max_d_dist=0.0;
+  double Ei=0.0;
+  double Ej=0.0;
+  double E1=0.0;
+  double E2=0.0;
+  double E3=0.0;
+  double Ep=0.0;
+  double temp=0.0;
+  double tmp=0.0;
+
+  long *grid_clusterID; //(dense)gridID to clusterID
+  long *grid_ID; //(dense)eventID to gridID
+  long *cluster_ID; //(dense)eventID to clusterID
+  long *eventID_To_denseventID; //eventID_To_denseventID[i]=k means event i is in a dense grid and its ID within dense events is k
+  long *all_grid_ID; //gridID for all events
+  long *densegridID_To_gridevent;
+  long *sorted_seq;
+  long *dense_grid_reverse;
+  long *population_ID; //denseeventID to populationID
+  long *cluster_populationID; //clusterID to populationID
+  long *grid_populationID; //gridID to populationID
+  long *all_population_ID; //populationID of event
+  long *size_c;
+  long *size_p_2;
+  long *partit;
+  long *size_p;
+  long *temp_size_j;
+  long *all_computed_pop_ID;
+
+
+  long **position;
+  long **dense_grid_seq;
+  long **temp_population_ID;
+
+  double *interval;
+  double *center_1;
+  double *center_2;
+  double *center_3;
+  double *aver;
+  double *std;
+  double *center_p_1;
+  double *center_p_2;
+  double *center_p_3;
+	
+  double **population_center; //population centroids in the raw/original data
+  double **cluster_center; //cluster centroids in the raw/original data
+  double **new_population_center;
+  double **temp_population_center;
+  double **min_pop;
+  double **max_pop;
+
+  double **input_data;
+  double **normalized_data;
+
+  double **real_pop_center;
+  double **distance_pop;
+
+  double ***ind_pop;
+  double ***ind_pop_3;
+		
+  int min = 999999;
+  int max = 0;
+
+  printf( "Starting time:\t\t\t\t");
+  fflush(stdout);
+  system("/bin/date");
+
+  /*
+   * Windows Version
+  _strtime( tmpbuf );
+  printf( "Starting time:\t\t\t\t%s\n", tmpbuf );
+  _strdate( tmpbuf );
+  printf( "Starting date:\t\t\t\t%s\n", tmpbuf );
+  */
+
+  /////////////////////////////////////////////////////////////
+
+  if ((argc!=2) && (argc!=3) && (argc!=4) && (argc!=5) && (argc!=6))
+  {
+      fprintf(stderr,"Incorrect number of input parameters!\n"); 
+	  fprintf(stderr,"usage:\n");
+	  fprintf(stderr,"basic mode: flock data_file\n");
+	  fprintf(stderr,"advanced mode 0 (specify maximum # of pops): flock data_file max_num_pop\n");
+	  fprintf(stderr,"advanced mode 1 (without # of pops): flock data_file num_bin density_index\n");
+	  fprintf(stderr,"advanced mode 2 (specify # of pops): flock data_file num_bin density_index number_of_pop\n");
+	  fprintf(stderr,"advanced mode 3 (specify both # of pops): flock data_file num_bin density_index number_of_pop max_num_pop\n");
+      exit(0);
+  }	
+
+  f_src=fopen(argv[1],"r");
+
+  if (argc==3)
+  {
+	max_num_pop=atoi(argv[2]);
+	printf("num of maximum pop is %ld\n",max_num_pop);
+  }
+  
+  if (argc==4)
+  {
+	num_bin=atoi(argv[2]);
+	printf("num_bin is %ld\n",num_bin);
+
+	den_t_event=atoi(argv[3]);
+	printf("density_index is %d\n",den_t_event);
+  }
+
+  if (argc==5)
+  {
+	num_bin=atoi(argv[2]);
+	printf("num_bin is %ld\n",num_bin);
+
+	den_t_event=atoi(argv[3]);
+	printf("density_index is %d\n",den_t_event);
+
+	num_pop=atoi(argv[4]);
+	printf("num of pop is %ld\n",num_pop);
+  }
+
+  if (argc==6)
+  {
+	num_bin=atoi(argv[2]);
+	printf("num_bin is %ld\n",num_bin);
+
+	den_t_event=atoi(argv[3]);
+	printf("density_index is %d\n",den_t_event);
+
+	num_pop=atoi(argv[4]);
+	printf("num of pop is %ld\n",num_pop);
+
+	max_num_pop=atoi(argv[5]);
+	printf("num of pop is %ld\n",max_num_pop);
+  }
+
+
+  if (num_pop==1)
+  {
+	  printf("it is not allowed to specify only one population\n");
+	  exit(0);
+  }
+
+    
+
+  getfileinfo(f_src, &file_Len, &num_dm, para_name_string, &time_ID); //get the filelength, number of dimensions, and num/name of parameters
+
+  
+  if (max_num_pop==0)
+  {
+	  max_num_pop=(long)pow(2,num_dm);
+		if (max_num_pop>MAX_POP_NUM)
+			max_num_pop=MAX_POP_NUM;
+  }
+
+  /************************************************* Read the data *****************************************************/
+	
+  rewind(f_src); //reset the file pointer	
+
+  input_data = (double **)malloc(sizeof(double*)*file_Len);
+  memset(input_data,0,sizeof(double*)*file_Len);
+  for (i=0;i<file_Len;i++)
+  {
+     input_data[i]=(double *)malloc(sizeof(double)*num_dm);
+     memset(input_data[i],0,sizeof(double)*num_dm);
+  }
+	
+  readsource(f_src, file_Len, num_dm, input_data, time_ID); //read the data;
+	
+  fclose(f_src);
+
+  normalized_data=(double **)malloc(sizeof(double*)*file_Len);
+  memset(normalized_data,0,sizeof(double*)*file_Len);
+  for (i=0;i<file_Len;i++)
+    {
+      normalized_data[i]=(double *)malloc(sizeof(double)*num_dm);
+      memset(normalized_data[i],0,sizeof(double)*num_dm);
+    }
+	
+  tran(input_data, file_Len, num_dm, NORM_METHOD, normalized_data);
+	
+
+  position=(long **)malloc(sizeof(long*)*file_Len);
+  memset(position,0,sizeof(long*)*file_Len);
+  for (i=0;i<file_Len;i++)
+    {
+      position[i]=(long*)malloc(sizeof(long)*num_dm);
+      memset(position[i],0,sizeof(long)*num_dm);
+    }
+
+  all_grid_ID=(long *)malloc(sizeof(long)*file_Len);
+  memset(all_grid_ID,0,sizeof(long)*file_Len);
+
+  sorted_seq=(long*)malloc(sizeof(long)*file_Len);
+  memset(sorted_seq,0,sizeof(long)*file_Len);
+	
+  interval=(double*)malloc(sizeof(double)*num_dm);
+  memset(interval,0,sizeof(double)*num_dm);
+
+  /************************************************* select_bin *************************************************/
+	
+  if (num_bin>=1)  //num_bin has been selected by user
+  {
+  	select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin);
+	printf("user set bin number is %ld\n",num_bin); 
+  }
+  else  //num_bin has not been selected by user
+  {
+	num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin);
+	printf("selected bin number is %ld\n",num_bin);    
+  }
+  printf("number of non-empty grids is %ld\n",num_nonempty);
+		
+ 
+
+  /* Although we return sorted_seq from select_bin(), we don't use it for anything, except possibly diagnostics */
+  free(sorted_seq);
+	
+	
+  dense_grid_reverse=(long*)malloc(sizeof(long)*num_nonempty);
+  memset(dense_grid_reverse,0,sizeof(long)*num_nonempty);	
+
+  /************************************************* select_dense **********************************************/
+
+  if (den_t_event>=1) //den_t_event must be larger or equal to 2 if the user wants to set it
+	den_t_event=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event);
+  else
+  {
+	den_t_event=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event);
+	printf("automated selected density threshold is %d\n",den_t_event);
+  }		
+
+  printf("Number of dense grids is %ld\n",num_dense_grids);
+
+  densegridID_To_gridevent = (long *)malloc(sizeof(long)*num_dense_grids);
+  memset(densegridID_To_gridevent,0,sizeof(long)*num_dense_grids);
+
+  for (i=0;i<num_dense_grids;i++)
+    densegridID_To_gridevent[i]=-1; //initialize all densegridID_To_gridevent values to -1
+	
+
+  eventID_To_denseventID=(long *)malloc(sizeof(long)*file_Len);
+  memset(eventID_To_denseventID,0,sizeof(long)*file_Len);     //eventID_To_denseventID[i]=k means event i is in a dense grid and its ID within dense events is k
+
+
+  grid_To_event(file_Len, dense_grid_reverse, all_grid_ID, eventID_To_denseventID, densegridID_To_gridevent);
+
+	
+  dense_grid_seq=(long **)malloc(sizeof(long*)*num_dense_grids);
+  memset(dense_grid_seq,0,sizeof(long*)*num_dense_grids);
+  for (i=0;i<num_dense_grids;i++)
+    {
+      dense_grid_seq[i]=(long *)malloc(sizeof(long)*num_dm);
+      memset(dense_grid_seq[i],0,sizeof(long)*num_dm);
+    }
+
+
+  /* Look up the binned data values for each dense grid */
+  generate_grid_seq(num_dm, num_dense_grids, densegridID_To_gridevent, position, dense_grid_seq); 	
+	
+	
+  /************************************************* allocate memory *********************************************/
+	
+  grid_clusterID=(long *)malloc(sizeof(long)*num_dense_grids);
+  memset(grid_clusterID,0,sizeof(long)*num_dense_grids);
+
+  grid_ID=(long *)malloc(sizeof(long)*num_dense_events);
+  memset(grid_ID,0,sizeof(long)*num_dense_events);
+
+  cluster_ID=(long *)malloc(sizeof(long)*num_dense_events);
+  memset(cluster_ID,0,sizeof(long)*num_dense_events);
+
+
+  /*********************************************** merge_grids ***********************************************/
+  //long merge_grids(long file_Len, long num_dm, long num_bin, long **position, long num_dense_grids, long *dense_grid_rank, long *dense_grid_reverse,
+  //			 long **dense_grid_seq, long *eventID_To_denseventID, long *densegridID_To_gridevent, long *all_grid_ID,
+  //			 long *cluster_ID, long *grid_ID, long *grid_clusterID)
+	
+  num_clust = merge_grids(normalized_data, interval, file_Len, num_dm, num_bin, position, num_dense_grids, dense_grid_reverse, dense_grid_seq, eventID_To_denseventID, densegridID_To_gridevent, all_grid_ID, cluster_ID, grid_ID, grid_clusterID);
+	
+  printf("computed number of groups is %ld\n",num_clust);	
+
+	
+  /************************************** release unnecessary memory and allocate memory and compute centers **********************************/
+	
+	
+  for (i=0;i<file_Len;i++)
+    free(position[i]);
+  free(position);
+
+  for (i=0;i<num_dense_grids;i++)
+    free(dense_grid_seq[i]);
+  free(dense_grid_seq);
+
+  free(dense_grid_reverse);
+	
+  free(densegridID_To_gridevent);
+  free(all_grid_ID);
+	
+  // cluster_center ////////////////////////////////////////////////////////////////////////////////////////////////////////
+	
+  cluster_center=(double **)malloc(sizeof(double*)*num_clust);
+  memset(cluster_center,0,sizeof(double*)*num_clust);
+  for (i=0;i<num_clust;i++)
+  {
+     cluster_center[i]=(double*)malloc(sizeof(double)*num_dm);
+     memset(cluster_center[i],0,sizeof(double)*num_dm);
+  }
+	
+  ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_clust,cluster_ID,cluster_center); //produce the centers with normalized data
+	
+  //printf("pass the first ID2center\n");
+
+  /*** population_ID and grid_populationID **/
+	
+  cluster_populationID=(long*)malloc(sizeof(long)*num_clust);
+  memset(cluster_populationID,0,sizeof(long)*num_clust);
+
+  grid_populationID=(long*)malloc(sizeof(long)*num_dense_grids);
+  memset(grid_populationID,0,sizeof(long)*num_dense_grids);
+
+  population_ID=(long*)malloc(sizeof(long)*num_dense_events);
+  memset(population_ID,0,sizeof(long)*num_dense_events);
+
+  num_population = merge_clusters(num_clust, num_dm, interval, cluster_center, cluster_populationID, max_num_pop);
+
+  
+
+  for (i=0;i<num_clust;i++)
+    free(cluster_center[i]);
+  free(cluster_center);
+
+  free(interval);
+	
+  for (i=0;i<num_dense_grids;i++)
+    {
+      grid_populationID[i]=cluster_populationID[grid_clusterID[i]];
+    }
+
+  for (i=0;i<num_dense_events;i++)
+    {
+      population_ID[i]=cluster_populationID[cluster_ID[i]];
+    }
+
+  printf("Number of merged groups before second-run partitioning is %ld\n",num_population);
+
+  
+
+  // population_center /////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ 
+  population_center=(double **)malloc(sizeof(double*)*num_population);
+  memset(population_center,0,sizeof(double*)*num_population);
+  for (i=0;i<num_population;i++)
+  {
+     population_center[i]=(double*)malloc(sizeof(double)*num_dm);
+     memset(population_center[i],0,sizeof(double)*num_dm);
+  }
+
+  
+	
+  ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_population,population_ID,population_center); //produce population centers with normalized data
+	
+
+  // show ////////////////////////////////////////////////////////////////////////////////
+  all_population_ID=(long*)malloc(sizeof(long)*file_Len);
+  memset(all_population_ID,0,sizeof(long)*file_Len);
+
+  kmeans(normalized_data, num_population, KMEANS_TERM, file_Len, num_dm, all_population_ID, population_center);
+
+  for (i=0;i<num_population;i++)
+	free(population_center[i]);
+
+  free(population_center);
+  
+  ////// there needs to be another step to further partition the data
+  
+	
+	  center_p_1=(double *)malloc(sizeof(double)*3);
+	  memset(center_p_1,0,sizeof(double)*3);
+
+	  center_p_2=(double *)malloc(sizeof(double)*3);
+	  memset(center_p_2,0,sizeof(double)*3);
+
+	  center_p_3=(double *)malloc(sizeof(double)*3);
+	  memset(center_p_3,0,sizeof(double)*3);
+
+	  size_p=(long *)malloc(sizeof(long)*num_population);
+	  memset(size_p,0,sizeof(long)*num_population);
+	
+	  temp_size_j=(long *)malloc(sizeof(long)*num_population);
+	  memset(temp_size_j,0,sizeof(long)*num_population);
+
+	  all_computed_pop_ID=(long *)malloc(sizeof(long)*file_Len);
+	  memset(all_computed_pop_ID,0,sizeof(long)*file_Len);
+
+	  for (i=0;i<file_Len;i++)
+	  {
+		size_p[all_population_ID[i]]++;
+		all_computed_pop_ID[i]=all_population_ID[i];
+	  }
+
+	  ind_pop=(double***)malloc(sizeof(double**)*num_population);
+	  memset(ind_pop,0,sizeof(double**)*num_population);
+
+	  ind_pop_3=(double***)malloc(sizeof(double**)*num_population);
+	  memset(ind_pop_3,0,sizeof(double**)*num_population);
+
+	  for (i=0;i<num_population;i++)
+	  {
+		  ind_pop[i]=(double**)malloc(sizeof(double*)*size_p[i]);
+		  memset(ind_pop[i],0,sizeof(double*)*size_p[i]);
+
+		  ind_pop_3[i]=(double**)malloc(sizeof(double*)*size_p[i]);
+		  memset(ind_pop_3[i],0,sizeof(double*)*size_p[i]);
+
+		  for (j=0;j<size_p[i];j++)
+		  {
+			  ind_pop[i][j]=(double*)malloc(sizeof(double)*num_dm);
+			  memset(ind_pop[i][j],0,sizeof(double)*num_dm);
+
+			  ind_pop_3[i][j]=(double*)malloc(sizeof(double)*3);
+			  memset(ind_pop_3[i][j],0,sizeof(double)*3);
+		  }
+		  temp_size_j[i]=0;
+	  }
+
+	  for (i=0;i<file_Len;i++) //to generate ind_pop[i]
+	  {
+		  index_id=all_population_ID[i];
+		  
+		  j=temp_size_j[index_id];
+
+		  for (t=0;t<num_dm;t++)
+		  {
+			ind_pop[index_id][j][t]=input_data[i][t];
+		  }
+		  temp_size_j[index_id]++;		
+	  }
+
+	  aver=(double*)malloc(sizeof(double)*num_dm);
+	  memset(aver,0,sizeof(double)*num_dm);
+
+	  std=(double*)malloc(sizeof(double)*num_dm);
+	  memset(std,0,sizeof(double)*num_dm);
+
+	  temp_population_center=(double**)malloc(sizeof(double*)*2);
+	  memset(temp_population_center,0,sizeof(double*)*2);
+	  for (i=0;i<2;i++)
+	  {
+		temp_population_center[i]=(double*)malloc(sizeof(double)*3);
+		memset(temp_population_center[i],0,sizeof(double)*3);
+	  }
+
+	  size_p_2=(long*)malloc(sizeof(long)*2);
+	  memset(size_p_2,0,sizeof(long)*2);
+
+	  partit=(long*)malloc(sizeof(long)*num_population);
+	  memset(partit,0,sizeof(long)*num_population);
+
+	  num_computed_population=num_population;
+
+	  temp_population_ID=(long**)malloc(sizeof(long*)*num_population);
+	  memset(temp_population_ID,0,sizeof(long*)*num_population);
+
+	  for (i=0;i<num_population;i++)
+	  {
+		temp_population_ID[i]=(long*)malloc(sizeof(long)*size_p[i]);
+		memset(temp_population_ID[i],0,sizeof(long)*size_p[i]);
+	  }
+	
+	  //printf("num_population=%d\n",num_population);
+	  
+	  for (i=0;i<num_population;i++) 
+	  {
+		  partit[i]=0;
+		  tran(ind_pop[i], size_p[i], num_dm, 1, ind_pop[i]);//0-1 normalize ind_pop[i]
+          
+		  //find the 3 dimensions with the largest std for ind_pop[i]
+		  d1=-1;
+		  d2=-1;
+		  d3=-1;
+
+		  for (t=0;t<num_dm;t++)
+		  {
+			aver[t]=0;
+			for (j=0;j<size_p[i];j++)
+			{
+				aver[t]=aver[t]+ind_pop[i][j][t];
+			}
+			aver[t]=aver[t]/(double)size_p[i];
+
+			std[t]=0;
+			for (j=0;j<size_p[i];j++)
+			{
+				std[t]=std[t]+((ind_pop[i][j][t]-aver[t])*(ind_pop[i][j][t]-aver[t]));
+			}
+			std[t]=sqrt(std[t]/(double)size_p[i]);
+		  }
+
+		  
+
+		  for (j=0;j<3;j++)
+		  {
+			max_d_dist=0;
+			for (t=0;t<num_dm;t++)
+			{
+				if ((t!=d1) && (t!=d2))
+				{
+					dist=std[t];
+				}
+				else
+					dist=-1;
+
+				if (dist>max_d_dist)
+				{
+					max_d_dist=dist;
+					d_d=t;						
+				}
+			}
+
+			if (j==0)
+				d1=d_d;
+			if (j==1)
+				d2=d_d;
+			if (j==2)
+				d3=d_d;
+		  }
+		 
+		 
+		  for (t=0;t<size_p[i];t++)
+		  {
+				ind_pop_3[i][t][0]=ind_pop[i][t][d1];
+				ind_pop_3[i][t][1]=ind_pop[i][t][d2];
+				ind_pop_3[i][t][2]=ind_pop[i][t][d3];
+		  }
+		
+
+		 temp_population_center[0][0]=(aver[d1])/2.0;
+		 
+		 temp_population_center[0][1]=aver[d2];
+	     temp_population_center[0][2]=aver[d3];
+		 
+		 temp_population_center[1][0]=(aver[d1]+1.0)/2.0;
+
+		 temp_population_center[1][1]=aver[d2];
+		 temp_population_center[1][2]=aver[d3];
+		 
+		 
+
+		 //run K-means with K=2
+		 kmeans(ind_pop_3[i], 2, KMEANS_TERM, size_p[i], 3, temp_population_ID[i], temp_population_center);
+
+		 for (j=0;j<2;j++)
+			 size_p_2[j]=0;
+
+		 for (j=0;j<size_p[i];j++)
+			 size_p_2[temp_population_ID[i][j]]++;
+
+		 for (j=0;j<2;j++)
+			 if (size_p_2[j]==0)
+				 printf("size_p_2=0 at i=%ld\n",i);
+
+
+		 //check whether the 2 parts should be merged
+		
+
+		 Ei=get_avg_dist(temp_population_center[0], 0,1, temp_population_ID[i], 2,size_p[i], 3, ind_pop_3[i], 0,1,2,size_p_2);
+		 Ej=get_avg_dist(temp_population_center[1], 0,1, temp_population_ID[i], 2,size_p[i], 3, ind_pop_3[i], 0,1,2,size_p_2);
+
+		 for (t=0;t<3;t++)
+		 {
+			center_p_1[t]=temp_population_center[0][t]*0.25+temp_population_center[1][t]*0.75;
+			center_p_2[t]=temp_population_center[0][t]*0.5+temp_population_center[1][t]*0.5;
+			center_p_3[t]=temp_population_center[0][t]*0.75+temp_population_center[1][t]*0.25;
+		 }
+		
+		 E1=get_avg_dist(center_p_1, 0,1, temp_population_ID[i], 2,size_p[i], 3, ind_pop_3[i], 0,1,2,size_p_2);
+		
+		 E2=get_avg_dist(center_p_2, 0,1, temp_population_ID[i], 2,size_p[i], 3, ind_pop_3[i], 0,1,2,size_p_2);
+
+		 E3=get_avg_dist(center_p_3, 0,1, temp_population_ID[i], 2,size_p[i], 3, ind_pop_3[i], 0,1,2,size_p_2);
+
+		 //printf("i=%d;E1=%f;E2=%f;E3=%f\n",i,E1,E2,E3);
+
+		 if (E1<E2)
+			Ep=E2;
+		 else
+			Ep=E1;
+			
+		 Ep=max(Ep,E3); //Ep is the most sparse area
+
+		 
+
+		 if ((Ep>Ei) && (Ep>Ej)) //the two parts should be partitioned
+		 {
+			 partit[i]=num_computed_population;
+			 num_computed_population++;			 
+		 }
+		
+	  } //end for (i=0;i<num_population;i++)
+
+	  
+	  for (i=0;i<num_population;i++)
+		  temp_size_j[i]=0;
+
+	  for (i=0;i<file_Len;i++)
+	  {
+		index_id=all_population_ID[i];
+		if (partit[index_id]>0)
+		{
+			j=temp_size_j[index_id];
+			if (temp_population_ID[index_id][j]==1)
+				all_computed_pop_ID[i]=partit[index_id];
+					
+			temp_size_j[index_id]++;
+		}
+	  }
+
+	  printf("Number of groups after partitioning is %ld\n", num_computed_population);
+
+	
+      free(size_p_2);
+	  
+	  free(aver);
+	  free(std);
+	  free(partit);
+
+	  free(center_p_1);
+	  free(center_p_2);
+	  free(center_p_3);
+
+	 
+
+	  for (i=0;i<num_population;i++)
+		  free(temp_population_ID[i]);
+	  free(temp_population_ID);
+
+	  for (i=0;i<num_population;i++)
+	  {
+		for (j=0;j<size_p[i];j++)
+		{
+			  free(ind_pop[i][j]);
+			  free(ind_pop_3[i][j]);
+		}
+	  }
+
+	 
+
+	  for (i=0;i<num_population;i++)
+	  {
+		  free(ind_pop[i]);
+		  free(ind_pop_3[i]);
+	  }
+	  free(ind_pop);
+	  free(ind_pop_3);
+
+	  
+	 
+
+	  for (i=0;i<2;i++)
+		  free(temp_population_center[i]);
+
+	  free(temp_population_center);
+
+	  free(temp_size_j);
+	  free(size_p);
+
+	 
+
+	  //update the IDs, Centers, and # of populations
+	  for (i=0;i<file_Len;i++)
+	  {
+		  all_population_ID[i]=all_computed_pop_ID[i];
+		  if (all_population_ID[i]>=num_computed_population)
+			  printf("all_population_ID[%ld]=%ld\n",i,all_population_ID[i]);
+	  }
+
+	  num_population=num_computed_population;
+
+	  free(all_computed_pop_ID);
+    //end partitioning
+  // since the num_population has changed, population_center needs to be redefined as below
+
+  population_center=(double **)malloc(sizeof(double*)*num_population);
+  memset(population_center,0,sizeof(double*)*num_population);
+  for (i=0;i<num_population;i++)
+  {
+     population_center[i]=(double*)malloc(sizeof(double)*num_dm);
+     memset(population_center[i],0,sizeof(double)*num_dm);
+  }
+
+  	  
+  ID2Center_all(normalized_data,file_Len,num_dm,num_population,all_population_ID,population_center);
+  
+
+
+  ////// end of further partitioning
+
+  ////////////////////////////////////////////////////////////
+  //  to further merge populations to avoid overpartitioning
+  //  Added June 20, 2010
+  ////////////////////////////////////////////////////////////
+  
+
+  
+  num_real_pop=num_population;
+  keep_merge=1;
+
+  if (num_pop>num_real_pop)
+  {
+		fprintf(stderr,"number of populations specified too large\n");
+		exit(0);
+  }
+
+  center_1=(double *)malloc(sizeof(double)*num_dm);
+  memset(center_1,0,sizeof(double)*num_dm);
+
+  center_2=(double *)malloc(sizeof(double)*num_dm);
+  memset(center_2,0,sizeof(double)*num_dm);
+
+  center_3=(double *)malloc(sizeof(double)*num_dm);
+  memset(center_3,0,sizeof(double)*num_dm);
+
+
+    
+  while (((num_real_pop>num_pop) && (num_pop!=0)) || ((keep_merge==1) && (num_pop==0) && (num_real_pop>2)))
+  {
+	  
+	keep_merge=0;  //so, if no entering merge function, while will exit when num_pop==0
+
+	real_pop_center=(double **)malloc(sizeof(double*)*num_real_pop);
+    memset(real_pop_center,0,sizeof(double*)*num_real_pop);
+    for (i=0;i<num_real_pop;i++)
+    {
+      real_pop_center[i]=(double*)malloc(sizeof(double)*num_dm);
+      memset(real_pop_center[i],0,sizeof(double)*num_dm);
+    }
+
+	min_pop=(double **)malloc(sizeof(double*)*num_real_pop);
+    memset(min_pop,0,sizeof(double*)*num_real_pop);
+    for (i=0;i<num_real_pop;i++)
+    {
+      min_pop[i]=(double*)malloc(sizeof(double)*num_dm);
+      memset(min_pop[i],0,sizeof(double)*num_dm);
+    }
+
+	max_pop=(double **)malloc(sizeof(double*)*num_real_pop);
+    memset(max_pop,0,sizeof(double*)*num_real_pop);
+    for (i=0;i<num_real_pop;i++)
+    {
+      max_pop[i]=(double*)malloc(sizeof(double)*num_dm);
+      memset(max_pop[i],0,sizeof(double)*num_dm);
+    }
+
+	if (num_real_pop==num_population)
+	{
+		for (i=0;i<num_real_pop;i++)
+			for (j=0;j<num_dm;j++)
+				real_pop_center[i][j]=population_center[i][j];	
+	}
+	else
+	{
+		ID2Center_all(normalized_data,file_Len,num_dm,num_real_pop,all_population_ID,real_pop_center);
+	}
+
+	for (i=0;i<num_real_pop;i++)
+	{
+		for (j=0;j<num_dm;j++)
+		{
+			min_pop[i][j]=MAX_VALUE;
+			
+			max_pop[i][j]=0;
+		}			
+	}
+	
+	for (i=0;i<file_Len;i++)
+	{
+		index_id=all_population_ID[i];
+		for (j=0;j<num_dm;j++)
+		{
+			if (normalized_data[i][j]<min_pop[index_id][j])
+				min_pop[index_id][j]=normalized_data[i][j];
+			
+			if (normalized_data[i][j]>max_pop[index_id][j])
+				max_pop[index_id][j]=normalized_data[i][j];
+		}			
+	}
+
+/*	for (i=0;i<num_real_pop;i++)
+	{
+		for (j=0;j<num_dm;j++)
+		{
+			printf("min_pop is %f\t",min_pop[i][j]);
+			//printf("max_pop is %f\n",max_pop[i][j]);
+		}
+		printf("\n");
+	}
+*/
+
+
+	distance_pop=(double **)malloc(sizeof(double*)*num_real_pop);
+    memset(distance_pop,0,sizeof(double*)*num_real_pop);
+    for (i=0;i<num_real_pop;i++)
+    {
+      distance_pop[i]=(double*)malloc(sizeof(double)*num_real_pop);
+      memset(distance_pop[i],0,sizeof(double)*num_real_pop);
+    }
+
+	for (i=0;i<num_real_pop-1;i++)
+	{
+		for (j=i+1;j<num_real_pop;j++)
+		{
+		  distance=0;
+		
+		  for (t=0;t<num_dm;t++)
+		  {
+			dist=real_pop_center[i][t]-real_pop_center[j][t];
+			distance=distance+dist*dist;
+		  }
+		  distance_pop[i][j]=distance;
+		  distance_pop[j][i]=distance;	      
+		}
+	}
+ 
+	if (num_real_pop>2)
+		num_checked_range=(long)((double)num_real_pop*(num_real_pop-1)/2.0); //to find the mergeable pair among the num_checked_range pairs of smallest distances
+	else
+		num_checked_range=1;
+
+	size_c=(long *)malloc(sizeof(long)*num_real_pop);
+    memset(size_c,0,sizeof(long)*num_real_pop);
+	
+	for (i=0;i<file_Len;i++)
+	  size_c[all_population_ID[i]]++;
+
+	for (p=0;p<num_checked_range;p++)  
+	{
+		current_smallest_dist=MAX_VALUE;
+
+		for (i=0;i<num_real_pop-1;i++)
+		{
+			for (j=i+1;j<num_real_pop;j++)
+			{
+				if (distance_pop[i][j]<current_smallest_dist)
+				{
+					current_smallest_dist=distance_pop[i][j];
+				
+					temp_i=i;
+					temp_j=j;
+				}
+			}
+		}
+
+		if (p==0)
+		{
+			first_i=temp_i;
+			first_j=temp_j;
+		}
+
+		distance_pop[temp_i][temp_j]=MAX_VALUE+1; //make sure this pair won't be selected next time
+		
+		//normalize and calculate std for temp_i and temp_j
+		
+		//pick the largest 3 dimensions on standard deviation
+		d1=-1;
+		d2=-1;
+		d3=-1;
+
+		for (i=0;i<3;i++)
+		{
+			max_d_dist=0;
+			for (j=0;j<num_dm;j++)
+			{
+				if ((j!=d1) && (j!=d2))
+				{
+					temp=min(min_pop[temp_i][j],min_pop[temp_j][j]);
+					tmp=max(max_pop[temp_i][j],max_pop[temp_j][j]);
+					if (tmp<=temp)
+					{
+						//printf("min_pop[%d][%d]=%f \t min_pop[%d][%d]=%f \t max_pop[%d][%d]=%f \t max_pop[%d][%d]=%f\n",temp_i,j,min_pop[temp_i][j],temp_j, j, min_pop[temp_j][j],temp_i,j,max_pop[temp_i][j],temp_j,j,max_pop[temp_j][j]);
+						dist=0;
+					}
+					else
+						dist=(real_pop_center[temp_i][j]-real_pop_center[temp_j][j])/(tmp-temp);
+					if (dist<0)
+						dist=-dist;
+				}
+				else
+					dist=-1;
+
+				if (dist>max_d_dist)
+				{
+					max_d_dist=dist;
+					d_d=j;						
+				}
+			}
+
+			if (i==0)
+				d1=d_d;
+			if (i==1)
+				d2=d_d;
+			if (i==2)
+				d3=d_d;
+		}
+
+		//printf("d1=%d;d2=%d;d3=%d\n",d1,d2,d3);
+
+		Ei=get_avg_dist(real_pop_center[temp_i], temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c);
+		Ej=get_avg_dist(real_pop_center[temp_j], temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c);
+
+		for (t=0;t<num_dm;t++)
+		{
+			center_1[t]=real_pop_center[temp_i][t]*0.25+real_pop_center[temp_j][t]*0.75;
+			center_2[t]=real_pop_center[temp_i][t]*0.5+real_pop_center[temp_j][t]*0.5;
+			center_3[t]=real_pop_center[temp_i][t]*0.75+real_pop_center[temp_j][t]*0.25;
+		}
+		
+		E1=get_avg_dist(center_1, temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c);
+		
+		E2=get_avg_dist(center_2, temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c);
+
+		E3=get_avg_dist(center_3, temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c);
+
+		if (E1<E2)
+			Ep=E2;
+		else
+			Ep=E1;
+			
+		Ep=max(Ep,E3); //Ep is the most sparse area
+
+		Ep=Ep*E_T;
+		//printf("Ep=%f;Ei=%f;Ej=%f\n",Ep,Ei,Ej);
+
+		if ((Ep<=Ei) || (Ep<=Ej))//if the most sparse area between i and j are still denser than one of them, temp_i and temp_j should be merged
+		{
+			keep_merge=1;
+			break;		
+		}//end if (Ep)
+	} //end for p
+
+	//printf("keep_merge=%d\n",keep_merge);
+
+	for (i=0;i<num_real_pop;i++)
+	{
+		free(real_pop_center[i]);
+		free(distance_pop[i]);
+		free(min_pop[i]);
+		free(max_pop[i]);
+	}
+	free(real_pop_center);
+	free(min_pop);
+	free(max_pop);
+	free(distance_pop);
+	free(size_c);
+
+	//printf("temp_i=%d;temp_j=%d\n",temp_i,temp_j);
+
+	if (keep_merge)  //found one within p loop
+	{
+		for (i=0;i<file_Len;i++)
+		{
+			if (all_population_ID[i]>temp_j)
+			{
+				all_population_ID[i]=all_population_ID[i]-1;
+			}
+			else 
+			{
+				if (all_population_ID[i]==temp_j)
+				{
+					all_population_ID[i]=temp_i;
+				}
+			}
+		}
+
+		num_real_pop--;
+	}
+	else
+	{
+		if ((num_pop!=0) && (num_pop<num_real_pop)) //not reach the specified num of pop
+		{
+			for (i=0;i<file_Len;i++)
+			{
+				if (all_population_ID[i]>first_j)
+				{
+					all_population_ID[i]=all_population_ID[i]-1;
+				}
+				else
+				{
+					if (all_population_ID[i]==first_j)
+					{
+						all_population_ID[i]=first_i;
+					}
+				}
+			}
+
+			num_real_pop--;
+
+		
+
+		}
+	}
+		//printf("num_real_pop is %d\n",num_real_pop);
+	
+  } //end of while
+
+  
+
+  free(center_1);
+  free(center_2);
+  free(center_3);
+  printf("Final number of populations is %ld\n",num_real_pop);
+
+  new_population_center=(double **)malloc(sizeof(double*)*num_real_pop);
+  memset(new_population_center,0,sizeof(double*)*num_real_pop);
+  for (i=0;i<num_real_pop;i++)
+  {
+      new_population_center[i]=(double*)malloc(sizeof(double)*num_dm);
+      memset(new_population_center[i],0,sizeof(double)*num_dm);
+  }
+  
+ 
+  ///////////////////////////////////////////
+  //End of population mapping
+  ///////////////////////////////////////////
+
+  show(input_data, all_population_ID, file_Len, num_real_pop, num_dm, para_name_string);
+
+  ID2Center_all(input_data,file_Len,num_dm,num_real_pop,all_population_ID,new_population_center);
+  
+
+  f_cid=fopen("population_id.txt","w");
+  f_ctr=fopen("population_center.txt","w");
+  f_out=fopen("coordinates.txt","w");
+  f_results=fopen("flock_results.txt","w");
+
+/*
+  f_parameters=fopen("parameters.txt","w");
+  fprintf(f_parameters,"Number_of_Bins\t%d\n",num_bin);
+  fprintf(f_parameters,"Density\t%f\n",aver_index);
+  fclose(f_parameters);
+*/
+
+  for (i=0;i<file_Len;i++)
+	fprintf(f_cid,"%ld\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009
+
+  /*
+   * New to check for min/max to add to parameters.txt
+   *
+  */
+  
+  fprintf(f_out,"%s\n",para_name_string);
+  //fprintf(f_results,"%s\tEvent\tPopulation\n",para_name_string);
+  fprintf(f_results,"%s\tPopulation\n",para_name_string);
+  for (i=0;i<file_Len;i++)
+  {
+	for (j=0;j<num_dm;j++)
+	{
+		if (input_data[i][j] < min) {
+			min = (int)input_data[i][j];
+		}
+		if (input_data[i][j] > max) {
+			max = (int)input_data[i][j];
+		}
+		if (j==num_dm-1)
+		{
+			fprintf(f_out,"%d\n",(int)input_data[i][j]);
+			fprintf(f_results,"%d\t",(int)input_data[i][j]);
+		}
+		else
+		{
+			fprintf(f_out,"%d\t",(int)input_data[i][j]);
+			fprintf(f_results,"%d\t",(int)input_data[i][j]);
+		}
+	}
+	//fprintf(f_results,"%ld\t",i + 1);
+	fprintf(f_results,"%ld\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009
+  }
+
+/*
+  f_parameters=fopen("parameters.txt","w");
+  fprintf(f_parameters,"Number_of_Bins\t%ld\n",num_bin);
+  fprintf(f_parameters,"Density\t%d\n",den_t_event);
+  fprintf(f_parameters,"Min\t%d\n",min);
+  fprintf(f_parameters,"Max\t%d\n",max);
+  fclose(f_parameters);
+*/
+
+  f_properties=fopen("fcs.properties","w");
+  fprintf(f_properties,"Bins=%ld\n",num_bin);
+  fprintf(f_properties,"Density=%d\n",den_t_event);
+  fprintf(f_properties,"Min=%d\n",min);
+  fprintf(f_properties,"Max=%d\n",max);
+  fprintf(f_properties,"Populations=%ld\n",num_real_pop);
+  fprintf(f_properties,"Events=%ld\n",file_Len);
+  fprintf(f_properties,"Markers=%ld\n",num_dm);
+  fclose(f_properties);
+
+  for (i=0;i<num_real_pop;i++) {
+	/* Add if we want to include population id in the output
+	*/
+	fprintf(f_ctr,"%ld\t",i+1);  //i changed to i+1 to start from 1 instead of 0: April 16, 2009
+
+	for (j=0;j<num_dm;j++) {
+		if (j==num_dm-1)
+			fprintf(f_ctr,"%.0f\n",new_population_center[i][j]);
+		else
+			fprintf(f_ctr,"%.0f\t",new_population_center[i][j]);
+	}
+  }
+
+  	//added April 16, 2009
+	f_mfi=fopen("MFI.txt","w");
+
+	for (i=0;i<num_real_pop;i++)
+	{
+		fprintf(f_mfi,"%ld\t",i+1);
+
+		for (j=0;j<num_dm;j++)
+		{
+			if (j==num_dm-1)
+				fprintf(f_mfi,"%.0f\n",new_population_center[i][j]);
+			else
+				fprintf(f_mfi,"%.0f\t",new_population_center[i][j]);
+		}
+	}
+	fclose(f_mfi);
+
+	//ended April 16, 2009
+			
+  fclose(f_cid);
+  fclose(f_ctr);
+  fclose(f_out);
+  fclose(f_results);
+
+
+  for (i=0;i<num_population;i++)
+  	free(population_center[i]);
+  
+  free(population_center);
+ 
+  for (i=0;i<num_real_pop;i++)
+	  free(new_population_center[i]);
+  
+  free(new_population_center);
+
+  for (i=0;i<file_Len;i++)
+    free(normalized_data[i]);
+  free(normalized_data);	
+	
+  free(grid_populationID);
+
+  free(cluster_populationID);
+  free(grid_clusterID);
+  free(cluster_ID);
+
+  for (i=0;i<file_Len;i++)
+    free(input_data[i]);
+  free(input_data);
+
+  free(grid_ID);
+  free(population_ID);
+  free(all_population_ID);
+  free(eventID_To_denseventID);
+		
+  ///////////////////////////////////////////////////////////
+  printf("Ending time:\t\t\t\t");
+  fflush(stdout);
+  system("/bin/date");
+
+  /*
+   * Windows version
+  _strtime( tmpbuf );
+  printf( "Ending time:\t\t\t\t%s\n", tmpbuf );
+  _strdate( tmpbuf );
+  printf( "Ending date:\t\t\t\t%s\n", tmpbuf );
+ */
+  
+  return 0;
+}
author	immport-devteam
date	Mon, 27 Feb 2017 13:26:09 -0500
parents
children