view src/flock2.c @ 2:311a4f16c483 draft

Deleted selected files
author immport-devteam
date Mon, 27 Feb 2017 13:27:43 -0500
parents 7eab80f86779
children
line wrap: on
line source

///////////
// Changes made:
// 1. Added another parameter: number of populations
// 2. Added a hierarchical merging step based on density change between centroids of two hyper-regions
// 3. Picked the longest dimensions between the population centroids to judge whether the two parts should be merged
// 4. Removed checking time parameter
// 5. Output error to stderr
// 6. Fixed the bug of density threshold always = 3
// 7. Added another error (select_num_bin<min_grid) || (select_num_bin>max_grid) to STDERR
// 8. Fixed a bug for 2D data by using K=e*K
// 9. Added some header files, may not be necessary
// 10. Added a lower bound (at least two) for number of populations
/***************************************************************************************************************************************
	
	FLOCK: FLOw cytometry Clustering without K (Named by: Jamie A. Lee and Richard H. Scheuermann) 
	
	Author: (Max) Yu Qian, Ph.D.
	
	Copyright: Scheuermann Lab, Dept. of Pathology, UTSW
	
	Development: November 2005 ~ Forever

	Status: July 2010: Release 2.0

	Usage: flock data_file
		    Note: the input file format must be channel values and the delimiter between two values must be a tab.


    	
****************************************************************************************************************************************/

#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <sys/stat.h>
#include <unistd.h>
#include <assert.h>



#define DEBUG 0
#define LINE_LEN 1024
#define FILE_NAME_LEN 128
#define PARA_NAME_LEN 64
#define MAX_VALUE 1000000000
#define MIN_GRID 6
#define MAX_GRID 50
#define E_T 1.0

#define NORM_METHOD 2 //2 if z-score; 0 if no normalization; 1 if min-max 
#define KMEANS_TERM 10 
#define MAX_POP_NUM 128

#ifndef max
    #define max( a, b ) ( ((a) > (b)) ? (a) : (b) )
#endif

#ifndef min
    #define min( a, b ) ( ((a) < (b)) ? (a) : (b) )
#endif

static long **Gr=0;
static long *gcID = 0;          /* grid cluster IDs */
static long *cluster_count=0;   /* count of nodes per cluster */
static long ndense=0;
static long ndim=0;
/* cid changes between depth-first searches, but is constant within a
   single search, so it goes here. */
static long cid=0;
/* Do a depth-first search for a single connected component in graph
 * G.  Start from node, tag the nodes found with cid, and record
 * the tags in grid_clusterID.  Also, record the node count in
 * cluster_count.  If we find a node that has already been assigned to
 * a cluster, that means we're merging two clusters, so zero out the
 * old cid's node count.
 *
 * Note that our graph is constructed as a DAG, so we can be
 * guaranteed to terminate without checking for cycles.  
 *
 * Note2: this function can potentially recurse to depth = ndense.
 * Plan stack size accordingly.  
 *
 * Output:
 *
 * grid_clusterID[] -- array where we tag the nodes.
 * cluster_count[]  -- count of the number of nodes per cluster.
 */
static void merge_cluster(long from, long into)
{
  int i;

  for(i=0; i<ndense; ++i)
    if(gcID[i] == from)
      gcID[i] = into;
}
   
void depth_first(long node)
{
  long i;

  if(gcID[node] == cid)         // we're guaranteed no cycles, but it is possible to reach a node 
    return;                     // through two different paths in the same cluster.  This early
                                // return saves us some unneeded work.

  /* Check to see if we are merging a cluster */
  if(gcID[node] >= 0) {
    /* We are, so zero the count for the old cluster. */
    cluster_count[ gcID[node] ] = 0;  
    merge_cluster(gcID[node], cid);
    return;
  }

  /* Update for this node */
  gcID[node] = cid;
  cluster_count[cid]++;

  /* Recursively search the child nodes */
  for(i=0; i<ndim; ++i)
    if(Gr[node][i] >= 0)      /* This is a child node */
      depth_first(Gr[node][i]);
}

void bail(const char *msg)
{
  fprintf(stderr,"%s",msg);
  exit(0);
}


static void check_clusters(long *gcID, int ndense)
{
  int i;

  for(i=0; i<ndense; ++i)
    if(gcID[i] < 0) {
      fprintf(stderr,"faulty cluster id at i= %d\n",i);
      exit(0);
    }
}



long find_connected(long **G, long n_dense_grids, long num_dm, long *grid_clusterID)
{
  long nclust=0;                  /* number of clusters found */
  long i;
  long *subfac;
  long clustid=0;
  int subval=0,nempty=0;
  
  int sz = n_dense_grids*sizeof(long); 
  cluster_count = malloc(sz);
  if(!cluster_count)
    bail("find_connected: Unable to allocate %zd bytes.\n");
  memset(cluster_count,0,sz);

  /* set up the statics that will be used in the DFS */
  Gr=G;
  gcID = grid_clusterID;
  ndense = n_dense_grids;
  ndim = num_dm;
  
  for(i=0;i<ndense;++i)
    grid_clusterID[i] = -1;

  for(i=0;i<ndense;++i) {
    if(grid_clusterID[i] < 0) {  /* grid hasn't been assigned yet */
      cid = nclust++;
      depth_first(i);
    }
  }


  
  /* At this point we probably have some clusters that are empty due to merging.
     We want to compact the cluster numbering to eliminate the empty clusters. */

  subfac = malloc(sz);
  if(!subfac)
    bail("find_connected: Unable to allocate %zd bytes.\n");
  subval=0;
  nempty=0;

  /* cluster #i needs to have its ID decremented by 1 for each empty cluster with
     ID < i.  Precaclulate the decrements in this loop: */
  for(i=0;i<nclust;++i) {
    //clustid = grid_clusterID[i];
    if(cluster_count[i] == 0) {
      subval++;
      nempty++;
    }
    subfac[i] = subval;
  }

  //printf("nempty is %d\n",nempty);
  
  /* Now apply the decrements to all of the dense grids */
  for(i=0;i<ndense;++i) {
    clustid = grid_clusterID[i];
    grid_clusterID[i] -= subfac[clustid];
  }


  
  /* correct the number of clusters found */
  nclust -= nempty;

  //printf("nclust is %d\n",nclust);

  return nclust;
}

/************************************* Read basic info of the source file **************************************/
/************************************* Read basic info of the source file **************************************/
void getfileinfo(FILE *f_src, long *file_Len, long *num_dm, char *name_string, long *time_ID)
{
  char src[LINE_LEN];
  char current_name[64];
  char prv;

  long num_rows=0;
  long num_columns=0;
  long ch='\n';
  long prev='\n';
  long time_pos=0;
  long i=0;
  long j=0;
  int sw=0;

  src[0]='\0';
  fgets(src, LINE_LEN, f_src);

  name_string[0]='\0';
  current_name[0]='\0';
  prv='\n';

  while ((src[i]==' ') || (src[i]=='\t')) //skip space and tab characters
    i++;

  while ((src[i]!='\0') && (src[i]!='\n')) //repeat until the end of the line
    {
      current_name[j]=src[i];
		
      if ((src[i]=='\t') && (prv!='\t')) //a complete word
        {
          current_name[j]='\0';

          if (0!=strcmp(current_name,"Time"))
            {
              num_columns++; //num_columns does not inlcude the column of Time
              time_pos++;
              if (sw) {
                  strcat(name_string,"\t");
              }
              strcat(name_string,current_name); 
              sw = 1;
            }
          else
            {
              *time_ID=time_pos;
            }

          current_name[0]='\0';
          j=0;			
        }		
		
      if ((src[i]=='\t') && (prv=='\t')) //a duplicate tab or space
        {
          current_name[0]='\0';
          j=0;
        }
		
      if (src[i]!='\t')
        j++;
		
      prv=src[i];
      i++;
    }
	
  //name_string[j]='\0';
  if (prv!='\t') //the last one hasn't been retrieved
    {
      current_name[j]='\0';
      if (0!=strcmp(current_name,"Time"))
        {
          num_columns++;
          strcat(name_string,"\t");
          strcat(name_string,current_name);
          time_pos++;
        }
      else
        {
          *time_ID=time_pos;
        }
    }
  if (DEBUG==1)
    {
      printf("time_ID is %ld\n",*time_ID);
      printf("name_string is %s\n",name_string);
    }

  //start computing # of rows

  while ((ch = fgetc(f_src))!= EOF )
    {
      if (ch == '\n')
        {
          ++num_rows;
        }
      prev = ch;
    }
  if (prev!='\n')
    ++num_rows;

   if (num_rows<50)
  {
    fprintf(stderr,"Number of events in the input file is too few and should not be processed!\n"); //modified on July 23, 2010
	exit(0);
  }
	
  *file_Len=num_rows;
  *num_dm=num_columns; 

  printf("original file size is %ld; number of dimensions is %ld\n", *file_Len, *num_dm);
}



/************************************* Read the source file into uncomp_data **************************************/
void readsource(FILE *f_src, long file_Len, long num_dm, double **uncomp_data, long time_ID)
{
  //long time_pass=0; //to mark whether the time_ID has been passed
  long index=0;

  long i=0;
  long j=0;
  long t=0;

  char src[LINE_LEN];
  char xc[LINE_LEN/10];

  src[0]='\0';
  fgets(src,LINE_LEN, f_src); //skip the first line about parameter names

  while (!feof(f_src) && (index<file_Len)) //index = 0, 1, ..., file_Len-1
    {
      src[0]='\0';	    
      fgets(src,LINE_LEN,f_src);
      i=0;
      //time_pass=0;
						
      //if (time_ID==-1)  //there is no time_ID
      //  {
          for (t=0;t<num_dm;t++) 
            {
              xc[0]='\0';
              j=0;
              while ((src[i]!='\0') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t'))
                {
                  xc[j]=src[i];
                  i++;
                  j++;
                }
		
              xc[j]='\0';	    
              i++;

              uncomp_data[index][t]=atof(xc);
            }	
       // }
      /*else
        {
          for (t=0;t<=num_dm;t++) //the time column needs to be skipped, so there are num_dm+1 columns
            {
              xc[0]='\0';
              j=0;
              while ((src[i]!='\0') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t'))
                {
                  xc[j]=src[i];
                  i++;
                  j++;
                }
		
              xc[j]='\0';	    
              i++;

              if (t==time_ID)
                {
                  time_pass=1;
                  continue;
                }
				
              if (time_pass)
                uncomp_data[index][t-1]=atof(xc);
              else
                uncomp_data[index][t]=atof(xc);
            }
        }*/ //commented by Yu Qian on Aug 31, 2010 as we do not want to check time_ID anymore        	
      index++;     	
      //fprintf(fout_ID,"%s",src);
    } //end of while
	
  if (DEBUG == 1)
    {
      printf("the last line of the source data is:\n");
      for (j=0;j<num_dm;j++)
        printf("%f ",uncomp_data[index-1][j]);
      printf("\n");
    }
}


/**************************************** Normalization ******************************************/
void tran(double **orig_data, long file_Len, long num_dm, long norm_used, double **matrix_to_cluster)
{
  long i=0;
  long j=0;

  double biggest=0;
  double smallest=MAX_VALUE;

  double *aver; //average of each column
  double *std; //standard deviation of each column

  aver=(double*)malloc(sizeof(double)*file_Len);
  memset(aver,0,sizeof(double)*file_Len);

  std=(double*)malloc(sizeof(double)*file_Len);
  memset(std,0,sizeof(double)*file_Len);	
		
  if (norm_used==2) //z-score normalization
    {
      for (j=0;j<num_dm;j++)
        {
          aver[j]=0;
          for (i=0;i<file_Len;i++)
            aver[j]=aver[j]+orig_data[i][j];
          aver[j]=aver[j]/(double)file_Len;

          std[j]=0;
          for (i=0;i<file_Len;i++)
            std[j]=std[j]+((orig_data[i][j]-aver[j])*(orig_data[i][j]-aver[j]));
          std[j]=sqrt(std[j]/(double)file_Len);
			
          for (i=0;i<file_Len;i++)
            matrix_to_cluster[i][j]=(orig_data[i][j]-aver[j])/std[j];  //z-score normalization
        }
    }

  if (norm_used==1) //0-1 min-max normalization
    {
      for (j=0;j<num_dm;j++)
        {
          biggest=0;
          smallest=MAX_VALUE;
          for (i=0;i<file_Len;i++)
            {
              if (orig_data[i][j]>biggest)
                biggest=orig_data[i][j];
              if (orig_data[i][j]<smallest)
                smallest=orig_data[i][j];
            }
			
          for (i=0;i<file_Len;i++)
            {
              if (biggest==smallest)
                matrix_to_cluster[i][j]=biggest;
              else
                matrix_to_cluster[i][j]=(orig_data[i][j]-smallest)/(biggest-smallest);
            }
        }
    }

  if (norm_used==0) //no normalization
    {
      for (i=0;i<file_Len;i++)
        for (j=0;j<num_dm;j++)
          matrix_to_cluster[i][j]=orig_data[i][j];
    }

}



/********************************************** RadixSort *******************************************/
/* Perform a radix sort using each dimension from the original data as a radix.
 * Outputs:
 * sorted_seq   -- a permutation vector mapping the ordered list onto the original data.
 *                  (sorted_seq[i] -> index in the original data of the ith element of the ordered list)
 * grid_ID      -- mapping between the original data and the "grids" (see below) found as a byproduct
 *                  of the sorting procedure.
 * num_nonempty -- the number of grids that occur in the data (= the number of distinct values assumed
 *                  by grid_ID)
 */

void radixsort_flock(long **position,long file_Len,long num_dm,long num_bin,long *sorted_seq,long *num_nonempty,long *grid_ID)
{
  long i=0;
  long length=0; 
  long start=0;
  long prev_ID=0;
  long curr_ID=0;
	
  long j=0;
  long t=0;
  long p=0;
  long loc=0;
  long temp=0;
  long equal=0;
	
  long *count; //count[i]=j means there are j numbers having value i at the processing digit
  long *index; //index[i]=j means the starting position of grid i is j
  long *cp; //current position
  long *mark; //mark[i]=0 means it is not an ending point of a part, 1 means it is (a "part" is a group of items with identical bins for all dimensions)
  long *seq; //temporary sequence

  count=(long*)malloc(sizeof(long)*num_bin);
  memset(count,0,sizeof(long)*num_bin);

  cp=(long*)malloc(sizeof(long)*num_bin);
  memset(cp,0,sizeof(long)*num_bin);

  index=(long*)malloc(sizeof(long)*num_bin); // initialized below

  seq=(long*)malloc(sizeof(long)*file_Len);
  memset(seq,0,sizeof(long)*file_Len);
	
  mark=(long*)malloc(sizeof(long)*file_Len);
  memset(mark,0,sizeof(long)*file_Len);
	
  for (i=0;i<file_Len;i++)
    {
      sorted_seq[i]=i;
      mark[i]=0;
      seq[i]=0;
    }
  for (i=0;i<num_bin;i++)
    {
      index[i]=0;
      cp[i]=0;
      count[i]=0;
    }

  for (j=0;j<num_dm;j++)
    {
      if (j==0) //compute the initial values of mark
        {
          for (i=0;i<file_Len;i++)
            count[position[i][j]]++; // initialize the count to the number of items in each bin of the 0th dimension

          index[0] = 0;
          for (i=0;i<num_bin-1;i++)
            {
              index[i+1]=index[i]+count[i];  //index[k]=x means k segment starts at x (in the ordered list)
              if ((index[i+1]>0) && (index[i+1]<=file_Len))
                {
                  mark[index[i+1]-1]=1; // Mark the end of the segment in the ordered list
                }
              else
                {
                  printf("out of myboundary for mark at index[i+1]-1.\n");
                }
            }
          mark[file_Len-1]=1;
			
          for (i=0;i<file_Len;i++)
            {
              /* Build a permutation vector for the partially ordered data.  Store the PV in sorted_seq */
              loc=position[i][j];
              temp=index[loc]+cp[loc]; //cp[i]=j means the offset from the starting position of grid i is j 
              sorted_seq[temp]=i;  //sorted_seq[i]=temp is also another way to sort
              cp[loc]++;
            }
        }
      else
        {
          //reset count, index, loc, temp, cp, start, and length
          length=0;
          loc=0;
          temp=0;
          start=0;
          for (p=0;p<num_bin;p++)
            {
              cp[p]=0;
              count[p]=0;
              index[p]=0;
            }

          for (i=0;i<file_Len;i++)
            {
              long iperm = sorted_seq[i]; // iperm allows us to traverse the data in sorted order.
              if (mark[i]!=1)
                {
                  /* Count the number of items in each bin of
                     dimension j, BUT we are going to reset at the end
                     of each "part".  Thus, the total effect is to do
                     a sort by bin on the jth dimension for each group
                     of data that has been identical for the
                     dimensions processed up to this point.  This is
                     the standard radix sort procedure, but doing it
                     this way saves us having to allocate buckets to
                     hold the data in each group of "identical-so-far"
                     elements. */
                  count[position[iperm][j]]++;  //count[position[i][j]]++;
                  length++;                     // This is the total length of the part, irrespective of the value of the jth component
                                                // (actually, less one, since we don't increment for the final element below)
                }
              if (mark[i]==1)
                {
                  //length++;
                  count[position[iperm][j]]++;//count[position[i][j]]++;  //the current point must be counted in
                  start=i-length; //this part starts from start to i: [start,i]
                  /* Now we sort on the jth radix, just like we did for the 0th above, but we restrict it to just the current part.
                     This would be a lot more clear if we broke this bit of code out into a separate function and processed recursively,
                     plus we could multi-thread over the parts.  (Hmmm...)
                  */
                  index[0] = start; // Let index give the offset within the whole ordered list.
                  for (t=0;t<num_bin-1;t++)
                    {
                      index[t+1]=index[t]+count[t];
						
                      if ((index[t+1]<=file_Len) && (index[t+1]>0))
                        {
                          mark[index[t+1]-1]=1; // update the part boundaries to include the differences in the current radix.
                        }
						
                    }
                  mark[i]=1;

                  /* Update the permutation vector for the current part (i.e., from start to i).  By the time we finish the loop over i
                     the PV will be completely updated for the partial ordering up to the current radix. */
                  for (t=start;t<=i;t++)
                    {
                      loc=position[sorted_seq[t]][j];//loc=position[t][j];
                      temp=index[loc]+cp[loc];
                      if ((temp<file_Len) && (temp>=0)) 
                        {
                          // seq is a temporary because we have to maintain the old PV until we have finished this step.
                          seq[temp]=sorted_seq[t];  //sorted_seq[i]=temp is also another way to sort
                          cp[loc]++;
                        }
                      else
                        {
                          printf("out of myboundary for seq at temp.\n");
                        }
                    }

                  for (t=start;t<=i;t++)
                    {
                      // copy the temporary back into sorted_seq.  sorted_seq is now updated for radix j up through
                      // entry i in the ordered list.
                      if ((t>=0) && (t<file_Len))
                        sorted_seq[t]=seq[t];
                      else
                        printf("out of myboundary for seq and sorted_seq at t.\n");
                    }
                  //reset count, index, seq, length, and cp
                  length=0;
                  loc=0;
                  temp=0;
                  for (p=0;p<num_bin;p++)
                    {
                      cp[p]=0;
                      count[p]=0;
                      index[p]=0;
                    }
                }
            }//end for i
        }//end else
    }//end for j

  /* sorted_seq[] now contains the ordered list for all radices.  mark[] gives the boundaries between groups of elements that are
     identical over all radices (= dimensions in the original data) (although it appears we aren't going to make use of this fact) */
  
  for (i=0;i<file_Len;i++)
    grid_ID[i]=0; //in case the initial value hasn't been assigned
  *num_nonempty=1; //starting from 1!	

  /* assign the "grid" identifiers for all of the data.  A grid will be what we were calling a "part" above.  We will number them
     serially and tag the *unordered* data with the grid IDs.  We will also count the number of populated grids (in general there will
     be many possible combinations of bin values that simply never occur) */
  
  for (i=1;i<file_Len;i++)
    {
      equal=1;
      prev_ID=sorted_seq[i-1];
      curr_ID=sorted_seq[i];
      for (j=0;j<num_dm;j++)
        {
          if (position[prev_ID][j]!=position[curr_ID][j])
            {	
              equal=0;  //not equal
              break;
            }
        }
		
      if (equal)
        {
          grid_ID[curr_ID]=grid_ID[prev_ID];
        }
      else
        {
          *num_nonempty=*num_nonempty+1;
          grid_ID[curr_ID]=grid_ID[prev_ID]+1;
        }
      //all_grid_vol[grid_ID[curr_ID]]++;
    }

  //free memory
  free(count);
  free(index);	
  free(cp);	
  free(seq);
  free(mark); 
  
}

/********************************************** Compute Position of Events ************************************************/
void compute_position(double **data_in, long file_Len, long num_dm, long num_bin, long **position, double *interval)
{
  /* What we are really doing here is binning the data, with the bins
     spanning the range of the data and number of bins = num_bin */
  long i=0;
  long j=0;

  double *small; //small[j] is the smallest value within dimension j
  double *big; //big[j] is the biggest value within dimension j
		
  small=(double*)malloc(sizeof(double)*num_dm);
  memset(small,0,sizeof(double)*num_dm);

  big=(double*)malloc(sizeof(double)*num_dm);
  memset(big,0,sizeof(double)*num_dm);
	
	
  for (j=0;j<num_dm;j++)
    {
      big[j]=MAX_VALUE*(-1);
      small[j]=MAX_VALUE;
      for (i=0;i<file_Len;i++)
        {
          if (data_in[i][j]>big[j])
            big[j]=data_in[i][j];

          if (data_in[i][j]<small[j])
            small[j]=data_in[i][j];
        }
		
      interval[j]=(big[j]-small[j])/(double)num_bin;	//interval is computed using the biggest value and smallest value instead of the channel limit
      /* XXX: I'm pretty sure the denominator of the fraction above should be num_bin-1. */
    }
    
  for (j=0;j<num_dm;j++)
  {
     for (i=0;i<file_Len;i++)
     {	
        if (data_in[i][j]>=big[j])
           position[i][j]=num_bin-1;
        else
        {
           position[i][j]=(long)((data_in[i][j]-small[j])/interval[j]); //position[i][j]=t means point i is at the t grid of dimensional j
           if ((position[i][j]>=num_bin) || (position[i][j]<0))
           {
               //printf("position mis-computed in density analysis!\n");
               //exit(0);
			   fprintf(stderr,"Incorrect input file format or input parameters (number of bins overflows)!\n"); //modified on July 23, 2010
				exit(0);
           }
        }
     }
  }


  free(small);
  free(big);
}

/********************************************** select_bin to select the number of bins **********************************/
//num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty);
/* Determine the number of bins to use in each dimension.  Additionally sort the data elements according to the binned
 * values, and partition the data into "grids" with identical (binned) values.  We try progressively more bins until we
 * maximize a merit function, then return the results obtained using the optimal number of bins. 
 *
 * Outputs:
 * position     -- binned data values
 * sorted_seq   -- permutation vector mapping the ordered list to the original data
 * all_grid_ID  -- grid to which each data element was assigned.
 * num_nonempty -- number of distinct values assumed by all_grid_ID
 * interval     -- bin width for each data dimension
 * return value -- the number of bins selected.
 */

long select_bin(double **normalized_data, long file_Len, long num_dm, long min_grid, long max_grid, long **position, long *sorted_seq, 
                long *all_grid_ID, long *num_nonempty, double *interval, long user_num_bin)
{
 
  long num_bin=0;
  long select_num_bin=0;
  long m=0;
  long n=0;
	
  long i=0;
  long bin_scope=0;
  long temp_num_nonempty=0;

  long *temp_grid_ID;
  long *temp_sorted_seq;
  long **temp_position;

  //sorted_seq[i]=j means the event j ranks i

  double temp_index=0;
  double *bin_index;	
  double *temp_interval;
	
	
  temp_grid_ID=(long *)malloc(sizeof(long)*file_Len);
  memset(temp_grid_ID,0,sizeof(long)*file_Len);
	
  temp_sorted_seq=(long *)malloc(sizeof(long)*file_Len);
  memset(temp_sorted_seq,0,sizeof(long)*file_Len);

  temp_position=(long **)malloc(sizeof(long*)*file_Len);
  memset(temp_position,0,sizeof(long*)*file_Len);
  for (m=0;m<file_Len;m++)
    {
      temp_position[m]=(long*)malloc(sizeof(long)*num_dm);
      memset(temp_position[m],0,sizeof(long)*num_dm);
    }

  temp_interval=(double*)malloc(sizeof(double)*num_dm);
  memset(temp_interval,0,sizeof(double)*num_dm);

  bin_scope=max_grid-min_grid+1;
  bin_index=(double *)malloc(sizeof(double)*bin_scope);
  memset(bin_index,0,sizeof(double)*bin_scope);

  i=0;

  for (num_bin=min_grid;num_bin<=max_grid;num_bin++)
    {
      
	  temp_num_nonempty=0;
	   /* compute_position bins the data into num_bin bins.  Each
         dimension is binned independently.

         Outputs:
         temp_position[i][j] -- bin for the jth component of data element i.
         temp_interval[j]    -- bin-width for the jth component
      */
      compute_position(normalized_data, file_Len, num_dm, num_bin, temp_position, temp_interval);
      radixsort_flock(temp_position,file_Len,num_dm,num_bin,temp_sorted_seq,&temp_num_nonempty,temp_grid_ID);

      /* our figure of merit is the number of non-empty grids divided by number of bins per dimension.
         We declare victory when we have found a local maximum */
      bin_index[i]=((double)temp_num_nonempty)/((double)num_bin);
	  if ((double)(temp_num_nonempty)>=(double)(file_Len)*0.95)
		  break;
      if ((bin_index[i]<temp_index) && (user_num_bin==0))
         break;
	  if ((user_num_bin==num_bin-1) && (user_num_bin!=0))
		 break;
      
      /* Since we have accepted this trial bin, copy all the temporary results into
         the output buffers */
      memcpy(all_grid_ID,temp_grid_ID,sizeof(long)*file_Len);
      memcpy(sorted_seq,temp_sorted_seq,sizeof(long)*file_Len);
      memcpy(interval,temp_interval,sizeof(double)*num_dm);
		
      for (m=0;m<file_Len;m++)
        for (n=0;n<num_dm;n++)
          position[m][n]=temp_position[m][n];

      temp_index=bin_index[i];
      select_num_bin=num_bin;
      num_nonempty[0]=temp_num_nonempty;
      i++;
    }

   if ((select_num_bin<min_grid) || (select_num_bin>max_grid))
  {
    fprintf(stderr,"Number of events collected is too few in terms of number of markers used. The file should not be processed!\n"); //modified on Nov 04, 2010
	exit(0);
  }
	
  if (temp_index==0)
  {
	 fprintf(stderr,"Too many dimensions with too few events in the input file, or a too large number of bins used.\n"); //modified on July 23, 2010
	 exit(0);
  }

  free(temp_grid_ID);
  free(temp_sorted_seq);
  free(bin_index);
  free(temp_interval);
	
  for (m=0;m<file_Len;m++)
    free(temp_position[m]);
  free(temp_position);

  return select_num_bin; 
}

/************************************* Select dense grids **********************************/
// compute num_dense_grids, num_dense_events, dense_grid_reverse, and all_grid_vol
// den_cutoff=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse);
/*
 * Prune away grids that are insufficiently "dense" (i.e., contain too few data items)
 *
 * Outputs:
 * num_dense_grids    -- number of dense grids
 * num_dense_events   -- total number of data items in all dense grids
 * dense_grid_reverse -- mapping from list of all grids to list of dense grids.
 * return value       -- density cutoff for separating dense from non-dense grids.
 */

int select_dense(long file_Len, long *all_grid_ID, long num_nonempty, long *num_dense_grids, long *num_dense_events, long *dense_grid_reverse, int den_t_event)
{
  

  long i=0;
  long vol_ID=0;
  long biggest_size=0; //biggest grid_size, to define grid_size_index
  long biggest_index=0;
  //long actual_threshold=0; //the actual threshold on grid_size, e.g., 1 implies 1 event in the grid
  //long num_remain=0; //number of remaining grids with different density thresholds
  long temp_num_dense_grids=0;
  long temp_num_dense_events=0;
	
  long *grid_size_index;
  long *all_grid_vol;
  long *grid_density_index;

  //double den_average=0;
 // double avr_index=0;
 

  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Compute all_grid_vol
  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  all_grid_vol=(long *)malloc(sizeof(long)*num_nonempty);
  memset(all_grid_vol,0,sizeof(long)*num_nonempty);

  /* Grid "volume" is just the number of data contained in the grid. */
  for (i=0;i<file_Len;i++)
    {
      vol_ID=all_grid_ID[i]; //vol_ID=all_grid_ID[sorted_seq[i]];
      all_grid_vol[vol_ID]++;  //all_grid_vol[i]=j means grid i has j points
    }

 
	
  //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Compute grid_size_index (histogram of grid sizes)
  //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

  for (i=0;i<num_nonempty;i++)
    {
      if (biggest_size<all_grid_vol[i])
        {
          biggest_size=all_grid_vol[i];
        }
    }

   if (biggest_size<3)
  {
	 fprintf(stderr,"Too many dimensions with too few events in the input file, or a too large number of bins used.\n"); //modified on July 23, 2010
	 exit(0);
  }


  grid_size_index=(long*)malloc(sizeof(long)*biggest_size);
  memset(grid_size_index,0,sizeof(long)*biggest_size);
	
  for (i=0;i<num_nonempty;i++)
    {
      grid_size_index[all_grid_vol[i]-1]++; //grid_size_index[0]=5 means there are 5 grids having size 1
    }



  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Compute den_cutoff
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
	
  grid_density_index=(long *)malloc(sizeof(long)*(biggest_size-2));//from event 2 to biggest_size-1, i.e., from 0 to biggest_size-3
  memset(grid_density_index,0,sizeof(long)*(biggest_size-2));

  if (den_t_event==0)
  {
	  biggest_index=0;

	  for (i=2;i<biggest_size-1;i++) //the grid with 1 event will be skipped, i.e., grid_density_index[0] won't be defined
	  {
		  grid_density_index[i-1]=(grid_size_index[i-1]+grid_size_index[i+1]-2*grid_size_index[i]);
		  if (biggest_index<grid_density_index[i-1])
		  {
			biggest_index=grid_density_index[i-1];
			den_t_event=i+1;
		  }
	  }
  }

  if (den_t_event==0) //if biggest_size==3
  {
	 fprintf(stderr,"the densest hyperregion has only 3 events, smaller than the user-specified value. Therefore the density threshold is automatically changed to 3.\n");
	 den_t_event=3;
  }

  for (i=0;i<num_nonempty;i++)
	  if (all_grid_vol[i]>=den_t_event)
		temp_num_dense_grids++;

  if (temp_num_dense_grids<=1)
  {
	  //exit(0);
	  //printf("a too high density threshold is set! Please decrease your density threshold.\n");
	  fprintf(stderr,"a too high density threshold! Please decrease your density threshold.\n"); //modified on July 23, 2010
	  exit(0);
  }

   if (temp_num_dense_grids>=100000)
  {
	  //modified on July 23, 2010
	  //printf("a too low density threshold is set! Please increase your density threshold.\n");
	  //exit(0);
	  fprintf(stderr,"a too low density threshold! Please increase your density threshold.\n"); //modified on July 23, 2010
	  exit(0);
  }
  
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Compute dense_grid_reverse
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  temp_num_dense_grids=0;
  temp_num_dense_events=0;
  for (i=0;i<num_nonempty;i++)
    {
      dense_grid_reverse[i]=-1;
		
      if (all_grid_vol[i]>=den_t_event) 
        {			
          dense_grid_reverse[i]=temp_num_dense_grids;  //dense_grid_reverse provides a mapping from all nonempty grids to dense grids.
          temp_num_dense_grids++;
          temp_num_dense_events+=all_grid_vol[i];						
        }
    }

  num_dense_grids[0]=temp_num_dense_grids;
  num_dense_events[0]=temp_num_dense_events;	

  free(grid_size_index);
  free(all_grid_vol);

  return den_t_event;
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Compute densegridID_To_gridevent and eventID_To_denseventID
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//	grid_To_event(file_Len, dense_grid_reverse, all_grid_ID, eventID_To_denseventID, densegridID_To_gridevent);
/*
 * Filter the data so that only the data belonging to dense grids is left
 *
 * Output:
 * eventID_To_denseeventID   -- mapping from original event ID to ID in list containing only events contained in dense grids.
 * densegridID_To_gridevent  -- mapping from dense grids to prototype members of the grids.
 *
 */

void grid_To_event(long file_Len, long *dense_grid_reverse, long *all_grid_ID, long *eventID_To_denseventID, long *densegridID_To_gridevent)
{
  long i=0;
  long dense_grid_ID=0;
  long dense_event_ID=0;

  for (i=0;i<file_Len;i++)
    {
      dense_grid_ID=dense_grid_reverse[all_grid_ID[i]];
      eventID_To_denseventID[i]=-1;
      if (dense_grid_ID!=-1) //for point (i) belonging to dense grids 
        {			
          eventID_To_denseventID[i]=dense_event_ID;
          dense_event_ID++;
		
          if (densegridID_To_gridevent[dense_grid_ID]==-1) //for point i that hasn't been selected
            densegridID_To_gridevent[dense_grid_ID]=i; //densegridID_To_gridevent maps dense_grid_ID to its representative point			
        }		
    }
	
	
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Compute dense_grid_seq
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//	generate_grid_seq(file_Len, num_dm, sorted_seq, num_dense_grids, densegridID_To_gridevent, position, dense_grid_rank, dense_grid_seq);
/* Construct a table of binned data values for each dense grid.
 *
 * Output:
 *
 * dense_grid_seq  -- table of binned data values for each dense grid (recall that all members of a grid have identical binned data values)
 */

void generate_grid_seq(long num_dm, long num_dense_grids, long *densegridID_To_gridevent, long **position, long **dense_grid_seq)
{  

  long i=0;
  long j=0;
  long ReventID=0; //representative event ID of the dense grid

  for (i=0;i<num_dense_grids;i++)
    {
      ReventID = densegridID_To_gridevent[i];

      for (j=0;j<num_dm;j++)
        dense_grid_seq[i][j]=position[ReventID][j];
    }
}

//compare two vectors
long compare_value(long num_dm, long *search_value, long *seq_value)
{
  long i=0;

  for (i=0;i<num_dm;i++)
    {
      if (search_value[i]<seq_value[i])
        return 1;
      if (search_value[i]>seq_value[i])
        return -1;
      if (search_value[i]==seq_value[i])
        continue;
    }
  return 0;
}

//binary search the dense_grid_seq to return the dense grid ID if it exists
long binary_search(long num_dense_grids, long num_dm, long *search_value, long **dense_grid_seq)
{

  long low=0;
  long high=0;
  long mid=0;
  long comp_result=0;
  long match=0;
  //long found=0;
	
  low = 0;
  high = num_dense_grids-1;
    
  while (low <= high) 
    {
      mid = (long)((low + high)/2);
	
      comp_result=compare_value(num_dm, search_value,dense_grid_seq[mid]);
	
		
      switch(comp_result)
        {
        case 1:
          high=mid-1;
          break;
        case -1:
          low=mid+1;
          break;
        case 0:
          match=1;
          break;
        }
      if (match==1)
        break;
    }
	


  if (match==1)
    {
      return mid;
    }
  else
    return -1;   
}


/********************************************** Computing Centers Using IDs **********************************************/

void ID2Center(double **data_in, long file_Len, long num_dm, long *eventID_To_denseventID, long num_clust, long *cluster_ID, double **population_center)
{
  long i=0;
  long j=0;
  long ID=0;
  long eventID=0;
  long *size_c;



  size_c=(long *)malloc(sizeof(long)*num_clust);
  memset(size_c,0,sizeof(long)*num_clust);

  for (i=0;i<num_clust;i++)
    for (j=0;j<num_dm;j++)
      population_center[i][j]=0;

  for (i=0;i<file_Len;i++)
    {
      eventID=eventID_To_denseventID[i];

      if (eventID!=-1) //only events in dense grids count
        {
          ID=cluster_ID[eventID];
			
          if (ID==-1)
            {
              //printf("ID==-1! in ID2Center\n");
              //exit(0);
			  fprintf(stderr,"Incorrect file format or input parameters (no dense regions found!)\n"); //modified on July 23, 2010
			  exit(0);
            }

          for (j=0;j<num_dm;j++)
            population_center[ID][j]=population_center[ID][j]+data_in[i][j];
			
          size_c[ID]++;				
        }
    }
	

  for (i=0;i<num_clust;i++)
    {
      for (j=0;j<num_dm;j++)
        if (size_c[i]!=0)
          population_center[i][j]=(population_center[i][j]/(double)(size_c[i]));
        else
		{
          population_center[i][j]=0;
		  printf("size_c[%ld]=0 in ID2center\n",i);
		}
    }

  free(size_c);

}

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//  Compute Population Center with all events
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void ID2Center_all(double **data_in, long file_Len, long num_dm, long num_clust, long *cluster_ID, double **population_center)
{
  long i=0;
  long j=0;
  long ID=0;
  long *size_c;



  size_c=(long *)malloc(sizeof(long)*num_clust);
  memset(size_c,0,sizeof(long)*num_clust);

  for (i=0;i<num_clust;i++)
    for (j=0;j<num_dm;j++)
      population_center[i][j]=0;

	
  for (i=0;i<file_Len;i++)
    {
         ID=cluster_ID[i];
			
         if (ID==-1)
         {
            //printf("ID==-1! in ID2Center_all\n");
            //exit(0);
			fprintf(stderr,"Incorrect file format or input parameters (resulting in incorrect population IDs)\n"); //modified on July 23, 2010
			exit(0);
         }

         for (j=0;j<num_dm;j++)
           population_center[ID][j]=population_center[ID][j]+data_in[i][j];
			
         size_c[ID]++;        
    }
	
 
  for (i=0;i<num_clust;i++)
    {
      for (j=0;j<num_dm;j++)
        if (size_c[i]!=0)
          population_center[i][j]=(population_center[i][j]/(double)(size_c[i]));
        else
		{
          population_center[i][j]=0;
		  printf("size_c[%ld]=0 in ID2center_all\n",i);
		}
    }


  free(size_c);

}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Merge neighboring grids to clusters
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

long merge_grids(double **normalized_data, double *interval, long file_Len, long num_dm, long num_bin, long **position, long num_dense_grids, 
                 long *dense_grid_reverse, long **dense_grid_seq, long *eventID_To_denseventID, long *densegridID_To_gridevent, long *all_grid_ID,
                 long *cluster_ID, long *grid_ID, long *grid_clusterID)
{
  

  long i=0;
  long j=0;
  long t=0;
  long p=0;
  long num_clust=0;
  long ReventID=0;
  long denseID=0;
  long neighbor_ID=0;
  //long temp_grid=0;

  long *grid_value;
  long *search_value;
	
  long **graph_of_grid; //the graph constructed for dense grids: each dense grid is a graph node

  double real_dist=0;
  double **norm_grid_center;
	
  norm_grid_center=(double **)malloc(sizeof(double*)*num_dense_grids);
  memset(norm_grid_center,0,sizeof(double*)*num_dense_grids);
	
  for (i=0;i<num_dense_grids;i++)
    {
      norm_grid_center[i]=(double *)malloc(sizeof(double)*num_dm);
      memset(norm_grid_center[i],0,sizeof(double)*num_dm);
    }

  for (i=0;i<file_Len;i++)
    {
      denseID=eventID_To_denseventID[i];
      if (denseID!=-1) //only dense events can enter
        {
          grid_ID[denseID]=dense_grid_reverse[all_grid_ID[i]];
			
          if (grid_ID[denseID]==-1)
            {
              fprintf(stderr,"Incorrect input file format or input parameters (no dense region found)!\n");
              exit(0);
            }			
        }
    }

	
  /* Find centroid (in the normalized data) for each dense grid */
  ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_dense_grids,grid_ID,norm_grid_center);	
 
  //printf("pass the grid ID2 center\n");

	
  graph_of_grid=(long **)malloc(sizeof(long*)*num_dense_grids);
  memset(graph_of_grid,0,sizeof(long*)*num_dense_grids);
  for (i=0;i<num_dense_grids;i++)
    {
      graph_of_grid[i]=(long *)malloc(sizeof(long)*num_dm);
      memset(graph_of_grid[i],0,sizeof(long)*num_dm);		
		

      for (j=0;j<num_dm;j++)
        graph_of_grid[i][j]=-1;
    }	
	
  grid_value=(long *)malloc(sizeof(long)*num_dm);
  memset(grid_value,0,sizeof(long)*num_dm);

  search_value=(long *)malloc(sizeof(long)*num_dm);
  memset(search_value,0,sizeof(long)*num_dm);

  
  for (i=0;i<num_dense_grids;i++)
    {
      ReventID=densegridID_To_gridevent[i];

      for (j=0;j<num_dm;j++)
        {
          grid_value[j]=position[ReventID][j];
          
        }
     

      /* For each dimension, find the neighbor, if any, that is equal in all other dimensions and 1 greater in
         the chosen dimension.  If the neighbor's centroid is not too far away, add it to this grid's neighbor
         list. */
      for (t=0;t<num_dm;t++)
        {
          for (p=0;p<num_dm;p++)
            search_value[p]=grid_value[p];

          if (grid_value[t]==num_bin-1)
            continue;

          search_value[t]=grid_value[t]+1; //we only consider the neighbor at the bigger side

          neighbor_ID=binary_search(num_dense_grids, num_dm, search_value, dense_grid_seq);
			
          if (neighbor_ID!=-1) 
            {
              real_dist=norm_grid_center[i][t]-norm_grid_center[neighbor_ID][t];
	
              if (real_dist<0)
                real_dist=-real_dist;
				
              if (real_dist<2*interval[t]) //by using 2*interval, this switch is void
                graph_of_grid[i][t]=neighbor_ID;			
            }
        }
      grid_clusterID[i]=i; //initialize grid_clusterID
    } 
	
	
  //graph constructed
  //DFS-based search begins

  /* Use a depth-first search to construct a list of connected subgraphs (= "clusters").
     Note our graph as we have constructed it is a DAG, so we can use that to our advantage
     in our search. */
  //  num_clust=dfs(graph_of_grid,num_dense_grids,num_dm,grid_clusterID);
  num_clust=find_connected(graph_of_grid, num_dense_grids, num_dm, grid_clusterID);

	
  //computes grid_ID and cluster_ID
  for (i=0;i<file_Len;i++)
    {
      denseID=eventID_To_denseventID[i];
      if (denseID!=-1) //only dense events can enter
	  {
        cluster_ID[denseID]=grid_clusterID[grid_ID[denseID]];
		//if (cluster_ID[denseID]==-1)
		//	printf("catch you!\n");
	  }
    }
	
  free(search_value);
  free(grid_value);

  for (i=0;i<num_dense_grids;i++)
    {
      free(graph_of_grid[i]);
      free(norm_grid_center[i]);
    }
  free(graph_of_grid);
  free(norm_grid_center);

  return num_clust;
}

/********************************************* Merge Clusters to Populations *******************************************/

long merge_clusters(long num_clust, long num_dm, double *interval, double **cluster_center, long *cluster_populationID, long max_num_pop)
{
  long num_population=0;
  long temp_num_population=0;

  long i=0;
  long j=0;
  long t=0;
  long merge=0;
  long smid=0;
  long bgid=0;
  double merge_dist=1.1;

  long *map_ID;

  double diff=0;  

  map_ID=(long*)malloc(sizeof(long)*num_clust);
  memset(map_ID,0,sizeof(long)*num_clust);

  for (i=0;i<num_clust;i++)
  {
    	cluster_populationID[i]=i;
		map_ID[i]=i;
  }
    

  while (((num_population>max_num_pop) && (merge_dist<5.0)) || ((num_population<=1) && (merge_dist>0.1)))
  {
  
	  if (num_population<=1)
	  	  merge_dist=merge_dist-0.1;

	  if (num_population>max_num_pop)
          merge_dist=merge_dist+0.1;
	  
	    
	 for (i=0;i<num_clust;i++)
		cluster_populationID[i]=i;

    for (i=0;i<num_clust-1;i++)
    {
      for (j=i+1;j<num_clust;j++)
        {
          merge=1;
			
          for (t=0;t<num_dm;t++)
            {
              diff=cluster_center[i][t]-cluster_center[j][t];
				
              if (diff<0)
                diff=-diff;
              if (diff>(merge_dist*interval[t]))
                merge=0;
            }

          if ((merge) && (cluster_populationID[i]!=cluster_populationID[j]))
            {
              if (cluster_populationID[i]<cluster_populationID[j])  //they could not be equal
                {
                  smid = cluster_populationID[i];
                  bgid = cluster_populationID[j];
                }
              else
                {
                  smid = cluster_populationID[j];
                  bgid = cluster_populationID[i];
                }
              for (t=0;t<num_clust;t++)
                {
                  if (cluster_populationID[t]==bgid)
                    cluster_populationID[t]=smid;
                }
            }
        }
    }

  

  for (i=0;i<num_clust;i++)
    map_ID[i]=-1;

  for (i=0;i<num_clust;i++)
    map_ID[cluster_populationID[i]]=1;

  num_population=0;
  for (i=0;i<num_clust;i++)
    {
      if (map_ID[i]==1)
        {
          map_ID[i]=num_population;
          num_population++;
        }
    }

  if ((temp_num_population>max_num_pop) && (num_population==1))
	  break;
  else
	  temp_num_population=num_population;

  if (num_clust<=1)
	break;
  } //end while

  for (i=0;i<num_clust;i++)
    cluster_populationID[i]=map_ID[cluster_populationID[i]];
	
  free(map_ID);

  return num_population; 
}

///////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
double kmeans(double **Matrix, long k, double kmean_term, long file_Len, long num_dm, long *shortest_id, double **center)
{
	
	long i=0;
	long j=0;
	long t=0;
	long random=0;
	long random1=0;
	long random2=0;
	long times=0;
	long dist_used=0; //0 is Euclidean and 1 is Pearson
	long random_init=0; //0: not use random seeds; 
	long real_Len=0;
	long skipped=0;
	
 	long *num;  //num[i]=t means the ith cluster has t points
	
	double vvv=1.0; // the biggest variation;
	double distance=0.0;
	double xv=0.0;
	double variation=0.0;

	double mean_dx=0;
	double mean_dy=0;
	double sum_var=0;
	double dx=0;
	double dy=0;
	double sd_x=0;
	double sd_y=0;	
	double diff=0;
	double distortion=0;
	double sd=0; //standard deviation
	double shortest_distance;

	double *temp_center;	
			
	double **sum;	
 
	temp_center = (double *)malloc(sizeof(double)*num_dm);
	memset(temp_center,0,sizeof(double)*num_dm);

	if (random_init)
	{
		for (i=0;i<k;i++)
		{	
			random1=rand()*rand();
			random2=abs((random1%5)+1);
			for (t=0;t<random2;t++)
				random2=random2*rand()+rand();
	
			random=abs(random2%file_Len);
			for (j=0;j<num_dm;j++)
				center[i][j]=Matrix[random][j];				
		}
	}


	num = (long *)malloc(sizeof(long)*k);
	memset(num,0,sizeof(long)*k);

	sum = (double **)malloc(sizeof(double*)*k);
	memset(sum,0,sizeof(double*)*k);
	for (i=0;i<k;i++)
	{
		sum[i] = (double *)malloc(sizeof(double)*num_dm);
		memset(sum[i],0,sizeof(double)*num_dm);
	}


	times=0;
	real_Len=0;

	while (((vvv>kmean_term) && (kmean_term<1)) || ((times<kmean_term) && (kmean_term>=1)))
	{
		for (i=0;i<k;i++)
		{
			num[i]=0;
			for (j=0;j<num_dm;j++)
				sum[i][j]=0.0;  
		}
		
		for (i=0;i<file_Len;i++)  //for each data point i, we compute the distance between Matrix[i] and center[j]
		{		
			skipped = 0;
			shortest_distance=MAX_VALUE;
			for (j=0;j<k;j++)  //for each center j
			{
				distance=0.0;
								
				if (dist_used==0)  //Euclidean distance
				{
					for (t=0;t<num_dm;t++) //for each dimension here num_dm is always 1 as we consider individual dimensions
					{
					
						diff=center[j][t]-Matrix[i][t];
					
						diff=diff*diff;  
						//this K-means is only used for dimension selection, and therefore for 1-dimensional data only, no need to use cube distance

						distance = distance+diff; //here we have a weight for each dimension
					}
				}
				else  //pearson correlation
				{
					mean_dx=0.0;
					mean_dy=0.0;
					sum_var=0.0;
					dx=0.0;
					dy=0.0;
					sd_x=0.0;
					sd_y=0.0;
					for (t=0;t<num_dm;t++)
					{
						mean_dx+=center[j][t];
						mean_dy+=Matrix[i][t];
					}
					mean_dx=mean_dx/(double)num_dm;
					mean_dy=mean_dy/(double)num_dm;
			
					for (t=0;t<num_dm;t++)
					{
						dx=center[j][t]-mean_dx;
						dy=Matrix[i][t]-mean_dy;
						sum_var+=dx*dy;
						sd_x+=dx*dx;
						sd_y+=dy*dy;
					}
					if (sqrt(sd_x*sd_y)==0)
						distance = 1.0;
					else
						distance = 1.0 - (sum_var/(sqrt(sd_x*sd_y))); // distance ranges from 0 to 2;
					//printf("distance=%f\n",distance);			
				}	//pearson correlation ends 				

				if ((distance<shortest_distance) && (skipped == 0))
				{
					shortest_distance=distance;						
					shortest_id[i]=j;  
				}			
			}//end for j
				real_Len++;
				num[shortest_id[i]]=num[shortest_id[i]]+1;
				for (t=0;t<num_dm;t++)
					sum[shortest_id[i]][t]=sum[shortest_id[i]][t]+Matrix[i][t];		
		}//end for i
	/* recompute the centers */
	//compute_mean(group);		
		vvv=0.0;
		for (j=0;j<k;j++)
		{
			memcpy(temp_center,center[j],sizeof(double)*num_dm);
			variation=0.0;
			if (num[j]!=0)
			{
				for (t=0;t<num_dm;t++)
				{
					center[j][t]=sum[j][t]/(double)num[j];
					xv=(temp_center[t]-center[j][t]);
					variation=variation+xv*xv;
				}
			}
			
			if (variation>vvv)
				vvv=variation;  //vvv is the biggest variation among the k clusters;			
		}
	//compute_variation;
		times++;
	} //end for while




	free(num);

	for (i=0;i<k;i++)
		free(sum[i]);
	free(sum);
	free(temp_center);


	return distortion;
		
}

//////////////////////////////////////////////////////
/*************************** Show *****************************/
void show(double **Matrix, long *cluster_id, long file_Len, long k, long num_dm, char *name_string)
{
	int situ1=0;
	int situ2=0;

	long i=0;
	long id=0;
	long j=0;
	long info_id=0;
	long nearest_id=0;
	long insert=0;
	long temp=0;
	long m=0;
	long n=0;
	long t=0;
	
	long *size_c;
	


	long **size_mybound_1;
	long **size_mybound_2;
	long **size_mybound_3;
	long **size_mybound_0;

	double interval=0.0;

	double *big;
	double *small;


	double **center;
	double **mybound;
	
	long **prof; //prof[i][j]=1 means population i is + at parameter j
	
	FILE *fpcnt_id; //proportion id
	//FILE *fcent_id; //center_id, i.e., centers of clusters within the original data
	FILE *fprof_id; //profile_id

	big=(double *)malloc(sizeof(double)*num_dm);
	memset(big,0,sizeof(double)*num_dm);

	small=(double *)malloc(sizeof(double)*num_dm);
	memset(small,0,sizeof(double)*num_dm);

	for (i=0;i<num_dm;i++)
	{
		big[i]=0.0;
		small[i]=(double)MAX_VALUE;
	}
	
	
	size_c=(long *)malloc(sizeof(long)*k);
	memset(size_c,0,sizeof(long)*k);

	center=(double**)malloc(sizeof(double*)*k);
	memset(center,0,sizeof(double*)*k);
	for (i=0;i<k;i++)
	{
		center[i]=(double*)malloc(sizeof(double)*num_dm);
		memset(center[i],0,sizeof(double)*num_dm);
	}

	mybound=(double**)malloc(sizeof(double*)*num_dm);
	memset(mybound,0,sizeof(double*)*num_dm);
	for (i=0;i<num_dm;i++) //there are 3 mybounds for 4 categories
	{
		mybound[i]=(double*)malloc(sizeof(double)*3);
		memset(mybound[i],0,sizeof(double)*3);
	}

	prof=(long **)malloc(sizeof(long*)*k);
	memset(prof,0,sizeof(long*)*k);
	for (i=0;i<k;i++)
	{
		prof[i]=(long *)malloc(sizeof(long)*num_dm);
		memset(prof[i],0,sizeof(long)*num_dm);
	}


	for (i=0;i<file_Len;i++)
	{
		id=cluster_id[i];
		for (j=0;j<num_dm;j++)
		{
			center[id][j]=center[id][j]+Matrix[i][j];
			if (big[j]<Matrix[i][j])
				big[j]=Matrix[i][j];
			if (small[j]>Matrix[i][j])
				small[j]=Matrix[i][j];
		}
		
		size_c[id]++;		
	}

	for (i=0;i<k;i++)
		for (j=0;j<num_dm;j++)
		{			
			if (size_c[i]!=0)
				center[i][j]=(center[i][j]/(double)(size_c[i]));
			else
				center[i][j]=0;	
		}

	for (j=0;j<num_dm;j++)
	{
		interval=((big[j]-small[j])/4.0);
		//printf("interval[%d] is %f\n",j,interval);
		for (i=0;i<3;i++)
			mybound[j][i]=small[j]+((double)(i+1)*interval);
	}
	

	size_mybound_0=(long **)malloc(sizeof(long*)*k);
	memset(size_mybound_0,0,sizeof(long*)*k);
	
	for (i=0;i<k;i++)
	{
		size_mybound_0[i]=(long*)malloc(sizeof(long)*num_dm);
		memset(size_mybound_0[i],0,sizeof(long)*num_dm);		
	}

	size_mybound_1=(long **)malloc(sizeof(long*)*k);
	memset(size_mybound_1,0,sizeof(long*)*k);
	
	for (i=0;i<k;i++)
	{
		size_mybound_1[i]=(long*)malloc(sizeof(long)*num_dm);
		memset(size_mybound_1[i],0,sizeof(long)*num_dm);		
	}

	size_mybound_2=(long **)malloc(sizeof(long*)*k);
	memset(size_mybound_2,0,sizeof(long*)*k);
	
	for (i=0;i<k;i++)
	{
		size_mybound_2[i]=(long*)malloc(sizeof(long)*num_dm);
		memset(size_mybound_2[i],0,sizeof(long)*num_dm);	
	}

	size_mybound_3=(long **)malloc(sizeof(long*)*k);
	memset(size_mybound_3,0,sizeof(long*)*k);

	for (i=0;i<k;i++)
	{
		size_mybound_3[i]=(long*)malloc(sizeof(long)*num_dm);
		memset(size_mybound_3[i],0,sizeof(long)*num_dm);
	}
	
	for (i=0;i<file_Len;i++)
		for (j=0;j<num_dm;j++)
			{
				if (Matrix[i][j]<mybound[j][0])// && ((Matrix[i][j]-small[j])>0)) //the smallest values excluded
					size_mybound_0[cluster_id[i]][j]++;
				else
				{
					if (Matrix[i][j]<mybound[j][1])
						size_mybound_1[cluster_id[i]][j]++;
					else
					{
						if (Matrix[i][j]<mybound[j][2])
							size_mybound_2[cluster_id[i]][j]++;
						else
							//if (Matrix[i][j]!=big[j]) //the biggest values excluded
								size_mybound_3[cluster_id[i]][j]++;
					}

				}
			}

	fprof_id=fopen("profile.txt","w");
	fprintf(fprof_id,"Population_ID\t");
	fprintf(fprof_id,"%s\n",name_string);
	
	for (i=0;i<k;i++)
	{
		fprintf(fprof_id,"%ld\t",i+1); //i changed to i+1 to start from 1 instead of 0: April 16, 2009
		for (j=0;j<num_dm;j++)
		{
			
			if (size_mybound_0[i][j]>size_mybound_1[i][j])
				situ1=0;
			else
				situ1=1;
			if (size_mybound_2[i][j]>size_mybound_3[i][j])
				situ2=2;
			else
				situ2=3;

			if ((situ1==0) && (situ2==2))
			{
				if (size_mybound_0[i][j]>size_mybound_2[i][j])
					prof[i][j]=0;
				else
					prof[i][j]=2;
			}
			if ((situ1==0) && (situ2==3))
			{
				if (size_mybound_0[i][j]>size_mybound_3[i][j])
					prof[i][j]=0;
				else
					prof[i][j]=3;
			}
			if ((situ1==1) && (situ2==2))
			{
				if (size_mybound_1[i][j]>size_mybound_2[i][j])
					prof[i][j]=1;
				else
					prof[i][j]=2;
			}
			if ((situ1==1) && (situ2==3))
			{
				if (size_mybound_1[i][j]>size_mybound_3[i][j])
					prof[i][j]=1;
				else
					prof[i][j]=3;
			}
			
			//begin to output profile
			if (j==num_dm-1)
			{
				if (prof[i][j]==0)
					fprintf(fprof_id,"1\n");
				if (prof[i][j]==1)
					fprintf(fprof_id,"2\n");
				if (prof[i][j]==2)
					fprintf(fprof_id,"3\n");
				if (prof[i][j]==3)
					fprintf(fprof_id,"4\n");
			}
			else
			{
				if (prof[i][j]==0)
					fprintf(fprof_id,"1\t");
				if (prof[i][j]==1)
					fprintf(fprof_id,"2\t");
				if (prof[i][j]==2)
					fprintf(fprof_id,"3\t");
				if (prof[i][j]==3)
					fprintf(fprof_id,"4\t");
			}
		}
	}
	fclose(fprof_id);

	///////////////////////////////////////////////////////////
	

	fpcnt_id=fopen("percentage.txt","w");
	fprintf(fpcnt_id,"Population_ID\tPercentage\n");

	for (t=0;t<k;t++)
	{
		fprintf(fpcnt_id,"%ld\t%.2f\n",t+1,(double)size_c[t]*100.0/(double)file_Len);	//t changed to t+1 to start from 1 instead of 0: April 16, 2009									
	}
	fclose(fpcnt_id);

	free(big);
	free(small);
	free(size_c);

	for (i=0;i<k;i++)
	{
		free(center[i]);
		free(prof[i]);
		free(size_mybound_0[i]);
		free(size_mybound_1[i]);
		free(size_mybound_2[i]);
		free(size_mybound_3[i]);
	}
	free(center);
	free(prof);
	free(size_mybound_0);
	free(size_mybound_1);
	free(size_mybound_2);
	free(size_mybound_3);

	for (i=0;i<num_dm;i++)
		free(mybound[i]);
	free(mybound);
	
}
///////////////////////////////////////////////////////////////////////////

double get_avg_dist(double *population_center, long smaller_pop_ID, long larger_pop_ID, long *population_ID, long num_population, long file_Len, 
				   long num_dm, double **normalized_data, long d1, long d2, long d3, long *size_c)
{
	long k=0;
	long temp=0;
	long i=0;
	long j=0;
	long t=0;
	long current_biggest=0;


	double total_dist=0.0;
	double distance=0.0;
	double dist=0.0;
	double biggest_distance=0.0;

	long *nearest_neighbors;
	double *nearest_distance;

	k=min(size_c[smaller_pop_ID],size_c[larger_pop_ID]);
	
	if (k>0)
	{
		k=(long)sqrt((double)k);
		if (num_dm<=3)
			k=(long)(2.7183*(double)k);  //e*k for low-dimensional space, added Nov. 4, 2010
		//printf("the k-nearest k is %d\n",k);
	}
	else
	{
		printf("error in k\n");
		exit(0);
	}

	nearest_neighbors=(long *)malloc(sizeof(long)*k);
	memset(nearest_neighbors,0,sizeof(long)*k);

	nearest_distance=(double *)malloc(sizeof(double)*k);
	memset(nearest_distance,0,sizeof(double)*k);

	temp=0;

	for (i=0;i<file_Len;i++)
	{
		if ((population_ID[i]==smaller_pop_ID) || (population_ID[i]==larger_pop_ID)) //the event belongs to the pair of populations
		{
			distance=0.0;
			for (t=0;t<num_dm;t++)
			{
				if ((t!=d1) && (t!=d2) && (t!=d3))
					continue;
				else
				{
					dist=population_center[t]-normalized_data[i][t];
					distance=distance+dist*dist;
				}
			}

			if (temp<k)
			{
				nearest_neighbors[temp]=i;
				nearest_distance[temp]=distance;
				temp++;
			}
			else
			{
			    biggest_distance=0.0;
				for (j=0;j<k;j++)
				{
					if (nearest_distance[j]>biggest_distance)
					{
						biggest_distance=nearest_distance[j];
						current_biggest=j;
					}
				}

				if (biggest_distance>distance)
				{
					nearest_neighbors[current_biggest]=i;
					nearest_distance[current_biggest]=distance;
				}
			}
		}
	}

    for (i=0;i<k;i++)
		total_dist=total_dist+nearest_distance[i];

	total_dist=total_dist/(double)k;

	free(nearest_distance);
	free(nearest_neighbors);
	
	return total_dist;
}


/******************************************************** Main Function **************************************************/
//for normalized data, there are five variables:
//cluster_ID
//population_center
//grid_clusterID
//grid_ID
//grid_Center

//the same five variables exist for the original data
//however, the three IDs (cluster_ID, grid_ID, grid_clusterID) don't change in either normalized or original data
//also, data + cluster_ID -> population_center
//data + grid_ID -> grid_Center

/* what is the final output */
//the final things we want are grid_Center in the original data and grid_clusterID //or population_center in the original data
//Sparse grids will not be considered when computing the two centroids (centroids of grids and centroids of clusters)

/*  what information should select_bin output */
//the size of all IDs are unknown to function main because we only consider the events in dense grids, and also the number of dense grids
//is unknown, therefore I must use a prescreening to output 
//how many bins I should use
//the number of dense grids
//total number of events in the dense grids

/* basic procedure of main function */ 
//1. read raw file and normalize the raw file
//2. select_bin
//3. allocate memory for eventID_To_denseventID, grid_clusterID, grid_ID, cluster_ID. 
//4. call select_dense and merge_grids with grid_clusterID, grid_ID, cluster_ID.
//5. release normalized data; allocate memory for grid_Center and population_center
//6. output grid_Center and population_center using ID2Center together with grid_clusterID //from IDs to centers

int main (int argc, char **argv)
{
  //inputs
  FILE *f_src; //source file pointer
 
  FILE *f_out; //coordinates
  FILE *f_cid; //population-ID of events
  FILE *f_ctr; //centroids of populations
  FILE *f_results; //coordinates file event and population column
  FILE *f_mfi; //added April 16, 2009 for mean fluorescence intensity
  FILE *f_parameters; //number of bins and density calculated by
                      //the algorithm. Used to update the database
  FILE *f_properties; //Properties file used by Image generation software

  //char tmpbuf[128];

  char para_name_string[LINE_LEN];

  long time_ID=-1;
  long num_bin=0; //the bins I will use on each dimension
  long num_pop=0;	
  long file_Len=0; //number of events		
  long num_dm=0;
  long num_clust=0;
  long num_dense_events=0;
  long num_dense_grids=0;
  long num_nonempty=0;
  long num_population=0;
  long num_real_pop=0;
  long keep_merge=0;
  long num_checked_range=0;

  //below are read from configuration file
  long i=0;
  long j=0;
  long p=0;
  long t=0;
  long q=0;
  long temp_i=0;
  long temp_j=0;
  long first_i=0;
  long first_j=0;
  long d1=0;
  long d2=0;
  long d3=0;
  long d_d=0;
  long max_num_pop=0; //upperbound of the number of populations that is equal to 2^d
  long index_id=0;
  long num_computed_population=0;
  long tot_size=0;

  int den_t_event=0;

  double distance=0.0;
  double dist=0.0;
  double current_smallest_dist=0;
  double max_d_dist=0.0;
  double Ei=0.0;
  double Ej=0.0;
  double E1=0.0;
  double E2=0.0;
  double E3=0.0;
  double Ep=0.0;
  double temp=0.0;
  double tmp=0.0;

  long *grid_clusterID; //(dense)gridID to clusterID
  long *grid_ID; //(dense)eventID to gridID
  long *cluster_ID; //(dense)eventID to clusterID
  long *eventID_To_denseventID; //eventID_To_denseventID[i]=k means event i is in a dense grid and its ID within dense events is k
  long *all_grid_ID; //gridID for all events
  long *densegridID_To_gridevent;
  long *sorted_seq;
  long *dense_grid_reverse;
  long *population_ID; //denseeventID to populationID
  long *cluster_populationID; //clusterID to populationID
  long *grid_populationID; //gridID to populationID
  long *all_population_ID; //populationID of event
  long *size_c;
  long *size_p_2;
  long *partit;
  long *size_p;
  long *temp_size_j;
  long *all_computed_pop_ID;


  long **position;
  long **dense_grid_seq;
  long **temp_population_ID;

  double *interval;
  double *center_1;
  double *center_2;
  double *center_3;
  double *aver;
  double *std;
  double *center_p_1;
  double *center_p_2;
  double *center_p_3;
	
  double **population_center; //population centroids in the raw/original data
  double **cluster_center; //cluster centroids in the raw/original data
  double **new_population_center;
  double **temp_population_center;
  double **min_pop;
  double **max_pop;

  double **input_data;
  double **normalized_data;

  double **real_pop_center;
  double **distance_pop;

  double ***ind_pop;
  double ***ind_pop_3;
		
  int min = 999999;
  int max = 0;

  printf( "Starting time:\t\t\t\t");
  fflush(stdout);
  system("/bin/date");

  /*
   * Windows Version
  _strtime( tmpbuf );
  printf( "Starting time:\t\t\t\t%s\n", tmpbuf );
  _strdate( tmpbuf );
  printf( "Starting date:\t\t\t\t%s\n", tmpbuf );
  */

  /////////////////////////////////////////////////////////////

  if ((argc!=2) && (argc!=3) && (argc!=4) && (argc!=5) && (argc!=6))
  {
      fprintf(stderr,"Incorrect number of input parameters!\n"); 
	  fprintf(stderr,"usage:\n");
	  fprintf(stderr,"basic mode: flock data_file\n");
	  fprintf(stderr,"advanced mode 0 (specify maximum # of pops): flock data_file max_num_pop\n");
	  fprintf(stderr,"advanced mode 1 (without # of pops): flock data_file num_bin density_index\n");
	  fprintf(stderr,"advanced mode 2 (specify # of pops): flock data_file num_bin density_index number_of_pop\n");
	  fprintf(stderr,"advanced mode 3 (specify both # of pops): flock data_file num_bin density_index number_of_pop max_num_pop\n");
      exit(0);
  }	

  f_src=fopen(argv[1],"r");

  if (argc==3)
  {
	max_num_pop=atoi(argv[2]);
	printf("num of maximum pop is %ld\n",max_num_pop);
  }
  
  if (argc==4)
  {
	num_bin=atoi(argv[2]);
	printf("num_bin is %ld\n",num_bin);

	den_t_event=atoi(argv[3]);
	printf("density_index is %d\n",den_t_event);
  }

  if (argc==5)
  {
	num_bin=atoi(argv[2]);
	printf("num_bin is %ld\n",num_bin);

	den_t_event=atoi(argv[3]);
	printf("density_index is %d\n",den_t_event);

	num_pop=atoi(argv[4]);
	printf("num of pop is %ld\n",num_pop);
  }

  if (argc==6)
  {
	num_bin=atoi(argv[2]);
	printf("num_bin is %ld\n",num_bin);

	den_t_event=atoi(argv[3]);
	printf("density_index is %d\n",den_t_event);

	num_pop=atoi(argv[4]);
	printf("num of pop is %ld\n",num_pop);

	max_num_pop=atoi(argv[5]);
	printf("num of pop is %ld\n",max_num_pop);
  }


  if (num_pop==1)
  {
	  printf("it is not allowed to specify only one population\n");
	  exit(0);
  }

    

  getfileinfo(f_src, &file_Len, &num_dm, para_name_string, &time_ID); //get the filelength, number of dimensions, and num/name of parameters

  
  if (max_num_pop==0)
  {
	  max_num_pop=(long)pow(2,num_dm);
		if (max_num_pop>MAX_POP_NUM)
			max_num_pop=MAX_POP_NUM;
  }

  /************************************************* Read the data *****************************************************/
	
  rewind(f_src); //reset the file pointer	

  input_data = (double **)malloc(sizeof(double*)*file_Len);
  memset(input_data,0,sizeof(double*)*file_Len);
  for (i=0;i<file_Len;i++)
  {
     input_data[i]=(double *)malloc(sizeof(double)*num_dm);
     memset(input_data[i],0,sizeof(double)*num_dm);
  }
	
  readsource(f_src, file_Len, num_dm, input_data, time_ID); //read the data;
	
  fclose(f_src);

  normalized_data=(double **)malloc(sizeof(double*)*file_Len);
  memset(normalized_data,0,sizeof(double*)*file_Len);
  for (i=0;i<file_Len;i++)
    {
      normalized_data[i]=(double *)malloc(sizeof(double)*num_dm);
      memset(normalized_data[i],0,sizeof(double)*num_dm);
    }
	
  tran(input_data, file_Len, num_dm, NORM_METHOD, normalized_data);
	

  position=(long **)malloc(sizeof(long*)*file_Len);
  memset(position,0,sizeof(long*)*file_Len);
  for (i=0;i<file_Len;i++)
    {
      position[i]=(long*)malloc(sizeof(long)*num_dm);
      memset(position[i],0,sizeof(long)*num_dm);
    }

  all_grid_ID=(long *)malloc(sizeof(long)*file_Len);
  memset(all_grid_ID,0,sizeof(long)*file_Len);

  sorted_seq=(long*)malloc(sizeof(long)*file_Len);
  memset(sorted_seq,0,sizeof(long)*file_Len);
	
  interval=(double*)malloc(sizeof(double)*num_dm);
  memset(interval,0,sizeof(double)*num_dm);

  /************************************************* select_bin *************************************************/
	
  if (num_bin>=1)  //num_bin has been selected by user
  {
  	select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin);
	printf("user set bin number is %ld\n",num_bin); 
  }
  else  //num_bin has not been selected by user
  {
	num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin);
	printf("selected bin number is %ld\n",num_bin);    
  }
  printf("number of non-empty grids is %ld\n",num_nonempty);
		
 

  /* Although we return sorted_seq from select_bin(), we don't use it for anything, except possibly diagnostics */
  free(sorted_seq);
	
	
  dense_grid_reverse=(long*)malloc(sizeof(long)*num_nonempty);
  memset(dense_grid_reverse,0,sizeof(long)*num_nonempty);	

  /************************************************* select_dense **********************************************/

  if (den_t_event>=1) //den_t_event must be larger or equal to 2 if the user wants to set it
	den_t_event=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event);
  else
  {
	den_t_event=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event);
	printf("automated selected density threshold is %d\n",den_t_event);
  }		

  printf("Number of dense grids is %ld\n",num_dense_grids);

  densegridID_To_gridevent = (long *)malloc(sizeof(long)*num_dense_grids);
  memset(densegridID_To_gridevent,0,sizeof(long)*num_dense_grids);

  for (i=0;i<num_dense_grids;i++)
    densegridID_To_gridevent[i]=-1; //initialize all densegridID_To_gridevent values to -1
	

  eventID_To_denseventID=(long *)malloc(sizeof(long)*file_Len);
  memset(eventID_To_denseventID,0,sizeof(long)*file_Len);     //eventID_To_denseventID[i]=k means event i is in a dense grid and its ID within dense events is k


  grid_To_event(file_Len, dense_grid_reverse, all_grid_ID, eventID_To_denseventID, densegridID_To_gridevent);

	
  dense_grid_seq=(long **)malloc(sizeof(long*)*num_dense_grids);
  memset(dense_grid_seq,0,sizeof(long*)*num_dense_grids);
  for (i=0;i<num_dense_grids;i++)
    {
      dense_grid_seq[i]=(long *)malloc(sizeof(long)*num_dm);
      memset(dense_grid_seq[i],0,sizeof(long)*num_dm);
    }


  /* Look up the binned data values for each dense grid */
  generate_grid_seq(num_dm, num_dense_grids, densegridID_To_gridevent, position, dense_grid_seq); 	
	
	
  /************************************************* allocate memory *********************************************/
	
  grid_clusterID=(long *)malloc(sizeof(long)*num_dense_grids);
  memset(grid_clusterID,0,sizeof(long)*num_dense_grids);

  grid_ID=(long *)malloc(sizeof(long)*num_dense_events);
  memset(grid_ID,0,sizeof(long)*num_dense_events);

  cluster_ID=(long *)malloc(sizeof(long)*num_dense_events);
  memset(cluster_ID,0,sizeof(long)*num_dense_events);


  /*********************************************** merge_grids ***********************************************/
  //long merge_grids(long file_Len, long num_dm, long num_bin, long **position, long num_dense_grids, long *dense_grid_rank, long *dense_grid_reverse,
  //			 long **dense_grid_seq, long *eventID_To_denseventID, long *densegridID_To_gridevent, long *all_grid_ID,
  //			 long *cluster_ID, long *grid_ID, long *grid_clusterID)
	
  num_clust = merge_grids(normalized_data, interval, file_Len, num_dm, num_bin, position, num_dense_grids, dense_grid_reverse, dense_grid_seq, eventID_To_denseventID, densegridID_To_gridevent, all_grid_ID, cluster_ID, grid_ID, grid_clusterID);
	
  printf("computed number of groups is %ld\n",num_clust);	

	
  /************************************** release unnecessary memory and allocate memory and compute centers **********************************/
	
	
  for (i=0;i<file_Len;i++)
    free(position[i]);
  free(position);

  for (i=0;i<num_dense_grids;i++)
    free(dense_grid_seq[i]);
  free(dense_grid_seq);

  free(dense_grid_reverse);
	
  free(densegridID_To_gridevent);
  free(all_grid_ID);
	
  // cluster_center ////////////////////////////////////////////////////////////////////////////////////////////////////////
	
  cluster_center=(double **)malloc(sizeof(double*)*num_clust);
  memset(cluster_center,0,sizeof(double*)*num_clust);
  for (i=0;i<num_clust;i++)
  {
     cluster_center[i]=(double*)malloc(sizeof(double)*num_dm);
     memset(cluster_center[i],0,sizeof(double)*num_dm);
  }
	
  ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_clust,cluster_ID,cluster_center); //produce the centers with normalized data
	
  //printf("pass the first ID2center\n");

  /*** population_ID and grid_populationID **/
	
  cluster_populationID=(long*)malloc(sizeof(long)*num_clust);
  memset(cluster_populationID,0,sizeof(long)*num_clust);

  grid_populationID=(long*)malloc(sizeof(long)*num_dense_grids);
  memset(grid_populationID,0,sizeof(long)*num_dense_grids);

  population_ID=(long*)malloc(sizeof(long)*num_dense_events);
  memset(population_ID,0,sizeof(long)*num_dense_events);

  num_population = merge_clusters(num_clust, num_dm, interval, cluster_center, cluster_populationID, max_num_pop);

  

  for (i=0;i<num_clust;i++)
    free(cluster_center[i]);
  free(cluster_center);

  free(interval);
	
  for (i=0;i<num_dense_grids;i++)
    {
      grid_populationID[i]=cluster_populationID[grid_clusterID[i]];
    }

  for (i=0;i<num_dense_events;i++)
    {
      population_ID[i]=cluster_populationID[cluster_ID[i]];
    }

  printf("Number of merged groups before second-run partitioning is %ld\n",num_population);

  

  // population_center /////////////////////////////////////////////////////////////////////////////////////////////////////

 
  population_center=(double **)malloc(sizeof(double*)*num_population);
  memset(population_center,0,sizeof(double*)*num_population);
  for (i=0;i<num_population;i++)
  {
     population_center[i]=(double*)malloc(sizeof(double)*num_dm);
     memset(population_center[i],0,sizeof(double)*num_dm);
  }

  
	
  ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_population,population_ID,population_center); //produce population centers with normalized data
	

  // show ////////////////////////////////////////////////////////////////////////////////
  all_population_ID=(long*)malloc(sizeof(long)*file_Len);
  memset(all_population_ID,0,sizeof(long)*file_Len);

  kmeans(normalized_data, num_population, KMEANS_TERM, file_Len, num_dm, all_population_ID, population_center);

  for (i=0;i<num_population;i++)
	free(population_center[i]);

  free(population_center);
  
  ////// there needs to be another step to further partition the data
  
	
	  center_p_1=(double *)malloc(sizeof(double)*3);
	  memset(center_p_1,0,sizeof(double)*3);

	  center_p_2=(double *)malloc(sizeof(double)*3);
	  memset(center_p_2,0,sizeof(double)*3);

	  center_p_3=(double *)malloc(sizeof(double)*3);
	  memset(center_p_3,0,sizeof(double)*3);

	  size_p=(long *)malloc(sizeof(long)*num_population);
	  memset(size_p,0,sizeof(long)*num_population);
	
	  temp_size_j=(long *)malloc(sizeof(long)*num_population);
	  memset(temp_size_j,0,sizeof(long)*num_population);

	  all_computed_pop_ID=(long *)malloc(sizeof(long)*file_Len);
	  memset(all_computed_pop_ID,0,sizeof(long)*file_Len);

	  for (i=0;i<file_Len;i++)
	  {
		size_p[all_population_ID[i]]++;
		all_computed_pop_ID[i]=all_population_ID[i];
	  }

	  ind_pop=(double***)malloc(sizeof(double**)*num_population);
	  memset(ind_pop,0,sizeof(double**)*num_population);

	  ind_pop_3=(double***)malloc(sizeof(double**)*num_population);
	  memset(ind_pop_3,0,sizeof(double**)*num_population);

	  for (i=0;i<num_population;i++)
	  {
		  ind_pop[i]=(double**)malloc(sizeof(double*)*size_p[i]);
		  memset(ind_pop[i],0,sizeof(double*)*size_p[i]);

		  ind_pop_3[i]=(double**)malloc(sizeof(double*)*size_p[i]);
		  memset(ind_pop_3[i],0,sizeof(double*)*size_p[i]);

		  for (j=0;j<size_p[i];j++)
		  {
			  ind_pop[i][j]=(double*)malloc(sizeof(double)*num_dm);
			  memset(ind_pop[i][j],0,sizeof(double)*num_dm);

			  ind_pop_3[i][j]=(double*)malloc(sizeof(double)*3);
			  memset(ind_pop_3[i][j],0,sizeof(double)*3);
		  }
		  temp_size_j[i]=0;
	  }

	  for (i=0;i<file_Len;i++) //to generate ind_pop[i]
	  {
		  index_id=all_population_ID[i];
		  
		  j=temp_size_j[index_id];

		  for (t=0;t<num_dm;t++)
		  {
			ind_pop[index_id][j][t]=input_data[i][t];
		  }
		  temp_size_j[index_id]++;		
	  }

	  aver=(double*)malloc(sizeof(double)*num_dm);
	  memset(aver,0,sizeof(double)*num_dm);

	  std=(double*)malloc(sizeof(double)*num_dm);
	  memset(std,0,sizeof(double)*num_dm);

	  temp_population_center=(double**)malloc(sizeof(double*)*2);
	  memset(temp_population_center,0,sizeof(double*)*2);
	  for (i=0;i<2;i++)
	  {
		temp_population_center[i]=(double*)malloc(sizeof(double)*3);
		memset(temp_population_center[i],0,sizeof(double)*3);
	  }

	  size_p_2=(long*)malloc(sizeof(long)*2);
	  memset(size_p_2,0,sizeof(long)*2);

	  partit=(long*)malloc(sizeof(long)*num_population);
	  memset(partit,0,sizeof(long)*num_population);

	  num_computed_population=num_population;

	  temp_population_ID=(long**)malloc(sizeof(long*)*num_population);
	  memset(temp_population_ID,0,sizeof(long*)*num_population);

	  for (i=0;i<num_population;i++)
	  {
		temp_population_ID[i]=(long*)malloc(sizeof(long)*size_p[i]);
		memset(temp_population_ID[i],0,sizeof(long)*size_p[i]);
	  }
	
	  //printf("num_population=%d\n",num_population);
	  
	  for (i=0;i<num_population;i++) 
	  {
		  partit[i]=0;
		  tran(ind_pop[i], size_p[i], num_dm, 1, ind_pop[i]);//0-1 normalize ind_pop[i]
          
		  //find the 3 dimensions with the largest std for ind_pop[i]
		  d1=-1;
		  d2=-1;
		  d3=-1;

		  for (t=0;t<num_dm;t++)
		  {
			aver[t]=0;
			for (j=0;j<size_p[i];j++)
			{
				aver[t]=aver[t]+ind_pop[i][j][t];
			}
			aver[t]=aver[t]/(double)size_p[i];

			std[t]=0;
			for (j=0;j<size_p[i];j++)
			{
				std[t]=std[t]+((ind_pop[i][j][t]-aver[t])*(ind_pop[i][j][t]-aver[t]));
			}
			std[t]=sqrt(std[t]/(double)size_p[i]);
		  }

		  

		  for (j=0;j<3;j++)
		  {
			max_d_dist=0;
			for (t=0;t<num_dm;t++)
			{
				if ((t!=d1) && (t!=d2))
				{
					dist=std[t];
				}
				else
					dist=-1;

				if (dist>max_d_dist)
				{
					max_d_dist=dist;
					d_d=t;						
				}
			}

			if (j==0)
				d1=d_d;
			if (j==1)
				d2=d_d;
			if (j==2)
				d3=d_d;
		  }
		 
		 
		  for (t=0;t<size_p[i];t++)
		  {
				ind_pop_3[i][t][0]=ind_pop[i][t][d1];
				ind_pop_3[i][t][1]=ind_pop[i][t][d2];
				ind_pop_3[i][t][2]=ind_pop[i][t][d3];
		  }
		

		 temp_population_center[0][0]=(aver[d1])/2.0;
		 
		 temp_population_center[0][1]=aver[d2];
	     temp_population_center[0][2]=aver[d3];
		 
		 temp_population_center[1][0]=(aver[d1]+1.0)/2.0;

		 temp_population_center[1][1]=aver[d2];
		 temp_population_center[1][2]=aver[d3];
		 
		 

		 //run K-means with K=2
		 kmeans(ind_pop_3[i], 2, KMEANS_TERM, size_p[i], 3, temp_population_ID[i], temp_population_center);

		 for (j=0;j<2;j++)
			 size_p_2[j]=0;

		 for (j=0;j<size_p[i];j++)
			 size_p_2[temp_population_ID[i][j]]++;

		 for (j=0;j<2;j++)
			 if (size_p_2[j]==0)
				 printf("size_p_2=0 at i=%ld\n",i);


		 //check whether the 2 parts should be merged
		

		 Ei=get_avg_dist(temp_population_center[0], 0,1, temp_population_ID[i], 2,size_p[i], 3, ind_pop_3[i], 0,1,2,size_p_2);
		 Ej=get_avg_dist(temp_population_center[1], 0,1, temp_population_ID[i], 2,size_p[i], 3, ind_pop_3[i], 0,1,2,size_p_2);

		 for (t=0;t<3;t++)
		 {
			center_p_1[t]=temp_population_center[0][t]*0.25+temp_population_center[1][t]*0.75;
			center_p_2[t]=temp_population_center[0][t]*0.5+temp_population_center[1][t]*0.5;
			center_p_3[t]=temp_population_center[0][t]*0.75+temp_population_center[1][t]*0.25;
		 }
		
		 E1=get_avg_dist(center_p_1, 0,1, temp_population_ID[i], 2,size_p[i], 3, ind_pop_3[i], 0,1,2,size_p_2);
		
		 E2=get_avg_dist(center_p_2, 0,1, temp_population_ID[i], 2,size_p[i], 3, ind_pop_3[i], 0,1,2,size_p_2);

		 E3=get_avg_dist(center_p_3, 0,1, temp_population_ID[i], 2,size_p[i], 3, ind_pop_3[i], 0,1,2,size_p_2);

		 //printf("i=%d;E1=%f;E2=%f;E3=%f\n",i,E1,E2,E3);

		 if (E1<E2)
			Ep=E2;
		 else
			Ep=E1;
			
		 Ep=max(Ep,E3); //Ep is the most sparse area

		 

		 if ((Ep>Ei) && (Ep>Ej)) //the two parts should be partitioned
		 {
			 partit[i]=num_computed_population;
			 num_computed_population++;			 
		 }
		
	  } //end for (i=0;i<num_population;i++)

	  
	  for (i=0;i<num_population;i++)
		  temp_size_j[i]=0;

	  for (i=0;i<file_Len;i++)
	  {
		index_id=all_population_ID[i];
		if (partit[index_id]>0)
		{
			j=temp_size_j[index_id];
			if (temp_population_ID[index_id][j]==1)
				all_computed_pop_ID[i]=partit[index_id];
					
			temp_size_j[index_id]++;
		}
	  }

	  printf("Number of groups after partitioning is %ld\n", num_computed_population);

	
      free(size_p_2);
	  
	  free(aver);
	  free(std);
	  free(partit);

	  free(center_p_1);
	  free(center_p_2);
	  free(center_p_3);

	 

	  for (i=0;i<num_population;i++)
		  free(temp_population_ID[i]);
	  free(temp_population_ID);

	  for (i=0;i<num_population;i++)
	  {
		for (j=0;j<size_p[i];j++)
		{
			  free(ind_pop[i][j]);
			  free(ind_pop_3[i][j]);
		}
	  }

	 

	  for (i=0;i<num_population;i++)
	  {
		  free(ind_pop[i]);
		  free(ind_pop_3[i]);
	  }
	  free(ind_pop);
	  free(ind_pop_3);

	  
	 

	  for (i=0;i<2;i++)
		  free(temp_population_center[i]);

	  free(temp_population_center);

	  free(temp_size_j);
	  free(size_p);

	 

	  //update the IDs, Centers, and # of populations
	  for (i=0;i<file_Len;i++)
	  {
		  all_population_ID[i]=all_computed_pop_ID[i];
		  if (all_population_ID[i]>=num_computed_population)
			  printf("all_population_ID[%ld]=%ld\n",i,all_population_ID[i]);
	  }

	  num_population=num_computed_population;

	  free(all_computed_pop_ID);
    //end partitioning
  // since the num_population has changed, population_center needs to be redefined as below

  population_center=(double **)malloc(sizeof(double*)*num_population);
  memset(population_center,0,sizeof(double*)*num_population);
  for (i=0;i<num_population;i++)
  {
     population_center[i]=(double*)malloc(sizeof(double)*num_dm);
     memset(population_center[i],0,sizeof(double)*num_dm);
  }

  	  
  ID2Center_all(normalized_data,file_Len,num_dm,num_population,all_population_ID,population_center);
  


  ////// end of further partitioning

  ////////////////////////////////////////////////////////////
  //  to further merge populations to avoid overpartitioning
  //  Added June 20, 2010
  ////////////////////////////////////////////////////////////
  

  
  num_real_pop=num_population;
  keep_merge=1;

  if (num_pop>num_real_pop)
  {
		fprintf(stderr,"number of populations specified too large\n");
		exit(0);
  }

  center_1=(double *)malloc(sizeof(double)*num_dm);
  memset(center_1,0,sizeof(double)*num_dm);

  center_2=(double *)malloc(sizeof(double)*num_dm);
  memset(center_2,0,sizeof(double)*num_dm);

  center_3=(double *)malloc(sizeof(double)*num_dm);
  memset(center_3,0,sizeof(double)*num_dm);


    
  while (((num_real_pop>num_pop) && (num_pop!=0)) || ((keep_merge==1) && (num_pop==0) && (num_real_pop>2)))
  {
	  
	keep_merge=0;  //so, if no entering merge function, while will exit when num_pop==0

	real_pop_center=(double **)malloc(sizeof(double*)*num_real_pop);
    memset(real_pop_center,0,sizeof(double*)*num_real_pop);
    for (i=0;i<num_real_pop;i++)
    {
      real_pop_center[i]=(double*)malloc(sizeof(double)*num_dm);
      memset(real_pop_center[i],0,sizeof(double)*num_dm);
    }

	min_pop=(double **)malloc(sizeof(double*)*num_real_pop);
    memset(min_pop,0,sizeof(double*)*num_real_pop);
    for (i=0;i<num_real_pop;i++)
    {
      min_pop[i]=(double*)malloc(sizeof(double)*num_dm);
      memset(min_pop[i],0,sizeof(double)*num_dm);
    }

	max_pop=(double **)malloc(sizeof(double*)*num_real_pop);
    memset(max_pop,0,sizeof(double*)*num_real_pop);
    for (i=0;i<num_real_pop;i++)
    {
      max_pop[i]=(double*)malloc(sizeof(double)*num_dm);
      memset(max_pop[i],0,sizeof(double)*num_dm);
    }

	if (num_real_pop==num_population)
	{
		for (i=0;i<num_real_pop;i++)
			for (j=0;j<num_dm;j++)
				real_pop_center[i][j]=population_center[i][j];	
	}
	else
	{
		ID2Center_all(normalized_data,file_Len,num_dm,num_real_pop,all_population_ID,real_pop_center);
	}

	for (i=0;i<num_real_pop;i++)
	{
		for (j=0;j<num_dm;j++)
		{
			min_pop[i][j]=MAX_VALUE;
			
			max_pop[i][j]=0;
		}			
	}
	
	for (i=0;i<file_Len;i++)
	{
		index_id=all_population_ID[i];
		for (j=0;j<num_dm;j++)
		{
			if (normalized_data[i][j]<min_pop[index_id][j])
				min_pop[index_id][j]=normalized_data[i][j];
			
			if (normalized_data[i][j]>max_pop[index_id][j])
				max_pop[index_id][j]=normalized_data[i][j];
		}			
	}

/*	for (i=0;i<num_real_pop;i++)
	{
		for (j=0;j<num_dm;j++)
		{
			printf("min_pop is %f\t",min_pop[i][j]);
			//printf("max_pop is %f\n",max_pop[i][j]);
		}
		printf("\n");
	}
*/


	distance_pop=(double **)malloc(sizeof(double*)*num_real_pop);
    memset(distance_pop,0,sizeof(double*)*num_real_pop);
    for (i=0;i<num_real_pop;i++)
    {
      distance_pop[i]=(double*)malloc(sizeof(double)*num_real_pop);
      memset(distance_pop[i],0,sizeof(double)*num_real_pop);
    }

	for (i=0;i<num_real_pop-1;i++)
	{
		for (j=i+1;j<num_real_pop;j++)
		{
		  distance=0;
		
		  for (t=0;t<num_dm;t++)
		  {
			dist=real_pop_center[i][t]-real_pop_center[j][t];
			distance=distance+dist*dist;
		  }
		  distance_pop[i][j]=distance;
		  distance_pop[j][i]=distance;	      
		}
	}
 
	if (num_real_pop>2)
		num_checked_range=(long)((double)num_real_pop*(num_real_pop-1)/2.0); //to find the mergeable pair among the num_checked_range pairs of smallest distances
	else
		num_checked_range=1;

	size_c=(long *)malloc(sizeof(long)*num_real_pop);
    memset(size_c,0,sizeof(long)*num_real_pop);
	
	for (i=0;i<file_Len;i++)
	  size_c[all_population_ID[i]]++;

	for (p=0;p<num_checked_range;p++)  
	{
		current_smallest_dist=MAX_VALUE;

		for (i=0;i<num_real_pop-1;i++)
		{
			for (j=i+1;j<num_real_pop;j++)
			{
				if (distance_pop[i][j]<current_smallest_dist)
				{
					current_smallest_dist=distance_pop[i][j];
				
					temp_i=i;
					temp_j=j;
				}
			}
		}

		if (p==0)
		{
			first_i=temp_i;
			first_j=temp_j;
		}

		distance_pop[temp_i][temp_j]=MAX_VALUE+1; //make sure this pair won't be selected next time
		
		//normalize and calculate std for temp_i and temp_j
		
		//pick the largest 3 dimensions on standard deviation
		d1=-1;
		d2=-1;
		d3=-1;

		for (i=0;i<3;i++)
		{
			max_d_dist=0;
			for (j=0;j<num_dm;j++)
			{
				if ((j!=d1) && (j!=d2))
				{
					temp=min(min_pop[temp_i][j],min_pop[temp_j][j]);
					tmp=max(max_pop[temp_i][j],max_pop[temp_j][j]);
					if (tmp<=temp)
					{
						//printf("min_pop[%d][%d]=%f \t min_pop[%d][%d]=%f \t max_pop[%d][%d]=%f \t max_pop[%d][%d]=%f\n",temp_i,j,min_pop[temp_i][j],temp_j, j, min_pop[temp_j][j],temp_i,j,max_pop[temp_i][j],temp_j,j,max_pop[temp_j][j]);
						dist=0;
					}
					else
						dist=(real_pop_center[temp_i][j]-real_pop_center[temp_j][j])/(tmp-temp);
					if (dist<0)
						dist=-dist;
				}
				else
					dist=-1;

				if (dist>max_d_dist)
				{
					max_d_dist=dist;
					d_d=j;						
				}
			}

			if (i==0)
				d1=d_d;
			if (i==1)
				d2=d_d;
			if (i==2)
				d3=d_d;
		}

		//printf("d1=%d;d2=%d;d3=%d\n",d1,d2,d3);

		Ei=get_avg_dist(real_pop_center[temp_i], temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c);
		Ej=get_avg_dist(real_pop_center[temp_j], temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c);

		for (t=0;t<num_dm;t++)
		{
			center_1[t]=real_pop_center[temp_i][t]*0.25+real_pop_center[temp_j][t]*0.75;
			center_2[t]=real_pop_center[temp_i][t]*0.5+real_pop_center[temp_j][t]*0.5;
			center_3[t]=real_pop_center[temp_i][t]*0.75+real_pop_center[temp_j][t]*0.25;
		}
		
		E1=get_avg_dist(center_1, temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c);
		
		E2=get_avg_dist(center_2, temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c);

		E3=get_avg_dist(center_3, temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c);

		if (E1<E2)
			Ep=E2;
		else
			Ep=E1;
			
		Ep=max(Ep,E3); //Ep is the most sparse area

		Ep=Ep*E_T;
		//printf("Ep=%f;Ei=%f;Ej=%f\n",Ep,Ei,Ej);

		if ((Ep<=Ei) || (Ep<=Ej))//if the most sparse area between i and j are still denser than one of them, temp_i and temp_j should be merged
		{
			keep_merge=1;
			break;		
		}//end if (Ep)
	} //end for p

	//printf("keep_merge=%d\n",keep_merge);

	for (i=0;i<num_real_pop;i++)
	{
		free(real_pop_center[i]);
		free(distance_pop[i]);
		free(min_pop[i]);
		free(max_pop[i]);
	}
	free(real_pop_center);
	free(min_pop);
	free(max_pop);
	free(distance_pop);
	free(size_c);

	//printf("temp_i=%d;temp_j=%d\n",temp_i,temp_j);

	if (keep_merge)  //found one within p loop
	{
		for (i=0;i<file_Len;i++)
		{
			if (all_population_ID[i]>temp_j)
			{
				all_population_ID[i]=all_population_ID[i]-1;
			}
			else 
			{
				if (all_population_ID[i]==temp_j)
				{
					all_population_ID[i]=temp_i;
				}
			}
		}

		num_real_pop--;
	}
	else
	{
		if ((num_pop!=0) && (num_pop<num_real_pop)) //not reach the specified num of pop
		{
			for (i=0;i<file_Len;i++)
			{
				if (all_population_ID[i]>first_j)
				{
					all_population_ID[i]=all_population_ID[i]-1;
				}
				else
				{
					if (all_population_ID[i]==first_j)
					{
						all_population_ID[i]=first_i;
					}
				}
			}

			num_real_pop--;

		

		}
	}
		//printf("num_real_pop is %d\n",num_real_pop);
	
  } //end of while

  

  free(center_1);
  free(center_2);
  free(center_3);
  printf("Final number of populations is %ld\n",num_real_pop);

  new_population_center=(double **)malloc(sizeof(double*)*num_real_pop);
  memset(new_population_center,0,sizeof(double*)*num_real_pop);
  for (i=0;i<num_real_pop;i++)
  {
      new_population_center[i]=(double*)malloc(sizeof(double)*num_dm);
      memset(new_population_center[i],0,sizeof(double)*num_dm);
  }
  
 
  ///////////////////////////////////////////
  //End of population mapping
  ///////////////////////////////////////////

  show(input_data, all_population_ID, file_Len, num_real_pop, num_dm, para_name_string);

  ID2Center_all(input_data,file_Len,num_dm,num_real_pop,all_population_ID,new_population_center);
  

  f_cid=fopen("population_id.txt","w");
  f_ctr=fopen("population_center.txt","w");
  f_out=fopen("coordinates.txt","w");
  f_results=fopen("flock_results.txt","w");

/*
  f_parameters=fopen("parameters.txt","w");
  fprintf(f_parameters,"Number_of_Bins\t%d\n",num_bin);
  fprintf(f_parameters,"Density\t%f\n",aver_index);
  fclose(f_parameters);
*/

  for (i=0;i<file_Len;i++)
	fprintf(f_cid,"%ld\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009

  /*
   * New to check for min/max to add to parameters.txt
   *
  */
  
  fprintf(f_out,"%s\n",para_name_string);
  //fprintf(f_results,"%s\tEvent\tPopulation\n",para_name_string);
  fprintf(f_results,"%s\tPopulation\n",para_name_string);
  for (i=0;i<file_Len;i++)
  {
	for (j=0;j<num_dm;j++)
	{
		if (input_data[i][j] < min) {
			min = (int)input_data[i][j];
		}
		if (input_data[i][j] > max) {
			max = (int)input_data[i][j];
		}
		if (j==num_dm-1)
		{
			fprintf(f_out,"%d\n",(int)input_data[i][j]);
			fprintf(f_results,"%d\t",(int)input_data[i][j]);
		}
		else
		{
			fprintf(f_out,"%d\t",(int)input_data[i][j]);
			fprintf(f_results,"%d\t",(int)input_data[i][j]);
		}
	}
	//fprintf(f_results,"%ld\t",i + 1);
	fprintf(f_results,"%ld\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009
  }

/*
  f_parameters=fopen("parameters.txt","w");
  fprintf(f_parameters,"Number_of_Bins\t%ld\n",num_bin);
  fprintf(f_parameters,"Density\t%d\n",den_t_event);
  fprintf(f_parameters,"Min\t%d\n",min);
  fprintf(f_parameters,"Max\t%d\n",max);
  fclose(f_parameters);
*/

  f_properties=fopen("fcs.properties","w");
  fprintf(f_properties,"Bins=%ld\n",num_bin);
  fprintf(f_properties,"Density=%d\n",den_t_event);
  fprintf(f_properties,"Min=%d\n",min);
  fprintf(f_properties,"Max=%d\n",max);
  fprintf(f_properties,"Populations=%ld\n",num_real_pop);
  fprintf(f_properties,"Events=%ld\n",file_Len);
  fprintf(f_properties,"Markers=%ld\n",num_dm);
  fclose(f_properties);

  for (i=0;i<num_real_pop;i++) {
	/* Add if we want to include population id in the output
	*/
	fprintf(f_ctr,"%ld\t",i+1);  //i changed to i+1 to start from 1 instead of 0: April 16, 2009

	for (j=0;j<num_dm;j++) {
		if (j==num_dm-1)
			fprintf(f_ctr,"%.0f\n",new_population_center[i][j]);
		else
			fprintf(f_ctr,"%.0f\t",new_population_center[i][j]);
	}
  }

  	//added April 16, 2009
	f_mfi=fopen("MFI.txt","w");

	for (i=0;i<num_real_pop;i++)
	{
		fprintf(f_mfi,"%ld\t",i+1);

		for (j=0;j<num_dm;j++)
		{
			if (j==num_dm-1)
				fprintf(f_mfi,"%.0f\n",new_population_center[i][j]);
			else
				fprintf(f_mfi,"%.0f\t",new_population_center[i][j]);
		}
	}
	fclose(f_mfi);

	//ended April 16, 2009
			
  fclose(f_cid);
  fclose(f_ctr);
  fclose(f_out);
  fclose(f_results);


  for (i=0;i<num_population;i++)
  	free(population_center[i]);
  
  free(population_center);
 
  for (i=0;i<num_real_pop;i++)
	  free(new_population_center[i]);
  
  free(new_population_center);

  for (i=0;i<file_Len;i++)
    free(normalized_data[i]);
  free(normalized_data);	
	
  free(grid_populationID);

  free(cluster_populationID);
  free(grid_clusterID);
  free(cluster_ID);

  for (i=0;i<file_Len;i++)
    free(input_data[i]);
  free(input_data);

  free(grid_ID);
  free(population_ID);
  free(all_population_ID);
  free(eventID_To_denseventID);
		
  ///////////////////////////////////////////////////////////
  printf("Ending time:\t\t\t\t");
  fflush(stdout);
  system("/bin/date");

  /*
   * Windows version
  _strtime( tmpbuf );
  printf( "Ending time:\t\t\t\t%s\n", tmpbuf );
  _strdate( tmpbuf );
  printf( "Ending date:\t\t\t\t%s\n", tmpbuf );
 */
  
  return 0;
}