Mercurial > repos > immport-devteam > cross_sample
changeset 2:311a4f16c483 draft
Deleted selected files
author | immport-devteam |
---|---|
date | Mon, 27 Feb 2017 13:27:43 -0500 |
parents | 7eab80f86779 |
children | 5f670146a9af |
files | src/README src/cent_adjust.c src/find_connected.c src/flock1.c |
diffstat | 4 files changed, 0 insertions(+), 3724 deletions(-) [+] |
line wrap: on
line diff
--- a/src/README Mon Feb 27 13:26:09 2017 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,139 +0,0 @@ -This package contains R code for converting and transforming a binary -FCS file, using the FCSTrans software, and the C code for running the -flock1 and flock2 population identification software. - -src - contains the C code for flock1, flock2 and cent_adjust -bin - contains the compiled C code for flock1, flock2 and cent_adjust, - plus the R code for using the FCSTrans algorithm for file - conversion and data transformation. -doc - documentation for FCSTrans and FLOCK algorithms -example - sample data and output from FCSTrans and FLOCK - -To run this software sucessfully the code assumes the software was installed -in the /usr/local/flock directory and that R, plus the Bioconductor flowCore -module has been installed. If you use the ipconvert.sh shell script, you -may need to adjust the location of the RScript executable and the location -of the FCSTrans.R code by editing the shell script. - -############################################################################# -# Overview -############################################################################# -ImmPort-FLOCK - -FLOCK (FLOw Clustering without K), an automated population discovery tool for -multidimensional FCM data was designed to specifically take into account the -unique feature of FCM data and produce objective segregation of cell -populations. - -FLOCK parameter settings can be customized by defining Bins and Density -Threshold. The number of bins is an integer specifying the number of -equal-sized regions the data will be partitioned into on each axis. Increasing -the number of bins increases the sensitivity to detect rare populations but -may also result in single populations being divided. Density Threshold is the -cut-off value to separate the dense regions from background. It is a floating -point number that helps define population centers; increasing the threshold -may help separate major populations but could cause the algorithm to overlook -rare populations. - -############################################################################# -# Compiling C code -############################################################################# -cd bin -cc -o flock1 ../src/flock1.c ../src/find_connected.c -lm -cc -o flock2 ../src/flock2.c -lm -cc -o cent_adjust ../src/cent_adjust.c -lm - -############################################################################# -# FCSTrans -############################################################################# -A shell script named ipconvert.sh is included that runs the FCSTrans R -code for converting and transforming a binary FCS file. The output consists -of one text file containing the transformed channel intensity values and -another file containing a list of the FCS parameters. - -cd bin -/usr/local/flock/bin/ipconvert.sh ../example/data/FCS2.fcs -/usr/local/flock/bin/ipconvert.sh ../example/data/FCS3.fcs - -############################################################################# -# Running flock1 or flock2 -############################################################################# -Running the FLOCK1 and FLOCK2 algorithms generate 8 output files that have -generic file names. For this reason, it is recommended that one output -directory be created for one input file to the program. - -cd example/output/FCS2 -/usr/local/flock/bin/flock1 ../../data/FCS2.txt - -cd example/output/FCS3 -/usr/local/flock/bin/flock2 ../../data/FCS3.txt - -Files created: MFI.txt, percentage.txt, population_id.txt, profile.txt, - flock_results.txt, coordinates.txt, population_center.txt - and percentage.txt - -Usage Information for FLOCK1 ----------------------------------------------------------------------------- -basic mode: flock1 fcs.txt -advanced_ mode: flock1 fcs.txt num_bin density_index max_num_pop - -Usage Information for FLOCK2 ----------------------------------------------------------------------------- -basic mode: flock data_file -advanced mode 0 (specify maximum # of pops): flock data_file max_num_pop -advanced mode 1 (without # of pops): flock data_file num_bin density_index -advanced mode 2 (specify # of pops): flock data_file num_bin density_index - number_of_pop -advanced mode 3 (specify both # of pops): flock data_file num_bin density_index - number_of_pop max_num_pop - -FLOCK Output Files ----------------------------------------------------------------------------- -coordinates.txt: - Output is the intensity values for each marker and event - -flock_results.txt: - A combination of the input file, event identifiers and population - identifiers. - -MFI.txt: - Provides the mean fluorescence intensity for each population for each - marker/parameter - -population_id.txt: - Contains population identifiers (i.e, values from [1 to n] where n is - the population assigned to the corresponding events in the input data - file, one identifier per row.) - -population_center.txt: - Contains the centroid coordinates for each identified population - -percentage.txt: - Includes the population identifiers and percentage of events within that - population (relative to the whole data file) - -profile.txt: - Displays an expression profile, where the approximate expression level - for each marker is assigned a numeric value from 1-4, for each identified - population - -fcs_properties.txt: - Contains the number of events, number of populations, and number of - markers, as well as the algorithm parameters used in the analysis - -############################################################################# -# Running cent_adjust -############################################################################# -Running the cent_adjust algorithm generates 4 output files that have -generic file names. For this reason, it is recommened that one output -directory be created for one input file to the program. - -mkdir example/output/FCS2/cent_adjust -cd example/output/FCS2/cent_adjust -/usr/local/flock/bin/cent_adjust ../population_center.txt ../coordinates.txt - -Files created: MFI.txt, percentage.txt, population_id.txt and profile.txt - -Usage Information for cent_adjust ----------------------------------------------------------------------------- -basic mode: cent_adjust input_center input_data_file
--- a/src/cent_adjust.c Mon Feb 27 13:26:09 2017 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1009 +0,0 @@ -///////////////////////////////////////////////////////// -// Cent_adjust version number and modification history -// ImmPort BISC project -// Author: Yu "Max" Qian -// v1.01: Oct 16, 2009 -// Line 899 of the main function: -// Changed kmean_term=1 to kmean_term=2 -////////////////////////////////////////////////////////// - - -#include <time.h> -#include <stdio.h> -#include <stdlib.h> -#include <math.h> -#include <string.h> - -#define DEBUG 0 -#define LINE_LEN 1024 -#define FILE_NAME_LEN 128 -#define PARA_NAME_LEN 64 -#define MAX_VALUE 1000000000 -#define CUBE 0 - - -void getctrfileinfo(FILE *f_src_ctr, long *num_clust) -{ - int ch='\n'; - int prev='\n'; - long num_rows=0; - - while ((ch = fgetc(f_src_ctr))!= EOF ) - { - if (ch == '\n') - { - ++num_rows; - } - prev = ch; - } - if (prev!='\n') - ++num_rows; - - *num_clust=num_rows; - //printf("center file has %ld rows\n", *num_clust); -} - -/************************************* Read basic info of the source file **************************************/ -void getfileinfo(FILE *f_src, long *file_Len, long *num_dm, char *name_string, int *time_ID) -{ - char src[LINE_LEN]; - char current_name[64]; - char prv; - - long num_rows=0; - long num_columns=0; - int ch='\n'; - int prev='\n'; - long time_pos=0; - long i=0; - long j=0; - - - - src[0]='\0'; - fgets(src, LINE_LEN, f_src); - - name_string[0]='\0'; - current_name[0]='\0'; - prv='\n'; - - while ((src[i]==' ') || (src[i]=='\t')) //skip space and tab characters - i++; - - while ((src[i]!='\r') && (src[i]!='\n')) //repeat until the end of the line - { - current_name[j]=src[i]; - - if ((src[i]=='\t') && (prv!='\t')) //a complete word - { - current_name[j]='\0'; - - /* - * Commented out John Campbell, June 10 2010 - * We no longer want to automatically remove Time column. - * This column should have been removed by column selection - if (0!=strcmp(current_name,"Time")) - { - num_columns++; //num_columns does not inlcude the column of Time - time_pos++; - strcat(name_string,current_name); - strcat(name_string,"\t"); - } - else - { - *time_ID=time_pos; - } - */ - - num_columns++; - strcat(name_string,current_name); - strcat(name_string,"\t"); - - current_name[0]='\0'; - j=0; - } - - if ((src[i]=='\t') && (prv=='\t')) //a duplicate tab or space - { - current_name[0]='\0'; - j=0; - } - - if (src[i]!='\t') - j++; - - - prv=src[i]; - i++; - } - - if (prv!='\t') //the last one hasn't been retrieved - { - current_name[j]='\0'; - /* - * Commented out John Campbell, June 10 2010 - * We no longer want to automatically remove Time column. - * This column should have been removed by column selection - if (0!=strcmp(current_name,"Time")) - { - num_columns++; - strcat(name_string,current_name); - time_pos++; - } - else - { - *time_ID=time_pos; - } - */ - - num_columns++; - strcat(name_string,current_name); - } - - if (DEBUG==1) - { - printf("time_ID is %d\n",*time_ID); - printf("name_string is %s\n",name_string); - } - - // # of rows - - while ((ch = fgetc(f_src))!= EOF ) - { - if (ch == '\n') - { - ++num_rows; - } - prev = ch; - } - if (prev!='\n') - ++num_rows; - - *file_Len=num_rows; - *num_dm=num_columns; - - //printf("original file size is %ld; number of dimensions is %ld\n", *file_Len, *num_dm); -} - - - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -/************************************* Read the source file into uncomp_data **************************************/ -void readsource(FILE *f_src, long file_Len, long num_dm, double **uncomp_data, int time_ID) -{ - long time_pass=0; //to mark whether the time_ID has been passed - long index=0; - - long i=0; - long j=0; - long t=0; - - char src[LINE_LEN]; - char xc[LINE_LEN/10]; - - src[0]='\0'; - fgets(src,LINE_LEN, f_src); //skip the first line about parameter names - - while (!feof(f_src) && (index<file_Len)) //index = 0, 1, ..., file_Len-1 - { - src[0]='\0'; - fgets(src,LINE_LEN,f_src); - i=0; - time_pass=0; - - if (time_ID==-1) - { - for (t=0;t<num_dm;t++) //there is no time_ID - { - xc[0]='\0'; - j=0; - while ((src[i]!='\r') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t')) - { - xc[j]=src[i]; - i++; - j++; - } - - xc[j]='\0'; - i++; - - uncomp_data[index][t]=atof(xc); - } - } - else - { - for (t=0;t<=num_dm;t++) //the time column needs to be skipped, so there are num_dm+1 columns - { - xc[0]='\0'; - j=0; - while ((src[i]!='\r') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t')) - { - xc[j]=src[i]; - i++; - j++; - } - - xc[j]='\0'; - i++; - - if (t==time_ID) - { - time_pass=1; - continue; - } - - if (time_pass) - uncomp_data[index][t-1]=atof(xc); - else - uncomp_data[index][t]=atof(xc); - } - } - index++; - //fprintf(fout_ID,"%s",src); - } //end of while - - if (DEBUG == 1) - { - printf("the last line of the source data is:\n"); - for (j=0;j<num_dm;j++) - printf("%f ",uncomp_data[index-1][j]); - printf("\n"); - } -} - -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -void readcenter(FILE *f_src_ctr, long num_clust, long num_dm, double **cluster_center, long *IDmapping) -{ - char src[LINE_LEN]; - char xc[LINE_LEN/10]; - - long i=0; - long j=0; - int m=0; - int t=0; - - for (i=0;i<num_clust;i++) - { - src[0]='\0'; - fgets(src,LINE_LEN, f_src_ctr); - m=0; - for (j=0;j<num_dm+1;j++) - { - xc[0]='\0'; - t=0; - while ((src[m]!='\r') && (src[m]!='\n') && (src[m]!=' ') && (src[m]!='\t')) - { - xc[t]=src[m]; - m++; - t++; - } - xc[t]='\0'; - m++; - if (j==0) - IDmapping[i]=atoi(xc); - else - cluster_center[i][j-1]=atof(xc); - //printf("cluster_center[%d][%d]=%f\n",i,j,cluster_center[i][j]); - } - } -} - - -/**************************************** Normalization ******************************************/ -void tran(double **orig_data, long clean_Len, long num_dm, long norm_used, double **matrix_to_cluster) -{ - long i=0; - long j=0; - - double biggest=0; - double smallest=MAX_VALUE; - - double *aver; //average of each column - double *std; //standard deviation of each column - - aver=(double*)malloc(sizeof(double)*clean_Len); - memset(aver,0,sizeof(double)*clean_Len); - - std=(double*)malloc(sizeof(double)*clean_Len); - memset(std,0,sizeof(double)*clean_Len); - - if (norm_used==2) //z-score normalization - { - for (j=0;j<num_dm;j++) - { - aver[j]=0; - for (i=0;i<clean_Len;i++) - aver[j]=aver[j]+orig_data[i][j]; - aver[j]=aver[j]/(double)clean_Len; - - std[j]=0; - for (i=0;i<clean_Len;i++) - std[j]=std[j]+(orig_data[i][j]-aver[j])*(orig_data[i][j]-aver[j]); - std[j]=sqrt(std[j]/(double)clean_Len); - - for (i=0;i<clean_Len;i++) - matrix_to_cluster[i][j]=(orig_data[i][j]-aver[j])/std[j]; //z-score normalization - } - } - - if (norm_used==1) //0-1 min-max normalization - { - for (j=0;j<num_dm;j++) - { - biggest=0; - smallest=MAX_VALUE; - for (i=0;i<clean_Len;i++) - { - if (orig_data[i][j]>biggest) - biggest=orig_data[i][j]; - if (orig_data[i][j]<smallest) - smallest=orig_data[i][j]; - } - - for (i=0;i<clean_Len;i++) - { - if (biggest==smallest) - matrix_to_cluster[i][j]=biggest; - else - matrix_to_cluster[i][j]=(orig_data[i][j]-smallest)/(biggest-smallest); - } - } - } - - if (norm_used==0) //no normalization - { - for (i=0;i<clean_Len;i++) - for (j=0;j<num_dm;j++) - matrix_to_cluster[i][j]=orig_data[i][j]; - } - - -} - -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -void assign_event(double **Matrix, long k, long dist_used, double kmean_term, long file_Len, long num_dm, long *shortest_id, double **center, int random_init) -{ - - long i=0; - - long j=0; - long t=0; - long random=0; - long random1=0; - long random2=0; - long times=0; - long times_allowed=0; - - long *num; //num[i]=t means the ith cluster has t points - - double vvv=1.0; // the biggest variation; - double distance=0.0; - double xv=0.0; - double variation=0.0; - double EPS=0.0; - double diff=0.0; - double mean_dx=0; - double mean_dy=0; - double sum_var=0; - double dx=0; - double dy=0; - double sd_x=0; - double sd_y=0; - - double *temp_center; - double *shortest_distance; - - double **sum; - - temp_center = (double *)malloc(sizeof(double)*num_dm); - memset(temp_center,0,sizeof(double)*num_dm); - - /* Choosing Centers */ - if (random_init) - { - for (i=0;i<k;i++) - { - random1=rand()*rand(); - //srand( (unsigned)time( NULL ) ); - random2=abs((random1%5)+1); - for (t=0;t<random2;t++) - random2=random2*rand()+rand(); - - random=abs(random2%file_Len); - //printf("random=%d\n",random); - for (j=0;j<num_dm;j++) - center[i][j]=Matrix[random][j]; - - } - } - - //printf("finish random selection\n"); - /* To compute the nearest center for every point */ - - shortest_distance = (double *)malloc(sizeof(double)*file_Len); - memset(shortest_distance,0,sizeof(double)*file_Len); - - num = (long *)malloc(sizeof(long)*k); - memset(num,0,sizeof(long)*k); - - sum = (double **)malloc(sizeof(double*)*k); - memset(sum,0,sizeof(double*)*k); - for (i=0;i<k;i++) - { - sum[i] = (double *)malloc(sizeof(double)*num_dm); - memset(sum[i],0,sizeof(double)*num_dm); - } - - for (i=0;i<k;i++) - for (j=0;j<num_dm;j++) - sum[i][j]=0.0; //sum[i][j] = k means the sum of the jth dimension of all points in the ith group is k - - //printf("before recursion\n"); - if (kmean_term>=1) - times_allowed = (long)kmean_term; - else - EPS = kmean_term; - - times=0; - - while (((vvv>EPS) && (kmean_term<1)) || ((times<times_allowed) && (kmean_term>=1))) - { - for (i=0;i<k;i++) - { - num[i]=0; - for (j=0;j<num_dm;j++) - sum[i][j]=0.0; - } - - for (i=0;i<file_Len;i++) //for each data point i, we compute the distance between Matrix[i] and center[j] - { - shortest_distance[i]=MAX_VALUE; - for (j=0;j<k;j++) //for each center j - { - - distance=0.0; - - if (dist_used==0) //Euclidean distance - { - for (t=0;t<num_dm;t++) //for each dimension - { - diff=center[j][t]-Matrix[i][t]; - if (diff<0) - diff=-diff; - - if (CUBE) - distance = distance+(diff*diff*diff); - else - distance = distance+(diff*diff); - } - } - else //pearson correlation - { - mean_dx=0.0; - mean_dy=0.0; - sum_var=0.0; - dx=0.0; - dy=0.0; - sd_x=0.0; - sd_y=0.0; - for (t=0;t<num_dm;t++) - { - mean_dx+=center[j][t]; - mean_dy+=Matrix[i][t]; - } - mean_dx=mean_dx/(double)num_dm; - mean_dy=mean_dy/(double)num_dm; - //printf("mean_dx=%f\n",mean_dx); - - for (t=0;t<num_dm;t++) - { - dx=center[j][t]-mean_dx; - dy=Matrix[i][t]-mean_dy; - sum_var+=dx*dy; - sd_x+=dx*dx; - sd_y+=dy*dy; - } - if (sqrt(sd_x*sd_y)==0) - distance = 1.0; - else - distance = 1.0 - (sum_var/(sqrt(sd_x*sd_y))); // distance ranges from 0 to 2; - //printf("distance=%f\n",distance); - } //pearson correlation ends - - - if (distance<shortest_distance[i]) - { - shortest_distance[i]=distance; - shortest_id[i]=j; - } - }//end for j - num[shortest_id[i]]=num[shortest_id[i]]+1; - for (t=0;t<num_dm;t++) - sum[shortest_id[i]][t]=sum[shortest_id[i]][t]+Matrix[i][t]; - }//end for i - /* recompute the centers */ - //compute_mean(group); - vvv=0.0; - for (j=0;j<k;j++) - { - memcpy(temp_center,center[j],sizeof(double)*num_dm); - variation=0.0; - if (num[j]!=0) - { - for (t=0;t<num_dm;t++) - { - center[j][t]=sum[j][t]/(double)num[j]; - xv=(temp_center[t]-center[j][t]); - variation=variation+xv*xv; - } - } - - if (variation>vvv) - vvv=variation; //vvv is the biggest variation among the k clusters; - } - //compute_variation; - times++; - } //end for while - - - - free(num); - for (i=0;i<k;i++) - free(sum[i]); - free(sum); - free(temp_center); - free(shortest_distance); - -} - -////////////////////////////////////////////////////// -/*************************** Show *****************************/ -void show(double **Matrix, long *cluster_id, long file_Len, long k, long num_dm, char *name_string, long *IDmapping) -{ - int situ1=0; - int situ2=0; - - long i=0; - long id=0; - long j=0; - long info_id=0; - long nearest_id=0; - long insert=0; - long temp=0; - long m=0; - long n=0; - long t=0; - - long *size_c; - - - - long **size_mybound_1; - long **size_mybound_2; - long **size_mybound_3; - long **size_mybound_0; - - double interval=0.0; - - double *big; - double *small; - - - double **center; - double **mybound; - - long **prof; //prof[i][j]=1 means population i is + at parameter j - - FILE *fpcnt_id; //proportion id - //FILE *fcent_id; //center_id, i.e., centers of clusters within the original data - FILE *fprof_id; //profile_id - - big=(double *)malloc(sizeof(double)*num_dm); - memset(big,0,sizeof(double)*num_dm); - - small=(double *)malloc(sizeof(double)*num_dm); - memset(small,0,sizeof(double)*num_dm); - - for (i=0;i<num_dm;i++) - { - big[i]=0.0; - small[i]=(double)MAX_VALUE; - } - - - size_c=(long *)malloc(sizeof(long)*k); - memset(size_c,0,sizeof(long)*k); - - center=(double**)malloc(sizeof(double*)*k); - memset(center,0,sizeof(double*)*k); - for (i=0;i<k;i++) - { - center[i]=(double*)malloc(sizeof(double)*num_dm); - memset(center[i],0,sizeof(double)*num_dm); - } - - mybound=(double**)malloc(sizeof(double*)*num_dm); - memset(mybound,0,sizeof(double*)*num_dm); - for (i=0;i<num_dm;i++) //there are 3 mybounds for 4 categories - { - mybound[i]=(double*)malloc(sizeof(double)*3); - memset(mybound[i],0,sizeof(double)*3); - } - - prof=(long **)malloc(sizeof(long*)*k); - memset(prof,0,sizeof(long*)*k); - for (i=0;i<k;i++) - { - prof[i]=(long *)malloc(sizeof(long)*num_dm); - memset(prof[i],0,sizeof(long)*num_dm); - } - - - for (i=0;i<file_Len;i++) - { - id=cluster_id[i]; - for (j=0;j<num_dm;j++) - { - center[id][j]=center[id][j]+Matrix[i][j]; - if (big[j]<Matrix[i][j]) - big[j]=Matrix[i][j]; - if (small[j]>Matrix[i][j]) - small[j]=Matrix[i][j]; - } - - size_c[id]++; - } - - for (i=0;i<k;i++) - for (j=0;j<num_dm;j++) - { - if (size_c[i]!=0) - center[i][j]=(center[i][j]/(double)(size_c[i])); - else - center[i][j]=0; - } - - for (j=0;j<num_dm;j++) - { - interval=((big[j]-small[j])/4.0); - //printf("interval[%d] is %f\n",j,interval); - for (i=0;i<3;i++) - mybound[j][i]=small[j]+((double)(i+1)*interval); - } - - - size_mybound_0=(long **)malloc(sizeof(long*)*k); - memset(size_mybound_0,0,sizeof(long*)*k); - - for (i=0;i<k;i++) - { - size_mybound_0[i]=(long*)malloc(sizeof(long)*num_dm); - memset(size_mybound_0[i],0,sizeof(long)*num_dm); - } - - size_mybound_1=(long **)malloc(sizeof(long*)*k); - memset(size_mybound_1,0,sizeof(long*)*k); - - for (i=0;i<k;i++) - { - size_mybound_1[i]=(long*)malloc(sizeof(long)*num_dm); - memset(size_mybound_1[i],0,sizeof(long)*num_dm); - } - - size_mybound_2=(long **)malloc(sizeof(long*)*k); - memset(size_mybound_2,0,sizeof(long*)*k); - - for (i=0;i<k;i++) - { - size_mybound_2[i]=(long*)malloc(sizeof(long)*num_dm); - memset(size_mybound_2[i],0,sizeof(long)*num_dm); - } - - size_mybound_3=(long **)malloc(sizeof(long*)*k); - memset(size_mybound_3,0,sizeof(long*)*k); - - for (i=0;i<k;i++) - { - size_mybound_3[i]=(long*)malloc(sizeof(long)*num_dm); - memset(size_mybound_3[i],0,sizeof(long)*num_dm); - } - - for (i=0;i<file_Len;i++) - for (j=0;j<num_dm;j++) - { - if (Matrix[i][j]<mybound[j][0])// && ((Matrix[i][j]-small[j])>0)) //the smallest values excluded - size_mybound_0[cluster_id[i]][j]++; - else - { - if (Matrix[i][j]<mybound[j][1]) - size_mybound_1[cluster_id[i]][j]++; - else - { - if (Matrix[i][j]<mybound[j][2]) - size_mybound_2[cluster_id[i]][j]++; - else - //if (Matrix[i][j]!=big[j]) //the biggest values excluded - size_mybound_3[cluster_id[i]][j]++; - } - - } - } - - fprof_id=fopen("profile.txt","w"); - fprintf(fprof_id,"Population_ID\t"); - fprintf(fprof_id,"%s\n",name_string); - - for (i=0;i<k;i++) - { - fprintf(fprof_id,"%ld\t",IDmapping[i]); - for (j=0;j<num_dm;j++) - { - - if (size_mybound_0[i][j]>size_mybound_1[i][j]) - situ1=0; - else - situ1=1; - if (size_mybound_2[i][j]>size_mybound_3[i][j]) - situ2=2; - else - situ2=3; - - if ((situ1==0) && (situ2==2)) - { - if (size_mybound_0[i][j]>size_mybound_2[i][j]) - prof[i][j]=0; - else - prof[i][j]=2; - } - if ((situ1==0) && (situ2==3)) - { - if (size_mybound_0[i][j]>size_mybound_3[i][j]) - prof[i][j]=0; - else - prof[i][j]=3; - } - if ((situ1==1) && (situ2==2)) - { - if (size_mybound_1[i][j]>size_mybound_2[i][j]) - prof[i][j]=1; - else - prof[i][j]=2; - } - if ((situ1==1) && (situ2==3)) - { - if (size_mybound_1[i][j]>size_mybound_3[i][j]) - prof[i][j]=1; - else - prof[i][j]=3; - } - - //begin to output profile - if (j==num_dm-1) - { - if (prof[i][j]==0) - fprintf(fprof_id,"1\n"); - if (prof[i][j]==1) - fprintf(fprof_id,"2\n"); - if (prof[i][j]==2) - fprintf(fprof_id,"3\n"); - if (prof[i][j]==3) - fprintf(fprof_id,"4\n"); - } - else - { - if (prof[i][j]==0) - fprintf(fprof_id,"1\t"); - if (prof[i][j]==1) - fprintf(fprof_id,"2\t"); - if (prof[i][j]==2) - fprintf(fprof_id,"3\t"); - if (prof[i][j]==3) - fprintf(fprof_id,"4\t"); - } - } - } - fclose(fprof_id); - - /////////////////////////////////////////////////////////// - - - fpcnt_id=fopen("percentage.txt","w"); - fprintf(fpcnt_id,"Population_ID\tPercentage\n"); - - for (t=0;t<k;t++) - { - fprintf(fpcnt_id,"%ld\t%.2f\n",IDmapping[t],(double)size_c[t]*100.0/(double)file_Len); - } - fclose(fpcnt_id); - - free(big); - free(small); - free(size_c); - - for (i=0;i<k;i++) - { - free(center[i]); - free(prof[i]); - free(size_mybound_0[i]); - free(size_mybound_1[i]); - free(size_mybound_2[i]); - free(size_mybound_3[i]); - } - free(center); - free(prof); - free(size_mybound_0); - free(size_mybound_1); - free(size_mybound_2); - free(size_mybound_3); - - for (i=0;i<num_dm;i++) - free(mybound[i]); - free(mybound); - -} -/******************************************************** Main Function **************************************************/ -int main (int argc, char **argv) -{ - //inputs - FILE *f_src; //source file pointer - FILE *f_src_ctr; //source center file - //outputs - FILE *f_cid; //cluster-id file pointer - FILE *f_mfi; //added April 16, 2009 - - - char name_string[LINE_LEN]; //for name use - - int time_id=-1; - - long file_Len=0; - long num_clust=0; - long num_dm=0; - long norm_used=0; - long dist_used=0; - long i=0; - long j=0; - - long *cluster_id; - long *IDmapping; //this is to keep the original populationID of the center.txt - - double kmean_term=0; - - double **cluster_center; - double **orig_data; - double **normalized_data; - -/* - _strtime( tmpbuf ); - printf( "Starting time:\t\t\t\t%s\n", tmpbuf ); - _strdate( tmpbuf ); - printf( "Starting date:\t\t\t\t%s\n", tmpbuf ); -*/ - - if (argc!=3) - { - printf("usage: cent_adjust input_center input_data_file\n"); - exit(0); - } - - - f_src_ctr=fopen(argv[1],"r"); - - //read source data - f_src=fopen(argv[2],"r"); - - getfileinfo(f_src, &file_Len, &num_dm, name_string, &time_id); //get the filelength, number of dimensions, and num/name of parameters - - rewind(f_src); //reset data file pointer - - orig_data = (double **)malloc(sizeof(double*)*file_Len); - memset(orig_data,0,sizeof(double*)*file_Len); - for (i=0;i<file_Len;i++) - { - orig_data[i]=(double *)malloc(sizeof(double)*num_dm); - memset(orig_data[i],0,sizeof(double)*num_dm); - } - - readsource(f_src, file_Len, num_dm, orig_data, time_id); //read the data; - - fclose(f_src); - ///////////////////////////////////////////////////////////////////////////// - getctrfileinfo(f_src_ctr, &num_clust); //get how many populations - norm_used=0; - dist_used=0; - kmean_term=2; //modified on Oct 16, 2009: changed kmean_term=1 to kmean_term=2 - - rewind(f_src_ctr); //reset center file pointer - - //read population center - cluster_center=(double **)malloc(sizeof(double*)*num_clust); - memset(cluster_center,0,sizeof(double*)*num_clust); - for (i=0;i<num_clust;i++) - { - cluster_center[i]=(double*)malloc(sizeof(double)*num_dm); - memset(cluster_center[i],0,sizeof(double)*num_dm); - } - for (i=0;i<num_clust;i++) - for (j=0;j<num_dm;j++) - cluster_center[i][j]=0; - - IDmapping=(long *)malloc(sizeof(long)*num_clust); - memset(IDmapping,0,sizeof(long)*num_clust); - - readcenter(f_src_ctr,num_clust,num_dm,cluster_center,IDmapping); //read population center - fclose(f_src_ctr); - - ///////////////////////////////////////////////////////////////////////////// - normalized_data=(double **)malloc(sizeof(double*)*file_Len); - memset(normalized_data,0,sizeof(double*)*file_Len); - for (i=0;i<file_Len;i++) - { - normalized_data[i]=(double *)malloc(sizeof(double)*num_dm); - memset(normalized_data[i],0,sizeof(double)*num_dm); - } - - tran(orig_data, file_Len, num_dm, norm_used, normalized_data); - /************************************************* Compute number of clusters *************************************************/ - - cluster_id=(long*)malloc(sizeof(long)*file_Len); - memset(cluster_id,0,sizeof(long)*file_Len); - - assign_event(normalized_data,num_clust,dist_used,kmean_term,file_Len,num_dm,cluster_id,cluster_center,0); - - - //show(orig_data,cluster_id,file_Len,num_clust,num_dm,show_data,num_disp,name_string); - show(orig_data, cluster_id, file_Len, num_clust, num_dm, name_string, IDmapping); - - f_cid=fopen("population_id.txt","w"); - - for (i=0;i<file_Len;i++) - fprintf(f_cid,"%ld\n",IDmapping[cluster_id[i]]); - - - fclose(f_cid); - - //added April 16, 2009 - f_mfi=fopen("MFI.txt","w"); - - for (i=0;i<num_clust;i++) - { - fprintf(f_mfi,"%ld\t",IDmapping[i]); - - for (j=0;j<num_dm;j++) - { - if (j==num_dm-1) - fprintf(f_mfi,"%.0f\n",cluster_center[i][j]); - else - fprintf(f_mfi,"%.0f\t",cluster_center[i][j]); - } - } - fclose(f_mfi); - - //ended April 16, 2009 - - for (i=0;i<num_clust;i++) - free(cluster_center[i]); - free(cluster_center); - - - /********************************************** Release memory ******************************************/ - - for (i=0;i<file_Len;i++) - { - free(orig_data[i]); - free(normalized_data[i]); - } - - free(orig_data); - free(normalized_data); - free(cluster_id); - free(IDmapping); - -/* - _strtime( tmpbuf ); - printf( "Ending time:\t\t\t\t%s\n", tmpbuf ); - _strdate( tmpbuf ); - printf( "Ending date:\t\t\t\t%s\n", tmpbuf ); -*/ - -}
--- a/src/find_connected.c Mon Feb 27 13:26:09 2017 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,176 +0,0 @@ -#include <stdlib.h> -#include <stdio.h> -#include <string.h> -#include <assert.h> - -//static const char *rcsid = "$Id: find_connected.c,v 1.1 2008/09/05 21:54:40 rpl Exp $"; - -int find_connected(int **G, int num_dense_grids, int ndim, int *grid_clusterID); -void depth_first(int startnode); - -void bail(const char *); /* exits via abort */ -static void check_clusters(int *gcID, int ndense); -static void merge_cluster(int from, int into); - - -/* Vars that will not change througout the depth-first recursion. We - store them here to avoid endless replication on the stack. */ -static int **Gr=0; -static int *gcID = 0; /* grid cluster IDs */ -static int *cluster_count=0; /* count of nodes per cluster */ -static int ndense=0; -static int ndim=0; -/* cid changes between depth-first searches, but is constant within a - single search, so it goes here. */ -static int cid=0; - -/* Find connected components in the graph of neighboring grids defined in G. - * - * Output: - * - * grid_clusterID[] -- cluster to which each dense grid was assigned - * return value -- number of clusters assigned. - */ -int find_connected(int **G, int n_dense_grids, int num_dm, int *grid_clusterID) -{ - int nclust=0; /* number of clusters found */ - int i; - int *subfac; - int subval=0,nempty=0; - int clustid=0; - - size_t sz = n_dense_grids*sizeof(int); - cluster_count = malloc(sz); - if(!cluster_count) - bail("find_connected: Unable to allocate %zd bytes.\n"); - memset(cluster_count,0,sz); - - /* set up the statics that will be used in the DFS */ - Gr=G; - gcID = grid_clusterID; - ndense = n_dense_grids; - ndim = num_dm; - - - for(i=0;i<ndense;++i) - grid_clusterID[i] = -1; - - for(i=0;i<ndense;++i) { - if(grid_clusterID[i] < 0) { /* grid hasn't been assigned yet */ - cid = nclust++; - depth_first(i); - } - } - -#ifndef NDEBUG - check_clusters(gcID,ndense); -#endif - - /* At this point we probably have some clusters that are empty due to merging. - We want to compact the cluster numbering to eliminate the empty clusters. */ - - subfac = malloc(sz); - if(!subfac) - bail("find_connected: Unable to allocate %zd bytes.\n"); - subval=0; - nempty=0; - - /* cluster #i needs to have its ID decremented by 1 for each empty cluster with - ID < i. Precaclulate the decrements in this loop: */ - for(i=0;i<nclust;++i) { - //clustid = grid_clusterID[i]; - if(cluster_count[i] == 0) { - subval++; - nempty++; - } - subfac[i] = subval; - } - - /* Now apply the decrements to all of the dense grids */ - for(i=0;i<ndense;++i) { - clustid = grid_clusterID[i]; - grid_clusterID[i] -= subfac[clustid]; - } - -#ifndef NDEBUG - // check_clusters(gcID,ndense); -#endif - - /* correct the number of clusters found */ - nclust -= nempty; - - return nclust; -} - - -/* Do a depth-first search for a single connected component in graph - * G. Start from node, tag the nodes found with cid, and record - * the tags in grid_clusterID. Also, record the node count in - * cluster_count. If we find a node that has already been assigned to - * a cluster, that means we're merging two clusters, so zero out the - * old cid's node count. - * - * Note that our graph is constructed as a DAG, so we can be - * guaranteed to terminate without checking for cycles. - * - * Note2: this function can potentially recurse to depth = ndense. - * Plan stack size accordingly. - * - * Output: - * - * grid_clusterID[] -- array where we tag the nodes. - * cluster_count[] -- count of the number of nodes per cluster. - */ - -void depth_first(int node) -{ - int i; - - if(gcID[node] == cid) // we're guaranteed no cycles, but it is possible to reach a node - return; // through two different paths in the same cluster. This early - // return saves us some unneeded work. - - /* Check to see if we are merging a cluster */ - if(gcID[node] >= 0) { - /* We are, so zero the count for the old cluster. */ - cluster_count[ gcID[node] ] = 0; - merge_cluster(gcID[node], cid); - return; - } - - /* Update for this node */ - gcID[node] = cid; - cluster_count[cid]++; - - /* Recursively search the child nodes */ - for(i=0; i<ndim; ++i) - if(Gr[node][i] >= 0) /* This is a child node */ - depth_first(Gr[node][i]); -} - -void bail(const char *msg) -{ - fprintf(stderr,"%s",msg); - abort(); -} - - -static void check_clusters(int *gcID, int ndense) -{ - int i; - - for(i=0; i<ndense; ++i) - if(gcID[i] < 0) { - fprintf(stderr,"faulty cluster id at i= %d\n",i); - abort(); - } -} - -static void merge_cluster(int from, int into) -{ - int i; - - for(i=0; i<ndense; ++i) - if(gcID[i] == from) - gcID[i] = into; -}
--- a/src/flock1.c Mon Feb 27 13:26:09 2017 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2400 +0,0 @@ -/***************************************************************************** - - FLOCK: FLOw cytometry Clustering without K (Named by: Jamie A. Lee and Richard H. Scheuermann) - - Author: (Max) Yu Qian, Ph.D. - - Copyright: Scheuermann Lab, Dept. of Pathology, UTSW - - Development: November 2005 ~ Forever - - Algorithm Status: May 2007: Release 1.0 - - Usage: flock data_file - Note: the input file format must be channel values and the delimiter between two values must be a tab. - - Changes made July 23, 2010: made errors to STDERR - Changes made Nov 4, 2010: added one more error (select_num_bin<min_grid) || (select_num_bin>max_grid) to STDERR; - MAX_GRID changed to 50 as larger than 50 seems not useful for any file we have got - -******************************************************************************/ -#include <time.h> -#include <stdio.h> -#include <stdlib.h> -#include <math.h> -#include <string.h> -#include <sys/stat.h> -#include <unistd.h> -#include <assert.h> - - -#define DEBUG 0 -#define LINE_LEN 1024 -#define FILE_NAME_LEN 128 -#define PARA_NAME_LEN 64 -#define MAX_VALUE 1000000000 -#define MIN_GRID 6 -#define MAX_GRID 50 - -#define NORM_METHOD 2 //2 if z-score; 0 if no normalization; 1 if min-max -#define KMEANS_TERM 100 -//#define MAX_NUM_POP 30 - - -int find_connected(int **G, int num_dense_grids, int ndim, int *grid_clusterID); - -/************* Read basic info of the source file ****************************/ -void getfileinfo(FILE *f_src, int *file_Len, int *num_dm, char *name_string, int *time_ID) -{ - char src[LINE_LEN]; - char current_name[64]; - char prv; - - int num_rows=0; - int num_columns=0; - int ch='\n'; - int prev='\n'; - int time_pos=0; - int i=0; - int j=0; - int sw=0; - - src[0]='\0'; - fgets(src, LINE_LEN, f_src); - - if ((src[0]=='F') && (src[1]=='C') && (src[2]=='S')) - { - fprintf(stderr,"the correct input format is a tab-delimited txt file, instead of FCS file.\n"); - abort(); - } - - name_string[0]='\0'; - current_name[0]='\0'; - prv='\n'; - - // skip space and tab characters - while ((src[i]==' ') || (src[i]=='\t')) - i++; - - // repeat until the end of line is reached - while ((src[i]!='\0') && (src[i]!='\n') && (src[i]!='\r')) - { - current_name[j]=src[i]; - - if ((src[i]=='\t') && (prv!='\t')) //a complete word - { - current_name[j]='\0'; - - if (0!=strcmp(current_name,"Time")) - { - num_columns++; //num_columns does not inlcude the column of Time - time_pos++; - if (sw) { - strcat(name_string,"\t"); - } - strcat(name_string,current_name); - sw = 1; - } - else - { - *time_ID=time_pos; - } - - - current_name[0]='\0'; - j=0; - } - - if ((src[i]=='\t') && (prv=='\t')) //a duplicate tab or space - { - current_name[0]='\0'; - j=0; - } - - if (src[i]!='\t') - j++; - - prv=src[i]; - i++; - } - - if (prv!='\t') //the last one hasn't been retrieved - { - current_name[j]='\0'; - - if (0!=strcmp(current_name,"Time")) - { - num_columns++; - strcat(name_string,"\t"); - strcat(name_string,current_name); - time_pos++; - } - else - { - *time_ID=time_pos; - } - - - } - if (DEBUG==1) - { - printf("time_ID is %d\n",*time_ID); - printf("name_string is %s\n",name_string); - } - - //start computing # of rows - - while ((ch = fgetc(f_src))!= EOF ) - { - if (ch == '\n') - { - ++num_rows; - } - prev = ch; - } - if (prev!='\n') - ++num_rows; - - //added on July 23, 2010 - if (num_rows<50) - { - fprintf(stderr,"Number of events in the input file is too few and should not be processed!\n"); //modified on July 23, 2010 - abort(); - } - - *file_Len=num_rows; - *num_dm=num_columns; - - printf("original file size is %d; number of dimensions is %d\n", *file_Len, *num_dm); -} - - - -/************************************* Read the source file into uncomp_data **************************************/ -void readsource(FILE *f_src, int file_Len, int num_dm, double **uncomp_data, int time_ID) -{ - int time_pass=0; //to mark whether the time_ID has been passed - int index=0; - - int i=0; - int j=0; - int t=0; - - char src[LINE_LEN]; - char xc[LINE_LEN/10]; - - src[0]='\0'; - fgets(src,LINE_LEN, f_src); //skip the first line about parameter names - - while (!feof(f_src) && (index<file_Len)) //index = 0, 1, ..., file_Len-1 - { - src[0]='\0'; - fgets(src,LINE_LEN,f_src); - i=0; - time_pass=0; - - if (time_ID==-1) - { - for (t=0;t<num_dm;t++) //there is no time_ID - { - xc[0]='\0'; - j=0; - while ((src[i]!='\0') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t')) - { - xc[j]=src[i]; - i++; - j++; - } - - xc[j]='\0'; - i++; - - uncomp_data[index][t]=atof(xc); - } - } - else - { - for (t=0;t<=num_dm;t++) //the time column needs to be skipped, so there are num_dm+1 columns - { - xc[0]='\0'; - j=0; - while ((src[i]!='\0') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t')) - { - xc[j]=src[i]; - i++; - j++; - } - - xc[j]='\0'; - i++; - - if (t==time_ID) - { - time_pass=1; - continue; - } - - if (time_pass) - uncomp_data[index][t-1]=atof(xc); - else - uncomp_data[index][t]=atof(xc); - } - } - index++; - //fprintf(fout_ID,"%s",src); - } //end of while - - if (DEBUG == 1) - { - printf("the last line of the source data is:\n"); - for (j=0;j<num_dm;j++) - printf("%f ",uncomp_data[index-1][j]); - printf("\n"); - } -} - - -/**************************************** Normalization ******************************************/ -void tran(double **orig_data, int file_Len, int num_dm, int norm_used, double **matrix_to_cluster) -{ - int i=0; - int j=0; - - double biggest=0; - double smallest=MAX_VALUE; - - double *aver; //average of each column - double *std; //standard deviation of each column - - aver=(double*)malloc(sizeof(double)*file_Len); - memset(aver,0,sizeof(double)*file_Len); - - std=(double*)malloc(sizeof(double)*file_Len); - memset(std,0,sizeof(double)*file_Len); - - if (norm_used==2) //z-score normalization - { - for (j=0;j<num_dm;j++) - { - aver[j]=0; - for (i=0;i<file_Len;i++) - aver[j]=aver[j]+orig_data[i][j]; - aver[j]=aver[j]/(double)file_Len; - - std[j]=0; - for (i=0;i<file_Len;i++) - std[j]=std[j]+(orig_data[i][j]-aver[j])*(orig_data[i][j]-aver[j]); - std[j]=sqrt(std[j]/(double)file_Len); - - for (i=0;i<file_Len;i++) - matrix_to_cluster[i][j]=(orig_data[i][j]-aver[j])/std[j]; //z-score normalization - } - } - - if (norm_used==1) //0-1 min-max normalization - { - for (j=0;j<num_dm;j++) - { - biggest=0; - smallest=MAX_VALUE; - for (i=0;i<file_Len;i++) - { - if (orig_data[i][j]>biggest) - biggest=orig_data[i][j]; - if (orig_data[i][j]<smallest) - smallest=orig_data[i][j]; - } - - for (i=0;i<file_Len;i++) - { - if (biggest==smallest) - matrix_to_cluster[i][j]=biggest; - else - matrix_to_cluster[i][j]=(orig_data[i][j]-smallest)/(biggest-smallest); - } - } - } - - if (norm_used==0) //no normalization - { - for (i=0;i<file_Len;i++) - for (j=0;j<num_dm;j++) - matrix_to_cluster[i][j]=orig_data[i][j]; - } - -} - - - -/********************************************** RadixSort *******************************************/ -/* Perform a radix sort using each dimension from the original data as a radix. - * Outputs: - * sorted_seq -- a permutation vector mapping the ordered list onto the original data. - * (sorted_seq[i] -> index in the original data of the ith element of the ordered list) - * grid_ID -- mapping between the original data and the "grids" (see below) found as a byproduct - * of the sorting procedure. - * num_nonempty -- the number of grids that occur in the data (= the number of distinct values assumed - * by grid_ID) - */ - -void radixsort_flock(int **position,int file_Len,int num_dm,int num_bin,int *sorted_seq,int *num_nonempty,int *grid_ID) -{ - int i=0; - int length=0; - int start=0; - int prev_ID=0; - int curr_ID=0; - - int j=0; - int t=0; - int p=0; - int loc=0; - int temp=0; - int equal=0; - - int *count; //count[i]=j means there are j numbers having value i at the processing digit - int *index; //index[i]=j means the starting position of grid i is j - int *cp; //current position - int *mark; //mark[i]=0 means it is not an ending point of a part, 1 means it is (a "part" is a group of items with identical bins for all dimensions) - int *seq; //temporary sequence - - count=(int*)malloc(sizeof(int)*num_bin); - memset(count,0,sizeof(int)*num_bin); - - cp=(int*)malloc(sizeof(int)*num_bin); - memset(cp,0,sizeof(int)*num_bin); - - index=(int*)malloc(sizeof(int)*num_bin); // initialized below - - seq=(int*)malloc(sizeof(int)*file_Len); - memset(seq,0,sizeof(int)*file_Len); - - mark=(int*)malloc(sizeof(int)*file_Len); - memset(mark,0,sizeof(int)*file_Len); - - for (i=0;i<file_Len;i++) - { - sorted_seq[i]=i; - mark[i]=0; - seq[i]=0; - } - for (i=0;i<num_bin;i++) - { - index[i]=0; - cp[i]=0; - count[i]=0; - } - - for (j=0;j<num_dm;j++) - { - if (j==0) //compute the initial values of mark - { - for (i=0;i<file_Len;i++) - count[position[i][j]]++; // initialize the count to the number of items in each bin of the 0th dimension - - index[0] = 0; - for (i=0;i<num_bin-1;i++) - { - index[i+1]=index[i]+count[i]; //index[k]=x means k segment starts at x (in the ordered list) - if ((index[i+1]>0) && (index[i+1]<=file_Len)) - { - mark[index[i+1]-1]=1; // Mark the end of the segment in the ordered list - } - else - { - printf("out of myboundary for mark at index[i+1]-1.\n"); - } - } - mark[file_Len-1]=1; - - for (i=0;i<file_Len;i++) - { - /* Build a permutation vector for the partially ordered data. Store the PV in sorted_seq */ - loc=position[i][j]; - temp=index[loc]+cp[loc]; //cp[i]=j means the offset from the starting position of grid i is j - sorted_seq[temp]=i; //sorted_seq[i]=temp is also another way to sort - cp[loc]++; - } - } - else - { - //reset count, index, loc, temp, cp, start, and length - length=0; - loc=0; - temp=0; - start=0; - for (p=0;p<num_bin;p++) - { - cp[p]=0; - count[p]=0; - index[p]=0; - } - - for (i=0;i<file_Len;i++) - { - int iperm = sorted_seq[i]; // iperm allows us to traverse the data in sorted order. - if (mark[i]!=1) - { - /* Count the number of items in each bin of - dimension j, BUT we are going to reset at the end - of each "part". Thus, the total effect is to do - a sort by bin on the jth dimension for each group - of data that has been identical for the - dimensions processed up to this point. This is - the standard radix sort procedure, but doing it - this way saves us having to allocate buckets to - hold the data in each group of "identical-so-far" - elements. */ - count[position[iperm][j]]++; //count[position[i][j]]++; - length++; // This is the total length of the part, irrespective of the value of the jth component - // (actually, less one, since we don't increment for the final element below) - } - if (mark[i]==1) - { - //length++; - count[position[iperm][j]]++;//count[position[i][j]]++; //the current point must be counted in - start=i-length; //this part starts from start to i: [start,i] - /* Now we sort on the jth radix, just like we did for the 0th above, but we restrict it to just the current part. - This would be a lot more clear if we broke this bit of code out into a separate function and processed recursively, - plus we could multi-thread over the parts. (Hmmm...) - */ - index[0] = start; // Let index give the offset within the whole ordered list. - for (t=0;t<num_bin-1;t++) - { - index[t+1]=index[t]+count[t]; - - if ((index[t+1]<=file_Len) && (index[t+1]>0)) - { - mark[index[t+1]-1]=1; // update the part boundaries to include the differences in the current radix. - } - - } - mark[i]=1; - - /* Update the permutation vector for the current part (i.e., from start to i). By the time we finish the loop over i - the PV will be completely updated for the partial ordering up to the current radix. */ - for (t=start;t<=i;t++) - { - loc=position[sorted_seq[t]][j];//loc=position[t][j]; - temp=index[loc]+cp[loc]; - if ((temp<file_Len) && (temp>=0)) - { - // seq is a temporary because we have to maintain the old PV until we have finished this step. - seq[temp]=sorted_seq[t]; //sorted_seq[i]=temp is also another way to sort - cp[loc]++; - } - else - { - printf("out of myboundary for seq at temp.\n"); - } - } - - for (t=start;t<=i;t++) - { - // copy the temporary back into sorted_seq. sorted_seq is now updated for radix j up through - // entry i in the ordered list. - if ((t>=0) && (t<file_Len)) - sorted_seq[t]=seq[t]; - else - printf("out of myboundary for seq and sorted_seq at t.\n"); - } - //reset count, index, seq, length, and cp - length=0; - loc=0; - temp=0; - for (p=0;p<num_bin;p++) - { - cp[p]=0; - count[p]=0; - index[p]=0; - } - } - }//end for i - }//end else - }//end for j - - /* sorted_seq[] now contains the ordered list for all radices. mark[] gives the boundaries between groups of elements that are - identical over all radices (= dimensions in the original data) (although it appears we aren't going to make use of this fact) */ - - for (i=0;i<file_Len;i++) - grid_ID[i]=0; //in case the initial value hasn't been assigned - *num_nonempty=1; //starting from 1! - - /* assign the "grid" identifiers for all of the data. A grid will be what we were calling a "part" above. We will number them - serially and tag the *unordered* data with the grid IDs. We will also count the number of populated grids (in general there will - be many possible combinations of bin values that simply never occur) */ - - for (i=1;i<file_Len;i++) - { - equal=1; - prev_ID=sorted_seq[i-1]; - curr_ID=sorted_seq[i]; - for (j=0;j<num_dm;j++) - { - if (position[prev_ID][j]!=position[curr_ID][j]) - { - equal=0; //not equal - break; - } - } - - if (equal) - { - grid_ID[curr_ID]=grid_ID[prev_ID]; - } - else - { - *num_nonempty=*num_nonempty+1; - grid_ID[curr_ID]=grid_ID[prev_ID]+1; - } - //all_grid_vol[grid_ID[curr_ID]]++; - } - - //free memory - free(count); - free(index); - free(cp); - free(seq); - free(mark); - -} - -/********************************************** Compute Position of Events ************************************************/ -void compute_position(double **data_in, int file_Len, int num_dm, int num_bin, int **position, double *interval) -{ - /* What we are really doing here is binning the data, with the bins - spanning the range of the data and number of bins = num_bin */ - int i=0; - int j=0; - - double *small; //small[j] is the smallest value within dimension j - double *big; //big[j] is the biggest value within dimension j - - small=(double*)malloc(sizeof(double)*num_dm); - memset(small,0,sizeof(double)*num_dm); - - big=(double*)malloc(sizeof(double)*num_dm); - memset(big,0,sizeof(double)*num_dm); - - - for (j=0;j<num_dm;j++) - { - big[j]=MAX_VALUE*(-1); - small[j]=MAX_VALUE; - for (i=0;i<file_Len;i++) - { - if (data_in[i][j]>big[j]) - big[j]=data_in[i][j]; - - if (data_in[i][j]<small[j]) - small[j]=data_in[i][j]; - } - - interval[j]=(big[j]-small[j])/(double)num_bin; //interval is computed using the biggest value and smallest value instead of the channel limit - /* XXX: I'm pretty sure the denominator of the fraction above should be num_bin-1. */ - /* I don't think so: num_bin is the number of bins */ - } - - for (j=0;j<num_dm;j++) - { - for (i=0;i<file_Len;i++) - { - if (data_in[i][j]>=big[j]) - position[i][j]=num_bin-1; - else - { - position[i][j]=(int)((data_in[i][j]-small[j])/interval[j]); //position[i][j]=t means point i is at the t grid of dimensional j - if ((position[i][j]>=num_bin) || (position[i][j]<0)) - { - //printf("position mis-computed in density analysis!\n"); - //exit(0); - fprintf(stderr,"Incorrect input file format or input parameters (number of bins overflows)!\n"); //modified on July 23, 2010 - abort(); - - } - } - } - } - - - free(small); - free(big); -} - -/********************************************** select_bin to select the number of bins **********************************/ -//num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty); -/* Determine the number of bins to use in each dimension. Additionally sort the data elements according to the binned - * values, and partition the data into "grids" with identical (binned) values. We try progressively more bins until we - * maximize a merit function, then return the results obtained using the optimal number of bins. - * - * Outputs: - * position -- binned data values - * sorted_seq -- permutation vector mapping the ordered list to the original data - * all_grid_ID -- grid to which each data element was assigned. - * num_nonempty -- number of distinct values assumed by all_grid_ID - * interval -- bin width for each data dimension - * return value -- the number of bins selected. - */ - -int select_bin(double **normalized_data, int file_Len, int num_dm, int min_grid, int max_grid, int **position, int *sorted_seq, - int *all_grid_ID, int *num_nonempty, double *interval, int user_num_bin) -{ - - int num_bin=0; - int select_num_bin=0; - int m=0; - int n=0; - - int i=0; - int bin_scope=0; - int temp_num_nonempty=0; - - int *temp_grid_ID; - int *temp_sorted_seq; - int **temp_position; - - //sorted_seq[i]=j means the event j ranks i - - double temp_index=0; - double *bin_index; - double *temp_interval; - - - - temp_grid_ID=(int *)malloc(sizeof(int)*file_Len); - memset(temp_grid_ID,0,sizeof(int)*file_Len); - - temp_sorted_seq=(int *)malloc(sizeof(int)*file_Len); - memset(temp_sorted_seq,0,sizeof(int)*file_Len); - - temp_position=(int **)malloc(sizeof(int*)*file_Len); - memset(temp_position,0,sizeof(int*)*file_Len); - for (m=0;m<file_Len;m++) - { - temp_position[m]=(int*)malloc(sizeof(int)*num_dm); - memset(temp_position[m],0,sizeof(int)*num_dm); - } - - temp_interval=(double*)malloc(sizeof(double)*num_dm); - memset(temp_interval,0,sizeof(double)*num_dm); - - bin_scope=max_grid-min_grid+1; - bin_index=(double *)malloc(sizeof(double)*bin_scope); - memset(bin_index,0,sizeof(double)*bin_scope); - - i=0; - - for (num_bin=min_grid;num_bin<=max_grid;num_bin++) - { - /* compute_position bins the data into num_bin bins. Each - dimension is binned independently. - - Outputs: - temp_position[i][j] -- bin for the jth component of data element i. - temp_interval[j] -- bin-width for the jth component - */ - compute_position(normalized_data, file_Len, num_dm, num_bin, temp_position, temp_interval); - radixsort_flock(temp_position,file_Len,num_dm,num_bin,temp_sorted_seq,&temp_num_nonempty,temp_grid_ID); - - /* our figure of merit is the number of non-empty grids divided by number of bins per dimension. - We declare victory when we have found a local maximum */ - bin_index[i]=((double)temp_num_nonempty)/((double)num_bin); - if ((double)(temp_num_nonempty)>=(double)(file_Len)*0.95) - break; - if ((bin_index[i]<temp_index) && (user_num_bin==0)) - break; - if ((user_num_bin==num_bin-1) && (user_num_bin!=0)) - break; - - /* Since we have accepted this trial bin, copy all the temporary results into - the output buffers */ - memcpy(all_grid_ID,temp_grid_ID,sizeof(int)*file_Len); - memcpy(sorted_seq,temp_sorted_seq,sizeof(int)*file_Len); - memcpy(interval,temp_interval,sizeof(double)*num_dm); - - for (m=0;m<file_Len;m++) - for (n=0;n<num_dm;n++) - position[m][n]=temp_position[m][n]; - - temp_index=bin_index[i]; - select_num_bin=num_bin; - num_nonempty[0]=temp_num_nonempty; - i++; - } - - if ((select_num_bin<min_grid) || (select_num_bin>max_grid)) - { - fprintf(stderr,"Number of events collected is too few in terms of number of markers used. The file should not be processed!\n"); //modified on Nov 4, 2010 - exit(0); - } - - if (temp_index==0) - { - fprintf(stderr,"Too many dimensions with too few events in the input file, or a too large number of bins used.\n"); //modified on July 23, 2010 - abort(); - } - - - free(temp_grid_ID); - free(temp_sorted_seq); - free(bin_index); - free(temp_interval); - - for (m=0;m<file_Len;m++) - free(temp_position[m]); - free(temp_position); - - return select_num_bin; -} - -/************************************* Select dense grids **********************************/ -// compute num_dense_grids, num_dense_events, dense_grid_reverse, and all_grid_vol -// den_cutoff=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse); -/* - * Prune away grids that are insufficiently "dense" (i.e., contain too few data items) - * - * Outputs: - * num_dense_grids -- number of dense grids - * num_dense_events -- total number of data items in all dense grids - * dense_grid_reverse -- mapping from list of all grids to list of dense grids. - * return value -- density cutoff for separating dense from non-dense grids. - */ - -int select_dense(int file_Len, int *all_grid_ID, int num_nonempty, int *num_dense_grids, int *num_dense_events, int *dense_grid_reverse, int den_t_event) -{ - - - int i=0; - int vol_ID=0; - int biggest_size=0; //biggest grid_size, to define grid_size_index - int biggest_index=0; - //int actual_threshold=0; //the actual threshold on grid_size, e.g., 1 implies 1 event in the grid - //int num_remain=0; //number of remaining grids with different density thresholds - int temp_num_dense_grids=0; - int temp_num_dense_events=0; - - int *grid_size_index; - int *all_grid_vol; - int *grid_density_index; - - //double den_average=0; - // double avr_index=0; - - - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Compute all_grid_vol - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - all_grid_vol=(int *)malloc(sizeof(int)*num_nonempty); - memset(all_grid_vol,0,sizeof(int)*num_nonempty); - - /* Grid "volume" is just the number of data contained in the grid. */ - for (i=0;i<file_Len;i++) - { - vol_ID=all_grid_ID[i]; //vol_ID=all_grid_ID[sorted_seq[i]]; - all_grid_vol[vol_ID]++; //all_grid_vol[i]=j means grid i has j points - } - - - - ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Compute grid_size_index (histogram of grid sizes) - ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - - for (i=0;i<num_nonempty;i++) - { - if (biggest_size<all_grid_vol[i]) - { - biggest_size=all_grid_vol[i]; - } - } - - //added on July 23, 2010 - if (biggest_size<3) - { - fprintf(stderr,"Too many dimensions with too few events in the input file, or a too large number of bins used.\n"); //modified on July 23, 2010 - abort(); - } - - grid_size_index=(int*)malloc(sizeof(int)*biggest_size); - memset(grid_size_index,0,sizeof(int)*biggest_size); - - for (i=0;i<num_nonempty;i++) - { - grid_size_index[all_grid_vol[i]-1]++; //grid_size_index[0]=5 means there are 5 grids having size 1 - } - - - - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Compute den_cutoff - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - - grid_density_index=(int *)malloc(sizeof(int)*(biggest_size-2));//from event 2 to biggest_size-1, i.e., from 0 to biggest_size-3 - memset(grid_density_index,0,sizeof(int)*(biggest_size-2)); - - if (den_t_event==0) //user doesn't define the density threshold - { - biggest_index=0; - - for (i=2;i<biggest_size-1;i++) //the grid with 1 event will be skipped, i.e., grid_density_index[0] won't be defined - { - grid_density_index[i-1]=(grid_size_index[i-1]+grid_size_index[i+1]-2*grid_size_index[i]); - if (biggest_index<grid_density_index[i-1]) - { - biggest_index=grid_density_index[i-1]; - den_t_event=i+1; - } - } - } - - if (den_t_event==0) //if something is wrong - den_t_event=3; - - for (i=0;i<num_nonempty;i++) - if (all_grid_vol[i]>=den_t_event) - temp_num_dense_grids++; - - if (temp_num_dense_grids<=1) - { - //modified on July 23, 2010 - //printf("a too high density threshold is set! Please decrease your density threshold.\n"); - //exit(0); - fprintf(stderr,"a too high density threshold is set! Please decrease your density threshold.\n"); //modified on July 23, 2010 - abort(); - } - - if (temp_num_dense_grids>=100000) - { - //modified on July 23, 2010 - //printf("a too low density threshold is set! Please increase your density threshold.\n"); - //exit(0); - fprintf(stderr,"a too low density threshold is set! Please increase your density threshold.\n"); //modified on July 23, 2010 - abort(); - } - - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Compute dense_grid_reverse - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - temp_num_dense_grids=0; - temp_num_dense_events=0; - for (i=0;i<num_nonempty;i++) - { - dense_grid_reverse[i]=-1; - - if (all_grid_vol[i]>=den_t_event) - { - dense_grid_reverse[i]=temp_num_dense_grids; //dense_grid_reverse provides a mapping from all nonempty grids to dense grids. - temp_num_dense_grids++; - temp_num_dense_events+=all_grid_vol[i]; - } - } - - num_dense_grids[0]=temp_num_dense_grids; - num_dense_events[0]=temp_num_dense_events; - - free(grid_size_index); - free(all_grid_vol); - - return den_t_event; -} - -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Compute densegridID_To_gridevent and eventID_To_denseventID -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// grid_To_event(file_Len, dense_grid_reverse, all_grid_ID, eventID_To_denseventID, densegridID_To_gridevent); -/* - * Filter the data so that only the data belonging to dense grids is left - * - * Output: - * eventID_To_denseeventID -- mapping from original event ID to ID in list containing only events contained in dense grids. - * densegridID_To_gridevent -- mapping from dense grids to prototype members of the grids. - * - */ - -void grid_To_event(int file_Len, int *dense_grid_reverse, int *all_grid_ID, int *eventID_To_denseventID, int *densegridID_To_gridevent) -{ - int i=0; - int dense_grid_ID=0; - int dense_event_ID=0; - - for (i=0;i<file_Len;i++) - { - dense_grid_ID=dense_grid_reverse[all_grid_ID[i]]; - eventID_To_denseventID[i]=-1; - if (dense_grid_ID!=-1) //for point (i) belonging to dense grids - { - eventID_To_denseventID[i]=dense_event_ID; - dense_event_ID++; - - if (densegridID_To_gridevent[dense_grid_ID]==-1) //for point i that hasn't been selected - densegridID_To_gridevent[dense_grid_ID]=i; //densegridID_To_gridevent maps dense_grid_ID to its representative point - } - } - - -} - -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Compute dense_grid_seq -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// generate_grid_seq(file_Len, num_dm, sorted_seq, num_dense_grids, densegridID_To_gridevent, position, dense_grid_rank, dense_grid_seq); -/* Construct a table of binned data values for each dense grid. - * - * Output: - * - * dense_grid_seq -- table of binned data values for each dense grid (recall that all members of a grid have identical binned data values) - */ - -void generate_grid_seq(int num_dm, int num_dense_grids, int *densegridID_To_gridevent, int **position, int **dense_grid_seq) -{ - - int i=0; - int j=0; - int ReventID=0; //representative event ID of the dense grid - - for (i=0;i<num_dense_grids;i++) - { - ReventID = densegridID_To_gridevent[i]; - - for (j=0;j<num_dm;j++) - dense_grid_seq[i][j]=position[ReventID][j]; - } -} - -//compare two vectors -int compare_value(int num_dm, int *search_value, int *seq_value) -{ - int i=0; - - for (i=0;i<num_dm;i++) - { - if (search_value[i]<seq_value[i]) - return 1; - if (search_value[i]>seq_value[i]) - return -1; - if (search_value[i]==seq_value[i]) - continue; - } - return 0; -} - -//binary search the dense_grid_seq to return the dense grid ID if it exists -int binary_search(int num_dense_grids, int num_dm, int *search_value, int **dense_grid_seq) -{ - - int low=0; - int high=0; - int mid=0; - int comp_result=0; - int match=0; - //int found=0; - - low = 0; - high = num_dense_grids-1; - - while (low <= high) - { - mid = (int)((low + high)/2); - - comp_result=compare_value(num_dm, search_value,dense_grid_seq[mid]); - - - switch(comp_result) - { - case 1: - high=mid-1; - break; - case -1: - low=mid+1; - break; - case 0: - match=1; - break; - } - if (match==1) - break; - } - - - - if (match==1) - { - return mid; - } - else - return -1; -} - - -/********************************************** Computing Centers Using IDs **********************************************/ - -void ID2Center(double **data_in, int file_Len, int num_dm, int *eventID_To_denseventID, int num_clust, int *cluster_ID, double **population_center) -{ - int i=0; - int j=0; - int ID=0; - int eventID=0; - int *size_c; - - - - size_c=(int *)malloc(sizeof(int)*num_clust); - memset(size_c,0,sizeof(int)*num_clust); - - for (i=0;i<num_clust;i++) - for (j=0;j<num_dm;j++) - population_center[i][j]=0; - - for (i=0;i<file_Len;i++) - { - eventID=eventID_To_denseventID[i]; - - if (eventID!=-1) //only events in dense grids count - { - ID=cluster_ID[eventID]; - - if (ID==-1) - { - //modified on July 23, 2010 - //printf("ID==-1! in ID2Center\n"); - //exit(0); - fprintf(stderr,"Incorrect file format or input parameters (no dense regions found!)\n"); //modified on July 23, 2010 - abort(); - } - - for (j=0;j<num_dm;j++) - population_center[ID][j]=population_center[ID][j]+data_in[i][j]; - - size_c[ID]++; - } - } - - - for (i=0;i<num_clust;i++) - { - for (j=0;j<num_dm;j++) - if (size_c[i]!=0) - population_center[i][j]=(population_center[i][j]/(double)(size_c[i])); - else - population_center[i][j]=0; - } - - free(size_c); - -} - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Compute Population Center with all events -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -void ID2Center_all(double **data_in, int file_Len, int num_dm, int num_clust, int *cluster_ID, double **population_center) -{ - int i=0; - int j=0; - int ID=0; - int *size_c; - - - - size_c=(int *)malloc(sizeof(int)*num_clust); - memset(size_c,0,sizeof(int)*num_clust); - - for (i=0;i<num_clust;i++) - for (j=0;j<num_dm;j++) - population_center[i][j]=0; - - for (i=0;i<file_Len;i++) - { - ID=cluster_ID[i]; - - if (ID==-1) - { - //commented on July 23, 2010 - //printf("ID==-1! in ID2Center_all\n"); - //exit(0); - fprintf(stderr,"Incorrect file format or input parameters (resulting in incorrect population IDs)\n"); //modified on July 23, 2010 - abort(); - } - - for (j=0;j<num_dm;j++) - population_center[ID][j]=population_center[ID][j]+data_in[i][j]; - - size_c[ID]++; - } - - - for (i=0;i<num_clust;i++) - { - for (j=0;j<num_dm;j++) - if (size_c[i]!=0) - population_center[i][j]=(population_center[i][j]/(double)(size_c[i])); - else - population_center[i][j]=0; - } - - free(size_c); - -} - -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Merge neighboring grids to clusters -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -int merge_grids(double **normalized_data, double *interval, int file_Len, int num_dm, int num_bin, int **position, int num_dense_grids, - int *dense_grid_reverse, int **dense_grid_seq, int *eventID_To_denseventID, int *densegridID_To_gridevent, int *all_grid_ID, - int *cluster_ID, int *grid_ID, int *grid_clusterID) -{ - - - int i=0; - int j=0; - int t=0; - int p=0; - int num_clust=0; - int ReventID=0; - int denseID=0; - int neighbor_ID=0; - //int temp_grid=0; - - int *grid_value; - int *search_value; - - int **graph_of_grid; //the graph constructed for dense grids: each dense grid is a graph node - - double real_dist=0; - double **norm_grid_center; - - norm_grid_center=(double **)malloc(sizeof(double*)*num_dense_grids); - memset(norm_grid_center,0,sizeof(double*)*num_dense_grids); - - for (i=0;i<num_dense_grids;i++) - { - norm_grid_center[i]=(double *)malloc(sizeof(double)*num_dm); - memset(norm_grid_center[i],0,sizeof(double)*num_dm); - } - - for (i=0;i<file_Len;i++) - { - denseID=eventID_To_denseventID[i]; - if (denseID!=-1) //only dense events can enter - { - grid_ID[denseID]=dense_grid_reverse[all_grid_ID[i]]; - - if (grid_ID[denseID]==-1) - { - fprintf(stderr,"Incorrect input file format or input parameters (no dense region found)!\n"); //modified on July 23, 2010 - abort(); - } - } - } - - - /* Find centroid (in the normalized data) for each dense grid */ - ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_dense_grids,grid_ID,norm_grid_center); - - //printf("pass the grid ID2 center\n"); //commmented on July 23, 2010 - - - graph_of_grid=(int **)malloc(sizeof(int*)*num_dense_grids); - memset(graph_of_grid,0,sizeof(int*)*num_dense_grids); - for (i=0;i<num_dense_grids;i++) - { - graph_of_grid[i]=(int *)malloc(sizeof(int)*num_dm); - memset(graph_of_grid[i],0,sizeof(int)*num_dm); - - - for (j=0;j<num_dm;j++) - graph_of_grid[i][j]=-1; - } - - grid_value=(int *)malloc(sizeof(int)*num_dm); - memset(grid_value,0,sizeof(int)*num_dm); - - search_value=(int *)malloc(sizeof(int)*num_dm); - memset(search_value,0,sizeof(int)*num_dm); - - - for (i=0;i<num_dense_grids;i++) - { - ReventID=densegridID_To_gridevent[i]; - - for (j=0;j<num_dm;j++) - { - grid_value[j]=position[ReventID][j]; - - } - - - /* For each dimension, find the neighbor, if any, that is equal in all other dimensions and 1 greater in - the chosen dimension. If the neighbor's centroid is not too far away, add it to this grid's neighbor - list. */ - for (t=0;t<num_dm;t++) - { - for (p=0;p<num_dm;p++) - search_value[p]=grid_value[p]; - - if (grid_value[t]==num_bin-1) - continue; - - search_value[t]=grid_value[t]+1; //we only consider the neighbor at the bigger side - - neighbor_ID=binary_search(num_dense_grids, num_dm, search_value, dense_grid_seq); - - if (neighbor_ID!=-1) - { - real_dist=norm_grid_center[i][t]-norm_grid_center[neighbor_ID][t]; - - if (real_dist<0) - real_dist=-real_dist; - - if (real_dist<2*interval[t]) - graph_of_grid[i][t]=neighbor_ID; - } - } - grid_clusterID[i]=i; //initialize grid_clusterID - } - - - //graph constructed - //DFS-based search begins - - /* Use a depth-first search to construct a list of connected subgraphs (= "clusters"). - Note our graph as we have constructed it is a DAG, so we can use that to our advantage - in our search. */ - // num_clust=dfs(graph_of_grid,num_dense_grids,num_dm,grid_clusterID); - num_clust=find_connected(graph_of_grid, num_dense_grids, num_dm, grid_clusterID); - - - //computes grid_ID and cluster_ID - for (i=0;i<file_Len;i++) - { - denseID=eventID_To_denseventID[i]; - if (denseID!=-1) //only dense events can enter - { - cluster_ID[denseID]=grid_clusterID[grid_ID[denseID]]; - //if (cluster_ID[denseID]==-1) - // printf("catch you!\n"); - } - } - - free(search_value); - free(grid_value); - - for (i=0;i<num_dense_grids;i++) - { - free(graph_of_grid[i]); - free(norm_grid_center[i]); - } - free(graph_of_grid); - free(norm_grid_center); - - return num_clust; -} - -/********************************************* Merge Clusters to Populations *******************************************/ -// This is the function future work can be on because it is about how to cluster the about 500 points accurately - -int merge_clusters(int num_clust, int num_dm, double *interval, double **cluster_center, int *cluster_populationID, int max_num_pop) -{ - int num_population=0; - int temp_num_population=0; - - int i=0; - int j=0; - int t=0; - int merge=0; - int smid=0; - int bgid=0; - double merge_dist=1.1; //initial value of merge_dist*interval should be slightly larger than the bin width - - int *map_ID; - - double diff=0; - - map_ID=(int*)malloc(sizeof(int)*num_clust); - memset(map_ID,0,sizeof(int)*num_clust); - - while ((num_population>max_num_pop) || (num_population<=1)) - { - - if (num_population<=1) - merge_dist=merge_dist-0.1; - - if (num_population>max_num_pop) - merge_dist=merge_dist+0.1; - - - for (i=0;i<num_clust;i++) - cluster_populationID[i]=i; - - for (i=0;i<num_clust-1;i++) - { - for (j=i+1;j<num_clust;j++) - { - merge=1; - - for (t=0;t<num_dm;t++) - { - diff=cluster_center[i][t]-cluster_center[j][t]; - - if (diff<0) - diff=-diff; - if (diff>(merge_dist*interval[t])) - merge=0; - } - - if ((merge) && (cluster_populationID[i]!=cluster_populationID[j])) - { - if (cluster_populationID[i]<cluster_populationID[j]) //they could not be equal - { - smid = cluster_populationID[i]; - bgid = cluster_populationID[j]; - } - else - { - smid = cluster_populationID[j]; - bgid = cluster_populationID[i]; - } - for (t=0;t<num_clust;t++) - { - if (cluster_populationID[t]==bgid) - cluster_populationID[t]=smid; - } - } - } - } - - - - for (i=0;i<num_clust;i++) - map_ID[i]=-1; - - for (i=0;i<num_clust;i++) - map_ID[cluster_populationID[i]]=1; - - num_population=0; - for (i=0;i<num_clust;i++) - { - if (map_ID[i]==1) - { - map_ID[i]=num_population; - num_population++; - } - } - - if ((temp_num_population>max_num_pop) && (num_population==1)) - break; - else - temp_num_population=num_population; - - if (num_clust<=1) - break; - } - - for (i=0;i<num_clust;i++) - cluster_populationID[i]=map_ID[cluster_populationID[i]]; - - free(map_ID); - - return num_population; -} - -/////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -double kmeans(double **Matrix, int k, double kmean_term, int file_Len, int num_dm, int *shortest_id, double **center) -{ - - int i=0; - int j=0; - int t=0; - int random=0; - int random1=0; - int random2=0; - int times=0; - int dist_used=0; //0 is Euclidean and 1 is Pearson - int random_init=0; //0: not use random seeds; - int real_Len=0; - int skipped=0; - - int *num; //num[i]=t means the ith cluster has t points - - double vvv=1.0; // the biggest variation; - double distance=0.0; - double xv=0.0; - double variation=0.0; - - double mean_dx=0; - double mean_dy=0; - double sum_var=0; - double dx=0; - double dy=0; - double sd_x=0; - double sd_y=0; - double diff=0; - double distortion=0; - - double shortest_distance; - - double *temp_center; - - double **sum; - - temp_center = (double *)malloc(sizeof(double)*num_dm); - memset(temp_center,0,sizeof(double)*num_dm); - - if (random_init) - { - for (i=0;i<k;i++) - { - random1=rand()*rand(); - random2=abs((random1%5)+1); - for (t=0;t<random2;t++) - random2=random2*rand()+rand(); - - random=abs(random2%file_Len); - for (j=0;j<num_dm;j++) - center[i][j]=Matrix[random][j]; - } - } - - - num = (int *)malloc(sizeof(int)*k); - memset(num,0,sizeof(int)*k); - - sum = (double **)malloc(sizeof(double*)*k); - memset(sum,0,sizeof(double*)*k); - for (i=0;i<k;i++) - { - sum[i] = (double *)malloc(sizeof(double)*num_dm); - memset(sum[i],0,sizeof(double)*num_dm); - } - - - times=0; - real_Len=0; - - while (((vvv>kmean_term) && (kmean_term<1)) || ((times<kmean_term) && (kmean_term>=1))) - { - for (i=0;i<k;i++) - { - num[i]=0; - for (j=0;j<num_dm;j++) - sum[i][j]=0.0; - } - - for (i=0;i<file_Len;i++) //for each data point i, we compute the distance between Matrix[i] and center[j] - { - skipped = 0; - shortest_distance=MAX_VALUE; - for (j=0;j<k;j++) //for each center j - { - distance=0.0; - - if (dist_used==0) //Euclidean distance - { - for (t=0;t<num_dm;t++) //for each dimension here num_dm is always 1 as we consider individual dimensions - { - - diff=center[j][t]-Matrix[i][t]; - - diff=diff*diff; - - distance = distance+diff; //here we have a weight for each dimension - } - } - else //pearson correlation - { - mean_dx=0.0; - mean_dy=0.0; - sum_var=0.0; - dx=0.0; - dy=0.0; - sd_x=0.0; - sd_y=0.0; - for (t=0;t<num_dm;t++) - { - mean_dx+=center[j][t]; - mean_dy+=Matrix[i][t]; - } - mean_dx=mean_dx/(double)num_dm; - mean_dy=mean_dy/(double)num_dm; - - for (t=0;t<num_dm;t++) - { - dx=center[j][t]-mean_dx; - dy=Matrix[i][t]-mean_dy; - sum_var+=dx*dy; - sd_x+=dx*dx; - sd_y+=dy*dy; - } - if (sqrt(sd_x*sd_y)==0) - distance = 1.0; - else - distance = 1.0 - (sum_var/(sqrt(sd_x*sd_y))); // distance ranges from 0 to 2; - //printf("distance=%f\n",distance); - } //pearson correlation ends - - if ((distance<shortest_distance) && (skipped == 0)) - { - shortest_distance=distance; - shortest_id[i]=j; - } - }//end for j - real_Len++; - num[shortest_id[i]]=num[shortest_id[i]]+1; - for (t=0;t<num_dm;t++) - sum[shortest_id[i]][t]=sum[shortest_id[i]][t]+Matrix[i][t]; - }//end for i - /* recompute the centers */ - //compute_mean(group); - vvv=0.0; - for (j=0;j<k;j++) - { - memcpy(temp_center,center[j],sizeof(double)*num_dm); - variation=0.0; - if (num[j]!=0) - { - for (t=0;t<num_dm;t++) - { - center[j][t]=sum[j][t]/(double)num[j]; - xv=(temp_center[t]-center[j][t]); - variation=variation+xv*xv; - } - } - - if (variation>vvv) - vvv=variation; //vvv is the biggest variation among the k clusters; - } - //compute_variation; - times++; - } //end for while - - - - - free(num); - - for (i=0;i<k;i++) - free(sum[i]); - free(sum); - free(temp_center); - - - return distortion; - -} - -////////////////////////////////////////////////////// -/*************************** Show *****************************/ -void show(double **Matrix, int *cluster_id, int file_Len, int k, int num_dm, char *name_string) -{ - int situ1=0; - int situ2=0; - - int i=0; - int id=0; - int j=0; - int t=0; - - int *size_c; - - - - int **size_mybound_1; - int **size_mybound_2; - int **size_mybound_3; - int **size_mybound_0; - - double interval=0.0; - - double *big; - double *small; - - - double **center; - double **mybound; - - int **prof; //prof[i][j]=1 means population i is + at parameter j - - FILE *fpcnt_id; //proportion id - //FILE *fcent_id; //center_id, i.e., centers of clusters within the original data - FILE *fprof_id; //profile_id - - big=(double *)malloc(sizeof(double)*num_dm); - memset(big,0,sizeof(double)*num_dm); - - small=(double *)malloc(sizeof(double)*num_dm); - memset(small,0,sizeof(double)*num_dm); - - for (i=0;i<num_dm;i++) - { - big[i]=0.0; - small[i]=(double)MAX_VALUE; - } - - - size_c=(int *)malloc(sizeof(int)*k); - memset(size_c,0,sizeof(int)*k); - - center=(double**)malloc(sizeof(double*)*k); - memset(center,0,sizeof(double*)*k); - for (i=0;i<k;i++) - { - center[i]=(double*)malloc(sizeof(double)*num_dm); - memset(center[i],0,sizeof(double)*num_dm); - } - - mybound=(double**)malloc(sizeof(double*)*num_dm); - memset(mybound,0,sizeof(double*)*num_dm); - for (i=0;i<num_dm;i++) //there are 3 mybounds for 4 categories - { - mybound[i]=(double*)malloc(sizeof(double)*3); - memset(mybound[i],0,sizeof(double)*3); - } - - prof=(int **)malloc(sizeof(int*)*k); - memset(prof,0,sizeof(int*)*k); - for (i=0;i<k;i++) - { - prof[i]=(int *)malloc(sizeof(int)*num_dm); - memset(prof[i],0,sizeof(int)*num_dm); - } - - - for (i=0;i<file_Len;i++) - { - id=cluster_id[i]; - for (j=0;j<num_dm;j++) - { - center[id][j]=center[id][j]+Matrix[i][j]; - if (big[j]<Matrix[i][j]) - big[j]=Matrix[i][j]; - if (small[j]>Matrix[i][j]) - small[j]=Matrix[i][j]; - } - - size_c[id]++; - } - - for (i=0;i<k;i++) - for (j=0;j<num_dm;j++) - { - if (size_c[i]!=0) - center[i][j]=(center[i][j]/(double)(size_c[i])); - else - center[i][j]=0; - } - - for (j=0;j<num_dm;j++) - { - interval=((big[j]-small[j])/4.0); - //printf("interval[%d] is %f\n",j,interval); - for (i=0;i<3;i++) - mybound[j][i]=small[j]+((double)(i+1)*interval); - } - - - size_mybound_0=(int **)malloc(sizeof(int*)*k); - memset(size_mybound_0,0,sizeof(int*)*k); - - for (i=0;i<k;i++) - { - size_mybound_0[i]=(int*)malloc(sizeof(int)*num_dm); - memset(size_mybound_0[i],0,sizeof(int)*num_dm); - } - - size_mybound_1=(int **)malloc(sizeof(int*)*k); - memset(size_mybound_1,0,sizeof(int*)*k); - - for (i=0;i<k;i++) - { - size_mybound_1[i]=(int*)malloc(sizeof(int)*num_dm); - memset(size_mybound_1[i],0,sizeof(int)*num_dm); - } - - size_mybound_2=(int **)malloc(sizeof(int*)*k); - memset(size_mybound_2,0,sizeof(int*)*k); - - for (i=0;i<k;i++) - { - size_mybound_2[i]=(int*)malloc(sizeof(int)*num_dm); - memset(size_mybound_2[i],0,sizeof(int)*num_dm); - } - - size_mybound_3=(int **)malloc(sizeof(int*)*k); - memset(size_mybound_3,0,sizeof(int*)*k); - - for (i=0;i<k;i++) - { - size_mybound_3[i]=(int*)malloc(sizeof(int)*num_dm); - memset(size_mybound_3[i],0,sizeof(int)*num_dm); - } - - for (i=0;i<file_Len;i++) - for (j=0;j<num_dm;j++) - { - if (Matrix[i][j]<mybound[j][0])// && ((Matrix[i][j]-small[j])>0)) //the smallest values excluded - size_mybound_0[cluster_id[i]][j]++; - else - { - if (Matrix[i][j]<mybound[j][1]) - size_mybound_1[cluster_id[i]][j]++; - else - { - if (Matrix[i][j]<mybound[j][2]) - size_mybound_2[cluster_id[i]][j]++; - else - //if (Matrix[i][j]!=big[j]) //the biggest values excluded - size_mybound_3[cluster_id[i]][j]++; - } - - } - } - - fprof_id=fopen("profile.txt","w"); - fprintf(fprof_id,"Population_ID\t"); - fprintf(fprof_id,"%s\n",name_string); - - for (i=0;i<k;i++) - { - fprintf(fprof_id,"%d\t",i+1); //i changed to i+1 to start from 1 instead of 0: April 16, 2009 - for (j=0;j<num_dm;j++) - { - - if (size_mybound_0[i][j]>size_mybound_1[i][j]) - situ1=0; - else - situ1=1; - if (size_mybound_2[i][j]>size_mybound_3[i][j]) - situ2=2; - else - situ2=3; - - if ((situ1==0) && (situ2==2)) - { - if (size_mybound_0[i][j]>size_mybound_2[i][j]) - prof[i][j]=0; - else - prof[i][j]=2; - } - if ((situ1==0) && (situ2==3)) - { - if (size_mybound_0[i][j]>size_mybound_3[i][j]) - prof[i][j]=0; - else - prof[i][j]=3; - } - if ((situ1==1) && (situ2==2)) - { - if (size_mybound_1[i][j]>size_mybound_2[i][j]) - prof[i][j]=1; - else - prof[i][j]=2; - } - if ((situ1==1) && (situ2==3)) - { - if (size_mybound_1[i][j]>size_mybound_3[i][j]) - prof[i][j]=1; - else - prof[i][j]=3; - } - - //begin to output profile - if (j==num_dm-1) - { - if (prof[i][j]==0) - fprintf(fprof_id,"1\n"); - if (prof[i][j]==1) - fprintf(fprof_id,"2\n"); - if (prof[i][j]==2) - fprintf(fprof_id,"3\n"); - if (prof[i][j]==3) - fprintf(fprof_id,"4\n"); - } - else - { - if (prof[i][j]==0) - fprintf(fprof_id,"1\t"); - if (prof[i][j]==1) - fprintf(fprof_id,"2\t"); - if (prof[i][j]==2) - fprintf(fprof_id,"3\t"); - if (prof[i][j]==3) - fprintf(fprof_id,"4\t"); - } - } - } - fclose(fprof_id); - - /////////////////////////////////////////////////////////// - - - fpcnt_id=fopen("percentage.txt","w"); - fprintf(fpcnt_id,"Population_ID\tPercentage\n"); - - for (t=0;t<k;t++) - { - fprintf(fpcnt_id,"%d\t%.2f\n",t+1,(double)size_c[t]*100.0/(double)file_Len); //t changed to t+1 to start from 1 instead of 0: April 16, 2009 - } - fclose(fpcnt_id); - - free(big); - free(small); - free(size_c); - - for (i=0;i<k;i++) - { - free(center[i]); - free(prof[i]); - free(size_mybound_0[i]); - free(size_mybound_1[i]); - free(size_mybound_2[i]); - free(size_mybound_3[i]); - } - free(center); - free(prof); - free(size_mybound_0); - free(size_mybound_1); - free(size_mybound_2); - free(size_mybound_3); - - for (i=0;i<num_dm;i++) - free(mybound[i]); - free(mybound); - -} - - - -/******************************************************** Main Function **************************************************/ -//for normalized data, there are five variables: -//cluster_ID -//population_center -//grid_clusterID -//grid_ID -//grid_Center - -//the same five variables exist for the original data -//however, the three IDs (cluster_ID, grid_ID, grid_clusterID) don't change in either normalized or original data -//also, data + cluster_ID -> population_center -//data + grid_ID -> grid_Center - -/* what is the final output */ -//the final things we want are grid_Center in the original data and grid_clusterID //or population_center in the original data -//Sparse grids will not be considered when computing the two centroids (centroids of grids and centroids of clusters) - -/* what information should select_bin output */ -//the size of all IDs are unknown to function main because we only consider the events in dense grids, and also the number of dense grids -//is unknown, therefore I must use a prescreening to output -//how many bins I should use -//the number of dense grids -//total number of events in the dense grids - -/* basic procedure of main function */ -//1. read raw file and normalize the raw file -//2. select_bin -//3. allocate memory for eventID_To_denseventID, grid_clusterID, grid_ID, cluster_ID. -//4. call select_dense and merge_grids with grid_clusterID, grid_ID, cluster_ID. -//5. release normalized data; allocate memory for grid_Center and population_center -//6. output grid_Center and population_center using ID2Center together with grid_clusterID //from IDs to centers - -int main (int argc, char **argv) -{ - //inputs - FILE *f_src; //source file pointer - - FILE *f_out; //coordinates - FILE *f_cid; //population-ID of events - FILE *f_ctr; //centroids of populations - FILE *f_results; //coordinates file event and population column - FILE *f_mfi; //added April 16, 2009 for mean fluorescence intensity - FILE *f_parameters; //number of bins and density calculated by - //the algorithm. Used to update the database - FILE *f_properties; //Properties file used by Image generation software - - char para_name_string[LINE_LEN]; - - int time_ID=-1; - int num_bin=0; //the bins I will use on each dimension - - int file_Len=0; //number of events - int num_dm=0; - int num_clust=0; - int num_dense_events=0; - int num_dense_grids=0; - int num_nonempty=0; - int num_population=0; - //int temp=0; - - //below are read from configuration file - int i=0; - int j=0; - int max_num_pop=0; - - int den_t_event=0; - - int *grid_clusterID; //(dense)gridID to clusterID - int *grid_ID; //(dense)eventID to gridID - int *cluster_ID; //(dense)eventID to clusterID - int *eventID_To_denseventID; //eventID_To_denseventID[i]=k means event i is in a dense grid and its ID within dense events is k - int *all_grid_ID; //gridID for all events - int *densegridID_To_gridevent; - int *sorted_seq; - int *dense_grid_reverse; - int *population_ID; //denseeventID to populationID - int *cluster_populationID; //clusterID to populationID - int *grid_populationID; //gridID to populationID - int *all_population_ID; //populationID of event - - int **position; - int **dense_grid_seq; - - double *interval; - - double **population_center; //population centroids in the raw/original data - double **cluster_center; //cluster centroids in the raw/original data - - double **input_data; - double **normalized_data; - - int min = 999999; - int max = 0; - - printf( "Starting time:\t\t\t\t"); - fflush(stdout); - system("/bin/date"); - ///////////////////////////////////////////////////////////// - - if ((argc!=2) && (argc!=4) && (argc!=5)) - { - //modified on Jul 23, 2010 - //printf("usage:\n"); - //printf("basic mode: flock data_file\n"); - //printf("advanced mode: flock data_file num_bin density_index\n"); - //exit(0); - - fprintf(stderr,"Incorrect number of input parameters!\n"); //modified on July 23, 2010 - //fprintf(stderr,"basic mode: flock data_file\n"); //modified on July 23, 2010 - //fprintf(stderr,"advanced mode1: flock data_file num_bin density_index\n"); //modified on July 23, 2010 - fprintf(stderr,"advanced mode: flock data_file num_bin density_index max_num_pop\n"); //added on Dec 16, 2010 for GenePattern - abort(); - } - - f_src=fopen(argv[1],"r"); - - if (argc==2) - { - max_num_pop=30; //default value, maximum 30 clusters - } - - if (argc==4) - { - num_bin=atoi(argv[2]); - printf("num_bin is %d\n",num_bin); - - den_t_event=atoi(argv[3]); - printf("density_index is %d\n",den_t_event); - - max_num_pop=30; - - if (((num_bin<6) && (num_bin!=0)) || (num_bin>29)) - { - fprintf(stderr,"Incorrect input range of number of bins, which should be larger than 5 and smaller than 30\n"); - abort(); - } - - if (((den_t_event<3) && (den_t_event!=0)) || (den_t_event>99)) - { - fprintf(stderr,"Incorrect input range of density threshold, which should be larger than 2 and smaller than 100\n"); - abort(); - } - } - - if (argc==5) - { - num_bin=atoi(argv[2]); - printf("num_bin is %d\n",num_bin); - - den_t_event=atoi(argv[3]); - printf("density_index is %d\n",den_t_event); - - max_num_pop=atoi(argv[4]); - printf("max number of clusters is %d\n",max_num_pop); - - if (((num_bin<6) && (num_bin!=0)) || (num_bin>29)) - { - fprintf(stderr,"Incorrect input range of number of bins, which should be larger than 5 and smaller than 30\n"); - abort(); - } - - if (((den_t_event<3) && (den_t_event!=0)) || (den_t_event>99)) - { - fprintf(stderr,"Incorrect input range of density threshold, which should be larger than 2 and smaller than 100\n"); - abort(); - } - - if ((max_num_pop<5) || (max_num_pop>999)) - { - fprintf(stderr,"Incorrect input range of maximum number of populations, which should be larger than 4 and smaller than 1000\n"); - abort(); - } - } - - - getfileinfo(f_src, &file_Len, &num_dm, para_name_string, &time_ID); //get the filelength, number of dimensions, and num/name of parameters - - /************************************************* Read the data *****************************************************/ - - rewind(f_src); //reset the file pointer - - input_data = (double **)malloc(sizeof(double*)*file_Len); - memset(input_data,0,sizeof(double*)*file_Len); - for (i=0;i<file_Len;i++) - { - input_data[i]=(double *)malloc(sizeof(double)*num_dm); - memset(input_data[i],0,sizeof(double)*num_dm); - } - - readsource(f_src, file_Len, num_dm, input_data, time_ID); //read the data; - - fclose(f_src); - - normalized_data=(double **)malloc(sizeof(double*)*file_Len); - memset(normalized_data,0,sizeof(double*)*file_Len); - for (i=0;i<file_Len;i++) - { - normalized_data[i]=(double *)malloc(sizeof(double)*num_dm); - memset(normalized_data[i],0,sizeof(double)*num_dm); - } - - tran(input_data, file_Len, num_dm, NORM_METHOD, normalized_data); - - - position=(int **)malloc(sizeof(int*)*file_Len); - memset(position,0,sizeof(int*)*file_Len); - for (i=0;i<file_Len;i++) - { - position[i]=(int*)malloc(sizeof(int)*num_dm); - memset(position[i],0,sizeof(int)*num_dm); - } - - all_grid_ID=(int *)malloc(sizeof(int)*file_Len); - memset(all_grid_ID,0,sizeof(int)*file_Len); - - sorted_seq=(int*)malloc(sizeof(int)*file_Len); - memset(sorted_seq,0,sizeof(int)*file_Len); - - interval=(double*)malloc(sizeof(double)*num_dm); - memset(interval,0,sizeof(double)*num_dm); - - /************************************************* select_bin *************************************************/ - - if (num_bin>=1) //num_bin has been selected by user - select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin); - else //num_bin has not been selected by user - { - num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin); - printf("selected bin number is %d\n",num_bin); - } - printf("number of non-empty grids is %d\n",num_nonempty); - - - - /* Although we return sorted_seq from select_bin(), we don't use it for anything, except possibly diagnostics */ - free(sorted_seq); - - - dense_grid_reverse=(int*)malloc(sizeof(int)*num_nonempty); - memset(dense_grid_reverse,0,sizeof(int)*num_nonempty); - - /************************************************* select_dense **********************************************/ - - if (den_t_event>=1) //den_t_event must be larger or equal to 2 if the user wants to set it - select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event); - else - { - den_t_event=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event); - printf("automated selected density threshold is %d\n",den_t_event); - } - - printf("Number of dense grids is %d\n",num_dense_grids); - - densegridID_To_gridevent = (int *)malloc(sizeof(int)*num_dense_grids); - memset(densegridID_To_gridevent,0,sizeof(int)*num_dense_grids); - - for (i=0;i<num_dense_grids;i++) - densegridID_To_gridevent[i]=-1; //initialize all densegridID_To_gridevent values to -1 - - - eventID_To_denseventID=(int *)malloc(sizeof(int)*file_Len); - memset(eventID_To_denseventID,0,sizeof(int)*file_Len); //eventID_To_denseventID[i]=k means event i is in a dense grid and its ID within dense events is k - - - grid_To_event(file_Len, dense_grid_reverse, all_grid_ID, eventID_To_denseventID, densegridID_To_gridevent); - - - dense_grid_seq=(int **)malloc(sizeof(int*)*num_dense_grids); - memset(dense_grid_seq,0,sizeof(int*)*num_dense_grids); - for (i=0;i<num_dense_grids;i++) - { - dense_grid_seq[i]=(int *)malloc(sizeof(int)*num_dm); - memset(dense_grid_seq[i],0,sizeof(int)*num_dm); - } - - - /* Look up the binned data values for each dense grid */ - generate_grid_seq(num_dm, num_dense_grids, densegridID_To_gridevent, position, dense_grid_seq); - - - /************************************************* allocate memory *********************************************/ - - grid_clusterID=(int *)malloc(sizeof(int)*num_dense_grids); - memset(grid_clusterID,0,sizeof(int)*num_dense_grids); - - grid_ID=(int *)malloc(sizeof(int)*num_dense_events); - memset(grid_ID,0,sizeof(int)*num_dense_events); - - cluster_ID=(int *)malloc(sizeof(int)*num_dense_events); - memset(cluster_ID,0,sizeof(int)*num_dense_events); - - - /*********************************************** merge_grids ***********************************************/ - //int merge_grids(int file_Len, int num_dm, int num_bin, int **position, int num_dense_grids, int *dense_grid_rank, int *dense_grid_reverse, - // int **dense_grid_seq, int *eventID_To_denseventID, int *densegridID_To_gridevent, int *all_grid_ID, - // int *cluster_ID, int *grid_ID, int *grid_clusterID) - - num_clust = merge_grids(normalized_data, interval, file_Len, num_dm, num_bin, position, num_dense_grids, dense_grid_reverse, dense_grid_seq, eventID_To_denseventID, densegridID_To_gridevent, all_grid_ID, cluster_ID, grid_ID, grid_clusterID); - - printf("computed number of groups is %d\n",num_clust); - - - /************************************** release unnecessary memory and allocate memory and compute centers **********************************/ - - - for (i=0;i<file_Len;i++) - free(position[i]); - free(position); - - for (i=0;i<num_dense_grids;i++) - free(dense_grid_seq[i]); - free(dense_grid_seq); - - free(dense_grid_reverse); - - free(densegridID_To_gridevent); - free(all_grid_ID); - - // cluster_center //////////////////////////////////////////////////////////////////////////////////////////////////////// - - cluster_center=(double **)malloc(sizeof(double*)*num_clust); - memset(cluster_center,0,sizeof(double*)*num_clust); - for (i=0;i<num_clust;i++) - { - cluster_center[i]=(double*)malloc(sizeof(double)*num_dm); - memset(cluster_center[i],0,sizeof(double)*num_dm); - } - - ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_clust,cluster_ID,cluster_center); //produce the centers with normalized data - - //printf("pass the first ID2center\n"); //commented on July 23, 2010 - - /*** population_ID and grid_populationID **/ - - cluster_populationID=(int*)malloc(sizeof(int)*num_clust); - memset(cluster_populationID,0,sizeof(int)*num_clust); - - grid_populationID=(int*)malloc(sizeof(int)*num_dense_grids); - memset(grid_populationID,0,sizeof(int)*num_dense_grids); - - population_ID=(int*)malloc(sizeof(int)*num_dense_events); - memset(population_ID,0,sizeof(int)*num_dense_events); - - num_population = merge_clusters(num_clust, num_dm, interval, cluster_center, cluster_populationID,max_num_pop); - - - - for (i=0;i<num_clust;i++) - free(cluster_center[i]); - free(cluster_center); - - free(interval); - - for (i=0;i<num_dense_grids;i++) - { - grid_populationID[i]=cluster_populationID[grid_clusterID[i]]; - } - - for (i=0;i<num_dense_events;i++) - { - population_ID[i]=cluster_populationID[cluster_ID[i]]; - } - - printf("computed number of populations is %d\n",num_population); - - - - // population_center ///////////////////////////////////////////////////////////////////////////////////////////////////// - - - population_center=(double **)malloc(sizeof(double*)*num_population); - memset(population_center,0,sizeof(double*)*num_population); - for (i=0;i<num_population;i++) - { - population_center[i]=(double*)malloc(sizeof(double)*num_dm); - memset(population_center[i],0,sizeof(double)*num_dm); - } - - - - ID2Center(normalized_data,file_Len,num_dm,eventID_To_denseventID,num_population,population_ID,population_center); //produce population centers with normalized data - - - // show //////////////////////////////////////////////////////////////////////////////// - all_population_ID=(int*)malloc(sizeof(int)*file_Len); - memset(all_population_ID,0,sizeof(int)*file_Len); - - kmeans(normalized_data, num_population, KMEANS_TERM, file_Len, num_dm, all_population_ID, population_center); - show(input_data, all_population_ID, file_Len, num_population, num_dm, para_name_string); - - ID2Center_all(input_data,file_Len,num_dm,num_population,all_population_ID,population_center); - - - f_cid=fopen("population_id.txt","w"); - f_ctr=fopen("population_center.txt","w"); - f_out=fopen("coordinates.txt","w"); - f_results=fopen("flock_results.txt","w"); - -/* - f_parameters=fopen("parameters.txt","w"); - fprintf(f_parameters,"Number_of_Bins\t%d\n",num_bin); - fprintf(f_parameters,"Density\t%f\n",aver_index); - fclose(f_parameters); -*/ - - for (i=0;i<file_Len;i++) - fprintf(f_cid,"%d\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009 - - /* - * New to check for min/max to add to parameters.txt - * - */ - - fprintf(f_out,"%s\n",para_name_string); - //fprintf(f_results,"%s\tEvent\tPopulation\n",para_name_string); - fprintf(f_results,"%s\tPopulation\n",para_name_string); - for (i=0;i<file_Len;i++) - { - for (j=0;j<num_dm;j++) - { - if (input_data[i][j] < min) { - min = (int)input_data[i][j]; - } - if (input_data[i][j] > max) { - max = (int)input_data[i][j]; - } - if (j==num_dm-1) - { - fprintf(f_out,"%d\n",(int)input_data[i][j]); - fprintf(f_results,"%d\t",(int)input_data[i][j]); - } - else - { - fprintf(f_out,"%d\t",(int)input_data[i][j]); - fprintf(f_results,"%d\t",(int)input_data[i][j]); - } - } - //fprintf(f_results,"%d\t",i + 1); - fprintf(f_results,"%d\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009 - } - -/* - f_parameters=fopen("parameters.txt","w"); - fprintf(f_parameters,"Number_of_Bins\t%d\n",num_bin); - fprintf(f_parameters,"Density\t%d\n",den_t_event); - fprintf(f_parameters,"Min\t%d\n",min); - fprintf(f_parameters,"Max\t%d\n",max); - fclose(f_parameters); -*/ - - f_properties=fopen("fcs.properties","w"); - fprintf(f_properties,"Bins=%d\n",num_bin); - fprintf(f_properties,"Density=%d\n",den_t_event); - fprintf(f_properties,"Min=%d\n",min); - fprintf(f_properties,"Max=%d\n",max); - fprintf(f_properties,"Populations=%d\n",num_population); - fprintf(f_properties,"Events=%d\n",file_Len); - fprintf(f_properties,"Markers=%d\n",num_dm); - fclose(f_properties); - - - for (i=0;i<num_population;i++) { - /* Add if we want to include population id in the output - */ - fprintf(f_ctr,"%d\t",i+1); //i changed to i+1 to start from 1 instead of 0: April 16, 2009 - - for (j=0;j<num_dm;j++) { - if (j==num_dm-1) - fprintf(f_ctr,"%.0f\n",population_center[i][j]); - else - fprintf(f_ctr,"%.0f\t",population_center[i][j]); - } - } - - //added April 16, 2009 - f_mfi=fopen("MFI.txt","w"); - - for (i=0;i<num_population;i++) - { - fprintf(f_mfi,"%d\t",i+1); - - for (j=0;j<num_dm;j++) - { - if (j==num_dm-1) - fprintf(f_mfi,"%.0f\n",population_center[i][j]); - else - fprintf(f_mfi,"%.0f\t",population_center[i][j]); - } - } - fclose(f_mfi); - - //ended April 16, 2009 - - fclose(f_cid); - fclose(f_ctr); - fclose(f_out); - fclose(f_results); - - - for (i=0;i<num_population;i++) - { - free(population_center[i]); - } - free(population_center); - - - for (i=0;i<file_Len;i++) - free(normalized_data[i]); - free(normalized_data); - - free(grid_populationID); - - free(cluster_populationID); - free(grid_clusterID); - free(cluster_ID); - - for (i=0;i<file_Len;i++) - free(input_data[i]); - free(input_data); - - free(grid_ID); - free(population_ID); - free(all_population_ID); - free(eventID_To_denseventID); - - /////////////////////////////////////////////////////////// - printf("Ending time:\t\t\t\t"); - fflush(stdout); - system("/bin/date"); - - return 0; - -}