# HG changeset patch
# User azomics
# Date 1594991214 14400
# Node ID b6b4d08b685869ff0c52ff301b858d84919d0cd8
# Parent 81f9b44f5242dcbcd7f89ea82e420b28325c7af7
"planemo upload for repository https://github.com/ImmPortDB/immport-galaxy-tools/tree/master/flowtools/run_flock commit 7e94637827c3637229f3b568fa7f9d38428d6607"
diff -r 81f9b44f5242 -r b6b4d08b6858 runFlockMFI.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/runFlockMFI.py Fri Jul 17 09:06:54 2020 -0400
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+######################################################################
+# Copyright (c) 2016 Northrop Grumman.
+# All rights reserved.
+######################################################################
+# version 2
+
+import sys
+import os
+from argparse import ArgumentParser
+import pandas as pd
+from scipy.stats import gmean
+
+
+def run_FLOCK(input_file, method, bins, density, output_file, mfi_file,
+ mfi_calc, profile):
+ run_command = method + " " + input_file
+ if bins:
+ run_command += " " + bins
+ if density:
+ run_command += " " + density
+
+ os.system(run_command)
+
+ move_command = "mv flock_results.txt " + output_file
+ os.system(move_command)
+
+ # Here add some way to calculate the count and tack it on to profile file.
+ flockdf = pd.read_table(output_file)
+ if mfi_calc == "mfi":
+ MFIs = flockdf.groupby('Population').mean().round(decimals=2)
+ elif mfi_calc == "gmfi":
+ MFIs = flockdf.groupby('Population').agg(lambda x: gmean(list(x))).round(decimals=2)
+ else:
+ MFIs = flockdf.groupby('Population').median().round(decimals=2)
+
+ with open(mfi_file, "w") as outf:
+ MFIs.to_csv(outf, sep="\t", float_format='%.0f')
+
+ (events, columns) = flockdf.shape
+ fstats = {}
+ fstats['population'] = flockdf.iloc[:, -1:].iloc[:, 0]
+ fstats['population_freq'] = fstats['population'].value_counts()
+ fstats['population_freq_sort'] = fstats['population_freq'].sort_index()
+ fstats['population_per'] = (fstats['population'].value_counts(normalize=True) * 100).round(decimals=2)
+ fstats['population_per_sort'] = fstats['population_per'].sort_index()
+ fstats['population_all'] = pd.concat([fstats['population_freq_sort'], fstats['population_per_sort']], axis=1)
+ fstats['population_all'].columns = ['Count', 'Percentage']
+ fstats['population_all']['Population_ID'] = fstats['population_all'].index
+
+ flock_profile = pd.read_table('profile.txt')
+ profile_pop = flock_profile.merge(fstats['population_all'], on='Population_ID')
+ profile_pop.to_csv(profile, sep="\t", float_format='%.2f', index=False)
+
+# get_profile = "mv profile.txt " + profile
+# os.system(get_profile)
+ return
+
+
+if __name__ == "__main__":
+ parser = ArgumentParser(
+ prog="runFlockMFI",
+ description="Run Flock on text file and generate centroid file")
+
+ parser.add_argument(
+ '-i',
+ dest="input_file",
+ required=True,
+ help="File location for the FCS file.")
+
+ parser.add_argument(
+ '-m',
+ dest="method",
+ required=True,
+ help="Run flock1 or flock2.")
+
+ parser.add_argument(
+ '-M',
+ dest="mfi_calc",
+ required=True,
+ help="what to calculate for centroids.")
+
+ parser.add_argument(
+ '-b',
+ dest="bins",
+ required=False,
+ help="Number of Bins.")
+
+ parser.add_argument(
+ '-d',
+ dest="density",
+ required=False,
+ help="Density.")
+
+ parser.add_argument(
+ '-o',
+ dest="output_file",
+ required=True,
+ help="File location for the output file.")
+
+ parser.add_argument(
+ '-c',
+ dest="centroids",
+ required=True,
+ help="File location for the output centroid file.")
+
+ parser.add_argument(
+ '-p',
+ dest="profile",
+ required=True,
+ help="File location for the output profile file.")
+
+ args = parser.parse_args()
+ run_FLOCK(args.input_file, args.method, args.bins,
+ args.density, args.output_file, args.centroids, args.mfi_calc,
+ args.profile)
diff -r 81f9b44f5242 -r b6b4d08b6858 runFlockMFI.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/runFlockMFI.xml Fri Jul 17 09:06:54 2020 -0400
@@ -0,0 +1,166 @@
+
+ using a FCS file that was converted/transformed to a text file
+
+ scipy
+ pandas
+ flock
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 10.1002/cyto.b.20554
+
+
diff -r 81f9b44f5242 -r b6b4d08b6858 run_flock/runFlockMFI.py
--- a/run_flock/runFlockMFI.py Mon Feb 27 13:30:23 2017 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,125 +0,0 @@
-#!/usr/bin/env python
-######################################################################
-# Copyright (c) 2016 Northrop Grumman.
-# All rights reserved.
-######################################################################
-# version 2
-from __future__ import print_function
-
-import sys
-import os
-from argparse import ArgumentParser
-import pandas as pd
-from scipy.stats import gmean
-
-
-def run_FLOCK(input_file, method, bins, density, output_file, mfi_file,
- mfi_calc, profile, tool_directory):
- run_command = tool_directory + "/bin/" + method + " " + input_file
- if bins:
- run_command += " " + bins
- if density:
- run_command += " " + density
-
- os.system(run_command)
-
- move_command = "mv flock_results.txt " + output_file
- os.system(move_command)
-
- # Here add some way to calculate the count and tack it on to profile file.
- flockdf = pd.read_table(output_file)
- if mfi_calc == "mfi":
- MFIs = flockdf.groupby('Population').mean().round(decimals=2)
- elif mfi_calc == "gmfi":
- MFIs = flockdf.groupby('Population').agg(lambda x: gmean(list(x))).round(decimals=2)
- else:
- MFIs = flockdf.groupby('Population').median().round(decimals=2)
-
- with open(mfi_file, "w") as outf:
- MFIs.to_csv(outf, sep="\t", float_format='%.0f')
-
- (events, columns) = flockdf.shape
- fstats = {}
- fstats['population'] = flockdf.iloc[:, -1:].iloc[:, 0]
- fstats['population_freq'] = fstats['population'].value_counts()
- fstats['population_freq_sort'] = fstats['population_freq'].sort_index()
- fstats['population_per'] = (fstats['population'].value_counts(normalize=True) * 100).round(decimals=2)
- fstats['population_per_sort'] = fstats['population_per'].sort_index()
- fstats['population_all'] = pd.concat([fstats['population_freq_sort'], fstats['population_per_sort']], axis=1)
- fstats['population_all'].columns = ['Count', 'Percentage']
- fstats['population_all']['Population_ID'] = fstats['population_all'].index
-
- flock_profile = pd.read_table('profile.txt')
- profile_pop = flock_profile.merge(fstats['population_all'], on='Population_ID')
- profile_pop.to_csv(profile, sep="\t", float_format='%.2f', index=False)
-
-# get_profile = "mv profile.txt " + profile
-# os.system(get_profile)
- return
-
-
-if __name__ == "__main__":
- parser = ArgumentParser(
- prog="runFlockMFI",
- description="Run Flock on text file and generate centroid file")
-
- parser.add_argument(
- '-i',
- dest="input_file",
- required=True,
- help="File location for the FCS file.")
-
- parser.add_argument(
- '-m',
- dest="method",
- required=True,
- help="Run flock1 or flock2.")
-
- parser.add_argument(
- '-M',
- dest="mfi_calc",
- required=True,
- help="what to calculate for centroids.")
-
- parser.add_argument(
- '-b',
- dest="bins",
- required=False,
- help="Number of Bins.")
-
- parser.add_argument(
- '-d',
- dest="density",
- required=False,
- help="Density.")
-
- parser.add_argument(
- '-o',
- dest="output_file",
- required=True,
- help="File location for the output file.")
-
- parser.add_argument(
- '-t',
- dest="tool_directory",
- required=True,
- help="File location for the output file.")
-
- parser.add_argument(
- '-c',
- dest="centroids",
- required=True,
- help="File location for the output centroid file.")
-
- parser.add_argument(
- '-p',
- dest="profile",
- required=True,
- help="File location for the output profile file.")
-
- args = parser.parse_args()
- run_FLOCK(args.input_file, args.method, args.bins,
- args.density, args.output_file, args.centroids, args.mfi_calc,
- args.profile, args.tool_directory)
-
- sys.exit(0)
diff -r 81f9b44f5242 -r b6b4d08b6858 run_flock/runFlockMFI.xml
--- a/run_flock/runFlockMFI.xml Mon Feb 27 13:30:23 2017 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,158 +0,0 @@
-
- using a FCS file that was converted/transformed to a text file.
-
- scipy
- pandas
- flock
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 10.1002/cyto.b.20554
-
-
diff -r 81f9b44f5242 -r b6b4d08b6858 run_flock/src/README
--- a/run_flock/src/README Mon Feb 27 13:30:23 2017 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,139 +0,0 @@
-This package contains R code for converting and transforming a binary
-FCS file, using the FCSTrans software, and the C code for running the
-flock1 and flock2 population identification software.
-
-src - contains the C code for flock1, flock2 and cent_adjust
-bin - contains the compiled C code for flock1, flock2 and cent_adjust,
- plus the R code for using the FCSTrans algorithm for file
- conversion and data transformation.
-doc - documentation for FCSTrans and FLOCK algorithms
-example - sample data and output from FCSTrans and FLOCK
-
-To run this software sucessfully the code assumes the software was installed
-in the /usr/local/flock directory and that R, plus the Bioconductor flowCore
-module has been installed. If you use the ipconvert.sh shell script, you
-may need to adjust the location of the RScript executable and the location
-of the FCSTrans.R code by editing the shell script.
-
-#############################################################################
-# Overview
-#############################################################################
-ImmPort-FLOCK
-
-FLOCK (FLOw Clustering without K), an automated population discovery tool for
-multidimensional FCM data was designed to specifically take into account the
-unique feature of FCM data and produce objective segregation of cell
-populations.
-
-FLOCK parameter settings can be customized by defining Bins and Density
-Threshold. The number of bins is an integer specifying the number of
-equal-sized regions the data will be partitioned into on each axis. Increasing
-the number of bins increases the sensitivity to detect rare populations but
-may also result in single populations being divided. Density Threshold is the
-cut-off value to separate the dense regions from background. It is a floating
-point number that helps define population centers; increasing the threshold
-may help separate major populations but could cause the algorithm to overlook
-rare populations.
-
-#############################################################################
-# Compiling C code
-#############################################################################
-cd bin
-cc -o flock1 ../src/flock1.c ../src/find_connected.c -lm
-cc -o flock2 ../src/flock2.c -lm
-cc -o cent_adjust ../src/cent_adjust.c -lm
-
-#############################################################################
-# FCSTrans
-#############################################################################
-A shell script named ipconvert.sh is included that runs the FCSTrans R
-code for converting and transforming a binary FCS file. The output consists
-of one text file containing the transformed channel intensity values and
-another file containing a list of the FCS parameters.
-
-cd bin
-/usr/local/flock/bin/ipconvert.sh ../example/data/FCS2.fcs
-/usr/local/flock/bin/ipconvert.sh ../example/data/FCS3.fcs
-
-#############################################################################
-# Running flock1 or flock2
-#############################################################################
-Running the FLOCK1 and FLOCK2 algorithms generate 8 output files that have
-generic file names. For this reason, it is recommended that one output
-directory be created for one input file to the program.
-
-cd example/output/FCS2
-/usr/local/flock/bin/flock1 ../../data/FCS2.txt
-
-cd example/output/FCS3
-/usr/local/flock/bin/flock2 ../../data/FCS3.txt
-
-Files created: MFI.txt, percentage.txt, population_id.txt, profile.txt,
- flock_results.txt, coordinates.txt, population_center.txt
- and percentage.txt
-
-Usage Information for FLOCK1
-----------------------------------------------------------------------------
-basic mode: flock1 fcs.txt
-advanced_ mode: flock1 fcs.txt num_bin density_index max_num_pop
-
-Usage Information for FLOCK2
-----------------------------------------------------------------------------
-basic mode: flock data_file
-advanced mode 0 (specify maximum # of pops): flock data_file max_num_pop
-advanced mode 1 (without # of pops): flock data_file num_bin density_index
-advanced mode 2 (specify # of pops): flock data_file num_bin density_index
- number_of_pop
-advanced mode 3 (specify both # of pops): flock data_file num_bin density_index
- number_of_pop max_num_pop
-
-FLOCK Output Files
-----------------------------------------------------------------------------
-coordinates.txt:
- Output is the intensity values for each marker and event
-
-flock_results.txt:
- A combination of the input file, event identifiers and population
- identifiers.
-
-MFI.txt:
- Provides the mean fluorescence intensity for each population for each
- marker/parameter
-
-population_id.txt:
- Contains population identifiers (i.e, values from [1 to n] where n is
- the population assigned to the corresponding events in the input data
- file, one identifier per row.)
-
-population_center.txt:
- Contains the centroid coordinates for each identified population
-
-percentage.txt:
- Includes the population identifiers and percentage of events within that
- population (relative to the whole data file)
-
-profile.txt:
- Displays an expression profile, where the approximate expression level
- for each marker is assigned a numeric value from 1-4, for each identified
- population
-
-fcs_properties.txt:
- Contains the number of events, number of populations, and number of
- markers, as well as the algorithm parameters used in the analysis
-
-#############################################################################
-# Running cent_adjust
-#############################################################################
-Running the cent_adjust algorithm generates 4 output files that have
-generic file names. For this reason, it is recommened that one output
-directory be created for one input file to the program.
-
-mkdir example/output/FCS2/cent_adjust
-cd example/output/FCS2/cent_adjust
-/usr/local/flock/bin/cent_adjust ../population_center.txt ../coordinates.txt
-
-Files created: MFI.txt, percentage.txt, population_id.txt and profile.txt
-
-Usage Information for cent_adjust
-----------------------------------------------------------------------------
-basic mode: cent_adjust input_center input_data_file
diff -r 81f9b44f5242 -r b6b4d08b6858 run_flock/src/cent_adjust.c
--- a/run_flock/src/cent_adjust.c Mon Feb 27 13:30:23 2017 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,1009 +0,0 @@
-/////////////////////////////////////////////////////////
-// Cent_adjust version number and modification history
-// ImmPort BISC project
-// Author: Yu "Max" Qian
-// v1.01: Oct 16, 2009
-// Line 899 of the main function:
-// Changed kmean_term=1 to kmean_term=2
-//////////////////////////////////////////////////////////
-
-
-#include
-#include
-#include
-#include
-#include
-
-#define DEBUG 0
-#define LINE_LEN 1024
-#define FILE_NAME_LEN 128
-#define PARA_NAME_LEN 64
-#define MAX_VALUE 1000000000
-#define CUBE 0
-
-
-void getctrfileinfo(FILE *f_src_ctr, long *num_clust)
-{
- int ch='\n';
- int prev='\n';
- long num_rows=0;
-
- while ((ch = fgetc(f_src_ctr))!= EOF )
- {
- if (ch == '\n')
- {
- ++num_rows;
- }
- prev = ch;
- }
- if (prev!='\n')
- ++num_rows;
-
- *num_clust=num_rows;
- //printf("center file has %ld rows\n", *num_clust);
-}
-
-/************************************* Read basic info of the source file **************************************/
-void getfileinfo(FILE *f_src, long *file_Len, long *num_dm, char *name_string, int *time_ID)
-{
- char src[LINE_LEN];
- char current_name[64];
- char prv;
-
- long num_rows=0;
- long num_columns=0;
- int ch='\n';
- int prev='\n';
- long time_pos=0;
- long i=0;
- long j=0;
-
-
-
- src[0]='\0';
- fgets(src, LINE_LEN, f_src);
-
- name_string[0]='\0';
- current_name[0]='\0';
- prv='\n';
-
- while ((src[i]==' ') || (src[i]=='\t')) //skip space and tab characters
- i++;
-
- while ((src[i]!='\r') && (src[i]!='\n')) //repeat until the end of the line
- {
- current_name[j]=src[i];
-
- if ((src[i]=='\t') && (prv!='\t')) //a complete word
- {
- current_name[j]='\0';
-
- /*
- * Commented out John Campbell, June 10 2010
- * We no longer want to automatically remove Time column.
- * This column should have been removed by column selection
- if (0!=strcmp(current_name,"Time"))
- {
- num_columns++; //num_columns does not inlcude the column of Time
- time_pos++;
- strcat(name_string,current_name);
- strcat(name_string,"\t");
- }
- else
- {
- *time_ID=time_pos;
- }
- */
-
- num_columns++;
- strcat(name_string,current_name);
- strcat(name_string,"\t");
-
- current_name[0]='\0';
- j=0;
- }
-
- if ((src[i]=='\t') && (prv=='\t')) //a duplicate tab or space
- {
- current_name[0]='\0';
- j=0;
- }
-
- if (src[i]!='\t')
- j++;
-
-
- prv=src[i];
- i++;
- }
-
- if (prv!='\t') //the last one hasn't been retrieved
- {
- current_name[j]='\0';
- /*
- * Commented out John Campbell, June 10 2010
- * We no longer want to automatically remove Time column.
- * This column should have been removed by column selection
- if (0!=strcmp(current_name,"Time"))
- {
- num_columns++;
- strcat(name_string,current_name);
- time_pos++;
- }
- else
- {
- *time_ID=time_pos;
- }
- */
-
- num_columns++;
- strcat(name_string,current_name);
- }
-
- if (DEBUG==1)
- {
- printf("time_ID is %d\n",*time_ID);
- printf("name_string is %s\n",name_string);
- }
-
- // # of rows
-
- while ((ch = fgetc(f_src))!= EOF )
- {
- if (ch == '\n')
- {
- ++num_rows;
- }
- prev = ch;
- }
- if (prev!='\n')
- ++num_rows;
-
- *file_Len=num_rows;
- *num_dm=num_columns;
-
- //printf("original file size is %ld; number of dimensions is %ld\n", *file_Len, *num_dm);
-}
-
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/************************************* Read the source file into uncomp_data **************************************/
-void readsource(FILE *f_src, long file_Len, long num_dm, double **uncomp_data, int time_ID)
-{
- long time_pass=0; //to mark whether the time_ID has been passed
- long index=0;
-
- long i=0;
- long j=0;
- long t=0;
-
- char src[LINE_LEN];
- char xc[LINE_LEN/10];
-
- src[0]='\0';
- fgets(src,LINE_LEN, f_src); //skip the first line about parameter names
-
- while (!feof(f_src) && (indexbiggest)
- biggest=orig_data[i][j];
- if (orig_data[i][j]=1)
- times_allowed = (long)kmean_term;
- else
- EPS = kmean_term;
-
- times=0;
-
- while (((vvv>EPS) && (kmean_term<1)) || ((times=1)))
- {
- for (i=0;ivvv)
- vvv=variation; //vvv is the biggest variation among the k clusters;
- }
- //compute_variation;
- times++;
- } //end for while
-
-
-
- free(num);
- for (i=0;iMatrix[i][j])
- small[j]=Matrix[i][j];
- }
-
- size_c[id]++;
- }
-
- for (i=0;i0)) //the smallest values excluded
- size_mybound_0[cluster_id[i]][j]++;
- else
- {
- if (Matrix[i][j]size_mybound_1[i][j])
- situ1=0;
- else
- situ1=1;
- if (size_mybound_2[i][j]>size_mybound_3[i][j])
- situ2=2;
- else
- situ2=3;
-
- if ((situ1==0) && (situ2==2))
- {
- if (size_mybound_0[i][j]>size_mybound_2[i][j])
- prof[i][j]=0;
- else
- prof[i][j]=2;
- }
- if ((situ1==0) && (situ2==3))
- {
- if (size_mybound_0[i][j]>size_mybound_3[i][j])
- prof[i][j]=0;
- else
- prof[i][j]=3;
- }
- if ((situ1==1) && (situ2==2))
- {
- if (size_mybound_1[i][j]>size_mybound_2[i][j])
- prof[i][j]=1;
- else
- prof[i][j]=2;
- }
- if ((situ1==1) && (situ2==3))
- {
- if (size_mybound_1[i][j]>size_mybound_3[i][j])
- prof[i][j]=1;
- else
- prof[i][j]=3;
- }
-
- //begin to output profile
- if (j==num_dm-1)
- {
- if (prof[i][j]==0)
- fprintf(fprof_id,"1\n");
- if (prof[i][j]==1)
- fprintf(fprof_id,"2\n");
- if (prof[i][j]==2)
- fprintf(fprof_id,"3\n");
- if (prof[i][j]==3)
- fprintf(fprof_id,"4\n");
- }
- else
- {
- if (prof[i][j]==0)
- fprintf(fprof_id,"1\t");
- if (prof[i][j]==1)
- fprintf(fprof_id,"2\t");
- if (prof[i][j]==2)
- fprintf(fprof_id,"3\t");
- if (prof[i][j]==3)
- fprintf(fprof_id,"4\t");
- }
- }
- }
- fclose(fprof_id);
-
- ///////////////////////////////////////////////////////////
-
-
- fpcnt_id=fopen("percentage.txt","w");
- fprintf(fpcnt_id,"Population_ID\tPercentage\n");
-
- for (t=0;t
-#include
-#include
-#include
-
-//static const char *rcsid = "$Id: find_connected.c,v 1.1 2008/09/05 21:54:40 rpl Exp $";
-
-int find_connected(int **G, int num_dense_grids, int ndim, int *grid_clusterID);
-void depth_first(int startnode);
-
-void bail(const char *); /* exits via abort */
-static void check_clusters(int *gcID, int ndense);
-static void merge_cluster(int from, int into);
-
-
-/* Vars that will not change througout the depth-first recursion. We
- store them here to avoid endless replication on the stack. */
-static int **Gr=0;
-static int *gcID = 0; /* grid cluster IDs */
-static int *cluster_count=0; /* count of nodes per cluster */
-static int ndense=0;
-static int ndim=0;
-/* cid changes between depth-first searches, but is constant within a
- single search, so it goes here. */
-static int cid=0;
-
-/* Find connected components in the graph of neighboring grids defined in G.
- *
- * Output:
- *
- * grid_clusterID[] -- cluster to which each dense grid was assigned
- * return value -- number of clusters assigned.
- */
-int find_connected(int **G, int n_dense_grids, int num_dm, int *grid_clusterID)
-{
- int nclust=0; /* number of clusters found */
- int i;
- int *subfac;
- int subval=0,nempty=0;
- int clustid=0;
-
- size_t sz = n_dense_grids*sizeof(int);
- cluster_count = malloc(sz);
- if(!cluster_count)
- bail("find_connected: Unable to allocate %zd bytes.\n");
- memset(cluster_count,0,sz);
-
- /* set up the statics that will be used in the DFS */
- Gr=G;
- gcID = grid_clusterID;
- ndense = n_dense_grids;
- ndim = num_dm;
-
-
- for(i=0;i= 0) {
- /* We are, so zero the count for the old cluster. */
- cluster_count[ gcID[node] ] = 0;
- merge_cluster(gcID[node], cid);
- return;
- }
-
- /* Update for this node */
- gcID[node] = cid;
- cluster_count[cid]++;
-
- /* Recursively search the child nodes */
- for(i=0; i= 0) /* This is a child node */
- depth_first(Gr[node][i]);
-}
-
-void bail(const char *msg)
-{
- fprintf(stderr,"%s",msg);
- abort();
-}
-
-
-static void check_clusters(int *gcID, int ndense)
-{
- int i;
-
- for(i=0; imax_grid) to STDERR;
- MAX_GRID changed to 50 as larger than 50 seems not useful for any file we have got
-
-******************************************************************************/
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-
-
-#define DEBUG 0
-#define LINE_LEN 1024
-#define FILE_NAME_LEN 128
-#define PARA_NAME_LEN 64
-#define MAX_VALUE 1000000000
-#define MIN_GRID 6
-#define MAX_GRID 50
-
-#define NORM_METHOD 2 //2 if z-score; 0 if no normalization; 1 if min-max
-#define KMEANS_TERM 100
-//#define MAX_NUM_POP 30
-
-
-int find_connected(int **G, int num_dense_grids, int ndim, int *grid_clusterID);
-
-/************* Read basic info of the source file ****************************/
-void getfileinfo(FILE *f_src, int *file_Len, int *num_dm, char *name_string, int *time_ID)
-{
- char src[LINE_LEN];
- char current_name[64];
- char prv;
-
- int num_rows=0;
- int num_columns=0;
- int ch='\n';
- int prev='\n';
- int time_pos=0;
- int i=0;
- int j=0;
- int sw=0;
-
- src[0]='\0';
- fgets(src, LINE_LEN, f_src);
-
- if ((src[0]=='F') && (src[1]=='C') && (src[2]=='S'))
- {
- fprintf(stderr,"the correct input format is a tab-delimited txt file, instead of FCS file.\n");
- abort();
- }
-
- name_string[0]='\0';
- current_name[0]='\0';
- prv='\n';
-
- // skip space and tab characters
- while ((src[i]==' ') || (src[i]=='\t'))
- i++;
-
- // repeat until the end of line is reached
- while ((src[i]!='\0') && (src[i]!='\n') && (src[i]!='\r'))
- {
- current_name[j]=src[i];
-
- if ((src[i]=='\t') && (prv!='\t')) //a complete word
- {
- current_name[j]='\0';
-
- if (0!=strcmp(current_name,"Time"))
- {
- num_columns++; //num_columns does not inlcude the column of Time
- time_pos++;
- if (sw) {
- strcat(name_string,"\t");
- }
- strcat(name_string,current_name);
- sw = 1;
- }
- else
- {
- *time_ID=time_pos;
- }
-
-
- current_name[0]='\0';
- j=0;
- }
-
- if ((src[i]=='\t') && (prv=='\t')) //a duplicate tab or space
- {
- current_name[0]='\0';
- j=0;
- }
-
- if (src[i]!='\t')
- j++;
-
- prv=src[i];
- i++;
- }
-
- if (prv!='\t') //the last one hasn't been retrieved
- {
- current_name[j]='\0';
-
- if (0!=strcmp(current_name,"Time"))
- {
- num_columns++;
- strcat(name_string,"\t");
- strcat(name_string,current_name);
- time_pos++;
- }
- else
- {
- *time_ID=time_pos;
- }
-
-
- }
- if (DEBUG==1)
- {
- printf("time_ID is %d\n",*time_ID);
- printf("name_string is %s\n",name_string);
- }
-
- //start computing # of rows
-
- while ((ch = fgetc(f_src))!= EOF )
- {
- if (ch == '\n')
- {
- ++num_rows;
- }
- prev = ch;
- }
- if (prev!='\n')
- ++num_rows;
-
- //added on July 23, 2010
- if (num_rows<50)
- {
- fprintf(stderr,"Number of events in the input file is too few and should not be processed!\n"); //modified on July 23, 2010
- abort();
- }
-
- *file_Len=num_rows;
- *num_dm=num_columns;
-
- printf("original file size is %d; number of dimensions is %d\n", *file_Len, *num_dm);
-}
-
-
-
-/************************************* Read the source file into uncomp_data **************************************/
-void readsource(FILE *f_src, int file_Len, int num_dm, double **uncomp_data, int time_ID)
-{
- int time_pass=0; //to mark whether the time_ID has been passed
- int index=0;
-
- int i=0;
- int j=0;
- int t=0;
-
- char src[LINE_LEN];
- char xc[LINE_LEN/10];
-
- src[0]='\0';
- fgets(src,LINE_LEN, f_src); //skip the first line about parameter names
-
- while (!feof(f_src) && (indexbiggest)
- biggest=orig_data[i][j];
- if (orig_data[i][j] index in the original data of the ith element of the ordered list)
- * grid_ID -- mapping between the original data and the "grids" (see below) found as a byproduct
- * of the sorting procedure.
- * num_nonempty -- the number of grids that occur in the data (= the number of distinct values assumed
- * by grid_ID)
- */
-
-void radixsort_flock(int **position,int file_Len,int num_dm,int num_bin,int *sorted_seq,int *num_nonempty,int *grid_ID)
-{
- int i=0;
- int length=0;
- int start=0;
- int prev_ID=0;
- int curr_ID=0;
-
- int j=0;
- int t=0;
- int p=0;
- int loc=0;
- int temp=0;
- int equal=0;
-
- int *count; //count[i]=j means there are j numbers having value i at the processing digit
- int *index; //index[i]=j means the starting position of grid i is j
- int *cp; //current position
- int *mark; //mark[i]=0 means it is not an ending point of a part, 1 means it is (a "part" is a group of items with identical bins for all dimensions)
- int *seq; //temporary sequence
-
- count=(int*)malloc(sizeof(int)*num_bin);
- memset(count,0,sizeof(int)*num_bin);
-
- cp=(int*)malloc(sizeof(int)*num_bin);
- memset(cp,0,sizeof(int)*num_bin);
-
- index=(int*)malloc(sizeof(int)*num_bin); // initialized below
-
- seq=(int*)malloc(sizeof(int)*file_Len);
- memset(seq,0,sizeof(int)*file_Len);
-
- mark=(int*)malloc(sizeof(int)*file_Len);
- memset(mark,0,sizeof(int)*file_Len);
-
- for (i=0;i0) && (index[i+1]<=file_Len))
- {
- mark[index[i+1]-1]=1; // Mark the end of the segment in the ordered list
- }
- else
- {
- printf("out of myboundary for mark at index[i+1]-1.\n");
- }
- }
- mark[file_Len-1]=1;
-
- for (i=0;i0))
- {
- mark[index[t+1]-1]=1; // update the part boundaries to include the differences in the current radix.
- }
-
- }
- mark[i]=1;
-
- /* Update the permutation vector for the current part (i.e., from start to i). By the time we finish the loop over i
- the PV will be completely updated for the partial ordering up to the current radix. */
- for (t=start;t<=i;t++)
- {
- loc=position[sorted_seq[t]][j];//loc=position[t][j];
- temp=index[loc]+cp[loc];
- if ((temp=0))
- {
- // seq is a temporary because we have to maintain the old PV until we have finished this step.
- seq[temp]=sorted_seq[t]; //sorted_seq[i]=temp is also another way to sort
- cp[loc]++;
- }
- else
- {
- printf("out of myboundary for seq at temp.\n");
- }
- }
-
- for (t=start;t<=i;t++)
- {
- // copy the temporary back into sorted_seq. sorted_seq is now updated for radix j up through
- // entry i in the ordered list.
- if ((t>=0) && (tbig[j])
- big[j]=data_in[i][j];
-
- if (data_in[i][j]=big[j])
- position[i][j]=num_bin-1;
- else
- {
- position[i][j]=(int)((data_in[i][j]-small[j])/interval[j]); //position[i][j]=t means point i is at the t grid of dimensional j
- if ((position[i][j]>=num_bin) || (position[i][j]<0))
- {
- //printf("position mis-computed in density analysis!\n");
- //exit(0);
- fprintf(stderr,"Incorrect input file format or input parameters (number of bins overflows)!\n"); //modified on July 23, 2010
- abort();
-
- }
- }
- }
- }
-
-
- free(small);
- free(big);
-}
-
-/********************************************** select_bin to select the number of bins **********************************/
-//num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty);
-/* Determine the number of bins to use in each dimension. Additionally sort the data elements according to the binned
- * values, and partition the data into "grids" with identical (binned) values. We try progressively more bins until we
- * maximize a merit function, then return the results obtained using the optimal number of bins.
- *
- * Outputs:
- * position -- binned data values
- * sorted_seq -- permutation vector mapping the ordered list to the original data
- * all_grid_ID -- grid to which each data element was assigned.
- * num_nonempty -- number of distinct values assumed by all_grid_ID
- * interval -- bin width for each data dimension
- * return value -- the number of bins selected.
- */
-
-int select_bin(double **normalized_data, int file_Len, int num_dm, int min_grid, int max_grid, int **position, int *sorted_seq,
- int *all_grid_ID, int *num_nonempty, double *interval, int user_num_bin)
-{
-
- int num_bin=0;
- int select_num_bin=0;
- int m=0;
- int n=0;
-
- int i=0;
- int bin_scope=0;
- int temp_num_nonempty=0;
-
- int *temp_grid_ID;
- int *temp_sorted_seq;
- int **temp_position;
-
- //sorted_seq[i]=j means the event j ranks i
-
- double temp_index=0;
- double *bin_index;
- double *temp_interval;
-
-
-
- temp_grid_ID=(int *)malloc(sizeof(int)*file_Len);
- memset(temp_grid_ID,0,sizeof(int)*file_Len);
-
- temp_sorted_seq=(int *)malloc(sizeof(int)*file_Len);
- memset(temp_sorted_seq,0,sizeof(int)*file_Len);
-
- temp_position=(int **)malloc(sizeof(int*)*file_Len);
- memset(temp_position,0,sizeof(int*)*file_Len);
- for (m=0;m=(double)(file_Len)*0.95)
- break;
- if ((bin_index[i]max_grid))
- {
- fprintf(stderr,"Number of events collected is too few in terms of number of markers used. The file should not be processed!\n"); //modified on Nov 4, 2010
- exit(0);
- }
-
- if (temp_index==0)
- {
- fprintf(stderr,"Too many dimensions with too few events in the input file, or a too large number of bins used.\n"); //modified on July 23, 2010
- abort();
- }
-
-
- free(temp_grid_ID);
- free(temp_sorted_seq);
- free(bin_index);
- free(temp_interval);
-
- for (m=0;m=den_t_event)
- temp_num_dense_grids++;
-
- if (temp_num_dense_grids<=1)
- {
- //modified on July 23, 2010
- //printf("a too high density threshold is set! Please decrease your density threshold.\n");
- //exit(0);
- fprintf(stderr,"a too high density threshold is set! Please decrease your density threshold.\n"); //modified on July 23, 2010
- abort();
- }
-
- if (temp_num_dense_grids>=100000)
- {
- //modified on July 23, 2010
- //printf("a too low density threshold is set! Please increase your density threshold.\n");
- //exit(0);
- fprintf(stderr,"a too low density threshold is set! Please increase your density threshold.\n"); //modified on July 23, 2010
- abort();
- }
-
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- // Compute dense_grid_reverse
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- temp_num_dense_grids=0;
- temp_num_dense_events=0;
- for (i=0;i=den_t_event)
- {
- dense_grid_reverse[i]=temp_num_dense_grids; //dense_grid_reverse provides a mapping from all nonempty grids to dense grids.
- temp_num_dense_grids++;
- temp_num_dense_events+=all_grid_vol[i];
- }
- }
-
- num_dense_grids[0]=temp_num_dense_grids;
- num_dense_events[0]=temp_num_dense_events;
-
- free(grid_size_index);
- free(all_grid_vol);
-
- return den_t_event;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Compute densegridID_To_gridevent and eventID_To_denseventID
-/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// grid_To_event(file_Len, dense_grid_reverse, all_grid_ID, eventID_To_denseventID, densegridID_To_gridevent);
-/*
- * Filter the data so that only the data belonging to dense grids is left
- *
- * Output:
- * eventID_To_denseeventID -- mapping from original event ID to ID in list containing only events contained in dense grids.
- * densegridID_To_gridevent -- mapping from dense grids to prototype members of the grids.
- *
- */
-
-void grid_To_event(int file_Len, int *dense_grid_reverse, int *all_grid_ID, int *eventID_To_denseventID, int *densegridID_To_gridevent)
-{
- int i=0;
- int dense_grid_ID=0;
- int dense_event_ID=0;
-
- for (i=0;iseq_value[i])
- return -1;
- if (search_value[i]==seq_value[i])
- continue;
- }
- return 0;
-}
-
-//binary search the dense_grid_seq to return the dense grid ID if it exists
-int binary_search(int num_dense_grids, int num_dm, int *search_value, int **dense_grid_seq)
-{
-
- int low=0;
- int high=0;
- int mid=0;
- int comp_result=0;
- int match=0;
- //int found=0;
-
- low = 0;
- high = num_dense_grids-1;
-
- while (low <= high)
- {
- mid = (int)((low + high)/2);
-
- comp_result=compare_value(num_dm, search_value,dense_grid_seq[mid]);
-
-
- switch(comp_result)
- {
- case 1:
- high=mid-1;
- break;
- case -1:
- low=mid+1;
- break;
- case 0:
- match=1;
- break;
- }
- if (match==1)
- break;
- }
-
-
-
- if (match==1)
- {
- return mid;
- }
- else
- return -1;
-}
-
-
-/********************************************** Computing Centers Using IDs **********************************************/
-
-void ID2Center(double **data_in, int file_Len, int num_dm, int *eventID_To_denseventID, int num_clust, int *cluster_ID, double **population_center)
-{
- int i=0;
- int j=0;
- int ID=0;
- int eventID=0;
- int *size_c;
-
-
-
- size_c=(int *)malloc(sizeof(int)*num_clust);
- memset(size_c,0,sizeof(int)*num_clust);
-
- for (i=0;imax_num_pop) || (num_population<=1))
- {
-
- if (num_population<=1)
- merge_dist=merge_dist-0.1;
-
- if (num_population>max_num_pop)
- merge_dist=merge_dist+0.1;
-
-
- for (i=0;i(merge_dist*interval[t]))
- merge=0;
- }
-
- if ((merge) && (cluster_populationID[i]!=cluster_populationID[j]))
- {
- if (cluster_populationID[i]max_num_pop) && (num_population==1))
- break;
- else
- temp_num_population=num_population;
-
- if (num_clust<=1)
- break;
- }
-
- for (i=0;ikmean_term) && (kmean_term<1)) || ((times=1)))
- {
- for (i=0;ivvv)
- vvv=variation; //vvv is the biggest variation among the k clusters;
- }
- //compute_variation;
- times++;
- } //end for while
-
-
-
-
- free(num);
-
- for (i=0;iMatrix[i][j])
- small[j]=Matrix[i][j];
- }
-
- size_c[id]++;
- }
-
- for (i=0;i0)) //the smallest values excluded
- size_mybound_0[cluster_id[i]][j]++;
- else
- {
- if (Matrix[i][j]size_mybound_1[i][j])
- situ1=0;
- else
- situ1=1;
- if (size_mybound_2[i][j]>size_mybound_3[i][j])
- situ2=2;
- else
- situ2=3;
-
- if ((situ1==0) && (situ2==2))
- {
- if (size_mybound_0[i][j]>size_mybound_2[i][j])
- prof[i][j]=0;
- else
- prof[i][j]=2;
- }
- if ((situ1==0) && (situ2==3))
- {
- if (size_mybound_0[i][j]>size_mybound_3[i][j])
- prof[i][j]=0;
- else
- prof[i][j]=3;
- }
- if ((situ1==1) && (situ2==2))
- {
- if (size_mybound_1[i][j]>size_mybound_2[i][j])
- prof[i][j]=1;
- else
- prof[i][j]=2;
- }
- if ((situ1==1) && (situ2==3))
- {
- if (size_mybound_1[i][j]>size_mybound_3[i][j])
- prof[i][j]=1;
- else
- prof[i][j]=3;
- }
-
- //begin to output profile
- if (j==num_dm-1)
- {
- if (prof[i][j]==0)
- fprintf(fprof_id,"1\n");
- if (prof[i][j]==1)
- fprintf(fprof_id,"2\n");
- if (prof[i][j]==2)
- fprintf(fprof_id,"3\n");
- if (prof[i][j]==3)
- fprintf(fprof_id,"4\n");
- }
- else
- {
- if (prof[i][j]==0)
- fprintf(fprof_id,"1\t");
- if (prof[i][j]==1)
- fprintf(fprof_id,"2\t");
- if (prof[i][j]==2)
- fprintf(fprof_id,"3\t");
- if (prof[i][j]==3)
- fprintf(fprof_id,"4\t");
- }
- }
- }
- fclose(fprof_id);
-
- ///////////////////////////////////////////////////////////
-
-
- fpcnt_id=fopen("percentage.txt","w");
- fprintf(fpcnt_id,"Population_ID\tPercentage\n");
-
- for (t=0;t population_center
-//data + grid_ID -> grid_Center
-
-/* what is the final output */
-//the final things we want are grid_Center in the original data and grid_clusterID //or population_center in the original data
-//Sparse grids will not be considered when computing the two centroids (centroids of grids and centroids of clusters)
-
-/* what information should select_bin output */
-//the size of all IDs are unknown to function main because we only consider the events in dense grids, and also the number of dense grids
-//is unknown, therefore I must use a prescreening to output
-//how many bins I should use
-//the number of dense grids
-//total number of events in the dense grids
-
-/* basic procedure of main function */
-//1. read raw file and normalize the raw file
-//2. select_bin
-//3. allocate memory for eventID_To_denseventID, grid_clusterID, grid_ID, cluster_ID.
-//4. call select_dense and merge_grids with grid_clusterID, grid_ID, cluster_ID.
-//5. release normalized data; allocate memory for grid_Center and population_center
-//6. output grid_Center and population_center using ID2Center together with grid_clusterID //from IDs to centers
-
-int main (int argc, char **argv)
-{
- //inputs
- FILE *f_src; //source file pointer
-
- FILE *f_out; //coordinates
- FILE *f_cid; //population-ID of events
- FILE *f_ctr; //centroids of populations
- FILE *f_results; //coordinates file event and population column
- FILE *f_mfi; //added April 16, 2009 for mean fluorescence intensity
- FILE *f_parameters; //number of bins and density calculated by
- //the algorithm. Used to update the database
- FILE *f_properties; //Properties file used by Image generation software
-
- char para_name_string[LINE_LEN];
-
- int time_ID=-1;
- int num_bin=0; //the bins I will use on each dimension
-
- int file_Len=0; //number of events
- int num_dm=0;
- int num_clust=0;
- int num_dense_events=0;
- int num_dense_grids=0;
- int num_nonempty=0;
- int num_population=0;
- //int temp=0;
-
- //below are read from configuration file
- int i=0;
- int j=0;
- int max_num_pop=0;
-
- int den_t_event=0;
-
- int *grid_clusterID; //(dense)gridID to clusterID
- int *grid_ID; //(dense)eventID to gridID
- int *cluster_ID; //(dense)eventID to clusterID
- int *eventID_To_denseventID; //eventID_To_denseventID[i]=k means event i is in a dense grid and its ID within dense events is k
- int *all_grid_ID; //gridID for all events
- int *densegridID_To_gridevent;
- int *sorted_seq;
- int *dense_grid_reverse;
- int *population_ID; //denseeventID to populationID
- int *cluster_populationID; //clusterID to populationID
- int *grid_populationID; //gridID to populationID
- int *all_population_ID; //populationID of event
-
- int **position;
- int **dense_grid_seq;
-
- double *interval;
-
- double **population_center; //population centroids in the raw/original data
- double **cluster_center; //cluster centroids in the raw/original data
-
- double **input_data;
- double **normalized_data;
-
- int min = 999999;
- int max = 0;
-
- printf( "Starting time:\t\t\t\t");
- fflush(stdout);
- system("/bin/date");
- /////////////////////////////////////////////////////////////
-
- if ((argc!=2) && (argc!=4) && (argc!=5))
- {
- //modified on Jul 23, 2010
- //printf("usage:\n");
- //printf("basic mode: flock data_file\n");
- //printf("advanced mode: flock data_file num_bin density_index\n");
- //exit(0);
-
- fprintf(stderr,"Incorrect number of input parameters!\n"); //modified on July 23, 2010
- //fprintf(stderr,"basic mode: flock data_file\n"); //modified on July 23, 2010
- //fprintf(stderr,"advanced mode1: flock data_file num_bin density_index\n"); //modified on July 23, 2010
- fprintf(stderr,"advanced mode: flock data_file num_bin density_index max_num_pop\n"); //added on Dec 16, 2010 for GenePattern
- abort();
- }
-
- f_src=fopen(argv[1],"r");
-
- if (argc==2)
- {
- max_num_pop=30; //default value, maximum 30 clusters
- }
-
- if (argc==4)
- {
- num_bin=atoi(argv[2]);
- printf("num_bin is %d\n",num_bin);
-
- den_t_event=atoi(argv[3]);
- printf("density_index is %d\n",den_t_event);
-
- max_num_pop=30;
-
- if (((num_bin<6) && (num_bin!=0)) || (num_bin>29))
- {
- fprintf(stderr,"Incorrect input range of number of bins, which should be larger than 5 and smaller than 30\n");
- abort();
- }
-
- if (((den_t_event<3) && (den_t_event!=0)) || (den_t_event>99))
- {
- fprintf(stderr,"Incorrect input range of density threshold, which should be larger than 2 and smaller than 100\n");
- abort();
- }
- }
-
- if (argc==5)
- {
- num_bin=atoi(argv[2]);
- printf("num_bin is %d\n",num_bin);
-
- den_t_event=atoi(argv[3]);
- printf("density_index is %d\n",den_t_event);
-
- max_num_pop=atoi(argv[4]);
- printf("max number of clusters is %d\n",max_num_pop);
-
- if (((num_bin<6) && (num_bin!=0)) || (num_bin>29))
- {
- fprintf(stderr,"Incorrect input range of number of bins, which should be larger than 5 and smaller than 30\n");
- abort();
- }
-
- if (((den_t_event<3) && (den_t_event!=0)) || (den_t_event>99))
- {
- fprintf(stderr,"Incorrect input range of density threshold, which should be larger than 2 and smaller than 100\n");
- abort();
- }
-
- if ((max_num_pop<5) || (max_num_pop>999))
- {
- fprintf(stderr,"Incorrect input range of maximum number of populations, which should be larger than 4 and smaller than 1000\n");
- abort();
- }
- }
-
-
- getfileinfo(f_src, &file_Len, &num_dm, para_name_string, &time_ID); //get the filelength, number of dimensions, and num/name of parameters
-
- /************************************************* Read the data *****************************************************/
-
- rewind(f_src); //reset the file pointer
-
- input_data = (double **)malloc(sizeof(double*)*file_Len);
- memset(input_data,0,sizeof(double*)*file_Len);
- for (i=0;i=1) //num_bin has been selected by user
- select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin);
- else //num_bin has not been selected by user
- {
- num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin);
- printf("selected bin number is %d\n",num_bin);
- }
- printf("number of non-empty grids is %d\n",num_nonempty);
-
-
-
- /* Although we return sorted_seq from select_bin(), we don't use it for anything, except possibly diagnostics */
- free(sorted_seq);
-
-
- dense_grid_reverse=(int*)malloc(sizeof(int)*num_nonempty);
- memset(dense_grid_reverse,0,sizeof(int)*num_nonempty);
-
- /************************************************* select_dense **********************************************/
-
- if (den_t_event>=1) //den_t_event must be larger or equal to 2 if the user wants to set it
- select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event);
- else
- {
- den_t_event=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event);
- printf("automated selected density threshold is %d\n",den_t_event);
- }
-
- printf("Number of dense grids is %d\n",num_dense_grids);
-
- densegridID_To_gridevent = (int *)malloc(sizeof(int)*num_dense_grids);
- memset(densegridID_To_gridevent,0,sizeof(int)*num_dense_grids);
-
- for (i=0;i max) {
- max = (int)input_data[i][j];
- }
- if (j==num_dm-1)
- {
- fprintf(f_out,"%d\n",(int)input_data[i][j]);
- fprintf(f_results,"%d\t",(int)input_data[i][j]);
- }
- else
- {
- fprintf(f_out,"%d\t",(int)input_data[i][j]);
- fprintf(f_results,"%d\t",(int)input_data[i][j]);
- }
- }
- //fprintf(f_results,"%d\t",i + 1);
- fprintf(f_results,"%d\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009
- }
-
-/*
- f_parameters=fopen("parameters.txt","w");
- fprintf(f_parameters,"Number_of_Bins\t%d\n",num_bin);
- fprintf(f_parameters,"Density\t%d\n",den_t_event);
- fprintf(f_parameters,"Min\t%d\n",min);
- fprintf(f_parameters,"Max\t%d\n",max);
- fclose(f_parameters);
-*/
-
- f_properties=fopen("fcs.properties","w");
- fprintf(f_properties,"Bins=%d\n",num_bin);
- fprintf(f_properties,"Density=%d\n",den_t_event);
- fprintf(f_properties,"Min=%d\n",min);
- fprintf(f_properties,"Max=%d\n",max);
- fprintf(f_properties,"Populations=%d\n",num_population);
- fprintf(f_properties,"Events=%d\n",file_Len);
- fprintf(f_properties,"Markers=%d\n",num_dm);
- fclose(f_properties);
-
-
- for (i=0;imax_grid) to STDERR
-// 8. Fixed a bug for 2D data by using K=e*K
-// 9. Added some header files, may not be necessary
-// 10. Added a lower bound (at least two) for number of populations
-/***************************************************************************************************************************************
-
- FLOCK: FLOw cytometry Clustering without K (Named by: Jamie A. Lee and Richard H. Scheuermann)
-
- Author: (Max) Yu Qian, Ph.D.
-
- Copyright: Scheuermann Lab, Dept. of Pathology, UTSW
-
- Development: November 2005 ~ Forever
-
- Status: July 2010: Release 2.0
-
- Usage: flock data_file
- Note: the input file format must be channel values and the delimiter between two values must be a tab.
-
-
-
-****************************************************************************************************************************************/
-
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-
-
-
-#define DEBUG 0
-#define LINE_LEN 1024
-#define FILE_NAME_LEN 128
-#define PARA_NAME_LEN 64
-#define MAX_VALUE 1000000000
-#define MIN_GRID 6
-#define MAX_GRID 50
-#define E_T 1.0
-
-#define NORM_METHOD 2 //2 if z-score; 0 if no normalization; 1 if min-max
-#define KMEANS_TERM 10
-#define MAX_POP_NUM 128
-
-#ifndef max
- #define max( a, b ) ( ((a) > (b)) ? (a) : (b) )
-#endif
-
-#ifndef min
- #define min( a, b ) ( ((a) < (b)) ? (a) : (b) )
-#endif
-
-static long **Gr=0;
-static long *gcID = 0; /* grid cluster IDs */
-static long *cluster_count=0; /* count of nodes per cluster */
-static long ndense=0;
-static long ndim=0;
-/* cid changes between depth-first searches, but is constant within a
- single search, so it goes here. */
-static long cid=0;
-/* Do a depth-first search for a single connected component in graph
- * G. Start from node, tag the nodes found with cid, and record
- * the tags in grid_clusterID. Also, record the node count in
- * cluster_count. If we find a node that has already been assigned to
- * a cluster, that means we're merging two clusters, so zero out the
- * old cid's node count.
- *
- * Note that our graph is constructed as a DAG, so we can be
- * guaranteed to terminate without checking for cycles.
- *
- * Note2: this function can potentially recurse to depth = ndense.
- * Plan stack size accordingly.
- *
- * Output:
- *
- * grid_clusterID[] -- array where we tag the nodes.
- * cluster_count[] -- count of the number of nodes per cluster.
- */
-static void merge_cluster(long from, long into)
-{
- int i;
-
- for(i=0; i= 0) {
- /* We are, so zero the count for the old cluster. */
- cluster_count[ gcID[node] ] = 0;
- merge_cluster(gcID[node], cid);
- return;
- }
-
- /* Update for this node */
- gcID[node] = cid;
- cluster_count[cid]++;
-
- /* Recursively search the child nodes */
- for(i=0; i= 0) /* This is a child node */
- depth_first(Gr[node][i]);
-}
-
-void bail(const char *msg)
-{
- fprintf(stderr,"%s",msg);
- exit(0);
-}
-
-
-static void check_clusters(long *gcID, int ndense)
-{
- int i;
-
- for(i=0; ibiggest)
- biggest=orig_data[i][j];
- if (orig_data[i][j] index in the original data of the ith element of the ordered list)
- * grid_ID -- mapping between the original data and the "grids" (see below) found as a byproduct
- * of the sorting procedure.
- * num_nonempty -- the number of grids that occur in the data (= the number of distinct values assumed
- * by grid_ID)
- */
-
-void radixsort_flock(long **position,long file_Len,long num_dm,long num_bin,long *sorted_seq,long *num_nonempty,long *grid_ID)
-{
- long i=0;
- long length=0;
- long start=0;
- long prev_ID=0;
- long curr_ID=0;
-
- long j=0;
- long t=0;
- long p=0;
- long loc=0;
- long temp=0;
- long equal=0;
-
- long *count; //count[i]=j means there are j numbers having value i at the processing digit
- long *index; //index[i]=j means the starting position of grid i is j
- long *cp; //current position
- long *mark; //mark[i]=0 means it is not an ending point of a part, 1 means it is (a "part" is a group of items with identical bins for all dimensions)
- long *seq; //temporary sequence
-
- count=(long*)malloc(sizeof(long)*num_bin);
- memset(count,0,sizeof(long)*num_bin);
-
- cp=(long*)malloc(sizeof(long)*num_bin);
- memset(cp,0,sizeof(long)*num_bin);
-
- index=(long*)malloc(sizeof(long)*num_bin); // initialized below
-
- seq=(long*)malloc(sizeof(long)*file_Len);
- memset(seq,0,sizeof(long)*file_Len);
-
- mark=(long*)malloc(sizeof(long)*file_Len);
- memset(mark,0,sizeof(long)*file_Len);
-
- for (i=0;i0) && (index[i+1]<=file_Len))
- {
- mark[index[i+1]-1]=1; // Mark the end of the segment in the ordered list
- }
- else
- {
- printf("out of myboundary for mark at index[i+1]-1.\n");
- }
- }
- mark[file_Len-1]=1;
-
- for (i=0;i0))
- {
- mark[index[t+1]-1]=1; // update the part boundaries to include the differences in the current radix.
- }
-
- }
- mark[i]=1;
-
- /* Update the permutation vector for the current part (i.e., from start to i). By the time we finish the loop over i
- the PV will be completely updated for the partial ordering up to the current radix. */
- for (t=start;t<=i;t++)
- {
- loc=position[sorted_seq[t]][j];//loc=position[t][j];
- temp=index[loc]+cp[loc];
- if ((temp=0))
- {
- // seq is a temporary because we have to maintain the old PV until we have finished this step.
- seq[temp]=sorted_seq[t]; //sorted_seq[i]=temp is also another way to sort
- cp[loc]++;
- }
- else
- {
- printf("out of myboundary for seq at temp.\n");
- }
- }
-
- for (t=start;t<=i;t++)
- {
- // copy the temporary back into sorted_seq. sorted_seq is now updated for radix j up through
- // entry i in the ordered list.
- if ((t>=0) && (tbig[j])
- big[j]=data_in[i][j];
-
- if (data_in[i][j]=big[j])
- position[i][j]=num_bin-1;
- else
- {
- position[i][j]=(long)((data_in[i][j]-small[j])/interval[j]); //position[i][j]=t means point i is at the t grid of dimensional j
- if ((position[i][j]>=num_bin) || (position[i][j]<0))
- {
- //printf("position mis-computed in density analysis!\n");
- //exit(0);
- fprintf(stderr,"Incorrect input file format or input parameters (number of bins overflows)!\n"); //modified on July 23, 2010
- exit(0);
- }
- }
- }
- }
-
-
- free(small);
- free(big);
-}
-
-/********************************************** select_bin to select the number of bins **********************************/
-//num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty);
-/* Determine the number of bins to use in each dimension. Additionally sort the data elements according to the binned
- * values, and partition the data into "grids" with identical (binned) values. We try progressively more bins until we
- * maximize a merit function, then return the results obtained using the optimal number of bins.
- *
- * Outputs:
- * position -- binned data values
- * sorted_seq -- permutation vector mapping the ordered list to the original data
- * all_grid_ID -- grid to which each data element was assigned.
- * num_nonempty -- number of distinct values assumed by all_grid_ID
- * interval -- bin width for each data dimension
- * return value -- the number of bins selected.
- */
-
-long select_bin(double **normalized_data, long file_Len, long num_dm, long min_grid, long max_grid, long **position, long *sorted_seq,
- long *all_grid_ID, long *num_nonempty, double *interval, long user_num_bin)
-{
-
- long num_bin=0;
- long select_num_bin=0;
- long m=0;
- long n=0;
-
- long i=0;
- long bin_scope=0;
- long temp_num_nonempty=0;
-
- long *temp_grid_ID;
- long *temp_sorted_seq;
- long **temp_position;
-
- //sorted_seq[i]=j means the event j ranks i
-
- double temp_index=0;
- double *bin_index;
- double *temp_interval;
-
-
- temp_grid_ID=(long *)malloc(sizeof(long)*file_Len);
- memset(temp_grid_ID,0,sizeof(long)*file_Len);
-
- temp_sorted_seq=(long *)malloc(sizeof(long)*file_Len);
- memset(temp_sorted_seq,0,sizeof(long)*file_Len);
-
- temp_position=(long **)malloc(sizeof(long*)*file_Len);
- memset(temp_position,0,sizeof(long*)*file_Len);
- for (m=0;m=(double)(file_Len)*0.95)
- break;
- if ((bin_index[i]max_grid))
- {
- fprintf(stderr,"Number of events collected is too few in terms of number of markers used. The file should not be processed!\n"); //modified on Nov 04, 2010
- exit(0);
- }
-
- if (temp_index==0)
- {
- fprintf(stderr,"Too many dimensions with too few events in the input file, or a too large number of bins used.\n"); //modified on July 23, 2010
- exit(0);
- }
-
- free(temp_grid_ID);
- free(temp_sorted_seq);
- free(bin_index);
- free(temp_interval);
-
- for (m=0;m=den_t_event)
- temp_num_dense_grids++;
-
- if (temp_num_dense_grids<=1)
- {
- //exit(0);
- //printf("a too high density threshold is set! Please decrease your density threshold.\n");
- fprintf(stderr,"a too high density threshold! Please decrease your density threshold.\n"); //modified on July 23, 2010
- exit(0);
- }
-
- if (temp_num_dense_grids>=100000)
- {
- //modified on July 23, 2010
- //printf("a too low density threshold is set! Please increase your density threshold.\n");
- //exit(0);
- fprintf(stderr,"a too low density threshold! Please increase your density threshold.\n"); //modified on July 23, 2010
- exit(0);
- }
-
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- // Compute dense_grid_reverse
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- temp_num_dense_grids=0;
- temp_num_dense_events=0;
- for (i=0;i=den_t_event)
- {
- dense_grid_reverse[i]=temp_num_dense_grids; //dense_grid_reverse provides a mapping from all nonempty grids to dense grids.
- temp_num_dense_grids++;
- temp_num_dense_events+=all_grid_vol[i];
- }
- }
-
- num_dense_grids[0]=temp_num_dense_grids;
- num_dense_events[0]=temp_num_dense_events;
-
- free(grid_size_index);
- free(all_grid_vol);
-
- return den_t_event;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Compute densegridID_To_gridevent and eventID_To_denseventID
-/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// grid_To_event(file_Len, dense_grid_reverse, all_grid_ID, eventID_To_denseventID, densegridID_To_gridevent);
-/*
- * Filter the data so that only the data belonging to dense grids is left
- *
- * Output:
- * eventID_To_denseeventID -- mapping from original event ID to ID in list containing only events contained in dense grids.
- * densegridID_To_gridevent -- mapping from dense grids to prototype members of the grids.
- *
- */
-
-void grid_To_event(long file_Len, long *dense_grid_reverse, long *all_grid_ID, long *eventID_To_denseventID, long *densegridID_To_gridevent)
-{
- long i=0;
- long dense_grid_ID=0;
- long dense_event_ID=0;
-
- for (i=0;iseq_value[i])
- return -1;
- if (search_value[i]==seq_value[i])
- continue;
- }
- return 0;
-}
-
-//binary search the dense_grid_seq to return the dense grid ID if it exists
-long binary_search(long num_dense_grids, long num_dm, long *search_value, long **dense_grid_seq)
-{
-
- long low=0;
- long high=0;
- long mid=0;
- long comp_result=0;
- long match=0;
- //long found=0;
-
- low = 0;
- high = num_dense_grids-1;
-
- while (low <= high)
- {
- mid = (long)((low + high)/2);
-
- comp_result=compare_value(num_dm, search_value,dense_grid_seq[mid]);
-
-
- switch(comp_result)
- {
- case 1:
- high=mid-1;
- break;
- case -1:
- low=mid+1;
- break;
- case 0:
- match=1;
- break;
- }
- if (match==1)
- break;
- }
-
-
-
- if (match==1)
- {
- return mid;
- }
- else
- return -1;
-}
-
-
-/********************************************** Computing Centers Using IDs **********************************************/
-
-void ID2Center(double **data_in, long file_Len, long num_dm, long *eventID_To_denseventID, long num_clust, long *cluster_ID, double **population_center)
-{
- long i=0;
- long j=0;
- long ID=0;
- long eventID=0;
- long *size_c;
-
-
-
- size_c=(long *)malloc(sizeof(long)*num_clust);
- memset(size_c,0,sizeof(long)*num_clust);
-
- for (i=0;imax_num_pop) && (merge_dist<5.0)) || ((num_population<=1) && (merge_dist>0.1)))
- {
-
- if (num_population<=1)
- merge_dist=merge_dist-0.1;
-
- if (num_population>max_num_pop)
- merge_dist=merge_dist+0.1;
-
-
- for (i=0;i(merge_dist*interval[t]))
- merge=0;
- }
-
- if ((merge) && (cluster_populationID[i]!=cluster_populationID[j]))
- {
- if (cluster_populationID[i]max_num_pop) && (num_population==1))
- break;
- else
- temp_num_population=num_population;
-
- if (num_clust<=1)
- break;
- } //end while
-
- for (i=0;ikmean_term) && (kmean_term<1)) || ((times=1)))
- {
- for (i=0;ivvv)
- vvv=variation; //vvv is the biggest variation among the k clusters;
- }
- //compute_variation;
- times++;
- } //end for while
-
-
-
-
- free(num);
-
- for (i=0;iMatrix[i][j])
- small[j]=Matrix[i][j];
- }
-
- size_c[id]++;
- }
-
- for (i=0;i0)) //the smallest values excluded
- size_mybound_0[cluster_id[i]][j]++;
- else
- {
- if (Matrix[i][j]size_mybound_1[i][j])
- situ1=0;
- else
- situ1=1;
- if (size_mybound_2[i][j]>size_mybound_3[i][j])
- situ2=2;
- else
- situ2=3;
-
- if ((situ1==0) && (situ2==2))
- {
- if (size_mybound_0[i][j]>size_mybound_2[i][j])
- prof[i][j]=0;
- else
- prof[i][j]=2;
- }
- if ((situ1==0) && (situ2==3))
- {
- if (size_mybound_0[i][j]>size_mybound_3[i][j])
- prof[i][j]=0;
- else
- prof[i][j]=3;
- }
- if ((situ1==1) && (situ2==2))
- {
- if (size_mybound_1[i][j]>size_mybound_2[i][j])
- prof[i][j]=1;
- else
- prof[i][j]=2;
- }
- if ((situ1==1) && (situ2==3))
- {
- if (size_mybound_1[i][j]>size_mybound_3[i][j])
- prof[i][j]=1;
- else
- prof[i][j]=3;
- }
-
- //begin to output profile
- if (j==num_dm-1)
- {
- if (prof[i][j]==0)
- fprintf(fprof_id,"1\n");
- if (prof[i][j]==1)
- fprintf(fprof_id,"2\n");
- if (prof[i][j]==2)
- fprintf(fprof_id,"3\n");
- if (prof[i][j]==3)
- fprintf(fprof_id,"4\n");
- }
- else
- {
- if (prof[i][j]==0)
- fprintf(fprof_id,"1\t");
- if (prof[i][j]==1)
- fprintf(fprof_id,"2\t");
- if (prof[i][j]==2)
- fprintf(fprof_id,"3\t");
- if (prof[i][j]==3)
- fprintf(fprof_id,"4\t");
- }
- }
- }
- fclose(fprof_id);
-
- ///////////////////////////////////////////////////////////
-
-
- fpcnt_id=fopen("percentage.txt","w");
- fprintf(fpcnt_id,"Population_ID\tPercentage\n");
-
- for (t=0;t0)
- {
- k=(long)sqrt((double)k);
- if (num_dm<=3)
- k=(long)(2.7183*(double)k); //e*k for low-dimensional space, added Nov. 4, 2010
- //printf("the k-nearest k is %d\n",k);
- }
- else
- {
- printf("error in k\n");
- exit(0);
- }
-
- nearest_neighbors=(long *)malloc(sizeof(long)*k);
- memset(nearest_neighbors,0,sizeof(long)*k);
-
- nearest_distance=(double *)malloc(sizeof(double)*k);
- memset(nearest_distance,0,sizeof(double)*k);
-
- temp=0;
-
- for (i=0;ibiggest_distance)
- {
- biggest_distance=nearest_distance[j];
- current_biggest=j;
- }
- }
-
- if (biggest_distance>distance)
- {
- nearest_neighbors[current_biggest]=i;
- nearest_distance[current_biggest]=distance;
- }
- }
- }
- }
-
- for (i=0;i population_center
-//data + grid_ID -> grid_Center
-
-/* what is the final output */
-//the final things we want are grid_Center in the original data and grid_clusterID //or population_center in the original data
-//Sparse grids will not be considered when computing the two centroids (centroids of grids and centroids of clusters)
-
-/* what information should select_bin output */
-//the size of all IDs are unknown to function main because we only consider the events in dense grids, and also the number of dense grids
-//is unknown, therefore I must use a prescreening to output
-//how many bins I should use
-//the number of dense grids
-//total number of events in the dense grids
-
-/* basic procedure of main function */
-//1. read raw file and normalize the raw file
-//2. select_bin
-//3. allocate memory for eventID_To_denseventID, grid_clusterID, grid_ID, cluster_ID.
-//4. call select_dense and merge_grids with grid_clusterID, grid_ID, cluster_ID.
-//5. release normalized data; allocate memory for grid_Center and population_center
-//6. output grid_Center and population_center using ID2Center together with grid_clusterID //from IDs to centers
-
-int main (int argc, char **argv)
-{
- //inputs
- FILE *f_src; //source file pointer
-
- FILE *f_out; //coordinates
- FILE *f_cid; //population-ID of events
- FILE *f_ctr; //centroids of populations
- FILE *f_results; //coordinates file event and population column
- FILE *f_mfi; //added April 16, 2009 for mean fluorescence intensity
- FILE *f_parameters; //number of bins and density calculated by
- //the algorithm. Used to update the database
- FILE *f_properties; //Properties file used by Image generation software
-
- //char tmpbuf[128];
-
- char para_name_string[LINE_LEN];
-
- long time_ID=-1;
- long num_bin=0; //the bins I will use on each dimension
- long num_pop=0;
- long file_Len=0; //number of events
- long num_dm=0;
- long num_clust=0;
- long num_dense_events=0;
- long num_dense_grids=0;
- long num_nonempty=0;
- long num_population=0;
- long num_real_pop=0;
- long keep_merge=0;
- long num_checked_range=0;
-
- //below are read from configuration file
- long i=0;
- long j=0;
- long p=0;
- long t=0;
- long q=0;
- long temp_i=0;
- long temp_j=0;
- long first_i=0;
- long first_j=0;
- long d1=0;
- long d2=0;
- long d3=0;
- long d_d=0;
- long max_num_pop=0; //upperbound of the number of populations that is equal to 2^d
- long index_id=0;
- long num_computed_population=0;
- long tot_size=0;
-
- int den_t_event=0;
-
- double distance=0.0;
- double dist=0.0;
- double current_smallest_dist=0;
- double max_d_dist=0.0;
- double Ei=0.0;
- double Ej=0.0;
- double E1=0.0;
- double E2=0.0;
- double E3=0.0;
- double Ep=0.0;
- double temp=0.0;
- double tmp=0.0;
-
- long *grid_clusterID; //(dense)gridID to clusterID
- long *grid_ID; //(dense)eventID to gridID
- long *cluster_ID; //(dense)eventID to clusterID
- long *eventID_To_denseventID; //eventID_To_denseventID[i]=k means event i is in a dense grid and its ID within dense events is k
- long *all_grid_ID; //gridID for all events
- long *densegridID_To_gridevent;
- long *sorted_seq;
- long *dense_grid_reverse;
- long *population_ID; //denseeventID to populationID
- long *cluster_populationID; //clusterID to populationID
- long *grid_populationID; //gridID to populationID
- long *all_population_ID; //populationID of event
- long *size_c;
- long *size_p_2;
- long *partit;
- long *size_p;
- long *temp_size_j;
- long *all_computed_pop_ID;
-
-
- long **position;
- long **dense_grid_seq;
- long **temp_population_ID;
-
- double *interval;
- double *center_1;
- double *center_2;
- double *center_3;
- double *aver;
- double *std;
- double *center_p_1;
- double *center_p_2;
- double *center_p_3;
-
- double **population_center; //population centroids in the raw/original data
- double **cluster_center; //cluster centroids in the raw/original data
- double **new_population_center;
- double **temp_population_center;
- double **min_pop;
- double **max_pop;
-
- double **input_data;
- double **normalized_data;
-
- double **real_pop_center;
- double **distance_pop;
-
- double ***ind_pop;
- double ***ind_pop_3;
-
- int min = 999999;
- int max = 0;
-
- printf( "Starting time:\t\t\t\t");
- fflush(stdout);
- system("/bin/date");
-
- /*
- * Windows Version
- _strtime( tmpbuf );
- printf( "Starting time:\t\t\t\t%s\n", tmpbuf );
- _strdate( tmpbuf );
- printf( "Starting date:\t\t\t\t%s\n", tmpbuf );
- */
-
- /////////////////////////////////////////////////////////////
-
- if ((argc!=2) && (argc!=3) && (argc!=4) && (argc!=5) && (argc!=6))
- {
- fprintf(stderr,"Incorrect number of input parameters!\n");
- fprintf(stderr,"usage:\n");
- fprintf(stderr,"basic mode: flock data_file\n");
- fprintf(stderr,"advanced mode 0 (specify maximum # of pops): flock data_file max_num_pop\n");
- fprintf(stderr,"advanced mode 1 (without # of pops): flock data_file num_bin density_index\n");
- fprintf(stderr,"advanced mode 2 (specify # of pops): flock data_file num_bin density_index number_of_pop\n");
- fprintf(stderr,"advanced mode 3 (specify both # of pops): flock data_file num_bin density_index number_of_pop max_num_pop\n");
- exit(0);
- }
-
- f_src=fopen(argv[1],"r");
-
- if (argc==3)
- {
- max_num_pop=atoi(argv[2]);
- printf("num of maximum pop is %ld\n",max_num_pop);
- }
-
- if (argc==4)
- {
- num_bin=atoi(argv[2]);
- printf("num_bin is %ld\n",num_bin);
-
- den_t_event=atoi(argv[3]);
- printf("density_index is %d\n",den_t_event);
- }
-
- if (argc==5)
- {
- num_bin=atoi(argv[2]);
- printf("num_bin is %ld\n",num_bin);
-
- den_t_event=atoi(argv[3]);
- printf("density_index is %d\n",den_t_event);
-
- num_pop=atoi(argv[4]);
- printf("num of pop is %ld\n",num_pop);
- }
-
- if (argc==6)
- {
- num_bin=atoi(argv[2]);
- printf("num_bin is %ld\n",num_bin);
-
- den_t_event=atoi(argv[3]);
- printf("density_index is %d\n",den_t_event);
-
- num_pop=atoi(argv[4]);
- printf("num of pop is %ld\n",num_pop);
-
- max_num_pop=atoi(argv[5]);
- printf("num of pop is %ld\n",max_num_pop);
- }
-
-
- if (num_pop==1)
- {
- printf("it is not allowed to specify only one population\n");
- exit(0);
- }
-
-
-
- getfileinfo(f_src, &file_Len, &num_dm, para_name_string, &time_ID); //get the filelength, number of dimensions, and num/name of parameters
-
-
- if (max_num_pop==0)
- {
- max_num_pop=(long)pow(2,num_dm);
- if (max_num_pop>MAX_POP_NUM)
- max_num_pop=MAX_POP_NUM;
- }
-
- /************************************************* Read the data *****************************************************/
-
- rewind(f_src); //reset the file pointer
-
- input_data = (double **)malloc(sizeof(double*)*file_Len);
- memset(input_data,0,sizeof(double*)*file_Len);
- for (i=0;i=1) //num_bin has been selected by user
- {
- select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin);
- printf("user set bin number is %ld\n",num_bin);
- }
- else //num_bin has not been selected by user
- {
- num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin);
- printf("selected bin number is %ld\n",num_bin);
- }
- printf("number of non-empty grids is %ld\n",num_nonempty);
-
-
-
- /* Although we return sorted_seq from select_bin(), we don't use it for anything, except possibly diagnostics */
- free(sorted_seq);
-
-
- dense_grid_reverse=(long*)malloc(sizeof(long)*num_nonempty);
- memset(dense_grid_reverse,0,sizeof(long)*num_nonempty);
-
- /************************************************* select_dense **********************************************/
-
- if (den_t_event>=1) //den_t_event must be larger or equal to 2 if the user wants to set it
- den_t_event=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event);
- else
- {
- den_t_event=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event);
- printf("automated selected density threshold is %d\n",den_t_event);
- }
-
- printf("Number of dense grids is %ld\n",num_dense_grids);
-
- densegridID_To_gridevent = (long *)malloc(sizeof(long)*num_dense_grids);
- memset(densegridID_To_gridevent,0,sizeof(long)*num_dense_grids);
-
- for (i=0;imax_d_dist)
- {
- max_d_dist=dist;
- d_d=t;
- }
- }
-
- if (j==0)
- d1=d_d;
- if (j==1)
- d2=d_d;
- if (j==2)
- d3=d_d;
- }
-
-
- for (t=0;tEi) && (Ep>Ej)) //the two parts should be partitioned
- {
- partit[i]=num_computed_population;
- num_computed_population++;
- }
-
- } //end for (i=0;i0)
- {
- j=temp_size_j[index_id];
- if (temp_population_ID[index_id][j]==1)
- all_computed_pop_ID[i]=partit[index_id];
-
- temp_size_j[index_id]++;
- }
- }
-
- printf("Number of groups after partitioning is %ld\n", num_computed_population);
-
-
- free(size_p_2);
-
- free(aver);
- free(std);
- free(partit);
-
- free(center_p_1);
- free(center_p_2);
- free(center_p_3);
-
-
-
- for (i=0;i=num_computed_population)
- printf("all_population_ID[%ld]=%ld\n",i,all_population_ID[i]);
- }
-
- num_population=num_computed_population;
-
- free(all_computed_pop_ID);
- //end partitioning
- // since the num_population has changed, population_center needs to be redefined as below
-
- population_center=(double **)malloc(sizeof(double*)*num_population);
- memset(population_center,0,sizeof(double*)*num_population);
- for (i=0;inum_real_pop)
- {
- fprintf(stderr,"number of populations specified too large\n");
- exit(0);
- }
-
- center_1=(double *)malloc(sizeof(double)*num_dm);
- memset(center_1,0,sizeof(double)*num_dm);
-
- center_2=(double *)malloc(sizeof(double)*num_dm);
- memset(center_2,0,sizeof(double)*num_dm);
-
- center_3=(double *)malloc(sizeof(double)*num_dm);
- memset(center_3,0,sizeof(double)*num_dm);
-
-
-
- while (((num_real_pop>num_pop) && (num_pop!=0)) || ((keep_merge==1) && (num_pop==0) && (num_real_pop>2)))
- {
-
- keep_merge=0; //so, if no entering merge function, while will exit when num_pop==0
-
- real_pop_center=(double **)malloc(sizeof(double*)*num_real_pop);
- memset(real_pop_center,0,sizeof(double*)*num_real_pop);
- for (i=0;imax_pop[index_id][j])
- max_pop[index_id][j]=normalized_data[i][j];
- }
- }
-
-/* for (i=0;i2)
- num_checked_range=(long)((double)num_real_pop*(num_real_pop-1)/2.0); //to find the mergeable pair among the num_checked_range pairs of smallest distances
- else
- num_checked_range=1;
-
- size_c=(long *)malloc(sizeof(long)*num_real_pop);
- memset(size_c,0,sizeof(long)*num_real_pop);
-
- for (i=0;imax_d_dist)
- {
- max_d_dist=dist;
- d_d=j;
- }
- }
-
- if (i==0)
- d1=d_d;
- if (i==1)
- d2=d_d;
- if (i==2)
- d3=d_d;
- }
-
- //printf("d1=%d;d2=%d;d3=%d\n",d1,d2,d3);
-
- Ei=get_avg_dist(real_pop_center[temp_i], temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c);
- Ej=get_avg_dist(real_pop_center[temp_j], temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c);
-
- for (t=0;ttemp_j)
- {
- all_population_ID[i]=all_population_ID[i]-1;
- }
- else
- {
- if (all_population_ID[i]==temp_j)
- {
- all_population_ID[i]=temp_i;
- }
- }
- }
-
- num_real_pop--;
- }
- else
- {
- if ((num_pop!=0) && (num_popfirst_j)
- {
- all_population_ID[i]=all_population_ID[i]-1;
- }
- else
- {
- if (all_population_ID[i]==first_j)
- {
- all_population_ID[i]=first_i;
- }
- }
- }
-
- num_real_pop--;
-
-
-
- }
- }
- //printf("num_real_pop is %d\n",num_real_pop);
-
- } //end of while
-
-
-
- free(center_1);
- free(center_2);
- free(center_3);
- printf("Final number of populations is %ld\n",num_real_pop);
-
- new_population_center=(double **)malloc(sizeof(double*)*num_real_pop);
- memset(new_population_center,0,sizeof(double*)*num_real_pop);
- for (i=0;i max) {
- max = (int)input_data[i][j];
- }
- if (j==num_dm-1)
- {
- fprintf(f_out,"%d\n",(int)input_data[i][j]);
- fprintf(f_results,"%d\t",(int)input_data[i][j]);
- }
- else
- {
- fprintf(f_out,"%d\t",(int)input_data[i][j]);
- fprintf(f_results,"%d\t",(int)input_data[i][j]);
- }
- }
- //fprintf(f_results,"%ld\t",i + 1);
- fprintf(f_results,"%ld\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009
- }
-
-/*
- f_parameters=fopen("parameters.txt","w");
- fprintf(f_parameters,"Number_of_Bins\t%ld\n",num_bin);
- fprintf(f_parameters,"Density\t%d\n",den_t_event);
- fprintf(f_parameters,"Min\t%d\n",min);
- fprintf(f_parameters,"Max\t%d\n",max);
- fclose(f_parameters);
-*/
-
- f_properties=fopen("fcs.properties","w");
- fprintf(f_properties,"Bins=%ld\n",num_bin);
- fprintf(f_properties,"Density=%d\n",den_t_event);
- fprintf(f_properties,"Min=%d\n",min);
- fprintf(f_properties,"Max=%d\n",max);
- fprintf(f_properties,"Populations=%ld\n",num_real_pop);
- fprintf(f_properties,"Events=%ld\n",file_Len);
- fprintf(f_properties,"Markers=%ld\n",num_dm);
- fclose(f_properties);
-
- for (i=0;i