annotate ARTS/ARTS.pl @ 0:3723b54935cb draft

Uploaded
author mmaiensc
date Wed, 13 Nov 2013 16:13:17 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1 #!/usr/bin/perl
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
2 # ARTS: Automated Randomization of multiple Traits for Study Design, using diploidly GA
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
3 # Mark Maienschein-Cline, last updated 8/19/2013
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
4 # mmaiensc@uic.edu
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
5 # Center for Research Informatics, University of Illinois at Chicago
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
6 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
7 # Copyright (C) 2013 Mark Maienschein-Cline
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
8 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
9 # This program is free software; you can redistribute it and/or modify
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
10 # it under the terms of the GNU General Public License as published by
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
11 # the Free Software Foundation; either version 2 of the License, or
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
12 # (at your option) any later version.
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
13 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
14 # This program is distributed in the hope that it will be useful,
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
17 # GNU General Public License for more details.
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
18 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
19 # You should have received a copy of the GNU General Public License along
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
20 # with this program; if not, write to the Free Software Foundation, Inc.,
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
21 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
22
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
23
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
24
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
25
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
26
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
27 use Getopt::Long qw(:config no_ignore_case);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
28 #use Time::HiRes qw( clock_gettime );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
29 use Math::Trig;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
30 $|++;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
31
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
32 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
33 # initialize random number parameters
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
34 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
35 &ran1_init();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
36
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
37 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
38 # read command line
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
39 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
40 &read_command_line();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
41
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
42 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
43 # read phenotype list: print the title lines of columns used for verbose
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
44 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
45 &read_data();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
46 if( $verb eq "y" || $verb eq "l" ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
47 printf("Using traits:");
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
48 for($i=0; $i<= $#allcols; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
49 print "\t$titlevals[$allcols[$i]]";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
50 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
51 print "\n";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
52 printf("Using trait combinations:");
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
53 for($i=0; $i<= $#cols; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
54 printf("\t{%s", $titlevals[$cols[$i][0]]);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
55 for($j=1; $j<= $#{$cols[$i]}; $j++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
56 printf(",%s", $titlevals[$cols[$i][$j]]);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
57 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
58 printf("}");
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
59 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
60 print "\n";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
61 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
62
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
63 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
64 # initialize GA parameters
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
65 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
66 &ga_init();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
67
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
68 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
69 # if using the batchcolumn, fill in the batch
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
70 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
71 if( $bcolumn ne "" ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
72 if( $verb eq "y" ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
73 printf("Looking at column %i (%s) for batch assignments\n", $bcolumn+1, $titlevals[$bcolumn]);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
74 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
75 # fill in batch from last column of @data
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
76 $numbatches = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
77 $foundbatchhash = {};
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
78 @batchsizes = ();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
79 for($i=0; $i<=$#data; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
80 if( $foundbatchhash->{$data[$i][$bcolumn]} eq "" ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
81 $foundbatchhash->{$data[$i][$bcolumn]} = $numbatches;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
82 $numbatches++;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
83 push(@batchnames, $data[$i][$bcolumn]);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
84 push(@batchsizes, 0);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
85 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
86 $batchsizes[$foundbatchhash->{$data[$i][$bcolumn]}]++;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
87 $data[$i][$#{$data[0]}] = $data[$i][$bcolumn];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
88 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
89 $mi = &mutual_info( $numbatches );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
90 $bestmi = $mi;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
91 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
92
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
93 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
94 # else do sampling: run GA
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
95 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
96 if( $bcolumn eq "" ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
97 &initialize_population();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
98
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
99 $oldavg = 1;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
100 $err = 0.0001;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
101 for($n=0; $n< $numgen; $n++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
102 &add_immigrants();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
103 @population = &permute( \@population );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
104 $k = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
105 $k = &crossover( $k );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
106 $k = &mutate( $k );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
107 $k = &add_parents( $k );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
108 @pool = sort{$a->{score} <=> $b->{score}} @pool;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
109 $average = &fill_population();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
110
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
111 # check if we've done enough already, and print out status
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
112 if( $verb eq "y" ){printf(" Generation %i of %i, average fitness %0.4f\n", $n+1, $numgen, $average );}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
113 if( $oldavg >= $average && $oldavg - $average < $err ){last;}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
114 $oldavg = $average;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
115 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
116
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
117 # save the final best one
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
118 for($i=0; $i<= $#data; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
119 &fill_assignments( \@{$population[0]->{assignments}} );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
120 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
121 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
122
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
123 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
124 # print final log to stdout
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
125 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
126 if( $verb eq "y" || $verb eq "l" ){&print_info;}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
127
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
128 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
129 # print result
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
130 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
131 if( $out ne "" ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
132 open(OUT,">$out");
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
133 print OUT "$title\t$bname\n";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
134 for($i=0; $i<= $#data; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
135 $orig[$i][1] = $data[$i][$#{$data[0]}];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
136 printf OUT ("%s\t%i\n", $orig[$i][0], $orig[$i][1]);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
137 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
138 close(OUT);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
139 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
140
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
141 ###############
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
142 # SUBROUTINES #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
143 ###############
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
144
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
145 # read command line options
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
146 sub read_command_line{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
147 my $i;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
148
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
149 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
150 # option default values
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
151 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
152 $in = "";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
153 $out = "";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
154 $bcolumn = "";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
155 $batch = "";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
156 $bname = "batch";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
157 $phenocols = "";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
158 $contcols = "";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
159 $datecols = "";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
160 $bins = 5;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
161 @blist = ();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
162 $verb = "y";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
163 $mmi = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
164
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
165 $options = "
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
166 Usage: ./ARTS.pl <OPTIONS>
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
167 REQUIRED:
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
168 -i input traits (rectangular, tab-delimited matrix, including title line with column names)
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
169 -c trait columns to randomize
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
170 comma- and semicolon delimited list, columns indexed from 1
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
171 all traits indicated by commas are used in joint distributions
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
172 AND EITHER -b AND -o, OR -p:
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
173 -b batch sizes (a single number, or a comma-delimited list)
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
174 -o output file (formatted same as input file, with batch added as last column)
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
175 -or-
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
176 -p <batch column index>: print MI statistic for input traits using this column as batch designations
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
177 -p will not do any sampling
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
178 OTHER OPTIONS:
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
179 -cc continuously-valued columns (will be binned)
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
180 -cd date-valued columns (should be M/D/Y); should also list these as continuous (in -cc)
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
181 -cb number of bins to use for continuous or date columns (default: $bins for each)
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
182 can give 1 value, or a list of the same length as -cc; if a list, will be assigned in the same order as -cc
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
183 -bn batch name (title of added column, default $bname)
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
184 -s random number seed (large negative integer, default: $seed)
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
185 ";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
186
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
187 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
188 # Secret options:
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
189 # -v y or l (verbose: print all, or just print status from beginning or end)
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
190 # -mmi force use of MMI objective function on all columns indicated by -c, over-riding any other settings from -c
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
191 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
192
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
193 GetOptions('i=s' => \$in,
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
194 'o=s' => \$out,
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
195 'p=i' => \$bcolumn,
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
196 'b=s' => \$batch,
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
197 'c=s' => \$phenocols,
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
198 'cc=s' => \$contcols,
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
199 'cd=s' => \$datecols,
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
200 'cb=s' => \$bins,
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
201 'bn=s' => \$bname,
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
202 's=i' => \$seed,
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
203 'mmi' => \$mmi,
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
204 'v=s' => \$verb,
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
205 ) || die "$options\n";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
206
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
207 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
208 # check that required inputs exist
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
209 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
210 if( $in eq "" ){&exit_required("i");}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
211 if( ($out eq "" || $batch eq "") && $bcolumn eq "" ){&exit_required("b and -o, or -p,");}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
212 if( $phenocols eq "" || $phenocols eq "None" ){&exit_required("c");}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
213
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
214 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
215 # check that inputs values are OK
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
216 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
217 if( $bcolumn ne "" ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
218 if( $bcolumn < 1 ){&exit_err("p","at least 1");}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
219 $bcolumn--;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
220 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
221 if( $verb ne "y" && $verb ne "n" && $verb ne "l" ){&exit_err("v","y or n or l");}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
222 if( $seed > 0 ){$seed*= -1;}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
223 if( $seed == 0 ){&exit_err("s","non-zero");}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
224
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
225 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
226 # if mmi, reset phenocols value using all found columns
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
227 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
228 if( $mmi ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
229 @initcs = split(/[,;]/, $phenocols);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
230 # remove duplicates
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
231 @clist = ();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
232 $cinds = {};
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
233 for($i=0; $i<= $#initcs; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
234 if( $cinds->{$initcs[$i]} eq "" ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
235 $cinds->{$initcs[$i]} = 1;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
236 push(@clist, $initcs[$i]);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
237 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
238 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
239 # add to new phenocols
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
240 $phenocols = "$clist[0]";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
241 for($i=1; $i<= $#clist; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
242 $phenocols = sprintf("%s,%s", $phenocols, $clist[$i]);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
243 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
244 $phenocols = sprintf("%s;%s", $phenocols, $clist[0]);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
245 for($i=1; $i<= $#clist; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
246 $phenocols = sprintf("%s;%s", $phenocols, $clist[$i]);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
247 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
248 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
249
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
250 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
251 # extract phenotype columns
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
252 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
253 @cols = ();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
254 @allcols = ();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
255 $alllist = {};
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
256 @jointlist = split(';',$phenocols);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
257 for($i=0; $i<= $#jointlist; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
258 @tmp = split(',',$jointlist[$i]);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
259 @tmp = &fix_cols( \@tmp );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
260 push(@cols, [@tmp]);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
261 for($j=0; $j<= $#tmp; $j++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
262 if( $alllist->{$tmp[$j]} eq "" ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
263 $alllist->{$tmp[$j]} = 1;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
264 push(@allcols, $tmp[$j]);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
265 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
266 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
267 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
268
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
269 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
270 # extract continuous and date columns
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
271 # sort continuous columns so that bins correspond to them in order
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
272 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
273 if( $contcols ne "" && $contcols ne "None" ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
274 @conts = split(',',$contcols);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
275 @conts = &fix_cols( \@conts );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
276 $numconts = $#conts+1;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
277 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
278 if( $datecols ne "" && $datecols ne "None" ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
279 @dates = split(',',$datecols);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
280 @dates = &fix_cols( \@dates );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
281 $numdates = $#dates+1;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
282 # check that date columns are among continuous columns
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
283 for($i=0; $i<= $#dates; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
284 for($j=0; $j<= $#conts; $j++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
285 if( $dates[$i] == $conts[$j] ){last;}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
286 if( $j==$#conts ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
287 printf("Error: please specify date column %i as continuous\n", $dates[$i]+1 );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
288 die;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
289 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
290 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
291 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
292 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
293 if( $bins =~ /,/ ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
294 @blist = split(',',$bins);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
295 if( $#blist+1 != $#conts + 1 ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
296 printf("Error: you input %i bins, but %i columns that need binning\n", $#blist+1, $#conts+1);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
297 die;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
298 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
299 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
300 else{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
301 for($i=0; $i<= $#conts; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
302 push(@blist, $bins);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
303 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
304 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
305 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
306
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
307 # print error message for flag $_[0], with correct values $_[1], and print usage
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
308 sub exit_err{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
309 printf("Error: set -%s to be %s\n%s\n", $_[0], $_[1], $options);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
310 exit;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
311 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
312 # print error message saying flag $_[0] is required
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
313 sub exit_required{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
314 printf("Error: option -%s is required\n%s\n", $_[0], $options);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
315 exit;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
316 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
317
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
318 # fix all indices in array $_[0]: cast to integer, check at least 1, and subtract 1
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
319 sub fix_cols{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
320 my @list;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
321 my $i;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
322 @list = @{$_[0]};
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
323 for($i=0; $i<= $#list; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
324 $list[$i] = sprintf("%i", $list[$i]);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
325 if( $list[$i] < 1 ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
326 print "Error: column indices should be at least 1\n";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
327 die;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
328 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
329 $list[$i]--;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
330 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
331 return @list;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
332 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
333
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
334 # print info about best randomization
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
335 sub print_info{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
336 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
337 # get MI of each phenotype and average
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
338 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
339 $bestmi = &mutual_info();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
340 @bestmilist = &individual_mi( $numbatches );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
341 $bestavgmi = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
342 for($i=0; $i<= $#bestmilist; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
343 $bestavgmi+= $bestmilist[$i]/($#bestmilist+1);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
344 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
345
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
346 printf("Final MI %0.4f ; Individual trait MIs (mean %0.4f ): ", $bestmi, $bestavgmi);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
347 for($i=0; $i<= $#bestmilist; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
348 printf("\t%0.4f", $bestmilist[$i]);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
349 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
350 print "\n-----------------------------------------------------------------\n";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
351 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
352 # print the counts for each phenotype in each batch
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
353 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
354 # first title line: phenotype names
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
355 for($i=0; $i<= $#allcols; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
356 printf("\t%s values", $titlevals[$allcols[$i]]);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
357 for($j=1; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
358 printf("\t");
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
359 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
360 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
361 print "\nBatch (size)";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
362 # second title line: phenotype values
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
363 for($i=0; $i<= $#allcols; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
364 for($j=0; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
365 if( $items->{$allcols[$i]}->{list}[$j] ne "" ){printf("\t%s", &name($items->{$allcols[$i]}->{list}[$j], $allcols[$i]) );}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
366 else{printf("\tempty");}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
367 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
368 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
369 print "\n-------";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
370 # print a line of dashes to separate
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
371 for($i=0; $i<= $#allcols; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
372 for($j=0; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
373 printf("\t-------");
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
374 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
375 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
376 print "\n";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
377 for($k=0; $k< $numbatches; $k++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
378 printf("%s (%i)", $batchnames[$k], $batchsizes[$k] );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
379 for($i=0; $i<= $#allcols; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
380 for($j=0; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
381 printf("\t%i", &count( $#{$data[0]}, $batchnames[$k], $allcols[$i], $items->{$allcols[$i]}->{list}[$j] ) );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
382 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
383 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
384 print "\n";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
385 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
386 print "-------";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
387 # print a line of dashes to separate
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
388 for($i=0; $i<= $#allcols; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
389 for($j=0; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
390 printf("\t-------");
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
391 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
392 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
393 # print totals for each type
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
394 print "\nTotal";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
395 for($i=0; $i<= $#allcols; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
396 for($j=0; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
397 printf("\t%i", $items->{$allcols[$i]}->{$items->{$allcols[$i]}->{list}[$j]}[1] );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
398 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
399 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
400 print "\n";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
401 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
402
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
403 # for continuous valued columns, checked by $_[1], convert value $_[0] back to a range
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
404 sub name{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
405 my $i;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
406 my $binw;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
407
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
408 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
409 # if there aren't any continuous columns, or $_[1] doesn't match one, just return $_[0]
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
410 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
411 if( $#conts < 0 ){return $_[0];}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
412 for($i=0; $i<= $#conts; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
413 if( $_[1] == $conts[$i] ){last;}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
414 if( $i==$#conts ){return $_[0];}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
415 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
416
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
417 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
418 # convert bin value back to continuous value
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
419 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
420 $binw = ($contstats[$i][2]-$contstats[$i][0])/$blist[$i];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
421 $val1 = $binw*$_[0]+$contstats[$i][0];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
422 $val2 = $binw*($_[0]+1)+$contstats[$i][0];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
423
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
424 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
425 # if there aren't any date columns, or $_[1] doesn't match one, just return the range val1-val2
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
426 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
427 if( $#dates < 0 ){return sprintf("%s-%s", $val1, $val2);}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
428 for($i=0; $i<= $#dates; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
429 if( $_[1] == $dates[$i] ){last;}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
430 if( $i==$#dates ){return sprintf("%s-%s", $val1, $val2);}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
431 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
432 $val1 = sprintf("%i", $val1);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
433 $val2 = sprintf("%i", $val2);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
434 return sprintf("%s-%s", &convert_date( $val1 ), &convert_date( $val2 ));
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
435
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
436 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
437
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
438 # read in regular matrix from $in
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
439 # for continuous (including date-value) columns, make histograms
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
440 sub read_data{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
441 my @lines;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
442 my $i;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
443 @data = ();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
444 $items = {};
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
445 @orig = ();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
446 @titlevals = ();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
447 @batchsizes = ();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
448 $numbatches = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
449
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
450 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
451 # fix newline convention
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
452 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
453
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
454
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
455 open(IN,"$in") || die "Error: can't open $in\n";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
456 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
457 # read in all lines and check formatting
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
458 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
459 @lines = <IN>;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
460 if( $lines[0] =~ /\r/ && $#lines == 0 ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
461 # this happens with tab-delimited text saved from excel
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
462 @lines = split('\r', $lines[0]);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
463 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
464
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
465 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
466 # read title line
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
467 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
468 $title = $lines[0];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
469 chomp($title);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
470 @titlevals = split('\t',$title);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
471 for($k=1; $k<= $#lines; $k++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
472 $line = $lines[$k];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
473 chomp($line);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
474 if( $line ne "" ){ # ignore blank lines
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
475 @parts = split('\t',$line);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
476 for($i=$#parts+1; $i<= $#titlevals; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
477 push(@parts, "");
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
478 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
479 if( $#parts != $#titlevals ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
480 printf("Error: not enough columns in line %i\n", $#data+2);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
481 die;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
482 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
483 # push 1 extra for the batch
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
484 push(@parts, 0);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
485 push(@data, [(@parts)] );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
486 push(@orig, [($line, 0)] );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
487 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
488 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
489 close(IN);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
490
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
491 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
492 # exit if no data read
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
493 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
494 if( $#data < 0 ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
495 printf("Error: no samples were read in\n");
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
496 die;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
497 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
498
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
499 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
500 # if batch is not empty, check batches:
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
501 # if no commas, cast to integer and count how many are needed
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
502 # if there are commas, get batched on given sizes
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
503 # double check that we add up
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
504 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
505 @batchnames = ();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
506 if( $batch ne "" ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
507 if( $batch !~ /,/ ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
508 $batch = sprintf("%i", $batch);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
509 # fix batch size if too big
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
510 if( $batch > $#data + 1 ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
511 printf("Warning: you have %i samples, but asked for a batch size of %i, so there is only 1 batch\n", $#data+1, $batch);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
512 $batch = $#data+1;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
513 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
514 $numbatches = ($#data+1)/$batch;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
515 $exactbatches = sprintf("%i", $numbatches);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
516 if( $exactbatches < $numbatches ){$exactbatches++;}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
517 $numbatches = $exactbatches;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
518 for($i=0; $i< $numbatches-1; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
519 push(@batchsizes, $batch);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
520 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
521 push(@batchsizes, $batch - ($numbatches*$batch - ($#data+1)) );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
522 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
523 else{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
524 @batchsizes = split(',',$batch);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
525 $numbatches = $#batchsizes+1;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
526 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
527 $tot = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
528 for($i=0; $i< $numbatches; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
529 push(@batchnames, $i+1);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
530 $tot+= $batchsizes[$i];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
531 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
532 if( $tot != $#data+1 ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
533 printf("Error: have %i spaces in all batches, but %i samples\n", $tot, $#data+1);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
534 die;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
535 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
536 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
537
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
538 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
539 # convert dates to numbers
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
540 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
541 for($i=0; $i<= $#data; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
542 for($j=0; $j<= $#dates; $j++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
543 if( $data[$i][$dates[$j]] ne "" ){$data[$i][$dates[$j]] = &convert_date( $data[$i][$dates[$j]] );}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
544 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
545 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
546
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
547 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
548 # for all continuous columns, compute median and fill in missing values
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
549 # also record max and min for binning
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
550 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
551 @contstats = (); # records min, median, max for each continuous column
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
552 for($j=0; $j<= $#conts; $j++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
553 @tmp = ();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
554 for($i=0; $i<= $#data; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
555 if( $data[$i][$conts[$j]] ne "" ){push(@tmp, $data[$i][$conts[$j]]);}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
556 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
557 @tmp = sort{ $a <=> $b } @tmp;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
558 $median = $tmp[sprintf("%i", ($#tmp+1)/2)];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
559 push(@contstats, [($tmp[0], $median, $tmp[$#tmp])] );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
560 # for($i=0; $i<= $#data; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
561 # if( $data[$i][$conts[$j]] eq "" ){$data[$i][$conts[$j]] = $median;}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
562 # }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
563 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
564
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
565 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
566 # for all continuous columns, bin data
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
567 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
568 for($j=0; $j<= $#conts; $j++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
569 $binw = ($contstats[$j][2] - $contstats[$j][0])/($blist[$j]);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
570 if( $binw == 0 ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
571 printf("Error: max and min of column %i are equal (max/min are %f/%f)\n", $conts[$j]+1, $contstats[$j][2], $contstats[$j][0] );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
572 die;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
573 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
574 for($i=0; $i<= $#data; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
575 if( $data[$i][$conts[$j]] ne "" ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
576 $data[$i][$conts[$j]] = sprintf("%i", ($data[$i][$conts[$j]] - $contstats[$j][0])/$binw);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
577 if( $data[$i][$conts[$j]] >= $blist[$j] ){$data[$i][$conts[$j]] = $blist[$j]-1;}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
578 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
579 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
580 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
581
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
582 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
583 # for each column we're using, count how many item types there are
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
584 # empty phenotypes are considered their own, distinct phenotype
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
585 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
586 $items = &itemize( \@allcols );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
587 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
588
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
589 # count how many item types of @{$_[0]} there are in @data
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
590 sub itemize{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
591 my $i;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
592 my $j;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
593 my $info;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
594 my @cols;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
595 my $items;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
596 @cols = @{$_[0]};
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
597
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
598 for($j=0; $j<= $#cols; $j++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
599 $info = {};
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
600 $info->{list} = ();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
601 for($i=0; $i<= $#data; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
602 if( $info->{$data[$i][$cols[$j]]} eq "" ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
603 $info->{$data[$i][$cols[$j]]} = [($#{$info->{list}}+1,0)];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
604 push(@{$info->{list}}, $data[$i][$cols[$j]]);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
605 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
606 $info->{$data[$i][$cols[$j]]}[1]++;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
607 $info->{count}++;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
608 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
609 $info->{num} = $#{$info->{list}}+1;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
610 $items->{$cols[$j]} = $info;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
611 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
612
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
613 for($j=0; $j<= $#cols; $j++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
614 # this set prints the number of values and counts for each phenotype
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
615 #printf("%i,%s:", $cols[$j], $titlevals[$cols[$j]]);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
616 #for($k=0; $k<= $#{$items->{$cols[$j]}->{list}}; $k++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
617 # printf("\t%s,%i", $items->{$cols[$j]}->{list}[$k], $items->{$cols[$j]}->{$items->{$cols[$j]}->{list}[$k]}[1] );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
618 #}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
619 #print "\n";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
620 if( $items->{$cols[$j]}->{num} > 20 ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
621 printf("Warning: column %i (%s) has %i values; should you make it continuous?\n", $cols[$j], $titlevals[$cols[$j]], $items->{$cols[$j]}->{num} );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
622 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
623 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
624 return $items;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
625 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
626
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
627 # convert date in M/D/Y to integer, or integer to M/D/Y
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
628 sub convert_date{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
629 my $date;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
630 my $month;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
631 my $day;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
632 my $year;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
633 my $months;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
634 my $i;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
635 # cumulative days per month
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
636 $months->{0} = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
637 $months->{1} = 31;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
638 $months->{2} = 59;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
639 $months->{3} = 90;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
640 $months->{4} = 120;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
641 $months->{5} = 151;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
642 $months->{6} = 181;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
643 $months->{7} = 212;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
644 $months->{8} = 243;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
645 $months->{9} = 273;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
646 $months->{10} = 304;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
647 $months->{11} = 334;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
648 $months->{12} = 365;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
649 $date = $_[0];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
650
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
651 # convert date to integer
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
652 if( $date =~ /\// ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
653 ($month, $day, $year) = split('/',$date);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
654 $month = sprintf("%i", $month);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
655 $day = sprintf("%i", $day);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
656 $year = sprintf("%i", $year);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
657 if( $month < 1 || $month > 12 ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
658 print "Error: found a month $month not between 1 and 12\n";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
659 die;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
660 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
661 if( $day < 1 || $day > 31 ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
662 print "Error: found a day $day not between 1 and 31\n";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
663 die;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
664 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
665
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
666 return $day + $months->{$month-1} + $year*$months->{12};
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
667 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
668 # convert integer to date
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
669 elsif( $date == sprintf("%i", $date) ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
670 $year = sprintf("%i", $date/($months->{12}));
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
671 $month = $date-$year*$months->{12};
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
672 for($i=1; $i<=12; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
673 if( $month < $months->{$i} ){last;}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
674 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
675 $day = $month - $months->{$i-1};
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
676 $month = $i;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
677 return sprintf("%s/%s/%s", $month, $day, $year);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
678 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
679 else{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
680 printf("\nError: unrecognized format in convert_date(): %s\n", $date);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
681 die;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
682 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
683 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
684
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
685 # set globals used by ran1
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
686 sub ran1_init{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
687 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
688 # random number variables
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
689 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
690 $iset = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
691 $gset = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
692 #$iseed = clock_gettime(CLOCK_REALTIME);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
693 #($first, $second) = split('\.', $iseed);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
694 #$seed = sprintf("-%i%i", $second, $first);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
695 $seed = -10854829;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
696 $M1 = 259200;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
697 $IA1 = 7141;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
698 $IC1 = 54773;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
699 $RM1 = 1.0/$M1;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
700 $M2 = 134456;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
701 $IA2 = 8121;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
702 $IC2 = 28411;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
703 $RM2 = 1.0/$M2;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
704 $M3 = 243000;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
705 $IA3 = 4561;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
706 $IC3 = 51349;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
707 $iff = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
708 $ix1 = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
709 $ix2 = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
710 $ix3 = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
711 @ranarray = ();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
712 for($i=0; $i< 98; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
713 push(@ranarray, 0);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
714 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
715 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
716
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
717 # uniform random number generator, seed, iff, and various capital-letter variables set in beginning
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
718 sub ran1{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
719 my $j;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
720 my $temp;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
721
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
722 if( $seed < 0 || $iff == 0 ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
723 $iff = 1;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
724 $ix1 = ($IC1 - $seed)%$M1;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
725 $ix1 = ($IA1*$ix1 + $IC1)%$M1;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
726 $ix2 = $ix1%$M2;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
727 $ix1 = ($IA1*$ix1 + $IC1)%$M1;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
728 $ix3 = $ix1%$M3;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
729 for($j=1; $j<= 97; $j++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
730 $ix1 = ($IA1*$ix1 + $IC1)%$M1;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
731 $ix2 = ($IA2*$ix2 + $IC2)%$M2;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
732 $ranarray[$j] = ($ix1 + $ix2*$RM2)*$RM1;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
733 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
734 $seed = 1;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
735 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
736 $ix1 = ($IA1*$ix1 + $IC1)%$M1;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
737 $ix2 = ($IA2*$ix2 + $IC2)%$M2;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
738 $ix3 = ($IA3*$ix3 + $IC3)%$M3;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
739
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
740 $j = sprintf("%i", 1 + ((97*$ix3)/$M3) );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
741 if( $j> 97 || $j< 1 ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
742 printf("Error in ran1: $j outside of [1:97]\n");
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
743 die;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
744 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
745 $temp = $ranarray[$j];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
746 $ranarray[$j] = ($ix1 + $ix2*$RM2)*$RM1;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
747 return $temp;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
748 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
749
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
750 # permute array $_[0]
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
751 sub permute{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
752 my @assignments;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
753 my $i;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
754 my $j;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
755 my $tmp;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
756
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
757 @assignments = @{$_[0]};
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
758
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
759 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
760 # shuffle batches randomly
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
761 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
762 for($i=$#assignments; $i>= 0; $i--){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
763 $j = sprintf("%i", ($i+1)*&ran1() );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
764 $tmp = $assignments[$j];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
765 $assignments[$j] = $assignments[$i];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
766 $assignments[$i] = $tmp;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
767 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
768 return @assignments;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
769 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
770
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
771 # fill data with assignments $_[0]
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
772 sub fill_assignments{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
773 my @list;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
774 my $i;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
775 @list = @{$_[0]};
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
776 if( $#list != $#data ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
777 print "Error in fill_assignments: mismatching list lengths\n";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
778 die;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
779 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
780 for($i=0; $i<= $#list; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
781 $data[$i][$#{$data[0]}] = $list[$i];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
782 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
783 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
784
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
785 # compute mutual information of a batch assignment
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
786 sub mutual_info{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
787 my $i;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
788 my $s;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
789 my $mi;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
790 my $stot;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
791
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
792 $mi = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
793 for($i=0; $i<= $#cols; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
794 $mi += &this_mi( $_[0], $#{$data[0]}, \@{$cols[$i]} )/($#cols+1);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
795 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
796
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
797 return $mi;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
798 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
799
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
800 # compute all single-phenotype mutual information
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
801 sub individual_mi{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
802 my $i;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
803 my @list;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
804 my @milist;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
805
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
806 @milist = ();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
807 for($i=0; $i<= $#allcols; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
808 @list = ($allcols[$i]);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
809 push(@milist, &this_mi( $_[0], $#{$data[0]}, \@list ) );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
810 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
811 return @milist;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
812 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
813
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
814 # compute mutual information of columns $_[1] ($_[0] bins) and all of @{$_[2]}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
815 sub this_mi{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
816 my $i;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
817 my $j;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
818 my $summand;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
819 my @list;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
820 my $jprob;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
821 my $m1prob;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
822 my $m2prob;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
823 my $jbin;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
824 my $m1bin;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
825 my $m2bin;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
826 my $jbinstot;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
827 my $m1binstot;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
828 my $m2binstot;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
829 my @jbinlist;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
830 my @m1binlist;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
831 my @m2binlist;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
832 my $mi;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
833 my $s1;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
834 my $s2;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
835 my $s;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
836 @list = @{$_[2]};
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
837
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
838 # initialize probabilities
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
839 $jprob = {}; # joint distribution
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
840 $m1prob = {}; # batch marginal dist
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
841 $m2prob = {}; # pheno marginal dist
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
842 @jbinlist = (); # phenotype combos found in joint distribution
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
843 @m1binlist = (); # batches found in batches distribution (1st marginal dist)
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
844 @m2binlist = (); # phenotype combos found in phenotypes distribution (2nd marginal dist)
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
845
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
846 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
847 # read through data and add to distributions
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
848 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
849 $summand = 1.0/($#data+1);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
850 for($i=0; $i<= $#data; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
851 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
852 # define bin names based on phenotype/batch
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
853 # for phenotypes p1, p2, etc., batch b:
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
854 # joint = p1_p2_..._pn_b
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
855 # 1st marginal = b
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
856 # 2nd marginal = p1_p2_..._pn
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
857 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
858 $jbin = sprintf("%s", $data[$i][$#{$data[0]}]);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
859 $m1bin = sprintf("%s", $data[$i][$#{$data[0]}]);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
860 $m2bin = "";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
861 for($j=0; $j<= $#list; $j++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
862 # NOTE:
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
863 # $list[$j] is a phenotype column (e.g., gender)
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
864 # $data[$i][$list[$j]] is the value of that phenotype in sample $i (e.g., M or F)
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
865 # $items->{$list[$j]}->{$data[$i][$list[$j]]}[0] is the bin index (e.g., M->0, F->1) of that phenotype
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
866
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
867 $jbin = sprintf("%s_%i", $jbin, $items->{$list[$j]}->{$data[$i][$list[$j]]}[0]);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
868 if( $j>0 ){$m2bin = sprintf("%s_", $m2bin);}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
869 $m2bin = sprintf("%s%i", $m2bin, $items->{$list[$j]}->{$data[$i][$list[$j]]}[0]);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
870 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
871
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
872 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
873 # check if we've already seen this bin, for each distribution
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
874 # initialize probabilities and add to list if it's the first time
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
875 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
876 if( $jprob->{$jbin} eq "" ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
877 $jprob->{$jbin} = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
878 push(@jbinlist, [($jbin, $m1bin, $m2bin)] );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
879 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
880 if( $m1prob->{$m1bin} eq "" ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
881 $m1prob->{$m1bin} = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
882 push(@m1binlist, $m1bin);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
883 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
884 if( $m2prob->{$m2bin} eq "" ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
885 $m2prob->{$m2bin} = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
886 push(@m2binlist, $m2bin);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
887 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
888
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
889 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
890 # add a count to each distribution
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
891 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
892 $jprob->{$jbin} += $summand;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
893 $m1prob->{$m1bin} += $summand;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
894 $m2prob->{$m2bin} += $summand;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
895 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
896
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
897 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
898 # compute mutual information, and entropy of m1prob and m2prob (for normalization)
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
899 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
900 $mi = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
901 $s1 = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
902 $s2 = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
903 for($i=0; $i<= $#jbinlist; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
904 $mi+= ($jprob->{$jbinlist[$i][0]}) * log( ($jprob->{$jbinlist[$i][0]})/($m1prob->{$jbinlist[$i][1]} * $m2prob->{$jbinlist[$i][2]}) );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
905 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
906 for($i=0; $i<= $#m1binlist; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
907 $s1-= $m1prob->{$m1binlist[$i]} * log( $m1prob->{$m1binlist[$i]} );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
908 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
909 for($i=0; $i<= $#m2binlist; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
910 $s2-= $m2prob->{$m2binlist[$i]} * log( $m2prob->{$m2binlist[$i]} );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
911 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
912 $s = sqrt($s1*$s2);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
913
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
914 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
915 # normalize mi
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
916 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
917 if( $s>0 ){$mi/= $s;}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
918
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
919 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
920 # return normalized mi (0=independent, 1=completely dependent)
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
921 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
922 return $mi;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
923 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
924
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
925 # count how many of @data have column $_[0] equal $_[1] and column $_[2] equal $_[3]
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
926 sub count{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
927 my $i;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
928 my $tot;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
929
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
930 $tot = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
931 for($i=0; $i<= $#data; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
932 if( $data[$i][$_[0]] eq $_[1] && $data[$i][$_[2]] eq $_[3] ){$tot++;}
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
933 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
934 return $tot;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
935 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
936
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
937 # initialize GA parameters and large matrices
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
938 sub ga_init{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
939 my $i;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
940 my $j;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
941 my $info;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
942
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
943 $popsize = 100;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
944 $numgen = 300;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
945 $nchrmuts = 2;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
946 $nnewimm = 10;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
947 $nkeepparents = 2;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
948 $nchrpool = ($nnewimm+$popsize) + ($nnewimm+$popsize)*$nchrmuts + $nkeepparents;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
949
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
950 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
951 # population to turn over each generation
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
952 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
953 @population = ();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
954 for($i=0; $i< $popsize+$nnewimm; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
955 $info = {};
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
956 $info->{score} = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
957 $info->{assignments} = [()];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
958 for($j=0; $j<= $#data; $j++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
959 push(@{$info->{assignments}}, 0);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
960 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
961 push(@population, $info);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
962 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
963
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
964 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
965 # new individuals to fill each generation
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
966 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
967 @pool = ();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
968 for($i=0; $i< $nchrpool; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
969 $info = {};
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
970 $info->{score} = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
971 $info->{assignments} = [()];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
972 for($j=0; $j<= $#data; $j++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
973 push(@{$info->{assignments}}, 0);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
974 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
975 push(@pool, $info);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
976 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
977
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
978 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
979 # array to randomize for batch assignments
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
980 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
981 @batched = ();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
982 @bcounts = ();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
983 for($i=0; $i<= $#batchsizes; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
984 push(@bcounts, 0);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
985 for($j=0; $j< $batchsizes[$i]; $j++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
986 push(@batched, $i+1);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
987 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
988 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
989 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
990
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
991 # initialize the population array: randomize $popsize batches and score each one
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
992 sub initialize_population{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
993 my $i;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
994
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
995 for($i=0; $i< $popsize; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
996 @{$population[$i]->{assignments}} = &permute( \@batched );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
997 &fill_assignments( \@{$population[$i]->{assignments}} );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
998 $population[$i]->{score} = &mutual_info( $numbatches );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
999 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1000 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1001
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1002 # complete the crossover step, keeping track of our index with $_[0]
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1003 sub crossover{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1004 my $i;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1005 my $j;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1006 my $k;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1007
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1008 $k = $_[0];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1009
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1010 for($i=0; $i< $popsize + $nnewimm; $i+=2){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1011 &do_cross($i, $k);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1012 &fill_assignments( \@{$pool[$k]->{assignments}} );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1013 $pool[$k]->{score} = &mutual_info( $numbatches );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1014 $k++;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1015 &fill_assignments( \@{$pool[$k]->{assignments}} );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1016 $pool[$k]->{score} = &mutual_info( $numbatches );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1017 $k++;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1018 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1019 return $k;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1020 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1021
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1022 # do a crossover between population members $_[0] and $_[0]+1, fill to pool members $_[1] and $_[1]+1
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1023 sub do_cross{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1024 my $i;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1025 my $j;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1026 my $popmem;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1027 my $poolmem;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1028 my $index;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1029 my @swap;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1030 my @subswap;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1031 my $swapinds;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1032
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1033 $popmem = $_[0];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1034 $poolmem = $_[1];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1035 $index = sprintf("%i", $#data * &ran1() );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1036
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1037 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1038 # count how many of each batch to switch
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1039 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1040 for($i=0; $i<= $#bcounts; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1041 $bcounts[$i] = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1042 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1043 for($i=0; $i<= $index; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1044 $bcounts[$population[$popmem]->{assignments}[$i] - 1]++;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1045 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1046
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1047 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1048 # for each batch i:
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1049 # record into subswap all indices with that batch from popmem+1
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1050 # permute subswap
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1051 # add first bcounts[$i] into swap
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1052 # then sort swap
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1053 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1054 @swap = ();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1055 $swapinds = {};
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1056 for($i=0; $i<= $#bcounts; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1057 @subswap = ();
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1058 for($j=0; $j<= $#data; $j++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1059 if( $population[$popmem+1]->{assignments}[$j] == $i+1 ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1060 push(@subswap, $j);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1061 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1062 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1063 @subswap = &permute( \@subswap );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1064 for($j=0; $j< $bcounts[$i]; $j++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1065 push(@swap, $subswap[$j]);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1066 $swapinds->{$subswap[$j]} = 1;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1067 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1068 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1069 @swap = sort{$a <=> $b} @swap;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1070
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1071 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1072 # fill start of first new chr from swap indices of second chr, and end from end of first chr
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1073 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1074 for($i=0; $i<= $index; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1075 $pool[$poolmem]->{assignments}[$i] = $population[$popmem+1]->{assignments}[$swap[$i]];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1076 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1077 for($i=$index+1; $i<= $#data; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1078 $pool[$poolmem]->{assignments}[$i] = $population[$popmem]->{assignments}[$i];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1079 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1080
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1081 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1082 # fill start of second chr from start of first chr, and end from remaining parts of second chr
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1083 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1084 for($i=0; $i<= $index; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1085 $pool[$poolmem+1]->{assignments}[$i] = $population[$popmem]->{assignments}[$i];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1086 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1087 $j = $index+1;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1088 for($i=0; $i<= $#data; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1089 if( $swapinds->{$i} != 1 ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1090 $pool[$poolmem+1]->{assignments}[$j] = $population[$popmem+1]->{assignments}[$i];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1091 $j++;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1092 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1093 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1094
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1095
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1096 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1097 # check that batch counts are still ok
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1098 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1099 $checkbatch = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1100 if( $checkbatch ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1101 for($i=0; $i<= $#bcounts; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1102 $bcounts[$i] = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1103 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1104 for($i=0; $i<= $#data; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1105 $bcounts[$pool[$poolmem]->{assignments}[$i] - 1]++;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1106 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1107 for($i=0; $i<= $#bcounts; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1108 if( $bcounts[$i] != $batchsizes[$i] ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1109 print "Error in do_cross: lost some batch counts in first daughter chr\n";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1110 exit;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1111 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1112 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1113 for($i=0; $i<= $#bcounts; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1114 $bcounts[$i] = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1115 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1116 for($i=0; $i<= $#data; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1117 $bcounts[$pool[$poolmem+1]->{assignments}[$i] - 1]++;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1118 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1119 for($i=0; $i<= $#bcounts; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1120 if( $bcounts[$i] != $batchsizes[$i] ){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1121 print "Error in do_cross: lost some batch counts in first daughter chr\n";
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1122 exit;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1123 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1124 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1125 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1126 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1127
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1128 # complete the mutation step, keeping track of our index with $_[0]
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1129 sub mutate{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1130 my $i;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1131 my $j;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1132 my $k;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1133
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1134 $k = $_[0];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1135
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1136 for($i=0; $i< $popsize+$nnewimm; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1137 for($j=0; $j< $nchrmuts; $j++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1138 &do_mutation($i, $k);
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1139 &fill_assignments( \@{$pool[$k]->{assignments}} );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1140 $pool[$k]->{score} = &mutual_info( $numbatches );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1141 $k++;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1142 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1143 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1144 return $k;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1145 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1146
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1147 # do a mutation for population member $_[0], fill to pool member $_[1]
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1148 sub do_mutation{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1149 my $i;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1150 my $popmem;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1151 my $poolmem;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1152 my $index1;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1153 my $index2;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1154
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1155 $popmem = $_[0];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1156 $poolmem = $_[1];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1157
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1158 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1159 # fill all of poolmem
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1160 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1161 for($i=0; $i<= $#data; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1162 $pool[$poolmem]->{assignments}[$i] = $population[$popmem]->{assignments}[$i];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1163 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1164
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1165 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1166 # switch two members
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1167 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1168 $index1 = sprintf("%i", ($#data+1) * &ran1() );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1169 $index2 = sprintf("%i", ($#data+1) * &ran1() );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1170
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1171 $pool[$poolmem]->{assignments}[$index1] = $population[$popmem]->{assignments}[$index2];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1172 $pool[$poolmem]->{assignments}[$index2] = $population[$popmem]->{assignments}[$index1];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1173 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1174
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1175 # add immigrants, keeping track of our index with $_[0]
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1176 sub add_immigrants{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1177 my $j;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1178
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1179 for($j=0; $j< $nnewimm; $j++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1180 @{$population[$popsize+$j]->{assignments}} = &permute( \@batched );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1181 &fill_assignments( \@{$population[$popsize+$j]->{assignments}} );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1182 $population[$popsize+$j]->{score} = &mutual_info( $numbatches );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1183 $k++;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1184 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1185 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1186
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1187 # add top-scoring parents, keeping track of our index with $_[0]
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1188 sub add_parents{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1189 my $i;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1190 my $j;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1191 my $k;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1192
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1193 $k = $_[0];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1194 # sort population now
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1195 @population = sort{$a->{score} <=> $b->{score}} @population;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1196
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1197 for($j=0; $j< $nkeepparents; $j++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1198 &copy_parents( $j, $k );
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1199 # will copy score also in copy_parents()
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1200 $k++;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1201 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1202 return $k;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1203 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1204
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1205 # add population member $_[0] to pool member $_[1]
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1206 sub copy_parents{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1207 my $i;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1208 my $popmem;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1209 my $poolmem;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1210 my $index1;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1211 my $index2;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1212
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1213 $popmem = $_[0];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1214 $poolmem = $_[1];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1215
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1216 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1217 # fill all of poolmem
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1218 #
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1219 for($i=0; $i<= $#data; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1220 $pool[$poolmem]->{assignments}[$i] = $population[$popmem]->{assignments}[$i];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1221 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1222 $pool[$poolmem]->{score} = $population[$popmem]->{score};
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1223 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1224
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1225 # copy top of pool to population (assumes pool is sorted)
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1226 sub fill_population{
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1227 my $i;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1228 my $j;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1229 my $m;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1230
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1231 $m = 0;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1232 for($i=0; $i< $popsize; $i++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1233 $population[$i]->{score} = $pool[$i]->{score};
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1234 $m+= $population[$i]->{score};
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1235 for($j=0; $j<= $#data; $j++){
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1236 $population[$i]->{assignments}[$j] = $pool[$i]->{assignments}[$j];
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1237 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1238 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1239 return $m/$popsize;
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1240 }
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1241
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1242
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1243
3723b54935cb Uploaded
mmaiensc
parents:
diff changeset
1244