Mercurial > repos > mmaiensc > arts
comparison ARTS/ARTS.pl @ 0:3723b54935cb draft
Uploaded
author | mmaiensc |
---|---|
date | Wed, 13 Nov 2013 16:13:17 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:3723b54935cb |
---|---|
1 #!/usr/bin/perl | |
2 # ARTS: Automated Randomization of multiple Traits for Study Design, using diploidly GA | |
3 # Mark Maienschein-Cline, last updated 8/19/2013 | |
4 # mmaiensc@uic.edu | |
5 # Center for Research Informatics, University of Illinois at Chicago | |
6 # | |
7 # Copyright (C) 2013 Mark Maienschein-Cline | |
8 # | |
9 # This program is free software; you can redistribute it and/or modify | |
10 # it under the terms of the GNU General Public License as published by | |
11 # the Free Software Foundation; either version 2 of the License, or | |
12 # (at your option) any later version. | |
13 # | |
14 # This program is distributed in the hope that it will be useful, | |
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 # GNU General Public License for more details. | |
18 # | |
19 # You should have received a copy of the GNU General Public License along | |
20 # with this program; if not, write to the Free Software Foundation, Inc., | |
21 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | |
22 | |
23 | |
24 | |
25 | |
26 | |
27 use Getopt::Long qw(:config no_ignore_case); | |
28 #use Time::HiRes qw( clock_gettime ); | |
29 use Math::Trig; | |
30 $|++; | |
31 | |
32 # | |
33 # initialize random number parameters | |
34 # | |
35 &ran1_init(); | |
36 | |
37 # | |
38 # read command line | |
39 # | |
40 &read_command_line(); | |
41 | |
42 # | |
43 # read phenotype list: print the title lines of columns used for verbose | |
44 # | |
45 &read_data(); | |
46 if( $verb eq "y" || $verb eq "l" ){ | |
47 printf("Using traits:"); | |
48 for($i=0; $i<= $#allcols; $i++){ | |
49 print "\t$titlevals[$allcols[$i]]"; | |
50 } | |
51 print "\n"; | |
52 printf("Using trait combinations:"); | |
53 for($i=0; $i<= $#cols; $i++){ | |
54 printf("\t{%s", $titlevals[$cols[$i][0]]); | |
55 for($j=1; $j<= $#{$cols[$i]}; $j++){ | |
56 printf(",%s", $titlevals[$cols[$i][$j]]); | |
57 } | |
58 printf("}"); | |
59 } | |
60 print "\n"; | |
61 } | |
62 | |
63 # | |
64 # initialize GA parameters | |
65 # | |
66 &ga_init(); | |
67 | |
68 # | |
69 # if using the batchcolumn, fill in the batch | |
70 # | |
71 if( $bcolumn ne "" ){ | |
72 if( $verb eq "y" ){ | |
73 printf("Looking at column %i (%s) for batch assignments\n", $bcolumn+1, $titlevals[$bcolumn]); | |
74 } | |
75 # fill in batch from last column of @data | |
76 $numbatches = 0; | |
77 $foundbatchhash = {}; | |
78 @batchsizes = (); | |
79 for($i=0; $i<=$#data; $i++){ | |
80 if( $foundbatchhash->{$data[$i][$bcolumn]} eq "" ){ | |
81 $foundbatchhash->{$data[$i][$bcolumn]} = $numbatches; | |
82 $numbatches++; | |
83 push(@batchnames, $data[$i][$bcolumn]); | |
84 push(@batchsizes, 0); | |
85 } | |
86 $batchsizes[$foundbatchhash->{$data[$i][$bcolumn]}]++; | |
87 $data[$i][$#{$data[0]}] = $data[$i][$bcolumn]; | |
88 } | |
89 $mi = &mutual_info( $numbatches ); | |
90 $bestmi = $mi; | |
91 } | |
92 | |
93 # | |
94 # else do sampling: run GA | |
95 # | |
96 if( $bcolumn eq "" ){ | |
97 &initialize_population(); | |
98 | |
99 $oldavg = 1; | |
100 $err = 0.0001; | |
101 for($n=0; $n< $numgen; $n++){ | |
102 &add_immigrants(); | |
103 @population = &permute( \@population ); | |
104 $k = 0; | |
105 $k = &crossover( $k ); | |
106 $k = &mutate( $k ); | |
107 $k = &add_parents( $k ); | |
108 @pool = sort{$a->{score} <=> $b->{score}} @pool; | |
109 $average = &fill_population(); | |
110 | |
111 # check if we've done enough already, and print out status | |
112 if( $verb eq "y" ){printf(" Generation %i of %i, average fitness %0.4f\n", $n+1, $numgen, $average );} | |
113 if( $oldavg >= $average && $oldavg - $average < $err ){last;} | |
114 $oldavg = $average; | |
115 } | |
116 | |
117 # save the final best one | |
118 for($i=0; $i<= $#data; $i++){ | |
119 &fill_assignments( \@{$population[0]->{assignments}} ); | |
120 } | |
121 } | |
122 | |
123 # | |
124 # print final log to stdout | |
125 # | |
126 if( $verb eq "y" || $verb eq "l" ){&print_info;} | |
127 | |
128 # | |
129 # print result | |
130 # | |
131 if( $out ne "" ){ | |
132 open(OUT,">$out"); | |
133 print OUT "$title\t$bname\n"; | |
134 for($i=0; $i<= $#data; $i++){ | |
135 $orig[$i][1] = $data[$i][$#{$data[0]}]; | |
136 printf OUT ("%s\t%i\n", $orig[$i][0], $orig[$i][1]); | |
137 } | |
138 close(OUT); | |
139 } | |
140 | |
141 ############### | |
142 # SUBROUTINES # | |
143 ############### | |
144 | |
145 # read command line options | |
146 sub read_command_line{ | |
147 my $i; | |
148 | |
149 # | |
150 # option default values | |
151 # | |
152 $in = ""; | |
153 $out = ""; | |
154 $bcolumn = ""; | |
155 $batch = ""; | |
156 $bname = "batch"; | |
157 $phenocols = ""; | |
158 $contcols = ""; | |
159 $datecols = ""; | |
160 $bins = 5; | |
161 @blist = (); | |
162 $verb = "y"; | |
163 $mmi = 0; | |
164 | |
165 $options = " | |
166 Usage: ./ARTS.pl <OPTIONS> | |
167 REQUIRED: | |
168 -i input traits (rectangular, tab-delimited matrix, including title line with column names) | |
169 -c trait columns to randomize | |
170 comma- and semicolon delimited list, columns indexed from 1 | |
171 all traits indicated by commas are used in joint distributions | |
172 AND EITHER -b AND -o, OR -p: | |
173 -b batch sizes (a single number, or a comma-delimited list) | |
174 -o output file (formatted same as input file, with batch added as last column) | |
175 -or- | |
176 -p <batch column index>: print MI statistic for input traits using this column as batch designations | |
177 -p will not do any sampling | |
178 OTHER OPTIONS: | |
179 -cc continuously-valued columns (will be binned) | |
180 -cd date-valued columns (should be M/D/Y); should also list these as continuous (in -cc) | |
181 -cb number of bins to use for continuous or date columns (default: $bins for each) | |
182 can give 1 value, or a list of the same length as -cc; if a list, will be assigned in the same order as -cc | |
183 -bn batch name (title of added column, default $bname) | |
184 -s random number seed (large negative integer, default: $seed) | |
185 "; | |
186 | |
187 # | |
188 # Secret options: | |
189 # -v y or l (verbose: print all, or just print status from beginning or end) | |
190 # -mmi force use of MMI objective function on all columns indicated by -c, over-riding any other settings from -c | |
191 # | |
192 | |
193 GetOptions('i=s' => \$in, | |
194 'o=s' => \$out, | |
195 'p=i' => \$bcolumn, | |
196 'b=s' => \$batch, | |
197 'c=s' => \$phenocols, | |
198 'cc=s' => \$contcols, | |
199 'cd=s' => \$datecols, | |
200 'cb=s' => \$bins, | |
201 'bn=s' => \$bname, | |
202 's=i' => \$seed, | |
203 'mmi' => \$mmi, | |
204 'v=s' => \$verb, | |
205 ) || die "$options\n"; | |
206 | |
207 # | |
208 # check that required inputs exist | |
209 # | |
210 if( $in eq "" ){&exit_required("i");} | |
211 if( ($out eq "" || $batch eq "") && $bcolumn eq "" ){&exit_required("b and -o, or -p,");} | |
212 if( $phenocols eq "" || $phenocols eq "None" ){&exit_required("c");} | |
213 | |
214 # | |
215 # check that inputs values are OK | |
216 # | |
217 if( $bcolumn ne "" ){ | |
218 if( $bcolumn < 1 ){&exit_err("p","at least 1");} | |
219 $bcolumn--; | |
220 } | |
221 if( $verb ne "y" && $verb ne "n" && $verb ne "l" ){&exit_err("v","y or n or l");} | |
222 if( $seed > 0 ){$seed*= -1;} | |
223 if( $seed == 0 ){&exit_err("s","non-zero");} | |
224 | |
225 # | |
226 # if mmi, reset phenocols value using all found columns | |
227 # | |
228 if( $mmi ){ | |
229 @initcs = split(/[,;]/, $phenocols); | |
230 # remove duplicates | |
231 @clist = (); | |
232 $cinds = {}; | |
233 for($i=0; $i<= $#initcs; $i++){ | |
234 if( $cinds->{$initcs[$i]} eq "" ){ | |
235 $cinds->{$initcs[$i]} = 1; | |
236 push(@clist, $initcs[$i]); | |
237 } | |
238 } | |
239 # add to new phenocols | |
240 $phenocols = "$clist[0]"; | |
241 for($i=1; $i<= $#clist; $i++){ | |
242 $phenocols = sprintf("%s,%s", $phenocols, $clist[$i]); | |
243 } | |
244 $phenocols = sprintf("%s;%s", $phenocols, $clist[0]); | |
245 for($i=1; $i<= $#clist; $i++){ | |
246 $phenocols = sprintf("%s;%s", $phenocols, $clist[$i]); | |
247 } | |
248 } | |
249 | |
250 # | |
251 # extract phenotype columns | |
252 # | |
253 @cols = (); | |
254 @allcols = (); | |
255 $alllist = {}; | |
256 @jointlist = split(';',$phenocols); | |
257 for($i=0; $i<= $#jointlist; $i++){ | |
258 @tmp = split(',',$jointlist[$i]); | |
259 @tmp = &fix_cols( \@tmp ); | |
260 push(@cols, [@tmp]); | |
261 for($j=0; $j<= $#tmp; $j++){ | |
262 if( $alllist->{$tmp[$j]} eq "" ){ | |
263 $alllist->{$tmp[$j]} = 1; | |
264 push(@allcols, $tmp[$j]); | |
265 } | |
266 } | |
267 } | |
268 | |
269 # | |
270 # extract continuous and date columns | |
271 # sort continuous columns so that bins correspond to them in order | |
272 # | |
273 if( $contcols ne "" && $contcols ne "None" ){ | |
274 @conts = split(',',$contcols); | |
275 @conts = &fix_cols( \@conts ); | |
276 $numconts = $#conts+1; | |
277 } | |
278 if( $datecols ne "" && $datecols ne "None" ){ | |
279 @dates = split(',',$datecols); | |
280 @dates = &fix_cols( \@dates ); | |
281 $numdates = $#dates+1; | |
282 # check that date columns are among continuous columns | |
283 for($i=0; $i<= $#dates; $i++){ | |
284 for($j=0; $j<= $#conts; $j++){ | |
285 if( $dates[$i] == $conts[$j] ){last;} | |
286 if( $j==$#conts ){ | |
287 printf("Error: please specify date column %i as continuous\n", $dates[$i]+1 ); | |
288 die; | |
289 } | |
290 } | |
291 } | |
292 } | |
293 if( $bins =~ /,/ ){ | |
294 @blist = split(',',$bins); | |
295 if( $#blist+1 != $#conts + 1 ){ | |
296 printf("Error: you input %i bins, but %i columns that need binning\n", $#blist+1, $#conts+1); | |
297 die; | |
298 } | |
299 } | |
300 else{ | |
301 for($i=0; $i<= $#conts; $i++){ | |
302 push(@blist, $bins); | |
303 } | |
304 } | |
305 } | |
306 | |
307 # print error message for flag $_[0], with correct values $_[1], and print usage | |
308 sub exit_err{ | |
309 printf("Error: set -%s to be %s\n%s\n", $_[0], $_[1], $options); | |
310 exit; | |
311 } | |
312 # print error message saying flag $_[0] is required | |
313 sub exit_required{ | |
314 printf("Error: option -%s is required\n%s\n", $_[0], $options); | |
315 exit; | |
316 } | |
317 | |
318 # fix all indices in array $_[0]: cast to integer, check at least 1, and subtract 1 | |
319 sub fix_cols{ | |
320 my @list; | |
321 my $i; | |
322 @list = @{$_[0]}; | |
323 for($i=0; $i<= $#list; $i++){ | |
324 $list[$i] = sprintf("%i", $list[$i]); | |
325 if( $list[$i] < 1 ){ | |
326 print "Error: column indices should be at least 1\n"; | |
327 die; | |
328 } | |
329 $list[$i]--; | |
330 } | |
331 return @list; | |
332 } | |
333 | |
334 # print info about best randomization | |
335 sub print_info{ | |
336 # | |
337 # get MI of each phenotype and average | |
338 # | |
339 $bestmi = &mutual_info(); | |
340 @bestmilist = &individual_mi( $numbatches ); | |
341 $bestavgmi = 0; | |
342 for($i=0; $i<= $#bestmilist; $i++){ | |
343 $bestavgmi+= $bestmilist[$i]/($#bestmilist+1); | |
344 } | |
345 | |
346 printf("Final MI %0.4f ; Individual trait MIs (mean %0.4f ): ", $bestmi, $bestavgmi); | |
347 for($i=0; $i<= $#bestmilist; $i++){ | |
348 printf("\t%0.4f", $bestmilist[$i]); | |
349 } | |
350 print "\n-----------------------------------------------------------------\n"; | |
351 # | |
352 # print the counts for each phenotype in each batch | |
353 # | |
354 # first title line: phenotype names | |
355 for($i=0; $i<= $#allcols; $i++){ | |
356 printf("\t%s values", $titlevals[$allcols[$i]]); | |
357 for($j=1; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){ | |
358 printf("\t"); | |
359 } | |
360 } | |
361 print "\nBatch (size)"; | |
362 # second title line: phenotype values | |
363 for($i=0; $i<= $#allcols; $i++){ | |
364 for($j=0; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){ | |
365 if( $items->{$allcols[$i]}->{list}[$j] ne "" ){printf("\t%s", &name($items->{$allcols[$i]}->{list}[$j], $allcols[$i]) );} | |
366 else{printf("\tempty");} | |
367 } | |
368 } | |
369 print "\n-------"; | |
370 # print a line of dashes to separate | |
371 for($i=0; $i<= $#allcols; $i++){ | |
372 for($j=0; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){ | |
373 printf("\t-------"); | |
374 } | |
375 } | |
376 print "\n"; | |
377 for($k=0; $k< $numbatches; $k++){ | |
378 printf("%s (%i)", $batchnames[$k], $batchsizes[$k] ); | |
379 for($i=0; $i<= $#allcols; $i++){ | |
380 for($j=0; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){ | |
381 printf("\t%i", &count( $#{$data[0]}, $batchnames[$k], $allcols[$i], $items->{$allcols[$i]}->{list}[$j] ) ); | |
382 } | |
383 } | |
384 print "\n"; | |
385 } | |
386 print "-------"; | |
387 # print a line of dashes to separate | |
388 for($i=0; $i<= $#allcols; $i++){ | |
389 for($j=0; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){ | |
390 printf("\t-------"); | |
391 } | |
392 } | |
393 # print totals for each type | |
394 print "\nTotal"; | |
395 for($i=0; $i<= $#allcols; $i++){ | |
396 for($j=0; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){ | |
397 printf("\t%i", $items->{$allcols[$i]}->{$items->{$allcols[$i]}->{list}[$j]}[1] ); | |
398 } | |
399 } | |
400 print "\n"; | |
401 } | |
402 | |
403 # for continuous valued columns, checked by $_[1], convert value $_[0] back to a range | |
404 sub name{ | |
405 my $i; | |
406 my $binw; | |
407 | |
408 # | |
409 # if there aren't any continuous columns, or $_[1] doesn't match one, just return $_[0] | |
410 # | |
411 if( $#conts < 0 ){return $_[0];} | |
412 for($i=0; $i<= $#conts; $i++){ | |
413 if( $_[1] == $conts[$i] ){last;} | |
414 if( $i==$#conts ){return $_[0];} | |
415 } | |
416 | |
417 # | |
418 # convert bin value back to continuous value | |
419 # | |
420 $binw = ($contstats[$i][2]-$contstats[$i][0])/$blist[$i]; | |
421 $val1 = $binw*$_[0]+$contstats[$i][0]; | |
422 $val2 = $binw*($_[0]+1)+$contstats[$i][0]; | |
423 | |
424 # | |
425 # if there aren't any date columns, or $_[1] doesn't match one, just return the range val1-val2 | |
426 # | |
427 if( $#dates < 0 ){return sprintf("%s-%s", $val1, $val2);} | |
428 for($i=0; $i<= $#dates; $i++){ | |
429 if( $_[1] == $dates[$i] ){last;} | |
430 if( $i==$#dates ){return sprintf("%s-%s", $val1, $val2);} | |
431 } | |
432 $val1 = sprintf("%i", $val1); | |
433 $val2 = sprintf("%i", $val2); | |
434 return sprintf("%s-%s", &convert_date( $val1 ), &convert_date( $val2 )); | |
435 | |
436 } | |
437 | |
438 # read in regular matrix from $in | |
439 # for continuous (including date-value) columns, make histograms | |
440 sub read_data{ | |
441 my @lines; | |
442 my $i; | |
443 @data = (); | |
444 $items = {}; | |
445 @orig = (); | |
446 @titlevals = (); | |
447 @batchsizes = (); | |
448 $numbatches = 0; | |
449 | |
450 # | |
451 # fix newline convention | |
452 # | |
453 | |
454 | |
455 open(IN,"$in") || die "Error: can't open $in\n"; | |
456 # | |
457 # read in all lines and check formatting | |
458 # | |
459 @lines = <IN>; | |
460 if( $lines[0] =~ /\r/ && $#lines == 0 ){ | |
461 # this happens with tab-delimited text saved from excel | |
462 @lines = split('\r', $lines[0]); | |
463 } | |
464 | |
465 # | |
466 # read title line | |
467 # | |
468 $title = $lines[0]; | |
469 chomp($title); | |
470 @titlevals = split('\t',$title); | |
471 for($k=1; $k<= $#lines; $k++){ | |
472 $line = $lines[$k]; | |
473 chomp($line); | |
474 if( $line ne "" ){ # ignore blank lines | |
475 @parts = split('\t',$line); | |
476 for($i=$#parts+1; $i<= $#titlevals; $i++){ | |
477 push(@parts, ""); | |
478 } | |
479 if( $#parts != $#titlevals ){ | |
480 printf("Error: not enough columns in line %i\n", $#data+2); | |
481 die; | |
482 } | |
483 # push 1 extra for the batch | |
484 push(@parts, 0); | |
485 push(@data, [(@parts)] ); | |
486 push(@orig, [($line, 0)] ); | |
487 } | |
488 } | |
489 close(IN); | |
490 | |
491 # | |
492 # exit if no data read | |
493 # | |
494 if( $#data < 0 ){ | |
495 printf("Error: no samples were read in\n"); | |
496 die; | |
497 } | |
498 | |
499 # | |
500 # if batch is not empty, check batches: | |
501 # if no commas, cast to integer and count how many are needed | |
502 # if there are commas, get batched on given sizes | |
503 # double check that we add up | |
504 # | |
505 @batchnames = (); | |
506 if( $batch ne "" ){ | |
507 if( $batch !~ /,/ ){ | |
508 $batch = sprintf("%i", $batch); | |
509 # fix batch size if too big | |
510 if( $batch > $#data + 1 ){ | |
511 printf("Warning: you have %i samples, but asked for a batch size of %i, so there is only 1 batch\n", $#data+1, $batch); | |
512 $batch = $#data+1; | |
513 } | |
514 $numbatches = ($#data+1)/$batch; | |
515 $exactbatches = sprintf("%i", $numbatches); | |
516 if( $exactbatches < $numbatches ){$exactbatches++;} | |
517 $numbatches = $exactbatches; | |
518 for($i=0; $i< $numbatches-1; $i++){ | |
519 push(@batchsizes, $batch); | |
520 } | |
521 push(@batchsizes, $batch - ($numbatches*$batch - ($#data+1)) ); | |
522 } | |
523 else{ | |
524 @batchsizes = split(',',$batch); | |
525 $numbatches = $#batchsizes+1; | |
526 } | |
527 $tot = 0; | |
528 for($i=0; $i< $numbatches; $i++){ | |
529 push(@batchnames, $i+1); | |
530 $tot+= $batchsizes[$i]; | |
531 } | |
532 if( $tot != $#data+1 ){ | |
533 printf("Error: have %i spaces in all batches, but %i samples\n", $tot, $#data+1); | |
534 die; | |
535 } | |
536 } | |
537 | |
538 # | |
539 # convert dates to numbers | |
540 # | |
541 for($i=0; $i<= $#data; $i++){ | |
542 for($j=0; $j<= $#dates; $j++){ | |
543 if( $data[$i][$dates[$j]] ne "" ){$data[$i][$dates[$j]] = &convert_date( $data[$i][$dates[$j]] );} | |
544 } | |
545 } | |
546 | |
547 # | |
548 # for all continuous columns, compute median and fill in missing values | |
549 # also record max and min for binning | |
550 # | |
551 @contstats = (); # records min, median, max for each continuous column | |
552 for($j=0; $j<= $#conts; $j++){ | |
553 @tmp = (); | |
554 for($i=0; $i<= $#data; $i++){ | |
555 if( $data[$i][$conts[$j]] ne "" ){push(@tmp, $data[$i][$conts[$j]]);} | |
556 } | |
557 @tmp = sort{ $a <=> $b } @tmp; | |
558 $median = $tmp[sprintf("%i", ($#tmp+1)/2)]; | |
559 push(@contstats, [($tmp[0], $median, $tmp[$#tmp])] ); | |
560 # for($i=0; $i<= $#data; $i++){ | |
561 # if( $data[$i][$conts[$j]] eq "" ){$data[$i][$conts[$j]] = $median;} | |
562 # } | |
563 } | |
564 | |
565 # | |
566 # for all continuous columns, bin data | |
567 # | |
568 for($j=0; $j<= $#conts; $j++){ | |
569 $binw = ($contstats[$j][2] - $contstats[$j][0])/($blist[$j]); | |
570 if( $binw == 0 ){ | |
571 printf("Error: max and min of column %i are equal (max/min are %f/%f)\n", $conts[$j]+1, $contstats[$j][2], $contstats[$j][0] ); | |
572 die; | |
573 } | |
574 for($i=0; $i<= $#data; $i++){ | |
575 if( $data[$i][$conts[$j]] ne "" ){ | |
576 $data[$i][$conts[$j]] = sprintf("%i", ($data[$i][$conts[$j]] - $contstats[$j][0])/$binw); | |
577 if( $data[$i][$conts[$j]] >= $blist[$j] ){$data[$i][$conts[$j]] = $blist[$j]-1;} | |
578 } | |
579 } | |
580 } | |
581 | |
582 # | |
583 # for each column we're using, count how many item types there are | |
584 # empty phenotypes are considered their own, distinct phenotype | |
585 # | |
586 $items = &itemize( \@allcols ); | |
587 } | |
588 | |
589 # count how many item types of @{$_[0]} there are in @data | |
590 sub itemize{ | |
591 my $i; | |
592 my $j; | |
593 my $info; | |
594 my @cols; | |
595 my $items; | |
596 @cols = @{$_[0]}; | |
597 | |
598 for($j=0; $j<= $#cols; $j++){ | |
599 $info = {}; | |
600 $info->{list} = (); | |
601 for($i=0; $i<= $#data; $i++){ | |
602 if( $info->{$data[$i][$cols[$j]]} eq "" ){ | |
603 $info->{$data[$i][$cols[$j]]} = [($#{$info->{list}}+1,0)]; | |
604 push(@{$info->{list}}, $data[$i][$cols[$j]]); | |
605 } | |
606 $info->{$data[$i][$cols[$j]]}[1]++; | |
607 $info->{count}++; | |
608 } | |
609 $info->{num} = $#{$info->{list}}+1; | |
610 $items->{$cols[$j]} = $info; | |
611 } | |
612 | |
613 for($j=0; $j<= $#cols; $j++){ | |
614 # this set prints the number of values and counts for each phenotype | |
615 #printf("%i,%s:", $cols[$j], $titlevals[$cols[$j]]); | |
616 #for($k=0; $k<= $#{$items->{$cols[$j]}->{list}}; $k++){ | |
617 # printf("\t%s,%i", $items->{$cols[$j]}->{list}[$k], $items->{$cols[$j]}->{$items->{$cols[$j]}->{list}[$k]}[1] ); | |
618 #} | |
619 #print "\n"; | |
620 if( $items->{$cols[$j]}->{num} > 20 ){ | |
621 printf("Warning: column %i (%s) has %i values; should you make it continuous?\n", $cols[$j], $titlevals[$cols[$j]], $items->{$cols[$j]}->{num} ); | |
622 } | |
623 } | |
624 return $items; | |
625 } | |
626 | |
627 # convert date in M/D/Y to integer, or integer to M/D/Y | |
628 sub convert_date{ | |
629 my $date; | |
630 my $month; | |
631 my $day; | |
632 my $year; | |
633 my $months; | |
634 my $i; | |
635 # cumulative days per month | |
636 $months->{0} = 0; | |
637 $months->{1} = 31; | |
638 $months->{2} = 59; | |
639 $months->{3} = 90; | |
640 $months->{4} = 120; | |
641 $months->{5} = 151; | |
642 $months->{6} = 181; | |
643 $months->{7} = 212; | |
644 $months->{8} = 243; | |
645 $months->{9} = 273; | |
646 $months->{10} = 304; | |
647 $months->{11} = 334; | |
648 $months->{12} = 365; | |
649 $date = $_[0]; | |
650 | |
651 # convert date to integer | |
652 if( $date =~ /\// ){ | |
653 ($month, $day, $year) = split('/',$date); | |
654 $month = sprintf("%i", $month); | |
655 $day = sprintf("%i", $day); | |
656 $year = sprintf("%i", $year); | |
657 if( $month < 1 || $month > 12 ){ | |
658 print "Error: found a month $month not between 1 and 12\n"; | |
659 die; | |
660 } | |
661 if( $day < 1 || $day > 31 ){ | |
662 print "Error: found a day $day not between 1 and 31\n"; | |
663 die; | |
664 } | |
665 | |
666 return $day + $months->{$month-1} + $year*$months->{12}; | |
667 } | |
668 # convert integer to date | |
669 elsif( $date == sprintf("%i", $date) ){ | |
670 $year = sprintf("%i", $date/($months->{12})); | |
671 $month = $date-$year*$months->{12}; | |
672 for($i=1; $i<=12; $i++){ | |
673 if( $month < $months->{$i} ){last;} | |
674 } | |
675 $day = $month - $months->{$i-1}; | |
676 $month = $i; | |
677 return sprintf("%s/%s/%s", $month, $day, $year); | |
678 } | |
679 else{ | |
680 printf("\nError: unrecognized format in convert_date(): %s\n", $date); | |
681 die; | |
682 } | |
683 } | |
684 | |
685 # set globals used by ran1 | |
686 sub ran1_init{ | |
687 # | |
688 # random number variables | |
689 # | |
690 $iset = 0; | |
691 $gset = 0; | |
692 #$iseed = clock_gettime(CLOCK_REALTIME); | |
693 #($first, $second) = split('\.', $iseed); | |
694 #$seed = sprintf("-%i%i", $second, $first); | |
695 $seed = -10854829; | |
696 $M1 = 259200; | |
697 $IA1 = 7141; | |
698 $IC1 = 54773; | |
699 $RM1 = 1.0/$M1; | |
700 $M2 = 134456; | |
701 $IA2 = 8121; | |
702 $IC2 = 28411; | |
703 $RM2 = 1.0/$M2; | |
704 $M3 = 243000; | |
705 $IA3 = 4561; | |
706 $IC3 = 51349; | |
707 $iff = 0; | |
708 $ix1 = 0; | |
709 $ix2 = 0; | |
710 $ix3 = 0; | |
711 @ranarray = (); | |
712 for($i=0; $i< 98; $i++){ | |
713 push(@ranarray, 0); | |
714 } | |
715 } | |
716 | |
717 # uniform random number generator, seed, iff, and various capital-letter variables set in beginning | |
718 sub ran1{ | |
719 my $j; | |
720 my $temp; | |
721 | |
722 if( $seed < 0 || $iff == 0 ){ | |
723 $iff = 1; | |
724 $ix1 = ($IC1 - $seed)%$M1; | |
725 $ix1 = ($IA1*$ix1 + $IC1)%$M1; | |
726 $ix2 = $ix1%$M2; | |
727 $ix1 = ($IA1*$ix1 + $IC1)%$M1; | |
728 $ix3 = $ix1%$M3; | |
729 for($j=1; $j<= 97; $j++){ | |
730 $ix1 = ($IA1*$ix1 + $IC1)%$M1; | |
731 $ix2 = ($IA2*$ix2 + $IC2)%$M2; | |
732 $ranarray[$j] = ($ix1 + $ix2*$RM2)*$RM1; | |
733 } | |
734 $seed = 1; | |
735 } | |
736 $ix1 = ($IA1*$ix1 + $IC1)%$M1; | |
737 $ix2 = ($IA2*$ix2 + $IC2)%$M2; | |
738 $ix3 = ($IA3*$ix3 + $IC3)%$M3; | |
739 | |
740 $j = sprintf("%i", 1 + ((97*$ix3)/$M3) ); | |
741 if( $j> 97 || $j< 1 ){ | |
742 printf("Error in ran1: $j outside of [1:97]\n"); | |
743 die; | |
744 } | |
745 $temp = $ranarray[$j]; | |
746 $ranarray[$j] = ($ix1 + $ix2*$RM2)*$RM1; | |
747 return $temp; | |
748 } | |
749 | |
750 # permute array $_[0] | |
751 sub permute{ | |
752 my @assignments; | |
753 my $i; | |
754 my $j; | |
755 my $tmp; | |
756 | |
757 @assignments = @{$_[0]}; | |
758 | |
759 # | |
760 # shuffle batches randomly | |
761 # | |
762 for($i=$#assignments; $i>= 0; $i--){ | |
763 $j = sprintf("%i", ($i+1)*&ran1() ); | |
764 $tmp = $assignments[$j]; | |
765 $assignments[$j] = $assignments[$i]; | |
766 $assignments[$i] = $tmp; | |
767 } | |
768 return @assignments; | |
769 } | |
770 | |
771 # fill data with assignments $_[0] | |
772 sub fill_assignments{ | |
773 my @list; | |
774 my $i; | |
775 @list = @{$_[0]}; | |
776 if( $#list != $#data ){ | |
777 print "Error in fill_assignments: mismatching list lengths\n"; | |
778 die; | |
779 } | |
780 for($i=0; $i<= $#list; $i++){ | |
781 $data[$i][$#{$data[0]}] = $list[$i]; | |
782 } | |
783 } | |
784 | |
785 # compute mutual information of a batch assignment | |
786 sub mutual_info{ | |
787 my $i; | |
788 my $s; | |
789 my $mi; | |
790 my $stot; | |
791 | |
792 $mi = 0; | |
793 for($i=0; $i<= $#cols; $i++){ | |
794 $mi += &this_mi( $_[0], $#{$data[0]}, \@{$cols[$i]} )/($#cols+1); | |
795 } | |
796 | |
797 return $mi; | |
798 } | |
799 | |
800 # compute all single-phenotype mutual information | |
801 sub individual_mi{ | |
802 my $i; | |
803 my @list; | |
804 my @milist; | |
805 | |
806 @milist = (); | |
807 for($i=0; $i<= $#allcols; $i++){ | |
808 @list = ($allcols[$i]); | |
809 push(@milist, &this_mi( $_[0], $#{$data[0]}, \@list ) ); | |
810 } | |
811 return @milist; | |
812 } | |
813 | |
814 # compute mutual information of columns $_[1] ($_[0] bins) and all of @{$_[2]} | |
815 sub this_mi{ | |
816 my $i; | |
817 my $j; | |
818 my $summand; | |
819 my @list; | |
820 my $jprob; | |
821 my $m1prob; | |
822 my $m2prob; | |
823 my $jbin; | |
824 my $m1bin; | |
825 my $m2bin; | |
826 my $jbinstot; | |
827 my $m1binstot; | |
828 my $m2binstot; | |
829 my @jbinlist; | |
830 my @m1binlist; | |
831 my @m2binlist; | |
832 my $mi; | |
833 my $s1; | |
834 my $s2; | |
835 my $s; | |
836 @list = @{$_[2]}; | |
837 | |
838 # initialize probabilities | |
839 $jprob = {}; # joint distribution | |
840 $m1prob = {}; # batch marginal dist | |
841 $m2prob = {}; # pheno marginal dist | |
842 @jbinlist = (); # phenotype combos found in joint distribution | |
843 @m1binlist = (); # batches found in batches distribution (1st marginal dist) | |
844 @m2binlist = (); # phenotype combos found in phenotypes distribution (2nd marginal dist) | |
845 | |
846 # | |
847 # read through data and add to distributions | |
848 # | |
849 $summand = 1.0/($#data+1); | |
850 for($i=0; $i<= $#data; $i++){ | |
851 # | |
852 # define bin names based on phenotype/batch | |
853 # for phenotypes p1, p2, etc., batch b: | |
854 # joint = p1_p2_..._pn_b | |
855 # 1st marginal = b | |
856 # 2nd marginal = p1_p2_..._pn | |
857 # | |
858 $jbin = sprintf("%s", $data[$i][$#{$data[0]}]); | |
859 $m1bin = sprintf("%s", $data[$i][$#{$data[0]}]); | |
860 $m2bin = ""; | |
861 for($j=0; $j<= $#list; $j++){ | |
862 # NOTE: | |
863 # $list[$j] is a phenotype column (e.g., gender) | |
864 # $data[$i][$list[$j]] is the value of that phenotype in sample $i (e.g., M or F) | |
865 # $items->{$list[$j]}->{$data[$i][$list[$j]]}[0] is the bin index (e.g., M->0, F->1) of that phenotype | |
866 | |
867 $jbin = sprintf("%s_%i", $jbin, $items->{$list[$j]}->{$data[$i][$list[$j]]}[0]); | |
868 if( $j>0 ){$m2bin = sprintf("%s_", $m2bin);} | |
869 $m2bin = sprintf("%s%i", $m2bin, $items->{$list[$j]}->{$data[$i][$list[$j]]}[0]); | |
870 } | |
871 | |
872 # | |
873 # check if we've already seen this bin, for each distribution | |
874 # initialize probabilities and add to list if it's the first time | |
875 # | |
876 if( $jprob->{$jbin} eq "" ){ | |
877 $jprob->{$jbin} = 0; | |
878 push(@jbinlist, [($jbin, $m1bin, $m2bin)] ); | |
879 } | |
880 if( $m1prob->{$m1bin} eq "" ){ | |
881 $m1prob->{$m1bin} = 0; | |
882 push(@m1binlist, $m1bin); | |
883 } | |
884 if( $m2prob->{$m2bin} eq "" ){ | |
885 $m2prob->{$m2bin} = 0; | |
886 push(@m2binlist, $m2bin); | |
887 } | |
888 | |
889 # | |
890 # add a count to each distribution | |
891 # | |
892 $jprob->{$jbin} += $summand; | |
893 $m1prob->{$m1bin} += $summand; | |
894 $m2prob->{$m2bin} += $summand; | |
895 } | |
896 | |
897 # | |
898 # compute mutual information, and entropy of m1prob and m2prob (for normalization) | |
899 # | |
900 $mi = 0; | |
901 $s1 = 0; | |
902 $s2 = 0; | |
903 for($i=0; $i<= $#jbinlist; $i++){ | |
904 $mi+= ($jprob->{$jbinlist[$i][0]}) * log( ($jprob->{$jbinlist[$i][0]})/($m1prob->{$jbinlist[$i][1]} * $m2prob->{$jbinlist[$i][2]}) ); | |
905 } | |
906 for($i=0; $i<= $#m1binlist; $i++){ | |
907 $s1-= $m1prob->{$m1binlist[$i]} * log( $m1prob->{$m1binlist[$i]} ); | |
908 } | |
909 for($i=0; $i<= $#m2binlist; $i++){ | |
910 $s2-= $m2prob->{$m2binlist[$i]} * log( $m2prob->{$m2binlist[$i]} ); | |
911 } | |
912 $s = sqrt($s1*$s2); | |
913 | |
914 # | |
915 # normalize mi | |
916 # | |
917 if( $s>0 ){$mi/= $s;} | |
918 | |
919 # | |
920 # return normalized mi (0=independent, 1=completely dependent) | |
921 # | |
922 return $mi; | |
923 } | |
924 | |
925 # count how many of @data have column $_[0] equal $_[1] and column $_[2] equal $_[3] | |
926 sub count{ | |
927 my $i; | |
928 my $tot; | |
929 | |
930 $tot = 0; | |
931 for($i=0; $i<= $#data; $i++){ | |
932 if( $data[$i][$_[0]] eq $_[1] && $data[$i][$_[2]] eq $_[3] ){$tot++;} | |
933 } | |
934 return $tot; | |
935 } | |
936 | |
937 # initialize GA parameters and large matrices | |
938 sub ga_init{ | |
939 my $i; | |
940 my $j; | |
941 my $info; | |
942 | |
943 $popsize = 100; | |
944 $numgen = 300; | |
945 $nchrmuts = 2; | |
946 $nnewimm = 10; | |
947 $nkeepparents = 2; | |
948 $nchrpool = ($nnewimm+$popsize) + ($nnewimm+$popsize)*$nchrmuts + $nkeepparents; | |
949 | |
950 # | |
951 # population to turn over each generation | |
952 # | |
953 @population = (); | |
954 for($i=0; $i< $popsize+$nnewimm; $i++){ | |
955 $info = {}; | |
956 $info->{score} = 0; | |
957 $info->{assignments} = [()]; | |
958 for($j=0; $j<= $#data; $j++){ | |
959 push(@{$info->{assignments}}, 0); | |
960 } | |
961 push(@population, $info); | |
962 } | |
963 | |
964 # | |
965 # new individuals to fill each generation | |
966 # | |
967 @pool = (); | |
968 for($i=0; $i< $nchrpool; $i++){ | |
969 $info = {}; | |
970 $info->{score} = 0; | |
971 $info->{assignments} = [()]; | |
972 for($j=0; $j<= $#data; $j++){ | |
973 push(@{$info->{assignments}}, 0); | |
974 } | |
975 push(@pool, $info); | |
976 } | |
977 | |
978 # | |
979 # array to randomize for batch assignments | |
980 # | |
981 @batched = (); | |
982 @bcounts = (); | |
983 for($i=0; $i<= $#batchsizes; $i++){ | |
984 push(@bcounts, 0); | |
985 for($j=0; $j< $batchsizes[$i]; $j++){ | |
986 push(@batched, $i+1); | |
987 } | |
988 } | |
989 } | |
990 | |
991 # initialize the population array: randomize $popsize batches and score each one | |
992 sub initialize_population{ | |
993 my $i; | |
994 | |
995 for($i=0; $i< $popsize; $i++){ | |
996 @{$population[$i]->{assignments}} = &permute( \@batched ); | |
997 &fill_assignments( \@{$population[$i]->{assignments}} ); | |
998 $population[$i]->{score} = &mutual_info( $numbatches ); | |
999 } | |
1000 } | |
1001 | |
1002 # complete the crossover step, keeping track of our index with $_[0] | |
1003 sub crossover{ | |
1004 my $i; | |
1005 my $j; | |
1006 my $k; | |
1007 | |
1008 $k = $_[0]; | |
1009 | |
1010 for($i=0; $i< $popsize + $nnewimm; $i+=2){ | |
1011 &do_cross($i, $k); | |
1012 &fill_assignments( \@{$pool[$k]->{assignments}} ); | |
1013 $pool[$k]->{score} = &mutual_info( $numbatches ); | |
1014 $k++; | |
1015 &fill_assignments( \@{$pool[$k]->{assignments}} ); | |
1016 $pool[$k]->{score} = &mutual_info( $numbatches ); | |
1017 $k++; | |
1018 } | |
1019 return $k; | |
1020 } | |
1021 | |
1022 # do a crossover between population members $_[0] and $_[0]+1, fill to pool members $_[1] and $_[1]+1 | |
1023 sub do_cross{ | |
1024 my $i; | |
1025 my $j; | |
1026 my $popmem; | |
1027 my $poolmem; | |
1028 my $index; | |
1029 my @swap; | |
1030 my @subswap; | |
1031 my $swapinds; | |
1032 | |
1033 $popmem = $_[0]; | |
1034 $poolmem = $_[1]; | |
1035 $index = sprintf("%i", $#data * &ran1() ); | |
1036 | |
1037 # | |
1038 # count how many of each batch to switch | |
1039 # | |
1040 for($i=0; $i<= $#bcounts; $i++){ | |
1041 $bcounts[$i] = 0; | |
1042 } | |
1043 for($i=0; $i<= $index; $i++){ | |
1044 $bcounts[$population[$popmem]->{assignments}[$i] - 1]++; | |
1045 } | |
1046 | |
1047 # | |
1048 # for each batch i: | |
1049 # record into subswap all indices with that batch from popmem+1 | |
1050 # permute subswap | |
1051 # add first bcounts[$i] into swap | |
1052 # then sort swap | |
1053 # | |
1054 @swap = (); | |
1055 $swapinds = {}; | |
1056 for($i=0; $i<= $#bcounts; $i++){ | |
1057 @subswap = (); | |
1058 for($j=0; $j<= $#data; $j++){ | |
1059 if( $population[$popmem+1]->{assignments}[$j] == $i+1 ){ | |
1060 push(@subswap, $j); | |
1061 } | |
1062 } | |
1063 @subswap = &permute( \@subswap ); | |
1064 for($j=0; $j< $bcounts[$i]; $j++){ | |
1065 push(@swap, $subswap[$j]); | |
1066 $swapinds->{$subswap[$j]} = 1; | |
1067 } | |
1068 } | |
1069 @swap = sort{$a <=> $b} @swap; | |
1070 | |
1071 # | |
1072 # fill start of first new chr from swap indices of second chr, and end from end of first chr | |
1073 # | |
1074 for($i=0; $i<= $index; $i++){ | |
1075 $pool[$poolmem]->{assignments}[$i] = $population[$popmem+1]->{assignments}[$swap[$i]]; | |
1076 } | |
1077 for($i=$index+1; $i<= $#data; $i++){ | |
1078 $pool[$poolmem]->{assignments}[$i] = $population[$popmem]->{assignments}[$i]; | |
1079 } | |
1080 | |
1081 # | |
1082 # fill start of second chr from start of first chr, and end from remaining parts of second chr | |
1083 # | |
1084 for($i=0; $i<= $index; $i++){ | |
1085 $pool[$poolmem+1]->{assignments}[$i] = $population[$popmem]->{assignments}[$i]; | |
1086 } | |
1087 $j = $index+1; | |
1088 for($i=0; $i<= $#data; $i++){ | |
1089 if( $swapinds->{$i} != 1 ){ | |
1090 $pool[$poolmem+1]->{assignments}[$j] = $population[$popmem+1]->{assignments}[$i]; | |
1091 $j++; | |
1092 } | |
1093 } | |
1094 | |
1095 | |
1096 # | |
1097 # check that batch counts are still ok | |
1098 # | |
1099 $checkbatch = 0; | |
1100 if( $checkbatch ){ | |
1101 for($i=0; $i<= $#bcounts; $i++){ | |
1102 $bcounts[$i] = 0; | |
1103 } | |
1104 for($i=0; $i<= $#data; $i++){ | |
1105 $bcounts[$pool[$poolmem]->{assignments}[$i] - 1]++; | |
1106 } | |
1107 for($i=0; $i<= $#bcounts; $i++){ | |
1108 if( $bcounts[$i] != $batchsizes[$i] ){ | |
1109 print "Error in do_cross: lost some batch counts in first daughter chr\n"; | |
1110 exit; | |
1111 } | |
1112 } | |
1113 for($i=0; $i<= $#bcounts; $i++){ | |
1114 $bcounts[$i] = 0; | |
1115 } | |
1116 for($i=0; $i<= $#data; $i++){ | |
1117 $bcounts[$pool[$poolmem+1]->{assignments}[$i] - 1]++; | |
1118 } | |
1119 for($i=0; $i<= $#bcounts; $i++){ | |
1120 if( $bcounts[$i] != $batchsizes[$i] ){ | |
1121 print "Error in do_cross: lost some batch counts in first daughter chr\n"; | |
1122 exit; | |
1123 } | |
1124 } | |
1125 } | |
1126 } | |
1127 | |
1128 # complete the mutation step, keeping track of our index with $_[0] | |
1129 sub mutate{ | |
1130 my $i; | |
1131 my $j; | |
1132 my $k; | |
1133 | |
1134 $k = $_[0]; | |
1135 | |
1136 for($i=0; $i< $popsize+$nnewimm; $i++){ | |
1137 for($j=0; $j< $nchrmuts; $j++){ | |
1138 &do_mutation($i, $k); | |
1139 &fill_assignments( \@{$pool[$k]->{assignments}} ); | |
1140 $pool[$k]->{score} = &mutual_info( $numbatches ); | |
1141 $k++; | |
1142 } | |
1143 } | |
1144 return $k; | |
1145 } | |
1146 | |
1147 # do a mutation for population member $_[0], fill to pool member $_[1] | |
1148 sub do_mutation{ | |
1149 my $i; | |
1150 my $popmem; | |
1151 my $poolmem; | |
1152 my $index1; | |
1153 my $index2; | |
1154 | |
1155 $popmem = $_[0]; | |
1156 $poolmem = $_[1]; | |
1157 | |
1158 # | |
1159 # fill all of poolmem | |
1160 # | |
1161 for($i=0; $i<= $#data; $i++){ | |
1162 $pool[$poolmem]->{assignments}[$i] = $population[$popmem]->{assignments}[$i]; | |
1163 } | |
1164 | |
1165 # | |
1166 # switch two members | |
1167 # | |
1168 $index1 = sprintf("%i", ($#data+1) * &ran1() ); | |
1169 $index2 = sprintf("%i", ($#data+1) * &ran1() ); | |
1170 | |
1171 $pool[$poolmem]->{assignments}[$index1] = $population[$popmem]->{assignments}[$index2]; | |
1172 $pool[$poolmem]->{assignments}[$index2] = $population[$popmem]->{assignments}[$index1]; | |
1173 } | |
1174 | |
1175 # add immigrants, keeping track of our index with $_[0] | |
1176 sub add_immigrants{ | |
1177 my $j; | |
1178 | |
1179 for($j=0; $j< $nnewimm; $j++){ | |
1180 @{$population[$popsize+$j]->{assignments}} = &permute( \@batched ); | |
1181 &fill_assignments( \@{$population[$popsize+$j]->{assignments}} ); | |
1182 $population[$popsize+$j]->{score} = &mutual_info( $numbatches ); | |
1183 $k++; | |
1184 } | |
1185 } | |
1186 | |
1187 # add top-scoring parents, keeping track of our index with $_[0] | |
1188 sub add_parents{ | |
1189 my $i; | |
1190 my $j; | |
1191 my $k; | |
1192 | |
1193 $k = $_[0]; | |
1194 # sort population now | |
1195 @population = sort{$a->{score} <=> $b->{score}} @population; | |
1196 | |
1197 for($j=0; $j< $nkeepparents; $j++){ | |
1198 ©_parents( $j, $k ); | |
1199 # will copy score also in copy_parents() | |
1200 $k++; | |
1201 } | |
1202 return $k; | |
1203 } | |
1204 | |
1205 # add population member $_[0] to pool member $_[1] | |
1206 sub copy_parents{ | |
1207 my $i; | |
1208 my $popmem; | |
1209 my $poolmem; | |
1210 my $index1; | |
1211 my $index2; | |
1212 | |
1213 $popmem = $_[0]; | |
1214 $poolmem = $_[1]; | |
1215 | |
1216 # | |
1217 # fill all of poolmem | |
1218 # | |
1219 for($i=0; $i<= $#data; $i++){ | |
1220 $pool[$poolmem]->{assignments}[$i] = $population[$popmem]->{assignments}[$i]; | |
1221 } | |
1222 $pool[$poolmem]->{score} = $population[$popmem]->{score}; | |
1223 } | |
1224 | |
1225 # copy top of pool to population (assumes pool is sorted) | |
1226 sub fill_population{ | |
1227 my $i; | |
1228 my $j; | |
1229 my $m; | |
1230 | |
1231 $m = 0; | |
1232 for($i=0; $i< $popsize; $i++){ | |
1233 $population[$i]->{score} = $pool[$i]->{score}; | |
1234 $m+= $population[$i]->{score}; | |
1235 for($j=0; $j<= $#data; $j++){ | |
1236 $population[$i]->{assignments}[$j] = $pool[$i]->{assignments}[$j]; | |
1237 } | |
1238 } | |
1239 return $m/$popsize; | |
1240 } | |
1241 | |
1242 | |
1243 | |
1244 |