comparison ARTS/ARTS.pl @ 0:3723b54935cb draft

Uploaded
author mmaiensc
date Wed, 13 Nov 2013 16:13:17 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:3723b54935cb
1 #!/usr/bin/perl
2 # ARTS: Automated Randomization of multiple Traits for Study Design, using diploidly GA
3 # Mark Maienschein-Cline, last updated 8/19/2013
4 # mmaiensc@uic.edu
5 # Center for Research Informatics, University of Illinois at Chicago
6 #
7 # Copyright (C) 2013 Mark Maienschein-Cline
8 #
9 # This program is free software; you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation; either version 2 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License along
20 # with this program; if not, write to the Free Software Foundation, Inc.,
21 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22
23
24
25
26
27 use Getopt::Long qw(:config no_ignore_case);
28 #use Time::HiRes qw( clock_gettime );
29 use Math::Trig;
30 $|++;
31
32 #
33 # initialize random number parameters
34 #
35 &ran1_init();
36
37 #
38 # read command line
39 #
40 &read_command_line();
41
42 #
43 # read phenotype list: print the title lines of columns used for verbose
44 #
45 &read_data();
46 if( $verb eq "y" || $verb eq "l" ){
47 printf("Using traits:");
48 for($i=0; $i<= $#allcols; $i++){
49 print "\t$titlevals[$allcols[$i]]";
50 }
51 print "\n";
52 printf("Using trait combinations:");
53 for($i=0; $i<= $#cols; $i++){
54 printf("\t{%s", $titlevals[$cols[$i][0]]);
55 for($j=1; $j<= $#{$cols[$i]}; $j++){
56 printf(",%s", $titlevals[$cols[$i][$j]]);
57 }
58 printf("}");
59 }
60 print "\n";
61 }
62
63 #
64 # initialize GA parameters
65 #
66 &ga_init();
67
68 #
69 # if using the batchcolumn, fill in the batch
70 #
71 if( $bcolumn ne "" ){
72 if( $verb eq "y" ){
73 printf("Looking at column %i (%s) for batch assignments\n", $bcolumn+1, $titlevals[$bcolumn]);
74 }
75 # fill in batch from last column of @data
76 $numbatches = 0;
77 $foundbatchhash = {};
78 @batchsizes = ();
79 for($i=0; $i<=$#data; $i++){
80 if( $foundbatchhash->{$data[$i][$bcolumn]} eq "" ){
81 $foundbatchhash->{$data[$i][$bcolumn]} = $numbatches;
82 $numbatches++;
83 push(@batchnames, $data[$i][$bcolumn]);
84 push(@batchsizes, 0);
85 }
86 $batchsizes[$foundbatchhash->{$data[$i][$bcolumn]}]++;
87 $data[$i][$#{$data[0]}] = $data[$i][$bcolumn];
88 }
89 $mi = &mutual_info( $numbatches );
90 $bestmi = $mi;
91 }
92
93 #
94 # else do sampling: run GA
95 #
96 if( $bcolumn eq "" ){
97 &initialize_population();
98
99 $oldavg = 1;
100 $err = 0.0001;
101 for($n=0; $n< $numgen; $n++){
102 &add_immigrants();
103 @population = &permute( \@population );
104 $k = 0;
105 $k = &crossover( $k );
106 $k = &mutate( $k );
107 $k = &add_parents( $k );
108 @pool = sort{$a->{score} <=> $b->{score}} @pool;
109 $average = &fill_population();
110
111 # check if we've done enough already, and print out status
112 if( $verb eq "y" ){printf(" Generation %i of %i, average fitness %0.4f\n", $n+1, $numgen, $average );}
113 if( $oldavg >= $average && $oldavg - $average < $err ){last;}
114 $oldavg = $average;
115 }
116
117 # save the final best one
118 for($i=0; $i<= $#data; $i++){
119 &fill_assignments( \@{$population[0]->{assignments}} );
120 }
121 }
122
123 #
124 # print final log to stdout
125 #
126 if( $verb eq "y" || $verb eq "l" ){&print_info;}
127
128 #
129 # print result
130 #
131 if( $out ne "" ){
132 open(OUT,">$out");
133 print OUT "$title\t$bname\n";
134 for($i=0; $i<= $#data; $i++){
135 $orig[$i][1] = $data[$i][$#{$data[0]}];
136 printf OUT ("%s\t%i\n", $orig[$i][0], $orig[$i][1]);
137 }
138 close(OUT);
139 }
140
141 ###############
142 # SUBROUTINES #
143 ###############
144
145 # read command line options
146 sub read_command_line{
147 my $i;
148
149 #
150 # option default values
151 #
152 $in = "";
153 $out = "";
154 $bcolumn = "";
155 $batch = "";
156 $bname = "batch";
157 $phenocols = "";
158 $contcols = "";
159 $datecols = "";
160 $bins = 5;
161 @blist = ();
162 $verb = "y";
163 $mmi = 0;
164
165 $options = "
166 Usage: ./ARTS.pl <OPTIONS>
167 REQUIRED:
168 -i input traits (rectangular, tab-delimited matrix, including title line with column names)
169 -c trait columns to randomize
170 comma- and semicolon delimited list, columns indexed from 1
171 all traits indicated by commas are used in joint distributions
172 AND EITHER -b AND -o, OR -p:
173 -b batch sizes (a single number, or a comma-delimited list)
174 -o output file (formatted same as input file, with batch added as last column)
175 -or-
176 -p <batch column index>: print MI statistic for input traits using this column as batch designations
177 -p will not do any sampling
178 OTHER OPTIONS:
179 -cc continuously-valued columns (will be binned)
180 -cd date-valued columns (should be M/D/Y); should also list these as continuous (in -cc)
181 -cb number of bins to use for continuous or date columns (default: $bins for each)
182 can give 1 value, or a list of the same length as -cc; if a list, will be assigned in the same order as -cc
183 -bn batch name (title of added column, default $bname)
184 -s random number seed (large negative integer, default: $seed)
185 ";
186
187 #
188 # Secret options:
189 # -v y or l (verbose: print all, or just print status from beginning or end)
190 # -mmi force use of MMI objective function on all columns indicated by -c, over-riding any other settings from -c
191 #
192
193 GetOptions('i=s' => \$in,
194 'o=s' => \$out,
195 'p=i' => \$bcolumn,
196 'b=s' => \$batch,
197 'c=s' => \$phenocols,
198 'cc=s' => \$contcols,
199 'cd=s' => \$datecols,
200 'cb=s' => \$bins,
201 'bn=s' => \$bname,
202 's=i' => \$seed,
203 'mmi' => \$mmi,
204 'v=s' => \$verb,
205 ) || die "$options\n";
206
207 #
208 # check that required inputs exist
209 #
210 if( $in eq "" ){&exit_required("i");}
211 if( ($out eq "" || $batch eq "") && $bcolumn eq "" ){&exit_required("b and -o, or -p,");}
212 if( $phenocols eq "" || $phenocols eq "None" ){&exit_required("c");}
213
214 #
215 # check that inputs values are OK
216 #
217 if( $bcolumn ne "" ){
218 if( $bcolumn < 1 ){&exit_err("p","at least 1");}
219 $bcolumn--;
220 }
221 if( $verb ne "y" && $verb ne "n" && $verb ne "l" ){&exit_err("v","y or n or l");}
222 if( $seed > 0 ){$seed*= -1;}
223 if( $seed == 0 ){&exit_err("s","non-zero");}
224
225 #
226 # if mmi, reset phenocols value using all found columns
227 #
228 if( $mmi ){
229 @initcs = split(/[,;]/, $phenocols);
230 # remove duplicates
231 @clist = ();
232 $cinds = {};
233 for($i=0; $i<= $#initcs; $i++){
234 if( $cinds->{$initcs[$i]} eq "" ){
235 $cinds->{$initcs[$i]} = 1;
236 push(@clist, $initcs[$i]);
237 }
238 }
239 # add to new phenocols
240 $phenocols = "$clist[0]";
241 for($i=1; $i<= $#clist; $i++){
242 $phenocols = sprintf("%s,%s", $phenocols, $clist[$i]);
243 }
244 $phenocols = sprintf("%s;%s", $phenocols, $clist[0]);
245 for($i=1; $i<= $#clist; $i++){
246 $phenocols = sprintf("%s;%s", $phenocols, $clist[$i]);
247 }
248 }
249
250 #
251 # extract phenotype columns
252 #
253 @cols = ();
254 @allcols = ();
255 $alllist = {};
256 @jointlist = split(';',$phenocols);
257 for($i=0; $i<= $#jointlist; $i++){
258 @tmp = split(',',$jointlist[$i]);
259 @tmp = &fix_cols( \@tmp );
260 push(@cols, [@tmp]);
261 for($j=0; $j<= $#tmp; $j++){
262 if( $alllist->{$tmp[$j]} eq "" ){
263 $alllist->{$tmp[$j]} = 1;
264 push(@allcols, $tmp[$j]);
265 }
266 }
267 }
268
269 #
270 # extract continuous and date columns
271 # sort continuous columns so that bins correspond to them in order
272 #
273 if( $contcols ne "" && $contcols ne "None" ){
274 @conts = split(',',$contcols);
275 @conts = &fix_cols( \@conts );
276 $numconts = $#conts+1;
277 }
278 if( $datecols ne "" && $datecols ne "None" ){
279 @dates = split(',',$datecols);
280 @dates = &fix_cols( \@dates );
281 $numdates = $#dates+1;
282 # check that date columns are among continuous columns
283 for($i=0; $i<= $#dates; $i++){
284 for($j=0; $j<= $#conts; $j++){
285 if( $dates[$i] == $conts[$j] ){last;}
286 if( $j==$#conts ){
287 printf("Error: please specify date column %i as continuous\n", $dates[$i]+1 );
288 die;
289 }
290 }
291 }
292 }
293 if( $bins =~ /,/ ){
294 @blist = split(',',$bins);
295 if( $#blist+1 != $#conts + 1 ){
296 printf("Error: you input %i bins, but %i columns that need binning\n", $#blist+1, $#conts+1);
297 die;
298 }
299 }
300 else{
301 for($i=0; $i<= $#conts; $i++){
302 push(@blist, $bins);
303 }
304 }
305 }
306
307 # print error message for flag $_[0], with correct values $_[1], and print usage
308 sub exit_err{
309 printf("Error: set -%s to be %s\n%s\n", $_[0], $_[1], $options);
310 exit;
311 }
312 # print error message saying flag $_[0] is required
313 sub exit_required{
314 printf("Error: option -%s is required\n%s\n", $_[0], $options);
315 exit;
316 }
317
318 # fix all indices in array $_[0]: cast to integer, check at least 1, and subtract 1
319 sub fix_cols{
320 my @list;
321 my $i;
322 @list = @{$_[0]};
323 for($i=0; $i<= $#list; $i++){
324 $list[$i] = sprintf("%i", $list[$i]);
325 if( $list[$i] < 1 ){
326 print "Error: column indices should be at least 1\n";
327 die;
328 }
329 $list[$i]--;
330 }
331 return @list;
332 }
333
334 # print info about best randomization
335 sub print_info{
336 #
337 # get MI of each phenotype and average
338 #
339 $bestmi = &mutual_info();
340 @bestmilist = &individual_mi( $numbatches );
341 $bestavgmi = 0;
342 for($i=0; $i<= $#bestmilist; $i++){
343 $bestavgmi+= $bestmilist[$i]/($#bestmilist+1);
344 }
345
346 printf("Final MI %0.4f ; Individual trait MIs (mean %0.4f ): ", $bestmi, $bestavgmi);
347 for($i=0; $i<= $#bestmilist; $i++){
348 printf("\t%0.4f", $bestmilist[$i]);
349 }
350 print "\n-----------------------------------------------------------------\n";
351 #
352 # print the counts for each phenotype in each batch
353 #
354 # first title line: phenotype names
355 for($i=0; $i<= $#allcols; $i++){
356 printf("\t%s values", $titlevals[$allcols[$i]]);
357 for($j=1; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){
358 printf("\t");
359 }
360 }
361 print "\nBatch (size)";
362 # second title line: phenotype values
363 for($i=0; $i<= $#allcols; $i++){
364 for($j=0; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){
365 if( $items->{$allcols[$i]}->{list}[$j] ne "" ){printf("\t%s", &name($items->{$allcols[$i]}->{list}[$j], $allcols[$i]) );}
366 else{printf("\tempty");}
367 }
368 }
369 print "\n-------";
370 # print a line of dashes to separate
371 for($i=0; $i<= $#allcols; $i++){
372 for($j=0; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){
373 printf("\t-------");
374 }
375 }
376 print "\n";
377 for($k=0; $k< $numbatches; $k++){
378 printf("%s (%i)", $batchnames[$k], $batchsizes[$k] );
379 for($i=0; $i<= $#allcols; $i++){
380 for($j=0; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){
381 printf("\t%i", &count( $#{$data[0]}, $batchnames[$k], $allcols[$i], $items->{$allcols[$i]}->{list}[$j] ) );
382 }
383 }
384 print "\n";
385 }
386 print "-------";
387 # print a line of dashes to separate
388 for($i=0; $i<= $#allcols; $i++){
389 for($j=0; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){
390 printf("\t-------");
391 }
392 }
393 # print totals for each type
394 print "\nTotal";
395 for($i=0; $i<= $#allcols; $i++){
396 for($j=0; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){
397 printf("\t%i", $items->{$allcols[$i]}->{$items->{$allcols[$i]}->{list}[$j]}[1] );
398 }
399 }
400 print "\n";
401 }
402
403 # for continuous valued columns, checked by $_[1], convert value $_[0] back to a range
404 sub name{
405 my $i;
406 my $binw;
407
408 #
409 # if there aren't any continuous columns, or $_[1] doesn't match one, just return $_[0]
410 #
411 if( $#conts < 0 ){return $_[0];}
412 for($i=0; $i<= $#conts; $i++){
413 if( $_[1] == $conts[$i] ){last;}
414 if( $i==$#conts ){return $_[0];}
415 }
416
417 #
418 # convert bin value back to continuous value
419 #
420 $binw = ($contstats[$i][2]-$contstats[$i][0])/$blist[$i];
421 $val1 = $binw*$_[0]+$contstats[$i][0];
422 $val2 = $binw*($_[0]+1)+$contstats[$i][0];
423
424 #
425 # if there aren't any date columns, or $_[1] doesn't match one, just return the range val1-val2
426 #
427 if( $#dates < 0 ){return sprintf("%s-%s", $val1, $val2);}
428 for($i=0; $i<= $#dates; $i++){
429 if( $_[1] == $dates[$i] ){last;}
430 if( $i==$#dates ){return sprintf("%s-%s", $val1, $val2);}
431 }
432 $val1 = sprintf("%i", $val1);
433 $val2 = sprintf("%i", $val2);
434 return sprintf("%s-%s", &convert_date( $val1 ), &convert_date( $val2 ));
435
436 }
437
438 # read in regular matrix from $in
439 # for continuous (including date-value) columns, make histograms
440 sub read_data{
441 my @lines;
442 my $i;
443 @data = ();
444 $items = {};
445 @orig = ();
446 @titlevals = ();
447 @batchsizes = ();
448 $numbatches = 0;
449
450 #
451 # fix newline convention
452 #
453
454
455 open(IN,"$in") || die "Error: can't open $in\n";
456 #
457 # read in all lines and check formatting
458 #
459 @lines = <IN>;
460 if( $lines[0] =~ /\r/ && $#lines == 0 ){
461 # this happens with tab-delimited text saved from excel
462 @lines = split('\r', $lines[0]);
463 }
464
465 #
466 # read title line
467 #
468 $title = $lines[0];
469 chomp($title);
470 @titlevals = split('\t',$title);
471 for($k=1; $k<= $#lines; $k++){
472 $line = $lines[$k];
473 chomp($line);
474 if( $line ne "" ){ # ignore blank lines
475 @parts = split('\t',$line);
476 for($i=$#parts+1; $i<= $#titlevals; $i++){
477 push(@parts, "");
478 }
479 if( $#parts != $#titlevals ){
480 printf("Error: not enough columns in line %i\n", $#data+2);
481 die;
482 }
483 # push 1 extra for the batch
484 push(@parts, 0);
485 push(@data, [(@parts)] );
486 push(@orig, [($line, 0)] );
487 }
488 }
489 close(IN);
490
491 #
492 # exit if no data read
493 #
494 if( $#data < 0 ){
495 printf("Error: no samples were read in\n");
496 die;
497 }
498
499 #
500 # if batch is not empty, check batches:
501 # if no commas, cast to integer and count how many are needed
502 # if there are commas, get batched on given sizes
503 # double check that we add up
504 #
505 @batchnames = ();
506 if( $batch ne "" ){
507 if( $batch !~ /,/ ){
508 $batch = sprintf("%i", $batch);
509 # fix batch size if too big
510 if( $batch > $#data + 1 ){
511 printf("Warning: you have %i samples, but asked for a batch size of %i, so there is only 1 batch\n", $#data+1, $batch);
512 $batch = $#data+1;
513 }
514 $numbatches = ($#data+1)/$batch;
515 $exactbatches = sprintf("%i", $numbatches);
516 if( $exactbatches < $numbatches ){$exactbatches++;}
517 $numbatches = $exactbatches;
518 for($i=0; $i< $numbatches-1; $i++){
519 push(@batchsizes, $batch);
520 }
521 push(@batchsizes, $batch - ($numbatches*$batch - ($#data+1)) );
522 }
523 else{
524 @batchsizes = split(',',$batch);
525 $numbatches = $#batchsizes+1;
526 }
527 $tot = 0;
528 for($i=0; $i< $numbatches; $i++){
529 push(@batchnames, $i+1);
530 $tot+= $batchsizes[$i];
531 }
532 if( $tot != $#data+1 ){
533 printf("Error: have %i spaces in all batches, but %i samples\n", $tot, $#data+1);
534 die;
535 }
536 }
537
538 #
539 # convert dates to numbers
540 #
541 for($i=0; $i<= $#data; $i++){
542 for($j=0; $j<= $#dates; $j++){
543 if( $data[$i][$dates[$j]] ne "" ){$data[$i][$dates[$j]] = &convert_date( $data[$i][$dates[$j]] );}
544 }
545 }
546
547 #
548 # for all continuous columns, compute median and fill in missing values
549 # also record max and min for binning
550 #
551 @contstats = (); # records min, median, max for each continuous column
552 for($j=0; $j<= $#conts; $j++){
553 @tmp = ();
554 for($i=0; $i<= $#data; $i++){
555 if( $data[$i][$conts[$j]] ne "" ){push(@tmp, $data[$i][$conts[$j]]);}
556 }
557 @tmp = sort{ $a <=> $b } @tmp;
558 $median = $tmp[sprintf("%i", ($#tmp+1)/2)];
559 push(@contstats, [($tmp[0], $median, $tmp[$#tmp])] );
560 # for($i=0; $i<= $#data; $i++){
561 # if( $data[$i][$conts[$j]] eq "" ){$data[$i][$conts[$j]] = $median;}
562 # }
563 }
564
565 #
566 # for all continuous columns, bin data
567 #
568 for($j=0; $j<= $#conts; $j++){
569 $binw = ($contstats[$j][2] - $contstats[$j][0])/($blist[$j]);
570 if( $binw == 0 ){
571 printf("Error: max and min of column %i are equal (max/min are %f/%f)\n", $conts[$j]+1, $contstats[$j][2], $contstats[$j][0] );
572 die;
573 }
574 for($i=0; $i<= $#data; $i++){
575 if( $data[$i][$conts[$j]] ne "" ){
576 $data[$i][$conts[$j]] = sprintf("%i", ($data[$i][$conts[$j]] - $contstats[$j][0])/$binw);
577 if( $data[$i][$conts[$j]] >= $blist[$j] ){$data[$i][$conts[$j]] = $blist[$j]-1;}
578 }
579 }
580 }
581
582 #
583 # for each column we're using, count how many item types there are
584 # empty phenotypes are considered their own, distinct phenotype
585 #
586 $items = &itemize( \@allcols );
587 }
588
589 # count how many item types of @{$_[0]} there are in @data
590 sub itemize{
591 my $i;
592 my $j;
593 my $info;
594 my @cols;
595 my $items;
596 @cols = @{$_[0]};
597
598 for($j=0; $j<= $#cols; $j++){
599 $info = {};
600 $info->{list} = ();
601 for($i=0; $i<= $#data; $i++){
602 if( $info->{$data[$i][$cols[$j]]} eq "" ){
603 $info->{$data[$i][$cols[$j]]} = [($#{$info->{list}}+1,0)];
604 push(@{$info->{list}}, $data[$i][$cols[$j]]);
605 }
606 $info->{$data[$i][$cols[$j]]}[1]++;
607 $info->{count}++;
608 }
609 $info->{num} = $#{$info->{list}}+1;
610 $items->{$cols[$j]} = $info;
611 }
612
613 for($j=0; $j<= $#cols; $j++){
614 # this set prints the number of values and counts for each phenotype
615 #printf("%i,%s:", $cols[$j], $titlevals[$cols[$j]]);
616 #for($k=0; $k<= $#{$items->{$cols[$j]}->{list}}; $k++){
617 # printf("\t%s,%i", $items->{$cols[$j]}->{list}[$k], $items->{$cols[$j]}->{$items->{$cols[$j]}->{list}[$k]}[1] );
618 #}
619 #print "\n";
620 if( $items->{$cols[$j]}->{num} > 20 ){
621 printf("Warning: column %i (%s) has %i values; should you make it continuous?\n", $cols[$j], $titlevals[$cols[$j]], $items->{$cols[$j]}->{num} );
622 }
623 }
624 return $items;
625 }
626
627 # convert date in M/D/Y to integer, or integer to M/D/Y
628 sub convert_date{
629 my $date;
630 my $month;
631 my $day;
632 my $year;
633 my $months;
634 my $i;
635 # cumulative days per month
636 $months->{0} = 0;
637 $months->{1} = 31;
638 $months->{2} = 59;
639 $months->{3} = 90;
640 $months->{4} = 120;
641 $months->{5} = 151;
642 $months->{6} = 181;
643 $months->{7} = 212;
644 $months->{8} = 243;
645 $months->{9} = 273;
646 $months->{10} = 304;
647 $months->{11} = 334;
648 $months->{12} = 365;
649 $date = $_[0];
650
651 # convert date to integer
652 if( $date =~ /\// ){
653 ($month, $day, $year) = split('/',$date);
654 $month = sprintf("%i", $month);
655 $day = sprintf("%i", $day);
656 $year = sprintf("%i", $year);
657 if( $month < 1 || $month > 12 ){
658 print "Error: found a month $month not between 1 and 12\n";
659 die;
660 }
661 if( $day < 1 || $day > 31 ){
662 print "Error: found a day $day not between 1 and 31\n";
663 die;
664 }
665
666 return $day + $months->{$month-1} + $year*$months->{12};
667 }
668 # convert integer to date
669 elsif( $date == sprintf("%i", $date) ){
670 $year = sprintf("%i", $date/($months->{12}));
671 $month = $date-$year*$months->{12};
672 for($i=1; $i<=12; $i++){
673 if( $month < $months->{$i} ){last;}
674 }
675 $day = $month - $months->{$i-1};
676 $month = $i;
677 return sprintf("%s/%s/%s", $month, $day, $year);
678 }
679 else{
680 printf("\nError: unrecognized format in convert_date(): %s\n", $date);
681 die;
682 }
683 }
684
685 # set globals used by ran1
686 sub ran1_init{
687 #
688 # random number variables
689 #
690 $iset = 0;
691 $gset = 0;
692 #$iseed = clock_gettime(CLOCK_REALTIME);
693 #($first, $second) = split('\.', $iseed);
694 #$seed = sprintf("-%i%i", $second, $first);
695 $seed = -10854829;
696 $M1 = 259200;
697 $IA1 = 7141;
698 $IC1 = 54773;
699 $RM1 = 1.0/$M1;
700 $M2 = 134456;
701 $IA2 = 8121;
702 $IC2 = 28411;
703 $RM2 = 1.0/$M2;
704 $M3 = 243000;
705 $IA3 = 4561;
706 $IC3 = 51349;
707 $iff = 0;
708 $ix1 = 0;
709 $ix2 = 0;
710 $ix3 = 0;
711 @ranarray = ();
712 for($i=0; $i< 98; $i++){
713 push(@ranarray, 0);
714 }
715 }
716
717 # uniform random number generator, seed, iff, and various capital-letter variables set in beginning
718 sub ran1{
719 my $j;
720 my $temp;
721
722 if( $seed < 0 || $iff == 0 ){
723 $iff = 1;
724 $ix1 = ($IC1 - $seed)%$M1;
725 $ix1 = ($IA1*$ix1 + $IC1)%$M1;
726 $ix2 = $ix1%$M2;
727 $ix1 = ($IA1*$ix1 + $IC1)%$M1;
728 $ix3 = $ix1%$M3;
729 for($j=1; $j<= 97; $j++){
730 $ix1 = ($IA1*$ix1 + $IC1)%$M1;
731 $ix2 = ($IA2*$ix2 + $IC2)%$M2;
732 $ranarray[$j] = ($ix1 + $ix2*$RM2)*$RM1;
733 }
734 $seed = 1;
735 }
736 $ix1 = ($IA1*$ix1 + $IC1)%$M1;
737 $ix2 = ($IA2*$ix2 + $IC2)%$M2;
738 $ix3 = ($IA3*$ix3 + $IC3)%$M3;
739
740 $j = sprintf("%i", 1 + ((97*$ix3)/$M3) );
741 if( $j> 97 || $j< 1 ){
742 printf("Error in ran1: $j outside of [1:97]\n");
743 die;
744 }
745 $temp = $ranarray[$j];
746 $ranarray[$j] = ($ix1 + $ix2*$RM2)*$RM1;
747 return $temp;
748 }
749
750 # permute array $_[0]
751 sub permute{
752 my @assignments;
753 my $i;
754 my $j;
755 my $tmp;
756
757 @assignments = @{$_[0]};
758
759 #
760 # shuffle batches randomly
761 #
762 for($i=$#assignments; $i>= 0; $i--){
763 $j = sprintf("%i", ($i+1)*&ran1() );
764 $tmp = $assignments[$j];
765 $assignments[$j] = $assignments[$i];
766 $assignments[$i] = $tmp;
767 }
768 return @assignments;
769 }
770
771 # fill data with assignments $_[0]
772 sub fill_assignments{
773 my @list;
774 my $i;
775 @list = @{$_[0]};
776 if( $#list != $#data ){
777 print "Error in fill_assignments: mismatching list lengths\n";
778 die;
779 }
780 for($i=0; $i<= $#list; $i++){
781 $data[$i][$#{$data[0]}] = $list[$i];
782 }
783 }
784
785 # compute mutual information of a batch assignment
786 sub mutual_info{
787 my $i;
788 my $s;
789 my $mi;
790 my $stot;
791
792 $mi = 0;
793 for($i=0; $i<= $#cols; $i++){
794 $mi += &this_mi( $_[0], $#{$data[0]}, \@{$cols[$i]} )/($#cols+1);
795 }
796
797 return $mi;
798 }
799
800 # compute all single-phenotype mutual information
801 sub individual_mi{
802 my $i;
803 my @list;
804 my @milist;
805
806 @milist = ();
807 for($i=0; $i<= $#allcols; $i++){
808 @list = ($allcols[$i]);
809 push(@milist, &this_mi( $_[0], $#{$data[0]}, \@list ) );
810 }
811 return @milist;
812 }
813
814 # compute mutual information of columns $_[1] ($_[0] bins) and all of @{$_[2]}
815 sub this_mi{
816 my $i;
817 my $j;
818 my $summand;
819 my @list;
820 my $jprob;
821 my $m1prob;
822 my $m2prob;
823 my $jbin;
824 my $m1bin;
825 my $m2bin;
826 my $jbinstot;
827 my $m1binstot;
828 my $m2binstot;
829 my @jbinlist;
830 my @m1binlist;
831 my @m2binlist;
832 my $mi;
833 my $s1;
834 my $s2;
835 my $s;
836 @list = @{$_[2]};
837
838 # initialize probabilities
839 $jprob = {}; # joint distribution
840 $m1prob = {}; # batch marginal dist
841 $m2prob = {}; # pheno marginal dist
842 @jbinlist = (); # phenotype combos found in joint distribution
843 @m1binlist = (); # batches found in batches distribution (1st marginal dist)
844 @m2binlist = (); # phenotype combos found in phenotypes distribution (2nd marginal dist)
845
846 #
847 # read through data and add to distributions
848 #
849 $summand = 1.0/($#data+1);
850 for($i=0; $i<= $#data; $i++){
851 #
852 # define bin names based on phenotype/batch
853 # for phenotypes p1, p2, etc., batch b:
854 # joint = p1_p2_..._pn_b
855 # 1st marginal = b
856 # 2nd marginal = p1_p2_..._pn
857 #
858 $jbin = sprintf("%s", $data[$i][$#{$data[0]}]);
859 $m1bin = sprintf("%s", $data[$i][$#{$data[0]}]);
860 $m2bin = "";
861 for($j=0; $j<= $#list; $j++){
862 # NOTE:
863 # $list[$j] is a phenotype column (e.g., gender)
864 # $data[$i][$list[$j]] is the value of that phenotype in sample $i (e.g., M or F)
865 # $items->{$list[$j]}->{$data[$i][$list[$j]]}[0] is the bin index (e.g., M->0, F->1) of that phenotype
866
867 $jbin = sprintf("%s_%i", $jbin, $items->{$list[$j]}->{$data[$i][$list[$j]]}[0]);
868 if( $j>0 ){$m2bin = sprintf("%s_", $m2bin);}
869 $m2bin = sprintf("%s%i", $m2bin, $items->{$list[$j]}->{$data[$i][$list[$j]]}[0]);
870 }
871
872 #
873 # check if we've already seen this bin, for each distribution
874 # initialize probabilities and add to list if it's the first time
875 #
876 if( $jprob->{$jbin} eq "" ){
877 $jprob->{$jbin} = 0;
878 push(@jbinlist, [($jbin, $m1bin, $m2bin)] );
879 }
880 if( $m1prob->{$m1bin} eq "" ){
881 $m1prob->{$m1bin} = 0;
882 push(@m1binlist, $m1bin);
883 }
884 if( $m2prob->{$m2bin} eq "" ){
885 $m2prob->{$m2bin} = 0;
886 push(@m2binlist, $m2bin);
887 }
888
889 #
890 # add a count to each distribution
891 #
892 $jprob->{$jbin} += $summand;
893 $m1prob->{$m1bin} += $summand;
894 $m2prob->{$m2bin} += $summand;
895 }
896
897 #
898 # compute mutual information, and entropy of m1prob and m2prob (for normalization)
899 #
900 $mi = 0;
901 $s1 = 0;
902 $s2 = 0;
903 for($i=0; $i<= $#jbinlist; $i++){
904 $mi+= ($jprob->{$jbinlist[$i][0]}) * log( ($jprob->{$jbinlist[$i][0]})/($m1prob->{$jbinlist[$i][1]} * $m2prob->{$jbinlist[$i][2]}) );
905 }
906 for($i=0; $i<= $#m1binlist; $i++){
907 $s1-= $m1prob->{$m1binlist[$i]} * log( $m1prob->{$m1binlist[$i]} );
908 }
909 for($i=0; $i<= $#m2binlist; $i++){
910 $s2-= $m2prob->{$m2binlist[$i]} * log( $m2prob->{$m2binlist[$i]} );
911 }
912 $s = sqrt($s1*$s2);
913
914 #
915 # normalize mi
916 #
917 if( $s>0 ){$mi/= $s;}
918
919 #
920 # return normalized mi (0=independent, 1=completely dependent)
921 #
922 return $mi;
923 }
924
925 # count how many of @data have column $_[0] equal $_[1] and column $_[2] equal $_[3]
926 sub count{
927 my $i;
928 my $tot;
929
930 $tot = 0;
931 for($i=0; $i<= $#data; $i++){
932 if( $data[$i][$_[0]] eq $_[1] && $data[$i][$_[2]] eq $_[3] ){$tot++;}
933 }
934 return $tot;
935 }
936
937 # initialize GA parameters and large matrices
938 sub ga_init{
939 my $i;
940 my $j;
941 my $info;
942
943 $popsize = 100;
944 $numgen = 300;
945 $nchrmuts = 2;
946 $nnewimm = 10;
947 $nkeepparents = 2;
948 $nchrpool = ($nnewimm+$popsize) + ($nnewimm+$popsize)*$nchrmuts + $nkeepparents;
949
950 #
951 # population to turn over each generation
952 #
953 @population = ();
954 for($i=0; $i< $popsize+$nnewimm; $i++){
955 $info = {};
956 $info->{score} = 0;
957 $info->{assignments} = [()];
958 for($j=0; $j<= $#data; $j++){
959 push(@{$info->{assignments}}, 0);
960 }
961 push(@population, $info);
962 }
963
964 #
965 # new individuals to fill each generation
966 #
967 @pool = ();
968 for($i=0; $i< $nchrpool; $i++){
969 $info = {};
970 $info->{score} = 0;
971 $info->{assignments} = [()];
972 for($j=0; $j<= $#data; $j++){
973 push(@{$info->{assignments}}, 0);
974 }
975 push(@pool, $info);
976 }
977
978 #
979 # array to randomize for batch assignments
980 #
981 @batched = ();
982 @bcounts = ();
983 for($i=0; $i<= $#batchsizes; $i++){
984 push(@bcounts, 0);
985 for($j=0; $j< $batchsizes[$i]; $j++){
986 push(@batched, $i+1);
987 }
988 }
989 }
990
991 # initialize the population array: randomize $popsize batches and score each one
992 sub initialize_population{
993 my $i;
994
995 for($i=0; $i< $popsize; $i++){
996 @{$population[$i]->{assignments}} = &permute( \@batched );
997 &fill_assignments( \@{$population[$i]->{assignments}} );
998 $population[$i]->{score} = &mutual_info( $numbatches );
999 }
1000 }
1001
1002 # complete the crossover step, keeping track of our index with $_[0]
1003 sub crossover{
1004 my $i;
1005 my $j;
1006 my $k;
1007
1008 $k = $_[0];
1009
1010 for($i=0; $i< $popsize + $nnewimm; $i+=2){
1011 &do_cross($i, $k);
1012 &fill_assignments( \@{$pool[$k]->{assignments}} );
1013 $pool[$k]->{score} = &mutual_info( $numbatches );
1014 $k++;
1015 &fill_assignments( \@{$pool[$k]->{assignments}} );
1016 $pool[$k]->{score} = &mutual_info( $numbatches );
1017 $k++;
1018 }
1019 return $k;
1020 }
1021
1022 # do a crossover between population members $_[0] and $_[0]+1, fill to pool members $_[1] and $_[1]+1
1023 sub do_cross{
1024 my $i;
1025 my $j;
1026 my $popmem;
1027 my $poolmem;
1028 my $index;
1029 my @swap;
1030 my @subswap;
1031 my $swapinds;
1032
1033 $popmem = $_[0];
1034 $poolmem = $_[1];
1035 $index = sprintf("%i", $#data * &ran1() );
1036
1037 #
1038 # count how many of each batch to switch
1039 #
1040 for($i=0; $i<= $#bcounts; $i++){
1041 $bcounts[$i] = 0;
1042 }
1043 for($i=0; $i<= $index; $i++){
1044 $bcounts[$population[$popmem]->{assignments}[$i] - 1]++;
1045 }
1046
1047 #
1048 # for each batch i:
1049 # record into subswap all indices with that batch from popmem+1
1050 # permute subswap
1051 # add first bcounts[$i] into swap
1052 # then sort swap
1053 #
1054 @swap = ();
1055 $swapinds = {};
1056 for($i=0; $i<= $#bcounts; $i++){
1057 @subswap = ();
1058 for($j=0; $j<= $#data; $j++){
1059 if( $population[$popmem+1]->{assignments}[$j] == $i+1 ){
1060 push(@subswap, $j);
1061 }
1062 }
1063 @subswap = &permute( \@subswap );
1064 for($j=0; $j< $bcounts[$i]; $j++){
1065 push(@swap, $subswap[$j]);
1066 $swapinds->{$subswap[$j]} = 1;
1067 }
1068 }
1069 @swap = sort{$a <=> $b} @swap;
1070
1071 #
1072 # fill start of first new chr from swap indices of second chr, and end from end of first chr
1073 #
1074 for($i=0; $i<= $index; $i++){
1075 $pool[$poolmem]->{assignments}[$i] = $population[$popmem+1]->{assignments}[$swap[$i]];
1076 }
1077 for($i=$index+1; $i<= $#data; $i++){
1078 $pool[$poolmem]->{assignments}[$i] = $population[$popmem]->{assignments}[$i];
1079 }
1080
1081 #
1082 # fill start of second chr from start of first chr, and end from remaining parts of second chr
1083 #
1084 for($i=0; $i<= $index; $i++){
1085 $pool[$poolmem+1]->{assignments}[$i] = $population[$popmem]->{assignments}[$i];
1086 }
1087 $j = $index+1;
1088 for($i=0; $i<= $#data; $i++){
1089 if( $swapinds->{$i} != 1 ){
1090 $pool[$poolmem+1]->{assignments}[$j] = $population[$popmem+1]->{assignments}[$i];
1091 $j++;
1092 }
1093 }
1094
1095
1096 #
1097 # check that batch counts are still ok
1098 #
1099 $checkbatch = 0;
1100 if( $checkbatch ){
1101 for($i=0; $i<= $#bcounts; $i++){
1102 $bcounts[$i] = 0;
1103 }
1104 for($i=0; $i<= $#data; $i++){
1105 $bcounts[$pool[$poolmem]->{assignments}[$i] - 1]++;
1106 }
1107 for($i=0; $i<= $#bcounts; $i++){
1108 if( $bcounts[$i] != $batchsizes[$i] ){
1109 print "Error in do_cross: lost some batch counts in first daughter chr\n";
1110 exit;
1111 }
1112 }
1113 for($i=0; $i<= $#bcounts; $i++){
1114 $bcounts[$i] = 0;
1115 }
1116 for($i=0; $i<= $#data; $i++){
1117 $bcounts[$pool[$poolmem+1]->{assignments}[$i] - 1]++;
1118 }
1119 for($i=0; $i<= $#bcounts; $i++){
1120 if( $bcounts[$i] != $batchsizes[$i] ){
1121 print "Error in do_cross: lost some batch counts in first daughter chr\n";
1122 exit;
1123 }
1124 }
1125 }
1126 }
1127
1128 # complete the mutation step, keeping track of our index with $_[0]
1129 sub mutate{
1130 my $i;
1131 my $j;
1132 my $k;
1133
1134 $k = $_[0];
1135
1136 for($i=0; $i< $popsize+$nnewimm; $i++){
1137 for($j=0; $j< $nchrmuts; $j++){
1138 &do_mutation($i, $k);
1139 &fill_assignments( \@{$pool[$k]->{assignments}} );
1140 $pool[$k]->{score} = &mutual_info( $numbatches );
1141 $k++;
1142 }
1143 }
1144 return $k;
1145 }
1146
1147 # do a mutation for population member $_[0], fill to pool member $_[1]
1148 sub do_mutation{
1149 my $i;
1150 my $popmem;
1151 my $poolmem;
1152 my $index1;
1153 my $index2;
1154
1155 $popmem = $_[0];
1156 $poolmem = $_[1];
1157
1158 #
1159 # fill all of poolmem
1160 #
1161 for($i=0; $i<= $#data; $i++){
1162 $pool[$poolmem]->{assignments}[$i] = $population[$popmem]->{assignments}[$i];
1163 }
1164
1165 #
1166 # switch two members
1167 #
1168 $index1 = sprintf("%i", ($#data+1) * &ran1() );
1169 $index2 = sprintf("%i", ($#data+1) * &ran1() );
1170
1171 $pool[$poolmem]->{assignments}[$index1] = $population[$popmem]->{assignments}[$index2];
1172 $pool[$poolmem]->{assignments}[$index2] = $population[$popmem]->{assignments}[$index1];
1173 }
1174
1175 # add immigrants, keeping track of our index with $_[0]
1176 sub add_immigrants{
1177 my $j;
1178
1179 for($j=0; $j< $nnewimm; $j++){
1180 @{$population[$popsize+$j]->{assignments}} = &permute( \@batched );
1181 &fill_assignments( \@{$population[$popsize+$j]->{assignments}} );
1182 $population[$popsize+$j]->{score} = &mutual_info( $numbatches );
1183 $k++;
1184 }
1185 }
1186
1187 # add top-scoring parents, keeping track of our index with $_[0]
1188 sub add_parents{
1189 my $i;
1190 my $j;
1191 my $k;
1192
1193 $k = $_[0];
1194 # sort population now
1195 @population = sort{$a->{score} <=> $b->{score}} @population;
1196
1197 for($j=0; $j< $nkeepparents; $j++){
1198 &copy_parents( $j, $k );
1199 # will copy score also in copy_parents()
1200 $k++;
1201 }
1202 return $k;
1203 }
1204
1205 # add population member $_[0] to pool member $_[1]
1206 sub copy_parents{
1207 my $i;
1208 my $popmem;
1209 my $poolmem;
1210 my $index1;
1211 my $index2;
1212
1213 $popmem = $_[0];
1214 $poolmem = $_[1];
1215
1216 #
1217 # fill all of poolmem
1218 #
1219 for($i=0; $i<= $#data; $i++){
1220 $pool[$poolmem]->{assignments}[$i] = $population[$popmem]->{assignments}[$i];
1221 }
1222 $pool[$poolmem]->{score} = $population[$popmem]->{score};
1223 }
1224
1225 # copy top of pool to population (assumes pool is sorted)
1226 sub fill_population{
1227 my $i;
1228 my $j;
1229 my $m;
1230
1231 $m = 0;
1232 for($i=0; $i< $popsize; $i++){
1233 $population[$i]->{score} = $pool[$i]->{score};
1234 $m+= $population[$i]->{score};
1235 for($j=0; $j<= $#data; $j++){
1236 $population[$i]->{assignments}[$j] = $pool[$i]->{assignments}[$j];
1237 }
1238 }
1239 return $m/$popsize;
1240 }
1241
1242
1243
1244