Mercurial > repos > mmaiensc > ember
view GALAXY_FILES/tools/EMBER/PreProcess_Expression_Data.pl @ 1:cd8def22f313
Deleted selected files
author | mmaiensc |
---|---|
date | Wed, 29 Feb 2012 15:12:29 -0500 |
parents | 003f802d4c7d |
children |
line wrap: on
line source
#!/usr/bin/perl # prepare microarray data for use in EMBER w/ discretized behaviors use Getopt::Long; $|++; # # command line arguments # $options = "Usage: ./PreProcess_Expresssion_Data.pl <OPTIONS> -i microarray data (required) format: <id> <value 1> <value 2> ... <value N> this requires a title line (starting with #): #ID <condition 1><replicate 1> <condition 2><replicate 2> ... where <condition> should match the conditions in input -c, and the <condition><replicate> pair tells what the data in the column is from -c list of comparisons (required) format: <condition1> <condition2> -a annotation file (required) format: <id> <gene name> <chr> <start> <end> <strand> -o output (required) -p percentile threshold (number between 0 and 1, default 0.63) this value is used as follows: all data are concatenated into one list, and the pth percentile of that list is taken as the threshold. Then, a probe is removed if its value is less than the threshold in ALL conditions. p=1.0 means all probes are retained, p=0.0 means none are -l log-transform the data (y or n, default n) -v verbose (print progress to stdout: y or n, default y) \n"; $in = ""; $c = ""; $a = ""; $o = ""; $p = 0.63; $l = "n"; $v = "y"; GetOptions('i=s' => \$in, 'c=s' => \$c, 'a=s' => \$a, 'o=s' => \$o, 'p=f' => \$p, 'l=s' => \$l, 'v=s' => \$v, ) || die "\n$options\n"; if( $in eq "" ){ print "\nError: set a value for -i\n\n$options"; exit; } if( $c eq "" ){ print "\nError: set a value for -c\n\n$options"; exit; } if( $a eq "" ){ print "\nError: set a value for -a\n\n$options"; exit; } if( $o eq "" ){ print "\nError: set a value for -o\n\n$options"; exit; } if( $p < 0 || $p > 1 ){ print "\nError: choose -p to be between 0 and 1\n\n$options"; exit; } if( $l ne "y" && $l ne "n" ){ print "\nError: choose -l to be y or n\n\n$options"; exit; } if( $v ne "y" && $v ne "n" ){ print "\nError: choose -v to be y or n\n\n$options"; exit; } ################################################# # SECTION 1 - read in file names and conditions # ################################################# # # read in data list (title line of $in) and list of comparisons # arrays: @names, @comps # open(IN,"$in") || die "Error: can't open file $in\n"; $line = <IN>; close(IN); @names = (); @parts = split(' ',$line); if( substr($parts[0], 0, 1) ne "#" ){ print "Error: no title line detected in file $in\n"; exit; } for($i=1; $i<= $#parts; $i++){ if( $parts[$i] !~ /#/ ){ printf("Error: column %i (%s) does not have a replicate number\n", $i, $parts[$i]); exit; } ($condition, $replicate) = split('#', $parts[$i]); push(@names, [($condition, $replicate)] ); } #while($line = <IN>){ # chomp($line); # push(@files, $line); # @parts = split('/',$line); # ($base, $suff) = split('\.txt', $parts[$#parts]); # push(@names, $base); #} #close(IN); open(IN,"$c") || die "Error: can't open file $c\n"; @comps = (); while($line = <IN>){ chomp($line); @parts = split(' ',$line); if( $#parts != 1 ){ print "Error: file $c does not have 2 columns\n"; exit; } $info1 = {}; $info1->{name} = $parts[0]; $info1->{namei} = 0; $info2 = {}; $info2->{name} = $parts[1]; $info2->{namei} = 0; push(@comps, [($info1, $info2)] ); } close(IN); # # count how many conditions we have, and how many replicates of each # arrays: @conds # @conds = (); for($i=0; $i<= $#comps; $i++){ for($j=0; $j< 2; $j++){ $ok = 1; $k=-1; $end = $#conds; while($k< $end && $ok == 1){ $k++; if( $conds[$k]->{name} eq $comps[$i][$j]->{name} ){ $ok = 0; } } if( $ok == 1 ){ @empty1 = (); $info = {}; $info->{name} = $comps[$i][$j]->{name}; $info->{reps} = [@empty1]; $info->{nreps} = 0; push(@conds, $info); $comps[$i][$j]->{namei} = $#conds; } else{ $comps[$i][$j]->{namei} = $k; } } } for($i=0; $i<= $#conds; $i++){ for($j=0; $j<= $#names; $j++){ if( $names[$j][0] eq $conds[$i]->{name} ){ push(@{$conds[$i]->{reps}}, $j); $conds[$i]->{nreps}++; } } } # # print summary to stdout # if( $v eq "y" ){ printf("\nReading in %i conditions, %i total data columns:\n", $#conds+1, $#names+1); for($i=0; $i<= $#conds; $i++){ printf(" %s, %i replicates\n", $conds[$i]->{name}, $conds[$i]->{nreps}); } printf("\nMaking %i comparisons:\n", $#comps+1); for($i=0; $i<= $#comps; $i++){ printf(" %s vs %s\n", $comps[$i][0]->{name}, $comps[$i][1]->{name}); } } ########################################################### # SECTION 2 - read in annotation file and microarray data # ########################################################### if( $v eq "y" ){ print "\nReading in annotation file and data\n"; } # # read in probe annotations # arrays: @probes # open(IN,"$a") || die "Error: can't open file $a\n"; @vals = (); while($line = <IN>){ chomp($line); @parts = split(' ',$line); if( $#parts != 5 ){ print "Error: file $a does not have 6 columns\n"; exit; } @empty1 = (); @empty2 = (); $info = {}; $info->{id} = $parts[0]; $info->{name} = $parts[1]; $info->{chr} = $parts[2]; $info->{start} = $parts[3]; $info->{end} = $parts[4]; $info->{strand} = $parts[5]; $info->{vals} = [@empty1]; $info->{classes} = [@empty2]; $info->{use} = 0; push(@vals, $info); } close(IN); @probes = sort{ $a->{id} cmp $b->{id} } @vals; # # read in data, sort by ids, and add to @probes # record average for each file # arrays: @averages # @averages = (); open(IN,"$in"); $line = <IN>; @data = (); while( $line = <IN> ){ chomp($line); @parts = split(' ',$line); push(@data, [@parts] ); } @sdata = sort{ $a->[0] cmp $b->[0] } @data; close(IN); $j=0; $k=0; $endd = $#sdata; $endp = $#probes; while( $j <= $endd && $k<= $endp ){ if( $probes[$k]->{id} gt $sdata[$j][0] ){$j++;} elsif( $probes[$k]->{id} lt $sdata[$j][0] ){ $probes[$k]->{use} = 0; $k++; } elsif( $probes[$k]->{id} eq $sdata[$j][0] ){ for( $i=0; $i<= $#names; $i++){ if( $l eq "n" ){push(@{$probes[$k]->{vals}}, $sdata[$j][$i+1] );} if( $l eq "y" ){push(@{$probes[$k]->{vals}}, log($sdata[$j][$i+1])/log(2) );} } $probes[$k]->{use} = 1; $j++; $k++; } else{ printf("Unknown line comparison\n"); exit; } } for($i=0; $i<= $#names; $i++){ $avg=0; # # compute average # $avg = 0; $tot = 0; for($k=0; $k<= $#probes; $k++){ if( $probes[$k]->{use} == 1 ){ $avg+= $probes[$k]->{vals}[$i]; $tot++; } } $avg/= $tot; push(@averages, $avg); } if( $v eq "y" ){ printf(" using %i probes\n", $tot); } ################################################################## # SECTION 3 - determine threshold and label probes to be removed # ################################################################## if( $v eq "y" ){ printf("\nDetermining threshold\n"); } # # find threshold # variables: $thresh # @allvals = (); for($i=0; $i<= $#probes; $i++){ for($j=0; $j<= $#names; $j++){ if( $probes[$i]->{use} == 1 ){ push(@allvals, ($probes[$i]->{vals}[$j])/$averages[$j]); } } } @sallvals = sort{ $a <=> $b } @allvals; $end = (1-$p)*$#sallvals; $i = sprintf("%i", $end); $thresh = $sallvals[$i]; # # label probes as on or off # $totuse = 0; for($i=0; $i<= $#probes; $i++){ $j=0; $probes[$i]->{use} = 0; while($j<= $#names && $probes[$i]->{use} == 0 ){ if( ($probes[$i]->{vals}[$j])/$averages[$j] > $thresh ){ $probes[$i]->{use} = 1; $totuse++; } $j++; } } if( $v eq "y" ){ printf(" %i probes retained for analysis\n", $totuse); } ############################################# # SECTION 4 - make discrete classifications # ############################################# if( $v eq "y" ){ printf("\nClassifying probes\n"); } # # for each probe, # for each comparison, # compute avg and stdev of first and second conditions # apply rules for classification # for($i=0; $i<= $#probes; $i++){ for($j=0; $j<= $#comps; $j++){ $avg = (0,0); $sd = (0,0); for($k=0; $k< 2; $k++){ $condi = $comps[$j][$k]->{namei}; $avg[$k] = 0; $sd[$k] = 0; for($l=0; $l< $conds[$condi]->{nreps}; $l++){ $filei = $conds[$condi]->{reps}[$l]; $avg[$k]+= $probes[$i]->{vals}[$filei]; } $avg[$k] /= $conds[$condi]->{nreps}; for($l=0; $l< $conds[$condi]->{nreps}; $l++){ $filei = $conds[$condi]->{reps}[$l]; $sd[$k]+= ($avg[$k]-$probes[$i]->{vals}[$filei])**2; } $sd[$k] = sqrt($sd[$k]/($conds[$condi]->{nreps})); } $s = $sd[0] + $sd[1]; # ++ category if( $avg[1]-$avg[0] > 3*$s ){ push( @{$probes[$i]->{classes}}, 2); } # + category elsif( $avg[1]-$avg[0] > $s ){ push(@{$probes[$i]->{classes}}, 1); } # -- category elsif( $avg[0]-$avg[1] > 3*$s ){ push(@{$probes[$i]->{classes}}, -2); } # 0 category elsif( $avg[0]-$avg[1] > $s ){ push(@{$probes[$i]->{classes}}, -1); } else{ push(@{$probes[$i]->{classes}}, 0); } } } ############################ # SECTION 5 - print result # ############################ if( $v eq "y" ){ printf("\nPrinting result\n\n"); } open(OUT,">$o"); $delim = " "; for($i=0; $i<= $#probes; $i++){ printf OUT ("%s%s%s%s%s%s%s%s%s%s%s", $probes[$i]->{id}, $delim, $probes[$i]->{name}, $delim, $probes[$i]->{chr}, $delim, $probes[$i]->{start}, $delim, $probes[$i]->{end}, $delim, $probes[$i]->{strand} ); if( $probes[$i]->{use} == 1 ){ for($j=0; $j<= $#comps; $j++){ printf OUT ("%s%i", $delim, $probes[$i]->{classes}[$j]); } print OUT "\n"; } else{ print OUT "$delim-1\n"; } } close(OUT);