Mercurial > repos > mmaiensc > ember
diff GALAXY_FILES/tools/EMBER/PreProcess_Expression_Data.pl @ 3:037c3edda16e
Uploaded
author | mmaiensc |
---|---|
date | Thu, 22 Mar 2012 13:49:52 -0400 |
parents | 003f802d4c7d |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GALAXY_FILES/tools/EMBER/PreProcess_Expression_Data.pl Thu Mar 22 13:49:52 2012 -0400 @@ -0,0 +1,413 @@ +#!/usr/bin/perl +# prepare microarray data for use in EMBER w/ discretized behaviors + +use Getopt::Long; +$|++; + +# +# command line arguments +# +$options = "Usage: ./PreProcess_Expresssion_Data.pl <OPTIONS> + -i microarray data (required) + format: <id> <value 1> <value 2> ... <value N> + this requires a title line (starting with #): + #ID <condition 1><replicate 1> <condition 2><replicate 2> ... + where <condition> should match the conditions in input -c, + and the <condition><replicate> pair tells what the data in the column is from + -c list of comparisons (required) + format: <condition1> <condition2> + -a annotation file (required) + format: <id> <gene name> <chr> <start> <end> <strand> + -o output (required) + -p percentile threshold (number between 0 and 1, default 0.63) + this value is used as follows: all data are concatenated into one list, + and the pth percentile of that list is taken as the threshold. + Then, a probe is removed if its value is less than the threshold in ALL conditions. + p=1.0 means all probes are retained, p=0.0 means none are + -l log-transform the data (y or n, default n) + -v verbose (print progress to stdout: y or n, default y) +\n"; + +$in = ""; +$c = ""; +$a = ""; +$o = ""; +$p = 0.63; +$l = "n"; +$v = "y"; + +GetOptions('i=s' => \$in, + 'c=s' => \$c, + 'a=s' => \$a, + 'o=s' => \$o, + 'p=f' => \$p, + 'l=s' => \$l, + 'v=s' => \$v, +) || die "\n$options\n"; + +if( $in eq "" ){ + print "\nError: set a value for -i\n\n$options"; + exit; +} +if( $c eq "" ){ + print "\nError: set a value for -c\n\n$options"; + exit; +} +if( $a eq "" ){ + print "\nError: set a value for -a\n\n$options"; + exit; +} +if( $o eq "" ){ + print "\nError: set a value for -o\n\n$options"; + exit; +} + +if( $p < 0 || $p > 1 ){ + print "\nError: choose -p to be between 0 and 1\n\n$options"; + exit; +} +if( $l ne "y" && $l ne "n" ){ + print "\nError: choose -l to be y or n\n\n$options"; + exit; +} +if( $v ne "y" && $v ne "n" ){ + print "\nError: choose -v to be y or n\n\n$options"; + exit; +} + +################################################# +# SECTION 1 - read in file names and conditions # +################################################# + +# +# read in data list (title line of $in) and list of comparisons +# arrays: @names, @comps +# +open(IN,"$in") || die "Error: can't open file $in\n"; +$line = <IN>; +close(IN); +@names = (); +@parts = split(' ',$line); +if( substr($parts[0], 0, 1) ne "#" ){ + print "Error: no title line detected in file $in\n"; + exit; +} +for($i=1; $i<= $#parts; $i++){ + if( $parts[$i] !~ /#/ ){ + printf("Error: column %i (%s) does not have a replicate number\n", $i, $parts[$i]); + exit; + } + ($condition, $replicate) = split('#', $parts[$i]); + push(@names, [($condition, $replicate)] ); +} +#while($line = <IN>){ +# chomp($line); +# push(@files, $line); +# @parts = split('/',$line); +# ($base, $suff) = split('\.txt', $parts[$#parts]); +# push(@names, $base); +#} +#close(IN); +open(IN,"$c") || die "Error: can't open file $c\n"; +@comps = (); +while($line = <IN>){ + chomp($line); + @parts = split(' ',$line); + if( $#parts != 1 ){ + print "Error: file $c does not have 2 columns\n"; + exit; + } + $info1 = {}; + $info1->{name} = $parts[0]; + $info1->{namei} = 0; + $info2 = {}; + $info2->{name} = $parts[1]; + $info2->{namei} = 0; + push(@comps, [($info1, $info2)] ); +} +close(IN); + +# +# count how many conditions we have, and how many replicates of each +# arrays: @conds +# +@conds = (); +for($i=0; $i<= $#comps; $i++){ + for($j=0; $j< 2; $j++){ + $ok = 1; + $k=-1; + $end = $#conds; + while($k< $end && $ok == 1){ + $k++; + if( $conds[$k]->{name} eq $comps[$i][$j]->{name} ){ + $ok = 0; + } + } + if( $ok == 1 ){ + @empty1 = (); + $info = {}; + $info->{name} = $comps[$i][$j]->{name}; + $info->{reps} = [@empty1]; + $info->{nreps} = 0; + push(@conds, $info); + $comps[$i][$j]->{namei} = $#conds; + } + else{ + $comps[$i][$j]->{namei} = $k; + } + } +} +for($i=0; $i<= $#conds; $i++){ + for($j=0; $j<= $#names; $j++){ + if( $names[$j][0] eq $conds[$i]->{name} ){ + push(@{$conds[$i]->{reps}}, $j); + $conds[$i]->{nreps}++; + } + } +} + +# +# print summary to stdout +# +if( $v eq "y" ){ + printf("\nReading in %i conditions, %i total data columns:\n", $#conds+1, $#names+1); + for($i=0; $i<= $#conds; $i++){ + printf(" %s, %i replicates\n", $conds[$i]->{name}, $conds[$i]->{nreps}); + } + printf("\nMaking %i comparisons:\n", $#comps+1); + for($i=0; $i<= $#comps; $i++){ + printf(" %s vs %s\n", $comps[$i][0]->{name}, $comps[$i][1]->{name}); + } +} + +########################################################### +# SECTION 2 - read in annotation file and microarray data # +########################################################### + +if( $v eq "y" ){ + print "\nReading in annotation file and data\n"; +} + +# +# read in probe annotations +# arrays: @probes +# +open(IN,"$a") || die "Error: can't open file $a\n"; +@vals = (); +while($line = <IN>){ + chomp($line); + @parts = split(' ',$line); + if( $#parts != 5 ){ + print "Error: file $a does not have 6 columns\n"; + exit; + } + @empty1 = (); + @empty2 = (); + $info = {}; + $info->{id} = $parts[0]; + $info->{name} = $parts[1]; + $info->{chr} = $parts[2]; + $info->{start} = $parts[3]; + $info->{end} = $parts[4]; + $info->{strand} = $parts[5]; + $info->{vals} = [@empty1]; + $info->{classes} = [@empty2]; + $info->{use} = 0; + push(@vals, $info); +} +close(IN); +@probes = sort{ $a->{id} cmp $b->{id} } @vals; + +# +# read in data, sort by ids, and add to @probes +# record average for each file +# arrays: @averages +# +@averages = (); +open(IN,"$in"); +$line = <IN>; +@data = (); +while( $line = <IN> ){ + chomp($line); + @parts = split(' ',$line); + push(@data, [@parts] ); +} +@sdata = sort{ $a->[0] cmp $b->[0] } @data; +close(IN); + +$j=0; +$k=0; +$endd = $#sdata; +$endp = $#probes; +while( $j <= $endd && $k<= $endp ){ + if( $probes[$k]->{id} gt $sdata[$j][0] ){$j++;} + elsif( $probes[$k]->{id} lt $sdata[$j][0] ){ + $probes[$k]->{use} = 0; + $k++; + } + elsif( $probes[$k]->{id} eq $sdata[$j][0] ){ + for( $i=0; $i<= $#names; $i++){ + if( $l eq "n" ){push(@{$probes[$k]->{vals}}, $sdata[$j][$i+1] );} + if( $l eq "y" ){push(@{$probes[$k]->{vals}}, log($sdata[$j][$i+1])/log(2) );} + } + $probes[$k]->{use} = 1; + $j++; + $k++; + } + else{ + printf("Unknown line comparison\n"); + exit; + } +} +for($i=0; $i<= $#names; $i++){ + $avg=0; + # + # compute average + # + $avg = 0; + $tot = 0; + for($k=0; $k<= $#probes; $k++){ + if( $probes[$k]->{use} == 1 ){ + $avg+= $probes[$k]->{vals}[$i]; + $tot++; + } + } + $avg/= $tot; + push(@averages, $avg); +} + +if( $v eq "y" ){ + printf(" using %i probes\n", $tot); +} + +################################################################## +# SECTION 3 - determine threshold and label probes to be removed # +################################################################## + +if( $v eq "y" ){ + printf("\nDetermining threshold\n"); +} +# +# find threshold +# variables: $thresh +# +@allvals = (); +for($i=0; $i<= $#probes; $i++){ + for($j=0; $j<= $#names; $j++){ + if( $probes[$i]->{use} == 1 ){ + push(@allvals, ($probes[$i]->{vals}[$j])/$averages[$j]); + } + } +} + +@sallvals = sort{ $a <=> $b } @allvals; + +$end = (1-$p)*$#sallvals; +$i = sprintf("%i", $end); +$thresh = $sallvals[$i]; + +# +# label probes as on or off +# +$totuse = 0; +for($i=0; $i<= $#probes; $i++){ + $j=0; + $probes[$i]->{use} = 0; + while($j<= $#names && $probes[$i]->{use} == 0 ){ + if( ($probes[$i]->{vals}[$j])/$averages[$j] > $thresh ){ + $probes[$i]->{use} = 1; + $totuse++; + } + $j++; + } +} +if( $v eq "y" ){ + printf(" %i probes retained for analysis\n", $totuse); +} + +############################################# +# SECTION 4 - make discrete classifications # +############################################# + +if( $v eq "y" ){ + printf("\nClassifying probes\n"); +} +# +# for each probe, +# for each comparison, +# compute avg and stdev of first and second conditions +# apply rules for classification +# +for($i=0; $i<= $#probes; $i++){ + for($j=0; $j<= $#comps; $j++){ + $avg = (0,0); + $sd = (0,0); + for($k=0; $k< 2; $k++){ + $condi = $comps[$j][$k]->{namei}; + $avg[$k] = 0; + $sd[$k] = 0; + for($l=0; $l< $conds[$condi]->{nreps}; $l++){ + $filei = $conds[$condi]->{reps}[$l]; + $avg[$k]+= $probes[$i]->{vals}[$filei]; + } + $avg[$k] /= $conds[$condi]->{nreps}; + for($l=0; $l< $conds[$condi]->{nreps}; $l++){ + $filei = $conds[$condi]->{reps}[$l]; + $sd[$k]+= ($avg[$k]-$probes[$i]->{vals}[$filei])**2; + } + $sd[$k] = sqrt($sd[$k]/($conds[$condi]->{nreps})); + } + + $s = $sd[0] + $sd[1]; + # ++ category + if( $avg[1]-$avg[0] > 3*$s ){ + push( @{$probes[$i]->{classes}}, 2); + } + # + category + elsif( $avg[1]-$avg[0] > $s ){ + push(@{$probes[$i]->{classes}}, 1); + } + # -- category + elsif( $avg[0]-$avg[1] > 3*$s ){ + push(@{$probes[$i]->{classes}}, -2); + } + # 0 category + elsif( $avg[0]-$avg[1] > $s ){ + push(@{$probes[$i]->{classes}}, -1); + } + else{ + push(@{$probes[$i]->{classes}}, 0); + } + } +} + +############################ +# SECTION 5 - print result # +############################ + +if( $v eq "y" ){ + printf("\nPrinting result\n\n"); +} +open(OUT,">$o"); +$delim = " "; +for($i=0; $i<= $#probes; $i++){ + printf OUT ("%s%s%s%s%s%s%s%s%s%s%s", $probes[$i]->{id}, $delim, $probes[$i]->{name}, $delim, $probes[$i]->{chr}, $delim, $probes[$i]->{start}, $delim, $probes[$i]->{end}, $delim, $probes[$i]->{strand} ); + if( $probes[$i]->{use} == 1 ){ + for($j=0; $j<= $#comps; $j++){ + printf OUT ("%s%i", $delim, $probes[$i]->{classes}[$j]); + } + print OUT "\n"; + } + else{ + print OUT "$delim-1\n"; + } +} +close(OUT); + + + + + + + +