diff GALAXY_FILES/tools/EMBER/PreProcess_Expression_Data.pl @ 3:037c3edda16e

Uploaded
author mmaiensc
date Thu, 22 Mar 2012 13:49:52 -0400
parents 003f802d4c7d
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/GALAXY_FILES/tools/EMBER/PreProcess_Expression_Data.pl	Thu Mar 22 13:49:52 2012 -0400
@@ -0,0 +1,413 @@
+#!/usr/bin/perl
+# prepare microarray data for use in EMBER w/ discretized behaviors
+
+use Getopt::Long;
+$|++;
+
+#
+# command line arguments
+#
+$options = "Usage: ./PreProcess_Expresssion_Data.pl <OPTIONS>
+	-i  microarray data (required)
+	    format: <id> <value 1> <value 2> ... <value N>
+            this requires a title line (starting with #):
+               #ID <condition 1><replicate 1> <condition 2><replicate 2> ...
+            where <condition> should match the conditions in input -c, 
+            and the <condition><replicate> pair tells what the data in the column is from
+	-c  list of comparisons (required)
+	    format: <condition1> <condition2>
+	-a  annotation file (required)
+	    format: <id> <gene name> <chr> <start> <end> <strand>
+	-o  output (required)
+	-p  percentile threshold (number between 0 and 1, default 0.63)
+	    this value is used as follows: all data are concatenated into one list,
+	    and the pth percentile of that list is taken as the threshold.
+	    Then, a probe is removed if its value is less than the threshold in ALL conditions.
+	    p=1.0 means all probes are retained, p=0.0 means none are
+	-l  log-transform the data (y or n, default n)
+	-v  verbose (print progress to stdout: y or n, default y)
+\n";
+
+$in = "";
+$c = "";
+$a = "";
+$o = "";
+$p = 0.63;
+$l = "n";
+$v = "y";
+
+GetOptions('i=s' => \$in,
+	   'c=s' => \$c,
+	   'a=s' => \$a,
+	   'o=s' => \$o,
+	   'p=f' => \$p,
+	   'l=s' => \$l,
+	   'v=s' => \$v,
+) || die "\n$options\n";
+
+if( $in eq "" ){
+	print "\nError: set a value for -i\n\n$options";
+	exit;
+}
+if( $c eq "" ){
+	print "\nError: set a value for -c\n\n$options";
+	exit;
+}
+if( $a eq "" ){
+	print "\nError: set a value for -a\n\n$options";
+	exit;
+}
+if( $o eq "" ){
+	print "\nError: set a value for -o\n\n$options";
+	exit;
+}
+
+if( $p < 0 || $p > 1 ){
+	print "\nError: choose -p to be between 0 and 1\n\n$options";
+	exit;
+}
+if( $l ne "y" && $l ne "n" ){
+	print "\nError: choose -l to be y or n\n\n$options";
+	exit;
+}
+if( $v ne "y" && $v ne "n" ){
+	print "\nError: choose -v to be y or n\n\n$options";
+	exit;
+}
+
+#################################################
+# SECTION 1 - read in file names and conditions #
+#################################################
+
+#
+# read in data list (title line of $in) and list of comparisons
+# arrays: @names, @comps
+#
+open(IN,"$in") || die "Error: can't open file $in\n";
+$line = <IN>;
+close(IN);
+@names = ();
+@parts = split(' ',$line);
+if( substr($parts[0], 0, 1) ne "#" ){
+	print "Error: no title line detected in file $in\n";
+	exit;
+}
+for($i=1; $i<= $#parts; $i++){
+	if( $parts[$i] !~ /#/ ){
+		printf("Error: column %i (%s) does not have a replicate number\n", $i, $parts[$i]);
+		exit;
+	}
+	($condition, $replicate) = split('#', $parts[$i]); 
+	push(@names, [($condition, $replicate)] );
+}
+#while($line = <IN>){
+#	chomp($line);
+#	push(@files, $line);
+#	@parts = split('/',$line);
+#	($base, $suff) = split('\.txt', $parts[$#parts]);
+#	push(@names, $base);
+#}
+#close(IN);
+open(IN,"$c") || die "Error: can't open file $c\n";
+@comps = ();
+while($line = <IN>){
+	chomp($line);
+	@parts = split(' ',$line);
+	if( $#parts != 1 ){
+		print "Error: file $c does not have 2 columns\n";
+		exit;
+	}
+	$info1 = {};
+	$info1->{name} = $parts[0];
+	$info1->{namei} = 0;
+	$info2 = {};
+	$info2->{name} = $parts[1];
+	$info2->{namei} = 0;
+	push(@comps, [($info1, $info2)] );
+}
+close(IN);
+
+#
+# count how many conditions we have, and how many replicates of each
+# arrays: @conds
+#
+@conds = ();
+for($i=0; $i<= $#comps; $i++){
+	for($j=0; $j< 2; $j++){
+		$ok = 1;
+		$k=-1;
+		$end = $#conds;
+		while($k< $end && $ok == 1){
+			$k++;
+			if( $conds[$k]->{name} eq $comps[$i][$j]->{name} ){
+				$ok = 0;
+			}
+		}
+		if( $ok == 1 ){
+			@empty1 = ();
+			$info = {};
+			$info->{name} = $comps[$i][$j]->{name};
+			$info->{reps} = [@empty1];
+			$info->{nreps} = 0;
+			push(@conds, $info);
+			$comps[$i][$j]->{namei} = $#conds;
+		}
+		else{
+			$comps[$i][$j]->{namei} = $k;
+		}
+	}
+}
+for($i=0; $i<= $#conds; $i++){
+	for($j=0; $j<= $#names; $j++){
+		if( $names[$j][0] eq $conds[$i]->{name} ){
+			push(@{$conds[$i]->{reps}}, $j);
+			$conds[$i]->{nreps}++;
+		}
+	}
+}
+
+#
+# print summary to stdout
+#
+if( $v eq "y" ){
+	printf("\nReading in %i conditions, %i total data columns:\n", $#conds+1, $#names+1);
+	for($i=0; $i<= $#conds; $i++){
+		printf("   %s, %i replicates\n", $conds[$i]->{name}, $conds[$i]->{nreps});
+	}
+	printf("\nMaking %i comparisons:\n", $#comps+1);
+	for($i=0; $i<= $#comps; $i++){
+		printf("   %s vs %s\n", $comps[$i][0]->{name}, $comps[$i][1]->{name});
+	}
+}
+
+###########################################################
+# SECTION 2 - read in annotation file and microarray data #
+###########################################################
+
+if( $v eq "y" ){
+	print "\nReading in annotation file and data\n";
+}
+
+#
+# read in probe annotations
+# arrays: @probes
+#
+open(IN,"$a") || die "Error: can't open file $a\n";
+@vals = ();
+while($line = <IN>){
+	chomp($line);
+	@parts = split(' ',$line);
+	if( $#parts != 5 ){
+		print "Error: file $a does not have 6 columns\n";
+		exit;
+	}
+	@empty1 = ();
+	@empty2 = ();
+	$info = {};
+	$info->{id} = $parts[0];
+	$info->{name} = $parts[1];
+	$info->{chr} = $parts[2];
+	$info->{start} = $parts[3];
+	$info->{end} = $parts[4];
+	$info->{strand} = $parts[5];
+	$info->{vals} = [@empty1];
+	$info->{classes} = [@empty2];
+	$info->{use} = 0;
+	push(@vals, $info);
+}
+close(IN);
+@probes = sort{ $a->{id} cmp $b->{id} } @vals;
+
+#
+# read in data, sort by ids, and add to @probes
+# record average for each file
+# arrays: @averages
+#
+@averages = ();
+open(IN,"$in");
+$line = <IN>;
+@data = ();
+while( $line = <IN> ){
+	chomp($line);
+	@parts = split(' ',$line);
+	push(@data, [@parts] );
+}
+@sdata = sort{ $a->[0] cmp $b->[0] } @data;
+close(IN);
+
+$j=0;
+$k=0;
+$endd = $#sdata;
+$endp = $#probes;
+while( $j <= $endd && $k<= $endp ){
+	if( $probes[$k]->{id} gt $sdata[$j][0] ){$j++;}
+	elsif( $probes[$k]->{id} lt $sdata[$j][0] ){
+		$probes[$k]->{use} = 0;
+		$k++;
+	}
+	elsif( $probes[$k]->{id} eq $sdata[$j][0] ){
+		for( $i=0; $i<= $#names; $i++){
+			if( $l eq "n" ){push(@{$probes[$k]->{vals}}, $sdata[$j][$i+1] );}
+			if( $l eq "y" ){push(@{$probes[$k]->{vals}}, log($sdata[$j][$i+1])/log(2) );}
+		}
+		$probes[$k]->{use} = 1;
+		$j++;
+		$k++;
+	}
+	else{
+		printf("Unknown line comparison\n");
+		exit;
+	}
+}
+for($i=0; $i<= $#names; $i++){
+	$avg=0;
+	#
+	# compute average
+	# 
+	$avg = 0;
+	$tot = 0;
+	for($k=0; $k<= $#probes; $k++){
+		if( $probes[$k]->{use} == 1 ){
+			$avg+= $probes[$k]->{vals}[$i];
+			$tot++;
+		}
+	}
+	$avg/= $tot;
+	push(@averages, $avg);
+}
+
+if( $v eq "y" ){
+	printf("   using %i probes\n", $tot);
+}
+
+##################################################################
+# SECTION 3 - determine threshold and label probes to be removed #
+##################################################################
+
+if( $v eq "y" ){
+	printf("\nDetermining threshold\n");
+}
+#
+# find threshold
+# variables: $thresh
+#
+@allvals = ();
+for($i=0; $i<= $#probes; $i++){
+	for($j=0; $j<= $#names; $j++){
+		if( $probes[$i]->{use} == 1 ){
+			push(@allvals, ($probes[$i]->{vals}[$j])/$averages[$j]);
+		}
+	}
+}
+
+@sallvals = sort{ $a <=> $b } @allvals;
+
+$end = (1-$p)*$#sallvals;
+$i = sprintf("%i", $end);
+$thresh = $sallvals[$i];
+
+#
+# label probes as on or off
+#
+$totuse = 0;
+for($i=0; $i<= $#probes; $i++){
+	$j=0;
+	$probes[$i]->{use} = 0;
+	while($j<= $#names && $probes[$i]->{use} == 0 ){
+		if( ($probes[$i]->{vals}[$j])/$averages[$j] > $thresh ){
+			$probes[$i]->{use} = 1;
+			$totuse++;
+		}
+		$j++;
+	}
+}
+if( $v eq "y" ){
+	printf("   %i probes retained for analysis\n", $totuse);
+}
+
+#############################################
+# SECTION 4 - make discrete classifications #
+#############################################
+
+if( $v eq "y" ){
+	printf("\nClassifying probes\n");
+}
+#
+# for each probe,
+#    for each comparison,
+#       compute avg and stdev of first and second conditions
+#       apply rules for classification
+#
+for($i=0; $i<= $#probes; $i++){
+	for($j=0; $j<= $#comps; $j++){
+		$avg = (0,0);
+		$sd = (0,0);
+		for($k=0; $k< 2; $k++){
+			$condi = $comps[$j][$k]->{namei};
+			$avg[$k] = 0;
+			$sd[$k] = 0;
+			for($l=0; $l< $conds[$condi]->{nreps}; $l++){
+				$filei = $conds[$condi]->{reps}[$l];
+				$avg[$k]+= $probes[$i]->{vals}[$filei];
+			}
+			$avg[$k] /= $conds[$condi]->{nreps};
+			for($l=0; $l< $conds[$condi]->{nreps}; $l++){
+				$filei = $conds[$condi]->{reps}[$l];
+				$sd[$k]+= ($avg[$k]-$probes[$i]->{vals}[$filei])**2;
+			}
+			$sd[$k] = sqrt($sd[$k]/($conds[$condi]->{nreps}));
+		}
+
+		$s = $sd[0] + $sd[1];
+		# ++ category
+		if( $avg[1]-$avg[0] > 3*$s ){
+			push( @{$probes[$i]->{classes}}, 2);
+		}
+		# + category
+		elsif( $avg[1]-$avg[0] > $s ){
+			push(@{$probes[$i]->{classes}}, 1);
+		}
+		# -- category
+		elsif( $avg[0]-$avg[1] > 3*$s ){
+			push(@{$probes[$i]->{classes}}, -2);
+		}
+		# 0 category
+		elsif( $avg[0]-$avg[1] > $s ){
+			push(@{$probes[$i]->{classes}}, -1);
+		}
+		else{
+			push(@{$probes[$i]->{classes}}, 0);
+		}
+	}
+}
+
+############################
+# SECTION 5 - print result #
+############################
+
+if( $v eq "y" ){
+	printf("\nPrinting result\n\n");
+}
+open(OUT,">$o");
+$delim = " ";
+for($i=0; $i<= $#probes; $i++){
+	printf OUT ("%s%s%s%s%s%s%s%s%s%s%s", $probes[$i]->{id}, $delim, $probes[$i]->{name}, $delim, $probes[$i]->{chr}, $delim, $probes[$i]->{start}, $delim, $probes[$i]->{end}, $delim, $probes[$i]->{strand} );
+	if( $probes[$i]->{use} == 1 ){
+		for($j=0; $j<= $#comps; $j++){
+			printf OUT ("%s%i", $delim, $probes[$i]->{classes}[$j]);
+		}
+		print OUT "\n";
+	}
+	else{
+		print OUT "$delim-1\n";
+	}
+}
+close(OUT);
+
+
+
+
+
+
+
+