diff GALAXY_FILES/tools/EMBER/Integrate_Data.pl @ 3:037c3edda16e

Uploaded
author mmaiensc
date Thu, 22 Mar 2012 13:49:52 -0400
parents 003f802d4c7d
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/GALAXY_FILES/tools/EMBER/Integrate_Data.pl	Thu Mar 22 13:49:52 2012 -0400
@@ -0,0 +1,252 @@
+#!/usr/bin/perl
+# combine microarray pre-preprocessed data and binding data
+
+use Getopt::Long;
+$|++;
+
+#
+# command line arguments
+#
+$options = "Usage: ./Integrate_Data.pl <OPTIONS>
+	-b  binding data (required)
+	-bf binding data format (default 1)
+	     1 - .bed format (<chr> <pkposn> <other information>)
+	         first two columns are required, 
+		 <other information> may contain peak ids, enrichments, etc, and is optional but will be retained
+	     2 - Dinner group \"annotation\" format
+	-e  pre-processed expression data (required)
+	-o  output (required)
+	-d  distance (positive number, in kbp, default 100)
+	-dt distance type (1 = to gene boundaries, 2 = to TSS, default 1)
+	-v  verbose (print progress to stdout: y or n, default y)
+\n";
+
+$b  = "";
+$bf = 1;
+$e  = "";
+$o  = "";
+$d  = 100;
+$dt = "1";
+$v = "y";
+
+GetOptions('b=s'  => \$b,
+	   'bf=i' => \$bf,
+	   'e=s'  => \$e,
+	   'o=s'  => \$o,
+	   'd=f'  => \$d,
+	   'dt=i' => \$dt,
+	   'v=s'  => \$v,
+) || die "\n$options\n";
+
+if( $b eq "" ){
+	print "\nError: set a value for -b\n\n$options";
+	exit;
+}
+if( $bf != 1 && $bf != 2 ){
+	print "\nError: set -bf to be 1 or 2\n\n$options";
+	exit;
+}
+if( $e eq "" ){
+	print "\nError: set a value for -e\n\n$options";
+	exit;
+}
+if( $o eq "" ){
+	print "\nError: set a value for -o\n\n$options";
+	exit;
+}
+
+if( $d < 0 ){
+	print "\nError: choose -d to be positive\n\n$options";
+	exit;
+}
+if( $dt != 1 && $dt != 2 ){
+	print "\nError: choose -dt to be 1 or 2\n\n$options";
+	exit;
+}
+if( $v ne "y" && $v ne "n" ){
+	print "\nError: choose -v to be y or n\n\n$options";
+	exit;
+}
+
+#
+# read in binding data
+#
+if( $v eq "y" ){
+	print "\nReading in binding and expression data\n";
+}
+@peaks = ();
+open(IN,"$b") || die "Error: can't open file $b\n";
+while($line = <IN>){
+	if( $line !~ /#/ ){
+		chomp($line);
+		@parts = split(' ',$line);
+		$info = {};
+		if( $bf == 1 ){
+			$info->{chr} = $parts[0];
+			$info->{pk} = $parts[1];
+			$val = "";
+			if( $#parts >= 2 ){
+				$val = $parts[2];
+				for($i=3; $i<= $#parts; $i++){
+					$val = sprintf("%s %s", $val, $parts[$i]);
+				}
+			}
+			$info->{rest} = $val;
+			push(@peaks, $info);
+		}
+		if( $bf == 2 ){
+			if( $#parts == 19 ){
+				$info->{chr} = $parts[2];
+				$info->{pk} = $parts[3];
+				$info->{rest} = sprintf("%s %s %s", $parts[0], $parts[1], $parts[6]);
+				push(@peaks, $info);
+			}
+		}
+	}
+}
+close(IN);
+
+#
+# read in expression data
+#
+@expr = ();
+open(IN,"$e") || die "Error: can't open file $e\n";
+while($line = <IN>){
+	chomp($line);
+	@parts = split(' ',$line);
+	$info = {};
+	$info->{chr} = $parts[2];
+	$info->{start} = $parts[3];
+	$info->{end} = $parts[4];
+	$info->{strand} = $parts[5];
+	$info->{line} = $line;
+	push(@expr, $info);
+}
+close(IN);
+
+if( $v eq "y" ){
+	printf("   %i peaks, %i genes\n", $#peaks+1, $#expr+1);
+	printf("\nSorting peaks and genes, indexing chromosomes\n");
+}
+
+#
+# sort binding and expression data by chr to save some time
+#
+@speaks = sort{ $a->{chr} cmp $b->{chr} } @peaks;
+@sexpr = sort{ $a->{chr} cmp $b->{chr} } @expr;
+
+#
+# index starting elements for each chromosome in sexpr
+#
+@chrinds = ();
+$chr = $sexpr[0]->{chr};
+$info = {};
+$info->{chr} = $chr;
+$info->{start} = 0;
+for($i=0; $i<= $#sexpr; $i++){
+	if( $sexpr[$i]->{chr} ne $chr ){
+		$info->{end} = $i-1;
+		push(@chrinds, $info);
+		$info = {};
+		$chr = $sexpr[$i]->{chr};
+		$info->{chr} = $chr;
+		$info->{start} = $i;
+	}
+}
+$info->{end} = $i-1;
+push(@chrinds, $info);
+
+#
+# compile potential targets for each peak and print out
+#
+if( $v eq "y" ){
+	printf("\nFinding potential targets\n");
+}
+open(OUT,">$o");
+$count=0;
+for($i=0; $i<= $#speaks; $i++){
+	if( $v eq "y" ){
+		printf("\r   peak %i of %i", $i+1, $#speaks+1);
+	}
+	# print out peak info
+	printf OUT ("PEAK: %s %s %s\n", $speaks[$i]->{chr}, $speaks[$i]->{pk}, $speaks[$i]->{rest} );
+	
+	# find the right chromosome and search through all the genes on the same chromosome for possible matches
+	$j=0;
+	$end=$#chrinds;
+	while( $chrinds[$j]->{chr} ne $speaks[$i]->{chr} && $j<= $end ){$j++;}
+	if( $chrinds[$j]->{chr} eq $speaks[$i]->{chr} ){
+		$start = $chrinds[$j]->{start};
+		$end = $chrinds[$j]->{end};
+		for($k=$start; $k<= $end; $k++){
+			if( &distance( $i, $k ) <= $d ){
+				$count++;
+				printf OUT ("GENE: %s\n", $sexpr[$k]->{line});
+			}
+		}
+	}
+}
+close(OUT);
+if( $v eq "y" ){
+	print "\n\nGot $count potential targets\n\n";
+}
+
+exit;
+
+
+# compute distance between peak $_[0] and gene $_[1]
+sub distance{
+	my $dist;
+	my $tss;
+	my $ii;
+	my $jj;
+	$ii = $_[0];
+	$jj = $_[1];
+	
+	if( $speaks[$ii]->{chr} ne $sexpr[$jj]->{chr}){
+		print "\nError: somehow computing distance between peak/gene on different chromosomes\n";
+		print "   peak chr: $speaks[$ii]->{chr}, gene chr: $sexpr[$jj]->{chr}\n";
+		exit;
+	}
+	if( $dt == 1 ){
+		# distance to closest side of gene
+		if( $speaks[$ii]->{pk} < $sexpr[$jj]->{start} ){
+			$dist = $sexpr[$jj]->{start} - $speaks[$ii]->{pk};
+		}
+		elsif( $speaks[$ii]->{pk} > $sexpr[$jj]->{end} ){
+			$dist = $speaks[$ii]->{pk} - $sexpr[$jj]->{end};
+		}
+		else{
+			$dist = 0;
+		}
+	}
+	elsif( $dt == 2 ){
+		# distance to TSS
+		if( $sexpr[$jj]->{strand} == 1 ){
+			$tss = $sexpr[$jj]->{start};
+		}
+		elsif( $sexpr[$jj]->{strand} == -1 ){
+			$tss = $sexpr[$jj]->{end};
+		}
+		else{
+			print "\nError: did not have strand of +/-1 (got $sexpr[$jj]->{strand})\n";
+			exit;
+		}
+		$dist = abs($speaks[$ii]->{pk} - $tss);
+	}
+	else{
+		print "\nError: wrong value for dt ($dt)\n";
+		exit;
+	}
+	if( $dist < 0 ){
+		print "\nError: negative distance\n";
+		exit;
+	}
+	# convert to kbp
+	$dist/=1000;
+	return $dist;
+}
+
+
+
+