Mercurial > repos > mmaiensc > ember
diff GALAXY_FILES/tools/EMBER/Integrate_Data.pl @ 0:003f802d4c7d
Uploaded
author | mmaiensc |
---|---|
date | Wed, 29 Feb 2012 15:03:33 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GALAXY_FILES/tools/EMBER/Integrate_Data.pl Wed Feb 29 15:03:33 2012 -0500 @@ -0,0 +1,252 @@ +#!/usr/bin/perl +# combine microarray pre-preprocessed data and binding data + +use Getopt::Long; +$|++; + +# +# command line arguments +# +$options = "Usage: ./Integrate_Data.pl <OPTIONS> + -b binding data (required) + -bf binding data format (default 1) + 1 - .bed format (<chr> <pkposn> <other information>) + first two columns are required, + <other information> may contain peak ids, enrichments, etc, and is optional but will be retained + 2 - Dinner group \"annotation\" format + -e pre-processed expression data (required) + -o output (required) + -d distance (positive number, in kbp, default 100) + -dt distance type (1 = to gene boundaries, 2 = to TSS, default 1) + -v verbose (print progress to stdout: y or n, default y) +\n"; + +$b = ""; +$bf = 1; +$e = ""; +$o = ""; +$d = 100; +$dt = "1"; +$v = "y"; + +GetOptions('b=s' => \$b, + 'bf=i' => \$bf, + 'e=s' => \$e, + 'o=s' => \$o, + 'd=f' => \$d, + 'dt=i' => \$dt, + 'v=s' => \$v, +) || die "\n$options\n"; + +if( $b eq "" ){ + print "\nError: set a value for -b\n\n$options"; + exit; +} +if( $bf != 1 && $bf != 2 ){ + print "\nError: set -bf to be 1 or 2\n\n$options"; + exit; +} +if( $e eq "" ){ + print "\nError: set a value for -e\n\n$options"; + exit; +} +if( $o eq "" ){ + print "\nError: set a value for -o\n\n$options"; + exit; +} + +if( $d < 0 ){ + print "\nError: choose -d to be positive\n\n$options"; + exit; +} +if( $dt != 1 && $dt != 2 ){ + print "\nError: choose -dt to be 1 or 2\n\n$options"; + exit; +} +if( $v ne "y" && $v ne "n" ){ + print "\nError: choose -v to be y or n\n\n$options"; + exit; +} + +# +# read in binding data +# +if( $v eq "y" ){ + print "\nReading in binding and expression data\n"; +} +@peaks = (); +open(IN,"$b") || die "Error: can't open file $b\n"; +while($line = <IN>){ + if( $line !~ /#/ ){ + chomp($line); + @parts = split(' ',$line); + $info = {}; + if( $bf == 1 ){ + $info->{chr} = $parts[0]; + $info->{pk} = $parts[1]; + $val = ""; + if( $#parts >= 2 ){ + $val = $parts[2]; + for($i=3; $i<= $#parts; $i++){ + $val = sprintf("%s %s", $val, $parts[$i]); + } + } + $info->{rest} = $val; + push(@peaks, $info); + } + if( $bf == 2 ){ + if( $#parts == 19 ){ + $info->{chr} = $parts[2]; + $info->{pk} = $parts[3]; + $info->{rest} = sprintf("%s %s %s", $parts[0], $parts[1], $parts[6]); + push(@peaks, $info); + } + } + } +} +close(IN); + +# +# read in expression data +# +@expr = (); +open(IN,"$e") || die "Error: can't open file $e\n"; +while($line = <IN>){ + chomp($line); + @parts = split(' ',$line); + $info = {}; + $info->{chr} = $parts[2]; + $info->{start} = $parts[3]; + $info->{end} = $parts[4]; + $info->{strand} = $parts[5]; + $info->{line} = $line; + push(@expr, $info); +} +close(IN); + +if( $v eq "y" ){ + printf(" %i peaks, %i genes\n", $#peaks+1, $#expr+1); + printf("\nSorting peaks and genes, indexing chromosomes\n"); +} + +# +# sort binding and expression data by chr to save some time +# +@speaks = sort{ $a->{chr} cmp $b->{chr} } @peaks; +@sexpr = sort{ $a->{chr} cmp $b->{chr} } @expr; + +# +# index starting elements for each chromosome in sexpr +# +@chrinds = (); +$chr = $sexpr[0]->{chr}; +$info = {}; +$info->{chr} = $chr; +$info->{start} = 0; +for($i=0; $i<= $#sexpr; $i++){ + if( $sexpr[$i]->{chr} ne $chr ){ + $info->{end} = $i-1; + push(@chrinds, $info); + $info = {}; + $chr = $sexpr[$i]->{chr}; + $info->{chr} = $chr; + $info->{start} = $i; + } +} +$info->{end} = $i-1; +push(@chrinds, $info); + +# +# compile potential targets for each peak and print out +# +if( $v eq "y" ){ + printf("\nFinding potential targets\n"); +} +open(OUT,">$o"); +$count=0; +for($i=0; $i<= $#speaks; $i++){ + if( $v eq "y" ){ + printf("\r peak %i of %i", $i+1, $#speaks+1); + } + # print out peak info + printf OUT ("PEAK: %s %s %s\n", $speaks[$i]->{chr}, $speaks[$i]->{pk}, $speaks[$i]->{rest} ); + + # find the right chromosome and search through all the genes on the same chromosome for possible matches + $j=0; + $end=$#chrinds; + while( $chrinds[$j]->{chr} ne $speaks[$i]->{chr} && $j<= $end ){$j++;} + if( $chrinds[$j]->{chr} eq $speaks[$i]->{chr} ){ + $start = $chrinds[$j]->{start}; + $end = $chrinds[$j]->{end}; + for($k=$start; $k<= $end; $k++){ + if( &distance( $i, $k ) <= $d ){ + $count++; + printf OUT ("GENE: %s\n", $sexpr[$k]->{line}); + } + } + } +} +close(OUT); +if( $v eq "y" ){ + print "\n\nGot $count potential targets\n\n"; +} + +exit; + + +# compute distance between peak $_[0] and gene $_[1] +sub distance{ + my $dist; + my $tss; + my $ii; + my $jj; + $ii = $_[0]; + $jj = $_[1]; + + if( $speaks[$ii]->{chr} ne $sexpr[$jj]->{chr}){ + print "\nError: somehow computing distance between peak/gene on different chromosomes\n"; + print " peak chr: $speaks[$ii]->{chr}, gene chr: $sexpr[$jj]->{chr}\n"; + exit; + } + if( $dt == 1 ){ + # distance to closest side of gene + if( $speaks[$ii]->{pk} < $sexpr[$jj]->{start} ){ + $dist = $sexpr[$jj]->{start} - $speaks[$ii]->{pk}; + } + elsif( $speaks[$ii]->{pk} > $sexpr[$jj]->{end} ){ + $dist = $speaks[$ii]->{pk} - $sexpr[$jj]->{end}; + } + else{ + $dist = 0; + } + } + elsif( $dt == 2 ){ + # distance to TSS + if( $sexpr[$jj]->{strand} == 1 ){ + $tss = $sexpr[$jj]->{start}; + } + elsif( $sexpr[$jj]->{strand} == -1 ){ + $tss = $sexpr[$jj]->{end}; + } + else{ + print "\nError: did not have strand of +/-1 (got $sexpr[$jj]->{strand})\n"; + exit; + } + $dist = abs($speaks[$ii]->{pk} - $tss); + } + else{ + print "\nError: wrong value for dt ($dt)\n"; + exit; + } + if( $dist < 0 ){ + print "\nError: negative distance\n"; + exit; + } + # convert to kbp + $dist/=1000; + return $dist; +} + + + +