Mercurial > repos > mmaiensc > ember
view GALAXY_FILES/tools/EMBER/Integrate_Data.pl @ 4:e960969a92ae default tip
Uploaded
author | mmaiensc |
---|---|
date | Thu, 22 Mar 2012 14:07:11 -0400 |
parents | 003f802d4c7d |
children |
line wrap: on
line source
#!/usr/bin/perl # combine microarray pre-preprocessed data and binding data use Getopt::Long; $|++; # # command line arguments # $options = "Usage: ./Integrate_Data.pl <OPTIONS> -b binding data (required) -bf binding data format (default 1) 1 - .bed format (<chr> <pkposn> <other information>) first two columns are required, <other information> may contain peak ids, enrichments, etc, and is optional but will be retained 2 - Dinner group \"annotation\" format -e pre-processed expression data (required) -o output (required) -d distance (positive number, in kbp, default 100) -dt distance type (1 = to gene boundaries, 2 = to TSS, default 1) -v verbose (print progress to stdout: y or n, default y) \n"; $b = ""; $bf = 1; $e = ""; $o = ""; $d = 100; $dt = "1"; $v = "y"; GetOptions('b=s' => \$b, 'bf=i' => \$bf, 'e=s' => \$e, 'o=s' => \$o, 'd=f' => \$d, 'dt=i' => \$dt, 'v=s' => \$v, ) || die "\n$options\n"; if( $b eq "" ){ print "\nError: set a value for -b\n\n$options"; exit; } if( $bf != 1 && $bf != 2 ){ print "\nError: set -bf to be 1 or 2\n\n$options"; exit; } if( $e eq "" ){ print "\nError: set a value for -e\n\n$options"; exit; } if( $o eq "" ){ print "\nError: set a value for -o\n\n$options"; exit; } if( $d < 0 ){ print "\nError: choose -d to be positive\n\n$options"; exit; } if( $dt != 1 && $dt != 2 ){ print "\nError: choose -dt to be 1 or 2\n\n$options"; exit; } if( $v ne "y" && $v ne "n" ){ print "\nError: choose -v to be y or n\n\n$options"; exit; } # # read in binding data # if( $v eq "y" ){ print "\nReading in binding and expression data\n"; } @peaks = (); open(IN,"$b") || die "Error: can't open file $b\n"; while($line = <IN>){ if( $line !~ /#/ ){ chomp($line); @parts = split(' ',$line); $info = {}; if( $bf == 1 ){ $info->{chr} = $parts[0]; $info->{pk} = $parts[1]; $val = ""; if( $#parts >= 2 ){ $val = $parts[2]; for($i=3; $i<= $#parts; $i++){ $val = sprintf("%s %s", $val, $parts[$i]); } } $info->{rest} = $val; push(@peaks, $info); } if( $bf == 2 ){ if( $#parts == 19 ){ $info->{chr} = $parts[2]; $info->{pk} = $parts[3]; $info->{rest} = sprintf("%s %s %s", $parts[0], $parts[1], $parts[6]); push(@peaks, $info); } } } } close(IN); # # read in expression data # @expr = (); open(IN,"$e") || die "Error: can't open file $e\n"; while($line = <IN>){ chomp($line); @parts = split(' ',$line); $info = {}; $info->{chr} = $parts[2]; $info->{start} = $parts[3]; $info->{end} = $parts[4]; $info->{strand} = $parts[5]; $info->{line} = $line; push(@expr, $info); } close(IN); if( $v eq "y" ){ printf(" %i peaks, %i genes\n", $#peaks+1, $#expr+1); printf("\nSorting peaks and genes, indexing chromosomes\n"); } # # sort binding and expression data by chr to save some time # @speaks = sort{ $a->{chr} cmp $b->{chr} } @peaks; @sexpr = sort{ $a->{chr} cmp $b->{chr} } @expr; # # index starting elements for each chromosome in sexpr # @chrinds = (); $chr = $sexpr[0]->{chr}; $info = {}; $info->{chr} = $chr; $info->{start} = 0; for($i=0; $i<= $#sexpr; $i++){ if( $sexpr[$i]->{chr} ne $chr ){ $info->{end} = $i-1; push(@chrinds, $info); $info = {}; $chr = $sexpr[$i]->{chr}; $info->{chr} = $chr; $info->{start} = $i; } } $info->{end} = $i-1; push(@chrinds, $info); # # compile potential targets for each peak and print out # if( $v eq "y" ){ printf("\nFinding potential targets\n"); } open(OUT,">$o"); $count=0; for($i=0; $i<= $#speaks; $i++){ if( $v eq "y" ){ printf("\r peak %i of %i", $i+1, $#speaks+1); } # print out peak info printf OUT ("PEAK: %s %s %s\n", $speaks[$i]->{chr}, $speaks[$i]->{pk}, $speaks[$i]->{rest} ); # find the right chromosome and search through all the genes on the same chromosome for possible matches $j=0; $end=$#chrinds; while( $chrinds[$j]->{chr} ne $speaks[$i]->{chr} && $j<= $end ){$j++;} if( $chrinds[$j]->{chr} eq $speaks[$i]->{chr} ){ $start = $chrinds[$j]->{start}; $end = $chrinds[$j]->{end}; for($k=$start; $k<= $end; $k++){ if( &distance( $i, $k ) <= $d ){ $count++; printf OUT ("GENE: %s\n", $sexpr[$k]->{line}); } } } } close(OUT); if( $v eq "y" ){ print "\n\nGot $count potential targets\n\n"; } exit; # compute distance between peak $_[0] and gene $_[1] sub distance{ my $dist; my $tss; my $ii; my $jj; $ii = $_[0]; $jj = $_[1]; if( $speaks[$ii]->{chr} ne $sexpr[$jj]->{chr}){ print "\nError: somehow computing distance between peak/gene on different chromosomes\n"; print " peak chr: $speaks[$ii]->{chr}, gene chr: $sexpr[$jj]->{chr}\n"; exit; } if( $dt == 1 ){ # distance to closest side of gene if( $speaks[$ii]->{pk} < $sexpr[$jj]->{start} ){ $dist = $sexpr[$jj]->{start} - $speaks[$ii]->{pk}; } elsif( $speaks[$ii]->{pk} > $sexpr[$jj]->{end} ){ $dist = $speaks[$ii]->{pk} - $sexpr[$jj]->{end}; } else{ $dist = 0; } } elsif( $dt == 2 ){ # distance to TSS if( $sexpr[$jj]->{strand} == 1 ){ $tss = $sexpr[$jj]->{start}; } elsif( $sexpr[$jj]->{strand} == -1 ){ $tss = $sexpr[$jj]->{end}; } else{ print "\nError: did not have strand of +/-1 (got $sexpr[$jj]->{strand})\n"; exit; } $dist = abs($speaks[$ii]->{pk} - $tss); } else{ print "\nError: wrong value for dt ($dt)\n"; exit; } if( $dist < 0 ){ print "\nError: negative distance\n"; exit; } # convert to kbp $dist/=1000; return $dist; }