Mercurial > repos > fcaramia > methylation_analysis_bismark
diff methylation_analysis/methylation_by_region_converter.pl @ 4:282edadee017 draft
Uploaded
author | fcaramia |
---|---|
date | Mon, 03 Dec 2012 18:26:25 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/methylation_analysis/methylation_by_region_converter.pl Mon Dec 03 18:26:25 2012 -0500 @@ -0,0 +1,161 @@ +#!/usr/bin/perl + +# script to take a bed file of target regions and a series of bedgraphs from bismark and create a bedgrapgh with methylation percentages aggregated by region + +# Created by Jason Ellul, Oct 2012 + + +use strict; +use warnings; +use Getopt::Std; +use File::Basename; +use Data::Dumper; +$| = 1; + +# Grab and set all options +my %OPTIONS = (s => "MethylationData"); +getopts('l:L:o:s:', \%OPTIONS); + +die qq( +Usage: methylation_by_region_converter.pl [OPTIONS] <bed file> <bedGraph 1> [<bedGraph 2> ...] + +OPTIONS: -o STR the name of the output file + -l STR filename of the log file [null] + -L STR append to an existing log file [null] + -s STR Sample Name [$OPTIONS{s}] +) if(@ARGV < 2); + +my $version = 0.1; +my $bed = shift @ARGV; +my @graphs = @ARGV; +my $Script = "methylation_by_region_converter.pl"; +my $now; + +# if log file specified +if(defined $OPTIONS{l}) { + open (FH, ">$OPTIONS{l}") or die "Couldn't create log file $OPTIONS{l}!\n"; + close (FH); + # Open the log file and redirect output to it + open (STDERR, ">>$OPTIONS{l}") or die "Couldn't write to log file $OPTIONS{l}!\n"; + my $now = localtime time; + print "Log File Created $now\n"; +} elsif(defined $OPTIONS{L}) { + #append to a log file + # Open the log file and redirect output to it + open (STDERR, ">>$OPTIONS{L}") or die "Couldn't append to log file $OPTIONS{L}!\n"; + my $now = localtime time; + print "Appending To Log File $now\n\n"; +} + +# print version of this script. +print STDERR "Using $Script version $version\n\n"; +print STDERR "Using region file: $bed\n"; +print STDERR "And bedgraphs: @graphs\n\n"; +my %Regions; +my @chr_order; + +# read in regions file +print STDERR "Reading Regions Bed File $bed ... "; +open(BED, "$bed") || die "$bed: $!"; +while(my $line = <BED>) { + chomp $line; + my @line_sp = split("\t", $line); + $line_sp[2]--; + + push @chr_order, $line_sp[0] unless defined $Regions{$line_sp[0]}; + push @{ $Regions{$line_sp[0]}{Start} }, $line_sp[1]; + push @{ $Regions{$line_sp[0]}{End} }, $line_sp[2]; + push @{ $Regions{$line_sp[0]}{Methylated} }, 0; + push @{ $Regions{$line_sp[0]}{Unmethylated} }, 0; +} +close(BED); +print STDERR "Done.\n"; + +# read in bedgraphs +foreach my $bedGraph (@graphs) { + print STDERR "Loading bedGraph File $bedGraph ... "; + open(GRAPH, $bedGraph) || die "$bedGraph: $!"; + my @lines = <GRAPH>; + close(GRAPH); + print STDERR "Done.\n\n"; + + print STDERR "Processing bedGraph File ... "; + foreach my $line (@lines) { + chomp $line; + my @line_sp = split("\t", $line); + $line_sp[1]++; + + # if the chromosome is in the regions file + if(defined $Regions{$line_sp[0]}) { + my $pos = binsearchregion($line_sp[1], \&cmpFunc, \@{ $Regions{$line_sp[0]}{Start} }, \@{ $Regions{$line_sp[0]}{End} }); + + # if the position is within a region + if($pos >= 0) { + ${ $Regions{$line_sp[0]}{Methylated} }[$pos] += $line_sp[4]; + ${ $Regions{$line_sp[0]}{Unmethylated} }[$pos] += $line_sp[5]; + } + } + } + print STDERR "Done.\n\n"; +} + +if(defined $OPTIONS{o}) { + open (STDOUT, ">$OPTIONS{o}") || die "$OPTIONS{o}: $!"; +} + +# calculate percent methylated and print +print STDERR "Printing output ... "; +print "#type=DNA_METHYLATION\n"; +print "'ID\tchrom\tloc.start\tloc.end\tMethylated\tUnmethylated\tTotal\tFractionMethylated\n"; +foreach my $chr (@chr_order) { + for(my $region = 0; $region < @{ $Regions{$chr}{Start} }; $region++) { + my $total = ${ $Regions{$chr}{Methylated} }[$region] + ${ $Regions{$chr}{Unmethylated} }[$region]; + my $fract = sprintf("%.3f", ${ $Regions{$chr}{Methylated} }[$region] / $total) if $total; + print "$OPTIONS{s}\t$chr\t${ $Regions{$chr}{Start} }[$region]\t${ $Regions{$chr}{End} }[$region]\t" if $total; + print "${ $Regions{$chr}{Methylated} }[$region]\t${ $Regions{$chr}{Unmethylated} }[$region]\t$total\t$fract\n" if $total; + } +} +close(STDERR) if defined $OPTIONS{o}; +print STDERR "Done.\n"; + +sub binsearchregion { + my ($target, $cmp, $start, $end) = @_; + + my $posmin = 0; + my $posmax = $#{ $start }; + + return -1 if &$cmp (0, $start, $target) > 0; + return -1 if &$cmp ($#{ $end }, $end, $target) < 0; + + while (1) { + my $mid = int (($posmin + $posmax) / 2); + my $result = &$cmp ($mid, $start, $target); + + if ($result < 0) { + $posmin = $posmax, next if $mid == $posmin && $posmax != $posmin; + if($mid == $posmin) { + return -1 if &$cmp ($mid, $end, $target) < 0; + return $mid; + } + $posmin = $mid; + } elsif ($result > 0) { + $posmax = $posmin, next if $mid == $posmax && $posmax != $posmin; + if($mid == $posmax) { + $mid--; + return -1 if &$cmp ($mid, $end, $target) < 0; + return $mid; + } + $posmax = $mid; + } else { + return $mid; + } + } +} + +sub cmpFunc { + my ($index, $arrayRef, $target) = @_; + my $item = $$arrayRef[$index]; + + return $item <=> $target; +} +