view methylation_analysis/methylation_by_region_converter.pl @ 4:282edadee017 draft

Uploaded
author fcaramia
date Mon, 03 Dec 2012 18:26:25 -0500
parents
children
line wrap: on
line source

#!/usr/bin/perl

# script to take a bed file of target regions and a series of bedgraphs from bismark and create a bedgrapgh with methylation percentages aggregated by region

# Created by Jason Ellul, Oct 2012


use strict;
use warnings;
use Getopt::Std;
use File::Basename;
use Data::Dumper;
$| = 1;

# Grab and set all options
my %OPTIONS = (s => "MethylationData");
getopts('l:L:o:s:', \%OPTIONS);

die qq(
Usage:   methylation_by_region_converter.pl [OPTIONS] <bed file> <bedGraph 1> [<bedGraph 2> ...]

OPTIONS:		-o	STR	the name of the output file
			-l	STR	filename of the log file [null]
			-L	STR	append to an existing log file [null]
			-s	STR	Sample Name [$OPTIONS{s}]
) if(@ARGV < 2);

my $version = 0.1;
my $bed = shift @ARGV;
my @graphs = @ARGV;
my $Script = "methylation_by_region_converter.pl";
my $now;

# if log file specified
if(defined $OPTIONS{l}) {
	open (FH, ">$OPTIONS{l}") or die "Couldn't create log file $OPTIONS{l}!\n";
	close (FH);
	# Open the log file and redirect output to it
	open (STDERR, ">>$OPTIONS{l}") or die "Couldn't write to log file $OPTIONS{l}!\n";
	my $now = localtime time;
	print "Log File Created $now\n";
} elsif(defined $OPTIONS{L}) {
	#append to a log file
	# Open the log file and redirect output to it
	open (STDERR, ">>$OPTIONS{L}") or die "Couldn't append to log file $OPTIONS{L}!\n";
	my $now = localtime time;
	print "Appending To Log File $now\n\n";
}

# print version of this script.
print STDERR "Using $Script version $version\n\n";
print STDERR "Using region file: $bed\n";
print STDERR "And bedgraphs: @graphs\n\n";
my %Regions;
my @chr_order;

# read in regions file
print STDERR "Reading Regions Bed File $bed ... ";
open(BED, "$bed") || die "$bed: $!";
while(my $line = <BED>) {
	chomp $line;
	my @line_sp = split("\t", $line);
	$line_sp[2]--;
	
	push @chr_order, $line_sp[0] unless defined $Regions{$line_sp[0]};		
	push @{ $Regions{$line_sp[0]}{Start} }, $line_sp[1];
	push @{ $Regions{$line_sp[0]}{End} }, $line_sp[2];
	push @{ $Regions{$line_sp[0]}{Methylated} }, 0;
	push @{ $Regions{$line_sp[0]}{Unmethylated} }, 0;
}
close(BED);
print STDERR "Done.\n";

# read in bedgraphs
foreach my $bedGraph (@graphs) {
	print STDERR "Loading bedGraph File $bedGraph ... ";
	open(GRAPH, $bedGraph) || die "$bedGraph: $!";
	my @lines = <GRAPH>;
	close(GRAPH);
	print STDERR "Done.\n\n";
	
	print STDERR "Processing bedGraph File ... ";
	foreach my $line (@lines) {
		chomp $line;
		my @line_sp = split("\t", $line);
		$line_sp[1]++;
		
		# if the chromosome is in the regions file
		if(defined $Regions{$line_sp[0]}) {
			my $pos = binsearchregion($line_sp[1], \&cmpFunc, \@{ $Regions{$line_sp[0]}{Start} }, \@{ $Regions{$line_sp[0]}{End} });
			
			# if the position is within a region
			if($pos >= 0) {
				${ $Regions{$line_sp[0]}{Methylated} }[$pos] += $line_sp[4];
				${ $Regions{$line_sp[0]}{Unmethylated} }[$pos] += $line_sp[5];
			}
		}
	}
	print STDERR "Done.\n\n";
}

if(defined $OPTIONS{o}) {
	open (STDOUT, ">$OPTIONS{o}") || die "$OPTIONS{o}: $!";
}

# calculate percent methylated and print
print STDERR "Printing output ... ";
print "#type=DNA_METHYLATION\n";
print "'ID\tchrom\tloc.start\tloc.end\tMethylated\tUnmethylated\tTotal\tFractionMethylated\n";
foreach my $chr (@chr_order) {
	for(my $region = 0; $region < @{ $Regions{$chr}{Start} }; $region++) {
		my $total = ${ $Regions{$chr}{Methylated} }[$region] + ${ $Regions{$chr}{Unmethylated} }[$region];
		my $fract = sprintf("%.3f", ${ $Regions{$chr}{Methylated} }[$region] / $total) if $total;
		print "$OPTIONS{s}\t$chr\t${ $Regions{$chr}{Start} }[$region]\t${ $Regions{$chr}{End} }[$region]\t" if $total;
		print "${ $Regions{$chr}{Methylated} }[$region]\t${ $Regions{$chr}{Unmethylated} }[$region]\t$total\t$fract\n" if $total;
	}
}
close(STDERR) if defined $OPTIONS{o};
print STDERR "Done.\n";

sub binsearchregion {
        my ($target, $cmp, $start, $end) = @_;

        my $posmin = 0;
        my $posmax = $#{ $start };

        return -1 if &$cmp (0, $start, $target) > 0;
        return -1 if &$cmp ($#{ $end }, $end, $target) < 0;

        while (1) {
                my $mid = int (($posmin + $posmax) / 2);
                my $result = &$cmp ($mid, $start, $target);

                if ($result < 0) {
                        $posmin = $posmax, next if $mid == $posmin && $posmax != $posmin;
                        if($mid == $posmin) {
                                return -1 if &$cmp ($mid, $end, $target) < 0;
                                return $mid;
                        }
                        $posmin = $mid;
                } elsif ($result > 0) {
                        $posmax = $posmin, next if $mid == $posmax && $posmax != $posmin;
                        if($mid == $posmax) {
                                $mid--;
                                return -1 if &$cmp ($mid, $end, $target) < 0;
                                return $mid;
                        }
                        $posmax = $mid;
                } else {
                        return $mid;
                }
        }
}

sub cmpFunc {
	my ($index, $arrayRef, $target) = @_;
	my $item = $$arrayRef[$index];

	return $item <=> $target;
}