diff methylation_analysis/methylation_by_region_converter.pl @ 4:282edadee017 draft

Uploaded
author fcaramia
date Mon, 03 Dec 2012 18:26:25 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/methylation_analysis/methylation_by_region_converter.pl	Mon Dec 03 18:26:25 2012 -0500
@@ -0,0 +1,161 @@
+#!/usr/bin/perl
+
+# script to take a bed file of target regions and a series of bedgraphs from bismark and create a bedgrapgh with methylation percentages aggregated by region
+
+# Created by Jason Ellul, Oct 2012
+
+
+use strict;
+use warnings;
+use Getopt::Std;
+use File::Basename;
+use Data::Dumper;
+$| = 1;
+
+# Grab and set all options
+my %OPTIONS = (s => "MethylationData");
+getopts('l:L:o:s:', \%OPTIONS);
+
+die qq(
+Usage:   methylation_by_region_converter.pl [OPTIONS] <bed file> <bedGraph 1> [<bedGraph 2> ...]
+
+OPTIONS:		-o	STR	the name of the output file
+			-l	STR	filename of the log file [null]
+			-L	STR	append to an existing log file [null]
+			-s	STR	Sample Name [$OPTIONS{s}]
+) if(@ARGV < 2);
+
+my $version = 0.1;
+my $bed = shift @ARGV;
+my @graphs = @ARGV;
+my $Script = "methylation_by_region_converter.pl";
+my $now;
+
+# if log file specified
+if(defined $OPTIONS{l}) {
+	open (FH, ">$OPTIONS{l}") or die "Couldn't create log file $OPTIONS{l}!\n";
+	close (FH);
+	# Open the log file and redirect output to it
+	open (STDERR, ">>$OPTIONS{l}") or die "Couldn't write to log file $OPTIONS{l}!\n";
+	my $now = localtime time;
+	print "Log File Created $now\n";
+} elsif(defined $OPTIONS{L}) {
+	#append to a log file
+	# Open the log file and redirect output to it
+	open (STDERR, ">>$OPTIONS{L}") or die "Couldn't append to log file $OPTIONS{L}!\n";
+	my $now = localtime time;
+	print "Appending To Log File $now\n\n";
+}
+
+# print version of this script.
+print STDERR "Using $Script version $version\n\n";
+print STDERR "Using region file: $bed\n";
+print STDERR "And bedgraphs: @graphs\n\n";
+my %Regions;
+my @chr_order;
+
+# read in regions file
+print STDERR "Reading Regions Bed File $bed ... ";
+open(BED, "$bed") || die "$bed: $!";
+while(my $line = <BED>) {
+	chomp $line;
+	my @line_sp = split("\t", $line);
+	$line_sp[2]--;
+	
+	push @chr_order, $line_sp[0] unless defined $Regions{$line_sp[0]};		
+	push @{ $Regions{$line_sp[0]}{Start} }, $line_sp[1];
+	push @{ $Regions{$line_sp[0]}{End} }, $line_sp[2];
+	push @{ $Regions{$line_sp[0]}{Methylated} }, 0;
+	push @{ $Regions{$line_sp[0]}{Unmethylated} }, 0;
+}
+close(BED);
+print STDERR "Done.\n";
+
+# read in bedgraphs
+foreach my $bedGraph (@graphs) {
+	print STDERR "Loading bedGraph File $bedGraph ... ";
+	open(GRAPH, $bedGraph) || die "$bedGraph: $!";
+	my @lines = <GRAPH>;
+	close(GRAPH);
+	print STDERR "Done.\n\n";
+	
+	print STDERR "Processing bedGraph File ... ";
+	foreach my $line (@lines) {
+		chomp $line;
+		my @line_sp = split("\t", $line);
+		$line_sp[1]++;
+		
+		# if the chromosome is in the regions file
+		if(defined $Regions{$line_sp[0]}) {
+			my $pos = binsearchregion($line_sp[1], \&cmpFunc, \@{ $Regions{$line_sp[0]}{Start} }, \@{ $Regions{$line_sp[0]}{End} });
+			
+			# if the position is within a region
+			if($pos >= 0) {
+				${ $Regions{$line_sp[0]}{Methylated} }[$pos] += $line_sp[4];
+				${ $Regions{$line_sp[0]}{Unmethylated} }[$pos] += $line_sp[5];
+			}
+		}
+	}
+	print STDERR "Done.\n\n";
+}
+
+if(defined $OPTIONS{o}) {
+	open (STDOUT, ">$OPTIONS{o}") || die "$OPTIONS{o}: $!";
+}
+
+# calculate percent methylated and print
+print STDERR "Printing output ... ";
+print "#type=DNA_METHYLATION\n";
+print "'ID\tchrom\tloc.start\tloc.end\tMethylated\tUnmethylated\tTotal\tFractionMethylated\n";
+foreach my $chr (@chr_order) {
+	for(my $region = 0; $region < @{ $Regions{$chr}{Start} }; $region++) {
+		my $total = ${ $Regions{$chr}{Methylated} }[$region] + ${ $Regions{$chr}{Unmethylated} }[$region];
+		my $fract = sprintf("%.3f", ${ $Regions{$chr}{Methylated} }[$region] / $total) if $total;
+		print "$OPTIONS{s}\t$chr\t${ $Regions{$chr}{Start} }[$region]\t${ $Regions{$chr}{End} }[$region]\t" if $total;
+		print "${ $Regions{$chr}{Methylated} }[$region]\t${ $Regions{$chr}{Unmethylated} }[$region]\t$total\t$fract\n" if $total;
+	}
+}
+close(STDERR) if defined $OPTIONS{o};
+print STDERR "Done.\n";
+
+sub binsearchregion {
+        my ($target, $cmp, $start, $end) = @_;
+
+        my $posmin = 0;
+        my $posmax = $#{ $start };
+
+        return -1 if &$cmp (0, $start, $target) > 0;
+        return -1 if &$cmp ($#{ $end }, $end, $target) < 0;
+
+        while (1) {
+                my $mid = int (($posmin + $posmax) / 2);
+                my $result = &$cmp ($mid, $start, $target);
+
+                if ($result < 0) {
+                        $posmin = $posmax, next if $mid == $posmin && $posmax != $posmin;
+                        if($mid == $posmin) {
+                                return -1 if &$cmp ($mid, $end, $target) < 0;
+                                return $mid;
+                        }
+                        $posmin = $mid;
+                } elsif ($result > 0) {
+                        $posmax = $posmin, next if $mid == $posmax && $posmax != $posmin;
+                        if($mid == $posmax) {
+                                $mid--;
+                                return -1 if &$cmp ($mid, $end, $target) < 0;
+                                return $mid;
+                        }
+                        $posmax = $mid;
+                } else {
+                        return $mid;
+                }
+        }
+}
+
+sub cmpFunc {
+	my ($index, $arrayRef, $target) = @_;
+	my $item = $$arrayRef[$index];
+
+	return $item <=> $target;
+}
+