Mercurial > repos > fcaramia > methylation_analysis_bismark
view methylation_analysis/methylation_by_region_converter.pl @ 6:4f09ae8138d1 draft
Uploaded
author | fcaramia |
---|---|
date | Mon, 03 Dec 2012 18:27:21 -0500 |
parents | 282edadee017 |
children |
line wrap: on
line source
#!/usr/bin/perl # script to take a bed file of target regions and a series of bedgraphs from bismark and create a bedgrapgh with methylation percentages aggregated by region # Created by Jason Ellul, Oct 2012 use strict; use warnings; use Getopt::Std; use File::Basename; use Data::Dumper; $| = 1; # Grab and set all options my %OPTIONS = (s => "MethylationData"); getopts('l:L:o:s:', \%OPTIONS); die qq( Usage: methylation_by_region_converter.pl [OPTIONS] <bed file> <bedGraph 1> [<bedGraph 2> ...] OPTIONS: -o STR the name of the output file -l STR filename of the log file [null] -L STR append to an existing log file [null] -s STR Sample Name [$OPTIONS{s}] ) if(@ARGV < 2); my $version = 0.1; my $bed = shift @ARGV; my @graphs = @ARGV; my $Script = "methylation_by_region_converter.pl"; my $now; # if log file specified if(defined $OPTIONS{l}) { open (FH, ">$OPTIONS{l}") or die "Couldn't create log file $OPTIONS{l}!\n"; close (FH); # Open the log file and redirect output to it open (STDERR, ">>$OPTIONS{l}") or die "Couldn't write to log file $OPTIONS{l}!\n"; my $now = localtime time; print "Log File Created $now\n"; } elsif(defined $OPTIONS{L}) { #append to a log file # Open the log file and redirect output to it open (STDERR, ">>$OPTIONS{L}") or die "Couldn't append to log file $OPTIONS{L}!\n"; my $now = localtime time; print "Appending To Log File $now\n\n"; } # print version of this script. print STDERR "Using $Script version $version\n\n"; print STDERR "Using region file: $bed\n"; print STDERR "And bedgraphs: @graphs\n\n"; my %Regions; my @chr_order; # read in regions file print STDERR "Reading Regions Bed File $bed ... "; open(BED, "$bed") || die "$bed: $!"; while(my $line = <BED>) { chomp $line; my @line_sp = split("\t", $line); $line_sp[2]--; push @chr_order, $line_sp[0] unless defined $Regions{$line_sp[0]}; push @{ $Regions{$line_sp[0]}{Start} }, $line_sp[1]; push @{ $Regions{$line_sp[0]}{End} }, $line_sp[2]; push @{ $Regions{$line_sp[0]}{Methylated} }, 0; push @{ $Regions{$line_sp[0]}{Unmethylated} }, 0; } close(BED); print STDERR "Done.\n"; # read in bedgraphs foreach my $bedGraph (@graphs) { print STDERR "Loading bedGraph File $bedGraph ... "; open(GRAPH, $bedGraph) || die "$bedGraph: $!"; my @lines = <GRAPH>; close(GRAPH); print STDERR "Done.\n\n"; print STDERR "Processing bedGraph File ... "; foreach my $line (@lines) { chomp $line; my @line_sp = split("\t", $line); $line_sp[1]++; # if the chromosome is in the regions file if(defined $Regions{$line_sp[0]}) { my $pos = binsearchregion($line_sp[1], \&cmpFunc, \@{ $Regions{$line_sp[0]}{Start} }, \@{ $Regions{$line_sp[0]}{End} }); # if the position is within a region if($pos >= 0) { ${ $Regions{$line_sp[0]}{Methylated} }[$pos] += $line_sp[4]; ${ $Regions{$line_sp[0]}{Unmethylated} }[$pos] += $line_sp[5]; } } } print STDERR "Done.\n\n"; } if(defined $OPTIONS{o}) { open (STDOUT, ">$OPTIONS{o}") || die "$OPTIONS{o}: $!"; } # calculate percent methylated and print print STDERR "Printing output ... "; print "#type=DNA_METHYLATION\n"; print "'ID\tchrom\tloc.start\tloc.end\tMethylated\tUnmethylated\tTotal\tFractionMethylated\n"; foreach my $chr (@chr_order) { for(my $region = 0; $region < @{ $Regions{$chr}{Start} }; $region++) { my $total = ${ $Regions{$chr}{Methylated} }[$region] + ${ $Regions{$chr}{Unmethylated} }[$region]; my $fract = sprintf("%.3f", ${ $Regions{$chr}{Methylated} }[$region] / $total) if $total; print "$OPTIONS{s}\t$chr\t${ $Regions{$chr}{Start} }[$region]\t${ $Regions{$chr}{End} }[$region]\t" if $total; print "${ $Regions{$chr}{Methylated} }[$region]\t${ $Regions{$chr}{Unmethylated} }[$region]\t$total\t$fract\n" if $total; } } close(STDERR) if defined $OPTIONS{o}; print STDERR "Done.\n"; sub binsearchregion { my ($target, $cmp, $start, $end) = @_; my $posmin = 0; my $posmax = $#{ $start }; return -1 if &$cmp (0, $start, $target) > 0; return -1 if &$cmp ($#{ $end }, $end, $target) < 0; while (1) { my $mid = int (($posmin + $posmax) / 2); my $result = &$cmp ($mid, $start, $target); if ($result < 0) { $posmin = $posmax, next if $mid == $posmin && $posmax != $posmin; if($mid == $posmin) { return -1 if &$cmp ($mid, $end, $target) < 0; return $mid; } $posmin = $mid; } elsif ($result > 0) { $posmax = $posmin, next if $mid == $posmax && $posmax != $posmin; if($mid == $posmax) { $mid--; return -1 if &$cmp ($mid, $end, $target) < 0; return $mid; } $posmax = $mid; } else { return $mid; } } } sub cmpFunc { my ($index, $arrayRef, $target) = @_; my $item = $$arrayRef[$index]; return $item <=> $target; }