Mercurial > repos > mvdbeek > damidseq_findpeaks
diff find_peaks @ 0:be830200e987 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damidseq_findpeaks commit 87c99a97cb2ce55640afdd2e55c8b3ae5ad99324
author | mvdbeek |
---|---|
date | Fri, 20 Apr 2018 04:34:17 -0400 |
parents | |
children | 7b0ca503bb95 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/find_peaks Fri Apr 20 04:34:17 2018 -0400 @@ -0,0 +1,633 @@ +#!/usr/bin/perl -w + +# Copyright © 2012-2015, Owen Marshall + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at +# your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 +# USA + +use strict; +use File::Basename; +use 5.010; +$|++; + +my $version = "1.0.1"; + +print STDERR "\nfind_peaks v$version\nCopyright © 2012-15, Owen Marshall\n\n"; + +my %vars = ( + 'fdr' => 0.01, + 'min_count' => 2, + 'n' => 100, + 'frac' => 0, + 'min_quant' => 0.95, + 'step' => 0.01, + 'unified_peaks' => 'max', +); + +my %vars_details = ( + 'fdr' => 'False discovery rate value', + 'min_count' => 'Minimum number of fragments to consider as a peak', + 'n' => 'Number of iterations', + 'frac' => 'Number of random fragments to consider per iteration', + 'min_quant' => 'Minimum quantile for considering peaks', + 'step' => 'Stepping for quantiles', + 'unified_peaks' => "Method for calling peak overlaps (two options):\n\r'min': call minimum overlapping peak area\n\r'max': call maximum overlap as peak", +); + + +my @in_files; +process_cli(); + +# Time and date +my ($sec,$min,$hour,$mday,$mon,$year) = localtime(); +my $date = sprintf("%04d-%02d-%02d.%02d-%02d-%02d",$year+1900,$mon+1,$mday,$hour,$min,$sec); + +help() unless @in_files; + +foreach my $fn (@in_files) { + my @in; + my @unified_peaks; + my @sig_peaks; + my @peakmins; + + my %peaks; + my %peak_count; + my %peak_count_real; + my %log_scores; + my %regression; + my %peak_fdr_cutoff; + my %fdr; + + # Output file names + my ($name,$dir,$ext) = fileparse($fn, qr/\.[^.]*/); + + # filenames + my $fn_base_date = "peak_analysis.".$name.".$date"; + my $base_dir = "$fn_base_date/"; + my $out = "$base_dir"."$name-FDR$vars{'fdr'}"; + my $out_peak_unified_track = $out.".peaks.gff"; + my $out_peaks = $out."_FDR-data"; + + # Load gff data files + load_gff(FILE=>$fn, IN_REF=>\@in); + + my $probes = @in; + + find_quants(IN_REF=>\@in, PEAKMINS_REF=>\@peakmins); + find_randomised_peaks(IN_REF=>\@in, PEAKMINS_REF=>\@peakmins, PEAK_COUNT=>\%peak_count, PEAKS=>\%peaks); + + # Make directory + mkdir($base_dir); + + # Open peaks file for writing + open(OUTP, ">$out_peaks")|| die "Cannot open peak file for writing: $!\n"; + print OUTP "FDR peak call v$version\n\n"; + print OUTP "Input file: $fn\n"; + + calculate_regressions(IN_REF=>\@in, PEAKMINS_REF=>\@peakmins, PEAK_COUNT=>\%peak_count, PEAKS=>\%peaks, LOG_SCORES=>\%log_scores, REGRESSION=>\%regression); + + call_peaks_unified_redux(ITER=>1, REAL=>1, AREF=>\@in, PEAKMINS_REF=>\@peakmins, PEAK_COUNT=>\%peak_count, PEAK_COUNT_REAL=>\%peak_count_real, PEAKS=>\%peaks); + + calculate_fdr(IN_REF=>\@in, PEAKMINS_REF=>\@peakmins, PEAK_COUNT=>\%peak_count, PEAK_COUNT_REAL=>\%peak_count_real, PEAK_FDR_CUTOFF=>\%peak_fdr_cutoff, FDR=>\%fdr, LOG_SCORES=>\%log_scores, REGRESSION=>\%regression); + + find_significant_peaks(PEAKMINS=>\@peakmins, SIG_PEAKS=>\@sig_peaks, PEAKS=>\%peaks, PEAK_FDR_CUTOFF=>\%peak_fdr_cutoff, FDR=>\%fdr); + + make_unified_peaks(SIG_PEAKS=>\@sig_peaks, UNIFIED_PEAKS=>\@unified_peaks, OUT=>$out_peak_unified_track, TYPE=>$vars{'unified_peaks'}); + + print STDERR "$#unified_peaks peaks found.\n\n"; + + close OUTP; +} + +print STDERR "All done.\n\n"; +exit 0; + + +###################### +# Subroutines start here +# + +sub find_significant_peaks { + my (%opts) = @_; + + my $peakmins = $opts{PEAKMINS}; + my $peaks = $opts{PEAKS}; + my $sig_peaks = $opts{SIG_PEAKS}; + my $fdr = $opts{FDR}; + my $peak_fdr_cutoff = $opts{PEAK_FDR_CUTOFF}; + + # Generate significant peaks and unify peaks + print STDERR "Selecting significant peaks ... \n"; + + foreach my $pm (@$peakmins) { + for my $i (0 .. $#{$$peaks{$pm}}) { + my ($chr, $pstart, $pend, $mean_pscore, $pscore, $count, $size) = @{ $$peaks{$pm}[$i] }; + if ($count >= $$peak_fdr_cutoff{$pm}) { + push (@$sig_peaks, [ @{$$peaks{$pm}[$i]}, $$fdr{$pm}[$count] ]); + } + } + } + + print OUTP "\nNumber of peaks: $#$sig_peaks\n"; +} + +sub calculate_fdr { + my (%opts) = @_; + my $in_ref = $opts{IN_REF}; + my $peakmins = $opts{PEAKMINS_REF}; + my $peak_count = $opts{PEAK_COUNT}; + my $log_scores = $opts{LOG_SCORES}; + my $regression = $opts{REGRESSION}; + my $fdr = $opts{FDR}; + my $peak_fdr_cutoff = $opts{PEAK_FDR_CUTOFF}; + my $peak_count_real = $opts{PEAK_COUNT_REAL}; + + foreach my $pm (@$peakmins) { + # get regression variables + my ($m,$b) = @{$$regression{$pm}} if $$regression{$pm}[0]; + + for my $i (0 .. $#{$$peak_count_real{$pm}}) { + next unless $$peak_count_real{$pm}[$i]; + my $expect = 10**($m*$i + $b); + + my $real_count = $$peak_count_real{$pm}[$i]; + my $fdr_conservative = $expect/$real_count; + $$fdr{$pm}[$i]= $fdr_conservative; + } + } + + # print FDR rates + print OUTP "\n"; + + foreach my $pm (@$peakmins) { + print OUTP "Peak min = $pm\n"; + for my $c (0 .. $#{$$fdr{$pm}}) { + next unless defined($$fdr{$pm}[$c]); + print OUTP "Peak size: $c\tCount: $$peak_count_real{$pm}[$c]\tFDR: $$fdr{$pm}[$c]\n"; + $$peak_fdr_cutoff{$pm} = $c if (($$fdr{$pm}[$c]<$vars{'fdr'}) && (!$$peak_fdr_cutoff{$pm})); + } + $$peak_fdr_cutoff{$pm}||= 10**10; # clumsy hack to prevent errors + print OUTP "\n"; + } + + foreach my $pm (@$peakmins) { + print OUTP "Peak min $pm: peak cutoff size for alpha = $vars{'fdr'} was $$peak_fdr_cutoff{$pm}\n\n"; + } +} + +sub calculate_regressions { + my (%opts) = @_; + my $in_ref = $opts{IN_REF}; + my $peakmins = $opts{PEAKMINS_REF}; + my $peaks = $opts{PEAKS}; + my $peak_count = $opts{PEAK_COUNT}; + my $log_scores = $opts{LOG_SCORES}; + my $regression = $opts{REGRESSION}; + + my $in_num = @$in_ref; + + foreach my $pm (@$peakmins) { + print OUTP "Peak min = $pm\n"; + for my $c (0 .. $#{$$peak_count{$pm}}) { + my $peak_count_avg = $$peak_count{$pm}[$c]/$vars{'n'} if $$peak_count{$pm}[$c]; + next unless $peak_count_avg; + + if ($vars{'frac'}) { + $peak_count_avg = $peak_count_avg * $in_num/$vars{'frac'}; + } + $$log_scores{$pm}[$c] = log($peak_count_avg)/log(10); + print OUTP "Peak size: $c\tCount:$peak_count_avg\n"; + } + + # calculate exponential decay rates + # y= a+bx for log(y) + my ($sumx, $sumy, $sumxsq, $sumxy); + my $n=0; + for my $i (0 .. $#{$$peak_count{$pm}}) { + next unless $$peak_count{$pm}[$i]; + $n++; + $sumx += $i; + $sumy += $$log_scores{$pm}[$i]; + $sumxsq += $i ** 2; + $sumxy += $i * $$log_scores{$pm}[$i]; + } + + next unless $n > 1; + + my $mean_x = $sumx/$n; + my $mean_y = $sumy/$n; + my $mean_xsq = $sumxsq/$n; + my $mean_xy = $sumxy/$n; + + my $b = ($mean_xy - ($mean_x * $mean_y)) / ($mean_xsq - ($mean_x **2)); + my $a = $mean_y - ($b * $mean_x); + + # store values + $$regression{$pm}=[$b, $a]; + print OUTP "regression: log(y) = $b(x) + $a\n"; + + for my $i (0 .. $#{$$peak_count{$pm}}) { + next unless $$peak_count{$pm}[$i]; + my ($b,$a) = @{$$regression{$pm}}; + my $logval = ($b*$i + $a); + my $val = 10**$logval; + print OUTP "lin regress: $i\t$$log_scores{$pm}[$i]\t$logval\t$val\n"; + } + + print OUTP "\n"; + } +} + +sub find_randomised_peaks { + my (%opts) = @_; + my $in_ref = $opts{IN_REF}; + my $peakmins_ref = $opts{PEAKMINS_REF}; + my $peaks = $opts{PEAKS}; + my $peak_count = $opts{PEAK_COUNT}; + + print STDERR "Duplicating ... \n"; + my @inr = @$in_ref; + + # Call peaks on input file + print STDERR "Calling peaks on input file ...\n"; + + for my $iter (1 .. $vars{'n'}) { + #print STDERR "Iteration $iter ... \r"; + print STDERR "Iteration $iter: [shuffling] \r"; + + if ($vars{'frac'}) { + # only use a fraction of the array per rep + my @a = inside_out(\@inr, $vars{'frac'}); + call_peaks_unified_redux(ITER=>$iter, AREF=>\@a, PEAKMINS_REF=>$peakmins_ref, PEAK_COUNT=>$peak_count); + } else { + shuffle(\@inr); + call_peaks_unified_redux(ITER=>$iter, AREF=>\@inr, PEAKMINS_REF=>$peakmins_ref, PEAK_COUNT=>$peak_count); + } + } + +} + +sub call_peaks_unified_redux { + my (%opts) = @_; + my $iter = $opts{ITER}; + my $real = $opts{REAL}; + my $a = $opts{AREF}; + my $peakmins_ref = $opts{PEAKMINS_REF}; + my $peak_count_real = $opts{PEAK_COUNT_REAL}; + my $peaks = $opts{PEAKS}; + my $peak_count = $opts{PEAK_COUNT}; + + my ($pstart, $pend, $inpeak, $pscore, $count); + $pstart=$pend=$pscore=$inpeak=$count=0; + + my @tmp_peak; + my $total = $#$a; + + if ($real) { + print STDERR "Calling real peaks ... \r"; + } else { + print STDERR "Iteration $iter: [processing ...] \r"; + } + + my $old_chr=""; + foreach my $pm (@$peakmins_ref) { + for my $i (0 .. $total) { + my ($chr, $start, $end, $score) = @{ @$a[$i] }; + next unless $score; + + if ($real) { + unless ($chr eq $old_chr) { + # Next chromosome + # (Peaks can't carry over chromosomes, but we don't use this shortcut when randomly shuffling) + $pstart=$pend=$pscore=$inpeak=$count=0; + @tmp_peak = () if $real; + } + } + $old_chr = $chr if $real; + + unless ($inpeak) { + next unless $score >= $pm; + # record new peak + $pstart = $start; + $pend = $end; + $pscore = $score * ($end-$start)/1000; + $count++; + push @tmp_peak, $score if $real; + $inpeak = 1; + } else { + if ($score >= $pm) { + # still in peak + $count++; + + # Fragment score to deal with scoring peaks made from uneven sized fragments + my $fragment_score = $score * ($end-$start)/1000; + + push @tmp_peak, $score if $real; + $pscore += $fragment_score; + $pend = $end; + } else { + # out of a peak + if ($count >= $vars{'min_count'}) { + # record peak + if ($real) { + $$peak_count_real{$pm}[$count]++; + my $mean_pscore = sprintf('%0.2f',($pscore/(($pend-$pstart)/1000))); + push (@{$$peaks{$pm}},[($chr, $pstart, $pend, $mean_pscore, $pscore, $count, ($pend-$pstart))]); + } else { + $$peak_count{$pm}[$count]++; + } + } + + # reset + $pstart=$pend=$pscore=$inpeak=$count=0; + @tmp_peak = () if $real; + } + } + } + } +} + + +sub shuffle { + # Fisher-Yates shuffle (Knuth shuffle) + my ($array) = @_; + my $i = @$array; + while ( --$i ) { + my $j = int rand( $i+1 ); + @$array[$i,$j] = @$array[$j,$i]; + } +} + +sub inside_out { + my ($array, $frac) = @_; + my @a; + for my $i (0 .. $frac-1) { + my $j = int rand( $i+1 ); + if ($j != $i) { + $a[$i] = $a[$j] + } + $a[$j] = @$array[$i] + } + return @a; +} + +sub load_gff { + my (%opts) = @_; + my $fn = $opts{FILE}; + my $in_ref = $opts{IN_REF}; + + print STDERR "Reading input file: $fn ...\n"; + open (IN, "<$fn") || die "Unable to read $fn: $!\n"; + + my $i; + while (<IN>) { + $i++; + print STDERR "Read $i lines ...\r" if $i%10000 == 0; + chomp; + + my @line = split('\t'); + + my ($chr, $start, $end, $score); + if ($#line == 3) { + # bedgraph + ($chr, $start, $end, $score) = @line; + } else { + # GFF + ($chr, $start, $end, $score) = @line[0,3,4,5]; + } + + next unless $start; + + $score = 0 if $score eq "NA"; + + push (@$in_ref, [$chr, $start, $end, $score]); + } + + close (IN); + + print STDERR "Sorting ... \n"; + @$in_ref = sort { $a->[1] <=> $b->[1] } @$in_ref; + @$in_ref = sort { $a->[0] cmp $b->[0] } @$in_ref; +} + +sub find_quants { + my (%opts) = @_; + + my $in_ref = $opts{IN_REF}; + my $peakmins_ref = $opts{PEAKMINS_REF}; + + my %seg; + my @frags; + + my $total_coverage; + + foreach my $l (@$in_ref) { + my ($chr, $start, $end, $score) = @$l; + next unless $score; + $score = 0 if $score eq "NA"; + $total_coverage += $end-$start; + push @frags, $score; + } + + @frags = sort {$a <=> $b} @frags; + + print STDERR "Total coverage was $total_coverage bp\n"; + + my @quants; + for (my $q=0;$q<=1;$q+=$vars{'step'}) { + push @quants, [$q, int($q * @frags)] if $q > $vars{'min_quant'}; + } + + foreach (@quants) { + my $cut_off = @{$_}[0]; + my $score = $frags[@{$_}[1]]; + printf(" Quantile %0.2f: %0.2f\n",$cut_off,$score); + $seg{$cut_off} = $score; + } + + foreach my $c (sort {$a <=> $b} keys %seg) { + push (@$peakmins_ref, $seg{$c}); + } +} + +sub make_unified_peaks { + my (%opts) = @_; + my $ref = $opts{SIG_PEAKS}; + my $out_peak_unified_track = $opts{OUT}; + my $unified_peaks = $opts{UNIFIED_PEAKS}; + my $type = $opts{TYPE}; + my $total = @$ref; + + # Unify overlapping peaks, and make significant peaks file + my $skipped_peaks; + print STDERR "Combining significant peaks ...\n"; + + # unroll chromosomes for speed + foreach my $chr (uniq( map @{$_}[0], @$ref )) { + my @c = grep {@{$_}[0] eq $chr} @$ref; + + my @unified_peaks_chr; + foreach my $ar (@c) { + if (state $i++ % 100 == 0) { + my $pc = sprintf("%0.2f",($i*100)/$total); + print STDERR "$pc\% processed ...\r"; + } + + my ($chra, $start, $end, $score, $total_score, $count, $peaklen, $fdr) = @$ar; + + # next if @unified_peaks_chr already overlaps + next if grep { + @{$_}[3] < $end + && @{$_}[4] > $start + } @unified_peaks_chr; + + # Grab all elements that overlap + my @test = grep { + @{$_}[1] < $end + && @{$_}[2] > $start + } @c; + + for my $j (0 .. $#test) { + my ($chr1, $start1, $end1, $score1, $total_score1, $count1, $peaklen1, $fdr1) = @{$test[$j]}; + + next unless $start1 < $end; + next unless $end1 > $start; + + if ($type eq 'min') { + $start = max($start, $start1); + $end = min($end, $end1); + } else { + $start = min($start, $start1); + $end = max($end, $end1); + } + + $score = max($score, $score1); + $fdr = min($fdr, $fdr1); + } + + push @unified_peaks_chr, [($chr, '.', '.', $start, $end, $score, '.', '.', "FDR=$fdr")]; + } + + @$unified_peaks = (@$unified_peaks, @unified_peaks_chr); + } + + $total = $#$unified_peaks; + + print STDERR "Sorting unified peaks ...\n"; + @$unified_peaks = sort { $a->[3] <=> $b->[3] } @$unified_peaks; + @$unified_peaks = sort { $a->[0] cmp $b->[0] } @$unified_peaks; + + print STDERR "Writing unified peaks file ...\n"; + open(PEAKOUTUNI, ">$out_peak_unified_track") || die "Unable to open peak output track for writing: $!\n\n"; + for my $j (0 .. $#$unified_peaks) { + print PEAKOUTUNI join("\t", @{$$unified_peaks[$j]}), "\n"; + } +} + +sub max { + my ($max, @vars) = @_; + my $index=0; + $max||=0; + for my $i (0..$#vars) { + ($max, $index) = ($vars[$i], $i+1) if $vars[$i] > $max; + } + return $max; +} + +sub min { + my ($min, @vars) = @_; + my $index=0; + $min||=0; + for my $i (0..$#vars) { + ($min, $index) = ($vars[$i],$i+1) if $vars[$i] < $min; + } + return $min; +} + +sub uniq { + my %seen; + return grep { !$seen{$_}++ } @_; +} + + + +sub process_cli { + foreach (@ARGV) { + if (/--(.*)=(.*)/) { + unless (defined($vars{$1})) { + print STDERR "Did not understand $_ ...\n"; + help(); + } + my ($v, $opt) = ($1,$2); + $vars{$v} = $opt; + next; + } elsif (/--h[elp]*/) { + help(); + } elsif (/--(.*)/) { + print STDERR "Please add a parameter to $_ ...\n\n"; + exit 1; + } + push @in_files, $_; + } +} + + +sub help { + print STDOUT <<EOT; +Simple FDR random permutation peak caller +Usage: [options] [files in bedgraph or GFF format] + +Options: +EOT + + my $opt_len = 0; + foreach (keys %vars) { + my $l = length($_); + $opt_len = $l if $l > $opt_len; + } + + $opt_len+=2; + + my $cols= `tput cols` || 80; + + my ($v, $val, $def, $def_format); + my $help_format = "format STDOUT =\n" + .' '.'^'.'<'x$opt_len . ' '. '^' . '<'x($cols-$opt_len-4) . "\n" + .'$v, $def_format'."\n" + .' '.'^'.'<'x$opt_len . ' '. '^' . '<'x($cols-$opt_len-6) . "~~\n" + .'$v, $def_format'."\n" + .".\n"; + + eval $help_format; + die $@ if $@; + + foreach my $k (sort (keys %vars)) { + ($v, $val, $def) = ($k, $vars{$k}, $vars_details{$k}); + $def||=""; + $def_format = $val ? "$def\n\r[Current value: $val]" : $def; + $v = "--$v"; +# format = +# ^<<<<<<<<<<<<<<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +#$v, $def_format +# ^<<<<<<<<<<<<<<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< ~~ +#$v, $def_format +#. + + write(); + + } + print STDOUT "\n"; + exit 1; +}