comparison find_peaks @ 0:be830200e987 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damidseq_findpeaks commit 87c99a97cb2ce55640afdd2e55c8b3ae5ad99324
author mvdbeek
date Fri, 20 Apr 2018 04:34:17 -0400
parents
children 7b0ca503bb95
comparison
equal deleted inserted replaced
-1:000000000000 0:be830200e987
1 #!/usr/bin/perl -w
2
3 # Copyright © 2012-2015, Owen Marshall
4
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or (at
8 # your option) any later version.
9 #
10 # This program is distributed in the hope that it will be useful, but
11 # WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 # General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18 # USA
19
20 use strict;
21 use File::Basename;
22 use 5.010;
23 $|++;
24
25 my $version = "1.0.1";
26
27 print STDERR "\nfind_peaks v$version\nCopyright © 2012-15, Owen Marshall\n\n";
28
29 my %vars = (
30 'fdr' => 0.01,
31 'min_count' => 2,
32 'n' => 100,
33 'frac' => 0,
34 'min_quant' => 0.95,
35 'step' => 0.01,
36 'unified_peaks' => 'max',
37 );
38
39 my %vars_details = (
40 'fdr' => 'False discovery rate value',
41 'min_count' => 'Minimum number of fragments to consider as a peak',
42 'n' => 'Number of iterations',
43 'frac' => 'Number of random fragments to consider per iteration',
44 'min_quant' => 'Minimum quantile for considering peaks',
45 'step' => 'Stepping for quantiles',
46 'unified_peaks' => "Method for calling peak overlaps (two options):\n\r'min': call minimum overlapping peak area\n\r'max': call maximum overlap as peak",
47 );
48
49
50 my @in_files;
51 process_cli();
52
53 # Time and date
54 my ($sec,$min,$hour,$mday,$mon,$year) = localtime();
55 my $date = sprintf("%04d-%02d-%02d.%02d-%02d-%02d",$year+1900,$mon+1,$mday,$hour,$min,$sec);
56
57 help() unless @in_files;
58
59 foreach my $fn (@in_files) {
60 my @in;
61 my @unified_peaks;
62 my @sig_peaks;
63 my @peakmins;
64
65 my %peaks;
66 my %peak_count;
67 my %peak_count_real;
68 my %log_scores;
69 my %regression;
70 my %peak_fdr_cutoff;
71 my %fdr;
72
73 # Output file names
74 my ($name,$dir,$ext) = fileparse($fn, qr/\.[^.]*/);
75
76 # filenames
77 my $fn_base_date = "peak_analysis.".$name.".$date";
78 my $base_dir = "$fn_base_date/";
79 my $out = "$base_dir"."$name-FDR$vars{'fdr'}";
80 my $out_peak_unified_track = $out.".peaks.gff";
81 my $out_peaks = $out."_FDR-data";
82
83 # Load gff data files
84 load_gff(FILE=>$fn, IN_REF=>\@in);
85
86 my $probes = @in;
87
88 find_quants(IN_REF=>\@in, PEAKMINS_REF=>\@peakmins);
89 find_randomised_peaks(IN_REF=>\@in, PEAKMINS_REF=>\@peakmins, PEAK_COUNT=>\%peak_count, PEAKS=>\%peaks);
90
91 # Make directory
92 mkdir($base_dir);
93
94 # Open peaks file for writing
95 open(OUTP, ">$out_peaks")|| die "Cannot open peak file for writing: $!\n";
96 print OUTP "FDR peak call v$version\n\n";
97 print OUTP "Input file: $fn\n";
98
99 calculate_regressions(IN_REF=>\@in, PEAKMINS_REF=>\@peakmins, PEAK_COUNT=>\%peak_count, PEAKS=>\%peaks, LOG_SCORES=>\%log_scores, REGRESSION=>\%regression);
100
101 call_peaks_unified_redux(ITER=>1, REAL=>1, AREF=>\@in, PEAKMINS_REF=>\@peakmins, PEAK_COUNT=>\%peak_count, PEAK_COUNT_REAL=>\%peak_count_real, PEAKS=>\%peaks);
102
103 calculate_fdr(IN_REF=>\@in, PEAKMINS_REF=>\@peakmins, PEAK_COUNT=>\%peak_count, PEAK_COUNT_REAL=>\%peak_count_real, PEAK_FDR_CUTOFF=>\%peak_fdr_cutoff, FDR=>\%fdr, LOG_SCORES=>\%log_scores, REGRESSION=>\%regression);
104
105 find_significant_peaks(PEAKMINS=>\@peakmins, SIG_PEAKS=>\@sig_peaks, PEAKS=>\%peaks, PEAK_FDR_CUTOFF=>\%peak_fdr_cutoff, FDR=>\%fdr);
106
107 make_unified_peaks(SIG_PEAKS=>\@sig_peaks, UNIFIED_PEAKS=>\@unified_peaks, OUT=>$out_peak_unified_track, TYPE=>$vars{'unified_peaks'});
108
109 print STDERR "$#unified_peaks peaks found.\n\n";
110
111 close OUTP;
112 }
113
114 print STDERR "All done.\n\n";
115 exit 0;
116
117
118 ######################
119 # Subroutines start here
120 #
121
122 sub find_significant_peaks {
123 my (%opts) = @_;
124
125 my $peakmins = $opts{PEAKMINS};
126 my $peaks = $opts{PEAKS};
127 my $sig_peaks = $opts{SIG_PEAKS};
128 my $fdr = $opts{FDR};
129 my $peak_fdr_cutoff = $opts{PEAK_FDR_CUTOFF};
130
131 # Generate significant peaks and unify peaks
132 print STDERR "Selecting significant peaks ... \n";
133
134 foreach my $pm (@$peakmins) {
135 for my $i (0 .. $#{$$peaks{$pm}}) {
136 my ($chr, $pstart, $pend, $mean_pscore, $pscore, $count, $size) = @{ $$peaks{$pm}[$i] };
137 if ($count >= $$peak_fdr_cutoff{$pm}) {
138 push (@$sig_peaks, [ @{$$peaks{$pm}[$i]}, $$fdr{$pm}[$count] ]);
139 }
140 }
141 }
142
143 print OUTP "\nNumber of peaks: $#$sig_peaks\n";
144 }
145
146 sub calculate_fdr {
147 my (%opts) = @_;
148 my $in_ref = $opts{IN_REF};
149 my $peakmins = $opts{PEAKMINS_REF};
150 my $peak_count = $opts{PEAK_COUNT};
151 my $log_scores = $opts{LOG_SCORES};
152 my $regression = $opts{REGRESSION};
153 my $fdr = $opts{FDR};
154 my $peak_fdr_cutoff = $opts{PEAK_FDR_CUTOFF};
155 my $peak_count_real = $opts{PEAK_COUNT_REAL};
156
157 foreach my $pm (@$peakmins) {
158 # get regression variables
159 my ($m,$b) = @{$$regression{$pm}} if $$regression{$pm}[0];
160
161 for my $i (0 .. $#{$$peak_count_real{$pm}}) {
162 next unless $$peak_count_real{$pm}[$i];
163 my $expect = 10**($m*$i + $b);
164
165 my $real_count = $$peak_count_real{$pm}[$i];
166 my $fdr_conservative = $expect/$real_count;
167 $$fdr{$pm}[$i]= $fdr_conservative;
168 }
169 }
170
171 # print FDR rates
172 print OUTP "\n";
173
174 foreach my $pm (@$peakmins) {
175 print OUTP "Peak min = $pm\n";
176 for my $c (0 .. $#{$$fdr{$pm}}) {
177 next unless defined($$fdr{$pm}[$c]);
178 print OUTP "Peak size: $c\tCount: $$peak_count_real{$pm}[$c]\tFDR: $$fdr{$pm}[$c]\n";
179 $$peak_fdr_cutoff{$pm} = $c if (($$fdr{$pm}[$c]<$vars{'fdr'}) && (!$$peak_fdr_cutoff{$pm}));
180 }
181 $$peak_fdr_cutoff{$pm}||= 10**10; # clumsy hack to prevent errors
182 print OUTP "\n";
183 }
184
185 foreach my $pm (@$peakmins) {
186 print OUTP "Peak min $pm: peak cutoff size for alpha = $vars{'fdr'} was $$peak_fdr_cutoff{$pm}\n\n";
187 }
188 }
189
190 sub calculate_regressions {
191 my (%opts) = @_;
192 my $in_ref = $opts{IN_REF};
193 my $peakmins = $opts{PEAKMINS_REF};
194 my $peaks = $opts{PEAKS};
195 my $peak_count = $opts{PEAK_COUNT};
196 my $log_scores = $opts{LOG_SCORES};
197 my $regression = $opts{REGRESSION};
198
199 my $in_num = @$in_ref;
200
201 foreach my $pm (@$peakmins) {
202 print OUTP "Peak min = $pm\n";
203 for my $c (0 .. $#{$$peak_count{$pm}}) {
204 my $peak_count_avg = $$peak_count{$pm}[$c]/$vars{'n'} if $$peak_count{$pm}[$c];
205 next unless $peak_count_avg;
206
207 if ($vars{'frac'}) {
208 $peak_count_avg = $peak_count_avg * $in_num/$vars{'frac'};
209 }
210 $$log_scores{$pm}[$c] = log($peak_count_avg)/log(10);
211 print OUTP "Peak size: $c\tCount:$peak_count_avg\n";
212 }
213
214 # calculate exponential decay rates
215 # y= a+bx for log(y)
216 my ($sumx, $sumy, $sumxsq, $sumxy);
217 my $n=0;
218 for my $i (0 .. $#{$$peak_count{$pm}}) {
219 next unless $$peak_count{$pm}[$i];
220 $n++;
221 $sumx += $i;
222 $sumy += $$log_scores{$pm}[$i];
223 $sumxsq += $i ** 2;
224 $sumxy += $i * $$log_scores{$pm}[$i];
225 }
226
227 next unless $n > 1;
228
229 my $mean_x = $sumx/$n;
230 my $mean_y = $sumy/$n;
231 my $mean_xsq = $sumxsq/$n;
232 my $mean_xy = $sumxy/$n;
233
234 my $b = ($mean_xy - ($mean_x * $mean_y)) / ($mean_xsq - ($mean_x **2));
235 my $a = $mean_y - ($b * $mean_x);
236
237 # store values
238 $$regression{$pm}=[$b, $a];
239 print OUTP "regression: log(y) = $b(x) + $a\n";
240
241 for my $i (0 .. $#{$$peak_count{$pm}}) {
242 next unless $$peak_count{$pm}[$i];
243 my ($b,$a) = @{$$regression{$pm}};
244 my $logval = ($b*$i + $a);
245 my $val = 10**$logval;
246 print OUTP "lin regress: $i\t$$log_scores{$pm}[$i]\t$logval\t$val\n";
247 }
248
249 print OUTP "\n";
250 }
251 }
252
253 sub find_randomised_peaks {
254 my (%opts) = @_;
255 my $in_ref = $opts{IN_REF};
256 my $peakmins_ref = $opts{PEAKMINS_REF};
257 my $peaks = $opts{PEAKS};
258 my $peak_count = $opts{PEAK_COUNT};
259
260 print STDERR "Duplicating ... \n";
261 my @inr = @$in_ref;
262
263 # Call peaks on input file
264 print STDERR "Calling peaks on input file ...\n";
265
266 for my $iter (1 .. $vars{'n'}) {
267 #print STDERR "Iteration $iter ... \r";
268 print STDERR "Iteration $iter: [shuffling] \r";
269
270 if ($vars{'frac'}) {
271 # only use a fraction of the array per rep
272 my @a = inside_out(\@inr, $vars{'frac'});
273 call_peaks_unified_redux(ITER=>$iter, AREF=>\@a, PEAKMINS_REF=>$peakmins_ref, PEAK_COUNT=>$peak_count);
274 } else {
275 shuffle(\@inr);
276 call_peaks_unified_redux(ITER=>$iter, AREF=>\@inr, PEAKMINS_REF=>$peakmins_ref, PEAK_COUNT=>$peak_count);
277 }
278 }
279
280 }
281
282 sub call_peaks_unified_redux {
283 my (%opts) = @_;
284 my $iter = $opts{ITER};
285 my $real = $opts{REAL};
286 my $a = $opts{AREF};
287 my $peakmins_ref = $opts{PEAKMINS_REF};
288 my $peak_count_real = $opts{PEAK_COUNT_REAL};
289 my $peaks = $opts{PEAKS};
290 my $peak_count = $opts{PEAK_COUNT};
291
292 my ($pstart, $pend, $inpeak, $pscore, $count);
293 $pstart=$pend=$pscore=$inpeak=$count=0;
294
295 my @tmp_peak;
296 my $total = $#$a;
297
298 if ($real) {
299 print STDERR "Calling real peaks ... \r";
300 } else {
301 print STDERR "Iteration $iter: [processing ...] \r";
302 }
303
304 my $old_chr="";
305 foreach my $pm (@$peakmins_ref) {
306 for my $i (0 .. $total) {
307 my ($chr, $start, $end, $score) = @{ @$a[$i] };
308 next unless $score;
309
310 if ($real) {
311 unless ($chr eq $old_chr) {
312 # Next chromosome
313 # (Peaks can't carry over chromosomes, but we don't use this shortcut when randomly shuffling)
314 $pstart=$pend=$pscore=$inpeak=$count=0;
315 @tmp_peak = () if $real;
316 }
317 }
318 $old_chr = $chr if $real;
319
320 unless ($inpeak) {
321 next unless $score >= $pm;
322 # record new peak
323 $pstart = $start;
324 $pend = $end;
325 $pscore = $score * ($end-$start)/1000;
326 $count++;
327 push @tmp_peak, $score if $real;
328 $inpeak = 1;
329 } else {
330 if ($score >= $pm) {
331 # still in peak
332 $count++;
333
334 # Fragment score to deal with scoring peaks made from uneven sized fragments
335 my $fragment_score = $score * ($end-$start)/1000;
336
337 push @tmp_peak, $score if $real;
338 $pscore += $fragment_score;
339 $pend = $end;
340 } else {
341 # out of a peak
342 if ($count >= $vars{'min_count'}) {
343 # record peak
344 if ($real) {
345 $$peak_count_real{$pm}[$count]++;
346 my $mean_pscore = sprintf('%0.2f',($pscore/(($pend-$pstart)/1000)));
347 push (@{$$peaks{$pm}},[($chr, $pstart, $pend, $mean_pscore, $pscore, $count, ($pend-$pstart))]);
348 } else {
349 $$peak_count{$pm}[$count]++;
350 }
351 }
352
353 # reset
354 $pstart=$pend=$pscore=$inpeak=$count=0;
355 @tmp_peak = () if $real;
356 }
357 }
358 }
359 }
360 }
361
362
363 sub shuffle {
364 # Fisher-Yates shuffle (Knuth shuffle)
365 my ($array) = @_;
366 my $i = @$array;
367 while ( --$i ) {
368 my $j = int rand( $i+1 );
369 @$array[$i,$j] = @$array[$j,$i];
370 }
371 }
372
373 sub inside_out {
374 my ($array, $frac) = @_;
375 my @a;
376 for my $i (0 .. $frac-1) {
377 my $j = int rand( $i+1 );
378 if ($j != $i) {
379 $a[$i] = $a[$j]
380 }
381 $a[$j] = @$array[$i]
382 }
383 return @a;
384 }
385
386 sub load_gff {
387 my (%opts) = @_;
388 my $fn = $opts{FILE};
389 my $in_ref = $opts{IN_REF};
390
391 print STDERR "Reading input file: $fn ...\n";
392 open (IN, "<$fn") || die "Unable to read $fn: $!\n";
393
394 my $i;
395 while (<IN>) {
396 $i++;
397 print STDERR "Read $i lines ...\r" if $i%10000 == 0;
398 chomp;
399
400 my @line = split('\t');
401
402 my ($chr, $start, $end, $score);
403 if ($#line == 3) {
404 # bedgraph
405 ($chr, $start, $end, $score) = @line;
406 } else {
407 # GFF
408 ($chr, $start, $end, $score) = @line[0,3,4,5];
409 }
410
411 next unless $start;
412
413 $score = 0 if $score eq "NA";
414
415 push (@$in_ref, [$chr, $start, $end, $score]);
416 }
417
418 close (IN);
419
420 print STDERR "Sorting ... \n";
421 @$in_ref = sort { $a->[1] <=> $b->[1] } @$in_ref;
422 @$in_ref = sort { $a->[0] cmp $b->[0] } @$in_ref;
423 }
424
425 sub find_quants {
426 my (%opts) = @_;
427
428 my $in_ref = $opts{IN_REF};
429 my $peakmins_ref = $opts{PEAKMINS_REF};
430
431 my %seg;
432 my @frags;
433
434 my $total_coverage;
435
436 foreach my $l (@$in_ref) {
437 my ($chr, $start, $end, $score) = @$l;
438 next unless $score;
439 $score = 0 if $score eq "NA";
440 $total_coverage += $end-$start;
441 push @frags, $score;
442 }
443
444 @frags = sort {$a <=> $b} @frags;
445
446 print STDERR "Total coverage was $total_coverage bp\n";
447
448 my @quants;
449 for (my $q=0;$q<=1;$q+=$vars{'step'}) {
450 push @quants, [$q, int($q * @frags)] if $q > $vars{'min_quant'};
451 }
452
453 foreach (@quants) {
454 my $cut_off = @{$_}[0];
455 my $score = $frags[@{$_}[1]];
456 printf(" Quantile %0.2f: %0.2f\n",$cut_off,$score);
457 $seg{$cut_off} = $score;
458 }
459
460 foreach my $c (sort {$a <=> $b} keys %seg) {
461 push (@$peakmins_ref, $seg{$c});
462 }
463 }
464
465 sub make_unified_peaks {
466 my (%opts) = @_;
467 my $ref = $opts{SIG_PEAKS};
468 my $out_peak_unified_track = $opts{OUT};
469 my $unified_peaks = $opts{UNIFIED_PEAKS};
470 my $type = $opts{TYPE};
471 my $total = @$ref;
472
473 # Unify overlapping peaks, and make significant peaks file
474 my $skipped_peaks;
475 print STDERR "Combining significant peaks ...\n";
476
477 # unroll chromosomes for speed
478 foreach my $chr (uniq( map @{$_}[0], @$ref )) {
479 my @c = grep {@{$_}[0] eq $chr} @$ref;
480
481 my @unified_peaks_chr;
482 foreach my $ar (@c) {
483 if (state $i++ % 100 == 0) {
484 my $pc = sprintf("%0.2f",($i*100)/$total);
485 print STDERR "$pc\% processed ...\r";
486 }
487
488 my ($chra, $start, $end, $score, $total_score, $count, $peaklen, $fdr) = @$ar;
489
490 # next if @unified_peaks_chr already overlaps
491 next if grep {
492 @{$_}[3] < $end
493 && @{$_}[4] > $start
494 } @unified_peaks_chr;
495
496 # Grab all elements that overlap
497 my @test = grep {
498 @{$_}[1] < $end
499 && @{$_}[2] > $start
500 } @c;
501
502 for my $j (0 .. $#test) {
503 my ($chr1, $start1, $end1, $score1, $total_score1, $count1, $peaklen1, $fdr1) = @{$test[$j]};
504
505 next unless $start1 < $end;
506 next unless $end1 > $start;
507
508 if ($type eq 'min') {
509 $start = max($start, $start1);
510 $end = min($end, $end1);
511 } else {
512 $start = min($start, $start1);
513 $end = max($end, $end1);
514 }
515
516 $score = max($score, $score1);
517 $fdr = min($fdr, $fdr1);
518 }
519
520 push @unified_peaks_chr, [($chr, '.', '.', $start, $end, $score, '.', '.', "FDR=$fdr")];
521 }
522
523 @$unified_peaks = (@$unified_peaks, @unified_peaks_chr);
524 }
525
526 $total = $#$unified_peaks;
527
528 print STDERR "Sorting unified peaks ...\n";
529 @$unified_peaks = sort { $a->[3] <=> $b->[3] } @$unified_peaks;
530 @$unified_peaks = sort { $a->[0] cmp $b->[0] } @$unified_peaks;
531
532 print STDERR "Writing unified peaks file ...\n";
533 open(PEAKOUTUNI, ">$out_peak_unified_track") || die "Unable to open peak output track for writing: $!\n\n";
534 for my $j (0 .. $#$unified_peaks) {
535 print PEAKOUTUNI join("\t", @{$$unified_peaks[$j]}), "\n";
536 }
537 }
538
539 sub max {
540 my ($max, @vars) = @_;
541 my $index=0;
542 $max||=0;
543 for my $i (0..$#vars) {
544 ($max, $index) = ($vars[$i], $i+1) if $vars[$i] > $max;
545 }
546 return $max;
547 }
548
549 sub min {
550 my ($min, @vars) = @_;
551 my $index=0;
552 $min||=0;
553 for my $i (0..$#vars) {
554 ($min, $index) = ($vars[$i],$i+1) if $vars[$i] < $min;
555 }
556 return $min;
557 }
558
559 sub uniq {
560 my %seen;
561 return grep { !$seen{$_}++ } @_;
562 }
563
564
565
566 sub process_cli {
567 foreach (@ARGV) {
568 if (/--(.*)=(.*)/) {
569 unless (defined($vars{$1})) {
570 print STDERR "Did not understand $_ ...\n";
571 help();
572 }
573 my ($v, $opt) = ($1,$2);
574 $vars{$v} = $opt;
575 next;
576 } elsif (/--h[elp]*/) {
577 help();
578 } elsif (/--(.*)/) {
579 print STDERR "Please add a parameter to $_ ...\n\n";
580 exit 1;
581 }
582 push @in_files, $_;
583 }
584 }
585
586
587 sub help {
588 print STDOUT <<EOT;
589 Simple FDR random permutation peak caller
590 Usage: [options] [files in bedgraph or GFF format]
591
592 Options:
593 EOT
594
595 my $opt_len = 0;
596 foreach (keys %vars) {
597 my $l = length($_);
598 $opt_len = $l if $l > $opt_len;
599 }
600
601 $opt_len+=2;
602
603 my $cols= `tput cols` || 80;
604
605 my ($v, $val, $def, $def_format);
606 my $help_format = "format STDOUT =\n"
607 .' '.'^'.'<'x$opt_len . ' '. '^' . '<'x($cols-$opt_len-4) . "\n"
608 .'$v, $def_format'."\n"
609 .' '.'^'.'<'x$opt_len . ' '. '^' . '<'x($cols-$opt_len-6) . "~~\n"
610 .'$v, $def_format'."\n"
611 .".\n";
612
613 eval $help_format;
614 die $@ if $@;
615
616 foreach my $k (sort (keys %vars)) {
617 ($v, $val, $def) = ($k, $vars{$k}, $vars_details{$k});
618 $def||="";
619 $def_format = $val ? "$def\n\r[Current value: $val]" : $def;
620 $v = "--$v";
621 # format =
622 # ^<<<<<<<<<<<<<<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
623 #$v, $def_format
624 # ^<<<<<<<<<<<<<<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< ~~
625 #$v, $def_format
626 #.
627
628 write();
629
630 }
631 print STDOUT "\n";
632 exit 1;
633 }