Mercurial > repos > rnateam > graphprot_predict_profile
diff graphprot_predict_profile_wrapper.pl @ 0:215925e588c4 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit 9bb5f3c8ed8e87ec5652b5bc8bf9c774d5534a1a
author | rnateam |
---|---|
date | Fri, 25 May 2018 12:17:44 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/graphprot_predict_profile_wrapper.pl Fri May 25 12:17:44 2018 -0400 @@ -0,0 +1,764 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use Getopt::Long; +use Pod::Usage; +use Cwd qw(getcwd abs_path); +use List::Util qw(sum); + +=head1 NAME + +=head1 SYNOPSIS + +Galaxy wrapper script for GraphProt (-action predict_profile) to compute +the binding profile for a given model on a given set of sequences provided +in FASTA format. After profile prediction, average profiles get computed, +scores signified and binding peak regions extracted. The score signification +is done using the provided fitted GEV parameters, either from .params file +or manually set. If score threshold is set (-thr-sc), p-value assignment +will be skipped and set score threshold will be used to extract peak +regions. NOTE: Additional lines .params file are used to store and get +GEV parameters, as well as type of model (model_type: sequence|structure). +Also, this wrapper currently works for classification mode only. + +PARAMETERS: + + -help|h display help page + -fasta Input FASTA file (option -fasta) + -model Input .model file (option -model) + -params Input .params file + NOTE: uses .params file with additional + parameters + Manually set parameters (below) will override + found settings in .params file + -data-id Data ID (option -prefix) + +GraphProt model parameters (by default get from .params file): + + -onlyseq Set if model is a sequence model + -R GraphProt model R parameter + -D GraphProt model D parameter + -epochs GraphProt model epochs parameter + -lambda GraphProt model lambda parameter + -bitsize GraphProt model bitsize parameter + -abstraction GraphProt model RNAshapes abstraction level + parameter (set for structure models) + +Peak region extraction parameters: + + -thr-sc Score threshold for extracting peak regions + By default p-value of 0.05 is used. If no p-value + calculation possible, -thr-sc is used with default: 0 + -thr-p p-value threshold for extracting peak regions + By default, peak regions with p = 0.05 are extracted, + as well as p50 score peak regions (if info given) + Default: 0.05 + -merge-dist Maximum merge distance for nearby peak regions + Default: report all non-overlapping regions + -p50-out Output p50 score filtered peak regions BED file + default: false + +GEV distribution parameters: + + -distr-my GEV distribution my parameter for calculating p-values + from scores + -distr-sigma GEV distrubution sigma parameter for calculating + p-values from scores + -distr-xi GEV distribution xi parameter for calculating p-values + from scores + -ap-extlr Used average profile left right extension for + averaging scores, which were used for distribution + fitting. NOTE: usually a value of 5 was used for + for getting GEV distribution and parameters. If you + choose a different value here, calculated p-values + will be wrong! + default : 5 + + +=head1 DISCRIPTION + +5) Write manual +6) add output p50 file with NOTE + +6) put GP into rna_tools + +NOTE: +Additional lines .params file used to store and get gev parameters, as well +as type of model (model_type: sequence|structure). + +Example .params content: +epochs: 20 +lambda: 0.001 +R: 1 +D: 4 +bitsize: 14 +model_type: sequence +#ADDITIONAL MODEL PARAMETERS +ap_extlr: 5 +gev_my: -2.5408 +gev_sigma: 1.6444 +gev_xi: -0.1383 +p50_score: 6.51534 +p50_p_val: 0.0009059744 + +=cut + +############################ +# COMMAND LINE CHECKING. +############################ + +# Command line argument variables. +my ($i_help, $i_fasta, $i_model, $i_params, $i_data_id, $i_thr_sc, $i_thr_p, $i_max_merge_dist, $i_p50_out, $i_gp_r, $i_gp_d, $i_gp_epochs, $i_gp_lambda, $i_gp_bitsize, $i_gp_abstr, $i_gp_onlyseq, $i_distr_my, $i_distr_sigma, $i_distr_xi, $i_ap_extlr); + +# Parse the command line. +GetOptions ( "help|h" => \$i_help, + "fasta:s" => \$i_fasta, + "model:s" => \$i_model, + "params:s" => \$i_params, + "data-id:s" => \$i_data_id, + "thr-sc:f" => \$i_thr_sc, + "thr-p:f" => \$i_thr_p, + "p50-out" => \$i_p50_out, + "merge-dist:i" => \$i_max_merge_dist, + "R:i" => \$i_gp_r, + "D:i" => \$i_gp_d, + "epochs:i" => \$i_gp_epochs, + "lambda:f" => \$i_gp_lambda, + "bitsize:i" => \$i_gp_bitsize, + "abstr:i" => \$i_gp_abstr, + "onlyseq" => \$i_gp_onlyseq, + "distr-my:f" => \$i_distr_my, + "distr-sigma:f" => \$i_distr_sigma, + "distr-xi:f" => \$i_distr_xi, + "ap-extlr:i" => \$i_ap_extlr ) or pod2usage(1); + +# Check. +pod2usage(1) if $i_help; +($i_fasta and $i_model) or pod2usage "ERROR: -fasta, -model are mandatory"; +if ($i_distr_my or $i_distr_sigma or $i_distr_xi) { + ($i_distr_my and $i_distr_sigma and $i_distr_xi) or pod2usage "ERROR: expects all three distribution parameters to be set"; +} + +####################### +# SET PARAMETERS. +####################### + +# Prefix. +my $data_id = "GraphProt"; +if ($i_data_id) { + $data_id = $i_data_id; +} +my $thr_sc = 0; +if ($i_thr_sc) { + $thr_sc = $i_thr_sc; +} +my $thr_p = 0.05; +if ($i_thr_p) { + $thr_p = $i_thr_p; +} +my $ap_extlr = 5; +if ($i_ap_extlr) { + $ap_extlr = $i_ap_extlr; +} +my $max_merge_dist = 0; +if ($i_max_merge_dist) { + $max_merge_dist = $i_max_merge_dist; +} + +# Get parameters from .params file. +my %params; +if ($i_params) { + open(IN, $i_params) or die "Cannot open $i_params: $!"; + while(<IN>) { + next if ($_ =~ /^#/); + if ($_ =~ /(.+):\s(.+)/) { + $params{$1} = $2; + } + } + close IN; +} + +# Create GP parameter string. +my $params_string = ""; +# -onlyseq +my $model_type = "structure"; +if (exists $params{"model_type"}) { + if ($params{"model_type"} eq "sequence") { + $params_string .= " -onlyseq"; + $model_type = "sequence"; + } +} elsif ($i_gp_onlyseq) { + $params_string .= " -onlyseq"; + $model_type = "sequence"; +} +# -R +if ($i_gp_r) { + $params_string .= " -R $i_gp_r"; +} elsif (exists $params{"R"}) { + my $v = $params{"R"}; + $params_string .= " -R $v"; +} else { + die "ERROR: -R needs to be set"; +} +# -D +if ($i_gp_d) { + $params_string .= " -D $i_gp_d"; +} elsif (exists $params{"D"}) { + my $v = $params{"D"}; + $params_string .= " -D $v"; +} else { + die "ERROR: -D needs to be set"; +} +# -epochs +if ($i_gp_epochs) { + $params_string .= " -epochs $i_gp_epochs"; +} elsif (exists $params{"epochs"}) { + my $v = $params{"epochs"}; + $params_string .= " -epochs $v"; +} else { + die "ERROR: -epochs needs to be set"; +} +# -lambda +if ($i_gp_lambda) { + $params_string .= " -lambda $i_gp_lambda"; +} elsif (exists $params{"lambda"}) { + my $v = $params{"lambda"}; + $params_string .= " -lambda $v"; +} else { + die "ERROR: -lambda needs to be set"; +} +# -bitsize +if ($i_gp_bitsize) { + $params_string .= " -bitsize $i_gp_bitsize"; +} elsif (exists $params{"bitsize"}) { + my $v = $params{"bitsize"}; + $params_string .= " -bitsize $v"; +} else { + die "ERROR: -bitsize needs to be set"; +} +# -abstraction +if ($i_gp_abstr) { + $params_string .= " -bitsize $i_gp_abstr"; + die "ERROR: -abstraction set with -onlyseq" unless ($model_type eq "structure"); +} elsif (exists $params{"abstraction"}) { + my $v = $params{"abstraction"}; + $params_string .= " -abstraction $v"; + die "ERROR: -abstraction set with -onlyseq" unless ($model_type eq "structure"); +} + +# Distribution parameters. +my ($distr_my, $distr_sigma, $distr_xi); +if ($i_distr_my) { + $distr_my = $i_distr_my; +} elsif (exists $params{"gev_my"}) { + $distr_my = $params{"gev_my"}; +} +if ($i_distr_sigma) { + $distr_sigma = $i_distr_sigma; +} elsif (exists $params{"gev_sigma"}) { + $distr_sigma = $params{"gev_sigma"}; +} +if ($i_distr_xi) { + $distr_xi = $i_distr_xi; +} elsif (exists $params{"gev_xi"}) { + $distr_xi = $params{"gev_xi"}; +} +# Average profile extension parameter. +if (exists $params{"ap_extlr"}) { + $ap_extlr = $params{"ap_extlr"}; +} + +print STDOUT "model_type: $model_type\n"; +print STDOUT "ap_extlr: $ap_extlr\n"; + +if ($distr_my and $distr_sigma and $distr_xi) { + print STDOUT "distr_my: $distr_my\n"; + print STDOUT "distr_sigma: $distr_sigma\n"; + print STDOUT "distr_xi: $distr_xi\n"; +} +print STDOUT "max_merge_dist: $max_merge_dist\n"; + +# p50 filter score. +my $p50_sc; +if (exists $params{"p50_score"}) { + $p50_sc = $params{"p50_score"}; + print STDOUT "p50_score: $p50_sc\n"; +} +# p50 p-value. +my $p50_p; +if (exists $params{"p50_p_val"}) { + $p50_p = $params{"p50_p_val"}; + print STDOUT "p50_p_val: $p50_p\n"; +} + + + +################################## +# RUN GP PROFILE PREDICTION. +################################## + +# Read in FASTA file headers. +my @fasta_ids; +my $headers = qx/grep ">" $i_fasta/; +while ($headers =~ />(.+?)\n/g) { + push(@fasta_ids,$1); +} + +# Run GP profile prediction. +my $gp_call = "GraphProt.pl -action predict_profile -model $i_model -fasta $i_fasta -prefix $data_id $params_string"; +print STDOUT "GraphProt call: $gp_call\n"; +# &> profile_prediction.log +qx/$gp_call/; + + +#################################### +# CALCULATE AVERAGE PROFILE. +#################################### + +# Calculate .average_profile from GraphProt .profile file. +# Also add p-value column if distr parameters set. +my $add_p = 0; +if ($distr_my and $distr_sigma and $distr_xi) { + $add_p = 1; +} + +# Average_profile: 1-based, FASTA headers as IDs, using ap_extlr for averaging scores. +my $profile = "$data_id.profile"; +my $average_profile = "$data_id.average_profile"; +# Calculate window size. +my $win = $ap_extlr * 2 + 1; + +open (IN, $profile) or die "Cannot open $profile: $!\n"; +open (OUT, '>', $average_profile) or die "Cannot open $average_profile: $!"; + +# Old ID. +my $old_id = "-"; +# Current ID. +my $cur_id = "-"; +# Start position of the window. +my $pos_inc = 0; +# Score array. +my @scores; +# Input row counter. +my $c_in = 0; +# Output row counter. +my $c_out = 0; + +while (<IN>) { + + if ($_ =~ /(.+?)\t\d+?\t(.+?)\n/) { + + $cur_id = $1; + my $score = $2; + $c_in++; + + # Case: New refseq ID / new paragraph. + if ($cur_id ne $old_id) { + + # Print remaining entries at the end of former paragraph. + while (scalar(@scores) >= ($ap_extlr + 1)) { + # Calculate avg array score. + my $mean = array_mean(@scores); + $c_out++; + $pos_inc++; + if ($add_p) { + my $y = exp(-1*(1 + ( ($mean-$distr_my)/$distr_sigma )*$distr_xi)**(-1/$distr_xi)); + my $p = sprintf("%.8f", (1-$y)); + print OUT "$fasta_ids[$old_id]\t$pos_inc\t$mean\t$p\n"; + } else { + print OUT "$fasta_ids[$old_id]\t$pos_inc\t$mean\n"; + } + # Remove old score from beginning. + shift(@scores); + } + + # Reset for new paragraph. + $pos_inc = 0; + @scores = (); + $old_id = $cur_id; + # Save first score. + push(@scores, $score); + next; + + } + + # Push score in array for average calculation. + push(@scores, $score); + + # Case: array smaller $win. + if (scalar(@scores) < $win) { + # Subcase: array more than half size, start with calculation. + if (scalar(@scores) >= ($ap_extlr + 1)) { + # Calculate avg array score. + my $mean = array_mean(@scores); + $c_out++; + $pos_inc++; + if ($add_p) { + my $y = exp(-1*(1 + ( ($mean-$distr_my)/$distr_sigma )*$distr_xi)**(-1/$distr_xi)); + my $p = sprintf("%.8f", (1-$y)); + print OUT "$fasta_ids[$cur_id]\t$pos_inc\t$mean\t$p\n"; + } else { + print OUT "$fasta_ids[$cur_id]\t$pos_inc\t$mean\n"; + } + } + next; + } + + # Case: array has $win size. + if (scalar(@scores) == $win) { + # Calculate avg array score. + my $mean = array_mean(@scores); + $c_out++; + $pos_inc++; + if ($add_p) { + my $y = exp(-1*(1 + ( ($mean-$distr_my)/$distr_sigma )*$distr_xi)**(-1/$distr_xi)); + my $p = sprintf("%.8f", (1-$y)); + print OUT "$fasta_ids[$cur_id]\t$pos_inc\t$mean\t$p\n"; + } else { + print OUT "$fasta_ids[$cur_id]\t$pos_inc\t$mean\n"; + } + # Remove old score from beginning. + shift(@scores); + next; + } + } +} + +# Print remaining entries at the end of last section. +while (scalar(@scores) >= ($ap_extlr + 1)) { + # Calculate avg array score. + my $mean = array_mean(@scores); + $c_out++; + $pos_inc++; + if ($add_p) { + my $y = exp(-1*(1 + ( ($mean-$distr_my)/$distr_sigma )*$distr_xi)**(-1/$distr_xi)); + my $p = sprintf("%.8f", (1-$y)); + print OUT "$fasta_ids[$cur_id]\t$pos_inc\t$mean\t$p\n"; + } else { + print OUT "$fasta_ids[$cur_id]\t$pos_inc\t$mean\n"; + } + # Remove old score from beginning. + shift(@scores); +} +close IN; +close OUT; + +qx/rm -f $profile/; + + +######################### +# GET PEAK REGIONS. +######################### + + +my $p50_peaks_bed_out = $data_id . ".peak_regions_p50.bed"; +my $peaks_bed_out = $data_id . ".peak_regions.bed"; + +# If p-values were calculated, use set p-value to get peak regions. +if ($add_p) { + extract_peak_regions_from_p_values($average_profile, $peaks_bed_out, $max_merge_dist, $thr_p); + # If p50-out set. + if ($i_p50_out) { + # If p50 p-value present, also get peak regions file for this threshold. + if ($p50_p) { + extract_peak_regions_from_p_values($average_profile, $p50_peaks_bed_out, $max_merge_dist, $p50_p); + } else { + qx/touch $p50_peaks_bed_out/; + } + } +} else { + # If no p-values available, use score threshold for defining peak regions. + extract_peak_regions_from_scores($average_profile, $peaks_bed_out, $max_merge_dist, $thr_sc); + # If p50-out set. + if ($i_p50_out) { + # If p50 score present, also get peak regions file for this threshold. + if ($p50_sc) { + extract_peak_regions_from_scores($average_profile, $p50_peaks_bed_out, $max_merge_dist, $p50_sc); + } else { + qx/touch $p50_peaks_bed_out/; + } + } +} + +exit; + + + +################################################################################ +################################################################################ +# SUBROUTINES. +################################################################################ + + +sub array_mean { + + my $mean = sum(@_)/@_; + return sprintf("%.5f", $mean); + +} + + +################################################################################ + +sub extract_peak_regions_from_p_values { + + my ($average_profile, $peak_regions_bed_file, $max_merge_dist, $max_p) = @_; + + my $old_ref = "no"; + my $in = "N"; + my $ref_id; + my $start; + my $end; + my $best_p = 1; + my $best_sc = -1000; + my $region_s; + my $region_e; + my $zero_pos; + + my $temp_bed_file = "peak_regions.tmp.bed"; + + open (IN, $average_profile) or die "Cannot open $average_profile: $!"; + open (OUT, '>', $temp_bed_file) or die "Cannot open $temp_bed_file: $!"; + + while (<IN>) { + chomp; + my ($ref, $s, $sc, $p) = (split /\t/)[0,1,2,3]; + # If file has zero-based positions. + if ($s == 0) { + $zero_pos = 1; + } + # If positions are one-based, make them zero-based. + unless ($zero_pos) { + $s -= 1; + } + # At transcript ends, if in positive region, write and reset. + if ($old_ref ne $ref) { + if ($in eq "Y") { + print OUT "$ref_id\t$region_s\t$region_e\t$end;$best_sc;$best_p\t0\t+\n"; + $in = "N"; + } + } + $old_ref = $ref; + # Deal with good p-value regions. + if ($p <= $max_p) { + # Start of a positive cluster. + if ($in eq "N") { + $start = $s; + $region_s = $s; + $region_e = $s + 1; + $end = $s + 1; + $best_p = $p; + $best_sc = $sc; + $ref_id = $ref; + $in = "Y"; + next; + # Inside a positive cluster. + } elsif ($in eq "Y") { + if ($p < $best_p) { + $start = $s; + $end = $s + 1; + $best_p = $p; + $best_sc = $sc; + $ref_id = $ref; + } + $region_e++; + next; + } + } else { + # If we were in positive cluster before. + if ($in eq "Y") { + print OUT "$ref_id\t$region_s\t$region_e\t$end;$best_sc;$best_p\t0\t+\n"; + $in = "N"; + } + } + } + # After last line processed. + if ($in eq "Y") { + print OUT "$ref_id\t$region_s\t$region_e\t$end;$best_sc;$best_p\t0\t+\n"; + $in = "N"; + } + close IN; + close OUT; + # If merge distance zero (i.e. end of one block is -1 from start of next block). + if ($max_merge_dist == 0) { + qx/cat $temp_bed_file > $peak_regions_bed_file/; + } else { + # Merge nearby regions. + open(IN, $temp_bed_file) or die "Cannot open $temp_bed_file: $!"; + open(OUT, '>', $peak_regions_bed_file) or die "Cannot open $peak_regions_bed_file: $!"; + # For storing current block stats. + my $block_chr = 0; + my ($block_s, $block_e, $block_best_pos, $block_best_p, $block_best_sc); + while (<IN>) { + chomp; + my ($chr, $s, $e, $id) = (split /\t/)[0,1,2,3]; + my ($best_pos, $best_sc, $best_p) = (split /;/, $id); + if ($chr eq $block_chr) { + # If $block_e, $s within merge merge. + if ( ($s - $block_e) <= $max_merge_dist ) { + # Update block stats. + $block_e = $e; + if ($block_best_p > $best_p) { + $block_best_p = $best_p; + $block_best_sc = $best_sc; + $block_best_pos = $best_pos; + } + } else { + # If $e outside merge range, print block. + print OUT "$block_chr\t$block_s\t$block_e\t$block_best_pos;$block_best_sc;$block_best_p\t0\t+\n"; + # Store new block. + ($block_chr, $block_s, $block_e, $block_best_pos, $block_best_sc, $block_best_p) = ($chr, $s, $e, $best_pos, $best_sc, $best_p); + } + + } else { + # If new chromosome, print last block, otherwise it is the first block. + if ($block_chr) { + print OUT "$block_chr\t$block_s\t$block_e\t$block_best_pos;$block_best_sc;$block_best_p\t0\t+\n"; + } + ($block_chr, $block_s, $block_e, $block_best_pos, $block_best_sc, $block_best_p) = ($chr, $s, $e, $best_pos, $best_sc, $best_p); + } + } + # Print last block. + if ($block_chr) { + print OUT "$block_chr\t$block_s\t$block_e\t$block_best_pos;$block_best_sc;$block_best_p\t0\t+\n"; + } + close OUT; + close IN; + } + qx/rm -f $temp_bed_file/; +} + + +################################################################################ + +sub extract_peak_regions_from_scores { + + my ($average_profile, $peak_regions_bed_file, $max_merge_dist, $min_sc) = @_; + + my $old_ref = "no"; + my $in = "N"; + my $ref_id; + my $start; + my $end; + my $best_sc = -1000; + my $region_s; + my $region_e; + my $zero_pos; + + my $temp_bed_file = "peak_regions.tmp.bed"; + + open (IN, $average_profile) or die "Cannot open $average_profile: $!"; + open (OUT, '>', $temp_bed_file) or die "Cannot open $temp_bed_file: $!"; + + while (<IN>) { + chomp; + my ($ref, $s, $sc) = (split /\t/)[0,1,2]; + # If file has zero-based positions. + if ($s == 0) { + $zero_pos = 1; + } + # If positions are one-based, make them zero-based. + unless ($zero_pos) { + $s -= 1; + } + # At transcript ends, if in positive region, write and reset. + if ($old_ref ne $ref) { + if ($in eq "Y") { + print OUT "$ref_id\t$region_s\t$region_e\t$end;$best_sc\t0\t+\n"; + $in = "N"; + } + } + $old_ref = $ref; + # Deal with positive regions. + if ($sc > $min_sc) { + # Start of a positive cluster. + if ($in eq "N") { + $start = $s; + $region_s = $s; + $region_e = $s + 1; + $end = $s + 1; + $best_sc = $sc; + $ref_id = $ref; + $in = "Y"; + next; + # Inside a positive cluster. + } elsif ($in eq "Y") { + if ($sc > $best_sc) { + $start = $s; + $end = $s + 1; + $best_sc = $sc; + $ref_id = $ref; + } + $region_e++; + next; + } + } else { + # If we were in positive cluster before. + if ($in eq "Y") { + print OUT "$ref_id\t$region_s\t$region_e\t$end;$best_sc\t0\t+\n"; + $in = "N"; + } + } + } + # After last line processed. + if ($in eq "Y") { + print OUT "$ref_id\t$region_s\t$region_e\t$end;$best_sc\t0\t+\n"; + $in = "N"; + } + close IN; + close OUT; + # If merge distance zero (i.e. end of one block is -1 from start of next block). + if ($max_merge_dist == 0) { + qx/cat $temp_bed_file > $peak_regions_bed_file/; + } else { + # Merge nearby regions. + open(IN, $temp_bed_file) or die "Cannot open $temp_bed_file: $!"; + open(OUT, '>', $peak_regions_bed_file) or die "Cannot open $peak_regions_bed_file: $!"; + # For storing current block stats. + my $block_chr = 0; + my ($block_s, $block_e, $block_best_pos, $block_best_sc); + while (<IN>) { + chomp; + my ($chr, $s, $e, $id) = (split /\t/)[0,1,2,3]; + my ($best_pos, $best_sc) = (split /;/, $id); + if ($chr eq $block_chr) { + # If $block_e, $s within merge merge. + if ( ($s - $block_e) <= $max_merge_dist ) { + # Update block stats. + $block_e = $e; + if ($block_best_sc < $best_sc) { + $block_best_sc = $best_sc; + $block_best_pos = $best_pos; + } + } else { + # If $e outside merge range, print block. + print OUT "$block_chr\t$block_s\t$block_e\t$block_best_pos;$block_best_sc\t0\t+\n"; + # Store new block. + ($block_chr, $block_s, $block_e, $block_best_pos, $block_best_sc) = ($chr, $s, $e, $best_pos, $best_sc); + } + + } else { + # If new chromosome, print last block, otherwise it is the first block. + if ($block_chr) { + print OUT "$block_chr\t$block_s\t$block_e\t$block_best_pos;$block_best_sc\t0\t+\n"; + } + ($block_chr, $block_s, $block_e, $block_best_pos, $block_best_sc) = ($chr, $s, $e, $best_pos, $best_sc); + } + + } + # Print last block. + if ($block_chr) { + print OUT "$block_chr\t$block_s\t$block_e\t$block_best_pos;$block_best_sc\t0\t+\n"; + } + close OUT; + close IN; + } + qx/rm -f $temp_bed_file/; +} + + +################################################################################ + + +