Mercurial > repos > matces > carpet_toolsuite
diff carpet-src-1/tools/CARPET/PeakPeaker2.pl @ 0:cdd489d98766
Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author | matces |
---|---|
date | Tue, 07 Jun 2011 16:50:41 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/carpet-src-1/tools/CARPET/PeakPeaker2.pl Tue Jun 07 16:50:41 2011 -0400 @@ -0,0 +1,432 @@ +#! /usr/bin/perl + +# Copyright 2009 Matteo Cesaroni, Lucilla Luzi +# +# This program is free software; ; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + + +use Getopt::Long; + +GetOptions ( + "help" => \$OPT{help}, + "in=s" => \$OPT{fname}, + "perc=s" => \$OPT{value}, + "fc=s" => \$OPT{fc_value}, + "log=s" => \$OPT{valore_log}, + "t=s" => \$OPT{type_peak}, + "num=s" => \$OPT{num_probe}, + "dist=s" => \$OPT{dist_max}, + "dist_peaks=s" => \$OPT{dist_max_peaks}, + "w=s"=> \$OPT{s_windows}, + "col3=s" => \$OPT{col3}, + "f_pv=s" => \$OPT{file_pv}, + "out=s"=> \$OPT{out} + +)|| printusage(); + +# opzioni da linea di comando +my $value=$OPT{value}; +my $fc_value=$OPT{fc_value}; +my $valore_log=$OPT{valore_log}; +my $type_peak=$OPT{type_peak}; +my $num_probe=$OPT{num_probe}; +my $dist_max=$OPT{dist_max}; +my $dist_max_peaks=$OPT{dist_max_peaks}; +my $fname=$OPT{fname}; +my $help=$OPT{help}; +my $s_windows=$OPT{s_windows}; +my $col3=$OPT{col3}; +my $outfile=$OPT{out}; +my $outfile_pv=$OPT{file_pv}, + +# "usage" se c'e' un help +$help and printusage(); + +# "usage" se non ci sono opzioni +if (!$s_windows || !$fname || (!$value && !$fc_value) || !$type_peak || !$num_probe || !$dist_max || (($type_peak eq "p") && !$valore_log)){ + &printusage() +}; + + +qx {sort -k 1,1 -k 4,4n $fname >$fname.sortato}; + + +open(FILE, "<$fname.sortato") or die "Cannot find file $fname.sortato\n"; +open(pvalue, ">$outfile_pv") or die "Cannot open file $outfile_pv: $!\n"; +#open(buonitutti, "> buoni_$fname") or die "Cannot find file $fname: $!\n"; +#loop th rought line-by-line until the end of the file and then push each line into an empty array, called @array# +print pvalue "#p-value track ($value) \n"; + + +@array= (); +$conto_tutti_sopra=0; +$conto_tutti=0; +while ($line = <FILE>){ + chomp ($line); + if ($line=~/track/g){next;} + if ($line=~/#/g){next;} + if ($line=~/^\s+$/g){next;} + push (@array,$line); + @value_perc=split("\t",$line); + push (@percentile,$value_perc[5]); + if ($value_perc[5]>=$fc_value){ + $conto_tutti_sopra++; + } + elsif ($value_perc[5]<$fc_value){ + $conto_tutti++; + } +} + +close (FILE); + +if (!$fc_value){ +@perc_ordine=sort(@percentile); +$position=(($value*($#perc_ordine+1))-1); +$valore_percentile=$perc_ordine[$position]; + #print "il percentile è $valore_percentile\n"; + #print "il max è $perc_ordine[$#perc_ordine]\n"; + #print "il min è $perc_ordine[1]\n"; +$probabilita=1-$value; +} +else { + $valore_percentile=$fc_value; + $tuttitutti=($conto_tutti+$conto_tutti_sopra); + #print"il numero totaledi probe è $tuttitutti\n"; + #print"il numero totale di probe sopra è $conto_tutti_sopra\n"; + $probabilita=$conto_tutti_sopra/($conto_tutti+$conto_tutti_sopra); + #print "la proba= $probabilita\n"; + } + +#print buonitutti"il percentile e $valore_percentile\n"; + + + +($one_ref,$two_ref)=&probe_cutoff(@array); + +if ($type_peak eq "p"){ + @forse_niente=&peak(@$one_ref); + } +elsif ($type_peak eq "s"){ + @forse_niente=&peak(@$two_ref); + } + +&double_peak(@forse_niente); + +# +# End process +# +close OUTFILE; +unlink "$fname.sortato"; +exit 0; + +################################################################################################ +######################## SUBROUTINE ############################### +################################################################################################ + +sub probe_cutoff +{ + +$c=-6; + +foreach $linea(@array){ + @subarray1 = split("\t",$linea); + $inizio=(((($subarray1[4]-$subarray1[3])/2)+$subarray1[3])-($s_windows/2)); #per modificare la windows cambia il 500 e il 1000 + $fine=$inizio+$s_windows; #windows 500 -->250 e 500 windows 1000 --> 500 e 1000 + $counter=0; + $counter_value=0; + + if($subarray1[5]>=$valore_percentile){ + push (@array_probe,[@subarray1]); + print buonitutti "$linea\n"; + } + + for($i=$c;$i<=$#array;$i++) { + if ($i<0){ + next; + } + @subarray = split("\t",$array[$i]); + if (($subarray[3]>=$inizio)&&($subarray[3]<$fine)&&("$subarray[0]" eq "$subarray1[0]")){ + $counter++; + if ($subarray[5]>= $valore_percentile){ + $counter_value++; + } + } + if (($subarray[3]>$fine)||("$subarray[0]" ne "$subarray1[0]")){ + $cazzo=$subarray[3]; + last; + } + + } +$c++; + if ($counter !=0){ + $chi_sq = (((($counter_value - ($probabilita*$counter))**2)/($probabilita*$counter))+(((($counter-$counter_value)-((1-$probabilita)*$counter))**2)/((1-$probabilita)*$counter))); + } + else{ + $chi_sq = NA; + } + +use Statistics::Distributions; +$chisprob=Statistics::Distributions::chisqrprob (1,$chi_sq); +if ($chisprob == 0){ + $log_10=100; +} +else { + $log_10 = -log($chisprob)/log(10); +} + +print pvalue "$subarray1[0]\t$subarray1[1]\tpv_$subarray1[2]\t$subarray1[3]\t$subarray1[4]\t$log_10\t$subarray1[6]\t$subarray1[7]\t$subarray1[5]\n"; + + if ($log_10>=$valore_log){ + #print"$subarray1[3],$inizio,$fine,$counter,$counter_value,$chi_sq,$chisprob,$log_10\n"; + @proviamo=($subarray1[0],$subarray1[1],$subarray1[2],$inizio,$fine,$log_10,$subarray1[6],$subarray1[7],$subarray1[5]); + push (@matrix_pvalue,[@proviamo]); + + } +} +@result_table=@matrix_pvalue; +@result_table2=@array_probe; +return(\@result_table,\@result_table2); + +} + + + + + +################################################################################################ + + + +sub peak +{ +@matrix_value=@_; + +$stop=$#matrix_value; + +for ($cio=0; $cio<=$stop;$cio++) { + @log_ratio=(); + @chilosa=(); + $inizio=$matrix_value[$cio][3]; + $somma=0; + $media=0; + $sommachilo=0; + $mediachilo=0; + $diviso=0; + push(@log_ratio,$matrix_value[$cio][5]); + if ($matrix_value[$cio][8]>=$valore_percentile){ + push(@chilosa,$matrix_value[$cio][8]); + } + for ($j=0; $j<=$stop;$j++){ + $distanza=($matrix_value[$cio+1][3]-$matrix_value[$cio][4]); + if (($distanza > $dist_max) || ($cio==$stop)||(!("$matrix_value[$cio+1][0]" eq "$matrix_value[$cio][0]"))){ + $j=$stop; + } + elsif (($distanza <= $dist_max)&&("$matrix_value[$cio+1][0]" eq "$matrix_value[$cio][0]")){ + $cio++; + push(@log_ratio,$matrix_value[$cio][5]); + $fine=$matrix_value[$cio][4]; + $inizio3=$inizio; + $j++; + if ($matrix_value[$cio][8]>=$valore_percentile){ + push(@chilosa,$matrix_value[$cio][8]); + } + } + + + } + $numeroprobes=($#chilosa+1); + if ((($#log_ratio+1)>=$num_probe) && (($type_peak eq "s") ||(($#chilosa+1)>=$num_probe))){ + foreach $n (@log_ratio) { + $somma+=$n; + } + if($type_peak eq "s"){ + @chilosa=@log_ratio; + } + foreach $nchilo (@chilosa) { + $sommachilo+=$nchilo; + } + $diviso=$#log_ratio+1; + $media = $somma/(@log_ratio); + $mediachilo=$sommachilo/(@chilosa); + #$somma_media_chilo=((sqrt($#chilosa+1))+$mediachilo); + #$moltipli=(($#log_ratio+1)*$media); + #$somma_media=((sqrt($#log_ratio+1))+$media); + if ($type_peak eq "p"){ + $inizio3=int($inizio3+(($s_windows/2)-25)); + $fine=int($fine-(($s_windows/2)-25)); + } + @proviamo_double=($matrix_value[$cio][0],$matrix_value[$cio][1],$col3,$inizio3,$fine,$media,$matrix_value[$cio][6],$matrix_value[$cio][7],$mediachilo,$diviso); + push (@matrix_for_double,[@proviamo_double]); + + } +} + +@result_table=@matrix_for_double; +return(@result_table); +} + + +################################################################################################ + + + +sub double_peak +{ +@matrix_value=@_; + +$stop=$#matrix_value; +$i=0; +$j=0; +#print $stop; + +# +# Print output file +# +if ($outfile){ + open (OUTFILE, ">$outfile"); +} + +# +# print File Header +# +my $header=<<e0c6654; +\# infile: $fname +\# percentile value: $value=$valore_percentile +\# fold change: $fc_value +\# type of analysis: $type_peak +\# log(pval analysis): $valore_log +\# num (probe defining a peak): $num_probe +\# dist: $dist_max +\# window length: $s_windows +track name=$col3 description="peaks find" visibility=2 +e0c6654 + +if ($outfile){ + print OUTFILE "$header"; +}else{ + print "$header"; +} +if ($type_peak eq "p"){ + print "perc value=$value=$valore_percentile, #probes=$num_probe (dist=$dist_max), window=$s_windows, type analisys =p-value (log=$valore_log), dist peaks=$dist_max_peaks"; +} +if ($type_peak eq "s"){ + print "perc value=$value=$valore_percentile, #probes=$num_probe (dist=$dist_max), window=$s_windows, type analisys=score, dist peaks=$dist_max_peaks"; +} +for ($i=0; $i<=$stop;$i++) { + @log_ratio=(); + @log_ratio2=(); + @diviso2=(); + $inizio=$matrix_value[$i][3]; + $somma=0; + $somma2=0; + $media=0; + $media2=0; + $diviso2=0; + push(@log_ratio,$matrix_value[$i][5]); + push(@log_ratio2,$matrix_value[$i][8]); + push(@diviso2,$matrix_value[$i][9]); + #print "giro $i\n"; + for ($j=0; $j<=$stop;$j++){ + # if ("$matrix_value[$i+1][0]" eq "$matrix_value[$i][0]"){ + $distanza=($matrix_value[$i+1][3]-$matrix_value[$i][4]); + #print "distanza fra i due $distanza\n"; + #exit; + if (($distanza > $dist_max_peaks) || ($i==$stop)||(!("$matrix_value[$i+1][0]" eq "$matrix_value[$i][0]"))){ + $j=$stop; + $fine=$matrix_value[$i][4]; + } + elsif (($distanza <= $dist_max_peaks)&&("$matrix_value[$i+1][0]" eq "$matrix_value[$i][0]")){ + $i++; + push(@log_ratio,$matrix_value[$i][5]); + push(@log_ratio2,$matrix_value[$i][8]); + push(@diviso2,$matrix_value[$i][9]); + $fine=$matrix_value[$i][4]; + $j++; + } + + # } + } + foreach $n (@log_ratio) { + $somma+=$n; + } + foreach $n1 (@log_ratio2) { + $somma2+=$n1; + } + foreach $n2 (@diviso2) { + $diviso2+=$n2; + } + $media = $somma/(@log_ratio); + $media2 = $somma2/(@log_ratio2); + $somma_media=(sqrt($diviso2)+$media); + $somma_score=(sqrt($diviso2)+$media2); + $somma_media = sprintf "%.2f", $somma_media; + $somma_score = sprintf "%.2f", $somma_score; + #$somma_score = $somma_score.$i + #$somma_media_chilo=((sqrt($#chilosa+1))+$mediachilo); + if ($outfile){ + print OUTFILE "$matrix_value[$i][0]\t$matrix_value[$i][1]\t$matrix_value[$i][2]\t$inizio\t$fine\t$somma_media\t$matrix_value[$i][6]\t$matrix_value[$i][7]\t$somma_score$i\n"; + }else{ + print "$matrix_value[$i][0]\t$matrix_value[$i][1]\t$matrix_value[$i][2]\t$inizio\t$fine\t$somma_media\t$matrix_value[$i][6]\t$matrix_value[$i][7]\t$somma_score$i\n"; + } + + +} +} + +#################################################################################################################################### + +sub printusage { + + print<<eoc22334; + + + + *************************************************** + :: N i m b l e G e n C h i p A n a l y s i s :: + *************************************************** + + + USAGE SUMMARY + --------------------------------------------------------------------------------- + This program utilizes NimbleGen ratio files in gff format as INPUT FILE and + provides a table of the computed picks in the same gff file format. + + + + --in [input filename] + --perc [percentile value, it is used to calculate the threshold rate based + on dataset distribution to filter out background ratio; i.e. 0.99] + OR + --fc [fold change value, it is used as fixed threshold to filter out + background ratio, i.e. 2] + --t [type of analysis; p, performs peaks determination based on p-value inference; + s, performs peaks determination based on a scoring system] + --log (required only for p-value based analysis) + [log2(p-value), cutoff integer to be used to identify a significant peak; i.e. 5] + --num [minimal number of consecutive probes used to define a peak; i.e. 3] + --dist [greatest nucleotide distance between two probes or between two peaks that + allow to consider the signals as belonging to the same peak] + --w [window length] + + --out [output filename (optional)] + ----------------------------------------------------------------------------------------- + EXAMPLES: + + perl program.pl --in 75340_ratio.gff --perc 0.98 --t s --num 3 --dist 250 --w 500 +OR + perl program.pl --in 75340_ratio.gff --perc 0.98 --t p --log 5 --num 3 --dist 250 --w 500 + + ----------------------------------------------------------------------------------------- + +eoc22334 + exit 0; + +}