Mercurial > repos > matces > carpet_toolsuite
view carpet-src-1/tools/CARPET/PeakPeaker2.pl @ 1:78770028dcf1 default tip
Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author | matces |
---|---|
date | Tue, 07 Jun 2011 16:59:33 -0400 |
parents | cdd489d98766 |
children |
line wrap: on
line source
#! /usr/bin/perl # Copyright 2009 Matteo Cesaroni, Lucilla Luzi # # This program is free software; ; you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published by # the Free Software Foundation; either version 3 of the License, or (at your # option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. use Getopt::Long; GetOptions ( "help" => \$OPT{help}, "in=s" => \$OPT{fname}, "perc=s" => \$OPT{value}, "fc=s" => \$OPT{fc_value}, "log=s" => \$OPT{valore_log}, "t=s" => \$OPT{type_peak}, "num=s" => \$OPT{num_probe}, "dist=s" => \$OPT{dist_max}, "dist_peaks=s" => \$OPT{dist_max_peaks}, "w=s"=> \$OPT{s_windows}, "col3=s" => \$OPT{col3}, "f_pv=s" => \$OPT{file_pv}, "out=s"=> \$OPT{out} )|| printusage(); # opzioni da linea di comando my $value=$OPT{value}; my $fc_value=$OPT{fc_value}; my $valore_log=$OPT{valore_log}; my $type_peak=$OPT{type_peak}; my $num_probe=$OPT{num_probe}; my $dist_max=$OPT{dist_max}; my $dist_max_peaks=$OPT{dist_max_peaks}; my $fname=$OPT{fname}; my $help=$OPT{help}; my $s_windows=$OPT{s_windows}; my $col3=$OPT{col3}; my $outfile=$OPT{out}; my $outfile_pv=$OPT{file_pv}, # "usage" se c'e' un help $help and printusage(); # "usage" se non ci sono opzioni if (!$s_windows || !$fname || (!$value && !$fc_value) || !$type_peak || !$num_probe || !$dist_max || (($type_peak eq "p") && !$valore_log)){ &printusage() }; qx {sort -k 1,1 -k 4,4n $fname >$fname.sortato}; open(FILE, "<$fname.sortato") or die "Cannot find file $fname.sortato\n"; open(pvalue, ">$outfile_pv") or die "Cannot open file $outfile_pv: $!\n"; #open(buonitutti, "> buoni_$fname") or die "Cannot find file $fname: $!\n"; #loop th rought line-by-line until the end of the file and then push each line into an empty array, called @array# print pvalue "#p-value track ($value) \n"; @array= (); $conto_tutti_sopra=0; $conto_tutti=0; while ($line = <FILE>){ chomp ($line); if ($line=~/track/g){next;} if ($line=~/#/g){next;} if ($line=~/^\s+$/g){next;} push (@array,$line); @value_perc=split("\t",$line); push (@percentile,$value_perc[5]); if ($value_perc[5]>=$fc_value){ $conto_tutti_sopra++; } elsif ($value_perc[5]<$fc_value){ $conto_tutti++; } } close (FILE); if (!$fc_value){ @perc_ordine=sort(@percentile); $position=(($value*($#perc_ordine+1))-1); $valore_percentile=$perc_ordine[$position]; #print "il percentile è $valore_percentile\n"; #print "il max è $perc_ordine[$#perc_ordine]\n"; #print "il min è $perc_ordine[1]\n"; $probabilita=1-$value; } else { $valore_percentile=$fc_value; $tuttitutti=($conto_tutti+$conto_tutti_sopra); #print"il numero totaledi probe è $tuttitutti\n"; #print"il numero totale di probe sopra è $conto_tutti_sopra\n"; $probabilita=$conto_tutti_sopra/($conto_tutti+$conto_tutti_sopra); #print "la proba= $probabilita\n"; } #print buonitutti"il percentile e $valore_percentile\n"; ($one_ref,$two_ref)=&probe_cutoff(@array); if ($type_peak eq "p"){ @forse_niente=&peak(@$one_ref); } elsif ($type_peak eq "s"){ @forse_niente=&peak(@$two_ref); } &double_peak(@forse_niente); # # End process # close OUTFILE; unlink "$fname.sortato"; exit 0; ################################################################################################ ######################## SUBROUTINE ############################### ################################################################################################ sub probe_cutoff { $c=-6; foreach $linea(@array){ @subarray1 = split("\t",$linea); $inizio=(((($subarray1[4]-$subarray1[3])/2)+$subarray1[3])-($s_windows/2)); #per modificare la windows cambia il 500 e il 1000 $fine=$inizio+$s_windows; #windows 500 -->250 e 500 windows 1000 --> 500 e 1000 $counter=0; $counter_value=0; if($subarray1[5]>=$valore_percentile){ push (@array_probe,[@subarray1]); print buonitutti "$linea\n"; } for($i=$c;$i<=$#array;$i++) { if ($i<0){ next; } @subarray = split("\t",$array[$i]); if (($subarray[3]>=$inizio)&&($subarray[3]<$fine)&&("$subarray[0]" eq "$subarray1[0]")){ $counter++; if ($subarray[5]>= $valore_percentile){ $counter_value++; } } if (($subarray[3]>$fine)||("$subarray[0]" ne "$subarray1[0]")){ $cazzo=$subarray[3]; last; } } $c++; if ($counter !=0){ $chi_sq = (((($counter_value - ($probabilita*$counter))**2)/($probabilita*$counter))+(((($counter-$counter_value)-((1-$probabilita)*$counter))**2)/((1-$probabilita)*$counter))); } else{ $chi_sq = NA; } use Statistics::Distributions; $chisprob=Statistics::Distributions::chisqrprob (1,$chi_sq); if ($chisprob == 0){ $log_10=100; } else { $log_10 = -log($chisprob)/log(10); } print pvalue "$subarray1[0]\t$subarray1[1]\tpv_$subarray1[2]\t$subarray1[3]\t$subarray1[4]\t$log_10\t$subarray1[6]\t$subarray1[7]\t$subarray1[5]\n"; if ($log_10>=$valore_log){ #print"$subarray1[3],$inizio,$fine,$counter,$counter_value,$chi_sq,$chisprob,$log_10\n"; @proviamo=($subarray1[0],$subarray1[1],$subarray1[2],$inizio,$fine,$log_10,$subarray1[6],$subarray1[7],$subarray1[5]); push (@matrix_pvalue,[@proviamo]); } } @result_table=@matrix_pvalue; @result_table2=@array_probe; return(\@result_table,\@result_table2); } ################################################################################################ sub peak { @matrix_value=@_; $stop=$#matrix_value; for ($cio=0; $cio<=$stop;$cio++) { @log_ratio=(); @chilosa=(); $inizio=$matrix_value[$cio][3]; $somma=0; $media=0; $sommachilo=0; $mediachilo=0; $diviso=0; push(@log_ratio,$matrix_value[$cio][5]); if ($matrix_value[$cio][8]>=$valore_percentile){ push(@chilosa,$matrix_value[$cio][8]); } for ($j=0; $j<=$stop;$j++){ $distanza=($matrix_value[$cio+1][3]-$matrix_value[$cio][4]); if (($distanza > $dist_max) || ($cio==$stop)||(!("$matrix_value[$cio+1][0]" eq "$matrix_value[$cio][0]"))){ $j=$stop; } elsif (($distanza <= $dist_max)&&("$matrix_value[$cio+1][0]" eq "$matrix_value[$cio][0]")){ $cio++; push(@log_ratio,$matrix_value[$cio][5]); $fine=$matrix_value[$cio][4]; $inizio3=$inizio; $j++; if ($matrix_value[$cio][8]>=$valore_percentile){ push(@chilosa,$matrix_value[$cio][8]); } } } $numeroprobes=($#chilosa+1); if ((($#log_ratio+1)>=$num_probe) && (($type_peak eq "s") ||(($#chilosa+1)>=$num_probe))){ foreach $n (@log_ratio) { $somma+=$n; } if($type_peak eq "s"){ @chilosa=@log_ratio; } foreach $nchilo (@chilosa) { $sommachilo+=$nchilo; } $diviso=$#log_ratio+1; $media = $somma/(@log_ratio); $mediachilo=$sommachilo/(@chilosa); #$somma_media_chilo=((sqrt($#chilosa+1))+$mediachilo); #$moltipli=(($#log_ratio+1)*$media); #$somma_media=((sqrt($#log_ratio+1))+$media); if ($type_peak eq "p"){ $inizio3=int($inizio3+(($s_windows/2)-25)); $fine=int($fine-(($s_windows/2)-25)); } @proviamo_double=($matrix_value[$cio][0],$matrix_value[$cio][1],$col3,$inizio3,$fine,$media,$matrix_value[$cio][6],$matrix_value[$cio][7],$mediachilo,$diviso); push (@matrix_for_double,[@proviamo_double]); } } @result_table=@matrix_for_double; return(@result_table); } ################################################################################################ sub double_peak { @matrix_value=@_; $stop=$#matrix_value; $i=0; $j=0; #print $stop; # # Print output file # if ($outfile){ open (OUTFILE, ">$outfile"); } # # print File Header # my $header=<<e0c6654; \# infile: $fname \# percentile value: $value=$valore_percentile \# fold change: $fc_value \# type of analysis: $type_peak \# log(pval analysis): $valore_log \# num (probe defining a peak): $num_probe \# dist: $dist_max \# window length: $s_windows track name=$col3 description="peaks find" visibility=2 e0c6654 if ($outfile){ print OUTFILE "$header"; }else{ print "$header"; } if ($type_peak eq "p"){ print "perc value=$value=$valore_percentile, #probes=$num_probe (dist=$dist_max), window=$s_windows, type analisys =p-value (log=$valore_log), dist peaks=$dist_max_peaks"; } if ($type_peak eq "s"){ print "perc value=$value=$valore_percentile, #probes=$num_probe (dist=$dist_max), window=$s_windows, type analisys=score, dist peaks=$dist_max_peaks"; } for ($i=0; $i<=$stop;$i++) { @log_ratio=(); @log_ratio2=(); @diviso2=(); $inizio=$matrix_value[$i][3]; $somma=0; $somma2=0; $media=0; $media2=0; $diviso2=0; push(@log_ratio,$matrix_value[$i][5]); push(@log_ratio2,$matrix_value[$i][8]); push(@diviso2,$matrix_value[$i][9]); #print "giro $i\n"; for ($j=0; $j<=$stop;$j++){ # if ("$matrix_value[$i+1][0]" eq "$matrix_value[$i][0]"){ $distanza=($matrix_value[$i+1][3]-$matrix_value[$i][4]); #print "distanza fra i due $distanza\n"; #exit; if (($distanza > $dist_max_peaks) || ($i==$stop)||(!("$matrix_value[$i+1][0]" eq "$matrix_value[$i][0]"))){ $j=$stop; $fine=$matrix_value[$i][4]; } elsif (($distanza <= $dist_max_peaks)&&("$matrix_value[$i+1][0]" eq "$matrix_value[$i][0]")){ $i++; push(@log_ratio,$matrix_value[$i][5]); push(@log_ratio2,$matrix_value[$i][8]); push(@diviso2,$matrix_value[$i][9]); $fine=$matrix_value[$i][4]; $j++; } # } } foreach $n (@log_ratio) { $somma+=$n; } foreach $n1 (@log_ratio2) { $somma2+=$n1; } foreach $n2 (@diviso2) { $diviso2+=$n2; } $media = $somma/(@log_ratio); $media2 = $somma2/(@log_ratio2); $somma_media=(sqrt($diviso2)+$media); $somma_score=(sqrt($diviso2)+$media2); $somma_media = sprintf "%.2f", $somma_media; $somma_score = sprintf "%.2f", $somma_score; #$somma_score = $somma_score.$i #$somma_media_chilo=((sqrt($#chilosa+1))+$mediachilo); if ($outfile){ print OUTFILE "$matrix_value[$i][0]\t$matrix_value[$i][1]\t$matrix_value[$i][2]\t$inizio\t$fine\t$somma_media\t$matrix_value[$i][6]\t$matrix_value[$i][7]\t$somma_score$i\n"; }else{ print "$matrix_value[$i][0]\t$matrix_value[$i][1]\t$matrix_value[$i][2]\t$inizio\t$fine\t$somma_media\t$matrix_value[$i][6]\t$matrix_value[$i][7]\t$somma_score$i\n"; } } } #################################################################################################################################### sub printusage { print<<eoc22334; *************************************************** :: N i m b l e G e n C h i p A n a l y s i s :: *************************************************** USAGE SUMMARY --------------------------------------------------------------------------------- This program utilizes NimbleGen ratio files in gff format as INPUT FILE and provides a table of the computed picks in the same gff file format. --in [input filename] --perc [percentile value, it is used to calculate the threshold rate based on dataset distribution to filter out background ratio; i.e. 0.99] OR --fc [fold change value, it is used as fixed threshold to filter out background ratio, i.e. 2] --t [type of analysis; p, performs peaks determination based on p-value inference; s, performs peaks determination based on a scoring system] --log (required only for p-value based analysis) [log2(p-value), cutoff integer to be used to identify a significant peak; i.e. 5] --num [minimal number of consecutive probes used to define a peak; i.e. 3] --dist [greatest nucleotide distance between two probes or between two peaks that allow to consider the signals as belonging to the same peak] --w [window length] --out [output filename (optional)] ----------------------------------------------------------------------------------------- EXAMPLES: perl program.pl --in 75340_ratio.gff --perc 0.98 --t s --num 3 --dist 250 --w 500 OR perl program.pl --in 75340_ratio.gff --perc 0.98 --t p --log 5 --num 3 --dist 250 --w 500 ----------------------------------------------------------------------------------------- eoc22334 exit 0; }