view carpet-src-1/tools/CARPET/PeakPeaker2.pl @ 0:cdd489d98766

Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author matces
date Tue, 07 Jun 2011 16:50:41 -0400
parents
children
line wrap: on
line source

#! /usr/bin/perl

# Copyright 2009 Matteo Cesaroni, Lucilla Luzi
#
# This program is free software; ; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 3 of the License, or (at your
# option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.


use Getopt::Long;

GetOptions (
			"help"	=> \$OPT{help},
			"in=s" => \$OPT{fname},
			"perc=s" => \$OPT{value},
			"fc=s" => \$OPT{fc_value},
			"log=s" => \$OPT{valore_log},
			"t=s" => \$OPT{type_peak},
			"num=s" => \$OPT{num_probe},
			"dist=s" => \$OPT{dist_max},
			"dist_peaks=s" => \$OPT{dist_max_peaks},
			"w=s"=> \$OPT{s_windows},
			"col3=s" => \$OPT{col3},
			"f_pv=s" => \$OPT{file_pv},
			"out=s"=> \$OPT{out}

)|| printusage();

# opzioni da linea di comando
my $value=$OPT{value};
my $fc_value=$OPT{fc_value};
my $valore_log=$OPT{valore_log};
my $type_peak=$OPT{type_peak};
my $num_probe=$OPT{num_probe};
my $dist_max=$OPT{dist_max};
my $dist_max_peaks=$OPT{dist_max_peaks};
my $fname=$OPT{fname};
my $help=$OPT{help};
my $s_windows=$OPT{s_windows};
my $col3=$OPT{col3};
my $outfile=$OPT{out};
my $outfile_pv=$OPT{file_pv},

# "usage" se c'e' un help
$help and printusage();

# "usage" se non ci sono opzioni
if (!$s_windows || !$fname || (!$value && !$fc_value) || !$type_peak || !$num_probe || !$dist_max || (($type_peak eq "p") && !$valore_log)){
	&printusage()
};


qx {sort -k 1,1 -k 4,4n $fname >$fname.sortato};


open(FILE, "<$fname.sortato") or die "Cannot find file $fname.sortato\n";
open(pvalue, ">$outfile_pv") or die "Cannot open file $outfile_pv: $!\n";
#open(buonitutti, "> buoni_$fname") or die "Cannot find file $fname: $!\n";
#loop th rought line-by-line until the end of the file and then push each line into an empty array, called @array#
print pvalue "#p-value track ($value) \n";


@array= ();
$conto_tutti_sopra=0;
$conto_tutti=0;
while ($line = <FILE>){
	chomp ($line);
	if ($line=~/track/g){next;}
    if ($line=~/#/g){next;}
    if ($line=~/^\s+$/g){next;}
	push (@array,$line);
	@value_perc=split("\t",$line);
	push (@percentile,$value_perc[5]);
	if ($value_perc[5]>=$fc_value){
		$conto_tutti_sopra++;
		}
	elsif ($value_perc[5]<$fc_value){
		$conto_tutti++;
		}
}

close (FILE);

if (!$fc_value){
@perc_ordine=sort(@percentile);
$position=(($value*($#perc_ordine+1))-1);
$valore_percentile=$perc_ordine[$position];
 #print "il percentile è $valore_percentile\n";
 #print "il max è $perc_ordine[$#perc_ordine]\n";
 #print "il min è $perc_ordine[1]\n";
$probabilita=1-$value;
}
else {
	$valore_percentile=$fc_value;
	$tuttitutti=($conto_tutti+$conto_tutti_sopra);
	#print"il numero totaledi probe è $tuttitutti\n";
	#print"il numero totale di probe sopra  è $conto_tutti_sopra\n";
	$probabilita=$conto_tutti_sopra/($conto_tutti+$conto_tutti_sopra);
	#print "la proba= $probabilita\n";
	}

#print buonitutti"il percentile e $valore_percentile\n";



($one_ref,$two_ref)=&probe_cutoff(@array);

if ($type_peak eq "p"){
	@forse_niente=&peak(@$one_ref);
	}
elsif ($type_peak eq "s"){
	@forse_niente=&peak(@$two_ref);
	}

&double_peak(@forse_niente);

#
# End process
#
close OUTFILE;
unlink "$fname.sortato";
exit 0;

################################################################################################
########################		            SUBROUTINE	             ###############################
################################################################################################

sub probe_cutoff
{

$c=-6;

foreach $linea(@array){
	@subarray1 = split("\t",$linea);
	$inizio=(((($subarray1[4]-$subarray1[3])/2)+$subarray1[3])-($s_windows/2));  #per modificare la windows cambia il 500 e il 1000
	$fine=$inizio+$s_windows;																								#windows 500 -->250 e 500   windows 1000 --> 500 e 1000
	$counter=0;
	$counter_value=0;

	if($subarray1[5]>=$valore_percentile){
		push (@array_probe,[@subarray1]);
		print buonitutti "$linea\n";
	}

	for($i=$c;$i<=$#array;$i++) {
			if ($i<0){
				next;
				}
			@subarray = split("\t",$array[$i]);
			if (($subarray[3]>=$inizio)&&($subarray[3]<$fine)&&("$subarray[0]" eq "$subarray1[0]")){
					$counter++;
					if ($subarray[5]>= $valore_percentile){
						$counter_value++;
					}
				}
				if (($subarray[3]>$fine)||("$subarray[0]" ne "$subarray1[0]")){
							$cazzo=$subarray[3];
							last;
				}

	}
$c++;
	if ($counter !=0){
		$chi_sq = (((($counter_value - ($probabilita*$counter))**2)/($probabilita*$counter))+(((($counter-$counter_value)-((1-$probabilita)*$counter))**2)/((1-$probabilita)*$counter)));
	}
	else{
		$chi_sq = NA;
	}

use Statistics::Distributions;
$chisprob=Statistics::Distributions::chisqrprob (1,$chi_sq);
if ($chisprob == 0){
	$log_10=100;
}
else {
	$log_10 = -log($chisprob)/log(10);	
}

print pvalue "$subarray1[0]\t$subarray1[1]\tpv_$subarray1[2]\t$subarray1[3]\t$subarray1[4]\t$log_10\t$subarray1[6]\t$subarray1[7]\t$subarray1[5]\n";

	if ($log_10>=$valore_log){
	 	#print"$subarray1[3],$inizio,$fine,$counter,$counter_value,$chi_sq,$chisprob,$log_10\n";
		@proviamo=($subarray1[0],$subarray1[1],$subarray1[2],$inizio,$fine,$log_10,$subarray1[6],$subarray1[7],$subarray1[5]);
		push (@matrix_pvalue,[@proviamo]);

	}
}
@result_table=@matrix_pvalue;
@result_table2=@array_probe;
return(\@result_table,\@result_table2);

}





################################################################################################



sub peak
{
@matrix_value=@_;

$stop=$#matrix_value;

for ($cio=0; $cio<=$stop;$cio++) {
	 		@log_ratio=();
	 		@chilosa=();
			$inizio=$matrix_value[$cio][3];
			$somma=0;
			$media=0;
			$sommachilo=0;
			$mediachilo=0;
			$diviso=0;
			push(@log_ratio,$matrix_value[$cio][5]);
			if ($matrix_value[$cio][8]>=$valore_percentile){
							push(@chilosa,$matrix_value[$cio][8]);
			}
			for ($j=0; $j<=$stop;$j++){
						$distanza=($matrix_value[$cio+1][3]-$matrix_value[$cio][4]);
						if (($distanza > $dist_max) || ($cio==$stop)||(!("$matrix_value[$cio+1][0]" eq "$matrix_value[$cio][0]"))){
								$j=$stop;
								}
						elsif (($distanza <= $dist_max)&&("$matrix_value[$cio+1][0]" eq "$matrix_value[$cio][0]")){
							$cio++;
							push(@log_ratio,$matrix_value[$cio][5]);
							$fine=$matrix_value[$cio][4];
							$inizio3=$inizio;
							$j++;
							if ($matrix_value[$cio][8]>=$valore_percentile){
									push(@chilosa,$matrix_value[$cio][8]);
							}
						}


			}
			$numeroprobes=($#chilosa+1);
			if ((($#log_ratio+1)>=$num_probe) && (($type_peak eq "s") ||(($#chilosa+1)>=$num_probe))){
				foreach $n (@log_ratio) {
					$somma+=$n;
				}
				if($type_peak eq "s"){
					@chilosa=@log_ratio;
				}
				foreach $nchilo (@chilosa) {
					$sommachilo+=$nchilo;
				}
				$diviso=$#log_ratio+1;
				$media = $somma/(@log_ratio);
				$mediachilo=$sommachilo/(@chilosa);
				#$somma_media_chilo=((sqrt($#chilosa+1))+$mediachilo);
				#$moltipli=(($#log_ratio+1)*$media);
				#$somma_media=((sqrt($#log_ratio+1))+$media);
				if ($type_peak eq "p"){
					$inizio3=int($inizio3+(($s_windows/2)-25));
					$fine=int($fine-(($s_windows/2)-25));
				}
				@proviamo_double=($matrix_value[$cio][0],$matrix_value[$cio][1],$col3,$inizio3,$fine,$media,$matrix_value[$cio][6],$matrix_value[$cio][7],$mediachilo,$diviso);
				push (@matrix_for_double,[@proviamo_double]);

			}
}

@result_table=@matrix_for_double;
return(@result_table);
}


################################################################################################



sub double_peak
{
@matrix_value=@_;

$stop=$#matrix_value;
$i=0;
$j=0;
#print $stop;

#
# Print output file
#
if ($outfile){
	open (OUTFILE, ">$outfile");
}

#
# print File Header
#
my $header=<<e0c6654;
\# infile: $fname
\# percentile value: $value=$valore_percentile
\# fold change: $fc_value
\# type of analysis: $type_peak
\# log(pval analysis): $valore_log
\# num (probe defining a peak): $num_probe
\# dist: $dist_max
\# window length: $s_windows
track name=$col3 description="peaks find" visibility=2
e0c6654

if ($outfile){
	print OUTFILE "$header";
}else{
	print "$header";
}
if ($type_peak eq "p"){
	print "perc value=$value=$valore_percentile, #probes=$num_probe (dist=$dist_max), window=$s_windows, type analisys =p-value (log=$valore_log), dist peaks=$dist_max_peaks";
}
if ($type_peak eq "s"){
	print "perc value=$value=$valore_percentile, #probes=$num_probe (dist=$dist_max), window=$s_windows, type analisys=score, dist peaks=$dist_max_peaks";
}
for ($i=0; $i<=$stop;$i++) {
			@log_ratio=();
			@log_ratio2=();
			@diviso2=();
			$inizio=$matrix_value[$i][3];
			$somma=0;
			$somma2=0;
			$media=0;
			$media2=0;
			$diviso2=0;
			push(@log_ratio,$matrix_value[$i][5]);
			push(@log_ratio2,$matrix_value[$i][8]);
			push(@diviso2,$matrix_value[$i][9]);
			#print "giro $i\n";
			for ($j=0; $j<=$stop;$j++){
				#	if ("$matrix_value[$i+1][0]" eq "$matrix_value[$i][0]"){
						$distanza=($matrix_value[$i+1][3]-$matrix_value[$i][4]);
						#print "distanza fra i due $distanza\n";
						#exit;
						if (($distanza > $dist_max_peaks) || ($i==$stop)||(!("$matrix_value[$i+1][0]" eq "$matrix_value[$i][0]"))){
								$j=$stop;
								$fine=$matrix_value[$i][4];
						}
						elsif (($distanza <= $dist_max_peaks)&&("$matrix_value[$i+1][0]" eq "$matrix_value[$i][0]")){
							$i++;
							push(@log_ratio,$matrix_value[$i][5]);
							push(@log_ratio2,$matrix_value[$i][8]);
							push(@diviso2,$matrix_value[$i][9]);
							$fine=$matrix_value[$i][4];
							$j++;
						}

				# 	}
			}
				foreach $n (@log_ratio) {
					$somma+=$n;
				}
				foreach $n1 (@log_ratio2) {
					$somma2+=$n1;
				}
				foreach $n2 (@diviso2) {
					$diviso2+=$n2;
				}
				$media = $somma/(@log_ratio);
				$media2 = $somma2/(@log_ratio2);
				$somma_media=(sqrt($diviso2)+$media);
				$somma_score=(sqrt($diviso2)+$media2);
				$somma_media = sprintf "%.2f", $somma_media;
				$somma_score = sprintf "%.2f", $somma_score;
				#$somma_score = $somma_score.$i
					#$somma_media_chilo=((sqrt($#chilosa+1))+$mediachilo);
				if ($outfile){
					print OUTFILE "$matrix_value[$i][0]\t$matrix_value[$i][1]\t$matrix_value[$i][2]\t$inizio\t$fine\t$somma_media\t$matrix_value[$i][6]\t$matrix_value[$i][7]\t$somma_score$i\n";
				}else{
					print "$matrix_value[$i][0]\t$matrix_value[$i][1]\t$matrix_value[$i][2]\t$inizio\t$fine\t$somma_media\t$matrix_value[$i][6]\t$matrix_value[$i][7]\t$somma_score$i\n";
				}


}
}

####################################################################################################################################

sub printusage {

	print<<eoc22334;



	***************************************************
	:: N i m b l e G e n   C h i p   A n a l y s i s ::
	***************************************************


	 USAGE SUMMARY
     ---------------------------------------------------------------------------------
	This program utilizes NimbleGen ratio files in gff format as INPUT FILE and
	provides a table of the computed picks in the same gff file format.



	--in     [input filename]
	  --perc [percentile value, it is used to calculate the threshold rate based
	          on dataset distribution to filter out background ratio; i.e. 0.99]
	  OR
	  --fc   [fold change value, it is used as fixed threshold to filter out
	          background ratio, i.e. 2]
	--t      [type of analysis; p, performs peaks determination based on p-value inference;
	                            s, performs peaks determination based on a scoring system]
	    --log  (required only for p-value based analysis)
	         [log2(p-value), cutoff integer to be used to identify a significant peak; i.e. 5]
	--num    [minimal number of consecutive probes used to define a peak; i.e. 3]
	--dist   [greatest nucleotide distance between two probes or between two peaks that
	          allow to consider the signals as belonging to the same peak]
	--w      [window length]

	--out    [output filename (optional)]
     -----------------------------------------------------------------------------------------
	 EXAMPLES:

	 perl program.pl --in 75340_ratio.gff --perc 0.98 --t s --num 3 --dist 250 --w 500
OR
	 perl program.pl --in 75340_ratio.gff --perc 0.98 --t p --log 5 --num 3 --dist 250 --w 500

     -----------------------------------------------------------------------------------------

eoc22334
	exit 0;

}