diff carpet-src-1/tools/CARPET/PeakPeaker2.pl @ 0:cdd489d98766

Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author matces
date Tue, 07 Jun 2011 16:50:41 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/carpet-src-1/tools/CARPET/PeakPeaker2.pl	Tue Jun 07 16:50:41 2011 -0400
@@ -0,0 +1,432 @@
+#! /usr/bin/perl
+
+# Copyright 2009 Matteo Cesaroni, Lucilla Luzi
+#
+# This program is free software; ; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+
+use Getopt::Long;
+
+GetOptions (
+			"help"	=> \$OPT{help},
+			"in=s" => \$OPT{fname},
+			"perc=s" => \$OPT{value},
+			"fc=s" => \$OPT{fc_value},
+			"log=s" => \$OPT{valore_log},
+			"t=s" => \$OPT{type_peak},
+			"num=s" => \$OPT{num_probe},
+			"dist=s" => \$OPT{dist_max},
+			"dist_peaks=s" => \$OPT{dist_max_peaks},
+			"w=s"=> \$OPT{s_windows},
+			"col3=s" => \$OPT{col3},
+			"f_pv=s" => \$OPT{file_pv},
+			"out=s"=> \$OPT{out}
+
+)|| printusage();
+
+# opzioni da linea di comando
+my $value=$OPT{value};
+my $fc_value=$OPT{fc_value};
+my $valore_log=$OPT{valore_log};
+my $type_peak=$OPT{type_peak};
+my $num_probe=$OPT{num_probe};
+my $dist_max=$OPT{dist_max};
+my $dist_max_peaks=$OPT{dist_max_peaks};
+my $fname=$OPT{fname};
+my $help=$OPT{help};
+my $s_windows=$OPT{s_windows};
+my $col3=$OPT{col3};
+my $outfile=$OPT{out};
+my $outfile_pv=$OPT{file_pv},
+
+# "usage" se c'e' un help
+$help and printusage();
+
+# "usage" se non ci sono opzioni
+if (!$s_windows || !$fname || (!$value && !$fc_value) || !$type_peak || !$num_probe || !$dist_max || (($type_peak eq "p") && !$valore_log)){
+	&printusage()
+};
+
+
+qx {sort -k 1,1 -k 4,4n $fname >$fname.sortato};
+
+
+open(FILE, "<$fname.sortato") or die "Cannot find file $fname.sortato\n";
+open(pvalue, ">$outfile_pv") or die "Cannot open file $outfile_pv: $!\n";
+#open(buonitutti, "> buoni_$fname") or die "Cannot find file $fname: $!\n";
+#loop th rought line-by-line until the end of the file and then push each line into an empty array, called @array#
+print pvalue "#p-value track ($value) \n";
+
+
+@array= ();
+$conto_tutti_sopra=0;
+$conto_tutti=0;
+while ($line = <FILE>){
+	chomp ($line);
+	if ($line=~/track/g){next;}
+    if ($line=~/#/g){next;}
+    if ($line=~/^\s+$/g){next;}
+	push (@array,$line);
+	@value_perc=split("\t",$line);
+	push (@percentile,$value_perc[5]);
+	if ($value_perc[5]>=$fc_value){
+		$conto_tutti_sopra++;
+		}
+	elsif ($value_perc[5]<$fc_value){
+		$conto_tutti++;
+		}
+}
+
+close (FILE);
+
+if (!$fc_value){
+@perc_ordine=sort(@percentile);
+$position=(($value*($#perc_ordine+1))-1);
+$valore_percentile=$perc_ordine[$position];
+ #print "il percentile è $valore_percentile\n";
+ #print "il max è $perc_ordine[$#perc_ordine]\n";
+ #print "il min è $perc_ordine[1]\n";
+$probabilita=1-$value;
+}
+else {
+	$valore_percentile=$fc_value;
+	$tuttitutti=($conto_tutti+$conto_tutti_sopra);
+	#print"il numero totaledi probe è $tuttitutti\n";
+	#print"il numero totale di probe sopra  è $conto_tutti_sopra\n";
+	$probabilita=$conto_tutti_sopra/($conto_tutti+$conto_tutti_sopra);
+	#print "la proba= $probabilita\n";
+	}
+
+#print buonitutti"il percentile e $valore_percentile\n";
+
+
+
+($one_ref,$two_ref)=&probe_cutoff(@array);
+
+if ($type_peak eq "p"){
+	@forse_niente=&peak(@$one_ref);
+	}
+elsif ($type_peak eq "s"){
+	@forse_niente=&peak(@$two_ref);
+	}
+
+&double_peak(@forse_niente);
+
+#
+# End process
+#
+close OUTFILE;
+unlink "$fname.sortato";
+exit 0;
+
+################################################################################################
+########################		            SUBROUTINE	             ###############################
+################################################################################################
+
+sub probe_cutoff
+{
+
+$c=-6;
+
+foreach $linea(@array){
+	@subarray1 = split("\t",$linea);
+	$inizio=(((($subarray1[4]-$subarray1[3])/2)+$subarray1[3])-($s_windows/2));  #per modificare la windows cambia il 500 e il 1000
+	$fine=$inizio+$s_windows;																								#windows 500 -->250 e 500   windows 1000 --> 500 e 1000
+	$counter=0;
+	$counter_value=0;
+
+	if($subarray1[5]>=$valore_percentile){
+		push (@array_probe,[@subarray1]);
+		print buonitutti "$linea\n";
+	}
+
+	for($i=$c;$i<=$#array;$i++) {
+			if ($i<0){
+				next;
+				}
+			@subarray = split("\t",$array[$i]);
+			if (($subarray[3]>=$inizio)&&($subarray[3]<$fine)&&("$subarray[0]" eq "$subarray1[0]")){
+					$counter++;
+					if ($subarray[5]>= $valore_percentile){
+						$counter_value++;
+					}
+				}
+				if (($subarray[3]>$fine)||("$subarray[0]" ne "$subarray1[0]")){
+							$cazzo=$subarray[3];
+							last;
+				}
+
+	}
+$c++;
+	if ($counter !=0){
+		$chi_sq = (((($counter_value - ($probabilita*$counter))**2)/($probabilita*$counter))+(((($counter-$counter_value)-((1-$probabilita)*$counter))**2)/((1-$probabilita)*$counter)));
+	}
+	else{
+		$chi_sq = NA;
+	}
+
+use Statistics::Distributions;
+$chisprob=Statistics::Distributions::chisqrprob (1,$chi_sq);
+if ($chisprob == 0){
+	$log_10=100;
+}
+else {
+	$log_10 = -log($chisprob)/log(10);	
+}
+
+print pvalue "$subarray1[0]\t$subarray1[1]\tpv_$subarray1[2]\t$subarray1[3]\t$subarray1[4]\t$log_10\t$subarray1[6]\t$subarray1[7]\t$subarray1[5]\n";
+
+	if ($log_10>=$valore_log){
+	 	#print"$subarray1[3],$inizio,$fine,$counter,$counter_value,$chi_sq,$chisprob,$log_10\n";
+		@proviamo=($subarray1[0],$subarray1[1],$subarray1[2],$inizio,$fine,$log_10,$subarray1[6],$subarray1[7],$subarray1[5]);
+		push (@matrix_pvalue,[@proviamo]);
+
+	}
+}
+@result_table=@matrix_pvalue;
+@result_table2=@array_probe;
+return(\@result_table,\@result_table2);
+
+}
+
+
+
+
+
+################################################################################################
+
+
+
+sub peak
+{
+@matrix_value=@_;
+
+$stop=$#matrix_value;
+
+for ($cio=0; $cio<=$stop;$cio++) {
+	 		@log_ratio=();
+	 		@chilosa=();
+			$inizio=$matrix_value[$cio][3];
+			$somma=0;
+			$media=0;
+			$sommachilo=0;
+			$mediachilo=0;
+			$diviso=0;
+			push(@log_ratio,$matrix_value[$cio][5]);
+			if ($matrix_value[$cio][8]>=$valore_percentile){
+							push(@chilosa,$matrix_value[$cio][8]);
+			}
+			for ($j=0; $j<=$stop;$j++){
+						$distanza=($matrix_value[$cio+1][3]-$matrix_value[$cio][4]);
+						if (($distanza > $dist_max) || ($cio==$stop)||(!("$matrix_value[$cio+1][0]" eq "$matrix_value[$cio][0]"))){
+								$j=$stop;
+								}
+						elsif (($distanza <= $dist_max)&&("$matrix_value[$cio+1][0]" eq "$matrix_value[$cio][0]")){
+							$cio++;
+							push(@log_ratio,$matrix_value[$cio][5]);
+							$fine=$matrix_value[$cio][4];
+							$inizio3=$inizio;
+							$j++;
+							if ($matrix_value[$cio][8]>=$valore_percentile){
+									push(@chilosa,$matrix_value[$cio][8]);
+							}
+						}
+
+
+			}
+			$numeroprobes=($#chilosa+1);
+			if ((($#log_ratio+1)>=$num_probe) && (($type_peak eq "s") ||(($#chilosa+1)>=$num_probe))){
+				foreach $n (@log_ratio) {
+					$somma+=$n;
+				}
+				if($type_peak eq "s"){
+					@chilosa=@log_ratio;
+				}
+				foreach $nchilo (@chilosa) {
+					$sommachilo+=$nchilo;
+				}
+				$diviso=$#log_ratio+1;
+				$media = $somma/(@log_ratio);
+				$mediachilo=$sommachilo/(@chilosa);
+				#$somma_media_chilo=((sqrt($#chilosa+1))+$mediachilo);
+				#$moltipli=(($#log_ratio+1)*$media);
+				#$somma_media=((sqrt($#log_ratio+1))+$media);
+				if ($type_peak eq "p"){
+					$inizio3=int($inizio3+(($s_windows/2)-25));
+					$fine=int($fine-(($s_windows/2)-25));
+				}
+				@proviamo_double=($matrix_value[$cio][0],$matrix_value[$cio][1],$col3,$inizio3,$fine,$media,$matrix_value[$cio][6],$matrix_value[$cio][7],$mediachilo,$diviso);
+				push (@matrix_for_double,[@proviamo_double]);
+
+			}
+}
+
+@result_table=@matrix_for_double;
+return(@result_table);
+}
+
+
+################################################################################################
+
+
+
+sub double_peak
+{
+@matrix_value=@_;
+
+$stop=$#matrix_value;
+$i=0;
+$j=0;
+#print $stop;
+
+#
+# Print output file
+#
+if ($outfile){
+	open (OUTFILE, ">$outfile");
+}
+
+#
+# print File Header
+#
+my $header=<<e0c6654;
+\# infile: $fname
+\# percentile value: $value=$valore_percentile
+\# fold change: $fc_value
+\# type of analysis: $type_peak
+\# log(pval analysis): $valore_log
+\# num (probe defining a peak): $num_probe
+\# dist: $dist_max
+\# window length: $s_windows
+track name=$col3 description="peaks find" visibility=2
+e0c6654
+
+if ($outfile){
+	print OUTFILE "$header";
+}else{
+	print "$header";
+}
+if ($type_peak eq "p"){
+	print "perc value=$value=$valore_percentile, #probes=$num_probe (dist=$dist_max), window=$s_windows, type analisys =p-value (log=$valore_log), dist peaks=$dist_max_peaks";
+}
+if ($type_peak eq "s"){
+	print "perc value=$value=$valore_percentile, #probes=$num_probe (dist=$dist_max), window=$s_windows, type analisys=score, dist peaks=$dist_max_peaks";
+}
+for ($i=0; $i<=$stop;$i++) {
+			@log_ratio=();
+			@log_ratio2=();
+			@diviso2=();
+			$inizio=$matrix_value[$i][3];
+			$somma=0;
+			$somma2=0;
+			$media=0;
+			$media2=0;
+			$diviso2=0;
+			push(@log_ratio,$matrix_value[$i][5]);
+			push(@log_ratio2,$matrix_value[$i][8]);
+			push(@diviso2,$matrix_value[$i][9]);
+			#print "giro $i\n";
+			for ($j=0; $j<=$stop;$j++){
+				#	if ("$matrix_value[$i+1][0]" eq "$matrix_value[$i][0]"){
+						$distanza=($matrix_value[$i+1][3]-$matrix_value[$i][4]);
+						#print "distanza fra i due $distanza\n";
+						#exit;
+						if (($distanza > $dist_max_peaks) || ($i==$stop)||(!("$matrix_value[$i+1][0]" eq "$matrix_value[$i][0]"))){
+								$j=$stop;
+								$fine=$matrix_value[$i][4];
+						}
+						elsif (($distanza <= $dist_max_peaks)&&("$matrix_value[$i+1][0]" eq "$matrix_value[$i][0]")){
+							$i++;
+							push(@log_ratio,$matrix_value[$i][5]);
+							push(@log_ratio2,$matrix_value[$i][8]);
+							push(@diviso2,$matrix_value[$i][9]);
+							$fine=$matrix_value[$i][4];
+							$j++;
+						}
+
+				# 	}
+			}
+				foreach $n (@log_ratio) {
+					$somma+=$n;
+				}
+				foreach $n1 (@log_ratio2) {
+					$somma2+=$n1;
+				}
+				foreach $n2 (@diviso2) {
+					$diviso2+=$n2;
+				}
+				$media = $somma/(@log_ratio);
+				$media2 = $somma2/(@log_ratio2);
+				$somma_media=(sqrt($diviso2)+$media);
+				$somma_score=(sqrt($diviso2)+$media2);
+				$somma_media = sprintf "%.2f", $somma_media;
+				$somma_score = sprintf "%.2f", $somma_score;
+				#$somma_score = $somma_score.$i
+					#$somma_media_chilo=((sqrt($#chilosa+1))+$mediachilo);
+				if ($outfile){
+					print OUTFILE "$matrix_value[$i][0]\t$matrix_value[$i][1]\t$matrix_value[$i][2]\t$inizio\t$fine\t$somma_media\t$matrix_value[$i][6]\t$matrix_value[$i][7]\t$somma_score$i\n";
+				}else{
+					print "$matrix_value[$i][0]\t$matrix_value[$i][1]\t$matrix_value[$i][2]\t$inizio\t$fine\t$somma_media\t$matrix_value[$i][6]\t$matrix_value[$i][7]\t$somma_score$i\n";
+				}
+
+
+}
+}
+
+####################################################################################################################################
+
+sub printusage {
+
+	print<<eoc22334;
+
+
+
+	***************************************************
+	:: N i m b l e G e n   C h i p   A n a l y s i s ::
+	***************************************************
+
+
+	 USAGE SUMMARY
+     ---------------------------------------------------------------------------------
+	This program utilizes NimbleGen ratio files in gff format as INPUT FILE and
+	provides a table of the computed picks in the same gff file format.
+
+
+
+	--in     [input filename]
+	  --perc [percentile value, it is used to calculate the threshold rate based
+	          on dataset distribution to filter out background ratio; i.e. 0.99]
+	  OR
+	  --fc   [fold change value, it is used as fixed threshold to filter out
+	          background ratio, i.e. 2]
+	--t      [type of analysis; p, performs peaks determination based on p-value inference;
+	                            s, performs peaks determination based on a scoring system]
+	    --log  (required only for p-value based analysis)
+	         [log2(p-value), cutoff integer to be used to identify a significant peak; i.e. 5]
+	--num    [minimal number of consecutive probes used to define a peak; i.e. 3]
+	--dist   [greatest nucleotide distance between two probes or between two peaks that
+	          allow to consider the signals as belonging to the same peak]
+	--w      [window length]
+
+	--out    [output filename (optional)]
+     -----------------------------------------------------------------------------------------
+	 EXAMPLES:
+
+	 perl program.pl --in 75340_ratio.gff --perc 0.98 --t s --num 3 --dist 250 --w 500
+OR
+	 perl program.pl --in 75340_ratio.gff --perc 0.98 --t p --log 5 --num 3 --dist 250 --w 500
+
+     -----------------------------------------------------------------------------------------
+
+eoc22334
+	exit 0;
+
+}