Mercurial > repos > matces > carpet_toolsuite

#! /usr/bin/perl

# Copyright 2009 Matteo Cesaroni, Lucilla Luzi
#
# This program is free software; ; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 3 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.


#prende tutte le probes che mecciano almeno in parte dentro gli esoni

use Statistics::PointEstimation;
use Statistics::TTest;
#use Statistics::Test::WilcoxonRankSum;
use Getopt::Long;

GetOptions (
			"help"	=> \$OPT{help},
			"ann=s" => \$OPT{ann},
			"wt=s" => \$OPT{wt},
			"treat=s" => \$OPT{treat},
			"out=s" =>  \$OPT{out},
			"pv=s" => \$OPT{prob},
			"rc=s" => \$OPT{raw_cut},
			"fc=s" => \$OPT{fc_cut},
			"exon=s" => \$OPT{exon},
			"opt=s" => \$OPT{option},
			"norm=s" =>\$OPT{norm_q},
			"sum_met=s" =>\$OPT{sum_met},
			"fdr=s" =>\$OPT{fdr},
			"log=s" =>\$OPT{log_t},
)|| printusage();

# opzioni da linea di comando
my $file_ann=$OPT{ann};
my $file_wt=$OPT{wt};
my $file_treat=$OPT{treat};
my $file_output=$OPT{out};
my $pv_cut=$OPT{prob};
my $media_cut=$OPT{raw_cut};
my $FC_cut=$OPT{fc_cut};
my $tipo=$OPT{exon};
my $option=$OPT{option};
my $normalization_q=$OPT{norm_q};
my $sum_meth=$OPT{sum_met};
my $corretction=$OPT{fdr};
my $data_log=$OPT{log_t};

$help and printusage();


if($option eq "comp"){
if ($sum_meth eq "both"){
$header=<<e0c6654;
\# annotation_file: $file_ann
\# wt_file: $file_wt
\# treat_file: $file_treat
\# p-value cutoff: $pv_cut
\# raw data cutoff cutoff: $media_cut
\# summary method: $sum_meth
\# fold change cutoff: $FC_cut
\# type of analysis: $tipo
\# headers: name RefSeq Chr txStart txEnd strand mean_chip1 mean_chip2 FC_mean median_chip1 median_chip2 FC_median num_probes_in_gene p-value FDR(q-value)
e0c6654

}
if ($sum_meth eq "mean"){
$header=<<e0c6654;
\# annotation_file: $file_ann
\# wt_file: $file_wt
\# treat_file: $file_treat
\# p-value cutoff: $pv_cut
\# raw data cutoff cutoff: $media_cut
\# summary method: $sum_meth
\# fold change cutoff: $FC_cut
\# type of analysis: $tipo
\# headers: name RefSeq Chr txStart txEnd strand mean_chip1 mean_chip2 FC_mean num_probes_in_gene p-value FDR(q-value)
e0c6654

}
if ($sum_meth eq "median"){
$header=<<e0c6654;
\# annotation_file: $file_ann
\# wt_file: $file_wt
\# treat_file: $file_treat
\# p-value cutoff: $pv_cut
\# raw data cutoff cutoff: $media_cut
\# summary method: $sum_meth
\# fold change cutoff: $FC_cut
\# type of analysis: $tipo
\# headers: name RefSeq Chr txStart txEnd strand median_chip1 median_chip2 FC_median num_probes_in_gene p-value FDR(q-value)
e0c6654

}


# "usage" se non ci sono opzioni
if (!$file_ann || !$file_wt || !$file_treat) {
	&printusage()
}

#if ($pv_cut == 1){$pv_cut=0.9999;}
if (!$media_cut){$media_cut=0;}
if (!$FC_cut){$FC_cut=0;}

my @r1=();
my @r2=();

#my $confident=(1-$pv_cut)*100;
#inserire nell'ordine: tabella annotazione, tabella valori wt, tabella valori treated, p-value cutoff, raw signal cutoff


open (annotation,"<$file_ann") || die "file_ann not open:$!\n";
open (wt,"<$file_wt") || die "$file_wt not open:$!\n";
open (treated,"<$file_treat") || die "$file_treat not open:$!\n";
open (output,">$file_output") || die "bed_$file_wt not open:$!\n";

print output "$header";

print "Comparison --> probe_selection=$tipo, Normalization=$normalization_q, summary method=$sum_meth Filters: p-value=$pv_cut, raw value=$media_cut, FC=$FC_cut";

while (defined (my $line_down = <annotation>)) {
   chomp $line_down;
   my @tmp_down = split("\t", $line_down);
   my @probe_match=split("\,", $tmp_down[9]);
   foreach $probes (@probe_match){
   		my @coord=split("-", $probes);
   		$tab_tot{$tmp_down[0]}{"$coord[0]\t$coord[1]"}.="$tmp_down[1]\n$tmp_down[7]\n$tmp_down[0]\n$tmp_down[2]\n$tmp_down[3]\n$tmp_down[4]\n$tmp_down[5]\n$tmp_down[6]\n";
   		push(@ciclo,"$tmp_down[0]\t$coord[0]\t$coord[1]");
   }
}

while (defined (my $line_down = <wt>)) {
   chomp $line_down;
   my @tmp_down = split("\t", $line_down);
   chomp $tmp_down[0];
   $tab_tot_wt{"$tmp_down[0]\t$tmp_down[3]"}=$tmp_down[5];
}


while (defined (my $line_down = <treated>)) {
   chomp $line_down;
   my @tmp_down = split("\t", $line_down);
   chomp $tmp_down[0];
   $tab_tot_treat{"$tmp_down[0]\t$tmp_down[3]"}=$tmp_down[5];
}

if($normalization_q eq "yes"){

	@sort_wt=sort{$tab_tot_wt{$a} <=> $tab_tot_wt{$b}} keys %tab_tot_wt;
	@sort_treat=sort{$tab_tot_treat{$a} <=> $tab_tot_treat{$b}} (keys %tab_tot_treat);

	for($i=0;$i<=$#sort_wt;$i++){
		$media=($tab_tot_wt{$sort_wt[$i]}+$tab_tot_treat{$sort_treat[$i]})/2;
		$tab_tot_value{$sort_wt[$i]}.="$media\n";
	}

	for($i=0;$i<=$#sort_treat;$i++){
		$media=($tab_tot_wt{$sort_wt[$i]}+$tab_tot_treat{$sort_treat[$i]})/2;
		$tab_tot_value{$sort_treat[$i]}.="$media\n";
	}

}


foreach $key_value(@ciclo){

			@keys_values=split("\t",$key_value);
			@array1= split("\n",$tab_tot{$keys_values[0]}{"$keys_values[1]\t$keys_values[2]"});
			if($normalization_q eq "yes"){
				@array2= split("\n",$tab_tot_value{"$keys_values[1]\t$keys_values[2]"});
				$value1=$array2[0];
				$value2=$array2[1];
			}
			else{
				$value1=$tab_tot_wt{"$keys_values[1]\t$keys_values[2]"};
				$value2=$tab_tot_treat{"$keys_values[1]\t$keys_values[2]"};
			}
			if ($tipo eq "internal_exon"){
				$prendo = "^exon";
			}
			if ($tipo eq "all_exon"){
				$prendo = "exon";
			}
			if ($tipo eq "last_exon"){
				$prendo = "exon last";
			}

			if ((!($array1[0] eq "")) && ($array1[1] =~ /$prendo/g))  {
				if ($data_log eq "no_log"){
					$log1=log($value1)/log(2);
					$log2=log($value2)/log(2);
				}
				if ($data_log eq "log"){
					$log1=$value1;
					$log2=$value2;
				}
				$tab_gene_wt{"$array1[0]\t$array1[2]\t$array1[3]\t$array1[4]\t$array1[5]\t$array1[6]"}.="$log1\n";
				$tab_gene_tratted{"$array1[0]\t$array1[2]\t$array1[3]\t$array1[4]\t$array1[5]\t$array1[6]"}.="$log2\n";
			}

}

foreach $key (keys %tab_gene_wt) {
	@r1=split("\n",$tab_gene_wt{$key});
	@r2=split("\n",$tab_gene_tratted{$key});

	sort {$a <=> $b} (@r1);
	sort {$a <=> $b} (@r2);

	if ($#r1>0){
		my $ttest = new Statistics::TTest;
 		$ttest->set_significance(95);
 		$ttest->load_data(\@r1,\@r2);
    	my $s1=$ttest->{s1};
    	my $s2=$ttest->{s2};
		$media1=$s1->{mean};
		$media2=$s2->{mean};
		$total_probes=$#r1+1;
		$potenza_a=$media2-$media1;
		$FCa=((2)**$potenza_a);
    	if( (@r1 % 2) == 1 ) {
        	$median1 = $r1[((@r1+1) / 2)-1];
        	$median2 = $r2[((@r2+1) / 2)-1];
    	} else {
        	$median1 = ($r1[(@r1 / 2)-1] + $r1[@r1 / 2]) / 2;
        	$median2 = ($r2[(@r2 / 2)-1] + $r2[@r2 / 2]) / 2;
	    }
		$potenza_m=$median2-$median1;
		$FCm=((2)**$potenza_m);


		if ($FCa<1){
			$FCa=(-(1/$FCa));
		}


		if ($FCm<1){
			$FCm=(-(1/$FCm));
		}
		if ($sum_meth eq "mean"){
			if ((($media1<$media_cut) && ($media2<$media_cut)) || ($FC_cut>=abs($FCa))){
				next;
			}
			$p_value{"$key\t$media1\t$media2\t$FCa\t$total_probes"}=$ttest->{t_prob};

			#print output "$key\t$media1\t$media2\t$FCa\t",$ttest->{t_prob},"\t$total_probes\n";
		}
		if ($sum_meth eq "median"){
			if ((($media1<$media_cut) && ($media2<$media_cut)) || ($FC_cut>=abs($FCm))){
				next;
			}
			#print output "$key\t$median1\t$median2\t$FCm\t",$ttest->{t_prob},"\t$total_probes\n";
			$p_value{"$key\t$median1\t$median2\t$FCm\t$total_probes"}=$ttest->{t_prob};
		}
		if ($sum_meth eq "both"){
			if (($media1<$media_cut) && ($media2<$media_cut)){
				next;
			}
			#print output "$key\t$media1\t$media2\t$FCa\t$median1\t$median2\t$FCm\t",$ttest->{t_prob},"\t$total_probes\n";
			$p_value{"$key\t$media1\t$media2\t$FCa\t$median1\t$median2\t$FCm\t$total_probes"}=$ttest->{t_prob};
		}
	}
	if ($#r1==0){
		$media1=$r1[0];
		$media2=$r2[0];
		$median1=$r1[0];
		$median2=$r2[0];
		$potenza=$media2-$media1;
		$FCa=((2)**$potenza);
		$FCm=((2)**$potenza);
		if ($FCa<1){
			$FCa=(-(1/$FCa));
		}
		if ($FCm<1){
			$FCm=(-(1/$FCm));
		}

		if ($sum_meth eq "both"){
			if (($media1<$media_cut) && ($media2<$media_cut)){
				next;
			}
			$p_value{"$key\t$media1\t$media2\t$FCa\t$median1\t$median2\t$FCm\t$total_probes"}=1;
		}
		if ($sum_meth eq "mean"){
			if ((($media1<$media_cut) && ($media2<$media_cut)) || ($FC_cut>=abs($FCa))){
				next;
			}
			$p_value{"$key\t$media1\t$media2\t$FCa\t$total_probes"}=1;
		}
		if ($sum_meth eq "median"){
			if ((($media1<$media_cut) && ($media2<$media_cut)) || ($FC_cut>=abs($FCm))){
				next;
			}
			$p_value{"$key\t$median1\t$median2\t$FCm\t$total_probes"}=1;
		}
	}
}

@sort_pvalue=sort{$p_value{$a} <=> $p_value{$b}} keys %p_value;

if($corretction eq "yes"){
	for($i=0;$i<=$#sort_pvalue;$i++){
		$qvalue=$p_value{$sort_pvalue[$i]}*(($#sort_pvalue+1)/($i+1));
		push(@QVALUE, $qvalue);
	}

	@qvalue_sort = sort {$a <=> $b} @QVALUE;

	for($i=0;$i<=$#sort_pvalue;$i++){
		#$FDR=((($i+1)*0.05)/($#sort_pvalue+1));
		$qvalue=shift(@qvalue_sort);
		if($qvalue>1){$qvalue=1;}
		if($pv_cut>=$qvalue){
			print output "$sort_pvalue[$i]\t$p_value{$sort_pvalue[$i]}\t$qvalue\n";
		}
	}
}
if($corretction eq "no"){
	for($i=0;$i<=$#sort_pvalue;$i++){
		if($pv_cut>=$qvalue){
			print output "$sort_pvalue[$i]\t$p_value{$sort_pvalue[$i]}\n";
		}
	}
}


}
if($option eq "expr"){
if ($sum_meth eq "median"){
$header=<<e0c6654;
\# annotation_file: $file_ann
\# expression_file: $file_wt
\# type of analysis: $tipo
\# summary method: $sum_meth
\# headers: name RefSeq Chr txStart txEnd strand median_chip num_probes_in_gene
e0c6654

}
if ($sum_meth eq "mean"){
$header=<<e0c6654;
\# annotation_file: $file_ann
\# expression_file: $file_wt
\# type of analysis: $tipo
\# summary method: $sum_meth
\# headers: name RefSeq Chr txStart txEnd strand mean_chip num_probes_in_gene
e0c6654

}
if ($sum_meth eq "both"){
$header=<<e0c6654;
\# annotation_file: $file_ann
\# expression_file: $file_wt
\# type of analysis: $tipo
\# summary method: $sum_meth
\# headers: name RefSeq Chr txStart txEnd strand mean_chip median_chip num_probes_in_gene
e0c6654

}
	if (!$file_ann || !$file_wt) {
		&printusage()
	}
my @r1=();
my @r2=();


open (annotation,"<$file_ann") || die "file_ann not open:$!\n";
open (wt,"<$file_wt") || die "$file_wt not open:$!\n";
open (output,">$file_output") || die "bed_$file_wt not open:$!\n";

print output "$header";

print "Expression --> probe_selection=$tipo summary method=$sum_meth";

while (defined (my $line_down = <annotation>)) {
   chomp $line_down;
   my @tmp_down = split("\t", $line_down);
   my @probe_match=split("\,", $tmp_down[9]);
   foreach $probes (@probe_match){
   		my @coord=split("-", $probes);
   		$tab_tot{$tmp_down[0]}{"$coord[0]\t$coord[1]"}.="$tmp_down[1]\n$tmp_down[7]\n$tmp_down[0]\n$tmp_down[2]\n$tmp_down[3]\n$tmp_down[4]\n$tmp_down[5]\n$tmp_down[6]\n";
   		push(@ciclo,"$tmp_down[0]\t$coord[0]\t$coord[1]");
   }
}

while (defined (my $line_down = <wt>)) {
   chomp $line_down;
   $line_down=~ s/ //g;
   my @tmp_down = split("\t", $line_down);
   chomp $tmp_down[0];
   $tab_tot_wt{"$tmp_down[0]\t$tmp_down[3]"}=$tmp_down[5];
}
#@sort_wt=sort{$tab_tot_wt{$a} <=> $tab_tot_wt{$b}} keys %tab_tot_wt;


#print "ciclo = $ciclo[0]\t$ciclo[1]\n";

foreach $key_value(@ciclo){

			@keys_values=split("\t",$key_value);
			@array1= split("\n",$tab_tot{$keys_values[0]}{"$keys_values[1]\t$keys_values[2]"});
			@array2= split("\n",$tab_tot_wt{"$keys_values[1]\t$keys_values[2]"});

			if ($tipo eq "internal_exon"){
				$prendo = "^exon";
			}
			if ($tipo eq "all_exon"){
				$prendo = "exon";
			}
			if ($tipo eq "last_exon"){
				$prendo = "exon last";
			}

			if ((!($array1[0] eq "")) && ($array1[1] =~ /$prendo/g))  {

				if ($data_log eq "no_log"){
					$log1=log($array2[0])/log(2);
				}
				if ($data_log eq "log"){
					$log1=$array2[0];
				}

				$tab_gene_wt{"$array1[0]\t$array1[2]\t$array1[3]\t$array1[4]\t$array1[5]\t$array1[6]"}.="$log1\n";
			}
			if ((!($array1[0] eq "")) && ($array1[1] =~ /intron/g))  {
				if ($data_log eq "no_log"){
					$log1=log($array2[0])/log(2);
				}
				if ($data_log eq "log"){
					$log1=$array2[0];
				}
				$tab_intron_wt{"$array1[0]\t$array1[2]\t$array1[3]\t$array1[4]\t$array1[5]\t$array1[6]"}.="$log1\n";
			}

}


foreach $key (keys %tab_gene_wt) {
	@r1=split("\n",$tab_gene_wt{$key});
	if (exists $tab_intron_wt{$key}){
		@r2=split("\n",$tab_intron_wt{$key});
		sort {$a <=> $b} (@r2);
	}else{
		@r2=("no");
	}
	sort {$a <=> $b} (@r1);
	if ($#r2>0) {
		if ($#r1>0) {
			my $ttest = new Statistics::TTest;
 			$ttest->set_significance(95);
 			$ttest->load_data(\@r1,\@r2);
    		my $s1=$ttest->{s1};
    		my $s2=$ttest->{s2};
			$media1=$s1->{mean};
			$media2=$s2->{mean};
			$total_probes=$#r1+1;
			$total_probes2=$#r2+1;
    		if( (@r1 % 2) == 1 ) {
        		$median1 = $r1[((@r1+1) / 2)-1];
    		} else {
        		$median1 = ($r1[(@r1 / 2)-1] + $r1[@r1 / 2]) / 2;
	    	}
	    	if( (@r2 % 2) == 1 ) {
        		$median2 = $r2[((@r2+1) / 2)-1];
    		} else {
        		$median2 = ($r2[(@r2 / 2)-1] + $r2[@r2 / 2]) / 2;
	    	}
	    	if($media1>=$media2){
	    		$pvalue=$ttest->{t_prob};
	    	}
	    	if($media1<$media2){
	    		$pvalue=1;
	    	}
	    	if ($sum_meth eq "mean"){
				print output "$key\t$media1\t$media2\t$pvalue\t$total_probes\t$total_probes2\n";
				#print output "$key\t$media1\t$total_probes\n";
			}
			if ($sum_meth eq "median"){
				print output "$key\t$median1\t$median2\t$pvalue\t$total_probes\t$total_probes2\n";
				#print output "$key\t$median1\t$total_probes\n";
			}
			if ($sum_meth eq "both"){
				print output "$key\t$media1\t$media2\t$median1\t$median2\t$pvalue\t$total_probes\t$total_probes2\n";
				#print output "$key\t$media1\t$median1\t$total_probes\n";
			}
	    }
	    if ($#r1==0) {
			@r1=@r2;
			my $ttest = new Statistics::TTest;
 			$ttest->set_significance(95);
 			$ttest->load_data(\@r1,\@r2);
    		my $s2=$ttest->{s2};
			$media2=$s2->{mean};
			$total_probes=1;
			$total_probes2=$#r2+1;
			$media1=$r1[0];
			$median1=$r1[0];
			if( (@r2 % 2) == 1 ) {
        		$median2 = $r2[((@r2+1) / 2)-1];
    		} else {
        		$median2 = ($r2[(@r2 / 2)-1] + $r2[@r2 / 2]) / 2;
	    	}
	    	if ($sum_meth eq "mean"){
				print output "$key\t$media1\t$media2\tNA\t$total_probes\t$total_probes2\n";
				#print output "$key\t$media1\t$total_probes\n";
			}
			if ($sum_meth eq "median"){
				print output "$key\t$median1\t$median2\tNA\t$total_probes\t$total_probes2\n";
				#print output "$key\t$median1\t$total_probes\n";
			}
			if ($sum_meth eq "both"){
				print output "$key\t$media1\t$media2\t$median1\t$median2\tNA\t$total_probes\t$total_probes2\n";
				#print output "$key\t$media1\t$median1\t$total_probes\n";
			}
		}


	}

	if ($r2[0] eq "no") {
		if ($#r1>0) {
			@r2=@r1;
			my $ttest = new Statistics::TTest;
 			$ttest->set_significance(95);
 			$ttest->load_data(\@r1,\@r2);
    		my $s1=$ttest->{s1};
			$media1=$s1->{mean};
			$total_probes=$#r1+1;
    		if( (@r1 % 2) == 1 ) {
        		$median1 = $r1[((@r1+1) / 2)-1];
    		} else {
        		$median1 = ($r1[(@r1 / 2)-1] + $r1[@r1 / 2]) / 2;
	   		}
	   	}
	   	if ($#r==0){
	   		$media1=$r1[0];
			$median1=$r1[0];
			$total_probes=$#r1+1;
	   	}
		$media2=0;
		$median2=0;
		if ($sum_meth eq "both"){
			print output "$key\t$media1\t$media2\t$median1\t$median2\tNA\t$total_probes\t0\n";
			#print output "$key\t$media1\t$median1\t$total_probes\n";
		}
		if ($sum_meth eq "mean"){
			print output "$key\t$media1\t$media2\tNA\t$total_probes\t0\n";
			#print output "$key\t$media1\t$total_probes\n";
		}
		if ($sum_meth eq "median"){
			print output "$key\t$median1\t$median2\tNA\t$total_probes\t0\n";
			#print output "$key\t$median1\t$total_probes\n";
		}
	}
	if (($#r2 == 0) && ($r2[0] ne "no") && ($#r1==0)){
		$media1=$r1[0];
		$media2=$r2[0];
		$median1=$r1[0];
		$median2=$r2[0];
		if ($sum_meth eq "both"){
			print output "$key\t$media1\t$media2\t$median1\t$median2\tNA\t1\t1\n";
			#print output "$key\t$media1\t$median1\t1\n";
		}
		if ($sum_meth eq "mean"){
			print output "$key\t$media1\t$media2\tNA\t1\t1\n";
			#print output "$key\t$media1\t1\n";
		}
		if ($sum_meth eq "median"){
			print output "$key\t$median1\t$median2\tNA\t1\t1\n";
			#print output "$key\t$median1\t1\n";
		}
	}
}


}


####################################################################################################################################

sub printusage {

	print<<eoc22334;


	***************************************************************
	:: N i m b l e G e n   E x p r e s s i o n   A n a l y s i s ::
	***************************************************************


	 USAGE SUMMARY
     ---------------------------------------------------------------------------------
	This program utilizes NimbleGen ratio files in gff format as INPUT FILE and
	provides a table of the computed picks in the same gff file format.


	--ann annotation file
	--wt wild type file
	--treat treated file
	--pv p-value cut-off
	--rc raw signal cut-off
	--fc fold change cut-toff

     -----------------------------------------------------------------------------------------
	 EXAMPLES:

	 perl program.pl --in 75340_ratio.gff --perc 0.98 --t s --num 3 --dist 250 --w 500
OR
	 perl program.pl --in 75340_ratio.gff --perc 0.98 --t p --log 5 --num 3 --dist 250 --w 500

     -----------------------------------------------------------------------------------------

eoc22334
	exit 0;

}
author	matces
date	Tue, 07 Jun 2011 16:50:41 -0400
parents
children