Mercurial > repos > matces > carpet_toolsuite
view carpet-src-1/tools/CARPET/calcolo_p_v4_norm_intron.pl @ 0:cdd489d98766
Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author | matces |
---|---|
date | Tue, 07 Jun 2011 16:50:41 -0400 |
parents | |
children |
line wrap: on
line source
#! /usr/bin/perl # Copyright 2009 Matteo Cesaroni, Lucilla Luzi # # This program is free software; ; you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published by # the Free Software Foundation; either version 3 of the License, or (at your # option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. #prende tutte le probes che mecciano almeno in parte dentro gli esoni use Statistics::PointEstimation; use Statistics::TTest; #use Statistics::Test::WilcoxonRankSum; use Getopt::Long; GetOptions ( "help" => \$OPT{help}, "ann=s" => \$OPT{ann}, "wt=s" => \$OPT{wt}, "treat=s" => \$OPT{treat}, "out=s" => \$OPT{out}, "pv=s" => \$OPT{prob}, "rc=s" => \$OPT{raw_cut}, "fc=s" => \$OPT{fc_cut}, "exon=s" => \$OPT{exon}, "opt=s" => \$OPT{option}, "norm=s" =>\$OPT{norm_q}, "sum_met=s" =>\$OPT{sum_met}, "fdr=s" =>\$OPT{fdr}, "log=s" =>\$OPT{log_t}, )|| printusage(); # opzioni da linea di comando my $file_ann=$OPT{ann}; my $file_wt=$OPT{wt}; my $file_treat=$OPT{treat}; my $file_output=$OPT{out}; my $pv_cut=$OPT{prob}; my $media_cut=$OPT{raw_cut}; my $FC_cut=$OPT{fc_cut}; my $tipo=$OPT{exon}; my $option=$OPT{option}; my $normalization_q=$OPT{norm_q}; my $sum_meth=$OPT{sum_met}; my $corretction=$OPT{fdr}; my $data_log=$OPT{log_t}; $help and printusage(); if($option eq "comp"){ if ($sum_meth eq "both"){ $header=<<e0c6654; \# annotation_file: $file_ann \# wt_file: $file_wt \# treat_file: $file_treat \# p-value cutoff: $pv_cut \# raw data cutoff cutoff: $media_cut \# summary method: $sum_meth \# fold change cutoff: $FC_cut \# type of analysis: $tipo \# headers: name RefSeq Chr txStart txEnd strand mean_chip1 mean_chip2 FC_mean median_chip1 median_chip2 FC_median num_probes_in_gene p-value FDR(q-value) e0c6654 } if ($sum_meth eq "mean"){ $header=<<e0c6654; \# annotation_file: $file_ann \# wt_file: $file_wt \# treat_file: $file_treat \# p-value cutoff: $pv_cut \# raw data cutoff cutoff: $media_cut \# summary method: $sum_meth \# fold change cutoff: $FC_cut \# type of analysis: $tipo \# headers: name RefSeq Chr txStart txEnd strand mean_chip1 mean_chip2 FC_mean num_probes_in_gene p-value FDR(q-value) e0c6654 } if ($sum_meth eq "median"){ $header=<<e0c6654; \# annotation_file: $file_ann \# wt_file: $file_wt \# treat_file: $file_treat \# p-value cutoff: $pv_cut \# raw data cutoff cutoff: $media_cut \# summary method: $sum_meth \# fold change cutoff: $FC_cut \# type of analysis: $tipo \# headers: name RefSeq Chr txStart txEnd strand median_chip1 median_chip2 FC_median num_probes_in_gene p-value FDR(q-value) e0c6654 } # "usage" se non ci sono opzioni if (!$file_ann || !$file_wt || !$file_treat) { &printusage() } #if ($pv_cut == 1){$pv_cut=0.9999;} if (!$media_cut){$media_cut=0;} if (!$FC_cut){$FC_cut=0;} my @r1=(); my @r2=(); #my $confident=(1-$pv_cut)*100; #inserire nell'ordine: tabella annotazione, tabella valori wt, tabella valori treated, p-value cutoff, raw signal cutoff open (annotation,"<$file_ann") || die "file_ann not open:$!\n"; open (wt,"<$file_wt") || die "$file_wt not open:$!\n"; open (treated,"<$file_treat") || die "$file_treat not open:$!\n"; open (output,">$file_output") || die "bed_$file_wt not open:$!\n"; print output "$header"; print "Comparison --> probe_selection=$tipo, Normalization=$normalization_q, summary method=$sum_meth Filters: p-value=$pv_cut, raw value=$media_cut, FC=$FC_cut"; while (defined (my $line_down = <annotation>)) { chomp $line_down; my @tmp_down = split("\t", $line_down); my @probe_match=split("\,", $tmp_down[9]); foreach $probes (@probe_match){ my @coord=split("-", $probes); $tab_tot{$tmp_down[0]}{"$coord[0]\t$coord[1]"}.="$tmp_down[1]\n$tmp_down[7]\n$tmp_down[0]\n$tmp_down[2]\n$tmp_down[3]\n$tmp_down[4]\n$tmp_down[5]\n$tmp_down[6]\n"; push(@ciclo,"$tmp_down[0]\t$coord[0]\t$coord[1]"); } } while (defined (my $line_down = <wt>)) { chomp $line_down; my @tmp_down = split("\t", $line_down); chomp $tmp_down[0]; $tab_tot_wt{"$tmp_down[0]\t$tmp_down[3]"}=$tmp_down[5]; } while (defined (my $line_down = <treated>)) { chomp $line_down; my @tmp_down = split("\t", $line_down); chomp $tmp_down[0]; $tab_tot_treat{"$tmp_down[0]\t$tmp_down[3]"}=$tmp_down[5]; } if($normalization_q eq "yes"){ @sort_wt=sort{$tab_tot_wt{$a} <=> $tab_tot_wt{$b}} keys %tab_tot_wt; @sort_treat=sort{$tab_tot_treat{$a} <=> $tab_tot_treat{$b}} (keys %tab_tot_treat); for($i=0;$i<=$#sort_wt;$i++){ $media=($tab_tot_wt{$sort_wt[$i]}+$tab_tot_treat{$sort_treat[$i]})/2; $tab_tot_value{$sort_wt[$i]}.="$media\n"; } for($i=0;$i<=$#sort_treat;$i++){ $media=($tab_tot_wt{$sort_wt[$i]}+$tab_tot_treat{$sort_treat[$i]})/2; $tab_tot_value{$sort_treat[$i]}.="$media\n"; } } foreach $key_value(@ciclo){ @keys_values=split("\t",$key_value); @array1= split("\n",$tab_tot{$keys_values[0]}{"$keys_values[1]\t$keys_values[2]"}); if($normalization_q eq "yes"){ @array2= split("\n",$tab_tot_value{"$keys_values[1]\t$keys_values[2]"}); $value1=$array2[0]; $value2=$array2[1]; } else{ $value1=$tab_tot_wt{"$keys_values[1]\t$keys_values[2]"}; $value2=$tab_tot_treat{"$keys_values[1]\t$keys_values[2]"}; } if ($tipo eq "internal_exon"){ $prendo = "^exon"; } if ($tipo eq "all_exon"){ $prendo = "exon"; } if ($tipo eq "last_exon"){ $prendo = "exon last"; } if ((!($array1[0] eq "")) && ($array1[1] =~ /$prendo/g)) { if ($data_log eq "no_log"){ $log1=log($value1)/log(2); $log2=log($value2)/log(2); } if ($data_log eq "log"){ $log1=$value1; $log2=$value2; } $tab_gene_wt{"$array1[0]\t$array1[2]\t$array1[3]\t$array1[4]\t$array1[5]\t$array1[6]"}.="$log1\n"; $tab_gene_tratted{"$array1[0]\t$array1[2]\t$array1[3]\t$array1[4]\t$array1[5]\t$array1[6]"}.="$log2\n"; } } foreach $key (keys %tab_gene_wt) { @r1=split("\n",$tab_gene_wt{$key}); @r2=split("\n",$tab_gene_tratted{$key}); sort {$a <=> $b} (@r1); sort {$a <=> $b} (@r2); if ($#r1>0){ my $ttest = new Statistics::TTest; $ttest->set_significance(95); $ttest->load_data(\@r1,\@r2); my $s1=$ttest->{s1}; my $s2=$ttest->{s2}; $media1=$s1->{mean}; $media2=$s2->{mean}; $total_probes=$#r1+1; $potenza_a=$media2-$media1; $FCa=((2)**$potenza_a); if( (@r1 % 2) == 1 ) { $median1 = $r1[((@r1+1) / 2)-1]; $median2 = $r2[((@r2+1) / 2)-1]; } else { $median1 = ($r1[(@r1 / 2)-1] + $r1[@r1 / 2]) / 2; $median2 = ($r2[(@r2 / 2)-1] + $r2[@r2 / 2]) / 2; } $potenza_m=$median2-$median1; $FCm=((2)**$potenza_m); if ($FCa<1){ $FCa=(-(1/$FCa)); } if ($FCm<1){ $FCm=(-(1/$FCm)); } if ($sum_meth eq "mean"){ if ((($media1<$media_cut) && ($media2<$media_cut)) || ($FC_cut>=abs($FCa))){ next; } $p_value{"$key\t$media1\t$media2\t$FCa\t$total_probes"}=$ttest->{t_prob}; #print output "$key\t$media1\t$media2\t$FCa\t",$ttest->{t_prob},"\t$total_probes\n"; } if ($sum_meth eq "median"){ if ((($media1<$media_cut) && ($media2<$media_cut)) || ($FC_cut>=abs($FCm))){ next; } #print output "$key\t$median1\t$median2\t$FCm\t",$ttest->{t_prob},"\t$total_probes\n"; $p_value{"$key\t$median1\t$median2\t$FCm\t$total_probes"}=$ttest->{t_prob}; } if ($sum_meth eq "both"){ if (($media1<$media_cut) && ($media2<$media_cut)){ next; } #print output "$key\t$media1\t$media2\t$FCa\t$median1\t$median2\t$FCm\t",$ttest->{t_prob},"\t$total_probes\n"; $p_value{"$key\t$media1\t$media2\t$FCa\t$median1\t$median2\t$FCm\t$total_probes"}=$ttest->{t_prob}; } } if ($#r1==0){ $media1=$r1[0]; $media2=$r2[0]; $median1=$r1[0]; $median2=$r2[0]; $potenza=$media2-$media1; $FCa=((2)**$potenza); $FCm=((2)**$potenza); if ($FCa<1){ $FCa=(-(1/$FCa)); } if ($FCm<1){ $FCm=(-(1/$FCm)); } if ($sum_meth eq "both"){ if (($media1<$media_cut) && ($media2<$media_cut)){ next; } $p_value{"$key\t$media1\t$media2\t$FCa\t$median1\t$median2\t$FCm\t$total_probes"}=1; } if ($sum_meth eq "mean"){ if ((($media1<$media_cut) && ($media2<$media_cut)) || ($FC_cut>=abs($FCa))){ next; } $p_value{"$key\t$media1\t$media2\t$FCa\t$total_probes"}=1; } if ($sum_meth eq "median"){ if ((($media1<$media_cut) && ($media2<$media_cut)) || ($FC_cut>=abs($FCm))){ next; } $p_value{"$key\t$median1\t$median2\t$FCm\t$total_probes"}=1; } } } @sort_pvalue=sort{$p_value{$a} <=> $p_value{$b}} keys %p_value; if($corretction eq "yes"){ for($i=0;$i<=$#sort_pvalue;$i++){ $qvalue=$p_value{$sort_pvalue[$i]}*(($#sort_pvalue+1)/($i+1)); push(@QVALUE, $qvalue); } @qvalue_sort = sort {$a <=> $b} @QVALUE; for($i=0;$i<=$#sort_pvalue;$i++){ #$FDR=((($i+1)*0.05)/($#sort_pvalue+1)); $qvalue=shift(@qvalue_sort); if($qvalue>1){$qvalue=1;} if($pv_cut>=$qvalue){ print output "$sort_pvalue[$i]\t$p_value{$sort_pvalue[$i]}\t$qvalue\n"; } } } if($corretction eq "no"){ for($i=0;$i<=$#sort_pvalue;$i++){ if($pv_cut>=$qvalue){ print output "$sort_pvalue[$i]\t$p_value{$sort_pvalue[$i]}\n"; } } } } if($option eq "expr"){ if ($sum_meth eq "median"){ $header=<<e0c6654; \# annotation_file: $file_ann \# expression_file: $file_wt \# type of analysis: $tipo \# summary method: $sum_meth \# headers: name RefSeq Chr txStart txEnd strand median_chip num_probes_in_gene e0c6654 } if ($sum_meth eq "mean"){ $header=<<e0c6654; \# annotation_file: $file_ann \# expression_file: $file_wt \# type of analysis: $tipo \# summary method: $sum_meth \# headers: name RefSeq Chr txStart txEnd strand mean_chip num_probes_in_gene e0c6654 } if ($sum_meth eq "both"){ $header=<<e0c6654; \# annotation_file: $file_ann \# expression_file: $file_wt \# type of analysis: $tipo \# summary method: $sum_meth \# headers: name RefSeq Chr txStart txEnd strand mean_chip median_chip num_probes_in_gene e0c6654 } if (!$file_ann || !$file_wt) { &printusage() } my @r1=(); my @r2=(); open (annotation,"<$file_ann") || die "file_ann not open:$!\n"; open (wt,"<$file_wt") || die "$file_wt not open:$!\n"; open (output,">$file_output") || die "bed_$file_wt not open:$!\n"; print output "$header"; print "Expression --> probe_selection=$tipo summary method=$sum_meth"; while (defined (my $line_down = <annotation>)) { chomp $line_down; my @tmp_down = split("\t", $line_down); my @probe_match=split("\,", $tmp_down[9]); foreach $probes (@probe_match){ my @coord=split("-", $probes); $tab_tot{$tmp_down[0]}{"$coord[0]\t$coord[1]"}.="$tmp_down[1]\n$tmp_down[7]\n$tmp_down[0]\n$tmp_down[2]\n$tmp_down[3]\n$tmp_down[4]\n$tmp_down[5]\n$tmp_down[6]\n"; push(@ciclo,"$tmp_down[0]\t$coord[0]\t$coord[1]"); } } while (defined (my $line_down = <wt>)) { chomp $line_down; $line_down=~ s/ //g; my @tmp_down = split("\t", $line_down); chomp $tmp_down[0]; $tab_tot_wt{"$tmp_down[0]\t$tmp_down[3]"}=$tmp_down[5]; } #@sort_wt=sort{$tab_tot_wt{$a} <=> $tab_tot_wt{$b}} keys %tab_tot_wt; #print "ciclo = $ciclo[0]\t$ciclo[1]\n"; foreach $key_value(@ciclo){ @keys_values=split("\t",$key_value); @array1= split("\n",$tab_tot{$keys_values[0]}{"$keys_values[1]\t$keys_values[2]"}); @array2= split("\n",$tab_tot_wt{"$keys_values[1]\t$keys_values[2]"}); if ($tipo eq "internal_exon"){ $prendo = "^exon"; } if ($tipo eq "all_exon"){ $prendo = "exon"; } if ($tipo eq "last_exon"){ $prendo = "exon last"; } if ((!($array1[0] eq "")) && ($array1[1] =~ /$prendo/g)) { if ($data_log eq "no_log"){ $log1=log($array2[0])/log(2); } if ($data_log eq "log"){ $log1=$array2[0]; } $tab_gene_wt{"$array1[0]\t$array1[2]\t$array1[3]\t$array1[4]\t$array1[5]\t$array1[6]"}.="$log1\n"; } if ((!($array1[0] eq "")) && ($array1[1] =~ /intron/g)) { if ($data_log eq "no_log"){ $log1=log($array2[0])/log(2); } if ($data_log eq "log"){ $log1=$array2[0]; } $tab_intron_wt{"$array1[0]\t$array1[2]\t$array1[3]\t$array1[4]\t$array1[5]\t$array1[6]"}.="$log1\n"; } } foreach $key (keys %tab_gene_wt) { @r1=split("\n",$tab_gene_wt{$key}); if (exists $tab_intron_wt{$key}){ @r2=split("\n",$tab_intron_wt{$key}); sort {$a <=> $b} (@r2); }else{ @r2=("no"); } sort {$a <=> $b} (@r1); if ($#r2>0) { if ($#r1>0) { my $ttest = new Statistics::TTest; $ttest->set_significance(95); $ttest->load_data(\@r1,\@r2); my $s1=$ttest->{s1}; my $s2=$ttest->{s2}; $media1=$s1->{mean}; $media2=$s2->{mean}; $total_probes=$#r1+1; $total_probes2=$#r2+1; if( (@r1 % 2) == 1 ) { $median1 = $r1[((@r1+1) / 2)-1]; } else { $median1 = ($r1[(@r1 / 2)-1] + $r1[@r1 / 2]) / 2; } if( (@r2 % 2) == 1 ) { $median2 = $r2[((@r2+1) / 2)-1]; } else { $median2 = ($r2[(@r2 / 2)-1] + $r2[@r2 / 2]) / 2; } if($media1>=$media2){ $pvalue=$ttest->{t_prob}; } if($media1<$media2){ $pvalue=1; } if ($sum_meth eq "mean"){ print output "$key\t$media1\t$media2\t$pvalue\t$total_probes\t$total_probes2\n"; #print output "$key\t$media1\t$total_probes\n"; } if ($sum_meth eq "median"){ print output "$key\t$median1\t$median2\t$pvalue\t$total_probes\t$total_probes2\n"; #print output "$key\t$median1\t$total_probes\n"; } if ($sum_meth eq "both"){ print output "$key\t$media1\t$media2\t$median1\t$median2\t$pvalue\t$total_probes\t$total_probes2\n"; #print output "$key\t$media1\t$median1\t$total_probes\n"; } } if ($#r1==0) { @r1=@r2; my $ttest = new Statistics::TTest; $ttest->set_significance(95); $ttest->load_data(\@r1,\@r2); my $s2=$ttest->{s2}; $media2=$s2->{mean}; $total_probes=1; $total_probes2=$#r2+1; $media1=$r1[0]; $median1=$r1[0]; if( (@r2 % 2) == 1 ) { $median2 = $r2[((@r2+1) / 2)-1]; } else { $median2 = ($r2[(@r2 / 2)-1] + $r2[@r2 / 2]) / 2; } if ($sum_meth eq "mean"){ print output "$key\t$media1\t$media2\tNA\t$total_probes\t$total_probes2\n"; #print output "$key\t$media1\t$total_probes\n"; } if ($sum_meth eq "median"){ print output "$key\t$median1\t$median2\tNA\t$total_probes\t$total_probes2\n"; #print output "$key\t$median1\t$total_probes\n"; } if ($sum_meth eq "both"){ print output "$key\t$media1\t$media2\t$median1\t$median2\tNA\t$total_probes\t$total_probes2\n"; #print output "$key\t$media1\t$median1\t$total_probes\n"; } } } if ($r2[0] eq "no") { if ($#r1>0) { @r2=@r1; my $ttest = new Statistics::TTest; $ttest->set_significance(95); $ttest->load_data(\@r1,\@r2); my $s1=$ttest->{s1}; $media1=$s1->{mean}; $total_probes=$#r1+1; if( (@r1 % 2) == 1 ) { $median1 = $r1[((@r1+1) / 2)-1]; } else { $median1 = ($r1[(@r1 / 2)-1] + $r1[@r1 / 2]) / 2; } } if ($#r==0){ $media1=$r1[0]; $median1=$r1[0]; $total_probes=$#r1+1; } $media2=0; $median2=0; if ($sum_meth eq "both"){ print output "$key\t$media1\t$media2\t$median1\t$median2\tNA\t$total_probes\t0\n"; #print output "$key\t$media1\t$median1\t$total_probes\n"; } if ($sum_meth eq "mean"){ print output "$key\t$media1\t$media2\tNA\t$total_probes\t0\n"; #print output "$key\t$media1\t$total_probes\n"; } if ($sum_meth eq "median"){ print output "$key\t$median1\t$median2\tNA\t$total_probes\t0\n"; #print output "$key\t$median1\t$total_probes\n"; } } if (($#r2 == 0) && ($r2[0] ne "no") && ($#r1==0)){ $media1=$r1[0]; $media2=$r2[0]; $median1=$r1[0]; $median2=$r2[0]; if ($sum_meth eq "both"){ print output "$key\t$media1\t$media2\t$median1\t$median2\tNA\t1\t1\n"; #print output "$key\t$media1\t$median1\t1\n"; } if ($sum_meth eq "mean"){ print output "$key\t$media1\t$media2\tNA\t1\t1\n"; #print output "$key\t$media1\t1\n"; } if ($sum_meth eq "median"){ print output "$key\t$median1\t$median2\tNA\t1\t1\n"; #print output "$key\t$median1\t1\n"; } } } } #################################################################################################################################### sub printusage { print<<eoc22334; *************************************************************** :: N i m b l e G e n E x p r e s s i o n A n a l y s i s :: *************************************************************** USAGE SUMMARY --------------------------------------------------------------------------------- This program utilizes NimbleGen ratio files in gff format as INPUT FILE and provides a table of the computed picks in the same gff file format. --ann annotation file --wt wild type file --treat treated file --pv p-value cut-off --rc raw signal cut-off --fc fold change cut-toff ----------------------------------------------------------------------------------------- EXAMPLES: perl program.pl --in 75340_ratio.gff --perc 0.98 --t s --num 3 --dist 250 --w 500 OR perl program.pl --in 75340_ratio.gff --perc 0.98 --t p --log 5 --num 3 --dist 250 --w 500 ----------------------------------------------------------------------------------------- eoc22334 exit 0; }