Mercurial > repos > matces > carpet_toolsuite
diff carpet-src-1/tools/CARPET/calcolo_p_v4_norm_intron.pl @ 0:cdd489d98766
Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author | matces |
---|---|
date | Tue, 07 Jun 2011 16:50:41 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/carpet-src-1/tools/CARPET/calcolo_p_v4_norm_intron.pl Tue Jun 07 16:50:41 2011 -0400 @@ -0,0 +1,636 @@ +#! /usr/bin/perl + +# Copyright 2009 Matteo Cesaroni, Lucilla Luzi +# +# This program is free software; ; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + + + +#prende tutte le probes che mecciano almeno in parte dentro gli esoni + +use Statistics::PointEstimation; +use Statistics::TTest; +#use Statistics::Test::WilcoxonRankSum; +use Getopt::Long; + +GetOptions ( + "help" => \$OPT{help}, + "ann=s" => \$OPT{ann}, + "wt=s" => \$OPT{wt}, + "treat=s" => \$OPT{treat}, + "out=s" => \$OPT{out}, + "pv=s" => \$OPT{prob}, + "rc=s" => \$OPT{raw_cut}, + "fc=s" => \$OPT{fc_cut}, + "exon=s" => \$OPT{exon}, + "opt=s" => \$OPT{option}, + "norm=s" =>\$OPT{norm_q}, + "sum_met=s" =>\$OPT{sum_met}, + "fdr=s" =>\$OPT{fdr}, + "log=s" =>\$OPT{log_t}, +)|| printusage(); + +# opzioni da linea di comando +my $file_ann=$OPT{ann}; +my $file_wt=$OPT{wt}; +my $file_treat=$OPT{treat}; +my $file_output=$OPT{out}; +my $pv_cut=$OPT{prob}; +my $media_cut=$OPT{raw_cut}; +my $FC_cut=$OPT{fc_cut}; +my $tipo=$OPT{exon}; +my $option=$OPT{option}; +my $normalization_q=$OPT{norm_q}; +my $sum_meth=$OPT{sum_met}; +my $corretction=$OPT{fdr}; +my $data_log=$OPT{log_t}; + +$help and printusage(); + + +if($option eq "comp"){ +if ($sum_meth eq "both"){ +$header=<<e0c6654; +\# annotation_file: $file_ann +\# wt_file: $file_wt +\# treat_file: $file_treat +\# p-value cutoff: $pv_cut +\# raw data cutoff cutoff: $media_cut +\# summary method: $sum_meth +\# fold change cutoff: $FC_cut +\# type of analysis: $tipo +\# headers: name RefSeq Chr txStart txEnd strand mean_chip1 mean_chip2 FC_mean median_chip1 median_chip2 FC_median num_probes_in_gene p-value FDR(q-value) +e0c6654 + +} +if ($sum_meth eq "mean"){ +$header=<<e0c6654; +\# annotation_file: $file_ann +\# wt_file: $file_wt +\# treat_file: $file_treat +\# p-value cutoff: $pv_cut +\# raw data cutoff cutoff: $media_cut +\# summary method: $sum_meth +\# fold change cutoff: $FC_cut +\# type of analysis: $tipo +\# headers: name RefSeq Chr txStart txEnd strand mean_chip1 mean_chip2 FC_mean num_probes_in_gene p-value FDR(q-value) +e0c6654 + +} +if ($sum_meth eq "median"){ +$header=<<e0c6654; +\# annotation_file: $file_ann +\# wt_file: $file_wt +\# treat_file: $file_treat +\# p-value cutoff: $pv_cut +\# raw data cutoff cutoff: $media_cut +\# summary method: $sum_meth +\# fold change cutoff: $FC_cut +\# type of analysis: $tipo +\# headers: name RefSeq Chr txStart txEnd strand median_chip1 median_chip2 FC_median num_probes_in_gene p-value FDR(q-value) +e0c6654 + +} + + + +# "usage" se non ci sono opzioni +if (!$file_ann || !$file_wt || !$file_treat) { + &printusage() +} + +#if ($pv_cut == 1){$pv_cut=0.9999;} +if (!$media_cut){$media_cut=0;} +if (!$FC_cut){$FC_cut=0;} + +my @r1=(); +my @r2=(); + +#my $confident=(1-$pv_cut)*100; +#inserire nell'ordine: tabella annotazione, tabella valori wt, tabella valori treated, p-value cutoff, raw signal cutoff + + +open (annotation,"<$file_ann") || die "file_ann not open:$!\n"; +open (wt,"<$file_wt") || die "$file_wt not open:$!\n"; +open (treated,"<$file_treat") || die "$file_treat not open:$!\n"; +open (output,">$file_output") || die "bed_$file_wt not open:$!\n"; + +print output "$header"; + +print "Comparison --> probe_selection=$tipo, Normalization=$normalization_q, summary method=$sum_meth Filters: p-value=$pv_cut, raw value=$media_cut, FC=$FC_cut"; + +while (defined (my $line_down = <annotation>)) { + chomp $line_down; + my @tmp_down = split("\t", $line_down); + my @probe_match=split("\,", $tmp_down[9]); + foreach $probes (@probe_match){ + my @coord=split("-", $probes); + $tab_tot{$tmp_down[0]}{"$coord[0]\t$coord[1]"}.="$tmp_down[1]\n$tmp_down[7]\n$tmp_down[0]\n$tmp_down[2]\n$tmp_down[3]\n$tmp_down[4]\n$tmp_down[5]\n$tmp_down[6]\n"; + push(@ciclo,"$tmp_down[0]\t$coord[0]\t$coord[1]"); + } +} + +while (defined (my $line_down = <wt>)) { + chomp $line_down; + my @tmp_down = split("\t", $line_down); + chomp $tmp_down[0]; + $tab_tot_wt{"$tmp_down[0]\t$tmp_down[3]"}=$tmp_down[5]; +} + + +while (defined (my $line_down = <treated>)) { + chomp $line_down; + my @tmp_down = split("\t", $line_down); + chomp $tmp_down[0]; + $tab_tot_treat{"$tmp_down[0]\t$tmp_down[3]"}=$tmp_down[5]; +} + +if($normalization_q eq "yes"){ + + @sort_wt=sort{$tab_tot_wt{$a} <=> $tab_tot_wt{$b}} keys %tab_tot_wt; + @sort_treat=sort{$tab_tot_treat{$a} <=> $tab_tot_treat{$b}} (keys %tab_tot_treat); + + for($i=0;$i<=$#sort_wt;$i++){ + $media=($tab_tot_wt{$sort_wt[$i]}+$tab_tot_treat{$sort_treat[$i]})/2; + $tab_tot_value{$sort_wt[$i]}.="$media\n"; + } + + for($i=0;$i<=$#sort_treat;$i++){ + $media=($tab_tot_wt{$sort_wt[$i]}+$tab_tot_treat{$sort_treat[$i]})/2; + $tab_tot_value{$sort_treat[$i]}.="$media\n"; + } + +} + + +foreach $key_value(@ciclo){ + + @keys_values=split("\t",$key_value); + @array1= split("\n",$tab_tot{$keys_values[0]}{"$keys_values[1]\t$keys_values[2]"}); + if($normalization_q eq "yes"){ + @array2= split("\n",$tab_tot_value{"$keys_values[1]\t$keys_values[2]"}); + $value1=$array2[0]; + $value2=$array2[1]; + } + else{ + $value1=$tab_tot_wt{"$keys_values[1]\t$keys_values[2]"}; + $value2=$tab_tot_treat{"$keys_values[1]\t$keys_values[2]"}; + } + if ($tipo eq "internal_exon"){ + $prendo = "^exon"; + } + if ($tipo eq "all_exon"){ + $prendo = "exon"; + } + if ($tipo eq "last_exon"){ + $prendo = "exon last"; + } + + if ((!($array1[0] eq "")) && ($array1[1] =~ /$prendo/g)) { + if ($data_log eq "no_log"){ + $log1=log($value1)/log(2); + $log2=log($value2)/log(2); + } + if ($data_log eq "log"){ + $log1=$value1; + $log2=$value2; + } + $tab_gene_wt{"$array1[0]\t$array1[2]\t$array1[3]\t$array1[4]\t$array1[5]\t$array1[6]"}.="$log1\n"; + $tab_gene_tratted{"$array1[0]\t$array1[2]\t$array1[3]\t$array1[4]\t$array1[5]\t$array1[6]"}.="$log2\n"; + } + +} + +foreach $key (keys %tab_gene_wt) { + @r1=split("\n",$tab_gene_wt{$key}); + @r2=split("\n",$tab_gene_tratted{$key}); + + sort {$a <=> $b} (@r1); + sort {$a <=> $b} (@r2); + + if ($#r1>0){ + my $ttest = new Statistics::TTest; + $ttest->set_significance(95); + $ttest->load_data(\@r1,\@r2); + my $s1=$ttest->{s1}; + my $s2=$ttest->{s2}; + $media1=$s1->{mean}; + $media2=$s2->{mean}; + $total_probes=$#r1+1; + $potenza_a=$media2-$media1; + $FCa=((2)**$potenza_a); + if( (@r1 % 2) == 1 ) { + $median1 = $r1[((@r1+1) / 2)-1]; + $median2 = $r2[((@r2+1) / 2)-1]; + } else { + $median1 = ($r1[(@r1 / 2)-1] + $r1[@r1 / 2]) / 2; + $median2 = ($r2[(@r2 / 2)-1] + $r2[@r2 / 2]) / 2; + } + $potenza_m=$median2-$median1; + $FCm=((2)**$potenza_m); + + + if ($FCa<1){ + $FCa=(-(1/$FCa)); + } + + + if ($FCm<1){ + $FCm=(-(1/$FCm)); + } + if ($sum_meth eq "mean"){ + if ((($media1<$media_cut) && ($media2<$media_cut)) || ($FC_cut>=abs($FCa))){ + next; + } + $p_value{"$key\t$media1\t$media2\t$FCa\t$total_probes"}=$ttest->{t_prob}; + + #print output "$key\t$media1\t$media2\t$FCa\t",$ttest->{t_prob},"\t$total_probes\n"; + } + if ($sum_meth eq "median"){ + if ((($media1<$media_cut) && ($media2<$media_cut)) || ($FC_cut>=abs($FCm))){ + next; + } + #print output "$key\t$median1\t$median2\t$FCm\t",$ttest->{t_prob},"\t$total_probes\n"; + $p_value{"$key\t$median1\t$median2\t$FCm\t$total_probes"}=$ttest->{t_prob}; + } + if ($sum_meth eq "both"){ + if (($media1<$media_cut) && ($media2<$media_cut)){ + next; + } + #print output "$key\t$media1\t$media2\t$FCa\t$median1\t$median2\t$FCm\t",$ttest->{t_prob},"\t$total_probes\n"; + $p_value{"$key\t$media1\t$media2\t$FCa\t$median1\t$median2\t$FCm\t$total_probes"}=$ttest->{t_prob}; + } + } + if ($#r1==0){ + $media1=$r1[0]; + $media2=$r2[0]; + $median1=$r1[0]; + $median2=$r2[0]; + $potenza=$media2-$media1; + $FCa=((2)**$potenza); + $FCm=((2)**$potenza); + if ($FCa<1){ + $FCa=(-(1/$FCa)); + } + if ($FCm<1){ + $FCm=(-(1/$FCm)); + } + + if ($sum_meth eq "both"){ + if (($media1<$media_cut) && ($media2<$media_cut)){ + next; + } + $p_value{"$key\t$media1\t$media2\t$FCa\t$median1\t$median2\t$FCm\t$total_probes"}=1; + } + if ($sum_meth eq "mean"){ + if ((($media1<$media_cut) && ($media2<$media_cut)) || ($FC_cut>=abs($FCa))){ + next; + } + $p_value{"$key\t$media1\t$media2\t$FCa\t$total_probes"}=1; + } + if ($sum_meth eq "median"){ + if ((($media1<$media_cut) && ($media2<$media_cut)) || ($FC_cut>=abs($FCm))){ + next; + } + $p_value{"$key\t$median1\t$median2\t$FCm\t$total_probes"}=1; + } + } +} + +@sort_pvalue=sort{$p_value{$a} <=> $p_value{$b}} keys %p_value; + +if($corretction eq "yes"){ + for($i=0;$i<=$#sort_pvalue;$i++){ + $qvalue=$p_value{$sort_pvalue[$i]}*(($#sort_pvalue+1)/($i+1)); + push(@QVALUE, $qvalue); + } + + @qvalue_sort = sort {$a <=> $b} @QVALUE; + + for($i=0;$i<=$#sort_pvalue;$i++){ + #$FDR=((($i+1)*0.05)/($#sort_pvalue+1)); + $qvalue=shift(@qvalue_sort); + if($qvalue>1){$qvalue=1;} + if($pv_cut>=$qvalue){ + print output "$sort_pvalue[$i]\t$p_value{$sort_pvalue[$i]}\t$qvalue\n"; + } + } +} +if($corretction eq "no"){ + for($i=0;$i<=$#sort_pvalue;$i++){ + if($pv_cut>=$qvalue){ + print output "$sort_pvalue[$i]\t$p_value{$sort_pvalue[$i]}\n"; + } + } +} + + + + + +} +if($option eq "expr"){ +if ($sum_meth eq "median"){ +$header=<<e0c6654; +\# annotation_file: $file_ann +\# expression_file: $file_wt +\# type of analysis: $tipo +\# summary method: $sum_meth +\# headers: name RefSeq Chr txStart txEnd strand median_chip num_probes_in_gene +e0c6654 + +} +if ($sum_meth eq "mean"){ +$header=<<e0c6654; +\# annotation_file: $file_ann +\# expression_file: $file_wt +\# type of analysis: $tipo +\# summary method: $sum_meth +\# headers: name RefSeq Chr txStart txEnd strand mean_chip num_probes_in_gene +e0c6654 + +} +if ($sum_meth eq "both"){ +$header=<<e0c6654; +\# annotation_file: $file_ann +\# expression_file: $file_wt +\# type of analysis: $tipo +\# summary method: $sum_meth +\# headers: name RefSeq Chr txStart txEnd strand mean_chip median_chip num_probes_in_gene +e0c6654 + +} + if (!$file_ann || !$file_wt) { + &printusage() + } +my @r1=(); +my @r2=(); + + +open (annotation,"<$file_ann") || die "file_ann not open:$!\n"; +open (wt,"<$file_wt") || die "$file_wt not open:$!\n"; +open (output,">$file_output") || die "bed_$file_wt not open:$!\n"; + +print output "$header"; + +print "Expression --> probe_selection=$tipo summary method=$sum_meth"; + +while (defined (my $line_down = <annotation>)) { + chomp $line_down; + my @tmp_down = split("\t", $line_down); + my @probe_match=split("\,", $tmp_down[9]); + foreach $probes (@probe_match){ + my @coord=split("-", $probes); + $tab_tot{$tmp_down[0]}{"$coord[0]\t$coord[1]"}.="$tmp_down[1]\n$tmp_down[7]\n$tmp_down[0]\n$tmp_down[2]\n$tmp_down[3]\n$tmp_down[4]\n$tmp_down[5]\n$tmp_down[6]\n"; + push(@ciclo,"$tmp_down[0]\t$coord[0]\t$coord[1]"); + } +} + +while (defined (my $line_down = <wt>)) { + chomp $line_down; + $line_down=~ s/ //g; + my @tmp_down = split("\t", $line_down); + chomp $tmp_down[0]; + $tab_tot_wt{"$tmp_down[0]\t$tmp_down[3]"}=$tmp_down[5]; +} +#@sort_wt=sort{$tab_tot_wt{$a} <=> $tab_tot_wt{$b}} keys %tab_tot_wt; + + +#print "ciclo = $ciclo[0]\t$ciclo[1]\n"; + +foreach $key_value(@ciclo){ + + @keys_values=split("\t",$key_value); + @array1= split("\n",$tab_tot{$keys_values[0]}{"$keys_values[1]\t$keys_values[2]"}); + @array2= split("\n",$tab_tot_wt{"$keys_values[1]\t$keys_values[2]"}); + + if ($tipo eq "internal_exon"){ + $prendo = "^exon"; + } + if ($tipo eq "all_exon"){ + $prendo = "exon"; + } + if ($tipo eq "last_exon"){ + $prendo = "exon last"; + } + + if ((!($array1[0] eq "")) && ($array1[1] =~ /$prendo/g)) { + + if ($data_log eq "no_log"){ + $log1=log($array2[0])/log(2); + } + if ($data_log eq "log"){ + $log1=$array2[0]; + } + + $tab_gene_wt{"$array1[0]\t$array1[2]\t$array1[3]\t$array1[4]\t$array1[5]\t$array1[6]"}.="$log1\n"; + } + if ((!($array1[0] eq "")) && ($array1[1] =~ /intron/g)) { + if ($data_log eq "no_log"){ + $log1=log($array2[0])/log(2); + } + if ($data_log eq "log"){ + $log1=$array2[0]; + } + $tab_intron_wt{"$array1[0]\t$array1[2]\t$array1[3]\t$array1[4]\t$array1[5]\t$array1[6]"}.="$log1\n"; + } + +} + + +foreach $key (keys %tab_gene_wt) { + @r1=split("\n",$tab_gene_wt{$key}); + if (exists $tab_intron_wt{$key}){ + @r2=split("\n",$tab_intron_wt{$key}); + sort {$a <=> $b} (@r2); + }else{ + @r2=("no"); + } + sort {$a <=> $b} (@r1); + if ($#r2>0) { + if ($#r1>0) { + my $ttest = new Statistics::TTest; + $ttest->set_significance(95); + $ttest->load_data(\@r1,\@r2); + my $s1=$ttest->{s1}; + my $s2=$ttest->{s2}; + $media1=$s1->{mean}; + $media2=$s2->{mean}; + $total_probes=$#r1+1; + $total_probes2=$#r2+1; + if( (@r1 % 2) == 1 ) { + $median1 = $r1[((@r1+1) / 2)-1]; + } else { + $median1 = ($r1[(@r1 / 2)-1] + $r1[@r1 / 2]) / 2; + } + if( (@r2 % 2) == 1 ) { + $median2 = $r2[((@r2+1) / 2)-1]; + } else { + $median2 = ($r2[(@r2 / 2)-1] + $r2[@r2 / 2]) / 2; + } + if($media1>=$media2){ + $pvalue=$ttest->{t_prob}; + } + if($media1<$media2){ + $pvalue=1; + } + if ($sum_meth eq "mean"){ + print output "$key\t$media1\t$media2\t$pvalue\t$total_probes\t$total_probes2\n"; + #print output "$key\t$media1\t$total_probes\n"; + } + if ($sum_meth eq "median"){ + print output "$key\t$median1\t$median2\t$pvalue\t$total_probes\t$total_probes2\n"; + #print output "$key\t$median1\t$total_probes\n"; + } + if ($sum_meth eq "both"){ + print output "$key\t$media1\t$media2\t$median1\t$median2\t$pvalue\t$total_probes\t$total_probes2\n"; + #print output "$key\t$media1\t$median1\t$total_probes\n"; + } + } + if ($#r1==0) { + @r1=@r2; + my $ttest = new Statistics::TTest; + $ttest->set_significance(95); + $ttest->load_data(\@r1,\@r2); + my $s2=$ttest->{s2}; + $media2=$s2->{mean}; + $total_probes=1; + $total_probes2=$#r2+1; + $media1=$r1[0]; + $median1=$r1[0]; + if( (@r2 % 2) == 1 ) { + $median2 = $r2[((@r2+1) / 2)-1]; + } else { + $median2 = ($r2[(@r2 / 2)-1] + $r2[@r2 / 2]) / 2; + } + if ($sum_meth eq "mean"){ + print output "$key\t$media1\t$media2\tNA\t$total_probes\t$total_probes2\n"; + #print output "$key\t$media1\t$total_probes\n"; + } + if ($sum_meth eq "median"){ + print output "$key\t$median1\t$median2\tNA\t$total_probes\t$total_probes2\n"; + #print output "$key\t$median1\t$total_probes\n"; + } + if ($sum_meth eq "both"){ + print output "$key\t$media1\t$media2\t$median1\t$median2\tNA\t$total_probes\t$total_probes2\n"; + #print output "$key\t$media1\t$median1\t$total_probes\n"; + } + } + + + } + + if ($r2[0] eq "no") { + if ($#r1>0) { + @r2=@r1; + my $ttest = new Statistics::TTest; + $ttest->set_significance(95); + $ttest->load_data(\@r1,\@r2); + my $s1=$ttest->{s1}; + $media1=$s1->{mean}; + $total_probes=$#r1+1; + if( (@r1 % 2) == 1 ) { + $median1 = $r1[((@r1+1) / 2)-1]; + } else { + $median1 = ($r1[(@r1 / 2)-1] + $r1[@r1 / 2]) / 2; + } + } + if ($#r==0){ + $media1=$r1[0]; + $median1=$r1[0]; + $total_probes=$#r1+1; + } + $media2=0; + $median2=0; + if ($sum_meth eq "both"){ + print output "$key\t$media1\t$media2\t$median1\t$median2\tNA\t$total_probes\t0\n"; + #print output "$key\t$media1\t$median1\t$total_probes\n"; + } + if ($sum_meth eq "mean"){ + print output "$key\t$media1\t$media2\tNA\t$total_probes\t0\n"; + #print output "$key\t$media1\t$total_probes\n"; + } + if ($sum_meth eq "median"){ + print output "$key\t$median1\t$median2\tNA\t$total_probes\t0\n"; + #print output "$key\t$median1\t$total_probes\n"; + } + } + if (($#r2 == 0) && ($r2[0] ne "no") && ($#r1==0)){ + $media1=$r1[0]; + $media2=$r2[0]; + $median1=$r1[0]; + $median2=$r2[0]; + if ($sum_meth eq "both"){ + print output "$key\t$media1\t$media2\t$median1\t$median2\tNA\t1\t1\n"; + #print output "$key\t$media1\t$median1\t1\n"; + } + if ($sum_meth eq "mean"){ + print output "$key\t$media1\t$media2\tNA\t1\t1\n"; + #print output "$key\t$media1\t1\n"; + } + if ($sum_meth eq "median"){ + print output "$key\t$median1\t$median2\tNA\t1\t1\n"; + #print output "$key\t$median1\t1\n"; + } + } +} + + + + +} + + + + + +#################################################################################################################################### + +sub printusage { + + print<<eoc22334; + + + + *************************************************************** + :: N i m b l e G e n E x p r e s s i o n A n a l y s i s :: + *************************************************************** + + + USAGE SUMMARY + --------------------------------------------------------------------------------- + This program utilizes NimbleGen ratio files in gff format as INPUT FILE and + provides a table of the computed picks in the same gff file format. + + + --ann annotation file + --wt wild type file + --treat treated file + --pv p-value cut-off + --rc raw signal cut-off + --fc fold change cut-toff + + ----------------------------------------------------------------------------------------- + EXAMPLES: + + perl program.pl --in 75340_ratio.gff --perc 0.98 --t s --num 3 --dist 250 --w 500 +OR + perl program.pl --in 75340_ratio.gff --perc 0.98 --t p --log 5 --num 3 --dist 250 --w 500 + + ----------------------------------------------------------------------------------------- + +eoc22334 + exit 0; + +} + + +