Mercurial > repos > big-tiandm > sirna_plant
diff phased_siRNA.pl @ 0:07745c0958dd draft
Uploaded
author | big-tiandm |
---|---|
date | Thu, 18 Sep 2014 21:40:25 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phased_siRNA.pl Thu Sep 18 21:40:25 2014 -0400 @@ -0,0 +1,254 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2013/7/19 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; +#use Math::Cephes qw(:hypergeometrics); + +my %opts; +GetOptions(\%opts,"i=s","o=s","h"); +if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $filein=$opts{'i'}; +my $fileout=$opts{'o'}; + +open IN,"<$filein"; #input file +open OUT,">$fileout"; #output file + +while (my $aline=<IN>) { + chomp $aline; + if ($aline=~/^\#/) { + print OUT $aline,"\tp-value\n"; + next; + } + my @tmp=split/\t/,$aline; + my @pos=split/:|-/,$tmp[0]; + $tmp[1]=~s/nt//; + my $pv=&phase($tmp[1],$pos[1],$pos[2],$tmp[4]); + + print OUT $aline,"\t",$pv,"\n"; +} +close IN; +close OUT; + +sub phase{ + my ($tagL,$start,$end,$tags)=@_; + my @tmp=split/\;/,$tags; + my %tag; + for (my $i=0;$i<@tmp;$i++) { + my @aa=split/\,/,$tmp[$i]; + next if($aa[1]-$aa[0]+1 != $tagL); +# $tag{$aa[0].",".$aa[2]}+=$aa[3] if($aa[2] eq "+"); +# $tag{($aa[1]).",".$aa[2]}+=$aa[3] if($aa[2] eq "-"); + $tag{$aa[0]}+=$aa[3] if($aa[2] eq "+"); + $tag{($aa[1]+3)}+=$aa[3] if($aa[2] eq "-"); + } + + my $pv=&pvalue2(\%tag,$tagL,$start,$end); + + return $pv; +} + +sub pvalue2{ + my ($tag,$tagL,$start,$end)=@_; + + my $p=1; my $pp=1; + foreach my $ccs(keys %{$tag}){ + my $n=0; + my $k=0; + my $K=0; + my $N=0; + + my $cor= $ccs; + my $ss=$cor; + my $ee=($cor+$tagL*10-1)<$end ? $cor+$tagL*10-1 : $end; + + my $max=0; + for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand + { + my $x=$i; + if (defined $$tag{$x}) + { + if ($max<$$tag{$x}) {$max=$$tag{$x};} + $n +=$$tag{$x}; + $N++; + } + } + for (my $i=$ss; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand + { + my $x=$i; + if (defined $$tag{$x}) + { + $k +=$$tag{$x}; + $K++; + } + } + + + return $p if($K<3); + return $p if($max/$n>0.8); + + my $pn=0; + next if($n==$k); + $pn=10*$k/($n-$k)+1; + $pn = $pn ** ($K-2); + $pn = log($pn); + if ($p<$pn) { + $p=$pn; + } + + } + + return $p; + +} + +sub pvalue{ + my ($tag,$tagL,$start,$end)=@_; + + my $p=1; + foreach my $ccs(keys %{$tag}){ + my $n=-1; + my $k=-1; + + my ($cor, $str)=split(/,/, $ccs); + if ($str eq "+") # small RNAs on the Watson strand + { + my $ss=$cor; + my $ee=($cor+$tagL*11-1)<$end ? $cor+$tagL*11-1 : $end; + for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand + { + my $x=$i.","."+"; + if (defined $$tag{$x}) + { + $n=$n+1; + } + } + for (my $i=$ss; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand + { + my $x=$i.","."+"; + if (defined $$tag{$x}) + { + $k=$k+1; + } + } + + for (my $j=$ss-2; $j<=$ee-2; $j++) # calculate n on the antisense strand + { + my $x=$j.","."-"; + if (defined $$tag{$x}) + { + $n=$n+1; + } + } + + for (my $j=$ss+$tagL-2; $j<=$ee-2; $j=$j+$tagL) # calculate k on the antisense strand + { + my $x=$j.","."-"; + if (defined $$tag{$x}) + { + $k=$k+1; + } + } + } + + elsif ($str eq "-") # small RNAs on the Crick strand + { + my $ee=$cor; + my $ss=$cor-$tagL*11+1> $start ? $cor-$tagL*11+1 : $start; + for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand + { + my $x=$i.","."-"; + if (defined $$tag{$x}) + { + $n=$n+1; + } + } + for (my $i=$ss+$tagL-1; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand + { + my $x=$i.","."-"; + if (defined $$tag{$x}) + { + $k=$k+1; + } + } + + for (my $j=$ss+2; $j<=$ee+2; $j++) # calculate n on the antisense strand + { + my $x=$j.","."+"; + if (defined $$tag{$x}) + { + $n=$n+1; + } + } + for (my $j=$ss+2; $j<=$ee+2; $j=$j+$tagL) # calculate k on the antisense strand + { + my $x=$j.","."+"; + if (defined $$tag{$x}) + { + $k=$k+1; + } + } + } + + next if($k<3); + + my $pn=0; my $N=$tagL*11*2-1; my $M=21; + for (my $w=$k; $w<=$M; $w++) # calculate p-value from n and k + { + my $c=1; + my $rr=1; + my $rw=1; + + for (my $j=0; $j<=$w-1; $j++) + { + $c=$c*($M-$j)/($j+1); + } + for (my $x=0; $x<=$n-$w-1; $x++) + { + $rr=$rr*($N-$M-$x)/($x+1); + } + for (my $y=0; $y<=$n-1; $y++) + { + $rw=$rw*($y+1)/($N-$y); + } + my $pr=$c*$rr*$rw; + + $pn=$pn+$pr; + } + + $p=$pn<$p ? $pn :$p; + + if ($p<0.001) #select and output small RNA clusters with p<0.001 + + { + + return $p; + + } + + } + return $p; +} + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o +options: +-i input file +-o output file +-h help +USAGE +exit(1); +} +