diff phased_siRNA.pl @ 0:07745c0958dd draft

Uploaded
author big-tiandm
date Thu, 18 Sep 2014 21:40:25 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/phased_siRNA.pl	Thu Sep 18 21:40:25 2014 -0400
@@ -0,0 +1,254 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2013/7/19
+#Modified:
+#Description: 
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+#use Math::Cephes qw(:hypergeometrics); 
+
+my %opts;
+GetOptions(\%opts,"i=s","o=s","h");
+if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $filein=$opts{'i'};
+my $fileout=$opts{'o'};
+
+open IN,"<$filein"; #input file  
+open OUT,">$fileout"; #output file  
+
+while (my $aline=<IN>) {
+	chomp $aline;
+	if ($aline=~/^\#/) {
+		print OUT $aline,"\tp-value\n";
+		next;
+	}
+	my @tmp=split/\t/,$aline;
+	my @pos=split/:|-/,$tmp[0];
+	$tmp[1]=~s/nt//;
+	my $pv=&phase($tmp[1],$pos[1],$pos[2],$tmp[4]);
+	
+	print OUT $aline,"\t",$pv,"\n";
+}
+close IN;
+close OUT;
+
+sub phase{
+	my ($tagL,$start,$end,$tags)=@_;
+	my @tmp=split/\;/,$tags;
+	my %tag;
+	for (my $i=0;$i<@tmp;$i++) {
+		my @aa=split/\,/,$tmp[$i];
+		next if($aa[1]-$aa[0]+1 != $tagL);
+#		$tag{$aa[0].",".$aa[2]}+=$aa[3] if($aa[2] eq "+");
+#		$tag{($aa[1]).",".$aa[2]}+=$aa[3] if($aa[2] eq "-");
+		$tag{$aa[0]}+=$aa[3] if($aa[2] eq "+");
+		$tag{($aa[1]+3)}+=$aa[3] if($aa[2] eq "-");
+	}
+
+	my $pv=&pvalue2(\%tag,$tagL,$start,$end);
+
+	return $pv;
+}
+
+sub pvalue2{
+	my ($tag,$tagL,$start,$end)=@_;
+
+	my $p=1; my $pp=1;
+	foreach my $ccs(keys %{$tag}){
+		my $n=0;
+		my $k=0;
+		my $K=0;
+		my $N=0;
+
+		my $cor= $ccs;
+		my $ss=$cor;
+		my $ee=($cor+$tagL*10-1)<$end ? $cor+$tagL*10-1 : $end;
+		
+		my $max=0;
+		for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand
+		{
+			my $x=$i;
+			if (defined $$tag{$x})
+			{
+			if ($max<$$tag{$x}) {$max=$$tag{$x};}
+			$n +=$$tag{$x};
+			$N++;
+			}
+		}
+		for (my $i=$ss; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand
+		{
+			my $x=$i;
+			if (defined $$tag{$x})
+			{
+			$k +=$$tag{$x};
+			$K++;
+			}
+		}
+
+
+		return $p if($K<3);
+		return $p if($max/$n>0.8);
+
+		my $pn=0; 
+		next if($n==$k);
+		$pn=10*$k/($n-$k)+1;
+		$pn = $pn ** ($K-2);
+		$pn = log($pn);
+		if ($p<$pn) {
+			$p=$pn;
+		}
+
+	}
+	
+	return $p;
+
+}
+
+sub pvalue{
+	my ($tag,$tagL,$start,$end)=@_;
+
+	my $p=1;
+	foreach my $ccs(keys %{$tag}){
+		my $n=-1;
+		my $k=-1;
+
+		my ($cor, $str)=split(/,/, $ccs);
+		if ($str eq "+") # small RNAs on the Watson strand
+		{
+			my $ss=$cor;
+			my $ee=($cor+$tagL*11-1)<$end ? $cor+$tagL*11-1 : $end;
+			for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand
+			{
+				my $x=$i.","."+";
+				if (defined $$tag{$x})
+				{
+				$n=$n+1;
+				}
+			}
+			for (my $i=$ss; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand
+			{
+				my $x=$i.","."+";
+				if (defined $$tag{$x})
+				{
+				$k=$k+1;
+				}
+			}
+
+			for (my $j=$ss-2; $j<=$ee-2; $j++) # calculate n on the antisense strand
+			{
+				my $x=$j.","."-";
+				if (defined $$tag{$x})
+				{
+				$n=$n+1;
+				}
+			}
+
+			for (my $j=$ss+$tagL-2; $j<=$ee-2; $j=$j+$tagL) # calculate k on the antisense strand
+			{
+				my $x=$j.","."-";
+				if (defined $$tag{$x})
+				{
+				$k=$k+1;
+				}
+			}
+		}
+
+		elsif ($str eq "-") # small RNAs on the Crick strand
+		{
+			my $ee=$cor;
+			my $ss=$cor-$tagL*11+1> $start ? $cor-$tagL*11+1 : $start;
+			for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand
+			{
+				my $x=$i.","."-";
+				if (defined $$tag{$x})
+				{
+				$n=$n+1;
+				}
+			}
+			for (my $i=$ss+$tagL-1; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand
+			{
+				my $x=$i.","."-";
+				if (defined $$tag{$x})
+				{
+				$k=$k+1;
+				}
+			}
+
+			for (my $j=$ss+2; $j<=$ee+2; $j++) # calculate n on the antisense strand
+			{
+				my $x=$j.","."+";
+				if (defined $$tag{$x})
+				{
+				$n=$n+1;
+				}
+			}
+			for (my $j=$ss+2; $j<=$ee+2; $j=$j+$tagL) # calculate k on the antisense strand
+			{
+				my $x=$j.","."+";
+				if (defined $$tag{$x})
+				{
+				$k=$k+1;
+				}
+			}
+		}
+
+		next if($k<3);
+
+		my $pn=0; my $N=$tagL*11*2-1; my $M=21;
+		for (my $w=$k; $w<=$M; $w++) # calculate p-value from n and k
+		{
+			my $c=1;
+			my $rr=1;
+			my $rw=1;
+
+			for (my $j=0; $j<=$w-1; $j++)
+			{
+				$c=$c*($M-$j)/($j+1);
+			}
+			for (my $x=0; $x<=$n-$w-1; $x++)
+			{
+				$rr=$rr*($N-$M-$x)/($x+1);
+			}
+			for (my $y=0; $y<=$n-1; $y++)
+			{
+				$rw=$rw*($y+1)/($N-$y);
+			}
+			my $pr=$c*$rr*$rw;
+			
+			$pn=$pn+$pr;
+		}
+
+		$p=$pn<$p ? $pn :$p;
+
+		if ($p<0.001) #select and output small RNA clusters with p<0.001
+
+		{
+
+			return $p;
+
+		}
+
+	}
+	return $p;
+}
+
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o
+options:
+-i input file
+-o output file
+-h help
+USAGE
+exit(1);
+}
+