view phased_siRNA.pl @ 23:cad6a1dfb174 draft

Uploaded
author big-tiandm
date Wed, 05 Nov 2014 21:11:49 -0500
parents 07745c0958dd
children
line wrap: on
line source

#!/usr/bin/perl -w
#Filename:
#Author: Tian Dongmei
#Email: tiandm@big.ac.cn
#Date: 2013/7/19
#Modified:
#Description: 
my $version=1.00;

use strict;
use Getopt::Long;
#use Math::Cephes qw(:hypergeometrics); 

my %opts;
GetOptions(\%opts,"i=s","o=s","h");
if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments
&usage;
}

my $filein=$opts{'i'};
my $fileout=$opts{'o'};

open IN,"<$filein"; #input file  
open OUT,">$fileout"; #output file  

while (my $aline=<IN>) {
	chomp $aline;
	if ($aline=~/^\#/) {
		print OUT $aline,"\tp-value\n";
		next;
	}
	my @tmp=split/\t/,$aline;
	my @pos=split/:|-/,$tmp[0];
	$tmp[1]=~s/nt//;
	my $pv=&phase($tmp[1],$pos[1],$pos[2],$tmp[4]);
	
	print OUT $aline,"\t",$pv,"\n";
}
close IN;
close OUT;

sub phase{
	my ($tagL,$start,$end,$tags)=@_;
	my @tmp=split/\;/,$tags;
	my %tag;
	for (my $i=0;$i<@tmp;$i++) {
		my @aa=split/\,/,$tmp[$i];
		next if($aa[1]-$aa[0]+1 != $tagL);
#		$tag{$aa[0].",".$aa[2]}+=$aa[3] if($aa[2] eq "+");
#		$tag{($aa[1]).",".$aa[2]}+=$aa[3] if($aa[2] eq "-");
		$tag{$aa[0]}+=$aa[3] if($aa[2] eq "+");
		$tag{($aa[1]+3)}+=$aa[3] if($aa[2] eq "-");
	}

	my $pv=&pvalue2(\%tag,$tagL,$start,$end);

	return $pv;
}

sub pvalue2{
	my ($tag,$tagL,$start,$end)=@_;

	my $p=1; my $pp=1;
	foreach my $ccs(keys %{$tag}){
		my $n=0;
		my $k=0;
		my $K=0;
		my $N=0;

		my $cor= $ccs;
		my $ss=$cor;
		my $ee=($cor+$tagL*10-1)<$end ? $cor+$tagL*10-1 : $end;
		
		my $max=0;
		for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand
		{
			my $x=$i;
			if (defined $$tag{$x})
			{
			if ($max<$$tag{$x}) {$max=$$tag{$x};}
			$n +=$$tag{$x};
			$N++;
			}
		}
		for (my $i=$ss; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand
		{
			my $x=$i;
			if (defined $$tag{$x})
			{
			$k +=$$tag{$x};
			$K++;
			}
		}


		return $p if($K<3);
		return $p if($max/$n>0.8);

		my $pn=0; 
		next if($n==$k);
		$pn=10*$k/($n-$k)+1;
		$pn = $pn ** ($K-2);
		$pn = log($pn);
		if ($p<$pn) {
			$p=$pn;
		}

	}
	
	return $p;

}

sub pvalue{
	my ($tag,$tagL,$start,$end)=@_;

	my $p=1;
	foreach my $ccs(keys %{$tag}){
		my $n=-1;
		my $k=-1;

		my ($cor, $str)=split(/,/, $ccs);
		if ($str eq "+") # small RNAs on the Watson strand
		{
			my $ss=$cor;
			my $ee=($cor+$tagL*11-1)<$end ? $cor+$tagL*11-1 : $end;
			for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand
			{
				my $x=$i.","."+";
				if (defined $$tag{$x})
				{
				$n=$n+1;
				}
			}
			for (my $i=$ss; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand
			{
				my $x=$i.","."+";
				if (defined $$tag{$x})
				{
				$k=$k+1;
				}
			}

			for (my $j=$ss-2; $j<=$ee-2; $j++) # calculate n on the antisense strand
			{
				my $x=$j.","."-";
				if (defined $$tag{$x})
				{
				$n=$n+1;
				}
			}

			for (my $j=$ss+$tagL-2; $j<=$ee-2; $j=$j+$tagL) # calculate k on the antisense strand
			{
				my $x=$j.","."-";
				if (defined $$tag{$x})
				{
				$k=$k+1;
				}
			}
		}

		elsif ($str eq "-") # small RNAs on the Crick strand
		{
			my $ee=$cor;
			my $ss=$cor-$tagL*11+1> $start ? $cor-$tagL*11+1 : $start;
			for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand
			{
				my $x=$i.","."-";
				if (defined $$tag{$x})
				{
				$n=$n+1;
				}
			}
			for (my $i=$ss+$tagL-1; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand
			{
				my $x=$i.","."-";
				if (defined $$tag{$x})
				{
				$k=$k+1;
				}
			}

			for (my $j=$ss+2; $j<=$ee+2; $j++) # calculate n on the antisense strand
			{
				my $x=$j.","."+";
				if (defined $$tag{$x})
				{
				$n=$n+1;
				}
			}
			for (my $j=$ss+2; $j<=$ee+2; $j=$j+$tagL) # calculate k on the antisense strand
			{
				my $x=$j.","."+";
				if (defined $$tag{$x})
				{
				$k=$k+1;
				}
			}
		}

		next if($k<3);

		my $pn=0; my $N=$tagL*11*2-1; my $M=21;
		for (my $w=$k; $w<=$M; $w++) # calculate p-value from n and k
		{
			my $c=1;
			my $rr=1;
			my $rw=1;

			for (my $j=0; $j<=$w-1; $j++)
			{
				$c=$c*($M-$j)/($j+1);
			}
			for (my $x=0; $x<=$n-$w-1; $x++)
			{
				$rr=$rr*($N-$M-$x)/($x+1);
			}
			for (my $y=0; $y<=$n-1; $y++)
			{
				$rw=$rw*($y+1)/($N-$y);
			}
			my $pr=$c*$rr*$rw;
			
			$pn=$pn+$pr;
		}

		$p=$pn<$p ? $pn :$p;

		if ($p<0.001) #select and output small RNA clusters with p<0.001

		{

			return $p;

		}

	}
	return $p;
}

sub usage{
print <<"USAGE";
Version $version
Usage:
$0 -i -o
options:
-i input file
-o output file
-h help
USAGE
exit(1);
}