Mercurial > repos > big-tiandm > sirna_plant

#!/usr/bin/perl -w
#Filename:
#Author: Tian Dongmei
#Email: tiandm@big.ac.cn
#Date: 2014-5-29
#Modified:
#Description:
my $version=1.00;

use strict;
use Getopt::Long;
use File::Basename;

my %opts;
GetOptions(\%opts,"i=s","format=s","o=s","h");
if (!(defined $opts{o} and defined $opts{format} and defined $opts{i} ) || defined $opts{h}) { #necessary arguments
&usage;
}
my ($config,$prepath,$rfampath,$genomepath,$clusterpath,$annotatepath,$degpath);
my ($predir,$rfamdir,$genomedir,$clusterdir,$annotatedir,$degdir);
open IN,"<$opts{i}";
$config=<IN>; chomp $config;
$prepath=<IN>; chomp $prepath;
$rfampath=<IN>;chomp $rfampath;
$genomepath=<IN>; chomp $genomepath;
$clusterpath=<IN>; chomp $clusterpath;
$annotatepath=<IN>; chomp $annotatepath;
$degpath=<IN>; chomp $degpath;
close IN;
my @tmp=split/\//,$prepath;
$predir=$tmp[-1];
@tmp=split/\//,$rfampath;
$rfamdir=$tmp[-1];
@tmp=split/\//,$genomepath;
$genomedir=$tmp[-1];
@tmp=split/\//,$clusterpath;
$clusterdir=$tmp[-1];
@tmp=split/\//,$annotatepath;
$annotatedir=$tmp[-1];
@tmp=split/\//,$degpath;
$degdir=$tmp[-1];

my $dir=dirname($opts{'o'});

open OUT ,">$opts{'o'}";
print OUT "<HTML>\n  <HEAD>\n  <TITLE> Analysis Report </TITLE>\n </HEAD>
 <BODY bgcolor=\"lightgray\">\n  <h1 align=\"center\">\n    <font face=\"黑体\">\n	<b>Small RNA Analysis Report</b>\n  </font>\n  </h1>
  <h2>1. Sequence No. and quality</h2>
  <h3>1.1 Sequece No.</h3>
";

### raw data no
open IN,"<$config";
my @files;my @marks; my @rawNo;
while (my $aline=<IN>) {
	chomp $aline;
	my @tmp=split/\t/,$aline;
	push @files,$tmp[0];

	my $no=`less $tmp[0] |wc -l `;
	chomp $no;
	if ($opts{'format'} eq "fq" || $opts{'format'} eq "fastq") {
		$no=$no/4;
	}
	else{
		$no=$no/2;
	}
	push @rawNo,$no;

	push @marks,$tmp[1];
}
close IN;

### preprocess
unless ($prepath=~/\/$/) {
	$prepath .="/";
}

my @trimNo;my @collapse;
my $collapsefile=$prepath."collapse_reads.fa";
open IN,"<$collapsefile";
while (my $aline=<IN>) {
	chomp $aline;
	<IN>;
	$aline=~/:([\d|_]+)_x(\d+)$/;
	my @lng=split/_/,$1;
	for (my $i=0;$i<@lng;$i++) {
		if ($lng[$i]>0) {
			$trimNo[$i] +=$lng[$i];
			$collapse[$i] ++;
		}
	}
}
close IN;

my @cleanR;my @cleanT;
my $clean=$prepath."collapse_reads_18-40.fa";
open IN,"<$clean";
while (my $aline=<IN>) {
	chomp $aline;
	<IN>;
	$aline=~/:([\d|_]+)_x(\d+)$/;
	my @lng=split/_/,$1;
	for (my $i=0;$i<@lng;$i++) {
		if ($lng[$i]>0) {
			$cleanR[$i] +=$lng[$i];
			$cleanT[$i] ++;
		}
	}
}
close IN;

my @filterR;my @filterT;
my $filter=$prepath."collapse_reads_out.fa";
open IN,"<$filter";
while (my $aline=<IN>) {
	chomp $aline;
	<IN>;
	$aline=~/:([\d|_]+)_x(\d+)$/;
	my @lng=split/_/,$1;
	for (my $i=0;$i<@lng;$i++) {
		if ($lng[$i]>0) {
			$filterR[$i] +=$lng[$i];
			$filterT[$i] ++;
		}
	}
}
close IN;


print OUT "<table border=\"1\">
<tr align=\"center\">
<th>&nbsp;</th>
";
foreach  (@marks) {
	print OUT "<th> $_ </th>\n";
}
print  OUT "</tr>
<tr align=\"center\">
<th align=\"left\">Raw Reads No. </th>
";
foreach  (@rawNo) {
	print OUT "<td> $_ </td>\n";
}
print OUT "</tr>
<tr align=\"center\">
<th align=\"left\">Reads No. After Trimed 3\' adapter </th>
";
foreach  (@trimNo) {
	print OUT "<td> $_ </td>\n";
}
print OUT "</tr>
<tr align=\"center\">
<th align=\"left\">Unique Tags No. </th>
";
foreach  (@collapse) {
	print OUT "<td> $_ </td>\n";
}
print OUT  "</tr>
<tr align=\"center\">
<th align=\"left\">Clean Reads No. </th>
";
foreach  (@cleanR) {
	print OUT "<td> $_ </td>\n";
}
print OUT "</tr>
<tr align=\"center\">
<th align=\"left\">Clean Tags No. </th>
";
foreach  (@cleanT) {
	print OUT "<td> $_ </td>\n";
}
print OUT  "</tr>
<tr align=\"center\">
<th align=\"left\">Filter Reads No. \(reads count \>3\) </th>
";
foreach  (@filterR) {
	print OUT "<td> $_ </td>\n";
}
print OUT "</tr>
<tr align=\"center\">
<th align=\"left\">Filter Tags No. \(reads count \>3\) </th>
";
foreach  (@filterT) {
	print OUT "<td> $_ </td>\n";
}
print OUT "</tr>\n</table>";
print OUT "<p>
Note:<br />
The raw data file path is: <b>$files[0]</b><br />
";
for (my $i=1;$i<@files;$i++) {
	print OUT "&nbsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;<b>$files[$i]</b><br />";
}
print OUT "The collapsed file path is: <b>$collapsefile</b><br />
The clean data file path is: <b>$clean</b><br />
The filter (remain total reads>3) data file path is: <b>$filter</b><br />
</p>
<h2> 1. Sequence length count</h2>
<h3> 1.1 Reads length</h3>
";

print OUT "<img src=\"./$predir/Reads_length_after_count_filter.png\" alt=\"Reads_length_after_count_filter.png\" width=\"400\" height=\"300\"/>
<h3> 1.2 Tags length count</h3>
<img src=\"./$predir/Tags_length_after_count_filter.png\" alt=\"Tags_length_after_count_filter.png\" width=\"400\" height=\"300\"/>
<p> Note:<br />The sequence length data: <a href=\"./$predir/reads_length_distribution_after_count_filter.txt\"> length file</a>
</p>
";

#### rfam
unless ($rfampath=~/\/$/) {
	$rfampath .="/";
}
unless ($genomepath=~/\/$/) {
	$genomepath .="/";
}
print OUT "<h2>2. Rfam non-miRNA annotation</h2>
<h3>2.1 Reads count</h3>
<table border=\"1\">
<tr align=\"center\">
";

my @rfamR; my @rfamT;
my $tag=1;
open IN,"<$dir/rfam_match/rfam_non-miRNA_annotation.txt";
while (my $aline=<IN>) {
	chomp $aline;
	$tag=0 if($aline=~/tags\s+number/);
	next if($aline=~/^\#/);
	next if($aline=~/^\s*$/);
	my @tmp=split/\s+/,$aline;
	if($tag == 1){push @rfamR,[@tmp];}
	else{push @rfamT,[@tmp];}
}
close IN;


print OUT "<th>RNA Name</th>\n";
foreach  (@marks) {
	print OUT "<th> $_ </th>\n";
}
for (my $i=0;$i<@rfamR;$i++) {
	print  OUT "</tr>
	<tr align=\"center\">
	<th align=\"left\">$rfamR[$i][0]</th>
	";
	for (my $j=1;$j<@{$rfamR[$i]} ;$j++) {
	print OUT "<td> $rfamR[$i][$j]</td>\n";
	}
}

print OUT "</tr>\n</table>
  <h3>2.2 Tags count</h3>
  <table border=\"1\">
   <tr align=\"center\">
   <th>RNA Name</th>\n";
foreach  (@marks) {
	print OUT "<th> $_ </th>\n";
}
for (my $i=0;$i<@rfamT;$i++) {
	print  OUT "</tr>
	<tr align=\"center\">
	<th align=\"left\">$rfamT[$i][0]</th>
	";
	for (my $j=1;$j<@{$rfamT[$i]} ;$j++) {
	print OUT "<td> $rfamT[$i][$j]</td>\n";
	}
}
print OUT "</tr>\n</table>
<p>Note:<br />The rfam mapping results is: <b>$rfampath</b>";
print OUT "<b>rfam_mapped.bwt</b></p>";

open IN,"<$dir/genome_match/genome_mapped.bwt";
my @genome_r_u;
my @genome_r_m;
my @genome_t_u;
my @genome_t_m;
my $tags_map_number=0;
while (my $aline=<IN>) {
	chomp $aline;
	my @temp=split/\t/,$aline;
	if ($temp[6]==0) {
		$aline=~/:([\d|_]+)_x(\d+)/;
		my @lng=split/_/,$1;
		for (my $i=0;$i<@lng;$i++) {
			if ($lng[$i]>0) {
				$genome_r_u[$i] +=$lng[$i];
				$genome_t_u[$i] ++;
			}
		}
		$tags_map_number++;
	}
	if ($temp[6]>0) {
		$aline=~/:([\d|_]+)_x(\d+)/;
		my @lng=split/_/,$1;
		for (my $i=0;$i<@lng;$i++) {
			if ($lng[$i]>0) {
				$genome_r_m[$i] +=$lng[$i];
				$genome_t_m[$i] ++;
			}
		}
		for (my $i=0;$i<$temp[6] ;$i++) {
			my $next=<IN>;
		}
		$tags_map_number++;
	}
}
close IN;
#<h3>3.1 Reads count</h3>
#<table border=\"1\">
#<tr align=\"center\">
print OUT "<h2>3. genome mapping result</h2>
<table border=\"1\">
<tr align=\"center\">
<th align=\"left\">Map</th>\n
";
foreach  (@marks) {
	print OUT "<th> $_ </th>\n";
}
print  OUT "</tr>
	<tr align=\"center\">
	<th align=\"left\">Uniq Map Reads No.</th>
";
for (my $i=0;$i<@genome_r_u ;$i++) {
	print OUT "<td> $genome_r_u[$i]</td>\n";
}

print  OUT "</tr>
	<tr align=\"center\">
	<th align=\"left\">Uniq Map Tags No.</th>
";
for (my $i=0;$i<@genome_t_u ;$i++) {
	print OUT "<td> $genome_t_u[$i]</td>\n";
}

print  OUT "</tr>
	<tr align=\"center\">
	<th align=\"left\">Multiple Map Reads No.</th>
";
for (my $i=0;$i<@genome_r_m ;$i++) {
	print OUT "<td> $genome_r_m[$i]</td>\n";
}

print  OUT "</tr>
	<tr align=\"center\">
	<th align=\"left\">Multiple Map Tags No.</th>
";
for (my $i=0;$i<@genome_t_m ;$i++) {
	print OUT "<td> $genome_t_m[$i]</td>\n";
}

print OUT "</tr>\n</table>
<p>Note:<br />The genome mapping results is: <b>$genomepath</b>";
print OUT "<b>genome_mapped.bwt</b></p>";

my $cluster="$clusterpath/sample_reads.cluster";
my $cluster_number=`less $cluster |wc -l `;
$cluster_number=$cluster_number-1;
my (%cluster_length,@exp,@rpkm);
my @exp_range=qw(0 1--9 10--99 100--999 1000--9999 10000--99999 100000--**);
my @rpkm_range=qw(0 0--0.25 0.25--0.5 0.5--1 1.0--5.0 5.0--10 10--50 50--100 100--500 500--1000 1000--**);

open IN,"<$cluster";
while (my $aline=<IN>) {
	next if($aline=~/^\"/);
	chomp $aline;
	my @temp=split/\t/,$aline;
	my @id=split/:|-/,$temp[0];
	$cluster_length{$id[2]-$id[1]+1}++;
	for (my $i=0;$i<@marks ;$i++) {
		if ($temp[$i+3] == 0) {$exp[$i][0]++;}
		elsif ($temp[$i+3]>0 && $temp[$i+3]<= 10  ){$exp[$i][1]++;}
		elsif ($temp[$i+3]>10 && $temp[$i+3]<=100){$exp[$i][2]++;}
		elsif ($temp[$i+3]>100 && $temp[$i+3]<=1000){$exp[$i][3]++;}
		elsif ($temp[$i+3]>1000 && $temp[$i+3]<=10000){$exp[$i][4]++;}
		elsif ($temp[$i+3]>10000 && $temp[$i+3]<=100000){$exp[$i][5]++;}
		elsif ($temp[$i+3]>100000){$exp[$i][6]++;}
	}
}
close IN;

my $cluster_rpkm="$clusterpath/sample_rpkm.cluster";
open IN,"<$cluster_rpkm";
while (my $aline=<IN>) {
	next if($aline=~/^\#/);
	chomp $aline;
	my @temp=split/\t/,$aline;
	for (my $i=0;$i<@marks ;$i++) {
		if ($temp[$i+3]==0) {$rpkm[$i][0]++;}
		elsif($temp[$i+3]>0 && $temp[$i+3]<=0.25){$rpkm[$i][1]++;}
		elsif($temp[$i+3]>0.25 && $temp[$i+3]<=0.5){$rpkm[$i][2]++;}
		elsif($temp[$i+3]>0.5 && $temp[$i+3]<=1){$rpkm[$i][3]++;}
		elsif($temp[$i+3]>1 && $temp[$i+3]<=5){$rpkm[$i][4]++;}
		elsif($temp[$i+3]>5 && $temp[$i+3]<=10){$rpkm[$i][5]++;}
		elsif($temp[$i+3]>10 && $temp[$i+3]<=50){$rpkm[$i][6]++;}
		elsif($temp[$i+3]>50 && $temp[$i+3]<=100){$rpkm[$i][7]++;}
		elsif($temp[$i+3]>100 && $temp[$i+3]<=500){$rpkm[$i][8]++;}
		elsif($temp[$i+3]>500 && $temp[$i+3]<=1000){$rpkm[$i][9]++;}
		else{$rpkm[$i][10]++;}
	}
}

close IN;

my $cluster_length_file="$clusterpath/cluster_length.txt";
open LEN,">$cluster_length_file";
print LEN "\#length\tcluster_number\n";
foreach my $key (sort keys %cluster_length) {
	print LEN "$key\t$cluster_length{$key}\n";
}
close LEN;
print OUT "<h2>4. cluster result</h2>
<h3>4.1 Cluster count</h3>
<table border=\"1\">
<tr align=\"center\">
<th align=\"left\"> </th>
<td>Merged samples</td></tr>
<tr align=\"center\">
<th align=\"left\">Tags number</th>
<td>$tags_map_number</td></tr>
<tr align=\"center\">
<th align=\"left\">Cluster number</th>
<td>$cluster_number</td></tr>\n</table>
";

print OUT "<h3>4.2 Cluster length</h3>
<p> Note:<br />The clusters length data: <a href=\"./$clusterdir/cluster_length.txt\"> length file</a>
</p>
";
print OUT "<h3>4.3 Quantify</h3>
<table border=\"1\">
<tr align=\"center\">
<th align=\"left\">Reads Range</th>\n
";
foreach  (@marks) {
	print OUT "<th> $_ </th>\n";
}
for (my $i=0;$i<@exp_range;$i++) {
	print  OUT "</tr>
	<tr align=\"center\">
	<th align=\"left\">$exp_range[$i]</th>
	";
	for (my $j=0;$j<@marks ;$j++) {
		if (!(defined($exp[$i][$j]))) {
			print OUT "<td> 0</td>\n";
		}
		else{print OUT "<td> $exp[$i][$j]</td>\n";}
	}
}
print OUT "</tr>\n</table>";

print OUT "\n<table border=\"1\">
<tr align=\"center\">
<th align=\"left\">RPKM Range</th>\n
";
foreach  (@marks) {
	print OUT "<th> $_ </th>\n";
}
for (my $i=0;$i<@rpkm_range;$i++) {
	print  OUT "</tr>
	<tr align=\"center\">
	<th align=\"left\">$rpkm_range[$i]</th>
	";
	for (my $j=0;$j<@marks ;$j++) {
		if (!(defined($rpkm[$i][$j]))) {
			print OUT "<td> 0</td>\n";
		}
		else{print OUT "<td> $rpkm[$i][$j]</td>\n";}
	}
}
print OUT "</tr>\n</table>";

my $annotate="$annotatepath/sample_c_p.anno";
my (%posit,%repeat,%nat1,%nat2);
my (@phase,@long,@repeat,@nat);
for (my $j=0;$j<@marks ;$j++) {
	$phase[$j]=0;
	$long[$j]=0;
	$repeat[$j]=0;
	$nat[$j]=0;
}
open ANNO,"<$annotate";
while (my $aline=<ANNO>) {
	next if($aline=~/^\#/);
	chomp $aline;
	my @temp=split/\t/,$aline;
	for (my $i=3+@marks+6;$i<@temp;$i++) {
		my @posit=split/\;/,$temp[$i];
		for (my $j=0;$j<@marks ;$j++) {
			if ($temp[3+$j]>0) {
				$posit{$posit[0]}[$j]++;
			}
			else{
				if (!(defined($posit{$posit[0]}[$j]))) {
					$posit{$posit[0]}[$j]=0;
				}
			}
		}
	}
	for (my $j=0;$j<@marks ;$j++) {
		if ($temp[3+$j]>0) {
			if ($temp[6] eq "phase") {
				$phase[$j]++;
			}
			if ($temp[7] eq "long") {
				$long[$j]++;
			}
			if ($temp[8] ne "\/") {
				$repeat[$j]++;
				my @rr=split/\;/,$temp[8];
				foreach  (@rr) {
					$repeat{$_}[$j]++;
				}
			}
			if ($temp[9] ne "\/") {
				$nat[$j]++;
				my @nn1=split/\;/,$temp[9];
				my @nn2=split/\;/,$temp[10];
				for (my $k=0;$k<@nn1 ;$k++) {
					$nat1{$nn1[$k]}[$j]++;
					$nat2{$nn2[$k]}[$j]++;
				}
			}
		}
	}
}
close ANNO;

print OUT "<h2>5. Cluster Annotate</h2>
<h3>5.1 Cluster genome position annotate</h3>
<table border=\"1\">
<tr align=\"center\">
<th align=\"left\">clusters number</th>\n
";

foreach  (@marks) {
	print OUT "<th> $_ </th>\n";
}
foreach my $key (sort keys %posit) {
	print  OUT "</tr>
		<tr align=\"center\">
		<th align=\"left\">$key</th>
	";
	foreach  (@{$posit{$key}}) {
		print OUT "<td> $_</td>\n";
	}
}
print OUT "</tr>\n</table>";
print OUT "<p>
Note:<br />
One cluster mybe annotate to multiple genes<br />
";

print OUT "<h3>5.2 Cluster source mechanism annotate</h3>
<table border=\"1\">
<tr align=\"center\">
<th align=\"left\">clusters number</th>\n
";

foreach  (@marks) {
	print OUT "<th> $_ </th>\n";
}
print OUT "</tr>
<tr align=\"center\">
<th align=\"left\">Phase</th>\n
";
foreach  (@phase) {
	print OUT "<td> $_ </td>\n";
}

print OUT "</tr>
<tr align=\"center\">
<th align=\"left\">Long</th>\n
";
foreach  (@long) {
	print OUT "<td> $_ </td>\n";
}

print OUT "</tr>
<tr align=\"center\">
<th align=\"left\">Repeat</th>\n
";
foreach  (@repeat) {
	print OUT "<td> $_ </td>\n";
}

print OUT "</tr>
<tr align=\"center\">
<th align=\"left\">Nat</th>\n
";
foreach  (@nat) {
	print OUT "<td> $_ </td>\n";
}
print OUT "</tr>\n</table>";

print OUT "<p>
Repeat subclass annotate:
";

print OUT "<table border=\"1\">
<tr align=\"center\">
<th align=\"left\">Repeat subclass</th>\n
";
foreach  (@marks) {
	print OUT "<th> $_ </th>\n";
}

foreach my $key (sort keys %repeat) {
	print  OUT "</tr>
		<tr align=\"center\">
		<th align=\"left\">$key</th>
	";
	for (my $i=0;$i<@marks ;$i++) {
		if (defined($repeat{$key}[$i])) {
			print OUT "<td> $repeat{$key}[$i] </td>\n";
		}
		else{print OUT "<td> 0 </td>\n";}
	}
}
print OUT "</tr>\n</table>";


print OUT "<p>
Nat subclass1 annotate:
";

print OUT "<table border=\"1\">
<tr align=\"center\">
<th align=\"left\">Nat subclass1</th>\n
";
foreach  (@marks) {
	print OUT "<th> $_ </th>\n";
}
foreach my $key (sort keys %nat1) {
	print  OUT "</tr>
		<tr align=\"center\">
		<th align=\"left\">$key</th>
	";
	for (my $i=0;$i<@marks ;$i++) {
		if (defined($nat1{$key}[$i])) {
			print OUT "<td> $nat1{$key}[$i] </td>\n";
		}
		else{print OUT "<td> 0 </td>\n";}
	}
}
print OUT "</tr>\n</table>";

print OUT "<p>
Nat subclass2 annotate:
";

print OUT "<table border=\"1\">
<tr align=\"center\">
<th align=\"left\">Nat subclass2</th>\n
";
foreach  (@marks) {
	print OUT "<th> $_ </th>\n";
}
foreach my $key (sort keys %nat2) {
	print  OUT "</tr>
		<tr align=\"center\">
		<th align=\"left\">$key</th>
	";
	for (my $i=0;$i<@marks ;$i++) {
		if (defined($nat2{$key}[$i])) {
			print OUT "<td> $nat2{$key}[$i] </td>\n";
		}
		else{print OUT "<td> 0 </td>\n";}
	}
}
print OUT "</tr>\n</table>";
print OUT "<p>
Note:<br />
One cluster mybe annotate to multiple repeats or nats<br />
";

my $deg_file=`ls $degpath`;
chomp $deg_file;
my @deg_file=split/\n/,$deg_file;
my %deg;
foreach  (@deg_file) {
	my $output="$degpath/$_/output_score.txt";
	open IN,"<$output";
	$deg{$_}[0]=0;
	$deg{$_}[1]=0;
	$deg{$_}[2]=0;
	while (my $aline=<IN>) {
		next if ($aline=~/^\"/);
		chomp $aline;
		my @temp=split/\t/,$aline;
		if ($temp[9] eq "TRUE") {
			$deg{$_}[0]++;
			if ($temp[4] >0) {
				$deg{$_}[1]++;
			}
			if ($temp[4] <0) {
				$deg{$_}[2]++;
			}
		}
	}
	close IN;
}


print OUT "<h2>6. DEG</h2>
<table border=\"1\">
<tr align=\"center\">
<th align=\"left\">Genes number</th>\n
<th> DEG </th>\n
<th> UP </th>\n
<th> DOWN </th>\n
";

foreach my $key (sort keys %deg) {
	print  OUT "</tr>
		<tr align=\"center\">
		<th align=\"left\">$key</th>
	";
	for (my $i=0;$i<@{$deg{$key}} ;$i++) {
		print OUT "<td> $deg{$key}[$i] </td>\n";
	}
}
print OUT "</tr>\n</table>";

print OUT "
 </BODY>
</HTML>
";
close OUT;


sub usage{
print <<"USAGE";
Version $version
Usage:
$0 -o
options:
-i
-format
-o output file
-h help
USAGE
exit(1);
}
author	big-tiandm
date	Wed, 29 Oct 2014 04:18:44 -0400
parents	07745c0958dd
children	0e4b6b0c6e9d