Mercurial > repos > big-tiandm > mirplant2
diff html_preprocess.pl @ 47:c75593f79aa9 draft
Uploaded
author | big-tiandm |
---|---|
date | Wed, 03 Dec 2014 01:54:29 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/html_preprocess.pl Wed Dec 03 01:54:29 2014 -0500 @@ -0,0 +1,388 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2014-5-29 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; +use File::Basename; + +my %opts; +GetOptions(\%opts,"i=s","format=s","min=i","max=i","o=s","h"); +if (!(defined $opts{o} and defined $opts{format} and defined $opts{i} ) || defined $opts{h}) { #necessary arguments +&usage; +} +my ($config,$prepath,$rfampath,$knownpath,$genomepath,$novelpath); +my ($predir,$rfamdir,$knowndir,$genomedir,$noveldir); +open IN,"<$opts{i}"; +$config=<IN>; chomp $config; +$prepath=<IN>; chomp $prepath; +$genomepath=<IN>; chomp $genomepath; +$rfampath=<IN>; +close IN; +my @tmp=split/\//,$prepath; +$predir=$tmp[-1]; +@tmp=split/\//,$genomepath; +$genomedir=$tmp[-1]; + +my $dir=dirname($opts{'o'}); + +open OUT ,">$opts{'o'}"; +print OUT "<HTML>\n <HEAD>\n <TITLE> Analysis Report </TITLE>\n </HEAD> + <BODY bgcolor=\"lightgray\">\n <h1 align=\"center\">\n <font face=\"ºÚÌå\">\n <b>Preprocess Report</b>\n </font>\n </h1> + <h2>1. Sequence No. and quality</h2> + <h3>1.1 Sequece No.</h3> +"; + +### raw data no +open IN,"<$config"; +my @files;my @marks; my @rawNo; +while (my $aline=<IN>) { + chomp $aline; + my @tmp=split/\t/,$aline; + push @files,$tmp[0]; + + my $no=`less $tmp[0] |wc -l `; + chomp $no; + if ($opts{'format'} eq "fq" || $opts{'format'} eq "fastq") { + $no=$no/4; + } + else{ + $no=$no/2; + } + push @rawNo,$no; + + push @marks,$tmp[1]; +} +close IN; + +### preprocess +unless ($prepath=~/\/$/) { + $prepath .="/"; +} + +my @trimNo;my @collapse; +my $collapsefile=$prepath."collapse_reads.fa"; +open IN,"<$collapsefile"; +while (my $aline=<IN>) { + chomp $aline; + <IN>; + $aline=~/:([\d|_]+)_x(\d+)$/; + my @lng=split/_/,$1; + for (my $i=0;$i<@lng;$i++) { + if ($lng[$i]>0) { + $trimNo[$i] +=$lng[$i]; + $collapse[$i] ++; + } + } +} +close IN; + +my @cleanR;my @cleanT; +my $clean=$prepath."collapse_reads_$opts{min}_$opts{max}.fa"; +open IN,"<$clean"; +while (my $aline=<IN>) { + chomp $aline; + <IN>; + $aline=~/:([\d|_]+)_x(\d+)$/; + my @lng=split/_/,$1; + for (my $i=0;$i<@lng;$i++) { + if ($lng[$i]>0) { + $cleanR[$i] +=$lng[$i]; + $cleanT[$i] ++; + } + } +} +close IN; + +print OUT "<table border=\"1\"> +<tr align=\"center\"> +<th> </th> +"; +foreach (@marks) { + print OUT "<th> $_ </th>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Raw Reads No. </th> +"; +foreach (@rawNo) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Reads No. After Trimed 3\' adapter </th> +"; +foreach (@trimNo) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Unique Tags No. </th> +"; +foreach (@collapse) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Clean Reads No. </th> +"; +foreach (@cleanR) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Clean Tags No. </th> +"; +foreach (@cleanT) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr>\n</table>"; +print OUT "<p> +Note:<br /> +The raw data file path is: <b>$files[0]</b><br /> +"; +for (my $i=1;$i<@files;$i++) { + print OUT "          <b>$files[$i]</b><br />"; +} +print OUT "The collapsed file path is: <b>$collapsefile</b><br /> +The clean data file path is: <b>$clean</b><br /> +</p> +<h2> 1. Sequence length count</h2> +<h3> 1.1 Reads length count </h3> +"; +print OUT "\n"; + +my (%length); my $key="Tags Length"; +open IN,"<$prepath/reads_length_distribution.txt"; +while (my $aline=<IN>) { + chomp $aline; + next if($aline=~/^\s*$/); + if ($aline=~/^Reads/) { $key="Reads Length";} + my @tmp=split/\t/,$aline; + my @array=split/\s/,$tmp[1]; + push @{$length{$key}},[$tmp[0],@array]; +} +close IN; + +print OUT "<table border=\"1\"> +<tr align=\"center\">"; +my $hashkey="Reads Length"; +foreach (@{$length{$hashkey}[0]}) { + print OUT "<th> $_ </th>\n"; +} +print OUT "</tr>"; + +for (my $i=1;$i<@{$length{$hashkey}};$i++) { + print OUT "<tr align=\"center\"> + <th >$length{$hashkey}[$i][0] </th> + "; + for(my $j=1;$j<@{$length{$hashkey}[$i]};$j++) { + print OUT "<td> $length{$hashkey}[$i][$j] </td>\n"; + } + print OUT "</tr>\n"; +} +print OUT "</table>\n"; + +print OUT "<h3> 1.2 Tags length count </h3>"; + +print OUT "<table border=\"1\"> +<tr align=\"center\">"; +$hashkey="Tags Length"; +foreach (@{$length{$hashkey}[0]}) { + print OUT "<th> $_ </th>\n"; +} +print OUT "</tr>"; + +for (my $i=1;$i<@{$length{$hashkey}};$i++) { + print OUT "<tr align=\"center\"> + <th > $length{$hashkey}[$i][0] </th> + "; + for(my $j=1;$j<@{$length{$hashkey}[$i]};$j++) { + print OUT "<td> $length{$hashkey}[$i][$j] </td>\n"; + } + print OUT "</tr>\n"; +} + +print OUT "</table>\n"; + +print OUT "<h2> 2. Sequence length distribution </h2>"; +my $length=$prepath."length.html"; +open IN,"<$length"; +while (my $aline=<IN>) { + chomp $aline; + print OUT "$aline\n"; +} + +#print OUT "<p> Note:<br />The sequence length data: <a href=\"./$predir/reads_length_distribution.txt\"> length file</a> +#</p> +#"; + + + + +####genome map +#unless ($genomedir=~/\/$/) { +# $genomedir .="/"; +#} + +print OUT "<h2>2. Genome Alignment Result</h2> +<h3>2.1 Mapping count</h3> +"; + +open IN,"<$genomepath/genome_mapped.fa"; +my (@gread,@gtag); +while (my $aline=<IN>) { + chomp $aline; + <IN>; + $aline=~/:([\d|_]+)_x(\d+)$/; + my @sss=split/_/,$1; + for (my $i=0;$i<@sss;$i++) { + if ($sss[$i]>0) { + $gread[$i] +=$sss[$i]; + $gtag[$i] ++; + } + } +} +close IN; + +print OUT "<table border=\"1\"> +<tr align=\"center\"> +<th> </th> +"; +foreach (@marks) { + print OUT "<th> $_ </th>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Genome Mapped Reads No. </th> +"; +foreach (@gread) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Genome Mapped Reads Percent </th> +"; + +for (my $i=0;$i<@gread;$i++) { + my $per=sprintf ("%.2f",$gread[$i]/$cleanR[$i]*100); + print OUT "<td> $per\%</td>\n"; +} + +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Genome Mapped Tags No. </th> +"; +foreach (@gtag) { + print OUT "<td> $_ </td>\n"; +} +print OUT "</tr> +<tr align=\"center\"> +<th align=\"left\">Genome Mapped Tags Percent </th> +"; + +for (my $i=0;$i<@gtag;$i++) { + my $per=sprintf ("%.2f",$gtag[$i]/$cleanT[$i]*100); + print OUT "<td> $per\%</td>\n"; +} +print OUT "</tr>\n</table>"; +print OUT "<p> +Note:<br /> +The genome mapped bwt file path is: <b>$genomedir/genome_mapped.bwt</b><br /> +The genome mapped FASTA file path is: <b>$genomedir/genome_mapped.fa</b> +<br /> +"; + + + +#### rfam +if(defined $rfampath && $rfampath=~/rfam_match/){ +chomp $rfampath; +@tmp=split/\//,$rfampath; +$rfamdir=$tmp[-1]; + +unless ($rfampath=~/\/$/) { + $rfampath .="/"; +} +print OUT "<h2>3. Rfam non-miRNA annotation</h2> +<h3>3.1 Reads count</h3> +<table border=\"1\"> +<tr align=\"center\"> +"; + +my @rfamR; my @rfamT; +my $tag=1; +open IN,"<$dir/rfam_non-miRNA_annotation.txt"; +while (my $aline=<IN>) { + chomp $aline; + $tag=0 if($aline=~/tags\s+number/); + next if($aline=~/^\#/); + next if($aline=~/^\s*$/); + my @tmp=split/\s+/,$aline; + if($tag == 1){push @rfamR,[@tmp];} + else{push @rfamT,[@tmp];} +} +close IN; + + +print OUT "<th>RNA Name</th>\n"; +foreach (@marks) { + print OUT "<th> $_ </th>\n"; +} +for (my $i=0;$i<@rfamR;$i++) { + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">$rfamR[$i][0]</th> + "; + for (my $j=1;$j<@{$rfamR[$i]} ;$j++) { + print OUT "<td> $rfamR[$i][$j]</td>\n"; + } +} + +print OUT "</tr>\n</table> + <h3>3.2 Tags count</h3> + <table border=\"1\"> + <tr align=\"center\"> + <th>RNA Name</th>\n"; +foreach (@marks) { + print OUT "<th> $_ </th>\n"; +} +for (my $i=0;$i<@rfamT;$i++) { + print OUT "</tr> + <tr align=\"center\"> + <th align=\"left\">$rfamT[$i][0]</th> + "; + for (my $j=1;$j<@{$rfamT[$i]} ;$j++) { + print OUT "<td> $rfamT[$i][$j]</td>\n"; + } +} +print OUT "</tr>\n</table> +<p>Note:<br />The rfam mapping results is: <b>$rfampath</b>"; +print OUT "<b>rfam_mapped.bwt</b></p> +"; +} + + +print OUT " + </BODY> +</HTML> +"; +close OUT; + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -o +options: +-o output file +-h help +USAGE +exit(1); +} +