Mercurial > repos > bigrna > gpsrna
diff filterReadsByCount.pl @ 0:87fe81de0931 draft default tip
Uploaded
author | bigrna |
---|---|
date | Sun, 04 Jan 2015 02:47:25 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filterReadsByCount.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,116 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2010-01 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; +use File::Basename; + +my %opts; +GetOptions(\%opts,"i=s","o=s","mark:s","h"); +if (!(defined $opts{i} and defined $opts{o}) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $mark=defined $opts{'mark'} ? $opts{'mark'} : "Sample"; +my @mark=split /\#/,$mark; + +open OUT,">$opts{o}"; +open IN,"<$opts{i}"; +my %hash;my %reads; +while (my $aline=<IN>) { + chomp $aline; + my $seq=<IN>; + chomp $seq; + if($aline=~/:([\d|_]+)_x(\d+)$/){ + if ($2>3) { + my @ss=split/_/,$1; + for (my $i=0;$i<@ss;$i++) { + $hash{length($seq)}[$i]++ if($ss[$i]>0); + $hash{length($seq)}[$i] +=0 if($ss[$i]==0); + $reads{length($seq)}[$i]+=$ss[$i]; + } + print OUT "$aline\n$seq\n"; + } + } +} +close IN; +close OUT; + +my $dir=dirname($opts{'o'}); +chdir $dir; +my $lengthfile=$dir."/reads_length_distribution_after_count_filter.txt"; +open OUT, ">$lengthfile"; +open R,">$dir/length_distribution_after_count_filter.R"; + +print OUT "Tags length\t@mark\n"; + +my $samNo=@mark; +my $avalue=""; +my @length=sort{$a<=>$b} keys %hash; +foreach (@length) { + print OUT $_,"\t@{$hash{$_}}\n"; + my $vv=join ", ",@{$hash{$_}}; + $avalue .="$vv,"; +} +$avalue =~s/,$//; +my $lengths=join ",",@length; +my $marks=join "\",\"",@mark; + +print R "a<-c($avalue) +b<-matrix(a,ncol=$samNo,byrow=T) +cl<-colors() +names=c($lengths) +legends=c(\"$marks\") +png(\"Tags_length_after_count_filter.png\",width=800,height=600) +barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Tags Length Distribution After Count Filter\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\") +abline(h=0) +dev.off() + +"; +$avalue=""; +print OUT "\nReads length\t@mark\n"; +foreach (@length) { + print OUT $_,"\t@{$reads{$_}}\n"; + my $vv=join ", ", @{$reads{$_}}; + $avalue .= "$vv,"; +} +$avalue =~s/,$//; + +print R "a<-c($avalue)\n +b<-matrix(a,ncol=$samNo,byrow=T) + +png(\"Reads_length_after_count_filter.png\",width=800,height=600) +barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Reads Length Distribution After Count Filter\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\") +abline(h=0) +dev.off() + +"; +close OUT; +close R; + +system ("R CMD BATCH $dir/length_distribution_after_count_filter.R"); + +#system ("rm $dir/length_distribution.R"); +#system ("rm $dir/length_distribution.Rout"); +#system ("rm $dir/.RData"); +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o -min -max -mark +options: + +-i input file +-o output file +-mark string #sample name eg: samA#samB#samC +-h help +USAGE +exit(1); +} +