view filterReadsByLength.pl @ 34:915b62d31f9d draft

Uploaded
author big-tiandm
date Thu, 31 Jul 2014 03:08:00 -0400
parents 0a69f39fa9ff
children ca05d68aca13
line wrap: on
line source

#!/usr/bin/perl -w
#Filename:
#Author: Tian Dongmei
#Email: tiandm@big.ac.cn
#Date: 2010-01
#Modified:
#Description:  
my $version=1.00;

use strict;
use Getopt::Long;
use File::Basename;

my %opts;
GetOptions(\%opts,"i=s","min=i","max=i","o=s","mark:s","h");
if (!(defined $opts{i} and defined $opts{o} and defined $opts{min} and defined $opts{max}) || defined $opts{h}) { #necessary arguments
&usage;
}

my $mark=defined $opts{'mark'} ? $opts{'mark'} : "Sample";
my @mark=split /,/,$mark;


open OUT,">$opts{o}";
open IN,"<$opts{i}";
my %hash;my %reads;
while (my $aline=<IN>) {
	chomp $aline;
	my $seq=<IN>;
	chomp $seq;
	
	if($aline=~/:([\d|_]+)_x(\d+)$/){
		my @ss=split/_/,$1;
		for (my $i=0;$i<@ss;$i++) {
			$hash{length($seq)}[$i]++ if($ss[$i]>0);
			$hash{length($seq)}[$i] +=0 if($ss[$i]>0);
			$reads{length($seq)}[$i]+=$ss[$i];
		}
	}
	#else{$reads{length($seq)}+=1;}
	if (length ($seq)>=$opts{'min'}  && length ($seq) <=$opts{'max'}) {
		print OUT "$aline\n$seq\n";
	}
}
close IN;
close OUT;

my $dir=dirname($opts{'o'});
chdir $dir;
my $lengthfile=$dir."/reads_length_distribution.txt";
open OUT, ">$lengthfile";
open R,">$dir/length_distribution.R";

print OUT "Tags length\t@mark\n";

my $samNo=@mark;
my $avalue="";
my @length=sort{$a<=>$b} keys %hash;
foreach  (@length) {
	print OUT $_,"\t@{$hash{$_}}\n";
	my $vv=join ", ",@{$hash{$_}};
	$avalue .="$vv,";
}
$avalue =~s/,$//;
my $lengths=join ",",@length;
my $marks=join "\",\"",@mark;

print R  "a<-c($avalue)
b<-matrix(a,ncol=$samNo,byrow=T)
cl<-colors()
names=c($lengths)
legends=c(\"$marks\")
png(\"Tags_length.png\",width=800,height=600)
barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Tags Length Distribution\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\")
abline(h=0)
dev.off()

";
$avalue="";
print OUT "\nReads length\t@mark\n";
foreach  (@length) {
	print OUT $_,"\t@{$reads{$_}}\n";
	my $vv=join ", ", @{$reads{$_}};
	$avalue .= "$vv,";
}
$avalue =~s/,$//;

print R  "a<-c($avalue)\n
b<-matrix(a,ncol=$samNo,byrow=T)

png(\"Reads_length.png\",width=800,height=600)
barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Reads Length Distribution\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\")
abline(h=0)
dev.off()

";
close OUT;
close R;

system ("R CMD BATCH $dir/length_distribution.R");

#system ("rm $dir/length_distribution.R");
#system ("rm $dir/length_distribution.Rout");
#system ("rm $dir/.RData");
sub usage{
print <<"USAGE";
Version $version
Usage:
$0 -i -o  -min -max -mark
options:

-i input file
-o output file
-min reads min length.
-max reads max length.
-mark string #sample name eg: samA,samB,samC
-h help
USAGE
exit(1);
}