diff filterReadsByLength.pl @ 29:253b51f72993 draft

Uploaded
author big-tiandm
date Thu, 31 Jul 2014 03:06:43 -0400
parents 0a69f39fa9ff
children ca05d68aca13
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filterReadsByLength.pl	Thu Jul 31 03:06:43 2014 -0400
@@ -0,0 +1,121 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2010-01
+#Modified:
+#Description:  
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+use File::Basename;
+
+my %opts;
+GetOptions(\%opts,"i=s","min=i","max=i","o=s","mark:s","h");
+if (!(defined $opts{i} and defined $opts{o} and defined $opts{min} and defined $opts{max}) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $mark=defined $opts{'mark'} ? $opts{'mark'} : "Sample";
+my @mark=split /,/,$mark;
+
+
+open OUT,">$opts{o}";
+open IN,"<$opts{i}";
+my %hash;my %reads;
+while (my $aline=<IN>) {
+	chomp $aline;
+	my $seq=<IN>;
+	chomp $seq;
+	
+	if($aline=~/:([\d|_]+)_x(\d+)$/){
+		my @ss=split/_/,$1;
+		for (my $i=0;$i<@ss;$i++) {
+			$hash{length($seq)}[$i]++ if($ss[$i]>0);
+			$hash{length($seq)}[$i] +=0 if($ss[$i]>0);
+			$reads{length($seq)}[$i]+=$ss[$i];
+		}
+	}
+	#else{$reads{length($seq)}+=1;}
+	if (length ($seq)>=$opts{'min'}  && length ($seq) <=$opts{'max'}) {
+		print OUT "$aline\n$seq\n";
+	}
+}
+close IN;
+close OUT;
+
+my $dir=dirname($opts{'o'});
+chdir $dir;
+my $lengthfile=$dir."/reads_length_distribution.txt";
+open OUT, ">$lengthfile";
+open R,">$dir/length_distribution.R";
+
+print OUT "Tags length\t@mark\n";
+
+my $samNo=@mark;
+my $avalue="";
+my @length=sort{$a<=>$b} keys %hash;
+foreach  (@length) {
+	print OUT $_,"\t@{$hash{$_}}\n";
+	my $vv=join ", ",@{$hash{$_}};
+	$avalue .="$vv,";
+}
+$avalue =~s/,$//;
+my $lengths=join ",",@length;
+my $marks=join "\",\"",@mark;
+
+print R  "a<-c($avalue)
+b<-matrix(a,ncol=$samNo,byrow=T)
+cl<-colors()
+names=c($lengths)
+legends=c(\"$marks\")
+png(\"Tags_length.png\",width=800,height=600)
+barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Tags Length Distribution\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\")
+abline(h=0)
+dev.off()
+
+";
+$avalue="";
+print OUT "\nReads length\t@mark\n";
+foreach  (@length) {
+	print OUT $_,"\t@{$reads{$_}}\n";
+	my $vv=join ", ", @{$reads{$_}};
+	$avalue .= "$vv,";
+}
+$avalue =~s/,$//;
+
+print R  "a<-c($avalue)\n
+b<-matrix(a,ncol=$samNo,byrow=T)
+
+png(\"Reads_length.png\",width=800,height=600)
+barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Reads Length Distribution\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\")
+abline(h=0)
+dev.off()
+
+";
+close OUT;
+close R;
+
+system ("R CMD BATCH $dir/length_distribution.R");
+
+#system ("rm $dir/length_distribution.R");
+#system ("rm $dir/length_distribution.Rout");
+#system ("rm $dir/.RData");
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o  -min -max -mark
+options:
+
+-i input file
+-o output file
+-min reads min length.
+-max reads max length.
+-mark string #sample name eg: samA,samB,samC
+-h help
+USAGE
+exit(1);
+}
+