Mercurial > repos > big-tiandm > mirplant2
view filterReadsByLength.pl @ 39:2a5b751228a6 draft
Uploaded
author | big-tiandm |
---|---|
date | Tue, 28 Oct 2014 01:32:32 -0400 |
parents | 0a69f39fa9ff |
children | ca05d68aca13 |
line wrap: on
line source
#!/usr/bin/perl -w #Filename: #Author: Tian Dongmei #Email: tiandm@big.ac.cn #Date: 2010-01 #Modified: #Description: my $version=1.00; use strict; use Getopt::Long; use File::Basename; my %opts; GetOptions(\%opts,"i=s","min=i","max=i","o=s","mark:s","h"); if (!(defined $opts{i} and defined $opts{o} and defined $opts{min} and defined $opts{max}) || defined $opts{h}) { #necessary arguments &usage; } my $mark=defined $opts{'mark'} ? $opts{'mark'} : "Sample"; my @mark=split /,/,$mark; open OUT,">$opts{o}"; open IN,"<$opts{i}"; my %hash;my %reads; while (my $aline=<IN>) { chomp $aline; my $seq=<IN>; chomp $seq; if($aline=~/:([\d|_]+)_x(\d+)$/){ my @ss=split/_/,$1; for (my $i=0;$i<@ss;$i++) { $hash{length($seq)}[$i]++ if($ss[$i]>0); $hash{length($seq)}[$i] +=0 if($ss[$i]>0); $reads{length($seq)}[$i]+=$ss[$i]; } } #else{$reads{length($seq)}+=1;} if (length ($seq)>=$opts{'min'} && length ($seq) <=$opts{'max'}) { print OUT "$aline\n$seq\n"; } } close IN; close OUT; my $dir=dirname($opts{'o'}); chdir $dir; my $lengthfile=$dir."/reads_length_distribution.txt"; open OUT, ">$lengthfile"; open R,">$dir/length_distribution.R"; print OUT "Tags length\t@mark\n"; my $samNo=@mark; my $avalue=""; my @length=sort{$a<=>$b} keys %hash; foreach (@length) { print OUT $_,"\t@{$hash{$_}}\n"; my $vv=join ", ",@{$hash{$_}}; $avalue .="$vv,"; } $avalue =~s/,$//; my $lengths=join ",",@length; my $marks=join "\",\"",@mark; print R "a<-c($avalue) b<-matrix(a,ncol=$samNo,byrow=T) cl<-colors() names=c($lengths) legends=c(\"$marks\") png(\"Tags_length.png\",width=800,height=600) barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Tags Length Distribution\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\") abline(h=0) dev.off() "; $avalue=""; print OUT "\nReads length\t@mark\n"; foreach (@length) { print OUT $_,"\t@{$reads{$_}}\n"; my $vv=join ", ", @{$reads{$_}}; $avalue .= "$vv,"; } $avalue =~s/,$//; print R "a<-c($avalue)\n b<-matrix(a,ncol=$samNo,byrow=T) png(\"Reads_length.png\",width=800,height=600) barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Reads Length Distribution\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\") abline(h=0) dev.off() "; close OUT; close R; system ("R CMD BATCH $dir/length_distribution.R"); #system ("rm $dir/length_distribution.R"); #system ("rm $dir/length_distribution.Rout"); #system ("rm $dir/.RData"); sub usage{ print <<"USAGE"; Version $version Usage: $0 -i -o -min -max -mark options: -i input file -o output file -min reads min length. -max reads max length. -mark string #sample name eg: samA,samB,samC -h help USAGE exit(1); }