annotate filterReadsByLength.pl @ 31:7321a6f82492 draft

Uploaded
author big-tiandm
date Thu, 31 Jul 2014 03:07:14 -0400
parents 0a69f39fa9ff
children ca05d68aca13
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
9
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
1 #!/usr/bin/perl -w
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
2 #Filename:
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
3 #Author: Tian Dongmei
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
4 #Email: tiandm@big.ac.cn
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
5 #Date: 2010-01
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
6 #Modified:
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
7 #Description:
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
8 my $version=1.00;
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
9
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
10 use strict;
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
11 use Getopt::Long;
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
12 use File::Basename;
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
13
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
14 my %opts;
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
15 GetOptions(\%opts,"i=s","min=i","max=i","o=s","mark:s","h");
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
16 if (!(defined $opts{i} and defined $opts{o} and defined $opts{min} and defined $opts{max}) || defined $opts{h}) { #necessary arguments
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
17 &usage;
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
18 }
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
19
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
20 my $mark=defined $opts{'mark'} ? $opts{'mark'} : "Sample";
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
21 my @mark=split /,/,$mark;
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
22
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
23
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
24 open OUT,">$opts{o}";
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
25 open IN,"<$opts{i}";
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
26 my %hash;my %reads;
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
27 while (my $aline=<IN>) {
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
28 chomp $aline;
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
29 my $seq=<IN>;
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
30 chomp $seq;
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
31
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
32 if($aline=~/:([\d|_]+)_x(\d+)$/){
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
33 my @ss=split/_/,$1;
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
34 for (my $i=0;$i<@ss;$i++) {
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
35 $hash{length($seq)}[$i]++ if($ss[$i]>0);
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
36 $hash{length($seq)}[$i] +=0 if($ss[$i]>0);
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
37 $reads{length($seq)}[$i]+=$ss[$i];
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
38 }
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
39 }
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
40 #else{$reads{length($seq)}+=1;}
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
41 if (length ($seq)>=$opts{'min'} && length ($seq) <=$opts{'max'}) {
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
42 print OUT "$aline\n$seq\n";
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
43 }
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
44 }
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
45 close IN;
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
46 close OUT;
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
47
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
48 my $dir=dirname($opts{'o'});
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
49 chdir $dir;
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
50 my $lengthfile=$dir."/reads_length_distribution.txt";
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
51 open OUT, ">$lengthfile";
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
52 open R,">$dir/length_distribution.R";
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
53
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
54 print OUT "Tags length\t@mark\n";
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
55
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
56 my $samNo=@mark;
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
57 my $avalue="";
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
58 my @length=sort{$a<=>$b} keys %hash;
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
59 foreach (@length) {
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
60 print OUT $_,"\t@{$hash{$_}}\n";
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
61 my $vv=join ", ",@{$hash{$_}};
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
62 $avalue .="$vv,";
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
63 }
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
64 $avalue =~s/,$//;
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
65 my $lengths=join ",",@length;
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
66 my $marks=join "\",\"",@mark;
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
67
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
68 print R "a<-c($avalue)
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
69 b<-matrix(a,ncol=$samNo,byrow=T)
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
70 cl<-colors()
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
71 names=c($lengths)
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
72 legends=c(\"$marks\")
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
73 png(\"Tags_length.png\",width=800,height=600)
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
74 barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Tags Length Distribution\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\")
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
75 abline(h=0)
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
76 dev.off()
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
77
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
78 ";
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
79 $avalue="";
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
80 print OUT "\nReads length\t@mark\n";
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
81 foreach (@length) {
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
82 print OUT $_,"\t@{$reads{$_}}\n";
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
83 my $vv=join ", ", @{$reads{$_}};
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
84 $avalue .= "$vv,";
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
85 }
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
86 $avalue =~s/,$//;
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
87
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
88 print R "a<-c($avalue)\n
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
89 b<-matrix(a,ncol=$samNo,byrow=T)
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
90
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
91 png(\"Reads_length.png\",width=800,height=600)
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
92 barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Reads Length Distribution\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\")
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
93 abline(h=0)
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
94 dev.off()
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
95
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
96 ";
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
97 close OUT;
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
98 close R;
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
99
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
100 system ("R CMD BATCH $dir/length_distribution.R");
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
101
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
102 #system ("rm $dir/length_distribution.R");
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
103 #system ("rm $dir/length_distribution.Rout");
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
104 #system ("rm $dir/.RData");
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
105 sub usage{
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
106 print <<"USAGE";
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
107 Version $version
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
108 Usage:
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
109 $0 -i -o -min -max -mark
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
110 options:
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
111
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
112 -i input file
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
113 -o output file
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
114 -min reads min length.
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
115 -max reads max length.
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
116 -mark string #sample name eg: samA,samB,samC
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
117 -h help
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
118 USAGE
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
119 exit(1);
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
120 }
0a69f39fa9ff Uploaded
big-tiandm
parents:
diff changeset
121