annotate filterReadsByCount.pl @ 51:3202911efdae draft

Uploaded
author big-tiandm
date Fri, 05 Dec 2014 01:14:28 -0500
parents 7b5a48b972e9
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
50
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1 #!/usr/bin/perl -w
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
2 #Filename:
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
3 #Author: Tian Dongmei
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
4 #Email: tiandm@big.ac.cn
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
5 #Date: 2010-01
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
6 #Modified:
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
7 #Description:
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
8 my $version=1.00;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
9
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
10 use strict;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
11 use Getopt::Long;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
12 use File::Basename;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
13
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
14 my %opts;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
15 GetOptions(\%opts,"i=s","o=s","mark:s","h");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
16 if (!(defined $opts{i} and defined $opts{o}) || defined $opts{h}) { #necessary arguments
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
17 &usage;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
18 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
19
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
20 my $mark=defined $opts{'mark'} ? $opts{'mark'} : "Sample";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
21 my @mark=split /\#/,$mark;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
22
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
23 open OUT,">$opts{o}";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
24 open IN,"<$opts{i}";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
25 my %hash;my %reads;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
26 while (my $aline=<IN>) {
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
27 chomp $aline;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
28 my $seq=<IN>;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
29 chomp $seq;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
30 if($aline=~/:([\d|_]+)_x(\d+)$/){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
31 if ($2>3) {
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
32 my @ss=split/_/,$1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
33 for (my $i=0;$i<@ss;$i++) {
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
34 $hash{length($seq)}[$i]++ if($ss[$i]>0);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
35 $hash{length($seq)}[$i] +=0 if($ss[$i]==0);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
36 $reads{length($seq)}[$i]+=$ss[$i];
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
37 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
38 print OUT "$aline\n$seq\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
39 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
40 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
41 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
42 close IN;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
43 close OUT;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
44
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
45 my $dir=dirname($opts{'o'});
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
46 chdir $dir;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
47 my $lengthfile=$dir."/reads_length_distribution_after_count_filter.txt";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
48 open OUT, ">$lengthfile";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
49 open R,">$dir/length_distribution_after_count_filter.R";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
50
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
51 print OUT "Tags length\t@mark\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
52
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
53 my $samNo=@mark;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
54 my $avalue="";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
55 my @length=sort{$a<=>$b} keys %hash;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
56 foreach (@length) {
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
57 print OUT $_,"\t@{$hash{$_}}\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
58 my $vv=join ", ",@{$hash{$_}};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
59 $avalue .="$vv,";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
60 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
61 $avalue =~s/,$//;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
62 my $lengths=join ",",@length;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
63 my $marks=join "\",\"",@mark;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
64
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
65 print R "a<-c($avalue)
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
66 b<-matrix(a,ncol=$samNo,byrow=T)
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
67 cl<-colors()
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
68 names=c($lengths)
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
69 legends=c(\"$marks\")
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
70 png(\"Tags_length_after_count_filter.png\",width=800,height=600)
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
71 barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Tags Length Distribution After Count Filter\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\")
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
72 abline(h=0)
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
73 dev.off()
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
74
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
75 ";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
76 $avalue="";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
77 print OUT "\nReads length\t@mark\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
78 foreach (@length) {
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
79 print OUT $_,"\t@{$reads{$_}}\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
80 my $vv=join ", ", @{$reads{$_}};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
81 $avalue .= "$vv,";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
82 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
83 $avalue =~s/,$//;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
84
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
85 print R "a<-c($avalue)\n
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
86 b<-matrix(a,ncol=$samNo,byrow=T)
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
87
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
88 png(\"Reads_length_after_count_filter.png\",width=800,height=600)
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
89 barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Reads Length Distribution After Count Filter\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\")
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
90 abline(h=0)
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
91 dev.off()
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
92
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
93 ";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
94 close OUT;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
95 close R;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
96
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
97 system ("R CMD BATCH $dir/length_distribution_after_count_filter.R");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
98
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
99 #system ("rm $dir/length_distribution.R");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
100 #system ("rm $dir/length_distribution.Rout");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
101 #system ("rm $dir/.RData");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
102 sub usage{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
103 print <<"USAGE";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
104 Version $version
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
105 Usage:
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
106 $0 -i -o -min -max -mark
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
107 options:
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
108
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
109 -i input file
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
110 -o output file
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
111 -mark string #sample name eg: samA#samB#samC
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
112 -h help
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
113 USAGE
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
114 exit(1);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
115 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
116