annotate collapseReads2Tags.pl @ 13:a8c011dc575b draft

Uploaded
author big-tiandm
date Wed, 29 Oct 2014 04:19:42 -0400
parents 07745c0958dd
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
1 #!/usr/bin/perl -w
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
2 #Filename:
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
3 #Author: Tian Dongmei
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
4 #Email: tiandm@big.ac.cn
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
5 #Date: 2014-3-20
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
6 #Modified:
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
7 #Description: fastq file form reads cluster(the same sequence in the same cluster)
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
8 my $version=1.00;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
9
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
10 use strict;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
11 use Getopt::Long;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
12
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
13 my %opts;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
14 GetOptions(\%opts,"i:s@","format=s","mark:s","qual:s","qv:i","o=s","h");
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
15 if (!(defined $opts{o} and defined $opts{'format'}) || defined $opts{h}) { #necessary arguments
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
16 &usage;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
17 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
18 my @filein=@{$opts{i}} if(defined $opts{i});
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
19 my $name=defined $opts{'mark'} ? $opts{'mark'} : "seq";
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
20 my $fileout=$opts{'o'};
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
21 my $pq=defined $opts{'qv'} ? $opts{'qv'} : 33;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
22 my %hash;##分块存放原始序列
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
23
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
24 my $format=$opts{'format'};
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
25 if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") {
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
26 die "Parameter -format is error!\n";
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
27 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
28
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
29 my ($qualT,$qualV);
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
30 if (defined $opts{'qual'} && ($format eq "fastq" || $format eq "fq")) { #quality filter
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
31 my @temp=split /:/,$opts{'qual'};
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
32 $qualT=$temp[0];
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
33 $qualV=$temp[1];
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
34
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
35 for (my $i=0;$i<@filein;$i++) {
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
36 open IN,"<$filein[$i]";
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
37 while (my $aline=<IN>) {
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
38 my $seq=<IN>;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
39 my $n=<IN>;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
40 my $qv=<IN>;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
41 my $tag=&qvcheck($qv,$qualT,$qualV);
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
42 next if(!$tag);
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
43 my $str=substr($seq,0,6);
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
44 $hash{$str}[$i].=$seq;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
45 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
46 close IN;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
47 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
48 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
49 elsif($format eq "fastq" || $format eq "fq"){ ### do not filter low quality reads
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
50 for (my $i=0;$i<@filein;$i++) {
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
51 open IN,"<$filein[$i]";
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
52 while (my $aline=<IN>) {
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
53 my $seq=<IN>;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
54 my $n=<IN>;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
55 my $qv=<IN>;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
56 my $str=substr($seq,0,6);
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
57 $hash{$str}[$i].=$seq;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
58 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
59 close IN;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
60 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
61
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
62 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
63 elsif($format eq "fasta" || $format eq "fa"){
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
64 for (my $i=0;$i<@filein;$i++) {
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
65 open IN,"<$filein[$i]";
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
66 while (my $aline=<IN>) {
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
67 my $seq=<IN>;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
68 my $str=substr($seq,0,6);
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
69 $hash{$str}[$i].=$seq;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
70 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
71 close IN;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
72 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
73 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
74
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
75 open OUT,">$fileout"; #output file
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
76 my $count=0;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
77 foreach my $key (keys %hash) {
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
78 my %cluster;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
79 for (my $i=0;$i<@filein;$i++) {
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
80 next if(!(defined $hash{$key}[$i]));
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
81 my @tmp=split/\n/,$hash{$key}[$i];
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
82 foreach (@tmp) {
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
83 $cluster{$_}[$i]++;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
84 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
85 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
86
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
87 foreach my $seq (keys %cluster) {
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
88 my $exp=""; my $ee=0;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
89 for (my $i=0;$i<@filein;$i++) {
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
90 if (defined $cluster{$seq}[$i]) {
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
91 $exp.="_$cluster{$seq}[$i]";
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
92 $ee+=$cluster{$seq}[$i];
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
93 }else{
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
94 $exp.="_0";
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
95 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
96 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
97 $count+=$ee;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
98 $exp=~s/^_//;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
99 print OUT ">$name","_$count:$exp","_x$ee\n$seq\n";
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
100 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
101 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
102 close OUT;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
103
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
104
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
105 sub qvcheck{
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
106 my ($str,$t,$v)=@_;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
107 my $qv=0;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
108 if($t eq "mean"){
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
109 $qv=&getMeanQuality($str);
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
110 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
111 elsif($t eq "min"){
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
112 $qv=&getMinQuality($str);
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
113 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
114 if ($qv<$v) {
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
115 return 0;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
116 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
117 return 1;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
118 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
119
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
120 sub getMeanQuality(){
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
121 chomp $_[0];
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
122 my @bases = split(//,$_[0]);
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
123 my $sum = 0;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
124 for(my $i = 0; $i <= $#bases; $i++){
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
125 my $num = ord($bases[$i]) - $pq;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
126 $sum += $num;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
127 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
128
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
129 return $sum/($#bases+1);
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
130
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
131 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
132
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
133 ###
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
134 ### This function gives back the Q-value of the worst base
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
135 sub getMinQuality(){
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
136 chomp $_[0];
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
137 my @bases = split(//,$_[0]);
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
138 my $worst = 1000;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
139 for(my $i = 0; $i <= $#bases; $i++){
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
140 # printf ("base: $bases[$i] --> %d\n",ord($bases[$i]));
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
141 my $num = ord($bases[$i]) - $pq;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
142 if($num < $worst){
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
143 $worst = $num;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
144 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
145 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
146 return $worst;
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
147 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
148
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
149 sub usage{
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
150 print <<"USAGE";
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
151 Version $version
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
152 Usage:
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
153 $0 -i -format -mark -qual -qv -o
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
154 options:
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
155 -i input file#fastq file ##can be multiple -i file1 -i file2 ...
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
156 -mark string#quary name,default is "seq"
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
157 -o output file
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
158 -format string # fastq|fasta|fq|fa
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
159
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
160 -qual #reads filter
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
161 eg:(min:value/mean:value)
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
162 This parameter just for solexa reads.
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
163 If the input files are solid and needs filter,please do filter first .
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
164
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
165 -qv integer #Phred quality64/33,default 33
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
166 -h help
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
167 USAGE
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
168 exit(1);
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
169 }
07745c0958dd Uploaded
big-tiandm
parents:
diff changeset
170