Mercurial > repos > bigrna > gpsrna
comparison collapseReads2Tags.pl @ 0:87fe81de0931 draft default tip
Uploaded
author | bigrna |
---|---|
date | Sun, 04 Jan 2015 02:47:25 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:87fe81de0931 |
---|---|
1 #!/usr/bin/perl -w | |
2 #Filename: | |
3 #Author: Tian Dongmei | |
4 #Email: tiandm@big.ac.cn | |
5 #Date: 2014-3-20 | |
6 #Modified: | |
7 #Description: fastq file form reads cluster(the same sequence in the same cluster) | |
8 my $version=1.00; | |
9 | |
10 use strict; | |
11 use Getopt::Long; | |
12 | |
13 my %opts; | |
14 GetOptions(\%opts,"i:s@","format=s","mark:s","qual:s","qv:i","o=s","h"); | |
15 if (!(defined $opts{o} and defined $opts{'format'}) || defined $opts{h}) { #necessary arguments | |
16 &usage; | |
17 } | |
18 my @filein=@{$opts{i}} if(defined $opts{i}); | |
19 my $name=defined $opts{'mark'} ? $opts{'mark'} : "seq"; | |
20 my $fileout=$opts{'o'}; | |
21 my $pq=defined $opts{'qv'} ? $opts{'qv'} : 33; | |
22 my %hash;##分块存放原始序列 | |
23 | |
24 my $format=$opts{'format'}; | |
25 if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { | |
26 die "Parameter -format is error!\n"; | |
27 } | |
28 | |
29 my ($qualT,$qualV); | |
30 if (defined $opts{'qual'} && ($format eq "fastq" || $format eq "fq")) { #quality filter | |
31 my @temp=split /:/,$opts{'qual'}; | |
32 $qualT=$temp[0]; | |
33 $qualV=$temp[1]; | |
34 | |
35 for (my $i=0;$i<@filein;$i++) { | |
36 open IN,"<$filein[$i]"; | |
37 while (my $aline=<IN>) { | |
38 my $seq=<IN>; | |
39 my $n=<IN>; | |
40 my $qv=<IN>; | |
41 my $tag=&qvcheck($qv,$qualT,$qualV); | |
42 next if(!$tag); | |
43 my $str=substr($seq,0,6); | |
44 $hash{$str}[$i].=$seq; | |
45 } | |
46 close IN; | |
47 } | |
48 } | |
49 elsif($format eq "fastq" || $format eq "fq"){ ### do not filter low quality reads | |
50 for (my $i=0;$i<@filein;$i++) { | |
51 open IN,"<$filein[$i]"; | |
52 while (my $aline=<IN>) { | |
53 my $seq=<IN>; | |
54 my $n=<IN>; | |
55 my $qv=<IN>; | |
56 my $str=substr($seq,0,6); | |
57 $hash{$str}[$i].=$seq; | |
58 } | |
59 close IN; | |
60 } | |
61 | |
62 } | |
63 elsif($format eq "fasta" || $format eq "fa"){ | |
64 for (my $i=0;$i<@filein;$i++) { | |
65 open IN,"<$filein[$i]"; | |
66 while (my $aline=<IN>) { | |
67 my $seq=<IN>; | |
68 my $str=substr($seq,0,6); | |
69 $hash{$str}[$i].=$seq; | |
70 } | |
71 close IN; | |
72 } | |
73 } | |
74 | |
75 open OUT,">$fileout"; #output file | |
76 my $count=0; | |
77 foreach my $key (keys %hash) { | |
78 my %cluster; | |
79 for (my $i=0;$i<@filein;$i++) { | |
80 next if(!(defined $hash{$key}[$i])); | |
81 my @tmp=split/\n/,$hash{$key}[$i]; | |
82 foreach (@tmp) { | |
83 $cluster{$_}[$i]++; | |
84 } | |
85 } | |
86 | |
87 foreach my $seq (keys %cluster) { | |
88 my $exp=""; my $ee=0; | |
89 for (my $i=0;$i<@filein;$i++) { | |
90 if (defined $cluster{$seq}[$i]) { | |
91 $exp.="_$cluster{$seq}[$i]"; | |
92 $ee+=$cluster{$seq}[$i]; | |
93 }else{ | |
94 $exp.="_0"; | |
95 } | |
96 } | |
97 $count+=$ee; | |
98 $exp=~s/^_//; | |
99 print OUT ">$name","_$count:$exp","_x$ee\n$seq\n"; | |
100 } | |
101 } | |
102 close OUT; | |
103 | |
104 | |
105 sub qvcheck{ | |
106 my ($str,$t,$v)=@_; | |
107 my $qv=0; | |
108 if($t eq "mean"){ | |
109 $qv=&getMeanQuality($str); | |
110 } | |
111 elsif($t eq "min"){ | |
112 $qv=&getMinQuality($str); | |
113 } | |
114 if ($qv<$v) { | |
115 return 0; | |
116 } | |
117 return 1; | |
118 } | |
119 | |
120 sub getMeanQuality(){ | |
121 chomp $_[0]; | |
122 my @bases = split(//,$_[0]); | |
123 my $sum = 0; | |
124 for(my $i = 0; $i <= $#bases; $i++){ | |
125 my $num = ord($bases[$i]) - $pq; | |
126 $sum += $num; | |
127 } | |
128 | |
129 return $sum/($#bases+1); | |
130 | |
131 } | |
132 | |
133 ### | |
134 ### This function gives back the Q-value of the worst base | |
135 sub getMinQuality(){ | |
136 chomp $_[0]; | |
137 my @bases = split(//,$_[0]); | |
138 my $worst = 1000; | |
139 for(my $i = 0; $i <= $#bases; $i++){ | |
140 # printf ("base: $bases[$i] --> %d\n",ord($bases[$i])); | |
141 my $num = ord($bases[$i]) - $pq; | |
142 if($num < $worst){ | |
143 $worst = $num; | |
144 } | |
145 } | |
146 return $worst; | |
147 } | |
148 | |
149 sub usage{ | |
150 print <<"USAGE"; | |
151 Version $version | |
152 Usage: | |
153 $0 -i -format -mark -qual -qv -o | |
154 options: | |
155 -i input file#fastq file ##can be multiple -i file1 -i file2 ... | |
156 -mark string#quary name,default is "seq" | |
157 -o output file | |
158 -format string # fastq|fasta|fq|fa | |
159 | |
160 -qual #reads filter | |
161 eg:(min:value/mean:value) | |
162 This parameter just for solexa reads. | |
163 If the input files are solid and needs filter,please do filter first . | |
164 | |
165 -qv integer #Phred quality64/33,default 33 | |
166 -h help | |
167 USAGE | |
168 exit(1); | |
169 } | |
170 |