0
|
1 #!/usr/bin/perl
|
|
2
|
|
3 # This version has an option of counting only the pairs with a single mate distance. (version 11).
|
|
4 # This version has fixed the problem in filtering max_insert_length. (version 11).
|
|
5 # This version does not print out the additional tab in the iNIR file. (version 11).
|
|
6 # This version has data type option for parsing refseq headers. (version 10)
|
|
7 # This version fixed the max_insert_lendis option name from max_insert_length. (version 10)
|
|
8 # This version can take multiple bowtieout files separated by comma. (version 10)
|
|
9 # This version can take option of max insert lendis (version 9)
|
|
10 # This version can take sample name (column title) as option, if -f is selected. (version 8)
|
|
11 # This version fixed the problem of double-counting cases that are strand-flipped. (version 7)
|
|
12 # This version has an option of counting all reads, gNIR or iNIR. (version 7)
|
|
13 # This version has two levels of verbosity and uses whole file name if run with a single bowtie file. (version 6)
|
|
14 # This version has an option of using a single bowtie file name instead of bowtie directory. (version 6)
|
|
15 # This version fixed the problem of not being able to find common suffix and assumes .bowtieout suffix for bowtieout files, to prevent reading other files in the directory. (version 5)
|
|
16 # This version fixed the problem of going into infinite loop when run with one sample. (Version 3)
|
|
17 # This version is verbose. (version 2)
|
|
18
|
|
19 use Getopt::Long;
|
|
20 my $verbose=0;
|
|
21 my $LINEBIN=1000000;
|
|
22
|
|
23 my $bowtie = 'd'; # default is directory.
|
|
24 my $count_opt = 'gReadcount';
|
|
25 my $samplename;
|
|
26 my $max_insert_length;
|
|
27 my $readlength;
|
|
28 my $datatype='E';
|
|
29 my $uniq_d; # reads with single mate distance only (applicable only for PE)
|
|
30 &GetOptions ( 'v' => sub { $verbose=1 },
|
|
31 'vv' => sub { $verbose=2 },
|
|
32 'f' => sub { $bowtie='f'}, # read bowtie file instead of bowtiu directory
|
|
33 'gNIR' => sub { $count_opt = 'gNIR' },
|
|
34 'iNIR' => sub { $count_opt = 'iNIR' },
|
|
35 's|sample=s' => \$samplename,
|
|
36 'm|max_insert_length=i' => \$max_insert_length,
|
|
37 'l|readlength=i' => \$readlength,
|
|
38 'd|datatype=s' => \$datatype,
|
|
39 'u|uniq_d' => \$uniq_d
|
|
40 );
|
|
41
|
|
42 if($bowtie eq 'd' && defined($samplename)) { print STDERR "Warning: -s|--sample option is ignored, when -f is not selected.\n"; }
|
|
43 if(defined($max_insert_length) && !defined($readlength)) { die "readlength must be defined if max_insert_length is defined.\n"; }
|
|
44
|
|
45
|
|
46 if(@ARGV<3) { &help; exit; }
|
|
47
|
|
48
|
|
49 my $gene2NM = shift @ARGV;
|
|
50 my $bowtieoutdir = shift @ARGV;
|
|
51 my $PE = shift @ARGV;
|
|
52
|
|
53 if($PE==0 && defined($max_insert_length)) { print STDERR "Warning: -m|--max_insert_length option is ignored for single end data.\n"; }
|
|
54
|
|
55
|
|
56 if($bowtie eq 'd'){
|
|
57 @bowtiefiles = split/\n/,`ls -1 $bowtieoutdir/*`;
|
|
58 if($verbose) { printf STDERR "Finished reading bowtieout dir $bowtieoutdir. Total %d files\n", scalar(@bowtiefiles); }
|
|
59 }
|
|
60 else {
|
|
61 push @bowtiefiles, (split/,/,$bowtieoutdir);
|
|
62 }
|
|
63
|
|
64
|
|
65 open GENE2NM, $gene2NM or die "Can't open gene2NM file $gene2NM\n";
|
|
66 while(<GENE2NM>){
|
|
67 chomp;
|
|
68 my ($g,$t) = split/\t/;
|
|
69 $t2g{$t}=$g;
|
|
70 push @{$g2t{$g}}, $t;
|
|
71 }
|
|
72 close GENE2NM;
|
|
73 if($verbose) { printf STDERR "Finished reading gene2NM file $gene2NM. Total %d transcripts and %d genes\n", scalar(keys(%t2g)), scalar(keys(%g2t)); }
|
|
74
|
|
75
|
|
76
|
|
77
|
|
78
|
|
79 for my $i (0..$#bowtiefiles) {
|
|
80
|
|
81 my $bowtieout = $bowtiefiles[$i];
|
|
82 my %mapped_genes =();
|
|
83 my %mapped_tr =();
|
|
84 my %mapped_insert_lengths =();
|
|
85
|
|
86 if($verbose) { print STDERR "Processing bowtieout $bowtieout (file $i)\n"; }
|
|
87
|
|
88
|
|
89 my $prev_head=''; my $prev_tr='';
|
|
90 open $BOWTIE, $bowtieout or die "Can't open bowtiefile $bowtieout\n";
|
|
91 while(<$BOWTIE>){
|
|
92 chomp;
|
|
93 my ($head,$tr,$pos1) = (split/\t/)[0,2,3];
|
|
94 if($datatype eq 'R') { my ($tmp_tr) = (split/\|/,$tr)[3]; if(defined($tmp_tr)) {$tr = $tmp_tr; $tr=~s/\.\d+$//; }}
|
|
95
|
|
96
|
|
97 my $insert_length=0; ## default, for SE.
|
|
98 if($PE==1){
|
|
99 chomp(my $secondline = <$BOWTIE>);
|
|
100 my ($head2,$pos2) = (split/\t/,$secondline)[0,3];
|
|
101 $head = &min($head,$head2);
|
|
102 $insert_length = $pos2 - $pos1 + $readlength;
|
|
103 if(defined $max_insert_length) { if($insert_length > $max_insert_length) { next; } }
|
|
104 }
|
|
105
|
|
106
|
|
107 if($verbose==2) { if($prev_head eq '') { print STDERR "head= $head, tr= $tr, PE = $PE\n"; } } # print only at first line
|
|
108
|
|
109
|
|
110 if($count_opt eq 'gNIR'){
|
|
111 if($head eq $prev_head) {
|
|
112 $mapped_tr { $tr } = 1;
|
|
113 $mapped_tr { $prev_tr } = 1;
|
|
114 $mapped_genes { $t2g{$tr} } = 1;
|
|
115 $mapped_genes { $t2g{$prev_tr} } = 1;
|
|
116 }
|
|
117 else {
|
|
118 if(scalar(keys(%mapped_genes)) == 1) { my ($gene) = keys %mapped_genes; if(scalar(keys(%mapped_tr)) == scalar(@{$g2t{$gene}})) { $count{$gene}[$i]++; }}
|
|
119 ## count reads that are unique to the gene and common to all of its isoforms
|
|
120 %mapped_genes =();
|
|
121 %mapped_tr = ();
|
|
122 $mapped_tr { $tr } = 1;
|
|
123 $mapped_genes { $t2g{$tr} } = 1;
|
|
124 }
|
|
125
|
|
126 }
|
|
127 elsif($count_opt eq 'iNIR'){
|
|
128 if($head eq $prev_head) {
|
|
129 $mapped_tr { $tr } = 1;
|
|
130 $mapped_tr { $prev_tr } = 1;
|
|
131 }
|
|
132 else {
|
|
133 if(scalar(keys(%mapped_tr)) == 1) { my ($transcript) = keys %mapped_tr; $count{$transcript}[$i]++; } ## count reads that are unique to the transcript
|
|
134 %mapped_tr = ();
|
|
135 $mapped_tr { $tr } = 1;
|
|
136 }
|
|
137 }
|
|
138 else {
|
|
139 if($head eq $prev_head) {
|
|
140 $mapped_genes { $t2g{$tr} } = 1;
|
|
141 $mapped_genes { $t2g{$prev_tr} } = 1;
|
|
142 $mapped_insert_lengths { "$insert_length" } =1;
|
|
143 $mapped_insert_lengths { "$prev_insert_length" } =1;
|
|
144 }
|
|
145 else {
|
|
146 if(scalar(keys(%mapped_genes)) == 1) {
|
|
147 my ($gene) = keys %mapped_genes;
|
|
148 if(defined($uniq_d)) { if(scalar(keys(%mapped_insert_lengths))==1) { $count{$gene}[$i]++; } }
|
|
149 else { $count{$gene}[$i]++; }
|
|
150 } ## count reads that are unique to the gene
|
|
151 %mapped_genes =();
|
|
152 %mapped_insert_lengths =();
|
|
153 $mapped_genes { $t2g{$tr} } = 1;
|
|
154 $mapped_insert_lengths { "$insert_length" } =1;
|
|
155 }
|
|
156 }
|
|
157
|
|
158
|
|
159 $prev_head = $head;
|
|
160 $prev_tr = $tr;
|
|
161 $prev_insert_length = $insert_length;
|
|
162
|
|
163 if($verbose==2) { if($. % $LINEBIN==0) { print STDERR ":line $.\n"; } }
|
|
164
|
|
165 }
|
|
166 close $BOWTIE;
|
|
167
|
|
168
|
|
169 # last line
|
|
170 if($count_opt eq 'gNIR'){
|
|
171 if(scalar(keys(%mapped_genes)) == 1) { my ($gene) = keys %mapped_genes; if(scalar(keys(%mapped_tr)) == scalar(@{$g2t{$gene}})) { $count{$gene}[$i]++; }}
|
|
172 ## count reads that are unique to the gene and common to all of its isoforms
|
|
173 }
|
|
174 elsif($count_opt eq 'iNIR'){
|
|
175 if(scalar(keys(%mapped_tr)) == 1) { my ($transcript) = keys %mapped_tr; $count{$transcript}[$i]++; } ## count reads that are unique to the transcript
|
|
176 }
|
|
177 else{
|
|
178 if(scalar(keys(%mapped_genes)) == 1) {
|
|
179 my ($gene) = keys %mapped_genes;
|
|
180 if(defined($uniq_d)) { if(scalar(keys(%mapped_insert_lengths))==1) { $count{$gene}[$i]++; } }
|
|
181 else { $count{$gene}[$i]++; }
|
|
182 } ## count reads that are unique to the gene
|
|
183 }
|
|
184
|
|
185 }
|
|
186
|
|
187
|
|
188 $"="\t";
|
|
189
|
|
190
|
|
191 if($count_opt eq 'gNIR' || $count_opt eq 'gReadcount') { $count_target='genes'; } else { $count_target = 'transcripts'; }
|
|
192 if($verbose) { printf STDERR "Finished reading bowtieout files. Total %d $count_target counted\n",scalar(keys(%count)); }
|
|
193
|
|
194
|
|
195 $minlen=1000000;
|
|
196 for my $file (@bowtiefiles){
|
|
197 $minlen = length($file) if length($file)<$minlen;
|
|
198 }
|
|
199
|
|
200
|
|
201 # find longest common suffix for bowtiefiles
|
|
202 my %suffs;
|
|
203 my $k=0;
|
|
204 do {
|
|
205 %suffs=();
|
|
206 $k++;
|
|
207 for my $file (@bowtiefiles){
|
|
208 $suff = substr($file,length($file)-$k);
|
|
209 $suffs{$suff}=1;
|
|
210 }
|
|
211 } while ( scalar(keys(%suffs))==1 && $k<$minlen);
|
|
212
|
|
213 my ($commonsuffix) = substr($bowtiefiles[0],length($file)-($k-1));
|
|
214 if(scalar(@bowtiefiles) == 1) { $commonsuffix = ''; }
|
|
215
|
|
216 # title is between "/" and common suffix.
|
|
217 my @title=();
|
|
218 for my $file (@bowtiefiles){
|
|
219 if($file=~ /([^\/]+)$commonsuffix$/) { push @title, $1; }
|
|
220 else { push @title, $file; }
|
|
221 }
|
|
222 if(defined($samplename)) { @title=(); push @title, $samplename; }
|
|
223
|
|
224
|
|
225 if($verbose) { print STDERR "Printing to file...\n"; }
|
|
226
|
|
227 if($count_opt eq 'gNIR' || $count_opt eq 'gReadcount'){
|
|
228 print "gene\t@title\n";
|
|
229 for my $gene (sort keys %count){
|
|
230 print "$gene";
|
|
231 for my $i (0..$#bowtiefiles){
|
|
232 if(defined $count{$gene}[$i]) { print "\t$count{$gene}[$i]"; }
|
|
233 else { print "\t0"; }
|
|
234 }
|
|
235 print "\n";
|
|
236 }
|
|
237 }
|
|
238 else { ## iNIR
|
|
239 print "gene\ttranscript\t@title\n";
|
|
240 for my $tr (sort keys %count){
|
|
241 print "$t2g{$tr}\t$tr";
|
|
242 for my $i (0..$#bowtiefiles){
|
|
243 if(defined $count{$tr}[$i]) { print "\t$count{$tr}[$i]"; }
|
|
244 else { print "\t0"; }
|
|
245 }
|
|
246 print "\n";
|
|
247 }
|
|
248 }
|
|
249
|
|
250 if($verbose) { print STDERR "done.\n"; }
|
|
251
|
|
252
|
|
253
|
|
254 ## subroutines ##
|
|
255
|
|
256
|
|
257
|
|
258 sub help {
|
|
259 print STDERR<<EOF
|
|
260
|
|
261 usage: $0 <options> gene2NM bowtieoutdir/file PE[1/0] > output.readcount
|
|
262
|
|
263 Options:
|
|
264 -v : verbose
|
|
265 -vv : more verbose
|
|
266 -f : use bowtie file name instead of bowtie directory name.
|
|
267 ex) $0 -f gene2NM bowtieoutfile PE > output.readcount
|
|
268 ex2) $0 gene2NM bowtieoutdir PE > output.readcount
|
|
269 --gNIR : count gNIR instead of gReadcount (recommended outfile suffix is .gNIR)
|
|
270 --iNIR : count iNIR instead of gReadcount (recommended outfile suffix is .iNIR)
|
|
271 -s|--sample <samplename> : sample name to be used as a column title. This options is available only when -f option is used.
|
|
272 -m|--max_insert_length <max_insert_length> : maximum insert length. If this option is used, length filtering is done. This option must accompany option -l (--readlength). This option doesn't do anything for single-end data (ignored)
|
|
273 -l|--readlength <readlength> : read length. This is required only when -m (--max_insert_length) option is used.
|
|
274 -d|--datatype <datatype> : Either 'R' (Refseq) or 'E' (Ensembl). Use R if transcript names are in Refseq format (gi|xxx|xxx|name). (default E).
|
|
275 -u|--uniq_d : filter reads with a single mate distance only. (applicable for only PE gReadcount)
|
|
276 EOF
|
|
277 }
|
|
278
|
|
279 sub min {
|
|
280 if($_[0] lt $_[1]) { $_[0] } else { $_[1] };
|
|
281 }
|
|
282
|
|
283
|