Mercurial > repos > chawhwa > neuma
comparison NEUMA-1.2.1/bowtie2genecount.11.pl @ 0:c44c43d185ef draft default tip
NEUMA-1.2.1 Uploaded
author | chawhwa |
---|---|
date | Thu, 08 Aug 2013 00:46:13 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c44c43d185ef |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 # This version has an option of counting only the pairs with a single mate distance. (version 11). | |
4 # This version has fixed the problem in filtering max_insert_length. (version 11). | |
5 # This version does not print out the additional tab in the iNIR file. (version 11). | |
6 # This version has data type option for parsing refseq headers. (version 10) | |
7 # This version fixed the max_insert_lendis option name from max_insert_length. (version 10) | |
8 # This version can take multiple bowtieout files separated by comma. (version 10) | |
9 # This version can take option of max insert lendis (version 9) | |
10 # This version can take sample name (column title) as option, if -f is selected. (version 8) | |
11 # This version fixed the problem of double-counting cases that are strand-flipped. (version 7) | |
12 # This version has an option of counting all reads, gNIR or iNIR. (version 7) | |
13 # This version has two levels of verbosity and uses whole file name if run with a single bowtie file. (version 6) | |
14 # This version has an option of using a single bowtie file name instead of bowtie directory. (version 6) | |
15 # This version fixed the problem of not being able to find common suffix and assumes .bowtieout suffix for bowtieout files, to prevent reading other files in the directory. (version 5) | |
16 # This version fixed the problem of going into infinite loop when run with one sample. (Version 3) | |
17 # This version is verbose. (version 2) | |
18 | |
19 use Getopt::Long; | |
20 my $verbose=0; | |
21 my $LINEBIN=1000000; | |
22 | |
23 my $bowtie = 'd'; # default is directory. | |
24 my $count_opt = 'gReadcount'; | |
25 my $samplename; | |
26 my $max_insert_length; | |
27 my $readlength; | |
28 my $datatype='E'; | |
29 my $uniq_d; # reads with single mate distance only (applicable only for PE) | |
30 &GetOptions ( 'v' => sub { $verbose=1 }, | |
31 'vv' => sub { $verbose=2 }, | |
32 'f' => sub { $bowtie='f'}, # read bowtie file instead of bowtiu directory | |
33 'gNIR' => sub { $count_opt = 'gNIR' }, | |
34 'iNIR' => sub { $count_opt = 'iNIR' }, | |
35 's|sample=s' => \$samplename, | |
36 'm|max_insert_length=i' => \$max_insert_length, | |
37 'l|readlength=i' => \$readlength, | |
38 'd|datatype=s' => \$datatype, | |
39 'u|uniq_d' => \$uniq_d | |
40 ); | |
41 | |
42 if($bowtie eq 'd' && defined($samplename)) { print STDERR "Warning: -s|--sample option is ignored, when -f is not selected.\n"; } | |
43 if(defined($max_insert_length) && !defined($readlength)) { die "readlength must be defined if max_insert_length is defined.\n"; } | |
44 | |
45 | |
46 if(@ARGV<3) { &help; exit; } | |
47 | |
48 | |
49 my $gene2NM = shift @ARGV; | |
50 my $bowtieoutdir = shift @ARGV; | |
51 my $PE = shift @ARGV; | |
52 | |
53 if($PE==0 && defined($max_insert_length)) { print STDERR "Warning: -m|--max_insert_length option is ignored for single end data.\n"; } | |
54 | |
55 | |
56 if($bowtie eq 'd'){ | |
57 @bowtiefiles = split/\n/,`ls -1 $bowtieoutdir/*`; | |
58 if($verbose) { printf STDERR "Finished reading bowtieout dir $bowtieoutdir. Total %d files\n", scalar(@bowtiefiles); } | |
59 } | |
60 else { | |
61 push @bowtiefiles, (split/,/,$bowtieoutdir); | |
62 } | |
63 | |
64 | |
65 open GENE2NM, $gene2NM or die "Can't open gene2NM file $gene2NM\n"; | |
66 while(<GENE2NM>){ | |
67 chomp; | |
68 my ($g,$t) = split/\t/; | |
69 $t2g{$t}=$g; | |
70 push @{$g2t{$g}}, $t; | |
71 } | |
72 close GENE2NM; | |
73 if($verbose) { printf STDERR "Finished reading gene2NM file $gene2NM. Total %d transcripts and %d genes\n", scalar(keys(%t2g)), scalar(keys(%g2t)); } | |
74 | |
75 | |
76 | |
77 | |
78 | |
79 for my $i (0..$#bowtiefiles) { | |
80 | |
81 my $bowtieout = $bowtiefiles[$i]; | |
82 my %mapped_genes =(); | |
83 my %mapped_tr =(); | |
84 my %mapped_insert_lengths =(); | |
85 | |
86 if($verbose) { print STDERR "Processing bowtieout $bowtieout (file $i)\n"; } | |
87 | |
88 | |
89 my $prev_head=''; my $prev_tr=''; | |
90 open $BOWTIE, $bowtieout or die "Can't open bowtiefile $bowtieout\n"; | |
91 while(<$BOWTIE>){ | |
92 chomp; | |
93 my ($head,$tr,$pos1) = (split/\t/)[0,2,3]; | |
94 if($datatype eq 'R') { my ($tmp_tr) = (split/\|/,$tr)[3]; if(defined($tmp_tr)) {$tr = $tmp_tr; $tr=~s/\.\d+$//; }} | |
95 | |
96 | |
97 my $insert_length=0; ## default, for SE. | |
98 if($PE==1){ | |
99 chomp(my $secondline = <$BOWTIE>); | |
100 my ($head2,$pos2) = (split/\t/,$secondline)[0,3]; | |
101 $head = &min($head,$head2); | |
102 $insert_length = $pos2 - $pos1 + $readlength; | |
103 if(defined $max_insert_length) { if($insert_length > $max_insert_length) { next; } } | |
104 } | |
105 | |
106 | |
107 if($verbose==2) { if($prev_head eq '') { print STDERR "head= $head, tr= $tr, PE = $PE\n"; } } # print only at first line | |
108 | |
109 | |
110 if($count_opt eq 'gNIR'){ | |
111 if($head eq $prev_head) { | |
112 $mapped_tr { $tr } = 1; | |
113 $mapped_tr { $prev_tr } = 1; | |
114 $mapped_genes { $t2g{$tr} } = 1; | |
115 $mapped_genes { $t2g{$prev_tr} } = 1; | |
116 } | |
117 else { | |
118 if(scalar(keys(%mapped_genes)) == 1) { my ($gene) = keys %mapped_genes; if(scalar(keys(%mapped_tr)) == scalar(@{$g2t{$gene}})) { $count{$gene}[$i]++; }} | |
119 ## count reads that are unique to the gene and common to all of its isoforms | |
120 %mapped_genes =(); | |
121 %mapped_tr = (); | |
122 $mapped_tr { $tr } = 1; | |
123 $mapped_genes { $t2g{$tr} } = 1; | |
124 } | |
125 | |
126 } | |
127 elsif($count_opt eq 'iNIR'){ | |
128 if($head eq $prev_head) { | |
129 $mapped_tr { $tr } = 1; | |
130 $mapped_tr { $prev_tr } = 1; | |
131 } | |
132 else { | |
133 if(scalar(keys(%mapped_tr)) == 1) { my ($transcript) = keys %mapped_tr; $count{$transcript}[$i]++; } ## count reads that are unique to the transcript | |
134 %mapped_tr = (); | |
135 $mapped_tr { $tr } = 1; | |
136 } | |
137 } | |
138 else { | |
139 if($head eq $prev_head) { | |
140 $mapped_genes { $t2g{$tr} } = 1; | |
141 $mapped_genes { $t2g{$prev_tr} } = 1; | |
142 $mapped_insert_lengths { "$insert_length" } =1; | |
143 $mapped_insert_lengths { "$prev_insert_length" } =1; | |
144 } | |
145 else { | |
146 if(scalar(keys(%mapped_genes)) == 1) { | |
147 my ($gene) = keys %mapped_genes; | |
148 if(defined($uniq_d)) { if(scalar(keys(%mapped_insert_lengths))==1) { $count{$gene}[$i]++; } } | |
149 else { $count{$gene}[$i]++; } | |
150 } ## count reads that are unique to the gene | |
151 %mapped_genes =(); | |
152 %mapped_insert_lengths =(); | |
153 $mapped_genes { $t2g{$tr} } = 1; | |
154 $mapped_insert_lengths { "$insert_length" } =1; | |
155 } | |
156 } | |
157 | |
158 | |
159 $prev_head = $head; | |
160 $prev_tr = $tr; | |
161 $prev_insert_length = $insert_length; | |
162 | |
163 if($verbose==2) { if($. % $LINEBIN==0) { print STDERR ":line $.\n"; } } | |
164 | |
165 } | |
166 close $BOWTIE; | |
167 | |
168 | |
169 # last line | |
170 if($count_opt eq 'gNIR'){ | |
171 if(scalar(keys(%mapped_genes)) == 1) { my ($gene) = keys %mapped_genes; if(scalar(keys(%mapped_tr)) == scalar(@{$g2t{$gene}})) { $count{$gene}[$i]++; }} | |
172 ## count reads that are unique to the gene and common to all of its isoforms | |
173 } | |
174 elsif($count_opt eq 'iNIR'){ | |
175 if(scalar(keys(%mapped_tr)) == 1) { my ($transcript) = keys %mapped_tr; $count{$transcript}[$i]++; } ## count reads that are unique to the transcript | |
176 } | |
177 else{ | |
178 if(scalar(keys(%mapped_genes)) == 1) { | |
179 my ($gene) = keys %mapped_genes; | |
180 if(defined($uniq_d)) { if(scalar(keys(%mapped_insert_lengths))==1) { $count{$gene}[$i]++; } } | |
181 else { $count{$gene}[$i]++; } | |
182 } ## count reads that are unique to the gene | |
183 } | |
184 | |
185 } | |
186 | |
187 | |
188 $"="\t"; | |
189 | |
190 | |
191 if($count_opt eq 'gNIR' || $count_opt eq 'gReadcount') { $count_target='genes'; } else { $count_target = 'transcripts'; } | |
192 if($verbose) { printf STDERR "Finished reading bowtieout files. Total %d $count_target counted\n",scalar(keys(%count)); } | |
193 | |
194 | |
195 $minlen=1000000; | |
196 for my $file (@bowtiefiles){ | |
197 $minlen = length($file) if length($file)<$minlen; | |
198 } | |
199 | |
200 | |
201 # find longest common suffix for bowtiefiles | |
202 my %suffs; | |
203 my $k=0; | |
204 do { | |
205 %suffs=(); | |
206 $k++; | |
207 for my $file (@bowtiefiles){ | |
208 $suff = substr($file,length($file)-$k); | |
209 $suffs{$suff}=1; | |
210 } | |
211 } while ( scalar(keys(%suffs))==1 && $k<$minlen); | |
212 | |
213 my ($commonsuffix) = substr($bowtiefiles[0],length($file)-($k-1)); | |
214 if(scalar(@bowtiefiles) == 1) { $commonsuffix = ''; } | |
215 | |
216 # title is between "/" and common suffix. | |
217 my @title=(); | |
218 for my $file (@bowtiefiles){ | |
219 if($file=~ /([^\/]+)$commonsuffix$/) { push @title, $1; } | |
220 else { push @title, $file; } | |
221 } | |
222 if(defined($samplename)) { @title=(); push @title, $samplename; } | |
223 | |
224 | |
225 if($verbose) { print STDERR "Printing to file...\n"; } | |
226 | |
227 if($count_opt eq 'gNIR' || $count_opt eq 'gReadcount'){ | |
228 print "gene\t@title\n"; | |
229 for my $gene (sort keys %count){ | |
230 print "$gene"; | |
231 for my $i (0..$#bowtiefiles){ | |
232 if(defined $count{$gene}[$i]) { print "\t$count{$gene}[$i]"; } | |
233 else { print "\t0"; } | |
234 } | |
235 print "\n"; | |
236 } | |
237 } | |
238 else { ## iNIR | |
239 print "gene\ttranscript\t@title\n"; | |
240 for my $tr (sort keys %count){ | |
241 print "$t2g{$tr}\t$tr"; | |
242 for my $i (0..$#bowtiefiles){ | |
243 if(defined $count{$tr}[$i]) { print "\t$count{$tr}[$i]"; } | |
244 else { print "\t0"; } | |
245 } | |
246 print "\n"; | |
247 } | |
248 } | |
249 | |
250 if($verbose) { print STDERR "done.\n"; } | |
251 | |
252 | |
253 | |
254 ## subroutines ## | |
255 | |
256 | |
257 | |
258 sub help { | |
259 print STDERR<<EOF | |
260 | |
261 usage: $0 <options> gene2NM bowtieoutdir/file PE[1/0] > output.readcount | |
262 | |
263 Options: | |
264 -v : verbose | |
265 -vv : more verbose | |
266 -f : use bowtie file name instead of bowtie directory name. | |
267 ex) $0 -f gene2NM bowtieoutfile PE > output.readcount | |
268 ex2) $0 gene2NM bowtieoutdir PE > output.readcount | |
269 --gNIR : count gNIR instead of gReadcount (recommended outfile suffix is .gNIR) | |
270 --iNIR : count iNIR instead of gReadcount (recommended outfile suffix is .iNIR) | |
271 -s|--sample <samplename> : sample name to be used as a column title. This options is available only when -f option is used. | |
272 -m|--max_insert_length <max_insert_length> : maximum insert length. If this option is used, length filtering is done. This option must accompany option -l (--readlength). This option doesn't do anything for single-end data (ignored) | |
273 -l|--readlength <readlength> : read length. This is required only when -m (--max_insert_length) option is used. | |
274 -d|--datatype <datatype> : Either 'R' (Refseq) or 'E' (Ensembl). Use R if transcript names are in Refseq format (gi|xxx|xxx|name). (default E). | |
275 -u|--uniq_d : filter reads with a single mate distance only. (applicable for only PE gReadcount) | |
276 EOF | |
277 } | |
278 | |
279 sub min { | |
280 if($_[0] lt $_[1]) { $_[0] } else { $_[1] }; | |
281 } | |
282 | |
283 |