comparison NEUMA-1.2.1/bowtie2genecount.11.pl @ 0:c44c43d185ef draft default tip

NEUMA-1.2.1 Uploaded
author chawhwa
date Thu, 08 Aug 2013 00:46:13 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c44c43d185ef
1 #!/usr/bin/perl
2
3 # This version has an option of counting only the pairs with a single mate distance. (version 11).
4 # This version has fixed the problem in filtering max_insert_length. (version 11).
5 # This version does not print out the additional tab in the iNIR file. (version 11).
6 # This version has data type option for parsing refseq headers. (version 10)
7 # This version fixed the max_insert_lendis option name from max_insert_length. (version 10)
8 # This version can take multiple bowtieout files separated by comma. (version 10)
9 # This version can take option of max insert lendis (version 9)
10 # This version can take sample name (column title) as option, if -f is selected. (version 8)
11 # This version fixed the problem of double-counting cases that are strand-flipped. (version 7)
12 # This version has an option of counting all reads, gNIR or iNIR. (version 7)
13 # This version has two levels of verbosity and uses whole file name if run with a single bowtie file. (version 6)
14 # This version has an option of using a single bowtie file name instead of bowtie directory. (version 6)
15 # This version fixed the problem of not being able to find common suffix and assumes .bowtieout suffix for bowtieout files, to prevent reading other files in the directory. (version 5)
16 # This version fixed the problem of going into infinite loop when run with one sample. (Version 3)
17 # This version is verbose. (version 2)
18
19 use Getopt::Long;
20 my $verbose=0;
21 my $LINEBIN=1000000;
22
23 my $bowtie = 'd'; # default is directory.
24 my $count_opt = 'gReadcount';
25 my $samplename;
26 my $max_insert_length;
27 my $readlength;
28 my $datatype='E';
29 my $uniq_d; # reads with single mate distance only (applicable only for PE)
30 &GetOptions ( 'v' => sub { $verbose=1 },
31 'vv' => sub { $verbose=2 },
32 'f' => sub { $bowtie='f'}, # read bowtie file instead of bowtiu directory
33 'gNIR' => sub { $count_opt = 'gNIR' },
34 'iNIR' => sub { $count_opt = 'iNIR' },
35 's|sample=s' => \$samplename,
36 'm|max_insert_length=i' => \$max_insert_length,
37 'l|readlength=i' => \$readlength,
38 'd|datatype=s' => \$datatype,
39 'u|uniq_d' => \$uniq_d
40 );
41
42 if($bowtie eq 'd' && defined($samplename)) { print STDERR "Warning: -s|--sample option is ignored, when -f is not selected.\n"; }
43 if(defined($max_insert_length) && !defined($readlength)) { die "readlength must be defined if max_insert_length is defined.\n"; }
44
45
46 if(@ARGV<3) { &help; exit; }
47
48
49 my $gene2NM = shift @ARGV;
50 my $bowtieoutdir = shift @ARGV;
51 my $PE = shift @ARGV;
52
53 if($PE==0 && defined($max_insert_length)) { print STDERR "Warning: -m|--max_insert_length option is ignored for single end data.\n"; }
54
55
56 if($bowtie eq 'd'){
57 @bowtiefiles = split/\n/,`ls -1 $bowtieoutdir/*`;
58 if($verbose) { printf STDERR "Finished reading bowtieout dir $bowtieoutdir. Total %d files\n", scalar(@bowtiefiles); }
59 }
60 else {
61 push @bowtiefiles, (split/,/,$bowtieoutdir);
62 }
63
64
65 open GENE2NM, $gene2NM or die "Can't open gene2NM file $gene2NM\n";
66 while(<GENE2NM>){
67 chomp;
68 my ($g,$t) = split/\t/;
69 $t2g{$t}=$g;
70 push @{$g2t{$g}}, $t;
71 }
72 close GENE2NM;
73 if($verbose) { printf STDERR "Finished reading gene2NM file $gene2NM. Total %d transcripts and %d genes\n", scalar(keys(%t2g)), scalar(keys(%g2t)); }
74
75
76
77
78
79 for my $i (0..$#bowtiefiles) {
80
81 my $bowtieout = $bowtiefiles[$i];
82 my %mapped_genes =();
83 my %mapped_tr =();
84 my %mapped_insert_lengths =();
85
86 if($verbose) { print STDERR "Processing bowtieout $bowtieout (file $i)\n"; }
87
88
89 my $prev_head=''; my $prev_tr='';
90 open $BOWTIE, $bowtieout or die "Can't open bowtiefile $bowtieout\n";
91 while(<$BOWTIE>){
92 chomp;
93 my ($head,$tr,$pos1) = (split/\t/)[0,2,3];
94 if($datatype eq 'R') { my ($tmp_tr) = (split/\|/,$tr)[3]; if(defined($tmp_tr)) {$tr = $tmp_tr; $tr=~s/\.\d+$//; }}
95
96
97 my $insert_length=0; ## default, for SE.
98 if($PE==1){
99 chomp(my $secondline = <$BOWTIE>);
100 my ($head2,$pos2) = (split/\t/,$secondline)[0,3];
101 $head = &min($head,$head2);
102 $insert_length = $pos2 - $pos1 + $readlength;
103 if(defined $max_insert_length) { if($insert_length > $max_insert_length) { next; } }
104 }
105
106
107 if($verbose==2) { if($prev_head eq '') { print STDERR "head= $head, tr= $tr, PE = $PE\n"; } } # print only at first line
108
109
110 if($count_opt eq 'gNIR'){
111 if($head eq $prev_head) {
112 $mapped_tr { $tr } = 1;
113 $mapped_tr { $prev_tr } = 1;
114 $mapped_genes { $t2g{$tr} } = 1;
115 $mapped_genes { $t2g{$prev_tr} } = 1;
116 }
117 else {
118 if(scalar(keys(%mapped_genes)) == 1) { my ($gene) = keys %mapped_genes; if(scalar(keys(%mapped_tr)) == scalar(@{$g2t{$gene}})) { $count{$gene}[$i]++; }}
119 ## count reads that are unique to the gene and common to all of its isoforms
120 %mapped_genes =();
121 %mapped_tr = ();
122 $mapped_tr { $tr } = 1;
123 $mapped_genes { $t2g{$tr} } = 1;
124 }
125
126 }
127 elsif($count_opt eq 'iNIR'){
128 if($head eq $prev_head) {
129 $mapped_tr { $tr } = 1;
130 $mapped_tr { $prev_tr } = 1;
131 }
132 else {
133 if(scalar(keys(%mapped_tr)) == 1) { my ($transcript) = keys %mapped_tr; $count{$transcript}[$i]++; } ## count reads that are unique to the transcript
134 %mapped_tr = ();
135 $mapped_tr { $tr } = 1;
136 }
137 }
138 else {
139 if($head eq $prev_head) {
140 $mapped_genes { $t2g{$tr} } = 1;
141 $mapped_genes { $t2g{$prev_tr} } = 1;
142 $mapped_insert_lengths { "$insert_length" } =1;
143 $mapped_insert_lengths { "$prev_insert_length" } =1;
144 }
145 else {
146 if(scalar(keys(%mapped_genes)) == 1) {
147 my ($gene) = keys %mapped_genes;
148 if(defined($uniq_d)) { if(scalar(keys(%mapped_insert_lengths))==1) { $count{$gene}[$i]++; } }
149 else { $count{$gene}[$i]++; }
150 } ## count reads that are unique to the gene
151 %mapped_genes =();
152 %mapped_insert_lengths =();
153 $mapped_genes { $t2g{$tr} } = 1;
154 $mapped_insert_lengths { "$insert_length" } =1;
155 }
156 }
157
158
159 $prev_head = $head;
160 $prev_tr = $tr;
161 $prev_insert_length = $insert_length;
162
163 if($verbose==2) { if($. % $LINEBIN==0) { print STDERR ":line $.\n"; } }
164
165 }
166 close $BOWTIE;
167
168
169 # last line
170 if($count_opt eq 'gNIR'){
171 if(scalar(keys(%mapped_genes)) == 1) { my ($gene) = keys %mapped_genes; if(scalar(keys(%mapped_tr)) == scalar(@{$g2t{$gene}})) { $count{$gene}[$i]++; }}
172 ## count reads that are unique to the gene and common to all of its isoforms
173 }
174 elsif($count_opt eq 'iNIR'){
175 if(scalar(keys(%mapped_tr)) == 1) { my ($transcript) = keys %mapped_tr; $count{$transcript}[$i]++; } ## count reads that are unique to the transcript
176 }
177 else{
178 if(scalar(keys(%mapped_genes)) == 1) {
179 my ($gene) = keys %mapped_genes;
180 if(defined($uniq_d)) { if(scalar(keys(%mapped_insert_lengths))==1) { $count{$gene}[$i]++; } }
181 else { $count{$gene}[$i]++; }
182 } ## count reads that are unique to the gene
183 }
184
185 }
186
187
188 $"="\t";
189
190
191 if($count_opt eq 'gNIR' || $count_opt eq 'gReadcount') { $count_target='genes'; } else { $count_target = 'transcripts'; }
192 if($verbose) { printf STDERR "Finished reading bowtieout files. Total %d $count_target counted\n",scalar(keys(%count)); }
193
194
195 $minlen=1000000;
196 for my $file (@bowtiefiles){
197 $minlen = length($file) if length($file)<$minlen;
198 }
199
200
201 # find longest common suffix for bowtiefiles
202 my %suffs;
203 my $k=0;
204 do {
205 %suffs=();
206 $k++;
207 for my $file (@bowtiefiles){
208 $suff = substr($file,length($file)-$k);
209 $suffs{$suff}=1;
210 }
211 } while ( scalar(keys(%suffs))==1 && $k<$minlen);
212
213 my ($commonsuffix) = substr($bowtiefiles[0],length($file)-($k-1));
214 if(scalar(@bowtiefiles) == 1) { $commonsuffix = ''; }
215
216 # title is between "/" and common suffix.
217 my @title=();
218 for my $file (@bowtiefiles){
219 if($file=~ /([^\/]+)$commonsuffix$/) { push @title, $1; }
220 else { push @title, $file; }
221 }
222 if(defined($samplename)) { @title=(); push @title, $samplename; }
223
224
225 if($verbose) { print STDERR "Printing to file...\n"; }
226
227 if($count_opt eq 'gNIR' || $count_opt eq 'gReadcount'){
228 print "gene\t@title\n";
229 for my $gene (sort keys %count){
230 print "$gene";
231 for my $i (0..$#bowtiefiles){
232 if(defined $count{$gene}[$i]) { print "\t$count{$gene}[$i]"; }
233 else { print "\t0"; }
234 }
235 print "\n";
236 }
237 }
238 else { ## iNIR
239 print "gene\ttranscript\t@title\n";
240 for my $tr (sort keys %count){
241 print "$t2g{$tr}\t$tr";
242 for my $i (0..$#bowtiefiles){
243 if(defined $count{$tr}[$i]) { print "\t$count{$tr}[$i]"; }
244 else { print "\t0"; }
245 }
246 print "\n";
247 }
248 }
249
250 if($verbose) { print STDERR "done.\n"; }
251
252
253
254 ## subroutines ##
255
256
257
258 sub help {
259 print STDERR<<EOF
260
261 usage: $0 <options> gene2NM bowtieoutdir/file PE[1/0] > output.readcount
262
263 Options:
264 -v : verbose
265 -vv : more verbose
266 -f : use bowtie file name instead of bowtie directory name.
267 ex) $0 -f gene2NM bowtieoutfile PE > output.readcount
268 ex2) $0 gene2NM bowtieoutdir PE > output.readcount
269 --gNIR : count gNIR instead of gReadcount (recommended outfile suffix is .gNIR)
270 --iNIR : count iNIR instead of gReadcount (recommended outfile suffix is .iNIR)
271 -s|--sample <samplename> : sample name to be used as a column title. This options is available only when -f option is used.
272 -m|--max_insert_length <max_insert_length> : maximum insert length. If this option is used, length filtering is done. This option must accompany option -l (--readlength). This option doesn't do anything for single-end data (ignored)
273 -l|--readlength <readlength> : read length. This is required only when -m (--max_insert_length) option is used.
274 -d|--datatype <datatype> : Either 'R' (Refseq) or 'E' (Ensembl). Use R if transcript names are in Refseq format (gi|xxx|xxx|name). (default E).
275 -u|--uniq_d : filter reads with a single mate distance only. (applicable for only PE gReadcount)
276 EOF
277 }
278
279 sub min {
280 if($_[0] lt $_[1]) { $_[0] } else { $_[1] };
281 }
282
283