annotate NEUMA-1.2.1/bowtie2genecount.11.pl @ 0:c44c43d185ef draft default tip

NEUMA-1.2.1 Uploaded
author chawhwa
date Thu, 08 Aug 2013 00:46:13 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
1 #!/usr/bin/perl
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
2
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
3 # This version has an option of counting only the pairs with a single mate distance. (version 11).
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
4 # This version has fixed the problem in filtering max_insert_length. (version 11).
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
5 # This version does not print out the additional tab in the iNIR file. (version 11).
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
6 # This version has data type option for parsing refseq headers. (version 10)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
7 # This version fixed the max_insert_lendis option name from max_insert_length. (version 10)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
8 # This version can take multiple bowtieout files separated by comma. (version 10)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
9 # This version can take option of max insert lendis (version 9)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
10 # This version can take sample name (column title) as option, if -f is selected. (version 8)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
11 # This version fixed the problem of double-counting cases that are strand-flipped. (version 7)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
12 # This version has an option of counting all reads, gNIR or iNIR. (version 7)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
13 # This version has two levels of verbosity and uses whole file name if run with a single bowtie file. (version 6)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
14 # This version has an option of using a single bowtie file name instead of bowtie directory. (version 6)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
15 # This version fixed the problem of not being able to find common suffix and assumes .bowtieout suffix for bowtieout files, to prevent reading other files in the directory. (version 5)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
16 # This version fixed the problem of going into infinite loop when run with one sample. (Version 3)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
17 # This version is verbose. (version 2)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
18
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
19 use Getopt::Long;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
20 my $verbose=0;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
21 my $LINEBIN=1000000;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
22
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
23 my $bowtie = 'd'; # default is directory.
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
24 my $count_opt = 'gReadcount';
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
25 my $samplename;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
26 my $max_insert_length;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
27 my $readlength;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
28 my $datatype='E';
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
29 my $uniq_d; # reads with single mate distance only (applicable only for PE)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
30 &GetOptions ( 'v' => sub { $verbose=1 },
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
31 'vv' => sub { $verbose=2 },
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
32 'f' => sub { $bowtie='f'}, # read bowtie file instead of bowtiu directory
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
33 'gNIR' => sub { $count_opt = 'gNIR' },
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
34 'iNIR' => sub { $count_opt = 'iNIR' },
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
35 's|sample=s' => \$samplename,
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
36 'm|max_insert_length=i' => \$max_insert_length,
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
37 'l|readlength=i' => \$readlength,
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
38 'd|datatype=s' => \$datatype,
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
39 'u|uniq_d' => \$uniq_d
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
40 );
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
41
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
42 if($bowtie eq 'd' && defined($samplename)) { print STDERR "Warning: -s|--sample option is ignored, when -f is not selected.\n"; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
43 if(defined($max_insert_length) && !defined($readlength)) { die "readlength must be defined if max_insert_length is defined.\n"; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
44
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
45
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
46 if(@ARGV<3) { &help; exit; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
47
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
48
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
49 my $gene2NM = shift @ARGV;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
50 my $bowtieoutdir = shift @ARGV;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
51 my $PE = shift @ARGV;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
52
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
53 if($PE==0 && defined($max_insert_length)) { print STDERR "Warning: -m|--max_insert_length option is ignored for single end data.\n"; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
54
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
55
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
56 if($bowtie eq 'd'){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
57 @bowtiefiles = split/\n/,`ls -1 $bowtieoutdir/*`;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
58 if($verbose) { printf STDERR "Finished reading bowtieout dir $bowtieoutdir. Total %d files\n", scalar(@bowtiefiles); }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
59 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
60 else {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
61 push @bowtiefiles, (split/,/,$bowtieoutdir);
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
62 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
63
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
64
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
65 open GENE2NM, $gene2NM or die "Can't open gene2NM file $gene2NM\n";
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
66 while(<GENE2NM>){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
67 chomp;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
68 my ($g,$t) = split/\t/;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
69 $t2g{$t}=$g;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
70 push @{$g2t{$g}}, $t;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
71 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
72 close GENE2NM;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
73 if($verbose) { printf STDERR "Finished reading gene2NM file $gene2NM. Total %d transcripts and %d genes\n", scalar(keys(%t2g)), scalar(keys(%g2t)); }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
74
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
75
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
76
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
77
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
78
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
79 for my $i (0..$#bowtiefiles) {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
80
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
81 my $bowtieout = $bowtiefiles[$i];
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
82 my %mapped_genes =();
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
83 my %mapped_tr =();
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
84 my %mapped_insert_lengths =();
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
85
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
86 if($verbose) { print STDERR "Processing bowtieout $bowtieout (file $i)\n"; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
87
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
88
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
89 my $prev_head=''; my $prev_tr='';
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
90 open $BOWTIE, $bowtieout or die "Can't open bowtiefile $bowtieout\n";
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
91 while(<$BOWTIE>){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
92 chomp;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
93 my ($head,$tr,$pos1) = (split/\t/)[0,2,3];
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
94 if($datatype eq 'R') { my ($tmp_tr) = (split/\|/,$tr)[3]; if(defined($tmp_tr)) {$tr = $tmp_tr; $tr=~s/\.\d+$//; }}
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
95
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
96
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
97 my $insert_length=0; ## default, for SE.
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
98 if($PE==1){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
99 chomp(my $secondline = <$BOWTIE>);
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
100 my ($head2,$pos2) = (split/\t/,$secondline)[0,3];
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
101 $head = &min($head,$head2);
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
102 $insert_length = $pos2 - $pos1 + $readlength;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
103 if(defined $max_insert_length) { if($insert_length > $max_insert_length) { next; } }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
104 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
105
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
106
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
107 if($verbose==2) { if($prev_head eq '') { print STDERR "head= $head, tr= $tr, PE = $PE\n"; } } # print only at first line
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
108
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
109
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
110 if($count_opt eq 'gNIR'){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
111 if($head eq $prev_head) {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
112 $mapped_tr { $tr } = 1;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
113 $mapped_tr { $prev_tr } = 1;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
114 $mapped_genes { $t2g{$tr} } = 1;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
115 $mapped_genes { $t2g{$prev_tr} } = 1;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
116 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
117 else {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
118 if(scalar(keys(%mapped_genes)) == 1) { my ($gene) = keys %mapped_genes; if(scalar(keys(%mapped_tr)) == scalar(@{$g2t{$gene}})) { $count{$gene}[$i]++; }}
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
119 ## count reads that are unique to the gene and common to all of its isoforms
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
120 %mapped_genes =();
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
121 %mapped_tr = ();
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
122 $mapped_tr { $tr } = 1;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
123 $mapped_genes { $t2g{$tr} } = 1;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
124 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
125
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
126 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
127 elsif($count_opt eq 'iNIR'){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
128 if($head eq $prev_head) {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
129 $mapped_tr { $tr } = 1;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
130 $mapped_tr { $prev_tr } = 1;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
131 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
132 else {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
133 if(scalar(keys(%mapped_tr)) == 1) { my ($transcript) = keys %mapped_tr; $count{$transcript}[$i]++; } ## count reads that are unique to the transcript
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
134 %mapped_tr = ();
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
135 $mapped_tr { $tr } = 1;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
136 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
137 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
138 else {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
139 if($head eq $prev_head) {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
140 $mapped_genes { $t2g{$tr} } = 1;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
141 $mapped_genes { $t2g{$prev_tr} } = 1;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
142 $mapped_insert_lengths { "$insert_length" } =1;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
143 $mapped_insert_lengths { "$prev_insert_length" } =1;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
144 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
145 else {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
146 if(scalar(keys(%mapped_genes)) == 1) {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
147 my ($gene) = keys %mapped_genes;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
148 if(defined($uniq_d)) { if(scalar(keys(%mapped_insert_lengths))==1) { $count{$gene}[$i]++; } }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
149 else { $count{$gene}[$i]++; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
150 } ## count reads that are unique to the gene
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
151 %mapped_genes =();
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
152 %mapped_insert_lengths =();
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
153 $mapped_genes { $t2g{$tr} } = 1;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
154 $mapped_insert_lengths { "$insert_length" } =1;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
155 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
156 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
157
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
158
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
159 $prev_head = $head;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
160 $prev_tr = $tr;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
161 $prev_insert_length = $insert_length;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
162
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
163 if($verbose==2) { if($. % $LINEBIN==0) { print STDERR ":line $.\n"; } }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
164
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
165 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
166 close $BOWTIE;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
167
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
168
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
169 # last line
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
170 if($count_opt eq 'gNIR'){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
171 if(scalar(keys(%mapped_genes)) == 1) { my ($gene) = keys %mapped_genes; if(scalar(keys(%mapped_tr)) == scalar(@{$g2t{$gene}})) { $count{$gene}[$i]++; }}
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
172 ## count reads that are unique to the gene and common to all of its isoforms
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
173 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
174 elsif($count_opt eq 'iNIR'){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
175 if(scalar(keys(%mapped_tr)) == 1) { my ($transcript) = keys %mapped_tr; $count{$transcript}[$i]++; } ## count reads that are unique to the transcript
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
176 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
177 else{
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
178 if(scalar(keys(%mapped_genes)) == 1) {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
179 my ($gene) = keys %mapped_genes;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
180 if(defined($uniq_d)) { if(scalar(keys(%mapped_insert_lengths))==1) { $count{$gene}[$i]++; } }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
181 else { $count{$gene}[$i]++; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
182 } ## count reads that are unique to the gene
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
183 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
184
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
185 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
186
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
187
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
188 $"="\t";
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
189
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
190
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
191 if($count_opt eq 'gNIR' || $count_opt eq 'gReadcount') { $count_target='genes'; } else { $count_target = 'transcripts'; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
192 if($verbose) { printf STDERR "Finished reading bowtieout files. Total %d $count_target counted\n",scalar(keys(%count)); }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
193
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
194
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
195 $minlen=1000000;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
196 for my $file (@bowtiefiles){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
197 $minlen = length($file) if length($file)<$minlen;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
198 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
199
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
200
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
201 # find longest common suffix for bowtiefiles
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
202 my %suffs;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
203 my $k=0;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
204 do {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
205 %suffs=();
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
206 $k++;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
207 for my $file (@bowtiefiles){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
208 $suff = substr($file,length($file)-$k);
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
209 $suffs{$suff}=1;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
210 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
211 } while ( scalar(keys(%suffs))==1 && $k<$minlen);
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
212
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
213 my ($commonsuffix) = substr($bowtiefiles[0],length($file)-($k-1));
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
214 if(scalar(@bowtiefiles) == 1) { $commonsuffix = ''; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
215
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
216 # title is between "/" and common suffix.
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
217 my @title=();
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
218 for my $file (@bowtiefiles){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
219 if($file=~ /([^\/]+)$commonsuffix$/) { push @title, $1; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
220 else { push @title, $file; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
221 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
222 if(defined($samplename)) { @title=(); push @title, $samplename; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
223
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
224
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
225 if($verbose) { print STDERR "Printing to file...\n"; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
226
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
227 if($count_opt eq 'gNIR' || $count_opt eq 'gReadcount'){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
228 print "gene\t@title\n";
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
229 for my $gene (sort keys %count){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
230 print "$gene";
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
231 for my $i (0..$#bowtiefiles){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
232 if(defined $count{$gene}[$i]) { print "\t$count{$gene}[$i]"; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
233 else { print "\t0"; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
234 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
235 print "\n";
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
236 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
237 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
238 else { ## iNIR
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
239 print "gene\ttranscript\t@title\n";
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
240 for my $tr (sort keys %count){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
241 print "$t2g{$tr}\t$tr";
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
242 for my $i (0..$#bowtiefiles){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
243 if(defined $count{$tr}[$i]) { print "\t$count{$tr}[$i]"; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
244 else { print "\t0"; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
245 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
246 print "\n";
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
247 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
248 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
249
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
250 if($verbose) { print STDERR "done.\n"; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
251
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
252
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
253
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
254 ## subroutines ##
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
255
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
256
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
257
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
258 sub help {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
259 print STDERR<<EOF
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
260
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
261 usage: $0 <options> gene2NM bowtieoutdir/file PE[1/0] > output.readcount
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
262
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
263 Options:
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
264 -v : verbose
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
265 -vv : more verbose
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
266 -f : use bowtie file name instead of bowtie directory name.
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
267 ex) $0 -f gene2NM bowtieoutfile PE > output.readcount
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
268 ex2) $0 gene2NM bowtieoutdir PE > output.readcount
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
269 --gNIR : count gNIR instead of gReadcount (recommended outfile suffix is .gNIR)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
270 --iNIR : count iNIR instead of gReadcount (recommended outfile suffix is .iNIR)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
271 -s|--sample <samplename> : sample name to be used as a column title. This options is available only when -f option is used.
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
272 -m|--max_insert_length <max_insert_length> : maximum insert length. If this option is used, length filtering is done. This option must accompany option -l (--readlength). This option doesn't do anything for single-end data (ignored)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
273 -l|--readlength <readlength> : read length. This is required only when -m (--max_insert_length) option is used.
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
274 -d|--datatype <datatype> : Either 'R' (Refseq) or 'E' (Ensembl). Use R if transcript names are in Refseq format (gi|xxx|xxx|name). (default E).
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
275 -u|--uniq_d : filter reads with a single mate distance only. (applicable for only PE gReadcount)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
276 EOF
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
277 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
278
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
279 sub min {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
280 if($_[0] lt $_[1]) { $_[0] } else { $_[1] };
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
281 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
282
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
283