0
|
1 #!/usr/bin/perl
|
|
2 use Getopt::Long;
|
|
3
|
|
4 # This version has an option of filtering only exclusively-CDS-mapped reads (useful for ribosome profiling analysis). The option name for delete_original is not cahnged. (version 3)
|
|
5 # This version fixed the problem of not being able to merge when the order of the two mates were flipped. (version 2)
|
|
6
|
|
7
|
|
8 my $delete_original;
|
|
9 my $newbowtiefile; ## output bowtiefile
|
|
10 my $mmcol=4;
|
|
11 my $datatype='E';
|
|
12 &GetOptions ( 'rm|delete_original' => \$delete_original,
|
|
13 'o=s' => \$newbowtiefile,
|
|
14 'mmcol=i' => \$mmcol, ## column for mismatch info (0-based)
|
|
15 'd|datatype=s' => \$datatype # refseq or ensembl ('refseq' if names must be parsed)
|
|
16 );
|
|
17
|
|
18 if(@ARGV<2) { &help; exit; }
|
|
19
|
|
20 my $bowtiefile = shift @ARGV;
|
|
21 my $PE = shift @ARGV; # 1 for paired-end, 0 for single-end
|
|
22
|
|
23 if($datatype !~/^[RE]$/ ) { die "ERROR: -d (--datatype) must be specified to be either R (Refseq) or E (Ensembl) (default)\n"; }
|
|
24
|
|
25 if(!defined($newbowtiefile)){
|
|
26 ($newbowtiefile) = $bowtiefile=~/(.+)\.bowtieout/;
|
|
27 $newbowtiefile .= ".best.bowtieout";
|
|
28 }
|
|
29
|
|
30
|
|
31
|
|
32 my $prev_head1=''; my $prev_head2='';
|
|
33 my $totmm=100000; # just some huge number
|
|
34 my @mapping=();
|
|
35 open $IN, $bowtiefile or die "Can't open bowtiefile $bowtiefile\n";
|
|
36 open $OUT, ">$newbowtiefile" or die "Can't write to output bowtiefile $newbowtiefile\n";
|
|
37 while(<$IN>){
|
|
38 chomp;
|
|
39 my ($head1,$strand1,$tr,$pos1,$mmstr1) = (split/\t/)[0,1,2,3,$mmcol];
|
|
40 my $mm1 = &parse_mmstr($mmstr1);
|
|
41 if($PE==1) {
|
|
42 chomp(my $secondline = <$IN>);
|
|
43 ($head2,$strand2,$pos2,$mmstr2) = (split/\t/,$secondline)[0,1,3,$mmcol];
|
|
44 my $mm2 = &parse_mmstr($mmstr2);
|
|
45 }
|
|
46
|
|
47 if($PE==1){
|
|
48 if($prev_head1 eq $head1 || $prev_head2 eq $head1) {
|
|
49 if($totmm>$mm1+$mm2) { @mapping=(); push @mapping,[$tr,$strand1,$pos1,$mmstr1,$strand2,$pos2,$mmstr2]; $totmm=$mm1+$mm2; }
|
|
50 elsif($totma==$mm1+$mm2) { push @mapping,[$tr,$strand1,$pos1,$mmstr1,$strand2,$pos2,$mmstr2]; }
|
|
51 }
|
|
52 else {
|
|
53 &print_mapping_PE(\@mapping,$OUT, $prev_head1,$prev_head2);
|
|
54 @mapping=();
|
|
55 push @mapping,[$tr,$strand1,$pos1,$mmstr1,$strand2,$pos2,$mmstr2];
|
|
56 $totmm=$mm1+$mm2;
|
|
57 }
|
|
58 }
|
|
59 else {
|
|
60 if($prev_head1 eq $head1) {
|
|
61 if($totmm>$mm1) { @mapping=(); push @mapping,[$tr,$strand1,$pos1,$mmstr1]; $totmm=$mm1; }
|
|
62 elsif($totmm==$mm1) { push @mapping,[$tr,$strand1,$pos1,$mmstr1]; }
|
|
63 }
|
|
64 else {
|
|
65 &print_mapping_SE(\@mapping,$OUT, $prev_head1);
|
|
66 @mapping=();
|
|
67 push @mapping,[$tr,$strand1,$pos1,$mmstr1];
|
|
68 $totmm=$mm1;
|
|
69 }
|
|
70 }
|
|
71
|
|
72 $prev_head1= $head1;
|
|
73 if($PE==1) { $prev_head2 = $head2; }
|
|
74 }
|
|
75
|
|
76
|
|
77 ## last line
|
|
78 if($PE==1){
|
|
79 &print_mapping_PE(\@mapping,$OUT, $prev_head1, $prev_head2);
|
|
80 }
|
|
81 else {
|
|
82 &print_mapping_SE(\@mapping,$OUT, $prev_head1);
|
|
83 }
|
|
84
|
|
85 close $OUT;
|
|
86 close $IN;
|
|
87
|
|
88
|
|
89
|
|
90 ## delete original bowtieout file
|
|
91 if(defined $delete_original) { `rm -f $bowtiefile`; }
|
|
92
|
|
93
|
|
94
|
|
95
|
|
96
|
|
97 ## subroutines @@
|
|
98
|
|
99
|
|
100 sub parse_mmstr {
|
|
101 if(!defined $_[0]){ return 0; }
|
|
102 else {
|
|
103 return(scalar(split/,/,$_[0]));
|
|
104 }
|
|
105 }
|
|
106
|
|
107
|
|
108 sub print_mapping_PE {
|
|
109 my $pMapping = shift @_;
|
|
110 my $OUT = shift @_;
|
|
111 my $prev_head1 = shift @_;
|
|
112 my $prev_head2 = shift @_;
|
|
113
|
|
114 if($prev_head1 ne ''){
|
|
115 for my $i (0..$#$pMapping){
|
|
116 my ($tr,$strand1,$pos1,$mm1,$strand2,$pos2,$mm2) = @{$pMapping->[$i]};
|
|
117 print $OUT "$prev_head1\t$strand1\t$tr\t$pos1\t$mm1\n";
|
|
118 print $OUT "$prev_head2\t$strand2\t$tr\t$pos2\t$mm2\n";
|
|
119 }
|
|
120 }
|
|
121 }
|
|
122
|
|
123
|
|
124
|
|
125 sub print_mapping_SE {
|
|
126 my $pMapping = shift @_;
|
|
127 my $OUT = shift @_;
|
|
128 my $prev_head1 = shift @_;
|
|
129
|
|
130 if($prev_head1 ne ''){
|
|
131 for my $i (0..$#$pMapping){
|
|
132 my ($tr,$strand1,$pos1,$mm1) = @{$pMapping->[$i]};
|
|
133 print $OUT "$prev_head1\t$strand1\t$tr\t$pos1\t$mm1\n";
|
|
134 }
|
|
135 }
|
|
136 }
|
|
137
|
|
138
|
|
139
|
|
140 sub help {
|
|
141 print STDERR<<EOF
|
|
142
|
|
143 usage: $0 <options> bowtieout_file PE(1/0)
|
|
144
|
|
145 Options
|
|
146 --rm|--delete_original : delete the original bowtieout file
|
|
147 -o <outfilename> : output bowtieout file name (default : originalfiletitle.best.bowtieout / originalfiletitle.best.cdsonly.bowtieout (if --cdsonly is used) )
|
|
148 --mmcol : the column number (0-based) for mismatch information. (default : 4)
|
|
149 -d | datatype <datatype> : Either 'R' (Refseq) or 'E' (Ensembl). Use R if the transcript names are in Refseq format (gi|xxx|xxx|name). This option affects only when --cdsonly is used. (default : E)
|
|
150 EOF
|
|
151 }
|
|
152
|
|
153
|