annotate NEUMA-1.2.1/filter.best.from.bowtieout.3.pl @ 0:c44c43d185ef draft default tip

NEUMA-1.2.1 Uploaded
author chawhwa
date Thu, 08 Aug 2013 00:46:13 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
1 #!/usr/bin/perl
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
2 use Getopt::Long;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
3
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
4 # This version has an option of filtering only exclusively-CDS-mapped reads (useful for ribosome profiling analysis). The option name for delete_original is not cahnged. (version 3)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
5 # This version fixed the problem of not being able to merge when the order of the two mates were flipped. (version 2)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
6
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
7
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
8 my $delete_original;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
9 my $newbowtiefile; ## output bowtiefile
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
10 my $mmcol=4;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
11 my $datatype='E';
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
12 &GetOptions ( 'rm|delete_original' => \$delete_original,
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
13 'o=s' => \$newbowtiefile,
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
14 'mmcol=i' => \$mmcol, ## column for mismatch info (0-based)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
15 'd|datatype=s' => \$datatype # refseq or ensembl ('refseq' if names must be parsed)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
16 );
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
17
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
18 if(@ARGV<2) { &help; exit; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
19
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
20 my $bowtiefile = shift @ARGV;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
21 my $PE = shift @ARGV; # 1 for paired-end, 0 for single-end
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
22
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
23 if($datatype !~/^[RE]$/ ) { die "ERROR: -d (--datatype) must be specified to be either R (Refseq) or E (Ensembl) (default)\n"; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
24
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
25 if(!defined($newbowtiefile)){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
26 ($newbowtiefile) = $bowtiefile=~/(.+)\.bowtieout/;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
27 $newbowtiefile .= ".best.bowtieout";
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
28 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
29
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
30
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
31
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
32 my $prev_head1=''; my $prev_head2='';
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
33 my $totmm=100000; # just some huge number
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
34 my @mapping=();
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
35 open $IN, $bowtiefile or die "Can't open bowtiefile $bowtiefile\n";
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
36 open $OUT, ">$newbowtiefile" or die "Can't write to output bowtiefile $newbowtiefile\n";
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
37 while(<$IN>){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
38 chomp;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
39 my ($head1,$strand1,$tr,$pos1,$mmstr1) = (split/\t/)[0,1,2,3,$mmcol];
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
40 my $mm1 = &parse_mmstr($mmstr1);
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
41 if($PE==1) {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
42 chomp(my $secondline = <$IN>);
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
43 ($head2,$strand2,$pos2,$mmstr2) = (split/\t/,$secondline)[0,1,3,$mmcol];
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
44 my $mm2 = &parse_mmstr($mmstr2);
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
45 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
46
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
47 if($PE==1){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
48 if($prev_head1 eq $head1 || $prev_head2 eq $head1) {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
49 if($totmm>$mm1+$mm2) { @mapping=(); push @mapping,[$tr,$strand1,$pos1,$mmstr1,$strand2,$pos2,$mmstr2]; $totmm=$mm1+$mm2; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
50 elsif($totma==$mm1+$mm2) { push @mapping,[$tr,$strand1,$pos1,$mmstr1,$strand2,$pos2,$mmstr2]; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
51 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
52 else {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
53 &print_mapping_PE(\@mapping,$OUT, $prev_head1,$prev_head2);
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
54 @mapping=();
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
55 push @mapping,[$tr,$strand1,$pos1,$mmstr1,$strand2,$pos2,$mmstr2];
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
56 $totmm=$mm1+$mm2;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
57 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
58 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
59 else {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
60 if($prev_head1 eq $head1) {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
61 if($totmm>$mm1) { @mapping=(); push @mapping,[$tr,$strand1,$pos1,$mmstr1]; $totmm=$mm1; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
62 elsif($totmm==$mm1) { push @mapping,[$tr,$strand1,$pos1,$mmstr1]; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
63 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
64 else {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
65 &print_mapping_SE(\@mapping,$OUT, $prev_head1);
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
66 @mapping=();
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
67 push @mapping,[$tr,$strand1,$pos1,$mmstr1];
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
68 $totmm=$mm1;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
69 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
70 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
71
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
72 $prev_head1= $head1;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
73 if($PE==1) { $prev_head2 = $head2; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
74 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
75
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
76
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
77 ## last line
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
78 if($PE==1){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
79 &print_mapping_PE(\@mapping,$OUT, $prev_head1, $prev_head2);
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
80 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
81 else {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
82 &print_mapping_SE(\@mapping,$OUT, $prev_head1);
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
83 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
84
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
85 close $OUT;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
86 close $IN;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
87
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
88
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
89
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
90 ## delete original bowtieout file
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
91 if(defined $delete_original) { `rm -f $bowtiefile`; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
92
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
93
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
94
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
95
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
96
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
97 ## subroutines @@
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
98
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
99
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
100 sub parse_mmstr {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
101 if(!defined $_[0]){ return 0; }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
102 else {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
103 return(scalar(split/,/,$_[0]));
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
104 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
105 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
106
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
107
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
108 sub print_mapping_PE {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
109 my $pMapping = shift @_;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
110 my $OUT = shift @_;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
111 my $prev_head1 = shift @_;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
112 my $prev_head2 = shift @_;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
113
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
114 if($prev_head1 ne ''){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
115 for my $i (0..$#$pMapping){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
116 my ($tr,$strand1,$pos1,$mm1,$strand2,$pos2,$mm2) = @{$pMapping->[$i]};
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
117 print $OUT "$prev_head1\t$strand1\t$tr\t$pos1\t$mm1\n";
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
118 print $OUT "$prev_head2\t$strand2\t$tr\t$pos2\t$mm2\n";
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
119 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
120 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
121 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
122
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
123
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
124
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
125 sub print_mapping_SE {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
126 my $pMapping = shift @_;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
127 my $OUT = shift @_;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
128 my $prev_head1 = shift @_;
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
129
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
130 if($prev_head1 ne ''){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
131 for my $i (0..$#$pMapping){
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
132 my ($tr,$strand1,$pos1,$mm1) = @{$pMapping->[$i]};
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
133 print $OUT "$prev_head1\t$strand1\t$tr\t$pos1\t$mm1\n";
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
134 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
135 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
136 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
137
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
138
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
139
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
140 sub help {
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
141 print STDERR<<EOF
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
142
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
143 usage: $0 <options> bowtieout_file PE(1/0)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
144
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
145 Options
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
146 --rm|--delete_original : delete the original bowtieout file
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
147 -o <outfilename> : output bowtieout file name (default : originalfiletitle.best.bowtieout / originalfiletitle.best.cdsonly.bowtieout (if --cdsonly is used) )
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
148 --mmcol : the column number (0-based) for mismatch information. (default : 4)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
149 -d | datatype <datatype> : Either 'R' (Refseq) or 'E' (Ensembl). Use R if the transcript names are in Refseq format (gi|xxx|xxx|name). This option affects only when --cdsonly is used. (default : E)
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
150 EOF
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
151 }
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
152
c44c43d185ef NEUMA-1.2.1 Uploaded
chawhwa
parents:
diff changeset
153