Mercurial > repos > chawhwa > neuma
comparison NEUMA-1.2.1/filter.best.from.bowtieout.3.pl @ 0:c44c43d185ef draft default tip
NEUMA-1.2.1 Uploaded
author | chawhwa |
---|---|
date | Thu, 08 Aug 2013 00:46:13 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c44c43d185ef |
---|---|
1 #!/usr/bin/perl | |
2 use Getopt::Long; | |
3 | |
4 # This version has an option of filtering only exclusively-CDS-mapped reads (useful for ribosome profiling analysis). The option name for delete_original is not cahnged. (version 3) | |
5 # This version fixed the problem of not being able to merge when the order of the two mates were flipped. (version 2) | |
6 | |
7 | |
8 my $delete_original; | |
9 my $newbowtiefile; ## output bowtiefile | |
10 my $mmcol=4; | |
11 my $datatype='E'; | |
12 &GetOptions ( 'rm|delete_original' => \$delete_original, | |
13 'o=s' => \$newbowtiefile, | |
14 'mmcol=i' => \$mmcol, ## column for mismatch info (0-based) | |
15 'd|datatype=s' => \$datatype # refseq or ensembl ('refseq' if names must be parsed) | |
16 ); | |
17 | |
18 if(@ARGV<2) { &help; exit; } | |
19 | |
20 my $bowtiefile = shift @ARGV; | |
21 my $PE = shift @ARGV; # 1 for paired-end, 0 for single-end | |
22 | |
23 if($datatype !~/^[RE]$/ ) { die "ERROR: -d (--datatype) must be specified to be either R (Refseq) or E (Ensembl) (default)\n"; } | |
24 | |
25 if(!defined($newbowtiefile)){ | |
26 ($newbowtiefile) = $bowtiefile=~/(.+)\.bowtieout/; | |
27 $newbowtiefile .= ".best.bowtieout"; | |
28 } | |
29 | |
30 | |
31 | |
32 my $prev_head1=''; my $prev_head2=''; | |
33 my $totmm=100000; # just some huge number | |
34 my @mapping=(); | |
35 open $IN, $bowtiefile or die "Can't open bowtiefile $bowtiefile\n"; | |
36 open $OUT, ">$newbowtiefile" or die "Can't write to output bowtiefile $newbowtiefile\n"; | |
37 while(<$IN>){ | |
38 chomp; | |
39 my ($head1,$strand1,$tr,$pos1,$mmstr1) = (split/\t/)[0,1,2,3,$mmcol]; | |
40 my $mm1 = &parse_mmstr($mmstr1); | |
41 if($PE==1) { | |
42 chomp(my $secondline = <$IN>); | |
43 ($head2,$strand2,$pos2,$mmstr2) = (split/\t/,$secondline)[0,1,3,$mmcol]; | |
44 my $mm2 = &parse_mmstr($mmstr2); | |
45 } | |
46 | |
47 if($PE==1){ | |
48 if($prev_head1 eq $head1 || $prev_head2 eq $head1) { | |
49 if($totmm>$mm1+$mm2) { @mapping=(); push @mapping,[$tr,$strand1,$pos1,$mmstr1,$strand2,$pos2,$mmstr2]; $totmm=$mm1+$mm2; } | |
50 elsif($totma==$mm1+$mm2) { push @mapping,[$tr,$strand1,$pos1,$mmstr1,$strand2,$pos2,$mmstr2]; } | |
51 } | |
52 else { | |
53 &print_mapping_PE(\@mapping,$OUT, $prev_head1,$prev_head2); | |
54 @mapping=(); | |
55 push @mapping,[$tr,$strand1,$pos1,$mmstr1,$strand2,$pos2,$mmstr2]; | |
56 $totmm=$mm1+$mm2; | |
57 } | |
58 } | |
59 else { | |
60 if($prev_head1 eq $head1) { | |
61 if($totmm>$mm1) { @mapping=(); push @mapping,[$tr,$strand1,$pos1,$mmstr1]; $totmm=$mm1; } | |
62 elsif($totmm==$mm1) { push @mapping,[$tr,$strand1,$pos1,$mmstr1]; } | |
63 } | |
64 else { | |
65 &print_mapping_SE(\@mapping,$OUT, $prev_head1); | |
66 @mapping=(); | |
67 push @mapping,[$tr,$strand1,$pos1,$mmstr1]; | |
68 $totmm=$mm1; | |
69 } | |
70 } | |
71 | |
72 $prev_head1= $head1; | |
73 if($PE==1) { $prev_head2 = $head2; } | |
74 } | |
75 | |
76 | |
77 ## last line | |
78 if($PE==1){ | |
79 &print_mapping_PE(\@mapping,$OUT, $prev_head1, $prev_head2); | |
80 } | |
81 else { | |
82 &print_mapping_SE(\@mapping,$OUT, $prev_head1); | |
83 } | |
84 | |
85 close $OUT; | |
86 close $IN; | |
87 | |
88 | |
89 | |
90 ## delete original bowtieout file | |
91 if(defined $delete_original) { `rm -f $bowtiefile`; } | |
92 | |
93 | |
94 | |
95 | |
96 | |
97 ## subroutines @@ | |
98 | |
99 | |
100 sub parse_mmstr { | |
101 if(!defined $_[0]){ return 0; } | |
102 else { | |
103 return(scalar(split/,/,$_[0])); | |
104 } | |
105 } | |
106 | |
107 | |
108 sub print_mapping_PE { | |
109 my $pMapping = shift @_; | |
110 my $OUT = shift @_; | |
111 my $prev_head1 = shift @_; | |
112 my $prev_head2 = shift @_; | |
113 | |
114 if($prev_head1 ne ''){ | |
115 for my $i (0..$#$pMapping){ | |
116 my ($tr,$strand1,$pos1,$mm1,$strand2,$pos2,$mm2) = @{$pMapping->[$i]}; | |
117 print $OUT "$prev_head1\t$strand1\t$tr\t$pos1\t$mm1\n"; | |
118 print $OUT "$prev_head2\t$strand2\t$tr\t$pos2\t$mm2\n"; | |
119 } | |
120 } | |
121 } | |
122 | |
123 | |
124 | |
125 sub print_mapping_SE { | |
126 my $pMapping = shift @_; | |
127 my $OUT = shift @_; | |
128 my $prev_head1 = shift @_; | |
129 | |
130 if($prev_head1 ne ''){ | |
131 for my $i (0..$#$pMapping){ | |
132 my ($tr,$strand1,$pos1,$mm1) = @{$pMapping->[$i]}; | |
133 print $OUT "$prev_head1\t$strand1\t$tr\t$pos1\t$mm1\n"; | |
134 } | |
135 } | |
136 } | |
137 | |
138 | |
139 | |
140 sub help { | |
141 print STDERR<<EOF | |
142 | |
143 usage: $0 <options> bowtieout_file PE(1/0) | |
144 | |
145 Options | |
146 --rm|--delete_original : delete the original bowtieout file | |
147 -o <outfilename> : output bowtieout file name (default : originalfiletitle.best.bowtieout / originalfiletitle.best.cdsonly.bowtieout (if --cdsonly is used) ) | |
148 --mmcol : the column number (0-based) for mismatch information. (default : 4) | |
149 -d | datatype <datatype> : Either 'R' (Refseq) or 'E' (Ensembl). Use R if the transcript names are in Refseq format (gi|xxx|xxx|name). This option affects only when --cdsonly is used. (default : E) | |
150 EOF | |
151 } | |
152 | |
153 |