Mercurial > repos > chawhwa > neuma
diff NEUMA-1.2.1/filter.best.from.bowtieout.3.pl @ 0:c44c43d185ef draft default tip
NEUMA-1.2.1 Uploaded
author | chawhwa |
---|---|
date | Thu, 08 Aug 2013 00:46:13 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/NEUMA-1.2.1/filter.best.from.bowtieout.3.pl Thu Aug 08 00:46:13 2013 -0400 @@ -0,0 +1,153 @@ +#!/usr/bin/perl +use Getopt::Long; + +# This version has an option of filtering only exclusively-CDS-mapped reads (useful for ribosome profiling analysis). The option name for delete_original is not cahnged. (version 3) +# This version fixed the problem of not being able to merge when the order of the two mates were flipped. (version 2) + + +my $delete_original; +my $newbowtiefile; ## output bowtiefile +my $mmcol=4; +my $datatype='E'; +&GetOptions ( 'rm|delete_original' => \$delete_original, + 'o=s' => \$newbowtiefile, + 'mmcol=i' => \$mmcol, ## column for mismatch info (0-based) + 'd|datatype=s' => \$datatype # refseq or ensembl ('refseq' if names must be parsed) +); + +if(@ARGV<2) { &help; exit; } + +my $bowtiefile = shift @ARGV; +my $PE = shift @ARGV; # 1 for paired-end, 0 for single-end + +if($datatype !~/^[RE]$/ ) { die "ERROR: -d (--datatype) must be specified to be either R (Refseq) or E (Ensembl) (default)\n"; } + +if(!defined($newbowtiefile)){ + ($newbowtiefile) = $bowtiefile=~/(.+)\.bowtieout/; + $newbowtiefile .= ".best.bowtieout"; +} + + + +my $prev_head1=''; my $prev_head2=''; +my $totmm=100000; # just some huge number +my @mapping=(); +open $IN, $bowtiefile or die "Can't open bowtiefile $bowtiefile\n"; +open $OUT, ">$newbowtiefile" or die "Can't write to output bowtiefile $newbowtiefile\n"; +while(<$IN>){ + chomp; + my ($head1,$strand1,$tr,$pos1,$mmstr1) = (split/\t/)[0,1,2,3,$mmcol]; + my $mm1 = &parse_mmstr($mmstr1); + if($PE==1) { + chomp(my $secondline = <$IN>); + ($head2,$strand2,$pos2,$mmstr2) = (split/\t/,$secondline)[0,1,3,$mmcol]; + my $mm2 = &parse_mmstr($mmstr2); + } + + if($PE==1){ + if($prev_head1 eq $head1 || $prev_head2 eq $head1) { + if($totmm>$mm1+$mm2) { @mapping=(); push @mapping,[$tr,$strand1,$pos1,$mmstr1,$strand2,$pos2,$mmstr2]; $totmm=$mm1+$mm2; } + elsif($totma==$mm1+$mm2) { push @mapping,[$tr,$strand1,$pos1,$mmstr1,$strand2,$pos2,$mmstr2]; } + } + else { + &print_mapping_PE(\@mapping,$OUT, $prev_head1,$prev_head2); + @mapping=(); + push @mapping,[$tr,$strand1,$pos1,$mmstr1,$strand2,$pos2,$mmstr2]; + $totmm=$mm1+$mm2; + } + } + else { + if($prev_head1 eq $head1) { + if($totmm>$mm1) { @mapping=(); push @mapping,[$tr,$strand1,$pos1,$mmstr1]; $totmm=$mm1; } + elsif($totmm==$mm1) { push @mapping,[$tr,$strand1,$pos1,$mmstr1]; } + } + else { + &print_mapping_SE(\@mapping,$OUT, $prev_head1); + @mapping=(); + push @mapping,[$tr,$strand1,$pos1,$mmstr1]; + $totmm=$mm1; + } + } + + $prev_head1= $head1; + if($PE==1) { $prev_head2 = $head2; } +} + + +## last line +if($PE==1){ + &print_mapping_PE(\@mapping,$OUT, $prev_head1, $prev_head2); +} +else { + &print_mapping_SE(\@mapping,$OUT, $prev_head1); +} + +close $OUT; +close $IN; + + + +## delete original bowtieout file +if(defined $delete_original) { `rm -f $bowtiefile`; } + + + + + +## subroutines @@ + + +sub parse_mmstr { + if(!defined $_[0]){ return 0; } + else { + return(scalar(split/,/,$_[0])); + } +} + + +sub print_mapping_PE { + my $pMapping = shift @_; + my $OUT = shift @_; + my $prev_head1 = shift @_; + my $prev_head2 = shift @_; + + if($prev_head1 ne ''){ + for my $i (0..$#$pMapping){ + my ($tr,$strand1,$pos1,$mm1,$strand2,$pos2,$mm2) = @{$pMapping->[$i]}; + print $OUT "$prev_head1\t$strand1\t$tr\t$pos1\t$mm1\n"; + print $OUT "$prev_head2\t$strand2\t$tr\t$pos2\t$mm2\n"; + } + } +} + + + +sub print_mapping_SE { + my $pMapping = shift @_; + my $OUT = shift @_; + my $prev_head1 = shift @_; + + if($prev_head1 ne ''){ + for my $i (0..$#$pMapping){ + my ($tr,$strand1,$pos1,$mm1) = @{$pMapping->[$i]}; + print $OUT "$prev_head1\t$strand1\t$tr\t$pos1\t$mm1\n"; + } + } +} + + + +sub help { + print STDERR<<EOF + +usage: $0 <options> bowtieout_file PE(1/0) + +Options + --rm|--delete_original : delete the original bowtieout file + -o <outfilename> : output bowtieout file name (default : originalfiletitle.best.bowtieout / originalfiletitle.best.cdsonly.bowtieout (if --cdsonly is used) ) + --mmcol : the column number (0-based) for mismatch information. (default : 4) + -d | datatype <datatype> : Either 'R' (Refseq) or 'E' (Ensembl). Use R if the transcript names are in Refseq format (gi|xxx|xxx|name). This option affects only when --cdsonly is used. (default : E) +EOF +} + +