view NEUMA-1.2.1/filter.best.from.bowtieout.3.pl @ 0:c44c43d185ef draft default tip

NEUMA-1.2.1 Uploaded
author chawhwa
date Thu, 08 Aug 2013 00:46:13 -0400
parents
children
line wrap: on
line source

#!/usr/bin/perl
use Getopt::Long;

# This version has an option of filtering only exclusively-CDS-mapped reads (useful for ribosome profiling analysis). The option name for delete_original is not cahnged. (version 3)
# This version fixed the problem of not being able to merge when the order of the two mates were flipped. (version 2)


my $delete_original;
my $newbowtiefile;  ## output bowtiefile
my $mmcol=4;
my $datatype='E';
&GetOptions ( 'rm|delete_original' => \$delete_original,
              'o=s' => \$newbowtiefile,
              'mmcol=i' => \$mmcol,   ## column for mismatch info (0-based)
              'd|datatype=s' => \$datatype  # refseq or ensembl ('refseq' if names must be parsed)
);

if(@ARGV<2) { &help; exit; }

my $bowtiefile = shift @ARGV;
my $PE = shift @ARGV;  # 1 for paired-end, 0 for single-end

if($datatype !~/^[RE]$/ ) { die "ERROR: -d (--datatype) must be specified to be either R (Refseq) or E (Ensembl) (default)\n"; }

if(!defined($newbowtiefile)){
  ($newbowtiefile) = $bowtiefile=~/(.+)\.bowtieout/;
  $newbowtiefile .= ".best.bowtieout";
}



my $prev_head1=''; my $prev_head2='';
my $totmm=100000;  # just some huge number
my @mapping=();
open $IN, $bowtiefile or die "Can't open bowtiefile $bowtiefile\n";
open $OUT, ">$newbowtiefile" or die "Can't write to output bowtiefile $newbowtiefile\n";
while(<$IN>){
  chomp;
  my ($head1,$strand1,$tr,$pos1,$mmstr1) = (split/\t/)[0,1,2,3,$mmcol];
  my $mm1 = &parse_mmstr($mmstr1);
  if($PE==1) { 
    chomp(my $secondline = <$IN>);
    ($head2,$strand2,$pos2,$mmstr2) = (split/\t/,$secondline)[0,1,3,$mmcol];
    my $mm2 = &parse_mmstr($mmstr2);
  }
  
  if($PE==1){
    if($prev_head1 eq $head1 || $prev_head2 eq $head1) {
      if($totmm>$mm1+$mm2) { @mapping=();  push @mapping,[$tr,$strand1,$pos1,$mmstr1,$strand2,$pos2,$mmstr2];  $totmm=$mm1+$mm2; }
      elsif($totma==$mm1+$mm2) { push @mapping,[$tr,$strand1,$pos1,$mmstr1,$strand2,$pos2,$mmstr2]; }
    }
    else {
      &print_mapping_PE(\@mapping,$OUT, $prev_head1,$prev_head2);
      @mapping=();
      push @mapping,[$tr,$strand1,$pos1,$mmstr1,$strand2,$pos2,$mmstr2];
      $totmm=$mm1+$mm2;
    }
  }
  else {
    if($prev_head1 eq $head1) {
      if($totmm>$mm1) { @mapping=();  push @mapping,[$tr,$strand1,$pos1,$mmstr1];  $totmm=$mm1; }
      elsif($totmm==$mm1) { push @mapping,[$tr,$strand1,$pos1,$mmstr1]; }
    }
    else {
      &print_mapping_SE(\@mapping,$OUT, $prev_head1);
      @mapping=();
      push @mapping,[$tr,$strand1,$pos1,$mmstr1];
      $totmm=$mm1;
    }
  }

  $prev_head1= $head1;
  if($PE==1) { $prev_head2 = $head2; }
}


## last line
if($PE==1){
  &print_mapping_PE(\@mapping,$OUT, $prev_head1, $prev_head2);
}
else {
  &print_mapping_SE(\@mapping,$OUT, $prev_head1);
}

close $OUT;
close $IN;



## delete original bowtieout file
if(defined $delete_original) { `rm -f $bowtiefile`; }





## subroutines @@


sub parse_mmstr {
  if(!defined $_[0]){ return 0; }
  else {
    return(scalar(split/,/,$_[0]));
  }
}


sub print_mapping_PE {
  my $pMapping = shift @_;
  my $OUT = shift @_;
  my $prev_head1 = shift @_;
  my $prev_head2 = shift @_;

  if($prev_head1 ne ''){
    for my $i (0..$#$pMapping){
      my ($tr,$strand1,$pos1,$mm1,$strand2,$pos2,$mm2) = @{$pMapping->[$i]};
      print $OUT "$prev_head1\t$strand1\t$tr\t$pos1\t$mm1\n";
      print $OUT "$prev_head2\t$strand2\t$tr\t$pos2\t$mm2\n";
    }
  }
}



sub print_mapping_SE {
  my $pMapping = shift @_;
  my $OUT = shift @_;
  my $prev_head1 = shift @_;

  if($prev_head1 ne ''){
    for my $i (0..$#$pMapping){
      my ($tr,$strand1,$pos1,$mm1) = @{$pMapping->[$i]};
      print $OUT "$prev_head1\t$strand1\t$tr\t$pos1\t$mm1\n";
    }
  }
}



sub help {
  print STDERR<<EOF

usage: $0 <options> bowtieout_file PE(1/0)

Options
   --rm|--delete_original  : delete the original bowtieout file
   -o <outfilename> : output bowtieout file name (default : originalfiletitle.best.bowtieout / originalfiletitle.best.cdsonly.bowtieout (if --cdsonly is used) )
   --mmcol : the column number (0-based) for mismatch information. (default : 4)
   -d | datatype <datatype> : Either 'R' (Refseq) or 'E' (Ensembl). Use R if the transcript names are in Refseq format (gi|xxx|xxx|name). This option affects only when --cdsonly is used. (default : E)
EOF
}