view NEUMA-1.2.1/bowtieout2mappingstat.2.pl @ 0:c44c43d185ef draft default tip

NEUMA-1.2.1 Uploaded
author chawhwa
date Thu, 08 Aug 2013 00:46:13 -0400
parents
children
line wrap: on
line source

#!/usr/bin/perl
use Getopt::Long;
# This version includes max_insert_length filtered mapping stat, optionally. (version 2)


my $max_insert_length;
my $readlength;
my $datatype = 'E';  # E or R.
&GetOptions ( 'm|max_insert_length=i' => \$max_insert_length,
              'l|readlength=i' => \$readlength,
              'd|datatype=s' => \$datatype
);

if(defined($max_insert_length) && !defined($readlength)) {  die "readlength (-l|--readlength) must be defined if -m (--max_insert_length) is defined.\n"; }
if(@ARGV<3){ &help; exit; }


my $bowtie = shift @ARGV;
my $sample_name = shift @ARGV;
my $PE = shift @ARGV;

my $prev_head='';
my $NM=0; my $NR=0; my $NM_lf = 0; my $NR_lf = 0;   # lf : insert-length-filtered version
my $countNM=0; my $countNM_lf =0;
my $countNR=0; my $countNR_lf = 0;
my $countAll=0; my $countAll_lf=0;


open IN, $bowtie or die "Can't open bowtiefile $bowtie\n";
while(<IN>){
  chomp;
  my ($head,$tr,$pos1)= (split/\t/)[0,2,3];
  if($datatype eq 'R') { ($tr) = (split/\|/,$tr)[3];  $tr=~s/\.\d+$//; }

  if($PE==1){
    chomp(my $secondline = <IN>);
    my ($head2,$pos2) = (split/\t/,$secondline)[0,3];
    $head = &min($head,$head2);

  }

  if($prev_head eq $head) { 

     if(defined($max_insert_length) && $pos2+$readlength-$pos1 <= $max_insert_length) { 
           if($tr=~/^NM_/) { $NM_lf = 1; }
           elsif($tr=~/^NR_/) { $NR_lf = 1; }
     }

     if($tr=~/^NM_/) { $NM=1; }
     elsif($tr=~/^NR_/) { $NR=1; }
  }
  else {

     if(defined($max_insert_length) && $pos2+$readlength-$pos1 <= $max_insert_length) {
         if($NM_lf==1) { $countNM_lf++; $NM_lf=0; }
         if($NR_lf==1) { $countNR_lf++; $NR_lf=0; }
         $countAll_lf++;
     }
     if($NM==1) { $countNM++; $NM=0; }    ## if a single read is mapped to both NR and NM, it is counted once for each. Therefore total count may not be the same as the sum of NR and NM counts.
     if($NR==1) { $countNR++; $NR=0; }
     $countAll++;
  }
  $prev_head=$head;
}
# no need to do additional last line
close IN;


if($datatype eq 'E') { ($countNM,$countNR,$countNM_lf,$countNM_lf) = ('NA','NA','NA','NA'); }

$"="\t";
if(defined($max_insert_length)){
  print "sample_name\tmRNA_mapped\tncRNA_mapped\ttotalRNA_mapped\tmRNA_mapped(lenfiltered)\tncRNA_mapped(lenfiltered)\ttotalRNA_mapped(lenfiltered)\n";
  print "$sample_name\t$countNM\t$countNR\t$countAll\t$countNM_lf\t$countNR_lf\t$countAll_lf\n";
}
else {
  print "sample_name\tmRNA_mapped\tncRNA_mapped\ttotalRNA_mapped\n";
  print "$sample_name\t$countNM\t$countNR\t$countAll\n";
}



sub min {
   if($_[0] lt $_[1]) { $_[0] } else { $_[1] };
}


sub help {
  print STDERR<<EOF;

usage: $0 <options> bowtieoutfile samplename PE(1/0) (> mappingstat) 

Options:
  -m|--max_insert_length <max_insert_length> : the output includes insert-length-filtered mapping stats as well. -l(--readlength) option must be accompanied. This option is valid only for paired-end data. (For single-end, it is ignored)
  -l|--readlength <readlength> : read length. This is required only when -m(--max_insert_length) option is used.
  -d|--datatype <datatype> : Either E (Ensembl) or R (Refseq). Use R if the data is Refseq and the transcript names are in format 'gi|xxx|xxx|name'. (Default E)

EOF
}