Mercurial > repos > chawhwa > neuma

#!/usr/bin/perl

# This version has an option of counting only the pairs with a single mate distance. (version 11).
# This version has fixed the problem in filtering max_insert_length. (version 11).
# This version does not print out the additional tab in the iNIR file. (version 11).
# This version has data type option for parsing refseq headers. (version 10)
# This version fixed the max_insert_lendis option name from max_insert_length. (version 10)
# This version can take multiple bowtieout files separated by comma. (version 10)
# This version can take option of max insert lendis (version 9)
# This version can take sample name (column title) as option, if -f is selected. (version 8)
# This version fixed the problem of double-counting cases that are strand-flipped. (version 7)
# This version has an option of counting all reads, gNIR or iNIR. (version 7)
# This version has two levels of verbosity and uses whole file name if run with a single bowtie file. (version 6)
# This version has an option of using a single bowtie file name instead of bowtie directory. (version 6)
# This version fixed the problem of not being able to find common suffix and assumes .bowtieout suffix for bowtieout files, to prevent reading other files in the directory. (version 5)
# This version fixed the problem of going into infinite loop when run with one sample. (Version 3)
# This version is verbose. (version 2)

use Getopt::Long;
my $verbose=0;
my $LINEBIN=1000000;

my $bowtie = 'd';  # default is directory.
my $count_opt = 'gReadcount';
my $samplename;
my $max_insert_length;
my $readlength;
my $datatype='E';
my $uniq_d;  # reads with single mate distance only (applicable only for PE)
&GetOptions ( 'v' => sub { $verbose=1 },
              'vv' => sub { $verbose=2 },
              'f' => sub { $bowtie='f'},  # read bowtie file instead of bowtiu directory
              'gNIR' => sub { $count_opt = 'gNIR' },
              'iNIR' => sub { $count_opt = 'iNIR' },
              's|sample=s' => \$samplename,
              'm|max_insert_length=i' => \$max_insert_length,
              'l|readlength=i' => \$readlength,
              'd|datatype=s' => \$datatype,
              'u|uniq_d' => \$uniq_d
);

if($bowtie eq 'd' && defined($samplename)) { print STDERR "Warning:  -s|--sample option is ignored, when -f is not selected.\n"; }
if(defined($max_insert_length) && !defined($readlength)) { die "readlength must be defined if max_insert_length is defined.\n"; }


if(@ARGV<3) { &help;  exit; }


my $gene2NM = shift @ARGV;
my $bowtieoutdir = shift @ARGV;
my $PE = shift @ARGV;

if($PE==0 && defined($max_insert_length)) { print STDERR "Warning: -m|--max_insert_length option is ignored for single end data.\n"; }


if($bowtie eq 'd'){
  @bowtiefiles = split/\n/,`ls -1 $bowtieoutdir/*`;
  if($verbose) { printf STDERR "Finished reading bowtieout dir $bowtieoutdir. Total %d files\n", scalar(@bowtiefiles); }
}
else {
  push @bowtiefiles, (split/,/,$bowtieoutdir);
}


open GENE2NM, $gene2NM or die "Can't open gene2NM file $gene2NM\n";
while(<GENE2NM>){
  chomp;
  my ($g,$t) = split/\t/;
  $t2g{$t}=$g;
  push @{$g2t{$g}}, $t;
}
close GENE2NM;
if($verbose) { printf STDERR "Finished reading gene2NM file $gene2NM. Total %d transcripts and %d genes\n", scalar(keys(%t2g)), scalar(keys(%g2t)); }


for my $i (0..$#bowtiefiles) {

  my $bowtieout = $bowtiefiles[$i];
  my %mapped_genes =();
  my %mapped_tr =();
  my %mapped_insert_lengths =();

  if($verbose) { print STDERR "Processing bowtieout $bowtieout (file $i)\n"; }


  my $prev_head=''; my $prev_tr='';
  open $BOWTIE, $bowtieout or die "Can't open bowtiefile $bowtieout\n";
  while(<$BOWTIE>){
    chomp;
    my ($head,$tr,$pos1) = (split/\t/)[0,2,3];
    if($datatype eq 'R') { my ($tmp_tr) = (split/\|/,$tr)[3]; if(defined($tmp_tr)) {$tr = $tmp_tr; $tr=~s/\.\d+$//; }}


    my $insert_length=0;   ## default, for SE.
    if($PE==1){
      chomp(my $secondline = <$BOWTIE>);
      my ($head2,$pos2) = (split/\t/,$secondline)[0,3];
      $head = &min($head,$head2);
      $insert_length = $pos2 - $pos1 + $readlength;
      if(defined $max_insert_length) { if($insert_length > $max_insert_length) { next; } }
    }


    if($verbose==2) { if($prev_head eq '') { print STDERR "head= $head, tr= $tr, PE = $PE\n"; }  }   # print only at first line


    if($count_opt eq 'gNIR'){
       if($head eq $prev_head) {
          $mapped_tr { $tr } = 1;
          $mapped_tr { $prev_tr } = 1;
          $mapped_genes { $t2g{$tr} } = 1;
          $mapped_genes { $t2g{$prev_tr} } = 1;
       }
       else {
          if(scalar(keys(%mapped_genes)) == 1) { my ($gene) = keys %mapped_genes; if(scalar(keys(%mapped_tr)) == scalar(@{$g2t{$gene}})) { $count{$gene}[$i]++; }}
          ## count reads that are unique to the gene and common to all of its isoforms
          %mapped_genes =();
          %mapped_tr = ();
          $mapped_tr { $tr } = 1;
          $mapped_genes { $t2g{$tr} } = 1;
       }

    }
    elsif($count_opt eq 'iNIR'){
       if($head eq $prev_head) {
          $mapped_tr { $tr } = 1;
          $mapped_tr { $prev_tr } = 1;
       }
       else {
          if(scalar(keys(%mapped_tr)) == 1) { my ($transcript) = keys %mapped_tr; $count{$transcript}[$i]++; }  ## count reads that are unique to the transcript
          %mapped_tr = ();
          $mapped_tr { $tr } = 1;
       }
    }
    else {
       if($head eq $prev_head) {
          $mapped_genes { $t2g{$tr} } = 1;
          $mapped_genes { $t2g{$prev_tr} } = 1;
          $mapped_insert_lengths { "$insert_length" } =1;
          $mapped_insert_lengths { "$prev_insert_length" } =1;
       }
       else {
           if(scalar(keys(%mapped_genes)) == 1) {
              my ($gene) = keys %mapped_genes;
              if(defined($uniq_d)) { if(scalar(keys(%mapped_insert_lengths))==1) { $count{$gene}[$i]++; } }
              else { $count{$gene}[$i]++;  }
           }  ## count reads that are unique to the gene
           %mapped_genes =();
           %mapped_insert_lengths =();
           $mapped_genes { $t2g{$tr} } = 1;
           $mapped_insert_lengths { "$insert_length" } =1;
       }
    }


    $prev_head = $head;
    $prev_tr = $tr;
    $prev_insert_length = $insert_length;

    if($verbose==2) { if($. % $LINEBIN==0) { print STDERR ":line $.\n"; } }

  }
  close $BOWTIE;


  # last line
  if($count_opt eq 'gNIR'){
    if(scalar(keys(%mapped_genes)) == 1) { my ($gene) = keys %mapped_genes; if(scalar(keys(%mapped_tr)) == scalar(@{$g2t{$gene}})) { $count{$gene}[$i]++; }}
    ## count reads that are unique to the gene and common to all of its isoforms
  }
  elsif($count_opt eq 'iNIR'){
    if(scalar(keys(%mapped_tr)) == 1) { my ($transcript) = keys %mapped_tr; $count{$transcript}[$i]++; }  ## count reads that are unique to the transcript
  }
  else{
    if(scalar(keys(%mapped_genes)) == 1) {
      my ($gene) = keys %mapped_genes;
      if(defined($uniq_d)) { if(scalar(keys(%mapped_insert_lengths))==1) { $count{$gene}[$i]++; } }
      else { $count{$gene}[$i]++;  }
    }  ## count reads that are unique to the gene
  }

}


$"="\t";


if($count_opt eq 'gNIR' || $count_opt eq 'gReadcount') { $count_target='genes'; } else { $count_target = 'transcripts'; }
if($verbose) { printf STDERR "Finished reading bowtieout files. Total %d $count_target counted\n",scalar(keys(%count)); }


$minlen=1000000;
for my $file (@bowtiefiles){
  $minlen = length($file) if length($file)<$minlen;
}


# find longest common suffix for bowtiefiles
my %suffs;
my $k=0;
do {
 %suffs=();
 $k++;
 for my $file (@bowtiefiles){
   $suff = substr($file,length($file)-$k);
   $suffs{$suff}=1;
 }
} while ( scalar(keys(%suffs))==1 &&  $k<$minlen);

my ($commonsuffix) = substr($bowtiefiles[0],length($file)-($k-1));
if(scalar(@bowtiefiles) == 1) { $commonsuffix = ''; }

# title is between "/" and common suffix.
my @title=();
for my $file (@bowtiefiles){
  if($file=~ /([^\/]+)$commonsuffix$/) { push @title, $1; }
  else { push @title, $file; }
}
if(defined($samplename)) { @title=(); push @title, $samplename; }


if($verbose) { print STDERR "Printing to file...\n"; }

if($count_opt eq 'gNIR' || $count_opt eq 'gReadcount'){
  print "gene\t@title\n";
  for my $gene (sort keys %count){
   print "$gene";
   for my $i (0..$#bowtiefiles){
     if(defined $count{$gene}[$i]) { print "\t$count{$gene}[$i]"; }
     else { print "\t0"; }
   }
   print "\n";
  }
}
else {  ## iNIR
  print "gene\ttranscript\t@title\n";
  for my $tr (sort keys %count){
   print "$t2g{$tr}\t$tr";
   for my $i (0..$#bowtiefiles){
     if(defined $count{$tr}[$i]) { print "\t$count{$tr}[$i]"; }
     else { print "\t0"; }
   }
   print "\n";
  }
}

if($verbose) { print STDERR "done.\n"; }


## subroutines ##


sub help {
 print STDERR<<EOF

usage: $0 <options> gene2NM bowtieoutdir/file PE[1/0] > output.readcount

Options:
  -v : verbose
  -vv : more verbose
  -f : use bowtie file name instead of bowtie directory name.
       ex) $0 -f gene2NM bowtieoutfile PE > output.readcount
       ex2) $0 gene2NM bowtieoutdir PE > output.readcount
  --gNIR : count gNIR instead of gReadcount (recommended outfile suffix is .gNIR)
  --iNIR : count iNIR instead of gReadcount (recommended outfile suffix is .iNIR)
  -s|--sample <samplename> : sample name to be used as a column title. This options is available only when -f option is used.
  -m|--max_insert_length <max_insert_length> : maximum insert length. If this option is used, length filtering is done. This option must accompany option -l (--readlength). This option doesn't do anything for single-end data (ignored)
  -l|--readlength <readlength> : read length. This is required only when -m (--max_insert_length) option is used.
  -d|--datatype <datatype> : Either 'R' (Refseq) or 'E' (Ensembl). Use R if transcript names are in Refseq format (gi|xxx|xxx|name). (default E).
  -u|--uniq_d : filter reads with a single mate distance only. (applicable for only PE gReadcount)
EOF
}

sub min {
   if($_[0] lt $_[1]) { $_[0] } else { $_[1] };
}
author	chawhwa
date	Thu, 08 Aug 2013 00:46:13 -0400
parents
children