diff NEUMA-1.2.1/bowtie2genecount.11.pl @ 0:c44c43d185ef draft default tip

NEUMA-1.2.1 Uploaded
author chawhwa
date Thu, 08 Aug 2013 00:46:13 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/NEUMA-1.2.1/bowtie2genecount.11.pl	Thu Aug 08 00:46:13 2013 -0400
@@ -0,0 +1,283 @@
+#!/usr/bin/perl
+
+# This version has an option of counting only the pairs with a single mate distance. (version 11).
+# This version has fixed the problem in filtering max_insert_length. (version 11).
+# This version does not print out the additional tab in the iNIR file. (version 11).
+# This version has data type option for parsing refseq headers. (version 10)
+# This version fixed the max_insert_lendis option name from max_insert_length. (version 10)
+# This version can take multiple bowtieout files separated by comma. (version 10)
+# This version can take option of max insert lendis (version 9)
+# This version can take sample name (column title) as option, if -f is selected. (version 8)
+# This version fixed the problem of double-counting cases that are strand-flipped. (version 7)
+# This version has an option of counting all reads, gNIR or iNIR. (version 7)
+# This version has two levels of verbosity and uses whole file name if run with a single bowtie file. (version 6)
+# This version has an option of using a single bowtie file name instead of bowtie directory. (version 6)
+# This version fixed the problem of not being able to find common suffix and assumes .bowtieout suffix for bowtieout files, to prevent reading other files in the directory. (version 5)
+# This version fixed the problem of going into infinite loop when run with one sample. (Version 3)
+# This version is verbose. (version 2)
+
+use Getopt::Long;
+my $verbose=0;
+my $LINEBIN=1000000;
+
+my $bowtie = 'd';  # default is directory.
+my $count_opt = 'gReadcount';
+my $samplename;  
+my $max_insert_length;
+my $readlength;
+my $datatype='E';
+my $uniq_d;  # reads with single mate distance only (applicable only for PE)
+&GetOptions ( 'v' => sub { $verbose=1 },
+              'vv' => sub { $verbose=2 },
+              'f' => sub { $bowtie='f'},  # read bowtie file instead of bowtiu directory
+              'gNIR' => sub { $count_opt = 'gNIR' },
+              'iNIR' => sub { $count_opt = 'iNIR' },
+              's|sample=s' => \$samplename,
+              'm|max_insert_length=i' => \$max_insert_length,
+              'l|readlength=i' => \$readlength,
+              'd|datatype=s' => \$datatype,
+              'u|uniq_d' => \$uniq_d
+);
+
+if($bowtie eq 'd' && defined($samplename)) { print STDERR "Warning:  -s|--sample option is ignored, when -f is not selected.\n"; }
+if(defined($max_insert_length) && !defined($readlength)) { die "readlength must be defined if max_insert_length is defined.\n"; }
+
+
+if(@ARGV<3) { &help;  exit; }
+
+
+my $gene2NM = shift @ARGV;
+my $bowtieoutdir = shift @ARGV;
+my $PE = shift @ARGV;
+
+if($PE==0 && defined($max_insert_length)) { print STDERR "Warning: -m|--max_insert_length option is ignored for single end data.\n"; }
+
+
+if($bowtie eq 'd'){
+  @bowtiefiles = split/\n/,`ls -1 $bowtieoutdir/*`;
+  if($verbose) { printf STDERR "Finished reading bowtieout dir $bowtieoutdir. Total %d files\n", scalar(@bowtiefiles); }
+}
+else {
+  push @bowtiefiles, (split/,/,$bowtieoutdir);
+}
+
+
+open GENE2NM, $gene2NM or die "Can't open gene2NM file $gene2NM\n";
+while(<GENE2NM>){
+  chomp;
+  my ($g,$t) = split/\t/;
+  $t2g{$t}=$g;
+  push @{$g2t{$g}}, $t;
+}
+close GENE2NM;
+if($verbose) { printf STDERR "Finished reading gene2NM file $gene2NM. Total %d transcripts and %d genes\n", scalar(keys(%t2g)), scalar(keys(%g2t)); }
+
+
+
+
+
+for my $i (0..$#bowtiefiles) {
+
+  my $bowtieout = $bowtiefiles[$i];
+  my %mapped_genes =();
+  my %mapped_tr =();
+  my %mapped_insert_lengths =();
+
+  if($verbose) { print STDERR "Processing bowtieout $bowtieout (file $i)\n"; }
+
+
+  my $prev_head=''; my $prev_tr='';
+  open $BOWTIE, $bowtieout or die "Can't open bowtiefile $bowtieout\n";
+  while(<$BOWTIE>){
+    chomp;
+    my ($head,$tr,$pos1) = (split/\t/)[0,2,3];
+    if($datatype eq 'R') { my ($tmp_tr) = (split/\|/,$tr)[3]; if(defined($tmp_tr)) {$tr = $tmp_tr; $tr=~s/\.\d+$//; }}
+
+
+    my $insert_length=0;   ## default, for SE.
+    if($PE==1){
+      chomp(my $secondline = <$BOWTIE>);
+      my ($head2,$pos2) = (split/\t/,$secondline)[0,3];
+      $head = &min($head,$head2);
+      $insert_length = $pos2 - $pos1 + $readlength;
+      if(defined $max_insert_length) { if($insert_length > $max_insert_length) { next; } }
+    }
+
+
+    if($verbose==2) { if($prev_head eq '') { print STDERR "head= $head, tr= $tr, PE = $PE\n"; }  }   # print only at first line
+
+
+    if($count_opt eq 'gNIR'){
+       if($head eq $prev_head) {
+          $mapped_tr { $tr } = 1;
+          $mapped_tr { $prev_tr } = 1;
+          $mapped_genes { $t2g{$tr} } = 1;
+          $mapped_genes { $t2g{$prev_tr} } = 1;
+       }
+       else {
+          if(scalar(keys(%mapped_genes)) == 1) { my ($gene) = keys %mapped_genes; if(scalar(keys(%mapped_tr)) == scalar(@{$g2t{$gene}})) { $count{$gene}[$i]++; }}  
+          ## count reads that are unique to the gene and common to all of its isoforms
+          %mapped_genes =();
+          %mapped_tr = ();
+          $mapped_tr { $tr } = 1;
+          $mapped_genes { $t2g{$tr} } = 1;
+       }
+
+    }
+    elsif($count_opt eq 'iNIR'){
+       if($head eq $prev_head) {
+          $mapped_tr { $tr } = 1;
+          $mapped_tr { $prev_tr } = 1;
+       }
+       else {
+          if(scalar(keys(%mapped_tr)) == 1) { my ($transcript) = keys %mapped_tr; $count{$transcript}[$i]++; }  ## count reads that are unique to the transcript
+          %mapped_tr = ();
+          $mapped_tr { $tr } = 1;
+       }
+    }
+    else {
+       if($head eq $prev_head) {
+          $mapped_genes { $t2g{$tr} } = 1;
+          $mapped_genes { $t2g{$prev_tr} } = 1;
+          $mapped_insert_lengths { "$insert_length" } =1;
+          $mapped_insert_lengths { "$prev_insert_length" } =1;
+       }
+       else {
+           if(scalar(keys(%mapped_genes)) == 1) { 
+              my ($gene) = keys %mapped_genes; 
+              if(defined($uniq_d)) { if(scalar(keys(%mapped_insert_lengths))==1) { $count{$gene}[$i]++; } }
+              else { $count{$gene}[$i]++;  }
+           }  ## count reads that are unique to the gene
+           %mapped_genes =();
+           %mapped_insert_lengths =();
+           $mapped_genes { $t2g{$tr} } = 1;
+           $mapped_insert_lengths { "$insert_length" } =1;
+       }
+    }
+
+
+    $prev_head = $head;
+    $prev_tr = $tr;
+    $prev_insert_length = $insert_length;
+
+    if($verbose==2) { if($. % $LINEBIN==0) { print STDERR ":line $.\n"; } }
+
+  }
+  close $BOWTIE;
+
+
+  # last line
+  if($count_opt eq 'gNIR'){
+    if(scalar(keys(%mapped_genes)) == 1) { my ($gene) = keys %mapped_genes; if(scalar(keys(%mapped_tr)) == scalar(@{$g2t{$gene}})) { $count{$gene}[$i]++; }}  
+    ## count reads that are unique to the gene and common to all of its isoforms
+  }
+  elsif($count_opt eq 'iNIR'){
+    if(scalar(keys(%mapped_tr)) == 1) { my ($transcript) = keys %mapped_tr; $count{$transcript}[$i]++; }  ## count reads that are unique to the transcript
+  }
+  else{
+    if(scalar(keys(%mapped_genes)) == 1) { 
+      my ($gene) = keys %mapped_genes; 
+      if(defined($uniq_d)) { if(scalar(keys(%mapped_insert_lengths))==1) { $count{$gene}[$i]++; } }
+      else { $count{$gene}[$i]++;  }
+    }  ## count reads that are unique to the gene
+  }
+
+}
+
+
+$"="\t";
+
+
+if($count_opt eq 'gNIR' || $count_opt eq 'gReadcount') { $count_target='genes'; } else { $count_target = 'transcripts'; }
+if($verbose) { printf STDERR "Finished reading bowtieout files. Total %d $count_target counted\n",scalar(keys(%count)); }
+
+
+$minlen=1000000;
+for my $file (@bowtiefiles){
+  $minlen = length($file) if length($file)<$minlen;
+}
+
+
+# find longest common suffix for bowtiefiles
+my %suffs;
+my $k=0;
+do {
+ %suffs=();
+ $k++;
+ for my $file (@bowtiefiles){
+   $suff = substr($file,length($file)-$k);
+   $suffs{$suff}=1;
+ }
+} while ( scalar(keys(%suffs))==1 &&  $k<$minlen); 
+
+my ($commonsuffix) = substr($bowtiefiles[0],length($file)-($k-1));
+if(scalar(@bowtiefiles) == 1) { $commonsuffix = ''; }
+
+# title is between "/" and common suffix.
+my @title=();
+for my $file (@bowtiefiles){
+  if($file=~ /([^\/]+)$commonsuffix$/) { push @title, $1; }
+  else { push @title, $file; }
+}
+if(defined($samplename)) { @title=(); push @title, $samplename; }
+
+
+if($verbose) { print STDERR "Printing to file...\n"; }
+
+if($count_opt eq 'gNIR' || $count_opt eq 'gReadcount'){
+  print "gene\t@title\n";
+  for my $gene (sort keys %count){
+   print "$gene";
+   for my $i (0..$#bowtiefiles){
+     if(defined $count{$gene}[$i]) { print "\t$count{$gene}[$i]"; }
+     else { print "\t0"; }
+   }
+   print "\n";
+  }
+}
+else {  ## iNIR
+  print "gene\ttranscript\t@title\n";
+  for my $tr (sort keys %count){
+   print "$t2g{$tr}\t$tr";
+   for my $i (0..$#bowtiefiles){
+     if(defined $count{$tr}[$i]) { print "\t$count{$tr}[$i]"; }
+     else { print "\t0"; }
+   }
+   print "\n";
+  }
+}
+
+if($verbose) { print STDERR "done.\n"; }
+
+
+
+## subroutines ##
+
+
+
+sub help {
+ print STDERR<<EOF
+
+usage: $0 <options> gene2NM bowtieoutdir/file PE[1/0] > output.readcount
+
+Options:
+  -v : verbose
+  -vv : more verbose
+  -f : use bowtie file name instead of bowtie directory name.
+       ex) $0 -f gene2NM bowtieoutfile PE > output.readcount
+       ex2) $0 gene2NM bowtieoutdir PE > output.readcount
+  --gNIR : count gNIR instead of gReadcount (recommended outfile suffix is .gNIR)
+  --iNIR : count iNIR instead of gReadcount (recommended outfile suffix is .iNIR)
+  -s|--sample <samplename> : sample name to be used as a column title. This options is available only when -f option is used.
+  -m|--max_insert_length <max_insert_length> : maximum insert length. If this option is used, length filtering is done. This option must accompany option -l (--readlength). This option doesn't do anything for single-end data (ignored)
+  -l|--readlength <readlength> : read length. This is required only when -m (--max_insert_length) option is used.
+  -d|--datatype <datatype> : Either 'R' (Refseq) or 'E' (Ensembl). Use R if transcript names are in Refseq format (gi|xxx|xxx|name). (default E).
+  -u|--uniq_d : filter reads with a single mate distance only. (applicable for only PE gReadcount)
+EOF
+}
+
+sub min {
+   if($_[0] lt $_[1]) { $_[0] } else { $_[1] };
+}
+
+