diff NEUMA-1.2.1/filter.best.from.bowtieout.3.pl @ 0:c44c43d185ef draft default tip

NEUMA-1.2.1 Uploaded
author chawhwa
date Thu, 08 Aug 2013 00:46:13 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/NEUMA-1.2.1/filter.best.from.bowtieout.3.pl	Thu Aug 08 00:46:13 2013 -0400
@@ -0,0 +1,153 @@
+#!/usr/bin/perl
+use Getopt::Long;
+
+# This version has an option of filtering only exclusively-CDS-mapped reads (useful for ribosome profiling analysis). The option name for delete_original is not cahnged. (version 3)
+# This version fixed the problem of not being able to merge when the order of the two mates were flipped. (version 2)
+
+
+my $delete_original;
+my $newbowtiefile;  ## output bowtiefile
+my $mmcol=4;
+my $datatype='E';
+&GetOptions ( 'rm|delete_original' => \$delete_original,
+              'o=s' => \$newbowtiefile,
+              'mmcol=i' => \$mmcol,   ## column for mismatch info (0-based)
+              'd|datatype=s' => \$datatype  # refseq or ensembl ('refseq' if names must be parsed)
+);
+
+if(@ARGV<2) { &help; exit; }
+
+my $bowtiefile = shift @ARGV;
+my $PE = shift @ARGV;  # 1 for paired-end, 0 for single-end
+
+if($datatype !~/^[RE]$/ ) { die "ERROR: -d (--datatype) must be specified to be either R (Refseq) or E (Ensembl) (default)\n"; }
+
+if(!defined($newbowtiefile)){
+  ($newbowtiefile) = $bowtiefile=~/(.+)\.bowtieout/;
+  $newbowtiefile .= ".best.bowtieout";
+}
+
+
+
+my $prev_head1=''; my $prev_head2='';
+my $totmm=100000;  # just some huge number
+my @mapping=();
+open $IN, $bowtiefile or die "Can't open bowtiefile $bowtiefile\n";
+open $OUT, ">$newbowtiefile" or die "Can't write to output bowtiefile $newbowtiefile\n";
+while(<$IN>){
+  chomp;
+  my ($head1,$strand1,$tr,$pos1,$mmstr1) = (split/\t/)[0,1,2,3,$mmcol];
+  my $mm1 = &parse_mmstr($mmstr1);
+  if($PE==1) { 
+    chomp(my $secondline = <$IN>);
+    ($head2,$strand2,$pos2,$mmstr2) = (split/\t/,$secondline)[0,1,3,$mmcol];
+    my $mm2 = &parse_mmstr($mmstr2);
+  }
+  
+  if($PE==1){
+    if($prev_head1 eq $head1 || $prev_head2 eq $head1) {
+      if($totmm>$mm1+$mm2) { @mapping=();  push @mapping,[$tr,$strand1,$pos1,$mmstr1,$strand2,$pos2,$mmstr2];  $totmm=$mm1+$mm2; }
+      elsif($totma==$mm1+$mm2) { push @mapping,[$tr,$strand1,$pos1,$mmstr1,$strand2,$pos2,$mmstr2]; }
+    }
+    else {
+      &print_mapping_PE(\@mapping,$OUT, $prev_head1,$prev_head2);
+      @mapping=();
+      push @mapping,[$tr,$strand1,$pos1,$mmstr1,$strand2,$pos2,$mmstr2];
+      $totmm=$mm1+$mm2;
+    }
+  }
+  else {
+    if($prev_head1 eq $head1) {
+      if($totmm>$mm1) { @mapping=();  push @mapping,[$tr,$strand1,$pos1,$mmstr1];  $totmm=$mm1; }
+      elsif($totmm==$mm1) { push @mapping,[$tr,$strand1,$pos1,$mmstr1]; }
+    }
+    else {
+      &print_mapping_SE(\@mapping,$OUT, $prev_head1);
+      @mapping=();
+      push @mapping,[$tr,$strand1,$pos1,$mmstr1];
+      $totmm=$mm1;
+    }
+  }
+
+  $prev_head1= $head1;
+  if($PE==1) { $prev_head2 = $head2; }
+}
+
+
+## last line
+if($PE==1){
+  &print_mapping_PE(\@mapping,$OUT, $prev_head1, $prev_head2);
+}
+else {
+  &print_mapping_SE(\@mapping,$OUT, $prev_head1);
+}
+
+close $OUT;
+close $IN;
+
+
+
+## delete original bowtieout file
+if(defined $delete_original) { `rm -f $bowtiefile`; }
+
+
+
+
+
+## subroutines @@
+
+
+sub parse_mmstr {
+  if(!defined $_[0]){ return 0; }
+  else {
+    return(scalar(split/,/,$_[0]));
+  }
+}
+
+
+sub print_mapping_PE {
+  my $pMapping = shift @_;
+  my $OUT = shift @_;
+  my $prev_head1 = shift @_;
+  my $prev_head2 = shift @_;
+
+  if($prev_head1 ne ''){
+    for my $i (0..$#$pMapping){
+      my ($tr,$strand1,$pos1,$mm1,$strand2,$pos2,$mm2) = @{$pMapping->[$i]};
+      print $OUT "$prev_head1\t$strand1\t$tr\t$pos1\t$mm1\n";
+      print $OUT "$prev_head2\t$strand2\t$tr\t$pos2\t$mm2\n";
+    }
+  }
+}
+
+
+
+sub print_mapping_SE {
+  my $pMapping = shift @_;
+  my $OUT = shift @_;
+  my $prev_head1 = shift @_;
+
+  if($prev_head1 ne ''){
+    for my $i (0..$#$pMapping){
+      my ($tr,$strand1,$pos1,$mm1) = @{$pMapping->[$i]};
+      print $OUT "$prev_head1\t$strand1\t$tr\t$pos1\t$mm1\n";
+    }
+  }
+}
+
+
+
+sub help {
+  print STDERR<<EOF
+
+usage: $0 <options> bowtieout_file PE(1/0)
+
+Options
+   --rm|--delete_original  : delete the original bowtieout file
+   -o <outfilename> : output bowtieout file name (default : originalfiletitle.best.bowtieout / originalfiletitle.best.cdsonly.bowtieout (if --cdsonly is used) )
+   --mmcol : the column number (0-based) for mismatch information. (default : 4)
+   -d | datatype <datatype> : Either 'R' (Refseq) or 'E' (Ensembl). Use R if the transcript names are in Refseq format (gi|xxx|xxx|name). This option affects only when --cdsonly is used. (default : E)
+EOF
+}
+
+