Mercurial > repos > bgruening > bismark

#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long;


### This script is supposed to remove alignments to the same position in the genome which can arise by e.g. PCR amplification
### Paired-end alignments are considered a duplicate if both partner reasd start and end at the exact same position

### May 13, 2013
### Changed the single-end trimming behavior so that only the start coordinate will be used. This avoids duplicate reads that have been trimmed to a varying extent
### Changed the way of determining the end of reads in SAM format to using the CIGAR string if the read contains InDels

### 16 July 2013
### Adding a new deduplication mode for barcoded RRBS-Seq

### 27 Sept 2013
### Added close statement for all output filehandles (which should probably have been there from the start...)

### 8 Jan 2015
### to detect paired-end command from the @PG line we are no requiring spaces before and after the -1 or -2

### 09 Mar 2015
### Removing newline characters also from the read conversion flag in case the tags had been reordered and are now present in the very last column

### 19 08 2015
### Hiding the option --representative from view to discourage people from using it (it was nearly always not what they wanted to do anyway). It should still work
### for alignments that do not contain any indels
### Just for reference, here is the the text:
### print "--representative\twill browse through all sequences and print out the sequence with the most representative (as in most frequent) methylation call for any given position. Note that this is very likely the most highly amplified PCR product for a given sequence\n\n";


my $dedup_version = 'v0.16.3';

my $help;
my $representative;
my $single;
my $paired;
my $global_single;
my $global_paired;
my $vanilla;
my $samtools_path;
my $bam;
my $rrbs;
my $version;

my $command_line = GetOptions ('help' => \$help,
			       'representative' => \$representative,
			       's|single' => \$global_single,
			       'p|paired' => \$global_paired,
			       'vanilla' => \$vanilla,
			       'samtools_path=s' => \$samtools_path,
			       'bam' => \$bam,
			       'barcode' => \$rrbs,
			       'version' => \$version,
			      );

die "Please respecify command line options\n\n" unless ($command_line);

if ($help){
  print_helpfile();
  exit;
}

if ($version){
  print << "VERSION";

                           Bismark Deduplication Module

                          Deduplicator Version: $dedup_version
              Copyright 2010-16 Felix Krueger, Babraham Bioinformatics
                www.bioinformatics.babraham.ac.uk/projects/bismark/


VERSION
    exit;
  }


my @filenames = @ARGV;

unless (@filenames){
  print "Please provide one or more Bismark output files for deduplication\n\n";
  sleep (2);
  print_helpfile();
  exit;
}


### OPTIONS
unless ($global_single or $global_paired){
  if ($vanilla){
    die "Please specify either -s (single-end) or -p (paired-end) for deduplication. Reading this information from the \@PG header line only works for SAM/BAM files\n\n";
  }
  warn "\nNeither -s (single-end) nor -p (paired-end) selected for deduplication. Trying to extract this information for each file separately from the \@PG line of the SAM/BAM file\n";
}

if ($global_paired){
  if ($global_single){
    die "Please select either -s for single-end files or -p for paired-end files, but not both at the same time!\n\n";
  }
  if ($vanilla){

    if ($rrbs){
      die "Barcode deduplication only works with Bismark SAM (or BAM) output (in attempt to phase out the vanilla format)\n";
    }

    warn "Processing paired-end custom Bismark output file(s):\n";
    warn join ("\t",@filenames),"\n\n";
  }
  else{
    warn "Processing paired-end Bismark output file(s) (SAM format):\n";
    warn join ("\t",@filenames),"\n\n";
  }
}
else{
  if ($vanilla){
    warn "Processing single-end custom Bismark output file(s):\n";
    warn join ("\t",@filenames),"\n\n";
  }
  else{
    warn "Processing single-end Bismark output file(s) (SAM format):\n";
    warn join ("\t",@filenames),"\n\n";
  }
}

### PATH TO SAMTOOLS
if (defined $samtools_path){
  # if Samtools was specified as full command
  if ($samtools_path =~ /samtools$/){
    if (-e $samtools_path){
      # Samtools executable found
    }
    else{
      die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
    }
  }
  else{
    unless ($samtools_path =~ /\/$/){
      $samtools_path =~ s/$/\//;
    }
    $samtools_path .= 'samtools';
    if (-e $samtools_path){
      # Samtools executable found
    }
    else{
      die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
    }
  }
}
# Check whether Samtools is in the PATH if no path was supplied by the user
else{
  if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if Samtools is in the PATH
    $samtools_path = `which samtools`;
    chomp $samtools_path;
  }
}

if ($bam){
  if (defined $samtools_path){
    $bam = 1;
  }
  else{
    warn "No Samtools found on your system, writing out a gzipped SAM file instead\n";
    $bam = 2;
  }
}
else{
  $bam = 0;
}


if ($representative){
  warn "\nIf there are several alignments to a single position in the genome the alignment with the most representative methylation call will be chosen (this might be the most highly amplified PCR product...)\n\n";
  sleep (1);
}
elsif($rrbs){
  warn "\nIf the input is a multiplexed sample with several alignments to a single position in the genome, only alignments with a unique barcode will be chosen)\n\n";
  sleep (1);
}
else{ # default; random (=first) alignment
  warn "\nIf there are several alignments to a single position in the genome the first alignment will be chosen. Since the input files are not in any way sorted this is a near-enough random selection of reads.\n\n";
  sleep (1);
}

foreach my $file (@filenames){

  if ($global_single){
    $paired = 0;
    $single = 1;
  }
  elsif($global_paired){
    $paired = 1;
    $single = 0;
  }

  unless($global_single or $global_paired){

    warn "Trying to determine the type of mapping from the SAM header line\n"; sleep(1);

    ### if the user did not specify whether the alignment file was single-end or paired-end we are trying to get this information from the @PG header line in the SAM/BAM file
    if ($file =~ /\.gz$/){
      open (DETERMINE,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n";
    }
    elsif ($file =~ /\.bam$/){
      open (DETERMINE,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n";
    }
    else{
      open (DETERMINE,$file) or die "Unable to read from $file: $!\n";
    }
    while (<DETERMINE>){
      last unless (/^\@/);
      if ($_ =~ /^\@PG/){
	# warn "found the \@PG line:\n";
	# warn "$_";

	if ($_ =~ /\s+-1\s+/ and $_ =~ /\s+-2\s+/){
	  warn "Treating file as paired-end data (extracted from \@PG line)\n"; sleep(1);
	  $paired = 1;
	  $single = 0;
	}
	else{
	  warn "Treating file as single-end data (extracted from \@PG line)\n"; sleep(1);
	  $paired = 0;
	  $single = 1;
	}
      }
    }
    close DETERMINE or warn "$!\n";
  }

  if ($file =~ /(\.bam$|\.sam$)/){
      bam_isEmpty($file);
  }


  ### OPTIONS
  unless ($single or $paired){
    die "Please specify either -s (single-end) or -p (paired-end) for deduplication, or provide a SAM/BAM file that contains the \@PG header line\n\n";
  }

  ###
  unless ($vanilla){
    if ($paired){
      test_positional_sorting($file);
    }
  }


  ### writing to a report file
  my $report = $file;

  $report =~ s/\.gz$//;
  $report =~ s/\.sam$//;
  $report =~ s/\.bam$//;
  $report =~ s/\.txt$//;
  $report =~ s/$/.deduplication_report.txt/;

  open (REPORT,'>',$report) or die "Failed to write to report file to $report: $!\n\n";


  ### for representative methylation calls we need to discriminate between single-end and paired-end files as the latter have 2 methylation call strings
  if($representative){
    deduplicate_representative($file);
  }

  elsif($rrbs){
    deduplicate_barcoded_rrbs($file);
  }

  ### as the default option we simply write out the first read for a position and discard all others. This is the fastest option
  else{

    my %unique_seqs;
    my %positions;

    if ($file =~ /\.gz$/){
      open (IN,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n";
    }
    elsif ($file =~ /\.bam$/){
      open (IN,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n";
    }
    else{
      open (IN,$file) or die "Unable to read from $file: $!\n";
    }

    my $outfile = $file;
    $outfile =~ s/\.gz$//;
    $outfile =~ s/\.sam$//;
    $outfile =~ s/\.bam$//;
    $outfile =~ s/\.txt$//;

    if ($vanilla){
      $outfile =~ s/$/_deduplicated.txt/;
    }
    else{
      if ($bam == 1){
	$outfile =~ s/$/.deduplicated.bam/;
      }
      elsif ($bam == 2){
	$outfile =~ s/$/.deduplicated.sam.gz/;
      }
      else{
	$outfile =~ s/$/.deduplicated.sam/;
      }
    }
    if ($bam == 1){
      open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $outfile") or die "Failed to write to $outfile: $!\n";
    }
    elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
      open (OUT,"| gzip -c - > $outfile") or die "Failed to write to $outfile: $!\n";
    }

    else{
      open (OUT,'>',$outfile) or die "Unable to write to $outfile: $!\n";
    }

    ### need to proceed slightly differently for the custom Bismark and Bismark SAM output
    if ($vanilla){
      $_ = <IN>; # Bismark version header
      print OUT; # Printing the Bismark version to the de-duplicated file again
    }
    my $count = 0;
    my $unique_seqs = 0;
    my $removed = 0;

    while (<IN>){

      if ($count == 0){
	if ($_ =~ /^Bismark version:/){
	  warn "The file appears to be in the custom Bismark and not SAM format. Please see option --vanilla!\n";
	  sleep (2);
	  print_helpfile();
	  exit;
	}
      }

      ### if this was a SAM file we ignore header lines
      unless ($vanilla){
	if (/^\@\w{2}\t/){
	  print "skipping header line:\t$_";
	  print OUT "$_"; # Printing the header lines again into the de-duplicated file
	  next;
	}
      }

      ++$count;
      my $composite; # storing positional data. For single end data we are only using the start coordinate since the end might have been trimmed to different lengths

      my ($strand,$chr,$start,$end,$cigar);
      my $line1;

      if ($vanilla){
	($strand,$chr,$start,$end) = (split (/\t/))[1,2,3,4];
      }
      else{ # SAM format
	($strand,$chr,$start,$cigar) = (split (/\t/))[1,2,3,5]; # we are assigning the FLAG value to $strand

	### SAM single-end
	if ($single){

	  if ($strand == 0 ){
	    ### read aligned to the forward strand. No action needed
	  }
	  elsif ($strand == 16){
	    ### read is on reverse strand

	    $start -= 1; # only need to adjust this once

	    # for InDel free matches we can simply use the M number in the CIGAR string
	    if ($cigar =~ /^(\d+)M$/){ # linear match
	      $start += $1;
	    }

	    else{
	      # parsing CIGAR string
	      my @len = split (/\D+/,$cigar); # storing the length per operation
	      my @ops = split (/\d+/,$cigar); # storing the operation
	      shift @ops; # remove the empty first element
	      die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);

	      # warn "CIGAR string; $cigar\n";
	      ### determining end position of a read
	      foreach my $index(0..$#len){
		if ($ops[$index] eq 'M'){  # standard matching bases
		  $start += $len[$index];
		  # warn "Operation is 'M', adding $len[$index] bp\n";
		}
		elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
		  # warn "Operation is 'I', next\n";
		}
		elsif($ops[$index] eq 'D'){ # deletions do affect the end position
		  #  warn "Operation is 'D',adding $len[$index] bp\n";
		  $start += $len[$index];
		}
		else{
		  die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n";
		}
	      }
	    }
	  }
	  $composite = join (":",$strand,$chr,$start);
	}
	elsif($paired){

	  ### storing the current line
	  $line1 = $_;

	  my $read_conversion;
	  my $genome_conversion;

	  while ( /(XR|XG):Z:([^\t]+)/g ) {
	    my $tag = $1;
	    my $value = $2;

	    if ($tag eq "XR") {
	      $read_conversion = $value;
	      $read_conversion =~ s/\r//;
	      chomp $read_conversion;
	    } elsif ($tag eq "XG") {
	      $genome_conversion = $value;
	      $genome_conversion =~ s/\r//;
	      chomp $genome_conversion;
	    }
	  }
	  die "Failed to determine read and genome conversion from line: $line1\n\n" unless ($read_conversion and $read_conversion);


	  my $index;
	  if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
	    $index = 0;
	    $strand = '+';
	  } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
	    $index = 1;
	    $strand = '-';
	  } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
	    $index = 2;
	    $strand = '+';
	  } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
	    $index = 3;
	    $strand = '-';
	  } else {
	    die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
	  }

	  # if the read aligns in forward orientation we can certainly use the start position of read 1, and only need to work out the end position of read 2
	  if ($index == 0 or $index == 2){

	    ### reading in the next line
	    $_ = <IN>;
	    # the only thing we need is the end position
	    ($end,my $cigar_2) = (split (/\t/))[3,5];

	    $end -= 1; # only need to adjust this once

	    # for InDel free matches we can simply use the M number in the CIGAR string
	    if ($cigar_2 =~ /^(\d+)M$/){ # linear match
	      $end += $1;
	    }
	    else{
	      # parsing CIGAR string
	      my @len = split (/\D+/,$cigar_2); # storing the length per operation
	      my @ops = split (/\d+/,$cigar_2); # storing the operation
	      shift @ops; # remove the empty first element
	      die "CIGAR string contained a non-matching number of lengths and operations ($cigar_2)\n" unless (scalar @len == scalar @ops);

	      # warn "CIGAR string; $cigar_2\n";
	      ### determining end position of the read
	      foreach my $index(0..$#len){
		if ($ops[$index] eq 'M'){  # standard matching bases
		  $end += $len[$index];
		  # warn "Operation is 'M', adding $len[$index] bp\n";
		}
		elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
		  # warn "Operation is 'I', next\n";
		}
		elsif($ops[$index] eq 'D'){ # deletions do affect the end position
		  #  warn "Operation is 'D',adding $len[$index] bp\n";
		  $end += $len[$index];
		}
		else{
		  die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n";
		}
	      }
	    }
	  }
	  else{
	    # else read 1 aligns in reverse orientation and we need to work out the end of the fragment first, and use the start of the next line

	    $end = $start - 1; # need to adjust this only once

	    # for InDel free matches we can simply use the M number in the CIGAR string
	    if ($cigar =~ /^(\d+)M$/){ # linear match
	      $end += $1;
	    }
	    else{
	      # parsing CIGAR string
	      my @len = split (/\D+/,$cigar); # storing the length per operation
	      my @ops = split (/\d+/,$cigar); # storing the operation
	      shift @ops; # remove the empty first element
	      die "CIGAR string contained a non-matching number of lengths and operations ($cigar)\n" unless (scalar @len == scalar @ops);

	      # warn "CIGAR string; $cigar\n";
	      ### determining end position of the read
	      foreach my $index(0..$#len){
		if ($ops[$index] eq 'M'){  # standard matching bases
		  $end += $len[$index];
		  # warn "Operation is 'M', adding $len[$index] bp\n";
		}
		elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
		  # warn "Operation is 'I', next\n";
		}
		elsif($ops[$index] eq 'D'){ # deletions do affect the end position
		  # warn "Operation is 'D',adding $len[$index] bp\n";
		  $end += $len[$index];
		}
		else{
		  die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n";
		}
	      }
	    }

	    ### reading in the next line
	    $_ = <IN>;
	    # the only thing we need is the start position
	    ($start) = (split (/\t/))[3];
	  }
	  $composite = join (":",$strand,$chr,$start,$end);
	}

	else{
	  die "Input must be single or paired-end\n";
	}
      }

      if (exists $unique_seqs{$composite}){
	++$removed;
	unless (exists $positions{$composite}){
	  $positions{$composite}++;
	}
      }
      else{
	if ($paired){
	  unless ($vanilla){
	    print OUT "$line1"; # printing first paired-end line for SAM output
	  }
	}
	print OUT "$_"; # printing single-end SAM alignment or second paired-end line
	$unique_seqs{$composite}++;
      }
    }

    my $percentage;
    my $percentage_leftover;
    my $leftover = $count - $removed;

    unless ($count == 0){
      $percentage = sprintf("%.2f",$removed/$count*100);
      $percentage_leftover = sprintf("%.2f",$leftover/$count*100);
    }
    else{
      $percentage = 'N/A';
      $percentage_leftover = 'N/A';
    }

    warn "\nTotal number of alignments analysed in $file:\t$count\n";
    warn "Total number duplicated alignments removed:\t$removed ($percentage%)\n";
    warn "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n";
    warn "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n";

    print REPORT "\nTotal number of alignments analysed in $file:\t$count\n";
    print REPORT "Total number duplicated alignments removed:\t$removed ($percentage%)\n";
    print REPORT "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n";
    print REPORT "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n";
  }

  close OUT or warn "Failed to close output filehandle: $!\n";
  close REPORT or warn "Failed to close report filehandle: $!\n";

}


sub deduplicate_barcoded_rrbs{

  my $file = shift;

  my %unique_seqs;
  my %positions;

  if ($file =~ /\.gz$/){
    open (IN,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n";
  }
  elsif ($file =~ /\.bam$/){
    open (IN,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n";
  }
  else{
    open (IN,$file) or die "Unable to read from $file: $!\n";
  }

  my $outfile = $file;
  $outfile =~ s/\.gz$//;
  $outfile =~ s/\.sam$//;
  $outfile =~ s/\.bam$//;
  $outfile =~ s/\.txt$//;

  if ($vanilla){
    $outfile =~ s/$/_dedup_RRBS.txt/;
  }
  else{
    if ($bam == 1){
      $outfile =~ s/$/.dedup_RRBS.bam/;
    }
    elsif ($bam == 2){
      $outfile =~ s/$/.dedupRRBS.sam.gz/;
    }
    else{
      $outfile =~ s/$/.dedup_RRBS.sam/;
    }
  }
  if ($bam == 1){
    open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $outfile") or die "Failed to write to $outfile: $!\n";
  }
  elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
    open (OUT,"| gzip -c - > $outfile") or die "Failed to write to $outfile: $!\n";
  }
  else{
    open (OUT,'>',$outfile) or die "Unable to write to $outfile: $!\n";
  }

  ### This mode only supports Bismark SAM output
  my $count = 0;
  my $unique_seqs = 0;
  my $removed = 0;

  while (<IN>){

    if ($count == 0){
      if ($_ =~ /^Bismark version:/){
	warn "The file appears to be in the custom Bismark and not SAM format. Please see option --vanilla!\n";
	sleep (2);
	print_helpfile();
	exit;
      }
    }

    ### if this was a SAM file we ignore header lines
    if (/^\@\w{2}\t/){
      warn "skipping SAM header line:\t$_";
      print OUT; # Printing the header lines again into the de-duplicated file
      next;
    }

    ++$count;
    my $composite; # storing positional data. For single end data we are only using the start coordinate since the end might have been trimmed to different lengths
    ### in this barcoded mode we also store the read barcode as additional means of assisting the deduplication
    ### in effect the $composite string looks like this (separated by ':'):

    ### FLAG:chromosome:start:barcode

    my $end;
    my $line1;

    # SAM format
    my ($id,$strand,$chr,$start,$cigar) = (split (/\t/))[0,1,2,3,5]; # we are assigning the FLAG value to $strand

    $id =~ /:(\w+)$/;
    my $barcode = $1;
    unless ($barcode){
      die "Failed to extract a barcode from the read ID (last element of each read ID needs to be the barcode sequence, e.g. ':CATG'\n\n";
    }

    ### SAM single-end
    if ($single){

      if ($strand == 0 ){
	### read aligned to the forward strand. No action needed
      }
      elsif ($strand == 16){
	### read is on reverse strand

	$start -= 1; # only need to adjust this once

	# for InDel free matches we can simply use the M number in the CIGAR string
	if ($cigar =~ /^(\d+)M$/){ # linear match
	  $start += $1;
	}
	else{
	  # parsing CIGAR string
	  my @len = split (/\D+/,$cigar); # storing the length per operation
	  my @ops = split (/\d+/,$cigar); # storing the operation
	  shift @ops; # remove the empty first element
	  die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);

	  # warn "CIGAR string; $cigar\n";
	  ### determining end position of a read
	  foreach my $index(0..$#len){
	    if ($ops[$index] eq 'M'){  # standard matching bases
	      $start += $len[$index];
	      # warn "Operation is 'M', adding $len[$index] bp\n";
	    }
	    elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
	      # warn "Operation is 'I', next\n";
	    }
	    elsif($ops[$index] eq 'D'){ # deletions do affect the end position
	      #  warn "Operation is 'D',adding $len[$index] bp\n";
	      $start += $len[$index];
	    }
	    else{
	      die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n";
	    }
	  }
	}
      }

      ### Here we take the barcode sequence into consideration
      $composite = join (":",$strand,$chr,$start,$barcode);
      # warn "$composite\n\n";
      # sleep(1);
    }
    elsif($paired){

      ### storing the current line
      $line1 = $_;

      my $read_conversion;
      my $genome_conversion;

      while ( /(XR|XG):Z:([^\t]+)/g ) {
	my $tag = $1;
	my $value = $2;

	if ($tag eq "XR") {
	  $read_conversion = $value;
	  $read_conversion =~ s/\r//;
	  chomp $read_conversion;
	}
	elsif ($tag eq "XG") {
	  $genome_conversion = $value;
	  $genome_conversion =~ s/\r//;
	  chomp $genome_conversion;
	}
      }
      die "Failed to determine read and genome conversion from line: $line1\n\n" unless ($read_conversion and $read_conversion);


      my $index;
      if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
	$index = 0;
	$strand = '+';
      } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
	$index = 1;
	$strand = '-';
      } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
	$index = 2;
	$strand = '+';
      } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
	$index = 3;
	$strand = '-';
      } else {
	die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
      }

      # if the read aligns in forward orientation we can certainly use the start position of read 1, and only need to work out the end position of read 2
      if ($index == 0 or $index == 2){

	### reading in the next line
	$_ = <IN>;
	# the only thing we need is the end position
	($end,my $cigar_2) = (split (/\t/))[3,5];

	$end -= 1; # only need to adjust this once

	# for InDel free matches we can simply use the M number in the CIGAR string
	if ($cigar_2 =~ /^(\d+)M$/){ # linear match
	  $end += $1;
	}
	else{
	  # parsing CIGAR string
	  my @len = split (/\D+/,$cigar_2); # storing the length per operation
	  my @ops = split (/\d+/,$cigar_2); # storing the operation
	  shift @ops; # remove the empty first element
	  die "CIGAR string contained a non-matching number of lengths and operations ($cigar_2)\n" unless (scalar @len == scalar @ops);

	  # warn "CIGAR string; $cigar_2\n";
	  ### determining end position of the read
	  foreach my $index(0..$#len){
	    if ($ops[$index] eq 'M'){  # standard matching bases
	      $end += $len[$index];
	      # warn "Operation is 'M', adding $len[$index] bp\n";
	    }
	    elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
	      # warn "Operation is 'I', next\n";
	    }
	    elsif($ops[$index] eq 'D'){ # deletions do affect the end position
	      #  warn "Operation is 'D',adding $len[$index] bp\n";
	      $end += $len[$index];
	    }
	    else{
	      die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n";
	    }
	  }
	}
      }
      else{
	# else read 1 aligns in reverse orientation and we need to work out the end of the fragment first, and use the start of the next line

	$end = $start - 1; # need to adjust this only once

	# for InDel free matches we can simply use the M number in the CIGAR string
	if ($cigar =~ /^(\d+)M$/){ # linear match
	  $end += $1;
	}
	else{
	  # parsing CIGAR string
	  my @len = split (/\D+/,$cigar); # storing the length per operation
	  my @ops = split (/\d+/,$cigar); # storing the operation
	  shift @ops; # remove the empty first element
	  die "CIGAR string contained a non-matching number of lengths and operations ($cigar)\n" unless (scalar @len == scalar @ops);

	  # warn "CIGAR string; $cigar\n";
	  ### determining end position of the read
	  foreach my $index(0..$#len){
	    if ($ops[$index] eq 'M'){  # standard matching bases
	      $end += $len[$index];
	      # warn "Operation is 'M', adding $len[$index] bp\n";
	    }
	    elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
	      # warn "Operation is 'I', next\n";
	    }
	    elsif($ops[$index] eq 'D'){ # deletions do affect the end position
	      # warn "Operation is 'D',adding $len[$index] bp\n";
	      $end += $len[$index];
	    }
	    else{
	      die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n";
	    }
	  }
	}

	### reading in the next line
	$_ = <IN>;
	# the only thing we need is the start position
	($start) = (split (/\t/))[3];
      }

      ### Here we take the barcode sequence into consideration
      $composite = join (":",$strand,$chr,$start,$end,$barcode);
    }
    else{
      die "Input must be single or paired-end\n";
    }

    if (exists $unique_seqs{$composite}){
      ++$removed;
      unless (exists $positions{$composite}){
	$positions{$composite}++;
      }
    }
    else{
      if ($paired){
	print OUT $line1; # printing first paired-end line for SAM output
      }
      print OUT; # printing single-end SAM alignment or second paired-end line
      $unique_seqs{$composite}++;
    }
  }

  my $percentage;
  my $percentage_leftover;
  my $leftover = $count - $removed;

  unless ($count == 0){
    $percentage = sprintf("%.2f",$removed/$count*100);
    $percentage_leftover = sprintf("%.2f",$leftover/$count*100);
  }
  else{
    $percentage = 'N/A';
    $percentage_leftover = 'N/A';
  }


  warn "\nTotal number of alignments analysed in $file:\t$count\n";
  warn "Total number duplicated alignments removed:\t$removed ($percentage%)\n";
  warn "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n";
  warn "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n";

  print REPORT "\nTotal number of alignments analysed in $file:\t$count\n";
  print REPORT "Total number duplicated alignments removed:\t$removed ($percentage%)\n";
  print REPORT "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n";
  print REPORT "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n";

}

sub bam_isEmpty{

    my $file = shift;

    if ($file =~ /\.bam$/){
	open (EMPTY,"$samtools_path view $file |") or die "Unable to read from BAM file $file: $!\n";
    }
    else{
	open (EMPTY,$file) or die "Unable to read from $file: $!\n";
    }
    my $count = 0;
    while (<EMPTY>){
	if ($_){
	    $count++;  # file contains data, fine.
	}
	last; # one line is enough
    }

    if ($count == 0){
	die "\n### File appears to be empty, terminating deduplication process. Please make sure the input file has not been truncated. ###\n\n";
    }
    close EMPTY or warn "$!\n";
}


sub print_helpfile{
  print "\n",'='x111,"\n";
  print "\nThis script is supposed to remove alignments to the same position in the genome from the Bismark mapping output\n(both single and paired-end SAM files), which can arise by e.g. excessive PCR amplification. If sequences align\nto the same genomic position but on different strands they will be scored individually.\n\nNote that deduplication is not recommended for RRBS-type experiments!\n\nIn the default mode, the first alignment to a given position will be used irrespective of its methylation call\n(this is the fastest option, and as the alignments are not ordered in any way this is also near enough random).\n\n";
  print "For single-end alignments only use the start coordinate of a read will be used for deduplication.\n\n";
  print "For paired-end alignments the start-coordinate of the first read and the end coordinate of the second\nread will be used for deduplication. ";
  print "This script expects the Bismark output to be in SAM format\n(Bismark v0.6.x or higher). To deduplicate the old custom Bismark output please specify '--vanilla'.\n\n";
  print "*** Please note that for paired-end BAM files the deduplication script expects Read1 and Read2 to\nfollow each other in consecutive lines! If the file has been sorted by position make sure that you resort it\nby read name first (e.g. using samtools sort -n)  ***\n\n";

  print '='x111,"\n\n";
  print ">>> USAGE: ./deduplicate_bismark_alignment_output.pl [options] filename(s) <<<\n\n";

  print "-s/--single\t\tdeduplicate single-end Bismark files (default format: SAM)\n";
  print "-p/--paired\t\tdeduplicate paired-end Bismark files (default format: SAM)\n\n";
  print "--vanilla\t\tThe input file is in the old custom Bismark format and not in SAM format\n\n";
  print "--barcode\t\tIn addition to chromosome, start position and orientation this will also take a potential barcode into\n                        consideration while deduplicating. The barcode needs to be the last element of the read ID and separated\n                        by a ':', e.g.: MISEQ:14:000000000-A55D0:1:1101:18024:2858_1:N:0:CTCCT\n\n";
  print "--bam\t\t\tThe output will be written out in BAM format instead of the default SAM format. This script will\n\t\t\tattempt to use the path to Samtools that was specified with '--samtools_path', or, if it hasn't\n\t\t\tbeen specified, attempt to find Samtools in the PATH. If no installation of Samtools can be found,\n\t\t\tthe SAM output will be compressed with GZIP instead (yielding a .sam.gz output file)\n\n";
  print "--samtools_path\t\tThe path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified\n\t\t\texplicitly if Samtools is in the PATH already\n\n";
  print "--version\t\tPrint version information and exit\n";

  print '='x111,"\n\n";

  print "This script was last modified on November 04, 2015\n\n";
}


sub test_positional_sorting{

  my $filename = shift;

  print "\nNow testing Bismark result file $filename for positional sorting (which would be bad...)\t";
  sleep(1);

  if ($filename =~ /\.gz$/) {
    open (TEST,"gunzip -c $filename |") or die "Can't open gzipped file $filename: $!\n";
  }
  elsif ($filename =~ /bam$/ ||  isBam($filename) ){ ### this would allow to read BAM files that do not end in *.bam
    if ($samtools_path){
      open (TEST,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n";
    }
    else{
      die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n";
    }
  }
  else {
    open (TEST,$filename) or die "Can't open file $filename: $!\n";
  }

  my $count = 0;

  while (<TEST>) {
    if (/^\@/) {	     # testing header lines if they contain the @SO flag (for being sorted)
      if (/^\@SO/) {
	die "SAM/BAM header line '$_' indicates that the Bismark aligment file has been sorted by chromosomal positions which is is incompatible with correct methylation extraction. Please use an unsorted file instead (e.g. use samtools sort -n)\n\n";
      }
      next;
    }
    $count++;

    last if ($count > 100000); # else we test the first 100000 sequences if they start with the same read ID

    my ($id_1) = (split (/\t/));

    ### reading the next line which should be read 2
    $_ = <TEST>;
    my ($id_2) = (split (/\t/));
    last unless ($id_2);
    ++$count;

    if ($id_1 eq $id_2){
      ### ids are the same
      next;
    }
    else{ ### in previous versions of Bismark we appended /1 and /2 to the read IDs for easier eyeballing which read is which. These tags need to be removed first
      my $id_1_trunc = $id_1;
      $id_1_trunc =~ s/\/1$//;
      my $id_2_trunc = $id_2;
      $id_2_trunc =~ s/\/2$//;

      unless ($id_1_trunc eq $id_2_trunc){
	die "\nThe IDs of Read 1 ($id_1) and Read 2 ($id_2) are not the same. This might be a result of sorting the paired-end SAM/BAM files by chromosomal position which is not compatible with correct methylation extraction. Please use an unsorted file instead (e.g. use samtools sort -n)\n\n";
      }
    }
  }
  #  close TEST or die $!; somehow fails on our cluster...
  ### If it hasen't died so far then it seems the file is in the correct Bismark format (read 1 and read 2 of a pair directly following each other)
  warn "...passed!\n";
  sleep(1);

}

sub isBam{

  my $filename = shift;

  # reading the first line of the input file to see if it is a BAM file in disguise (i.e. a BAM file that does not end in *.bam which may be produced by Galaxy)
  open (DISGUISE,"gunzip -c $filename |") or die "Failed to open filehandle DISGUISE for $filename\n\n";

  ### when BAM files read through a gunzip -c stream they start with BAM...
  my $bam_in_disguise = <DISGUISE>;
  # warn "BAM in disguise: $bam_in_disguise\n\n";

  if ($bam_in_disguise){
    if ($bam_in_disguise =~ /^BAM/){
      close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n";
      return 1;
    }
    else{
      close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n";
     return 0;
    }
  }
  else{
    close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n";
    return 0;
  }
}


#####################################

### The following subroutine "deduplicate_representative" only works correctly for reads that do not contain any indels (as in only Bowtie1-based alignments).
### Please refrain from using it unless you want to test something out.

#####################################

sub deduplicate_representative {
  my $file = shift;

  my %positions;
  my %unique_seqs;

  my $count = 0;
  my $unique_seqs = 0;
  my $removed = 0;

  ### going through the file first and storing all positions as well as their methylation call strings in a hash
  if ($file =~ /\.gz$/){
    open (IN,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n";
  }
  elsif ($file =~ /\.bam$/){
    open (IN,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n";
  }
  else{
    open (IN,$file) or die "Unable to read from $file: $!\n";
  }

  if ($single){

    my $outfile = $file;
    $outfile =~ s/\.gz$//;
    $outfile =~ s/\.sam$//;
    $outfile =~ s/\.bam$//;
    $outfile =~ s/\.txt$//;

    if ($vanilla){
      $outfile =~ s/$/.deduplicated_to_representative_sequences.txt/;
    }
    else{
      if ($bam == 1){
	$outfile =~ s/$/.deduplicated_to_representative_sequences.bam/;
      }
      elsif ($bam == 2){
	$outfile =~ s/$/.deduplicated_to_representative_sequences.sam.gz/;
      }
      else{
	$outfile =~ s/$/.deduplicated_to_representative_sequences.sam/;
      }
    }

    if ($bam == 1){
      open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $outfile") or die "Failed to write to $outfile: $!\n";
    }
    elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
      open (OUT,"| gzip -c - > $outfile") or die "Failed to write to $outfile: $!\n";
    }
    else{
      open (OUT,'>',$outfile) or die "Unable to write to $outfile: $!\n";
    }

    warn "Reading and storing all alignment positions\n";

    ### need to proceed slightly differently for the custom Bismark and Bismark SAM output
    if ($vanilla){
      $_ = <IN>; # Bismark version header
      print OUT; # Printing the Bismark version to the de-duplicated file again
    }

    while (<IN>){

      if ($count == 0){
	if ($_ =~ /^Bismark version:/){
	  warn "The file appears to be in the custom Bismark and not SAM format. Please see option --vanilla!\n";
	  sleep (2);
	  print_helpfile();
	  exit;
	}
      }

      ### if this was a SAM file we ignore header lines
      unless ($vanilla){
	if (/^\@\w{2}\t/){
	  warn "skipping SAM header line:\t$_";
	  print OUT; # Printing the header lines again into the de-duplicated file
	  next;
	}
      }

      my ($strand,$chr,$start,$end,$meth_call);

      if ($vanilla){
	($strand,$chr,$start,$end,$meth_call) = (split (/\t/))[1,2,3,4,7];
      }
      else{ # SAM format

	($strand,$chr,$start,my $seq,$meth_call) = (split (/\t/))[1,2,3,9,13]; # we are assigning the FLAG value to $strand
	### SAM single-end
	$end = $start + length($seq) - 1;
      }

      my $composite = join (":",$strand,$chr,$start,$end);

      $count++;
      $positions{$composite}->{$meth_call}->{count}++;
      $positions{$composite}->{$meth_call}->{alignment} = $_;
    }
    warn "Stored ",scalar keys %positions," different positions for $count sequences in total (+ and - alignments to the same position are scored individually)\n\n";
    close IN or warn $!;
  }

  elsif ($paired){

    ### we are going to concatenate both methylation call strings from the paired end file to form a joint methylation call string

    my $outfile = $file;
    if ($vanilla){
      $outfile =~ s/$/_deduplicated_to_representative_sequences_pe.txt/;
    }
    else{
      $outfile =~ s/$/_deduplicated_to_representative_sequences_pe.sam/;
    }

    open (OUT,'>',$outfile) or die "Unable to write to $outfile: $!\n";
    warn "Reading and storing all alignment positions\n";

    ### need to proceed slightly differently for the custom Bismark and Bismark SAM output
    if ($vanilla){
      $_ = <IN>; # Bismark version header
      print OUT; # Printing the Bismark version to the de-duplicated file again
    }

    while (<IN>){

      if ($count == 0){
	if ($_ =~ /^Bismark version:/){
	  warn "The file appears to be in the custom Bismark and not SAM format. Please see option --vanilla!\n";
	  sleep (2);
	  print_helpfile();
	  exit;
	}
      }

      ### if this was a SAM file we ignore header lines
      unless ($vanilla){
	if (/^\@\w{2}\t/){
	  warn "skipping SAM header line:\t$_";
	  print OUT; # Printing the header lines again into the de-duplicated file
	  next;
	}
      }

      my ($strand,$chr,$start,$end,$meth_call_1,$meth_call_2);
      my $line1;

      if ($vanilla){
	($strand,$chr,$start,$end,$meth_call_1,$meth_call_2) = (split (/\t/))[1,2,3,4,7,10];
      }
      else{ # SAM paired-end format

	($strand,$chr,$start,$meth_call_1) = (split (/\t/))[1,2,3,13]; # we are assigning the FLAG value to $strand

	### storing the first line (= read 1 alignment)
	$line1 = $_;

	### reading in the next line
	$_ = <IN>;
	# we only need the end position and the methylation call
	(my $pos,my $seq_2,$meth_call_2) = (split (/\t/))[3,9,13];
	$end = $pos + length($seq_2) - 1;
      }

      my $composite = join (":",$strand,$chr,$start,$end);
      $count++;
      my $meth_call = $meth_call_1.$meth_call_2;

      $positions{$composite}->{$meth_call}->{count}++;
      if ($vanilla){
	$positions{$composite}->{$meth_call}->{alignment} = $_;
      }
      else{ # SAM PAIRED-END
	$positions{$composite}->{$meth_call}->{alignment_1} = $line1;
	$positions{$composite}->{$meth_call}->{alignment_2} = $_;
      }
    }
    warn "Stored ",scalar keys %positions," different positions for $count sequences in total (+ and - alignments to the same position are scored individually)\n\n";
    close IN or warn $!;
  }

  ### PRINTING RESULTS

  ### Now going through all stored positions and printing out the methylation call which is most representative, i.e. the one which occurred most often
  warn "Now printing out alignments with the most representative methylation call(s)\n";

  foreach my $pos (keys %positions){
    foreach my $meth_call (sort { $positions{$pos}->{$b}->{count} <=> $positions{$pos}->{$a}->{count} }keys %{$positions{$pos}}){
      if ($paired){
	if ($vanilla){
	  print OUT $positions{$pos}->{$meth_call}->{alignment};
	}
	else{
	  print OUT $positions{$pos}->{$meth_call}->{alignment_1}; # SAM read 1
	  print OUT $positions{$pos}->{$meth_call}->{alignment_2}; # SAM read 2
	}
      }
      else{ # single-end
	print OUT $positions{$pos}->{$meth_call}->{alignment};
      }
      $unique_seqs++;
      last; ### exiting once we printed a sequence with the most frequent methylation call for a position
    }
  }

  my $percentage;
  unless ($count == 0){
    $percentage = sprintf ("%.2f",$unique_seqs*100/$count);
  }
  else{
    $percentage = 'N/A';
  }

  warn "\nTotal number of alignments analysed in $file:\t$count\n";
  warn "Total number of representative alignments printed from $file in total:\t$unique_seqs ($percentage%)\n\n";
  print REPORT "\nTotal number of alignments analysed in $file:\t$count\n";
  print REPORT "Total number of representative alignments printed from $file in total:\t$unique_seqs ($percentage%)\n\n";

}
author	bgruening
date	Sat, 06 May 2017 13:18:09 -0400
parents
children