bismark: bismark_methyl_extractor/coverage2cytosine comparison

comparison bismark_methyl_extractor/coverage2cytosine @ 7:fcadce4d9a06 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'

author	bgruening
date	Sat, 06 May 2017 13:18:09 -0400
parents
children

comparison

equal deleted inserted replaced

-:0f8646f22b8d
+:fcadce4d9a06
+#!/usr/bin/env perl
+use warnings;
+use strict;
+$|++;
+use Getopt::Long;
+use Cwd;
+use Carp;
+## This program is Copyright (C) 2010-16, Felix Krueger (felix.krueger@babraham.ac.uk)
+## This program is free software: you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation, either version 3 of the License, or
+## (at your option) any later version.
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+## You should have received a copy of the GNU General Public License
+## along with this program. If not, see <http://www.gnu.org/licenses/>.
+my %chromosomes; # storing sequence information of all chromosomes/scaffolds
+my %processed;   # keeping a record of which chromosomes have been processed
+my $coverage2cytosine_version = 'v0.16.3';
+my ($output_dir,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$parent_dir,$coverage_infile,$cytosine_out,$merge_CpGs,$gc_context,$gzip,$tetra) = process_commandline();
+warn "Summary of parameters for genome-wide cytosine report:\n";
+warn '='x78,"\n";
+warn "Coverage infile:\t\t\t$coverage_infile\n";
+warn "Output directory:\t\t\t>$output_dir<\n";
+warn "Parent directory:\t\t\t>$parent_dir<\n";
+warn "Genome directory:\t\t\t>$genome_folder<\n";
+if ($CX_context){
+warn "CX context:\t\t\t\tyes\n";
+}
+else{
+warn "CX context:\t\t\t\tno (CpG context only, default)\n";
+}
+if ($merge_CpGs){
+warn "Pooling CpG top/bottom strand evidence:\tyes\n";
+}
+if($gc_context){
+warn "Optional GC context track:\t\tyes\n";
+}
+if ($tetra){
+warn "Tetra/Penta nucleotide context:\t\tyes\n";
+}
+if ($zero){
+warn "Genome coordinates used:\t\t0-based (user specified)\n";
+}
+else{
+warn "Genome coordinates used:\t\t1-based (default)\n";
+}
+if ($gzip){
+warn "GZIP compression:\t\t\tyes\n";
+}
+else{
+warn "GZIP compression:\t\t\tno\n";
+}
+if ($split_by_chromosome){
+warn "Split by chromosome:\t\t\tyes\n\n\n";
+}
+else{
+warn "Split by chromosome:\t\t\tno\n\n\n";
+}
+sleep (3);
+read_genome_into_memory();
+warn "Stored sequence information of ",scalar keys %chromosomes," chromosomes/scaffolds in total\n\n";
+generate_genome_wide_cytosine_report($coverage_infile);
+### 11 December 2014
+# The following optional section re-reads the genome-wide report and merges methylation evidence of both top and bottom strand
+# into a single CpG dinucleotide entity. This significantly simplifies downstream processing, e.g. by the bsseq R-/Bioconductor package
+# which recommends this merging process to increase coverage per CpG and reduce the memory burden for its processing
+if ($merge_CpGs){
+# we only allow this operation if the report is limited to CpG context, and for a single report for the entire genome for the time being
+combine_CpGs_to_single_CG_entity($cytosine_out);
+}
+### 18 August 2015
+# The following section reprocessed the genome to generate cytosine methylation output in GC context (e.g. when a GpC methylase had been deployed
+if ($gc_context){
+generate_GC_context_report($coverage_infile);
+}
+sub combine_CpGs_to_single_CG_entity{
+my $CpG_report_file = shift;
+warn "Now merging top and bottom strand CpGs into a single CG dinucleotide entity\n";
+open (IN,$CpG_report_file) or die "Failed to open file $CpG_report_file: $!\n\n";
+my $pooled_CG = $CpG_report_file;
+$pooled_CG =~ s/$/.merged_CpG_evidence.cov/;
+open (OUT,'>',$pooled_CG) or die "Failed to write to file '$pooled_CG': $!\n\n";
+warn ">>> Writing a new coverage file with top and bottom strand CpG methylation evidence merged to $pooled_CG <<<\n\n";
+sleep(1);
+while (1){
+my $line1 = <IN>;
+my $line2 = <IN>;
+last unless ($line1 and $line2);
+chomp $line1;
+chomp $line2;
+my ($chr1,$pos1,$strand1,$m1,$u1,$context1) = (split /\t/,$line1);
+my ($chr2,$pos2,$strand2,$m2,$u2,$context2) = (split /\t/,$line2);
+# print join ("\t",$chr1,$pos1,$strand1,$m1,$u1,$context1),"\n";
+# print join ("\t",$chr2,$pos2,$strand2,$m2,$u2,$context2),"\n"; sleep(1);
+if ($pos1 < 2){
+$line1 = $line2;
+$line2 = <IN>;
+chomp $line2;
+($chr1,$pos1,$strand1,$m1,$u1,$context1) = (split /\t/,$line1);
+($chr2,$pos2,$strand2,$m2,$u2,$context2) = (split /\t/,$line2);
+}
+# a few sanity checks
+die "The context of the first line was not CG:\t$line1"  unless ($context1 eq 'CG');
+die "The context of the second line was not CG:\t$line2" unless ($context2 eq 'CG');
+unless ($strand1 eq '+' and $strand2 eq'-'){
+die "The strand of line 1 and line 2 were not + and -:\nline1:\t$line1\nline2:\t$line2\n";
+}
+unless ($pos2 eq ($pos1 + 1)){
+die "The reported position were not spaced 1bp apart:\nline1:\t$line1\nline2:\t$line2\n";
+}
+unless($chr1 eq $chr2){
+die "The chromsome information for line 1 and 2 did not match:\nline1:\t$line1\nline2:\t$line2\n";
+}
+# This looks alright from what I can tell, lets pool the 2 strands
+my $pooled_m = $m1 + $m2;
+my $pooled_u = $u1 + $u2;
+# since this is a new coverage file we only write it out if the coverage is at least 1
+next unless ($pooled_m + $pooled_u) > 0;
+my $pooled_percentage = sprintf ("%.6f",$pooled_m/ ($pooled_m + $pooled_u) *100);
+# print join ("\t",$chr1,$pos1,$strand1,$m1,$u1,$context1),"\n";
+# print join ("\t",$chr2,$pos2,$strand2,$m2,$u2,$context2),"\n";
+# print join ("\t",$chr1,$pos1,$pos2,$pooled_percentage,$pooled_m,$pooled_u),"\n";
+# sleep(1);
+print OUT join ("\t",$chr1,$pos1,$pos2,$pooled_percentage,$pooled_m,$pooled_u),"\n";
+}
+warn "CpG merging complete. Good luck finding DMRs with bsseq!\n\n";
+}
+sub process_commandline{
+my $help;
+my $output_dir;
+my $genome_folder;
+my $zero;
+my $CpG_only;
+my $CX_context;
+my $gzip;
+my $split_by_chromosome;
+my $cytosine_out;
+my $parent_dir;
+my $version;
+my $merge_CpGs;
+my $gc_context;
+my $tetra;
+my $command_line = GetOptions ('help|man' => \$help,
+				 'dir=s' => \$output_dir,
+				 'g|genome_folder=s' => \$genome_folder,
+				 "zero_based" => \$zero,
+				 "CX|CX_context" => \$CX_context,
+				 "split_by_chromosome" => \$split_by_chromosome,
+				 'o|output=s' => \$cytosine_out,
+				 'parent_dir=s' => \$parent_dir,
+				 'version' => \$version,
+				 'merge_CpGs' => \$merge_CpGs,
+				 'GC|GC_context' => \$gc_context,
+				 'ff|qq' => \$tetra,
+				 'gzip' => \$gzip,
+);
+### EXIT ON ERROR if there were errors with any of the supplied options
+unless ($command_line){
+die "Please respecify command line options\n";
+}
+### HELPFILE
+if ($help){
+print_helpfile();
+exit;
+}
+if ($version){
+print << "VERSION";
+Bismark Methylation Extractor Module -
+coverage2cytosine
+Bismark Extractor Version: $coverage2cytosine_version
+Copyright 2010-15 Felix Krueger, Babraham Bioinformatics
+www.bioinformatics.babraham.ac.uk/projects/bismark/
+VERSION
+exit;
+}
+### no files provided
+unless (@ARGV){
+warn "You need to provide a Bismark coverage file (with counts methylated/unmethylated cytosines) to create an individual C methylation output. Please respecify!\n";
+sleep(2);
+print_helpfile();
+exit;
+}
+my $coverage_infile = shift @ARGV;
+unless ($parent_dir){
+$parent_dir = getcwd();
+}
+unless ($parent_dir =~ /\/$/){
+$parent_dir =~ s/$/\//;
+}
+unless (defined $cytosine_out){
+die "Please provide the name of the output file using the option -o/--output filename\n";
+}
+### OUTPUT DIR PATH
+if (defined $output_dir){
+unless ($output_dir eq ''){ # if the output dir has been passed on by the methylation extractor and is an empty string we don't want to change it
+unless ($output_dir =~ /\/$/){
+	$output_dir =~ s/$/\//;
+}
+}
+}
+else{
+$output_dir = '';
+}
+unless ($CX_context){
+$CX_context = 0;
+$CpG_only = 1;
+}
+### GENOME folder
+if ($genome_folder){
+unless ($genome_folder =~/\/$/){
+$genome_folder =~ s/$/\//;
+}
+}
+else{
+die "Please specify a genome folder to proceed (full path only)\n";
+}
+if ($merge_CpGs){
+if ($CX_context){
+die "Merging individual CpG calls into a single CpG dinucleotide entity is currently only supported if CpG-context is selected only (lose the option --CX)\n";
+}
+if ($split_by_chromosome){
+die "Merging individual CpG calls into a single CpG dinucleotide entity is currently only supported if a single CpG report is written out (lose the option --split_by_chromosome)\n";
+}
+}
+return ($output_dir,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$parent_dir,$coverage_infile,$cytosine_out,$merge_CpGs,$gc_context,$gzip,$tetra);
+}
+sub generate_genome_wide_cytosine_report {
+warn  "="x78,"\n";
+warn "Methylation information will now be written into a genome-wide cytosine report\n";
+warn  "="x78,"\n\n";
+sleep (2);
+my $number_processed = 0;
+### changing to the output directory again
+unless ($output_dir eq ''){ # default
+chdir $output_dir or die "Failed to change directory to $output_dir\n";
+# warn "Changed directory to $output_dir\n";
+}
+my $in = shift;
+# infiles handed over by the methylation extractor will be just the filename on their own. The directory should have been handed over with --dir
+if ($in =~ /gz$/){
+open (IN,"gunzip -c $in |") or die "Failed to read from gzipped file $in: $!\n"; # changed from gunzip -c to gunzip -c 08 04 16
+}
+else{
+open (IN,"$in") or die "Failed to read from file $in: $!\n";
+}
+### note: we are still in the folder: $output_dir, so we do not have to include this into the open commands
+unless ($split_by_chromosome){ ### writing all output to a single file (default)
+if ($gzip){
+	  unless ($cytosine_out =~ /\.gz$/){
+	      $cytosine_out .= '.gz';
+	  }
+	  open (CYT,"| gzip -c - > $cytosine_out") or die "Failed to write to file $cytosine_out: $!\n";
+}
+else{
+	  open (CYT,'>',$cytosine_out) or die "Failed to write to file $cytosine_out: $!\n";
+}
+warn ">>> Writing genome-wide cytosine report to: $cytosine_out <<<\n\n";
+sleep (1);
+}
+my $last_chr;
+my %chr; # storing reads for one chromosome at a time
+my $count = 0;
+while (<IN>){
+chomp;
+++$count;
+my ($chr,$start,$end,undef,$meth,$nonmeth) = (split /\t/);
+# defining the first chromosome
+unless (defined $last_chr){
+	  $last_chr = $chr;
+	  ++$number_processed;
+	  # warn "Storing all covered cytosine positions for chromosome: $chr\n";
+}
+### As of version 0.9.1 the start positions are 1-based!
+if ($chr eq $last_chr){
+	  $chr{$chr}->{$start}->{meth} = $meth;
+	  $chr{$chr}->{$start}->{nonmeth} = $nonmeth;
+}
+else{
+	  warn "Writing cytosine report for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n";
+	  ++$number_processed;
+	  if ($split_by_chromosome){ ## writing output to 1 file per chromosome
+	      my $chromosome_out = $cytosine_out;
+	      if ($chromosome_out =~ /\.txt$/){
+		  $chromosome_out =~ s/\.txt$/chr${last_chr}.txt/;
+	      }
+	      else{
+		  $chromosome_out =~ s/$/.chr${last_chr}.txt/;
+	      }
+	      open (CYT,'>',$chromosome_out) or die $!;
+	      # warn "Writing output for $last_chr to $chromosome_out\n";
+	  }
+	  my $tri_nt;
+	  my $tetra_nt;
+	  my $penta_nt;
+	  my $context;
+	  $processed{$last_chr} = 1;
+	  while ( $chromosomes{$last_chr} =~ /([CG])/g){
+	      $tri_nt = '';
+	      $context = '';
+	      if ($tetra){
+		  $tetra_nt = ''; # clearing
+		  $penta_nt = '';
+	      }
+	      my $pos = pos$chromosomes{$last_chr};
+	      my $strand;
+	      my $meth = 0;
+	      my $nonmeth = 0;
+	      if ($1 eq 'C'){    # C on forward strand
+		  $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3);   # positions are 0-based!
+		  if ($tetra){
+		      if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 4) ){
+			  $tetra_nt = substr ($chromosomes{$last_chr},($pos-1),4);
+		      }
+		      else{
+			  $tetra_nt = '';
+		      }
+		      if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 5) ){
+			  $penta_nt = substr ($chromosomes{$last_chr},($pos-1),5);
+		      }
+		      else{
+			  $penta_nt = '';
+		      }
+		  }
+		  $strand = '+';
+	      }
+	      elsif ($1 eq 'G'){ # C on reverse strand
+		  $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3);   # positions are 0-based!
+		  $tri_nt = reverse $tri_nt;
+		  $tri_nt =~ tr/ACTG/TGAC/;
+		  if ($tetra){
+		      if ( $pos - 4 >= 0 ){
+			  $tetra_nt = substr ($chromosomes{$last_chr},($pos-4),4);
+			  $tetra_nt = reverse $tetra_nt;
+			  $tetra_nt =~ tr/ACTG/TGAC/;
+		      }
+		      else{
+			  $tetra_nt = '';
+		      }
+		      if ( $pos - 5 >= 0 ){
+			  $penta_nt = substr ($chromosomes{$last_chr},($pos-5),5);
+			  $penta_nt = reverse $penta_nt;
+			  $penta_nt =~ tr/ACTG/TGAC/;
+		      }
+		      else{
+			  $penta_nt = '';
+		      }
+		  }
+		  $strand = '-';
+	      }
+	      next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted
+	      # if (length$penta_nt < 5){
+	      # warn "$tri_nt\t$tetra_nt\t$penta_nt\n"; sleep(1);
+	      # }
+	      if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based! (as of v0.9.1)
+		  $meth =  $chr{$last_chr}->{$pos}->{meth};
+		  $nonmeth = $chr{$last_chr}->{$pos}->{nonmeth};
+	      }
+	      ### determining cytosine context
+	      if ($tri_nt =~ /^CG/){
+		  $context = 'CG';
+	      }
+	      elsif ($tri_nt =~ /^C.{1}G$/){
+		  $context = 'CHG';
+	      }
+	      elsif ($tri_nt =~ /^C.{2}$/){
+		  $context = 'CHH';
+	      }
+	      else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark)
+		  warn "The sequence context could not be determined (found: '$tri_nt'). Skipping.\n";
+		  next;
+	      }
+	      if ($CpG_only){
+		  if ($tri_nt =~ /^CG/){ # CpG context is the default
+		      if ($zero){ # zero based coordinates
+			  $pos -= 1;
+			  if ($tetra){
+			      print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
+			  }
+			  else{
+			      print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
+			  }
+		      }
+		      else{ # default
+			  if ($tetra){
+			      print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
+			  }
+			  else{
+			      print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
+			  }
+		      }
+		  }
+	      }
+	      else{ ## all cytosines, specified with --CX
+		  if ($zero){ # zero based coordinates
+		      $pos -= 1;
+		      if ($tetra){
+			  print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
+		      }
+		      else{
+			  print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"
+		      }
+		  }
+		  else{ # default
+		      if ($tetra){
+			  print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
+		      }
+		      else{
+			  print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
+		      }
+		  }
+	      }
+	  }
+	  %chr = (); # resetting the hash
+	  # new first entry
+	  $last_chr = $chr;
+	  $chr{$chr}->{$start}->{meth} = $meth;
+	  $chr{$chr}->{$start}->{nonmeth} = $nonmeth;
+}
+}
+# Last found chromosome
+# If there never was a last chromosome then something must have gone wrong with reading the data in
+unless (defined $last_chr){
+die "No last chromosome was defined, something must have gone wrong while reading the data in (e.g. specified wrong file path for a gzipped coverage file?). Please check your command!\n\n";
+}
+warn "Writing cytosine report for last chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n";
+$processed{$last_chr} = 1;
+if ($split_by_chromosome){ ## writing output to 1 file per chromosome
+my $chromosome_out = $cytosine_out;
+if ($chromosome_out =~ /\.txt$/){ # files passed on by the methylation extractor end in _report.txt
+	  $chromosome_out =~ s/\.txt$/chr${last_chr}.txt/;
+}
+else{ # user specified output file name
+	  $chromosome_out =~ s/$/.chr${last_chr}.txt/;
+}
+open (CYT,'>',$chromosome_out) or die $!;
+# warn "Writing output for $last_chr to $chromosome_out\n";
+}
+my $tri_nt;
+my $tetra_nt;
+my $penta_nt;
+my $context;
+while ( $chromosomes{$last_chr} =~ /([CG])/g){
+$tri_nt = '';
+$context = '';
+if ($tetra){
+	  $tetra_nt = '';
+	  $penta_nt = '';
+}
+my $pos = pos$chromosomes{$last_chr};
+my $strand;
+my $meth = 0;
+my $nonmeth = 0;
+if ($1 eq 'C'){    # C on forward strand
+	  $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3);   # positions are 0-based!
+	  $strand = '+';
+	  if ($tetra){
+	      if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 4) ){
+		  $tetra_nt = substr ($chromosomes{$last_chr},($pos-1),4);
+	      }
+	      else{
+		  $tetra_nt = '';
+	      }
+	      if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 5) ){
+		  $penta_nt = substr ($chromosomes{$last_chr},($pos-1),5);
+	      }
+	      else{
+		  $penta_nt = '';
+	      }
+	  }
+}
+elsif ($1 eq 'G'){ # C on reverse strand
+	  $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3);   # positions are 0-based!
+	  $tri_nt = reverse $tri_nt;
+	  $tri_nt =~ tr/ACTG/TGAC/;
+	  $strand = '-';
+	  if ($tetra){
+	      if ( $pos - 4 >= 0 ){
+		  $tetra_nt = substr ($chromosomes{$last_chr},($pos-4),4);
+		  $tetra_nt = reverse $tetra_nt;
+		  $tetra_nt =~ tr/ACTG/TGAC/;
+	      }
+	      else{
+		  $tetra_nt = '';
+	      }
+	      if ( $pos - 5 >= 0 ){
+		  $penta_nt = substr ($chromosomes{$last_chr},($pos-5),5);
+		  $penta_nt = reverse $penta_nt;
+		  $penta_nt =~ tr/ACTG/TGAC/;
+	      }
+	      else{
+		  $penta_nt = '';
+	      }
+	  }
+}
+if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based! as of v0.9.1
+	  $meth =  $chr{$last_chr}->{$pos}->{meth};
+	  $nonmeth = $chr{$last_chr}->{$pos}->{nonmeth};
+}
+next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted
+# if (length$penta_nt < 5){
+# warn "$tri_nt\t$tetra_nt\t$penta_nt\n"; sleep(1);
+# }
+### determining cytosine context
+if ($tri_nt =~ /^CG/){
+	  $context = 'CG';
+}
+elsif ($tri_nt =~ /^C.{1}G$/){
+	  $context = 'CHG';
+}
+elsif ($tri_nt =~ /^C.{2}$/){
+	  $context = 'CHH';
+}
+else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark)
+	  warn "The cytosine context could not be determined (found: '$tri_nt'). Skipping.\n";
+	  next;
+}
+if ($CpG_only){
+	  if ($tri_nt =~ /^CG/){ # CpG context is the default
+	      if ($zero){ # zero-based coordinates
+		  $pos -= 1;
+		  if ($tetra){
+		      print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
+		  }
+		  else{
+		      print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
+		  }
+	      }
+	      else{ # default
+		  if ($tetra){
+		      print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
+		  }
+		  else{
+		      print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
+		  }
+	      }
+	  }
+}
+else{ ## all cytosines, specified with --CX
+	  if ($zero){ # zero based coordinates
+	      $pos -= 1;
+	      if ($tetra){
+		  print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
+	      }
+	      else{
+		  print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
+	      }
+	  }
+	  else{ # default
+	      if ($tetra){
+		  print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
+	      }
+	      else{
+		  print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
+	      }
+	  }
+}
+}
+warn "Finished writing out cytosine report for covered chromosomes (processed $number_processed chromosomes/scaffolds in total)\n\n";
+### Now processing chromosomes that were not covered in the coverage file
+warn "Now processing chromosomes that were not covered by any methylation calls in the coverage file...\n";
+my $unprocessed = 0;
+foreach my $chr (sort keys %processed) {
+unless ( $processed{$chr} ) {
+	  ++$unprocessed;
+	  ++$number_processed;
+	  process_unprocessed_chromosomes($chr);
+}
+}
+if ($unprocessed == 0) {
+warn "All chromosomes in the genome were covered by at least some reads. coverage2cytosine processing complete.\n\n";
+}
+else{
+warn "Finished writing out cytosine report (processed $number_processed chromosomes/scaffolds in total). coverage2cytosine processing complete.\n\n";
+}
+close CYT or warn $!;
+}
+#### GC CONTEXT - optional
+####
+sub generate_GC_context_report {
+warn  "="x82,"\n";
+warn "Methylation information for GC context will now be written to a GpC-context report\n";
+warn  "="x82,"\n\n";
+sleep (2);
+my $number_processed = 0;
+### changing to the output directory again
+unless ($output_dir eq ''){ # default
+chdir $output_dir or die "Failed to change directory to $output_dir\n";
+# warn "Changed directory to $output_dir\n";
+}
+my $in = shift;
+if ($in =~ /gz$/){
+open (IN,"gunzip -c $in |") or die "Failed to read from gzipped file $in: $!\n";
+}
+else{
+open (IN,"$in") or die "Failed to read from file $in: $!\n";
+}
+my $gc_out = $cytosine_out.'.GpC_report.txt';
+my $gc_cov = $cytosine_out.'.GpC.cov';
+### note: we are still in the folder: $output_dir, so we do not have to include this into the open commands
+open (GC,'>',$gc_out) or die "Failed to write to GpC file $gc_out: !\n\n";
+warn ">>> Writing genome-wide GpC cytosine report to: $gc_out <<<\n";
+open (GCCOV,'>',$gc_cov) or die "Failed to write to GpC coverage file $gc_cov: !\n\n";
+warn ">>> Writing genome-wide GpC coverage file to: $gc_cov <<<\n\n";
+my $last_chr;
+my %chr; # storing reads for one chromosome at a time
+my $count = 0;
+while (<IN>){
+chomp;
+++$count;
+my ($chr,$start,$end,undef,$meth,$nonmeth) = (split /\t/);
+# defining the first chromosome
+unless (defined $last_chr){
+$last_chr = $chr;
+++$number_processed;
+# warn "Storing all covered cytosine positions for chromosome: $chr\n";
+}
+### start positions are 1-based
+if ($chr eq $last_chr){
+$chr{$chr}->{$start}->{meth} = $meth;
+$chr{$chr}->{$start}->{nonmeth} = $nonmeth;
+}
+else{
+warn "Writing cytosine report for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n";
+++$number_processed;
+$processed{$last_chr} = 1;
+while ( $chromosomes{$last_chr} =~ /(GC)/g){
+	# a GC on the top strand automatically means that there is a GC on the bottom strand as well, so we can process both at the same time
+	my $context_top = '';
+	my $context_bottom = '';
+	my $pos = pos$chromosomes{$last_chr};
+	my $meth_top = 0;
+	my $meth_bottom = 0;
+	my $nonmeth_top = 0;
+	my $nonmeth_bottom = 0;
+	#warn "$1\n"; sleep(1);
+	# C on forward strand
+	my $tri_nt_top = substr ($chromosomes{$last_chr},($pos-1),3);   # positions are 0-based!
+	my $strand_top = '+';
+	# C on reverse strand
+	my $tri_nt_bottom = substr ($chromosomes{$last_chr},($pos-4),3);   # positions are 0-based!
+	$tri_nt_bottom = reverse $tri_nt_bottom;
+	$tri_nt_bottom =~ tr/ACTG/TGAC/;
+	my $strand_bottom = '-';
+	next if (length $tri_nt_top < 3);     # trinucleotide sequence could not be extracted for the top strand
+	next if (length $tri_nt_bottom < 3);  # trinucleotide sequence could not be extracted for the reverse strand
+	# top strand
+	if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based!
+	  $meth_top    = $chr{$last_chr}->{$pos}->{meth};
+	  $nonmeth_top = $chr{$last_chr}->{$pos}->{nonmeth};
+	}
+	# bottom strand
+	if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 1-based! -1 for bottom strand Cs
+	  $meth_bottom    = $chr{$last_chr}->{$pos-1}->{meth};
+	  $nonmeth_bottom = $chr{$last_chr}->{$pos-1}->{nonmeth};
+	}
+	### determining cytosine context top strand
+	if ($tri_nt_top =~ /^CG/){
+	  $context_top = 'CG';
+	}
+	elsif ($tri_nt_top =~ /^C.{1}G$/){
+	  $context_top = 'CHG';
+	}
+	elsif ($tri_nt_top =~ /^C.{2}$/){
+	  $context_top = 'CHH';
+	}
+	else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark)
+	  warn "The sequence context could not be determined for the top strand (found: '$tri_nt_top'). Skipping.\n";
+	  next;
+	}
+	### determining cytosine context bottom strand
+	if ($tri_nt_bottom =~ /^CG/){
+	  $context_bottom = 'CG';
+	}
+	elsif ($tri_nt_bottom =~ /^C.{1}G$/){
+	  $context_bottom = 'CHG';
+	}
+	elsif ($tri_nt_bottom =~ /^C.{2}$/){
+	  $context_bottom = 'CHH';
+	}
+	else{ # if the context can't be determined the positions will not be printed
+	  warn "The sequence context could not be determined for the bottom strand (found: '$tri_nt_bottom'). Skipping.\n";
+	  next;
+	}
+	# if Cs were not covered at all they are not written out
+	unless ($meth_bottom + $nonmeth_bottom == 0){
+	  my $percentage = sprintf ("%.6f",$meth_bottom/ ($meth_bottom + $nonmeth_bottom) *100);
+	  print GCCOV join ("\t",$last_chr,$pos-1,$pos-1,$percentage,$meth_bottom,$nonmeth_bottom),"\n";
+	  print GC join ("\t",$last_chr,$pos-1,$strand_bottom,$meth_bottom,$nonmeth_bottom,$context_bottom,$tri_nt_bottom),"\n";
+	}
+	unless ($meth_top + $nonmeth_top == 0){
+	  my $percentage = sprintf ("%.6f",$meth_top/ ($meth_top + $nonmeth_top) *100);
+	  print GCCOV join ("\t",$last_chr,$pos,$pos,$percentage,$meth_top,$nonmeth_top),"\n";
+	  print GC join ("\t",$last_chr,$pos,$strand_top,$meth_top,$nonmeth_top,$context_top,$tri_nt_top),"\n";
+	}
+}
+%chr = (); # resetting the hash
+# new first entry
+$last_chr = $chr;
+$chr{$chr}->{$start}->{meth} = $meth;
+$chr{$chr}->{$start}->{nonmeth} = $nonmeth;
+}
+}
+# Last found chromosome
+warn "Writing cytosine GpC report for last chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n";
+$processed{$last_chr} = 1;
+while ( $chromosomes{$last_chr} =~ /(GC)/g){
+# a GC on the top strand automatically means that there is a GC on the bottom strand as well, so we can process both at the same time
+my $context_top = '';
+my $context_bottom = '';
+my $pos = pos$chromosomes{$last_chr};
+my $meth_top = 0;
+my $meth_bottom = 0;
+my $nonmeth_top = 0;
+my $nonmeth_bottom = 0;
+# C on forward strand
+my $tri_nt_top = substr ($chromosomes{$last_chr},($pos-1),3);   # positions are 0-based!
+my $strand_top = '+';
+# C on reverse strand
+my $tri_nt_bottom = substr ($chromosomes{$last_chr},($pos-4),3);   # positions are 0-based!
+$tri_nt_bottom = reverse $tri_nt_bottom;
+$tri_nt_bottom =~ tr/ACTG/TGAC/;
+my $strand_bottom = '-';
+next if (length $tri_nt_top < 3);     # trinucleotide sequence could not be extracted for the top strand
+next if (length $tri_nt_bottom < 3);  # trinucleotide sequence could not be extracted for the bottom strand
+### determining cytosine context top strand
+if ($tri_nt_top =~ /^CG/){
+$context_top = 'CG';
+}
+elsif ($tri_nt_top =~ /^C.{1}G$/){
+$context_top = 'CHG';
+}
+elsif ($tri_nt_top =~ /^C.{2}$/){
+$context_top = 'CHH';
+}
+else{ # if the context can't be determined the positions will not be printed
+warn "The sequence context could not be determined for the top strand (found: '$tri_nt_top'). Skipping.\n";
+next;
+}
+### determining cytosine context bottom strand
+if ($tri_nt_bottom =~ /^CG/){
+$context_bottom = 'CG';
+}
+elsif ($tri_nt_bottom =~ /^C.{1}G$/){
+$context_bottom = 'CHG';
+}
+elsif ($tri_nt_bottom =~ /^C.{2}$/){
+$context_bottom = 'CHH';
+}
+else{ # if the context can't be determined the positions will not be printed
+warn "The sequence context could not be determined for the bottom strand (found: '$tri_nt_bottom'). Skipping.\n";
+next;
+}
+# top strand
+if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based!
+$meth_top    = $chr{$last_chr}->{$pos}->{meth};
+$nonmeth_top = $chr{$last_chr}->{$pos}->{nonmeth};
+}
+# bottom strand
+if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 1-based! -1 for bottom strand Cs
+$meth_bottom    = $chr{$last_chr}->{$pos-1}->{meth};
+$nonmeth_bottom = $chr{$last_chr}->{$pos-1}->{nonmeth};
+}
+# if Cs were not covered at all they are not written out
+unless ($meth_bottom + $nonmeth_bottom == 0){
+my $percentage = sprintf ("%.6f",$meth_bottom/ ($meth_bottom + $nonmeth_bottom) *100);
+print GCCOV join ("\t",$last_chr,$pos-1,$pos-1,$percentage,$meth_bottom,$nonmeth_bottom),"\n";
+print GC join ("\t",$last_chr,$pos-1,$strand_bottom,$meth_bottom,$nonmeth_bottom,$context_bottom,$tri_nt_bottom),"\n";
+}
+unless ($meth_top + $nonmeth_top == 0){
+my $percentage = sprintf ("%.6f",$meth_top/ ($meth_top + $nonmeth_top) *100);
+print GCCOV join ("\t",$last_chr,$pos,$pos,$percentage,$meth_top,$nonmeth_top),"\n";
+print GC join ("\t",$last_chr,$pos,$strand_top,$meth_top,$nonmeth_top,$context_top,$tri_nt_top),"\n";
+}
+}
+warn "Finished writing out GpC cytosine report for covered chromosomes (processed $number_processed chromosomes/scaffolds in total)\n\n";
+close GC or warn $!;
+}
+####
+####
+sub process_unprocessed_chromosomes{
+my $chr = shift;
+warn "Writing cytosine report for not covered chromosome $chr\n";
+$processed{$chr} = 1;
+if ($split_by_chromosome){ ## writing output to 1 file per chromosome
+my $chromosome_out = $cytosine_out;
+if ($chromosome_out =~ /txt$/){ # files passed on by the methylation extractor end in _report.txt
+	$chromosome_out =~ s/txt$/chr${chr}.txt/;
+}
+else{ # user specified output file name
+	$chromosome_out =~ s/$/.chr${chr}.txt/;
+}
+open (CYT,'>',$chromosome_out) or die $!;
+# warn "Writing output for $last_chr to $chromosome_out\n";
+}
+my $tri_nt;
+my $tetra_nt;
+my $penta_nt;
+my $context;
+while ( $chromosomes{$chr} =~ /([CG])/g){
+$tri_nt = '';
+$context = '';
+if ($tetra){
+	  $tetra_nt = ''; # clearing
+	  $penta_nt = '';
+}
+my $pos = pos$chromosomes{$chr};
+my $strand;
+my $meth = 0;
+my $nonmeth = 0;
+if ($1 eq 'C'){    # C on forward strand
+	  $tri_nt = substr ($chromosomes{$chr},($pos-1),3);   # positions are 0-based!
+	  $strand = '+';
+	  if ($tetra){
+	      if ( length($chromosomes{$chr}) >= ($pos - 1 + 4) ){
+		  $tetra_nt = substr ($chromosomes{$chr},($pos-1),4);
+	      }
+	      else{
+		  $tetra_nt = '';
+	      }
+	      if ( length($chromosomes{$chr}) >= ($pos - 1 + 5) ){
+		  $penta_nt = substr ($chromosomes{$chr},($pos-1),5);
+	      }
+	      else{
+		  $penta_nt = '';
+	      }
+	  }
+}
+elsif ($1 eq 'G'){ # C on reverse strand
+	  $tri_nt = substr ($chromosomes{$chr},($pos-3),3);   # positions are 0-based!
+	  $tri_nt = reverse $tri_nt;
+	  $tri_nt =~ tr/ACTG/TGAC/;
+	  $strand = '-';
+	  if ($tetra){
+	      if ( $pos - 4 >= 0 ){
+		  $tetra_nt = substr ($chromosomes{$chr},($pos-4),4);
+		  $tetra_nt = reverse $tetra_nt;
+		  $tetra_nt =~ tr/ACTG/TGAC/;
+	      }
+	      else{
+		  $tetra_nt = '';
+	      }
+	      if ( $pos - 5 >= 0 ){
+		  $penta_nt = substr ($chromosomes{$chr},($pos-5),5);
+		  $penta_nt = reverse $penta_nt;
+		  $penta_nt =~ tr/ACTG/TGAC/;
+	      }
+	      else{
+		  $penta_nt = '';
+	      }
+	  }
+	  $strand = '-';
+}
+next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted
+### determining cytosine context
+if ($tri_nt =~ /^CG/){
+	  $context = 'CG';
+}
+elsif ($tri_nt =~ /^C.{1}G$/){
+	  $context = 'CHG';
+}
+elsif ($tri_nt =~ /^C.{2}$/){
+	  $context = 'CHH';
+}
+else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark)
+	  warn "The cytosine context could not be determined (found: '$tri_nt'). Skipping.\n";
+	  next;
+}
+if ($CpG_only){
+	  if ($tri_nt =~ /^CG/){ # CpG context is the default
+	      if ($zero){ # zero-based coordinates
+		  $pos -= 1;
+		  if ($tetra){
+		      print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
+		  }
+		  else{
+		      print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
+		  }
+	      }
+	      else{ # default
+		  if ($tetra){
+		      print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
+		  }
+		  else{
+		      print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
+		  }
+	      }
+	  }
+}
+else{ ## all cytosines, specified with --CX
+	  if ($zero){ # zero based coordinates
+	      $pos -= 1;
+	      if ($tetra){
+		  print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
+	      }
+	      else{
+		  print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
+	      }
+	  }
+	  else{ # default
+	      if ($tetra){
+		  print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
+	      }
+	      else{
+		  print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
+	      }
+	  }
+}
+}
+#  close CYT or warn $!;
+}
+sub read_genome_into_memory{
+## reading in and storing the specified genome in the %chromosomes hash
+chdir ($genome_folder) or die "Can't move to $genome_folder: $!";
+warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n";
+my @chromosome_filenames =  <*.fa>;
+### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
+unless (@chromosome_filenames){
+@chromosome_filenames =  <*.fasta>;
+}
+unless (@chromosome_filenames){
+die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n";
+}
+foreach my $chromosome_filename (@chromosome_filenames){
+# skipping the tophat entire mouse genome fasta file
+next if ($chromosome_filename eq 'Mus_musculus.NCBIM37.fa');
+open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n";
+### first line needs to be a fastA header
+my $first_line = <CHR_IN>;
+chomp $first_line;
+$first_line =~ s/\r//; # removing /r carriage returns
+### Extracting chromosome name from the FastA header
+my $chromosome_name = extract_chromosome_name($first_line);
+my $sequence;
+while (<CHR_IN>){
+chomp;
+$_ =~ s/\r//; # removing /r carriage returns
+if ($_ =~ /^>/){
+	### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA)
+	if (exists $chromosomes{$chromosome_name}){
+	  warn "chr $chromosome_name (",length $sequence ," bp)\n";
+	  die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
+	}
+	else {
+	  if (length($sequence) == 0){
+	    warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n";
+	  }
+	  warn "chr $chromosome_name (",length $sequence ," bp)\n";
+	  $chromosomes{$chromosome_name} = $sequence;
+	  $processed{$chromosome_name} = 0; # processed chromosomes will be set to 1 later to allow a record of which chromosome has been processed
+	}
+	### resetting the sequence variable
+	$sequence = '';
+	### setting new chromosome name
+	$chromosome_name = extract_chromosome_name($_);
+}
+else{
+	$sequence .= uc$_;
+}
+}
+if (exists $chromosomes{$chromosome_name}){
+warn "chr $chromosome_name (",length $sequence ," bp)\t";
+die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n";
+}
+else{
+if (length($sequence) == 0){
+	warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n";
+}
+warn "chr $chromosome_name (",length $sequence ," bp)\n";
+$chromosomes{$chromosome_name} = $sequence;
+$processed{$chromosome_name} = 0; # processed chromosomes will be set to 1 later to allow a record of which chromosome has been processed
+}
+}
+warn "\n";
+chdir $parent_dir or die "Failed to move to directory $parent_dir\n";
+}
+sub extract_chromosome_name {
+## Bowtie extracts the first string after the inition > in the FASTA file, so we are doing this as well
+my $fasta_header = shift;
+if ($fasta_header =~ s/^>//){
+my ($chromosome_name) = split (/\s+/,$fasta_header);
+return $chromosome_name;
+}
+else{
+die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n";
+}
+}
+sub print_helpfile{
+warn <<EOF
+SYNOPSIS:
+This script generates a cytosine methylation report for a genome of interest and a sorted methylation input file produced
+by the script "bismark2bedGraph". By default, the output uses 1-based chromosome coordinates and reports CpG positions only
+(for both strands individually and not merged in any way). Coordinates may be changed to 0-based using the option '--zero_based'.
+The input file needs to have been generated with the script bismark2bedGraph (the file is called *.cov) or otherwise be
+sorted by position and exactly in the following format:
+<chromosome>  <start position>  <end position>  <methylation percentage>  <count methylated>  <count unmethylated>
+The coordinates of the input file are expected to be 1-based throughout (do not use files ending in .zero.cov!).
+USAGE: coverage2cytosine [options] --genome_folder <path> -o <output> [input]
+-o/--output <filename>   Name of the output file, mandatory.
+--dir                    Output directory. Output is written to the current directory if not specified explicitly.
+--genome_folder <path>   Enter the genome folder you wish to use to extract sequences from (full path only). Accepted
+formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory.
+-CX/--CX_context         The output file contains information on every single cytosine in the genome irrespective of
+its context. This applies to both forward and reverse strands. Please be aware that this will
+generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse.
+Default: OFF (i.e. Default = CpG context only).
+--merge_CpG              Using this option will post-process the genome-wide report to write out an additional coverage
+file (see above for the coverage file format) which has the top and bottom strand methylation
+evidence pooled into a single CpG dinucleotide entity. This may be the desirable input format
+for some downstream processing tools such as the R-package bsseq (by K.D. Hansen). An example would be:
+			   genome-wide CpG report (old)
+			   gi|9626372|ref|NC_001422.1|     157     +       313     156     CG
+			   gi|9626372|ref|NC_001422.1|     158     -       335     156     CG
+			   merged CpG evidence coverage file (new)
+			   gi|9626372|ref|NC_001422.1|     157     158     67.500000       648     312
+			 This option is currently experimental, and only works if CpG context only and a single genome-wide report
+were specified (i.e. it doesn't work with the options --CX or --split_by_chromosome).
+--gc/--gc_context        In addition to normal processing this option will reprocess the genome to find methylation in
+GpC context. This might be useful for specialist applications where GpC methylases had been
+deployed. The output format is exactly the same as for the normal cytosine report, and only
+positions covered by at least one read are reported (output file ends in .GpC_report.txt). In addition
+this will write out a Bismark coverage file (ending in GpC.cov).
+--ff                     In addition to the standard output selecting --ff will also extract a four and five (tetra/penta)
+nucleotide context for the cytosines in question. Too short sequences (e.g. at the edges of the
+chromosome) will be left blank; sequences containing Ns are ignored.
+--zero_based             Uses 0-based coordinates instead of 1-based coordinates throughout. Default: OFF.
+--split_by_chromosome    Writes the output into individual files for each chromosome instead of a single output file. Files
+will be named to include the input filename and the chromosome number.
+--gzip                   Output file will be GZIP compressed (ending in .gz). Only works for standard CpG- and CX-output.
+--help                   Displays this help message and exits
+OUTPUT FORMAT:
+The genome-wide cytosine methylation output file is tab-delimited in the following format (1-based coords):
+===========================================================================================================
+<chromosome>  <position>  <strand>  <count methylated>  <count non-methylated>  <C-context>  <trinucleotide context>
+Script last modified: 04 April 2016
+EOF
+;
+exit 1;
+}

Mercurial > repos > bgruening > bismark

comparison bismark_methyl_extractor/coverage2cytosine @ 7:fcadce4d9a06 draft