Mercurial > repos > bgruening > bismark
diff new/bismark_methylation_extractor @ 7:fcadce4d9a06 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author | bgruening |
---|---|
date | Sat, 06 May 2017 13:18:09 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/new/bismark_methylation_extractor Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,5875 @@ +#!/usr/bin/perl +use warnings; +use strict; +$|++; +use Getopt::Long; +use Cwd; +use Carp; +use FindBin qw($Bin); +use lib "$Bin/../lib"; + +## This program is Copyright (C) 2010-15, Felix Krueger (felix.krueger@babraham.ac.uk) + +## This program is free software: you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. + +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. + +## You should have received a copy of the GNU General Public License +## along with this program. If not, see <http://www.gnu.org/licenses/>. + +my @filenames; # input files +my %counting; +my $parent_dir = getcwd(); + +my %fhs; + +my $version = 'v0.14.3'; +my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_off,$mbias_only,$gazillion,$ample_mem,$ignore_3prime,$ignore_3prime_r2,$multicore) = process_commandline(); + + +### only needed for bedGraph output +my @sorting_files; # if files are to be written to bedGraph format, these are the methylation extractor output files +my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total +my @bedfiles; + +### only needed for genome-wide cytosine methylation report +my %chromosomes; + +my %mbias_1; +my %mbias_2; + + +############################################################################################## +### Summarising Run Parameters +############################################################################################## + +### METHYLATION EXTRACTOR + +warn "Summarising Bismark methylation extractor parameters:\n"; +warn '='x63,"\n"; + +if ($single){ + if ($vanilla){ + warn "Bismark single-end vanilla format specified\n"; + } + else{ + warn "Bismark single-end SAM format specified (default)\n"; # default + } +} +elsif ($paired){ + if ($vanilla){ + warn "Bismark paired-end vanilla format specified\n"; + } + else{ + warn "Bismark paired-end SAM format specified (default)\n"; # default + } +} + +warn "Number of cores to be used: $multicore\n"; + +if ($single){ + if ($ignore){ + warn "First $ignore bp will be disregarded when processing the methylation call string\n"; + } + if ($ignore_3prime){ + warn "Last $ignore_3prime bp will be disregarded when processing the methylation call string\n"; + } + +} +else{ ## paired-end + if ($ignore){ + warn "First $ignore bp will be disregarded when processing the methylation call string of Read 1\n"; + } + if ($ignore_r2){ + warn "First $ignore_r2 bp will be disregarded when processing the methylation call string of Read 2\n"; + } + + if ($ignore_3prime){ + warn "Last $ignore_3prime bp will be disregarded when processing the methylation call string of Read 1\n"; + } + if ($ignore_3prime_r2){ + warn "Last $ignore_3prime_r2 bp will be disregarded when processing the methylation call string of Read 2\n"; + } + + +} + + +if ($full){ + warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n"; +} +if ($merge_non_CpG){ + warn "Merge CHG and CHH context to non-CpG context specified\n"; +} +### output directory +if ($output_dir eq ''){ + warn "Output will be written to the current directory ('$parent_dir')\n"; +} +else{ + warn "Output path specified as: $output_dir\n"; +} + + +sleep (1); + +### BEDGRAPH + +if ($bedGraph){ + warn "\n\nSummarising bedGraph parameters:\n"; + warn '='x63,"\n"; + + if ($counts){ + warn "Generating additional output in bedGraph and coverage format\nbedGraph format:\t<Chromosome> <Start Position> <End Position> <Methylation Percentage>\ncoverage format:\t<Chromosome> <Start Position> <End Position> <Methylation Percentage> <count methylated> <count non-methylated>\n\n"; + } + else{ + warn "Generating additional sorted output in bedGraph format (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage>)\n"; + } + + ### Zero-based coordinates + if ($zero){ + warn "Writing out an additional coverage file (ending in zero.cov) with 0-based start and 1-based end genomic coordinates (zero-based, half-open; user-defined)\n"; + } + + warn "Using a cutoff of $coverage_threshold read(s) to report cytosine positions\n"; + + if ($CX_context){ + warn "Reporting and sorting methylation information for all cytosine context (sorting may take a long time, you have been warned ...)\n"; + } + else{ # default + $CpG_only = 1; + warn "Reporting and sorting cytosine methylation information in CpG context only (default)\n"; + } + + if ($remove){ + warn "White spaces in read ID names will be removed prior to sorting\n"; + } + + if ($ample_mem){ + warn "Sorting chromosomal postions for the bedGraph step using arrays instead of using UNIX sort\n"; + } + elsif (defined $sort_size){ + warn "The bedGraph UNIX sort command will use the following memory setting:\t'$sort_size'. Temporary directory used for sorting is the output directory\n"; + } + else{ + warn "Setting a default memory usage for the bedGraph UNIX sort command to 2GB\n"; + } + + + + sleep (1); + + if ($cytosine_report){ + warn "\n\nSummarising genome-wide cytosine methylation report parameters:\n"; + warn '='x63,"\n"; + warn "Generating comprehensive genome-wide cytosine report\n(output format: <Chromosome> <Position> <Strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context> )\n"; + + + if ($CX_context){ + warn "Reporting methylation for all cytosine contexts. Be aware that this will generate enormous files\n"; + } + else{ # default + $CpG_only = 1; + warn "Reporting cytosine methylation in CpG context only (default)\n"; + } + + if ($split_by_chromosome){ + warn "Splitting the cytosine report output up into individual files for each chromosome\n"; + } + + ### Zero-based coordinates + if ($zero){ + warn "Using zero-based start and 1-based end genomic coordinates (zero-based, half-open; user-defined)\n"; + } + else{ # default, 1-based coords + warn "Using 1-based genomic coordinates (default)\n"; + } + + ### GENOME folder + if ($genome_folder){ + unless ($genome_folder =~/\/$/){ + $genome_folder =~ s/$/\//; + } + warn "Genome folder was specified as $genome_folder\n"; + } + else{ + $genome_folder = '/data/public/Genomes/Mouse/NCBIM37/'; + warn "Using the default genome folder /data/public/Genomes/Mouse/NCBIM37/\n"; + } + sleep (1); + } +} + +warn "\n"; +sleep (1); + +###################################################### +### PROCESSING FILES +###################################################### + +foreach my $filename (@filenames){ + # resetting counters and filehandles + %fhs = (); + %counting =( + total_meCHG_count => 0, + total_meCHH_count => 0, + total_meCpG_count => 0, + total_unmethylated_CHG_count => 0, + total_unmethylated_CHH_count => 0, + total_unmethylated_CpG_count => 0, + sequences_count => 0, + methylation_call_strings => 0, + ); + + @sorting_files = (); + @bedfiles = (); + + %mbias_1 = (); + %mbias_2 = (); + + + ### performing a quick check to see if a paired-end SAM file has been sorted by positions which does interfere with the logic used by the extractor + unless ($vanilla){ + if ($paired){ + test_positional_sorting($filename); + } + } + + my ($pid,$pids,$report_basename) = process_Bismark_results_file($filename); + + if ($pid == 0){ + warn "Finished processing child process. Exiting..\n"; + + # ### Closing all filehandles of the child process so that the Bismark methylation extractor output doesn't get truncated due to buffering issues + # foreach my $fh (keys %fhs) { + # if ($fh =~ /^[1230]$/) { + # foreach my $context (keys %{$fhs{$fh}}) { + # $fhs{$fh}->{$context}->flush; + # } + # } + # else{ + # $fhs{$fh}->flush; + # } + # } + exit 0; + } + + ### + if ($pid and $multicore > 1){ + warn "Now waiting for all child processes to complete\n"; + sleep(1); + + ### we need to ensure that we wait for all child processes to be finished before continuing + # warn "here are the child IDs: @$pids\n"; + # warn "Looping through the child process IDs:\n"; + + foreach my $id (@$pids){ + # print "$id\t"; + my $kid = waitpid ($id,0); + # print "Returned: $kid\nExit status: $?\n"; + unless ($? == 0){ + warn "\nChild process terminated with exit signal: '$?'\n\n"; + } + } + } + + ### Closing all filehandles so that the Bismark methylation extractor output doesn't get truncated due to buffering issues + foreach my $fh (keys %fhs) { + if ($fh =~ /^[1230]$/) { + foreach my $context (keys %{$fhs{$fh}}) { + close $fhs{$fh}->{$context} or die $!; + } + } + else{ + close $fhs{$fh} or die $!; + } + } + + ### We need to stitch together a main splitting report from all individual parent/child processes + if ($multicore > 1){ + merge_individual_splitting_reports($report_basename); + print_splitting_report(); + + merge_individual_mbias_reports($report_basename); # this updates the main %mbias_1 and %mbias_2 data structures so we can proceed normally + + } + + unless ($mbias_off){ + ### printing out all M-Bias data + produce_mbias_plots ($filename); produce_mbias_plots ($filename); + + } + + unless ($mbias_only){ + delete_unused_files(); + } + + if ($bedGraph){ + + my $out = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified + $out =~ s/gz$//; + $out =~ s/sam$//; + $out =~ s/bam$//; + $out =~ s/txt$//; + $out =~ s/$/bedGraph/; + + my $bedGraph_output = $out; + my @args; + + if ($remove){ + push @args, '--remove'; + } + if ($CX_context){ + push @args, '--CX_context'; + } + if ($no_header){ + push @args, '--no_header'; + } + if ($gazillion){ + push @args, '--gazillion'; + } + if ($ample_mem){ + push @args, '--ample_memory'; + } + if ($zero){ + push @args, "--zero"; + } + + # if ($counts){ + # push @args, "--counts"; + # } + + push @args, "--buffer_size $sort_size"; + push @args, "--cutoff $coverage_threshold"; + push @args, "--output $bedGraph_output"; + push @args, "--dir '$output_dir'"; + + ### adding all files to be sorted to @args + foreach my $f (@sorting_files){ + push @args, $f; + } + + # print join "\t",@args,"\n"; + + system ("$Bin/bismark2bedGraph @args"); + + warn "Finished BedGraph conversion ...\n\n"; + sleep(1); + + # open (OUT,'>',$output_dir.$bedGraph_output) or die "Problems with the bedGraph output filename detected: file path: '$output_dir'\tfile name: '$bedGraph_output' $!"; + # warn "Writing bedGraph to file: $bedGraph_output\n"; + # process_bedGraph_output(); + # close OUT or die $!; + + ### genome-wide cytosine methylation report requires bedGraph processing anyway + if ($cytosine_report){ + + @args = (); # resetting @args + my $cytosine_out = $out; + $cytosine_out =~ s/bedGraph$//; + + if ($CX_context){ + $cytosine_out =~ s/$/CX_report.txt/; + } + else{ + $cytosine_out =~ s/$/CpG_report.txt/; + } + + push @args, "--output $cytosine_out"; + push @args, "--dir '$output_dir'"; + push @args, "--genome '$genome_folder'"; + push @args, "--parent_dir '$parent_dir'"; + + if ($zero){ + push @args, "--zero"; + } + if ($CX_context){ + push @args, '--CX_context'; + } + if ($split_by_chromosome){ + push @args, '--split_by_chromosome'; + } + + my $coverage_output = $bedGraph_output; + $coverage_output =~ s/bedGraph$/bismark.cov/; + + push @args, $output_dir . $coverage_output; # this will be the infile + + system ("$Bin/coverage2cytosine @args"); + # generate_genome_wide_cytosine_report($bedGraph_output,$cytosine_out); + warn "\n\nFinished generating genome-wide cytosine report\n\n"; + } + } +} + +sub merge_individual_splitting_reports{ + + my $report_basename = shift; + + my @splitting_reports; # only needed in multi-core mode to generate an overall report + foreach my $ext (1..$multicore){ + push @splitting_reports, "$report_basename.$ext"; + } + warn "\nMerging individual splitting reports into overall report: '$report_basename'\n"; + warn "Merging from these individual files:\n"; + print join ("\n",@splitting_reports),"\n\n"; + sleep(1); + + ########## + # resetting the counter first + %counting =( + total_meCHG_count => 0, + total_meCHH_count => 0, + total_meCpG_count => 0, + total_unmethylated_CHG_count => 0, + total_unmethylated_CHH_count => 0, + total_unmethylated_CpG_count => 0, + sequences_count => 0, + methylation_call_strings => 0, + ); + + # repopulating the merged counter + foreach my $file (@splitting_reports){ + open (IR,$file) or die $!; + while (<IR>){ + chomp; + my ($context,$count) = (split /\t/); + if ($context){ + if ($context =~ /^Total C to T conversions in CpG context/){ + # warn "context: $context\ncount: $count\n"; + $counting{total_unmethylated_CpG_count} += $count; + } + elsif ($context =~ /^Total C to T conversions in CHG context/){ + # warn "context: $context\ncount: $count\n"; + $counting{total_unmethylated_CHG_count} += $count; + } + elsif ($context =~ /^Total C to T conversions in CHH context/){ + # warn "context: $context\ncount: $count\n"; + $counting{total_unmethylated_CHH_count} += $count; + } + elsif ($context =~ /^Total methylated C\'s in CpG context/){ + # warn "context: $context\ncount: $count\n"; + $counting{total_meCpG_count} += $count; + } + elsif ($context =~ /^Total methylated C\'s in CHG context/){ + # warn "context: $context\ncount: $count\n"; + $counting{total_meCHG_count} += $count; + } + elsif ($context =~ /^Total methylated C\'s in CHH context/){ + # warn "context: $context\ncount: $count\n"; + $counting{total_meCHH_count} += $count; + } + elsif ($context =~ /^line count/){ + # warn "Line count\ncount: $count\n"; + $counting{sequences_count} = $count; # always the same + } + elsif ($context =~ /^meth call strings/){ + # warn "Meth call strings\ncount: $count\n"; + $counting{methylation_call_strings} += $count; + } + } + } + } + + # deleting the individual reports afterwards + foreach my $file (@splitting_reports){ + unlink $file; + } +} + + +sub merge_individual_mbias_reports{ + + my $report_basename = shift; + + my @mbias_reports; # only needed in multi-core mode to generate an overall report + foreach my $ext (1..$multicore){ + push @mbias_reports, "$report_basename.${ext}.mbias"; + } + warn "\nMerging individual M-bias reports into overall M-bias statistics from these $multicore individual files:\n"; + print join ("\n",@mbias_reports),"\n\n"; + + + ########## + # resetting the counters first, then repopulating them + %mbias_1 = (); + %mbias_2 = (); + + # repopulating the merged counter + foreach my $file (@mbias_reports){ + open (IR,$file) or die $!; + + my $context; + my $read; + + while (<IR>){ + chomp; + # warn "$_\n"; sleep(1); + if ($_ =~ /context/){ + $context = $1 if ($_ =~ /(\D{3}) context/); + # warn "Context set as $context\n"; + + if ($_ =~ /R2/){ + $read = 'R2'; + } + else{ + $read = 'R1'; + } + # warn "Setting read identity to '$read'\n"; + + # reading in 2 additional lines (===========, and header line) + $_ = <IR>; + #warn "discarding line $_\n"; + $_ = <IR>; + #warn "discarding line $_\n"; + next; + } + else{ + + if ($_ eq ''){ + # empty line, only occurs after a context has finished and before a new context starts + next; + } + + my ($pos,$meth,$unmeth) = (split /\t/); + # warn "$pos\t$meth\t$unmeth\n"; sleep(1); + if ($read eq 'R1'){ + $mbias_1{$context}->{$pos}->{meth} += $meth; + $mbias_1{$context}->{$pos}->{un} += $unmeth; + } + elsif ($read eq 'R2'){ + $mbias_2{$context}->{$pos}->{meth} += $meth; + $mbias_2{$context}->{$pos}->{un} += $unmeth; + } + } + } + close IR or warn "Had trouble closing filehandle for $file: $!\n"; + } + + # deleting the individual reports afterwards + foreach my $file (@mbias_reports){ + unlink $file; + } +} + + +sub delete_unused_files{ + + warn "Deleting unused files ...\n\n"; sleep(1); + + my $index = 0; + + while ($index <= $#sorting_files){ + if ($sorting_files[$index] =~ /gz$/){ + open (USED,"zcat $sorting_files[$index] |") or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n"; + } + else{ + open (USED,$sorting_files[$index]) or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n"; + } + + my $used = 0; + + while (<USED>){ + next if (/^Bismark/); + if ($_){ + $used = 1; + last; + } + } + + if ($used){ + warn "$sorting_files[$index] contains data ->\tkept\n"; + ++$index; + } + else{ + + my $delete = unlink $sorting_files[$index]; + + if ($delete){ + warn "$sorting_files[$index] was empty ->\tdeleted\n"; + } + else{ + warn "$sorting_files[$index] was empty, however deletion was unsuccessful: $!\n" + } + + ### we also need to remove the element from @sorting_files + splice @sorting_files, $index, 1; + } + } + warn "\n\n"; ## can't close the piped filehandles at this point because it will die (unfortunately) +} + +sub produce_mbias_plots{ + + my $filename = shift; + + my $mbias = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified + $mbias =~ s/gz$//; + $mbias =~ s/sam$//; + $mbias =~ s/bam$//; + $mbias =~ s/txt$//; + my $mbias_graph_1 = my $mbias_graph_2 = $mbias; + $mbias_graph_1 = $output_dir . $mbias_graph_1 . 'M-bias_R1.png'; + $mbias_graph_2 = $output_dir . $mbias_graph_2 . 'M-bias_R2.png'; + + $mbias =~ s/$/M-bias.txt/; + + open (MBIAS,'>',"$output_dir$mbias") or die "Failed to open file for the M-bias data\n\n"; + + # determining maximum read length + my $max_length_1 = 0; + my $max_length_2 = 0; + + foreach my $context (keys %mbias_1){ + foreach my $pos (sort {$a<=>$b} keys %{$mbias_1{$context}}){ + $max_length_1 = $pos unless ($max_length_1 >= $pos); + } + } + if ($paired){ + foreach my $context (keys %mbias_2){ + foreach my $pos (sort {$a<=>$b} keys %{$mbias_2{$context}}){ + $max_length_2 = $pos unless ($max_length_2 >= $pos); + } + } + } + + if ($single){ + warn "Determining maximum read length for M-Bias plot\n"; + warn "Maximum read length of Read 1: $max_length_1\n\n"; + } + else{ + warn "Determining maximum read lengths for M-Bias plots\n"; + warn "Maximum read length of Read 1: $max_length_1\n"; + warn "Maximum read length of Read 2: $max_length_2\n\n"; + } + # sleep(3); + + my @mbias_read1; + my @mbias_read2; + + #Check whether the module GD::Graph:lines is installed + my $gd_graph_installed = 0; + eval{ + require GD::Graph::lines; + GD::Graph::lines->import(); + }; + + unless($@) { # syntax or routine error variable, set if something goes wrong in the last eval{ require ...} + $gd_graph_installed = 1; + + #Check whether the module GD::Graph::colour is installed + eval{ + require GD::Graph::colour; + GD::Graph::colour->import(qw(:colours :lists :files :convert)); + }; + + if ($@) { + warn "Perl module GD::Graph::colour not found, skipping drawing M-bias plots (only writing out M-bias plot table)\n"; + sleep(2); + $gd_graph_installed = 0; + } + + + } + else{ + warn "Perl module GD::Graph::lines is not installed, skipping drawing M-bias plots (only writing out M-bias plot table)\n"; + sleep(2); + } + + + my $graph_title; + my $graph1; + my $graph2; + + if ( $gd_graph_installed){ + $graph1 = GD::Graph::lines->new(800,600); + if ($paired){ + $graph2 = GD::Graph::lines->new(800,600); + } + } + + foreach my $context (qw(CpG CHG CHH)){ + @{$mbias_read1[0]} = (); + + if ($paired){ + print MBIAS "$context context (R1)\n================\n"; + $graph_title = 'M-bias (Read 1)'; + } + else{ + print MBIAS "$context context\n===========\n"; + $graph_title = 'M-bias'; + } + print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n"; + + foreach my $pos (1..$max_length_1){ + + unless (defined $mbias_1{$context}->{$pos}->{meth}){ + $mbias_1{$context}->{$pos}->{meth} = 0; + } + unless (defined $mbias_1{$context}->{$pos}->{un}){ + $mbias_1{$context}->{$pos}->{un} = 0; + } + + my $percent = ''; + if (($mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) > 0){ + $percent = sprintf("%.2f",$mbias_1{$context}->{$pos}->{meth} * 100/ ( $mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) ); + } + my $coverage = $mbias_1{$context}->{$pos}->{un} + $mbias_1{$context}->{$pos}->{meth}; + + print MBIAS "$pos\t$mbias_1{$context}->{$pos}->{meth}\t$mbias_1{$context}->{$pos}->{un}\t$percent\t$coverage\n"; + push @{$mbias_read1[0]},$pos; + + if ($context eq 'CpG'){ + push @{$mbias_read1[1]},$percent; + push @{$mbias_read1[4]},$coverage; + } + elsif ($context eq 'CHG'){ + push @{$mbias_read1[2]},$percent; + push @{$mbias_read1[5]},$coverage; + } + elsif ($context eq 'CHH'){ + push @{$mbias_read1[3]},$percent; + push @{$mbias_read1[6]},$coverage; + } + } + print MBIAS "\n"; + } + + if ( $gd_graph_installed){ + + add_colour(nice_blue => [31,120,180]); + add_colour(nice_orange => [255,127,0]); + add_colour(nice_green => [51,160,44]); + add_colour(pale_blue => [153,206,227]); + add_colour(pale_orange => [253,204,138]); + add_colour(pale_green => [191,230,207]); + + $graph1->set( + x_label => 'position (bp)', + y1_label => '% methylation', + y2_label => '# methylation calls', + title => $graph_title, + line_width => 2, + x_max_value => $max_length_1, + x_min_value => 0, + y_tick_number => 10, + y_label_skip => 2, + y1_max_value => 100, + y1_min_value => 0, + y_label_skip => 2, + y2_min_value => 0, + x_label_skip => 5, + x_label_position => 0.5, + x_tick_offset => -1, + bgclr => 'white', + transparent => 0, + two_axes => 1, + use_axis => [1,1,1,2,2,2], + legend_placement => 'RC', + legend_spacing => 6, + legend_marker_width => 24, + legend_marker_height => 18, + dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)], + ) or die $graph1->error; + + $graph1->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls'); + + ### Failure to plot the MBIAS graph will now generate a warning instead of dieing (previous version below. Suggested by Andrew DeiRossi, 05 June 2014) + if (my $gd1 = $graph1->plot(\@mbias_read1)) { + open (MBIAS_G1,'>',$mbias_graph_1) or die "Failed to write to file for M-bias plot 1: $!\n\n"; + binmode MBIAS_G1; + print MBIAS_G1 $gd1->png; + } + else { + warn "WARNING: Cannot generate read 1 M-bias plot: " , $graph1->error , "\n\n"; + } + + # my $gd1 = $graph1->plot(\@mbias_read1) or die $graph1->error; + # open (MBIAS_G1,'>',$mbias_graph_1) or die "Failed to write to file for M-bias plot 1: $!\n\n"; + # binmode MBIAS_G1; + # print MBIAS_G1 $gd1->png; + } + + if ($paired){ + + foreach my $context (qw(CpG CHG CHH)){ + @{$mbias_read2[0]} = (); + + print MBIAS "$context context (R2)\n================\n"; + print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n"; + foreach my $pos (1..$max_length_2){ + + unless (defined $mbias_2{$context}->{$pos}->{meth}){ + $mbias_2{$context}->{$pos}->{meth} = 0; + } + unless (defined $mbias_2{$context}->{$pos}->{un}){ + $mbias_2{$context}->{$pos}->{un} = 0; + } + + my $percent = ''; + if (($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) > 0){ + $percent = sprintf("%.2f",$mbias_2{$context}->{$pos}->{meth} * 100/ ($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) ); + } + my $coverage = $mbias_2{$context}->{$pos}->{un} + $mbias_2{$context}->{$pos}->{meth}; + + print MBIAS "$pos\t$mbias_2{$context}->{$pos}->{meth}\t$mbias_2{$context}->{$pos}->{un}\t$percent\t$coverage\n"; + + push @{$mbias_read2[0]},$pos; + + if ($context eq 'CpG'){ + push @{$mbias_read2[1]},$percent; + push @{$mbias_read2[4]},$coverage; + } + elsif ($context eq 'CHG'){ + push @{$mbias_read2[2]},$percent; + push @{$mbias_read2[5]},$coverage; + } + elsif ($context eq 'CHH'){ + push @{$mbias_read2[3]},$percent; + push @{$mbias_read2[6]},$coverage; + } + } + print MBIAS "\n"; + } + + if ( $gd_graph_installed){ + + add_colour(nice_blue => [31,120,180]); + add_colour(nice_orange => [255,127,0]); + add_colour(nice_green => [51,160,44]); + add_colour(pale_blue => [153,206,227]); + add_colour(pale_orange => [253,204,138]); + add_colour(pale_green => [191,230,207]); + + $graph2->set( + x_label => 'position (bp)', + line_width => 2, + x_max_value => $max_length_1, + x_min_value => 0, + y_tick_number => 10, + y_label_skip => 2, + y1_max_value => 100, + y1_min_value => 0, + y_label_skip => 2, + y2_min_value => 0, + x_label_skip => 5, + x_label_position => 0.5, + x_tick_offset => -1, + bgclr => 'white', + transparent => 0, + two_axes => 1, + use_axis => [1,1,1,2,2,2], + legend_placement => 'RC', + legend_spacing => 6, + legend_marker_width => 24, + legend_marker_height => 18, + dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)], + x_label => 'position (bp)', + y1_label => '% methylation', + y2_label => '# calls', + title => 'M-bias (Read 2)', + ) or die $graph2->error; + + $graph2->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls'); + + ### Failure to plot the MBIAS graph will now generate a warning instead of dieing (previous version below. Suggested by Andrew DeiRossi, 05 June 2014) + if (my $gd2 = $graph2->plot(\@mbias_read2)) { + open (MBIAS_G2,'>',$mbias_graph_2) or die "Failed to write to file for M-bias plot 2: $!\n\n"; + binmode MBIAS_G2; + print MBIAS_G2 $gd2->png; + } + else { + warn "WARNING: Cannot generate Read 2 M-bias plot: " , $graph2->error , "\n\n"; + } + + # my $gd2 = $graph2->plot(\@mbias_read2) or die $graph2->error; + # open (MBIAS_G2,'>',$mbias_graph_2) or die "Failed to write to file for M-bias plot 2: $!\n\n"; + # binmode MBIAS_G2; + # print MBIAS_G2 $gd2->png; + + } + } +} + +sub process_commandline{ + my $help; + my $single_end; + my $paired_end; + my $ignore; + my $ignore_r2; + my $genomic_fasta; + my $full; + my $report; + my $extractor_version; + my $no_overlap; + my $merge_non_CpG; + my $vanilla; + my $output_dir; + my $no_header; + my $bedGraph; + my $coverage_threshold = 1; # Minimum number of reads covering before calling methylation status + my $remove; + my $counts; + my $cytosine_report; + my $genome_folder; + my $zero; + my $CpG_only; + my $CX_context; + my $split_by_chromosome; + my $sort_size; + my $samtools_path; + my $gzip; + my $mbias_only; + my $mbias_off; + my $gazillion; + my $ample_mem; + my $include_overlap; + my $ignore_3prime; + my $ignore_3prime_r2; + my $multicore; + + my $command_line = GetOptions ('help|man' => \$help, + 'p|paired-end' => \$paired_end, + 's|single-end' => \$single_end, + 'fasta' => \$genomic_fasta, + 'ignore=i' => \$ignore, + 'ignore_r2=i' => \$ignore_r2, + 'comprehensive' => \$full, + 'report' => \$report, + 'version' => \$extractor_version, + 'no_overlap' => \$no_overlap, + 'merge_non_CpG' => \$merge_non_CpG, + 'vanilla' => \$vanilla, + 'o|output=s' => \$output_dir, + 'no_header' => \$no_header, + 'bedGraph' => \$bedGraph, + "cutoff=i" => \$coverage_threshold, + "remove_spaces" => \$remove, + "counts" => \$counts, + "cytosine_report" => \$cytosine_report, + 'g|genome_folder=s' => \$genome_folder, + "zero_based" => \$zero, + "CX|CX_context" => \$CX_context, + "split_by_chromosome" => \$split_by_chromosome, + "buffer_size=s" => \$sort_size, + 'samtools_path=s' => \$samtools_path, + 'gzip' => \$gzip, + 'mbias_only' => \$mbias_only, + 'mbias_off' => \$mbias_off, + 'gazillion|scaffolds' => \$gazillion, + 'ample_memory' => \$ample_mem, + 'include_overlap' => \$include_overlap, + 'ignore_3prime=i' => \$ignore_3prime, + 'ignore_3prime_r2=i' => \$ignore_3prime_r2, + 'multicore=i' => \$multicore, + ); + + ### EXIT ON ERROR if there were errors with any of the supplied options + unless ($command_line){ + die "Please respecify command line options\n"; + } + + ### HELPFILE + if ($help){ + print_helpfile(); + exit; + } + + if ($extractor_version){ + print << "VERSION"; + + + Bismark Methylation Extractor + + Bismark Extractor Version: $version + Copyright 2010-15 Felix Krueger, Babraham Bioinformatics + www.bioinformatics.babraham.ac.uk/projects/bismark/ + + +VERSION + exit; + } + + + ### no files provided + unless (@ARGV){ + die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n"; + } + @filenames = @ARGV; + + warn "\n *** Bismark methylation extractor version $version ***\n\n"; + + ### M-BIAS ONLY + if ($mbias_only){ + + if ($mbias_off){ + die "Options '--mbias_only' and '--mbias_off' are not compatible. Just pick one, mkay?\n"; + } + if ($bedGraph){ + die "Option '--mbias_only' skips all sorts of methylation extraction, including the bedGraph generation. Please respecify!\n"; + } + if ($cytosine_report){ + die "Option '--mbias_only' skips all sorts of methylation extraction, including the genome-wide cytosine methylation report generation. Please respecify!\n"; + } + if ($merge_non_CpG){ + warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--merge' won't have any effect\n"; + } + if ($full){ + warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--comprehensive' won't have any effect\n"; + } + sleep(3); + } + + ### PRINT A REPORT + unless ($report){ + $report = 1; # making this the default + } + + ### OUTPUT DIR PATH + if ($output_dir){ + unless ($output_dir =~ /\/$/){ + $output_dir =~ s/$/\//; + } + } + else{ + $output_dir = ''; + } + + ### NO HEADER + unless ($no_header){ + $no_header = 0; + } + + ### OLD (VANILLA) OUTPUT FORMAT + unless ($vanilla){ + $vanilla = 0; + } + + if ($single_end){ + $paired_end = 0; ### SINGLE END ALIGNMENTS + } + elsif ($paired_end){ + $single_end = 0; ### PAIRED-END ALIGNMENTS + } + else{ + + ### we will try to determine whether the input file was a single-end or paired-end sequencing run from the SAM header + + if ($vanilla){ + die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format with '-s' or '-p'\n\n"; + } + else{ # SAM/BAM format + + my $file = $filenames[0]; + warn "Trying to determine the type of mapping from the SAM header line of file $file\n"; sleep(1); + + ### if the user did not specify whether the alignment file was single-end or paired-end we are trying to get this information from the @PG header line in the SAM/BAM file + if ($file =~ /\.gz$/){ + open (DETERMINE,"zcat $file |") or die "Unable to read from gzipped file $file: $!\n"; + } + elsif ($file =~ /\.bam$/ || isBam($file) ){ ### this would allow to read BAM files that do not end in *.bam + open (DETERMINE,"samtools view -h $file |") or die "Unable to read from BAM file $file: $!\n"; + } + else{ + open (DETERMINE,$file) or die "Unable to read from $file: $!\n"; + } + + while (<DETERMINE>){ + last unless (/^\@/); + if ($_ =~ /^\@PG/){ + # warn "found the \@PG line:\n"; + # warn "$_"; + + if ($_ =~ /\s+-1\s+/ and $_ =~ /\s+-2\s+/){ + warn "Treating file(s) as paired-end data (as extracted from \@PG line)\n\n"; sleep(1); + $paired_end = 1; + $single_end = 0; + } + else{ + warn "Treating file(s) as single-end data (as extracted from \@PG line)\n\n"; sleep(1); + $paired_end = 0; + $single_end = 1; + } + } + } + + # close DETERMINE or warn $!; # this always throws an error anyway... + + } + } + + ### IGNORING <INT> 5' END + # bases at the start of the read when processing the methylation call string + unless ($ignore){ + $ignore = 0; + } + + if (defined $ignore_r2){ + die "You can only specify --ignore_r2 for paired-end result files\n" unless ($paired_end); + } + else{ + $ignore_r2 = 0; + } + + ### IGNORING <INT> 3' END + # bases at the end of the read when processing the methylation call string + unless ($ignore_3prime){ + $ignore_3prime = 0; + } + + if (defined $ignore_3prime_r2){ + die "You can only specify --ignore_3prime_r2 for paired-end result files\n" unless ($paired_end); + } + else{ + $ignore_3prime_r2 = 0; + } + + + ### NO OVERLAP + ### --no_overlap is the default (as of version 0.12.6), unless someone explicitly asks to include overlaps + if ($include_overlap){ + die "The option '--include_overlap' can only be specified for paired-end input!\n" unless ($paired_end); + warn "Setting option '--inlcude_overlap' for paired-end data (user-defined)\n\n"; + $no_overlap = 0; + } + else{ # default + if ($paired_end){ + warn "Setting option '--no_overlap' since this is (normally) the right thing to do for paired-end data\n\n"; + $no_overlap = 1; + } + } + + ### COMPREHENSIVE OUTPUT + unless ($full){ + $full = 0; + } + + ### MERGE NON-CpG context + unless ($merge_non_CpG){ + $merge_non_CpG = 0; + } + + ### remove white spaces in read ID (needed for sorting using the sort command + unless ($remove){ + $remove = 0; + } + + ### COVERAGE THRESHOLD FOR bedGraph OUTPUT + if (defined $coverage_threshold){ + unless ($coverage_threshold > 0){ + die "Please select a coverage greater than 0 (positive integers only)\n"; + } + } + else{ + $coverage_threshold = 1; + } + + ### SORT buffer size + if (defined $sort_size){ + unless ($sort_size =~ /^\d+\%$/ or $sort_size =~ /^\d+(K|M|G|T)$/){ + die "Please select a buffer size as percentage (e.g. --buffer_size 20%) or a number to be multiplied with K, M, G, T etc. (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line\n"; + } + } + else{ + $sort_size = '2G'; + } + + if ($zero){ + die "Option '--zero' is only available if '--bedGraph' or '--cytosine_report' are specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph); + } + + if ($CX_context){ + die "Option '--CX_context' is only available if '--bedGraph' or '--cytosine_report' are specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph); + } + else{ + $CX_context = 0; + } + + unless ($counts){ + $counts = 1; # counts will always be set + } + + if ($cytosine_report){ + + ### GENOME folder + if ($genome_folder){ + unless ($genome_folder =~/\/$/){ + $genome_folder =~ s/$/\//; + } + } + else{ + die "Please specify a genome folder to proceed (full path only)\n"; + } + + unless ($bedGraph){ + warn "Setting the option '--bedGraph' since this is required for the genome-wide cytosine report\n"; + $bedGraph = 1; + } + unless ($counts){ + # warn "Setting the option '--counts' since this is required for the genome-wide cytosine report\n"; + $counts = 1; + } + warn "\n"; + } + + ### PATH TO SAMTOOLS + if (defined $samtools_path){ + # if Samtools was specified as full command + if ($samtools_path =~ /samtools$/){ + if (-e $samtools_path){ + # Samtools executable found + } + else{ + die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; + } + } + else{ + unless ($samtools_path =~ /\/$/){ + $samtools_path =~ s/$/\//; + } + $samtools_path .= 'samtools'; + if (-e $samtools_path){ + # Samtools executable found + } + else{ + die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; + } + } + } + # Check whether Samtools is in the PATH if no path was supplied by the user + else{ + if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if Samtools is in the PATH + $samtools_path = `which samtools`; + chomp $samtools_path; + } + } + + unless (defined $samtools_path){ + $samtools_path = ''; + } + + + if ($gazillion){ + if ($ample_mem){ + die "You can't currently select '--ample_mem' together with '--gazillion'. Make your pick!\n\n"; + } + } + + if (defined $multicore){ + unless ($multicore > 0){ + die "Core usage needs to be set to 1 or more (currently selected $multicore). Please respecify!\n"; + } + if ($multicore > 20){ + warn "Core usage currently set to more than 20 threads. Let's see how this goes... (set value: $multicore)\n\n"; + } + } + else{ + $multicore = 1; # default. Single-thread mode + warn "Setting core usage to single-threaded (default). Consider using --multicore <int> to speed up the extraction process.\n\n"; + } + + return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_off,$mbias_only,$gazillion,$ample_mem,$ignore_3prime,$ignore_3prime_r2,$multicore); +} + + +sub test_positional_sorting{ + + my $filename = shift; + + print "\nNow testing Bismark result file $filename for positional sorting (which would be bad...)\t"; + sleep(1); + + if ($filename =~ /\.gz$/) { + open (TEST,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n"; + } + elsif ($filename =~ /bam$/ || isBam($filename) ){ ### this would allow to read BAM files that do not end in *.bam + if ($samtools_path){ + open (TEST,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n"; + } + else{ + die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n"; + } + } + else { + open (TEST,$filename) or die "Can't open file $filename: $!\n"; + } + + my $count = 0; + + while (<TEST>) { + if (/^\@/) { # testing header lines if they contain the @SO flag (for being sorted) + if (/^\@SO/) { + die "SAM/BAM header line '$_' indicates that the Bismark aligment file has been sorted by chromosomal positions which is is incompatible with correct methylation extraction. Please use an unsorted file instead\n\n"; + } + next; + } + $count++; + + last if ($count > 100000); # else we test the first 100000 sequences if they start with the same read ID + + my ($id_1) = (split (/\t/)); + + ### reading the next line which should be read 2 + $_ = <TEST>; + my ($id_2) = (split (/\t/)); + last unless ($id_2); + ++$count; + + if ($id_1 eq $id_2){ + ### ids are the same + next; + } + else{ ### in previous versions of Bismark we appended /1 and /2 to the read IDs for easier eyeballing which read is which. These tags need to be removed first + my $id_1_trunc = $id_1; + $id_1_trunc =~ s/\/1$//; + my $id_2_trunc = $id_2; + $id_2_trunc =~ s/\/2$//; + + unless ($id_1_trunc eq $id_2_trunc){ + die "The IDs of Read 1 ($id_1) and Read 2 ($id_2) are not the same. This might be a result of sorting the paired-end SAM/BAM files by chromosomal position which is not compatible with correct methylation extraction. Please use an unsorted file instead\n\n"; + } + } + } + # close TEST or die $!; somehow fails on our cluster... + ### If it hasen't died so far then it seems the file is in the correct Bismark format (read 1 and read 2 of a pair directly following each other) + warn "...passed!\n"; + sleep(1); + +} + +sub process_Bismark_results_file{ + + my $filename = shift; + my $report_filename = open_output_filehandles($filename); + + ### disabling buffering so we don't run into problems with half written out lines... + foreach my $fh (keys %fhs){ + if ($fh =~ /^[1230]$/) { + foreach my $context (keys %{$fhs{$fh}}) { + select($fhs{$fh}->{$context}); + $|++; + } + } + else{ + select($fhs{$fh}); + $|++; + } + } + select(STDOUT); + + ################################################ + ################################################ + ### multi-process handling + ### + + my $offset = 1; + my $process_id; + my @pids; + if ($multicore > 1){ + + until ($offset == $multicore){ + # warn "multicore: $multicore\noffset: $offset\n"; + my $fork = fork; + + if (defined $fork){ + if ($fork != 0){ + $process_id = $fork; + push @pids, $process_id; + if ($offset < $multicore){ + ++$offset; + # warn "I am the parent process, child pid: $fork\nIncrementing offset counter to: $offset\n\n"; + } + else{ + # warn "Reached the number of maximum multicores. Proceeeding to processing...\n"; + } + } + elsif ($fork == 0){ + # warn "I am a child process, pid: $fork\nOffset counter is: $offset\nProceeding to processing...\n"; + $process_id = $fork; + last; + } + } + else{ + die "Forking unsuccessful. Proceeding using a single thread only\n"; + } + } + + # warn "\nThe thread identity\n===================\n"; + if ($process_id){ + # print "I am the parent process. My children are called:\n"; + # print join ("\t",@pids),"\n"; + # print "I am going to process the following line count: $offset\n\n"; + } + elsif($process_id == 0){ + # warn "I am a child process: Process ID: $process_id\n"; + # warn "I am going to process the following line count: $offset\n\n"; + } + else{ + die "Process ID was: $process_id\n"; + } + } + else{ + # warn "Single-core mode: setting pid to 1\n"; + $process_id = 1; + } + + ################################################ + ################################################ + if ($process_id){ + warn "Now reading in Bismark result file $filename\n"; + } + else{ + warn "\nNow reading in Bismark result file $filename\n\n"; + } + + if ($filename =~ /\.gz$/) { + open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n"; + } + elsif ($filename =~ /bam$/ || isBam($filename) ){ ### this would allow to read BAM files that do not end in *.bam + if ($samtools_path){ + open (IN,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n"; + } + else{ + die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n"; + } + } + else { + open (IN,$filename) or die "Can't open file $filename: $!\n"; + } + + ### Vanilla and SAM output need to read different numbers of header lines + if ($vanilla) { + my $bismark_version = <IN>; ## discarding the Bismark version info + chomp $bismark_version; + $bismark_version =~ s/\r//; # replaces \r line feed + $bismark_version =~ s/Bismark version: //; + if ($bismark_version =~ /^\@/) { + warn "Detected \@ as the first character of the version information. Is it possible that the file is in SAM format?\n\n"; + sleep (1); + } + + unless ($version eq $bismark_version){ + die "The methylation extractor and Bismark itself need to be of the same version!\n\nVersions used:\nmethylation extractor: '$version'\nBismark: '$bismark_version'\n"; + } + } else { + # If the read is in SAM format (default) it can either start with @ header lines or start with alignments directly. + # We are reading from it further down + } + + my $methylation_call_strings_processed = 0; + my $line_count = 0; + + ### proceeding differently now for single-end or paired-end Bismark files + + ### PROCESSING SINGLE-END RESULT FILES + if ($single) { + + ### also proceeding differently now for SAM format or vanilla Bismark format files + if ($vanilla) { # old vanilla Bismark output format + while (<IN>) { + ++$line_count; + warn "Processed lines: $line_count\n" if ($line_count%500000==0); + + if ( ($line_count - $offset)%$multicore == 0){ + # warn "line count: $line_count\noffset: $offset\n"; + # warn "Modulus: ",($line_count - $offset)%$multicore,"\n"; + # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n"; + } + else{ + # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n"; + next; + } + + ### $seq here is the chromosomal sequence (to use for the repeat analysis for example) + my ($id,$strand,$chrom,$start,$seq,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,6,7,8,9]; + + ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the first or + ### last position + chomp $genome_conversion; + + my $index; + if ($meth_call) { + + if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand + $index = 0; + } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand + $index = 1; + } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand + $index = 3; + } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand + $index = 2; + } else { + die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; + } + + ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int> + if ($ignore) { + $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); + + ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly! + if ($strand eq '+') { + $start += $ignore; + } + elsif ($strand eq '-') { + $start += length($meth_call)-1; ## $meth_call is already shortened! + } + else { + die "Alignment did not have proper strand information: $strand\n"; + } + } + + ### Clipping off the last <int> number of bases from the methylation call string as specified with --ignore_3prime <int> + if ($ignore_3prime) { + + $meth_call = substr($meth_call,0, (length($meth_call)) - $ignore_3prime); + + ### If we are clipping off some bases at the end we need to adjust the end position of the alignments accordingly + if ($strand eq '+') { + # clipping the 3' end does not affect the starting position # ignore 5' has already been taken care of, if relevant at all + } + elsif ($strand eq '-') { + # here we need to discriminate if the start has been adjusted because of --ignore or not + if ($ignore){ + # position adjusted already, and because of this 3' trimming is irrelevant for the start position + } + else{ + # Here we need to add the length ignore_3prime to the read starting position, adjustment of the start position will take place later in the methylation extraction step + $start += $ignore_3prime; + } + } + else { + die "Alignment did not have proper strand information: $strand\n"; + } + } + ### just as a comment, if --ignore has not been specified the starting position of reverse reads is adjusted later at the methylation extraction stage + + ### printing out the methylation state of every C in the read + print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index); + + ++$methylation_call_strings_processed; # 1 per single-end result + } + } + } else { # processing single-end SAM format (default) + while (<IN>) { + chomp; + ### SAM format can either start with header lines (starting with @) or start with alignments directly + if (/^\@/) { # skipping header lines (starting with @) + warn "skipping SAM header line:\t$_\n" unless ($process_id == 0); + next; + } + + ++$line_count; + # warn "$line_count\n"; + warn "Processed lines: $line_count\n" if ($line_count%500000 == 0); + + if ( ($line_count - $offset)%$multicore == 0){ + # warn "line count: $line_count\noffset: $offset\n"; + # warn "Modulus: ",($line_count - $offset)%$multicore,"\n"; + # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n"; + } + else{ + # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n"; + next; + } + + # example read in SAM format + # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT + ### + + # < 0.7.6 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; + # < 0.7.6 $meth_call =~ s/^XM:Z://; + # < 0.7.6 $read_conversion =~ s/^XR:Z://; + # < 0.7.6 $genome_conversion =~ s/^XG:Z://; + + my ($id,$chrom,$start,$cigar) = (split("\t"))[0,2,3,5]; + + ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression + my $meth_call; ### Thanks to Zachary Zeno for this solution + my $read_conversion; + my $genome_conversion; + + while ( /(XM|XR|XG):Z:([^\t]+)/g ) { + my $tag = $1; + my $value = $2; + + if ($tag eq "XM") { + $meth_call = $value; + $meth_call =~ s/\r//; + } elsif ($tag eq "XR") { + $read_conversion = $value; + $read_conversion =~ s/\r//; + } elsif ($tag eq "XG") { + $genome_conversion = $value; + $genome_conversion =~ s/\r//; + } + } + + my $strand; + # warn "$meth_call\n$read_conversion\n$genome_conversion\n"; + + my $index; + if ($meth_call) { + if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand + $index = 0; + $strand = '+'; + } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand + $index = 1; + $strand = '-'; + } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand + $index = 2; + $strand = '+'; + } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand + $index = 3; + $strand = '-'; + } else { + die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; + } + + ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output + if ($strand eq '-') { + $meth_call = reverse $meth_call; + } + # warn "\n$meth_call\n"; + + ### IGNORE 5 PRIME: Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int> + if ($ignore) { + # warn "\n\n$meth_call\n"; + $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); + # warn "$meth_call\n";sleep(1); + + ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly + + my @len = split (/\D+/,$cigar); # storing the length per operation + my @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + my @comp_cigar; # building an array with all CIGAR operations + foreach my $index (0..$#len) { + foreach (1..$len[$index]) { + # print "$ops[$index]"; + push @comp_cigar, $ops[$index]; + } + } + # print "original CIGAR: $cigar\n"; + # print "original CIGAR: @comp_cigar\n"; + + ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly! + if ($strand eq '+') { + + my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions + my $I_count = 0; + + for (1..$ignore) { + my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations from the start + # print "$_ deleted $op\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $D_count++; + $op = shift @comp_cigar; + # print "$_ deleted $op\n"; + } + if ($op eq 'I') { # adjusting the genomic position for insertions (I) + $I_count++; + } + } + $start += $ignore + $D_count - $I_count; + # print "start $start\t ignore: $ignore\t D count: $D_count I_count: $I_count\n"; + } + elsif ($strand eq '-') { + + for (1..$ignore) { + my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array + while ($op eq 'D') { # repeating this for deletions (D) + $op = pop @comp_cigar; + } + } + + ### For reverse strand alignments we need to determine the number of matching bases (M) or deletions (D) in the read from the CIGAR + ### string to be able to work out the starting position of the read which is on the 3' end of the sequence + my $MD_count = 0; # counting all operations that affect the genomic position, i.e. M and D. Insertions do not affect the start position + foreach (@comp_cigar) { + ++$MD_count if ($_ eq 'M' or $_ eq 'D'); + } + $start += $MD_count - 1; + } + + ### reconstituting shortened CIGAR string + my $new_cigar; + my $count = 0; + my $last_op; + # print "ignore adjusted: @comp_cigar\n"; + foreach my $op (@comp_cigar) { + unless (defined $last_op){ + $last_op = $op; + ++$count; + next; + } + if ($last_op eq $op) { + ++$count; + } else { + $new_cigar .= "$count$last_op"; + $last_op = $op; + $count = 1; + } + } + $new_cigar .= "$count$last_op"; # appending the last operation and count + $cigar = $new_cigar; + # print "ignore adjusted scalar: $cigar\n"; + } + + ####################### + ### INGORE 3' END ### + ####################### + + # Clipping off the last <int> number of bases from the methylation call string as specified with --ignore_3prime <int> + if ($ignore_3prime) { + # warn "$meth_call\n"; + $meth_call = substr($meth_call,0, (length($meth_call)) - $ignore_3prime); + # warn "$meth_call\n";sleep(1); + + ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly + + my @len = split (/\D+/,$cigar); # storing the length per operation + my @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + my @comp_cigar; # building an array with all CIGAR operations + foreach my $index (0..$#len) { + foreach (1..$len[$index]) { + # print "$ops[$index]"; + push @comp_cigar, $ops[$index]; + } + } + + # print "original CIGAR: $cigar\n"; + # print join ("",@comp_cigar),"\n"; + + ### If we are clipping off some bases at the end we might have to adjust the start position of the alignments accordingly + if ($strand eq '+') { + + ### clipping the 3' end does not affect the starting position of forward strand alignments + # ignore 5' has already been taken care of at this stage, if relevant at all + + for (1..$ignore_3prime) { + my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore_3prime operations from the end + # print join ("",@comp_cigar),"\n"; + # print "$_ deleted $op from 3' end\n"; + while ($op eq 'D') { # repeating this for deletions (D) + $op = pop @comp_cigar; + # print join ("",@comp_cigar),"\n"; + # print "$_ deleted $op from 3' end\n"; + } + } + # print "Final truncated CIGAR string:\n"; + # print join ("",@comp_cigar),"\n"; + # $start += $ignore + $D_count - $I_count; + # print "start $start\t ignore_3prime: $ignore_3prime\t D count: $D_count I_count: $I_count\n"; + } + elsif ($strand eq '-') { + + my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions + my $I_count = 0; + + for (1..$ignore_3prime) { + my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore_3prime operations, here the first value of the array + while ($op eq 'D') { # repeating this for deletions (D) + $D_count++; + $op = shift @comp_cigar; + } + if ($op eq 'I') { # adjusting the genomic position for insertions (I) + $I_count++; + } + + } + + # here we need to discriminate if the start has been adjusted because of --ignore or not + if ($ignore){ + # the start position has already been modified for --ignore already, so we don't have to adjust the position again now + } + else{ + # Here we need to add the length ignore_3prime to the read starting position + # adjustment of the true starting position of this reverse read will take place later in the methylation extraction step + $start += $ignore_3prime + $D_count - $I_count; + } + + } + + ### reconstituting shortened CIGAR string + my $new_cigar; + my $count = 0; + my $last_op; + # print "ignore_3prime adjusted:\n"; print join ("",@comp_cigar),"\n"; + foreach my $op (@comp_cigar) { + unless (defined $last_op){ + $last_op = $op; + ++$count; + next; + } + if ($last_op eq $op) { + ++$count; + } else { + $new_cigar .= "$count$last_op"; + $last_op = $op; + $count = 1; + } + } + $new_cigar .= "$count$last_op"; # appending the last operation and count + $cigar = $new_cigar; + # print "ignore_3prime adjusted scalar: $cigar\n"; + } + } + ### printing out the methylation state of every C in the read + print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index,$cigar); + + ++$methylation_call_strings_processed; # 1 per single-end result + } + } + } + + ### PROCESSING PAIRED-END RESULT FILES + elsif ($paired) { + + ### proceeding differently now for SAM format or vanilla Bismark format files + if ($vanilla) { # old vanilla Bismark paired-end output format + while (<IN>) { + ++$line_count; + warn "processed line: $line_count\n" if ($line_count%500000 == 0); + + if ( ($line_count - $offset)%$multicore == 0){ + # warn "line count: $line_count\noffset: $offset\n"; + # warn "Modulus: ",($line_count - $offset)%$multicore,"\n"; + # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n"; + } + else{ + # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n"; + next; + } + + ### $seq here is the chromosomal sequence (to use for the repeat analysis for example) + my ($id,$strand,$chrom,$start_read_1,$end_read_2,$seq_1,$meth_call_1,$seq_2,$meth_call_2,$first_read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,4,6,7,9,10,11,12,13]; + + my $index; + chomp $genome_conversion; + + if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') { + $index = 0; ## this is OT + } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') { + $index = 2; ## this is CTOB!!! + } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') { + $index = 1; ## this is CTOT!!! + } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') { + $index = 3; ## this is OB + } else { + die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; + } + + if ($meth_call_1 and $meth_call_2) { + ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' + + ### IGNORE FROM 5' END + if ($ignore) { + $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); + + ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified + $start_read_1 += $ignore; + } + if ($ignore_r2) { + $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2); + + ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore_r2' was specified + $end_read_2 -= $ignore_r2; + } + + ### IGNORE 3' END + + ### Clipping off the last <int> number of bases from the methylation call string of Read 1 as specified with --ignore_3prime <int> + if ($ignore_3prime) { + $meth_call_1 = substr($meth_call_1,0, length($meth_call_1) - $ignore_3prime); + # we don't have to adjust the position now since the shortened methylation call will be fine, see below + } + ### Clipping off the last <int> number of bases from the methylation call string of Read 2 as specified with --ignore_3prime_r2 <int> + if ($ignore_3prime_r2) { + $meth_call_2 = substr($meth_call_2,0, length($meth_call_2) - $ignore_3prime_r2); + # we don't have to adjust the position now since the shortened methylation call will be fine, see below + } + + my $end_read_1; + my $start_read_2; + + if ($strand eq '+') { + + $end_read_1 = $start_read_1 + length($meth_call_1) - 1; + $start_read_2 = $end_read_2 - length($meth_call_2) + 1; + + ## we first pass the first read which is in + orientation on the forward strand + print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0,undef,1); # the last two values are CIGAR string and read identity + + # we next pass the second read which is in - orientation on the reverse strand + ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 + print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1,undef,2); + } + else { + + $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read! + $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read! + + ## we first pass the first read which is in - orientation on the reverse strand + print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0,undef,1); + + # we next pass the second read which is in + orientation on the forward strand + ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 + print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2,undef,2); + } + + $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls + } + } + } + else { # Bismark paired-end BAM/SAM output format (default) + while (<IN>) { + chomp; + ### SAM format can either start with header lines (starting with @) or start with alignments directly + if (/^\@/) { # skipping header lines (starting with @) + warn "skipping SAM header line:\t$_\n" unless ($process_id == 0); # no additional warnings for child procesess + next; + } + + ++$line_count; + warn "Processed lines: $line_count\n" if ($line_count%500000==0); + + if ( ($line_count - $offset)%$multicore == 0){ + # warn "line count: $line_count\noffset: $offset\n"; + # warn "Modulus: ",($line_count - $offset)%$multicore,"\n"; + # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n"; + } + else{ + # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n"; + $_ = <IN>; # reading and skipping another line since this is the paired-end read + next; + } + + # example paired-end reads in SAM format (2 consecutive lines) + # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT + # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT + + my ($id_1,$chrom,$start_read_1,$cigar_1) = (split("\t"))[0,2,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression + my $meth_call_1; + my $first_read_conversion; + my $genome_conversion; + + while ( /(XM|XR|XG):Z:([^\t]+)/g ) { + my $tag = $1; + my $value = $2; + + if ($tag eq "XM") { + $meth_call_1 = $value; + $meth_call_1 =~ s/\r//; + } elsif ($tag eq "XR") { + $first_read_conversion = $value; + $first_read_conversion =~ s/\r//; + } elsif ($tag eq "XG") { + $genome_conversion = $value; + $genome_conversion =~ s/\r//; + } + } + + $_ = <IN>; # reading in the paired read + chomp; + + my ($id_2,$start_read_2,$cigar_2) = (split("\t"))[0,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression + + my $meth_call_2; + my $second_read_conversion; + + while ( /(XM|XR):Z:([^\t]+)/g ) { + my $tag = $1; + my $value = $2; + + if ($tag eq "XM") { + $meth_call_2 = $value; + $meth_call_2 =~ s/\r//; + } elsif ($tag eq "XR") { + $second_read_conversion = $value; + $second_read_conversion = s/\r//; + } + } + + # < version 0.7.6 $genome_conversion =~ s/^XG:Z://; + # chomp $genome_conversion; # in case it captured a new line character # already removed + + # print join ("\t",$meth_call_1,$meth_call_2,$first_read_conversion,$second_read_conversion,$genome_conversion),"\n"; + + my $index; + my $strand; + + if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') { + $index = 0; ## this is OT + $strand = '+'; + } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') { + $index = 1; ## this is CTOT + $strand = '-'; + } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') { + $index = 2; ## this is CTOB + $strand = '+'; + } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') { + $index = 3; ## this is OB + $strand = '-'; + } else { + die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; + } + + ### reversing the methylation call of the read that was reverse-complemented + if ($strand eq '+') { + $meth_call_2 = reverse $meth_call_2; + } else { + $meth_call_1 = reverse $meth_call_1; + } + # warn "\n$meth_call_1\n$meth_call_2\n"; + + if ($meth_call_1 and $meth_call_2) { + + my $end_read_1; + + ### READ 1 + my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation + my @ops_1 = split (/\d+/,$cigar_1); # storing the operation + shift @ops_1; # remove the empty first element + + die "CIGAR string contained a non-matching number of lengths and operations: $cigar_1\n".join(" ",@len_1)."\n".join(" ",@ops_1)."\n" unless (scalar @len_1 == scalar @ops_1); + + my @comp_cigar_1; # building an array with all CIGAR operations + foreach my $index (0..$#len_1) { + foreach (1..$len_1[$index]) { + # print "$ops_1[$index]"; + push @comp_cigar_1, $ops_1[$index]; + } + } + # print "original CIGAR read 1: $cigar_1\n"; + # print "original CIGAR read 1: @comp_cigar_1\n"; + + ### READ 2 + my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation + my @ops_2 = split (/\d+/,$cigar_2); # storing the operation + shift @ops_2; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2); + my @comp_cigar_2; # building an array with all CIGAR operations for read 2 + foreach my $index (0..$#len_2) { + foreach (1..$len_2[$index]) { + # print "$ops_2[$index]"; + push @comp_cigar_2, $ops_2[$index]; + } + } + # print "original CIGAR read 2: $cigar_2\n"; + # print "original CIGAR read 2: @comp_cigar_2\n"; + + ################################## + ### IGNORE BASES FROM 5' END ### + ################################## + + if ($ignore) { + ### Clipping off the first <int> number of bases from the methylation call string as specified with '--ignore <int>' for read 1 + ### the methylation calls have already been reversed where necessary + + if ( (length($meth_call_1) - $ignore) <= 0){ + # next; # skipping this read entirely if the read is shorter than the portion to be ignored + $meth_call_1 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored + } + else { + $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); + + if ($strand eq '+') { + + ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start + my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions + my $I_count_1 = 0; + + for (1..$ignore) { + my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start + # print "$_ deleted $op\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $D_count_1++; + $op = shift @comp_cigar_1; + # print "$_ deleted $op\n"; + } + if ($op eq 'I') { # adjusting the genomic position for insertions (I) + $I_count_1++; + } + } + + $start_read_1 += $ignore + $D_count_1 - $I_count_1; + # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n"; + + # the start position of reads mapping to the reverse strand is being adjusted further below + } + elsif ($strand eq '-') { + + ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back + for (1..$ignore) { + my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array + while ($op eq 'D') { # repeating this for deletions (D) + $op = pop @comp_cigar_1; + } + } + # the start position of reads mapping to the reverse strand is being adjusted further below + + } + } + } + + if ($ignore_r2) { + ### Clipping off the first <int> number of bases from the methylation call string as specified with '--ignore_r2 <int>' for read 2 + ### the methylation calls have already been reversed where necessary + + if ( (length($meth_call_2) - $ignore_r2) <= 0){ + # next; # skipping this read entirely if the read is shorter than the portion to be ignored # this would skip the entire read pair! + $meth_call_2 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored + } + else { + $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2); + + ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly + + if ($strand eq '+') { + + ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back + + for (1..$ignore_r2) { + my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array + while ($op eq 'D') { # repeating this for deletions (D) + $op = pop @comp_cigar_2; + } + } + # the start position of reads mapping to the reverse strand is being adjusted further below + } + elsif ($strand eq '-') { + + ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start + my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions + my $I_count_2 = 0; + + for (1..$ignore_r2) { + my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start + # print "$_ deleted $op\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $D_count_2++; + $op = shift @comp_cigar_2; + # print "$_ deleted $op\n"; + } + if ($op eq 'I') { # adjusting the genomic position for insertions (I) + $I_count_2++; + } + } + + $start_read_2 += $ignore_r2 + $D_count_2 - $I_count_2; + # print "start read 2 $start_read_2\t ignore R2: $ignore_r2\t D count 2: $D_count_2\tI_count 2: $I_count_2\n"; + } + } + } + + if ($ignore and $meth_call_1){ # if the methylation call string is undefined at this position we don't need any new CIGAR string + + ### reconstituting shortened CIGAR string 1 + my $new_cigar_1; + my $count_1 = 0; + my $last_op_1; + # print "ignore adjusted CIGAR 1: @comp_cigar_1\n"; + foreach my $op (@comp_cigar_1) { + unless (defined $last_op_1){ + $last_op_1 = $op; + ++$count_1; + next; + } + if ($last_op_1 eq $op) { + ++$count_1; + } else { + $new_cigar_1 .= "$count_1$last_op_1"; + $last_op_1 = $op; + $count_1 = 1; + } + } + $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count + $cigar_1 = $new_cigar_1; + # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n"; + } + + if ($ignore_r2 and $meth_call_2){ # if the methylation call string is undefined at this point we don't need any new CIGAR string + + ### reconstituting shortened CIGAR string 2 + my $new_cigar_2; + my $count_2 = 0; + my $last_op_2; + # print "ignore adjusted CIGAR 2: @comp_cigar_2\n"; + foreach my $op (@comp_cigar_2) { + unless (defined $last_op_2){ + $last_op_2 = $op; + ++$count_2; + next; + } + if ($last_op_2 eq $op) { + ++$count_2; + } + else { + $new_cigar_2 .= "$count_2$last_op_2"; + $last_op_2 = $op; + $count_2 = 1; + } + } + $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count + $cigar_2 = $new_cigar_2; + # print "ignore_r2 adjusted CIGAR 2 scalar: $cigar_2\n"; + } + + ########################### + ### END IGNORE 5' END ### + ########################### + + ################################## + ### IGNORE BASES FROM 3' END ### + ################################## + + # print "CIGAR string before truncating 3' end (Read 1)\n"; + # print join ("",@comp_cigar_1),"\n"; + + if ($ignore_3prime and $meth_call_1) { # if the methylation call string is undefined at this point we don't need to process the read any further + + ### Clipping off the last <int> number of bases from the methylation call string as specified with '--ignore_3prime <int>' for read 1 + ### the methylation calls have already been reversed where necessary + + if ( (length($meth_call_1) - $ignore_3prime) <= 0){ + $meth_call_1 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored + } + else { + $meth_call_1 = substr($meth_call_1,0,length($meth_call_1) - $ignore_3prime); + # warn "truncated meth_call 1:\n$meth_call_1\n"; + + if ($strand eq '+') { + + ### if the (read 1) strand information is '+', clipping the 3' end does not affect the starting position of forward strand alignments + # ignore 5' has already been taken care of at this stage, if relevant at all + + for (1..$ignore_3prime) { + my $op = pop @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore_3prime operations from the end + # print "$_ deleted $op from 3' end\n"; + # print join ("",@comp_cigar_1),"\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $op = pop @comp_cigar_1; + # print join ("",@comp_cigar_1),"\n"; + # print "$_ deleted $op from 3' end\n"; + } + } + + # print "Final truncated CIGAR string (Read 1):\n"; + # print join ("",@comp_cigar_1),"\n"; + + } + elsif ($strand eq '-') { + + my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions + my $I_count_1 = 0; + + ### if the (read 1) strand information is '-', the read 1 CIGAR string needs to be trimmed from the start + for (1..$ignore_3prime) { + my $op = shift @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore_3prime operations, here the first value of the array + # print join ("",@comp_cigar_1),"\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $D_count_1++; + $op = shift @comp_cigar_1; + # print join ("",@comp_cigar_1),"\n"; + } + + if ($op eq 'I') { # adjusting the genomic position for insertions (I) + $I_count_1++; + } + } + + # print "Final truncated CIGAR string reverse_read:\n"; + # print join ("",@comp_cigar_1),"\n"; + + # Here we need to add the length ignore_3prime to the read starting position + # adjustment of the true start position of this reverse read will take place later in the methylation extraction step + $start_read_1 += $ignore_3prime + $D_count_1 - $I_count_1; + + } + } + } + + if ($ignore_3prime_r2 and $meth_call_2) { # if the methylation call string is undefined at this point we don't need to process the read any further + + ### Clipping off the last <int> number of bases from the methylation call string as specified with '--ignore_3prime_r2 <int>' for read 2 + ### the methylation calls have already been reversed where necessary + + if ( (length($meth_call_2) - $ignore_3prime_r2) <= 0){ + $meth_call_2 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored + } + else { + $meth_call_2 = substr($meth_call_2,0,length($meth_call_2) - $ignore_3prime_r2); + # warn "truncated meth_call 2:\n$meth_call_2\n"; + + ### If we are ignoring a part of the sequence we also need to adjust the cigar string and the positions accordingly + + if ($strand eq '+') { + + ### if the (read 1) strand information is '+', clipping the 3' end of read 2 does potentially affect the starting position of read 2 (reverse strand alignment) + ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the start + + my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions + my $I_count_2 = 0; + + for (1..$ignore_3prime_r2) { + my $op = shift @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the first value of the array + # print "$_ deleted $op from 3' end\n"; + # print join ("",@comp_cigar_2),"\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $D_count_2++; + $op = shift @comp_cigar_2; + # print "$_ deleted $op from 3' end\n"; + # print join ("",@comp_cigar_2),"\n"; + } + + if ($op eq 'I') { # adjusting the genomic position for insertions (I) + $I_count_2++; + } + } + + # print "Final truncated CIGAR string 2 (+ alignment):\n"; + # print join ("",@comp_cigar_2),"\n"; + + # Here we need to add the length ignore_3prime_r2 to the read starting position + # adjustment of the true start position of this reverse read will take place later in the methylation extraction step + $start_read_2 += $ignore_3prime_r2 + $D_count_2 - $I_count_2; + + # print "start read 2 $start_read_2\t ignore R2: $ignore_r2\t D count 2: $D_count_2\tI_count 2: $I_count_2\n"; + } + elsif ($strand eq '-') { + ### if the (read 1) strand information is '-', clipping the 3' end of read 2 does not affect its starting position (forward strand alignment) + ### ignore_r2 5' has already been taken care of at this stage, if relevant at all + + ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the end + + for (1..$ignore_3prime_r2) { + my $op = pop @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start + # print "$_ deleted $op\n"; + # print join ("",@comp_cigar_2),"\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $op = pop @comp_cigar_2; + # print "$_ deleted $op\n"; + # print join ("",@comp_cigar_2),"\n"; + } + } + + # print "Final truncated CIGAR string 2 (- alignment):\n"; + # print join ("",@comp_cigar_2),"\n"; + + } + } + } + + if ($ignore_3prime and $meth_call_1){ # if the methylation call string is undefined at this point we don't need any new CIGAR string + + ### reconstituting shortened CIGAR string 1 + my $new_cigar_1; + my $count_1 = 0; + my $last_op_1; + # print "ignore_3prime adjusted CIGAR 1: @comp_cigar_1\n"; + foreach my $op (@comp_cigar_1) { + unless (defined $last_op_1){ + $last_op_1 = $op; + ++$count_1; + next; + } + if ($last_op_1 eq $op) { + ++$count_1; + } + else { + $new_cigar_1 .= "$count_1$last_op_1"; + $last_op_1 = $op; + $count_1 = 1; + } + } + $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count + $cigar_1 = $new_cigar_1; + # warn "ignore_3prime adjusted CIGAR 1 scalar: $cigar_1\n"; + } + + if ($ignore_3prime_r2 and $meth_call_2){ # if the methylation call string is undefined at this point we don't need any new CIGAR string + + ### reconstituting shortened CIGAR string 2 + my $new_cigar_2; + my $count_2 = 0; + my $last_op_2; + # print "ignore_3prime_r2 adjusted CIGAR 2: @comp_cigar_2\n"; + foreach my $op (@comp_cigar_2) { + unless (defined $last_op_2){ + $last_op_2 = $op; + ++$count_2; + next; + } + if ($last_op_2 eq $op) { + ++$count_2; + } + else { + $new_cigar_2 .= "$count_2$last_op_2"; + $last_op_2 = $op; + $count_2 = 1; + } + } + $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count + $cigar_2 = $new_cigar_2; + # warn "ignore_3prime_r2 adjusted CIGAR 2 scalar: $cigar_2\n"; + } + + ########################### + ### END IGNORE 3' END ### + ########################### + + + ### Adjusting CIGAR string and starting position of reads in reverse orientation which we will pass to the extraction subroutine later on + + if ($strand eq '+') { + ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2 + @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too + # print "reverse: @comp_cigar_2\n"; + + my $MD_count_1 = 0; + foreach (@comp_cigar_1) { + ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't + } + + my $MD_count_2 = 0; + foreach (@comp_cigar_2) { + ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't + } + + $end_read_1 = $start_read_1 + $MD_count_1 - 1; + $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand + } + else { + ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1 + + @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too + # print "reverse: @comp_cigar_1\n"; + + my $MD_count_1 = 0; + foreach (@comp_cigar_1) { + ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't + } + + $end_read_1 = $start_read_1; + $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand + } + + if ($strand eq '+') { + ## we first pass the first read which is in + orientation on the forward strand; the last value is the read identity + print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1,1); + + # we next pass the second read which is in - orientation on the reverse strand + ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 + print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2,2); + } + else { + ## we first pass the first read which is in - orientation on the reverse strand + print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1,1); + + # we next pass the second read which is in + orientation on the forward strand + ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 + print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2,2); + } + + $methylation_call_strings_processed += 2; # paired-end = 2 methylation call strings + } + } + } + } else { + die "Single-end or paired-end reads not specified properly\n"; + } + + $counting{sequences_count} = $line_count; + $counting{methylation_call_strings} = $methylation_call_strings_processed; + + if ($multicore == 1){ + print_splitting_report (); + } + elsif ($multicore > 1){ + print_splitting_report_multicore($report_filename,$offset,$line_count,$methylation_call_strings_processed); + print_mbias_report_multicore($report_filename,$offset,$line_count,$methylation_call_strings_processed); + } + + return ($process_id,\@pids,$report_filename); + +} + + + +sub print_splitting_report{ + + ### Calculating methylation percentages if applicable + warn "\nProcessed $counting{sequences_count} lines in total\n"; + warn "Total number of methylation call strings processed: $counting{methylation_call_strings}\n\n"; + if ($report) { + print REPORT "\nProcessed $counting{sequences_count} lines in total\n"; + print REPORT "Total number of methylation call strings processed: $counting{methylation_call_strings}\n\n"; + } + + my $percent_meCpG; + if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ + $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); + } + + my $percent_meCHG; + if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); + } + + my $percent_meCHH; + if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ + $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); + } + + my $percent_non_CpG_methylation; + if ($merge_non_CpG){ + if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) ); + } + } + + if ($report){ + ### detailed information about Cs analysed + print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; + + my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; + print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; + + print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; + + print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; + + ### calculating methylated CpG percentage if applicable + if ($percent_meCpG){ + print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; + } + else{ + print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + } + + ### 2-Context Output + if ($merge_non_CpG){ + if ($percent_non_CpG_methylation){ + print REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; + } + else{ + print REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; + } + } + + ### 3 Context Output + else{ + ### calculating methylated CHG percentage if applicable + if ($percent_meCHG){ + print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; + } + else{ + print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + } + + ### calculating methylated CHH percentage if applicable + if ($percent_meCHH){ + print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; + } + else{ + print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; + } + } + } + + ### detailed information about Cs analysed for on-screen report + warn "Final Cytosine Methylation Report\n",'='x33,"\n"; + + my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; + warn "Total number of C's analysed:\t$total_number_of_C\n\n"; + + warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + warn"Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; + + warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + warn"Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; + + ### printing methylated CpG percentage if applicable + if ($percent_meCpG){ + warn "C methylated in CpG context:\t${percent_meCpG}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + } + + ### 2-Context Output + if ($merge_non_CpG){ + if ($percent_non_CpG_methylation){ + warn "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; + } + } + + ### 3-Context Output + else{ + ### printing methylated CHG percentage if applicable + if ($percent_meCHG){ + warn "C methylated in CHG context:\t${percent_meCHG}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + } + + ### printing methylated CHH percentage if applicable + if ($percent_meCHH){ + warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; + } + } +} + +### + +sub print_splitting_report_multicore{ + + my ($report_filename,$offset,$line_count,$meth_call_strings) = @_; + + # warn "\$report_filename is $report_filename\n"; + my $special_report = $report_filename.".$offset"; + + open (SPECIAL_REPORT,'>',$special_report) or die $!; + # warn "line count\t$line_count\n"; + # warn "meth call strings\t$meth_call_strings\n"; + + print SPECIAL_REPORT "line count\t$line_count\n"; + print SPECIAL_REPORT "meth call strings\t$meth_call_strings\n"; + + ### Calculating methylation percentages if applicable + my $percent_meCpG; + if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ + $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); + } + + my $percent_meCHG; + if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); + } + + my $percent_meCHH; + if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ + $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); + } + + my $percent_non_CpG_methylation; + if ($merge_non_CpG){ + if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) ); + } + } + + if ($report){ + + ### detailed information about Cs analysed + print SPECIAL_REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; + + my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; + print SPECIAL_REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; + + print SPECIAL_REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + print SPECIAL_REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + print SPECIAL_REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; + + print SPECIAL_REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + print SPECIAL_REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + print SPECIAL_REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; + + ### calculating methylated CpG percentage if applicable + if ($percent_meCpG){ + print SPECIAL_REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; + } + else{ + print SPECIAL_REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + } + + ### 2-Context Output + if ($merge_non_CpG){ + if ($percent_non_CpG_methylation){ + print SPECIAL_REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; + } + else{ + print SPECIAL_REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; + } + } + + ### 3 Context Output + else{ + ### calculating methylated CHG percentage if applicable + if ($percent_meCHG){ + print SPECIAL_REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; + } + else{ + print SPECIAL_REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + } + + ### calculating methylated CHH percentage if applicable + if ($percent_meCHH){ + print SPECIAL_REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; + } + else{ + print SPECIAL_REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; + } + } + } + close SPECIAL_REPORT or warn "Failed to close filehandle for individual report $special_report\n"; +} + + +### + +### INDIVIDUAL M-BIAS REPORTS + +sub print_mbias_report_multicore{ + + my ($report_filename,$offset,$line_count,$meth_call_strings) = @_; + + # warn "\$report_filename is $report_filename\n"; + my $special_mbias_report = $report_filename.".${offset}.mbias"; + + open (SPECIAL_MBIAS,'>',$special_mbias_report) or die $!; + + # determining maximum read length + my $max_length_1 = 0; + my $max_length_2 = 0; + + foreach my $context (keys %mbias_1){ + foreach my $pos (sort {$a<=>$b} keys %{$mbias_1{$context}}){ + $max_length_1 = $pos unless ($max_length_1 >= $pos); + } + } + if ($paired){ + foreach my $context (keys %mbias_2){ + foreach my $pos (sort {$a<=>$b} keys %{$mbias_2{$context}}){ + $max_length_2 = $pos unless ($max_length_2 >= $pos); + } + } + } + + if ($single){ + # warn "Determining maximum read length for M-Bias plot\n"; + # warn "Maximum read length of Read 1: $max_length_1\n\n"; + } + else{ + # warn "Determining maximum read lengths for M-Bias plots\n"; + # warn "Maximum read length of Read 1: $max_length_1\n"; + # warn "Maximum read length of Read 2: $max_length_2\n\n"; + } + + foreach my $context (qw(CpG CHG CHH)){ + + if ($paired){ + print SPECIAL_MBIAS "$context context (R1)\n================\n"; + } + else{ + print SPECIAL_MBIAS "$context context\n===========\n"; + } + print SPECIAL_MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n"; + + foreach my $pos (1..$max_length_1){ + + unless (defined $mbias_1{$context}->{$pos}->{meth}){ + $mbias_1{$context}->{$pos}->{meth} = 0; + } + unless (defined $mbias_1{$context}->{$pos}->{un}){ + $mbias_1{$context}->{$pos}->{un} = 0; + } + + my $percent = ''; + if (($mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) > 0){ + $percent = sprintf("%.2f",$mbias_1{$context}->{$pos}->{meth} * 100/ ( $mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) ); + } + my $coverage = $mbias_1{$context}->{$pos}->{un} + $mbias_1{$context}->{$pos}->{meth}; + + print SPECIAL_MBIAS "$pos\t$mbias_1{$context}->{$pos}->{meth}\t$mbias_1{$context}->{$pos}->{un}\t$percent\t$coverage\n"; + } + print SPECIAL_MBIAS "\n"; + } + + if ($paired){ + + foreach my $context (qw(CpG CHG CHH)){ + + print SPECIAL_MBIAS "$context context (R2)\n================\n"; + print SPECIAL_MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n"; + + foreach my $pos (1..$max_length_2){ + + unless (defined $mbias_2{$context}->{$pos}->{meth}){ + $mbias_2{$context}->{$pos}->{meth} = 0; + } + unless (defined $mbias_2{$context}->{$pos}->{un}){ + $mbias_2{$context}->{$pos}->{un} = 0; + } + + my $percent = ''; + if (($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) > 0){ + $percent = sprintf("%.2f",$mbias_2{$context}->{$pos}->{meth} * 100/ ($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) ); + } + my $coverage = $mbias_2{$context}->{$pos}->{un} + $mbias_2{$context}->{$pos}->{meth}; + + print SPECIAL_MBIAS "$pos\t$mbias_2{$context}->{$pos}->{meth}\t$mbias_2{$context}->{$pos}->{un}\t$percent\t$coverage\n"; + } + } + } + + close SPECIAL_MBIAS or warn "Failed to close filehandle for individual M-bias report $special_mbias_report\n"; +} + + +### + + +sub print_individual_C_methylation_states_paired_end_files{ + + my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar,$read_identity) = @_; + + unless (defined $meth_call) { + return; # skip this read + } + + ### we will use the read identity for the M-bias plot to discriminate read 1 and read 2 + die "Read identity was neither 1 nor 2: $read_identity\n\n" unless ($read_identity == 1 or $read_identity == 2); + + my @methylation_calls = split(//,$meth_call); + + ################################################################# + ### . for bases not involving cytosines ### + ### X for methylated C in CHG context (was protected) ### + ### x for not methylated C in CHG context (was converted) ### + ### H for methylated C in CHH context (was protected) ### + ### h for not methylated C in CHH context (was converted) ### + ### Z for methylated C in CpG context (was protected) ### + ### z for not methylated C in CpG context (was converted) ### + ### U for methylated C in Unknown context (was protected) ### + ### u for not methylated C in Unknown context (was converted) ### + ################################################################# + + my $methyl_CHG_count = 0; + my $methyl_CHH_count = 0; + my $methyl_CpG_count = 0; + my $unmethylated_CHG_count = 0; + my $unmethylated_CHH_count = 0; + my $unmethylated_CpG_count = 0; + + my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions + my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels + my @comp_cigar; + + ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing + if ($cigar =~ /^\d+M$/){ + # this check speeds up the extraction process by up to 60%!!! + } + else{ # parsing CIGAR string + my @len; + my @ops; + @len = split (/\D+/,$cigar); # storing the length per operation + @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + foreach my $index (0..$#len){ + foreach (1..$len[$index]){ + # print "$ops[$index]"; + push @comp_cigar, $ops[$index]; + } + } + # warn "\nDetected CIGAR string: $cigar\n"; + # warn "Length of methylation call: ",length $meth_call,"\n"; + # warn "number of operations: ",scalar @ops,"\n"; + # warn "number of length digits: ",scalar @len,"\n\n"; + # print @comp_cigar,"\n"; + # print "$meth_call\n\n"; + # sleep (1); + } + + if ($strand eq '-') { + + ### the CIGAR string needs to be reversed, the methylation call has already been reversed above + if (@comp_cigar){ + @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too + } + # print "reverse CIGAR string: @comp_cigar\n"; + + ### the start position of paired-end files has already been corrected, see above + } + + ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified + + if ($merge_non_CpG) { + if ($no_overlap) { # this has to be read 2... + + ### single-file CpG and non-CpG context output + if ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start+$index+$pos_offset >= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.'){} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); + } + } + } + elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start-$index+$pos_offset <= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); + } + } + } else { + die "The read orientation was neither + nor -: '$strand'\n"; + } + } + + ### strand-specific methylation output + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start+$index+$pos_offset >= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start-$index+$pos_offset <= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation was neither + nor -: '$strand'/n"; + } + } + } + + ### this is the default paired-end procedure allowing overlaps and using every single C position + ### Still within the 2-CONTEXT ONLY optional section + else { + ### single-file CpG and non-CpG context output + if ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + + ### strand-specific methylation output + ### still within the 2-CONTEXT optional section + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + } + } + + ############################################ + ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ### + ############################################ + + elsif ($no_overlap) { + ### single-file CpG, CHG and CHH context output + if ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start+$index+$pos_offset >= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start-$index+$pos_offset <= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + + ### strand-specific methylation output + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start+$index+$pos_offset >= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start-$index+$pos_offset <= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + } + + ### this is the paired-end procedure allowing overlaps and using every single C position + else { + ### single-file CpG, CHG and CHH context output + if ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + + ### strand-specific methylation output + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + } +} + +sub check_cigar_string { + my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_; + # print "$index\t$cigar_offset\t$pos_offset\t$strand\t"; + my ($new_cigar_offset,$new_pos_offset) = (0,0); + + if ($strand eq '+') { + # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; + + if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position + # warn "position needs no adjustment\n"; + } + + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence + $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position + # warn "adjusted genomic position by -1 bp (insertion)\n"; + } + + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence + $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index + $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position + # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; + + while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ + if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position + # warn "position needs no adjustment\n"; + last; + } + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ + $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position + # warn "adjusted genomic position by another -1 bp (insertion)\n"; + last; + } + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence + $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index + $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position + # warn "adjusted genomic position by another +1 bp (deletion)\n"; + } + else{ + die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; + } + } + } + else{ + die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; + } + } + + elsif ($strand eq '-') { + # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; + + if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position + # warn "position needs no adjustment\n"; + } + + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence + $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position + # warn "adjusted genomic position by +1 bp (insertion)\n"; + } + + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence + $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index + $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position + # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; + + while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ + if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position + # warn "Found new 'M' operation; position needs no adjustment\n"; + last; + } + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ + $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position + # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n"; + last; + } + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence + $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index + $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position + # warn "adjusted genomic position by another -1 bp (deletion)\n"; + } + else{ + die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; + } + } + } + else{ + die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; + } + } + # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n"; + return ($new_cigar_offset,$new_pos_offset); +} + +sub print_individual_C_methylation_states_single_end{ + + my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_; + my @methylation_calls = split(//,$meth_call); + + ################################################################# + ### . for bases not involving cytosines ### + ### X for methylated C in CHG context (was protected) ### + ### x for not methylated C in CHG context (was converted) ### + ### H for methylated C in CHH context (was protected) ### + ### h for not methylated C in CHH context (was converted) ### + ### Z for methylated C in CpG context (was protected) ### + ### z for not methylated C in CpG context (was converted) ### + ################################################################# + + my $methyl_CHG_count = 0; + my $methyl_CHH_count = 0; + my $methyl_CpG_count = 0; + my $unmethylated_CHG_count = 0; + my $unmethylated_CHH_count = 0; + my $unmethylated_CpG_count = 0; + + my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions + my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels + + my @comp_cigar; + + if ($cigar){ # parsing CIGAR string + + ### Checking whether the CIGAR string is a linear genomic match or whether it requires indel processing + if ($cigar =~ /^\d+M$/){ + # warn "See!? I told you so! $cigar\n"; + # sleep(1); + } + else{ + + my @len; + my @ops; + + @len = split (/\D+/,$cigar); # storing the length per operation + @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + # die "CIGAR string contained a non-matching number of lengths and operations: id: $id\nmeth call: $meth_call\nCIGAR: $cigar\n".join(" ",@len)."\n".join(" ",@ops)."\n" unless (scalar @len == scalar @ops); + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + foreach my $index (0..$#len){ + foreach (1..$len[$index]){ + # print "$ops[$index]"; + push @comp_cigar, $ops[$index]; + } + } + } + # warn "\nDetected CIGAR string: $cigar\n"; + # warn "Length of methylation call: ",length $meth_call,"\n"; + # warn "number of operations: ",scalar @ops,"\n"; + # warn "number of length digits: ",scalar @len,"\n\n"; + # print @comp_cigar,"\n"; + # print "$meth_call\n\n"; + # sleep (1); + } + + ### adjusting the start position for all reads mapping to the reverse strand + if ($strand eq '-') { + + if (@comp_cigar){ # only needed for SAM reads with InDels + @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too + # print @comp_cigar,"\n"; + } + + unless ($ignore){ ### if --ignore was specified the start position has already been corrected + + if ($cigar){ ### SAM format + if ($cigar =~ /^(\d+)M$/){ # linear match + $start += $1 - 1; + } + else{ # InDel read + my $MD_count = 0; + foreach (@comp_cigar){ + ++$MD_count if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't + } + $start += $MD_count - 1; + } + } + else{ ### vanilla format + $start += length($meth_call)-1; + } + } + } + + ### THIS IS THE CpG and Non-CpG SECTION (OPTIONAL) + + ### single-file CpG and other-context output + if ($full and $merge_non_CpG) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM alignments with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + my $line = join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{other_context}} $line unless($mbias_only); + # print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + my $line = join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{other_context}} $line unless($mbias_only); + # print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + my $line = join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{CpG_context}} $line unless($mbias_only); + # print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + my $line = join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{CpG_context}} $line unless($mbias_only); + # print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + my $line = join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{other_context}} $line unless($mbias_only); + # print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + my $line = join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{other_context}} $line unless($mbias_only); + # print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + elsif ($strand eq '-') { + + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM entries with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + my $line = join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{other_context}} $line unless($mbias_only); + #print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + my $line = join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{other_context}} $line unless($mbias_only); + #print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + my $line = join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{CpG_context}} $line unless($mbias_only); + #print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + my $line = join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{CpG_context}} $line unless($mbias_only); + #print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + my $line = join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{other_context}} $line unless($mbias_only); + #print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + my $line = join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{other_context}} $line unless($mbias_only); + #print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.'){} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + else { + die "The strand information was neither + nor -: $strand\n"; + } + } + + ### strand-specific methylation output + elsif ($merge_non_CpG) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + elsif ($strand eq '-') { + + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + else { + die "The strand information was neither + nor -: $strand\n"; + } + } + + ### THIS IS THE 3-CONTEXT (CpG, CHG and CHH) DEFAULT SECTION + + elsif ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); + } + } + } + elsif ($strand eq '-') { + + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + else { + die "The read had a strand orientation which was neither + nor -: $strand\n"; + } + } + + ### strand-specific methylation output + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + elsif ($strand eq '-') { + + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + else { + die "The strand information was neither + nor -: $strand\n"; + } + } +} + +sub open_output_filehandles{ + + my $filename = shift; + + my $output_filename = (split (/\//,$filename))[-1]; + my $report_filename = $output_filename; + + ### OPENING OUTPUT-FILEHANDLES + if ($report) { + $report_filename =~ s/\.sam$//; + $report_filename =~ s/\.txt$//; + $report_filename =~ s/$/_splitting_report.txt/; + $report_filename = $output_dir . $report_filename; + open (REPORT,'>',$report_filename) or die "Failed to write to file $report_filename $!\n"; + } + + if ($report) { + + print REPORT "$output_filename\n\n"; + print REPORT "Parameters used to extract methylation information:\n"; + print REPORT "Bismark Extractor Version: $version\n"; + + if ($paired) { + if ($vanilla) { + print REPORT "Bismark result file: paired-end (vanilla Bismark format)\n"; + } else { + print REPORT "Bismark result file: paired-end (SAM format)\n"; # default + } + } + + if ($single) { + if ($vanilla) { + print REPORT "Bismark result file: single-end (vanilla Bismark format)\n"; + } else { + print REPORT "Bismark result file: single-end (SAM format)\n"; # default + } + } + if ($single){ + if ($ignore) { + print REPORT "Ignoring first $ignore bp\n"; + } + if ($ignore_3prime) { + print REPORT "Ignoring last $ignore_3prime bp\n"; + } + } + else{ # paired-end + if ($ignore) { + print REPORT "Ignoring first $ignore bp of Read 1\n"; + } + if ($ignore_r2){ + print REPORT "Ignoring first $ignore_r2 bp of Read 2\n"; + } + + if ($ignore_3prime) { + print REPORT "Ignoring last $ignore_3prime bp of Read 1\n"; + } + if ($ignore_3prime_r2){ + print REPORT "Ignoring last $ignore_3prime_r2 bp of Read 2\n"; + } + + } + + if ($full) { + print REPORT "Output specified: comprehensive\n"; + } else { + print REPORT "Output specified: strand-specific (default)\n"; + } + + if ($no_overlap) { + print REPORT "No overlapping methylation calls specified\n"; + } + if ($genomic_fasta) { + print REPORT "Genomic equivalent sequences will be printed out in FastA format\n"; + } + if ($merge_non_CpG) { + print REPORT "Methylation in CHG and CHH context will be merged into \"non-CpG context\" output\n"; + } + + print REPORT "\n"; + } + + ##### open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + + ### CpG-context and non-CpG context. THIS SECTION IS OPTIONAL + ### if --comprehensive AND --merge_non_CpG was specified we are only writing out one CpG-context and one Any-Other-context result file + if ($full and $merge_non_CpG) { + my $cpg_output = my $other_c_output = $output_filename; + ### C in CpG context + $cpg_output =~ s/^/CpG_context_/; + $cpg_output =~ s/sam$/txt/; + $cpg_output =~ s/bam$/txt/; + $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); + $cpg_output = $output_dir . $cpg_output; + + if ($gzip){ + $cpg_output .= '.gz'; + open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only); + } + else{ ### disclaimer: I am aware of "The Useless Use of Cat Awards", but I saw no other option... + open ($fhs{CpG_context},"| cat > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only); + # open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only); + push @sorting_files,$cpg_output; + + unless ($no_header) { + print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + ### C in any other context than CpG + $other_c_output =~ s/^/Non_CpG_context_/; + $other_c_output =~ s/sam$/txt/; + $other_c_output =~ s/bam$/txt/; + $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/); + $other_c_output = $output_dir . $other_c_output; + + if ($gzip){ + $other_c_output .= '.gz'; + open ($fhs{other_context},"| gzip -c - > $other_c_output") or die "Failed to write to $other_c_output $! \n" unless($mbias_only); + } + else{ + open ($fhs{other_context},"| cat > $other_c_output") or die "Failed to write to $other_c_output $!\n" unless($mbias_only); + # open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in any other context to $other_c_output\n" unless($mbias_only); + push @sorting_files,$other_c_output; + + + unless ($no_header) { + print {$fhs{other_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + } + + ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found + elsif ($merge_non_CpG) { + + my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; + + ### For cytosines in CpG context + $cpg_ot =~ s/^/CpG_OT_/; + $cpg_ot =~ s/sam$/txt/; + $cpg_ot =~ s/bam$/txt/; + $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); + $cpg_ot = $output_dir . $cpg_ot; + + if ($gzip){ + $cpg_ot .= '.gz'; + open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); + } + else{ + open ($fhs{0}->{CpG},"| cat > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); + # open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only); + push @sorting_files,$cpg_ot; + + unless($no_header){ + print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $cpg_ctot =~ s/^/CpG_CTOT_/; + $cpg_ctot =~ s/sam$/txt/; + $cpg_ctot =~ s/bam$/txt/; + $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); + $cpg_ctot = $output_dir . $cpg_ctot; + + if ($gzip){ + $cpg_ctot .= '.gz'; + open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); + } + else{ + open ($fhs{1}->{CpG},"| cat > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); + # open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only); + push @sorting_files,$cpg_ctot; + + unless($no_header){ + print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $cpg_ctob =~ s/^/CpG_CTOB_/; + $cpg_ctob =~ s/sam$/txt/; + $cpg_ctob =~ s/bam$/txt/; + $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); + $cpg_ctob = $output_dir . $cpg_ctob; + + if ($gzip){ + $cpg_ctob .= '.gz'; + open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); + } + else{ + open ($fhs{2}->{CpG},"| cat > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); + # open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only); + push @sorting_files,$cpg_ctob; + + unless($no_header){ + print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $cpg_ob =~ s/^/CpG_OB_/; + $cpg_ob =~ s/sam$/txt/; + $cpg_ob =~ s/bam$/txt/; + $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); + $cpg_ob = $output_dir . $cpg_ob; + + if ($gzip){ + $cpg_ob .= '.gz'; + open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); + } + else{ + open ($fhs{3}->{CpG},"| cat > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); + # open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only); + push @sorting_files,$cpg_ob; + + unless($no_header){ + print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + ### For cytosines in Non-CpG (CC, CT or CA) context + my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename; + + $other_c_ot =~ s/^/Non_CpG_OT_/; + $other_c_ot =~ s/sam$/txt/; + $other_c_ot =~ s/bam$/txt/; + $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/); + $other_c_ot = $output_dir . $other_c_ot; + + if ($gzip){ + $other_c_ot .= '.gz'; + open ($fhs{0}->{other_c},"| gzip -c - > $other_c_ot") or die "Failed to write to $other_c_ot $!\n" unless($mbias_only); + } + else{ + open ($fhs{0}->{other_c},"| cat > $other_c_ot") or die "Failed to write to $other_c_ot $!\n" unless($mbias_only); + # open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n" unless($mbias_only); + push @sorting_files,$other_c_ot; + + unless($no_header){ + print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $other_c_ctot =~ s/^/Non_CpG_CTOT_/; + $other_c_ctot =~ s/sam$/txt/; + $other_c_ctot =~ s/bam$/txt/; + $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/); + $other_c_ctot = $output_dir . $other_c_ctot; + + if ($gzip){ + $other_c_ctot .= '.gz'; + open ($fhs{1}->{other_c},"| gzip -c - > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only); + } + else{ + open ($fhs{1}->{other_c},"| cat > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only); + # open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n" unless($mbias_only); + push @sorting_files,$other_c_ctot; + + unless($no_header){ + print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $other_c_ctob =~ s/^/Non_CpG_CTOB_/; + $other_c_ctob =~ s/sam$/txt/; + $other_c_ctob =~ s/bam$/txt/; + $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/); + $other_c_ctob = $output_dir . $other_c_ctob; + + if ($gzip){ + $other_c_ctob .= '.gz'; + open ($fhs{2}->{other_c},"| gzip -c - > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only); + } + else{ + open ($fhs{2}->{other_c},"| cat > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only); + # open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n" unless($mbias_only); + push @sorting_files,$other_c_ctob; + + unless($no_header){ + print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $other_c_ob =~ s/^/Non_CpG_OB_/; + $other_c_ob =~ s/sam$/txt/; + $other_c_ob =~ s/sam$/txt/; + $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/); + $other_c_ob = $output_dir . $other_c_ob; + + if ($gzip){ + $other_c_ob .= '.gz'; + open ($fhs{3}->{other_c},"| gzip -c - > $other_c_ob") or die "Failed to write to $other_c_ob $!\n" unless($mbias_only); + } + else{ + open ($fhs{3}->{other_c},"| cat > $other_c_ob") or die "Failed to write to $other_c_ob $!\n" unless($mbias_only); + # open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n" unless($mbias_only); + push @sorting_files,$other_c_ob; + + unless($no_header){ + print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + } + ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context) + + ### if --comprehensive was specified we are only writing one file per context + elsif ($full) { + my $cpg_output = my $chg_output = my $chh_output = $output_filename; + ### C in CpG context + $cpg_output =~ s/^/CpG_context_/; + $cpg_output =~ s/sam$/txt/; + $cpg_output =~ s/bam$/txt/; + $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); + $cpg_output = $output_dir . $cpg_output; + + if ($gzip){ + $cpg_output .= '.gz'; + open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only); + } + else{ + open ($fhs{CpG_context},"| cat > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only); + # open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only); + push @sorting_files,$cpg_output; + + unless($no_header){ + print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + ### C in CHG context + $chg_output =~ s/^/CHG_context_/; + $chg_output =~ s/sam$/txt/; + $chg_output =~ s/bam$/txt/; + $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/); + $chg_output = $output_dir . $chg_output; + + if ($gzip){ + $chg_output .= '.gz'; + open ($fhs{CHG_context},"| gzip -c - > $chg_output") or die "Failed to write to $chg_output $!\n" unless($mbias_only); + } + else{ + open ($fhs{CHG_context},"| cat > $chg_output") or die "Failed to write to $chg_output $!\n" unless($mbias_only); + # open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHG context to $chg_output\n" unless($mbias_only); + push @sorting_files,$chg_output; + + unless($no_header){ + print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + ### C in CHH context + $chh_output =~ s/^/CHH_context_/; + $chh_output =~ s/sam$/txt/; + $chh_output =~ s/bam$/txt/; + $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/); + $chh_output = $output_dir . $chh_output; + + if ($gzip){ + $chh_output .= '.gz'; + open ($fhs{CHH_context},"| gzip -c - > $chh_output") or die "Failed to write to $chh_output $!\n" unless($mbias_only); + } + else{ + open ($fhs{CHH_context},"| cat > $chh_output") or die "Failed to write to $chh_output $!\n" unless($mbias_only); + # open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHH context to $chh_output\n" unless($mbias_only); + push @sorting_files, $chh_output; + + unless($no_header){ + print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + } + ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found + else { + my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; + + ### For cytosines in CpG context + $cpg_ot =~ s/^/CpG_OT_/; + $cpg_ot =~ s/sam$/txt/; + $cpg_ot =~ s/bam$/txt/; + $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); + $cpg_ot = $output_dir . $cpg_ot; + + if ($gzip){ + $cpg_ot .= '.gz'; + open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); + } + else{ + open ($fhs{0}->{CpG},"| cat > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); + # open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only); + push @sorting_files,$cpg_ot; + + unless($no_header){ + print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $cpg_ctot =~ s/^/CpG_CTOT_/; + $cpg_ctot =~ s/sam$/txt/; + $cpg_ctot =~ s/bam$/txt/; + $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); + $cpg_ctot = $output_dir . $cpg_ctot; + + if ($gzip){ + $cpg_ctot .= '.gz'; + open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); + } + else{ + open ($fhs{1}->{CpG},"| cat > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); + # open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only); + push @sorting_files,$cpg_ctot; + + unless($no_header){ + print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $cpg_ctob =~ s/^/CpG_CTOB_/; + $cpg_ctob =~ s/sam$/txt/; + $cpg_ctob =~ s/bam$/txt/; + $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); + $cpg_ctob = $output_dir . $cpg_ctob; + + if ($gzip){ + $cpg_ctob .= '.gz'; + open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); + } + else{ + open ($fhs{2}->{CpG},"| cat > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); + # open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only); + push @sorting_files,$cpg_ctob; + + unless($no_header){ + print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $cpg_ob =~ s/^/CpG_OB_/; + $cpg_ob =~ s/sam$/txt/; + $cpg_ob =~ s/bam$/txt/; + $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); + $cpg_ob = $output_dir . $cpg_ob; + + if ($gzip){ + $cpg_ob .= '.gz'; + open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); + } + else{ + open ($fhs{3}->{CpG},"| cat > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); + # open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only); + push @sorting_files,$cpg_ob; + + unless($no_header){ + print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + ### For cytosines in CHG context + my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename; + + $chg_ot =~ s/^/CHG_OT_/; + $chg_ot =~ s/sam$/txt/; + $chg_ot =~ s/bam$/txt/; + $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/); + $chg_ot = $output_dir . $chg_ot; + + if ($gzip){ + $chg_ot .= '.gz'; + open ($fhs{0}->{CHG},"| gzip -c - > $chg_ot") or die "Failed to write to $chg_ot $!\n" unless($mbias_only); + } + else{ + open ($fhs{0}->{CHG},"| cat > $chg_ot") or die "Failed to write to $chg_ot $!\n" unless($mbias_only); + # open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n" unless($mbias_only); + push @sorting_files,$chg_ot; + + unless($no_header){ + print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $chg_ctot =~ s/^/CHG_CTOT_/; + $chg_ctot =~ s/sam$/txt/; + $chg_ctot =~ s/bam$/txt/; + $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/); + $chg_ctot = $output_dir . $chg_ctot; + + if ($gzip){ + $chg_ctot .= '.gz'; + open ($fhs{1}->{CHG},"| gzip -c - > $chg_ctot") or die "Failed to write to $chg_ctot $!\n" unless($mbias_only); + } + else{ + open ($fhs{1}->{CHG},"| cat > $chg_ctot") or die "Failed to write to $chg_ctot $!\n" unless($mbias_only); + # open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n" unless($mbias_only); + push @sorting_files,$chg_ctot; + + unless($no_header){ + print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $chg_ctob =~ s/^/CHG_CTOB_/; + $chg_ctob =~ s/sam$/txt/; + $chg_ctob =~ s/bam$/txt/; + $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/); + $chg_ctob = $output_dir . $chg_ctob; + + if ($gzip){ + $chg_ctob .= '.gz'; + open ($fhs{2}->{CHG},"| gzip -c - > $chg_ctob") or die "Failed to write to $chg_ctob $!\n" unless($mbias_only); + } + else{ + open ($fhs{2}->{CHG},"| cat > $chg_ctob") or die "Failed to write to $chg_ctob $!\n" unless($mbias_only); + # open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n" unless($mbias_only); + push @sorting_files,$chg_ctob; + + unless($no_header){ + print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $chg_ob =~ s/^/CHG_OB_/; + $chg_ob =~ s/sam$/txt/; + $chg_ob =~ s/bam$/txt/; + $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/); + $chg_ob = $output_dir . $chg_ob; + + if ($gzip){ + $chg_ob .= '.gz'; + open ($fhs{3}->{CHG},"| gzip -c - > $chg_ob") or die "Failed to write to $chg_ob $!\n" unless($mbias_only); + } + else{ + open ($fhs{3}->{CHG},"| cat > $chg_ob") or die "Failed to write to $chg_ob $!\n" unless($mbias_only); + # open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n" unless($mbias_only); + push @sorting_files,$chg_ob; + + unless($no_header){ + print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + ### For cytosines in CHH context + my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename; + + $chh_ot =~ s/^/CHH_OT_/; + $chh_ot =~ s/sam$/txt/; + $chh_ot =~ s/bam$/txt/; + $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/); + $chh_ot = $output_dir . $chh_ot; + + if ($gzip){ + $chh_ot .= '.gz'; + open ($fhs{0}->{CHH},"| gzip -c - > $chh_ot") or die "Failed to write to $chh_ot $!\n" unless($mbias_only); + } + else{ + open ($fhs{0}->{CHH},"| cat > $chh_ot") or die "Failed to write to $chh_ot $!\n" unless($mbias_only); + # open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n" unless($mbias_only); + push @sorting_files,$chh_ot; + + unless($no_header){ + print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $chh_ctot =~ s/^/CHH_CTOT_/; + $chh_ctot =~ s/sam$/txt/; + $chh_ctot =~ s/bam$/txt/; + $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/); + $chh_ctot = $output_dir . $chh_ctot; + + if ($gzip){ + $chh_ctot .= '.gz'; + open ($fhs{1}->{CHH},"| gzip -c - > $chh_ctot") or die "Failed to write to $chh_ctot $!\n" unless($mbias_only); + } + else{ + open ($fhs{1}->{CHH},"| cat > $chh_ctot") or die "Failed to write to $chh_ctot $!\n" unless($mbias_only); + # open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n" unless($mbias_only); + push @sorting_files,$chh_ctot; + + unless($no_header){ + print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $chh_ctob =~ s/^/CHH_CTOB_/; + $chh_ctob =~ s/sam$/txt/; + $chh_ctob =~ s/bam$/txt/; + $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/); + $chh_ctob = $output_dir . $chh_ctob; + + if ($gzip){ + $chh_ctob .= '.gz'; + open ($fhs{2}->{CHH},"| gzip -c - > $chh_ctob") or die "Failed to write to $chh_ctob $!\n" unless($mbias_only); + } + else{ + open ($fhs{2}->{CHH},"| cat > $chh_ctob") or die "Failed to write to $chh_ctob $!\n" unless($mbias_only); + # open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n" unless($mbias_only); + push @sorting_files,$chh_ctob; + + unless($no_header){ + print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $chh_ob =~ s/^/CHH_OB_/; + $chh_ob =~ s/sam$/txt/; + $chh_ob =~ s/bam$/txt/; + $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/); + $chh_ob = $output_dir . $chh_ob; + + if ($gzip){ + $chh_ob .= '.gz'; + open ($fhs{3}->{CHH},"| gzip -c - > $chh_ob") or die "Failed to write to $chh_ob $!\n" unless($mbias_only); + } + else{ + open ($fhs{3}->{CHH},"| cat > $chh_ob") or die "Failed to write to $chh_ob $!\n" unless($mbias_only); + # open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n" unless($mbias_only); + push @sorting_files,$chh_ob; + + unless($no_header){ + print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + } + return $report_filename; +} + +sub isBam{ + + my $filename = shift; + + # reading the first line of the input file to see if it is a BAM file in disguise (i.e. a BAM file that does not end in *.bam which may be produced by Galaxy) + open (DISGUISE,"zcat $filename |") or die "Failed to open filehandle DISGUISE for $filename\n\n"; + + ### when BAM files read through a zcat stream they start with BAM... + my $bam_in_disguise = <DISGUISE>; + # warn "BAM in disguise: $bam_in_disguise\n\n"; + + if ($bam_in_disguise){ + if ($bam_in_disguise =~ /^BAM/){ + close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n"; + return 1; + } + else{ + close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n"; + return 0; + } + } + else{ + close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n"; + return 0; + } +} + + +sub print_helpfile{ + + print << 'HOW_TO'; + + +DESCRIPTION + +The following is a brief description of all options to control the Bismark +methylation extractor. The script reads in a bisulfite read alignment results file +produced by the Bismark bisulfite mapper and extracts the methylation information +for individual cytosines. This information is found in the methylation call field +which can contain the following characters: + + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ~~~ X for methylated C in CHG context ~~~ + ~~~ x for not methylated C CHG ~~~ + ~~~ H for methylated C in CHH context ~~~ + ~~~ h for not methylated C in CHH context ~~~ + ~~~ Z for methylated C in CpG context ~~~ + ~~~ z for not methylated C in CpG context ~~~ + ~~~ U for methylated C in Unknown context (CN or CHN ~~~ + ~~~ u for not methylated C in Unknown context (CN or CHN) ~~~ + ~~~ . for any bases not involving cytosines ~~~ + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The methylation extractor outputs result files for cytosines in CpG, CHG and CHH +context (this distinction is actually already made in Bismark itself). As the methylation +information for every C analysed can produce files which easily have tens or even hundreds of +millions of lines, file sizes can become very large and more difficult to handle. The C +methylation info additionally splits cytosine methylation calls up into one of the four possible +strands a given bisulfite read aligned against: + + OT original top strand + CTOT complementary to original top strand + + OB original bottom strand + CTOB complementary to original bottom strand + +Thus, by default twelve individual output files are being generated per input file (unless +--comprehensive is specified, see below). The output files can be imported into a genome +viewer, such as SeqMonk, and re-combined into a single data group if desired (in fact +unless the bisulfite reads were generated preserving directionality it doesn't make any +sense to look at the data in a strand-specific manner). Strand-specific output files can +optionally be skipped, in which case only three output files for CpG, CHG or CHH context +will be generated. For both the strand-specific and comprehensive outputs there is also +the option to merge both non-CpG contexts (CHG and CHH) into one single non-CpG context. + + +The output files are in the following format (tab delimited): + +<sequence_id> <strand> <chromosome> <position> <methylation call> + + +USAGE: methylation_extractor [options] <filenames> + + +ARGUMENTS: +========== + +<filenames> A space-separated list of Bismark result files in SAM format from + which methylation information is extracted for every cytosine in + the reads. For alignment files in the older custom Bismark output + see option '--vanilla'. + +OPTIONS: + +-s/--single-end Input file(s) are Bismark result file(s) generated from single-end + read data. Specifying either --single-end or --paired-end is + mandatory. + +-p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end + read data. Specifying either --paired-end or --single-end is + mandatory. + +--vanilla The Bismark result input file(s) are in the old custom Bismark format + (up to version 0.5.x) and not in SAM format which is the default as + of Bismark version 0.6.x or higher. Default: OFF. + +--no_overlap For paired-end reads it is theoretically possible that read_1 and + read_2 overlap. This option avoids scoring overlapping methylation + calls twice (only methylation calls of read 1 are used for in the process + since read 1 has historically higher quality basecalls than read 2). + Whilst this option removes a bias towards more methylation calls + in the center of sequenced fragments it may de facto remove a sizable + proportion of the data. This option is on by default for paired-end data + but can be disabled using --include_overlap. Default: ON. + +--include_overlap For paired-end data all methylation calls will be extracted irrespective of + of whether they overlap or not. Default: OFF. + +--ignore <int> Ignore the first <int> bp from the 5' end of Read 1 (or single-end alignment + files) when processing the methylation call string. This can remove e.g. a + restriction enzyme site at the start of each read or any other source of + bias (such as PBAT-Seq data). + +--ignore_r2 <int> Ignore the first <int> bp from the 5' end of Read 2 of paired-end sequencing + results only. Since the first couple of bases in Read 2 of BS-Seq experiments + show a severe bias towards non-methylation as a result of end-repairing + sonicated fragments with unmethylated cytosines (see M-bias plot), it is + recommended that the first couple of bp of Read 2 are removed before + starting downstream analysis. Please see the section on M-bias plots in the + Bismark User Guide for more details. + +--ignore_3prime <int> Ignore the last <int> bp from the 3' end of Read 1 (or single-end alignment + files) when processing the methylation call string. This can remove unwanted + biases from the end of reads. + +--ignore_3prime_r2 <int> Ignore the last <int> bp from the 3' end of Read 2 of paired-end sequencing + results only. This can remove unwanted biases from the end of reads. + +--comprehensive Specifying this option will merge all four possible strand-specific + methylation info into context-dependent output files. The default + + contexts are: + - CpG context + - CHG context + - CHH context + +--merge_non_CpG This will produce two output files (in --comprehensive mode) or eight + strand-specific output files (default) for Cs in + - CpG context + - non-CpG context + +--report Prints out a short methylation summary as well as the paramaters used to run + this script. Default: ON. + +--no_header Suppresses the Bismark version header line in all output files for more convenient + batch processing. + +-o/--output DIR Allows specification of a different output directory (absolute or relative + path). If not specified explicitely, the output will be written to the current directory. + +--samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified + explicitly if Samtools is in the PATH already. + +--gzip The methylation extractor files (CpG_OT_..., CpG_OB_... etc) will be written out in + a GZIP compressed form to save disk space. This option does not work on bedGraph and + genome-wide cytosine reports as they are 'tiny' anyway. + +--version Displays version information. + +-h/--help Displays this help file and exits. + +--mbias_only The methylation extractor will read the entire file but only output the M-bias table and plots as + well as a report (optional) and then quit. Default: OFF. + +--mbias_off The methylation extractor will process the entire file as usual but doesn't write out any M-bias report. + Only recommended for users who deliberately want to keep an earlier version of the M-bias report. + Default: OFF. + +--multicore <int> Sets the number of cores to be used for the methylation extraction process. If system resources + are plentiful this is a viable option to speed up the extraction process (we observed a near linear + speed increase for up to 10 cores used). Please note that a typical process of extracting a BAM file + and writing out '.gz' output streams will in fact use ~3 cores per value of --multicore <int> + specified (1 for the methylation extractor itself, 1 for a Samtools stream, 1 for GZIP stream), so + --multicore 10 is likely to use around 30 cores of system resources. This option has no bearing + on the bismark2bedGraph or genome-wide cytosine report processes. + + + + +bedGraph specific options: +========================== + +--bedGraph After finishing the methylation extraction, the methylation output is written into a + sorted bedGraph file that reports the position of a given cytosine and its methylation + state (in %, see details below). The methylation extractor output is temporarily split up into + temporary files, one per chromosome (written into the current directory or folder + specified with -o/--output); these temp files are then used for sorting and deleted + afterwards. By default, only cytosines in CpG context will be sorted. The option + '--CX_context' may be used to report all cytosines irrespective of sequence context + (this will take MUCH longer!). The default folder for temporary files during the sorting + process is the output directory. The bedGraph conversion step is performed by the external + module 'bismark2bedGraph'; this script needs to reside in the same folder as the + bismark_methylation_extractor itself. + +--zero_based Write out an additional coverage file (ending in .zero.cov) that uses 0-based genomic start + and 1-based genomic end coordinates (zero-based, half-open), like used in the bedGraph file, + instead of using 1-based coordinates throughout. Default: OFF. + + +--cutoff [threshold] The minimum number of times a methylation state has to be seen for that nucleotide + before its methylation percentage is reported. Default: 1. + +--remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting. + +--CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered + in the experiment irrespective of its sequence context. This applies to both forward and + reverse strands. Please be aware that this option may generate large temporary and output files + and may take a long time to sort (up to many hours). Default: OFF. + (i.e. Default = CpG context only). + +--buffer_size <string> This allows you to specify the main memory sort buffer when sorting the methylation information. + Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or + a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc. + (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line. + Defaults to 2G. + +--scaffolds/--gazillion Users working with unfinished genomes sporting tens or even hundreds of thousands of + scaffolds/contigs/chromosomes frequently encountered errors with pre-sorting reads to + individual chromosome files. These errors were caused by the operating system's limit + of the number of filehandle that can be written to at any one time (typically 1024; to + find out this limit on Linux, type: ulimit -a). + To bypass the limitation of open filehandles, the option --scaffolds does not pre-sort + methylation calls into individual chromosome files. Instead, all input files are + temporarily merged into a single file (unless there is only a single file), and this + file will then be sorted by both chromosome AND position using the Unix sort command. + Please be aware that this option might take a looooong time to complete, depending on + the size of the input files, and the memory you allocate to this process (see --buffer_size). + Nevertheless, it seems to be working. + +--ample_memory Using this option will not sort chromosomal positions using the UNIX 'sort' command, but will + instead use two arrays to sort methylated and unmethylated calls. This may result in a faster + sorting process of very large files, but this comes at the cost of a larger memory footprint + (two arrays of the length of the largest human chromosome 1 (~250M bp) consume around 16GB + of RAM). Due to overheads in creating and looping through these arrays it seems that it will + actually be *slower* for small files (few million alignments), and we are currently testing at + which point it is advisable to use this option. Note that --ample_memory is not compatible + with options '--scaffolds/--gazillion' (as it requires pre-sorted files to begin with). + + + +Genome-wide cytosine methylation report specific options: +========================================================= + +--cytosine_report After the conversion to bedGraph has completed, the option '--cytosine_report' produces a + genome-wide methylation report for all cytosines in the genome. By default, the output uses 1-based + chromosome coordinates (zero-based start coords are optional) and reports CpG context only (all + cytosine context is optional). The output considers all Cs on both forward and reverse strands and + reports their position, strand, trinucleotide content and methylation state (counts are 0 if not + covered). The cytosine report conversion step is performed by the external module + 'coverage2cytosine'; this script needs to reside in the same folder as the bismark_methylation_extractor + itself. + +--CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of + its context. This applies to both forward and reverse strands. Please be aware that this will + generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse. + Default: OFF (i.e. Default = CpG context only). + +--zero_based Uses 0-based genomic coordinates instead of 1-based coordinates. Default: OFF. + +--genome_folder <path> Enter the genome folder you wish to use to extract sequences from (full path only). Accepted + formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory. + +--split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files + will be named to include the input filename and the chromosome number. + + + +OUTPUT: + +The bismark_methylation_extractor output is in the form: +======================================================== +<seq-ID> <methylation state*> <chromosome> <start position (= end position)> <methylation call> + +* Methylated cytosines receive a '+' orientation, +* Unmethylated cytosines receive a '-' orientation. + + + +The bedGraph output (optional) looks like this (tab-delimited; 0-based start coords, 1-based end coords): +========================================================================================================= + +track type=bedGraph (header line) + +<chromosome> <start position> <end position> <methylation percentage> + + + +The coverage output looks like this (tab-delimited, 1-based genomic coords; zero-based half-open coordinates available with '--zero_based'): +============================================================================================================================================ + +<chromosome> <start position> <end position> <methylation percentage> <count methylated> <count non-methylated> + + + +The genome-wide cytosine methylation output file is tab-delimited in the following format: +========================================================================================== +<chromosome> <position> <strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context> + + + +This script was last modified on 22 April 2015. + +HOW_TO +}