Mercurial > repos > bgruening > bismark
diff bismark_methylation_extractor @ 0:62c6da72dd4a draft
Uploaded
author | bgruening |
---|---|
date | Sat, 06 Jul 2013 09:57:36 -0400 |
parents | |
children | 91f07ff056ca |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bismark_methylation_extractor Sat Jul 06 09:57:36 2013 -0400 @@ -0,0 +1,4005 @@ +#!/usr/bin/perl +use warnings; +use strict; +$|++; +use Getopt::Long; +use Cwd; +use Carp; +use FindBin qw($Bin); +use lib "$Bin/../lib"; + +## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk) + +## This program is free software: you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. + +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. + +## You should have received a copy of the GNU General Public License +## along with this program. If not, see <http://www.gnu.org/licenses/>. + +my @filenames; # input files +my %counting; +my $parent_dir = getcwd(); + +my %fhs; + +my $version = 'v0.7.11'; +my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip) = process_commandline(); + + +### only needed for bedGraph output +my @sorting_files; # if files are to be written to bedGraph format, these are the methylation extractor output files +my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total +my @bedfiles; + +### only needed for genome-wide cytosine methylation report +my %chromosomes; + +############################################################################################## +### Summarising Run Parameters +############################################################################################## + +### METHYLATION EXTRACTOR + +warn "Summarising Bismark methylation extractor parameters:\n"; +warn '='x63,"\n"; + +if ($single){ + if ($vanilla){ + warn "Bismark single-end vanilla format specified\n"; + } + else{ + warn "Bismark single-end SAM format specified (default)\n"; # default + } +} +elsif ($paired){ + if ($vanilla){ + warn "Bismark paired-end vanilla format specified\n"; + } + else{ + warn "Bismark paired-end SAM format specified (default)\n"; # default + } +} + +if ($ignore){ + warn "First $ignore bases will be disregarded when processing the methylation call string\n"; +} + +if ($full){ + warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n"; +} +if ($merge_non_CpG){ + warn "Merge CHG and CHH context to non-CpG context specified\n"; +} +### output directory +if ($output_dir eq ''){ + warn "Output will be written to the current directory ('$parent_dir')\n"; +} +else{ + warn "Output path specified as: $output_dir\n"; +} + + +sleep (1); + +### BEDGRAPH + +if ($bedGraph){ + warn "\n\nSummarising bedGraph parameters:\n"; + warn '='x63,"\n"; + + if ($counts){ + warn "Generating additional output in bedGraph format including methylating counts (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage> <count methylated> <count non-methylated>)\n"; + } + else{ + warn "Generating additional sorted output in bedGraph format (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage>)\n"; + } + + warn "Using a cutoff of $coverage_threshold read(s) to report cytosine positions\n"; + + if ($CX_context){ + warn "Reporting and sorting methylation information for all cytosine context (sorting may take a long time, you have been warned ...)\n"; + } + else{ # default + $CpG_only = 1; + warn "Reporting and sorting cytosine methylation information in CpG context only (default)\n"; + } + + if ($remove){ + warn "White spaces in read ID names will be removed prior to sorting\n"; + } + + if (defined $sort_size){ + warn "The bedGraph UNIX sort command will use the following memory setting:\t'$sort_size'. Temporary directory used for sorting is the output directory\n"; + } + else{ + warn "Setting a default memory usage for the bedGraph UNIX sort command to 2GB\n"; + } + + + + sleep (1); + + if ($cytosine_report){ + warn "\n\nSummarising genome-wide cytosine methylation report parameters:\n"; + warn '='x63,"\n"; + warn "Generating comprehensive genome-wide cytosine report\n(output format: <Chromosome> <Position> <Strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context> )\n"; + + + if ($CX_context){ + warn "Reporting methylation for all cytosine contexts. Be aware that this will generate enormous files\n"; + } + else{ # default + $CpG_only = 1; + warn "Reporting cytosine methylation in CpG context only (default)\n"; + } + + if ($split_by_chromosome){ + warn "Splitting the cytosine report output up into individual files for each chromosome\n"; + } + + ### Zero-based coordinates + if ($zero){ + warn "Using zero-based genomic coordinates (user-defined)\n"; + } + else{ # default, 1-based coords + warn "Using 1-based genomic coordinates (default)\n"; + } + + ### GENOME folder + if ($genome_folder){ + unless ($genome_folder =~/\/$/){ + $genome_folder =~ s/$/\//; + } + warn "Genome folder was specified as $genome_folder\n"; + } + else{ + $genome_folder = '/data/public/Genomes/Mouse/NCBIM37/'; + warn "Using the default genome folder /data/public/Genomes/Mouse/NCBIM37/\n"; + } + sleep (1); + } +} + +warn "\n"; +sleep (5); + +###################################################### +### PROCESSING FILES +###################################################### + +foreach my $filename (@filenames){ + # resetting counters and filehandles + %fhs = (); + %counting =( + total_meCHG_count => 0, + total_meCHH_count => 0, + total_meCpG_count => 0, + total_unmethylated_CHG_count => 0, + total_unmethylated_CHH_count => 0, + total_unmethylated_CpG_count => 0, + sequences_count => 0, + ); + @sorting_files = (); + @bedfiles = (); + + process_Bismark_results_file($filename); + + ### Closing all filehandles so that the Bismark methylation extractor output doesn't get truncated due to buffering issues + foreach my $fh (keys %fhs) { + if ($fh =~ /^[1230]$/) { + foreach my $context (keys %{$fhs{$fh}}) { + close $fhs{$fh}->{$context} or die $!; + + } + } else { + close $fhs{$fh} or die $!; + } + } + + if ($bedGraph){ + + my $out = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified + $out =~ s/gz$//; + $out =~ s/sam$//; + $out =~ s/bam$//; + $out =~ s/txt$//; + $out =~ s/$/bedGraph/; + + + + my $bedGraph_output = $out; + my @args; + + if ($remove){ + push @args, '--remove'; + } + if ($CX_context){ + push @args, '--CX_context'; + } + if ($no_header){ + push @args, '--no_header'; + } + if ($counts){ + push @args, "--counts"; + } + + push @args, "--buffer_size $sort_size"; + push @args, "--cutoff $coverage_threshold"; + push @args, "--output $bedGraph_output"; + push @args, "--dir '$output_dir'"; + + ### adding all files to be sorted to @args + foreach my $f (@sorting_files){ + push @args, $f; + } + + # print join "\t",@args,"\n"; + + system ("$Bin/bismark2bedGraph @args"); + + warn "Finished BedGraph conversion ...\n\n"; + sleep(3); + + # open (OUT,'>',$output_dir.$bedGraph_output) or die "Problems with the bedGraph output filename detected: file path: '$output_dir'\tfile name: '$bedGraph_output' $!"; + # warn "Writing bedGraph to file: $bedGraph_output\n"; + # process_bedGraph_output(); + # close OUT or die $!; + + ### genome-wide cytosine methylation report requires bedGraph processing anyway + if ($cytosine_report){ + @args = (); # resetting @args + my $cytosine_out = $out; + $cytosine_out =~ s/bedGraph$//; + + if ($CX_context){ + $cytosine_out =~ s/$/CX_report.txt/; + } + else{ + $cytosine_out =~ s/$/CpG_report.txt/; + } + + push @args, "--output $cytosine_out"; + push @args, "--dir '$output_dir'"; + push @args, "--genome '$genome_folder'"; + push @args, "--parent_dir '$parent_dir'"; + + if ($zero){ + push @args, "--zero"; + } + if ($CX_context){ + push @args, '--CX_context'; + } + if ($split_by_chromosome){ + push @args, '--split_by_chromosome'; + } + + push @args, $bedGraph_output; # this will be the infile + + system ("$Bin/bedGraph2cytosine @args"); + # generate_genome_wide_cytosine_report($bedGraph_output,$cytosine_out); + warn "\n\nFinished generating genome-wide cytosine report\n\n"; + } + } +} + + +sub process_commandline{ + my $help; + my $single_end; + my $paired_end; + my $ignore; + my $genomic_fasta; + my $full; + my $report; + my $extractor_version; + my $no_overlap; + my $merge_non_CpG; + my $vanilla; + my $output_dir; + my $no_header; + my $bedGraph; + my $coverage_threshold = 1; # Minimum number of reads covering before calling methylation status + my $remove; + my $counts; + my $cytosine_report; + my $genome_folder; + my $zero; + my $CpG_only; + my $CX_context; + my $split_by_chromosome; + my $sort_size; + my $samtools_path; + my $gzip; + + my $command_line = GetOptions ('help|man' => \$help, + 'p|paired-end' => \$paired_end, + 's|single-end' => \$single_end, + 'fasta' => \$genomic_fasta, + 'ignore=i' => \$ignore, + 'comprehensive' => \$full, + 'report' => \$report, + 'version' => \$extractor_version, + 'no_overlap' => \$no_overlap, + 'merge_non_CpG' => \$merge_non_CpG, + 'vanilla' => \$vanilla, + 'o|output=s' => \$output_dir, + 'no_header' => \$no_header, + 'bedGraph' => \$bedGraph, + "cutoff=i" => \$coverage_threshold, + "remove_spaces" => \$remove, + "counts" => \$counts, + "cytosine_report" => \$cytosine_report, + 'g|genome_folder=s' => \$genome_folder, + "zero_based" => \$zero, + "CX|CX_context" => \$CX_context, + "split_by_chromosome" => \$split_by_chromosome, + "buffer_size=s" => \$sort_size, + 'samtools_path=s' => \$samtools_path, + "gzip" => \$gzip, + ); + + ### EXIT ON ERROR if there were errors with any of the supplied options + unless ($command_line){ + die "Please respecify command line options\n"; + } + + ### HELPFILE + if ($help){ + print_helpfile(); + exit; + } + + if ($extractor_version){ + print << "VERSION"; + + + Bismark Methylation Extractor + + Bismark Extractor Version: $version + Copyright 2010-13 Felix Krueger, Babraham Bioinformatics + www.bioinformatics.babraham.ac.uk/projects/bismark/ + + +VERSION + exit; + } + + + ### no files provided + unless (@ARGV){ + die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n"; + } + @filenames = @ARGV; + + warn "\n *** Bismark methylation extractor version $version ***\n\n"; + + ### IGNORING <INT> bases at the start of the read when processing the methylation call string + unless ($ignore){ + $ignore = 0; + } + ### PRINT A REPORT + unless ($report){ + $report = 0; + } + + ### OUTPUT DIR PATH + if ($output_dir){ + unless ($output_dir =~ /\/$/){ + $output_dir =~ s/$/\//; + } + } + else{ + $output_dir = ''; + } + + ### NO HEADER + unless ($no_header){ + $no_header = 0; + } + + ### OLD (VANILLA) OUTPUT FORMAT + unless ($vanilla){ + $vanilla = 0; + } + + if ($single_end){ + $paired_end = 0; ### SINGLE END ALIGNMENTS + } + elsif ($paired_end){ + $single_end = 0; ### PAIRED-END ALIGNMENTS + } + else{ + die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format\n\n"; + } + + ### NO OVERLAP + if ($no_overlap){ + die "The option '--no_overlap' can only be specified for paired-end input!\n" unless ($paired_end); + } + else{ + $no_overlap = 0; + } + + ### COMPREHENSIVE OUTPUT + unless ($full){ + $full = 0; + } + + ### MERGE NON-CpG context + unless ($merge_non_CpG){ + $merge_non_CpG = 0; + } + + ### remove white spaces in read ID (needed for sorting using the sort command + unless ($remove){ + $remove = 0; + } + + ### COVERAGE THRESHOLD FOR bedGraph OUTPUT + if (defined $coverage_threshold){ + unless ($coverage_threshold > 0){ + die "Please select a coverage greater than 0 (positive integers only)\n"; + } + } + else{ + $coverage_threshold = 1; + } + + ### SORT buffer size + if (defined $sort_size){ + unless ($sort_size =~ /^\d+\%$/ or $sort_size =~ /^\d+(K|M|G|T)$/){ + die "Please select a buffer size as percentage (e.g. --buffer_size 20%) or a number to be multiplied with K, M, G, T etc. (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line\n"; + } + } + else{ + $sort_size = '2G'; + } + + if ($zero){ + die "Option '--zero' is only available if '--cytosine_report' is specified as well. Please respecify\n" unless ($cytosine_report); + } + + if ($CX_context){ + die "Option '--CX_context' is only available if '--cytosine_report' or '--bedGraph' is specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph); + } + else{ + $CX_context = 0; + } + + unless ($counts){ + $counts = 0; + } + + if ($cytosine_report){ + + ### GENOME folder + if ($genome_folder){ + unless ($genome_folder =~/\/$/){ + $genome_folder =~ s/$/\//; + } + } + else{ + die "Please specify a genome folder to proceed (full path only)\n"; + } + + unless ($bedGraph){ + warn "Setting the option '--bedGraph' since this is required for the genome-wide cytosine report\n"; + $bedGraph = 1; + } + unless ($counts){ + warn "Setting the option '--counts' since this is required for the genome-wide cytosine report\n"; + $counts = 1; + } + warn "\n"; + } + + ### PATH TO SAMTOOLS + if (defined $samtools_path){ + # if Samtools was specified as full command + if ($samtools_path =~ /samtools$/){ + if (-e $samtools_path){ + # Samtools executable found + } + else{ + die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; + } + } + else{ + unless ($samtools_path =~ /\/$/){ + $samtools_path =~ s/$/\//; + } + $samtools_path .= 'samtools'; + if (-e $samtools_path){ + # Samtools executable found + } + else{ + die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; + } + } + } + # Check whether Samtools is in the PATH if no path was supplied by the user + else{ + if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if Samtools is in the PATH + $samtools_path = `which samtools`; + chomp $samtools_path; + } + } + + unless (defined $samtools_path){ + $samtools_path = ''; + } + + return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip); +} + + +sub process_Bismark_results_file{ + my $filename = shift; + + warn "\nNow reading in Bismark result file $filename\n\n"; + + if ($filename =~ /\.gz$/) { + open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n"; + } + elsif ($filename =~ /bam$/) { + if ($samtools_path){ + open (IN,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n"; + } + else{ + die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n"; + } + } + else { + open (IN,$filename) or die "Can't open file $filename: $!\n"; + } + + ### Vanilla and SAM output need to read different numbers of header lines + if ($vanilla) { + my $bismark_version = <IN>; ## discarding the Bismark version info + chomp $bismark_version; + $bismark_version =~ s/\r//; # replaces \r line feed + $bismark_version =~ s/Bismark version: //; + if ($bismark_version =~ /^\@/) { + warn "Detected \@ as the first character of the version information. Is it possible that the file is in SAM format?\n\n"; + sleep (2); + } + + unless ($version eq $bismark_version){ + die "The methylation extractor and Bismark itself need to be of the same version!\n\nVersions used:\nmethylation extractor: '$version'\nBismark: '$bismark_version'\n"; + } + } else { + # If the read is in SAM format (default) it can either start with @ header lines or start with alignments directly. + # We are reading from it further down + } + + my $output_filename = (split (/\//,$filename))[-1]; + + ### OPENING OUTPUT-FILEHANDLES + if ($report) { + my $report_filename = $output_filename; + $report_filename =~ s/\.sam$//; + $report_filename =~ s/\.txt$//; + $report_filename =~ s/$/_splitting_report.txt/; + $report_filename = $output_dir . $report_filename; + open (REPORT,'>',$report_filename) or die "Failed to write to file $report_filename $!\n"; + } + + if ($report) { + print REPORT "$output_filename\n\n"; + print REPORT "Parameters used to extract methylation information:\n"; + if ($paired) { + if ($vanilla) { + print REPORT "Bismark result file: paired-end (vanilla Bismark format)\n"; + } else { + print REPORT "Bismark result file: paired-end (SAM format)\n"; # default + } + } + + if ($single) { + if ($vanilla) { + print REPORT "Bismark result file: single-end (vanilla Bismark format)\n"; + } else { + print REPORT "Bismark result file: single-end (SAM format)\n"; # default + } + } + + if ($ignore) { + print REPORT "Ignoring first $ignore bases\n"; + } + + if ($full) { + print REPORT "Output specified: comprehensive\n"; + } else { + print REPORT "Output specified: strand-specific (default)\n"; + } + + if ($no_overlap) { + print REPORT "No overlapping methylation calls specified\n"; + } + if ($genomic_fasta) { + print REPORT "Genomic equivalent sequences will be printed out in FastA format\n"; + } + if ($merge_non_CpG) { + print REPORT "Methylation in CHG and CHH context will be merged into \"non-CpG context\" output\n"; + } + + print REPORT "\n"; + } + +##### open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + + ### CpG-context and non-CpG context. THIS SECTION IS OPTIONAL + ### if --comprehensive AND --merge_non_CpG was specified we are only writing out one CpG-context and one Any-Other-context result file + if ($full and $merge_non_CpG) { + my $cpg_output = my $other_c_output = $output_filename; + ### C in CpG context + $cpg_output =~ s/^/CpG_context_/; + $cpg_output =~ s/sam$/txt/; + $cpg_output =~ s/bam$/txt/; + $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); + $cpg_output = $output_dir . $cpg_output; + + if ($gzip){ + $cpg_output .= '.gz'; + open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n"; + } + else{ + open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n"; + } + + warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n"; + push @sorting_files,$cpg_output; + + unless ($no_header) { + print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n"; + } + + ### C in any other context than CpG + $other_c_output =~ s/^/Non_CpG_context_/; + $other_c_output =~ s/sam$/txt/; + $other_c_output =~ s/bam$/txt/; + $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/); + $other_c_output = $output_dir . $other_c_output; + + if ($gzip){ + $other_c_output .= '.gz'; + open ($fhs{other_context},"| gzip -c - > $other_c_output") or die "Failed to write to $other_c_output $! \n"; + } + else{ + open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n"; + } + + warn "Writing result file containing methylation information for C in any other context to $other_c_output\n"; + push @sorting_files,$other_c_output; + + + unless ($no_header) { + print {$fhs{other_context}} "Bismark methylation extractor version $version\n"; + } + } + + ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found + elsif ($merge_non_CpG) { + + my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; + + ### For cytosines in CpG context + $cpg_ot =~ s/^/CpG_OT_/; + $cpg_ot =~ s/sam$/txt/; + $cpg_ot =~ s/bam$/txt/; + $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); + $cpg_ot = $output_dir . $cpg_ot; + + if ($gzip){ + $cpg_ot .= '.gz'; + open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n"; + } + else{ + open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n"; + } + + warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n"; + push @sorting_files,$cpg_ot; + + unless($no_header){ + print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n"; + } + + $cpg_ctot =~ s/^/CpG_CTOT_/; + $cpg_ctot =~ s/sam$/txt/; + $cpg_ctot =~ s/bam$/txt/; + $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); + $cpg_ctot = $output_dir . $cpg_ctot; + + if ($gzip){ + $cpg_ctot .= '.gz'; + open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n"; + } + else{ + open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n"; + } + + warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n"; + push @sorting_files,$cpg_ctot; + + unless($no_header){ + print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n"; + } + + $cpg_ctob =~ s/^/CpG_CTOB_/; + $cpg_ctob =~ s/sam$/txt/; + $cpg_ctob =~ s/bam$/txt/; + $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); + $cpg_ctob = $output_dir . $cpg_ctob; + + if ($gzip){ + $cpg_ctob .= '.gz'; + open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n"; + } + else{ + open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n"; + } + + warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n"; + push @sorting_files,$cpg_ctob; + + unless($no_header){ + print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n"; + } + + $cpg_ob =~ s/^/CpG_OB_/; + $cpg_ob =~ s/sam$/txt/; + $cpg_ob =~ s/bam$/txt/; + $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); + $cpg_ob = $output_dir . $cpg_ob; + + if ($gzip){ + $cpg_ob .= '.gz'; + open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n"; + } + else{ + open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n"; + } + + warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n"; + push @sorting_files,$cpg_ob; + + unless($no_header){ + print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n"; + } + + ### For cytosines in Non-CpG (CC, CT or CA) context + my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename; + + $other_c_ot =~ s/^/Non_CpG_OT_/; + $other_c_ot =~ s/sam$/txt/; + $other_c_ot =~ s/bam$/txt/; + $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/); + $other_c_ot = $output_dir . $other_c_ot; + + if ($gzip){ + $other_c_ot .= '.gz'; + open ($fhs{0}->{other_c},"| gzip -c - > $other_c_ot") or die "Failed to write to $other_c_ot $!\n"; + } + else{ + open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n"; + } + + warn "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n"; + push @sorting_files,$other_c_ot; + + unless($no_header){ + print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n"; + } + + $other_c_ctot =~ s/^/Non_CpG_CTOT_/; + $other_c_ctot =~ s/sam$/txt/; + $other_c_ctot =~ s/bam$/txt/; + $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/); + $other_c_ctot = $output_dir . $other_c_ctot; + + if ($gzip){ + $other_c_ctot .= '.gz'; + open ($fhs{1}->{other_c},"| gzip -c - > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n"; + } + else{ + open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n"; + } + + warn "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n"; + push @sorting_files,$other_c_ctot; + + unless($no_header){ + print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n"; + } + + $other_c_ctob =~ s/^/Non_CpG_CTOB_/; + $other_c_ctob =~ s/sam$/txt/; + $other_c_ctob =~ s/bam$/txt/; + $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/); + $other_c_ctob = $output_dir . $other_c_ctob; + + if ($gzip){ + $other_c_ctob .= '.gz'; + open ($fhs{2}->{other_c},"| gzip -c - > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n"; + } + else{ + open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n"; + } + + warn "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n"; + push @sorting_files,$other_c_ctob; + + unless($no_header){ + print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n"; + } + + $other_c_ob =~ s/^/Non_CpG_OB_/; + $other_c_ob =~ s/sam$/txt/; + $other_c_ob =~ s/sam$/txt/; + $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/); + $other_c_ob = $output_dir . $other_c_ob; + + if ($gzip){ + $other_c_ob .= '.gz'; + open ($fhs{3}->{other_c},"| gzip -c - > $other_c_ob") or die "Failed to write to $other_c_ob $!\n"; + } + else{ + open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n"; + } + + warn "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n"; + push @sorting_files,$other_c_ob; + + unless($no_header){ + print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n"; + } + } + ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context) + + ### if --comprehensive was specified we are only writing one file per context + elsif ($full) { + my $cpg_output = my $chg_output = my $chh_output = $output_filename; + ### C in CpG context + $cpg_output =~ s/^/CpG_context_/; + $cpg_output =~ s/sam$/txt/; + $cpg_output =~ s/bam$/txt/; + $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); + $cpg_output = $output_dir . $cpg_output; + + if ($gzip){ + $cpg_output .= '.gz'; + open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n"; + } + else{ + open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n"; + } + + warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n"; + push @sorting_files,$cpg_output; + + unless($no_header){ + print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n"; + } + + ### C in CHG context + $chg_output =~ s/^/CHG_context_/; + $chg_output =~ s/sam$/txt/; + $chg_output =~ s/bam$/txt/; + $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/); + $chg_output = $output_dir . $chg_output; + + if ($gzip){ + $chg_output .= '.gz'; + open ($fhs{CHG_context},"| gzip -c - > $chg_output") or die "Failed to write to $chg_output $!\n"; + } + else{ + open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n"; + } + + warn "Writing result file containing methylation information for C in CHG context to $chg_output\n"; + push @sorting_files,$chg_output; + + unless($no_header){ + print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n"; + } + + ### C in CHH context + $chh_output =~ s/^/CHH_context_/; + $chh_output =~ s/sam$/txt/; + $chh_output =~ s/bam$/txt/; + $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/); + $chh_output = $output_dir . $chh_output; + + if ($gzip){ + $chh_output .= '.gz'; + open ($fhs{CHH_context},"| gzip -c - > $chh_output") or die "Failed to write to $chh_output $!\n"; + } + else{ + open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n"; + } + + warn "Writing result file containing methylation information for C in CHH context to $chh_output\n"; + push @sorting_files, $chh_output; + + unless($no_header){ + print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n"; + } + } + ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found + else { + my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; + + ### For cytosines in CpG context + $cpg_ot =~ s/^/CpG_OT_/; + $cpg_ot =~ s/sam$/txt/; + $cpg_ot =~ s/bam$/txt/; + $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); + $cpg_ot = $output_dir . $cpg_ot; + + if ($gzip){ + $cpg_ot .= '.gz'; + open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n"; + } + else{ + open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n"; + } + + warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n"; + push @sorting_files,$cpg_ot; + + unless($no_header){ + print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n"; + } + + $cpg_ctot =~ s/^/CpG_CTOT_/; + $cpg_ctot =~ s/sam$/txt/; + $cpg_ctot =~ s/bam$/txt/; + $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); + $cpg_ctot = $output_dir . $cpg_ctot; + + if ($gzip){ + $cpg_ctot .= '.gz'; + open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n"; + } + else{ + open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n"; + } + + warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n"; + push @sorting_files,$cpg_ctot; + + unless($no_header){ + print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n"; + } + + $cpg_ctob =~ s/^/CpG_CTOB_/; + $cpg_ctob =~ s/sam$/txt/; + $cpg_ctob =~ s/bam$/txt/; + $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); + $cpg_ctob = $output_dir . $cpg_ctob; + + if ($gzip){ + $cpg_ctob .= '.gz'; + open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n"; + } + else{ + open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n"; + } + + warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n"; + push @sorting_files,$cpg_ctob; + + unless($no_header){ + print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n"; + } + + $cpg_ob =~ s/^/CpG_OB_/; + $cpg_ob =~ s/sam$/txt/; + $cpg_ob =~ s/bam$/txt/; + $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); + $cpg_ob = $output_dir . $cpg_ob; + + if ($gzip){ + $cpg_ob .= '.gz'; + open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n"; + } + else{ + open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n"; + } + + warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n"; + push @sorting_files,$cpg_ob; + + unless($no_header){ + print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n"; + } + + ### For cytosines in CHG context + my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename; + + $chg_ot =~ s/^/CHG_OT_/; + $chg_ot =~ s/sam$/txt/; + $chg_ot =~ s/bam$/txt/; + $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/); + $chg_ot = $output_dir . $chg_ot; + + if ($gzip){ + $chg_ot .= '.gz'; + open ($fhs{0}->{CHG},"| gzip -c - > $chg_ot") or die "Failed to write to $chg_ot $!\n"; + } + else{ + open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n"; + } + + warn "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n"; + push @sorting_files,$chg_ot; + + unless($no_header){ + print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n"; + } + + $chg_ctot =~ s/^/CHG_CTOT_/; + $chg_ctot =~ s/sam$/txt/; + $chg_ctot =~ s/bam$/txt/; + $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/); + $chg_ctot = $output_dir . $chg_ctot; + + if ($gzip){ + $chg_ctot .= '.gz'; + open ($fhs{1}->{CHG},"| gzip -c - > $chg_ctot") or die "Failed to write to $chg_ctot $!\n"; + } + else{ + open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n"; + } + + warn "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n"; + push @sorting_files,$chg_ctot; + + unless($no_header){ + print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n"; + } + + $chg_ctob =~ s/^/CHG_CTOB_/; + $chg_ctob =~ s/sam$/txt/; + $chg_ctob =~ s/bam$/txt/; + $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/); + $chg_ctob = $output_dir . $chg_ctob; + + if ($gzip){ + $chg_ctob .= '.gz'; + open ($fhs{2}->{CHG},"| gzip -c - > $chg_ctob") or die "Failed to write to $chg_ctob $!\n"; + } + else{ + open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n"; + } + + warn "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n"; + push @sorting_files,$chg_ctob; + + unless($no_header){ + print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n"; + } + + $chg_ob =~ s/^/CHG_OB_/; + $chg_ob =~ s/sam$/txt/; + $chg_ob =~ s/bam$/txt/; + $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/); + $chg_ob = $output_dir . $chg_ob; + + if ($gzip){ + $chg_ob .= '.gz'; + open ($fhs{3}->{CHG},"| gzip -c - > $chg_ob") or die "Failed to write to $chg_ob $!\n"; + } + else{ + open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n"; + } + + warn "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n"; + push @sorting_files,$chg_ob; + + unless($no_header){ + print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n"; + } + + ### For cytosines in CHH context + my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename; + + $chh_ot =~ s/^/CHH_OT_/; + $chh_ot =~ s/sam$/txt/; + $chh_ot =~ s/bam$/txt/; + $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/); + $chh_ot = $output_dir . $chh_ot; + + if ($gzip){ + $chh_ot .= '.gz'; + open ($fhs{0}->{CHH},"| gzip -c - > $chh_ot") or die "Failed to write to $chh_ot $!\n"; + } + else{ + open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n"; + } + + warn "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n"; + push @sorting_files,$chh_ot; + + unless($no_header){ + print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n"; + } + + $chh_ctot =~ s/^/CHH_CTOT_/; + $chh_ctot =~ s/sam$/txt/; + $chh_ctot =~ s/bam$/txt/; + $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/); + $chh_ctot = $output_dir . $chh_ctot; + + if ($gzip){ + $chh_ctot .= '.gz'; + open ($fhs{1}->{CHH},"| gzip -c - > $chh_ctot") or die "Failed to write to $chh_ctot $!\n"; + } + else{ + open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n"; + } + + warn "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n"; + push @sorting_files,$chh_ctot; + + unless($no_header){ + print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n"; + } + + $chh_ctob =~ s/^/CHH_CTOB_/; + $chh_ctob =~ s/sam$/txt/; + $chh_ctob =~ s/bam$/txt/; + $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/); + $chh_ctob = $output_dir . $chh_ctob; + + if ($gzip){ + $chh_ctob .= '.gz'; + open ($fhs{2}->{CHH},"| gzip -c - > $chh_ctob") or die "Failed to write to $chh_ctob $!\n"; + } + else{ + open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n"; + } + + warn "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n"; + push @sorting_files,$chh_ctob; + + unless($no_header){ + print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n"; + } + + $chh_ob =~ s/^/CHH_OB_/; + $chh_ob =~ s/sam$/txt/; + $chh_ob =~ s/bam$/txt/; + $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/); + $chh_ob = $output_dir . $chh_ob; + + if ($gzip){ + $chh_ob .= '.gz'; + open ($fhs{3}->{CHH},"| gzip -c - > $chh_ob") or die "Failed to write to $chh_ob $!\n"; + } + else{ + open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n"; + } + + warn "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n"; + push @sorting_files,$chh_ob; + + unless($no_header){ + print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n"; + } + } + + my $methylation_call_strings_processed = 0; + my $line_count = 0; + + ### proceeding differently now for single-end or paired-end Bismark files + + ### PROCESSING SINGLE-END RESULT FILES + if ($single) { + + ### also proceeding differently now for SAM format or vanilla Bismark format files + if ($vanilla) { # old vanilla Bismark output format + while (<IN>) { + ++$line_count; + warn "Processed lines: $line_count\n" if ($line_count%500000==0); + + ### $seq here is the chromosomal sequence (to use for the repeat analysis for example) + my ($id,$strand,$chrom,$start,$seq,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,6,7,8,9]; + + ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the first or + ### last position + chomp $genome_conversion; + + my $index; + if ($meth_call) { + + if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand + $index = 0; + } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand + $index = 1; + } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand + $index = 3; + } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand + $index = 2; + } else { + die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; + } + + ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int> + if ($ignore) { + $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); + + ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly! + if ($strand eq '+') { + $start += $ignore; + } elsif ($strand eq '-') { + $start += length($meth_call)-1; ## $meth_call is already shortened! + } else { + die "Alignment did not have proper strand information: $strand\n"; + } + } + ### printing out the methylation state of every C in the read + print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index); + + ++$methylation_call_strings_processed; # 1 per single-end result + } + } + } else { # processing single-end SAM format (default) + while (<IN>) { + ### SAM format can either start with header lines (starting with @) or start with alignments directly + if (/^\@/) { # skipping header lines (starting with @) + warn "skipping SAM header line:\t$_"; + next; + } + + ++$line_count; + warn "Processed lines: $line_count\n" if ($line_count%500000==0); + + # example read in SAM format + # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT + ### + + # < 0.7.6 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; + # < 0.7.6 $meth_call =~ s/^XM:Z://; + # < 0.7.6 $read_conversion =~ s/^XR:Z://; + # < 0.7.6 $genome_conversion =~ s/^XG:Z://; + + my ($id,$chrom,$start,$cigar) = (split("\t"))[0,2,3,5]; + + ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression + my $meth_call; ### Thanks to Zachary Zeno for this solution + my $read_conversion; + my $genome_conversion; + + while ( /(XM|XR|XG):Z:([^\t]+)/g ) { + my $tag = $1; + my $value = $2; + + if ($tag eq "XM") { + $meth_call = $value; + $meth_call =~ s/\r//; + } elsif ($tag eq "XR") { + $read_conversion = $value; + $read_conversion =~ s/\r//; + } elsif ($tag eq "XG") { + $genome_conversion = $value; + $genome_conversion =~ s/\r//; + } + } + + my $strand; + chomp $genome_conversion; + # print "$meth_call\n$read_conversion\n$genome_conversion\n"; + + my $index; + if ($meth_call) { + if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand + $index = 0; + $strand = '+'; + } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand + $index = 1; + $strand = '-'; + } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand + $index = 2; + $strand = '+'; + } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand + $index = 3; + $strand = '-'; + } else { + die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; + } + + ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output + if ($strand eq '-') { + $meth_call = reverse $meth_call; + } + + ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int> + if ($ignore) { + # print "\n\n$meth_call\n"; + $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); + # print "$meth_call\n"; + ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly + + my @len = split (/\D+/,$cigar); # storing the length per operation + my @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + my @comp_cigar; # building an array with all CIGAR operations + foreach my $index (0..$#len) { + foreach (1..$len[$index]) { + # print "$ops[$index]"; + push @comp_cigar, $ops[$index]; + } + } + # print "original CIGAR: $cigar\n"; + # print "original CIGAR: @comp_cigar\n"; + + ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly! + if ($strand eq '+') { + + my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions + my $I_count = 0; + + for (1..$ignore) { + my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations from the start + # print "$_ deleted $op\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $D_count++; + $op = shift @comp_cigar; + # print "$_ deleted $op\n"; + } + if ($op eq 'I') { # adjusting the genomic position for insertions (I) + $I_count++; + } + } + $start += $ignore + $D_count - $I_count; + # print "start $start\t ignore: $ignore\t D count: $D_count I_count: $I_count\n"; + } elsif ($strand eq '-') { + + for (1..$ignore) { + my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array + while ($op eq 'D') { # repeating this for deletions (D) + $op = pop @comp_cigar; + } + } + + ### For reverse strand alignments we need to determine the number of matching bases (M) or deletions (D) in the read from the CIGAR + ### string to be able to work out the starting position of the read which is on the 3' end of the sequence + my $MD_count = 0; # counting all operations that affect the genomic position, i.e. M and D. Insertions do not affect the start position + foreach (@comp_cigar) { + ++$MD_count if ($_ eq 'M' or $_ eq 'D'); + } + $start += $MD_count - 1; + } + + ### reconstituting shortened CIGAR string + my $new_cigar; + my $count = 0; + my $last_op; + # print "ignore adjusted: @comp_cigar\n"; + foreach my $op (@comp_cigar) { + unless (defined $last_op){ + $last_op = $op; + ++$count; + next; + } + if ($last_op eq $op) { + ++$count; + } else { + $new_cigar .= "$count$last_op"; + $last_op = $op; + $count = 1; + } + } + $new_cigar .= "$count$last_op"; # appending the last operation and count + $cigar = $new_cigar; + # print "ignore adjusted scalar: $cigar\n"; + } + } + ### printing out the methylation state of every C in the read + print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index,$cigar); + + ++$methylation_call_strings_processed; # 1 per single-end result + } + } + } + + ### PROCESSING PAIRED-END RESULT FILES + elsif ($paired) { + + ### proceeding differently now for SAM format or vanilla Bismark format files + if ($vanilla) { # old vanilla Bismark paired-end output format + while (<IN>) { + ++$line_count; + warn "processed line: $line_count\n" if ($line_count%500000==0); + + ### $seq here is the chromosomal sequence (to use for the repeat analysis for example) + my ($id,$strand,$chrom,$start_read_1,$end_read_2,$seq_1,$meth_call_1,$seq_2,$meth_call_2,$first_read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,4,6,7,9,10,11,12,13]; + + my $index; + chomp $genome_conversion; + + if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') { + $index = 0; ## this is OT + } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') { + $index = 2; ## this is CTOB!!! + } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') { + $index = 1; ## this is CTOT!!! + } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') { + $index = 3; ## this is OB + } else { + die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; + } + + if ($meth_call_1 and $meth_call_2) { + ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' + if ($ignore) { + $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); + $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore); + + ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified + $start_read_1 += $ignore; + $end_read_2 -= $ignore; + } + my $end_read_1; + my $start_read_2; + + if ($strand eq '+') { + + $end_read_1 = $start_read_1+length($meth_call_1)-1; + $start_read_2 = $end_read_2-length($meth_call_2)+1; + + ## we first pass the first read which is in + orientation on the forward strand + print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0); + + # we next pass the second read which is in - orientation on the reverse strand + ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 + print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1); + } else { + + $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read! + $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read! + + ## we first pass the first read which is in - orientation on the reverse strand + print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0); + + # we next pass the second read which is in + orientation on the forward strand + ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 + print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2); + } + + $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls + } + } + } else { # Bismark paired-end SAM output format (default) + while (<IN>) { + ### SAM format can either start with header lines (starting with @) or start with alignments directly + if (/^\@/) { # skipping header lines (starting with @) + warn "skipping SAM header line:\t$_"; + next; + } + + ++$line_count; + warn "Processed lines: $line_count\n" if ($line_count%500000==0); + + # example paired-end reads in SAM format (2 consecutive lines) + # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT + # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT + + # < version 0.7.6 my ($id_1,$chrom,$start_read_1,$meth_call_1,$first_read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; + + my ($id_1,$chrom,$start_read_1,$cigar_1) = (split("\t"))[0,2,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression + my $meth_call_1; + my $first_read_conversion; + my $genome_conversion; + + while ( /(XM|XR|XG):Z:([^\t]+)/g ) { + my $tag = $1; + my $value = $2; + + if ($tag eq "XM") { + $meth_call_1 = $value; + $meth_call_1 =~ s/\r//; + } elsif ($tag eq "XR") { + $first_read_conversion = $value; + $first_read_conversion =~ s/\r//; + } elsif ($tag eq "XG") { + $genome_conversion = $value; + $genome_conversion =~ s/\r//; + } + } + + $_ = <IN>; # reading in the paired read + + # < version 0.7.6 my ($id_2,$start_read_2,$meth_call_2,$second_read_conversion) = (split("\t"))[0,3,13,14]; + # < version 0.7.6 $meth_call_1 =~ s/^XM:Z://; + # < version 0.7.6 $meth_call_2 =~ s/^XM:Z://; + # < version 0.7.6 $first_read_conversion =~ s/^XR:Z://; + # < version 0.7.6 $second_read_conversion =~ s/^XR:Z://; + + my ($id_2,$start_read_2,$cigar_2) = (split("\t"))[0,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression + + my $meth_call_2; + my $second_read_conversion; + + while ( /(XM|XR):Z:([^\t]+)/g ) { + my $tag = $1; + my $value = $2; + + if ($tag eq "XM") { + $meth_call_2 = $value; + $meth_call_2 =~ s/\r//; + } elsif ($tag eq "XR") { + $second_read_conversion = $value; + $second_read_conversion = s/\r//; + } + } + + # < version 0.7.6 $genome_conversion =~ s/^XG:Z://; + chomp $genome_conversion; # in case it captured a new line character + + # print join ("\t",$meth_call_1,$meth_call_2,$first_read_conversion,$second_read_conversion,$genome_conversion),"\n"; + + my $index; + my $strand; + + if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') { + $index = 0; ## this is OT + $strand = '+'; + } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') { + $index = 1; ## this is CTOT + $strand = '-'; + } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') { + $index = 2; ## this is CTOB + $strand = '+'; + } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') { + $index = 3; ## this is OB + $strand = '-'; + } else { + die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; + } + + ### reversing the methylation call of the read that was reverse-complemented + if ($strand eq '+') { + $meth_call_2 = reverse $meth_call_2; + } else { + $meth_call_1 = reverse $meth_call_1; + } + + if ($meth_call_1 and $meth_call_2) { + + my $end_read_1; + + ### READ 1 + my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation + my @ops_1 = split (/\d+/,$cigar_1); # storing the operation + shift @ops_1; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1); + + my @comp_cigar_1; # building an array with all CIGAR operations + foreach my $index (0..$#len_1) { + foreach (1..$len_1[$index]) { + # print "$ops_1[$index]"; + push @comp_cigar_1, $ops_1[$index]; + } + } + # print "original CIGAR read 1: $cigar_1\n"; + # print "original CIGAR read 1: @comp_cigar_1\n"; + + ### READ 2 + my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation + my @ops_2 = split (/\d+/,$cigar_2); # storing the operation + shift @ops_2; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2); + my @comp_cigar_2; # building an array with all CIGAR operations for read 2 + foreach my $index (0..$#len_2) { + foreach (1..$len_2[$index]) { + # print "$ops_2[$index]"; + push @comp_cigar_2, $ops_2[$index]; + } + } + # print "original CIGAR read 2: $cigar_2\n"; + # print "original CIGAR read 2: @comp_cigar_2\n"; + + if ($ignore) { + ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' + ### the methylation calls have already been reversed where necessary + $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); + $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore); + + ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly + + if ($strand eq '+') { + + ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start + my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions + my $I_count_1 = 0; + + for (1..$ignore) { + my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start + # print "$_ deleted $op\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $D_count_1++; + $op = shift @comp_cigar_1; + # print "$_ deleted $op\n"; + } + if ($op eq 'I') { # adjusting the genomic position for insertions (I) + $I_count_1++; + } + } + + $start_read_1 += $ignore + $D_count_1 - $I_count_1; + # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n"; + + ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back + + for (1..$ignore) { + my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array + while ($op eq 'D') { # repeating this for deletions (D) + $op = pop @comp_cigar_2; + } + } + # the start position of reads mapping to the reverse strand is being adjusted further below + } elsif ($strand eq '-') { + + ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back + for (1..$ignore) { + my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array + while ($op eq 'D') { # repeating this for deletions (D) + $op = pop @comp_cigar_1; + } + } + # the start position of reads mapping to the reverse strand is being adjusted further below + + ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start + my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions + my $I_count_2 = 0; + + for (1..$ignore) { + my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start + # print "$_ deleted $op\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $D_count_2++; + $op = shift @comp_cigar_2; + # print "$_ deleted $op\n"; + } + if ($op eq 'I') { # adjusting the genomic position for insertions (I) + $I_count_2++; + } + } + + $start_read_2 += $ignore + $D_count_2 - $I_count_2; + # print "start read 2 $start_read_2\t ignore: $ignore\t D count 2: $D_count_2\tI_count 2: $I_count_2\n"; + + } + + ### reconstituting shortened CIGAR string 1 + my $new_cigar_1; + my $count_1 = 0; + my $last_op_1; + # print "ignore adjusted CIGAR 1: @comp_cigar_1\n"; + foreach my $op (@comp_cigar_1) { + unless (defined $last_op_1){ + $last_op_1 = $op; + ++$count_1; + next; + } + if ($last_op_1 eq $op) { + ++$count_1; + } else { + $new_cigar_1 .= "$count_1$last_op_1"; + $last_op_1 = $op; + $count_1 = 1; + } + } + $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count + $cigar_1 = $new_cigar_1; + # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n"; + + ### reconstituting shortened CIGAR string 2 + my $new_cigar_2; + my $count_2 = 0; + my $last_op_2; + # print "ignore adjusted CIGAR 2: @comp_cigar_2\n"; + foreach my $op (@comp_cigar_2) { + unless (defined $last_op_2){ + $last_op_2 = $op; + ++$count_2; + next; + } + if ($last_op_2 eq $op) { + ++$count_2; + } else { + $new_cigar_2 .= "$count_2$last_op_2"; + $last_op_2 = $op; + $count_2 = 1; + } + } + $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count + $cigar_2 = $new_cigar_2; + # print "ignore adjusted CIGAR 2 scalar: $cigar_2\n"; + + } + + if ($strand eq '+') { + ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2 + @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too + # print "reverse: @comp_cigar_2\n"; + + my $MD_count_1 = 0; + foreach (@comp_cigar_1) { + ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't + } + + my $MD_count_2 = 0; + foreach (@comp_cigar_2) { + ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't + } + + $end_read_1 = $start_read_1 + $MD_count_1 - 1; + $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand + } else { + ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1 + + @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too + # print "reverse: @comp_cigar_1\n"; + + my $MD_count_1 = 0; + foreach (@comp_cigar_1) { + ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't + } + + $end_read_1 = $start_read_1; + $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand + + } + + if ($strand eq '+') { + ## we first pass the first read which is in + orientation on the forward strand + print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1); + + # we next pass the second read which is in - orientation on the reverse strand + ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 + print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2); + } else { + ## we first pass the first read which is in - orientation on the reverse strand + print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1); + + # we next pass the second read which is in + orientation on the forward strand + ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 + print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2); + } + + $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls + } + } + } + } else { + die "Single-end or paired-end reads not specified properly\n"; + } + + print "\n\nProcessed $line_count lines from $filename in total\n"; + print "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; + if ($report) { + print REPORT "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; + } + print_splitting_report (); +} + + + +sub print_splitting_report{ + + ### Calculating methylation percentages if applicable + + my $percent_meCpG; + if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ + $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); + } + + my $percent_meCHG; + if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); + } + + my $percent_meCHH; + if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ + $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); + } + + my $percent_non_CpG_methylation; + if ($merge_non_CpG){ + if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) ); + } + } + + if ($report){ + ### detailed information about Cs analysed + print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; + + my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; + print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; + + print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; + + print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; + + ### calculating methylated CpG percentage if applicable + if ($percent_meCpG){ + print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; + } + else{ + print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + } + + ### 2-Context Output + if ($merge_non_CpG){ + if ($percent_non_CpG_methylation){ + print REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; + } + else{ + print REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; + } + } + + ### 3 Context Output + else{ + ### calculating methylated CHG percentage if applicable + if ($percent_meCHG){ + print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; + } + else{ + print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + } + + ### calculating methylated CHH percentage if applicable + if ($percent_meCHH){ + print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; + } + else{ + print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; + } + } + } + + ### detailed information about Cs analysed for on-screen report + print "Final Cytosine Methylation Report\n",'='x33,"\n"; + + my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; + print "Total number of C's analysed:\t$total_number_of_C\n\n"; + + print "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + print "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + print "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; + + print "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + print "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + print "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; + + ### printing methylated CpG percentage if applicable + if ($percent_meCpG){ + print "C methylated in CpG context:\t${percent_meCpG}%\n"; + } + else{ + print "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + } + + ### 2-Context Output + if ($merge_non_CpG){ + if ($percent_non_CpG_methylation){ + print "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; + } + else{ + print "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; + } + } + + ### 3-Context Output + else{ + ### printing methylated CHG percentage if applicable + if ($percent_meCHG){ + print "C methylated in CHG context:\t${percent_meCHG}%\n"; + } + else{ + print "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + } + + ### printing methylated CHH percentage if applicable + if ($percent_meCHH){ + print "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; + } + else{ + print "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; + } + } +} + + + + + +sub print_individual_C_methylation_states_paired_end_files{ + + my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar) = @_; + my @methylation_calls = split(//,$meth_call); + + ################################################################# + ### . for bases not involving cytosines ### + ### X for methylated C in CHG context (was protected) ### + ### x for not methylated C in CHG context (was converted) ### + ### H for methylated C in CHH context (was protected) ### + ### h for not methylated C in CHH context (was converted) ### + ### Z for methylated C in CpG context (was protected) ### + ### z for not methylated C in CpG context (was converted) ### + ################################################################# + + my $methyl_CHG_count = 0; + my $methyl_CHH_count = 0; + my $methyl_CpG_count = 0; + my $unmethylated_CHG_count = 0; + my $unmethylated_CHH_count = 0; + my $unmethylated_CpG_count = 0; + + + my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions + my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels + my @comp_cigar; + + ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing + if ($cigar =~ /^\d+M$/){ + } + else{ # parsing CIGAR string + my @len; + my @ops; + @len = split (/\D+/,$cigar); # storing the length per operation + @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + foreach my $index (0..$#len){ + foreach (1..$len[$index]){ + # print "$ops[$index]"; + push @comp_cigar, $ops[$index]; + } + } + # warn "\nDetected CIGAR string: $cigar\n"; + # warn "Length of methylation call: ",length $meth_call,"\n"; + # warn "number of operations: ",scalar @ops,"\n"; + # warn "number of length digits: ",scalar @len,"\n\n"; + # print @comp_cigar,"\n"; + # print "$meth_call\n\n"; + # sleep (1); + } + + if ($strand eq '-') { + + ### the CIGAR string needs to be reversed, the methylation call has already been reversed above + if (@comp_cigar){ + @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too + } + # print "reverse CIGAR string: @comp_cigar\n"; + + ### the start position of paired-end files has already been corrected, see above + } + + ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified + + if ($merge_non_CpG) { + + if ($no_overlap) { + + ### single-file CpG and non-CpG context output + if ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start+$index+$pos_offset >= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start-$index+$pos_offset <= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The read orientation was neither + nor -: '$strand'\n"; + } + } + + ### strand-specific methylation output + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start+$index+$pos_offset >= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start-$index+$pos_offset <= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation was neither + nor -: '$strand'/n"; + } + } + } + + ### this is the default paired-end procedure allowing overlaps and using every single C position + ### Still within the 2-CONTEXT ONLY optional section + else { + ### single-file CpG and non-CpG context output + if ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + + ### strand-specific methylation output + ### still within the 2-CONTEXT optional section + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + } + } + + ############################################ + ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ### + ############################################ + + elsif ($no_overlap) { + ### single-file CpG, CHG and CHH context output + if ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start+$index+$pos_offset >= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start-$index+$pos_offset <= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + + ### strand-specific methylation output + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start+$index+$pos_offset >= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start-$index+$pos_offset <= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + } + + ### this is the default paired-end procedure allowing overlaps and using every single C position + else { + ### single-file CpG, CHG and CHH context output + if ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + + ### strand-specific methylation output + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + } +} + +sub check_cigar_string { + my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_; + # print "$index\t$cigar_offset\t$pos_offset\t$strand\t"; + my ($new_cigar_offset,$new_pos_offset) = (0,0); + + if ($strand eq '+') { + # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; + + if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position + # warn "position needs no adjustment\n"; + } + + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence + $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position + # warn "adjusted genomic position by -1 bp (insertion)\n"; + } + + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence + $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index + $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position + # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; + + while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ + if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position + # warn "position needs no adjustment\n"; + last; + } + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ + $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position + # warn "adjusted genomic position by another -1 bp (insertion)\n"; + last; + } + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence + $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index + $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position + # warn "adjusted genomic position by another +1 bp (deletion)\n"; + } + else{ + die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; + } + } + } + else{ + die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; + } + } + + elsif ($strand eq '-') { + # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; + + if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position + # warn "position needs no adjustment\n"; + } + + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence + $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position + # warn "adjusted genomic position by +1 bp (insertion)\n"; + } + + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence + $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index + $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position + # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; + + while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ + if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position + # warn "Found new 'M' operation; position needs no adjustment\n"; + last; + } + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ + $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position + # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n"; + last; + } + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence + $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index + $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position + # warn "adjusted genomic position by another -1 bp (deletion)\n"; + } + else{ + die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; + } + } + } + else{ + die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; + } + } + # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n"; + return ($new_cigar_offset,$new_pos_offset); +} + +sub print_individual_C_methylation_states_single_end{ + + my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_; + my @methylation_calls = split(//,$meth_call); + + ################################################################# + ### . for bases not involving cytosines ### + ### X for methylated C in CHG context (was protected) ### + ### x for not methylated C in CHG context (was converted) ### + ### H for methylated C in CHH context (was protected) ### + ### h for not methylated C in CHH context (was converted) ### + ### Z for methylated C in CpG context (was protected) ### + ### z for not methylated C in CpG context (was converted) ### + ################################################################# + + my $methyl_CHG_count = 0; + my $methyl_CHH_count = 0; + my $methyl_CpG_count = 0; + my $unmethylated_CHG_count = 0; + my $unmethylated_CHH_count = 0; + my $unmethylated_CpG_count = 0; + + my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions + my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels + + my @comp_cigar; + + if ($cigar){ # parsing CIGAR string + + ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing + if ($cigar =~ /^\d+M$/){ + # warn "See!? I told you so! $cigar\n"; + # sleep(1); + } + else{ + + my @len; + my @ops; + + @len = split (/\D+/,$cigar); # storing the length per operation + @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + foreach my $index (0..$#len){ + foreach (1..$len[$index]){ + # print "$ops[$index]"; + push @comp_cigar, $ops[$index]; + } + } + } + # warn "\nDetected CIGAR string: $cigar\n"; + # warn "Length of methylation call: ",length $meth_call,"\n"; + # warn "number of operations: ",scalar @ops,"\n"; + # warn "number of length digits: ",scalar @len,"\n\n"; + # print @comp_cigar,"\n"; + # print "$meth_call\n\n"; + # sleep (1); + } + + ### adjusting the start position for all reads mapping to the reverse strand + if ($strand eq '-') { + + if (@comp_cigar){ # only needed for SAM reads with InDels + @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too + # print @comp_cigar,"\n"; + } + + unless ($ignore){ ### if --ignore was specified the start position has already been corrected + + if ($cigar){ ### SAM format + if ($cigar =~ /^(\d+)M$/){ # linear match + $start += $1 - 1; + } + else{ # InDel read + my $MD_count = 0; + foreach (@comp_cigar){ + ++$MD_count if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't + } + $start += $MD_count - 1; + } + } + else{ ### vanilla format + $start += length($meth_call)-1; + } + } + } + + ### THIS IS THE CpG and Non-CpG SECTION (OPTIONAL) + + ### single-file CpG and other-context output + if ($full and $merge_non_CpG) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM alignments with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') { + } + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + elsif ($strand eq '-') { + + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM entries with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.'){ + } + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + else { + die "The strand information was neither + nor -: $strand\n"; + } + } + + ### strand-specific methylation output + elsif ($merge_non_CpG) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') { + } + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + elsif ($strand eq '-') { + + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') { + } + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + else { + die "The strand information was neither + nor -: $strand\n"; + } + } + + ### THIS IS THE 3-CONTEXT (CpG, CHG and CHH) DEFAULT SECTION + + elsif ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + elsif ($strand eq '-') { + + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + else { + die "The read had a strand orientation which was neither + nor -: $strand\n"; + } + } + + ### strand-specific methylation output + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + elsif ($strand eq '-') { + + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + else { + die "The strand information was neither + nor -: $strand\n"; + } + } +} + + + +####################################################################################################################################### +### bismark2bedGaph section - START +####################################################################################################################################### + +### has now been moved to the external script bismark2bedGraph + +# sub process_bedGraph_output{ +# warn "="x64,"\n"; +# warn "Methylation information will now be written into a bedGraph file\n"; +# warn "="x64,"\n\n"; +# sleep (2); + +# ### Closing all filehandles so that the Bismark methtylation extractor output doesn't get truncated due to buffering issues +# foreach my $fh (keys %fhs) { +# if ($fh =~ /^[1230]$/) { +# foreach my $context (keys %{$fhs{$fh}}) { +# close $fhs{$fh}->{$context} or die $!; +# } +# } else { +# close $fhs{$fh} or die $!; +# } +# } + +# ### deciding which files to use for bedGraph conversion +# foreach my $filename (@sorting_files){ +# # warn "$filename\n"; +# if ($filename =~ /\//){ # if files are in a different output folder we extract the filename again +# $filename =~ s/.*\///; # replacing everything up to the last slash in the filename +# # warn "$filename\n"; +# } + +# if ($CX_context){ +# push @bedfiles,$filename; +# } +# else{ ## CpG context only (default) +# if ($filename =~ /^CpG_/){ +# push @bedfiles,$filename; +# } +# else{ +# # skipping CHH or CHG files +# } +# } +# } + +# warn "Using the following files as Input:\n"; +# print join ("\t",@bedfiles),"\n\n"; +# sleep (2); + +# my %temp_fhs; +# my @temp_files; # writing all context files (default CpG only) to these files prior to sorting + +# ### changing to the output directory +# unless ($output_dir eq ''){ # default +# chdir $output_dir or die "Failed to change directory to $output_dir\n"; +# warn "Changed directory to $output_dir\n"; +# } + +# foreach my $infile (@bedfiles) { + +# if ($remove) { +# warn "Now replacing whitespaces in the sequence ID field of the Bismark methylation extractor output $infile prior to bedGraph conversion\n\n"; + +# if ($infile =~ /gz$/){ +# open (READ,"zcat $infile |") or die $!; +# } +# else{ +# open (READ,$infile) or die $!; +# } + +# my $removed_spaces_outfile = $infile; +# $removed_spaces_outfile =~ s/$/.spaces_removed.txt/; + +# open (REM,'>',$output_dir.$removed_spaces_outfile) or die "Couldn't write to file $removed_spaces_outfile: $!\n"; + +# unless ($no_header){ +# $_ = <READ>; ### Bismark version header +# print REM $_; ### Bismark version header +# } + +# while (<READ>) { +# chomp; +# my ($id,$strand,$chr,$pos,$context) = (split (/\t/)); +# $id =~ s/\s+/_/g; +# print REM join ("\t",$id,$strand,$chr,$pos,$context),"\n"; +# } + +# close READ or die $!; +# close REM or die $!; + +# ### changing the infile name to the new file without spaces +# $infile = $removed_spaces_outfile; +# } + +# warn "Now writing methylation information for file $infile to individual files for each chromosome\n"; +# if ($infile =~ /gz$/){ +# open (IN,"zcat $infile |") or die $!; +# } +# else{ +# open (IN,$infile) or die $!; +# } + +# ## always ignoring the version header +# unless ($no_header){ +# $_ = <IN>; ### Bismark version header +# } + +# while (<IN>) { +# chomp; +# my ($chr) = (split (/\t/))[2]; +# # warn "This is the chromosome name before replacing '|' characters:\t$chr\n\n"; +# $chr =~ s/\|/_/g; # replacing pipe ('|') characters in the file names +# # warn "This is the chromosome name AFTER replacing '|' characters:\t$chr\n\n"; + +# unless (exists $temp_fhs{$chr}) { +# open ($temp_fhs{$chr},'>','chr'.$chr.'.meth_extractor.temp') or die "Failed to open filehandle: $!"; +# } +# print {$temp_fhs{$chr}} "$_\n"; +# } + +# warn "Finished writing out individual chromosome files for $infile\n"; +# } +# warn "\n"; + +# @temp_files = <*.meth_extractor.temp>; + +# warn "Collecting temporary chromosome file information...\n"; +# sleep (1); +# warn "processing the following input file(s):\n"; +# warn join ("\n",@temp_files),"\n\n"; +# sleep (1); + +# foreach my $in (@temp_files) { +# if ($sort_size){ +# warn "Sorting input file $in by positions (using -S of '$sort_size')\n"; +# } +# else{ +# warn "Sorting input file $in by positions (using default memory settings)\n"; +# } +# my $sort_dir = $output_dir; +# if ($sort_dir eq ''){ +# $sort_dir = './'; +# } +# open my $ifh, "sort -S $sort_size -T $sort_dir -k3,3 -k4,4n $in |" or die "Input file could not be sorted. $!"; +# # print "Chromosome\tStart Position\tEnd Position\tMethylation Percentage\n"; + +# ############################################# m.a.bentley - moved the variables out of the while loop to hold the current line data { + +# my $name; +# my $meth_state; +# my $chr = ""; +# my $pos = 0; +# my $meth_state2; + +# my $last_pos; +# my $last_chr; + +# ############################################# } + +# while (my $line = <$ifh>) { +# next if $line =~ /^Bismark/; +# chomp $line; + +# ########################################### m.a.bentley - (1) set the last_chr and last_pos variables early in the while loop, before the line split (2) removed unnecessary setting of same variables in if statement { + +# $last_chr = $chr; +# $last_pos = $pos; +# ($name, $meth_state, $chr, $pos, $meth_state2) = split "\t", $line; + +# if (($last_pos ne $pos) || ($last_chr ne $chr)) { +# generate_output($last_chr,$last_pos) if $methylcalls[2] > 0; +# @methylcalls = qw (0 0 0); +# } + +# ############################################# } + +# my $validated = validate_methylation_call($meth_state, $meth_state2); +# unless($validated){ +# warn "Methylation state of sequence ($name) in file ($in) on line $. is inconsistent (meth_state is $meth_state, meth_state2 = $meth_state2)\n"; +# next; +# } +# if ($meth_state eq "+") { +# $methylcalls[0]++; +# $methylcalls[2]++; +# } else { +# $methylcalls[1]++; +# $methylcalls[2]++; +# } +# } + +# ############################################# m.a.bentley - set the last_chr and last_pos variables for the last line in the file (outside the while loop's scope using the method i've implemented) { + +# $last_chr = $chr; +# $last_pos = $pos; +# if ($methylcalls[2] > 0) { +# generate_output($last_chr,$last_pos) if $methylcalls[2] > 0; +# } +# ############################################# } + +# close $ifh or die $!; + +# @methylcalls = qw (0 0 0); # resetting @methylcalls + +# ### deleting temporary files +# my $delete = unlink $in; +# if ($delete) { +# warn "Successfully deleted the temporary input file $in\n\n"; +# } +# else { +# warn "The temporary inputfile $in could not be deleted $!\n\n"; +# } +# } +# } + +# sub generate_output{ +# my $methcount = $methylcalls[0]; +# my $nonmethcount = $methylcalls[1]; +# my $totalcount = $methylcalls[2]; +# my $last_chr = shift; +# my $last_pos = shift; +# croak "Should not be generating output if there's no reads to this region" unless $totalcount > 0; +# croak "Total counts ($totalcount) is not the sum of the methylated ($methcount) and unmethylated ($nonmethcount) counts" if $totalcount != ($methcount + $nonmethcount); + +# ############################################# m.a.bentley - declare a new variable 'bed_pos' to distinguish from bismark positions (-1) - previous scripts modified the last_pos variable earlier in the script leading to problems in meth % calculation { + +# my $bed_pos = $last_pos -1; ### Bismark coordinates are 1 based whereas bedGraph coordinates are 0 based. +# my $meth_percentage; +# ($totalcount >= $coverage_threshold) ? ($meth_percentage = ($methcount/$totalcount) * 100) : ($meth_percentage = undef); +# # $meth_percentage =~ s/(\.\d\d).+$/$1/ unless $meth_percentage =~ /^Below/; +# if (defined $meth_percentage){ +# if ($counts){ +# print OUT "$last_chr\t$bed_pos\t$bed_pos\t$meth_percentage\t$methcount\t$nonmethcount\n"; +# } +# else{ +# print OUT "$last_chr\t$bed_pos\t$bed_pos\t$meth_percentage\n"; +# } +# } +# ############################################# } +# } + +# sub validate_methylation_call{ +# my $meth_state = shift; +# croak "Missing (+/-) methylation call" unless defined $meth_state; +# my $meth_state2 = shift; +# croak "Missing alphabetical methylation call" unless defined $meth_state2; +# my $is_consistent; +# ($meth_state2 =~ /^z/i) ? ($is_consistent = check_CpG_methylation_call($meth_state, $meth_state2)) +# : ($is_consistent = check_nonCpG_methylation_call($meth_state,$meth_state2)); +# return 1 if $is_consistent; +# return 0; +# } + +# sub check_CpG_methylation_call{ +# my $meth1 = shift; +# my $meth2 = shift; +# return 1 if($meth1 eq "+" && $meth2 eq "Z"); +# return 1 if($meth1 eq "-" && $meth2 eq "z"); +# return 0; +# } + +# sub check_nonCpG_methylation_call{ +# my $meth1 = shift; +# my $meth2 = shift; +# return 1 if($meth1 eq "+" && $meth2 eq "C"); +# return 1 if($meth1 eq "+" && $meth2 eq "X"); +# return 1 if($meth1 eq "+" && $meth2 eq "H"); +# return 1 if($meth1 eq "-" && $meth2 eq "c"); +# return 1 if($meth1 eq "-" && $meth2 eq "x"); +# return 1 if($meth1 eq "-" && $meth2 eq "h"); +# return 0; +# } + +####################################################################################################################################### +### bismark2bedGaph section - END +####################################################################################################################################### + + + + + + +# ####################################################################################################################################### +# ### genome-wide cytosine methylation report - START +# ####################################################################################################################################### + +### has now been moved to the external script bedGraph2cytosine + +# sub generate_genome_wide_cytosine_report { + +# warn "="x78,"\n"; +# warn "Methylation information will now be written into a genome-wide cytosine report\n"; +# warn "="x78,"\n\n"; +# sleep (2); + +# ### changing to the output directory again +# unless ($output_dir eq ''){ # default +# chdir $output_dir or die "Failed to change directory to $output_dir\n"; +# # warn "Changed directory to $output_dir\n"; +# } + +# my $in = shift; +# open (IN,$in) or die $!; + +# my $cytosine_out = shift; + +# if ($CX_context){ +# $cytosine_out =~ s/$/genome-wide_CX_report.txt/; +# } +# else{ +# $cytosine_out =~ s/$/genome_wide_CpG_report.txt/; +# } + +# ### note: we are still in the folder: $output_dir, so we do not have to include this into the open commands +# unless ($split_by_chromosome){ ### writing all output to a single file (default) +# open (CYT,'>',$cytosine_out) or die $!; +# warn "Writing genome-wide cytosine report to: $cytosine_out\n"; +# sleep (3); +# } + +# my $last_chr; +# my %chr; # storing reads for one chromosome at a time + +# my $count = 0; +# while (<IN>){ +# chomp; +# ++$count; +# my ($chr,$start,$end,undef,$meth,$nonmeth) = (split /\t/); + +# # defining the first chromosome +# unless (defined $last_chr){ +# $last_chr = $chr; +# # warn "Storing all covered cytosine positions for chromosome: $chr\n"; +# } + +# if ($chr eq $last_chr){ +# $chr{$chr}->{$start}->{meth} = $meth; +# $chr{$chr}->{$start}->{nonmeth} = $nonmeth; +# } +# else{ +# warn "Writing cytosine reports for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n"; + +# if ($split_by_chromosome){ ## writing output to 1 file per chromosome +# my $chromosome_out = $cytosine_out; +# $chromosome_out =~ s/txt$/chr${last_chr}.txt/; +# open (CYT,'>',$chromosome_out) or die $!; +# } + +# while ( $chromosomes{$last_chr} =~ /([CG])/g){ + +# my $tri_nt = ''; +# my $context = ''; +# my $pos = pos$chromosomes{$last_chr}; + +# my $strand; +# my $meth = 0; +# my $nonmeth = 0; + +# if ($1 eq 'C'){ # C on forward strand +# $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based! +# $strand = '+'; +# } +# elsif ($1 eq 'G'){ # C on reverse strand +# $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based! +# $tri_nt = reverse $tri_nt; +# $tri_nt =~ tr/ACTG/TGAC/; +# $strand = '-'; +# } +# next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted + +# if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 0-based! +# $meth = $chr{$last_chr}->{$pos-1}->{meth}; +# $nonmeth = $chr{$last_chr}->{$pos-1}->{nonmeth}; +# } + +# ### determining cytosine context +# if ($tri_nt =~ /^CG/){ +# $context = 'CG'; +# } +# elsif ($tri_nt =~ /^C.{1}G$/){ +# $context = 'CHG'; +# } +# elsif ($tri_nt =~ /^C.{2}$/){ +# $context = 'CHH'; +# } +# else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark) +# warn "The sequence context could not be determined (found: '$tri_nt'). Skipping.\n"; +# next; +# } + +# if ($CpG_only){ +# if ($tri_nt =~ /^CG/){ # CpG context is the default +# if ($zero){ # zero based coordinates +# $pos -= 1; +# print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; +# } +# else{ # default +# print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; +# } +# } +# } +# else{ ## all cytosines, specified with --CX +# if ($zero){ # zero based coordinates +# $pos -= 1; +# print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; +# } +# else{ # default +# print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; +# } +# } +# } + +# %chr = (); # resetting the hash + +# # new first entry +# $last_chr = $chr; +# $chr{$chr}->{$start}->{meth} = $meth; +# $chr{$chr}->{$start}->{nonmeth} = $nonmeth; +# } +# } + +# # Last found chromosome +# warn "Writing cytosine reports for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n"; + +# if ($split_by_chromosome){ ## writing output to 1 file per chromosome +# my $chromosome_out = $cytosine_out; +# $chromosome_out =~ s/txt$/chr${last_chr}.txt/; +# open (CYT,'>',$chromosome_out) or die $!; +# } + +# while ( $chromosomes{$last_chr} =~ /([CG])/g){ + +# my $tri_nt; +# my $context; +# my $pos = pos$chromosomes{$last_chr}; + +# my $strand; +# my $meth = 0; +# my $nonmeth = 0; + +# if ($1 eq 'C'){ # C on forward strand +# $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based! +# $strand = '+'; +# } +# elsif ($1 eq 'G'){ # C on reverse strand +# $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based! +# $tri_nt = reverse $tri_nt; +# $tri_nt =~ tr/ACTG/TGAC/; +# $strand = '-'; +# } + +# if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 0-based! +# $meth = $chr{$last_chr}->{$pos-1}->{meth}; +# $nonmeth = $chr{$last_chr}->{$pos-1}->{nonmeth}; +# } + +# next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted + +# ### determining cytosine context +# if ($tri_nt =~ /^CG/){ +# $context = 'CG'; +# } +# elsif ($tri_nt =~ /^C.{1}G$/){ +# $context = 'CHG'; +# } +# elsif ($tri_nt =~ /^C.{2}$/){ +# $context = 'CHH'; +# } +# else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark) +# warn "The cytosine context could not be determined (found: '$tri_nt'). Skipping.\n"; +# next; +# } + +# if ($CpG_only){ +# if ($tri_nt =~ /^CG/){ # CpG context is the default +# if ($zero){ # zero-based coordinates +# $pos -= 1; +# print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; +# } +# else{ # default +# print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; +# } +# } +# } +# else{ ## all cytosines, specified with --CX +# if ($zero){ # zero based coordinates +# $pos -= 1; +# print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; +# } +# else{ # default +# print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; +# } +# } +# } +# close CYT or die $!; +# } + + +# sub read_genome_into_memory{ + +# ## reading in and storing the specified genome in the %chromosomes hash +# chdir ($genome_folder) or die "Can't move to $genome_folder: $!"; +# warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n"; + +# my @chromosome_filenames = <*.fa>; + +# ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta +# unless (@chromosome_filenames){ +# @chromosome_filenames = <*.fasta>; +# } +# unless (@chromosome_filenames){ +# die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n"; +# } + +# foreach my $chromosome_filename (@chromosome_filenames){ + +# # skipping the tophat entire mouse genome fasta file +# next if ($chromosome_filename eq 'Mus_musculus.NCBIM37.fa'); + +# open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n"; +# ### first line needs to be a fastA header +# my $first_line = <CHR_IN>; +# chomp $first_line; +# $first_line =~ s/\r//; # removing /r carriage returns + +# ### Extracting chromosome name from the FastA header +# my $chromosome_name = extract_chromosome_name($first_line); + +# my $sequence; +# while (<CHR_IN>){ +# chomp; +# $_ =~ s/\r//; # removing /r carriage returns + +# if ($_ =~ /^>/){ +# ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA) +# if (exists $chromosomes{$chromosome_name}){ +# warn "chr $chromosome_name (",length $sequence ," bp)\n"; +# die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n"; +# } +# else { +# if (length($sequence) == 0){ +# warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n"; +# } +# warn "chr $chromosome_name (",length $sequence ," bp)\n"; +# $chromosomes{$chromosome_name} = $sequence; +# } +# ### resetting the sequence variable +# $sequence = ''; +# ### setting new chromosome name +# $chromosome_name = extract_chromosome_name($_); +# } +# else{ +# $sequence .= uc$_; +# } +# } + +# if (exists $chromosomes{$chromosome_name}){ +# warn "chr $chromosome_name (",length $sequence ," bp)\t"; +# die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n"; +# } +# else{ +# if (length($sequence) == 0){ +# warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n"; +# } +# warn "chr $chromosome_name (",length $sequence ," bp)\n"; +# $chromosomes{$chromosome_name} = $sequence; +# } +# } +# warn "\n"; +# chdir $parent_dir or die "Failed to move to directory $parent_dir\n"; +# } + +# sub extract_chromosome_name { +# ## Bowtie extracts the first string after the inition > in the FASTA file, so we are doing this as well +# my $fasta_header = shift; +# if ($fasta_header =~ s/^>//){ +# my ($chromosome_name) = split (/\s+/,$fasta_header); +# return $chromosome_name; +# } +# else{ +# die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n"; +# } +# } + +# ####################################################################################################################################### +# ### genome-wide cytosine methylation report - END +# ####################################################################################################################################### + + + + +sub print_helpfile{ + + print << 'HOW_TO'; + + +DESCRIPTION + +The following is a brief description of all options to control the Bismark +methylation extractor. The script reads in a bisulfite read alignment results file +produced by the Bismark bisulfite mapper and extracts the methylation information +for individual cytosines. This information is found in the methylation call field +which can contain the following characters: + + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ~~~ X for methylated C in CHG context (was protected) ~~~ + ~~~ x for not methylated C CHG (was converted) ~~~ + ~~~ H for methylated C in CHH context (was protected) ~~~ + ~~~ h for not methylated C in CHH context (was converted) ~~~ + ~~~ Z for methylated C in CpG context (was protected) ~~~ + ~~~ z for not methylated C in CpG context (was converted) ~~~ + ~~~ . for any bases not involving cytosines ~~~ + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The methylation extractor outputs result files for cytosines in CpG, CHG and CHH +context (this distinction is actually already made in Bismark itself). As the methylation +information for every C analysed can produce files which easily have tens or even hundreds of +millions of lines, file sizes can become very large and more difficult to handle. The C +methylation info additionally splits cytosine methylation calls up into one of the four possible +strands a given bisulfite read aligned against: + + OT original top strand + CTOT complementary to original top strand + + OB original bottom strand + CTOB complementary to original bottom strand + +Thus, by default twelve individual output files are being generated per input file (unless +--comprehensive is specified, see below). The output files can be imported into a genome +viewer, such as SeqMonk, and re-combined into a single data group if desired (in fact +unless the bisulfite reads were generated preserving directionality it doesn't make any +sense to look at the data in a strand-specific manner). Strand-specific output files can +optionally be skipped, in which case only three output files for CpG, CHG or CHH context +will be generated. For both the strand-specific and comprehensive outputs there is also +the option to merge both non-CpG contexts (CHG and CHH) into one single non-CpG context. + + +The output files are in the following format (tab delimited): + +<sequence_id> <strand> <chromosome> <position> <methylation call> + + +USAGE: methylation_extractor [options] <filenames> + + +ARGUMENTS: + +<filenames> A space-separated list of Bismark result files in SAM format from + which methylation information is extracted for every cytosine in + the reads. For alignment files in the older custom Bismark output + see option '--vanilla'. + +OPTIONS: + +-s/--single-end Input file(s) are Bismark result file(s) generated from single-end + read data. Specifying either --single-end or --paired-end is + mandatory. + +-p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end + read data. Specifying either --paired-end or --single-end is + mandatory. + +--vanilla The Bismark result input file(s) are in the old custom Bismark format + (up to version 0.5.x) and not in SAM format which is the default as + of Bismark version 0.6.x or higher. Default: OFF. + +--no_overlap For paired-end reads it is theoretically possible that read_1 and + read_2 overlap. This option avoids scoring overlapping methylation + calls twice (only methylation calls of read 1 are used for in the process + since read 1 has historically higher quality basecalls than read 2). + Whilst this option removes a bias towards more methylation calls + in the center of sequenced fragments it may de facto remove a sizable + proportion of the data. This option is highly recommended for paired-end + data. + +--ignore <int> Ignore the first <int> bp at the 5' end of each read when processing the + methylation call string. This can remove e.g. a restriction enzyme site + at the start of each read. + +--comprehensive Specifying this option will merge all four possible strand-specific + methylation info into context-dependent output files. The default + contexts are: + - CpG context + - CHG context + - CHH context + +--merge_non_CpG This will produce two output files (in --comprehensive mode) or eight + strand-specific output files (default) for Cs in + - CpG context + - non-CpG context + +--report Prints out a short methylation summary as well as the paramaters used to run + this script. + +--no_header Suppresses the Bismark version header line in all output files for more convenient + batch processing. + +-o/--output DIR Allows specification of a different output directory (absolute or relative + path). If not specified explicitely, the output will be written to the current directory. + +--samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified + explicitly if Samtools is in the PATH already. + +--gzip The methylation extractor files (CpG_OT_..., CpG_OB_... etc) will be written out in + a GZIP compressed form to save disk space. This option does not work on bedGraph and + genome-wide cytosine reports as they are 'tiny' anyway. + +--version Displays version information. + +-h/--help Displays this help file and exits. + + + +bedGraph specific options: + +--bedGraph After finishing the methylation extraction, the methylation output is written into a + sorted bedGraph file that reports the position of a given cytosine and its methylation + state (in %, see details below). The methylation extractor output is temporarily split up into + temporary files, one per chromosome (written into the current directory or folder + specified with -o/--output); these temp files are then used for sorting and deleted + afterwards. By default, only cytosines in CpG context will be sorted. The option + '--CX_context' may be used to report all cytosines irrespective of sequence context + (this will take MUCH longer!). The default folder for temporary files during the sorting + process is the output directory. The bedGraph conversion step is performed by the external + module 'bismark2bedGraph'; this script needs to reside in the same folder as the + bismark_methylation_extractor itself. + + +--cutoff [threshold] The minimum number of times a methylation state has to be seen for that nucleotide + before its methylation percentage is reported. Default: 1. + +--remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting. + + +--counts Adds two additional columns to the output file to enable further calculations: + col 5: number of methylated calls + col 6: number of unmethylated calls + This option is required if '--cytosine_report' is specified (and will be set automatically if + necessary). + +--CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered + in the experiment irrespective of its sequence context. This applies to both forward and + reverse strands. Please be aware that this option may generate large temporary and output files + and may take a long time to sort (up to many hours). Default: OFF. + (i.e. Default = CpG context only). + +--buffer_size <string> This allows you to specify the main memory sort buffer when sorting the methylation information. + Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or + a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc. + (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line. + Defaults to 2G. + + +Genome-wide cytosine methylation report specific options: + +--cytosine_report After the conversion to bedGraph has completed, the option '--cytosine_report' produces a + genome-wide methylation report for all cytosines in the genome. By default, the output uses 1-based + chromosome coordinates (zero-based cords are optional) and reports CpG context only (all + cytosine context is optional). The output considers all Cs on both forward and reverse strands and + reports their position, strand, trinucleotide content and methylation state (counts are 0 if not + covered). The cytsoine report conversion step is performed by the external module + 'bedGraph2cytosine'; this script needs to reside in the same folder as the bismark_methylation_extractor + itself. + +--CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of + its context. This applies to both forward and reverse strands. Please be aware that this will + generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse. + Default: OFF (i.e. Default = CpG context only). + +--zero_based Uses zero-based coordinates like used in e.g. bed files instead of 1-based coordinates. Default: OFF. + +--genome_folder <path> Enter the genome folder you wish to use to extract sequences from (full path only). Accepted + formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory. + +--split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files + will be named to include the input filename and the chromosome number. + + + +OUTPUT: + +The bismark_methylation_extractor output is in the form: +======================================================== +<seq-ID> <methylation state*> <chromosome> <start position (= end position)> <methylation call> + +* Methylated cytosines receive a '+' orientation, +* Unmethylated cytosines receive a '-' orientation. + + + +The bedGraph output (optional) looks like this (tab-delimited): +=============================================================== +<chromosome> <start position> <end position> <methylation percentage> + +The bedGraph output with '--counts' specified looks like this (tab-delimited): + +<chromosome> <start position> <end position> <methylation percentage> <count methylated> <count non-methylated> + + + +The genome-wide cytosine methylation output file is tab-delimited in the following format: +========================================================================================== +<chromosome> <position> <strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context> + + + +This script was last modified on 21 April 2013. + +HOW_TO +}