# HG changeset patch # User fcaramia # Date 1355359566 18000 # Node ID 6479112a673b5dd7a20b059681b40b551fcadfac # Parent 2432df265dad75acfffd394d155056fdb3f98f55 Uploaded diff -r 2432df265dad -r 6479112a673b methylation_analysis_bismark/methylation_analysis/methylation_extractor --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/methylation_analysis_bismark/methylation_analysis/methylation_extractor Wed Dec 12 19:46:06 2012 -0500 @@ -0,0 +1,2570 @@ +#!/usr/bin/perl +use warnings; +use strict; +$|++; +use Getopt::Long; +use Cwd; + +my @filenames; +my %counting; +my $parent_dir = getcwd(); + +my %fhs; + +my $version = 'v0.7.6'; +my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla) = process_commandline(); + +process_Bismark_results_file(); + +sub process_commandline{ + my $help; + my $single_end; + my $paired_end; + my $ignore; + my $genomic_fasta; + my $full; + my $report; + my $extractor_version; + my $no_overlap; + my $merge_non_CpG; + my $vanilla; + + my $command_line = GetOptions ('help|man' => \$help, + 'p|paired-end' => \$paired_end, + 's|single-end' => \$single_end, + 'fasta' => \$genomic_fasta, + 'ignore=i' => \$ignore, + 'comprehensive' => \$full, + 'report' => \$report, + 'version' => \$extractor_version, + 'no_overlap' => \$no_overlap, + 'merge_non_CpG' => \$merge_non_CpG, + 'vanilla' => \$vanilla, + ); + + ### EXIT ON ERROR if there were errors with any of the supplied options + unless ($command_line){ + die "Please respecify command line options\n"; + } + + ### HELPFILE + if ($help){ + print_helpfile(); + exit; + } + + if ($extractor_version){ + print << "VERSION"; + + + Bismark Methylation Extractor + + Bismark Extractor Version: $version Copyright 2010-12 Felix Krueger, Babraham Bioinformatics + www.bioinformatics.babraham.ac.uk/projects/bismark/ + + +VERSION + exit; + } + + + ### no files provided + unless (@ARGV){ + die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n"; + } + @filenames = @ARGV; + + warn "\n *** Bismark methylation extractor version $version ***\n\n"; + + ### IGNORING bases at the start of the read when processing the methylation call string + unless ($ignore){ + $ignore = 0; + } + ### PRINT A REPORT + unless ($report){ + $report = 0; + } + + ### OLD (VANILLA) OUTPUT FORMAT + unless ($vanilla){ + $vanilla = 0; + } + + if ($single_end){ + $paired_end = 0; ### SINGLE END ALIGNMENTS + } + elsif ($paired_end){ + $single_end = 0; ### PAIRED-END ALIGNMENTS + } + else{ + die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format\n\n"; + } + + ### NO OVERLAP + if ($no_overlap){ + die "The option '--no_overlap' can only be specified for paired-end input!\n" unless ($paired_end); + } + else{ + $no_overlap = 0; + } + + ### COMPREHENSIVE OUTPUT + unless ($full){ + $full = 0; + } + + ### MERGE NON-CpG context + unless ($merge_non_CpG){ + $merge_non_CpG = 0; + } + + return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla); +} + + +sub process_Bismark_results_file{ + + if ($single){ + if ($vanilla){ + warn "Bismark Single-end vanilla format specified\n"; + } + else{ + warn "Bismark Single-end SAM format specified (default)\n"; # default + } + } + elsif ($paired){ + if ($vanilla){ + warn "Bismark Paired-end vanilla format specified\n"; + } + else{ + warn "Bismark Paired-end SAM format specified (default)\n"; # default + } + } + + if ($ignore){ + warn "First $ignore bases will be disregarded when processing the methylation call string\n"; + } + + if ($full){ + warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n"; + } + if ($merge_non_CpG){ + warn "Merge CHG and CHH context to non-CpG context specified\n"; + } + warn "\n"; + + sleep (3); + + foreach my $filename (@filenames){ + %fhs = (); + %counting =( + total_meCHG_count => 0, + total_meCHH_count => 0, + total_meCpG_count => 0, + total_unmethylated_CHG_count => 0, + total_unmethylated_CHH_count => 0, + total_unmethylated_CpG_count => 0, + sequences_count => 0, + ); + warn "\nNow reading in Bismark result file $filename\n\n"; + + if ($filename =~ /\.gz$/){ + open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n"; + } + else{ + open (IN,$filename) or die "Can't open file $filename: $!\n"; + } + + ### Vanilla and SAM output need to read different numbers of header lines + if ($vanilla){ + my $bismark_version = ; ## discarding the Bismark version info + chomp $bismark_version; + $bismark_version =~ s/\r//; # replaces \r line feed + $bismark_version =~ s/Bismark version: //; + if ($bismark_version =~ /^\@/){ + warn "Detected \@ as the first character of the version information. Is it possible that the file is in SAM format?\n\n"; + sleep (2); + } + + unless ($version eq $bismark_version){ + die "The methylation extractor and Bismark itself need to be of the same version!\n\nVersions used:\nmethylation extractor: '$version'\nBismark: '$bismark_version'\n"; + } + } + else{ + # If the read is in SAM format (default) it can either start with @ header lines or start with alignments directly. + # We are reading from it further down + } + + my $output_filename = (split (/\//,$filename))[-1]; + + ### OPENING OUTPUT-FILEHANDLES + if ($report){ + my $report_filename = $output_filename; + $report_filename =~ s/$/_splitting_report.txt/; + open (REPORT,'>',$report_filename) or die "Failed to write to file $report_filename $!\n"; + } + + if ($report){ + print REPORT "$output_filename\n\n"; + print REPORT "Parameters used to extract methylation information:\n"; + if ($paired){ + if ($vanilla){ + print REPORT "Bismark result file: paired-end (vanilla Bismark format)\n"; + } + else{ + print REPORT "Bismark result file: paired-end (SAM format)\n"; # default + } + } + + if ($single){ + if ($vanilla){ + print REPORT "Bismark result file: single-end (vanilla Bismark format)\n"; + } + else{ + print REPORT "Bismark result file: single-end (SAM format)\n"; # default + } + } + + if ($ignore){ + print REPORT "Ignoring first $ignore bases\n"; + } + + if ($full){ + print REPORT "Output specified: comprehensive\n"; + } + else{ + print REPORT "Output specified: strand-specific (default)\n"; + } + + if ($no_overlap){ + print REPORT "No overlapping methylation calls specified\n"; + } + if ($genomic_fasta){ + print REPORT "Genomic equivalent sequences will be printed out in FastA format\n"; + } + if ($merge_non_CpG){ + print REPORT "Methylation in CHG and CHH context will be merged into \"non-CpG context\" output\n"; + } + + print REPORT "\n"; + } + + ### CpG-context and non-CpG context. THIS SECTION IS OPTIONAL + + ### if --comprehensive AND --merge_non_CpG was specified we are only writing out one CpG-context and one Any-Other-context result file + if ($full and $merge_non_CpG){ + my $cpg_output = my $other_c_output = $output_filename; + ### C in CpG context + $cpg_output =~ s/^/CpG_context_/; + $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); + open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n"; + print "Writing result file containing methylation information for C in CpG context to $cpg_output\n"; + print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n"; + + ### C in any other context than CpG + $other_c_output =~ s/^/Non_CpG_context_/; + $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/); + open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n"; + print "Writing result file containing methylation information for C in any other context to $other_c_output\n"; + print {$fhs{other_context}} "Bismark methylation extractor version $version\n"; + } + + ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found + elsif ($merge_non_CpG){ + + my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; + + ### For cytosines in CpG context + $cpg_ot =~ s/^/CpG_OT_/; + $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); + open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n"; + print "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n"; + print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n"; + + $cpg_ctot =~ s/^/CpG_CTOT_/; + $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); + open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n"; + print "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n"; + print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n"; + + $cpg_ctob =~ s/^/CpG_CTOB_/; + $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); + open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n"; + print "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n"; + print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n"; + + $cpg_ob =~ s/^/CpG_OB_/; + $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); + open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n"; + print "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n"; + print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n"; + + ### For cytosines in Non-CpG (CC, CT or CA) context + my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename; + + $other_c_ot =~ s/^/Non_CpG_OT_/; + $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/); + open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n"; + print "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n"; + print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n"; + + $other_c_ctot =~ s/^/Non_CpG_CTOT_/; + $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/); + open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n"; + print "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n"; + print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n"; + + $other_c_ctob =~ s/^/Non_CpG_CTOB_/; + $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/); + open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n"; + print "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n"; + print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n"; + + $other_c_ob =~ s/^/Non_CpG_OB_/; + $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/); + open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n"; + print "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n"; + print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n"; + } + + ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context) + + ### if --comprehensive was specified we are only writing one file per context + elsif ($full){ + my $cpg_output = my $chg_output = my $chh_output = $output_filename; + ### C in CpG context + $cpg_output =~ s/^/CpG_context_/; + $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); + open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n"; + print "Writing result file containing methylation information for C in CpG context to $cpg_output\n"; + print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n"; + + ### C in CHG context + $chg_output =~ s/^/CHG_context_/; + $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/); + open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n"; + print "Writing result file containing methylation information for C in CHG context to $chg_output\n"; + print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n"; + + ### C in CHH context + $chh_output =~ s/^/CHH_context_/; + $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/); + open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n"; + print "Writing result file containing methylation information for C in CHH context to $chh_output\n"; + print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n"; } + + ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found + else{ + my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; + + ### For cytosines in CpG context + $cpg_ot =~ s/^/CpG_OT_/; + $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); + open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n"; + print "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n"; + print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n"; + + $cpg_ctot =~ s/^/CpG_CTOT_/; + $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); + open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n"; + print "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n"; + print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n"; + + $cpg_ctob =~ s/^/CpG_CTOB_/; + $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); + open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n"; + print "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n"; + print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n"; + + $cpg_ob =~ s/^/CpG_OB_/; + $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); + open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n"; + print "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n"; + print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n"; + + ### For cytosines in CHG context + my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename; + + $chg_ot =~ s/^/CHG_OT_/; + $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/); + open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n"; + print "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n"; + print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n"; + + $chg_ctot =~ s/^/CHG_CTOT_/; + $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/); + open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n"; + print "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n"; + print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n"; + + $chg_ctob =~ s/^/CHG_CTOB_/; + $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/); + open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n"; + print "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n"; + print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n"; + + $chg_ob =~ s/^/CHG_OB_/; + $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/); + open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n"; + print "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n"; + print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n"; + + ### For cytosines in CHH context + my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename; + + $chh_ot =~ s/^/CHH_OT_/; + $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/); + open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n"; + print "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n"; + print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n"; + + $chh_ctot =~ s/^/CHH_CTOT_/; + $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/); + open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n"; + print "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n"; + print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n"; + + $chh_ctob =~ s/^/CHH_CTOB_/; + $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/); + open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n"; + print "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n"; + print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n"; + + $chh_ob =~ s/^/CHH_OB_/; + $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/); + open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n"; + print "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n"; + print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n"; + } + + my $methylation_call_strings_processed = 0; + my $line_count = 0; + + ### proceeding differently now for single-end or paired-end Bismark files + + ### PROCESSING SINGLE-END RESULT FILES + if ($single){ + + ### also proceeding differently now for SAM format or vanilla Bismark format files + if ($vanilla){ # old vanilla Bismark output format + while (){ + ++$line_count; + warn "Processed lines: $line_count\n" if ($line_count%500000==0); + + ### $seq here is the chromosomal sequence (to use for the repeat analysis for example) + my ($id,$strand,$chrom,$start,$seq,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,6,7,8,9]; + + ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the first or + ### last position + chomp $genome_conversion; + + my $index; + if ($meth_call){ + + if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){ ## original top strand + $index = 0; + } + elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){ ## complementary to original top strand + $index = 1; + } + elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){ ## original bottom strand + $index = 3; + } + elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){ ## complementary to original bottom strand + $index = 2; + } + else { + die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; + } + + ### Clipping off the first number of bases from the methylation call string as specified with --ignore + if ($ignore){ + $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); + + ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly! + if ($strand eq '+'){ + $start += $ignore; + } + elsif ($strand eq '-'){ + $start += length($meth_call)-1; ## $meth_call is already shortened! + } + else { + die "Alignment did not have proper strand information: $strand\n"; + } + } + ### printing out the methylation state of every C in the read + print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index); + + ++$methylation_call_strings_processed; # 1 per single-end result + } + } + } + else{ # processing single-end SAM format (default) + while (){ + ### SAM format can either start with header lines (starting with @) or start with alignments directly + if (/^\@/){ # skipping header lines (starting with @) + warn "skipping SAM header line:\t$_"; + next; + } + + ++$line_count; + warn "Processed lines: $line_count\n" if ($line_count%500000==0); + + # example read in SAM format + # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT + ### + + # < 0.7.6 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; + # < 0.7.6 $meth_call =~ s/^XM:Z://; + # < 0.7.6 $read_conversion =~ s/^XR:Z://; + # < 0.7.6 $genome_conversion =~ s/^XG:Z://; + + my ($id,$chrom,$start,$cigar) = (split("\t"))[0,2,3,5]; + + ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression + my $meth_call; ### Thanks to Zachary Zeno for this solution + my $read_conversion; + my $genome_conversion; + + while ( /(XM|XR|XG):Z:([^\t]+)/g ) { + my $tag = $1; + my $value = $2; + + if ($tag eq "XM"){ + $meth_call = $value; + $meth_call =~ s/\r//; + } + elsif ($tag eq "XR") { + $read_conversion = $value; + $read_conversion =~ s/\r//; + } + elsif ($tag eq "XG") { + $genome_conversion = $value; + $genome_conversion =~ s/\r//; + } + } + + my $strand; + chomp $genome_conversion; + # print "$meth_call\n$read_conversion\n$genome_conversion\n"; + + my $index; + if ($meth_call){ + if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){ ## original top strand + $index = 0; + $strand = '+'; + } + elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){ ## complementary to original top strand + $index = 1; + $strand = '-'; + } + elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){ ## complementary to original bottom strand + $index = 2; + $strand = '+'; + } + elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){ ## original bottom strand + $index = 3; + $strand = '-'; + } + else { + die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; + } + + ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output + if ($strand eq '-'){ + $meth_call = reverse $meth_call; + } + + ### Clipping off the first number of bases from the methylation call string as specified with --ignore + if ($ignore){ + # print "\n\n$meth_call\n"; + $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); + # print "$meth_call\n"; + ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly + + my @len = split (/\D+/,$cigar); # storing the length per operation + my @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + my @comp_cigar; # building an array with all CIGAR operations + foreach my $index (0..$#len){ + foreach (1..$len[$index]){ + # print "$ops[$index]"; + push @comp_cigar, $ops[$index]; + } + } + # print "original CIGAR: $cigar\n"; + # print "original CIGAR: @comp_cigar\n"; + + ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly! + if ($strand eq '+'){ + + my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions + my $I_count = 0; + + for (1..$ignore){ + my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations from the start + # print "$_ deleted $op\n"; + + while ($op eq 'D'){ # repeating this for deletions (D) + $D_count++; + $op = shift @comp_cigar; + # print "$_ deleted $op\n"; + } + if ($op eq 'I'){ # adjusting the genomic position for insertions (I) + $I_count++; + } + } + $start += $ignore + $D_count - $I_count; + # print "start $start\t ignore: $ignore\t D count: $D_count I_count: $I_count\n"; + } + elsif ($strand eq '-'){ + + for (1..$ignore){ + my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array + while ($op eq 'D'){ # repeating this for deletions (D) + $op = pop @comp_cigar; + } + } + + ### For reverse strand alignments we need to determine the number of matching bases (M) or deletions (D) in the read from the CIGAR + ### string to be able to work out the starting position of the read which is on the 3' end of the sequence + my $MD_count = 0; # counting all operations that affect the genomic position, i.e. M and D. Insertions do not affect the start position + foreach (@comp_cigar){ + ++$MD_count if ($_ eq 'M' or $_ eq 'D'); + } + $start += $MD_count - 1; + } + + ### reconstituting shortened CIGAR string + my $new_cigar; + my $count = 0; + my $last_op; + # print "ignore adjusted: @comp_cigar\n"; + foreach my $op (@comp_cigar){ + unless (defined $last_op){ + $last_op = $op; + ++$count; + next; + } + if ($last_op eq $op){ + ++$count; + } + else{ + $new_cigar .= "$count$last_op"; + $last_op = $op; + $count = 1; + } + } + $new_cigar .= "$count$last_op"; # appending the last operation and count + $cigar = $new_cigar; + # print "ignore adjusted scalar: $cigar\n"; + } + } + ### printing out the methylation state of every C in the read + print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index,$cigar); + + ++$methylation_call_strings_processed; # 1 per single-end result + } + } + } + + ### PROCESSING PAIRED-END RESULT FILES + elsif ($paired){ + + ### proceeding differently now for SAM format or vanilla Bismark format files + if ($vanilla){ # old vanilla Bismark paired-end output format + while (){ + ++$line_count; + warn "processed line: $line_count\n" if ($line_count%500000==0); + + ### $seq here is the chromosomal sequence (to use for the repeat analysis for example) + my ($id,$strand,$chrom,$start_read_1,$end_read_2,$seq_1,$meth_call_1,$seq_2,$meth_call_2,$first_read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,4,6,7,9,10,11,12,13]; + + my $index; + chomp $genome_conversion; + + if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT'){ + $index = 0; ## this is OT + } + elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA'){ + $index = 2; ## this is CTOB!!! + } + elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT'){ + $index = 1; ## this is CTOT!!! + } + elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA'){ + $index = 3; ## this is OB + } + else { + die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; + } + + if ($meth_call_1 and $meth_call_2){ + ### Clipping off the first number of bases from the methylation call strings as specified with '--ignore ' + if ($ignore){ + $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); + $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore); + + ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified + $start_read_1 += $ignore; + $end_read_2 -= $ignore; + } + my $end_read_1; + my $start_read_2; + + if ($strand eq '+'){ + + $end_read_1 = $start_read_1+length($meth_call_1)-1; + $start_read_2 = $end_read_2-length($meth_call_2)+1; + + ## we first pass the first read which is in + orientation on the forward strand + print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0); + + # we next pass the second read which is in - orientation on the reverse strand + ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 + print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1); + } + + else{ + + $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read! + $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read! + + ## we first pass the first read which is in - orientation on the reverse strand + print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0); + + # we next pass the second read which is in + orientation on the forward strand + ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 + print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2); + } + + $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls + } + } + } + else{ # Bismark paired-end SAM output format (default) + while (){ + ### SAM format can either start with header lines (starting with @) or start with alignments directly + if (/^\@/){ # skipping header lines (starting with @) + warn "skipping SAM header line:\t$_"; + next; + } + + ++$line_count; + warn "Processed lines: $line_count\n" if ($line_count%500000==0); + + # example paired-end reads in SAM format (2 consecutive lines) + # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT + # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT + + # < version 0.7.6 my ($id_1,$chrom,$start_read_1,$meth_call_1,$first_read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; + + my ($id_1,$chrom,$start_read_1,$cigar_1) = (split("\t"))[0,2,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression + my $meth_call_1; + my $first_read_conversion; + my $genome_conversion; + + while ( /(XM|XR|XG):Z:([^\t]+)/g ) { + my $tag = $1; + my $value = $2; + + if ($tag eq "XM"){ + $meth_call_1 = $value; + $meth_call_1 =~ s/\r//; + } + elsif ($tag eq "XR") { + $first_read_conversion = $value; + $first_read_conversion =~ s/\r//; + } + elsif ($tag eq "XG") { + $genome_conversion = $value; + $genome_conversion =~ s/\r//; + } + } + + $_ = ; # reading in the paired read + + # < version 0.7.6 my ($id_2,$start_read_2,$meth_call_2,$second_read_conversion) = (split("\t"))[0,3,13,14]; + # < version 0.7.6 $meth_call_1 =~ s/^XM:Z://; + # < version 0.7.6 $meth_call_2 =~ s/^XM:Z://; + # < version 0.7.6 $first_read_conversion =~ s/^XR:Z://; + # < version 0.7.6 $second_read_conversion =~ s/^XR:Z://; + + my ($id_2,$start_read_2,$cigar_2) = (split("\t"))[0,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression + + my $meth_call_2; + my $second_read_conversion; + + while ( /(XM|XR):Z:([^\t]+)/g ) { + my $tag = $1; + my $value = $2; + + if ($tag eq "XM"){ + $meth_call_2 = $value; + $meth_call_2 =~ s/\r//; + } + elsif ($tag eq "XR") { + $second_read_conversion = $value; + $second_read_conversion = s/\r//; + } + } + + # < version 0.7.6 $genome_conversion =~ s/^XG:Z://; + chomp $genome_conversion; # in case it captured a new line character + + # print join ("\t",$meth_call_1,$meth_call_2,$first_read_conversion,$second_read_conversion,$genome_conversion),"\n"; + + my $index; + my $strand; + + if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT'){ + $index = 0; ## this is OT + $strand = '+'; + } + elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT'){ + $index = 1; ## this is CTOT + $strand = '-'; + } + elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA'){ + $index = 2; ## this is CTOB + $strand = '+'; + } + elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA'){ + $index = 3; ## this is OB + $strand = '-'; + } + else { + die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; + } + + ### reversing the methylation call of the read that was reverse-complemented + if ($strand eq '+'){ + $meth_call_2 = reverse $meth_call_2; + } + else{ + $meth_call_1 = reverse $meth_call_1; + } + + if ($meth_call_1 and $meth_call_2){ + + my $end_read_1; + + ### READ 1 + my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation + my @ops_1 = split (/\d+/,$cigar_1); # storing the operation + shift @ops_1; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1); + + my @comp_cigar_1; # building an array with all CIGAR operations + foreach my $index (0..$#len_1){ + foreach (1..$len_1[$index]){ + # print "$ops_1[$index]"; + push @comp_cigar_1, $ops_1[$index]; + } + } + # print "original CIGAR read 1: $cigar_1\n"; + # print "original CIGAR read 1: @comp_cigar_1\n"; + + ### READ 2 + my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation + my @ops_2 = split (/\d+/,$cigar_2); # storing the operation + shift @ops_2; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2); + my @comp_cigar_2; # building an array with all CIGAR operations for read 2 + foreach my $index (0..$#len_2){ + foreach (1..$len_2[$index]){ + # print "$ops_2[$index]"; + push @comp_cigar_2, $ops_2[$index]; + } + } + # print "original CIGAR read 2: $cigar_2\n"; + # print "original CIGAR read 2: @comp_cigar_2\n"; + + if ($ignore){ + ### Clipping off the first number of bases from the methylation call strings as specified with '--ignore ' + ### the methylation calls have already been reversed where necessary + $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); + $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore); + + ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly + + if ($strand eq '+'){ + + ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start + my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions + my $I_count_1 = 0; + + for (1..$ignore){ + my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start + # print "$_ deleted $op\n"; + + while ($op eq 'D'){ # repeating this for deletions (D) + $D_count_1++; + $op = shift @comp_cigar_1; + # print "$_ deleted $op\n"; + } + if ($op eq 'I'){ # adjusting the genomic position for insertions (I) + $I_count_1++; + } + } + + $start_read_1 += $ignore + $D_count_1 - $I_count_1; + # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n"; + + ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back + + for (1..$ignore){ + my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array + while ($op eq 'D'){ # repeating this for deletions (D) + $op = pop @comp_cigar_2; + } + } + # the start position of reads mapping to the reverse strand is being adjusted further below + } + + elsif ($strand eq '-'){ + + ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back + for (1..$ignore){ + my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array + while ($op eq 'D'){ # repeating this for deletions (D) + $op = pop @comp_cigar_1; + } + } + # the start position of reads mapping to the reverse strand is being adjusted further below + + ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start + my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions + my $I_count_2 = 0; + + for (1..$ignore){ + my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start + # print "$_ deleted $op\n"; + + while ($op eq 'D'){ # repeating this for deletions (D) + $D_count_2++; + $op = shift @comp_cigar_2; + # print "$_ deleted $op\n"; + } + if ($op eq 'I'){ # adjusting the genomic position for insertions (I) + $I_count_2++; + } + } + + $start_read_2 += $ignore + $D_count_2 - $I_count_2; + # print "start read 2 $start_read_2\t ignore: $ignore\t D count 2: $D_count_2\tI_count 2: $I_count_2\n"; + + } + + ### reconstituting shortened CIGAR string 1 + my $new_cigar_1; + my $count_1 = 0; + my $last_op_1; + # print "ignore adjusted CIGAR 1: @comp_cigar_1\n"; + foreach my $op (@comp_cigar_1){ + unless (defined $last_op_1){ + $last_op_1 = $op; + ++$count_1; + next; + } + if ($last_op_1 eq $op){ + ++$count_1; + } + else{ + $new_cigar_1 .= "$count_1$last_op_1"; + $last_op_1 = $op; + $count_1 = 1; + } + } + $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count + $cigar_1 = $new_cigar_1; + # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n"; + + ### reconstituting shortened CIGAR string 2 + my $new_cigar_2; + my $count_2 = 0; + my $last_op_2; + # print "ignore adjusted CIGAR 2: @comp_cigar_2\n"; + foreach my $op (@comp_cigar_2){ + unless (defined $last_op_2){ + $last_op_2 = $op; + ++$count_2; + next; + } + if ($last_op_2 eq $op){ + ++$count_2; + } + else{ + $new_cigar_2 .= "$count_2$last_op_2"; + $last_op_2 = $op; + $count_2 = 1; + } + } + $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count + $cigar_2 = $new_cigar_2; + # print "ignore adjusted CIGAR 2 scalar: $cigar_2\n"; + + } + + if ($strand eq '+'){ + ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2 + @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too + # print "reverse: @comp_cigar_2\n"; + + my $MD_count_1 = 0; + foreach (@comp_cigar_1){ + ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't + } + + my $MD_count_2 = 0; + foreach (@comp_cigar_2){ + ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't + } + + $end_read_1 = $start_read_1 + $MD_count_1 - 1; + $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand + } + else{ + ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1 + + @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too + # print "reverse: @comp_cigar_1\n"; + + my $MD_count_1 = 0; + foreach (@comp_cigar_1){ + ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't + } + + $end_read_1 = $start_read_1; + $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand + + } + + if ($strand eq '+'){ + ## we first pass the first read which is in + orientation on the forward strand + print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1); + + # we next pass the second read which is in - orientation on the reverse strand + ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 + print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2); + } + + else{ + ## we first pass the first read which is in - orientation on the reverse strand + print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1); + + # we next pass the second read which is in + orientation on the forward strand + ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 + print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2); + } + + $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls + } + } + } + } + else{ + die "Single-end or paired-end reads not specified properly\n"; + } + + print "\n\nProcessed $line_count lines from $filename in total\n"; + print "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; + if ($report){ + print REPORT "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; + } + print_splitting_report (); + } +} + + +sub print_splitting_report{ + + ### Calculating methylation percentages if applicable + + my $percent_meCpG; + if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ + $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); + } + + my $percent_meCHG; + if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); + } + + my $percent_meCHH; + if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ + $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); + } + + my $percent_non_CpG_methylation; + if ($merge_non_CpG){ + if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) ); + } + } + + if ($report){ + ### detailed information about Cs analysed + print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; + + my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; + print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; + + print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; + + print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; + + ### calculating methylated CpG percentage if applicable + if ($percent_meCpG){ + print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; + } + else{ + print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + } + + ### 2-Context Output + if ($merge_non_CpG){ + if ($percent_non_CpG_methylation){ + print REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; + } + else{ + print REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; + } + } + + ### 3 Context Output + else{ + ### calculating methylated CHG percentage if applicable + if ($percent_meCHG){ + print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; + } + else{ + print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + } + + ### calculating methylated CHH percentage if applicable + if ($percent_meCHH){ + print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; + } + else{ + print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; + } + } + } + + ### detailed information about Cs analysed for on-screen report + print "Final Cytosine Methylation Report\n",'='x33,"\n"; + + my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; + print "Total number of C's analysed:\t$total_number_of_C\n\n"; + + print "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + print "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + print "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; + + print "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + print "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + print "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; + + ### printing methylated CpG percentage if applicable + if ($percent_meCpG){ + print "C methylated in CpG context:\t${percent_meCpG}%\n"; + } + else{ + print "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + } + + ### 2-Context Output + if ($merge_non_CpG){ + if ($percent_non_CpG_methylation){ + print "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; + } + else{ + print "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; + } + } + + ### 3-Context Output + else{ + ### printing methylated CHG percentage if applicable + if ($percent_meCHG){ + print "C methylated in CHG context:\t${percent_meCHG}%\n"; + } + else{ + print "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + } + + ### printing methylated CHH percentage if applicable + if ($percent_meCHH){ + print "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; + } + else{ + print "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; + } + } +} + + + + + +sub print_individual_C_methylation_states_paired_end_files{ + + my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar) = @_; + my @methylation_calls = split(//,$meth_call); + + ################################################################# + ### . for bases not involving cytosines ### + ### X for methylated C in CHG context (was protected) ### + ### x for not methylated C in CHG context (was converted) ### + ### H for methylated C in CHH context (was protected) ### + ### h for not methylated C in CHH context (was converted) ### + ### Z for methylated C in CpG context (was protected) ### + ### z for not methylated C in CpG context (was converted) ### + ################################################################# + + my $methyl_CHG_count = 0; + my $methyl_CHH_count = 0; + my $methyl_CpG_count = 0; + my $unmethylated_CHG_count = 0; + my $unmethylated_CHH_count = 0; + my $unmethylated_CpG_count = 0; + + my @len; + my @ops; + my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions + my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels + my @comp_cigar; + + if ($cigar){ # parsing CIGAR string + @len = split (/\D+/,$cigar); # storing the length per operation + @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + foreach my $index (0..$#len){ + foreach (1..$len[$index]){ + # print "$ops[$index]"; + push @comp_cigar, $ops[$index]; + } + } + # warn "\nDetected CIGAR string: $cigar\n"; + # warn "Length of methylation call: ",length $meth_call,"\n"; + # warn "number of operations: ",scalar @ops,"\n"; + # warn "number of length digits: ",scalar @len,"\n\n"; + # print @comp_cigar,"\n"; + # print "$meth_call\n\n"; + # sleep (1); + } + + + if ($strand eq '-') { + + ### the CIGAR string needs to be reversed, the methylation call has already been reversed above + @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too + # print "reverse CIGAR string: @comp_cigar\n"; + + ### the start position of paired-end files has already been corrected, see above + } + + ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified + + if ($merge_non_CpG) { + + if ($no_overlap) { + + ### single-file CpG and non-CpG context output + if ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar){ # only needed for SAM files + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start+$index+$pos_offset >= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar){ # only needed for SAM files + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start-$index+$pos_offset <= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The read orientation was neither + nor -: '$strand'\n"; + } + } + + ### strand-specific methylation output + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar){ # only needed for SAM files + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start+$index+$pos_offset >= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar){ # only needed for SAM files + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start-$index+$pos_offset <= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation was neither + nor -: '$strand'/n"; + } + } + } + + ### this is the default paired-end procedure allowing overlaps and using every single C position + ### Still within the 2-CONTEXT ONLY optional section + else { + ### single-file CpG and non-CpG context output + if ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar){ # only needed for SAM files + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar){ # only needed for SAM files + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + + ### strand-specific methylation output + ### still within the 2-CONTEXT optional section + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar){ # only needed for SAM files + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar){ # only needed for SAM files + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + } + } + + ############################################ + ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ### + ############################################ + + elsif ($no_overlap) { + ### single-file CpG, CHG and CHH context output + if ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar){ # only needed for SAM files + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start+$index+$pos_offset >= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar){ # only needed for SAM files + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start-$index+$pos_offset <= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + + ### strand-specific methylation output + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar){ # only needed for SAM files + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start+$index+$pos_offset >= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar){ # only needed for SAM files + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start-$index+$pos_offset <= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + } + + ### this is the default paired-end procedure allowing overlaps and using every single C position + else { + ### single-file CpG, CHG and CHH context output + if ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar){ # only needed for SAM files + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar){ # only needed for SAM files + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + + ### strand-specific methylation output + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar){ # only needed for SAM files + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar){ # only needed for SAM files + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + } +} + +sub check_cigar_string { + my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_; + # print "$index\t$cigar_offset\t$pos_offset\t$strand\t"; + my ($new_cigar_offset,$new_pos_offset) = (0,0); + + if ($strand eq '+') { + # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; + + if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position + # warn "position needs no adjustment\n"; + } + + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence + $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position + # warn "adjusted genomic position by -1 bp (insertion)\n"; + } + + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence + $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index + $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position + # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; + + while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ + if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position + # warn "position needs no adjustment\n"; + last; + } + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ + $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position + # warn "adjusted genomic position by another -1 bp (insertion)\n"; + last; + } + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence + $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index + $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position + # warn "adjusted genomic position by another +1 bp (deletion)\n"; + } + else{ + die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; + } + } + } + else{ + die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; + } + } + + elsif ($strand eq '-') { + # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; + + if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position + # warn "position needs no adjustment\n"; + } + + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence + $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position + # warn "adjusted genomic position by +1 bp (insertion)\n"; + } + + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence + $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index + $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position + # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; + + while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ + if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position + # warn "Found new 'M' operation; position needs no adjustment\n"; + last; + } + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ + $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position + # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n"; + last; + } + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence + $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index + $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position + # warn "adjusted genomic position by another -1 bp (deletion)\n"; + } + else{ + die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; + } + } + } + else{ + die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; + } + } + # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n"; + return ($new_cigar_offset,$new_pos_offset); +} + +sub print_individual_C_methylation_states_single_end{ + + my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_; + my @methylation_calls = split(//,$meth_call); + + ################################################################# + ### . for bases not involving cytosines ### + ### X for methylated C in CHG context (was protected) ### + ### x for not methylated C in CHG context (was converted) ### + ### H for methylated C in CHH context (was protected) ### + ### h for not methylated C in CHH context (was converted) ### + ### Z for methylated C in CpG context (was protected) ### + ### z for not methylated C in CpG context (was converted) ### + ################################################################# + + my $methyl_CHG_count = 0; + my $methyl_CHH_count = 0; + my $methyl_CpG_count = 0; + my $unmethylated_CHG_count = 0; + my $unmethylated_CHH_count = 0; + my $unmethylated_CpG_count = 0; + + my @len; + my @ops; + my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions + my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels + + my @comp_cigar; + + if ($cigar){ # parsing CIGAR string + @len = split (/\D+/,$cigar); # storing the length per operation + @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + foreach my $index (0..$#len){ + foreach (1..$len[$index]){ + # print "$ops[$index]"; + push @comp_cigar, $ops[$index]; + } + } + # warn "\nDetected CIGAR string: $cigar\n"; + # warn "Length of methylation call: ",length $meth_call,"\n"; + # warn "number of operations: ",scalar @ops,"\n"; + # warn "number of length digits: ",scalar @len,"\n\n"; + # print @comp_cigar,"\n"; + # print "$meth_call\n\n"; + # sleep (1); + } + + ### adjusting the start position for all reads mapping to the reverse strand + if ($strand eq '-') { + + @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too + # print @comp_cigar,"\n"; + + unless ($ignore){ ### if --ignore was specified the start position has already been corrected + + if ($cigar){ ### SAM format + my $MD_count = 0; + foreach (@comp_cigar){ + ++$MD_count if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't + } + $start += $MD_count - 1; + } + else{ ### vanilla format + $start += length($meth_call)-1; + } + } + } + + ### THIS IS THE CpG and Non-CpG SECTION (OPTIONAL) + + ### single-file CpG and other-context output + if ($full and $merge_non_CpG) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar){ # only needed for SAM files + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') { + } + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + elsif ($strand eq '-') { + + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar){ # only needed for SAM files + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.'){ + } + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + else { + die "The strand information was neither + nor -: $strand\n"; + } + } + + ### strand-specific methylation output + elsif ($merge_non_CpG) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar){ # only needed for SAM files + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') { + } + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + elsif ($strand eq '-') { + + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar){ # only needed for SAM files + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') { + } + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + else { + die "The strand information was neither + nor -: $strand\n"; + } + } + + ### THIS IS THE 3-CONTEXT (CpG, CHG and CHH) DEFAULT SECTION + + elsif ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar){ # only needed for SAM files + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + elsif ($strand eq '-') { + + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar){ # only needed for SAM files + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + else { + die "The read had a strand orientation which was neither + nor -: $strand\n"; + } + } + + ### strand-specific methylation output + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar){ # only needed for SAM files + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + elsif ($strand eq '-') { + + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar){ # only needed for SAM files + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + else { + die "The strand information was neither + nor -: $strand\n"; + } + } +} + +sub print_helpfile{ + + print << 'HOW_TO'; + + +DESCRIPTION + +The following is a brief description of all options to control the Bismark +methylation extractor. The script reads in a bisulfite read alignment results file +produced by the Bismark bisulfite mapper and extracts the methylation information +for individual cytosines. This information is found in the methylation call field +which can contain the following characters: + + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ~~~ X for methylated C in CHG context (was protected) ~~~ + ~~~ x for not methylated C CHG (was converted) ~~~ + ~~~ H for methylated C in CHH context (was protected) ~~~ + ~~~ h for not methylated C in CHH context (was converted) ~~~ + ~~~ Z for methylated C in CpG context (was protected) ~~~ + ~~~ z for not methylated C in CpG context (was converted) ~~~ + ~~~ . for any bases not involving cytosines ~~~ + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The methylation extractor outputs result files for cytosines in CpG, CHG and CHH +context (this distinction is actually already made in Bismark itself). As the methylation +information for every C analysed can produce files which easily have tens or even hundreds of +millions of lines, file sizes can become very large and more difficult to handle. The C +methylation info additionally splits cytosine methylation calls up into one of the four possible +strands a given bisulfite read aligned against: + + OT original top strand + CTOT complementary to original top strand + + OB original bottom strand + CTOB complementary to original bottom strand + +Thus, by default twelve individual output files are being generated per input file (unless +--comprehensive is specified, see below). The output files can be imported into a genome +viewer, such as SeqMonk, and re-combined into a single data group if desired (in fact +unless the bisulfite reads were generated preserving directionality it doesn't make any +sense to look at the data in a strand-specific manner). Strand-specific output files can +optionally be skipped, in which case only three output files for CpG, CHG or CHH context +will be generated. For both the strand-specific and comprehensive outputs there is also +the option to merge both non-CpG contexts (CHG and CHH) into one single non-CpG context. + + +The output files are in the following format (tab delimited): + + + + +USAGE: methylation_extractor [options] + + +ARGUMENTS: + + A space-separated list of Bismark result files in SAM format from + which methylation information is extracted for every cytosine in + the reads. For alignment files in the older custom Bismark output + see option '--vanilla'. + +OPTIONS: + +-s/--single-end Input file(s) are Bismark result file(s) generated from single-end + read data. Specifying either --single-end or --paired-end is + mandatory. + +-p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end + read data. Specifying either --paired-end or --single-end is + mandatory. + +--vanilla The Bismark result input file(s) are in the old custom Bismark format + (up to version 0.5.x) and not in SAM format which is the default as + of Bismark version 0.6.x or higher. Default: OFF. + +--no_overlap For paired-end reads it is theoretically possible that read_1 and + read_2 overlap. This option avoids scoring overlapping methylation + calls twice. Whilst this removes a bias towards more methylation calls + towards the center of sequenced fragments it can de facto remove + a good proportion of the data. + +--ignore Ignore the first bp at the 5' end of each read when processing the + methylation call string. This can remove e.g. a restriction enzyme site + at the start of each read. + +--comprehensive Specifying this option will merge all four possible strand-specific + methylation info into context-dependent output files. The default + contexts are: + - CpG context + - CHG context + - CHH context + +--merge_non_CpG This will produce two output files (in --comprehensive mode) or eight + strand-specific output files (default) for Cs in + - CpG context + - non-CpG context + +--report Prints out a short methylation summary as well as the paramaters used to run + this script. + +--version Displays version information. + +-h/--help Displays this help file and exits. + + +OUTPUT: + +The output is in the form: + + + +* Methylated cytosines receive a '+' orientation, +* Unmethylated cytosines receive a '-' orientation. + + +This script was last modified on 31 July 2012. + +HOW_TO +}