# HG changeset patch
# User fcaramia
# Date 1355359504 18000
# Node ID 2432df265dad75acfffd394d155056fdb3f98f55
# Parent 5b208d4d89e5b2feb16476db7e801a46f1508592
Uploaded
diff -r 5b208d4d89e5 -r 2432df265dad methylation_analysis_bismark/methylation_analysis/bismark
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/methylation_analysis_bismark/methylation_analysis/bismark Wed Dec 12 19:45:04 2012 -0500
@@ -0,0 +1,6571 @@
+#!/usr/bin/perl --
+use strict;
+use warnings;
+use IO::Handle;
+use Cwd;
+$|++;
+use Getopt::Long;
+
+
+## This program is Copyright (C) 2010-12, Felix Krueger (felix.krueger@bbsrc.ac.uk)
+
+## This program is free software: you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation, either version 3 of the License, or
+## (at your option) any later version.
+
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+
+## You should have received a copy of the GNU General Public License
+## along with this program. If not, see .
+
+
+my $parent_dir = getcwd;
+my $bismark_version = 'v0.7.6';
+my $command_line = join (" ",@ARGV);
+
+### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail
+foreach my $arg (@ARGV){
+ if ($arg eq '--solexa1.3-quals'){
+ $arg = '--phred64-quals';
+ }
+}
+my @filenames; # will be populated by processing the command line
+
+my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir) = process_command_line();
+
+my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment
+my %chromosomes; # stores the chromosome sequences of the mouse genome
+my %counting; # counting various events
+
+my $seqID_contains_tabs;
+
+foreach my $filename (@filenames){
+
+ chdir $parent_dir or die "Unable to move to initial working directory $!\n";
+ ### resetting the counting hash and fhs
+ reset_counters_and_fhs($filename);
+ $seqID_contains_tabs = 0;
+
+ ### PAIRED-END ALIGNMENTS
+ if ($filename =~ ','){
+ my ($C_to_T_infile_1,$G_to_A_infile_1); # to be made from mate1 file
+
+ $fhs[0]->{name} = 'CTread1GAread2CTgenome';
+ $fhs[1]->{name} = 'GAread1CTread2GAgenome';
+ $fhs[2]->{name} = 'GAread1CTread2CTgenome';
+ $fhs[3]->{name} = 'CTread1GAread2GAgenome';
+
+ print "\nPaired-end alignments will be performed\n",'='x39,"\n\n";
+
+ my ($filename_1,$filename_2) = (split (/,/,$filename));
+ print "The provided filenames for paired-end alignments are $filename_1 and $filename_2\n";
+
+ ### additional variables only for paired-end alignments
+ my ($C_to_T_infile_2,$G_to_A_infile_2); # to be made from mate2 file
+
+ ### FastA format
+ if ($sequence_file_format eq 'FASTA'){
+ print "Input files are in FastA format\n";
+
+ if ($directional){
+ ($C_to_T_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
+ ($G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
+
+ $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
+ $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
+ $fhs[1]->{inputfile_1} = undef;
+ $fhs[1]->{inputfile_2} = undef;
+ $fhs[2]->{inputfile_1} = undef;
+ $fhs[2]->{inputfile_2} = undef;
+ $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
+ $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
+ }
+ else{
+ ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
+ ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
+
+ $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
+ $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
+ $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
+ $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
+ $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
+ $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
+ $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
+ $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
+ }
+
+ if ($bowtie2){
+ paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
+ }
+ else{
+ paired_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
+ }
+ }
+
+ ### FastQ format
+ else{
+ print "Input files are in FastQ format\n";
+ if ($directional){
+ ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
+ ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
+
+ $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
+ $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
+ $fhs[1]->{inputfile_1} = undef;
+ $fhs[1]->{inputfile_2} = undef;
+ $fhs[2]->{inputfile_1} = undef;
+ $fhs[2]->{inputfile_2} = undef;
+ $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
+ $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
+ }
+ else{
+ ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
+ ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
+
+ $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
+ $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
+ $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
+ $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
+ $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
+ $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
+ $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
+ $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
+ }
+
+ if ($bowtie2){
+ paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
+ }
+ else{
+ paired_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
+ }
+ }
+ start_methylation_call_procedure_paired_ends($filename_1,$filename_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
+ }
+
+ ### Else we are performing SINGLE-END ALIGNMENTS
+ else{
+ print "\nSingle-end alignments will be performed\n",'='x39,"\n\n";
+ ### Initialising bisulfite conversion filenames
+ my ($C_to_T_infile,$G_to_A_infile);
+
+
+ ### FastA format
+ if ($sequence_file_format eq 'FASTA'){
+ print "Inut file is in FastA format\n";
+ if ($directional){
+ ($C_to_T_infile) = biTransformFastAFiles ($filename);
+ $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
+ }
+ else{
+ ($C_to_T_infile,$G_to_A_infile) = biTransformFastAFiles ($filename);
+ $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
+ $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
+ }
+
+ ### Creating 4 different bowtie filehandles and storing the first entry
+ if ($bowtie2){
+ single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile,$G_to_A_infile);
+ }
+ else{
+ single_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile,$G_to_A_infile);
+ }
+ }
+
+ ## FastQ format
+ else{
+ print "Input file is in FastQ format\n";
+ if ($directional){
+ ($C_to_T_infile) = biTransformFastQFiles ($filename);
+ $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
+ }
+ else{
+ ($C_to_T_infile,$G_to_A_infile) = biTransformFastQFiles ($filename);
+ $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
+ $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
+ }
+
+ ### Creating 4 different bowtie filehandles and storing the first entry
+ if ($bowtie2){
+ single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile,$G_to_A_infile);
+ }
+ else{
+ single_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile,$G_to_A_infile);
+ }
+ }
+
+ start_methylation_call_procedure_single_ends($filename,$C_to_T_infile,$G_to_A_infile);
+
+ }
+}
+
+sub start_methylation_call_procedure_single_ends {
+ my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
+ my ($dir,$filename);
+
+ if ($sequence_file =~ /\//){
+ ($dir,$filename) = $sequence_file =~ m/(.*\/)(.*)$/;
+ }
+ else{
+ $filename = $sequence_file;
+ }
+
+ ### printing all alignments to a results file
+ my $outfile = $filename;
+
+ if ($bowtie2){ # SAM format is the default for Bowtie 2
+ $outfile =~ s/$/_bt2_bismark.sam/;
+ }
+ elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X)
+ $outfile =~ s/$/_bismark.txt/;
+ }
+ else{ # SAM is the default output
+ $outfile =~ s/$/_bismark.sam/;
+ }
+ print "Writing bisulfite mapping results to $output_dir$outfile\n\n";
+ open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n";
+ if ($vanilla){
+ print OUT "Bismark version: $bismark_version\n";
+ }
+
+ ### printing alignment and methylation call summary to a report file
+ my $reportfile = $filename;
+ if ($bowtie2){
+ $reportfile =~ s/$/_bt2_Bismark_mapping_report.txt/;
+ }
+ else{
+ $reportfile =~ s/$/_Bismark_mapping_report.txt/;
+ }
+
+ open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
+ print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n";
+
+ if ($unmapped){
+ my $unmapped_file = $filename;
+ $unmapped_file =~ s/$/_unmapped_reads.txt/;
+ open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n";
+ print "Unmapped sequences will be written to $output_dir$unmapped_file\n";
+ }
+ if ($ambiguous){
+ my $ambiguous_file = $filename;
+ $ambiguous_file =~ s/$/_ambiguous_reads.txt/;
+ open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n";
+ print "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n";
+ }
+
+ if ($directional){
+ print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed!)\n";
+ }
+ print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+
+
+ ### if 2 or more files are provided we can hold the genome in memory and don't need to read it in a second time
+ unless (%chromosomes){
+ my $cwd = getcwd; # storing the path of the current working directory
+ print "Current working directory is: $cwd\n\n";
+ read_genome_into_memory($cwd);
+ }
+
+ unless ($vanilla or $sam_no_hd){
+ generate_SAM_header();
+ }
+
+ ### Input file is in FastA format
+ if ($sequence_file_format eq 'FASTA'){
+ process_single_end_fastA_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
+ }
+ ### Input file is in FastQ format
+ else{
+ process_single_end_fastQ_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
+ }
+}
+
+sub start_methylation_call_procedure_paired_ends {
+ my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
+
+ my ($dir_1,$filename_1);
+
+ if ($sequence_file_1 =~ /\//){
+ ($dir_1,$filename_1) = $sequence_file_1 =~ m/(.*\/)(.*)$/;
+ }
+ else{
+ $filename_1 = $sequence_file_1;
+ }
+
+ my ($dir_2,$filename_2);
+
+ if ($sequence_file_2 =~ /\//){
+ ($dir_2,$filename_2) = $sequence_file_2 =~ m/(.*\/)(.*)$/;
+ }
+ else{
+ $filename_2 = $sequence_file_2;
+ }
+
+ ### printing all alignments to a results file
+ my $outfile = $filename_1;
+ if ($bowtie2){ # SAM format is the default Bowtie 2 output
+ $outfile =~ s/$/_bismark_bt2_pe.sam/;
+ }
+ elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X)
+ $outfile =~ s/$/_bismark_pe.txt/;
+ }
+ else{ # SAM format is the default Bowtie 1 output
+ $outfile =~ s/$/_bismark_pe.sam/;
+ }
+
+ print "Writing bisulfite mapping results to $outfile\n\n";
+ open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!";
+ if ($vanilla){
+ print OUT "Bismark version: $bismark_version\n";
+ }
+
+ ### printing alignment and methylation call summary to a report file
+ my $reportfile = $filename_1;
+ if ($bowtie2){
+ $reportfile =~ s/$/_Bismark_bt2_paired-end_mapping_report.txt/;
+ }
+ else{
+ $reportfile =~ s/$/_Bismark_paired-end_mapping_report.txt/;
+ }
+
+ open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
+ print REPORT "Bismark report for: $sequence_file_1 and $sequence_file_2 (version: $bismark_version)\n";
+ print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+
+
+ ### Unmapped read output
+ if ($unmapped){
+ my $unmapped_1 = $filename_1;
+ my $unmapped_2 = $filename_2;
+ $unmapped_1 =~ s/$/_unmapped_reads_1.txt/;
+ $unmapped_2 =~ s/$/_unmapped_reads_2.txt/;
+ open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n";
+ open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n";
+ print "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n";
+ }
+
+ if ($ambiguous){
+ my $amb_1 = $filename_1;
+ my $amb_2 = $filename_2;
+ $amb_1 =~ s/$/_ambiguous_reads_1.txt/;
+ $amb_2 =~ s/$/_ambiguous_reads_2.txt/;
+ open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n";
+ open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n";
+ print "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n";
+ }
+
+ if ($directional){
+ print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed)\n";
+ }
+
+ ### if 2 or more files are provided we might still hold the genome in memory and don't need to read it in a second time
+ unless (%chromosomes){
+ my $cwd = getcwd; # storing the path of the current working directory
+ print "Current working directory is: $cwd\n\n";
+ read_genome_into_memory($cwd);
+ }
+
+ unless ($vanilla or $sam_no_hd){
+ generate_SAM_header();
+ }
+
+ ### Input files are in FastA format
+ if ($sequence_file_format eq 'FASTA'){
+ process_fastA_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
+ }
+ ### Input files are in FastQ format
+ else{
+ process_fastQ_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
+ }
+}
+
+sub print_final_analysis_report_single_end{
+ my ($C_to_T_infile,$G_to_A_infile) = @_;
+ ### All sequences from the original sequence file have been analysed now
+ ### deleting temporary C->T or G->A infiles
+
+ if ($directional){
+ my $deletion_successful = unlink "$temp_dir$C_to_T_infile";
+ if ($deletion_successful == 1){
+ warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile\n\n";
+ }
+ else{
+ warn "Could not delete temporary file $C_to_T_infile properly $!\n";
+ }
+ }
+
+ else{
+ my $deletion_successful = unlink "$temp_dir$C_to_T_infile","$temp_dir$G_to_A_infile";
+ if ($deletion_successful == 2){
+ warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile and $temp_dir$G_to_A_infile\n\n";
+ }
+ else{
+ warn "Could not delete temporary files properly $!\n";
+ }
+ }
+
+ ### printing a final report for the alignment procedure
+ print REPORT "Final Alignment report\n",'='x22,"\n";
+ print "Final Alignment report\n",'='x22,"\n";
+ # foreach my $index (0..$#fhs){
+ # print "$fhs[$index]->{name}\n";
+ # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
+ # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
+ # }
+
+ ### printing a final report for the methylation call procedure
+ warn "Sequences analysed in total:\t$counting{sequences_count}\n";
+ print REPORT "Sequences analysed in total:\t$counting{sequences_count}\n";
+
+ my $percent_alignable_sequences = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
+
+ warn "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n\n";
+ print REPORT "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n";
+
+ ### percentage of low complexity reads overruled because of low complexity (thereby creating a bias for highly methylated reads),
+ ### only calculating the percentage if there were any overruled alignments
+ if ($counting{low_complexity_alignments_overruled_count}){
+ my $percent_overruled_low_complexity_alignments = sprintf ("%.1f",$counting{low_complexity_alignments_overruled_count}*100/$counting{sequences_count});
+ # print REPORT "Number of low complexity alignments which were overruled to have a unique best hit rather than discarding them:\t$counting{low_complexity_alignments_overruled_count}\t(${percent_overruled_low_complexity_alignments}%)\n";
+ }
+
+ print "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
+ print "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
+ print "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
+ print "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
+ print join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
+
+ print REPORT "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
+ print REPORT "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
+ print REPORT "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
+ print REPORT "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
+ print REPORT join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
+
+ if ($directional){
+ print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
+ print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
+ }
+
+ ### detailed information about Cs analysed
+ warn "Final Cytosine Methylation Report\n",'='x33,"\n";
+ my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count};
+ warn "Total number of C's analysed:\t$total_number_of_C\n\n";
+ warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
+ warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
+ warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
+ warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
+ warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
+ warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
+
+ print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
+ print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
+ print REPORT "Total methylated C's in CpG context:\t $counting{total_meCpG_count}\n";
+ print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
+ print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
+ print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
+ print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
+ print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
+
+ my $percent_meCHG;
+ if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
+ $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
+ }
+
+ my $percent_meCHH;
+ if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
+ $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
+ }
+
+ my $percent_meCpG;
+ if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
+ $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
+ }
+
+ ### printing methylated CpG percentage if applicable
+ if ($percent_meCpG){
+ warn "C methylated in CpG context:\t${percent_meCpG}%\n";
+ print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
+ }
+ else{
+ warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
+ print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
+ }
+
+ ### printing methylated C percentage (CHG context) if applicable
+ if ($percent_meCHG){
+ warn "C methylated in CHG context:\t${percent_meCHG}%\n";
+ print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
+ }
+ else{
+ warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
+ print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
+ }
+
+ ### printing methylated C percentage (CHH context) if applicable
+ if ($percent_meCHH){
+ warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
+ print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
+ }
+ else{
+ warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
+ print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
+ }
+
+ if ($seqID_contains_tabs){
+ warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
+ print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
+ }
+}
+
+sub print_final_analysis_report_paired_ends{
+ my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
+ ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles
+ if ($directional){
+ my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_2";
+ if ($deletion_successful == 2){
+ warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2\n\n";
+ }
+ else{
+ warn "Could not delete temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2 properly: $!\n";
+ }
+ }
+ else{
+ my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1","$temp_dir$C_to_T_infile_2","$temp_dir$G_to_A_infile_2";
+ if ($deletion_successful == 4){
+ warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1, $temp_dir$G_to_A_infile_1, $temp_dir$C_to_T_infile_2 and $temp_dir$G_to_A_infile_2\n\n";
+ }
+ else{
+ warn "Could not delete temporary files properly: $!\n";
+ }
+ }
+
+ ### printing a final report for the alignment procedure
+ warn "Final Alignment report\n",'='x22,"\n";
+ print REPORT "Final Alignment report\n",'='x22,"\n";
+ # foreach my $index (0..$#fhs){
+ # print "$fhs[$index]->{name}\n";
+ # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
+ # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
+ # }
+
+ ### printing a final report for the methylation call procedure
+ warn "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
+ print REPORT "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
+
+ my $percent_alignable_sequence_pairs = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
+ print "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}%\n\n";
+ print REPORT "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}% \n";
+
+ print "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
+ print "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
+ print "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
+ print "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
+ print join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
+
+
+ print REPORT "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
+ print REPORT "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
+ print REPORT "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
+ print REPORT "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
+ print REPORT join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
+ ### detailed information about Cs analysed
+
+ if ($directional){
+ print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
+ print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
+ }
+
+ warn "Final Cytosine Methylation Report\n",'='x33,"\n";
+ print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
+
+ my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
+ warn "Total number of C's analysed:\t$total_number_of_C\n\n";
+ warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
+ warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
+ warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
+ warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
+ warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
+ warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
+
+ print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
+ print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
+ print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
+ print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
+ print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
+ print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
+ print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
+
+ my $percent_meCHG;
+ if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
+ $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
+ }
+
+ my $percent_meCHH;
+ if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
+ $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
+ }
+
+ my $percent_meCpG;
+ if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
+ $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
+ }
+
+ ### printing methylated CpG percentage if applicable
+ if ($percent_meCpG){
+ warn "C methylated in CpG context:\t${percent_meCpG}%\n";
+ print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
+ }
+ else{
+ warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
+ print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
+ }
+
+ ### printing methylated C percentage in CHG context if applicable
+ if ($percent_meCHG){
+ warn "C methylated in CHG context:\t${percent_meCHG}%\n";
+ print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
+ }
+ else{
+ warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
+ print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
+ }
+
+ ### printing methylated C percentage in CHH context if applicable
+ if ($percent_meCHH){
+ warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
+ print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
+ }
+ else{
+ warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
+ print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
+ }
+
+}
+
+sub process_single_end_fastA_file_for_methylation_call{
+ my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
+ ### this is a FastA sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
+ ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
+ ### the C->T or G->A version
+
+ ### gzipped version of the infile
+ if ($sequence_file =~ /\.gz$/){
+ open (IN,"zcat $sequence_file |") or die $!;
+ }
+ else{
+ open (IN,$sequence_file) or die $!;
+ }
+
+ my $count = 0;
+
+ warn "\nReading in the sequence file $sequence_file\n";
+ while (1) {
+ # last if ($counting{sequences_count} > 100);
+ my $identifier = ;
+ my $sequence = ;
+ last unless ($identifier and $sequence);
+
+ $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
+
+ ++$count;
+
+ if ($skip){
+ next unless ($count > $skip);
+ }
+ if ($upto){
+ last if ($count > $upto);
+ }
+
+ $counting{sequences_count}++;
+ if ($counting{sequences_count}%100000==0) {
+ warn "Processed $counting{sequences_count} sequences so far\n";
+ }
+ chomp $sequence;
+ chomp $identifier;
+
+ $identifier =~ s/^>//; # deletes the > at the beginning of FastA headers
+
+ my $return;
+ if ($bowtie2){
+ $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier);
+ }
+ else{
+ $return = check_bowtie_results_single_end(uc$sequence,$identifier); # default Bowtie 1
+ }
+
+ unless ($return){
+ $return = 0;
+ }
+
+ # print the sequence to ambiguous.out if --ambiguous was specified
+ if ($ambiguous and $return == 2){
+ print AMBIG ">$identifier\n";
+ print AMBIG "$sequence\n";
+ }
+
+ # print the sequence to file if --un was specified
+ elsif ($unmapped and $return == 1){
+ print UNMAPPED ">$identifier\n";
+ print UNMAPPED "$sequence\n";
+ }
+ }
+ print "Processed $counting{sequences_count} sequences in total\n\n";
+
+ print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
+
+}
+
+sub process_single_end_fastQ_file_for_methylation_call{
+ my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
+ ### this is the Illumina sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
+ ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
+ ### the C->T or G->A version
+
+ ### gzipped version of the infile
+ if ($sequence_file =~ /\.gz$/){
+ open (IN,"zcat $sequence_file |") or die $!;
+ }
+ else{
+ open (IN,$sequence_file) or die $!;
+ }
+
+ my $count = 0;
+
+ warn "\nReading in the sequence file $sequence_file\n";
+ while (1) {
+ my $identifier = ;
+ my $sequence = ;
+ my $identifier_2 = ;
+ my $quality_value = ;
+ last unless ($identifier and $sequence and $identifier_2 and $quality_value);
+
+ $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
+
+ ++$count;
+
+ if ($skip){
+ next unless ($count > $skip);
+ }
+ if ($upto){
+ last if ($count > $upto);
+ }
+
+ $counting{sequences_count}++;
+
+ if ($counting{sequences_count}%1000000==0) {
+ warn "Processed $counting{sequences_count} sequences so far\n";
+ }
+ chomp $sequence;
+ chomp $identifier;
+ chomp $quality_value;
+
+ $identifier =~ s/^\@//; # deletes the @ at the beginning of Illumin FastQ headers
+
+ my $return;
+ if ($bowtie2){
+ $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier,$quality_value);
+ }
+ else{
+ $return = check_bowtie_results_single_end(uc$sequence,$identifier,$quality_value); # default Bowtie 1
+ }
+
+ unless ($return){
+ $return = 0;
+ }
+
+ # print the sequence to ambiguous.out if --ambiguous was specified
+ if ($ambiguous and $return == 2){
+ print AMBIG "\@$identifier\n";
+ print AMBIG "$sequence\n";
+ print AMBIG $identifier_2;
+ print AMBIG "$quality_value\n";
+ }
+
+ # print the sequence to file if --un was specified
+ elsif ($unmapped and $return == 1){
+ print UNMAPPED "\@$identifier\n";
+ print UNMAPPED "$sequence\n";
+ print UNMAPPED $identifier_2;
+ print UNMAPPED "$quality_value\n";
+ }
+ }
+ print "Processed $counting{sequences_count} sequences in total\n\n";
+
+ print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
+
+}
+
+sub process_fastA_files_for_paired_end_methylation_calls{
+ my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
+ ### Processing the two FastA sequence files; we need the actual sequences of both reads to compare them against the genomic sequence in order to
+ ### make a methylation call. The sequence idetifier per definition needs to be the same for a sequence pair used for paired-end mapping.
+ ### Now reading in the sequence files sequence by sequence and see if the current sequences produced an alignment to one (or both) of the
+ ### converted genomes (either the C->T or G->A version)
+
+ ### gzipped version of the infiles
+ if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
+ open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
+ open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
+ }
+ else{
+ open (IN1,$sequence_file_1) or die $!;
+ open (IN2,$sequence_file_2) or die $!;
+ }
+
+ warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
+ ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
+
+ my $count = 0;
+
+ while (1) {
+ # reading from the first input file
+ my $identifier_1 = ;
+ my $sequence_1 = ;
+ # reading from the second input file
+ my $identifier_2 = ;
+ my $sequence_2 = ;
+ last unless ($identifier_1 and $sequence_1 and $identifier_2 and $sequence_2);
+
+ $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
+ $identifier_2 = fix_IDs($identifier_2);
+
+ ++$count;
+
+ if ($skip){
+ next unless ($count > $skip);
+ }
+ if ($upto){
+ last if ($count > $upto);
+ }
+
+ $counting{sequences_count}++;
+ if ($counting{sequences_count}%100000==0) {
+ warn "Processed $counting{sequences_count} sequences so far\n";
+ }
+ my $orig_identifier_1 = $identifier_1;
+ my $orig_identifier_2 = $identifier_2;
+
+ chomp $sequence_1;
+ chomp $identifier_1;
+ chomp $sequence_2;
+ chomp $identifier_2;
+
+ $identifier_1 =~ s/^>//; # deletes the > at the beginning of FastA headers
+
+ my $return;
+ if ($bowtie2){
+ $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1);
+ }
+ else{
+ $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1);
+ }
+
+ unless ($return){
+ $return = 0;
+ }
+
+ # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
+ if ($ambiguous and $return == 2){
+ print AMBIG_1 $orig_identifier_1;
+ print AMBIG_1 "$sequence_1\n";
+ print AMBIG_2 $orig_identifier_2;
+ print AMBIG_2 "$sequence_2\n";
+ }
+
+ # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
+ elsif ($unmapped and $return == 1){
+ print UNMAPPED_1 $orig_identifier_1;
+ print UNMAPPED_1 "$sequence_1\n";
+ print UNMAPPED_2 $orig_identifier_2;
+ print UNMAPPED_2 "$sequence_2\n";
+ }
+ }
+
+ print "Processed $counting{sequences_count} sequences in total\n\n";
+
+ print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
+
+}
+
+sub process_fastQ_files_for_paired_end_methylation_calls{
+ my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
+ ### Processing the two Illumina sequence files; we need the actual sequence of both reads to compare them against the genomic sequence in order to
+ ### make a methylation call. The sequence identifier per definition needs to be same for a sequence pair used for paired-end alignments.
+ ### Now reading in the sequence files sequence by sequence and see if the current sequences produced a paired-end alignment to one (or both)
+ ### of the converted genomes (either C->T or G->A version)
+
+ ### gzipped version of the infiles
+ if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
+ open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
+ open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
+ }
+ else{
+ open (IN1,$sequence_file_1) or die $!;
+ open (IN2,$sequence_file_2) or die $!;
+ }
+
+ my $count = 0;
+
+ warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
+ ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
+ while (1) {
+ # reading from the first input file
+ my $identifier_1 = ;
+ my $sequence_1 = ;
+ my $ident_1 = ; # not needed
+ my $quality_value_1 = ; # not needed
+ # reading from the second input file
+ my $identifier_2 = ;
+ my $sequence_2 = ;
+ my $ident_2 = ; # not needed
+ my $quality_value_2 = ; # not needed
+ last unless ($identifier_1 and $sequence_1 and $quality_value_1 and $identifier_2 and $sequence_2 and $quality_value_2);
+
+ $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
+ $identifier_2 = fix_IDs($identifier_2);
+
+ ++$count;
+
+ if ($skip){
+ next unless ($count > $skip);
+ }
+ if ($upto){
+ last if ($count > $upto);
+ }
+
+ $counting{sequences_count}++;
+ if ($counting{sequences_count}%100000==0) {
+ warn "Processed $counting{sequences_count} sequences so far\n";
+ }
+
+ my $orig_identifier_1 = $identifier_1;
+ my $orig_identifier_2 = $identifier_2;
+
+ chomp $sequence_1;
+ chomp $identifier_1;
+ chomp $sequence_2;
+ chomp $identifier_2;
+ chomp $quality_value_1;
+ chomp $quality_value_2;
+
+ $identifier_1 =~ s/^\@//; # deletes the @ at the beginning of the FastQ ID
+
+ my $return;
+ if ($bowtie2){
+ $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
+ }
+ else{
+ $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
+ }
+
+ unless ($return){
+ $return = 0;
+ }
+
+ # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
+ if ($ambiguous and $return == 2){
+ # seq_1
+ print AMBIG_1 $orig_identifier_1;
+ print AMBIG_1 "$sequence_1\n";
+ print AMBIG_1 $ident_1;
+ print AMBIG_1 "$quality_value_1\n";
+ # seq_2
+ print AMBIG_2 $orig_identifier_2;
+ print AMBIG_2 "$sequence_2\n";
+ print AMBIG_2 $ident_2;
+ print AMBIG_2 "$quality_value_2\n";
+ }
+
+ # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
+ elsif ($unmapped and $return == 1){
+ # seq_1
+ print UNMAPPED_1 $orig_identifier_1;
+ print UNMAPPED_1 "$sequence_1\n";
+ print UNMAPPED_1 $ident_1;
+ print UNMAPPED_1 "$quality_value_1\n";
+ # seq_2
+ print UNMAPPED_2 $orig_identifier_2;
+ print UNMAPPED_2 "$sequence_2\n";
+ print UNMAPPED_2 $ident_2;
+ print UNMAPPED_2 "$quality_value_2\n";
+ }
+ }
+
+ print "Processed $counting{sequences_count} sequences in total\n\n";
+
+ print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
+
+}
+
+sub check_bowtie_results_single_end{
+ my ($sequence,$identifier,$quality_value) = @_;
+
+ unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
+ $quality_value = 'I'x(length$sequence);
+ }
+
+ my %mismatches = ();
+ ### reading from the bowtie output files to see if this sequence aligned to a bisulfite converted genome
+ foreach my $index (0..$#fhs){
+
+ ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
+ next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
+ ### if the sequence we are currently looking at produced an alignment we are doing various things with it
+ if ($fhs[$index]->{last_seq_id} eq $identifier) {
+ ###############################################################
+ ### STEP I Now processing the alignment stored in last_line ###
+ ###############################################################
+ my $valid_alignment_found_1 = decide_whether_single_end_alignment_is_valid($index,$identifier);
+ ### sequences can fail at this point if there was only 1 seq in the wrong orientation, or if there were 2 seqs, both in the wrong orientation
+ ### we only continue to extract useful information about this alignment if 1 was returned
+ if ($valid_alignment_found_1 == 1){
+ ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself
+ ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
+ my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
+
+ unless($mismatch_info){
+ $mismatch_info = '';
+ }
+
+ chomp $mismatch_info;
+ my $chromosome;
+ if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
+ $chromosome = $mapped_chromosome;
+ }
+ else{
+ die "Chromosome number extraction failed for $mapped_chromosome\n";
+ }
+ ### Now extracting the number of mismatches to the converted genome
+ my $number_of_mismatches;
+ if ($mismatch_info eq ''){
+ $number_of_mismatches = 0;
+ }
+ elsif ($mismatch_info =~ /^\d/){
+ my @mismatches = split (/,/,$mismatch_info);
+ $number_of_mismatches = scalar @mismatches;
+ }
+ else{
+ die "Something weird is going on with the mismatch field:\t>>> $mismatch_info <<<\n";
+ }
+ ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
+ my $alignment_location = join (":",$chromosome,$position);
+ ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
+ ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
+ ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
+ ### number for the found alignment)
+ unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
+ $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
+ $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
+ $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
+ $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
+ $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
+ }
+ $number_of_mismatches = undef;
+ ##################################################################################################################################################
+ ### STEP II Now reading in the next line from the bowtie filehandle. The next alignment can either be a second alignment of the same sequence or a
+ ### a new sequence. In either case we will store the next line in @fhs ->{last_line}. In case the alignment is already the next entry, a 0 will
+ ### be returned as $valid_alignment_found and it will then be processed in the next round only.
+ ##################################################################################################################################################
+ my $newline = $fhs[$index]->{fh}-> getline();
+ if ($newline){
+ my ($seq_id) = split (/\t/,$newline);
+ $fhs[$index]->{last_seq_id} = $seq_id;
+ $fhs[$index]->{last_line} = $newline;
+ }
+ else {
+ # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line} = undef;
+ next;
+ }
+ my $valid_alignment_found_2 = decide_whether_single_end_alignment_is_valid($index,$identifier);
+ ### we only continue to extract useful information about this second alignment if 1 was returned
+ if ($valid_alignment_found_2 == 1){
+ ### If the second Bowtie output made it this far it is in the correct orientation, so we can continue to analyse the alignment itself
+ ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
+ my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
+ unless($mismatch_info){
+ $mismatch_info = '';
+ }
+ chomp $mismatch_info;
+
+ my $chromosome;
+ if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
+ $chromosome = $mapped_chromosome;
+ }
+ else{
+ die "Chromosome number extraction failed for $mapped_chromosome\n";
+ }
+
+ ### Now extracting the number of mismatches to the converted genome
+ my $number_of_mismatches;
+ if ($mismatch_info eq ''){
+ $number_of_mismatches = 0;
+ }
+ elsif ($mismatch_info =~ /^\d/){
+ my @mismatches = split (/,/,$mismatch_info);
+ $number_of_mismatches = scalar @mismatches;
+ }
+ else{
+ die "Something weird is going on with the mismatch field\n";
+ }
+ ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
+ ### extracting the chromosome number from the bowtie output (see above)
+ my $alignment_location = join (":",$chromosome,$position);
+ ### In the special case that two differently converted sequences align against differently converted genomes, but to the same position
+ ### with the same number of mismatches (or perfect matches), the chromosome, position and number of mismatches are the same. In this
+ ### case we are not writing the same entry out a second time.
+ unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
+ $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
+ $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
+ $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
+ $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
+ $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
+ }
+ ####################################################################################################################################
+ #### STEP III Now reading in one more line which has to be the next alignment to be analysed. Adding it to @fhs ->{last_line} ###
+ ####################################################################################################################################
+ $newline = $fhs[$index]->{fh}-> getline();
+ if ($newline){
+ my ($seq_id) = split (/\t/,$newline);
+ die "The same seq ID occurred more than twice in a row\n" if ($seq_id eq $identifier);
+ $fhs[$index]->{last_seq_id} = $seq_id;
+ $fhs[$index]->{last_line} = $newline;
+ next;
+ }
+ else {
+ # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line} = undef;
+ next;
+ }
+ ### still within the 2nd sequence in correct orientation found
+ }
+ ### still withing the 1st sequence in correct orientation found
+ }
+ ### still within the if (last_seq_id eq identifier) condition
+ }
+ ### still within foreach index loop
+ }
+ ### if there was not a single alignment found for a certain sequence we will continue with the next sequence in the sequence file
+ unless(%mismatches){
+ $counting{no_single_alignment_found}++;
+ if ($unmapped){
+ return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
+ }
+ else{
+ return;
+ }
+ }
+ #######################################################################################################################################################
+ #######################################################################################################################################################
+ ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the ###
+ ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the ###
+ ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether ###
+ #######################################################################################################################################################
+ #######################################################################################################################################################
+ ### Going to use the variable $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
+ my $sequence_fails = 0;
+ ### Declaring an empty hash reference which will store all information we need for the methylation call
+ my $methylation_call_params; # hash reference!
+ ### sorting in ascending order
+ foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
+
+ ### if there is only 1 entry in the hash with the lowest number of mismatches we accept it as the best alignment
+ if (scalar keys %{$mismatches{$mismatch_number}} == 1){
+ for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
+ $methylation_call_params->{$identifier}->{bowtie_sequence} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence};
+ $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
+ $methylation_call_params->{$identifier}->{position} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{position};
+ $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
+ $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
+ }
+ }
+ elsif (scalar keys %{$mismatches{$mismatch_number}} == 3){
+ ### If there are 3 sequences with the same number of lowest mismatches we can discriminate 2 cases: (i) all 3 alignments are unique best hits and
+ ### come from different alignments processes (== indices) or (ii) one sequence alignment (== index) will give a unique best alignment, whereas a
+ ### second one will produce 2 (or potentially many) alignments for the same sequence but in a different conversion state or against a different genome
+ ### version (or both). This becomes especially relevant for highly converted sequences in which all Cs have been converted to Ts in the bisulfite
+ ### reaction. E.g.
+ ### CAGTCACGCGCGCGCG will become
+ ### TAGTTATGTGTGTGTG in the CT transformed version, which will ideally still give the correct alignment in the CT->CT alignment condition.
+ ### If the same read will then become G->A transformed as well however, the resulting sequence will look differently and potentially behave
+ ### differently in a GA->GA alignment and this depends on the methylation state of the original sequence!:
+ ### G->A conversion:
+ ### highly methylated: CAATCACACACACACA
+ ### highly converted : TAATTATATATATATA <== this sequence has a reduced complexity (only 2 bases left and not 3), and it is more likely to produce
+ ### an alignment with a low complexity genomic region than the one above. This would normally lead to the entire sequence being kicked out as the
+ ### there will be 3 alignments with the same number of lowest mismatches!! This in turn means that highly methylated and thereby not converted
+ ### sequences are more likely to pass the alignment step, thereby creating a bias for methylated reads compared to their non-methylated counterparts.
+ ### We do not want any bias, whatsover. Therefore if we have 1 sequence producing a unique best alignment and the second and third conditions
+ ### producing alignments only after performing an additional (theoretical) conversion we want to keep the best alignment with the lowest number of
+ ### additional transliterations performed. Thus we want to have a look at the level of complexity of the sequences producing the alignment.
+ ### In the above example the number of transliterations required to transform the actual sequence
+ ### to the C->T version would be TAGTTATGTGTGTGTG -> TAGTTATGTGTGTGTG = 0; (assuming this gives the correct alignment)
+ ### in the G->A case it would be TAGTTATGTGTGTGTG -> TAATTATATATATATA = 6; (assuming this gives multiple wrong alignments)
+ ### if the sequence giving a unique best alignment required a lower number of transliterations than the second best sequence yielding alignments
+ ### while requiring a much higher number of transliterations, we are going to accept the unique best alignment with the lowest number of performed
+ ### transliterations. As a threshold which does scale we will start with the number of tranliterations of the lowest best match x 2 must still be
+ ### smaller than the number of tranliterations of the second best sequence. Everything will be flagged with $sequence_fails = 1 and discarded.
+ my @three_candidate_seqs;
+ foreach my $composite_location (keys (%{$mismatches{$mismatch_number}}) ){
+ my $transliterations_performed;
+ if ($mismatches{$mismatch_number}->{$composite_location}->{index} == 0 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 1){
+ $transliterations_performed = determine_number_of_transliterations_performed($sequence,'CT');
+ }
+ elsif ($mismatches{$mismatch_number}->{$composite_location}->{index} == 2 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 3){
+ $transliterations_performed = determine_number_of_transliterations_performed($sequence,'GA');
+ }
+ else{
+ die "unexpected index number range $!\n";
+ }
+ push @three_candidate_seqs,{
+ index =>$mismatches{$mismatch_number}->{$composite_location}->{index},
+ bowtie_sequence => $mismatches{$mismatch_number}->{$composite_location}->{bowtie_sequence},
+ mismatch_number => $mismatch_number,
+ chromosome => $mismatches{$mismatch_number}->{$composite_location}->{chromosome},
+ position => $mismatches{$mismatch_number}->{$composite_location}->{position},
+ seq_id => $mismatches{$mismatch_number}->{$composite_location}->{seq_id},
+ transliterations_performed => $transliterations_performed,
+ };
+ }
+ ### sorting in ascending order for the lowest number of transliterations performed
+ @three_candidate_seqs = sort {$a->{transliterations_performed} <=> $b->{transliterations_performed}} @three_candidate_seqs;
+ my $first_array_element = $three_candidate_seqs[0]->{transliterations_performed};
+ my $second_array_element = $three_candidate_seqs[1]->{transliterations_performed};
+ my $third_array_element = $three_candidate_seqs[2]->{transliterations_performed};
+ # print "$first_array_element\t$second_array_element\t$third_array_element\n";
+ if (($first_array_element*2) < $second_array_element){
+ $counting{low_complexity_alignments_overruled_count}++;
+ ### taking the index with the unique best hit and over ruling low complexity alignments with 2 hits
+ $methylation_call_params->{$identifier}->{bowtie_sequence} = $three_candidate_seqs[0]->{bowtie_sequence};
+ $methylation_call_params->{$identifier}->{chromosome} = $three_candidate_seqs[0]->{chromosome};
+ $methylation_call_params->{$identifier}->{position} = $three_candidate_seqs[0]->{position};
+ $methylation_call_params->{$identifier}->{index} = $three_candidate_seqs[0]->{index};
+ $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
+ # print "Overruled low complexity alignments! Using $first_array_element and disregarding $second_array_element and $third_array_element\n";
+ }
+ else{
+ $sequence_fails = 1;
+ }
+ }
+ else{
+ $sequence_fails = 1;
+ }
+ ### after processing the alignment with the lowest number of mismatches we exit
+ last;
+ }
+ ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
+ if ($sequence_fails == 1){
+ $counting{unsuitable_sequence_count}++;
+ if ($ambiguous){
+ return 2; # => exits to next sequence, and prints it out to multiple_alignments.out if --ambiguous has been specified
+ }
+ if ($unmapped){
+ return 1; # => exits to next sequence, and prints it out to unmapped.out if --un has been specified
+ }
+ else{
+ return 0; # => exits to next sequence (default)
+ }
+ }
+
+ ### --DIRECTIONAL
+ ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
+ ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
+ if ($directional){
+ if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
+ # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
+ $counting{alignments_rejected_count}++;
+ return 0;
+ }
+ }
+
+ ### If the sequence has not been rejected so far it will have a unique best alignment
+ $counting{unique_best_alignment_count}++;
+ extract_corresponding_genomic_sequence_single_end($identifier,$methylation_call_params);
+ ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
+ if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
+ warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
+ $counting{genomic_sequence_could_not_be_extracted_count}++;
+ return 0;
+ }
+
+ ### otherwise we are set to perform the actual methylation call
+ $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
+
+ print_bisulfite_mapping_result_single_end($identifier,$sequence,$methylation_call_params,$quality_value);
+ return 0; ## otherwise 1 will be returned by default, which would print the sequence to unmapped.out
+}
+
+sub check_bowtie_results_single_end_bowtie2{
+ my ($sequence,$identifier,$quality_value) = @_;
+
+ unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
+ $quality_value = 'I'x(length$sequence);
+ }
+
+ # as of version Bowtie 2 2.0.0 beta7, when input reads are unpaired, Bowtie 2 no longer removes the trailing /1 or /2 from the read name.
+ # $identifier =~ s/\/[1234567890]+$//; # some sequencers don't just have /1 or /2 at the end of read IDs
+
+ my $alignment_ambiguous = 0;
+
+ my %alignments = ();
+
+ ### reading from the Bowtie 2 output filehandles
+ foreach my $index (0..$#fhs){
+ # print "Index: $index\n";
+ # print "$fhs[$index]->{last_line}\n";
+ # print "$fhs[$index]->{last_seq_id}\n\n";
+
+ ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
+ next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
+
+ ### if the sequence we are currently looking at produced an alignment we are doing various things with it
+ # print "last seq id: $fhs[$index]->{last_seq_id} and identifier: $identifier\n";
+
+ if ($fhs[$index]->{last_seq_id} eq $identifier) {
+
+ # SAM format specifications for Bowtie 2
+ # (1) Name of read that aligned
+ # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
+ # 1 The read is one of a pair
+ # 2 The alignment is one end of a proper paired-end alignment
+ # 4 The read has no reported alignments
+ # 8 The read is one of a pair and has no reported alignments
+ # 16 The alignment is to the reverse reference strand
+ # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
+ # 64 The read is mate 1 in a pair
+ # 128 The read is mate 2 in a pair
+ # 256 The read has multiple mapping states
+ # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
+ # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
+ # (5) Mapping quality (255 means MAPQ is not available)
+ # (6) CIGAR string representation of alignment (* if unavailable)
+ # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
+ # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
+ # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
+ # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
+ # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
+ # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
+ # AS:i: Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
+ # XS:i: Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
+ # YS:i: Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
+ # XN:i: The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
+ # XM:i: The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
+ # XO:i: The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
+ # XG:i: The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
+ # NM:i: The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
+ # YF:Z: String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
+ # MD:Z: A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
+
+ my ($id,$flag,$mapped_chromosome,$position,$mapping_quality,$cigar,$bowtie_sequence,$qual) = (split (/\t/,$fhs[$index]->{last_line}))[0,1,2,3,4,5,9,10];
+
+ ### If a sequence has no reported alignments there will be a single output line with a bit-wise flag value of 4. We can store the next alignment and move on to the next Bowtie 2 instance
+ if ($flag == 4){
+ ## reading in the next alignment, which must be the next sequence
+ my $newline = $fhs[$index]->{fh}-> getline();
+ if ($newline){
+ chomp $newline;
+ my ($seq_id) = split (/\t/,$newline);
+ $fhs[$index]->{last_seq_id} = $seq_id;
+ $fhs[$index]->{last_line} = $newline;
+ if ($seq_id eq $identifier){
+ die "Sequence with ID $identifier did not produce any alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
+ }
+ next; # next instance
+ }
+ else{
+ # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line} = undef;
+ next;
+ }
+ }
+
+ # if there are one or more proper alignments we can extract the chromosome number
+ my $chromosome;
+ if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
+ $chromosome = $mapped_chromosome;
+ }
+ else{
+ die "Chromosome number extraction failed for $mapped_chromosome\n";
+ }
+
+ ### We will use the optional field to determine the best alignment. Later on we extract the number of mismatches and/or indels from the CIGAR string
+ my ($alignment_score,$second_best,$MD_tag);
+ my @fields = split (/\t/,$fhs[$index]->{last_line});
+
+ foreach (11..$#fields){
+ if ($fields[$_] =~ /AS:i:(.*)/){
+ $alignment_score = $1;
+ }
+ elsif ($fields[$_] =~ /XS:i:(.*)/){
+ $second_best = $1;
+ }
+ elsif ($fields[$_] =~ /MD:Z:(.*)/){
+ $MD_tag = $1;
+ }
+ }
+
+ # warn "First best alignment_score is: '$alignment_score'\n";
+ # warn "MD tag is: '$MD_tag'\n";
+ die "Failed to extract alignment score ($alignment_score) and MD tag ($MD_tag)!\n" unless (defined $alignment_score and defined $MD_tag);
+
+ if (defined $second_best){
+ # warn "second best alignment_score is: '$second_best'\n";
+
+ # If the first alignment score is the same as the alignment score of the second best hit we are going to boot this sequence altogether
+ if ($alignment_score == $second_best){
+ $alignment_ambiguous = 1;
+ ## need to read and discard all additional ambiguous reads until we reach the next sequence
+ until ($fhs[$index]->{last_seq_id} ne $identifier){
+ my $newline = $fhs[$index]->{fh}-> getline();
+ if ($newline){
+ chomp $newline;
+ my ($seq_id) = split (/\t/,$newline);
+ $fhs[$index]->{last_seq_id} = $seq_id;
+ $fhs[$index]->{last_line} = $newline;
+ }
+ else{
+ # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line} = undef;
+ last; # break free in case we have reached the end of the alignment output
+ }
+ }
+ # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
+ }
+ else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
+
+ my $alignment_location = join (":",$chromosome,$position);
+
+ ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
+ ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
+ ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
+ ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
+
+ unless (exists $alignments{$alignment_location}){
+ $alignments{$alignment_location}->{seq_id} = $id;
+ $alignments{$alignment_location}->{alignment_score} = $alignment_score;
+ $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
+ $alignments{$alignment_location}->{index} = $index;
+ $alignments{$alignment_location}->{chromosome} = $chromosome;
+ $alignments{$alignment_location}->{position} = $position;
+ $alignments{$alignment_location}->{CIGAR} = $cigar;
+ $alignments{$alignment_location}->{MD_tag} = $MD_tag;
+ }
+
+ ### now reading and discarding all (inferior) alignments of this sequencing read until we hit the next sequence
+ until ($fhs[$index]->{last_seq_id} ne $identifier){
+ my $newline = $fhs[$index]->{fh}-> getline();
+ if ($newline){
+ chomp $newline;
+ my ($seq_id) = split (/\t/,$newline);
+ $fhs[$index]->{last_seq_id} = $seq_id;
+ $fhs[$index]->{last_line} = $newline;
+ }
+ else{
+ # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line} = undef;
+ last; # break free in case we have reached the end of the alignment output
+ }
+ }
+ # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
+ }
+ }
+ else{ # there is no second best hit, so we can just store this one and read in the next sequence
+
+ my $alignment_location = join (":",$chromosome,$position);
+
+ ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
+ ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
+ ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
+ ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
+
+ unless (exists $alignments{$alignment_location}){
+ $alignments{$alignment_location}->{seq_id} = $id;
+ $alignments{$alignment_location}->{alignment_score} = $alignment_score;
+ $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
+ $alignments{$alignment_location}->{index} = $index;
+ $alignments{$alignment_location}->{chromosome} = $chromosome;
+ $alignments{$alignment_location}->{position} = $position;
+ $alignments{$alignment_location}->{MD_tag} = $MD_tag;
+ $alignments{$alignment_location}->{CIGAR} = $cigar;
+ }
+
+ my $newline = $fhs[$index]->{fh}-> getline();
+ if ($newline){
+ chomp $newline;
+ my ($seq_id) = split (/\t/,$newline);
+ $fhs[$index]->{last_seq_id} = $seq_id;
+ $fhs[$index]->{last_line} = $newline;
+ if ($seq_id eq $identifier){
+ die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
+ }
+ }
+ else{
+ # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line} = undef;
+ }
+ }
+ }
+ }
+
+ ### if the read produced several ambiguous alignments already now can returning already now. If --ambiguous or --unmapped was specified the read sequence will be printed out.
+ if ($alignment_ambiguous == 1){
+ $counting{unsuitable_sequence_count}++;
+ ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
+ # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
+ # print "$ambiguous_read_output\n";
+
+ if ($ambiguous){
+ return 2; # => exits to next sequence, and prints it out to _ambiguous_reads.txt if '--ambiguous' was specified
+ }
+ elsif ($unmapped){
+ return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
+ }
+ else{
+ return 0;
+ }
+ }
+
+ ### if there was no alignment found for a certain sequence at all we continue with the next sequence in the sequence file
+ unless(%alignments){
+ $counting{no_single_alignment_found}++;
+ # my $unmapped_read_output = join("\t",$identifier,'4','*','0','0','*','*','0','0',$sequence,$quality_value);
+ # print "$unmapped_read_output\n";
+ if ($unmapped){
+ return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' was specified
+ }
+ else{
+ return 0; # default
+ }
+ }
+
+ #######################################################################################################################################################
+
+ ### If the sequence was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
+ ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
+ ### alignment score we are discarding the sequence altogether.
+ ### For end-to-end alignments the maximum alignment score can be 0, each mismatch can receive penalties up to 6, and each gap receives penalties for
+ ### opening (5) and extending (3 per bp) the gap.
+
+ #######################################################################################################################################################
+
+ my $methylation_call_params; # hash reference which will store all information we need for the methylation call
+ my $sequence_fails = 0; # Going to use $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
+
+ ### print contents of %alignments for debugging
+ # if (scalar keys %alignments > 1){
+ # print "\n******\n";
+ # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
+ # print "Loc: $alignment_location\n";
+ # print "ID: $alignments{$alignment_location}->{seq_id}\n";
+ # print "AS: $alignments{$alignment_location}->{alignment_score}\n";
+ # print "Seq: $alignments{$alignment_location}->{bowtie_sequence}\n";
+ # print "Index $alignments{$alignment_location}->{index}\n";
+ # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
+ # print "pos: $alignments{$alignment_location}->{position}\n";
+ # print "MD: $alignments{$alignment_location}->{MD_tag}\n\n";
+ # }
+ # print "\n******\n";
+ # }
+
+ ### if there is only 1 entry in the hash with we accept it as the best alignment
+ if (scalar keys %alignments == 1){
+ for my $unique_best_alignment (keys %alignments){
+ $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$unique_best_alignment}->{bowtie_sequence};
+ $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
+ $methylation_call_params->{$identifier}->{position} = $alignments{$unique_best_alignment}->{position};
+ $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
+ $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$unique_best_alignment}->{alignment_score};
+ $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$unique_best_alignment}->{MD_tag};
+ $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$unique_best_alignment}->{CIGAR};
+ }
+ }
+
+ ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
+ ### we boot the sequence altogether
+ elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
+ my $best_alignment_score;
+ my $best_alignment_location;
+ foreach my $alignment_location (sort {$alignments{$b}->{alignment_score} <=> $alignments{$a}->{alignment_score}} keys %alignments){
+ # print "$alignments{$alignment_location}->{alignment_score}\n";
+ unless (defined $best_alignment_score){
+ $best_alignment_score = $alignments{$alignment_location}->{alignment_score};
+ $best_alignment_location = $alignment_location;
+ # print "setting best alignment score: $best_alignment_score\n";
+ }
+ else{
+ ### if the second best alignment has the same alignment score as the first one, the sequence will get booted
+ if ($alignments{$alignment_location}->{alignment_score} == $best_alignment_score){
+ # warn "Same alignment score, the sequence will get booted!\n";
+ $sequence_fails = 1;
+ last; # exiting after the second alignment since we know that the sequence has ambiguous alignments
+ }
+ ### else we are going to store the best alignment for further processing
+ else{
+ $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$best_alignment_location}->{bowtie_sequence};
+ $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
+ $methylation_call_params->{$identifier}->{position} = $alignments{$best_alignment_location}->{position};
+ $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
+ $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$best_alignment_location}->{alignment_score};
+ $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$best_alignment_location}->{MD_tag};
+ $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$best_alignment_location}->{CIGAR};
+ last; # exiting after processing the second alignment since the sequence produced a unique best alignment
+ }
+ }
+ }
+ }
+ else{
+ die "There are too many potential hits for this sequence (1-4 expected, but found: ",scalar keys %alignments,")\n";;
+ }
+
+ ### skipping the sequence completely if there were multiple alignments with the same best alignment score at different positions
+ if ($sequence_fails == 1){
+ $counting{unsuitable_sequence_count}++;
+
+ ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
+ # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
+ # print OUT "$ambiguous_read_output\n";
+
+ if ($ambiguous){
+ return 2; # => exits to next sequence, and prints it out (in FastQ format) to _ambiguous_reads.txt if '--ambiguous' was specified
+ }
+ elsif ($unmapped){
+ return 1; # => exits to next sequence, and prints it out (in FastQ format) to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
+ }
+ else{
+ return 0; # => exits to next sequence (default)
+ }
+ }
+
+ ### --DIRECTIONAL
+ ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
+ ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
+ if ($directional){
+ if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
+ # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
+ $counting{alignments_rejected_count}++;
+ return 0;
+ }
+ }
+
+ ### If the sequence has not been rejected so far it has a unique best alignment
+ $counting{unique_best_alignment_count}++;
+
+ ### Now we need to extract a genomic sequence that exactly corresponds to the reported alignment. This potentially means that we need to deal with insertions or deletions as well
+ extract_corresponding_genomic_sequence_single_end_bowtie2 ($identifier,$methylation_call_params);
+
+ ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
+ if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
+ warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
+ $counting{genomic_sequence_could_not_be_extracted_count}++;
+ return 0;
+ }
+
+
+ ### otherwise we are set to perform the actual methylation call
+ $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
+ print_bisulfite_mapping_result_single_end_bowtie2 ($identifier,$sequence,$methylation_call_params,$quality_value);
+ return 0; ## if a sequence got this far we do not want to print it to unmapped or ambiguous.out
+}
+
+
+sub determine_number_of_transliterations_performed{
+ my ($sequence,$read_conversion) = @_;
+ my $number_of_transliterations;
+ if ($read_conversion eq 'CT'){
+ $number_of_transliterations = $sequence =~ tr/C/T/;
+ }
+ elsif ($read_conversion eq 'GA'){
+ $number_of_transliterations = $sequence =~ tr/G/A/;
+ }
+ else{
+ die "Read conversion mode of the read was not specified $!\n";
+ }
+ return $number_of_transliterations;
+}
+
+sub decide_whether_single_end_alignment_is_valid{
+ my ($index,$identifier) = @_;
+
+ # extracting from Bowtie 1 format
+ my ($id,$strand) = (split (/\t/,$fhs[$index]->{last_line}))[0,1];
+
+ ### ensuring that the entry is the correct sequence
+ if (($id eq $fhs[$index]->{last_seq_id}) and ($id eq $identifier)){
+ ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
+ ### sensible alignments
+ my $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
+ ### If the orientation was correct can we move on
+ if ($orientation == 1){
+ return 1; ### 1st possibility for a sequence to pass
+ }
+ ### If the alignment was in the wrong orientation we need to read in a new line
+ elsif($orientation == 0){
+ my $newline = $fhs[$index]->{fh}->getline();
+ if ($newline){
+ ($id,$strand) = (split (/\t/,$newline))[0,1];
+
+ ### ensuring that the next entry is still the correct sequence
+ if ($id eq $identifier){
+ ### checking orientation again
+ $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
+ ### If the orientation was correct can we move on
+ if ($orientation == 1){
+ $fhs[$index]->{last_seq_id} = $id;
+ $fhs[$index]->{last_line} = $newline;
+ return 1; ### 2nd possibility for a sequence to pass
+ }
+ ### If the alignment was in the wrong orientation again we need to read in yet another new line and store it in @fhs
+ elsif ($orientation == 0){
+ $newline = $fhs[$index]->{fh}->getline();
+ if ($newline){
+ my ($seq_id) = split (/\t/,$newline);
+ ### check if the next line still has the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
+ ### the same fields of the just read next entry
+ die "Same seq ID 3 or more times in a row!(should be 2 max) $!" if ($seq_id eq $identifier);
+ $fhs[$index]->{last_seq_id} = $seq_id;
+ $fhs[$index]->{last_line} = $newline;
+ return 0; # not processing anything this round as the alignment currently stored in last_line was in the wrong orientation
+ }
+ else{
+ # assigning undef to last_seq_id and last_line (end of bowtie output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line} = undef;
+ return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
+ }
+ }
+ else{
+ die "The orientation of the alignment must be either correct or incorrect\n";
+ }
+ }
+ ### the sequence we just read in is already the next sequence to be analysed -> store it in @fhs
+ else{
+ $fhs[$index]->{last_seq_id} = $id;
+ $fhs[$index]->{last_line} = $newline;
+ return 0; # processing the new alignment result only in the next round
+ }
+ }
+ else {
+ # assigning undef to last_seq_id and last_line (end of bowtie output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line} = undef;
+ return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
+ }
+ }
+ else{
+ die "The orientation of the alignment must be either correct or incorrect\n";
+ }
+ }
+ ### the sequence stored in @fhs as last_line is already the next sequence to be analysed -> analyse next round
+ else{
+ return 0;
+ }
+}
+#########################
+### BOWTIE 1 | PAIRED-END
+#########################
+
+sub check_bowtie_results_paired_ends{
+ my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
+
+ ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
+ unless ($quality_value_1){
+ $quality_value_1 = 'I'x(length$sequence_1);
+ }
+ unless ($quality_value_2){
+ $quality_value_2 = 'I'x(length$sequence_2);
+ }
+
+ # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
+
+ my %mismatches = ();
+ ### reading from the bowtie output files to see if this sequence pair aligned to a bisulfite converted genome
+
+
+ ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
+ ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
+ ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
+ ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignment to the complementary
+ ### strands are not being reported by specifying --directional
+
+ foreach my $index (0,3,1,2){
+ ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
+ next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
+ ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
+ if ($fhs[$index]->{last_seq_id} eq $identifier) {
+ # print "$identifier\n$fhs[$index]->{last_seq_id}\n\n";
+
+ ##################################################################################
+ ### STEP I Processing the entry which is stored in last_line_1 and last_line_2 ###
+ ##################################################################################
+ my $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
+ ### sequences can fail at this point if there was only 1 alignment in the wrong orientation, or if there were 2 aligments both in the wrong
+ ### orientation. We only continue to extract useful information about this alignment if 1 was returned
+ if ($valid_alignment_found == 1){
+ ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself.
+ ### we store the useful information in %mismatches
+ my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
+ my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
+ chomp $mismatch_info_1;
+ chomp $mismatch_info_2;
+
+ ### need to extract the chromosome number from the bowtie output (which is either XY_CT_converted or XY_GA_converted
+ my ($chromosome_1,$chromosome_2);
+ if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
+ $chromosome_1 = $mapped_chromosome_1;
+ }
+ else{
+ die "Chromosome number extraction failed for $mapped_chromosome_1\n";
+ }
+ if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
+ $chromosome_2 = $mapped_chromosome_2;
+ }
+ else{
+ die "Chromosome number extraction failed for $mapped_chromosome_2\n";
+ }
+
+ ### Now extracting the number of mismatches to the converted genome
+ my $number_of_mismatches_1;
+ my $number_of_mismatches_2;
+ if ($mismatch_info_1 eq ''){
+ $number_of_mismatches_1 = 0;
+ }
+ elsif ($mismatch_info_1 =~ /^\d/){
+ my @mismatches = split (/,/,$mismatch_info_1);
+ $number_of_mismatches_1 = scalar @mismatches;
+ }
+ else{
+ die "Something weird is going on with the mismatch field\n";
+ }
+ if ($mismatch_info_2 eq ''){
+ $number_of_mismatches_2 = 0;
+ }
+ elsif ($mismatch_info_2 =~ /^\d/){
+ my @mismatches = split (/,/,$mismatch_info_2);
+ $number_of_mismatches_2 = scalar @mismatches;
+ }
+ else{
+ die "Something weird is going on with the mismatch field\n";
+ }
+ ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
+ my $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
+ ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
+ die "Position 1 is higher than position 2" if ($position_1 > $position_2);
+ die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
+ my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
+ ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
+ ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
+ ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
+ ### number for the found alignment)
+ unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
+ }
+ ###################################################################################################################################################
+ ### STEP II Now reading in the next 2 lines from the bowtie filehandle. If there are 2 next lines in the alignments filehandle it can either ###
+ ### be a second alignment of the same sequence pair or a new sequence pair. In any case we will just add it to last_line_1 and last_line _2. ###
+ ### If it is the alignment of the next sequence pair, 0 will be returned as $valid_alignment_found, so it will not be processed any further in ###
+ ### this round ###
+ ###################################################################################################################################################
+ my $newline_1 = $fhs[$index]->{fh}-> getline();
+ my $newline_2 = $fhs[$index]->{fh}-> getline();
+
+ if ($newline_1 and $newline_2){
+ my ($seq_id_1) = split (/\t/,$newline_1);
+ my ($seq_id_2) = split (/\t/,$newline_2);
+
+ if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
+ $fhs[$index]->{last_seq_id} = $seq_id_1;
+ }
+ elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
+ $fhs[$index]->{last_seq_id} = $seq_id_2;
+ }
+ else{
+ die "Either read 1 or read 2 needs to end on '/1'\n";
+ }
+
+ $fhs[$index]->{last_line_1} = $newline_1;
+ $fhs[$index]->{last_line_2} = $newline_2;
+ }
+ else {
+ # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line_1} = undef;
+ $fhs[$index]->{last_line_2} = undef;
+ next; # jumping to the next index
+ }
+ ### Now processing the entry we just stored in last_line_1 and last_line_2
+ $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
+ ### only processing the alignment further if 1 was returned. 0 will be returned either if the alignment is already the next sequence pair to
+ ### be analysed or if it was a second alignment of the current sequence pair but in the wrong orientation
+ if ($valid_alignment_found == 1){
+ ### we store the useful information in %mismatches
+ ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,7];
+ ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,7];
+ chomp $mismatch_info_1;
+ chomp $mismatch_info_2;
+ ### need to extract the chromosome number from the bowtie output (which is either _CT_converted or _GA_converted)
+ if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
+ $chromosome_1 = $mapped_chromosome_1;
+ }
+ else{
+ die "Chromosome number extraction failed for $mapped_chromosome_1\n";
+ }
+ if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
+ $chromosome_2 = $mapped_chromosome_2;
+ }
+ else{
+ die "Chromosome number extraction failed for $mapped_chromosome_2\n";
+ }
+
+ $number_of_mismatches_1='';
+ $number_of_mismatches_2='';
+ ### Now extracting the number of mismatches to the converted genome
+ if ($mismatch_info_1 eq ''){
+ $number_of_mismatches_1 = 0;
+ }
+ elsif ($mismatch_info_1 =~ /^\d/){
+ my @mismatches = split (/,/,$mismatch_info_1);
+ $number_of_mismatches_1 = scalar @mismatches;
+ }
+ else{
+ die "Something weird is going on with the mismatch field\n";
+ }
+ if ($mismatch_info_2 eq ''){
+ $number_of_mismatches_2 = 0;
+ }
+ elsif ($mismatch_info_2 =~ /^\d/){
+ my @mismatches = split (/,/,$mismatch_info_2);
+ $number_of_mismatches_2 = scalar @mismatches;
+ }
+ else{
+ die "Something weird is going on with the mismatch field\n";
+ }
+ ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
+ $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
+ ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
+ die "position 1 is greater than position 2" if ($position_1 > $position_2);
+ die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
+ $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
+ ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
+ ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
+ ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
+ ### number for the found alignment)
+ unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
+ }
+ ###############################################################################################################################################
+ ### STEP III Now reading in two more lines. These have to be the next entry and we will just add assign them to last_line_1 and last_line_2 ###
+ ###############################################################################################################################################
+ $newline_1 = $fhs[$index]->{fh}-> getline();
+ $newline_2 = $fhs[$index]->{fh}-> getline();
+
+ if ($newline_1 and $newline_2){
+ my ($seq_id_1) = split (/\t/,$newline_1);
+ my ($seq_id_2) = split (/\t/,$newline_2);
+
+ if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
+ $fhs[$index]->{last_seq_id} = $seq_id_1;
+ }
+ if ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
+ $fhs[$index]->{last_seq_id} = $seq_id_2;
+ }
+ $fhs[$index]->{last_line_1} = $newline_1;
+ $fhs[$index]->{last_line_2} = $newline_2;
+ }
+ else {
+ # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line_1} = undef;
+ $fhs[$index]->{last_line_2} = undef;
+ next; # jumping to the next index
+ }
+ ### within the 2nd sequence pair alignment in correct orientation found
+ }
+ ### within the 1st sequence pair alignment in correct orientation found
+ }
+ ### still within the (last_seq_id eq identifier) condition
+ }
+ ### still within foreach index loop
+ }
+ ### if there was no single alignment found for a certain sequence we will continue with the next sequence in the sequence file
+ unless(%mismatches){
+ $counting{no_single_alignment_found}++;
+ return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
+ }
+ ### Going to use the variable $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
+ my $sequence_pair_fails = 0;
+ ### Declaring an empty hash reference which will store all information we need for the methylation call
+ my $methylation_call_params; # hash reference!
+ ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the
+ ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the
+ ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether
+ foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
+ #dev print "Number of mismatches: $mismatch_number\t$identifier\t$sequence_1\t$sequence_2\n";
+ foreach my $entry (keys (%{$mismatches{$mismatch_number}}) ){
+ #dev print "$mismatch_number\t$entry\t$mismatches{$mismatch_number}->{$entry}->{index}\n";
+ # print join("\t",$mismatch_number,$mismatches{$mismatch_number}->{$entry}->{seq_id},$sequence,$mismatches{$mismatch_number}->{$entry}->{bowtie_sequence},$mismatches{$mismatch_number}->{$entry}->{chromosome},$mismatches{$mismatch_number}->{$entry}->{position},$mismatches{$mismatch_number}->{$entry}->{index}),"\n";
+ }
+ if (scalar keys %{$mismatches{$mismatch_number}} == 1){
+ # print "Unique best alignment for sequence pair $sequence_1\t$sequence_1\n";
+ for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
+ $methylation_call_params->{$identifier}->{seq_id} = $identifier;
+ $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_1};
+ $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2};
+ $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
+ $methylation_call_params->{$identifier}->{start_seq_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_1};
+ $methylation_call_params->{$identifier}->{start_seq_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2};
+ $methylation_call_params->{$identifier}->{alignment_end} = ($mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}+length($mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2}));
+ $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
+ $methylation_call_params->{$identifier}->{number_of_mismatches_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_1};
+ $methylation_call_params->{$identifier}->{number_of_mismatches_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_2};
+ }
+ }
+ else{
+ $sequence_pair_fails = 1;
+ }
+ ### after processing the alignment with the lowest number of mismatches we exit
+ last;
+ }
+ ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
+ if ($sequence_pair_fails == 1){
+ $counting{unsuitable_sequence_count}++;
+ if ($ambiguous){
+ return 2; # => exits to next sequence pair, and prints both seqs out to multiple_alignments_1 and -2 if --ambiguous has been specified
+ }
+ if ($unmapped){
+ return 1; # => exits to next sequence pair, and prints both seqs out to unmapped_1 and _2 if --un has been specified
+ }
+ else{
+ return 0; # => exits to next sequence (default)
+ }
+ }
+
+ ### --DIRECTIONAL
+ ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
+ ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
+ if ($directional){
+ if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
+ # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
+ $counting{alignments_rejected_count}++;
+ return 0;
+ }
+ }
+
+ ### If the sequence has not been rejected so far it does have a unique best alignment
+ $counting{unique_best_alignment_count}++;
+ extract_corresponding_genomic_sequence_paired_ends($identifier,$methylation_call_params);
+
+ ### check test to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
+ if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
+ warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n";
+ $counting{genomic_sequence_could_not_be_extracted_count}++;
+ return 0;
+ }
+ if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
+ warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n";
+ $counting{genomic_sequence_could_not_be_extracted_count}++;
+ return 0;
+ }
+
+ ### otherwise we are set to perform the actual methylation call
+ $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
+ $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
+
+ print_bisulfite_mapping_results_paired_ends($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
+ return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
+}
+
+#########################
+### BOWTIE 2 | PAIRED-END
+#########################
+
+sub check_bowtie_results_paired_ends_bowtie2{
+ my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
+
+ ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
+ unless ($quality_value_1){
+ $quality_value_1 = 'I'x(length$sequence_1);
+ }
+
+ unless ($quality_value_2){
+ $quality_value_2 = 'I'x(length$sequence_2);
+ }
+
+
+ # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
+
+
+ my %alignments;
+ my $alignment_ambiguous = 0;
+
+ ### reading from the Bowtie 2 output filehandles
+
+ ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
+ ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
+ ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
+ ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignments to the complementary
+ ### strands are not being reported when '--directional' is specified
+
+ foreach my $index (0,3,1,2){
+ ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
+ next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
+
+ ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
+ if ($fhs[$index]->{last_seq_id} eq $identifier) {
+
+ my ($id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,5,9,10];
+ my ($id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,5,9,10];
+ # print "Index: $index\t$fhs[$index]->{last_line_1}\n";
+ # print "Index: $index\t$fhs[$index]->{last_line_2}\n";
+ # print join ("\t",$id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1),"\n";
+ # print join ("\t",$id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2),"\n";
+ $id_1 =~ s/\/1$//;
+ $id_2 =~ s/\/2$//;
+
+ # SAM format specifications for Bowtie 2
+ # (1) Name of read that aligned
+ # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
+ # 1 The read is one of a pair
+ # 2 The alignment is one end of a proper paired-end alignment
+ # 4 The read has no reported alignments
+ # 8 The read is one of a pair and has no reported alignments
+ # 16 The alignment is to the reverse reference strand
+ # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
+ # 64 The read is mate 1 in a pair
+ # 128 The read is mate 2 in a pair
+ # 256 The read has multiple mapping states
+ # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
+ # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
+ # (5) Mapping quality (255 means MAPQ is not available)
+ # (6) CIGAR string representation of alignment (* if unavailable)
+ # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
+ # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
+ # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
+ # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
+ # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
+ # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
+ # AS:i: Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
+ # XS:i: Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
+ # YS:i: Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
+ # XN:i: The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
+ # XM:i: The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
+ # XO:i: The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
+ # XG:i: The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
+ # NM:i: The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
+ # YF:Z: String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
+ # MD:Z: A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
+
+ ### If a sequence has no reported alignments there will be a single output line per sequence with a bit-wise flag value of 77 for read 1 (1+4+8+64), or 141 for read 2 (1+4+8+128).
+ ### We can store the next alignment and move on to the next Bowtie 2 instance
+ if ($flag_1 == 77 and $flag_2 == 141){
+ ## reading in the next alignment, which must be the next sequence
+ my $newline_1 = $fhs[$index]->{fh}-> getline();
+ my $newline_2 = $fhs[$index]->{fh}-> getline();
+
+ if ($newline_1 and $newline_2){
+ chomp $newline_1;
+ chomp $newline_2;
+ my ($seq_id_1) = split (/\t/,$newline_1);
+ my ($seq_id_2) = split (/\t/,$newline_2);
+ $seq_id_1 =~ s/\/1$//;
+ $seq_id_2 =~ s/\/2$//;
+ $fhs[$index]->{last_seq_id} = $seq_id_1;
+ $fhs[$index]->{last_line_1} = $newline_1;
+ $fhs[$index]->{last_line_2} = $newline_2;
+
+ # print "current sequence ($identifier) did not map, reading in next sequence\n";
+ # print "$index\t$fhs[$index]->{last_seq_id}\n";
+ # print "$index\t$fhs[$index]->{last_line_1}\n";
+ # print "$index\t$fhs[$index]->{last_line_2}\n";
+ next; # next instance
+ }
+ else{
+ # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line_1} = undef;
+ $fhs[$index]->{last_line_2} = undef;
+ next;
+ }
+ }
+
+ ### If there are one or more proper alignments we can extract the chromosome number
+ my ($chromosome_1,$chromosome_2);
+ if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
+ $chromosome_1 = $mapped_chromosome_1;
+ }
+ else{
+ die "Chromosome number extraction failed for $mapped_chromosome_1\n";
+ }
+ if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
+ $chromosome_2 = $mapped_chromosome_2;
+ }
+ else{
+ die "Chromosome number extraction failed for $mapped_chromosome_2\n";
+ }
+
+ die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
+
+ ### We will use the optional fields to determine the best alignments. Later on we extract the number of mismatches and/or indels from the CIGAR string
+ my ($alignment_score_1,$alignment_score_2,$second_best_1,$second_best_2,$MD_tag_1,$MD_tag_2);
+
+ my @fields_1 = split (/\t/,$fhs[$index]->{last_line_1});
+ my @fields_2 = split (/\t/,$fhs[$index]->{last_line_2});
+
+ foreach (11..$#fields_1){
+ if ($fields_1[$_] =~ /AS:i:(.*)/){
+ $alignment_score_1 = $1;
+ }
+ elsif ($fields_1[$_] =~ /XS:i:(.*)/){
+ $second_best_1 = $1;
+ }
+ elsif ($fields_1[$_] =~ /MD:Z:(.*)/){
+ $MD_tag_1 = $1;
+ }
+ }
+
+ foreach (11..$#fields_2){
+ if ($fields_2[$_] =~ /AS:i:(.*)/){
+ $alignment_score_2 = $1;
+ }
+ elsif ($fields_2[$_] =~ /XS:i:(.*)/){
+ $second_best_2 = $1;
+ }
+ elsif ($fields_2[$_] =~ /MD:Z:(.*)/){
+ $MD_tag_2 = $1;
+ }
+ }
+
+ die "Failed to extract alignment score 1 ($alignment_score_1) and MD tag ($MD_tag_1)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_1 and defined $MD_tag_1);
+ die "Failed to extract alignment score 2 ($alignment_score_2) and MD tag ($MD_tag_2)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_2 and defined $MD_tag_2);
+
+ # warn "First read 1 alignment score is: '$alignment_score_1'\n";
+ # warn "First read 2 alignment score is: '$alignment_score_2'\n";
+ # warn "MD tag 1 is: '$MD_tag_1'\n";
+ # warn "MD tag 2 is: '$MD_tag_2'\n";
+
+ ### To decide whether a sequence pair has a unique best alignment we will look at the highest sum of alignment scores from both alignments
+ my $sum_of_alignment_scores_1 = $alignment_score_1 + $alignment_score_2 ;
+ # print "sum of alignment scores: $sum_of_alignment_scores_1\n\n";
+
+ if (defined $second_best_1 and defined $second_best_2){
+ my $sum_of_alignment_scores_second_best = $second_best_1 + $second_best_2;
+ # warn "Second best alignment_score_1 is: '$second_best_1'\n";
+ # warn "Second best alignment_score_2 is: '$second_best_2'\n";
+ # warn "Second best alignment sum of alignment scores is: '$sum_of_alignment_scores_second_best'\n";
+
+ # If the first alignment score for the first read pair is the same as the alignment score of the second best hit we are going to boot this sequence pair altogether
+ if ($sum_of_alignment_scores_1 == $sum_of_alignment_scores_second_best){
+ $alignment_ambiguous = 1;
+ # print "This read will be chucked (AS==XS detected)!\n";
+
+ ## need to read and discard all additional ambiguous reads until we reach the next sequence
+ until ($fhs[$index]->{last_seq_id} ne $identifier){
+ my $newline_1 = $fhs[$index]->{fh}-> getline();
+ my $newline_2 = $fhs[$index]->{fh}-> getline();
+ if ($newline_1 and $newline_2){
+ chomp $newline_1;
+ chomp $newline_2;
+ my ($seq_id_1) = split (/\t/,$newline_1);
+ my ($seq_id_2) = split (/\t/,$newline_2);
+ $seq_id_1 =~ s/\/1$//;
+ $seq_id_2 =~ s/\/2$//;
+ # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
+
+ $fhs[$index]->{last_seq_id} = $seq_id_1;
+ $fhs[$index]->{last_line_1} = $newline_1;
+ $fhs[$index]->{last_line_2} = $newline_2;
+ }
+ else{
+ # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line_1} = undef;
+ $fhs[$index]->{last_line_2} = undef;
+ last; # break free if the end of the alignment output was reached
+ }
+ }
+ # if ($fhs[$index]->{last_seq_id}){
+ # warn "Index: $index\tThis Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
+ # }
+ }
+ else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
+
+ my $alignment_location;
+ if ($position_1 <= $position_2){
+ $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
+ }
+ elsif($position_2 < $position_1){
+ $alignment_location = join(":",$chromosome_1,$position_2,$position_1);
+ }
+
+ ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
+ ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
+ ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
+ ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
+
+ unless (exists $alignments{$alignment_location}){
+ $alignments{$alignment_location}->{seq_id} = $id_1;
+ $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
+ $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
+ $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
+ $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
+ $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
+ $alignments{$alignment_location}->{index} = $index;
+ $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
+ $alignments{$alignment_location}->{position_1} = $position_1;
+ $alignments{$alignment_location}->{position_2} = $position_2;
+ $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
+ $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
+ $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
+ $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
+ $alignments{$alignment_location}->{flag_1} = $flag_1;
+ $alignments{$alignment_location}->{flag_2} = $flag_2;
+ }
+ # warn "added best of several alignments to \%alignments hash\n";
+
+ ### now reading and discarding all (inferior) alignments of this read pair until we hit the next sequence
+ until ($fhs[$index]->{last_seq_id} ne $identifier){
+ my $newline_1 = $fhs[$index]->{fh}-> getline();
+ my $newline_2 = $fhs[$index]->{fh}-> getline();
+ if ($newline_1 and $newline_2){
+ chomp $newline_1;
+ chomp $newline_2;
+ my ($seq_id_1) = split (/\t/,$newline_1);
+ my ($seq_id_2) = split (/\t/,$newline_2);
+ $seq_id_1 =~ s/\/1$//;
+ $seq_id_2 =~ s/\/2$//;
+ # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
+
+ $fhs[$index]->{last_seq_id} = $seq_id_1;
+ $fhs[$index]->{last_line_1} = $newline_1;
+ $fhs[$index]->{last_line_2} = $newline_2;
+ }
+ else{
+ # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line_1} = undef;
+ $fhs[$index]->{last_line_2} = undef;
+ last; # break free if the end of the alignment output was reached
+ }
+ }
+ # if($fhs[$index]->{last_seq_id}){
+ # warn "Index: $index\tThis Seq-ID is $identifier, skipped all other alignments until the next ID was reached which is: $fhs[$index]->{last_seq_id}\n";
+ # }
+ }
+ }
+ else{ # there is no second best hit, so we can just store this one and read in the next sequence
+
+ my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
+ # print "$alignment_location\n";
+ ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
+ ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
+ ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
+ ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
+
+ unless (exists $alignments{$alignment_location}){
+ $alignments{$alignment_location}->{seq_id} = $id_1;
+ $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
+ $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
+ $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
+ $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
+ $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
+ $alignments{$alignment_location}->{index} = $index;
+ $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
+ $alignments{$alignment_location}->{position_1} = $position_1;
+ $alignments{$alignment_location}->{position_2} = $position_2;
+ $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
+ $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
+ $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
+ $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
+ $alignments{$alignment_location}->{flag_1} = $flag_1;
+ $alignments{$alignment_location}->{flag_2} = $flag_2;
+ }
+
+ # warn "added unique alignment to \%alignments hash\n";
+
+ # Now reading and storing the next read pair
+ my $newline_1 = $fhs[$index]->{fh}-> getline();
+ my $newline_2 = $fhs[$index]->{fh}-> getline();
+ if ($newline_1 and $newline_2){
+ chomp $newline_1;
+ chomp $newline_2;
+ # print "$newline_1\n";
+ # print "$newline_2\n";
+ my ($seq_id_1) = split (/\t/,$newline_1);
+ my ($seq_id_2) = split (/\t/,$newline_2);
+ $seq_id_1 =~ s/\/1$//;
+ $seq_id_2 =~ s/\/2$//;
+ # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
+
+ $fhs[$index]->{last_seq_id} = $seq_id_1;
+ $fhs[$index]->{last_line_1} = $newline_1;
+ $fhs[$index]->{last_line_2} = $newline_2;
+
+ if ($seq_id_1 eq $identifier){
+ die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
+ }
+ }
+ else{
+ # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line_1} = undef;
+ $fhs[$index]->{last_line_2} = undef;
+ }
+ }
+ }
+ }
+
+ ### if the read produced several ambiguous alignments for a single instance of Bowtie 2 we can return already now. If --ambiguous was specified the read sequence will be printed out in FastQ format
+ if ($alignment_ambiguous == 1){
+ $counting{unsuitable_sequence_count}++;
+ ### report that the sequence pair has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
+ # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
+ # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
+ # print "$ambiguous_read_1\n";
+ # print "$ambiguous_read_2\n";
+
+ if ($ambiguous){
+ return 2; # => exits to next sequence pair, and prints it out to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
+ }
+ elsif ($unmapped){
+ return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
+ }
+ else{
+ return 0;
+ }
+ }
+
+ ### if no alignment was found for a certain sequence at all we continue with the next sequence in the sequence file
+ unless (%alignments){
+ $counting{no_single_alignment_found}++;
+
+ # my $unmapped_read_1 = join("\t",$identifier.'/1','77','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
+ # my $unmapped_read_2 = join("\t",$identifier.'/2','141','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
+ # print "$unmapped_read_1\n";
+ # print "$unmapped_read_2\n";
+ if ($unmapped){
+ return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_read_2.txt if '--unmapped' was specified
+ }
+ else{
+ return 0;
+ }
+ }
+
+ #######################################################################################################################################################
+
+ ### If the sequence pair was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
+ ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
+ ### alignment score we are discarding the sequence pair altogether.
+ ### For end-to-end alignments the maximum alignment score is 0, each mismatch receives a penalty of 6, and each gap receives penalties for opening (5)
+ ### and extending (3 per bp) the gap.
+
+ #######################################################################################################################################################
+
+ ### Declaring an empty hash reference which will store all information we need for the methylation call
+ my $methylation_call_params; # hash reference
+ my $sequence_pair_fails = 0; # using $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
+
+ ### print contents of %alignments for debugging
+ ## if (scalar keys %alignments >= 1){
+ # print "\n******\n";
+ # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
+ # print "Loc: $alignment_location\n";
+ # print "ID: $alignments{$alignment_location}->{seq_id}\n";
+ # print "AS_1: $alignments{$alignment_location}->{alignment_score_1}\n";
+ # print "AS_2: $alignments{$alignment_location}->{alignment_score_2}\n";
+ # print "Seq_1: $alignments{$alignment_location}->{bowtie_sequence_1}\n";
+ # print "Seq_2: $alignments{$alignment_location}->{bowtie_sequence_2}\n";
+ # print "Index $alignments{$alignment_location}->{index}\n";
+ # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
+ # print "Pos_1: $alignments{$alignment_location}->{position_1}\n";
+ # print "Pos_2: $alignments{$alignment_location}->{position_2}\n";
+ # print "CIGAR_1: $alignments{$alignment_location}->{CIGAR_1}\n";
+ # print "CIGAR_2: $alignments{$alignment_location}->{CIGAR_2}\n";
+ # print "MD_1: $alignments{$alignment_location}->{mismatch_info_1}\n";
+ # print "MD_2: $alignments{$alignment_location}->{mismatch_info_2}\n";
+ # print "Flag 1: $alignments{$alignment_location}->{flag_1}\n";
+ # print "Flag 2: $alignments{$alignment_location}->{flag_2}\n";
+ # }
+ # print "\n******\n";
+ # }
+
+ ### if there is only 1 entry in the %alignments hash we accept it as the best alignment
+ if (scalar keys %alignments == 1){
+ for my $unique_best_alignment (keys %alignments){
+ $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$unique_best_alignment}->{bowtie_sequence_1};
+ $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$unique_best_alignment}->{bowtie_sequence_2};
+ $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
+ $methylation_call_params->{$identifier}->{position_1} = $alignments{$unique_best_alignment}->{position_1};
+ $methylation_call_params->{$identifier}->{position_2} = $alignments{$unique_best_alignment}->{position_2};
+ $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
+ $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$unique_best_alignment}->{alignment_score_1};
+ $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$unique_best_alignment}->{alignment_score_2};
+ $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores};
+ $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$unique_best_alignment}->{mismatch_info_1};
+ $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$unique_best_alignment}->{mismatch_info_2};
+ $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$unique_best_alignment}->{CIGAR_1};
+ $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$unique_best_alignment}->{CIGAR_2};
+ $methylation_call_params->{$identifier}->{flag_1} = $alignments{$unique_best_alignment}->{flag_1};
+ $methylation_call_params->{$identifier}->{flag_2} = $alignments{$unique_best_alignment}->{flag_2};
+ }
+ }
+
+ ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
+ ### we boot the sequence pair altogether)
+ elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
+ my $best_sum_of_alignment_scores;
+ my $best_alignment_location;
+ foreach my $alignment_location (sort {$alignments{$b}->{sum_of_alignment_scores} <=> $alignments{$a}->{sum_of_alignment_scores}} keys %alignments){
+ # print "$alignments{$alignment_location}->{sum_of_alignment_scores}\n";
+ unless (defined $best_sum_of_alignment_scores){
+ $best_sum_of_alignment_scores = $alignments{$alignment_location}->{sum_of_alignment_scores};
+ $best_alignment_location = $alignment_location;
+ # print "setting best alignment score to: $best_sum_of_alignment_scores\n";
+ }
+ else{
+ ### if the second best alignment has the same sum of alignment scores as the first one, the sequence pair will get booted
+ if ($alignments{$alignment_location}->{sum_of_alignment_scores} == $best_sum_of_alignment_scores){
+ # warn "Same sum of alignment scores for 2 different alignments, the sequence pair will get booted!\n";
+ $sequence_pair_fails = 1;
+ last; # exiting since we know that the sequence has ambiguous alignments
+ }
+ ### else we are going to store the best alignment for further processing
+ else{
+ $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$best_alignment_location}->{bowtie_sequence_1};
+ $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$best_alignment_location}->{bowtie_sequence_2};
+ $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
+ $methylation_call_params->{$identifier}->{position_1} = $alignments{$best_alignment_location}->{position_1};
+ $methylation_call_params->{$identifier}->{position_2} = $alignments{$best_alignment_location}->{position_2};
+ $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
+ $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$best_alignment_location}->{alignment_score_1};
+ $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$best_alignment_location}->{alignment_score_2};
+ $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$best_alignment_location}->{sum_of_alignment_scores};
+ $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$best_alignment_location}->{mismatch_info_1};
+ $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$best_alignment_location}->{mismatch_info_2};
+ $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$best_alignment_location}->{CIGAR_1};
+ $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$best_alignment_location}->{CIGAR_2};
+ $methylation_call_params->{$identifier}->{flag_1} = $alignments{$best_alignment_location}->{flag_1};
+ $methylation_call_params->{$identifier}->{flag_2} = $alignments{$best_alignment_location}->{flag_2};
+ last; # exiting since the sequence produced a unique best alignment
+ }
+ }
+ }
+ }
+ else{
+ die "There are too many potential hits for this sequence pair (1-4 expected, but found: '",scalar keys %alignments,"')\n";;
+ }
+
+ ### skipping the sequence completely if there were multiple alignments with the same best sum of alignment scores at different positions
+ if ($sequence_pair_fails == 1){
+ $counting{unsuitable_sequence_count}++;
+
+ ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
+ # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
+ # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
+ # print "$ambiguous_read_1\n";
+ # print "$ambiguous_read_2\n";
+
+ if ($ambiguous){
+ return 2; # => exits to next sequence pair, and prints it out (in FastQ format) to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
+ }
+ elsif ($unmapped){
+ return 1; # => exits to next sequence pair, and prints it out (in FastQ format) to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
+ }
+ else{
+ return 0; # => exits to next sequence pair (default)
+ }
+ }
+
+ ### --DIRECTIONAL
+ ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
+ ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
+ if ($directional){
+ if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
+ # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
+ $counting{alignments_rejected_count}++;
+ return 0;
+ }
+ }
+
+ ### If the sequence pair has not been rejected so far it does have a unique best alignment
+ $counting{unique_best_alignment_count}++;
+ extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params);
+
+ ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
+ if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
+ warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n";
+ $counting{genomic_sequence_could_not_be_extracted_count}++;
+ return 0;
+ }
+ if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
+ warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n";
+ $counting{genomic_sequence_could_not_be_extracted_count}++;
+ return 0;
+ }
+
+ ### now we are set to perform the actual methylation call
+ $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
+ $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
+ # print "$methylation_call_params->{$identifier}->{read_conversion_2}\n";
+ # print " $sequence_2\n";
+ # print "$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}\n";
+ # print " $methylation_call_params->{$identifier}->{methylation_call_2}\n";
+
+ print_bisulfite_mapping_results_paired_ends_bowtie2($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
+ return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
+}
+
+###
+
+sub decide_whether_paired_end_alignment_is_valid{
+ my ($index,$identifier) = @_;
+ my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
+ my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
+ chomp $mismatch_info_1;
+ chomp $mismatch_info_2;
+ my $seq_id_1 = $id_1;
+ my $seq_id_2 = $id_2;
+ $seq_id_1 =~ s/\/1$//; # removing the read /1
+ $seq_id_2 =~ s/\/1$//; # removing the read /1
+
+ ### ensuring that the current entry is the correct sequence
+ if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
+ ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
+ ### sensible alignments
+ my $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
+ ### If the orientation was correct can we move on
+ if ($orientation == 1){
+ return 1; ### 1st possibility for A SEQUENCE-PAIR TO PASS
+ }
+ ### If the alignment was in the wrong orientation we need to read in two new lines
+ elsif($orientation == 0){
+ my $newline_1 = $fhs[$index]->{fh}->getline();
+ my $newline_2 = $fhs[$index]->{fh}->getline();
+ if ($newline_1 and $newline_2){
+ ### extract detailed information about the alignment again (from $newline_1 and $newline_2 this time)
+ ($id_1,$strand_1) = (split (/\t/,$newline_1))[0,1];
+ ($id_2,$strand_2) = (split (/\t/,$newline_2))[0,1];
+
+ my $seqid;
+ $seq_id_1 = $id_1;
+ $seq_id_2 = $id_2;
+ # we need to capture the first read (ending on /1)
+ if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
+ $seqid = $seq_id_1;
+ }
+ elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
+ $seqid = $seq_id_2;
+ }
+ else{
+ die "One of the two reads needs to end on /1!!";
+ }
+
+ ### ensuring that the next entry is still the correct sequence
+ if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
+ ### checking orientation again
+ $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
+ ### If the orientation was correct can we move on
+ if ($orientation == 1){
+ ### Writing the current sequence to last_line_1 and last_line_2
+ $fhs[$index]->{last_seq_id} = $seqid;
+ $fhs[$index]->{last_line_1} = $newline_1;
+ $fhs[$index]->{last_line_2} = $newline_2;
+ return 1; ### 2nd possibility for a SEQUENCE-PAIR TO PASS
+ }
+ ### If the alignment was in the wrong orientation again we need to read in yet another 2 new lines and store them in @fhs (this must be
+ ### the next entry)
+ elsif ($orientation == 0){
+ $newline_1 = $fhs[$index]->{fh}->getline();
+ $newline_2 = $fhs[$index]->{fh}->getline();
+ if ($newline_1 and $newline_2){
+ ($seq_id_1) = split (/\t/,$newline_1);
+ ($seq_id_2) = split (/\t/,$newline_2);
+
+ $seqid = '';
+ if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
+ $seqid = $seq_id_1;
+ }
+ elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
+ $seqid = $seq_id_2;
+ }
+ else{
+ die "One of the two reads needs to end on /1!!";
+ }
+
+ ### check if the next 2 lines still have the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
+ ### the same fields of the just read next entry
+ die "Same seq ID 3 or more times in a row!(should be 2 max)" if ($seqid eq $identifier);
+ $fhs[$index]->{last_seq_id} = $seqid;
+ $fhs[$index]->{last_line_1} = $newline_1;
+ $fhs[$index]->{last_line_2} = $newline_2;
+ return 0; # not processing anything this round as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
+ }
+ else {
+ ### assigning undef to last_seq_id and last_line (end of bowtie output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line_1} = undef;
+ $fhs[$index]->{last_line_2} = undef;
+ return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
+ }
+ }
+ else{
+ die "The orientation of the alignment must be either correct or incorrect\n";
+ }
+ }
+ ### the sequence pair we just read in is already the next sequence pair to be analysed -> store it in @fhs
+ else{
+ $fhs[$index]->{last_seq_id} = $seqid;
+ $fhs[$index]->{last_line_1} = $newline_1;
+ $fhs[$index]->{last_line_2} = $newline_2;
+ return 0; # processing the new alignment result only in the next round
+ }
+ }
+ else {
+ # assigning undef to last_seq_id and both last_lines (end of bowtie output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line_1} = undef;
+ $fhs[$index]->{last_line_2} = undef;
+ return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
+ }
+ }
+ else{
+ die "The orientation of the alignment must be either correct or incorrect\n";
+ }
+ }
+ ### the sequence pair stored in @fhs as last_line_1 and last_line_2 is already the next sequence pair to be analysed -> analyse next round
+ else{
+ return 0;
+ }
+}
+
+### EXTRACT GENOMIC SEQUENCE | BOWTIE 1 | PAIRED-END
+
+sub extract_corresponding_genomic_sequence_paired_ends {
+ my ($sequence_identifier,$methylation_call_params) = @_;
+ ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
+ ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
+ my $alignment_read_1;
+ my $alignment_read_2;
+ my $read_conversion_info_1;
+ my $read_conversion_info_2;
+ my $genome_conversion;
+
+ ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at oone of the ends so that we can also make a CpG, CHG or CHH methylation call
+ ### if the C happens to be at the first or last position of the actually observed sequence
+ my $non_bisulfite_sequence_1;
+ my $non_bisulfite_sequence_2;
+
+ ### all alignments reported by bowtie have the + alignment first and the - alignment as the second one irrespective of whether read 1 or read 2 was
+ ### the + alignment. We however always read in sequences read 1 then read 2, so if read 2 is the + alignment we need to swap the extracted genomic
+ ### sequences around!
+ ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
+ if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
+ ### [Index 0, sequence originated from (converted) forward strand]
+ $counting{CT_GA_CT_count}++;
+ $alignment_read_1 = '+';
+ $alignment_read_2 = '-';
+ $read_conversion_info_1 = 'CT';
+ $read_conversion_info_2 = 'GA';
+ $genome_conversion = 'CT';
+ ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
+ ### for hits on the forward strand we need to capture 2 extra bases at the 3' end
+
+ $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ##CHH change
+
+ ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
+ ### As the second conversion is GA we need to capture 1 base 3', so that it is a 5' base after reverse complementation
+ if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{start_seq_2}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+1){ ## CHH change to +1
+
+ $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2);
+ ### the reverse strand sequence needs to be reverse complemented
+ $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
+ }
+ else{
+ $non_bisulfite_sequence_2 = '';
+ }
+ }
+
+ ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
+ elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
+ ### [Index 1, sequence originated from complementary to (converted) reverse strand]
+ $counting{GA_CT_GA_count}++;
+ $alignment_read_1 = '+';
+ $alignment_read_2 = '-';
+ $read_conversion_info_1 = 'GA';
+ $read_conversion_info_2 = 'CT';
+ $genome_conversion = 'GA';
+
+ ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
+ ### as we need to make the methylation call for the base 5' of the first base (GA conversion!) we need to capture 2 extra bases at the 5' end
+ if ($methylation_call_params->{$sequence_identifier}->{start_seq_1}-1 > 0){ ## CHH change to -1
+ $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH change to -2/+2
+ }
+ else{
+ $non_bisulfite_sequence_1 = '';
+ }
+
+ ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
+ ### As we are doing a CT comparison for the reverse strand we are taking 2 bases extra at the 5' end, so it is a 3' base after reverse complementation
+ $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to -2/+2
+ ### the reverse strand sequence needs to be reverse complemented
+ $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
+ }
+
+ ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
+ elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
+ ### [Index 2, sequence originated from the complementary to (converted) forward strand]
+ $counting{GA_CT_CT_count}++;
+ $alignment_read_1 = '-';
+ $alignment_read_2 = '+';
+ $read_conversion_info_1 = 'GA';
+ $read_conversion_info_2 = 'CT';
+ $genome_conversion = 'CT';
+
+ ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
+ ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
+ ### As read 1 is GA converted we need to capture 2 extra 3' bases which will be 2 extra 5' base after reverse complementation
+ $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to +2
+ ### the reverse strand sequence needs to be reverse complemented
+ $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
+
+ ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
+ ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
+ ### Read 2 is CT converted so we need to capture 2 extra 3' bases
+ if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > ($methylation_call_params->{$sequence_identifier}->{start_seq_1})+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+1){ ## CHH change to +1
+ $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ## CHH changed from +1 to +2
+ }
+ else{
+ $non_bisulfite_sequence_2 = '';
+ }
+ }
+
+ ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
+ elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
+ ### [Index 3, sequence originated from the (converted) reverse strand]
+ $counting{CT_GA_GA_count}++;
+ $alignment_read_1 = '-';
+ $alignment_read_2 = '+';
+ $read_conversion_info_1 = 'CT';
+ $read_conversion_info_2 = 'GA';
+ $genome_conversion = 'GA';
+
+ ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
+ ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
+ ### As read 1 is CT converted we need to capture 2 extra 5' bases which will be 2 extra 3' base after reverse complementation
+ if ( ($methylation_call_params->{$sequence_identifier}->{start_seq_2}-1) > 0){ ## CHH changed to -1
+ $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH changed to -2/+2
+ ### the reverse strand sequence needs to be reverse complemented
+ $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
+ }
+ else{
+ $non_bisulfite_sequence_1 = '';
+ }
+
+ ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
+ ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
+ ### Read 2 is GA converted so we need to capture 2 extra 5' bases
+ $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH changed to -2/+2
+ }
+ else{
+ die "Too many bowtie result filehandles\n";
+ }
+ ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
+ ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
+
+ $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
+ $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
+ $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
+ $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
+ $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
+}
+
+### EXTRACT GENOMIC SEQUENCE BOWTIE 2 | PAIRED-END
+
+sub extract_corresponding_genomic_sequence_paired_ends_bowtie2{
+ my ($sequence_identifier,$methylation_call_params) = @_;
+ ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
+ ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
+
+ my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1};
+ my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2};
+ my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1};
+ my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2};
+# print "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n";
+ ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
+ ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
+
+ ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
+ ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
+ my $alignment_read_1;
+ my $alignment_read_2;
+ my $read_conversion_info_1;
+ my $read_conversion_info_2;
+ my $genome_conversion;
+
+ ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at one of the ends so that we can also make a CpG, CHG or CHH methylation call
+ ### if the C happens to be at the last position of the actually observed sequence
+ my $non_bisulfite_sequence_1 = '';
+ my $non_bisulfite_sequence_2 = '';
+
+ ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
+ my $pos_1 = $methylation_call_params->{$sequence_identifier}->{position_1}-1;
+ my $pos_2 = $methylation_call_params->{$sequence_identifier}->{position_2}-1;
+
+ # parsing CIGAR 1 string
+ my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
+ my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
+ shift @ops_1; # remove the empty first element
+ die "CIGAR 1 string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1);
+ # parsing CIGAR 2 string
+ my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
+ my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
+ shift @ops_2; # remove the empty first element
+ die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
+
+ my $indels_1 = 0; # addiong these to the hemming distance value (needed for the NM field in the final SAM output
+ my $indels_2 = 0;
+
+ ### Extracting read 1 genomic sequence ###
+
+ # extracting 2 additional bp at the 5' end (read 1)
+ if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
+ # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
+ unless ( ($pos_1-2) > 0){# exiting with en empty genomic sequence otherwise
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
+ return;
+ }
+ $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1-2,2);
+ }
+
+ foreach (0..$#len_1){
+ if ($ops_1[$_] eq 'M'){
+ # extracting genomic sequence
+ $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]);
+ # warn "$non_bisulfite_sequence_1\n";
+ # adjusting position
+ $pos_1 += $len_1[$_];
+ }
+ elsif ($ops_1[$_] eq 'I'){ # insertion in the read sequence
+ # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
+ $non_bisulfite_sequence_1 .= 'N' x $len_1[$_];
+ # warn "$non_bisulfite_sequence_1\n";
+ # position doesn't need adjusting
+ $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
+ }
+ elsif ($ops_1[$_] eq 'D'){ # deletion in the read sequence
+ # we do not add any genomic sequence but only adjust the position
+ # warn "Just adjusting the position by: ",$len_1[$_],"bp\n";
+ $pos_1 += $len_1[$_];
+ $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
+ }
+ elsif($cigar_1 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
+ die "The CIGAR 1 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
+ }
+ else{
+ die "The CIGAR 1 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
+ }
+ }
+
+ ### 3' end of read 1
+ if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
+ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
+ unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
+ return;
+ }
+ $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2);
+ }
+
+
+ ### Extracting read 2 genomic sequence ###
+
+ ### 5' end of read 2
+ if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
+ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
+ unless ( ($pos_2-2) >= 0){# exiting with en empty genomic sequence otherwise
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
+ return;
+ }
+ $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2-2,2);
+ }
+
+ foreach (0..$#len_2){
+ if ($ops_2[$_] eq 'M'){
+ # extracting genomic sequence
+ $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]);
+ # warn "$non_bisulfite_sequence_2\n";
+ # adjusting position
+ $pos_2 += $len_2[$_];
+ }
+ elsif ($ops_2[$_] eq 'I'){ # insertion in the read sequence
+ # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
+ $non_bisulfite_sequence_2 .= 'N' x $len_2[$_];
+ # warn "$non_bisulfite_sequence_2\n";
+ # position doesn't need adjusting
+ $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
+ }
+ elsif ($ops_2[$_] eq 'D'){ # deletion in the read sequence
+ # we do not add any genomic sequence but only adjust the position
+ # warn "Just adjusting the position by: ",$len_2[$_],"bp\n";
+ $pos_2 += $len_2[$_];
+ $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
+ }
+ elsif($cigar_2 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
+ die "The CIGAR 2 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
+ }
+ else{
+ die "The CIGAR 2 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
+ }
+ }
+
+ ### 3' end of read 2
+ if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
+ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
+ unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
+ return;
+ }
+ $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2);
+ }
+
+ ### all paired-end alignments reported by Bowtie 2 have the Read 1 alignment first and the Read 2 alignment as the second one irrespective of whether read 1 or read 2 was
+ ### the + alignment. We also read in sequences read 1 then read 2 so they should correspond perfectly
+
+ ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
+ if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
+ ### [Index 0, sequence originated from (converted) forward strand]
+ $counting{CT_GA_CT_count}++;
+ $alignment_read_1 = '+';
+ $alignment_read_2 = '-';
+ $read_conversion_info_1 = 'CT';
+ $read_conversion_info_2 = 'GA';
+ $genome_conversion = 'CT';
+ ### Read 1 is always the forward hit
+ ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
+ $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
+ }
+
+ ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
+ elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
+ ### [Index 1, sequence originated from complementary to (converted) bottom strand]
+ $counting{GA_CT_GA_count}++;
+ $alignment_read_1 = '+';
+ $alignment_read_2 = '-';
+ $read_conversion_info_1 = 'GA';
+ $read_conversion_info_2 = 'CT';
+ $genome_conversion = 'GA';
+ ### Read 1 is always the forward hit
+ ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
+ $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
+ }
+
+ ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
+ elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
+ ### [Index 2, sequence originated from the complementary to (converted) top strand]
+ $counting{GA_CT_CT_count}++;
+ $alignment_read_1 = '-';
+ $alignment_read_2 = '+';
+ $read_conversion_info_1 = 'GA';
+ $read_conversion_info_2 = 'CT';
+ $genome_conversion = 'CT';
+
+ ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
+ $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
+ }
+
+ ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
+ elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
+ ### [Index 3, sequence originated from the (converted) reverse strand]
+ $counting{CT_GA_GA_count}++;
+ $alignment_read_1 = '-';
+ $alignment_read_2 = '+';
+ $read_conversion_info_1 = 'CT';
+ $read_conversion_info_2 = 'GA';
+ $genome_conversion = 'GA';
+ ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
+ $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
+ }
+ else{
+ die "Too many bowtie result filehandles\n";
+ }
+ ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
+ ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
+
+ $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
+ $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
+ $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
+ $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
+ $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
+ ## the end position of a read is stored in $pos
+ $methylation_call_params->{$sequence_identifier}->{end_position_1} = $pos_1;
+ $methylation_call_params->{$sequence_identifier}->{end_position_2} = $pos_2;
+ $methylation_call_params->{$sequence_identifier}->{indels_1} = $indels_1;
+ $methylation_call_params->{$sequence_identifier}->{indels_2} = $indels_2;
+}
+
+##########################################
+### PRINT SINGLE END RESULTS: Bowtie 1 ###
+##########################################
+
+sub print_bisulfite_mapping_result_single_end{
+ my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
+
+ ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
+ if ($phred64){
+ $quality_value = convert_phred64_quals_to_phred33($quality_value);
+ }
+ elsif ($solexa){
+ $quality_value = convert_solexa_quals_to_phred33($quality_value);
+ }
+
+ ### We will add +1 bp to the starting position of single-end reads, as Bowtie 1 reports the index and not the bp position.
+ $methylation_call_params->{$identifier}->{position} += 1;
+
+ ### writing every uniquely mapped read and its methylation call to the output file
+ if ($vanilla){
+ my $bowtie1_output = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_strand},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{position},$methylation_call_params->{$identifier}->{end_position},$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{methylation_call},$methylation_call_params->{$identifier}->{read_conversion},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value);
+ print OUT "$bowtie1_output\n";
+ }
+ else{ # SAM output, default since Bismark v1.0.0
+ single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
+ }
+}
+
+##########################################
+### PRINT SINGLE END RESULTS: Bowtie 2 ###
+##########################################
+
+sub print_bisulfite_mapping_result_single_end_bowtie2{
+ my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
+
+ ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
+ if ($phred64){
+ $quality_value = convert_phred64_quals_to_phred33($quality_value);
+ }
+ elsif ($solexa){
+ $quality_value = convert_solexa_quals_to_phred33($quality_value);
+ }
+
+ ### writing every mapped read and its methylation call to the SAM output file (unmapped and ambiguous reads were already printed)
+ single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
+}
+
+##########################################
+### PRINT PAIRED END ESULTS: Bowtie 1 ###
+##########################################
+
+sub print_bisulfite_mapping_results_paired_ends{
+ my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
+
+ ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
+ if ($phred64){
+ $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
+ $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
+ }
+ elsif ($solexa){
+ $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
+ $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
+ }
+
+ ### We will add +1 bp to the start position of paired-end reads, as Bowtie 1 reports the index and not the bp position. (End position is already 1-based)
+ $methylation_call_params->{$identifier}->{start_seq_1} += 1;
+
+ ### writing every single aligned read and its methylation call to the output file
+ if ($vanilla){
+ my $bowtie1_output_paired_end = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_read_1},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{start_seq_1},$methylation_call_params->{$identifier}->{alignment_end},$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{methylation_call_1},$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{methylation_call_2},$methylation_call_params->{$identifier}->{read_conversion_1},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value_1,$quality_value_2);
+ print OUT "$bowtie1_output_paired_end\n";
+ }
+ else{ # SAM output, default since Bismark v1.0.0
+ paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
+ }
+
+}
+
+##########################################
+### PRINT PAIRED END ESULTS: Bowtie 2 ###
+##########################################
+
+sub print_bisulfite_mapping_results_paired_ends_bowtie2{
+ my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
+
+ ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
+ if ($phred64){
+ $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
+ $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
+ }
+ elsif ($solexa){
+ $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
+ $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
+ }
+
+ ### writing every single aligned read and its methylation call to the output file (unmapped and ambiguous reads were already printed)
+ paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
+
+}
+
+
+sub convert_phred64_quals_to_phred33{
+
+ my $qual = shift;
+ my @quals = split (//,$qual);
+ my @new_quals;
+
+ foreach my $index (0..$#quals){
+ my $phred_score = convert_phred64_quality_string_into_phred_score ($quals[$index]);
+ my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
+ $new_quals[$index] = $phred33_quality_string;
+ }
+
+ my $phred33_quality = join ("",@new_quals);
+ return $phred33_quality;
+}
+
+sub convert_solexa_quals_to_phred33{
+
+ my $qual = shift;
+ my @quals = split (//,$qual);
+ my @new_quals;
+
+ foreach my $index (0..$#quals){
+ my $phred_score = convert_solexa_pre1_3_quality_string_into_phred_score ($quals[$index]);
+ my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
+ $new_quals[$index] = $phred33_quality_string;
+ }
+
+ my $phred33_quality = join ("",@new_quals);
+ return $phred33_quality;
+}
+
+sub convert_phred_score_into_phred33_quality_string{
+ my $qual = shift;
+ $qual = chr($qual+33);
+ return $qual;
+}
+
+sub convert_phred64_quality_string_into_phred_score{
+ my $string = shift;
+ my $qual = ord($string)-64;
+ return $qual;
+}
+
+sub convert_solexa_pre1_3_quality_string_into_phred_score{
+ ### We will just use 59 as the offset here as all Phred Scores between 10 and 40 look exactly the same, there is only a minute difference for values between 0 and 10
+ my $string = shift;
+ my $qual = ord($string)-59;
+ return $qual;
+}
+
+
+sub extract_corresponding_genomic_sequence_single_end {
+ my ($sequence_identifier,$methylation_call_params) = @_;
+ ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
+ ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
+
+ ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
+ ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
+ my $alignment_strand;
+ my $read_conversion_info;
+ my $genome_conversion;
+ ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and
+ ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation,
+ ### if the C happens to be at the last position of the actually observed sequence
+ my $non_bisulfite_sequence;
+ ### depending on the conversion we want to make need to capture 1 extra base at the 3' end
+
+ ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
+ if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
+ ### [Index 0, sequence originated from (converted) forward strand]
+ $counting{CT_CT_count}++;
+ $alignment_strand = '+';
+ $read_conversion_info = 'CT';
+ $genome_conversion = 'CT';
+
+ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
+ if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1
+ ### + 2 extra base at the 3' end
+ $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
+ }
+ else{
+ $non_bisulfite_sequence = '';
+ }
+ }
+
+ ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
+ elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
+ ### [Index 1, sequence originated from (converted) reverse strand]
+ $counting{CT_GA_count}++;
+ $alignment_strand = '-';
+ $read_conversion_info = 'CT';
+ $genome_conversion = 'GA';
+
+ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
+ if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from >
+ ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation
+ $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
+ ## reverse complement!
+ $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
+ }
+ else{
+ $non_bisulfite_sequence = '';
+ }
+ }
+
+ ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
+ elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
+ ### [Index 2, sequence originated from complementary to (converted) forward strand]
+ $counting{GA_CT_count}++;
+ $alignment_strand = '-';
+ $read_conversion_info = 'GA';
+ $genome_conversion = 'CT';
+
+ ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation
+ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
+ if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012
+ $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
+ ## reverse complement!
+ $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
+ }
+ else{
+ $non_bisulfite_sequence = '';
+ }
+ }
+
+ ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
+ elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
+ ### [Index 3, sequence originated from complementary to (converted) reverse strand]
+ $counting{GA_GA_count}++;
+ $alignment_strand = '+';
+ $read_conversion_info = 'GA';
+ $genome_conversion = 'GA';
+
+ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
+ if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from >
+ ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand
+ $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
+ }
+ else{
+ $non_bisulfite_sequence = '';
+ }
+ }
+ else{
+ die "Too many bowtie result filehandles\n";
+ }
+
+ $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
+ $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
+ $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
+
+ ### at this point we can also determine the end position of a read
+ $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence});
+}
+
+
+sub extract_corresponding_genomic_sequence_single_end_bowtie2{
+ my ($sequence_identifier,$methylation_call_params) = @_;
+
+ my $MD_tag = $methylation_call_params->{$sequence_identifier}->{mismatch_info};
+ my $cigar = $methylation_call_params->{$sequence_identifier}->{CIGAR};
+
+ ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
+ ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
+
+ ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
+ ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
+ my $alignment_strand;
+ my $read_conversion_info;
+ my $genome_conversion;
+ ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
+ ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
+ my $non_bisulfite_sequence = '';
+
+ ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
+ my $pos = $methylation_call_params->{$sequence_identifier}->{position}-1;
+
+ # parsing CIGAR string
+ my @len = split (/\D+/,$cigar); # storing the length per operation
+ my @ops = split (/\d+/,$cigar); # storing the operation
+ shift @ops; # remove the empty first element
+ die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
+
+ ### If the sequence aligns best as CT converted reads vs. GA converted genome (OB, index 1) or GA converted reads vs. GA converted genome (CTOB, index 3)
+ if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
+ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
+ unless ( ($pos-2) >= 0){ # exiting with en empty genomic sequence otherwise
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
+ return;
+ }
+ $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos-2,2);
+ }
+ my $indels = 0;
+
+ foreach (0..$#len){
+ if ($ops[$_] eq 'M'){
+ #extracting genomic sequence
+ $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]);
+ # adjusting position
+ $pos += $len[$_];
+ }
+ elsif ($ops[$_] eq 'I'){ # insertion in the read sequence
+ # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
+ $non_bisulfite_sequence .= 'N' x $len[$_];
+ # warn "$non_bisulfite_sequence\n";
+ # position doesn't need to be adjusting
+ $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
+ }
+ elsif ($ops[$_] eq 'D'){ # deletion in the read sequence
+ # we do not add any genomic sequence but only adjust the position
+ $pos += $len[$_];
+ $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
+ }
+ elsif($cigar =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
+ die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
+ }
+ else{
+ die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
+ }
+ }
+
+ ### If the sequence aligns best as CT converted reads vs. CT converted genome (OT, index 0) or GA converted reads vs. CT converted genome (CTOT, index 2)
+ if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
+ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
+ unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos+2){ # exiting with en empty genomic sequence otherwise
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
+ return;
+ }
+ $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,2);
+ # print "$methylation_call_params->{$sequence_identifier}->{bowtie_sequence}\n$non_bisulfite_sequence\n";
+ }
+
+
+
+ ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
+ if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
+ ### [Index 0, sequence originated from (converted) forward strand]
+ $counting{CT_CT_count}++;
+ $alignment_strand = '+';
+ $read_conversion_info = 'CT';
+ $genome_conversion = 'CT';
+ }
+
+ ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
+ elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
+ ### [Index 1, sequence originated from (converted) reverse strand]
+ $counting{CT_GA_count}++;
+ $alignment_strand = '-';
+ $read_conversion_info = 'CT';
+ $genome_conversion = 'GA';
+
+ ### reverse complement!
+ $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
+ }
+
+ ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
+ elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
+ ### [Index 2, sequence originated from complementary to (converted) forward strand]
+ $counting{GA_CT_count}++;
+ $alignment_strand = '-';
+ $read_conversion_info = 'GA';
+ $genome_conversion = 'CT';
+
+ ### reverse complement!
+ $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
+ }
+
+ ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
+ elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
+ ### [Index 3, sequence originated from complementary to (converted) reverse strand]
+ $counting{GA_GA_count}++;
+ $alignment_strand = '+';
+ $read_conversion_info = 'GA';
+ $genome_conversion = 'GA';
+
+ }
+ else{
+ die "Too many Bowtie 2 result filehandles\n";
+ }
+
+ $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
+ $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
+ $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
+
+ ### the end position of a read is stored in $pos
+ $methylation_call_params->{$sequence_identifier}->{end_position} = $pos;
+ $methylation_call_params->{$sequence_identifier}->{indels} = $indels;
+}
+
+### METHYLATION CALL
+
+sub methylation_call{
+ my ($identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion) = @_;
+ ### splitting both the actually observed sequence and the genomic sequence up into single bases so we can compare them one by one
+ my @seq = split(//,$sequence_actually_observed);
+ my @genomic = split(//,$genomic_sequence);
+ # print join ("\n",$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion),"\n";
+ ### Creating a match-string with different characters for non-cytosine bases (disregarding mismatches here), methyl-Cs or non-methyl Cs in either
+ ### CpG, CHH or CHG context
+
+ #################################################################
+ ### . for bases not involving cytosines ###
+ ### X for methylated C in CHG context (was protected) ###
+ ### x for not methylated C in CHG context (was converted) ###
+ ### H for methylated C in CHH context (was protected) ###
+ ### h for not methylated C in CHH context (was converted) ###
+ ### Z for methylated C in CpG context (was protected) ###
+ ### z for not methylated C in CpG context (was converted) ###
+ #################################################################
+
+ my @match =();
+ warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2
+ my $methyl_CHH_count = 0;
+ my $methyl_CHG_count = 0;
+ my $methyl_CpG_count = 0;
+ my $unmethylated_CHH_count = 0;
+ my $unmethylated_CHG_count = 0;
+ my $unmethylated_CpG_count = 0;
+
+ if ($read_conversion eq 'CT'){
+ for my $index (0..$#seq) {
+ if ($seq[$index] eq $genomic[$index]) {
+ ### The residue can only be a C if it was not converted to T, i.e. protected my methylation
+ if ($genomic[$index] eq 'C') {
+ ### If the residue is a C we want to know if it was in CpG context or in any other context
+ my $downstream_base = $genomic[$index+1];
+
+ if ($downstream_base eq 'G'){
+ ++$methyl_CpG_count;
+ push @match,'Z'; # protected C, methylated, in CpG context
+ }
+
+ else {
+ ### C in not in CpG-context, determining the second downstream base context
+ my $second_downstream_base = $genomic[$index+2];
+
+ if ($second_downstream_base eq 'G'){
+ ++$methyl_CHG_count;
+ push @match,'X'; # protected C, methylated, in CHG context
+ }
+ else{
+ ++$methyl_CHH_count;
+ push @match,'H'; # protected C, methylated, in CHH context
+ }
+ }
+ }
+ else {
+ push @match, '.';
+ }
+ }
+ elsif ($seq[$index] ne $genomic[$index]) {
+ ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted into Ts
+ ### in the actually observed sequence
+ if ($genomic[$index] eq 'C' and $seq[$index] eq 'T') {
+ ### If the residue was converted to T we want to know if it was in CpG, CHG or CHH context
+ my $downstream_base = $genomic[$index+1];
+
+ if ($downstream_base eq 'G'){
+ ++$unmethylated_CpG_count;
+ push @match,'z'; # converted C, not methylated, in CpG context
+ }
+
+ else{
+ ### C in not in CpG-context, determining the second downstream base context
+ my $second_downstream_base = $genomic[$index+2];
+
+ if ($second_downstream_base eq 'G'){
+ ++$unmethylated_CHG_count;
+ push @match,'x'; # converted C, not methylated, in CHG context
+ }
+ else{
+ ++$unmethylated_CHH_count;
+ push @match,'h'; # converted C, not methylated, in CHH context
+ }
+ }
+ }
+ ### all other mismatches are not of interest for a methylation call
+ else {
+ push @match,'.';
+ }
+ }
+ else{
+ die "There can be only 2 possibilities\n";
+ }
+ }
+ }
+ elsif ($read_conversion eq 'GA'){
+ # print join ("\n",'***',$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion,'***'),"\n";
+
+ for my $index (0..$#seq) {
+ if ($seq[$index] eq $genomic[$index+2]) {
+ ### The residue can only be a G if the C on the other strand was not converted to T, i.e. protected my methylation
+ if ($genomic[$index+2] eq 'G') {
+ ### If the residue is a G we want to know if the C on the other strand was in CpG, CHG or CHH context, therefore we need
+ ### to look if the base upstream is a C
+
+ my $upstream_base = $genomic[$index+1];
+
+ if ($upstream_base eq 'C'){
+ ++$methyl_CpG_count;
+ push @match,'Z'; # protected C on opposing strand, methylated, in CpG context
+ }
+
+ else{
+ ### C in not in CpG-context, determining the second upstream base context
+ my $second_upstream_base = $genomic[$index];
+
+ if ($second_upstream_base eq 'C'){
+ ++$methyl_CHG_count;
+ push @match,'X'; # protected C on opposing strand, methylated, in CHG context
+ }
+ else{
+ ++$methyl_CHH_count;
+ push @match,'H'; # protected C on opposing strand, methylated, in CHH context
+ }
+ }
+ }
+ else{
+ push @match, '.';
+ }
+ }
+ elsif ($seq[$index] ne $genomic[$index+2]) {
+ ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted to Ts
+ ### on the opposing strand, so G to A conversions in the actually observed sequence
+ if ($genomic[$index+2] eq 'G' and $seq[$index] eq 'A') {
+ ### If the C residue on the opposing strand was converted to T then we will see an A in the currently observed sequence. We want to know if
+ ### the C on the opposing strand was it was in CpG, CHG or CHH context, therefore we need to look one (or two) bases upstream!
+
+ my $upstream_base = $genomic[$index+1];
+
+ if ($upstream_base eq 'C'){
+ ++$unmethylated_CpG_count;
+ push @match,'z'; # converted C on opposing strand, not methylated, in CpG context
+ }
+
+ else{
+ ### C in not in CpG-context, determining the second upstream base context
+ my $second_upstream_base = $genomic[$index];
+
+ if ($second_upstream_base eq 'C'){
+ ++$unmethylated_CHG_count;
+ push @match,'x'; # converted C on opposing strand, not methylated, in CHG context
+ }
+ else{
+ ++$unmethylated_CHH_count;
+ push @match,'h'; # converted C on opposing strand, not methylated, in CHH context
+ }
+ }
+ }
+ ### all other mismatches are not of interest for a methylation call
+ else {
+ push @match,'.';
+ }
+ }
+ else{
+ die "There can be only 2 possibilities\n";
+ }
+ }
+ }
+ else{
+ die "Strand conversion info is required to perform a methylation call\n";
+ }
+
+ my $methylation_call = join ("",@match);
+
+ $counting{total_meCHH_count} += $methyl_CHH_count;
+ $counting{total_meCHG_count} += $methyl_CHG_count;
+ $counting{total_meCpG_count} += $methyl_CpG_count;
+ $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count;
+ $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count;
+ $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count;
+
+ # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n";
+ return $methylation_call;
+}
+
+sub read_genome_into_memory{
+ ## working directoy
+ my $cwd = shift;
+ ## reading in and storing the specified genome in the %chromosomes hash
+ chdir ($genome_folder) or die "Can't move to $genome_folder: $!";
+ print "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n";
+
+ my @chromosome_filenames = <*.fa>;
+
+ ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
+ unless (@chromosome_filenames){
+ @chromosome_filenames = <*.fasta>;
+ }
+
+ unless (@chromosome_filenames){
+ die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n";
+ }
+
+ foreach my $chromosome_filename (@chromosome_filenames){
+
+ open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n";
+ ### first line needs to be a fastA header
+ my $first_line = ;
+ chomp $first_line;
+
+ ### Extracting chromosome name from the FastA header
+ my $chromosome_name = extract_chromosome_name($first_line);
+
+ my $sequence;
+ while (){
+ chomp;
+ if ($_ =~ /^>/){
+ ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA)
+ if (exists $chromosomes{$chromosome_name}){
+ print "chr $chromosome_name (",length $sequence ," bp)\n";
+ die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
+ }
+ else {
+ if (length($sequence) == 0){
+ warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n";
+ }
+ print "chr $chromosome_name (",length $sequence ," bp)\n";
+ $chromosomes{$chromosome_name} = $sequence;
+ }
+ ### resetting the sequence variable
+ $sequence = '';
+ ### setting new chromosome name
+ $chromosome_name = extract_chromosome_name($_);
+ }
+ else{
+ $sequence .= uc$_;
+ }
+ }
+
+ if (exists $chromosomes{$chromosome_name}){
+ print "chr $chromosome_name (",length $sequence ," bp)\t";
+ die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n";
+ }
+ else{
+ if (length($sequence) == 0){
+ warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n";
+ }
+ print "chr $chromosome_name (",length $sequence ," bp)\n";
+ $chromosomes{$chromosome_name} = $sequence;
+ }
+ }
+ print "\n";
+ chdir $cwd or die "Failed to move to directory $cwd\n";
+}
+
+sub extract_chromosome_name {
+ ## Bowtie seems to extract the first string after the inition > in the FASTA file, so we are doing this as well
+ my $fasta_header = shift;
+ if ($fasta_header =~ s/^>//){
+ my ($chromosome_name) = split (/\s+/,$fasta_header);
+ return $chromosome_name;
+ }
+ else{
+ die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n";
+ }
+}
+
+sub reverse_complement{
+ my $sequence = shift;
+ $sequence =~ tr/CATG/GTAC/;
+ $sequence = reverse($sequence);
+ return $sequence;
+}
+
+sub biTransformFastAFiles {
+ my $file = shift;
+ my ($dir,$filename);
+ if ($file =~ /\//){
+ ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
+ }
+ else{
+ $filename = $file;
+ }
+
+ ### gzipped version of the infile
+ if ($file =~ /\.gz$/){
+ open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
+ }
+ else{
+ open (IN,$file) or die "Couldn't read from file $file: $!\n";
+ }
+
+ if ($skip){
+ warn "Skipping the first $skip reads from $file\n";
+ sleep (1);
+ }
+ if ($upto){
+ warn "Processing reads up to sequence no. $upto from $file\n";
+ sleep (1);
+ }
+
+ my $C_to_T_infile = my $G_to_A_infile = $filename;
+ $C_to_T_infile =~ s/$/_C_to_T.fa/;
+ $G_to_A_infile =~ s/$/_G_to_A.fa/;
+ print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
+ open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
+
+ unless ($directional){
+ print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
+ open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
+ }
+
+ my $count = 0;
+ while (1){
+ my $header = ;
+ my $sequence= ;
+ last unless ($header and $sequence);
+
+ $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
+
+ ++$count;
+
+ if ($skip){
+ next unless ($count > $skip);
+ }
+ if ($upto){
+ last if ($count > $upto);
+ }
+
+ $sequence = uc$sequence; # make input file case insensitive
+
+ # detecting if the input file contains tab stops, as this is likely to result in no alignments
+ if (index($header,"\t") != -1){
+ $seqID_contains_tabs++;
+ }
+
+ ### small check if the sequence seems to be in FastA format
+ die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/);
+
+ my $sequence_C_to_T = $sequence;
+ $sequence_C_to_T =~ tr/C/T/;
+ print CTOT "$header$sequence_C_to_T";
+
+ unless ($directional){
+ my $sequence_G_to_A = $sequence;
+ $sequence_G_to_A =~ tr/G/A/;
+ print GTOA "$header$sequence_G_to_A";
+ }
+ }
+ if ($directional){
+ print "\nCreated C -> T converted versions of the FastA file $filename ($count sequences in total)\n\n";
+ }
+ else{
+ print "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
+ }
+ return ($C_to_T_infile,$G_to_A_infile);
+}
+
+sub biTransformFastAFiles_paired_end {
+ my ($file,$read_number) = @_;
+
+ my ($dir,$filename);
+ if ($file =~ /\//){
+ ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
+ }
+ else{
+ $filename = $file;
+ }
+
+ ### gzipped version of the infile
+ if ($file =~ /\.gz$/){
+ open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
+ }
+ else{
+ open (IN,$file) or die "Couldn't read from file $file: $!\n";
+ }
+
+ if ($skip){
+ warn "Skipping the first $skip reads from $file\n";
+ sleep (1);
+ }
+ if ($upto){
+ warn "Processing reads up to sequence no. $upto from $file\n";
+ sleep (1);
+ }
+
+ my $C_to_T_infile = my $G_to_A_infile = $filename;
+ $C_to_T_infile =~ s/$/_C_to_T.fa/;
+ $G_to_A_infile =~ s/$/_G_to_A.fa/;
+
+ if ($directional){
+ if ($read_number == 1){
+ print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
+ open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
+ }
+ elsif ($read_number == 2){
+ print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
+ open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
+ }
+ else{
+ die "Read number needs to be 1 or 2, but was: $read_number\n\n";
+ }
+ }
+ else{ # all four strand output
+ print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
+ print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
+ open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
+ open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
+ }
+
+ my $count = 0;
+
+ while (1){
+ my $header = ;
+ my $sequence= ;
+ last unless ($header and $sequence);
+
+ $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
+
+ ++$count;
+
+ if ($skip){
+ next unless ($count > $skip);
+ }
+ if ($upto){
+ last if ($count > $upto);
+ }
+
+ $sequence = uc$sequence; # make input file case insensitive
+
+ # detecting if the input file contains tab stops, as this is likely to result in no alignments
+ if (index($header,"\t") != -1){
+ $seqID_contains_tabs++;
+ }
+
+ ## small check if the sequence seems to be in FastA format
+ die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/);
+
+ if ($read_number == 1){
+ if ($bowtie2){
+ $header =~ s/$/\/1\/1/;
+ }
+ else{
+ $header =~ s/$/\/1/;
+ }
+ }
+ elsif ($read_number == 2){
+ if ($bowtie2){
+ $header =~ s/$/\/2\/2/;
+ }
+ else{
+ $header =~ s/$/\/2/;
+ }
+ }
+ else{
+ die "Read number needs to be 1 or 2, but was: $read_number\n\n";
+ }
+ my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
+
+ $sequence_C_to_T =~ tr/C/T/;
+ $sequence_G_to_A =~ tr/G/A/;
+
+ if ($directional){
+
+ if ($read_number == 1){
+ print CTOT "$header$sequence_C_to_T";
+ }
+ elsif ($read_number == 2){
+ print GTOA "$header$sequence_G_to_A";
+ }
+ }
+ else{
+ print CTOT "$header$sequence_C_to_T";
+ print GTOA "$header$sequence_G_to_A";
+ }
+ }
+
+ if ($directional){
+ if ($read_number == 1){
+ print "\nCreated C -> T converted version of the FastA file $filename ($count sequences in total)\n\n";
+ }
+ else{
+ print "\nCreated G -> A converted version of the FastA file $filename ($count sequences in total)\n\n";
+ }
+ }
+ else{
+ print "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
+ }
+
+ if ($directional){
+ if ($read_number == 1){
+ return ($C_to_T_infile);
+ }
+ else{
+ return ($G_to_A_infile);
+ }
+ }
+ else{
+ return ($C_to_T_infile,$G_to_A_infile);
+ }
+}
+
+
+sub biTransformFastQFiles {
+ my $file = shift;
+ my ($dir,$filename);
+ if ($file =~ /\//){
+ ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
+ }
+ else{
+ $filename = $file;
+ }
+
+ ### gzipped version of the infile
+ if ($file =~ /\.gz$/){
+ open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
+ }
+ else{
+ open (IN,$file) or die "Couldn't read from file $file: $!\n";
+ }
+
+ if ($skip){
+ warn "Skipping the first $skip reads from $file\n";
+ sleep (1);
+ }
+ if ($upto){
+ warn "Processing reads up to sequence no. $upto from $file\n";
+ sleep (1);
+ }
+
+ my $C_to_T_infile = my $G_to_A_infile = $filename;
+
+ $C_to_T_infile =~ s/$/_C_to_T.fastq/;
+ print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
+ open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
+
+ unless ($directional){
+ $G_to_A_infile =~ s/$/_G_to_A.fastq/;
+ print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
+ open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
+ }
+
+ my $count = 0;
+ while (1){
+ my $identifier = ;
+ my $sequence = ;
+ my $identifier2 = ;
+ my $quality_score = ;
+ last unless ($identifier and $sequence and $identifier2 and $quality_score);
+
+ $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
+
+ ++$count;
+
+ if ($skip){
+ next unless ($count > $skip);
+ }
+ if ($upto){
+ last if ($count > $upto);
+ }
+
+ $sequence = uc$sequence; # make input file case insensitive
+
+ # detecting if the input file contains tab stops, as this is likely to result in no alignments
+ if (index($identifier,"\t") != -1){
+ $seqID_contains_tabs++;
+ }
+
+ ## small check if the sequence file appears to be a FastQ file
+ if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
+ die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
+ }
+
+ my $sequence_C_to_T = $sequence;
+ $sequence_C_to_T =~ tr/C/T/;
+ print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
+
+ unless ($directional){
+ my $sequence_G_to_A = $sequence;
+ $sequence_G_to_A =~ tr/G/A/;
+ print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
+ }
+ }
+
+ if ($directional){
+ print "\nCreated C -> T converted versions of the FastQ file $filename ($count sequences in total)\n\n";
+ }
+ else{
+ print "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
+ }
+
+ return ($C_to_T_infile,$G_to_A_infile);
+}
+
+sub biTransformFastQFiles_paired_end {
+ my ($file,$read_number) = @_;
+ my ($dir,$filename);
+
+ if ($file =~ /\//){
+ ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
+ }
+ else{
+ $filename = $file;
+ }
+
+ ### gzipped version of the infile
+ if ($file =~ /\.gz$/){
+ open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
+ }
+ else{
+ open (IN,$file) or die "Couldn't read from file $file: $!\n";
+ }
+
+ if ($skip){
+ warn "Skipping the first $skip reads from $file\n";
+ sleep (1);
+ }
+ if ($upto){
+ warn "Processing reads up to sequence no. $upto from $file\n";
+ sleep (1);
+ }
+
+ my $C_to_T_infile = my $G_to_A_infile = $filename;
+ $C_to_T_infile =~ s/$/_C_to_T.fastq/;
+ $G_to_A_infile =~ s/$/_G_to_A.fastq/;
+
+ if ($directional){
+ if ($read_number == 1){
+ print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
+ open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
+ }
+ elsif ($read_number == 2){
+ print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
+ open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
+ }
+ else{
+ die "Read number needs to be 1 or 2, but was $read_number!\n\n";
+ }
+ }
+ else{
+ print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
+ print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
+ open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
+ open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
+ }
+
+ my $count = 0;
+
+ while (1){
+ my $identifier = ;
+ my $sequence = ;
+ my $identifier2 = ;
+ my $quality_score = ;
+ last unless ($identifier and $sequence and $identifier2 and $quality_score);
+ ++$count;
+
+ $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
+
+ if ($skip){
+ next unless ($count > $skip);
+ }
+ if ($upto){
+ last if ($count > $upto);
+ }
+
+ $sequence= uc$sequence; # make input file case insensitive
+
+ ## small check if the sequence file appears to be a FastQ file
+ if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
+ die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
+ }
+ my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
+
+ if ($read_number == 1){
+ if ($bowtie2){
+ $identifier =~ s/$/\/1\/1/;
+ }
+ else{
+ $identifier =~ s/$/\/1/;
+ }
+ }
+ elsif ($read_number == 2){
+ if ($bowtie2){
+ $identifier =~ s/$/\/2\/2/;
+ }
+ else{
+ $identifier =~ s/$/\/2/;
+ }
+ }
+ else{
+ die "Read number needs to be 1 or 2\n";
+ }
+
+ $sequence_C_to_T =~ tr/C/T/;
+ $sequence_G_to_A =~ tr/G/A/;
+
+ if ($directional){
+ if ($read_number == 1){
+ print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
+ }
+ else{
+ print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
+ }
+ }
+ else{
+ print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
+ print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
+ }
+ }
+
+ if ($directional){
+ if ($read_number == 1){
+ print "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n";
+ }
+ else{
+ print "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n";
+ }
+ }
+ else{
+ print "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
+ }
+ if ($directional){
+ if ($read_number == 1){
+ return ($C_to_T_infile);
+ }
+ else{
+ return ($G_to_A_infile);
+ }
+ }
+ else{
+ return ($C_to_T_infile,$G_to_A_infile);
+ }
+}
+
+sub fix_IDs{
+ my $id = shift;
+ $id =~ s/[ \t]+/_/g; # replace spaces or tabs with underscores
+ return $id;
+}
+
+sub ensure_sensical_alignment_orientation_single_end{
+ my $index = shift; # index number if the sequence produced an alignment
+ my $strand = shift;
+ ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
+ my $orientation = 0;
+ ##############################################################################################################
+ ## FORWARD converted read against FORWARD converted genome (read: C->T.....C->T.. genome:C->T.......C->T)
+ ## here we only want reads in the forward (+) orientation
+ if ($fhs[$index]->{name} eq 'CTreadCTgenome') {
+ ### if the alignment is (+) we count it, and return 1 for a correct orientation
+ if ($strand eq '+') {
+ $fhs[$index]->{seen}++;
+ $orientation = 1;
+ return $orientation;
+ }
+ ### if the orientation equals (-) the alignment is nonsensical
+ elsif ($strand eq '-') {
+ $fhs[$index]->{wrong_strand}++;
+ return $orientation;
+ }
+ }
+ ###############################################################################################################
+ ## FORWARD converted read against reverse converted genome (read: C->T.....C->T.. genome: G->A.......G->A)
+ ## here we only want reads in the forward (-) orientation
+ elsif ($fhs[$index]->{name} eq 'CTreadGAgenome') {
+ ### if the alignment is (-) we count it and return 1 for a correct orientation
+ if ($strand eq '-') {
+ $fhs[$index]->{seen}++;
+ $orientation = 1;
+ return $orientation;
+ }
+ ### if the orientation equals (+) the alignment is nonsensical
+ elsif ($strand eq '+') {
+ $fhs[$index]->{wrong_strand}++;
+ return $orientation;
+ }
+ }
+ ###############################################################################################################
+ ## Reverse converted read against FORWARD converted genome (read: G->A.....G->A.. genome: C->T.......C->T)
+ ## here we only want reads in the forward (-) orientation
+ elsif ($fhs[$index]->{name} eq 'GAreadCTgenome') {
+ ### if the alignment is (-) we count it and return 1 for a correct orientation
+ if ($strand eq '-') {
+ $fhs[$index]->{seen}++;
+ $orientation = 1;
+ return $orientation;
+ }
+ ### if the orientation equals (+) the alignment is nonsensical
+ elsif ($strand eq '+') {
+ $fhs[$index]->{wrong_strand}++;
+ return $orientation;
+ }
+ }
+ ###############################################################################################################
+ ## Reverse converted read against reverse converted genome (read: G->A.....G->A.. genome: G->A.......G->A)
+ ## here we only want reads in the forward (+) orientation
+ elsif ($fhs[$index]->{name} eq 'GAreadGAgenome') {
+ ### if the alignment is (+) we count it and return 1 for a correct orientation
+ if ($strand eq '+') {
+ $fhs[$index]->{seen}++;
+ $orientation = 1;
+ return $orientation;
+ }
+ ### if the orientation equals (-) the alignment is nonsensical
+ elsif ($strand eq '-') {
+ $fhs[$index]->{wrong_strand}++;
+ return $orientation;
+ }
+ } else{
+ die "One of the above conditions must be true\n";
+ }
+}
+
+sub ensure_sensical_alignment_orientation_paired_ends{
+ my ($index,$id_1,$strand_1,$id_2,$strand_2) = @_; # index number if the sequence produced an alignment
+ ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
+ my $orientation = 0;
+ ##############################################################################################################
+ ## [Index 0, sequence originated from (converted) forward strand]
+ ## CT converted read 1
+ ## GA converted read 2
+ ## CT converted genome
+ ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
+ if ($fhs[$index]->{name} eq 'CTread1GAread2CTgenome') {
+ ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
+ if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
+ $fhs[$index]->{seen}++;
+ $orientation = 1;
+ return $orientation;
+ }
+ ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
+ elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
+ $fhs[$index]->{wrong_strand}++;
+ return $orientation;
+ }
+ else{
+ die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
+ }
+ }
+ ###############################################################################################################
+ ## [Index 1, sequence originated from (converted) reverse strand]
+ ## GA converted read 1
+ ## CT converted read 2
+ ## GA converted genome
+ ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
+ elsif ($fhs[$index]->{name} eq 'GAread1CTread2GAgenome') {
+ ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
+ if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
+ $fhs[$index]->{seen}++;
+ $orientation = 1;
+ return $orientation;
+ }
+ ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
+ elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
+ $fhs[$index]->{wrong_strand}++;
+ return $orientation;
+ }
+ else{
+ die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
+ }
+ }
+ ###############################################################################################################
+ ## [Index 2, sequence originated from complementary to (converted) forward strand]
+ ## GA converted read 1
+ ## CT converted read 2
+ ## CT converted genome
+ ## here we only want read 1 in (-) orientation and read 2 in (+) orientation
+ elsif ($fhs[$index]->{name} eq 'GAread1CTread2CTgenome') {
+ ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
+ if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
+ $fhs[$index]->{seen}++;
+ $orientation = 1;
+ return $orientation;
+ }
+ ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
+ elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
+ $fhs[$index]->{wrong_strand}++;
+ return $orientation;
+ }
+ else{
+ die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
+ }
+ }
+ ###############################################################################################################
+ ## [Index 3, sequence originated from complementary to (converted) reverse strand]
+ ## CT converted read 1
+ ## GA converted read 2
+ ## GA converted genome
+ ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
+ elsif ($fhs[$index]->{name} eq 'CTread1GAread2GAgenome') {
+ ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
+ if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
+ $fhs[$index]->{seen}++;
+ $orientation = 1;
+ return $orientation;
+ }
+ ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
+ elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
+ $fhs[$index]->{wrong_strand}++;
+ return $orientation;
+ }
+ else{
+ die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
+ }
+ }
+ else{
+ die "One of the above conditions must be true\n";
+ }
+}
+
+#####################################################################################################################################################
+
+### Bowtie 1 (default) | PAIRED-END | FASTA
+
+sub paired_end_align_fragments_to_bisulfite_genome_fastA {
+
+ my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
+
+ if ($directional){
+ print "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
+ }
+ else{
+ print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
+ }
+
+ ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
+ ## data structure above
+ if ($directional){
+ warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+ else{
+ warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+
+ foreach my $fh (@fhs) {
+
+ if ($directional){
+ unless ($fh->{inputfile_1}){
+ $fh->{last_seq_id} = undef;
+ $fh->{last_line_1} = undef;
+ $fh->{last_line_2} = undef;
+ next;
+ }
+ }
+
+ my $bt_options = $bowtie_options;
+ if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
+ $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
+ }
+ else {
+ $bt_options .= ' --nofw';
+ }
+
+ warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options)\n";
+ open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
+
+ my $line_1 = $fh->{fh}->getline();
+ my $line_2 = $fh->{fh}->getline();
+
+ # if Bowtie produces an alignment we store the first line of the output
+ if ($line_1 and $line_2) {
+ chomp $line_1;
+ chomp $line_2;
+ my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
+ my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
+
+ ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
+ ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
+
+ if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
+ $fh->{last_seq_id} = $id_1;
+ }
+ elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
+ $fh->{last_seq_id} = $id_2;
+ }
+ else{
+ die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
+ }
+
+ $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
+ $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
+ warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
+ }
+ # otherwise we just initialise last_seq_id and last_lines as undefined
+ else {
+ print "Found no alignment, assigning undef to last_seq_id and last_lines\n";
+ $fh->{last_seq_id} = undef;
+ $fh->{last_line_1} = undef;
+ $fh->{last_line_2} = undef;
+ }
+ }
+}
+
+### Bowtie 2 | PAIRED-END | FASTA
+
+sub paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
+ my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
+ if ($directional){
+ print "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
+ }
+ else{
+ print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
+ }
+
+ ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
+ ## data structure above
+ if ($directional){
+ warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+ else{
+ warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+
+ foreach my $fh (@fhs) {
+
+ if ($directional){
+ unless ($fh->{inputfile_1}){
+ $fh->{last_seq_id} = undef;
+ $fh->{last_line_1} = undef;
+ $fh->{last_line_2} = undef;
+ next;
+ }
+ }
+
+ my $bt2_options = $bowtie_options;
+ if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
+ $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
+ }
+ else {
+ $bt2_options .= ' --nofw';
+ }
+
+ warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
+ open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
+
+ ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
+ while (1){
+ $_ = $fh->{fh}->getline();
+ if ($_) {
+ last unless ($_ =~ /^\@/); # SAM headers start with @
+ }
+ else{
+ last; # no alignment output
+ }
+ }
+
+ my $line_1 = $_;
+ my $line_2 = $fh->{fh}->getline();
+
+ # if Bowtie produces an alignment we store the first line of the output
+ if ($line_1 and $line_2) {
+ chomp $line_1;
+ chomp $line_2;
+ my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
+ my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
+
+ ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
+ ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
+
+ if ($id_1 =~ s/\/1$//){ # removing the read 1 /1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
+ $fh->{last_seq_id} = $id_1;
+ }
+ elsif ($id_2 =~ s/\/1$//){ # removing the read 1 /2 tag if present
+ $fh->{last_seq_id} = $id_2;
+ }
+ else{
+ warn "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
+ }
+
+ $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
+ $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
+ warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
+ }
+ # otherwise we just initialise last_seq_id and last_lines as undefined
+ else {
+ print "Found no alignment, assigning undef to last_seq_id and last_lines\n";
+ $fh->{last_seq_id} = undef;
+ $fh->{last_line_1} = undef;
+ $fh->{last_line_2} = undef;
+ }
+ }
+}
+
+### Bowtie 1 (default) | PAIRED-END | FASTQ
+
+sub paired_end_align_fragments_to_bisulfite_genome_fastQ {
+ my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
+ if ($directional){
+ print "Input files are $C_to_T_infile_1 $G_to_A_infile_2 (FastQ)\n";
+ }
+ else{
+ print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n";
+ }
+
+ ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
+ ## data structure above
+ if ($directional){
+ warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+ else{
+ warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+
+ foreach my $fh (@fhs) {
+
+ if ($directional){
+ unless ($fh->{inputfile_1}){
+ $fh->{last_seq_id} = undef;
+ $fh->{last_line_1} = undef;
+ $fh->{last_line_2} = undef;
+ next;
+ }
+ }
+
+ my $bt_options = $bowtie_options;
+ if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
+ $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
+ }
+ else {
+ $bt_options .= ' --nofw';
+ }
+
+ warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options))\n";
+ open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
+
+ my $line_1 = $fh->{fh}->getline();
+ my $line_2 = $fh->{fh}->getline();
+
+ # if Bowtie produces an alignment we store the first line of the output
+ if ($line_1 and $line_2) {
+ chomp $line_1;
+ chomp $line_2;
+ ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
+ ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
+
+ my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
+ my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
+
+ if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
+ $fh->{last_seq_id} = $id_1;
+ }
+ elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
+ $fh->{last_seq_id} = $id_2;
+ }
+ else{
+ die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
+ }
+
+ $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
+ $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
+ warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
+ }
+
+ # otherwise we just initialise last_seq_id and last_lines as undefined
+ else {
+ print "Found no alignment, assigning undef to last_seq_id and last_lines\n";
+ $fh->{last_seq_id} = undef;
+ $fh->{last_line_1} = undef;
+ $fh->{last_line_2} = undef;
+ }
+ }
+}
+
+### Bowtie 2 | PAIRED-END | FASTQ
+
+sub paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
+ my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
+ if ($directional){
+ print "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n";
+ }
+ else{
+ print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n";
+ }
+
+ ## Now starting up 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
+ ## data structure above
+ if ($directional){
+ warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+ else{
+ warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+
+ foreach my $fh (@fhs) {
+
+ if ($directional){
+ unless ($fh->{inputfile_1}){
+ $fh->{last_seq_id} = undef;
+ $fh->{last_line_1} = undef;
+ $fh->{last_line_2} = undef;
+ next;
+ }
+ }
+
+ my $bt2_options = $bowtie_options;
+ if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
+ $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
+ }
+ else {
+ $bt2_options .= ' --nofw';
+ }
+
+ warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
+ open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
+
+ ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
+ while (1){
+ $_ = $fh->{fh}->getline();
+ if ($_) {
+ last unless ($_ =~ /^\@/); # SAM headers start with @
+ }
+ else{
+ last; # no alignment output
+ }
+ }
+
+ my $line_1 = $_;
+ my $line_2 = $fh->{fh}->getline();
+
+ # if Bowtie produces an alignment we store the first line of the output
+ if ($line_1 and $line_2) {
+ chomp $line_1;
+ chomp $line_2;
+ ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
+ ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
+
+ my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
+ my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
+
+ if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
+ $fh->{last_seq_id} = $id_1;
+ }
+ elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
+ $fh->{last_seq_id} = $id_2;
+ }
+ else{
+ die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
+ }
+
+ $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
+ $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
+ warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
+ }
+
+ # otherwise we just initialise last_seq_id and last_lines as undefined
+ else {
+ print "Found no alignment, assigning undef to last_seq_id and last_lines\n";
+ $fh->{last_seq_id} = undef;
+ $fh->{last_line_1} = undef;
+ $fh->{last_line_2} = undef;
+ }
+ }
+}
+
+#####################################################################################################################################################
+
+### Bowtie 1 (default) | SINGLE-END | FASTA
+sub single_end_align_fragments_to_bisulfite_genome_fastA {
+ my ($C_to_T_infile,$G_to_A_infile) = @_;
+ if ($directional){
+ print "Input file is $C_to_T_infile (FastA)\n";
+ }
+ else{
+ print "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
+ }
+
+ ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
+ ## data structure above
+ if ($directional){
+ warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+ else{
+ warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+
+ foreach my $fh (@fhs) {
+
+ my $bt_options = $bowtie_options;
+ if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
+ $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
+ }
+ else {
+ $bt_options .= ' --nofw';
+ }
+
+ warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
+ open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
+
+ # if Bowtie produces an alignment we store the first line of the output
+ $_ = $fh->{fh}->getline();
+ if ($_) {
+ chomp;
+ my $id = (split(/\t/))[0]; # this is the first element of the bowtie output (= the sequence identifier)
+ $fh->{last_seq_id} = $id;
+ $fh->{last_line} = $_;
+ warn "Found first alignment:\t$fh->{last_line}\n";
+ }
+ # otherwise we just initialise last_seq_id and last_line as undefined
+ else {
+ print "Found no alignment, assigning undef to last_seq_id and last_line\n";
+ $fh->{last_seq_id} = undef;
+ $fh->{last_line} = undef;
+ }
+ }
+}
+
+### Bowtie 2 | SINGLE-END | FASTA
+sub single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
+ my ($C_to_T_infile,$G_to_A_infile) = @_;
+ if ($directional){
+ print "Input file is $C_to_T_infile (FastA)\n";
+ }
+ else{
+ print "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
+ }
+
+ ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
+ ## data structure above
+ if ($directional){
+ warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+ else{
+ warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+
+ foreach my $fh (@fhs) {
+
+ my $bt2_options = $bowtie_options;
+ if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
+ $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
+ }
+ else {
+ $bt2_options .= ' --nofw';
+ }
+
+ warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt2_options)\n";
+ open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
+
+ ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
+ while (1){
+ $_ = $fh->{fh}->getline();
+ if ($_) {
+ last unless ($_ =~ /^\@/); # SAM headers start with @
+ }
+ else{
+ last; # no alignment output
+ }
+ }
+
+ # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
+ if ($_) {
+ chomp;
+ my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
+ $fh->{last_seq_id} = $id;
+ $fh->{last_line} = $_;
+ warn "Found first alignment:\t$fh->{last_line}\n";
+ }
+ # otherwise we just initialise last_seq_id and last_line as undefinded. This should only happen at the end of a file for Bowtie 2 output
+ else {
+ print "Found no alignment, assigning undef to last_seq_id and last_line\n";
+ $fh->{last_seq_id} = undef;
+ $fh->{last_line} = undef;
+ }
+ }
+}
+
+
+### Bowtie 1 (default) | SINGLE-END | FASTQ
+sub single_end_align_fragments_to_bisulfite_genome_fastQ {
+ my ($C_to_T_infile,$G_to_A_infile) = @_;
+ if ($directional){
+ print "Input file is $C_to_T_infile (FastQ)\n";
+ }
+ else{
+ print "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n";
+ }
+
+ ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
+ ## the data structure above
+ if ($directional){
+ warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+ else{
+ warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+
+ foreach my $fh (@fhs) {
+ my $bt_options = $bowtie_options;
+ if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
+ $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
+ }
+ else {
+ $bt_options .= ' --nofw';
+ }
+
+ warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
+ open ($fh->{fh},"$path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
+
+ # if Bowtie produces an alignment we store the first line of the output
+ $_ = $fh->{fh}->getline();
+ if ($_) {
+ chomp;
+ my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
+ $fh->{last_seq_id} = $id;
+ $fh->{last_line} = $_;
+ warn "Found first alignment:\t$fh->{last_line}\n";
+ }
+ # otherwise we just initialise last_seq_id and last_line as undefined
+ else {
+ print "Found no alignment, assigning undef to last_seq_id and last_line\n";
+ $fh->{last_seq_id} = undef;
+ $fh->{last_line} = undef;
+ }
+ }
+}
+
+### Bowtie 2 | SINGLE-END | FASTQ
+sub single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
+ my ($C_to_T_infile,$G_to_A_infile) = @_;
+ if ($directional){
+ print "Input file is $C_to_T_infile (FastQ)\n\n";
+ }
+ else{
+ print "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n\n";
+ }
+
+ ## Now starting up to 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
+ ## the data structure above
+ if ($directional){
+ warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+ else{
+ warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+
+ foreach my $fh (@fhs) {
+ my $bt2_options = $bowtie_options;
+ if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
+ $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
+ }
+ else {
+ $bt2_options .= ' --nofw';
+ }
+ warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options $bt2_options)\n";
+ warn "Using Bowtie 2 index: $fh->{bisulfiteIndex}\n\n";
+
+ open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
+ ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
+ while (1){
+ $_ = $fh->{fh}->getline();
+ if ($_) {
+ last unless ($_ =~ /^\@/); # SAM headers start with @
+ }
+ else {
+ last;
+ }
+ }
+
+ # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
+ if ($_) {
+ chomp;
+ my $id = (split(/\t/))[0]; # this is the first element of the Bowtie 2 output (= the sequence identifier)
+ $fh->{last_seq_id} = $id;
+ $fh->{last_line} = $_;
+ warn "Found first alignment:\t$fh->{last_line}\n";
+ }
+ # otherwise we just initialise last_seq_id and last_line as undefined. This should only happen at the end of a file for Bowtie 2 output
+ else {
+ print "Found no alignment, assigning undef to last_seq_id and last_line\n";
+ $fh->{last_seq_id} = undef;
+ $fh->{last_line} = undef;
+ }
+ }
+}
+
+###########################################################################################################################################
+
+sub reset_counters_and_fhs{
+ my $filename = shift;
+ %counting=(
+ total_meCHH_count => 0,
+ total_meCHG_count => 0,
+ total_meCpG_count => 0,
+ total_unmethylated_CHH_count => 0,
+ total_unmethylated_CHG_count => 0,
+ total_unmethylated_CpG_count => 0,
+ sequences_count => 0,
+ no_single_alignment_found => 0,
+ unsuitable_sequence_count => 0,
+ genomic_sequence_could_not_be_extracted_count => 0,
+ unique_best_alignment_count => 0,
+ low_complexity_alignments_overruled_count => 0,
+ CT_CT_count => 0, #(CT read/CT genome, original top strand)
+ CT_GA_count => 0, #(CT read/GA genome, original bottom strand)
+ GA_CT_count => 0, #(GA read/CT genome, complementary to original top strand)
+ GA_GA_count => 0, #(GA read/GA genome, complementary to original bottom strand)
+ CT_GA_CT_count => 0, #(CT read1/GA read2/CT genome, original top strand)
+ GA_CT_GA_count => 0, #(GA read1/CT read2/GA genome, complementary to original bottom strand)
+ GA_CT_CT_count => 0, #(GA read1/CT read2/CT genome, complementary to original top strand)
+ CT_GA_GA_count => 0, #(CT read1/GA read2/GA genome, original bottom strand)
+ alignments_rejected_count => 0, # only relevant if --directional was specified
+ );
+
+ if ($directional){
+ if ($filename =~ ','){ # paired-end files
+ @fhs=(
+ { name => 'CTreadCTgenome',
+ strand_identity => 'con ori forward',
+ bisulfiteIndex => $CT_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ { name => 'CTreadGAgenome',
+ strand_identity => 'con ori reverse',
+ bisulfiteIndex => $GA_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ { name => 'GAreadCTgenome',
+ strand_identity => 'compl ori con forward',
+ bisulfiteIndex => $CT_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ { name => 'GAreadGAgenome',
+ strand_identity => 'compl ori con reverse',
+ bisulfiteIndex => $GA_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ );
+ }
+ else{ # single-end files
+ @fhs=(
+ { name => 'CTreadCTgenome',
+ strand_identity => 'con ori forward',
+ bisulfiteIndex => $CT_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ { name => 'CTreadGAgenome',
+ strand_identity => 'con ori reverse',
+ bisulfiteIndex => $GA_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ );
+ }
+ }
+ else{
+ @fhs=(
+ { name => 'CTreadCTgenome',
+ strand_identity => 'con ori forward',
+ bisulfiteIndex => $CT_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ { name => 'CTreadGAgenome',
+ strand_identity => 'con ori reverse',
+ bisulfiteIndex => $GA_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ { name => 'GAreadCTgenome',
+ strand_identity => 'compl ori con forward',
+ bisulfiteIndex => $CT_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ { name => 'GAreadGAgenome',
+ strand_identity => 'compl ori con reverse',
+ bisulfiteIndex => $GA_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ );
+ }
+}
+
+
+sub process_command_line{
+ my @bowtie_options;
+ my $help;
+ my $mates1;
+ my $mates2;
+ my $path_to_bowtie;
+ my $fastq;
+ my $fasta;
+ my $skip;
+ my $qupto;
+ my $phred64;
+ my $phred33;
+ my $solexa;
+ my $mismatches;
+ my $seed_length;
+ my $best;
+ my $sequence_format;
+ my $version;
+ my $quiet;
+ my $chunk;
+ my $non_directional;
+ my $ceiling;
+ my $maxins;
+ my $minins;
+ my $unmapped;
+ my $multi_map;
+ my $output_dir;
+ my $bowtie2;
+ my $vanilla;
+ my $sam_no_hd;
+ my $seed_extension_fails;
+ my $reseed_repetitive_seeds;
+ my $most_valid_alignments;
+ my $score_min;
+ my $parallel;
+ my $temp_dir;
+
+ my $command_line = GetOptions ('help|man' => \$help,
+ '1=s' => \$mates1,
+ '2=s' => \$mates2,
+ 'path_to_bowtie=s' => \$path_to_bowtie,
+ 'f|fasta' => \$fasta,
+ 'q|fastq' => \$fastq,
+ 's|skip=i' => \$skip,
+ 'u|upto=i' => \$qupto,
+ 'phred33-quals' => \$phred33,
+ 'phred64-quals|solexa1' => \$phred64,
+ 'solexa-quals' => \$solexa,
+ 'n|seedmms=i' => \$mismatches,
+ 'l|seedlen=i' => \$seed_length,
+ 'no_best' => \$best,
+ 'version' => \$version,
+ 'quiet' => \$quiet,
+ 'chunkmbs=i' => \$chunk,
+ 'non_directional' => \$non_directional,
+ 'I|minins=i' => \$minins,
+ 'X|maxins=i' => \$maxins,
+ 'e|maqerr=i' => \$ceiling,
+ 'un|unmapped' => \$unmapped,
+ 'ambiguous' => \$multi_map,
+ 'o|output_dir=s' => \$output_dir,
+ 'bowtie2' => \$bowtie2,
+ 'vanilla' => \$vanilla,
+ 'sam-no-hd' => \$sam_no_hd,
+ 'D=i' => \$seed_extension_fails,
+ 'R=i' => \$reseed_repetitive_seeds,
+ 'score_min=s' => \$score_min,
+ 'most_valid_alignments=i' => \$most_valid_alignments,
+ 'p=i' => \$parallel,
+ 'temp_dir=s' => \$temp_dir,
+ );
+
+
+ ### EXIT ON ERROR if there were errors with any of the supplied options
+ unless ($command_line){
+ die "Please respecify command line options\n";
+ }
+ ### HELPFILE
+ if ($help){
+ print_helpfile();
+ exit;
+ }
+ if ($version){
+ print << "VERSION";
+
+
+ Bismark - Bisulfite Mapper and Methylation Caller.
+
+ Bismark Version: $bismark_version Copyright 2010-12 Felix Krueger, Babraham Bioinformatics
+ www.bioinformatics.babraham.ac.uk/projects/
+
+
+VERSION
+ exit;
+ }
+
+
+ ##########################
+ ### PROCESSING OPTIONS ###
+ ##########################
+
+ unless ($bowtie2){
+ $bowtie2 = 0;
+ }
+ unless ($sam_no_hd){
+ $sam_no_hd =0;
+ }
+
+ ### PATH TO BOWTIE
+ ### if a special path to Bowtie 1/2 was specified we will use that one, otherwise it is assumed that Bowtie 1/2 is in the PATH
+ if ($path_to_bowtie){
+ unless ($path_to_bowtie =~ /\/$/){
+ $path_to_bowtie =~ s/$/\//;
+ }
+ if (-d $path_to_bowtie){
+ if ($bowtie2){
+ $path_to_bowtie = "${path_to_bowtie}bowtie2";
+ }
+ else{
+ $path_to_bowtie = "${path_to_bowtie}bowtie";
+ }
+ }
+ else{
+ die "The path to bowtie provided ($path_to_bowtie) is invalid (not a directory)!\n";
+ }
+ }
+ else{
+ if ($bowtie2){
+ $path_to_bowtie = 'bowtie2';
+ warn "Path to Bowtie 2 specified as: $path_to_bowtie\n"; }
+ else{
+ $path_to_bowtie = 'bowtie';
+ warn "Path to Bowtie specified as: $path_to_bowtie\n";
+ }
+ }
+
+ ####################################
+ ### PROCESSING ARGUMENTS
+
+ ### GENOME FOLDER
+ my $genome_folder = shift @ARGV; # mandatory
+ unless ($genome_folder){
+ warn "Genome folder was not specified!\n";
+ print_helpfile();
+ exit;
+ }
+
+ ### checking that the genome folder, all subfolders and the required bowtie index files exist
+ unless ($genome_folder =~/\/$/){
+ $genome_folder =~ s/$/\//;
+ }
+
+ if (chdir $genome_folder){
+ my $absolute_genome_folder = getcwd; ## making the genome folder path absolute
+ unless ($absolute_genome_folder =~/\/$/){
+ $absolute_genome_folder =~ s/$/\//;
+ }
+ warn "Reference genome folder provided is $genome_folder\t(absolute path is '$absolute_genome_folder)'\n";
+ $genome_folder = $absolute_genome_folder;
+ }
+ else{
+ die "Failed to move to $genome_folder: $!\nUSAGE: Bismark.pl [options] {-1 -2 | } [] (--help for more details)\n";
+ }
+
+ my $CT_dir = "${genome_folder}Bisulfite_Genome/CT_conversion/";
+ my $GA_dir = "${genome_folder}Bisulfite_Genome/GA_conversion/";
+
+ if ($bowtie2){ ### Bowtie 2 (new)
+ ### checking the integrity of $CT_dir
+ chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
+ my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2');
+ foreach my $file(@CT_bowtie_index){
+ unless (-f $file){
+ die "The Bowtie 2 index of the C->T converted genome seems to be faulty ($file). Please run the bismark_genome_preparation before running Bismark.\n";
+ }
+ }
+ ### checking the integrity of $GA_dir
+ chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
+ my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2');
+ foreach my $file(@GA_bowtie_index){
+ unless (-f $file){
+ die "The Bowtie 2 index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
+ }
+ }
+ }
+
+ else{ ### Bowtie 1 (default)
+ ### checking the integrity of $CT_dir
+ chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
+ my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt');
+ foreach my $file(@CT_bowtie_index){
+ unless (-f $file){
+ die "The Bowtie index of the C->T converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
+ }
+ }
+ ### checking the integrity of $GA_dir
+ chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
+ my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt');
+ foreach my $file(@GA_bowtie_index){
+ unless (-f $file){
+ die "The Bowtie index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
+ }
+ }
+ }
+
+ my $CT_index_basename = "${CT_dir}BS_CT";
+ my $GA_index_basename = "${GA_dir}BS_GA";
+
+ ### INPUT OPTIONS
+
+ ### SEQUENCE FILE FORMAT
+ ### exits if both fastA and FastQ were specified
+ if ($fasta and $fastq){
+ die "Only one sequence filetype can be specified (fastA or fastQ)\n";
+ }
+
+ ### unless fastA is specified explicitely, fastQ sequence format is expected by default
+ if ($fasta){
+ print "FastA format specified\n";
+ $sequence_format = 'FASTA';
+ push @bowtie_options, '-f';
+ }
+ elsif ($fastq){
+ print "FastQ format specified\n";
+ $sequence_format = 'FASTQ';
+ push @bowtie_options, '-q';
+ }
+ else{
+ $fastq = 1;
+ print "FastQ format assumed (by default)\n";
+ $sequence_format = 'FASTQ';
+ push @bowtie_options, '-q';
+ }
+
+ ### SKIP
+ if ($skip){
+ warn "Skipping the first $skip reads from the input file\n";
+ # push @bowtie_options,"-s $skip";
+ }
+
+ ### UPTO
+ if ($qupto){
+ warn "Processing sequences up to read no. $qupto from the input file\n";
+ if ($bowtie2){
+ # push @bowtie_options,"--upto $qupto"; ## slightly changed for Bowtie 2
+ }
+ else{
+ # push @bowtie_options,"--qupto $qupto";
+ }
+ }
+
+ ### QUALITY VALUES
+ if (($phred33 and $phred64) or ($phred33 and $solexa) or ($phred64 and $solexa)){
+ die "You can only specify one type of quality value at a time! (--phred33-quals or --phred64-quals or --solexa-quals)";
+ }
+ if ($phred33){ ## if nothing else is specified $phred33 will be used as default by both Bowtie 1 and 2.
+ # Phred quality values work only when -q is specified
+ unless ($fastq){
+ die "Phred quality values works only when -q (FASTQ) is specified\n";
+ }
+ if ($bowtie2){
+ push @bowtie_options,"--phred33";
+ }
+ else{
+ push @bowtie_options,"--phred33-quals";
+ }
+ }
+ if ($phred64){
+ # Phred quality values work only when -q is specified
+ unless ($fastq){
+ die "Phred quality values work only when -q (FASTQ) is specified\n";
+ }
+ if ($bowtie2){
+ push @bowtie_options,"--phred64";
+ }
+ else{
+ push @bowtie_options,"--phred64-quals";
+ }
+ }
+ else{
+ $phred64 = 0;
+ }
+
+ if ($solexa){
+ if ($bowtie2){
+ die "The option '--solexa-quals' is not compatible with Bowtie 2. Please respecify!\n";
+ }
+ # Solexa to Phred value conversion works only when -q is specified
+ unless ($fastq){
+ die "Conversion from Solexa to Phred quality values works only when -q (FASTQ) is specified\n";
+ }
+ push @bowtie_options,"--solexa-quals";
+ }
+ else{
+ $solexa = 0;
+ }
+
+ ### ALIGNMENT OPTIONS
+
+ ### MISMATCHES
+ if (defined $mismatches){
+ if ($bowtie2){
+ if ($mismatches == 0 or $mismatches == 1){
+ push @bowtie_options,"-N $mismatches";
+ }
+ else{
+ die "Please set the number of multiseed mismatches for Bowtie 2 with '-N ' (where can be 0 or 1)\n";
+ }
+ }
+ else{
+ if ($mismatches >= 0 and $mismatches <= 3){
+ push @bowtie_options,"-n $mismatches";
+ }
+ else{
+ die "Please set the number of seed mismatches for Bowtie 1 with '-n ' (where can be 0,1,2 or 3)\n";
+ }
+ }
+ }
+ else{
+ unless ($bowtie2){
+ push @bowtie_options,"-n 1"; # setting -n to 1 by default (for use with Bowtie only) because it is much quicker than the default mode of -n 2
+ }
+ }
+
+ ### SEED LENGTH
+ if (defined $seed_length){
+ if ($bowtie2){
+ push @bowtie_options,"-L $seed_length";
+ }
+ else{
+ push @bowtie_options,"-l $seed_length";
+ }
+ }
+
+ ### MISMATCH CEILING
+ if (defined $ceiling){
+ die "The option '-e' is not compatible with Bowtie 2. Please respecify options\n" if ($bowtie2);
+ push @bowtie_options,"-e $ceiling";
+ }
+
+
+ ### BOWTIE 2 EFFORT OPTIONS
+
+ ### CONSECUTIVE SEED EXTENSION FAILS
+ if (defined $seed_extension_fails){
+ die "The option '-D ' is only available when using Bowtie 2\n\n" unless ($bowtie2);
+ push @bowtie_options,"-D $seed_extension_fails";
+ }
+
+ ### RE-SEEDING REPETITIVE SEEDS
+ if (defined $reseed_repetitive_seeds){
+ die "The option '-R ' is only available when using Bowtie 2\n\n" unless ($bowtie2);
+ push @bowtie_options,"-R $reseed_repetitive_seeds";
+ }
+
+
+ ### BOWTIE 2 SCORING OPTIONS
+ if ($score_min){
+ die "The option '--score_min ' is only available when using Bowtie 2\n\n" unless ($bowtie2);
+ unless ($score_min =~ /^L,.+,.+$/){
+ die "The option '--score_min ' needs to be in the format . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
+ }
+ push @bowtie_options,"--score-min $score_min";
+ }
+ else{
+ if ($bowtie2){
+ push @bowtie_options,"--score-min L,0,-0.2"; # default setting, more stringent than normal Bowtie2
+ }
+ }
+
+ ### BOWTIE 2 PARALLELIZATION OPTIONS
+ if (defined $parallel){
+ die "The parallelization switch '-p' only works for Bowtie 2. Please respecify!" unless ($bowtie2);
+ }
+ if ($bowtie2){
+ if ($parallel){
+ die "Please select a value for -p of 2 or more!\n" unless ($parallel > 1);
+ push @bowtie_options,"-p $parallel";
+ push @bowtie_options,'--reorder'; ## re-orders the bowtie 2 output so that it does match the input files. This is abolutely required for parallelization to work.
+ print "Each Bowtie 2 instance is going to be run with $parallel threads. Please monitor performance closely and tune down if needed!\n";
+ sleep (2);
+ }
+ }
+
+ ### REPORTING OPTIONS
+
+ if ($bowtie2){
+ push @bowtie_options,'--ignore-quals'; ## All mismatches will receive penalty for mismatches as if they were of high quality, which is 6 by default
+
+ ### Option -M is deprecated since Bowtie 2 version 2.0.0 beta7. I'll leave this option commented out for a while
+ if(defined $most_valid_alignments){
+
+ warn "\nThe option -M is now deprecated (as of Bowtie 2 version 2.0.0 beta7). What used to be called -M mode is still the default mode. Use the -D and -R options to adjust the effort expended to find valid alignments.\n\n";
+ # push @bowtie_options,"-M $most_valid_alignments";sleep (5);
+ }
+ # else{
+ # push @bowtie_options,'-M 10'; # the default behavior for Bowtie 2 is to report (and sort) up to 500 alignments for a given sequence
+ # }
+ }
+ else{ # Because of the way Bismark works we will always use the reporting option -k 2 (report up to 2 valid alignments) for Bowtie 1
+ push @bowtie_options,'-k 2';
+ }
+
+ ### --BEST
+ if ($bowtie2){
+ if ($best){ # Bowtie 2 does away with the concept of --best, so one can also not select --no-best when Bowtie 2 is to be used
+ die "The option '--no-best' is not compatible with Bowtie 2. Please respecify options\n";
+ }
+ }
+ else{
+ # --best is the default option for Bowtie 1, specifying --no-best can turn it off (e.g. to speed up alignment process)
+ unless ($best){
+ push @bowtie_options,'--best';
+ }
+ }
+
+ ### VANILLA BISMARK (BOWTIE 1) OUTPUT
+ if ($vanilla){
+ if ($bowtie2){
+ die "The options --bowtie2 and the --vanilla are not compatible. Please respecify!\n\n";
+ }
+ }
+ else{
+ $vanilla = 0;
+ }
+
+ ### PAIRED-END MAPPING
+ if ($mates1){
+ my @mates1 = (split (/,/,$mates1));
+ die "Paired-end mapping requires the format: -1 -2 , please respecify!\n" unless ($mates2);
+ my @mates2 = (split(/,/,$mates2));
+ unless (scalar @mates1 == scalar @mates2){
+ die "Paired-end mapping requires the same amounnt of mate1 and mate2 files, please respecify! (format: -1 -2 )\n";
+ }
+ while (1){
+ my $mate1 = shift @mates1;
+ my $mate2 = shift @mates2;
+ last unless ($mate1 and $mate2);
+ push @filenames,"$mate1,$mate2";
+ }
+ if ($bowtie2){
+ push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments
+ push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones
+ }
+ }
+ elsif ($mates2){
+ die "Paired-end mapping requires the format: -1 -2 , please respecify!\n";
+ }
+
+ ### SINGLE-END MAPPING
+ # Single-end mapping will be performed if no mate pairs for paired-end mapping have been specified
+ my $singles;
+ unless ($mates1 and $mates2){
+ $singles = join (',',@ARGV);
+ unless ($singles){
+ die "\nNo filename supplied! Please specify one or more files for single-end Bismark mapping!\n";
+ }
+ $singles =~ s/\s/,/g;
+ @filenames = (split(/,/,$singles));
+ warn "\nFiles to be analysed:\n";
+ warn "@filenames\n\n";
+ sleep (3);
+ }
+
+ ### MININUM INSERT SIZE (PAIRED-END ONLY)
+ if (defined $minins){
+ die "-I/--minins can only be used for paired-end mapping!\n\n" if ($singles);
+ push @bowtie_options,"--minins $minins";
+ }
+
+ ### MAXIMUM INSERT SIZE (PAIRED-END ONLY)
+ if (defined $maxins){
+ die "-X/--maxins can only be used for paired-end mapping!\n\n" if ($singles);
+ push @bowtie_options,"--maxins $maxins";
+ }
+ else{
+ unless ($singles){
+ push @bowtie_options,'--maxins 500';
+ }
+ }
+
+ ### QUIET prints nothing besides alignments (suppresses warnings)
+ if ($quiet){
+ push @bowtie_options,'--quiet';
+ }
+
+ ### CHUNKMBS needed to be increased to avoid memory exhaustion warnings for Bowtie 1, particularly for --best (and paired-end) alignments
+ unless ($bowtie2){ # Bowtie 2 does not have a chunkmbs option
+ if (defined $chunk){
+ push @bowtie_options,"--chunkmbs $chunk";
+ }
+ else{
+ push @bowtie_options,'--chunkmbs 512'; ## setting the default to 512MB (up from 64 default)
+ }
+ }
+
+
+ ### SUMMARY OF ALL BOWTIE OPTIONS
+ my $bowtie_options = join (' ',@bowtie_options);
+
+
+ ### STRAND-SPECIFIC LIBRARIES
+ my $directional;
+ if ($non_directional){
+ print "Library was specified to be not strand-specific (non-directional), therefore alignments to all four possible bisulfite strands (OT, CTOT, OB and CTOB) will be reported.\n";
+ sleep (3);
+ $directional = 0;
+ }
+ else{
+ print "Library is assumed to be strand-specific (directional), alignments to strands complementary to the original top or bottom strands will be ignored (i.e. not performed!).\n";
+ sleep (3);
+ $directional = 1; # Changed this to being the default behaviour
+ }
+
+ ### UNMAPPED SEQUENCE OUTPUT
+ $unmapped = 0 unless ($unmapped);
+
+ ### AMBIGUOUS ALIGNMENT SEQUENCE OUTPUT
+ $multi_map = 0 unless ($multi_map);
+
+
+ ### OUTPUT DIRECTORY
+
+ chdir $parent_dir or die "Failed to move back to current working directory\n";
+ if ($output_dir){
+ unless ($output_dir =~ /\/$/){
+ $output_dir =~ s/$/\//;
+ }
+
+ if (chdir $output_dir){
+ $output_dir = getcwd; # making the path absolute
+ unless ($output_dir =~ /\/$/){
+ $output_dir =~ s/$/\//;
+ }
+ }
+ else{
+ mkdir $output_dir or die "Unable to create directory $output_dir $!\n";
+ warn "Created output directory $output_dir!\n\n";
+ chdir $output_dir or die "Failed to move to $output_dir\n";
+ $output_dir = getcwd; # making the path absolute
+ unless ($output_dir =~ /\/$/){
+ $output_dir =~ s/$/\//;
+ }
+ }
+ warn "Output will be written into the directory: $output_dir\n";
+ }
+ else{
+ $output_dir = '';
+ }
+
+ ### TEMPORARY DIRECTORY for C->T and G->A transcribed files
+
+ chdir $parent_dir or die "Failed to move back to current working directory\n";
+ if ($temp_dir){
+ warn "\nUsing temp directory: $temp_dir\n";
+ unless ($temp_dir =~ /\/$/){
+ $temp_dir =~ s/$/\//;
+ }
+
+ if (chdir $temp_dir){
+ $temp_dir = getcwd; # making the path absolute
+ unless ($temp_dir =~ /\/$/){
+ $temp_dir =~ s/$/\//;
+ }
+ }
+ else{
+ mkdir $temp_dir or die "Unable to create directory $temp_dir $!\n";
+ warn "Created temporary directory $temp_dir!\n\n";
+ chdir $temp_dir or die "Failed to move to $temp_dir\n";
+ $temp_dir = getcwd; # making the path absolute
+ unless ($temp_dir =~ /\/$/){
+ $temp_dir =~ s/$/\//;
+ }
+ }
+ warn "Temporary files will be written into the directory: $temp_dir\n";
+ }
+ else{
+ $temp_dir = '';
+ }
+
+
+ return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir);
+}
+
+
+
+sub generate_SAM_header{
+ print OUT "\@HD\tVN:1.0\tSO:unsorted\n"; # @HD = header, VN = version, SO = sort order
+ foreach my $chr (keys %chromosomes){
+ my $length = length ($chromosomes{$chr});
+ print OUT "\@SQ\tSN:$chr\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length
+ }
+ print OUT "\@PG\tID:Bismark\tVN:$bismark_version\tCL:\"bismark $command_line\"\n"; # @PG = program, ID = unique identifier, PN = program name name, VN = program version
+}
+
+### I would like to thank the following individuals for their valuable contributions to the Bismark SAM output format:
+### O. Tam (Sep 2010), C. Whelan (2011), E. Vidal (2011), T. McBryan (2011), P. Hickey (2011)
+
+sub single_end_SAM_output{
+ my ($id,$actual_seq,$methylation_call_params,$qual) = @_;
+ my $strand = $methylation_call_params->{$id}->{alignment_strand};
+ my $chr = $methylation_call_params->{$id}->{chromosome};
+ my $start = $methylation_call_params->{$id}->{position};
+ my $stop = $methylation_call_params->{$id}->{end_position};
+ my $ref_seq = $methylation_call_params->{$id}->{unmodified_genomic_sequence};
+ my $methcall = $methylation_call_params->{$id}->{methylation_call};
+ my $read_conversion = $methylation_call_params->{$id}->{read_conversion};
+ my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion};
+ my $number_of_mismatches = $methylation_call_params->{$id}->{number_of_mismatches};
+ ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011"
+ ## FLAG: bitwise FLAG. Each bit is explained in the following table:
+ ## Bit Description Comment Value
+ ## 0x1 template has multiple segments in sequencing 0: single-end 1: paired end value: 2**0 ( 1)
+ ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2**1 ( 2)
+ ## 0x4 segment unmapped --- ---
+ ## 0x8 next segment in the template unmapped --- ---
+ ## 0x10 SEQ being reverse complemented value: 2**4 ( 16)
+ ## 0x20 SEQ of the next segment in the template being reversed value: 2**5 ( 32)
+ ## 0x40 the first segment in the template read 1 value: 2**6 ( 64)
+ ## 0x80 the last segment in the template read 2 value: 2**7 (128)
+ ## 0x100 secondary alignment --- ---
+ ## 0x200 not passing quality controls --- ---
+ ## 0x400 PCR or optical duplicate --- ---
+
+ #####
+
+ my $flag; # FLAG variable used for SAM format.
+ if ($strand eq "+"){
+ if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){
+ $flag = 0; # 0 for "+" strand (OT)
+ }
+ elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){
+ $flag = 16; # 16 for "-" strand (CTOB, yields information for the original bottom strand)
+ }
+ else{
+ die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n";
+ }
+ }
+ elsif ($strand eq "-"){
+ if ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){
+ $flag = 16; # 16 for "-" strand (OB)
+ }
+ elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){
+ $flag = 0; # 0 for "+" strand (CTOT, yields information for the original top strand)
+ }
+ else{
+ die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n";
+ }
+ }
+ else{
+ die "Unexpected strand information: $strand\n\n";
+ }
+
+ #####
+
+ my $mapq = 255; # Assume mapping quality is unavailable
+
+ #####
+
+ my $cigar;
+ if ($bowtie2){
+ $cigar = $methylation_call_params->{$id}->{CIGAR}; # Actual CIGAR string reported by Bowtie 2
+ }
+ else{
+ $cigar = length($actual_seq) . "M"; # Bowtie 1 output does not contain indels (only matches and mismatches)
+ }
+
+ #####
+
+ my $rnext = "*"; # Paired-end variable
+
+ #####
+
+ my $pnext = 0; # Paired-end variable
+
+ #####
+
+ my $tlen = 0; # Paired-end variable
+
+ #####
+
+ if ($read_conversion eq 'CT'){
+ $ref_seq = substr($ref_seq, 0, length($ref_seq) - 2); # Removes additional nucleotides from the 3' end. This only works for the original top or bottom strands
+ }
+ else{
+ $ref_seq = substr($ref_seq, 2, length($ref_seq) - 2); # Removes additional nucleotides from the 5' end. This works for the complementary strands in non-directional libraries
+ }
+
+ if ($strand eq '-'){
+ $actual_seq = revcomp($actual_seq); # Sequence represented on the forward genomic strand
+ $ref_seq = revcomp($ref_seq); # Required for comparison with actual sequence
+ $qual = reverse $qual; # if the sequence was reverse-complemented the quality string needs to be reversed as well
+ }
+
+ #####
+
+ my $hemming_dist = hemming_dist($actual_seq,$ref_seq); # Edit distance to the reference, i.e. minimal number of one-nucleotide edits needed to transform the read string
+ # into the reference string. hemming_dist()
+ if ($bowtie2){
+ $hemming_dist += $methylation_call_params->{$id}->{indels}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
+ }
+
+ my $NM_tag = "NM:i:$hemming_dist"; # Optional tag NM: edit distance based on nucleotide differences
+
+ #####
+
+ my $XX_tag = make_mismatch_string($actual_seq, $ref_seq); # Optional tag XX: string providing mismatched reference bases in the alignment (NO indel information!)
+
+ #####
+
+ my $XM_tag; # Optional tag XM: Methylation Call String
+ if ($strand eq '+'){
+ $XM_tag = "XM:Z:$methcall";
+ }
+ elsif ($strand eq '-'){
+ $XM_tag = 'XM:Z:'.reverse $methcall; # if the sequence was reverse-complemented the methylation call string needs to be reversed as well
+ }
+
+ #####
+
+ my $XR_tag = "XR:Z:$read_conversion"; # Optional tag XR: Read Conversion
+
+ #####
+
+ my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion
+
+ #####
+
+ # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
+ print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$XX_tag,$XM_tag,$XR_tag,$XG_tag)),"\n";
+}
+
+
+sub paired_end_SAM_output{
+ my ($id,$actual_seq_1,$actual_seq_2,$methylation_call_params,$qual_1,$qual_2) = @_;
+ my $strand_1 = $methylation_call_params->{$id}->{alignment_read_1}; # Bowtie 1 only reports the read 1 alignment strand
+ my $strand_2 = $methylation_call_params->{$id}->{alignment_read_2};
+ my $chr = $methylation_call_params->{$id}->{chromosome};
+ my $ref_seq_1 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_1};
+ my $ref_seq_2 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_2};
+ my $methcall_1 = $methylation_call_params->{$id}->{methylation_call_1};
+ my $methcall_2 = $methylation_call_params->{$id}->{methylation_call_2};
+ my $read_conversion_1 = $methylation_call_params->{$id}->{read_conversion_1};
+ my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2};
+ my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion};
+ my $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_1}; # only needed for custom allele-specific output, not the default!
+ my $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_2};
+
+ my $id_1 = $id.'/1';
+ my $id_2 = $id.'/2';
+
+ # Allows all degenerate nucleotide sequences in reference genome
+ die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHV]/i;
+ die "Reference sequence ($ref_seq_2) contains invalid nucleotides!\n" if $ref_seq_2 =~ /[^ACTGNRYMKSWBDHV]/i;
+
+ my $index; # used to store the srand origin of the alignment in a less convoluted way
+
+ if ($read_conversion_1 eq 'CT' and $genome_conversion eq 'CT'){
+ $index = 0; ## this is OT (original top strand)
+ }
+ elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'GA'){
+ $index = 1; ## this is CTOB (complementary to OB)
+ }
+ elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'CT'){
+ $index = 2; ## this is CTOT (complementary to OT)
+ }
+ elsif ($read_conversion_1 eq 'CT' and $genome_conversion eq 'GA'){
+ $index = 3; ## this is OB (original bottom)
+ }
+ else {
+ die "Unexpected combination of read 1 and genome conversion: $read_conversion_1 / $genome_conversion\n";
+ }
+
+ ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the
+ ### first or last position.
+
+ if ($index == 0 or $index == 3){ # OT or OB
+ $ref_seq_1 = substr($ref_seq_1,0,length($ref_seq_1)-2);
+ $ref_seq_2 = substr($ref_seq_2,2,length($ref_seq_2)-2);
+ }
+ else{ # CTOT or CTOB
+ $ref_seq_1 = substr($ref_seq_1,2,length($ref_seq_1)-2);
+ $ref_seq_2 = substr($ref_seq_2,0,length($ref_seq_2)-2);
+ }
+
+ #####
+
+ my $start_read_1;
+ my $start_read_2;
+ # adjusting end positions
+
+ if ($bowtie2){
+ $start_read_1 = $methylation_call_params->{$id}->{position_1};
+ $start_read_2 = $methylation_call_params->{$id}->{position_2};
+ }
+ else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1
+ if ($strand_1 eq '+'){ # Read 1 aligns to the + strand
+ $start_read_1 = $methylation_call_params->{$id}->{start_seq_1};
+ $start_read_2 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_2) + 1;
+ }
+ else{ # read 1 is on the - strand
+ $start_read_1 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_1) + 1;
+ $start_read_2 = $methylation_call_params->{$id}->{start_seq_1};
+ }
+ }
+
+ #####
+
+ my $end_read_1;
+ my $end_read_2;
+ # adjusting end positions
+
+ if ($bowtie2){
+ $end_read_1 = $methylation_call_params->{$id}->{end_position_1};
+ $end_read_2 = $methylation_call_params->{$id}->{end_position_2};
+ }
+ else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1
+ if ($strand_1 eq '+'){ # Read 1 aligns to the + strand
+ $end_read_1 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_1)-1;
+ $end_read_2 = $methylation_call_params->{$id}->{alignment_end};
+ }
+ else{
+ $end_read_1 = $methylation_call_params->{$id}->{alignment_end};
+ $end_read_2 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_2)-1;
+ }
+ }
+
+ #####
+
+ ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011"
+ ## FLAG: bitwise FLAG. Each bit is explained in the following table:
+ ## Bit Description Comment Value
+ ## 0x1 template having multiple segments in sequencing 0: single-end 1: paired end value: 2^^0 ( 1)
+ ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2^^1 ( 2)
+ ## 0x4 segment unmapped --- ---
+ ## 0x8 next segment in the template unmapped --- ---
+ ## 0x10 SEQ being reverse complemented - strand alignment value: 2^^4 ( 16)
+ ## 0x20 SEQ of the next segment in the template being reversed + strand alignment value: 2^^5 ( 32)
+ ## 0x40 the first segment in the template read 1 value: 2^^6 ( 64)
+ ## 0x80 the last segment in the template read 2 value: 2^^7 (128)
+ ## 0x100 secondary alignment --- ---
+ ## 0x200 not passing quality controls --- ---
+ ## 0x400 PCR or optical duplicate --- ---
+
+ ### As the FLAG value do not consider that there might be 4 different bisulfite strands of DNA, we are trying to make FLAG tags which take the strand identity into account
+
+ # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand)
+ # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences)
+
+ my $flag_1; # FLAG variable used for SAM format
+ my $flag_2;
+
+ if ($index == 0){ # OT
+ $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it)
+ $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128)
+ }
+ elsif ($index == 1){ # CTOB
+ $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64)
+ $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128)
+ }
+ elsif ($index == 2){ # CTOT
+ $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64)
+ $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128)
+ }
+ elsif ($index == 3){ # OB
+ $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64)
+ $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128)
+ }
+
+ #####
+
+ my $mapq = 255; # Mapping quality is unavailable
+
+ #####
+
+ my $cigar_1;
+ my $cigar_2;
+
+ if ($bowtie2){
+ $cigar_1 = $methylation_call_params->{$id}->{CIGAR_1}; # Actual CIGAR string reported by Bowtie 2
+ $cigar_2 = $methylation_call_params->{$id}->{CIGAR_2};
+ }
+ else{
+ $cigar_1 = length($actual_seq_1) . "M"; # Assume no indels for Bowtie 1 mapping (only matches and mismatches)
+ $cigar_2 = length($actual_seq_2) . "M";
+ }
+
+ #####
+
+ my $rnext = '='; # Chromosome of mate; applies to both reads
+
+ #####
+
+ my $pnext_1 = $start_read_2; # Leftmost position of mate
+ my $pnext_2 = $start_read_1;
+
+ #####
+
+ my $tlen_1; # signed observed Template LENgth (or inferred fragment size)
+ my $tlen_2;
+
+ if ($bowtie2){
+
+ if ($start_read_1 <= $start_read_2){
+
+ # Read 1 alignment is leftmost
+
+ if ($end_read_2 >= $end_read_1){
+
+ # -------------------------> read 1 reads overlapping
+ # <------------------------- read 2
+ #
+ # or
+ #
+ # -------------------------> read 1
+ # <----------------------- read 2 read 2 contained within read 1
+ #
+ # or
+ #
+ # -------------------------> read 1 reads 1 and 2 exactly overlapping
+ # <------------------------- read 2
+ #
+
+ # dovetailing of reads is not enabled for Bowtie 2 alignments
+
+ $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
+ $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
+ }
+ elsif ($end_read_2 < $end_read_1){
+
+ # -------------------------> read 1
+ # <----------- read 2 read 2 contained within read 1
+ #
+ # or
+ #
+ # -------------------------> read 1
+ # <----------- read 2 read 2 contained within read 1
+
+ # start and end of read 2 are fully contained within read 1
+ $tlen_1 = 0; # Set as 0 when the information is unavailable
+ $tlen_2 = 0; # Set as 0 when the information is unavailable
+ }
+
+ }
+
+ elsif ($start_read_2 < $start_read_1){
+
+ if ($end_read_1 >= $end_read_2){
+
+ # Read 2 alignment is leftmost
+
+ # -------------------------> read 2 reads overlapping
+ # <------------------------- read 1
+ #
+ # or
+ #
+ # -------------------------> read 2
+ # <----------------------- read 1 read 1 contained within read 2
+ #
+ #
+
+ $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
+ $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
+ }
+ elsif ($end_read_1 < $end_read_2){
+
+ # -------------------------> read 2
+ # <----------- read 1 read 1 contained within read 2
+ #
+ # or
+ #
+ # -------------------------> read 2
+ # <----------- read 1 read 1 contained within read 2
+
+ # start and end of read 1 are fully contained within read 2
+ $tlen_1 = 0; # Set as 0 when the information is unavailable
+ $tlen_2 = 0; # Set as 0 when the information is unavailable
+ }
+ }
+ }
+
+ else{ # Bowtie 1
+
+ if ($end_read_2 >= $end_read_1){
+ # Read 1 alignment is leftmost
+ # -------------------------> read 1
+ # <------------------------- read 2
+ # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
+
+ $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
+ $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
+ }
+ else{
+ # Read 2 alignment is leftmost
+ # -------------------------> read 2
+ # <------------------------- read 1
+ # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
+
+ $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
+ $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
+ }
+ }
+
+ #####
+
+ # adjusting the strand of the sequence before we use them to generate mismatch strings
+ if ($strand_1 eq '-'){
+ $actual_seq_1 = revcomp($actual_seq_1); # Sequence represented on the forward genomic strand
+ $ref_seq_1 = revcomp($ref_seq_1); # Required for comparison with actual sequence
+ $qual_1 = reverse $qual_1; # we need to reverse the quality string as well
+ }
+ if ($strand_2 eq '-'){
+ $actual_seq_2 = revcomp($actual_seq_2); # Mate sequence represented on the forward genomic strand
+ $ref_seq_2 = revcomp($ref_seq_2); # Required for comparison with actual sequence
+ $qual_2 = reverse $qual_2; # If the sequence gets reverse complemented we reverse the quality string as well
+ }
+
+ # print "$actual_seq_1\n$ref_seq_1\n\n";
+ # print "$actual_seq_2\n$ref_seq_2\n\n";
+
+ #####
+
+ my $hemming_dist_1 = hemming_dist($actual_seq_1,$ref_seq_1); # Minimal number of one-nucleotide edits needed to transform the read string into the reference sequence
+ my $hemming_dist_2 = hemming_dist($actual_seq_2,$ref_seq_2);
+ if ($bowtie2){
+ $hemming_dist_1 += $methylation_call_params->{$id}->{indels_1}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
+ $hemming_dist_2 += $methylation_call_params->{$id}->{indels_2}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
+ }
+ my $NM_tag_1 = "NM:i:$hemming_dist_1"; # Optional tag NM: edit distance based on nucleotide differences
+ my $NM_tag_2 = "NM:i:$hemming_dist_2"; # Optional tag NM: edit distance based on nucleotide differences
+
+ #####
+
+ my $XX_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1); # Optional tag XX: String providing mismatched reference bases in the alignment (NO indel information!)
+ my $XX_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2);
+
+ #####
+
+ my $XM_tag_1; # Optional tag XM: Methylation call string
+ my $XM_tag_2;
+
+ if ($strand_1 eq '-'){
+ $XM_tag_1 = 'XM:Z:'.reverse $methcall_1; # Needs to be reversed if the sequence was reverse complemented
+ }
+ else{
+ $XM_tag_1 = "XM:Z:$methcall_1";
+ }
+
+ if ($strand_2 eq '-'){
+ $XM_tag_2 = 'XM:Z:'.reverse $methcall_2; # Needs to be reversed if the sequence was reverse complemented
+ }
+ else{
+ $XM_tag_2 = "XM:Z:$methcall_2";
+ }
+
+ #####
+
+ my $XR_tag_1 = "XR:Z:$read_conversion_1"; # Optional tag XR: Read 1 conversion state
+ my $XR_tag_2 = "XR:Z:$read_conversion_2"; # Optional tag XR: Read 2 conversion state
+
+ #####
+
+ my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion state; valid for both reads
+
+ #####
+
+ # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
+ print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $XX_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag)), "\n";
+ print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $XX_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag)), "\n";
+}
+
+sub revcomp{
+ my $seq = shift or die "Missing seq to reverse complement\n";
+ $seq = reverse $seq;
+ $seq =~ tr/ACTGactg/TGACTGAC/;
+ return $seq;
+}
+
+sub hemming_dist{
+ my $matches = 0;
+ my @actual_seq = split //,(shift @_);
+ my @ref_seq = split //,(shift @_);
+ foreach (0..$#actual_seq){
+ ++$matches if ($actual_seq[$_] eq $ref_seq[$_]);
+ }
+ return my $hd = scalar @actual_seq - $matches;
+}
+
+sub make_mismatch_string{
+ my $actual_seq = shift or die "Missing actual sequence";
+ my $ref_seq = shift or die "Missing reference sequence";
+ my $XX_tag = "XX:Z:";
+ my $tmp = ($actual_seq ^ $ref_seq); # Bitwise comparison
+ my $prev_mm_pos = 0;
+ while($tmp =~ /[^\0]/g){ # Where bitwise comparison showed a difference
+ my $nuc_match = pos($tmp) - $prev_mm_pos - 1; # Generate number of nucleotide that matches since last mismatch
+ my $nuc_mm = substr($ref_seq, pos($tmp) - 1, 1) if pos($tmp) <= length($ref_seq); # Obtain reference nucleotide that was different from the actual read
+ $XX_tag .= "$nuc_match" if $nuc_match > 0; # Ignore if mismatches are adjacent to each other
+ $XX_tag .= "$nuc_mm" if defined $nuc_mm; # Ignore if there is no mismatch (prevents uninitialized string concatenation)
+ $prev_mm_pos = pos($tmp); # Position of last mismatch
+ }
+ my $end_matches = length($ref_seq) - $prev_mm_pos; # Provides number of matches from last mismatch till end of sequence
+ $XX_tag .= "$end_matches" if $end_matches > 0; # Ignore if mismatch is at the end of sequence
+ return $XX_tag;
+}
+
+
+
+sub print_helpfile{
+ print << "HOW_TO";
+
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+
+
+
+DESCRIPTION
+
+
+The following is a brief description of command line options and arguments to control the Bismark
+bisulfite mapper and methylation caller. Bismark takes in FastA or FastQ files and aligns the
+reads to a specified bisulfite genome. Sequence reads are transformed into a bisulfite converted forward strand
+version (C->T conversion) or into a bisulfite treated reverse strand (G->A conversion of the forward strand).
+Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome
+(C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the
+forward strand, by doing this alignments will produce the same positions). These 4 instances of Bowtie (1 or 2)
+are run in parallel. The sequence file(s) are then read in again sequence by sequence to pull out the original
+sequence from the genome and determine if there were any protected C's present or not.
+
+As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be
+re-enabled by using --non_directional.
+
+The final output of Bismark is in SAM format by default. For Bowtie 1 one can alos choose to report the old
+'vanilla' output format, which is a single tab delimited file with all sequences that have a unique best
+alignment to any of the 4 possible strands of a bisulfite PCR product. Both formats are described in more detail below.
+
+
+USAGE: bismark [options] {-1 -2 | }
+
+
+ARGUMENTS:
+
+ The path to the folder containing the unmodified reference genome
+ as well as the subfolders created by the Bismark_Genome_Preparation
+ script (/Bisulfite_Genome/CT_conversion/ and /Bisulfite_Genome/GA_conversion/).
+ Bismark expects one or more fastA files in this folder (file extension: .fa
+ or .fasta). The path can be relative or absolute.
+
+-1 Comma-separated list of files containing the #1 mates (filename usually includes
+ "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
+ correspond file-for-file and read-for-read with those specified in .
+ Reads may be a mix of different lengths. Bismark will produce one mapping result
+ and one report file per paired-end input file pair.
+
+-2 Comma-separated list of files containing the #2 mates (filename usually includes
+ "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
+ correspond file-for-file and read-for-read with those specified in .
+ Reads may be a mix of different lengths.
+
+ A comma- or space-separated list of files containing the reads to be aligned (e.g.
+ lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will
+ produce one mapping result and one report file per input file.
+
+
+OPTIONS:
+
+
+Input:
+
+-q/--fastq The query input files (specified as , or are FASTQ
+ files (usually having extension .fg or .fastq). This is the default. See also
+ --solexa-quals.
+
+-f/--fasta The query input files (specified as , or are FASTA
+ files (usually havin extension .fa, .mfa, .fna or similar). All quality values
+ are assumed to be 40 on the Phred scale.
+
+-s/--skip Skip (i.e. do not align) the first reads or read pairs from the input.
+
+-u/--upto Only aligns the first reads or read pairs from the input. Default: no limit.
+
+--phred33-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 33. Default: on.
+
+--phred64-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 64. Default: off.
+
+--solexa-quals Convert FASTQ qualities from solexa-scaled (which can be negative) to phred-scaled
+ (which can't). The formula for conversion is:
+ phred-qual = 10 * log(1 + 10 ** (solexa-qual/10.0)) / log(10). Used with -q. This
+ is usually the right option for use with (unconverted) reads emitted by the GA
+ Pipeline versions prior to 1.3. Works only for Bowtie 1. Default: off.
+
+--solexa1.3-quals Same as --phred64-quals. This is usually the right option for use with (unconverted)
+ reads emitted by GA Pipeline version 1.3 or later. Default: off.
+
+--path_to_bowtie The full path to the Bowtie (1 or 2) installation on your system. If not
+ specified it is assumed that Bowtie (1 or 2) is in the PATH.
+
+
+Alignment:
+
+-n/--seedmms The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs
+ of the read (where L is set with -l/--seedlen). This may be 0, 1, 2 or 3 and the
+ default is 1. This option is only available for Bowtie 1 (for Bowtie 2 see -N).
+
+-l/--seedlen The "seed length"; i.e., the number of bases of the high quality end of the read to
+ which the -n ceiling applies. The default is 28. Bowtie (and thus Bismark) is faster for
+ larger values of -l. This option is only available for Bowtie 1 (for Bowtie 2 see -L).
+
+-e/--maqerr Maximum permitted total of quality values at all mismatched read positions throughout
+ the entire alignment, not just in the "seed". The default is 70. Like Maq, bowtie rounds
+ quality values to the nearest 10 and saturates at 30. This value is not relevant for
+ Bowtie 2.
+
+--chunkmbs The number of megabytes of memory a given thread is given to store path descriptors in
+ --best mode. Best-first search must keep track of many paths at once to ensure it is
+ always extending the path with the lowest cumulative cost. Bowtie tries to minimize the
+ memory impact of the descriptors, but they can still grow very large in some cases. If
+ you receive an error message saying that chunk memory has been exhausted in --best mode,
+ try adjusting this parameter up to dedicate more memory to the descriptors. This value
+ is not relevant for Bowtie 2. Default: 512.
+
+-I/--minins The minimum insert size for valid paired-end alignments. E.g. if -I 60 is specified and
+ a paired-end alignment consists of two 20-bp alignments in the appropriate orientation
+ with a 20-bp gap between them, that alignment is considered valid (as long as -X is also
+ satisfied). A 19-bp gap would not be valid in that case. Default: 0.
+
+-X/--maxins The maximum insert size for valid paired-end alignments. E.g. if -X 100 is specified and
+ a paired-end alignment consists of two 20-bp alignments in the proper orientation with a
+ 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied).
+ A 61-bp gap would not be valid in that case. Default: 500.
+
+
+Bowtie 1 Reporting:
+
+-k <2> Due to the way Bismark works Bowtie will report up to 2 valid alignments. This option
+ will be used by default.
+
+--best Make Bowtie guarantee that reported singleton alignments are "best" in terms of stratum
+ (i.e. number of mismatches, or mismatches in the seed in the case if -n mode) and in
+ terms of the quality; e.g. a 1-mismatch alignment where the mismatch position has Phred
+ quality 40 is preferred over a 2-mismatch alignment where the mismatched positions both
+ have Phred quality 10. When --best is not specified, Bowtie may report alignments that
+ are sub-optimal in terms of stratum and/or quality (though an effort is made to report
+ the best alignment). --best mode also removes all strand bias. Note that --best does not
+ affect which alignments are considered "valid" by Bowtie, only which valid alignments
+ are reported by Bowtie. Bowtie is about 1-2.5 times slower when --best is specified.
+ Default: on.
+
+--no_best Disables the --best option which is on by default. This can speed up the alignment process,
+ e.g. for testing purposes, but for credible results it is not recommended to disable --best.
+
+
+Output:
+
+--non_directional The sequencing library was constructed in a non strand-specific manner, alignments to all four
+ bisulfite strands will be reported. Default: OFF.
+
+ (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary
+ to the original strands are merely theoretical and should not exist in reality. Specifying directional
+ alignments (which is the default) will only run 2 alignment threads to the original top (OT)
+ or bottom (OB) strands in parallel and report these alignments. This is the recommended option
+ for sprand-specific libraries).
+
+--sam-no-hd Suppress SAM header lines (starting with @). This might be useful when very large input files are
+ split up into several smaller files to run concurrently and the output files are to be merged.
+
+--quiet Print nothing besides alignments.
+
+--vanilla Performs bisulfite mapping with Bowtie 1 and prints the 'old' output (as in Bismark 0.5.X) instead
+ of SAM format output.
+
+-un/--unmapped Write all reads that could not be aligned to a file in the output directory. Written reads will
+ appear as they did in the input, without any translation of quality values that may have
+ taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1
+ and _2 inserted in their filenames, i.e. _unmapped_reads_1.txt and unmapped_reads_2.txt. Reads
+ with more than one valid alignment with the same number of lowest mismatches (ambiguous mapping)
+ are also written to _unmapped_reads.txt unless the option --ambiguous is specified as well.
+
+--ambiguous Write all reads which produce more than one valid alignment with the same number of lowest
+ mismatches or other reads that fail to align uniquely to a file in the output directory.
+ Written reads will appear as they did in the input, without any of the translation of quality
+ values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two
+ parallel files with _1 and _2 inserted in theit filenames, i.e. _ambiguous_reads_1.txt and
+ _ambiguous_reads_2.txt. These reads are not written to the file specified with --un.
+
+-o/--output_dir Write all output files into this directory. By default the output files will be written into
+ the same folder as the input file(s). If the specified folder does not exist, Bismark will attempt
+ to create it first. The path to the output folder can be either relative or absolute.
+
+--temp_dir Write temporary files to this directory instead of into the same directory as the input files. If
+ the specified folder does not exist, Bismark will attempt to create it first. The path to the
+ temporary folder can be either relative or absolute.
+
+
+
+Other:
+
+-h/--help Displays this help file.
+
+-v/--version Displays version information.
+
+
+BOWTIE 2 SPECIFIC OPTIONS
+
+--bowtie2 Uses Bowtie 2 instead of Bowtie 1. Bismark limits Bowtie 2 to only perform end-to-end
+ alignments, i.e. searches for alignments involving all read characters (also called
+ untrimmed or unclipped alignments). Bismark assumes that raw sequence data is adapter
+ and/or quality trimmed where appropriate. Default: off.
+
+Bowtie 2 alignment options:
+
+-N Sets the number of mismatches to allowed in a seed alignment during multiseed alignment.
+ Can be set to 0 or 1. Setting this higher makes alignment slower (often much slower)
+ but increases sensitivity. Default: 0. This option is only available for Bowtie 2 (for
+ Bowtie 1 see -n).
+
+-L Sets the length of the seed substrings to align during multiseed alignment. Smaller values
+ make alignment slower but more senstive. Default: the --sensitive preset of Bowtie 2 is
+ used by default, which sets -L to 20. This option is only available for Bowtie 2 (for
+ Bowtie 1 see -l).
+
+--ignore-quals When calculating a mismatch penalty, always consider the quality value at the mismatched
+ position to be the highest possible, regardless of the actual value. I.e. input is treated
+ as though all quality values are high. This is also the default behavior when the input
+ doesn't specify quality values (e.g. in -f mode). This option is invariable and on by default.
+
+
+Bowtie 2 paired-end options:
+
+--no-mixed This option disables Bowtie 2's behavior to try to find alignments for the individual mates if
+ it cannot find a concordant or discordant alignment for a pair. This option is invariable and
+ and on by default.
+
+--no-discordant Normally, Bowtie 2 looks for discordant alignments if it cannot find any concordant alignments.
+ A discordant alignment is an alignment where both mates align uniquely, but that does not
+ satisfy the paired-end constraints (--fr/--rf/--ff, -I, -X). This option disables that behavior
+ and it is on by default.
+
+
+Bowtie 2 effort options:
+
+-D Up to consecutive seed extension attempts can "fail" before Bowtie 2 moves on, using
+ the alignments found so far. A seed extension "fails" if it does not yield a new best or a
+ new second-best alignment. Default: 15.
+
+-R is the maximum number of times Bowtie 2 will "re-seed" reads with repetitive seeds.
+ When "re-seeding," Bowtie 2 simply chooses a new set of reads (same length, same number of
+ mismatches allowed) at different offsets and searches for more alignments. A read is considered
+ to have repetitive seeds if the total number of seed hits divided by the number of seeds
+ that aligned at least once is greater than 300. Default: 2.
+
+Bowtie 2 parallelization options:
+
+
+-p NTHREADS Launch NTHREADS parallel search threads (default: 1). Threads will run on separate processors/cores
+ and synchronize when parsing reads and outputting alignments. Searching for alignments is highly
+ parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint.
+ E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint
+ by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads
+ library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will
+ automatically use the option '--reorder', which guarantees that output SAM records are printed in
+ an order corresponding to the order of the reads in the original input file, even when -p is set
+ greater than 1 (Bismark requires the Bowtie 2 output to be this way). Specifying --reorder and
+ setting -p greater than 1 causes Bowtie 2 to run somewhat slower and use somewhat more memory then
+ if --reorder were not specified. Has no effect if -p is set to 1, since output order will naturally
+ correspond to input order in that case.
+
+Bowtie 2 Scoring options:
+
+--score_min Sets a function governing the minimum alignment score needed for an alignment to be considered
+ "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying
+ L,0,-0.2 sets the minimum-score function f to f(x) = 0 + -0.2 * x, where x is the read length.
+ See also: setting function options at http://bowtie-bio.sourceforge.net/bowtie2. The default is
+ L,0,-0.2.
+
+
+Bowtie 2 Reporting options:
+
+-most_valid_alignments This used to be the Bowtie 2 parameter -M. As of Bowtie 2 version 2.0.0 beta7 the option -M is
+ deprecated. It will be removed in subsequent versions. What used to be called -M mode is still the
+ default mode, but adjusting the -M setting is deprecated. Use the -D and -R options to adjust the
+ effort expended to find valid alignments.
+
+ For reference, this used to be the old (now deprecated) description of -M:
+ Bowtie 2 searches for at most +1 distinct, valid alignments for each read. The search terminates when it
+ can't find more distinct valid alignments, or when it finds +1 distinct alignments, whichever
+ happens first. Only the best alignment is reported. Information from the other alignments is used to
+ estimate mapping quality and to set SAM optional fields, such as AS:i and XS:i. Increasing -M makes
+ Bowtie 2 slower, but increases the likelihood that it will pick the correct alignment for a read that
+ aligns many places. For reads that have more than +1 distinct, valid alignments, Bowtie 2 does not
+ guarantee that the alignment reported is the best possible in terms of alignment score. -M is
+ always used and its default value is set to 10.
+
+
+'VANILLA' Bismark OUTPUT:
+
+Single-end output format (tab-separated):
+
+ (1)
+ (2)
+ (3)
+ (4)
+ (5)
+ (6)
+ (7)
+ (8)
+ (9)
+(11)
+
+
+Paired-end output format (tab-separated):
+ (1)
+ (2)
+ (3)
+ (4)
+ (5)
+ (6)
+ (7)
+ (8)
+ (9)
+(10)
+(11)
+(12)
+(14)
+(15)
+
+
+Bismark SAM OUTPUT (default):
+
+ (1) QNAME (seq-ID)
+ (2) FLAG (this flag tries to take the strand a bisulfite read originated from into account (this is different from ordinary DNA alignment flags!))
+ (3) RNAME (chromosome)
+ (4) POS (start position)
+ (5) MAPQ (always 255)
+ (6) CIGAR
+ (7) RNEXT
+ (8) PNEXT
+ (9) TLEN
+(10) SEQ
+(11) QUAL (Phred33 scale)
+(12) NM-tag (edit distance to the reference)
+(13) XX-tag (base-by-base mismatches to the reference. This does not include indels)
+(14) XM-tag (methylation call string)
+(15) XR-tag (read conversion state for the alignment)
+(16) XG-tag (genome conversion state for the alignment)
+
+Each read of paired-end alignments is written out in a separate line in the above format.
+
+
+This script was last edited on 31 July 2012.
+
+HOW_TO
+}