# HG changeset patch # User fcaramia # Date 1355359504 18000 # Node ID 2432df265dad75acfffd394d155056fdb3f98f55 # Parent 5b208d4d89e5b2feb16476db7e801a46f1508592 Uploaded diff -r 5b208d4d89e5 -r 2432df265dad methylation_analysis_bismark/methylation_analysis/bismark --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/methylation_analysis_bismark/methylation_analysis/bismark Wed Dec 12 19:45:04 2012 -0500 @@ -0,0 +1,6571 @@ +#!/usr/bin/perl -- +use strict; +use warnings; +use IO::Handle; +use Cwd; +$|++; +use Getopt::Long; + + +## This program is Copyright (C) 2010-12, Felix Krueger (felix.krueger@bbsrc.ac.uk) + +## This program is free software: you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. + +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. + +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + + +my $parent_dir = getcwd; +my $bismark_version = 'v0.7.6'; +my $command_line = join (" ",@ARGV); + +### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail +foreach my $arg (@ARGV){ + if ($arg eq '--solexa1.3-quals'){ + $arg = '--phred64-quals'; + } +} +my @filenames; # will be populated by processing the command line + +my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir) = process_command_line(); + +my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment +my %chromosomes; # stores the chromosome sequences of the mouse genome +my %counting; # counting various events + +my $seqID_contains_tabs; + +foreach my $filename (@filenames){ + + chdir $parent_dir or die "Unable to move to initial working directory $!\n"; + ### resetting the counting hash and fhs + reset_counters_and_fhs($filename); + $seqID_contains_tabs = 0; + + ### PAIRED-END ALIGNMENTS + if ($filename =~ ','){ + my ($C_to_T_infile_1,$G_to_A_infile_1); # to be made from mate1 file + + $fhs[0]->{name} = 'CTread1GAread2CTgenome'; + $fhs[1]->{name} = 'GAread1CTread2GAgenome'; + $fhs[2]->{name} = 'GAread1CTread2CTgenome'; + $fhs[3]->{name} = 'CTread1GAread2GAgenome'; + + print "\nPaired-end alignments will be performed\n",'='x39,"\n\n"; + + my ($filename_1,$filename_2) = (split (/,/,$filename)); + print "The provided filenames for paired-end alignments are $filename_1 and $filename_2\n"; + + ### additional variables only for paired-end alignments + my ($C_to_T_infile_2,$G_to_A_infile_2); # to be made from mate2 file + + ### FastA format + if ($sequence_file_format eq 'FASTA'){ + print "Input files are in FastA format\n"; + + if ($directional){ + ($C_to_T_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number + ($G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2); + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; + $fhs[0]->{inputfile_2} = $G_to_A_infile_2; + $fhs[1]->{inputfile_1} = undef; + $fhs[1]->{inputfile_2} = undef; + $fhs[2]->{inputfile_1} = undef; + $fhs[2]->{inputfile_2} = undef; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; + $fhs[3]->{inputfile_2} = $G_to_A_infile_2; + } + else{ + ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number + ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2); + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; + $fhs[0]->{inputfile_2} = $G_to_A_infile_2; + $fhs[1]->{inputfile_1} = $G_to_A_infile_1; + $fhs[1]->{inputfile_2} = $C_to_T_infile_2; + $fhs[2]->{inputfile_1} = $G_to_A_infile_1; + $fhs[2]->{inputfile_2} = $C_to_T_infile_2; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; + $fhs[3]->{inputfile_2} = $G_to_A_infile_2; + } + + if ($bowtie2){ + paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + } + else{ + paired_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + } + } + + ### FastQ format + else{ + print "Input files are in FastQ format\n"; + if ($directional){ + ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number + ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; + $fhs[0]->{inputfile_2} = $G_to_A_infile_2; + $fhs[1]->{inputfile_1} = undef; + $fhs[1]->{inputfile_2} = undef; + $fhs[2]->{inputfile_1} = undef; + $fhs[2]->{inputfile_2} = undef; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; + $fhs[3]->{inputfile_2} = $G_to_A_infile_2; + } + else{ + ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number + ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; + $fhs[0]->{inputfile_2} = $G_to_A_infile_2; + $fhs[1]->{inputfile_1} = $G_to_A_infile_1; + $fhs[1]->{inputfile_2} = $C_to_T_infile_2; + $fhs[2]->{inputfile_1} = $G_to_A_infile_1; + $fhs[2]->{inputfile_2} = $C_to_T_infile_2; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; + $fhs[3]->{inputfile_2} = $G_to_A_infile_2; + } + + if ($bowtie2){ + paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + } + else{ + paired_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + } + } + start_methylation_call_procedure_paired_ends($filename_1,$filename_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + } + + ### Else we are performing SINGLE-END ALIGNMENTS + else{ + print "\nSingle-end alignments will be performed\n",'='x39,"\n\n"; + ### Initialising bisulfite conversion filenames + my ($C_to_T_infile,$G_to_A_infile); + + + ### FastA format + if ($sequence_file_format eq 'FASTA'){ + print "Inut file is in FastA format\n"; + if ($directional){ + ($C_to_T_infile) = biTransformFastAFiles ($filename); + $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; + } + else{ + ($C_to_T_infile,$G_to_A_infile) = biTransformFastAFiles ($filename); + $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; + $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile; + } + + ### Creating 4 different bowtie filehandles and storing the first entry + if ($bowtie2){ + single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile,$G_to_A_infile); + } + else{ + single_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile,$G_to_A_infile); + } + } + + ## FastQ format + else{ + print "Input file is in FastQ format\n"; + if ($directional){ + ($C_to_T_infile) = biTransformFastQFiles ($filename); + $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; + } + else{ + ($C_to_T_infile,$G_to_A_infile) = biTransformFastQFiles ($filename); + $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; + $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile; + } + + ### Creating 4 different bowtie filehandles and storing the first entry + if ($bowtie2){ + single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile,$G_to_A_infile); + } + else{ + single_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile,$G_to_A_infile); + } + } + + start_methylation_call_procedure_single_ends($filename,$C_to_T_infile,$G_to_A_infile); + + } +} + +sub start_methylation_call_procedure_single_ends { + my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_; + my ($dir,$filename); + + if ($sequence_file =~ /\//){ + ($dir,$filename) = $sequence_file =~ m/(.*\/)(.*)$/; + } + else{ + $filename = $sequence_file; + } + + ### printing all alignments to a results file + my $outfile = $filename; + + if ($bowtie2){ # SAM format is the default for Bowtie 2 + $outfile =~ s/$/_bt2_bismark.sam/; + } + elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X) + $outfile =~ s/$/_bismark.txt/; + } + else{ # SAM is the default output + $outfile =~ s/$/_bismark.sam/; + } + print "Writing bisulfite mapping results to $output_dir$outfile\n\n"; + open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + if ($vanilla){ + print OUT "Bismark version: $bismark_version\n"; + } + + ### printing alignment and methylation call summary to a report file + my $reportfile = $filename; + if ($bowtie2){ + $reportfile =~ s/$/_bt2_Bismark_mapping_report.txt/; + } + else{ + $reportfile =~ s/$/_Bismark_mapping_report.txt/; + } + + open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n"; + print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n"; + + if ($unmapped){ + my $unmapped_file = $filename; + $unmapped_file =~ s/$/_unmapped_reads.txt/; + open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n"; + print "Unmapped sequences will be written to $output_dir$unmapped_file\n"; + } + if ($ambiguous){ + my $ambiguous_file = $filename; + $ambiguous_file =~ s/$/_ambiguous_reads.txt/; + open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n"; + print "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n"; + } + + if ($directional){ + print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed!)\n"; + } + print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + + + ### if 2 or more files are provided we can hold the genome in memory and don't need to read it in a second time + unless (%chromosomes){ + my $cwd = getcwd; # storing the path of the current working directory + print "Current working directory is: $cwd\n\n"; + read_genome_into_memory($cwd); + } + + unless ($vanilla or $sam_no_hd){ + generate_SAM_header(); + } + + ### Input file is in FastA format + if ($sequence_file_format eq 'FASTA'){ + process_single_end_fastA_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile); + } + ### Input file is in FastQ format + else{ + process_single_end_fastQ_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile); + } +} + +sub start_methylation_call_procedure_paired_ends { + my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; + + my ($dir_1,$filename_1); + + if ($sequence_file_1 =~ /\//){ + ($dir_1,$filename_1) = $sequence_file_1 =~ m/(.*\/)(.*)$/; + } + else{ + $filename_1 = $sequence_file_1; + } + + my ($dir_2,$filename_2); + + if ($sequence_file_2 =~ /\//){ + ($dir_2,$filename_2) = $sequence_file_2 =~ m/(.*\/)(.*)$/; + } + else{ + $filename_2 = $sequence_file_2; + } + + ### printing all alignments to a results file + my $outfile = $filename_1; + if ($bowtie2){ # SAM format is the default Bowtie 2 output + $outfile =~ s/$/_bismark_bt2_pe.sam/; + } + elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X) + $outfile =~ s/$/_bismark_pe.txt/; + } + else{ # SAM format is the default Bowtie 1 output + $outfile =~ s/$/_bismark_pe.sam/; + } + + print "Writing bisulfite mapping results to $outfile\n\n"; + open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!"; + if ($vanilla){ + print OUT "Bismark version: $bismark_version\n"; + } + + ### printing alignment and methylation call summary to a report file + my $reportfile = $filename_1; + if ($bowtie2){ + $reportfile =~ s/$/_Bismark_bt2_paired-end_mapping_report.txt/; + } + else{ + $reportfile =~ s/$/_Bismark_paired-end_mapping_report.txt/; + } + + open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n"; + print REPORT "Bismark report for: $sequence_file_1 and $sequence_file_2 (version: $bismark_version)\n"; + print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + + + ### Unmapped read output + if ($unmapped){ + my $unmapped_1 = $filename_1; + my $unmapped_2 = $filename_2; + $unmapped_1 =~ s/$/_unmapped_reads_1.txt/; + $unmapped_2 =~ s/$/_unmapped_reads_2.txt/; + open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n"; + open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n"; + print "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n"; + } + + if ($ambiguous){ + my $amb_1 = $filename_1; + my $amb_2 = $filename_2; + $amb_1 =~ s/$/_ambiguous_reads_1.txt/; + $amb_2 =~ s/$/_ambiguous_reads_2.txt/; + open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n"; + open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n"; + print "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n"; + } + + if ($directional){ + print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed)\n"; + } + + ### if 2 or more files are provided we might still hold the genome in memory and don't need to read it in a second time + unless (%chromosomes){ + my $cwd = getcwd; # storing the path of the current working directory + print "Current working directory is: $cwd\n\n"; + read_genome_into_memory($cwd); + } + + unless ($vanilla or $sam_no_hd){ + generate_SAM_header(); + } + + ### Input files are in FastA format + if ($sequence_file_format eq 'FASTA'){ + process_fastA_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + } + ### Input files are in FastQ format + else{ + process_fastQ_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + } +} + +sub print_final_analysis_report_single_end{ + my ($C_to_T_infile,$G_to_A_infile) = @_; + ### All sequences from the original sequence file have been analysed now + ### deleting temporary C->T or G->A infiles + + if ($directional){ + my $deletion_successful = unlink "$temp_dir$C_to_T_infile"; + if ($deletion_successful == 1){ + warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile\n\n"; + } + else{ + warn "Could not delete temporary file $C_to_T_infile properly $!\n"; + } + } + + else{ + my $deletion_successful = unlink "$temp_dir$C_to_T_infile","$temp_dir$G_to_A_infile"; + if ($deletion_successful == 2){ + warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile and $temp_dir$G_to_A_infile\n\n"; + } + else{ + warn "Could not delete temporary files properly $!\n"; + } + } + + ### printing a final report for the alignment procedure + print REPORT "Final Alignment report\n",'='x22,"\n"; + print "Final Alignment report\n",'='x22,"\n"; + # foreach my $index (0..$#fhs){ + # print "$fhs[$index]->{name}\n"; + # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n"; + # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n"; + # } + + ### printing a final report for the methylation call procedure + warn "Sequences analysed in total:\t$counting{sequences_count}\n"; + print REPORT "Sequences analysed in total:\t$counting{sequences_count}\n"; + + my $percent_alignable_sequences = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count}); + + warn "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n\n"; + print REPORT "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n"; + + ### percentage of low complexity reads overruled because of low complexity (thereby creating a bias for highly methylated reads), + ### only calculating the percentage if there were any overruled alignments + if ($counting{low_complexity_alignments_overruled_count}){ + my $percent_overruled_low_complexity_alignments = sprintf ("%.1f",$counting{low_complexity_alignments_overruled_count}*100/$counting{sequences_count}); + # print REPORT "Number of low complexity alignments which were overruled to have a unique best hit rather than discarding them:\t$counting{low_complexity_alignments_overruled_count}\t(${percent_overruled_low_complexity_alignments}%)\n"; + } + + print "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; + print "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; + print "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; + print "Number of sequences with unique best (first) alignment came from the bowtie output:\n"; + print join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n"; + + print REPORT "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; + print REPORT "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; + print REPORT "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; + print REPORT "Number of sequences with unique best (first) alignment came from the bowtie output:\n"; + print REPORT join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n"; + + if ($directional){ + print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; + print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; + } + + ### detailed information about Cs analysed + warn "Final Cytosine Methylation Report\n",'='x33,"\n"; + my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count}; + warn "Total number of C's analysed:\t$total_number_of_C\n\n"; + warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; + warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; + + print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; + print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; + print REPORT "Total methylated C's in CpG context:\t $counting{total_meCpG_count}\n"; + print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; + print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; + + my $percent_meCHG; + if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); + } + + my $percent_meCHH; + if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ + $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); + } + + my $percent_meCpG; + if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ + $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); + } + + ### printing methylated CpG percentage if applicable + if ($percent_meCpG){ + warn "C methylated in CpG context:\t${percent_meCpG}%\n"; + print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + } + + ### printing methylated C percentage (CHG context) if applicable + if ($percent_meCHG){ + warn "C methylated in CHG context:\t${percent_meCHG}%\n"; + print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + } + + ### printing methylated C percentage (CHH context) if applicable + if ($percent_meCHH){ + warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; + print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; + print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; + } + + if ($seqID_contains_tabs){ + warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; + print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; + } +} + +sub print_final_analysis_report_paired_ends{ + my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; + ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles + if ($directional){ + my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_2"; + if ($deletion_successful == 2){ + warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2\n\n"; + } + else{ + warn "Could not delete temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2 properly: $!\n"; + } + } + else{ + my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1","$temp_dir$C_to_T_infile_2","$temp_dir$G_to_A_infile_2"; + if ($deletion_successful == 4){ + warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1, $temp_dir$G_to_A_infile_1, $temp_dir$C_to_T_infile_2 and $temp_dir$G_to_A_infile_2\n\n"; + } + else{ + warn "Could not delete temporary files properly: $!\n"; + } + } + + ### printing a final report for the alignment procedure + warn "Final Alignment report\n",'='x22,"\n"; + print REPORT "Final Alignment report\n",'='x22,"\n"; + # foreach my $index (0..$#fhs){ + # print "$fhs[$index]->{name}\n"; + # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n"; + # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n"; + # } + + ### printing a final report for the methylation call procedure + warn "Sequence pairs analysed in total:\t$counting{sequences_count}\n"; + print REPORT "Sequence pairs analysed in total:\t$counting{sequences_count}\n"; + + my $percent_alignable_sequence_pairs = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count}); + print "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}%\n\n"; + print REPORT "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}% \n"; + + print "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; + print "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; + print "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; + print "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n"; + print join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n"; + + + print REPORT "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; + print REPORT "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; + print REPORT "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; + print REPORT "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n"; + print REPORT join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n"; + ### detailed information about Cs analysed + + if ($directional){ + print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; + print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; + } + + warn "Final Cytosine Methylation Report\n",'='x33,"\n"; + print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; + + my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; + warn "Total number of C's analysed:\t$total_number_of_C\n\n"; + warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; + warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; + + print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; + print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; + print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; + + my $percent_meCHG; + if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); + } + + my $percent_meCHH; + if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ + $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); + } + + my $percent_meCpG; + if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ + $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); + } + + ### printing methylated CpG percentage if applicable + if ($percent_meCpG){ + warn "C methylated in CpG context:\t${percent_meCpG}%\n"; + print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + } + + ### printing methylated C percentage in CHG context if applicable + if ($percent_meCHG){ + warn "C methylated in CHG context:\t${percent_meCHG}%\n"; + print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + } + + ### printing methylated C percentage in CHH context if applicable + if ($percent_meCHH){ + warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; + print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; + print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; + } + +} + +sub process_single_end_fastA_file_for_methylation_call{ + my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_; + ### this is a FastA sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call. + ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either + ### the C->T or G->A version + + ### gzipped version of the infile + if ($sequence_file =~ /\.gz$/){ + open (IN,"zcat $sequence_file |") or die $!; + } + else{ + open (IN,$sequence_file) or die $!; + } + + my $count = 0; + + warn "\nReading in the sequence file $sequence_file\n"; + while (1) { + # last if ($counting{sequences_count} > 100); + my $identifier = ; + my $sequence = ; + last unless ($identifier and $sequence); + + $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $counting{sequences_count}++; + if ($counting{sequences_count}%100000==0) { + warn "Processed $counting{sequences_count} sequences so far\n"; + } + chomp $sequence; + chomp $identifier; + + $identifier =~ s/^>//; # deletes the > at the beginning of FastA headers + + my $return; + if ($bowtie2){ + $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier); + } + else{ + $return = check_bowtie_results_single_end(uc$sequence,$identifier); # default Bowtie 1 + } + + unless ($return){ + $return = 0; + } + + # print the sequence to ambiguous.out if --ambiguous was specified + if ($ambiguous and $return == 2){ + print AMBIG ">$identifier\n"; + print AMBIG "$sequence\n"; + } + + # print the sequence to file if --un was specified + elsif ($unmapped and $return == 1){ + print UNMAPPED ">$identifier\n"; + print UNMAPPED "$sequence\n"; + } + } + print "Processed $counting{sequences_count} sequences in total\n\n"; + + print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile); + +} + +sub process_single_end_fastQ_file_for_methylation_call{ + my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_; + ### this is the Illumina sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call. + ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either + ### the C->T or G->A version + + ### gzipped version of the infile + if ($sequence_file =~ /\.gz$/){ + open (IN,"zcat $sequence_file |") or die $!; + } + else{ + open (IN,$sequence_file) or die $!; + } + + my $count = 0; + + warn "\nReading in the sequence file $sequence_file\n"; + while (1) { + my $identifier = ; + my $sequence = ; + my $identifier_2 = ; + my $quality_value = ; + last unless ($identifier and $sequence and $identifier_2 and $quality_value); + + $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $counting{sequences_count}++; + + if ($counting{sequences_count}%1000000==0) { + warn "Processed $counting{sequences_count} sequences so far\n"; + } + chomp $sequence; + chomp $identifier; + chomp $quality_value; + + $identifier =~ s/^\@//; # deletes the @ at the beginning of Illumin FastQ headers + + my $return; + if ($bowtie2){ + $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier,$quality_value); + } + else{ + $return = check_bowtie_results_single_end(uc$sequence,$identifier,$quality_value); # default Bowtie 1 + } + + unless ($return){ + $return = 0; + } + + # print the sequence to ambiguous.out if --ambiguous was specified + if ($ambiguous and $return == 2){ + print AMBIG "\@$identifier\n"; + print AMBIG "$sequence\n"; + print AMBIG $identifier_2; + print AMBIG "$quality_value\n"; + } + + # print the sequence to file if --un was specified + elsif ($unmapped and $return == 1){ + print UNMAPPED "\@$identifier\n"; + print UNMAPPED "$sequence\n"; + print UNMAPPED $identifier_2; + print UNMAPPED "$quality_value\n"; + } + } + print "Processed $counting{sequences_count} sequences in total\n\n"; + + print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile); + +} + +sub process_fastA_files_for_paired_end_methylation_calls{ + my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; + ### Processing the two FastA sequence files; we need the actual sequences of both reads to compare them against the genomic sequence in order to + ### make a methylation call. The sequence idetifier per definition needs to be the same for a sequence pair used for paired-end mapping. + ### Now reading in the sequence files sequence by sequence and see if the current sequences produced an alignment to one (or both) of the + ### converted genomes (either the C->T or G->A version) + + ### gzipped version of the infiles + if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){ + open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n"; + open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n"; + } + else{ + open (IN1,$sequence_file_1) or die $!; + open (IN2,$sequence_file_2) or die $!; + } + + warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n"; + ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one + + my $count = 0; + + while (1) { + # reading from the first input file + my $identifier_1 = ; + my $sequence_1 = ; + # reading from the second input file + my $identifier_2 = ; + my $sequence_2 = ; + last unless ($identifier_1 and $sequence_1 and $identifier_2 and $sequence_2); + + $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces + $identifier_2 = fix_IDs($identifier_2); + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $counting{sequences_count}++; + if ($counting{sequences_count}%100000==0) { + warn "Processed $counting{sequences_count} sequences so far\n"; + } + my $orig_identifier_1 = $identifier_1; + my $orig_identifier_2 = $identifier_2; + + chomp $sequence_1; + chomp $identifier_1; + chomp $sequence_2; + chomp $identifier_2; + + $identifier_1 =~ s/^>//; # deletes the > at the beginning of FastA headers + + my $return; + if ($bowtie2){ + $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1); + } + else{ + $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1); + } + + unless ($return){ + $return = 0; + } + + # print the sequences to ambiguous_1 and _2 if --ambiguous was specified + if ($ambiguous and $return == 2){ + print AMBIG_1 $orig_identifier_1; + print AMBIG_1 "$sequence_1\n"; + print AMBIG_2 $orig_identifier_2; + print AMBIG_2 "$sequence_2\n"; + } + + # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified + elsif ($unmapped and $return == 1){ + print UNMAPPED_1 $orig_identifier_1; + print UNMAPPED_1 "$sequence_1\n"; + print UNMAPPED_2 $orig_identifier_2; + print UNMAPPED_2 "$sequence_2\n"; + } + } + + print "Processed $counting{sequences_count} sequences in total\n\n"; + + print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + +} + +sub process_fastQ_files_for_paired_end_methylation_calls{ + my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; + ### Processing the two Illumina sequence files; we need the actual sequence of both reads to compare them against the genomic sequence in order to + ### make a methylation call. The sequence identifier per definition needs to be same for a sequence pair used for paired-end alignments. + ### Now reading in the sequence files sequence by sequence and see if the current sequences produced a paired-end alignment to one (or both) + ### of the converted genomes (either C->T or G->A version) + + ### gzipped version of the infiles + if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){ + open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n"; + open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n"; + } + else{ + open (IN1,$sequence_file_1) or die $!; + open (IN2,$sequence_file_2) or die $!; + } + + my $count = 0; + + warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n"; + ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one + while (1) { + # reading from the first input file + my $identifier_1 = ; + my $sequence_1 = ; + my $ident_1 = ; # not needed + my $quality_value_1 = ; # not needed + # reading from the second input file + my $identifier_2 = ; + my $sequence_2 = ; + my $ident_2 = ; # not needed + my $quality_value_2 = ; # not needed + last unless ($identifier_1 and $sequence_1 and $quality_value_1 and $identifier_2 and $sequence_2 and $quality_value_2); + + $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces + $identifier_2 = fix_IDs($identifier_2); + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $counting{sequences_count}++; + if ($counting{sequences_count}%100000==0) { + warn "Processed $counting{sequences_count} sequences so far\n"; + } + + my $orig_identifier_1 = $identifier_1; + my $orig_identifier_2 = $identifier_2; + + chomp $sequence_1; + chomp $identifier_1; + chomp $sequence_2; + chomp $identifier_2; + chomp $quality_value_1; + chomp $quality_value_2; + + $identifier_1 =~ s/^\@//; # deletes the @ at the beginning of the FastQ ID + + my $return; + if ($bowtie2){ + $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2); + } + else{ + $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2); + } + + unless ($return){ + $return = 0; + } + + # print the sequences to ambiguous_1 and _2 if --ambiguous was specified + if ($ambiguous and $return == 2){ + # seq_1 + print AMBIG_1 $orig_identifier_1; + print AMBIG_1 "$sequence_1\n"; + print AMBIG_1 $ident_1; + print AMBIG_1 "$quality_value_1\n"; + # seq_2 + print AMBIG_2 $orig_identifier_2; + print AMBIG_2 "$sequence_2\n"; + print AMBIG_2 $ident_2; + print AMBIG_2 "$quality_value_2\n"; + } + + # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified + elsif ($unmapped and $return == 1){ + # seq_1 + print UNMAPPED_1 $orig_identifier_1; + print UNMAPPED_1 "$sequence_1\n"; + print UNMAPPED_1 $ident_1; + print UNMAPPED_1 "$quality_value_1\n"; + # seq_2 + print UNMAPPED_2 $orig_identifier_2; + print UNMAPPED_2 "$sequence_2\n"; + print UNMAPPED_2 $ident_2; + print UNMAPPED_2 "$quality_value_2\n"; + } + } + + print "Processed $counting{sequences_count} sequences in total\n\n"; + + print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + +} + +sub check_bowtie_results_single_end{ + my ($sequence,$identifier,$quality_value) = @_; + + unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout + $quality_value = 'I'x(length$sequence); + } + + my %mismatches = (); + ### reading from the bowtie output files to see if this sequence aligned to a bisulfite converted genome + foreach my $index (0..$#fhs){ + + ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) + next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id}); + ### if the sequence we are currently looking at produced an alignment we are doing various things with it + if ($fhs[$index]->{last_seq_id} eq $identifier) { + ############################################################### + ### STEP I Now processing the alignment stored in last_line ### + ############################################################### + my $valid_alignment_found_1 = decide_whether_single_end_alignment_is_valid($index,$identifier); + ### sequences can fail at this point if there was only 1 seq in the wrong orientation, or if there were 2 seqs, both in the wrong orientation + ### we only continue to extract useful information about this alignment if 1 was returned + if ($valid_alignment_found_1 == 1){ + ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself + ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse) + my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7]; + + unless($mismatch_info){ + $mismatch_info = ''; + } + + chomp $mismatch_info; + my $chromosome; + if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ + $chromosome = $mapped_chromosome; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome\n"; + } + ### Now extracting the number of mismatches to the converted genome + my $number_of_mismatches; + if ($mismatch_info eq ''){ + $number_of_mismatches = 0; + } + elsif ($mismatch_info =~ /^\d/){ + my @mismatches = split (/,/,$mismatch_info); + $number_of_mismatches = scalar @mismatches; + } + else{ + die "Something weird is going on with the mismatch field:\t>>> $mismatch_info <<<\n"; + } + ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table + my $alignment_location = join (":",$chromosome,$position); + ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same + ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index + ### number for the found alignment) + unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){ + $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id; + $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence; + $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index; + $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome; + $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position; + } + $number_of_mismatches = undef; + ################################################################################################################################################## + ### STEP II Now reading in the next line from the bowtie filehandle. The next alignment can either be a second alignment of the same sequence or a + ### a new sequence. In either case we will store the next line in @fhs ->{last_line}. In case the alignment is already the next entry, a 0 will + ### be returned as $valid_alignment_found and it will then be processed in the next round only. + ################################################################################################################################################## + my $newline = $fhs[$index]->{fh}-> getline(); + if ($newline){ + my ($seq_id) = split (/\t/,$newline); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + } + else { + # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + next; + } + my $valid_alignment_found_2 = decide_whether_single_end_alignment_is_valid($index,$identifier); + ### we only continue to extract useful information about this second alignment if 1 was returned + if ($valid_alignment_found_2 == 1){ + ### If the second Bowtie output made it this far it is in the correct orientation, so we can continue to analyse the alignment itself + ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse) + my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7]; + unless($mismatch_info){ + $mismatch_info = ''; + } + chomp $mismatch_info; + + my $chromosome; + if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ + $chromosome = $mapped_chromosome; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome\n"; + } + + ### Now extracting the number of mismatches to the converted genome + my $number_of_mismatches; + if ($mismatch_info eq ''){ + $number_of_mismatches = 0; + } + elsif ($mismatch_info =~ /^\d/){ + my @mismatches = split (/,/,$mismatch_info); + $number_of_mismatches = scalar @mismatches; + } + else{ + die "Something weird is going on with the mismatch field\n"; + } + ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table + ### extracting the chromosome number from the bowtie output (see above) + my $alignment_location = join (":",$chromosome,$position); + ### In the special case that two differently converted sequences align against differently converted genomes, but to the same position + ### with the same number of mismatches (or perfect matches), the chromosome, position and number of mismatches are the same. In this + ### case we are not writing the same entry out a second time. + unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){ + $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id; + $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence; + $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index; + $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome; + $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position; + } + #################################################################################################################################### + #### STEP III Now reading in one more line which has to be the next alignment to be analysed. Adding it to @fhs ->{last_line} ### + #################################################################################################################################### + $newline = $fhs[$index]->{fh}-> getline(); + if ($newline){ + my ($seq_id) = split (/\t/,$newline); + die "The same seq ID occurred more than twice in a row\n" if ($seq_id eq $identifier); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + next; + } + else { + # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + next; + } + ### still within the 2nd sequence in correct orientation found + } + ### still withing the 1st sequence in correct orientation found + } + ### still within the if (last_seq_id eq identifier) condition + } + ### still within foreach index loop + } + ### if there was not a single alignment found for a certain sequence we will continue with the next sequence in the sequence file + unless(%mismatches){ + $counting{no_single_alignment_found}++; + if ($unmapped){ + return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified + } + else{ + return; + } + } + ####################################################################################################################################################### + ####################################################################################################################################################### + ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the ### + ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the ### + ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether ### + ####################################################################################################################################################### + ####################################################################################################################################################### + ### Going to use the variable $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) + my $sequence_fails = 0; + ### Declaring an empty hash reference which will store all information we need for the methylation call + my $methylation_call_params; # hash reference! + ### sorting in ascending order + foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){ + + ### if there is only 1 entry in the hash with the lowest number of mismatches we accept it as the best alignment + if (scalar keys %{$mismatches{$mismatch_number}} == 1){ + for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){ + $methylation_call_params->{$identifier}->{bowtie_sequence} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence}; + $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome}; + $methylation_call_params->{$identifier}->{position} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{position}; + $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index}; + $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number; + } + } + elsif (scalar keys %{$mismatches{$mismatch_number}} == 3){ + ### If there are 3 sequences with the same number of lowest mismatches we can discriminate 2 cases: (i) all 3 alignments are unique best hits and + ### come from different alignments processes (== indices) or (ii) one sequence alignment (== index) will give a unique best alignment, whereas a + ### second one will produce 2 (or potentially many) alignments for the same sequence but in a different conversion state or against a different genome + ### version (or both). This becomes especially relevant for highly converted sequences in which all Cs have been converted to Ts in the bisulfite + ### reaction. E.g. + ### CAGTCACGCGCGCGCG will become + ### TAGTTATGTGTGTGTG in the CT transformed version, which will ideally still give the correct alignment in the CT->CT alignment condition. + ### If the same read will then become G->A transformed as well however, the resulting sequence will look differently and potentially behave + ### differently in a GA->GA alignment and this depends on the methylation state of the original sequence!: + ### G->A conversion: + ### highly methylated: CAATCACACACACACA + ### highly converted : TAATTATATATATATA <== this sequence has a reduced complexity (only 2 bases left and not 3), and it is more likely to produce + ### an alignment with a low complexity genomic region than the one above. This would normally lead to the entire sequence being kicked out as the + ### there will be 3 alignments with the same number of lowest mismatches!! This in turn means that highly methylated and thereby not converted + ### sequences are more likely to pass the alignment step, thereby creating a bias for methylated reads compared to their non-methylated counterparts. + ### We do not want any bias, whatsover. Therefore if we have 1 sequence producing a unique best alignment and the second and third conditions + ### producing alignments only after performing an additional (theoretical) conversion we want to keep the best alignment with the lowest number of + ### additional transliterations performed. Thus we want to have a look at the level of complexity of the sequences producing the alignment. + ### In the above example the number of transliterations required to transform the actual sequence + ### to the C->T version would be TAGTTATGTGTGTGTG -> TAGTTATGTGTGTGTG = 0; (assuming this gives the correct alignment) + ### in the G->A case it would be TAGTTATGTGTGTGTG -> TAATTATATATATATA = 6; (assuming this gives multiple wrong alignments) + ### if the sequence giving a unique best alignment required a lower number of transliterations than the second best sequence yielding alignments + ### while requiring a much higher number of transliterations, we are going to accept the unique best alignment with the lowest number of performed + ### transliterations. As a threshold which does scale we will start with the number of tranliterations of the lowest best match x 2 must still be + ### smaller than the number of tranliterations of the second best sequence. Everything will be flagged with $sequence_fails = 1 and discarded. + my @three_candidate_seqs; + foreach my $composite_location (keys (%{$mismatches{$mismatch_number}}) ){ + my $transliterations_performed; + if ($mismatches{$mismatch_number}->{$composite_location}->{index} == 0 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 1){ + $transliterations_performed = determine_number_of_transliterations_performed($sequence,'CT'); + } + elsif ($mismatches{$mismatch_number}->{$composite_location}->{index} == 2 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 3){ + $transliterations_performed = determine_number_of_transliterations_performed($sequence,'GA'); + } + else{ + die "unexpected index number range $!\n"; + } + push @three_candidate_seqs,{ + index =>$mismatches{$mismatch_number}->{$composite_location}->{index}, + bowtie_sequence => $mismatches{$mismatch_number}->{$composite_location}->{bowtie_sequence}, + mismatch_number => $mismatch_number, + chromosome => $mismatches{$mismatch_number}->{$composite_location}->{chromosome}, + position => $mismatches{$mismatch_number}->{$composite_location}->{position}, + seq_id => $mismatches{$mismatch_number}->{$composite_location}->{seq_id}, + transliterations_performed => $transliterations_performed, + }; + } + ### sorting in ascending order for the lowest number of transliterations performed + @three_candidate_seqs = sort {$a->{transliterations_performed} <=> $b->{transliterations_performed}} @three_candidate_seqs; + my $first_array_element = $three_candidate_seqs[0]->{transliterations_performed}; + my $second_array_element = $three_candidate_seqs[1]->{transliterations_performed}; + my $third_array_element = $three_candidate_seqs[2]->{transliterations_performed}; + # print "$first_array_element\t$second_array_element\t$third_array_element\n"; + if (($first_array_element*2) < $second_array_element){ + $counting{low_complexity_alignments_overruled_count}++; + ### taking the index with the unique best hit and over ruling low complexity alignments with 2 hits + $methylation_call_params->{$identifier}->{bowtie_sequence} = $three_candidate_seqs[0]->{bowtie_sequence}; + $methylation_call_params->{$identifier}->{chromosome} = $three_candidate_seqs[0]->{chromosome}; + $methylation_call_params->{$identifier}->{position} = $three_candidate_seqs[0]->{position}; + $methylation_call_params->{$identifier}->{index} = $three_candidate_seqs[0]->{index}; + $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number; + # print "Overruled low complexity alignments! Using $first_array_element and disregarding $second_array_element and $third_array_element\n"; + } + else{ + $sequence_fails = 1; + } + } + else{ + $sequence_fails = 1; + } + ### after processing the alignment with the lowest number of mismatches we exit + last; + } + ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions + if ($sequence_fails == 1){ + $counting{unsuitable_sequence_count}++; + if ($ambiguous){ + return 2; # => exits to next sequence, and prints it out to multiple_alignments.out if --ambiguous has been specified + } + if ($unmapped){ + return 1; # => exits to next sequence, and prints it out to unmapped.out if --un has been specified + } + else{ + return 0; # => exits to next sequence (default) + } + } + + ### --DIRECTIONAL + ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore + ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol + if ($directional){ + if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){ + # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; + $counting{alignments_rejected_count}++; + return 0; + } + } + + ### If the sequence has not been rejected so far it will have a unique best alignment + $counting{unique_best_alignment_count}++; + extract_corresponding_genomic_sequence_single_end($identifier,$methylation_call_params); + ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call + if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){ + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n"; + $counting{genomic_sequence_could_not_be_extracted_count}++; + return 0; + } + + ### otherwise we are set to perform the actual methylation call + $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion}); + + print_bisulfite_mapping_result_single_end($identifier,$sequence,$methylation_call_params,$quality_value); + return 0; ## otherwise 1 will be returned by default, which would print the sequence to unmapped.out +} + +sub check_bowtie_results_single_end_bowtie2{ + my ($sequence,$identifier,$quality_value) = @_; + + unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout + $quality_value = 'I'x(length$sequence); + } + + # as of version Bowtie 2 2.0.0 beta7, when input reads are unpaired, Bowtie 2 no longer removes the trailing /1 or /2 from the read name. + # $identifier =~ s/\/[1234567890]+$//; # some sequencers don't just have /1 or /2 at the end of read IDs + + my $alignment_ambiguous = 0; + + my %alignments = (); + + ### reading from the Bowtie 2 output filehandles + foreach my $index (0..$#fhs){ + # print "Index: $index\n"; + # print "$fhs[$index]->{last_line}\n"; + # print "$fhs[$index]->{last_seq_id}\n\n"; + + ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) + next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id}); + + ### if the sequence we are currently looking at produced an alignment we are doing various things with it + # print "last seq id: $fhs[$index]->{last_seq_id} and identifier: $identifier\n"; + + if ($fhs[$index]->{last_seq_id} eq $identifier) { + + # SAM format specifications for Bowtie 2 + # (1) Name of read that aligned + # (2) Sum of all applicable flags. Flags relevant to Bowtie are: + # 1 The read is one of a pair + # 2 The alignment is one end of a proper paired-end alignment + # 4 The read has no reported alignments + # 8 The read is one of a pair and has no reported alignments + # 16 The alignment is to the reverse reference strand + # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand + # 64 The read is mate 1 in a pair + # 128 The read is mate 2 in a pair + # 256 The read has multiple mapping states + # (3) Name of reference sequence where alignment occurs (unmapped reads have a *) + # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads) + # (5) Mapping quality (255 means MAPQ is not available) + # (6) CIGAR string representation of alignment (* if unavailable) + # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate. + # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate. + # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate. + # (10) Read sequence (reverse-complemented if aligned to the reverse strand) + # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file. + # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment: + # AS:i: Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read. + # XS:i: Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read. + # YS:i: Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment. + # XN:i: The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read. + # XM:i: The number of mismatches in the alignment. Only present if SAM record is for an aligned read. + # XO:i: The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. + # XG:i: The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. + # NM:i: The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read. + # YF:Z: String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out. + # MD:Z: A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read. + + my ($id,$flag,$mapped_chromosome,$position,$mapping_quality,$cigar,$bowtie_sequence,$qual) = (split (/\t/,$fhs[$index]->{last_line}))[0,1,2,3,4,5,9,10]; + + ### If a sequence has no reported alignments there will be a single output line with a bit-wise flag value of 4. We can store the next alignment and move on to the next Bowtie 2 instance + if ($flag == 4){ + ## reading in the next alignment, which must be the next sequence + my $newline = $fhs[$index]->{fh}-> getline(); + if ($newline){ + chomp $newline; + my ($seq_id) = split (/\t/,$newline); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + if ($seq_id eq $identifier){ + die "Sequence with ID $identifier did not produce any alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; + } + next; # next instance + } + else{ + # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + next; + } + } + + # if there are one or more proper alignments we can extract the chromosome number + my $chromosome; + if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ + $chromosome = $mapped_chromosome; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome\n"; + } + + ### We will use the optional field to determine the best alignment. Later on we extract the number of mismatches and/or indels from the CIGAR string + my ($alignment_score,$second_best,$MD_tag); + my @fields = split (/\t/,$fhs[$index]->{last_line}); + + foreach (11..$#fields){ + if ($fields[$_] =~ /AS:i:(.*)/){ + $alignment_score = $1; + } + elsif ($fields[$_] =~ /XS:i:(.*)/){ + $second_best = $1; + } + elsif ($fields[$_] =~ /MD:Z:(.*)/){ + $MD_tag = $1; + } + } + + # warn "First best alignment_score is: '$alignment_score'\n"; + # warn "MD tag is: '$MD_tag'\n"; + die "Failed to extract alignment score ($alignment_score) and MD tag ($MD_tag)!\n" unless (defined $alignment_score and defined $MD_tag); + + if (defined $second_best){ + # warn "second best alignment_score is: '$second_best'\n"; + + # If the first alignment score is the same as the alignment score of the second best hit we are going to boot this sequence altogether + if ($alignment_score == $second_best){ + $alignment_ambiguous = 1; + ## need to read and discard all additional ambiguous reads until we reach the next sequence + until ($fhs[$index]->{last_seq_id} ne $identifier){ + my $newline = $fhs[$index]->{fh}-> getline(); + if ($newline){ + chomp $newline; + my ($seq_id) = split (/\t/,$newline); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + } + else{ + # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + last; # break free in case we have reached the end of the alignment output + } + } + # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; + } + else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment + + my $alignment_location = join (":",$chromosome,$position); + + ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite + ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only + ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB + + unless (exists $alignments{$alignment_location}){ + $alignments{$alignment_location}->{seq_id} = $id; + $alignments{$alignment_location}->{alignment_score} = $alignment_score; + $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence; + $alignments{$alignment_location}->{index} = $index; + $alignments{$alignment_location}->{chromosome} = $chromosome; + $alignments{$alignment_location}->{position} = $position; + $alignments{$alignment_location}->{CIGAR} = $cigar; + $alignments{$alignment_location}->{MD_tag} = $MD_tag; + } + + ### now reading and discarding all (inferior) alignments of this sequencing read until we hit the next sequence + until ($fhs[$index]->{last_seq_id} ne $identifier){ + my $newline = $fhs[$index]->{fh}-> getline(); + if ($newline){ + chomp $newline; + my ($seq_id) = split (/\t/,$newline); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + } + else{ + # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + last; # break free in case we have reached the end of the alignment output + } + } + # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; + } + } + else{ # there is no second best hit, so we can just store this one and read in the next sequence + + my $alignment_location = join (":",$chromosome,$position); + + ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite + ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only + ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB + + unless (exists $alignments{$alignment_location}){ + $alignments{$alignment_location}->{seq_id} = $id; + $alignments{$alignment_location}->{alignment_score} = $alignment_score; + $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence; + $alignments{$alignment_location}->{index} = $index; + $alignments{$alignment_location}->{chromosome} = $chromosome; + $alignments{$alignment_location}->{position} = $position; + $alignments{$alignment_location}->{MD_tag} = $MD_tag; + $alignments{$alignment_location}->{CIGAR} = $cigar; + } + + my $newline = $fhs[$index]->{fh}-> getline(); + if ($newline){ + chomp $newline; + my ($seq_id) = split (/\t/,$newline); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + if ($seq_id eq $identifier){ + die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; + } + } + else{ + # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + } + } + } + } + + ### if the read produced several ambiguous alignments already now can returning already now. If --ambiguous or --unmapped was specified the read sequence will be printed out. + if ($alignment_ambiguous == 1){ + $counting{unsuitable_sequence_count}++; + ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else + # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value); + # print "$ambiguous_read_output\n"; + + if ($ambiguous){ + return 2; # => exits to next sequence, and prints it out to _ambiguous_reads.txt if '--ambiguous' was specified + } + elsif ($unmapped){ + return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified + } + else{ + return 0; + } + } + + ### if there was no alignment found for a certain sequence at all we continue with the next sequence in the sequence file + unless(%alignments){ + $counting{no_single_alignment_found}++; + # my $unmapped_read_output = join("\t",$identifier,'4','*','0','0','*','*','0','0',$sequence,$quality_value); + # print "$unmapped_read_output\n"; + if ($unmapped){ + return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' was specified + } + else{ + return 0; # default + } + } + + ####################################################################################################################################################### + + ### If the sequence was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one + ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest) + ### alignment score we are discarding the sequence altogether. + ### For end-to-end alignments the maximum alignment score can be 0, each mismatch can receive penalties up to 6, and each gap receives penalties for + ### opening (5) and extending (3 per bp) the gap. + + ####################################################################################################################################################### + + my $methylation_call_params; # hash reference which will store all information we need for the methylation call + my $sequence_fails = 0; # Going to use $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) + + ### print contents of %alignments for debugging + # if (scalar keys %alignments > 1){ + # print "\n******\n"; + # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){ + # print "Loc: $alignment_location\n"; + # print "ID: $alignments{$alignment_location}->{seq_id}\n"; + # print "AS: $alignments{$alignment_location}->{alignment_score}\n"; + # print "Seq: $alignments{$alignment_location}->{bowtie_sequence}\n"; + # print "Index $alignments{$alignment_location}->{index}\n"; + # print "Chr: $alignments{$alignment_location}->{chromosome}\n"; + # print "pos: $alignments{$alignment_location}->{position}\n"; + # print "MD: $alignments{$alignment_location}->{MD_tag}\n\n"; + # } + # print "\n******\n"; + # } + + ### if there is only 1 entry in the hash with we accept it as the best alignment + if (scalar keys %alignments == 1){ + for my $unique_best_alignment (keys %alignments){ + $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$unique_best_alignment}->{bowtie_sequence}; + $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome}; + $methylation_call_params->{$identifier}->{position} = $alignments{$unique_best_alignment}->{position}; + $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index}; + $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$unique_best_alignment}->{alignment_score}; + $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$unique_best_alignment}->{MD_tag}; + $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$unique_best_alignment}->{CIGAR}; + } + } + + ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case + ### we boot the sequence altogether + elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){ + my $best_alignment_score; + my $best_alignment_location; + foreach my $alignment_location (sort {$alignments{$b}->{alignment_score} <=> $alignments{$a}->{alignment_score}} keys %alignments){ + # print "$alignments{$alignment_location}->{alignment_score}\n"; + unless (defined $best_alignment_score){ + $best_alignment_score = $alignments{$alignment_location}->{alignment_score}; + $best_alignment_location = $alignment_location; + # print "setting best alignment score: $best_alignment_score\n"; + } + else{ + ### if the second best alignment has the same alignment score as the first one, the sequence will get booted + if ($alignments{$alignment_location}->{alignment_score} == $best_alignment_score){ + # warn "Same alignment score, the sequence will get booted!\n"; + $sequence_fails = 1; + last; # exiting after the second alignment since we know that the sequence has ambiguous alignments + } + ### else we are going to store the best alignment for further processing + else{ + $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$best_alignment_location}->{bowtie_sequence}; + $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome}; + $methylation_call_params->{$identifier}->{position} = $alignments{$best_alignment_location}->{position}; + $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index}; + $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$best_alignment_location}->{alignment_score}; + $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$best_alignment_location}->{MD_tag}; + $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$best_alignment_location}->{CIGAR}; + last; # exiting after processing the second alignment since the sequence produced a unique best alignment + } + } + } + } + else{ + die "There are too many potential hits for this sequence (1-4 expected, but found: ",scalar keys %alignments,")\n";; + } + + ### skipping the sequence completely if there were multiple alignments with the same best alignment score at different positions + if ($sequence_fails == 1){ + $counting{unsuitable_sequence_count}++; + + ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else + # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value); + # print OUT "$ambiguous_read_output\n"; + + if ($ambiguous){ + return 2; # => exits to next sequence, and prints it out (in FastQ format) to _ambiguous_reads.txt if '--ambiguous' was specified + } + elsif ($unmapped){ + return 1; # => exits to next sequence, and prints it out (in FastQ format) to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified + } + else{ + return 0; # => exits to next sequence (default) + } + } + + ### --DIRECTIONAL + ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore + ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol + if ($directional){ + if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){ + # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; + $counting{alignments_rejected_count}++; + return 0; + } + } + + ### If the sequence has not been rejected so far it has a unique best alignment + $counting{unique_best_alignment_count}++; + + ### Now we need to extract a genomic sequence that exactly corresponds to the reported alignment. This potentially means that we need to deal with insertions or deletions as well + extract_corresponding_genomic_sequence_single_end_bowtie2 ($identifier,$methylation_call_params); + + ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call + if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){ + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n"; + $counting{genomic_sequence_could_not_be_extracted_count}++; + return 0; + } + + + ### otherwise we are set to perform the actual methylation call + $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion}); + print_bisulfite_mapping_result_single_end_bowtie2 ($identifier,$sequence,$methylation_call_params,$quality_value); + return 0; ## if a sequence got this far we do not want to print it to unmapped or ambiguous.out +} + + +sub determine_number_of_transliterations_performed{ + my ($sequence,$read_conversion) = @_; + my $number_of_transliterations; + if ($read_conversion eq 'CT'){ + $number_of_transliterations = $sequence =~ tr/C/T/; + } + elsif ($read_conversion eq 'GA'){ + $number_of_transliterations = $sequence =~ tr/G/A/; + } + else{ + die "Read conversion mode of the read was not specified $!\n"; + } + return $number_of_transliterations; +} + +sub decide_whether_single_end_alignment_is_valid{ + my ($index,$identifier) = @_; + + # extracting from Bowtie 1 format + my ($id,$strand) = (split (/\t/,$fhs[$index]->{last_line}))[0,1]; + + ### ensuring that the entry is the correct sequence + if (($id eq $fhs[$index]->{last_seq_id}) and ($id eq $identifier)){ + ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically + ### sensible alignments + my $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand); + ### If the orientation was correct can we move on + if ($orientation == 1){ + return 1; ### 1st possibility for a sequence to pass + } + ### If the alignment was in the wrong orientation we need to read in a new line + elsif($orientation == 0){ + my $newline = $fhs[$index]->{fh}->getline(); + if ($newline){ + ($id,$strand) = (split (/\t/,$newline))[0,1]; + + ### ensuring that the next entry is still the correct sequence + if ($id eq $identifier){ + ### checking orientation again + $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand); + ### If the orientation was correct can we move on + if ($orientation == 1){ + $fhs[$index]->{last_seq_id} = $id; + $fhs[$index]->{last_line} = $newline; + return 1; ### 2nd possibility for a sequence to pass + } + ### If the alignment was in the wrong orientation again we need to read in yet another new line and store it in @fhs + elsif ($orientation == 0){ + $newline = $fhs[$index]->{fh}->getline(); + if ($newline){ + my ($seq_id) = split (/\t/,$newline); + ### check if the next line still has the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with + ### the same fields of the just read next entry + die "Same seq ID 3 or more times in a row!(should be 2 max) $!" if ($seq_id eq $identifier); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + return 0; # not processing anything this round as the alignment currently stored in last_line was in the wrong orientation + } + else{ + # assigning undef to last_seq_id and last_line (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation + } + } + else{ + die "The orientation of the alignment must be either correct or incorrect\n"; + } + } + ### the sequence we just read in is already the next sequence to be analysed -> store it in @fhs + else{ + $fhs[$index]->{last_seq_id} = $id; + $fhs[$index]->{last_line} = $newline; + return 0; # processing the new alignment result only in the next round + } + } + else { + # assigning undef to last_seq_id and last_line (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation + } + } + else{ + die "The orientation of the alignment must be either correct or incorrect\n"; + } + } + ### the sequence stored in @fhs as last_line is already the next sequence to be analysed -> analyse next round + else{ + return 0; + } +} +######################### +### BOWTIE 1 | PAIRED-END +######################### + +sub check_bowtie_results_paired_ends{ + my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_; + + ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40 + unless ($quality_value_1){ + $quality_value_1 = 'I'x(length$sequence_1); + } + unless ($quality_value_2){ + $quality_value_2 = 'I'x(length$sequence_2); + } + + # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n"; + + my %mismatches = (); + ### reading from the bowtie output files to see if this sequence pair aligned to a bisulfite converted genome + + + ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way. + ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2). + ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB) + ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignment to the complementary + ### strands are not being reported by specifying --directional + + foreach my $index (0,3,1,2){ + ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) + next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id}); + ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it + if ($fhs[$index]->{last_seq_id} eq $identifier) { + # print "$identifier\n$fhs[$index]->{last_seq_id}\n\n"; + + ################################################################################## + ### STEP I Processing the entry which is stored in last_line_1 and last_line_2 ### + ################################################################################## + my $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier); + ### sequences can fail at this point if there was only 1 alignment in the wrong orientation, or if there were 2 aligments both in the wrong + ### orientation. We only continue to extract useful information about this alignment if 1 was returned + if ($valid_alignment_found == 1){ + ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself. + ### we store the useful information in %mismatches + my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7]; + my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7]; + chomp $mismatch_info_1; + chomp $mismatch_info_2; + + ### need to extract the chromosome number from the bowtie output (which is either XY_CT_converted or XY_GA_converted + my ($chromosome_1,$chromosome_2); + if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ + $chromosome_1 = $mapped_chromosome_1; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome_1\n"; + } + if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ + $chromosome_2 = $mapped_chromosome_2; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome_2\n"; + } + + ### Now extracting the number of mismatches to the converted genome + my $number_of_mismatches_1; + my $number_of_mismatches_2; + if ($mismatch_info_1 eq ''){ + $number_of_mismatches_1 = 0; + } + elsif ($mismatch_info_1 =~ /^\d/){ + my @mismatches = split (/,/,$mismatch_info_1); + $number_of_mismatches_1 = scalar @mismatches; + } + else{ + die "Something weird is going on with the mismatch field\n"; + } + if ($mismatch_info_2 eq ''){ + $number_of_mismatches_2 = 0; + } + elsif ($mismatch_info_2 =~ /^\d/){ + my @mismatches = split (/,/,$mismatch_info_2); + $number_of_mismatches_2 = scalar @mismatches; + } + else{ + die "Something weird is going on with the mismatch field\n"; + } + ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments + my $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2; + ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table + die "Position 1 is higher than position 2" if ($position_1 > $position_2); + die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); + my $alignment_location = join(":",$chromosome_1,$position_1,$position_2); + ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same + ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index + ### number for the found alignment) + unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){ + $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine + $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine + $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2; + } + ################################################################################################################################################### + ### STEP II Now reading in the next 2 lines from the bowtie filehandle. If there are 2 next lines in the alignments filehandle it can either ### + ### be a second alignment of the same sequence pair or a new sequence pair. In any case we will just add it to last_line_1 and last_line _2. ### + ### If it is the alignment of the next sequence pair, 0 will be returned as $valid_alignment_found, so it will not be processed any further in ### + ### this round ### + ################################################################################################################################################### + my $newline_1 = $fhs[$index]->{fh}-> getline(); + my $newline_2 = $fhs[$index]->{fh}-> getline(); + + if ($newline_1 and $newline_2){ + my ($seq_id_1) = split (/\t/,$newline_1); + my ($seq_id_2) = split (/\t/,$newline_2); + + if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag + $fhs[$index]->{last_seq_id} = $seq_id_1; + } + elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag + $fhs[$index]->{last_seq_id} = $seq_id_2; + } + else{ + die "Either read 1 or read 2 needs to end on '/1'\n"; + } + + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + } + else { + # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + next; # jumping to the next index + } + ### Now processing the entry we just stored in last_line_1 and last_line_2 + $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier); + ### only processing the alignment further if 1 was returned. 0 will be returned either if the alignment is already the next sequence pair to + ### be analysed or if it was a second alignment of the current sequence pair but in the wrong orientation + if ($valid_alignment_found == 1){ + ### we store the useful information in %mismatches + ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,7]; + ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,7]; + chomp $mismatch_info_1; + chomp $mismatch_info_2; + ### need to extract the chromosome number from the bowtie output (which is either _CT_converted or _GA_converted) + if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ + $chromosome_1 = $mapped_chromosome_1; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome_1\n"; + } + if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ + $chromosome_2 = $mapped_chromosome_2; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome_2\n"; + } + + $number_of_mismatches_1=''; + $number_of_mismatches_2=''; + ### Now extracting the number of mismatches to the converted genome + if ($mismatch_info_1 eq ''){ + $number_of_mismatches_1 = 0; + } + elsif ($mismatch_info_1 =~ /^\d/){ + my @mismatches = split (/,/,$mismatch_info_1); + $number_of_mismatches_1 = scalar @mismatches; + } + else{ + die "Something weird is going on with the mismatch field\n"; + } + if ($mismatch_info_2 eq ''){ + $number_of_mismatches_2 = 0; + } + elsif ($mismatch_info_2 =~ /^\d/){ + my @mismatches = split (/,/,$mismatch_info_2); + $number_of_mismatches_2 = scalar @mismatches; + } + else{ + die "Something weird is going on with the mismatch field\n"; + } + ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments + $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2; + ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table + die "position 1 is greater than position 2" if ($position_1 > $position_2); + die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); + $alignment_location = join(":",$chromosome_1,$position_1,$position_2); + ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same + ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index + ### number for the found alignment) + unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){ + $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine + $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine + $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2; + } + ############################################################################################################################################### + ### STEP III Now reading in two more lines. These have to be the next entry and we will just add assign them to last_line_1 and last_line_2 ### + ############################################################################################################################################### + $newline_1 = $fhs[$index]->{fh}-> getline(); + $newline_2 = $fhs[$index]->{fh}-> getline(); + + if ($newline_1 and $newline_2){ + my ($seq_id_1) = split (/\t/,$newline_1); + my ($seq_id_2) = split (/\t/,$newline_2); + + if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag + $fhs[$index]->{last_seq_id} = $seq_id_1; + } + if ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag + $fhs[$index]->{last_seq_id} = $seq_id_2; + } + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + } + else { + # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + next; # jumping to the next index + } + ### within the 2nd sequence pair alignment in correct orientation found + } + ### within the 1st sequence pair alignment in correct orientation found + } + ### still within the (last_seq_id eq identifier) condition + } + ### still within foreach index loop + } + ### if there was no single alignment found for a certain sequence we will continue with the next sequence in the sequence file + unless(%mismatches){ + $counting{no_single_alignment_found}++; + return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified + } + ### Going to use the variable $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) + my $sequence_pair_fails = 0; + ### Declaring an empty hash reference which will store all information we need for the methylation call + my $methylation_call_params; # hash reference! + ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the + ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the + ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether + foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){ + #dev print "Number of mismatches: $mismatch_number\t$identifier\t$sequence_1\t$sequence_2\n"; + foreach my $entry (keys (%{$mismatches{$mismatch_number}}) ){ + #dev print "$mismatch_number\t$entry\t$mismatches{$mismatch_number}->{$entry}->{index}\n"; + # print join("\t",$mismatch_number,$mismatches{$mismatch_number}->{$entry}->{seq_id},$sequence,$mismatches{$mismatch_number}->{$entry}->{bowtie_sequence},$mismatches{$mismatch_number}->{$entry}->{chromosome},$mismatches{$mismatch_number}->{$entry}->{position},$mismatches{$mismatch_number}->{$entry}->{index}),"\n"; + } + if (scalar keys %{$mismatches{$mismatch_number}} == 1){ + # print "Unique best alignment for sequence pair $sequence_1\t$sequence_1\n"; + for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){ + $methylation_call_params->{$identifier}->{seq_id} = $identifier; + $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_1}; + $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2}; + $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome}; + $methylation_call_params->{$identifier}->{start_seq_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_1}; + $methylation_call_params->{$identifier}->{start_seq_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}; + $methylation_call_params->{$identifier}->{alignment_end} = ($mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}+length($mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2})); + $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index}; + $methylation_call_params->{$identifier}->{number_of_mismatches_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_1}; + $methylation_call_params->{$identifier}->{number_of_mismatches_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_2}; + } + } + else{ + $sequence_pair_fails = 1; + } + ### after processing the alignment with the lowest number of mismatches we exit + last; + } + ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions + if ($sequence_pair_fails == 1){ + $counting{unsuitable_sequence_count}++; + if ($ambiguous){ + return 2; # => exits to next sequence pair, and prints both seqs out to multiple_alignments_1 and -2 if --ambiguous has been specified + } + if ($unmapped){ + return 1; # => exits to next sequence pair, and prints both seqs out to unmapped_1 and _2 if --un has been specified + } + else{ + return 0; # => exits to next sequence (default) + } + } + + ### --DIRECTIONAL + ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore + ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol + if ($directional){ + if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){ + # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; + $counting{alignments_rejected_count}++; + return 0; + } + } + + ### If the sequence has not been rejected so far it does have a unique best alignment + $counting{unique_best_alignment_count}++; + extract_corresponding_genomic_sequence_paired_ends($identifier,$methylation_call_params); + + ### check test to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call + if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){ + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n"; + $counting{genomic_sequence_could_not_be_extracted_count}++; + return 0; + } + if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){ + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n"; + $counting{genomic_sequence_could_not_be_extracted_count}++; + return 0; + } + + ### otherwise we are set to perform the actual methylation call + $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1}); + $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2}); + + print_bisulfite_mapping_results_paired_ends($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); + return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2 +} + +######################### +### BOWTIE 2 | PAIRED-END +######################### + +sub check_bowtie_results_paired_ends_bowtie2{ + my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_; + + ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40 + unless ($quality_value_1){ + $quality_value_1 = 'I'x(length$sequence_1); + } + + unless ($quality_value_2){ + $quality_value_2 = 'I'x(length$sequence_2); + } + + + # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n"; + + + my %alignments; + my $alignment_ambiguous = 0; + + ### reading from the Bowtie 2 output filehandles + + ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way. + ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2). + ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB) + ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignments to the complementary + ### strands are not being reported when '--directional' is specified + + foreach my $index (0,3,1,2){ + ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) + next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id}); + + ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it + if ($fhs[$index]->{last_seq_id} eq $identifier) { + + my ($id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,5,9,10]; + my ($id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,5,9,10]; + # print "Index: $index\t$fhs[$index]->{last_line_1}\n"; + # print "Index: $index\t$fhs[$index]->{last_line_2}\n"; + # print join ("\t",$id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1),"\n"; + # print join ("\t",$id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2),"\n"; + $id_1 =~ s/\/1$//; + $id_2 =~ s/\/2$//; + + # SAM format specifications for Bowtie 2 + # (1) Name of read that aligned + # (2) Sum of all applicable flags. Flags relevant to Bowtie are: + # 1 The read is one of a pair + # 2 The alignment is one end of a proper paired-end alignment + # 4 The read has no reported alignments + # 8 The read is one of a pair and has no reported alignments + # 16 The alignment is to the reverse reference strand + # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand + # 64 The read is mate 1 in a pair + # 128 The read is mate 2 in a pair + # 256 The read has multiple mapping states + # (3) Name of reference sequence where alignment occurs (unmapped reads have a *) + # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads) + # (5) Mapping quality (255 means MAPQ is not available) + # (6) CIGAR string representation of alignment (* if unavailable) + # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate. + # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate. + # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate. + # (10) Read sequence (reverse-complemented if aligned to the reverse strand) + # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file. + # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment: + # AS:i: Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read. + # XS:i: Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read. + # YS:i: Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment. + # XN:i: The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read. + # XM:i: The number of mismatches in the alignment. Only present if SAM record is for an aligned read. + # XO:i: The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. + # XG:i: The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. + # NM:i: The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read. + # YF:Z: String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out. + # MD:Z: A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read. + + ### If a sequence has no reported alignments there will be a single output line per sequence with a bit-wise flag value of 77 for read 1 (1+4+8+64), or 141 for read 2 (1+4+8+128). + ### We can store the next alignment and move on to the next Bowtie 2 instance + if ($flag_1 == 77 and $flag_2 == 141){ + ## reading in the next alignment, which must be the next sequence + my $newline_1 = $fhs[$index]->{fh}-> getline(); + my $newline_2 = $fhs[$index]->{fh}-> getline(); + + if ($newline_1 and $newline_2){ + chomp $newline_1; + chomp $newline_2; + my ($seq_id_1) = split (/\t/,$newline_1); + my ($seq_id_2) = split (/\t/,$newline_2); + $seq_id_1 =~ s/\/1$//; + $seq_id_2 =~ s/\/2$//; + $fhs[$index]->{last_seq_id} = $seq_id_1; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + + # print "current sequence ($identifier) did not map, reading in next sequence\n"; + # print "$index\t$fhs[$index]->{last_seq_id}\n"; + # print "$index\t$fhs[$index]->{last_line_1}\n"; + # print "$index\t$fhs[$index]->{last_line_2}\n"; + next; # next instance + } + else{ + # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + next; + } + } + + ### If there are one or more proper alignments we can extract the chromosome number + my ($chromosome_1,$chromosome_2); + if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ + $chromosome_1 = $mapped_chromosome_1; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome_1\n"; + } + if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ + $chromosome_2 = $mapped_chromosome_2; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome_2\n"; + } + + die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); + + ### We will use the optional fields to determine the best alignments. Later on we extract the number of mismatches and/or indels from the CIGAR string + my ($alignment_score_1,$alignment_score_2,$second_best_1,$second_best_2,$MD_tag_1,$MD_tag_2); + + my @fields_1 = split (/\t/,$fhs[$index]->{last_line_1}); + my @fields_2 = split (/\t/,$fhs[$index]->{last_line_2}); + + foreach (11..$#fields_1){ + if ($fields_1[$_] =~ /AS:i:(.*)/){ + $alignment_score_1 = $1; + } + elsif ($fields_1[$_] =~ /XS:i:(.*)/){ + $second_best_1 = $1; + } + elsif ($fields_1[$_] =~ /MD:Z:(.*)/){ + $MD_tag_1 = $1; + } + } + + foreach (11..$#fields_2){ + if ($fields_2[$_] =~ /AS:i:(.*)/){ + $alignment_score_2 = $1; + } + elsif ($fields_2[$_] =~ /XS:i:(.*)/){ + $second_best_2 = $1; + } + elsif ($fields_2[$_] =~ /MD:Z:(.*)/){ + $MD_tag_2 = $1; + } + } + + die "Failed to extract alignment score 1 ($alignment_score_1) and MD tag ($MD_tag_1)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_1 and defined $MD_tag_1); + die "Failed to extract alignment score 2 ($alignment_score_2) and MD tag ($MD_tag_2)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_2 and defined $MD_tag_2); + + # warn "First read 1 alignment score is: '$alignment_score_1'\n"; + # warn "First read 2 alignment score is: '$alignment_score_2'\n"; + # warn "MD tag 1 is: '$MD_tag_1'\n"; + # warn "MD tag 2 is: '$MD_tag_2'\n"; + + ### To decide whether a sequence pair has a unique best alignment we will look at the highest sum of alignment scores from both alignments + my $sum_of_alignment_scores_1 = $alignment_score_1 + $alignment_score_2 ; + # print "sum of alignment scores: $sum_of_alignment_scores_1\n\n"; + + if (defined $second_best_1 and defined $second_best_2){ + my $sum_of_alignment_scores_second_best = $second_best_1 + $second_best_2; + # warn "Second best alignment_score_1 is: '$second_best_1'\n"; + # warn "Second best alignment_score_2 is: '$second_best_2'\n"; + # warn "Second best alignment sum of alignment scores is: '$sum_of_alignment_scores_second_best'\n"; + + # If the first alignment score for the first read pair is the same as the alignment score of the second best hit we are going to boot this sequence pair altogether + if ($sum_of_alignment_scores_1 == $sum_of_alignment_scores_second_best){ + $alignment_ambiguous = 1; + # print "This read will be chucked (AS==XS detected)!\n"; + + ## need to read and discard all additional ambiguous reads until we reach the next sequence + until ($fhs[$index]->{last_seq_id} ne $identifier){ + my $newline_1 = $fhs[$index]->{fh}-> getline(); + my $newline_2 = $fhs[$index]->{fh}-> getline(); + if ($newline_1 and $newline_2){ + chomp $newline_1; + chomp $newline_2; + my ($seq_id_1) = split (/\t/,$newline_1); + my ($seq_id_2) = split (/\t/,$newline_2); + $seq_id_1 =~ s/\/1$//; + $seq_id_2 =~ s/\/2$//; + # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; + + $fhs[$index]->{last_seq_id} = $seq_id_1; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + } + else{ + # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + last; # break free if the end of the alignment output was reached + } + } + # if ($fhs[$index]->{last_seq_id}){ + # warn "Index: $index\tThis Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; + # } + } + else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment + + my $alignment_location; + if ($position_1 <= $position_2){ + $alignment_location = join(":",$chromosome_1,$position_1,$position_2); + } + elsif($position_2 < $position_1){ + $alignment_location = join(":",$chromosome_1,$position_2,$position_1); + } + + ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite + ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only + ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB + + unless (exists $alignments{$alignment_location}){ + $alignments{$alignment_location}->{seq_id} = $id_1; + $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1; + $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2; + $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1; + $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1; + $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2; + $alignments{$alignment_location}->{index} = $index; + $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine + $alignments{$alignment_location}->{position_1} = $position_1; + $alignments{$alignment_location}->{position_2} = $position_2; + $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1; + $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2; + $alignments{$alignment_location}->{CIGAR_1} = $cigar_1; + $alignments{$alignment_location}->{CIGAR_2} = $cigar_2; + $alignments{$alignment_location}->{flag_1} = $flag_1; + $alignments{$alignment_location}->{flag_2} = $flag_2; + } + # warn "added best of several alignments to \%alignments hash\n"; + + ### now reading and discarding all (inferior) alignments of this read pair until we hit the next sequence + until ($fhs[$index]->{last_seq_id} ne $identifier){ + my $newline_1 = $fhs[$index]->{fh}-> getline(); + my $newline_2 = $fhs[$index]->{fh}-> getline(); + if ($newline_1 and $newline_2){ + chomp $newline_1; + chomp $newline_2; + my ($seq_id_1) = split (/\t/,$newline_1); + my ($seq_id_2) = split (/\t/,$newline_2); + $seq_id_1 =~ s/\/1$//; + $seq_id_2 =~ s/\/2$//; + # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; + + $fhs[$index]->{last_seq_id} = $seq_id_1; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + } + else{ + # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + last; # break free if the end of the alignment output was reached + } + } + # if($fhs[$index]->{last_seq_id}){ + # warn "Index: $index\tThis Seq-ID is $identifier, skipped all other alignments until the next ID was reached which is: $fhs[$index]->{last_seq_id}\n"; + # } + } + } + else{ # there is no second best hit, so we can just store this one and read in the next sequence + + my $alignment_location = join(":",$chromosome_1,$position_1,$position_2); + # print "$alignment_location\n"; + ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite + ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only + ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB + + unless (exists $alignments{$alignment_location}){ + $alignments{$alignment_location}->{seq_id} = $id_1; + $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1; + $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2; + $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1; + $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1; + $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2; + $alignments{$alignment_location}->{index} = $index; + $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine + $alignments{$alignment_location}->{position_1} = $position_1; + $alignments{$alignment_location}->{position_2} = $position_2; + $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1; + $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2; + $alignments{$alignment_location}->{CIGAR_1} = $cigar_1; + $alignments{$alignment_location}->{CIGAR_2} = $cigar_2; + $alignments{$alignment_location}->{flag_1} = $flag_1; + $alignments{$alignment_location}->{flag_2} = $flag_2; + } + + # warn "added unique alignment to \%alignments hash\n"; + + # Now reading and storing the next read pair + my $newline_1 = $fhs[$index]->{fh}-> getline(); + my $newline_2 = $fhs[$index]->{fh}-> getline(); + if ($newline_1 and $newline_2){ + chomp $newline_1; + chomp $newline_2; + # print "$newline_1\n"; + # print "$newline_2\n"; + my ($seq_id_1) = split (/\t/,$newline_1); + my ($seq_id_2) = split (/\t/,$newline_2); + $seq_id_1 =~ s/\/1$//; + $seq_id_2 =~ s/\/2$//; + # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; + + $fhs[$index]->{last_seq_id} = $seq_id_1; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + + if ($seq_id_1 eq $identifier){ + die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; + } + } + else{ + # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + } + } + } + } + + ### if the read produced several ambiguous alignments for a single instance of Bowtie 2 we can return already now. If --ambiguous was specified the read sequence will be printed out in FastQ format + if ($alignment_ambiguous == 1){ + $counting{unsuitable_sequence_count}++; + ### report that the sequence pair has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else + # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); + # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); + # print "$ambiguous_read_1\n"; + # print "$ambiguous_read_2\n"; + + if ($ambiguous){ + return 2; # => exits to next sequence pair, and prints it out to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified + } + elsif ($unmapped){ + return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified + } + else{ + return 0; + } + } + + ### if no alignment was found for a certain sequence at all we continue with the next sequence in the sequence file + unless (%alignments){ + $counting{no_single_alignment_found}++; + + # my $unmapped_read_1 = join("\t",$identifier.'/1','77','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); + # my $unmapped_read_2 = join("\t",$identifier.'/2','141','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); + # print "$unmapped_read_1\n"; + # print "$unmapped_read_2\n"; + if ($unmapped){ + return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_read_2.txt if '--unmapped' was specified + } + else{ + return 0; + } + } + + ####################################################################################################################################################### + + ### If the sequence pair was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one + ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest) + ### alignment score we are discarding the sequence pair altogether. + ### For end-to-end alignments the maximum alignment score is 0, each mismatch receives a penalty of 6, and each gap receives penalties for opening (5) + ### and extending (3 per bp) the gap. + + ####################################################################################################################################################### + + ### Declaring an empty hash reference which will store all information we need for the methylation call + my $methylation_call_params; # hash reference + my $sequence_pair_fails = 0; # using $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) + + ### print contents of %alignments for debugging + ## if (scalar keys %alignments >= 1){ + # print "\n******\n"; + # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){ + # print "Loc: $alignment_location\n"; + # print "ID: $alignments{$alignment_location}->{seq_id}\n"; + # print "AS_1: $alignments{$alignment_location}->{alignment_score_1}\n"; + # print "AS_2: $alignments{$alignment_location}->{alignment_score_2}\n"; + # print "Seq_1: $alignments{$alignment_location}->{bowtie_sequence_1}\n"; + # print "Seq_2: $alignments{$alignment_location}->{bowtie_sequence_2}\n"; + # print "Index $alignments{$alignment_location}->{index}\n"; + # print "Chr: $alignments{$alignment_location}->{chromosome}\n"; + # print "Pos_1: $alignments{$alignment_location}->{position_1}\n"; + # print "Pos_2: $alignments{$alignment_location}->{position_2}\n"; + # print "CIGAR_1: $alignments{$alignment_location}->{CIGAR_1}\n"; + # print "CIGAR_2: $alignments{$alignment_location}->{CIGAR_2}\n"; + # print "MD_1: $alignments{$alignment_location}->{mismatch_info_1}\n"; + # print "MD_2: $alignments{$alignment_location}->{mismatch_info_2}\n"; + # print "Flag 1: $alignments{$alignment_location}->{flag_1}\n"; + # print "Flag 2: $alignments{$alignment_location}->{flag_2}\n"; + # } + # print "\n******\n"; + # } + + ### if there is only 1 entry in the %alignments hash we accept it as the best alignment + if (scalar keys %alignments == 1){ + for my $unique_best_alignment (keys %alignments){ + $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$unique_best_alignment}->{bowtie_sequence_1}; + $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$unique_best_alignment}->{bowtie_sequence_2}; + $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome}; + $methylation_call_params->{$identifier}->{position_1} = $alignments{$unique_best_alignment}->{position_1}; + $methylation_call_params->{$identifier}->{position_2} = $alignments{$unique_best_alignment}->{position_2}; + $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index}; + $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$unique_best_alignment}->{alignment_score_1}; + $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$unique_best_alignment}->{alignment_score_2}; + $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores}; + $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$unique_best_alignment}->{mismatch_info_1}; + $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$unique_best_alignment}->{mismatch_info_2}; + $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$unique_best_alignment}->{CIGAR_1}; + $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$unique_best_alignment}->{CIGAR_2}; + $methylation_call_params->{$identifier}->{flag_1} = $alignments{$unique_best_alignment}->{flag_1}; + $methylation_call_params->{$identifier}->{flag_2} = $alignments{$unique_best_alignment}->{flag_2}; + } + } + + ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case + ### we boot the sequence pair altogether) + elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){ + my $best_sum_of_alignment_scores; + my $best_alignment_location; + foreach my $alignment_location (sort {$alignments{$b}->{sum_of_alignment_scores} <=> $alignments{$a}->{sum_of_alignment_scores}} keys %alignments){ + # print "$alignments{$alignment_location}->{sum_of_alignment_scores}\n"; + unless (defined $best_sum_of_alignment_scores){ + $best_sum_of_alignment_scores = $alignments{$alignment_location}->{sum_of_alignment_scores}; + $best_alignment_location = $alignment_location; + # print "setting best alignment score to: $best_sum_of_alignment_scores\n"; + } + else{ + ### if the second best alignment has the same sum of alignment scores as the first one, the sequence pair will get booted + if ($alignments{$alignment_location}->{sum_of_alignment_scores} == $best_sum_of_alignment_scores){ + # warn "Same sum of alignment scores for 2 different alignments, the sequence pair will get booted!\n"; + $sequence_pair_fails = 1; + last; # exiting since we know that the sequence has ambiguous alignments + } + ### else we are going to store the best alignment for further processing + else{ + $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$best_alignment_location}->{bowtie_sequence_1}; + $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$best_alignment_location}->{bowtie_sequence_2}; + $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome}; + $methylation_call_params->{$identifier}->{position_1} = $alignments{$best_alignment_location}->{position_1}; + $methylation_call_params->{$identifier}->{position_2} = $alignments{$best_alignment_location}->{position_2}; + $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index}; + $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$best_alignment_location}->{alignment_score_1}; + $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$best_alignment_location}->{alignment_score_2}; + $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$best_alignment_location}->{sum_of_alignment_scores}; + $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$best_alignment_location}->{mismatch_info_1}; + $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$best_alignment_location}->{mismatch_info_2}; + $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$best_alignment_location}->{CIGAR_1}; + $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$best_alignment_location}->{CIGAR_2}; + $methylation_call_params->{$identifier}->{flag_1} = $alignments{$best_alignment_location}->{flag_1}; + $methylation_call_params->{$identifier}->{flag_2} = $alignments{$best_alignment_location}->{flag_2}; + last; # exiting since the sequence produced a unique best alignment + } + } + } + } + else{ + die "There are too many potential hits for this sequence pair (1-4 expected, but found: '",scalar keys %alignments,"')\n";; + } + + ### skipping the sequence completely if there were multiple alignments with the same best sum of alignment scores at different positions + if ($sequence_pair_fails == 1){ + $counting{unsuitable_sequence_count}++; + + ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else + # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); + # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); + # print "$ambiguous_read_1\n"; + # print "$ambiguous_read_2\n"; + + if ($ambiguous){ + return 2; # => exits to next sequence pair, and prints it out (in FastQ format) to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified + } + elsif ($unmapped){ + return 1; # => exits to next sequence pair, and prints it out (in FastQ format) to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified + } + else{ + return 0; # => exits to next sequence pair (default) + } + } + + ### --DIRECTIONAL + ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore + ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol + if ($directional){ + if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){ + # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; + $counting{alignments_rejected_count}++; + return 0; + } + } + + ### If the sequence pair has not been rejected so far it does have a unique best alignment + $counting{unique_best_alignment_count}++; + extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params); + + ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call + if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){ + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n"; + $counting{genomic_sequence_could_not_be_extracted_count}++; + return 0; + } + if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){ + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n"; + $counting{genomic_sequence_could_not_be_extracted_count}++; + return 0; + } + + ### now we are set to perform the actual methylation call + $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1}); + $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2}); + # print "$methylation_call_params->{$identifier}->{read_conversion_2}\n"; + # print " $sequence_2\n"; + # print "$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}\n"; + # print " $methylation_call_params->{$identifier}->{methylation_call_2}\n"; + + print_bisulfite_mapping_results_paired_ends_bowtie2($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); + return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2 +} + +### + +sub decide_whether_paired_end_alignment_is_valid{ + my ($index,$identifier) = @_; + my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7]; + my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7]; + chomp $mismatch_info_1; + chomp $mismatch_info_2; + my $seq_id_1 = $id_1; + my $seq_id_2 = $id_2; + $seq_id_1 =~ s/\/1$//; # removing the read /1 + $seq_id_2 =~ s/\/1$//; # removing the read /1 + + ### ensuring that the current entry is the correct sequence + if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){ + ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically + ### sensible alignments + my $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2); + ### If the orientation was correct can we move on + if ($orientation == 1){ + return 1; ### 1st possibility for A SEQUENCE-PAIR TO PASS + } + ### If the alignment was in the wrong orientation we need to read in two new lines + elsif($orientation == 0){ + my $newline_1 = $fhs[$index]->{fh}->getline(); + my $newline_2 = $fhs[$index]->{fh}->getline(); + if ($newline_1 and $newline_2){ + ### extract detailed information about the alignment again (from $newline_1 and $newline_2 this time) + ($id_1,$strand_1) = (split (/\t/,$newline_1))[0,1]; + ($id_2,$strand_2) = (split (/\t/,$newline_2))[0,1]; + + my $seqid; + $seq_id_1 = $id_1; + $seq_id_2 = $id_2; + # we need to capture the first read (ending on /1) + if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag + $seqid = $seq_id_1; + } + elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag + $seqid = $seq_id_2; + } + else{ + die "One of the two reads needs to end on /1!!"; + } + + ### ensuring that the next entry is still the correct sequence + if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){ + ### checking orientation again + $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2); + ### If the orientation was correct can we move on + if ($orientation == 1){ + ### Writing the current sequence to last_line_1 and last_line_2 + $fhs[$index]->{last_seq_id} = $seqid; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + return 1; ### 2nd possibility for a SEQUENCE-PAIR TO PASS + } + ### If the alignment was in the wrong orientation again we need to read in yet another 2 new lines and store them in @fhs (this must be + ### the next entry) + elsif ($orientation == 0){ + $newline_1 = $fhs[$index]->{fh}->getline(); + $newline_2 = $fhs[$index]->{fh}->getline(); + if ($newline_1 and $newline_2){ + ($seq_id_1) = split (/\t/,$newline_1); + ($seq_id_2) = split (/\t/,$newline_2); + + $seqid = ''; + if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag + $seqid = $seq_id_1; + } + elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag + $seqid = $seq_id_2; + } + else{ + die "One of the two reads needs to end on /1!!"; + } + + ### check if the next 2 lines still have the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with + ### the same fields of the just read next entry + die "Same seq ID 3 or more times in a row!(should be 2 max)" if ($seqid eq $identifier); + $fhs[$index]->{last_seq_id} = $seqid; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + return 0; # not processing anything this round as the alignment currently stored in last_line_1 and _2 was in the wrong orientation + } + else { + ### assigning undef to last_seq_id and last_line (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation + } + } + else{ + die "The orientation of the alignment must be either correct or incorrect\n"; + } + } + ### the sequence pair we just read in is already the next sequence pair to be analysed -> store it in @fhs + else{ + $fhs[$index]->{last_seq_id} = $seqid; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + return 0; # processing the new alignment result only in the next round + } + } + else { + # assigning undef to last_seq_id and both last_lines (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation + } + } + else{ + die "The orientation of the alignment must be either correct or incorrect\n"; + } + } + ### the sequence pair stored in @fhs as last_line_1 and last_line_2 is already the next sequence pair to be analysed -> analyse next round + else{ + return 0; + } +} + +### EXTRACT GENOMIC SEQUENCE | BOWTIE 1 | PAIRED-END + +sub extract_corresponding_genomic_sequence_paired_ends { + my ($sequence_identifier,$methylation_call_params) = @_; + ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the + ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call + my $alignment_read_1; + my $alignment_read_2; + my $read_conversion_info_1; + my $read_conversion_info_2; + my $genome_conversion; + + ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at oone of the ends so that we can also make a CpG, CHG or CHH methylation call + ### if the C happens to be at the first or last position of the actually observed sequence + my $non_bisulfite_sequence_1; + my $non_bisulfite_sequence_2; + + ### all alignments reported by bowtie have the + alignment first and the - alignment as the second one irrespective of whether read 1 or read 2 was + ### the + alignment. We however always read in sequences read 1 then read 2, so if read 2 is the + alignment we need to swap the extracted genomic + ### sequences around! + ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only) + if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ + ### [Index 0, sequence originated from (converted) forward strand] + $counting{CT_GA_CT_count}++; + $alignment_read_1 = '+'; + $alignment_read_2 = '-'; + $read_conversion_info_1 = 'CT'; + $read_conversion_info_2 = 'GA'; + $genome_conversion = 'CT'; + ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1) + ### for hits on the forward strand we need to capture 2 extra bases at the 3' end + + $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ##CHH change + + ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2) + ### As the second conversion is GA we need to capture 1 base 3', so that it is a 5' base after reverse complementation + if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{start_seq_2}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+1){ ## CHH change to +1 + + $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); + ### the reverse strand sequence needs to be reverse complemented + $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); + } + else{ + $non_bisulfite_sequence_2 = ''; + } + } + + ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ + ### [Index 1, sequence originated from complementary to (converted) reverse strand] + $counting{GA_CT_GA_count}++; + $alignment_read_1 = '+'; + $alignment_read_2 = '-'; + $read_conversion_info_1 = 'GA'; + $read_conversion_info_2 = 'CT'; + $genome_conversion = 'GA'; + + ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1) + ### as we need to make the methylation call for the base 5' of the first base (GA conversion!) we need to capture 2 extra bases at the 5' end + if ($methylation_call_params->{$sequence_identifier}->{start_seq_1}-1 > 0){ ## CHH change to -1 + $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH change to -2/+2 + } + else{ + $non_bisulfite_sequence_1 = ''; + } + + ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2) + ### As we are doing a CT comparison for the reverse strand we are taking 2 bases extra at the 5' end, so it is a 3' base after reverse complementation + $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to -2/+2 + ### the reverse strand sequence needs to be reverse complemented + $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); + } + + ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ + ### [Index 2, sequence originated from the complementary to (converted) forward strand] + $counting{GA_CT_CT_count}++; + $alignment_read_1 = '-'; + $alignment_read_2 = '+'; + $read_conversion_info_1 = 'GA'; + $read_conversion_info_2 = 'CT'; + $genome_conversion = 'CT'; + + ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!! + ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand + ### As read 1 is GA converted we need to capture 2 extra 3' bases which will be 2 extra 5' base after reverse complementation + $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to +2 + ### the reverse strand sequence needs to be reverse complemented + $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); + + ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1) + ### non_bisulfite_sequence_2 will later correspond to the read 2!!!! + ### Read 2 is CT converted so we need to capture 2 extra 3' bases + if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > ($methylation_call_params->{$sequence_identifier}->{start_seq_1})+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+1){ ## CHH change to +1 + $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ## CHH changed from +1 to +2 + } + else{ + $non_bisulfite_sequence_2 = ''; + } + } + + ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ + ### [Index 3, sequence originated from the (converted) reverse strand] + $counting{CT_GA_GA_count}++; + $alignment_read_1 = '-'; + $alignment_read_2 = '+'; + $read_conversion_info_1 = 'CT'; + $read_conversion_info_2 = 'GA'; + $genome_conversion = 'GA'; + + ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!! + ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand + ### As read 1 is CT converted we need to capture 2 extra 5' bases which will be 2 extra 3' base after reverse complementation + if ( ($methylation_call_params->{$sequence_identifier}->{start_seq_2}-1) > 0){ ## CHH changed to -1 + $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH changed to -2/+2 + ### the reverse strand sequence needs to be reverse complemented + $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); + } + else{ + $non_bisulfite_sequence_1 = ''; + } + + ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1) + ### non_bisulfite_sequence_2 will later correspond to the read 2!!!! + ### Read 2 is GA converted so we need to capture 2 extra 5' bases + $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH changed to -2/+2 + } + else{ + die "Too many bowtie result filehandles\n"; + } + ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, + ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions + + $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1; + $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2; + $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; + $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1; + $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; +} + +### EXTRACT GENOMIC SEQUENCE BOWTIE 2 | PAIRED-END + +sub extract_corresponding_genomic_sequence_paired_ends_bowtie2{ + my ($sequence_identifier,$methylation_call_params) = @_; + ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the + ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call + + my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1}; + my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2}; + my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1}; + my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2}; +# print "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n"; + ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and + ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence + + ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, + ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions + my $alignment_read_1; + my $alignment_read_2; + my $read_conversion_info_1; + my $read_conversion_info_2; + my $genome_conversion; + + ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at one of the ends so that we can also make a CpG, CHG or CHH methylation call + ### if the C happens to be at the last position of the actually observed sequence + my $non_bisulfite_sequence_1 = ''; + my $non_bisulfite_sequence_2 = ''; + + ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings + my $pos_1 = $methylation_call_params->{$sequence_identifier}->{position_1}-1; + my $pos_2 = $methylation_call_params->{$sequence_identifier}->{position_2}-1; + + # parsing CIGAR 1 string + my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation + my @ops_1 = split (/\d+/,$cigar_1); # storing the operation + shift @ops_1; # remove the empty first element + die "CIGAR 1 string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1); + # parsing CIGAR 2 string + my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation + my @ops_2 = split (/\d+/,$cigar_2); # storing the operation + shift @ops_2; # remove the empty first element + die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2); + + my $indels_1 = 0; # addiong these to the hemming distance value (needed for the NM field in the final SAM output + my $indels_2 = 0; + + ### Extracting read 1 genomic sequence ### + + # extracting 2 additional bp at the 5' end (read 1) + if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ + # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + unless ( ($pos_1-2) > 0){# exiting with en empty genomic sequence otherwise + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; + return; + } + $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1-2,2); + } + + foreach (0..$#len_1){ + if ($ops_1[$_] eq 'M'){ + # extracting genomic sequence + $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]); + # warn "$non_bisulfite_sequence_1\n"; + # adjusting position + $pos_1 += $len_1[$_]; + } + elsif ($ops_1[$_] eq 'I'){ # insertion in the read sequence + # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls + $non_bisulfite_sequence_1 .= 'N' x $len_1[$_]; + # warn "$non_bisulfite_sequence_1\n"; + # position doesn't need adjusting + $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output + } + elsif ($ops_1[$_] eq 'D'){ # deletion in the read sequence + # we do not add any genomic sequence but only adjust the position + # warn "Just adjusting the position by: ",$len_1[$_],"bp\n"; + $pos_1 += $len_1[$_]; + $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output + } + elsif($cigar_1 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die + die "The CIGAR 1 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; + } + else{ + die "The CIGAR 1 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; + } + } + + ### 3' end of read 1 + if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; + return; + } + $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2); + } + + + ### Extracting read 2 genomic sequence ### + + ### 5' end of read 2 + if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + unless ( ($pos_2-2) >= 0){# exiting with en empty genomic sequence otherwise + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; + return; + } + $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2-2,2); + } + + foreach (0..$#len_2){ + if ($ops_2[$_] eq 'M'){ + # extracting genomic sequence + $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]); + # warn "$non_bisulfite_sequence_2\n"; + # adjusting position + $pos_2 += $len_2[$_]; + } + elsif ($ops_2[$_] eq 'I'){ # insertion in the read sequence + # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls + $non_bisulfite_sequence_2 .= 'N' x $len_2[$_]; + # warn "$non_bisulfite_sequence_2\n"; + # position doesn't need adjusting + $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output + } + elsif ($ops_2[$_] eq 'D'){ # deletion in the read sequence + # we do not add any genomic sequence but only adjust the position + # warn "Just adjusting the position by: ",$len_2[$_],"bp\n"; + $pos_2 += $len_2[$_]; + $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output + } + elsif($cigar_2 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die + die "The CIGAR 2 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; + } + else{ + die "The CIGAR 2 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; + } + } + + ### 3' end of read 2 + if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; + return; + } + $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2); + } + + ### all paired-end alignments reported by Bowtie 2 have the Read 1 alignment first and the Read 2 alignment as the second one irrespective of whether read 1 or read 2 was + ### the + alignment. We also read in sequences read 1 then read 2 so they should correspond perfectly + + ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only) + if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ + ### [Index 0, sequence originated from (converted) forward strand] + $counting{CT_GA_CT_count}++; + $alignment_read_1 = '+'; + $alignment_read_2 = '-'; + $read_conversion_info_1 = 'CT'; + $read_conversion_info_2 = 'GA'; + $genome_conversion = 'CT'; + ### Read 1 is always the forward hit + ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented + $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); + } + + ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ + ### [Index 1, sequence originated from complementary to (converted) bottom strand] + $counting{GA_CT_GA_count}++; + $alignment_read_1 = '+'; + $alignment_read_2 = '-'; + $read_conversion_info_1 = 'GA'; + $read_conversion_info_2 = 'CT'; + $genome_conversion = 'GA'; + ### Read 1 is always the forward hit + ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented + $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); + } + + ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ + ### [Index 2, sequence originated from the complementary to (converted) top strand] + $counting{GA_CT_CT_count}++; + $alignment_read_1 = '-'; + $alignment_read_2 = '+'; + $read_conversion_info_1 = 'GA'; + $read_conversion_info_2 = 'CT'; + $genome_conversion = 'CT'; + + ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented + $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); + } + + ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ + ### [Index 3, sequence originated from the (converted) reverse strand] + $counting{CT_GA_GA_count}++; + $alignment_read_1 = '-'; + $alignment_read_2 = '+'; + $read_conversion_info_1 = 'CT'; + $read_conversion_info_2 = 'GA'; + $genome_conversion = 'GA'; + ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented + $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); + } + else{ + die "Too many bowtie result filehandles\n"; + } + ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, + ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions + + $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1; + $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2; + $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; + $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1; + $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; + ## the end position of a read is stored in $pos + $methylation_call_params->{$sequence_identifier}->{end_position_1} = $pos_1; + $methylation_call_params->{$sequence_identifier}->{end_position_2} = $pos_2; + $methylation_call_params->{$sequence_identifier}->{indels_1} = $indels_1; + $methylation_call_params->{$sequence_identifier}->{indels_2} = $indels_2; +} + +########################################## +### PRINT SINGLE END RESULTS: Bowtie 1 ### +########################################## + +sub print_bisulfite_mapping_result_single_end{ + my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_; + + ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) + if ($phred64){ + $quality_value = convert_phred64_quals_to_phred33($quality_value); + } + elsif ($solexa){ + $quality_value = convert_solexa_quals_to_phred33($quality_value); + } + + ### We will add +1 bp to the starting position of single-end reads, as Bowtie 1 reports the index and not the bp position. + $methylation_call_params->{$identifier}->{position} += 1; + + ### writing every uniquely mapped read and its methylation call to the output file + if ($vanilla){ + my $bowtie1_output = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_strand},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{position},$methylation_call_params->{$identifier}->{end_position},$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{methylation_call},$methylation_call_params->{$identifier}->{read_conversion},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value); + print OUT "$bowtie1_output\n"; + } + else{ # SAM output, default since Bismark v1.0.0 + single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script + } +} + +########################################## +### PRINT SINGLE END RESULTS: Bowtie 2 ### +########################################## + +sub print_bisulfite_mapping_result_single_end_bowtie2{ + my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_; + + ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) + if ($phred64){ + $quality_value = convert_phred64_quals_to_phred33($quality_value); + } + elsif ($solexa){ + $quality_value = convert_solexa_quals_to_phred33($quality_value); + } + + ### writing every mapped read and its methylation call to the SAM output file (unmapped and ambiguous reads were already printed) + single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script +} + +########################################## +### PRINT PAIRED END ESULTS: Bowtie 1 ### +########################################## + +sub print_bisulfite_mapping_results_paired_ends{ + my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_; + + ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) + if ($phred64){ + $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1); + $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2); + } + elsif ($solexa){ + $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1); + $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2); + } + + ### We will add +1 bp to the start position of paired-end reads, as Bowtie 1 reports the index and not the bp position. (End position is already 1-based) + $methylation_call_params->{$identifier}->{start_seq_1} += 1; + + ### writing every single aligned read and its methylation call to the output file + if ($vanilla){ + my $bowtie1_output_paired_end = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_read_1},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{start_seq_1},$methylation_call_params->{$identifier}->{alignment_end},$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{methylation_call_1},$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{methylation_call_2},$methylation_call_params->{$identifier}->{read_conversion_1},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value_1,$quality_value_2); + print OUT "$bowtie1_output_paired_end\n"; + } + else{ # SAM output, default since Bismark v1.0.0 + paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script + } + +} + +########################################## +### PRINT PAIRED END ESULTS: Bowtie 2 ### +########################################## + +sub print_bisulfite_mapping_results_paired_ends_bowtie2{ + my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_; + + ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) + if ($phred64){ + $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1); + $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2); + } + elsif ($solexa){ + $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1); + $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2); + } + + ### writing every single aligned read and its methylation call to the output file (unmapped and ambiguous reads were already printed) + paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script + +} + + +sub convert_phred64_quals_to_phred33{ + + my $qual = shift; + my @quals = split (//,$qual); + my @new_quals; + + foreach my $index (0..$#quals){ + my $phred_score = convert_phred64_quality_string_into_phred_score ($quals[$index]); + my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score); + $new_quals[$index] = $phred33_quality_string; + } + + my $phred33_quality = join ("",@new_quals); + return $phred33_quality; +} + +sub convert_solexa_quals_to_phred33{ + + my $qual = shift; + my @quals = split (//,$qual); + my @new_quals; + + foreach my $index (0..$#quals){ + my $phred_score = convert_solexa_pre1_3_quality_string_into_phred_score ($quals[$index]); + my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score); + $new_quals[$index] = $phred33_quality_string; + } + + my $phred33_quality = join ("",@new_quals); + return $phred33_quality; +} + +sub convert_phred_score_into_phred33_quality_string{ + my $qual = shift; + $qual = chr($qual+33); + return $qual; +} + +sub convert_phred64_quality_string_into_phred_score{ + my $string = shift; + my $qual = ord($string)-64; + return $qual; +} + +sub convert_solexa_pre1_3_quality_string_into_phred_score{ + ### We will just use 59 as the offset here as all Phred Scores between 10 and 40 look exactly the same, there is only a minute difference for values between 0 and 10 + my $string = shift; + my $qual = ord($string)-59; + return $qual; +} + + +sub extract_corresponding_genomic_sequence_single_end { + my ($sequence_identifier,$methylation_call_params) = @_; + ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the + ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call + + ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, + ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions + my $alignment_strand; + my $read_conversion_info; + my $genome_conversion; + ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and + ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation, + ### if the C happens to be at the last position of the actually observed sequence + my $non_bisulfite_sequence; + ### depending on the conversion we want to make need to capture 1 extra base at the 3' end + + ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only) + if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ + ### [Index 0, sequence originated from (converted) forward strand] + $counting{CT_CT_count}++; + $alignment_strand = '+'; + $read_conversion_info = 'CT'; + $genome_conversion = 'CT'; + + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1 + ### + 2 extra base at the 3' end + $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 + } + else{ + $non_bisulfite_sequence = ''; + } + } + + ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ + ### [Index 1, sequence originated from (converted) reverse strand] + $counting{CT_GA_count}++; + $alignment_strand = '-'; + $read_conversion_info = 'CT'; + $genome_conversion = 'GA'; + + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from > + ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation + $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 + ## reverse complement! + $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); + } + else{ + $non_bisulfite_sequence = ''; + } + } + + ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ + ### [Index 2, sequence originated from complementary to (converted) forward strand] + $counting{GA_CT_count}++; + $alignment_strand = '-'; + $read_conversion_info = 'GA'; + $genome_conversion = 'CT'; + + ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012 + $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 + ## reverse complement! + $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); + } + else{ + $non_bisulfite_sequence = ''; + } + } + + ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ + ### [Index 3, sequence originated from complementary to (converted) reverse strand] + $counting{GA_GA_count}++; + $alignment_strand = '+'; + $read_conversion_info = 'GA'; + $genome_conversion = 'GA'; + + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from > + ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand + $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 + } + else{ + $non_bisulfite_sequence = ''; + } + } + else{ + die "Too many bowtie result filehandles\n"; + } + + $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand; + $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info; + $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; + + ### at this point we can also determine the end position of a read + $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence}); +} + + +sub extract_corresponding_genomic_sequence_single_end_bowtie2{ + my ($sequence_identifier,$methylation_call_params) = @_; + + my $MD_tag = $methylation_call_params->{$sequence_identifier}->{mismatch_info}; + my $cigar = $methylation_call_params->{$sequence_identifier}->{CIGAR}; + + ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the + ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call + + ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, + ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions + my $alignment_strand; + my $read_conversion_info; + my $genome_conversion; + ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and + ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence + my $non_bisulfite_sequence = ''; + + ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings + my $pos = $methylation_call_params->{$sequence_identifier}->{position}-1; + + # parsing CIGAR string + my @len = split (/\D+/,$cigar); # storing the length per operation + my @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + ### If the sequence aligns best as CT converted reads vs. GA converted genome (OB, index 1) or GA converted reads vs. GA converted genome (CTOB, index 3) + if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + unless ( ($pos-2) >= 0){ # exiting with en empty genomic sequence otherwise + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; + return; + } + $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos-2,2); + } + my $indels = 0; + + foreach (0..$#len){ + if ($ops[$_] eq 'M'){ + #extracting genomic sequence + $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]); + # adjusting position + $pos += $len[$_]; + } + elsif ($ops[$_] eq 'I'){ # insertion in the read sequence + # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls + $non_bisulfite_sequence .= 'N' x $len[$_]; + # warn "$non_bisulfite_sequence\n"; + # position doesn't need to be adjusting + $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions) + } + elsif ($ops[$_] eq 'D'){ # deletion in the read sequence + # we do not add any genomic sequence but only adjust the position + $pos += $len[$_]; + $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions) + } + elsif($cigar =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die + die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; + } + else{ + die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; + } + } + + ### If the sequence aligns best as CT converted reads vs. CT converted genome (OT, index 0) or GA converted reads vs. CT converted genome (CTOT, index 2) + if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos+2){ # exiting with en empty genomic sequence otherwise + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; + return; + } + $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,2); + # print "$methylation_call_params->{$sequence_identifier}->{bowtie_sequence}\n$non_bisulfite_sequence\n"; + } + + + + ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only) + if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ + ### [Index 0, sequence originated from (converted) forward strand] + $counting{CT_CT_count}++; + $alignment_strand = '+'; + $read_conversion_info = 'CT'; + $genome_conversion = 'CT'; + } + + ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ + ### [Index 1, sequence originated from (converted) reverse strand] + $counting{CT_GA_count}++; + $alignment_strand = '-'; + $read_conversion_info = 'CT'; + $genome_conversion = 'GA'; + + ### reverse complement! + $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); + } + + ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ + ### [Index 2, sequence originated from complementary to (converted) forward strand] + $counting{GA_CT_count}++; + $alignment_strand = '-'; + $read_conversion_info = 'GA'; + $genome_conversion = 'CT'; + + ### reverse complement! + $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); + } + + ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ + ### [Index 3, sequence originated from complementary to (converted) reverse strand] + $counting{GA_GA_count}++; + $alignment_strand = '+'; + $read_conversion_info = 'GA'; + $genome_conversion = 'GA'; + + } + else{ + die "Too many Bowtie 2 result filehandles\n"; + } + + $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand; + $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info; + $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; + + ### the end position of a read is stored in $pos + $methylation_call_params->{$sequence_identifier}->{end_position} = $pos; + $methylation_call_params->{$sequence_identifier}->{indels} = $indels; +} + +### METHYLATION CALL + +sub methylation_call{ + my ($identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion) = @_; + ### splitting both the actually observed sequence and the genomic sequence up into single bases so we can compare them one by one + my @seq = split(//,$sequence_actually_observed); + my @genomic = split(//,$genomic_sequence); + # print join ("\n",$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion),"\n"; + ### Creating a match-string with different characters for non-cytosine bases (disregarding mismatches here), methyl-Cs or non-methyl Cs in either + ### CpG, CHH or CHG context + + ################################################################# + ### . for bases not involving cytosines ### + ### X for methylated C in CHG context (was protected) ### + ### x for not methylated C in CHG context (was converted) ### + ### H for methylated C in CHH context (was protected) ### + ### h for not methylated C in CHH context (was converted) ### + ### Z for methylated C in CpG context (was protected) ### + ### z for not methylated C in CpG context (was converted) ### + ################################################################# + + my @match =(); + warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2 + my $methyl_CHH_count = 0; + my $methyl_CHG_count = 0; + my $methyl_CpG_count = 0; + my $unmethylated_CHH_count = 0; + my $unmethylated_CHG_count = 0; + my $unmethylated_CpG_count = 0; + + if ($read_conversion eq 'CT'){ + for my $index (0..$#seq) { + if ($seq[$index] eq $genomic[$index]) { + ### The residue can only be a C if it was not converted to T, i.e. protected my methylation + if ($genomic[$index] eq 'C') { + ### If the residue is a C we want to know if it was in CpG context or in any other context + my $downstream_base = $genomic[$index+1]; + + if ($downstream_base eq 'G'){ + ++$methyl_CpG_count; + push @match,'Z'; # protected C, methylated, in CpG context + } + + else { + ### C in not in CpG-context, determining the second downstream base context + my $second_downstream_base = $genomic[$index+2]; + + if ($second_downstream_base eq 'G'){ + ++$methyl_CHG_count; + push @match,'X'; # protected C, methylated, in CHG context + } + else{ + ++$methyl_CHH_count; + push @match,'H'; # protected C, methylated, in CHH context + } + } + } + else { + push @match, '.'; + } + } + elsif ($seq[$index] ne $genomic[$index]) { + ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted into Ts + ### in the actually observed sequence + if ($genomic[$index] eq 'C' and $seq[$index] eq 'T') { + ### If the residue was converted to T we want to know if it was in CpG, CHG or CHH context + my $downstream_base = $genomic[$index+1]; + + if ($downstream_base eq 'G'){ + ++$unmethylated_CpG_count; + push @match,'z'; # converted C, not methylated, in CpG context + } + + else{ + ### C in not in CpG-context, determining the second downstream base context + my $second_downstream_base = $genomic[$index+2]; + + if ($second_downstream_base eq 'G'){ + ++$unmethylated_CHG_count; + push @match,'x'; # converted C, not methylated, in CHG context + } + else{ + ++$unmethylated_CHH_count; + push @match,'h'; # converted C, not methylated, in CHH context + } + } + } + ### all other mismatches are not of interest for a methylation call + else { + push @match,'.'; + } + } + else{ + die "There can be only 2 possibilities\n"; + } + } + } + elsif ($read_conversion eq 'GA'){ + # print join ("\n",'***',$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion,'***'),"\n"; + + for my $index (0..$#seq) { + if ($seq[$index] eq $genomic[$index+2]) { + ### The residue can only be a G if the C on the other strand was not converted to T, i.e. protected my methylation + if ($genomic[$index+2] eq 'G') { + ### If the residue is a G we want to know if the C on the other strand was in CpG, CHG or CHH context, therefore we need + ### to look if the base upstream is a C + + my $upstream_base = $genomic[$index+1]; + + if ($upstream_base eq 'C'){ + ++$methyl_CpG_count; + push @match,'Z'; # protected C on opposing strand, methylated, in CpG context + } + + else{ + ### C in not in CpG-context, determining the second upstream base context + my $second_upstream_base = $genomic[$index]; + + if ($second_upstream_base eq 'C'){ + ++$methyl_CHG_count; + push @match,'X'; # protected C on opposing strand, methylated, in CHG context + } + else{ + ++$methyl_CHH_count; + push @match,'H'; # protected C on opposing strand, methylated, in CHH context + } + } + } + else{ + push @match, '.'; + } + } + elsif ($seq[$index] ne $genomic[$index+2]) { + ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted to Ts + ### on the opposing strand, so G to A conversions in the actually observed sequence + if ($genomic[$index+2] eq 'G' and $seq[$index] eq 'A') { + ### If the C residue on the opposing strand was converted to T then we will see an A in the currently observed sequence. We want to know if + ### the C on the opposing strand was it was in CpG, CHG or CHH context, therefore we need to look one (or two) bases upstream! + + my $upstream_base = $genomic[$index+1]; + + if ($upstream_base eq 'C'){ + ++$unmethylated_CpG_count; + push @match,'z'; # converted C on opposing strand, not methylated, in CpG context + } + + else{ + ### C in not in CpG-context, determining the second upstream base context + my $second_upstream_base = $genomic[$index]; + + if ($second_upstream_base eq 'C'){ + ++$unmethylated_CHG_count; + push @match,'x'; # converted C on opposing strand, not methylated, in CHG context + } + else{ + ++$unmethylated_CHH_count; + push @match,'h'; # converted C on opposing strand, not methylated, in CHH context + } + } + } + ### all other mismatches are not of interest for a methylation call + else { + push @match,'.'; + } + } + else{ + die "There can be only 2 possibilities\n"; + } + } + } + else{ + die "Strand conversion info is required to perform a methylation call\n"; + } + + my $methylation_call = join ("",@match); + + $counting{total_meCHH_count} += $methyl_CHH_count; + $counting{total_meCHG_count} += $methyl_CHG_count; + $counting{total_meCpG_count} += $methyl_CpG_count; + $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count; + $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count; + $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count; + + # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n"; + return $methylation_call; +} + +sub read_genome_into_memory{ + ## working directoy + my $cwd = shift; + ## reading in and storing the specified genome in the %chromosomes hash + chdir ($genome_folder) or die "Can't move to $genome_folder: $!"; + print "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n"; + + my @chromosome_filenames = <*.fa>; + + ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta + unless (@chromosome_filenames){ + @chromosome_filenames = <*.fasta>; + } + + unless (@chromosome_filenames){ + die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n"; + } + + foreach my $chromosome_filename (@chromosome_filenames){ + + open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n"; + ### first line needs to be a fastA header + my $first_line = ; + chomp $first_line; + + ### Extracting chromosome name from the FastA header + my $chromosome_name = extract_chromosome_name($first_line); + + my $sequence; + while (){ + chomp; + if ($_ =~ /^>/){ + ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA) + if (exists $chromosomes{$chromosome_name}){ + print "chr $chromosome_name (",length $sequence ," bp)\n"; + die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n"; + } + else { + if (length($sequence) == 0){ + warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n"; + } + print "chr $chromosome_name (",length $sequence ," bp)\n"; + $chromosomes{$chromosome_name} = $sequence; + } + ### resetting the sequence variable + $sequence = ''; + ### setting new chromosome name + $chromosome_name = extract_chromosome_name($_); + } + else{ + $sequence .= uc$_; + } + } + + if (exists $chromosomes{$chromosome_name}){ + print "chr $chromosome_name (",length $sequence ," bp)\t"; + die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n"; + } + else{ + if (length($sequence) == 0){ + warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n"; + } + print "chr $chromosome_name (",length $sequence ," bp)\n"; + $chromosomes{$chromosome_name} = $sequence; + } + } + print "\n"; + chdir $cwd or die "Failed to move to directory $cwd\n"; +} + +sub extract_chromosome_name { + ## Bowtie seems to extract the first string after the inition > in the FASTA file, so we are doing this as well + my $fasta_header = shift; + if ($fasta_header =~ s/^>//){ + my ($chromosome_name) = split (/\s+/,$fasta_header); + return $chromosome_name; + } + else{ + die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n"; + } +} + +sub reverse_complement{ + my $sequence = shift; + $sequence =~ tr/CATG/GTAC/; + $sequence = reverse($sequence); + return $sequence; +} + +sub biTransformFastAFiles { + my $file = shift; + my ($dir,$filename); + if ($file =~ /\//){ + ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; + } + else{ + $filename = $file; + } + + ### gzipped version of the infile + if ($file =~ /\.gz$/){ + open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; + } + else{ + open (IN,$file) or die "Couldn't read from file $file: $!\n"; + } + + if ($skip){ + warn "Skipping the first $skip reads from $file\n"; + sleep (1); + } + if ($upto){ + warn "Processing reads up to sequence no. $upto from $file\n"; + sleep (1); + } + + my $C_to_T_infile = my $G_to_A_infile = $filename; + $C_to_T_infile =~ s/$/_C_to_T.fa/; + $G_to_A_infile =~ s/$/_G_to_A.fa/; + print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; + open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; + + unless ($directional){ + print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + + my $count = 0; + while (1){ + my $header = ; + my $sequence= ; + last unless ($header and $sequence); + + $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $sequence = uc$sequence; # make input file case insensitive + + # detecting if the input file contains tab stops, as this is likely to result in no alignments + if (index($header,"\t") != -1){ + $seqID_contains_tabs++; + } + + ### small check if the sequence seems to be in FastA format + die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/); + + my $sequence_C_to_T = $sequence; + $sequence_C_to_T =~ tr/C/T/; + print CTOT "$header$sequence_C_to_T"; + + unless ($directional){ + my $sequence_G_to_A = $sequence; + $sequence_G_to_A =~ tr/G/A/; + print GTOA "$header$sequence_G_to_A"; + } + } + if ($directional){ + print "\nCreated C -> T converted versions of the FastA file $filename ($count sequences in total)\n\n"; + } + else{ + print "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n"; + } + return ($C_to_T_infile,$G_to_A_infile); +} + +sub biTransformFastAFiles_paired_end { + my ($file,$read_number) = @_; + + my ($dir,$filename); + if ($file =~ /\//){ + ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; + } + else{ + $filename = $file; + } + + ### gzipped version of the infile + if ($file =~ /\.gz$/){ + open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; + } + else{ + open (IN,$file) or die "Couldn't read from file $file: $!\n"; + } + + if ($skip){ + warn "Skipping the first $skip reads from $file\n"; + sleep (1); + } + if ($upto){ + warn "Processing reads up to sequence no. $upto from $file\n"; + sleep (1); + } + + my $C_to_T_infile = my $G_to_A_infile = $filename; + $C_to_T_infile =~ s/$/_C_to_T.fa/; + $G_to_A_infile =~ s/$/_G_to_A.fa/; + + if ($directional){ + if ($read_number == 1){ + print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; + open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; + } + elsif ($read_number == 2){ + print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + else{ + die "Read number needs to be 1 or 2, but was: $read_number\n\n"; + } + } + else{ # all four strand output + print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; + print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + + my $count = 0; + + while (1){ + my $header = ; + my $sequence= ; + last unless ($header and $sequence); + + $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $sequence = uc$sequence; # make input file case insensitive + + # detecting if the input file contains tab stops, as this is likely to result in no alignments + if (index($header,"\t") != -1){ + $seqID_contains_tabs++; + } + + ## small check if the sequence seems to be in FastA format + die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/); + + if ($read_number == 1){ + if ($bowtie2){ + $header =~ s/$/\/1\/1/; + } + else{ + $header =~ s/$/\/1/; + } + } + elsif ($read_number == 2){ + if ($bowtie2){ + $header =~ s/$/\/2\/2/; + } + else{ + $header =~ s/$/\/2/; + } + } + else{ + die "Read number needs to be 1 or 2, but was: $read_number\n\n"; + } + my $sequence_C_to_T = my $sequence_G_to_A = $sequence; + + $sequence_C_to_T =~ tr/C/T/; + $sequence_G_to_A =~ tr/G/A/; + + if ($directional){ + + if ($read_number == 1){ + print CTOT "$header$sequence_C_to_T"; + } + elsif ($read_number == 2){ + print GTOA "$header$sequence_G_to_A"; + } + } + else{ + print CTOT "$header$sequence_C_to_T"; + print GTOA "$header$sequence_G_to_A"; + } + } + + if ($directional){ + if ($read_number == 1){ + print "\nCreated C -> T converted version of the FastA file $filename ($count sequences in total)\n\n"; + } + else{ + print "\nCreated G -> A converted version of the FastA file $filename ($count sequences in total)\n\n"; + } + } + else{ + print "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n"; + } + + if ($directional){ + if ($read_number == 1){ + return ($C_to_T_infile); + } + else{ + return ($G_to_A_infile); + } + } + else{ + return ($C_to_T_infile,$G_to_A_infile); + } +} + + +sub biTransformFastQFiles { + my $file = shift; + my ($dir,$filename); + if ($file =~ /\//){ + ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; + } + else{ + $filename = $file; + } + + ### gzipped version of the infile + if ($file =~ /\.gz$/){ + open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; + } + else{ + open (IN,$file) or die "Couldn't read from file $file: $!\n"; + } + + if ($skip){ + warn "Skipping the first $skip reads from $file\n"; + sleep (1); + } + if ($upto){ + warn "Processing reads up to sequence no. $upto from $file\n"; + sleep (1); + } + + my $C_to_T_infile = my $G_to_A_infile = $filename; + + $C_to_T_infile =~ s/$/_C_to_T.fastq/; + print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; + open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; + + unless ($directional){ + $G_to_A_infile =~ s/$/_G_to_A.fastq/; + print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + + my $count = 0; + while (1){ + my $identifier = ; + my $sequence = ; + my $identifier2 = ; + my $quality_score = ; + last unless ($identifier and $sequence and $identifier2 and $quality_score); + + $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $sequence = uc$sequence; # make input file case insensitive + + # detecting if the input file contains tab stops, as this is likely to result in no alignments + if (index($identifier,"\t") != -1){ + $seqID_contains_tabs++; + } + + ## small check if the sequence file appears to be a FastQ file + if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){ + die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n"; + } + + my $sequence_C_to_T = $sequence; + $sequence_C_to_T =~ tr/C/T/; + print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); + + unless ($directional){ + my $sequence_G_to_A = $sequence; + $sequence_G_to_A =~ tr/G/A/; + print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); + } + } + + if ($directional){ + print "\nCreated C -> T converted versions of the FastQ file $filename ($count sequences in total)\n\n"; + } + else{ + print "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n"; + } + + return ($C_to_T_infile,$G_to_A_infile); +} + +sub biTransformFastQFiles_paired_end { + my ($file,$read_number) = @_; + my ($dir,$filename); + + if ($file =~ /\//){ + ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; + } + else{ + $filename = $file; + } + + ### gzipped version of the infile + if ($file =~ /\.gz$/){ + open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; + } + else{ + open (IN,$file) or die "Couldn't read from file $file: $!\n"; + } + + if ($skip){ + warn "Skipping the first $skip reads from $file\n"; + sleep (1); + } + if ($upto){ + warn "Processing reads up to sequence no. $upto from $file\n"; + sleep (1); + } + + my $C_to_T_infile = my $G_to_A_infile = $filename; + $C_to_T_infile =~ s/$/_C_to_T.fastq/; + $G_to_A_infile =~ s/$/_G_to_A.fastq/; + + if ($directional){ + if ($read_number == 1){ + print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; + open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; + } + elsif ($read_number == 2){ + print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + else{ + die "Read number needs to be 1 or 2, but was $read_number!\n\n"; + } + } + else{ + print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; + print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + + my $count = 0; + + while (1){ + my $identifier = ; + my $sequence = ; + my $identifier2 = ; + my $quality_score = ; + last unless ($identifier and $sequence and $identifier2 and $quality_score); + ++$count; + + $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $sequence= uc$sequence; # make input file case insensitive + + ## small check if the sequence file appears to be a FastQ file + if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){ + die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n"; + } + my $sequence_C_to_T = my $sequence_G_to_A = $sequence; + + if ($read_number == 1){ + if ($bowtie2){ + $identifier =~ s/$/\/1\/1/; + } + else{ + $identifier =~ s/$/\/1/; + } + } + elsif ($read_number == 2){ + if ($bowtie2){ + $identifier =~ s/$/\/2\/2/; + } + else{ + $identifier =~ s/$/\/2/; + } + } + else{ + die "Read number needs to be 1 or 2\n"; + } + + $sequence_C_to_T =~ tr/C/T/; + $sequence_G_to_A =~ tr/G/A/; + + if ($directional){ + if ($read_number == 1){ + print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); + } + else{ + print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); + } + } + else{ + print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); + print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); + } + } + + if ($directional){ + if ($read_number == 1){ + print "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n"; + } + else{ + print "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n"; + } + } + else{ + print "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n"; + } + if ($directional){ + if ($read_number == 1){ + return ($C_to_T_infile); + } + else{ + return ($G_to_A_infile); + } + } + else{ + return ($C_to_T_infile,$G_to_A_infile); + } +} + +sub fix_IDs{ + my $id = shift; + $id =~ s/[ \t]+/_/g; # replace spaces or tabs with underscores + return $id; +} + +sub ensure_sensical_alignment_orientation_single_end{ + my $index = shift; # index number if the sequence produced an alignment + my $strand = shift; + ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one + my $orientation = 0; + ############################################################################################################## + ## FORWARD converted read against FORWARD converted genome (read: C->T.....C->T.. genome:C->T.......C->T) + ## here we only want reads in the forward (+) orientation + if ($fhs[$index]->{name} eq 'CTreadCTgenome') { + ### if the alignment is (+) we count it, and return 1 for a correct orientation + if ($strand eq '+') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the orientation equals (-) the alignment is nonsensical + elsif ($strand eq '-') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + } + ############################################################################################################### + ## FORWARD converted read against reverse converted genome (read: C->T.....C->T.. genome: G->A.......G->A) + ## here we only want reads in the forward (-) orientation + elsif ($fhs[$index]->{name} eq 'CTreadGAgenome') { + ### if the alignment is (-) we count it and return 1 for a correct orientation + if ($strand eq '-') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the orientation equals (+) the alignment is nonsensical + elsif ($strand eq '+') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + } + ############################################################################################################### + ## Reverse converted read against FORWARD converted genome (read: G->A.....G->A.. genome: C->T.......C->T) + ## here we only want reads in the forward (-) orientation + elsif ($fhs[$index]->{name} eq 'GAreadCTgenome') { + ### if the alignment is (-) we count it and return 1 for a correct orientation + if ($strand eq '-') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the orientation equals (+) the alignment is nonsensical + elsif ($strand eq '+') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + } + ############################################################################################################### + ## Reverse converted read against reverse converted genome (read: G->A.....G->A.. genome: G->A.......G->A) + ## here we only want reads in the forward (+) orientation + elsif ($fhs[$index]->{name} eq 'GAreadGAgenome') { + ### if the alignment is (+) we count it and return 1 for a correct orientation + if ($strand eq '+') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the orientation equals (-) the alignment is nonsensical + elsif ($strand eq '-') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + } else{ + die "One of the above conditions must be true\n"; + } +} + +sub ensure_sensical_alignment_orientation_paired_ends{ + my ($index,$id_1,$strand_1,$id_2,$strand_2) = @_; # index number if the sequence produced an alignment + ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one + my $orientation = 0; + ############################################################################################################## + ## [Index 0, sequence originated from (converted) forward strand] + ## CT converted read 1 + ## GA converted read 2 + ## CT converted genome + ## here we only want read 1 in (+) orientation and read 2 in (-) orientation + if ($fhs[$index]->{name} eq 'CTread1GAread2CTgenome') { + ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation + if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical + elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + else{ + die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; + } + } + ############################################################################################################### + ## [Index 1, sequence originated from (converted) reverse strand] + ## GA converted read 1 + ## CT converted read 2 + ## GA converted genome + ## here we only want read 1 in (+) orientation and read 2 in (-) orientation + elsif ($fhs[$index]->{name} eq 'GAread1CTread2GAgenome') { + ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation + if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical + elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + else{ + die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; + } + } + ############################################################################################################### + ## [Index 2, sequence originated from complementary to (converted) forward strand] + ## GA converted read 1 + ## CT converted read 2 + ## CT converted genome + ## here we only want read 1 in (-) orientation and read 2 in (+) orientation + elsif ($fhs[$index]->{name} eq 'GAread1CTread2CTgenome') { + ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation + if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical + elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + else{ + die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; + } + } + ############################################################################################################### + ## [Index 3, sequence originated from complementary to (converted) reverse strand] + ## CT converted read 1 + ## GA converted read 2 + ## GA converted genome + ## here we only want read 1 in (+) orientation and read 2 in (-) orientation + elsif ($fhs[$index]->{name} eq 'CTread1GAread2GAgenome') { + ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation + if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical + elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + else{ + die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; + } + } + else{ + die "One of the above conditions must be true\n"; + } +} + +##################################################################################################################################################### + +### Bowtie 1 (default) | PAIRED-END | FASTA + +sub paired_end_align_fragments_to_bisulfite_genome_fastA { + + my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; + + if ($directional){ + print "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n"; + } + else{ + print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n"; + } + + ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the + ## data structure above + if ($directional){ + warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + + if ($directional){ + unless ($fh->{inputfile_1}){ + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + next; + } + } + + my $bt_options = $bowtie_options; + if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ + $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt_options .= ' --nofw'; + } + + warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options)\n"; + open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; + + my $line_1 = $fh->{fh}->getline(); + my $line_2 = $fh->{fh}->getline(); + + # if Bowtie produces an alignment we store the first line of the output + if ($line_1 and $line_2) { + chomp $line_1; + chomp $line_2; + my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) + my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line + + ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. + ### We will thus identify which sequence was read 1 and store this ID as last_seq_id + + if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present + $fh->{last_seq_id} = $id_1; + } + elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present + $fh->{last_seq_id} = $id_2; + } + else{ + die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; + } + + $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2 + $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2 + warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; + } + # otherwise we just initialise last_seq_id and last_lines as undefined + else { + print "Found no alignment, assigning undef to last_seq_id and last_lines\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + } + } +} + +### Bowtie 2 | PAIRED-END | FASTA + +sub paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 { + my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; + if ($directional){ + print "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n"; + } + else{ + print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n"; + } + + ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the + ## data structure above + if ($directional){ + warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + + if ($directional){ + unless ($fh->{inputfile_1}){ + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + next; + } + } + + my $bt2_options = $bowtie_options; + if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ + $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt2_options .= ' --nofw'; + } + + warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n"; + open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; + + ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence + while (1){ + $_ = $fh->{fh}->getline(); + if ($_) { + last unless ($_ =~ /^\@/); # SAM headers start with @ + } + else{ + last; # no alignment output + } + } + + my $line_1 = $_; + my $line_2 = $fh->{fh}->getline(); + + # if Bowtie produces an alignment we store the first line of the output + if ($line_1 and $line_2) { + chomp $line_1; + chomp $line_2; + my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) + my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line + + ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. + ### We will thus identify which sequence was read 1 and store this ID as last_seq_id + + if ($id_1 =~ s/\/1$//){ # removing the read 1 /1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with + $fh->{last_seq_id} = $id_1; + } + elsif ($id_2 =~ s/\/1$//){ # removing the read 1 /2 tag if present + $fh->{last_seq_id} = $id_2; + } + else{ + warn "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; + } + + $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2 + $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2 + warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; + } + # otherwise we just initialise last_seq_id and last_lines as undefined + else { + print "Found no alignment, assigning undef to last_seq_id and last_lines\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + } + } +} + +### Bowtie 1 (default) | PAIRED-END | FASTQ + +sub paired_end_align_fragments_to_bisulfite_genome_fastQ { + my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; + if ($directional){ + print "Input files are $C_to_T_infile_1 $G_to_A_infile_2 (FastQ)\n"; + } + else{ + print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n"; + } + + ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the + ## data structure above + if ($directional){ + warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + + if ($directional){ + unless ($fh->{inputfile_1}){ + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + next; + } + } + + my $bt_options = $bowtie_options; + if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ + $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt_options .= ' --nofw'; + } + + warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options))\n"; + open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; + + my $line_1 = $fh->{fh}->getline(); + my $line_2 = $fh->{fh}->getline(); + + # if Bowtie produces an alignment we store the first line of the output + if ($line_1 and $line_2) { + chomp $line_1; + chomp $line_2; + ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. + ### We will thus identify which sequence was read 1 and store this ID as last_seq_id + + my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) + my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line + + if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present + $fh->{last_seq_id} = $id_1; + } + elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present + $fh->{last_seq_id} = $id_2; + } + else{ + die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; + } + + $fh->{last_line_1} = $line_1; # this contains read 1 or read 2 + $fh->{last_line_2} = $line_2; # this contains read 1 or read 2 + warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; + } + + # otherwise we just initialise last_seq_id and last_lines as undefined + else { + print "Found no alignment, assigning undef to last_seq_id and last_lines\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + } + } +} + +### Bowtie 2 | PAIRED-END | FASTQ + +sub paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 { + my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; + if ($directional){ + print "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n"; + } + else{ + print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n"; + } + + ## Now starting up 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the + ## data structure above + if ($directional){ + warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + + if ($directional){ + unless ($fh->{inputfile_1}){ + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + next; + } + } + + my $bt2_options = $bowtie_options; + if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ + $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt2_options .= ' --nofw'; + } + + warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n"; + open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; + + ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence + while (1){ + $_ = $fh->{fh}->getline(); + if ($_) { + last unless ($_ =~ /^\@/); # SAM headers start with @ + } + else{ + last; # no alignment output + } + } + + my $line_1 = $_; + my $line_2 = $fh->{fh}->getline(); + + # if Bowtie produces an alignment we store the first line of the output + if ($line_1 and $line_2) { + chomp $line_1; + chomp $line_2; + ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. + ### We will thus identify which sequence was read 1 and store this ID as last_seq_id + + my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) + my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line + + if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with + $fh->{last_seq_id} = $id_1; + } + elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present + $fh->{last_seq_id} = $id_2; + } + else{ + die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; + } + + $fh->{last_line_1} = $line_1; # this contains read 1 or read 2 + $fh->{last_line_2} = $line_2; # this contains read 1 or read 2 + warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; + } + + # otherwise we just initialise last_seq_id and last_lines as undefined + else { + print "Found no alignment, assigning undef to last_seq_id and last_lines\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + } + } +} + +##################################################################################################################################################### + +### Bowtie 1 (default) | SINGLE-END | FASTA +sub single_end_align_fragments_to_bisulfite_genome_fastA { + my ($C_to_T_infile,$G_to_A_infile) = @_; + if ($directional){ + print "Input file is $C_to_T_infile (FastA)\n"; + } + else{ + print "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n"; + } + + ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in + ## data structure above + if ($directional){ + warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + + my $bt_options = $bowtie_options; + if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ + $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt_options .= ' --nofw'; + } + + warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n"; + open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; + + # if Bowtie produces an alignment we store the first line of the output + $_ = $fh->{fh}->getline(); + if ($_) { + chomp; + my $id = (split(/\t/))[0]; # this is the first element of the bowtie output (= the sequence identifier) + $fh->{last_seq_id} = $id; + $fh->{last_line} = $_; + warn "Found first alignment:\t$fh->{last_line}\n"; + } + # otherwise we just initialise last_seq_id and last_line as undefined + else { + print "Found no alignment, assigning undef to last_seq_id and last_line\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line} = undef; + } + } +} + +### Bowtie 2 | SINGLE-END | FASTA +sub single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 { + my ($C_to_T_infile,$G_to_A_infile) = @_; + if ($directional){ + print "Input file is $C_to_T_infile (FastA)\n"; + } + else{ + print "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n"; + } + + ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in + ## data structure above + if ($directional){ + warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + + my $bt2_options = $bowtie_options; + if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ + $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt2_options .= ' --nofw'; + } + + warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt2_options)\n"; + open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; + + ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence + while (1){ + $_ = $fh->{fh}->getline(); + if ($_) { + last unless ($_ =~ /^\@/); # SAM headers start with @ + } + else{ + last; # no alignment output + } + } + + # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output + if ($_) { + chomp; + my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier) + $fh->{last_seq_id} = $id; + $fh->{last_line} = $_; + warn "Found first alignment:\t$fh->{last_line}\n"; + } + # otherwise we just initialise last_seq_id and last_line as undefinded. This should only happen at the end of a file for Bowtie 2 output + else { + print "Found no alignment, assigning undef to last_seq_id and last_line\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line} = undef; + } + } +} + + +### Bowtie 1 (default) | SINGLE-END | FASTQ +sub single_end_align_fragments_to_bisulfite_genome_fastQ { + my ($C_to_T_infile,$G_to_A_infile) = @_; + if ($directional){ + print "Input file is $C_to_T_infile (FastQ)\n"; + } + else{ + print "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n"; + } + + ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in + ## the data structure above + if ($directional){ + warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + my $bt_options = $bowtie_options; + if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ + $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt_options .= ' --nofw'; + } + + warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n"; + open ($fh->{fh},"$path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; + + # if Bowtie produces an alignment we store the first line of the output + $_ = $fh->{fh}->getline(); + if ($_) { + chomp; + my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier) + $fh->{last_seq_id} = $id; + $fh->{last_line} = $_; + warn "Found first alignment:\t$fh->{last_line}\n"; + } + # otherwise we just initialise last_seq_id and last_line as undefined + else { + print "Found no alignment, assigning undef to last_seq_id and last_line\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line} = undef; + } + } +} + +### Bowtie 2 | SINGLE-END | FASTQ +sub single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 { + my ($C_to_T_infile,$G_to_A_infile) = @_; + if ($directional){ + print "Input file is $C_to_T_infile (FastQ)\n\n"; + } + else{ + print "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n\n"; + } + + ## Now starting up to 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in + ## the data structure above + if ($directional){ + warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + my $bt2_options = $bowtie_options; + if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ + $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt2_options .= ' --nofw'; + } + warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options $bt2_options)\n"; + warn "Using Bowtie 2 index: $fh->{bisulfiteIndex}\n\n"; + + open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; + ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence + while (1){ + $_ = $fh->{fh}->getline(); + if ($_) { + last unless ($_ =~ /^\@/); # SAM headers start with @ + } + else { + last; + } + } + + # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output + if ($_) { + chomp; + my $id = (split(/\t/))[0]; # this is the first element of the Bowtie 2 output (= the sequence identifier) + $fh->{last_seq_id} = $id; + $fh->{last_line} = $_; + warn "Found first alignment:\t$fh->{last_line}\n"; + } + # otherwise we just initialise last_seq_id and last_line as undefined. This should only happen at the end of a file for Bowtie 2 output + else { + print "Found no alignment, assigning undef to last_seq_id and last_line\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line} = undef; + } + } +} + +########################################################################################################################################### + +sub reset_counters_and_fhs{ + my $filename = shift; + %counting=( + total_meCHH_count => 0, + total_meCHG_count => 0, + total_meCpG_count => 0, + total_unmethylated_CHH_count => 0, + total_unmethylated_CHG_count => 0, + total_unmethylated_CpG_count => 0, + sequences_count => 0, + no_single_alignment_found => 0, + unsuitable_sequence_count => 0, + genomic_sequence_could_not_be_extracted_count => 0, + unique_best_alignment_count => 0, + low_complexity_alignments_overruled_count => 0, + CT_CT_count => 0, #(CT read/CT genome, original top strand) + CT_GA_count => 0, #(CT read/GA genome, original bottom strand) + GA_CT_count => 0, #(GA read/CT genome, complementary to original top strand) + GA_GA_count => 0, #(GA read/GA genome, complementary to original bottom strand) + CT_GA_CT_count => 0, #(CT read1/GA read2/CT genome, original top strand) + GA_CT_GA_count => 0, #(GA read1/CT read2/GA genome, complementary to original bottom strand) + GA_CT_CT_count => 0, #(GA read1/CT read2/CT genome, complementary to original top strand) + CT_GA_GA_count => 0, #(CT read1/GA read2/GA genome, original bottom strand) + alignments_rejected_count => 0, # only relevant if --directional was specified + ); + + if ($directional){ + if ($filename =~ ','){ # paired-end files + @fhs=( + { name => 'CTreadCTgenome', + strand_identity => 'con ori forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'CTreadGAgenome', + strand_identity => 'con ori reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'GAreadCTgenome', + strand_identity => 'compl ori con forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'GAreadGAgenome', + strand_identity => 'compl ori con reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + ); + } + else{ # single-end files + @fhs=( + { name => 'CTreadCTgenome', + strand_identity => 'con ori forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'CTreadGAgenome', + strand_identity => 'con ori reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + ); + } + } + else{ + @fhs=( + { name => 'CTreadCTgenome', + strand_identity => 'con ori forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'CTreadGAgenome', + strand_identity => 'con ori reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'GAreadCTgenome', + strand_identity => 'compl ori con forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'GAreadGAgenome', + strand_identity => 'compl ori con reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + ); + } +} + + +sub process_command_line{ + my @bowtie_options; + my $help; + my $mates1; + my $mates2; + my $path_to_bowtie; + my $fastq; + my $fasta; + my $skip; + my $qupto; + my $phred64; + my $phred33; + my $solexa; + my $mismatches; + my $seed_length; + my $best; + my $sequence_format; + my $version; + my $quiet; + my $chunk; + my $non_directional; + my $ceiling; + my $maxins; + my $minins; + my $unmapped; + my $multi_map; + my $output_dir; + my $bowtie2; + my $vanilla; + my $sam_no_hd; + my $seed_extension_fails; + my $reseed_repetitive_seeds; + my $most_valid_alignments; + my $score_min; + my $parallel; + my $temp_dir; + + my $command_line = GetOptions ('help|man' => \$help, + '1=s' => \$mates1, + '2=s' => \$mates2, + 'path_to_bowtie=s' => \$path_to_bowtie, + 'f|fasta' => \$fasta, + 'q|fastq' => \$fastq, + 's|skip=i' => \$skip, + 'u|upto=i' => \$qupto, + 'phred33-quals' => \$phred33, + 'phred64-quals|solexa1' => \$phred64, + 'solexa-quals' => \$solexa, + 'n|seedmms=i' => \$mismatches, + 'l|seedlen=i' => \$seed_length, + 'no_best' => \$best, + 'version' => \$version, + 'quiet' => \$quiet, + 'chunkmbs=i' => \$chunk, + 'non_directional' => \$non_directional, + 'I|minins=i' => \$minins, + 'X|maxins=i' => \$maxins, + 'e|maqerr=i' => \$ceiling, + 'un|unmapped' => \$unmapped, + 'ambiguous' => \$multi_map, + 'o|output_dir=s' => \$output_dir, + 'bowtie2' => \$bowtie2, + 'vanilla' => \$vanilla, + 'sam-no-hd' => \$sam_no_hd, + 'D=i' => \$seed_extension_fails, + 'R=i' => \$reseed_repetitive_seeds, + 'score_min=s' => \$score_min, + 'most_valid_alignments=i' => \$most_valid_alignments, + 'p=i' => \$parallel, + 'temp_dir=s' => \$temp_dir, + ); + + + ### EXIT ON ERROR if there were errors with any of the supplied options + unless ($command_line){ + die "Please respecify command line options\n"; + } + ### HELPFILE + if ($help){ + print_helpfile(); + exit; + } + if ($version){ + print << "VERSION"; + + + Bismark - Bisulfite Mapper and Methylation Caller. + + Bismark Version: $bismark_version Copyright 2010-12 Felix Krueger, Babraham Bioinformatics + www.bioinformatics.babraham.ac.uk/projects/ + + +VERSION + exit; + } + + + ########################## + ### PROCESSING OPTIONS ### + ########################## + + unless ($bowtie2){ + $bowtie2 = 0; + } + unless ($sam_no_hd){ + $sam_no_hd =0; + } + + ### PATH TO BOWTIE + ### if a special path to Bowtie 1/2 was specified we will use that one, otherwise it is assumed that Bowtie 1/2 is in the PATH + if ($path_to_bowtie){ + unless ($path_to_bowtie =~ /\/$/){ + $path_to_bowtie =~ s/$/\//; + } + if (-d $path_to_bowtie){ + if ($bowtie2){ + $path_to_bowtie = "${path_to_bowtie}bowtie2"; + } + else{ + $path_to_bowtie = "${path_to_bowtie}bowtie"; + } + } + else{ + die "The path to bowtie provided ($path_to_bowtie) is invalid (not a directory)!\n"; + } + } + else{ + if ($bowtie2){ + $path_to_bowtie = 'bowtie2'; + warn "Path to Bowtie 2 specified as: $path_to_bowtie\n"; } + else{ + $path_to_bowtie = 'bowtie'; + warn "Path to Bowtie specified as: $path_to_bowtie\n"; + } + } + + #################################### + ### PROCESSING ARGUMENTS + + ### GENOME FOLDER + my $genome_folder = shift @ARGV; # mandatory + unless ($genome_folder){ + warn "Genome folder was not specified!\n"; + print_helpfile(); + exit; + } + + ### checking that the genome folder, all subfolders and the required bowtie index files exist + unless ($genome_folder =~/\/$/){ + $genome_folder =~ s/$/\//; + } + + if (chdir $genome_folder){ + my $absolute_genome_folder = getcwd; ## making the genome folder path absolute + unless ($absolute_genome_folder =~/\/$/){ + $absolute_genome_folder =~ s/$/\//; + } + warn "Reference genome folder provided is $genome_folder\t(absolute path is '$absolute_genome_folder)'\n"; + $genome_folder = $absolute_genome_folder; + } + else{ + die "Failed to move to $genome_folder: $!\nUSAGE: Bismark.pl [options] {-1 -2 | } [] (--help for more details)\n"; + } + + my $CT_dir = "${genome_folder}Bisulfite_Genome/CT_conversion/"; + my $GA_dir = "${genome_folder}Bisulfite_Genome/GA_conversion/"; + + if ($bowtie2){ ### Bowtie 2 (new) + ### checking the integrity of $CT_dir + chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; + my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2'); + foreach my $file(@CT_bowtie_index){ + unless (-f $file){ + die "The Bowtie 2 index of the C->T converted genome seems to be faulty ($file). Please run the bismark_genome_preparation before running Bismark.\n"; + } + } + ### checking the integrity of $GA_dir + chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; + my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2'); + foreach my $file(@GA_bowtie_index){ + unless (-f $file){ + die "The Bowtie 2 index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n"; + } + } + } + + else{ ### Bowtie 1 (default) + ### checking the integrity of $CT_dir + chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; + my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt'); + foreach my $file(@CT_bowtie_index){ + unless (-f $file){ + die "The Bowtie index of the C->T converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n"; + } + } + ### checking the integrity of $GA_dir + chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; + my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt'); + foreach my $file(@GA_bowtie_index){ + unless (-f $file){ + die "The Bowtie index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n"; + } + } + } + + my $CT_index_basename = "${CT_dir}BS_CT"; + my $GA_index_basename = "${GA_dir}BS_GA"; + + ### INPUT OPTIONS + + ### SEQUENCE FILE FORMAT + ### exits if both fastA and FastQ were specified + if ($fasta and $fastq){ + die "Only one sequence filetype can be specified (fastA or fastQ)\n"; + } + + ### unless fastA is specified explicitely, fastQ sequence format is expected by default + if ($fasta){ + print "FastA format specified\n"; + $sequence_format = 'FASTA'; + push @bowtie_options, '-f'; + } + elsif ($fastq){ + print "FastQ format specified\n"; + $sequence_format = 'FASTQ'; + push @bowtie_options, '-q'; + } + else{ + $fastq = 1; + print "FastQ format assumed (by default)\n"; + $sequence_format = 'FASTQ'; + push @bowtie_options, '-q'; + } + + ### SKIP + if ($skip){ + warn "Skipping the first $skip reads from the input file\n"; + # push @bowtie_options,"-s $skip"; + } + + ### UPTO + if ($qupto){ + warn "Processing sequences up to read no. $qupto from the input file\n"; + if ($bowtie2){ + # push @bowtie_options,"--upto $qupto"; ## slightly changed for Bowtie 2 + } + else{ + # push @bowtie_options,"--qupto $qupto"; + } + } + + ### QUALITY VALUES + if (($phred33 and $phred64) or ($phred33 and $solexa) or ($phred64 and $solexa)){ + die "You can only specify one type of quality value at a time! (--phred33-quals or --phred64-quals or --solexa-quals)"; + } + if ($phred33){ ## if nothing else is specified $phred33 will be used as default by both Bowtie 1 and 2. + # Phred quality values work only when -q is specified + unless ($fastq){ + die "Phred quality values works only when -q (FASTQ) is specified\n"; + } + if ($bowtie2){ + push @bowtie_options,"--phred33"; + } + else{ + push @bowtie_options,"--phred33-quals"; + } + } + if ($phred64){ + # Phred quality values work only when -q is specified + unless ($fastq){ + die "Phred quality values work only when -q (FASTQ) is specified\n"; + } + if ($bowtie2){ + push @bowtie_options,"--phred64"; + } + else{ + push @bowtie_options,"--phred64-quals"; + } + } + else{ + $phred64 = 0; + } + + if ($solexa){ + if ($bowtie2){ + die "The option '--solexa-quals' is not compatible with Bowtie 2. Please respecify!\n"; + } + # Solexa to Phred value conversion works only when -q is specified + unless ($fastq){ + die "Conversion from Solexa to Phred quality values works only when -q (FASTQ) is specified\n"; + } + push @bowtie_options,"--solexa-quals"; + } + else{ + $solexa = 0; + } + + ### ALIGNMENT OPTIONS + + ### MISMATCHES + if (defined $mismatches){ + if ($bowtie2){ + if ($mismatches == 0 or $mismatches == 1){ + push @bowtie_options,"-N $mismatches"; + } + else{ + die "Please set the number of multiseed mismatches for Bowtie 2 with '-N ' (where can be 0 or 1)\n"; + } + } + else{ + if ($mismatches >= 0 and $mismatches <= 3){ + push @bowtie_options,"-n $mismatches"; + } + else{ + die "Please set the number of seed mismatches for Bowtie 1 with '-n ' (where can be 0,1,2 or 3)\n"; + } + } + } + else{ + unless ($bowtie2){ + push @bowtie_options,"-n 1"; # setting -n to 1 by default (for use with Bowtie only) because it is much quicker than the default mode of -n 2 + } + } + + ### SEED LENGTH + if (defined $seed_length){ + if ($bowtie2){ + push @bowtie_options,"-L $seed_length"; + } + else{ + push @bowtie_options,"-l $seed_length"; + } + } + + ### MISMATCH CEILING + if (defined $ceiling){ + die "The option '-e' is not compatible with Bowtie 2. Please respecify options\n" if ($bowtie2); + push @bowtie_options,"-e $ceiling"; + } + + + ### BOWTIE 2 EFFORT OPTIONS + + ### CONSECUTIVE SEED EXTENSION FAILS + if (defined $seed_extension_fails){ + die "The option '-D ' is only available when using Bowtie 2\n\n" unless ($bowtie2); + push @bowtie_options,"-D $seed_extension_fails"; + } + + ### RE-SEEDING REPETITIVE SEEDS + if (defined $reseed_repetitive_seeds){ + die "The option '-R ' is only available when using Bowtie 2\n\n" unless ($bowtie2); + push @bowtie_options,"-R $reseed_repetitive_seeds"; + } + + + ### BOWTIE 2 SCORING OPTIONS + if ($score_min){ + die "The option '--score_min ' is only available when using Bowtie 2\n\n" unless ($bowtie2); + unless ($score_min =~ /^L,.+,.+$/){ + die "The option '--score_min ' needs to be in the format . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n"; + } + push @bowtie_options,"--score-min $score_min"; + } + else{ + if ($bowtie2){ + push @bowtie_options,"--score-min L,0,-0.2"; # default setting, more stringent than normal Bowtie2 + } + } + + ### BOWTIE 2 PARALLELIZATION OPTIONS + if (defined $parallel){ + die "The parallelization switch '-p' only works for Bowtie 2. Please respecify!" unless ($bowtie2); + } + if ($bowtie2){ + if ($parallel){ + die "Please select a value for -p of 2 or more!\n" unless ($parallel > 1); + push @bowtie_options,"-p $parallel"; + push @bowtie_options,'--reorder'; ## re-orders the bowtie 2 output so that it does match the input files. This is abolutely required for parallelization to work. + print "Each Bowtie 2 instance is going to be run with $parallel threads. Please monitor performance closely and tune down if needed!\n"; + sleep (2); + } + } + + ### REPORTING OPTIONS + + if ($bowtie2){ + push @bowtie_options,'--ignore-quals'; ## All mismatches will receive penalty for mismatches as if they were of high quality, which is 6 by default + + ### Option -M is deprecated since Bowtie 2 version 2.0.0 beta7. I'll leave this option commented out for a while + if(defined $most_valid_alignments){ + + warn "\nThe option -M is now deprecated (as of Bowtie 2 version 2.0.0 beta7). What used to be called -M mode is still the default mode. Use the -D and -R options to adjust the effort expended to find valid alignments.\n\n"; + # push @bowtie_options,"-M $most_valid_alignments";sleep (5); + } + # else{ + # push @bowtie_options,'-M 10'; # the default behavior for Bowtie 2 is to report (and sort) up to 500 alignments for a given sequence + # } + } + else{ # Because of the way Bismark works we will always use the reporting option -k 2 (report up to 2 valid alignments) for Bowtie 1 + push @bowtie_options,'-k 2'; + } + + ### --BEST + if ($bowtie2){ + if ($best){ # Bowtie 2 does away with the concept of --best, so one can also not select --no-best when Bowtie 2 is to be used + die "The option '--no-best' is not compatible with Bowtie 2. Please respecify options\n"; + } + } + else{ + # --best is the default option for Bowtie 1, specifying --no-best can turn it off (e.g. to speed up alignment process) + unless ($best){ + push @bowtie_options,'--best'; + } + } + + ### VANILLA BISMARK (BOWTIE 1) OUTPUT + if ($vanilla){ + if ($bowtie2){ + die "The options --bowtie2 and the --vanilla are not compatible. Please respecify!\n\n"; + } + } + else{ + $vanilla = 0; + } + + ### PAIRED-END MAPPING + if ($mates1){ + my @mates1 = (split (/,/,$mates1)); + die "Paired-end mapping requires the format: -1 -2 , please respecify!\n" unless ($mates2); + my @mates2 = (split(/,/,$mates2)); + unless (scalar @mates1 == scalar @mates2){ + die "Paired-end mapping requires the same amounnt of mate1 and mate2 files, please respecify! (format: -1 -2 )\n"; + } + while (1){ + my $mate1 = shift @mates1; + my $mate2 = shift @mates2; + last unless ($mate1 and $mate2); + push @filenames,"$mate1,$mate2"; + } + if ($bowtie2){ + push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments + push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones + } + } + elsif ($mates2){ + die "Paired-end mapping requires the format: -1 -2 , please respecify!\n"; + } + + ### SINGLE-END MAPPING + # Single-end mapping will be performed if no mate pairs for paired-end mapping have been specified + my $singles; + unless ($mates1 and $mates2){ + $singles = join (',',@ARGV); + unless ($singles){ + die "\nNo filename supplied! Please specify one or more files for single-end Bismark mapping!\n"; + } + $singles =~ s/\s/,/g; + @filenames = (split(/,/,$singles)); + warn "\nFiles to be analysed:\n"; + warn "@filenames\n\n"; + sleep (3); + } + + ### MININUM INSERT SIZE (PAIRED-END ONLY) + if (defined $minins){ + die "-I/--minins can only be used for paired-end mapping!\n\n" if ($singles); + push @bowtie_options,"--minins $minins"; + } + + ### MAXIMUM INSERT SIZE (PAIRED-END ONLY) + if (defined $maxins){ + die "-X/--maxins can only be used for paired-end mapping!\n\n" if ($singles); + push @bowtie_options,"--maxins $maxins"; + } + else{ + unless ($singles){ + push @bowtie_options,'--maxins 500'; + } + } + + ### QUIET prints nothing besides alignments (suppresses warnings) + if ($quiet){ + push @bowtie_options,'--quiet'; + } + + ### CHUNKMBS needed to be increased to avoid memory exhaustion warnings for Bowtie 1, particularly for --best (and paired-end) alignments + unless ($bowtie2){ # Bowtie 2 does not have a chunkmbs option + if (defined $chunk){ + push @bowtie_options,"--chunkmbs $chunk"; + } + else{ + push @bowtie_options,'--chunkmbs 512'; ## setting the default to 512MB (up from 64 default) + } + } + + + ### SUMMARY OF ALL BOWTIE OPTIONS + my $bowtie_options = join (' ',@bowtie_options); + + + ### STRAND-SPECIFIC LIBRARIES + my $directional; + if ($non_directional){ + print "Library was specified to be not strand-specific (non-directional), therefore alignments to all four possible bisulfite strands (OT, CTOT, OB and CTOB) will be reported.\n"; + sleep (3); + $directional = 0; + } + else{ + print "Library is assumed to be strand-specific (directional), alignments to strands complementary to the original top or bottom strands will be ignored (i.e. not performed!).\n"; + sleep (3); + $directional = 1; # Changed this to being the default behaviour + } + + ### UNMAPPED SEQUENCE OUTPUT + $unmapped = 0 unless ($unmapped); + + ### AMBIGUOUS ALIGNMENT SEQUENCE OUTPUT + $multi_map = 0 unless ($multi_map); + + + ### OUTPUT DIRECTORY + + chdir $parent_dir or die "Failed to move back to current working directory\n"; + if ($output_dir){ + unless ($output_dir =~ /\/$/){ + $output_dir =~ s/$/\//; + } + + if (chdir $output_dir){ + $output_dir = getcwd; # making the path absolute + unless ($output_dir =~ /\/$/){ + $output_dir =~ s/$/\//; + } + } + else{ + mkdir $output_dir or die "Unable to create directory $output_dir $!\n"; + warn "Created output directory $output_dir!\n\n"; + chdir $output_dir or die "Failed to move to $output_dir\n"; + $output_dir = getcwd; # making the path absolute + unless ($output_dir =~ /\/$/){ + $output_dir =~ s/$/\//; + } + } + warn "Output will be written into the directory: $output_dir\n"; + } + else{ + $output_dir = ''; + } + + ### TEMPORARY DIRECTORY for C->T and G->A transcribed files + + chdir $parent_dir or die "Failed to move back to current working directory\n"; + if ($temp_dir){ + warn "\nUsing temp directory: $temp_dir\n"; + unless ($temp_dir =~ /\/$/){ + $temp_dir =~ s/$/\//; + } + + if (chdir $temp_dir){ + $temp_dir = getcwd; # making the path absolute + unless ($temp_dir =~ /\/$/){ + $temp_dir =~ s/$/\//; + } + } + else{ + mkdir $temp_dir or die "Unable to create directory $temp_dir $!\n"; + warn "Created temporary directory $temp_dir!\n\n"; + chdir $temp_dir or die "Failed to move to $temp_dir\n"; + $temp_dir = getcwd; # making the path absolute + unless ($temp_dir =~ /\/$/){ + $temp_dir =~ s/$/\//; + } + } + warn "Temporary files will be written into the directory: $temp_dir\n"; + } + else{ + $temp_dir = ''; + } + + + return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir); +} + + + +sub generate_SAM_header{ + print OUT "\@HD\tVN:1.0\tSO:unsorted\n"; # @HD = header, VN = version, SO = sort order + foreach my $chr (keys %chromosomes){ + my $length = length ($chromosomes{$chr}); + print OUT "\@SQ\tSN:$chr\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length + } + print OUT "\@PG\tID:Bismark\tVN:$bismark_version\tCL:\"bismark $command_line\"\n"; # @PG = program, ID = unique identifier, PN = program name name, VN = program version +} + +### I would like to thank the following individuals for their valuable contributions to the Bismark SAM output format: +### O. Tam (Sep 2010), C. Whelan (2011), E. Vidal (2011), T. McBryan (2011), P. Hickey (2011) + +sub single_end_SAM_output{ + my ($id,$actual_seq,$methylation_call_params,$qual) = @_; + my $strand = $methylation_call_params->{$id}->{alignment_strand}; + my $chr = $methylation_call_params->{$id}->{chromosome}; + my $start = $methylation_call_params->{$id}->{position}; + my $stop = $methylation_call_params->{$id}->{end_position}; + my $ref_seq = $methylation_call_params->{$id}->{unmodified_genomic_sequence}; + my $methcall = $methylation_call_params->{$id}->{methylation_call}; + my $read_conversion = $methylation_call_params->{$id}->{read_conversion}; + my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion}; + my $number_of_mismatches = $methylation_call_params->{$id}->{number_of_mismatches}; + ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011" + ## FLAG: bitwise FLAG. Each bit is explained in the following table: + ## Bit Description Comment Value + ## 0x1 template has multiple segments in sequencing 0: single-end 1: paired end value: 2**0 ( 1) + ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2**1 ( 2) + ## 0x4 segment unmapped --- --- + ## 0x8 next segment in the template unmapped --- --- + ## 0x10 SEQ being reverse complemented value: 2**4 ( 16) + ## 0x20 SEQ of the next segment in the template being reversed value: 2**5 ( 32) + ## 0x40 the first segment in the template read 1 value: 2**6 ( 64) + ## 0x80 the last segment in the template read 2 value: 2**7 (128) + ## 0x100 secondary alignment --- --- + ## 0x200 not passing quality controls --- --- + ## 0x400 PCR or optical duplicate --- --- + + ##### + + my $flag; # FLAG variable used for SAM format. + if ($strand eq "+"){ + if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){ + $flag = 0; # 0 for "+" strand (OT) + } + elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){ + $flag = 16; # 16 for "-" strand (CTOB, yields information for the original bottom strand) + } + else{ + die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n"; + } + } + elsif ($strand eq "-"){ + if ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){ + $flag = 16; # 16 for "-" strand (OB) + } + elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){ + $flag = 0; # 0 for "+" strand (CTOT, yields information for the original top strand) + } + else{ + die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n"; + } + } + else{ + die "Unexpected strand information: $strand\n\n"; + } + + ##### + + my $mapq = 255; # Assume mapping quality is unavailable + + ##### + + my $cigar; + if ($bowtie2){ + $cigar = $methylation_call_params->{$id}->{CIGAR}; # Actual CIGAR string reported by Bowtie 2 + } + else{ + $cigar = length($actual_seq) . "M"; # Bowtie 1 output does not contain indels (only matches and mismatches) + } + + ##### + + my $rnext = "*"; # Paired-end variable + + ##### + + my $pnext = 0; # Paired-end variable + + ##### + + my $tlen = 0; # Paired-end variable + + ##### + + if ($read_conversion eq 'CT'){ + $ref_seq = substr($ref_seq, 0, length($ref_seq) - 2); # Removes additional nucleotides from the 3' end. This only works for the original top or bottom strands + } + else{ + $ref_seq = substr($ref_seq, 2, length($ref_seq) - 2); # Removes additional nucleotides from the 5' end. This works for the complementary strands in non-directional libraries + } + + if ($strand eq '-'){ + $actual_seq = revcomp($actual_seq); # Sequence represented on the forward genomic strand + $ref_seq = revcomp($ref_seq); # Required for comparison with actual sequence + $qual = reverse $qual; # if the sequence was reverse-complemented the quality string needs to be reversed as well + } + + ##### + + my $hemming_dist = hemming_dist($actual_seq,$ref_seq); # Edit distance to the reference, i.e. minimal number of one-nucleotide edits needed to transform the read string + # into the reference string. hemming_dist() + if ($bowtie2){ + $hemming_dist += $methylation_call_params->{$id}->{indels}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence + } + + my $NM_tag = "NM:i:$hemming_dist"; # Optional tag NM: edit distance based on nucleotide differences + + ##### + + my $XX_tag = make_mismatch_string($actual_seq, $ref_seq); # Optional tag XX: string providing mismatched reference bases in the alignment (NO indel information!) + + ##### + + my $XM_tag; # Optional tag XM: Methylation Call String + if ($strand eq '+'){ + $XM_tag = "XM:Z:$methcall"; + } + elsif ($strand eq '-'){ + $XM_tag = 'XM:Z:'.reverse $methcall; # if the sequence was reverse-complemented the methylation call string needs to be reversed as well + } + + ##### + + my $XR_tag = "XR:Z:$read_conversion"; # Optional tag XR: Read Conversion + + ##### + + my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion + + ##### + + # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields + print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$XX_tag,$XM_tag,$XR_tag,$XG_tag)),"\n"; +} + + +sub paired_end_SAM_output{ + my ($id,$actual_seq_1,$actual_seq_2,$methylation_call_params,$qual_1,$qual_2) = @_; + my $strand_1 = $methylation_call_params->{$id}->{alignment_read_1}; # Bowtie 1 only reports the read 1 alignment strand + my $strand_2 = $methylation_call_params->{$id}->{alignment_read_2}; + my $chr = $methylation_call_params->{$id}->{chromosome}; + my $ref_seq_1 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_1}; + my $ref_seq_2 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_2}; + my $methcall_1 = $methylation_call_params->{$id}->{methylation_call_1}; + my $methcall_2 = $methylation_call_params->{$id}->{methylation_call_2}; + my $read_conversion_1 = $methylation_call_params->{$id}->{read_conversion_1}; + my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2}; + my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion}; + my $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_1}; # only needed for custom allele-specific output, not the default! + my $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_2}; + + my $id_1 = $id.'/1'; + my $id_2 = $id.'/2'; + + # Allows all degenerate nucleotide sequences in reference genome + die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHV]/i; + die "Reference sequence ($ref_seq_2) contains invalid nucleotides!\n" if $ref_seq_2 =~ /[^ACTGNRYMKSWBDHV]/i; + + my $index; # used to store the srand origin of the alignment in a less convoluted way + + if ($read_conversion_1 eq 'CT' and $genome_conversion eq 'CT'){ + $index = 0; ## this is OT (original top strand) + } + elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'GA'){ + $index = 1; ## this is CTOB (complementary to OB) + } + elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'CT'){ + $index = 2; ## this is CTOT (complementary to OT) + } + elsif ($read_conversion_1 eq 'CT' and $genome_conversion eq 'GA'){ + $index = 3; ## this is OB (original bottom) + } + else { + die "Unexpected combination of read 1 and genome conversion: $read_conversion_1 / $genome_conversion\n"; + } + + ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the + ### first or last position. + + if ($index == 0 or $index == 3){ # OT or OB + $ref_seq_1 = substr($ref_seq_1,0,length($ref_seq_1)-2); + $ref_seq_2 = substr($ref_seq_2,2,length($ref_seq_2)-2); + } + else{ # CTOT or CTOB + $ref_seq_1 = substr($ref_seq_1,2,length($ref_seq_1)-2); + $ref_seq_2 = substr($ref_seq_2,0,length($ref_seq_2)-2); + } + + ##### + + my $start_read_1; + my $start_read_2; + # adjusting end positions + + if ($bowtie2){ + $start_read_1 = $methylation_call_params->{$id}->{position_1}; + $start_read_2 = $methylation_call_params->{$id}->{position_2}; + } + else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1 + if ($strand_1 eq '+'){ # Read 1 aligns to the + strand + $start_read_1 = $methylation_call_params->{$id}->{start_seq_1}; + $start_read_2 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_2) + 1; + } + else{ # read 1 is on the - strand + $start_read_1 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_1) + 1; + $start_read_2 = $methylation_call_params->{$id}->{start_seq_1}; + } + } + + ##### + + my $end_read_1; + my $end_read_2; + # adjusting end positions + + if ($bowtie2){ + $end_read_1 = $methylation_call_params->{$id}->{end_position_1}; + $end_read_2 = $methylation_call_params->{$id}->{end_position_2}; + } + else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1 + if ($strand_1 eq '+'){ # Read 1 aligns to the + strand + $end_read_1 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_1)-1; + $end_read_2 = $methylation_call_params->{$id}->{alignment_end}; + } + else{ + $end_read_1 = $methylation_call_params->{$id}->{alignment_end}; + $end_read_2 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_2)-1; + } + } + + ##### + + ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011" + ## FLAG: bitwise FLAG. Each bit is explained in the following table: + ## Bit Description Comment Value + ## 0x1 template having multiple segments in sequencing 0: single-end 1: paired end value: 2^^0 ( 1) + ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2^^1 ( 2) + ## 0x4 segment unmapped --- --- + ## 0x8 next segment in the template unmapped --- --- + ## 0x10 SEQ being reverse complemented - strand alignment value: 2^^4 ( 16) + ## 0x20 SEQ of the next segment in the template being reversed + strand alignment value: 2^^5 ( 32) + ## 0x40 the first segment in the template read 1 value: 2^^6 ( 64) + ## 0x80 the last segment in the template read 2 value: 2^^7 (128) + ## 0x100 secondary alignment --- --- + ## 0x200 not passing quality controls --- --- + ## 0x400 PCR or optical duplicate --- --- + + ### As the FLAG value do not consider that there might be 4 different bisulfite strands of DNA, we are trying to make FLAG tags which take the strand identity into account + + # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand) + # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences) + + my $flag_1; # FLAG variable used for SAM format + my $flag_2; + + if ($index == 0){ # OT + $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it) + $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128) + } + elsif ($index == 1){ # CTOB + $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64) + $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128) + } + elsif ($index == 2){ # CTOT + $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64) + $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128) + } + elsif ($index == 3){ # OB + $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64) + $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128) + } + + ##### + + my $mapq = 255; # Mapping quality is unavailable + + ##### + + my $cigar_1; + my $cigar_2; + + if ($bowtie2){ + $cigar_1 = $methylation_call_params->{$id}->{CIGAR_1}; # Actual CIGAR string reported by Bowtie 2 + $cigar_2 = $methylation_call_params->{$id}->{CIGAR_2}; + } + else{ + $cigar_1 = length($actual_seq_1) . "M"; # Assume no indels for Bowtie 1 mapping (only matches and mismatches) + $cigar_2 = length($actual_seq_2) . "M"; + } + + ##### + + my $rnext = '='; # Chromosome of mate; applies to both reads + + ##### + + my $pnext_1 = $start_read_2; # Leftmost position of mate + my $pnext_2 = $start_read_1; + + ##### + + my $tlen_1; # signed observed Template LENgth (or inferred fragment size) + my $tlen_2; + + if ($bowtie2){ + + if ($start_read_1 <= $start_read_2){ + + # Read 1 alignment is leftmost + + if ($end_read_2 >= $end_read_1){ + + # -------------------------> read 1 reads overlapping + # <------------------------- read 2 + # + # or + # + # -------------------------> read 1 + # <----------------------- read 2 read 2 contained within read 1 + # + # or + # + # -------------------------> read 1 reads 1 and 2 exactly overlapping + # <------------------------- read 2 + # + + # dovetailing of reads is not enabled for Bowtie 2 alignments + + $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign, + $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign + } + elsif ($end_read_2 < $end_read_1){ + + # -------------------------> read 1 + # <----------- read 2 read 2 contained within read 1 + # + # or + # + # -------------------------> read 1 + # <----------- read 2 read 2 contained within read 1 + + # start and end of read 2 are fully contained within read 1 + $tlen_1 = 0; # Set as 0 when the information is unavailable + $tlen_2 = 0; # Set as 0 when the information is unavailable + } + + } + + elsif ($start_read_2 < $start_read_1){ + + if ($end_read_1 >= $end_read_2){ + + # Read 2 alignment is leftmost + + # -------------------------> read 2 reads overlapping + # <------------------------- read 1 + # + # or + # + # -------------------------> read 2 + # <----------------------- read 1 read 1 contained within read 2 + # + # + + $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign, + $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign + } + elsif ($end_read_1 < $end_read_2){ + + # -------------------------> read 2 + # <----------- read 1 read 1 contained within read 2 + # + # or + # + # -------------------------> read 2 + # <----------- read 1 read 1 contained within read 2 + + # start and end of read 1 are fully contained within read 2 + $tlen_1 = 0; # Set as 0 when the information is unavailable + $tlen_2 = 0; # Set as 0 when the information is unavailable + } + } + } + + else{ # Bowtie 1 + + if ($end_read_2 >= $end_read_1){ + # Read 1 alignment is leftmost + # -------------------------> read 1 + # <------------------------- read 2 + # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing + + $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign, + $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign + } + else{ + # Read 2 alignment is leftmost + # -------------------------> read 2 + # <------------------------- read 1 + # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing + + $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign, + $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign + } + } + + ##### + + # adjusting the strand of the sequence before we use them to generate mismatch strings + if ($strand_1 eq '-'){ + $actual_seq_1 = revcomp($actual_seq_1); # Sequence represented on the forward genomic strand + $ref_seq_1 = revcomp($ref_seq_1); # Required for comparison with actual sequence + $qual_1 = reverse $qual_1; # we need to reverse the quality string as well + } + if ($strand_2 eq '-'){ + $actual_seq_2 = revcomp($actual_seq_2); # Mate sequence represented on the forward genomic strand + $ref_seq_2 = revcomp($ref_seq_2); # Required for comparison with actual sequence + $qual_2 = reverse $qual_2; # If the sequence gets reverse complemented we reverse the quality string as well + } + + # print "$actual_seq_1\n$ref_seq_1\n\n"; + # print "$actual_seq_2\n$ref_seq_2\n\n"; + + ##### + + my $hemming_dist_1 = hemming_dist($actual_seq_1,$ref_seq_1); # Minimal number of one-nucleotide edits needed to transform the read string into the reference sequence + my $hemming_dist_2 = hemming_dist($actual_seq_2,$ref_seq_2); + if ($bowtie2){ + $hemming_dist_1 += $methylation_call_params->{$id}->{indels_1}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence + $hemming_dist_2 += $methylation_call_params->{$id}->{indels_2}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence + } + my $NM_tag_1 = "NM:i:$hemming_dist_1"; # Optional tag NM: edit distance based on nucleotide differences + my $NM_tag_2 = "NM:i:$hemming_dist_2"; # Optional tag NM: edit distance based on nucleotide differences + + ##### + + my $XX_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1); # Optional tag XX: String providing mismatched reference bases in the alignment (NO indel information!) + my $XX_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2); + + ##### + + my $XM_tag_1; # Optional tag XM: Methylation call string + my $XM_tag_2; + + if ($strand_1 eq '-'){ + $XM_tag_1 = 'XM:Z:'.reverse $methcall_1; # Needs to be reversed if the sequence was reverse complemented + } + else{ + $XM_tag_1 = "XM:Z:$methcall_1"; + } + + if ($strand_2 eq '-'){ + $XM_tag_2 = 'XM:Z:'.reverse $methcall_2; # Needs to be reversed if the sequence was reverse complemented + } + else{ + $XM_tag_2 = "XM:Z:$methcall_2"; + } + + ##### + + my $XR_tag_1 = "XR:Z:$read_conversion_1"; # Optional tag XR: Read 1 conversion state + my $XR_tag_2 = "XR:Z:$read_conversion_2"; # Optional tag XR: Read 2 conversion state + + ##### + + my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion state; valid for both reads + + ##### + + # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields + print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $XX_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag)), "\n"; + print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $XX_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag)), "\n"; +} + +sub revcomp{ + my $seq = shift or die "Missing seq to reverse complement\n"; + $seq = reverse $seq; + $seq =~ tr/ACTGactg/TGACTGAC/; + return $seq; +} + +sub hemming_dist{ + my $matches = 0; + my @actual_seq = split //,(shift @_); + my @ref_seq = split //,(shift @_); + foreach (0..$#actual_seq){ + ++$matches if ($actual_seq[$_] eq $ref_seq[$_]); + } + return my $hd = scalar @actual_seq - $matches; +} + +sub make_mismatch_string{ + my $actual_seq = shift or die "Missing actual sequence"; + my $ref_seq = shift or die "Missing reference sequence"; + my $XX_tag = "XX:Z:"; + my $tmp = ($actual_seq ^ $ref_seq); # Bitwise comparison + my $prev_mm_pos = 0; + while($tmp =~ /[^\0]/g){ # Where bitwise comparison showed a difference + my $nuc_match = pos($tmp) - $prev_mm_pos - 1; # Generate number of nucleotide that matches since last mismatch + my $nuc_mm = substr($ref_seq, pos($tmp) - 1, 1) if pos($tmp) <= length($ref_seq); # Obtain reference nucleotide that was different from the actual read + $XX_tag .= "$nuc_match" if $nuc_match > 0; # Ignore if mismatches are adjacent to each other + $XX_tag .= "$nuc_mm" if defined $nuc_mm; # Ignore if there is no mismatch (prevents uninitialized string concatenation) + $prev_mm_pos = pos($tmp); # Position of last mismatch + } + my $end_matches = length($ref_seq) - $prev_mm_pos; # Provides number of matches from last mismatch till end of sequence + $XX_tag .= "$end_matches" if $end_matches > 0; # Ignore if mismatch is at the end of sequence + return $XX_tag; +} + + + +sub print_helpfile{ + print << "HOW_TO"; + + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License + along with this program. If not, see . + + + +DESCRIPTION + + +The following is a brief description of command line options and arguments to control the Bismark +bisulfite mapper and methylation caller. Bismark takes in FastA or FastQ files and aligns the +reads to a specified bisulfite genome. Sequence reads are transformed into a bisulfite converted forward strand +version (C->T conversion) or into a bisulfite treated reverse strand (G->A conversion of the forward strand). +Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome +(C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the +forward strand, by doing this alignments will produce the same positions). These 4 instances of Bowtie (1 or 2) +are run in parallel. The sequence file(s) are then read in again sequence by sequence to pull out the original +sequence from the genome and determine if there were any protected C's present or not. + +As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be +re-enabled by using --non_directional. + +The final output of Bismark is in SAM format by default. For Bowtie 1 one can alos choose to report the old +'vanilla' output format, which is a single tab delimited file with all sequences that have a unique best +alignment to any of the 4 possible strands of a bisulfite PCR product. Both formats are described in more detail below. + + +USAGE: bismark [options] {-1 -2 | } + + +ARGUMENTS: + + The path to the folder containing the unmodified reference genome + as well as the subfolders created by the Bismark_Genome_Preparation + script (/Bisulfite_Genome/CT_conversion/ and /Bisulfite_Genome/GA_conversion/). + Bismark expects one or more fastA files in this folder (file extension: .fa + or .fasta). The path can be relative or absolute. + +-1 Comma-separated list of files containing the #1 mates (filename usually includes + "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must + correspond file-for-file and read-for-read with those specified in . + Reads may be a mix of different lengths. Bismark will produce one mapping result + and one report file per paired-end input file pair. + +-2 Comma-separated list of files containing the #2 mates (filename usually includes + "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must + correspond file-for-file and read-for-read with those specified in . + Reads may be a mix of different lengths. + + A comma- or space-separated list of files containing the reads to be aligned (e.g. + lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will + produce one mapping result and one report file per input file. + + +OPTIONS: + + +Input: + +-q/--fastq The query input files (specified as , or are FASTQ + files (usually having extension .fg or .fastq). This is the default. See also + --solexa-quals. + +-f/--fasta The query input files (specified as , or are FASTA + files (usually havin extension .fa, .mfa, .fna or similar). All quality values + are assumed to be 40 on the Phred scale. + +-s/--skip Skip (i.e. do not align) the first reads or read pairs from the input. + +-u/--upto Only aligns the first reads or read pairs from the input. Default: no limit. + +--phred33-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 33. Default: on. + +--phred64-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 64. Default: off. + +--solexa-quals Convert FASTQ qualities from solexa-scaled (which can be negative) to phred-scaled + (which can't). The formula for conversion is: + phred-qual = 10 * log(1 + 10 ** (solexa-qual/10.0)) / log(10). Used with -q. This + is usually the right option for use with (unconverted) reads emitted by the GA + Pipeline versions prior to 1.3. Works only for Bowtie 1. Default: off. + +--solexa1.3-quals Same as --phred64-quals. This is usually the right option for use with (unconverted) + reads emitted by GA Pipeline version 1.3 or later. Default: off. + +--path_to_bowtie The full path to the Bowtie (1 or 2) installation on your system. If not + specified it is assumed that Bowtie (1 or 2) is in the PATH. + + +Alignment: + +-n/--seedmms The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs + of the read (where L is set with -l/--seedlen). This may be 0, 1, 2 or 3 and the + default is 1. This option is only available for Bowtie 1 (for Bowtie 2 see -N). + +-l/--seedlen The "seed length"; i.e., the number of bases of the high quality end of the read to + which the -n ceiling applies. The default is 28. Bowtie (and thus Bismark) is faster for + larger values of -l. This option is only available for Bowtie 1 (for Bowtie 2 see -L). + +-e/--maqerr Maximum permitted total of quality values at all mismatched read positions throughout + the entire alignment, not just in the "seed". The default is 70. Like Maq, bowtie rounds + quality values to the nearest 10 and saturates at 30. This value is not relevant for + Bowtie 2. + +--chunkmbs The number of megabytes of memory a given thread is given to store path descriptors in + --best mode. Best-first search must keep track of many paths at once to ensure it is + always extending the path with the lowest cumulative cost. Bowtie tries to minimize the + memory impact of the descriptors, but they can still grow very large in some cases. If + you receive an error message saying that chunk memory has been exhausted in --best mode, + try adjusting this parameter up to dedicate more memory to the descriptors. This value + is not relevant for Bowtie 2. Default: 512. + +-I/--minins The minimum insert size for valid paired-end alignments. E.g. if -I 60 is specified and + a paired-end alignment consists of two 20-bp alignments in the appropriate orientation + with a 20-bp gap between them, that alignment is considered valid (as long as -X is also + satisfied). A 19-bp gap would not be valid in that case. Default: 0. + +-X/--maxins The maximum insert size for valid paired-end alignments. E.g. if -X 100 is specified and + a paired-end alignment consists of two 20-bp alignments in the proper orientation with a + 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied). + A 61-bp gap would not be valid in that case. Default: 500. + + +Bowtie 1 Reporting: + +-k <2> Due to the way Bismark works Bowtie will report up to 2 valid alignments. This option + will be used by default. + +--best Make Bowtie guarantee that reported singleton alignments are "best" in terms of stratum + (i.e. number of mismatches, or mismatches in the seed in the case if -n mode) and in + terms of the quality; e.g. a 1-mismatch alignment where the mismatch position has Phred + quality 40 is preferred over a 2-mismatch alignment where the mismatched positions both + have Phred quality 10. When --best is not specified, Bowtie may report alignments that + are sub-optimal in terms of stratum and/or quality (though an effort is made to report + the best alignment). --best mode also removes all strand bias. Note that --best does not + affect which alignments are considered "valid" by Bowtie, only which valid alignments + are reported by Bowtie. Bowtie is about 1-2.5 times slower when --best is specified. + Default: on. + +--no_best Disables the --best option which is on by default. This can speed up the alignment process, + e.g. for testing purposes, but for credible results it is not recommended to disable --best. + + +Output: + +--non_directional The sequencing library was constructed in a non strand-specific manner, alignments to all four + bisulfite strands will be reported. Default: OFF. + + (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary + to the original strands are merely theoretical and should not exist in reality. Specifying directional + alignments (which is the default) will only run 2 alignment threads to the original top (OT) + or bottom (OB) strands in parallel and report these alignments. This is the recommended option + for sprand-specific libraries). + +--sam-no-hd Suppress SAM header lines (starting with @). This might be useful when very large input files are + split up into several smaller files to run concurrently and the output files are to be merged. + +--quiet Print nothing besides alignments. + +--vanilla Performs bisulfite mapping with Bowtie 1 and prints the 'old' output (as in Bismark 0.5.X) instead + of SAM format output. + +-un/--unmapped Write all reads that could not be aligned to a file in the output directory. Written reads will + appear as they did in the input, without any translation of quality values that may have + taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1 + and _2 inserted in their filenames, i.e. _unmapped_reads_1.txt and unmapped_reads_2.txt. Reads + with more than one valid alignment with the same number of lowest mismatches (ambiguous mapping) + are also written to _unmapped_reads.txt unless the option --ambiguous is specified as well. + +--ambiguous Write all reads which produce more than one valid alignment with the same number of lowest + mismatches or other reads that fail to align uniquely to a file in the output directory. + Written reads will appear as they did in the input, without any of the translation of quality + values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two + parallel files with _1 and _2 inserted in theit filenames, i.e. _ambiguous_reads_1.txt and + _ambiguous_reads_2.txt. These reads are not written to the file specified with --un. + +-o/--output_dir Write all output files into this directory. By default the output files will be written into + the same folder as the input file(s). If the specified folder does not exist, Bismark will attempt + to create it first. The path to the output folder can be either relative or absolute. + +--temp_dir Write temporary files to this directory instead of into the same directory as the input files. If + the specified folder does not exist, Bismark will attempt to create it first. The path to the + temporary folder can be either relative or absolute. + + + +Other: + +-h/--help Displays this help file. + +-v/--version Displays version information. + + +BOWTIE 2 SPECIFIC OPTIONS + +--bowtie2 Uses Bowtie 2 instead of Bowtie 1. Bismark limits Bowtie 2 to only perform end-to-end + alignments, i.e. searches for alignments involving all read characters (also called + untrimmed or unclipped alignments). Bismark assumes that raw sequence data is adapter + and/or quality trimmed where appropriate. Default: off. + +Bowtie 2 alignment options: + +-N Sets the number of mismatches to allowed in a seed alignment during multiseed alignment. + Can be set to 0 or 1. Setting this higher makes alignment slower (often much slower) + but increases sensitivity. Default: 0. This option is only available for Bowtie 2 (for + Bowtie 1 see -n). + +-L Sets the length of the seed substrings to align during multiseed alignment. Smaller values + make alignment slower but more senstive. Default: the --sensitive preset of Bowtie 2 is + used by default, which sets -L to 20. This option is only available for Bowtie 2 (for + Bowtie 1 see -l). + +--ignore-quals When calculating a mismatch penalty, always consider the quality value at the mismatched + position to be the highest possible, regardless of the actual value. I.e. input is treated + as though all quality values are high. This is also the default behavior when the input + doesn't specify quality values (e.g. in -f mode). This option is invariable and on by default. + + +Bowtie 2 paired-end options: + +--no-mixed This option disables Bowtie 2's behavior to try to find alignments for the individual mates if + it cannot find a concordant or discordant alignment for a pair. This option is invariable and + and on by default. + +--no-discordant Normally, Bowtie 2 looks for discordant alignments if it cannot find any concordant alignments. + A discordant alignment is an alignment where both mates align uniquely, but that does not + satisfy the paired-end constraints (--fr/--rf/--ff, -I, -X). This option disables that behavior + and it is on by default. + + +Bowtie 2 effort options: + +-D Up to consecutive seed extension attempts can "fail" before Bowtie 2 moves on, using + the alignments found so far. A seed extension "fails" if it does not yield a new best or a + new second-best alignment. Default: 15. + +-R is the maximum number of times Bowtie 2 will "re-seed" reads with repetitive seeds. + When "re-seeding," Bowtie 2 simply chooses a new set of reads (same length, same number of + mismatches allowed) at different offsets and searches for more alignments. A read is considered + to have repetitive seeds if the total number of seed hits divided by the number of seeds + that aligned at least once is greater than 300. Default: 2. + +Bowtie 2 parallelization options: + + +-p NTHREADS Launch NTHREADS parallel search threads (default: 1). Threads will run on separate processors/cores + and synchronize when parsing reads and outputting alignments. Searching for alignments is highly + parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint. + E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint + by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads + library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will + automatically use the option '--reorder', which guarantees that output SAM records are printed in + an order corresponding to the order of the reads in the original input file, even when -p is set + greater than 1 (Bismark requires the Bowtie 2 output to be this way). Specifying --reorder and + setting -p greater than 1 causes Bowtie 2 to run somewhat slower and use somewhat more memory then + if --reorder were not specified. Has no effect if -p is set to 1, since output order will naturally + correspond to input order in that case. + +Bowtie 2 Scoring options: + +--score_min Sets a function governing the minimum alignment score needed for an alignment to be considered + "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying + L,0,-0.2 sets the minimum-score function f to f(x) = 0 + -0.2 * x, where x is the read length. + See also: setting function options at http://bowtie-bio.sourceforge.net/bowtie2. The default is + L,0,-0.2. + + +Bowtie 2 Reporting options: + +-most_valid_alignments This used to be the Bowtie 2 parameter -M. As of Bowtie 2 version 2.0.0 beta7 the option -M is + deprecated. It will be removed in subsequent versions. What used to be called -M mode is still the + default mode, but adjusting the -M setting is deprecated. Use the -D and -R options to adjust the + effort expended to find valid alignments. + + For reference, this used to be the old (now deprecated) description of -M: + Bowtie 2 searches for at most +1 distinct, valid alignments for each read. The search terminates when it + can't find more distinct valid alignments, or when it finds +1 distinct alignments, whichever + happens first. Only the best alignment is reported. Information from the other alignments is used to + estimate mapping quality and to set SAM optional fields, such as AS:i and XS:i. Increasing -M makes + Bowtie 2 slower, but increases the likelihood that it will pick the correct alignment for a read that + aligns many places. For reads that have more than +1 distinct, valid alignments, Bowtie 2 does not + guarantee that the alignment reported is the best possible in terms of alignment score. -M is + always used and its default value is set to 10. + + +'VANILLA' Bismark OUTPUT: + +Single-end output format (tab-separated): + + (1) + (2) + (3) + (4) + (5) + (6) + (7) + (8) + (9) +(11) + + +Paired-end output format (tab-separated): + (1) + (2) + (3) + (4) + (5) + (6) + (7) + (8) + (9) +(10) +(11) +(12) +(14) +(15) + + +Bismark SAM OUTPUT (default): + + (1) QNAME (seq-ID) + (2) FLAG (this flag tries to take the strand a bisulfite read originated from into account (this is different from ordinary DNA alignment flags!)) + (3) RNAME (chromosome) + (4) POS (start position) + (5) MAPQ (always 255) + (6) CIGAR + (7) RNEXT + (8) PNEXT + (9) TLEN +(10) SEQ +(11) QUAL (Phred33 scale) +(12) NM-tag (edit distance to the reference) +(13) XX-tag (base-by-base mismatches to the reference. This does not include indels) +(14) XM-tag (methylation call string) +(15) XR-tag (read conversion state for the alignment) +(16) XG-tag (genome conversion state for the alignment) + +Each read of paired-end alignments is written out in a separate line in the above format. + + +This script was last edited on 31 July 2012. + +HOW_TO +}