comparison methylation_analysis_bismark/methylation_analysis/bismark @ 10:2432df265dad draft

Uploaded
author fcaramia
date Wed, 12 Dec 2012 19:45:04 -0500
parents
children
comparison
equal deleted inserted replaced
9:5b208d4d89e5 10:2432df265dad
1 #!/usr/bin/perl --
2 use strict;
3 use warnings;
4 use IO::Handle;
5 use Cwd;
6 $|++;
7 use Getopt::Long;
8
9
10 ## This program is Copyright (C) 2010-12, Felix Krueger (felix.krueger@bbsrc.ac.uk)
11
12 ## This program is free software: you can redistribute it and/or modify
13 ## it under the terms of the GNU General Public License as published by
14 ## the Free Software Foundation, either version 3 of the License, or
15 ## (at your option) any later version.
16
17 ## This program is distributed in the hope that it will be useful,
18 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ## GNU General Public License for more details.
21
22 ## You should have received a copy of the GNU General Public License
23 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
24
25
26 my $parent_dir = getcwd;
27 my $bismark_version = 'v0.7.6';
28 my $command_line = join (" ",@ARGV);
29
30 ### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail
31 foreach my $arg (@ARGV){
32 if ($arg eq '--solexa1.3-quals'){
33 $arg = '--phred64-quals';
34 }
35 }
36 my @filenames; # will be populated by processing the command line
37
38 my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir) = process_command_line();
39
40 my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment
41 my %chromosomes; # stores the chromosome sequences of the mouse genome
42 my %counting; # counting various events
43
44 my $seqID_contains_tabs;
45
46 foreach my $filename (@filenames){
47
48 chdir $parent_dir or die "Unable to move to initial working directory $!\n";
49 ### resetting the counting hash and fhs
50 reset_counters_and_fhs($filename);
51 $seqID_contains_tabs = 0;
52
53 ### PAIRED-END ALIGNMENTS
54 if ($filename =~ ','){
55 my ($C_to_T_infile_1,$G_to_A_infile_1); # to be made from mate1 file
56
57 $fhs[0]->{name} = 'CTread1GAread2CTgenome';
58 $fhs[1]->{name} = 'GAread1CTread2GAgenome';
59 $fhs[2]->{name} = 'GAread1CTread2CTgenome';
60 $fhs[3]->{name} = 'CTread1GAread2GAgenome';
61
62 print "\nPaired-end alignments will be performed\n",'='x39,"\n\n";
63
64 my ($filename_1,$filename_2) = (split (/,/,$filename));
65 print "The provided filenames for paired-end alignments are $filename_1 and $filename_2\n";
66
67 ### additional variables only for paired-end alignments
68 my ($C_to_T_infile_2,$G_to_A_infile_2); # to be made from mate2 file
69
70 ### FastA format
71 if ($sequence_file_format eq 'FASTA'){
72 print "Input files are in FastA format\n";
73
74 if ($directional){
75 ($C_to_T_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
76 ($G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
77
78 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
79 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
80 $fhs[1]->{inputfile_1} = undef;
81 $fhs[1]->{inputfile_2} = undef;
82 $fhs[2]->{inputfile_1} = undef;
83 $fhs[2]->{inputfile_2} = undef;
84 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
85 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
86 }
87 else{
88 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
89 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
90
91 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
92 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
93 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
94 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
95 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
96 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
97 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
98 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
99 }
100
101 if ($bowtie2){
102 paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
103 }
104 else{
105 paired_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
106 }
107 }
108
109 ### FastQ format
110 else{
111 print "Input files are in FastQ format\n";
112 if ($directional){
113 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
114 ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
115
116 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
117 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
118 $fhs[1]->{inputfile_1} = undef;
119 $fhs[1]->{inputfile_2} = undef;
120 $fhs[2]->{inputfile_1} = undef;
121 $fhs[2]->{inputfile_2} = undef;
122 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
123 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
124 }
125 else{
126 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
127 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
128
129 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
130 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
131 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
132 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
133 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
134 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
135 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
136 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
137 }
138
139 if ($bowtie2){
140 paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
141 }
142 else{
143 paired_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
144 }
145 }
146 start_methylation_call_procedure_paired_ends($filename_1,$filename_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
147 }
148
149 ### Else we are performing SINGLE-END ALIGNMENTS
150 else{
151 print "\nSingle-end alignments will be performed\n",'='x39,"\n\n";
152 ### Initialising bisulfite conversion filenames
153 my ($C_to_T_infile,$G_to_A_infile);
154
155
156 ### FastA format
157 if ($sequence_file_format eq 'FASTA'){
158 print "Inut file is in FastA format\n";
159 if ($directional){
160 ($C_to_T_infile) = biTransformFastAFiles ($filename);
161 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
162 }
163 else{
164 ($C_to_T_infile,$G_to_A_infile) = biTransformFastAFiles ($filename);
165 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
166 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
167 }
168
169 ### Creating 4 different bowtie filehandles and storing the first entry
170 if ($bowtie2){
171 single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile,$G_to_A_infile);
172 }
173 else{
174 single_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile,$G_to_A_infile);
175 }
176 }
177
178 ## FastQ format
179 else{
180 print "Input file is in FastQ format\n";
181 if ($directional){
182 ($C_to_T_infile) = biTransformFastQFiles ($filename);
183 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
184 }
185 else{
186 ($C_to_T_infile,$G_to_A_infile) = biTransformFastQFiles ($filename);
187 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
188 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
189 }
190
191 ### Creating 4 different bowtie filehandles and storing the first entry
192 if ($bowtie2){
193 single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile,$G_to_A_infile);
194 }
195 else{
196 single_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile,$G_to_A_infile);
197 }
198 }
199
200 start_methylation_call_procedure_single_ends($filename,$C_to_T_infile,$G_to_A_infile);
201
202 }
203 }
204
205 sub start_methylation_call_procedure_single_ends {
206 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
207 my ($dir,$filename);
208
209 if ($sequence_file =~ /\//){
210 ($dir,$filename) = $sequence_file =~ m/(.*\/)(.*)$/;
211 }
212 else{
213 $filename = $sequence_file;
214 }
215
216 ### printing all alignments to a results file
217 my $outfile = $filename;
218
219 if ($bowtie2){ # SAM format is the default for Bowtie 2
220 $outfile =~ s/$/_bt2_bismark.sam/;
221 }
222 elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X)
223 $outfile =~ s/$/_bismark.txt/;
224 }
225 else{ # SAM is the default output
226 $outfile =~ s/$/_bismark.sam/;
227 }
228 print "Writing bisulfite mapping results to $output_dir$outfile\n\n";
229 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n";
230 if ($vanilla){
231 print OUT "Bismark version: $bismark_version\n";
232 }
233
234 ### printing alignment and methylation call summary to a report file
235 my $reportfile = $filename;
236 if ($bowtie2){
237 $reportfile =~ s/$/_bt2_Bismark_mapping_report.txt/;
238 }
239 else{
240 $reportfile =~ s/$/_Bismark_mapping_report.txt/;
241 }
242
243 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
244 print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n";
245
246 if ($unmapped){
247 my $unmapped_file = $filename;
248 $unmapped_file =~ s/$/_unmapped_reads.txt/;
249 open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n";
250 print "Unmapped sequences will be written to $output_dir$unmapped_file\n";
251 }
252 if ($ambiguous){
253 my $ambiguous_file = $filename;
254 $ambiguous_file =~ s/$/_ambiguous_reads.txt/;
255 open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n";
256 print "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n";
257 }
258
259 if ($directional){
260 print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed!)\n";
261 }
262 print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
263
264
265 ### if 2 or more files are provided we can hold the genome in memory and don't need to read it in a second time
266 unless (%chromosomes){
267 my $cwd = getcwd; # storing the path of the current working directory
268 print "Current working directory is: $cwd\n\n";
269 read_genome_into_memory($cwd);
270 }
271
272 unless ($vanilla or $sam_no_hd){
273 generate_SAM_header();
274 }
275
276 ### Input file is in FastA format
277 if ($sequence_file_format eq 'FASTA'){
278 process_single_end_fastA_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
279 }
280 ### Input file is in FastQ format
281 else{
282 process_single_end_fastQ_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
283 }
284 }
285
286 sub start_methylation_call_procedure_paired_ends {
287 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
288
289 my ($dir_1,$filename_1);
290
291 if ($sequence_file_1 =~ /\//){
292 ($dir_1,$filename_1) = $sequence_file_1 =~ m/(.*\/)(.*)$/;
293 }
294 else{
295 $filename_1 = $sequence_file_1;
296 }
297
298 my ($dir_2,$filename_2);
299
300 if ($sequence_file_2 =~ /\//){
301 ($dir_2,$filename_2) = $sequence_file_2 =~ m/(.*\/)(.*)$/;
302 }
303 else{
304 $filename_2 = $sequence_file_2;
305 }
306
307 ### printing all alignments to a results file
308 my $outfile = $filename_1;
309 if ($bowtie2){ # SAM format is the default Bowtie 2 output
310 $outfile =~ s/$/_bismark_bt2_pe.sam/;
311 }
312 elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X)
313 $outfile =~ s/$/_bismark_pe.txt/;
314 }
315 else{ # SAM format is the default Bowtie 1 output
316 $outfile =~ s/$/_bismark_pe.sam/;
317 }
318
319 print "Writing bisulfite mapping results to $outfile\n\n";
320 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!";
321 if ($vanilla){
322 print OUT "Bismark version: $bismark_version\n";
323 }
324
325 ### printing alignment and methylation call summary to a report file
326 my $reportfile = $filename_1;
327 if ($bowtie2){
328 $reportfile =~ s/$/_Bismark_bt2_paired-end_mapping_report.txt/;
329 }
330 else{
331 $reportfile =~ s/$/_Bismark_paired-end_mapping_report.txt/;
332 }
333
334 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
335 print REPORT "Bismark report for: $sequence_file_1 and $sequence_file_2 (version: $bismark_version)\n";
336 print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
337
338
339 ### Unmapped read output
340 if ($unmapped){
341 my $unmapped_1 = $filename_1;
342 my $unmapped_2 = $filename_2;
343 $unmapped_1 =~ s/$/_unmapped_reads_1.txt/;
344 $unmapped_2 =~ s/$/_unmapped_reads_2.txt/;
345 open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n";
346 open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n";
347 print "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n";
348 }
349
350 if ($ambiguous){
351 my $amb_1 = $filename_1;
352 my $amb_2 = $filename_2;
353 $amb_1 =~ s/$/_ambiguous_reads_1.txt/;
354 $amb_2 =~ s/$/_ambiguous_reads_2.txt/;
355 open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n";
356 open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n";
357 print "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n";
358 }
359
360 if ($directional){
361 print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed)\n";
362 }
363
364 ### if 2 or more files are provided we might still hold the genome in memory and don't need to read it in a second time
365 unless (%chromosomes){
366 my $cwd = getcwd; # storing the path of the current working directory
367 print "Current working directory is: $cwd\n\n";
368 read_genome_into_memory($cwd);
369 }
370
371 unless ($vanilla or $sam_no_hd){
372 generate_SAM_header();
373 }
374
375 ### Input files are in FastA format
376 if ($sequence_file_format eq 'FASTA'){
377 process_fastA_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
378 }
379 ### Input files are in FastQ format
380 else{
381 process_fastQ_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
382 }
383 }
384
385 sub print_final_analysis_report_single_end{
386 my ($C_to_T_infile,$G_to_A_infile) = @_;
387 ### All sequences from the original sequence file have been analysed now
388 ### deleting temporary C->T or G->A infiles
389
390 if ($directional){
391 my $deletion_successful = unlink "$temp_dir$C_to_T_infile";
392 if ($deletion_successful == 1){
393 warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile\n\n";
394 }
395 else{
396 warn "Could not delete temporary file $C_to_T_infile properly $!\n";
397 }
398 }
399
400 else{
401 my $deletion_successful = unlink "$temp_dir$C_to_T_infile","$temp_dir$G_to_A_infile";
402 if ($deletion_successful == 2){
403 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile and $temp_dir$G_to_A_infile\n\n";
404 }
405 else{
406 warn "Could not delete temporary files properly $!\n";
407 }
408 }
409
410 ### printing a final report for the alignment procedure
411 print REPORT "Final Alignment report\n",'='x22,"\n";
412 print "Final Alignment report\n",'='x22,"\n";
413 # foreach my $index (0..$#fhs){
414 # print "$fhs[$index]->{name}\n";
415 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
416 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
417 # }
418
419 ### printing a final report for the methylation call procedure
420 warn "Sequences analysed in total:\t$counting{sequences_count}\n";
421 print REPORT "Sequences analysed in total:\t$counting{sequences_count}\n";
422
423 my $percent_alignable_sequences = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
424
425 warn "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n\n";
426 print REPORT "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n";
427
428 ### percentage of low complexity reads overruled because of low complexity (thereby creating a bias for highly methylated reads),
429 ### only calculating the percentage if there were any overruled alignments
430 if ($counting{low_complexity_alignments_overruled_count}){
431 my $percent_overruled_low_complexity_alignments = sprintf ("%.1f",$counting{low_complexity_alignments_overruled_count}*100/$counting{sequences_count});
432 # print REPORT "Number of low complexity alignments which were overruled to have a unique best hit rather than discarding them:\t$counting{low_complexity_alignments_overruled_count}\t(${percent_overruled_low_complexity_alignments}%)\n";
433 }
434
435 print "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
436 print "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
437 print "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
438 print "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
439 print join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
440
441 print REPORT "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
442 print REPORT "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
443 print REPORT "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
444 print REPORT "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
445 print REPORT join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
446
447 if ($directional){
448 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
449 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
450 }
451
452 ### detailed information about Cs analysed
453 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
454 my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count};
455 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
456 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
457 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
458 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
459 warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
460 warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
461 warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
462
463 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
464 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
465 print REPORT "Total methylated C's in CpG context:\t $counting{total_meCpG_count}\n";
466 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
467 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
468 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
469 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
470 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
471
472 my $percent_meCHG;
473 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
474 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
475 }
476
477 my $percent_meCHH;
478 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
479 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
480 }
481
482 my $percent_meCpG;
483 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
484 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
485 }
486
487 ### printing methylated CpG percentage if applicable
488 if ($percent_meCpG){
489 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
490 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
491 }
492 else{
493 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
494 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
495 }
496
497 ### printing methylated C percentage (CHG context) if applicable
498 if ($percent_meCHG){
499 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
500 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
501 }
502 else{
503 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
504 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
505 }
506
507 ### printing methylated C percentage (CHH context) if applicable
508 if ($percent_meCHH){
509 warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
510 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
511 }
512 else{
513 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
514 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
515 }
516
517 if ($seqID_contains_tabs){
518 warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
519 print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
520 }
521 }
522
523 sub print_final_analysis_report_paired_ends{
524 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
525 ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles
526 if ($directional){
527 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_2";
528 if ($deletion_successful == 2){
529 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2\n\n";
530 }
531 else{
532 warn "Could not delete temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2 properly: $!\n";
533 }
534 }
535 else{
536 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1","$temp_dir$C_to_T_infile_2","$temp_dir$G_to_A_infile_2";
537 if ($deletion_successful == 4){
538 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1, $temp_dir$G_to_A_infile_1, $temp_dir$C_to_T_infile_2 and $temp_dir$G_to_A_infile_2\n\n";
539 }
540 else{
541 warn "Could not delete temporary files properly: $!\n";
542 }
543 }
544
545 ### printing a final report for the alignment procedure
546 warn "Final Alignment report\n",'='x22,"\n";
547 print REPORT "Final Alignment report\n",'='x22,"\n";
548 # foreach my $index (0..$#fhs){
549 # print "$fhs[$index]->{name}\n";
550 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
551 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
552 # }
553
554 ### printing a final report for the methylation call procedure
555 warn "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
556 print REPORT "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
557
558 my $percent_alignable_sequence_pairs = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
559 print "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}%\n\n";
560 print REPORT "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}% \n";
561
562 print "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
563 print "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
564 print "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
565 print "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
566 print join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
567
568
569 print REPORT "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
570 print REPORT "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
571 print REPORT "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
572 print REPORT "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
573 print REPORT join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
574 ### detailed information about Cs analysed
575
576 if ($directional){
577 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
578 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
579 }
580
581 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
582 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
583
584 my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
585 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
586 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
587 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
588 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
589 warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
590 warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
591 warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
592
593 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
594 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
595 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
596 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
597 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
598 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
599 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
600
601 my $percent_meCHG;
602 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
603 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
604 }
605
606 my $percent_meCHH;
607 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
608 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
609 }
610
611 my $percent_meCpG;
612 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
613 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
614 }
615
616 ### printing methylated CpG percentage if applicable
617 if ($percent_meCpG){
618 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
619 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
620 }
621 else{
622 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
623 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
624 }
625
626 ### printing methylated C percentage in CHG context if applicable
627 if ($percent_meCHG){
628 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
629 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
630 }
631 else{
632 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
633 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
634 }
635
636 ### printing methylated C percentage in CHH context if applicable
637 if ($percent_meCHH){
638 warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
639 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
640 }
641 else{
642 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
643 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
644 }
645
646 }
647
648 sub process_single_end_fastA_file_for_methylation_call{
649 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
650 ### this is a FastA sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
651 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
652 ### the C->T or G->A version
653
654 ### gzipped version of the infile
655 if ($sequence_file =~ /\.gz$/){
656 open (IN,"zcat $sequence_file |") or die $!;
657 }
658 else{
659 open (IN,$sequence_file) or die $!;
660 }
661
662 my $count = 0;
663
664 warn "\nReading in the sequence file $sequence_file\n";
665 while (1) {
666 # last if ($counting{sequences_count} > 100);
667 my $identifier = <IN>;
668 my $sequence = <IN>;
669 last unless ($identifier and $sequence);
670
671 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
672
673 ++$count;
674
675 if ($skip){
676 next unless ($count > $skip);
677 }
678 if ($upto){
679 last if ($count > $upto);
680 }
681
682 $counting{sequences_count}++;
683 if ($counting{sequences_count}%100000==0) {
684 warn "Processed $counting{sequences_count} sequences so far\n";
685 }
686 chomp $sequence;
687 chomp $identifier;
688
689 $identifier =~ s/^>//; # deletes the > at the beginning of FastA headers
690
691 my $return;
692 if ($bowtie2){
693 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier);
694 }
695 else{
696 $return = check_bowtie_results_single_end(uc$sequence,$identifier); # default Bowtie 1
697 }
698
699 unless ($return){
700 $return = 0;
701 }
702
703 # print the sequence to ambiguous.out if --ambiguous was specified
704 if ($ambiguous and $return == 2){
705 print AMBIG ">$identifier\n";
706 print AMBIG "$sequence\n";
707 }
708
709 # print the sequence to <unmapped.out> file if --un was specified
710 elsif ($unmapped and $return == 1){
711 print UNMAPPED ">$identifier\n";
712 print UNMAPPED "$sequence\n";
713 }
714 }
715 print "Processed $counting{sequences_count} sequences in total\n\n";
716
717 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
718
719 }
720
721 sub process_single_end_fastQ_file_for_methylation_call{
722 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
723 ### this is the Illumina sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
724 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
725 ### the C->T or G->A version
726
727 ### gzipped version of the infile
728 if ($sequence_file =~ /\.gz$/){
729 open (IN,"zcat $sequence_file |") or die $!;
730 }
731 else{
732 open (IN,$sequence_file) or die $!;
733 }
734
735 my $count = 0;
736
737 warn "\nReading in the sequence file $sequence_file\n";
738 while (1) {
739 my $identifier = <IN>;
740 my $sequence = <IN>;
741 my $identifier_2 = <IN>;
742 my $quality_value = <IN>;
743 last unless ($identifier and $sequence and $identifier_2 and $quality_value);
744
745 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
746
747 ++$count;
748
749 if ($skip){
750 next unless ($count > $skip);
751 }
752 if ($upto){
753 last if ($count > $upto);
754 }
755
756 $counting{sequences_count}++;
757
758 if ($counting{sequences_count}%1000000==0) {
759 warn "Processed $counting{sequences_count} sequences so far\n";
760 }
761 chomp $sequence;
762 chomp $identifier;
763 chomp $quality_value;
764
765 $identifier =~ s/^\@//; # deletes the @ at the beginning of Illumin FastQ headers
766
767 my $return;
768 if ($bowtie2){
769 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier,$quality_value);
770 }
771 else{
772 $return = check_bowtie_results_single_end(uc$sequence,$identifier,$quality_value); # default Bowtie 1
773 }
774
775 unless ($return){
776 $return = 0;
777 }
778
779 # print the sequence to ambiguous.out if --ambiguous was specified
780 if ($ambiguous and $return == 2){
781 print AMBIG "\@$identifier\n";
782 print AMBIG "$sequence\n";
783 print AMBIG $identifier_2;
784 print AMBIG "$quality_value\n";
785 }
786
787 # print the sequence to <unmapped.out> file if --un was specified
788 elsif ($unmapped and $return == 1){
789 print UNMAPPED "\@$identifier\n";
790 print UNMAPPED "$sequence\n";
791 print UNMAPPED $identifier_2;
792 print UNMAPPED "$quality_value\n";
793 }
794 }
795 print "Processed $counting{sequences_count} sequences in total\n\n";
796
797 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
798
799 }
800
801 sub process_fastA_files_for_paired_end_methylation_calls{
802 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
803 ### Processing the two FastA sequence files; we need the actual sequences of both reads to compare them against the genomic sequence in order to
804 ### make a methylation call. The sequence idetifier per definition needs to be the same for a sequence pair used for paired-end mapping.
805 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced an alignment to one (or both) of the
806 ### converted genomes (either the C->T or G->A version)
807
808 ### gzipped version of the infiles
809 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
810 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
811 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
812 }
813 else{
814 open (IN1,$sequence_file_1) or die $!;
815 open (IN2,$sequence_file_2) or die $!;
816 }
817
818 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
819 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
820
821 my $count = 0;
822
823 while (1) {
824 # reading from the first input file
825 my $identifier_1 = <IN1>;
826 my $sequence_1 = <IN1>;
827 # reading from the second input file
828 my $identifier_2 = <IN2>;
829 my $sequence_2 = <IN2>;
830 last unless ($identifier_1 and $sequence_1 and $identifier_2 and $sequence_2);
831
832 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
833 $identifier_2 = fix_IDs($identifier_2);
834
835 ++$count;
836
837 if ($skip){
838 next unless ($count > $skip);
839 }
840 if ($upto){
841 last if ($count > $upto);
842 }
843
844 $counting{sequences_count}++;
845 if ($counting{sequences_count}%100000==0) {
846 warn "Processed $counting{sequences_count} sequences so far\n";
847 }
848 my $orig_identifier_1 = $identifier_1;
849 my $orig_identifier_2 = $identifier_2;
850
851 chomp $sequence_1;
852 chomp $identifier_1;
853 chomp $sequence_2;
854 chomp $identifier_2;
855
856 $identifier_1 =~ s/^>//; # deletes the > at the beginning of FastA headers
857
858 my $return;
859 if ($bowtie2){
860 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1);
861 }
862 else{
863 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1);
864 }
865
866 unless ($return){
867 $return = 0;
868 }
869
870 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
871 if ($ambiguous and $return == 2){
872 print AMBIG_1 $orig_identifier_1;
873 print AMBIG_1 "$sequence_1\n";
874 print AMBIG_2 $orig_identifier_2;
875 print AMBIG_2 "$sequence_2\n";
876 }
877
878 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
879 elsif ($unmapped and $return == 1){
880 print UNMAPPED_1 $orig_identifier_1;
881 print UNMAPPED_1 "$sequence_1\n";
882 print UNMAPPED_2 $orig_identifier_2;
883 print UNMAPPED_2 "$sequence_2\n";
884 }
885 }
886
887 print "Processed $counting{sequences_count} sequences in total\n\n";
888
889 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
890
891 }
892
893 sub process_fastQ_files_for_paired_end_methylation_calls{
894 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
895 ### Processing the two Illumina sequence files; we need the actual sequence of both reads to compare them against the genomic sequence in order to
896 ### make a methylation call. The sequence identifier per definition needs to be same for a sequence pair used for paired-end alignments.
897 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced a paired-end alignment to one (or both)
898 ### of the converted genomes (either C->T or G->A version)
899
900 ### gzipped version of the infiles
901 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
902 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
903 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
904 }
905 else{
906 open (IN1,$sequence_file_1) or die $!;
907 open (IN2,$sequence_file_2) or die $!;
908 }
909
910 my $count = 0;
911
912 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
913 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
914 while (1) {
915 # reading from the first input file
916 my $identifier_1 = <IN1>;
917 my $sequence_1 = <IN1>;
918 my $ident_1 = <IN1>; # not needed
919 my $quality_value_1 = <IN1>; # not needed
920 # reading from the second input file
921 my $identifier_2 = <IN2>;
922 my $sequence_2 = <IN2>;
923 my $ident_2 = <IN2>; # not needed
924 my $quality_value_2 = <IN2>; # not needed
925 last unless ($identifier_1 and $sequence_1 and $quality_value_1 and $identifier_2 and $sequence_2 and $quality_value_2);
926
927 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
928 $identifier_2 = fix_IDs($identifier_2);
929
930 ++$count;
931
932 if ($skip){
933 next unless ($count > $skip);
934 }
935 if ($upto){
936 last if ($count > $upto);
937 }
938
939 $counting{sequences_count}++;
940 if ($counting{sequences_count}%100000==0) {
941 warn "Processed $counting{sequences_count} sequences so far\n";
942 }
943
944 my $orig_identifier_1 = $identifier_1;
945 my $orig_identifier_2 = $identifier_2;
946
947 chomp $sequence_1;
948 chomp $identifier_1;
949 chomp $sequence_2;
950 chomp $identifier_2;
951 chomp $quality_value_1;
952 chomp $quality_value_2;
953
954 $identifier_1 =~ s/^\@//; # deletes the @ at the beginning of the FastQ ID
955
956 my $return;
957 if ($bowtie2){
958 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
959 }
960 else{
961 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
962 }
963
964 unless ($return){
965 $return = 0;
966 }
967
968 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
969 if ($ambiguous and $return == 2){
970 # seq_1
971 print AMBIG_1 $orig_identifier_1;
972 print AMBIG_1 "$sequence_1\n";
973 print AMBIG_1 $ident_1;
974 print AMBIG_1 "$quality_value_1\n";
975 # seq_2
976 print AMBIG_2 $orig_identifier_2;
977 print AMBIG_2 "$sequence_2\n";
978 print AMBIG_2 $ident_2;
979 print AMBIG_2 "$quality_value_2\n";
980 }
981
982 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
983 elsif ($unmapped and $return == 1){
984 # seq_1
985 print UNMAPPED_1 $orig_identifier_1;
986 print UNMAPPED_1 "$sequence_1\n";
987 print UNMAPPED_1 $ident_1;
988 print UNMAPPED_1 "$quality_value_1\n";
989 # seq_2
990 print UNMAPPED_2 $orig_identifier_2;
991 print UNMAPPED_2 "$sequence_2\n";
992 print UNMAPPED_2 $ident_2;
993 print UNMAPPED_2 "$quality_value_2\n";
994 }
995 }
996
997 print "Processed $counting{sequences_count} sequences in total\n\n";
998
999 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
1000
1001 }
1002
1003 sub check_bowtie_results_single_end{
1004 my ($sequence,$identifier,$quality_value) = @_;
1005
1006 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
1007 $quality_value = 'I'x(length$sequence);
1008 }
1009
1010 my %mismatches = ();
1011 ### reading from the bowtie output files to see if this sequence aligned to a bisulfite converted genome
1012 foreach my $index (0..$#fhs){
1013
1014 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
1015 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
1016 ### if the sequence we are currently looking at produced an alignment we are doing various things with it
1017 if ($fhs[$index]->{last_seq_id} eq $identifier) {
1018 ###############################################################
1019 ### STEP I Now processing the alignment stored in last_line ###
1020 ###############################################################
1021 my $valid_alignment_found_1 = decide_whether_single_end_alignment_is_valid($index,$identifier);
1022 ### sequences can fail at this point if there was only 1 seq in the wrong orientation, or if there were 2 seqs, both in the wrong orientation
1023 ### we only continue to extract useful information about this alignment if 1 was returned
1024 if ($valid_alignment_found_1 == 1){
1025 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself
1026 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
1027 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
1028
1029 unless($mismatch_info){
1030 $mismatch_info = '';
1031 }
1032
1033 chomp $mismatch_info;
1034 my $chromosome;
1035 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
1036 $chromosome = $mapped_chromosome;
1037 }
1038 else{
1039 die "Chromosome number extraction failed for $mapped_chromosome\n";
1040 }
1041 ### Now extracting the number of mismatches to the converted genome
1042 my $number_of_mismatches;
1043 if ($mismatch_info eq ''){
1044 $number_of_mismatches = 0;
1045 }
1046 elsif ($mismatch_info =~ /^\d/){
1047 my @mismatches = split (/,/,$mismatch_info);
1048 $number_of_mismatches = scalar @mismatches;
1049 }
1050 else{
1051 die "Something weird is going on with the mismatch field:\t>>> $mismatch_info <<<\n";
1052 }
1053 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
1054 my $alignment_location = join (":",$chromosome,$position);
1055 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
1056 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
1057 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
1058 ### number for the found alignment)
1059 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
1060 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
1061 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
1062 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
1063 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
1064 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
1065 }
1066 $number_of_mismatches = undef;
1067 ##################################################################################################################################################
1068 ### STEP II Now reading in the next line from the bowtie filehandle. The next alignment can either be a second alignment of the same sequence or a
1069 ### a new sequence. In either case we will store the next line in @fhs ->{last_line}. In case the alignment is already the next entry, a 0 will
1070 ### be returned as $valid_alignment_found and it will then be processed in the next round only.
1071 ##################################################################################################################################################
1072 my $newline = $fhs[$index]->{fh}-> getline();
1073 if ($newline){
1074 my ($seq_id) = split (/\t/,$newline);
1075 $fhs[$index]->{last_seq_id} = $seq_id;
1076 $fhs[$index]->{last_line} = $newline;
1077 }
1078 else {
1079 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
1080 $fhs[$index]->{last_seq_id} = undef;
1081 $fhs[$index]->{last_line} = undef;
1082 next;
1083 }
1084 my $valid_alignment_found_2 = decide_whether_single_end_alignment_is_valid($index,$identifier);
1085 ### we only continue to extract useful information about this second alignment if 1 was returned
1086 if ($valid_alignment_found_2 == 1){
1087 ### If the second Bowtie output made it this far it is in the correct orientation, so we can continue to analyse the alignment itself
1088 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
1089 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
1090 unless($mismatch_info){
1091 $mismatch_info = '';
1092 }
1093 chomp $mismatch_info;
1094
1095 my $chromosome;
1096 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
1097 $chromosome = $mapped_chromosome;
1098 }
1099 else{
1100 die "Chromosome number extraction failed for $mapped_chromosome\n";
1101 }
1102
1103 ### Now extracting the number of mismatches to the converted genome
1104 my $number_of_mismatches;
1105 if ($mismatch_info eq ''){
1106 $number_of_mismatches = 0;
1107 }
1108 elsif ($mismatch_info =~ /^\d/){
1109 my @mismatches = split (/,/,$mismatch_info);
1110 $number_of_mismatches = scalar @mismatches;
1111 }
1112 else{
1113 die "Something weird is going on with the mismatch field\n";
1114 }
1115 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
1116 ### extracting the chromosome number from the bowtie output (see above)
1117 my $alignment_location = join (":",$chromosome,$position);
1118 ### In the special case that two differently converted sequences align against differently converted genomes, but to the same position
1119 ### with the same number of mismatches (or perfect matches), the chromosome, position and number of mismatches are the same. In this
1120 ### case we are not writing the same entry out a second time.
1121 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
1122 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
1123 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
1124 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
1125 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
1126 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
1127 }
1128 ####################################################################################################################################
1129 #### STEP III Now reading in one more line which has to be the next alignment to be analysed. Adding it to @fhs ->{last_line} ###
1130 ####################################################################################################################################
1131 $newline = $fhs[$index]->{fh}-> getline();
1132 if ($newline){
1133 my ($seq_id) = split (/\t/,$newline);
1134 die "The same seq ID occurred more than twice in a row\n" if ($seq_id eq $identifier);
1135 $fhs[$index]->{last_seq_id} = $seq_id;
1136 $fhs[$index]->{last_line} = $newline;
1137 next;
1138 }
1139 else {
1140 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
1141 $fhs[$index]->{last_seq_id} = undef;
1142 $fhs[$index]->{last_line} = undef;
1143 next;
1144 }
1145 ### still within the 2nd sequence in correct orientation found
1146 }
1147 ### still withing the 1st sequence in correct orientation found
1148 }
1149 ### still within the if (last_seq_id eq identifier) condition
1150 }
1151 ### still within foreach index loop
1152 }
1153 ### if there was not a single alignment found for a certain sequence we will continue with the next sequence in the sequence file
1154 unless(%mismatches){
1155 $counting{no_single_alignment_found}++;
1156 if ($unmapped){
1157 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
1158 }
1159 else{
1160 return;
1161 }
1162 }
1163 #######################################################################################################################################################
1164 #######################################################################################################################################################
1165 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the ###
1166 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the ###
1167 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether ###
1168 #######################################################################################################################################################
1169 #######################################################################################################################################################
1170 ### Going to use the variable $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
1171 my $sequence_fails = 0;
1172 ### Declaring an empty hash reference which will store all information we need for the methylation call
1173 my $methylation_call_params; # hash reference!
1174 ### sorting in ascending order
1175 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
1176
1177 ### if there is only 1 entry in the hash with the lowest number of mismatches we accept it as the best alignment
1178 if (scalar keys %{$mismatches{$mismatch_number}} == 1){
1179 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
1180 $methylation_call_params->{$identifier}->{bowtie_sequence} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence};
1181 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
1182 $methylation_call_params->{$identifier}->{position} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{position};
1183 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
1184 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
1185 }
1186 }
1187 elsif (scalar keys %{$mismatches{$mismatch_number}} == 3){
1188 ### If there are 3 sequences with the same number of lowest mismatches we can discriminate 2 cases: (i) all 3 alignments are unique best hits and
1189 ### come from different alignments processes (== indices) or (ii) one sequence alignment (== index) will give a unique best alignment, whereas a
1190 ### second one will produce 2 (or potentially many) alignments for the same sequence but in a different conversion state or against a different genome
1191 ### version (or both). This becomes especially relevant for highly converted sequences in which all Cs have been converted to Ts in the bisulfite
1192 ### reaction. E.g.
1193 ### CAGTCACGCGCGCGCG will become
1194 ### TAGTTATGTGTGTGTG in the CT transformed version, which will ideally still give the correct alignment in the CT->CT alignment condition.
1195 ### If the same read will then become G->A transformed as well however, the resulting sequence will look differently and potentially behave
1196 ### differently in a GA->GA alignment and this depends on the methylation state of the original sequence!:
1197 ### G->A conversion:
1198 ### highly methylated: CAATCACACACACACA
1199 ### highly converted : TAATTATATATATATA <== this sequence has a reduced complexity (only 2 bases left and not 3), and it is more likely to produce
1200 ### an alignment with a low complexity genomic region than the one above. This would normally lead to the entire sequence being kicked out as the
1201 ### there will be 3 alignments with the same number of lowest mismatches!! This in turn means that highly methylated and thereby not converted
1202 ### sequences are more likely to pass the alignment step, thereby creating a bias for methylated reads compared to their non-methylated counterparts.
1203 ### We do not want any bias, whatsover. Therefore if we have 1 sequence producing a unique best alignment and the second and third conditions
1204 ### producing alignments only after performing an additional (theoretical) conversion we want to keep the best alignment with the lowest number of
1205 ### additional transliterations performed. Thus we want to have a look at the level of complexity of the sequences producing the alignment.
1206 ### In the above example the number of transliterations required to transform the actual sequence
1207 ### to the C->T version would be TAGTTATGTGTGTGTG -> TAGTTATGTGTGTGTG = 0; (assuming this gives the correct alignment)
1208 ### in the G->A case it would be TAGTTATGTGTGTGTG -> TAATTATATATATATA = 6; (assuming this gives multiple wrong alignments)
1209 ### if the sequence giving a unique best alignment required a lower number of transliterations than the second best sequence yielding alignments
1210 ### while requiring a much higher number of transliterations, we are going to accept the unique best alignment with the lowest number of performed
1211 ### transliterations. As a threshold which does scale we will start with the number of tranliterations of the lowest best match x 2 must still be
1212 ### smaller than the number of tranliterations of the second best sequence. Everything will be flagged with $sequence_fails = 1 and discarded.
1213 my @three_candidate_seqs;
1214 foreach my $composite_location (keys (%{$mismatches{$mismatch_number}}) ){
1215 my $transliterations_performed;
1216 if ($mismatches{$mismatch_number}->{$composite_location}->{index} == 0 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 1){
1217 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'CT');
1218 }
1219 elsif ($mismatches{$mismatch_number}->{$composite_location}->{index} == 2 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 3){
1220 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'GA');
1221 }
1222 else{
1223 die "unexpected index number range $!\n";
1224 }
1225 push @three_candidate_seqs,{
1226 index =>$mismatches{$mismatch_number}->{$composite_location}->{index},
1227 bowtie_sequence => $mismatches{$mismatch_number}->{$composite_location}->{bowtie_sequence},
1228 mismatch_number => $mismatch_number,
1229 chromosome => $mismatches{$mismatch_number}->{$composite_location}->{chromosome},
1230 position => $mismatches{$mismatch_number}->{$composite_location}->{position},
1231 seq_id => $mismatches{$mismatch_number}->{$composite_location}->{seq_id},
1232 transliterations_performed => $transliterations_performed,
1233 };
1234 }
1235 ### sorting in ascending order for the lowest number of transliterations performed
1236 @three_candidate_seqs = sort {$a->{transliterations_performed} <=> $b->{transliterations_performed}} @three_candidate_seqs;
1237 my $first_array_element = $three_candidate_seqs[0]->{transliterations_performed};
1238 my $second_array_element = $three_candidate_seqs[1]->{transliterations_performed};
1239 my $third_array_element = $three_candidate_seqs[2]->{transliterations_performed};
1240 # print "$first_array_element\t$second_array_element\t$third_array_element\n";
1241 if (($first_array_element*2) < $second_array_element){
1242 $counting{low_complexity_alignments_overruled_count}++;
1243 ### taking the index with the unique best hit and over ruling low complexity alignments with 2 hits
1244 $methylation_call_params->{$identifier}->{bowtie_sequence} = $three_candidate_seqs[0]->{bowtie_sequence};
1245 $methylation_call_params->{$identifier}->{chromosome} = $three_candidate_seqs[0]->{chromosome};
1246 $methylation_call_params->{$identifier}->{position} = $three_candidate_seqs[0]->{position};
1247 $methylation_call_params->{$identifier}->{index} = $three_candidate_seqs[0]->{index};
1248 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
1249 # print "Overruled low complexity alignments! Using $first_array_element and disregarding $second_array_element and $third_array_element\n";
1250 }
1251 else{
1252 $sequence_fails = 1;
1253 }
1254 }
1255 else{
1256 $sequence_fails = 1;
1257 }
1258 ### after processing the alignment with the lowest number of mismatches we exit
1259 last;
1260 }
1261 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
1262 if ($sequence_fails == 1){
1263 $counting{unsuitable_sequence_count}++;
1264 if ($ambiguous){
1265 return 2; # => exits to next sequence, and prints it out to multiple_alignments.out if --ambiguous has been specified
1266 }
1267 if ($unmapped){
1268 return 1; # => exits to next sequence, and prints it out to unmapped.out if --un has been specified
1269 }
1270 else{
1271 return 0; # => exits to next sequence (default)
1272 }
1273 }
1274
1275 ### --DIRECTIONAL
1276 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
1277 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
1278 if ($directional){
1279 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
1280 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
1281 $counting{alignments_rejected_count}++;
1282 return 0;
1283 }
1284 }
1285
1286 ### If the sequence has not been rejected so far it will have a unique best alignment
1287 $counting{unique_best_alignment_count}++;
1288 extract_corresponding_genomic_sequence_single_end($identifier,$methylation_call_params);
1289 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
1290 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
1291 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
1292 $counting{genomic_sequence_could_not_be_extracted_count}++;
1293 return 0;
1294 }
1295
1296 ### otherwise we are set to perform the actual methylation call
1297 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
1298
1299 print_bisulfite_mapping_result_single_end($identifier,$sequence,$methylation_call_params,$quality_value);
1300 return 0; ## otherwise 1 will be returned by default, which would print the sequence to unmapped.out
1301 }
1302
1303 sub check_bowtie_results_single_end_bowtie2{
1304 my ($sequence,$identifier,$quality_value) = @_;
1305
1306 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
1307 $quality_value = 'I'x(length$sequence);
1308 }
1309
1310 # as of version Bowtie 2 2.0.0 beta7, when input reads are unpaired, Bowtie 2 no longer removes the trailing /1 or /2 from the read name.
1311 # $identifier =~ s/\/[1234567890]+$//; # some sequencers don't just have /1 or /2 at the end of read IDs
1312
1313 my $alignment_ambiguous = 0;
1314
1315 my %alignments = ();
1316
1317 ### reading from the Bowtie 2 output filehandles
1318 foreach my $index (0..$#fhs){
1319 # print "Index: $index\n";
1320 # print "$fhs[$index]->{last_line}\n";
1321 # print "$fhs[$index]->{last_seq_id}\n\n";
1322
1323 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
1324 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
1325
1326 ### if the sequence we are currently looking at produced an alignment we are doing various things with it
1327 # print "last seq id: $fhs[$index]->{last_seq_id} and identifier: $identifier\n";
1328
1329 if ($fhs[$index]->{last_seq_id} eq $identifier) {
1330
1331 # SAM format specifications for Bowtie 2
1332 # (1) Name of read that aligned
1333 # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
1334 # 1 The read is one of a pair
1335 # 2 The alignment is one end of a proper paired-end alignment
1336 # 4 The read has no reported alignments
1337 # 8 The read is one of a pair and has no reported alignments
1338 # 16 The alignment is to the reverse reference strand
1339 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
1340 # 64 The read is mate 1 in a pair
1341 # 128 The read is mate 2 in a pair
1342 # 256 The read has multiple mapping states
1343 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
1344 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
1345 # (5) Mapping quality (255 means MAPQ is not available)
1346 # (6) CIGAR string representation of alignment (* if unavailable)
1347 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
1348 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
1349 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
1350 # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
1351 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
1352 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
1353 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
1354 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
1355 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
1356 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
1357 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
1358 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
1359 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
1360 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
1361 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
1362 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
1363
1364 my ($id,$flag,$mapped_chromosome,$position,$mapping_quality,$cigar,$bowtie_sequence,$qual) = (split (/\t/,$fhs[$index]->{last_line}))[0,1,2,3,4,5,9,10];
1365
1366 ### If a sequence has no reported alignments there will be a single output line with a bit-wise flag value of 4. We can store the next alignment and move on to the next Bowtie 2 instance
1367 if ($flag == 4){
1368 ## reading in the next alignment, which must be the next sequence
1369 my $newline = $fhs[$index]->{fh}-> getline();
1370 if ($newline){
1371 chomp $newline;
1372 my ($seq_id) = split (/\t/,$newline);
1373 $fhs[$index]->{last_seq_id} = $seq_id;
1374 $fhs[$index]->{last_line} = $newline;
1375 if ($seq_id eq $identifier){
1376 die "Sequence with ID $identifier did not produce any alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
1377 }
1378 next; # next instance
1379 }
1380 else{
1381 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
1382 $fhs[$index]->{last_seq_id} = undef;
1383 $fhs[$index]->{last_line} = undef;
1384 next;
1385 }
1386 }
1387
1388 # if there are one or more proper alignments we can extract the chromosome number
1389 my $chromosome;
1390 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
1391 $chromosome = $mapped_chromosome;
1392 }
1393 else{
1394 die "Chromosome number extraction failed for $mapped_chromosome\n";
1395 }
1396
1397 ### We will use the optional field to determine the best alignment. Later on we extract the number of mismatches and/or indels from the CIGAR string
1398 my ($alignment_score,$second_best,$MD_tag);
1399 my @fields = split (/\t/,$fhs[$index]->{last_line});
1400
1401 foreach (11..$#fields){
1402 if ($fields[$_] =~ /AS:i:(.*)/){
1403 $alignment_score = $1;
1404 }
1405 elsif ($fields[$_] =~ /XS:i:(.*)/){
1406 $second_best = $1;
1407 }
1408 elsif ($fields[$_] =~ /MD:Z:(.*)/){
1409 $MD_tag = $1;
1410 }
1411 }
1412
1413 # warn "First best alignment_score is: '$alignment_score'\n";
1414 # warn "MD tag is: '$MD_tag'\n";
1415 die "Failed to extract alignment score ($alignment_score) and MD tag ($MD_tag)!\n" unless (defined $alignment_score and defined $MD_tag);
1416
1417 if (defined $second_best){
1418 # warn "second best alignment_score is: '$second_best'\n";
1419
1420 # If the first alignment score is the same as the alignment score of the second best hit we are going to boot this sequence altogether
1421 if ($alignment_score == $second_best){
1422 $alignment_ambiguous = 1;
1423 ## need to read and discard all additional ambiguous reads until we reach the next sequence
1424 until ($fhs[$index]->{last_seq_id} ne $identifier){
1425 my $newline = $fhs[$index]->{fh}-> getline();
1426 if ($newline){
1427 chomp $newline;
1428 my ($seq_id) = split (/\t/,$newline);
1429 $fhs[$index]->{last_seq_id} = $seq_id;
1430 $fhs[$index]->{last_line} = $newline;
1431 }
1432 else{
1433 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
1434 $fhs[$index]->{last_seq_id} = undef;
1435 $fhs[$index]->{last_line} = undef;
1436 last; # break free in case we have reached the end of the alignment output
1437 }
1438 }
1439 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
1440 }
1441 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
1442
1443 my $alignment_location = join (":",$chromosome,$position);
1444
1445 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
1446 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
1447 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
1448 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
1449
1450 unless (exists $alignments{$alignment_location}){
1451 $alignments{$alignment_location}->{seq_id} = $id;
1452 $alignments{$alignment_location}->{alignment_score} = $alignment_score;
1453 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
1454 $alignments{$alignment_location}->{index} = $index;
1455 $alignments{$alignment_location}->{chromosome} = $chromosome;
1456 $alignments{$alignment_location}->{position} = $position;
1457 $alignments{$alignment_location}->{CIGAR} = $cigar;
1458 $alignments{$alignment_location}->{MD_tag} = $MD_tag;
1459 }
1460
1461 ### now reading and discarding all (inferior) alignments of this sequencing read until we hit the next sequence
1462 until ($fhs[$index]->{last_seq_id} ne $identifier){
1463 my $newline = $fhs[$index]->{fh}-> getline();
1464 if ($newline){
1465 chomp $newline;
1466 my ($seq_id) = split (/\t/,$newline);
1467 $fhs[$index]->{last_seq_id} = $seq_id;
1468 $fhs[$index]->{last_line} = $newline;
1469 }
1470 else{
1471 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
1472 $fhs[$index]->{last_seq_id} = undef;
1473 $fhs[$index]->{last_line} = undef;
1474 last; # break free in case we have reached the end of the alignment output
1475 }
1476 }
1477 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
1478 }
1479 }
1480 else{ # there is no second best hit, so we can just store this one and read in the next sequence
1481
1482 my $alignment_location = join (":",$chromosome,$position);
1483
1484 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
1485 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
1486 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
1487 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
1488
1489 unless (exists $alignments{$alignment_location}){
1490 $alignments{$alignment_location}->{seq_id} = $id;
1491 $alignments{$alignment_location}->{alignment_score} = $alignment_score;
1492 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
1493 $alignments{$alignment_location}->{index} = $index;
1494 $alignments{$alignment_location}->{chromosome} = $chromosome;
1495 $alignments{$alignment_location}->{position} = $position;
1496 $alignments{$alignment_location}->{MD_tag} = $MD_tag;
1497 $alignments{$alignment_location}->{CIGAR} = $cigar;
1498 }
1499
1500 my $newline = $fhs[$index]->{fh}-> getline();
1501 if ($newline){
1502 chomp $newline;
1503 my ($seq_id) = split (/\t/,$newline);
1504 $fhs[$index]->{last_seq_id} = $seq_id;
1505 $fhs[$index]->{last_line} = $newline;
1506 if ($seq_id eq $identifier){
1507 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
1508 }
1509 }
1510 else{
1511 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
1512 $fhs[$index]->{last_seq_id} = undef;
1513 $fhs[$index]->{last_line} = undef;
1514 }
1515 }
1516 }
1517 }
1518
1519 ### if the read produced several ambiguous alignments already now can returning already now. If --ambiguous or --unmapped was specified the read sequence will be printed out.
1520 if ($alignment_ambiguous == 1){
1521 $counting{unsuitable_sequence_count}++;
1522 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
1523 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
1524 # print "$ambiguous_read_output\n";
1525
1526 if ($ambiguous){
1527 return 2; # => exits to next sequence, and prints it out to _ambiguous_reads.txt if '--ambiguous' was specified
1528 }
1529 elsif ($unmapped){
1530 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
1531 }
1532 else{
1533 return 0;
1534 }
1535 }
1536
1537 ### if there was no alignment found for a certain sequence at all we continue with the next sequence in the sequence file
1538 unless(%alignments){
1539 $counting{no_single_alignment_found}++;
1540 # my $unmapped_read_output = join("\t",$identifier,'4','*','0','0','*','*','0','0',$sequence,$quality_value);
1541 # print "$unmapped_read_output\n";
1542 if ($unmapped){
1543 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' was specified
1544 }
1545 else{
1546 return 0; # default
1547 }
1548 }
1549
1550 #######################################################################################################################################################
1551
1552 ### If the sequence was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
1553 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
1554 ### alignment score we are discarding the sequence altogether.
1555 ### For end-to-end alignments the maximum alignment score can be 0, each mismatch can receive penalties up to 6, and each gap receives penalties for
1556 ### opening (5) and extending (3 per bp) the gap.
1557
1558 #######################################################################################################################################################
1559
1560 my $methylation_call_params; # hash reference which will store all information we need for the methylation call
1561 my $sequence_fails = 0; # Going to use $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
1562
1563 ### print contents of %alignments for debugging
1564 # if (scalar keys %alignments > 1){
1565 # print "\n******\n";
1566 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
1567 # print "Loc: $alignment_location\n";
1568 # print "ID: $alignments{$alignment_location}->{seq_id}\n";
1569 # print "AS: $alignments{$alignment_location}->{alignment_score}\n";
1570 # print "Seq: $alignments{$alignment_location}->{bowtie_sequence}\n";
1571 # print "Index $alignments{$alignment_location}->{index}\n";
1572 # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
1573 # print "pos: $alignments{$alignment_location}->{position}\n";
1574 # print "MD: $alignments{$alignment_location}->{MD_tag}\n\n";
1575 # }
1576 # print "\n******\n";
1577 # }
1578
1579 ### if there is only 1 entry in the hash with we accept it as the best alignment
1580 if (scalar keys %alignments == 1){
1581 for my $unique_best_alignment (keys %alignments){
1582 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$unique_best_alignment}->{bowtie_sequence};
1583 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
1584 $methylation_call_params->{$identifier}->{position} = $alignments{$unique_best_alignment}->{position};
1585 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
1586 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$unique_best_alignment}->{alignment_score};
1587 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$unique_best_alignment}->{MD_tag};
1588 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$unique_best_alignment}->{CIGAR};
1589 }
1590 }
1591
1592 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
1593 ### we boot the sequence altogether
1594 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
1595 my $best_alignment_score;
1596 my $best_alignment_location;
1597 foreach my $alignment_location (sort {$alignments{$b}->{alignment_score} <=> $alignments{$a}->{alignment_score}} keys %alignments){
1598 # print "$alignments{$alignment_location}->{alignment_score}\n";
1599 unless (defined $best_alignment_score){
1600 $best_alignment_score = $alignments{$alignment_location}->{alignment_score};
1601 $best_alignment_location = $alignment_location;
1602 # print "setting best alignment score: $best_alignment_score\n";
1603 }
1604 else{
1605 ### if the second best alignment has the same alignment score as the first one, the sequence will get booted
1606 if ($alignments{$alignment_location}->{alignment_score} == $best_alignment_score){
1607 # warn "Same alignment score, the sequence will get booted!\n";
1608 $sequence_fails = 1;
1609 last; # exiting after the second alignment since we know that the sequence has ambiguous alignments
1610 }
1611 ### else we are going to store the best alignment for further processing
1612 else{
1613 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$best_alignment_location}->{bowtie_sequence};
1614 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
1615 $methylation_call_params->{$identifier}->{position} = $alignments{$best_alignment_location}->{position};
1616 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
1617 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$best_alignment_location}->{alignment_score};
1618 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$best_alignment_location}->{MD_tag};
1619 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$best_alignment_location}->{CIGAR};
1620 last; # exiting after processing the second alignment since the sequence produced a unique best alignment
1621 }
1622 }
1623 }
1624 }
1625 else{
1626 die "There are too many potential hits for this sequence (1-4 expected, but found: ",scalar keys %alignments,")\n";;
1627 }
1628
1629 ### skipping the sequence completely if there were multiple alignments with the same best alignment score at different positions
1630 if ($sequence_fails == 1){
1631 $counting{unsuitable_sequence_count}++;
1632
1633 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
1634 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
1635 # print OUT "$ambiguous_read_output\n";
1636
1637 if ($ambiguous){
1638 return 2; # => exits to next sequence, and prints it out (in FastQ format) to _ambiguous_reads.txt if '--ambiguous' was specified
1639 }
1640 elsif ($unmapped){
1641 return 1; # => exits to next sequence, and prints it out (in FastQ format) to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
1642 }
1643 else{
1644 return 0; # => exits to next sequence (default)
1645 }
1646 }
1647
1648 ### --DIRECTIONAL
1649 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
1650 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
1651 if ($directional){
1652 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
1653 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
1654 $counting{alignments_rejected_count}++;
1655 return 0;
1656 }
1657 }
1658
1659 ### If the sequence has not been rejected so far it has a unique best alignment
1660 $counting{unique_best_alignment_count}++;
1661
1662 ### Now we need to extract a genomic sequence that exactly corresponds to the reported alignment. This potentially means that we need to deal with insertions or deletions as well
1663 extract_corresponding_genomic_sequence_single_end_bowtie2 ($identifier,$methylation_call_params);
1664
1665 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
1666 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
1667 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
1668 $counting{genomic_sequence_could_not_be_extracted_count}++;
1669 return 0;
1670 }
1671
1672
1673 ### otherwise we are set to perform the actual methylation call
1674 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
1675 print_bisulfite_mapping_result_single_end_bowtie2 ($identifier,$sequence,$methylation_call_params,$quality_value);
1676 return 0; ## if a sequence got this far we do not want to print it to unmapped or ambiguous.out
1677 }
1678
1679
1680 sub determine_number_of_transliterations_performed{
1681 my ($sequence,$read_conversion) = @_;
1682 my $number_of_transliterations;
1683 if ($read_conversion eq 'CT'){
1684 $number_of_transliterations = $sequence =~ tr/C/T/;
1685 }
1686 elsif ($read_conversion eq 'GA'){
1687 $number_of_transliterations = $sequence =~ tr/G/A/;
1688 }
1689 else{
1690 die "Read conversion mode of the read was not specified $!\n";
1691 }
1692 return $number_of_transliterations;
1693 }
1694
1695 sub decide_whether_single_end_alignment_is_valid{
1696 my ($index,$identifier) = @_;
1697
1698 # extracting from Bowtie 1 format
1699 my ($id,$strand) = (split (/\t/,$fhs[$index]->{last_line}))[0,1];
1700
1701 ### ensuring that the entry is the correct sequence
1702 if (($id eq $fhs[$index]->{last_seq_id}) and ($id eq $identifier)){
1703 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
1704 ### sensible alignments
1705 my $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
1706 ### If the orientation was correct can we move on
1707 if ($orientation == 1){
1708 return 1; ### 1st possibility for a sequence to pass
1709 }
1710 ### If the alignment was in the wrong orientation we need to read in a new line
1711 elsif($orientation == 0){
1712 my $newline = $fhs[$index]->{fh}->getline();
1713 if ($newline){
1714 ($id,$strand) = (split (/\t/,$newline))[0,1];
1715
1716 ### ensuring that the next entry is still the correct sequence
1717 if ($id eq $identifier){
1718 ### checking orientation again
1719 $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
1720 ### If the orientation was correct can we move on
1721 if ($orientation == 1){
1722 $fhs[$index]->{last_seq_id} = $id;
1723 $fhs[$index]->{last_line} = $newline;
1724 return 1; ### 2nd possibility for a sequence to pass
1725 }
1726 ### If the alignment was in the wrong orientation again we need to read in yet another new line and store it in @fhs
1727 elsif ($orientation == 0){
1728 $newline = $fhs[$index]->{fh}->getline();
1729 if ($newline){
1730 my ($seq_id) = split (/\t/,$newline);
1731 ### check if the next line still has the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
1732 ### the same fields of the just read next entry
1733 die "Same seq ID 3 or more times in a row!(should be 2 max) $!" if ($seq_id eq $identifier);
1734 $fhs[$index]->{last_seq_id} = $seq_id;
1735 $fhs[$index]->{last_line} = $newline;
1736 return 0; # not processing anything this round as the alignment currently stored in last_line was in the wrong orientation
1737 }
1738 else{
1739 # assigning undef to last_seq_id and last_line (end of bowtie output)
1740 $fhs[$index]->{last_seq_id} = undef;
1741 $fhs[$index]->{last_line} = undef;
1742 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
1743 }
1744 }
1745 else{
1746 die "The orientation of the alignment must be either correct or incorrect\n";
1747 }
1748 }
1749 ### the sequence we just read in is already the next sequence to be analysed -> store it in @fhs
1750 else{
1751 $fhs[$index]->{last_seq_id} = $id;
1752 $fhs[$index]->{last_line} = $newline;
1753 return 0; # processing the new alignment result only in the next round
1754 }
1755 }
1756 else {
1757 # assigning undef to last_seq_id and last_line (end of bowtie output)
1758 $fhs[$index]->{last_seq_id} = undef;
1759 $fhs[$index]->{last_line} = undef;
1760 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
1761 }
1762 }
1763 else{
1764 die "The orientation of the alignment must be either correct or incorrect\n";
1765 }
1766 }
1767 ### the sequence stored in @fhs as last_line is already the next sequence to be analysed -> analyse next round
1768 else{
1769 return 0;
1770 }
1771 }
1772 #########################
1773 ### BOWTIE 1 | PAIRED-END
1774 #########################
1775
1776 sub check_bowtie_results_paired_ends{
1777 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
1778
1779 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
1780 unless ($quality_value_1){
1781 $quality_value_1 = 'I'x(length$sequence_1);
1782 }
1783 unless ($quality_value_2){
1784 $quality_value_2 = 'I'x(length$sequence_2);
1785 }
1786
1787 # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
1788
1789 my %mismatches = ();
1790 ### reading from the bowtie output files to see if this sequence pair aligned to a bisulfite converted genome
1791
1792
1793 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
1794 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
1795 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
1796 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignment to the complementary
1797 ### strands are not being reported by specifying --directional
1798
1799 foreach my $index (0,3,1,2){
1800 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
1801 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
1802 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
1803 if ($fhs[$index]->{last_seq_id} eq $identifier) {
1804 # print "$identifier\n$fhs[$index]->{last_seq_id}\n\n";
1805
1806 ##################################################################################
1807 ### STEP I Processing the entry which is stored in last_line_1 and last_line_2 ###
1808 ##################################################################################
1809 my $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
1810 ### sequences can fail at this point if there was only 1 alignment in the wrong orientation, or if there were 2 aligments both in the wrong
1811 ### orientation. We only continue to extract useful information about this alignment if 1 was returned
1812 if ($valid_alignment_found == 1){
1813 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself.
1814 ### we store the useful information in %mismatches
1815 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
1816 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
1817 chomp $mismatch_info_1;
1818 chomp $mismatch_info_2;
1819
1820 ### need to extract the chromosome number from the bowtie output (which is either XY_CT_converted or XY_GA_converted
1821 my ($chromosome_1,$chromosome_2);
1822 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
1823 $chromosome_1 = $mapped_chromosome_1;
1824 }
1825 else{
1826 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
1827 }
1828 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
1829 $chromosome_2 = $mapped_chromosome_2;
1830 }
1831 else{
1832 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
1833 }
1834
1835 ### Now extracting the number of mismatches to the converted genome
1836 my $number_of_mismatches_1;
1837 my $number_of_mismatches_2;
1838 if ($mismatch_info_1 eq ''){
1839 $number_of_mismatches_1 = 0;
1840 }
1841 elsif ($mismatch_info_1 =~ /^\d/){
1842 my @mismatches = split (/,/,$mismatch_info_1);
1843 $number_of_mismatches_1 = scalar @mismatches;
1844 }
1845 else{
1846 die "Something weird is going on with the mismatch field\n";
1847 }
1848 if ($mismatch_info_2 eq ''){
1849 $number_of_mismatches_2 = 0;
1850 }
1851 elsif ($mismatch_info_2 =~ /^\d/){
1852 my @mismatches = split (/,/,$mismatch_info_2);
1853 $number_of_mismatches_2 = scalar @mismatches;
1854 }
1855 else{
1856 die "Something weird is going on with the mismatch field\n";
1857 }
1858 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
1859 my $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
1860 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
1861 die "Position 1 is higher than position 2" if ($position_1 > $position_2);
1862 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
1863 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
1864 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
1865 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
1866 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
1867 ### number for the found alignment)
1868 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
1869 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
1870 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
1871 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
1872 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
1873 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
1874 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
1875 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
1876 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
1877 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
1878 }
1879 ###################################################################################################################################################
1880 ### STEP II Now reading in the next 2 lines from the bowtie filehandle. If there are 2 next lines in the alignments filehandle it can either ###
1881 ### be a second alignment of the same sequence pair or a new sequence pair. In any case we will just add it to last_line_1 and last_line _2. ###
1882 ### If it is the alignment of the next sequence pair, 0 will be returned as $valid_alignment_found, so it will not be processed any further in ###
1883 ### this round ###
1884 ###################################################################################################################################################
1885 my $newline_1 = $fhs[$index]->{fh}-> getline();
1886 my $newline_2 = $fhs[$index]->{fh}-> getline();
1887
1888 if ($newline_1 and $newline_2){
1889 my ($seq_id_1) = split (/\t/,$newline_1);
1890 my ($seq_id_2) = split (/\t/,$newline_2);
1891
1892 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
1893 $fhs[$index]->{last_seq_id} = $seq_id_1;
1894 }
1895 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
1896 $fhs[$index]->{last_seq_id} = $seq_id_2;
1897 }
1898 else{
1899 die "Either read 1 or read 2 needs to end on '/1'\n";
1900 }
1901
1902 $fhs[$index]->{last_line_1} = $newline_1;
1903 $fhs[$index]->{last_line_2} = $newline_2;
1904 }
1905 else {
1906 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
1907 $fhs[$index]->{last_seq_id} = undef;
1908 $fhs[$index]->{last_line_1} = undef;
1909 $fhs[$index]->{last_line_2} = undef;
1910 next; # jumping to the next index
1911 }
1912 ### Now processing the entry we just stored in last_line_1 and last_line_2
1913 $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
1914 ### only processing the alignment further if 1 was returned. 0 will be returned either if the alignment is already the next sequence pair to
1915 ### be analysed or if it was a second alignment of the current sequence pair but in the wrong orientation
1916 if ($valid_alignment_found == 1){
1917 ### we store the useful information in %mismatches
1918 ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,7];
1919 ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,7];
1920 chomp $mismatch_info_1;
1921 chomp $mismatch_info_2;
1922 ### need to extract the chromosome number from the bowtie output (which is either _CT_converted or _GA_converted)
1923 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
1924 $chromosome_1 = $mapped_chromosome_1;
1925 }
1926 else{
1927 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
1928 }
1929 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
1930 $chromosome_2 = $mapped_chromosome_2;
1931 }
1932 else{
1933 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
1934 }
1935
1936 $number_of_mismatches_1='';
1937 $number_of_mismatches_2='';
1938 ### Now extracting the number of mismatches to the converted genome
1939 if ($mismatch_info_1 eq ''){
1940 $number_of_mismatches_1 = 0;
1941 }
1942 elsif ($mismatch_info_1 =~ /^\d/){
1943 my @mismatches = split (/,/,$mismatch_info_1);
1944 $number_of_mismatches_1 = scalar @mismatches;
1945 }
1946 else{
1947 die "Something weird is going on with the mismatch field\n";
1948 }
1949 if ($mismatch_info_2 eq ''){
1950 $number_of_mismatches_2 = 0;
1951 }
1952 elsif ($mismatch_info_2 =~ /^\d/){
1953 my @mismatches = split (/,/,$mismatch_info_2);
1954 $number_of_mismatches_2 = scalar @mismatches;
1955 }
1956 else{
1957 die "Something weird is going on with the mismatch field\n";
1958 }
1959 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
1960 $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
1961 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
1962 die "position 1 is greater than position 2" if ($position_1 > $position_2);
1963 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
1964 $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
1965 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
1966 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
1967 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
1968 ### number for the found alignment)
1969 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
1970 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
1971 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
1972 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
1973 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
1974 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
1975 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
1976 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
1977 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
1978 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
1979 }
1980 ###############################################################################################################################################
1981 ### STEP III Now reading in two more lines. These have to be the next entry and we will just add assign them to last_line_1 and last_line_2 ###
1982 ###############################################################################################################################################
1983 $newline_1 = $fhs[$index]->{fh}-> getline();
1984 $newline_2 = $fhs[$index]->{fh}-> getline();
1985
1986 if ($newline_1 and $newline_2){
1987 my ($seq_id_1) = split (/\t/,$newline_1);
1988 my ($seq_id_2) = split (/\t/,$newline_2);
1989
1990 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
1991 $fhs[$index]->{last_seq_id} = $seq_id_1;
1992 }
1993 if ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
1994 $fhs[$index]->{last_seq_id} = $seq_id_2;
1995 }
1996 $fhs[$index]->{last_line_1} = $newline_1;
1997 $fhs[$index]->{last_line_2} = $newline_2;
1998 }
1999 else {
2000 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
2001 $fhs[$index]->{last_seq_id} = undef;
2002 $fhs[$index]->{last_line_1} = undef;
2003 $fhs[$index]->{last_line_2} = undef;
2004 next; # jumping to the next index
2005 }
2006 ### within the 2nd sequence pair alignment in correct orientation found
2007 }
2008 ### within the 1st sequence pair alignment in correct orientation found
2009 }
2010 ### still within the (last_seq_id eq identifier) condition
2011 }
2012 ### still within foreach index loop
2013 }
2014 ### if there was no single alignment found for a certain sequence we will continue with the next sequence in the sequence file
2015 unless(%mismatches){
2016 $counting{no_single_alignment_found}++;
2017 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
2018 }
2019 ### Going to use the variable $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
2020 my $sequence_pair_fails = 0;
2021 ### Declaring an empty hash reference which will store all information we need for the methylation call
2022 my $methylation_call_params; # hash reference!
2023 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the
2024 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the
2025 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether
2026 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
2027 #dev print "Number of mismatches: $mismatch_number\t$identifier\t$sequence_1\t$sequence_2\n";
2028 foreach my $entry (keys (%{$mismatches{$mismatch_number}}) ){
2029 #dev print "$mismatch_number\t$entry\t$mismatches{$mismatch_number}->{$entry}->{index}\n";
2030 # print join("\t",$mismatch_number,$mismatches{$mismatch_number}->{$entry}->{seq_id},$sequence,$mismatches{$mismatch_number}->{$entry}->{bowtie_sequence},$mismatches{$mismatch_number}->{$entry}->{chromosome},$mismatches{$mismatch_number}->{$entry}->{position},$mismatches{$mismatch_number}->{$entry}->{index}),"\n";
2031 }
2032 if (scalar keys %{$mismatches{$mismatch_number}} == 1){
2033 # print "Unique best alignment for sequence pair $sequence_1\t$sequence_1\n";
2034 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
2035 $methylation_call_params->{$identifier}->{seq_id} = $identifier;
2036 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_1};
2037 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2};
2038 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
2039 $methylation_call_params->{$identifier}->{start_seq_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_1};
2040 $methylation_call_params->{$identifier}->{start_seq_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2};
2041 $methylation_call_params->{$identifier}->{alignment_end} = ($mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}+length($mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2}));
2042 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
2043 $methylation_call_params->{$identifier}->{number_of_mismatches_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_1};
2044 $methylation_call_params->{$identifier}->{number_of_mismatches_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_2};
2045 }
2046 }
2047 else{
2048 $sequence_pair_fails = 1;
2049 }
2050 ### after processing the alignment with the lowest number of mismatches we exit
2051 last;
2052 }
2053 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
2054 if ($sequence_pair_fails == 1){
2055 $counting{unsuitable_sequence_count}++;
2056 if ($ambiguous){
2057 return 2; # => exits to next sequence pair, and prints both seqs out to multiple_alignments_1 and -2 if --ambiguous has been specified
2058 }
2059 if ($unmapped){
2060 return 1; # => exits to next sequence pair, and prints both seqs out to unmapped_1 and _2 if --un has been specified
2061 }
2062 else{
2063 return 0; # => exits to next sequence (default)
2064 }
2065 }
2066
2067 ### --DIRECTIONAL
2068 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
2069 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
2070 if ($directional){
2071 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
2072 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
2073 $counting{alignments_rejected_count}++;
2074 return 0;
2075 }
2076 }
2077
2078 ### If the sequence has not been rejected so far it does have a unique best alignment
2079 $counting{unique_best_alignment_count}++;
2080 extract_corresponding_genomic_sequence_paired_ends($identifier,$methylation_call_params);
2081
2082 ### check test to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
2083 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
2084 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n";
2085 $counting{genomic_sequence_could_not_be_extracted_count}++;
2086 return 0;
2087 }
2088 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
2089 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n";
2090 $counting{genomic_sequence_could_not_be_extracted_count}++;
2091 return 0;
2092 }
2093
2094 ### otherwise we are set to perform the actual methylation call
2095 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
2096 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
2097
2098 print_bisulfite_mapping_results_paired_ends($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
2099 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
2100 }
2101
2102 #########################
2103 ### BOWTIE 2 | PAIRED-END
2104 #########################
2105
2106 sub check_bowtie_results_paired_ends_bowtie2{
2107 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
2108
2109 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
2110 unless ($quality_value_1){
2111 $quality_value_1 = 'I'x(length$sequence_1);
2112 }
2113
2114 unless ($quality_value_2){
2115 $quality_value_2 = 'I'x(length$sequence_2);
2116 }
2117
2118
2119 # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
2120
2121
2122 my %alignments;
2123 my $alignment_ambiguous = 0;
2124
2125 ### reading from the Bowtie 2 output filehandles
2126
2127 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
2128 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
2129 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
2130 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignments to the complementary
2131 ### strands are not being reported when '--directional' is specified
2132
2133 foreach my $index (0,3,1,2){
2134 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
2135 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
2136
2137 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
2138 if ($fhs[$index]->{last_seq_id} eq $identifier) {
2139
2140 my ($id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,5,9,10];
2141 my ($id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,5,9,10];
2142 # print "Index: $index\t$fhs[$index]->{last_line_1}\n";
2143 # print "Index: $index\t$fhs[$index]->{last_line_2}\n";
2144 # print join ("\t",$id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1),"\n";
2145 # print join ("\t",$id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2),"\n";
2146 $id_1 =~ s/\/1$//;
2147 $id_2 =~ s/\/2$//;
2148
2149 # SAM format specifications for Bowtie 2
2150 # (1) Name of read that aligned
2151 # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
2152 # 1 The read is one of a pair
2153 # 2 The alignment is one end of a proper paired-end alignment
2154 # 4 The read has no reported alignments
2155 # 8 The read is one of a pair and has no reported alignments
2156 # 16 The alignment is to the reverse reference strand
2157 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
2158 # 64 The read is mate 1 in a pair
2159 # 128 The read is mate 2 in a pair
2160 # 256 The read has multiple mapping states
2161 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
2162 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
2163 # (5) Mapping quality (255 means MAPQ is not available)
2164 # (6) CIGAR string representation of alignment (* if unavailable)
2165 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
2166 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
2167 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
2168 # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
2169 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
2170 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
2171 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
2172 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
2173 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
2174 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
2175 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
2176 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
2177 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
2178 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
2179 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
2180 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
2181
2182 ### If a sequence has no reported alignments there will be a single output line per sequence with a bit-wise flag value of 77 for read 1 (1+4+8+64), or 141 for read 2 (1+4+8+128).
2183 ### We can store the next alignment and move on to the next Bowtie 2 instance
2184 if ($flag_1 == 77 and $flag_2 == 141){
2185 ## reading in the next alignment, which must be the next sequence
2186 my $newline_1 = $fhs[$index]->{fh}-> getline();
2187 my $newline_2 = $fhs[$index]->{fh}-> getline();
2188
2189 if ($newline_1 and $newline_2){
2190 chomp $newline_1;
2191 chomp $newline_2;
2192 my ($seq_id_1) = split (/\t/,$newline_1);
2193 my ($seq_id_2) = split (/\t/,$newline_2);
2194 $seq_id_1 =~ s/\/1$//;
2195 $seq_id_2 =~ s/\/2$//;
2196 $fhs[$index]->{last_seq_id} = $seq_id_1;
2197 $fhs[$index]->{last_line_1} = $newline_1;
2198 $fhs[$index]->{last_line_2} = $newline_2;
2199
2200 # print "current sequence ($identifier) did not map, reading in next sequence\n";
2201 # print "$index\t$fhs[$index]->{last_seq_id}\n";
2202 # print "$index\t$fhs[$index]->{last_line_1}\n";
2203 # print "$index\t$fhs[$index]->{last_line_2}\n";
2204 next; # next instance
2205 }
2206 else{
2207 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
2208 $fhs[$index]->{last_seq_id} = undef;
2209 $fhs[$index]->{last_line_1} = undef;
2210 $fhs[$index]->{last_line_2} = undef;
2211 next;
2212 }
2213 }
2214
2215 ### If there are one or more proper alignments we can extract the chromosome number
2216 my ($chromosome_1,$chromosome_2);
2217 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
2218 $chromosome_1 = $mapped_chromosome_1;
2219 }
2220 else{
2221 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
2222 }
2223 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
2224 $chromosome_2 = $mapped_chromosome_2;
2225 }
2226 else{
2227 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
2228 }
2229
2230 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
2231
2232 ### We will use the optional fields to determine the best alignments. Later on we extract the number of mismatches and/or indels from the CIGAR string
2233 my ($alignment_score_1,$alignment_score_2,$second_best_1,$second_best_2,$MD_tag_1,$MD_tag_2);
2234
2235 my @fields_1 = split (/\t/,$fhs[$index]->{last_line_1});
2236 my @fields_2 = split (/\t/,$fhs[$index]->{last_line_2});
2237
2238 foreach (11..$#fields_1){
2239 if ($fields_1[$_] =~ /AS:i:(.*)/){
2240 $alignment_score_1 = $1;
2241 }
2242 elsif ($fields_1[$_] =~ /XS:i:(.*)/){
2243 $second_best_1 = $1;
2244 }
2245 elsif ($fields_1[$_] =~ /MD:Z:(.*)/){
2246 $MD_tag_1 = $1;
2247 }
2248 }
2249
2250 foreach (11..$#fields_2){
2251 if ($fields_2[$_] =~ /AS:i:(.*)/){
2252 $alignment_score_2 = $1;
2253 }
2254 elsif ($fields_2[$_] =~ /XS:i:(.*)/){
2255 $second_best_2 = $1;
2256 }
2257 elsif ($fields_2[$_] =~ /MD:Z:(.*)/){
2258 $MD_tag_2 = $1;
2259 }
2260 }
2261
2262 die "Failed to extract alignment score 1 ($alignment_score_1) and MD tag ($MD_tag_1)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_1 and defined $MD_tag_1);
2263 die "Failed to extract alignment score 2 ($alignment_score_2) and MD tag ($MD_tag_2)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_2 and defined $MD_tag_2);
2264
2265 # warn "First read 1 alignment score is: '$alignment_score_1'\n";
2266 # warn "First read 2 alignment score is: '$alignment_score_2'\n";
2267 # warn "MD tag 1 is: '$MD_tag_1'\n";
2268 # warn "MD tag 2 is: '$MD_tag_2'\n";
2269
2270 ### To decide whether a sequence pair has a unique best alignment we will look at the highest sum of alignment scores from both alignments
2271 my $sum_of_alignment_scores_1 = $alignment_score_1 + $alignment_score_2 ;
2272 # print "sum of alignment scores: $sum_of_alignment_scores_1\n\n";
2273
2274 if (defined $second_best_1 and defined $second_best_2){
2275 my $sum_of_alignment_scores_second_best = $second_best_1 + $second_best_2;
2276 # warn "Second best alignment_score_1 is: '$second_best_1'\n";
2277 # warn "Second best alignment_score_2 is: '$second_best_2'\n";
2278 # warn "Second best alignment sum of alignment scores is: '$sum_of_alignment_scores_second_best'\n";
2279
2280 # If the first alignment score for the first read pair is the same as the alignment score of the second best hit we are going to boot this sequence pair altogether
2281 if ($sum_of_alignment_scores_1 == $sum_of_alignment_scores_second_best){
2282 $alignment_ambiguous = 1;
2283 # print "This read will be chucked (AS==XS detected)!\n";
2284
2285 ## need to read and discard all additional ambiguous reads until we reach the next sequence
2286 until ($fhs[$index]->{last_seq_id} ne $identifier){
2287 my $newline_1 = $fhs[$index]->{fh}-> getline();
2288 my $newline_2 = $fhs[$index]->{fh}-> getline();
2289 if ($newline_1 and $newline_2){
2290 chomp $newline_1;
2291 chomp $newline_2;
2292 my ($seq_id_1) = split (/\t/,$newline_1);
2293 my ($seq_id_2) = split (/\t/,$newline_2);
2294 $seq_id_1 =~ s/\/1$//;
2295 $seq_id_2 =~ s/\/2$//;
2296 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
2297
2298 $fhs[$index]->{last_seq_id} = $seq_id_1;
2299 $fhs[$index]->{last_line_1} = $newline_1;
2300 $fhs[$index]->{last_line_2} = $newline_2;
2301 }
2302 else{
2303 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
2304 $fhs[$index]->{last_seq_id} = undef;
2305 $fhs[$index]->{last_line_1} = undef;
2306 $fhs[$index]->{last_line_2} = undef;
2307 last; # break free if the end of the alignment output was reached
2308 }
2309 }
2310 # if ($fhs[$index]->{last_seq_id}){
2311 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
2312 # }
2313 }
2314 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
2315
2316 my $alignment_location;
2317 if ($position_1 <= $position_2){
2318 $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
2319 }
2320 elsif($position_2 < $position_1){
2321 $alignment_location = join(":",$chromosome_1,$position_2,$position_1);
2322 }
2323
2324 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
2325 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
2326 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
2327 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
2328
2329 unless (exists $alignments{$alignment_location}){
2330 $alignments{$alignment_location}->{seq_id} = $id_1;
2331 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
2332 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
2333 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
2334 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
2335 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
2336 $alignments{$alignment_location}->{index} = $index;
2337 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
2338 $alignments{$alignment_location}->{position_1} = $position_1;
2339 $alignments{$alignment_location}->{position_2} = $position_2;
2340 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
2341 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
2342 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
2343 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
2344 $alignments{$alignment_location}->{flag_1} = $flag_1;
2345 $alignments{$alignment_location}->{flag_2} = $flag_2;
2346 }
2347 # warn "added best of several alignments to \%alignments hash\n";
2348
2349 ### now reading and discarding all (inferior) alignments of this read pair until we hit the next sequence
2350 until ($fhs[$index]->{last_seq_id} ne $identifier){
2351 my $newline_1 = $fhs[$index]->{fh}-> getline();
2352 my $newline_2 = $fhs[$index]->{fh}-> getline();
2353 if ($newline_1 and $newline_2){
2354 chomp $newline_1;
2355 chomp $newline_2;
2356 my ($seq_id_1) = split (/\t/,$newline_1);
2357 my ($seq_id_2) = split (/\t/,$newline_2);
2358 $seq_id_1 =~ s/\/1$//;
2359 $seq_id_2 =~ s/\/2$//;
2360 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
2361
2362 $fhs[$index]->{last_seq_id} = $seq_id_1;
2363 $fhs[$index]->{last_line_1} = $newline_1;
2364 $fhs[$index]->{last_line_2} = $newline_2;
2365 }
2366 else{
2367 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
2368 $fhs[$index]->{last_seq_id} = undef;
2369 $fhs[$index]->{last_line_1} = undef;
2370 $fhs[$index]->{last_line_2} = undef;
2371 last; # break free if the end of the alignment output was reached
2372 }
2373 }
2374 # if($fhs[$index]->{last_seq_id}){
2375 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all other alignments until the next ID was reached which is: $fhs[$index]->{last_seq_id}\n";
2376 # }
2377 }
2378 }
2379 else{ # there is no second best hit, so we can just store this one and read in the next sequence
2380
2381 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
2382 # print "$alignment_location\n";
2383 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
2384 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
2385 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
2386 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
2387
2388 unless (exists $alignments{$alignment_location}){
2389 $alignments{$alignment_location}->{seq_id} = $id_1;
2390 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
2391 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
2392 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
2393 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
2394 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
2395 $alignments{$alignment_location}->{index} = $index;
2396 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
2397 $alignments{$alignment_location}->{position_1} = $position_1;
2398 $alignments{$alignment_location}->{position_2} = $position_2;
2399 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
2400 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
2401 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
2402 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
2403 $alignments{$alignment_location}->{flag_1} = $flag_1;
2404 $alignments{$alignment_location}->{flag_2} = $flag_2;
2405 }
2406
2407 # warn "added unique alignment to \%alignments hash\n";
2408
2409 # Now reading and storing the next read pair
2410 my $newline_1 = $fhs[$index]->{fh}-> getline();
2411 my $newline_2 = $fhs[$index]->{fh}-> getline();
2412 if ($newline_1 and $newline_2){
2413 chomp $newline_1;
2414 chomp $newline_2;
2415 # print "$newline_1\n";
2416 # print "$newline_2\n";
2417 my ($seq_id_1) = split (/\t/,$newline_1);
2418 my ($seq_id_2) = split (/\t/,$newline_2);
2419 $seq_id_1 =~ s/\/1$//;
2420 $seq_id_2 =~ s/\/2$//;
2421 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
2422
2423 $fhs[$index]->{last_seq_id} = $seq_id_1;
2424 $fhs[$index]->{last_line_1} = $newline_1;
2425 $fhs[$index]->{last_line_2} = $newline_2;
2426
2427 if ($seq_id_1 eq $identifier){
2428 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
2429 }
2430 }
2431 else{
2432 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
2433 $fhs[$index]->{last_seq_id} = undef;
2434 $fhs[$index]->{last_line_1} = undef;
2435 $fhs[$index]->{last_line_2} = undef;
2436 }
2437 }
2438 }
2439 }
2440
2441 ### if the read produced several ambiguous alignments for a single instance of Bowtie 2 we can return already now. If --ambiguous was specified the read sequence will be printed out in FastQ format
2442 if ($alignment_ambiguous == 1){
2443 $counting{unsuitable_sequence_count}++;
2444 ### report that the sequence pair has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
2445 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
2446 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
2447 # print "$ambiguous_read_1\n";
2448 # print "$ambiguous_read_2\n";
2449
2450 if ($ambiguous){
2451 return 2; # => exits to next sequence pair, and prints it out to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
2452 }
2453 elsif ($unmapped){
2454 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
2455 }
2456 else{
2457 return 0;
2458 }
2459 }
2460
2461 ### if no alignment was found for a certain sequence at all we continue with the next sequence in the sequence file
2462 unless (%alignments){
2463 $counting{no_single_alignment_found}++;
2464
2465 # my $unmapped_read_1 = join("\t",$identifier.'/1','77','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
2466 # my $unmapped_read_2 = join("\t",$identifier.'/2','141','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
2467 # print "$unmapped_read_1\n";
2468 # print "$unmapped_read_2\n";
2469 if ($unmapped){
2470 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_read_2.txt if '--unmapped' was specified
2471 }
2472 else{
2473 return 0;
2474 }
2475 }
2476
2477 #######################################################################################################################################################
2478
2479 ### If the sequence pair was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
2480 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
2481 ### alignment score we are discarding the sequence pair altogether.
2482 ### For end-to-end alignments the maximum alignment score is 0, each mismatch receives a penalty of 6, and each gap receives penalties for opening (5)
2483 ### and extending (3 per bp) the gap.
2484
2485 #######################################################################################################################################################
2486
2487 ### Declaring an empty hash reference which will store all information we need for the methylation call
2488 my $methylation_call_params; # hash reference
2489 my $sequence_pair_fails = 0; # using $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
2490
2491 ### print contents of %alignments for debugging
2492 ## if (scalar keys %alignments >= 1){
2493 # print "\n******\n";
2494 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
2495 # print "Loc: $alignment_location\n";
2496 # print "ID: $alignments{$alignment_location}->{seq_id}\n";
2497 # print "AS_1: $alignments{$alignment_location}->{alignment_score_1}\n";
2498 # print "AS_2: $alignments{$alignment_location}->{alignment_score_2}\n";
2499 # print "Seq_1: $alignments{$alignment_location}->{bowtie_sequence_1}\n";
2500 # print "Seq_2: $alignments{$alignment_location}->{bowtie_sequence_2}\n";
2501 # print "Index $alignments{$alignment_location}->{index}\n";
2502 # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
2503 # print "Pos_1: $alignments{$alignment_location}->{position_1}\n";
2504 # print "Pos_2: $alignments{$alignment_location}->{position_2}\n";
2505 # print "CIGAR_1: $alignments{$alignment_location}->{CIGAR_1}\n";
2506 # print "CIGAR_2: $alignments{$alignment_location}->{CIGAR_2}\n";
2507 # print "MD_1: $alignments{$alignment_location}->{mismatch_info_1}\n";
2508 # print "MD_2: $alignments{$alignment_location}->{mismatch_info_2}\n";
2509 # print "Flag 1: $alignments{$alignment_location}->{flag_1}\n";
2510 # print "Flag 2: $alignments{$alignment_location}->{flag_2}\n";
2511 # }
2512 # print "\n******\n";
2513 # }
2514
2515 ### if there is only 1 entry in the %alignments hash we accept it as the best alignment
2516 if (scalar keys %alignments == 1){
2517 for my $unique_best_alignment (keys %alignments){
2518 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$unique_best_alignment}->{bowtie_sequence_1};
2519 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$unique_best_alignment}->{bowtie_sequence_2};
2520 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
2521 $methylation_call_params->{$identifier}->{position_1} = $alignments{$unique_best_alignment}->{position_1};
2522 $methylation_call_params->{$identifier}->{position_2} = $alignments{$unique_best_alignment}->{position_2};
2523 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
2524 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$unique_best_alignment}->{alignment_score_1};
2525 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$unique_best_alignment}->{alignment_score_2};
2526 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores};
2527 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$unique_best_alignment}->{mismatch_info_1};
2528 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$unique_best_alignment}->{mismatch_info_2};
2529 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$unique_best_alignment}->{CIGAR_1};
2530 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$unique_best_alignment}->{CIGAR_2};
2531 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$unique_best_alignment}->{flag_1};
2532 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$unique_best_alignment}->{flag_2};
2533 }
2534 }
2535
2536 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
2537 ### we boot the sequence pair altogether)
2538 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
2539 my $best_sum_of_alignment_scores;
2540 my $best_alignment_location;
2541 foreach my $alignment_location (sort {$alignments{$b}->{sum_of_alignment_scores} <=> $alignments{$a}->{sum_of_alignment_scores}} keys %alignments){
2542 # print "$alignments{$alignment_location}->{sum_of_alignment_scores}\n";
2543 unless (defined $best_sum_of_alignment_scores){
2544 $best_sum_of_alignment_scores = $alignments{$alignment_location}->{sum_of_alignment_scores};
2545 $best_alignment_location = $alignment_location;
2546 # print "setting best alignment score to: $best_sum_of_alignment_scores\n";
2547 }
2548 else{
2549 ### if the second best alignment has the same sum of alignment scores as the first one, the sequence pair will get booted
2550 if ($alignments{$alignment_location}->{sum_of_alignment_scores} == $best_sum_of_alignment_scores){
2551 # warn "Same sum of alignment scores for 2 different alignments, the sequence pair will get booted!\n";
2552 $sequence_pair_fails = 1;
2553 last; # exiting since we know that the sequence has ambiguous alignments
2554 }
2555 ### else we are going to store the best alignment for further processing
2556 else{
2557 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$best_alignment_location}->{bowtie_sequence_1};
2558 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$best_alignment_location}->{bowtie_sequence_2};
2559 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
2560 $methylation_call_params->{$identifier}->{position_1} = $alignments{$best_alignment_location}->{position_1};
2561 $methylation_call_params->{$identifier}->{position_2} = $alignments{$best_alignment_location}->{position_2};
2562 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
2563 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$best_alignment_location}->{alignment_score_1};
2564 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$best_alignment_location}->{alignment_score_2};
2565 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$best_alignment_location}->{sum_of_alignment_scores};
2566 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$best_alignment_location}->{mismatch_info_1};
2567 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$best_alignment_location}->{mismatch_info_2};
2568 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$best_alignment_location}->{CIGAR_1};
2569 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$best_alignment_location}->{CIGAR_2};
2570 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$best_alignment_location}->{flag_1};
2571 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$best_alignment_location}->{flag_2};
2572 last; # exiting since the sequence produced a unique best alignment
2573 }
2574 }
2575 }
2576 }
2577 else{
2578 die "There are too many potential hits for this sequence pair (1-4 expected, but found: '",scalar keys %alignments,"')\n";;
2579 }
2580
2581 ### skipping the sequence completely if there were multiple alignments with the same best sum of alignment scores at different positions
2582 if ($sequence_pair_fails == 1){
2583 $counting{unsuitable_sequence_count}++;
2584
2585 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
2586 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
2587 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
2588 # print "$ambiguous_read_1\n";
2589 # print "$ambiguous_read_2\n";
2590
2591 if ($ambiguous){
2592 return 2; # => exits to next sequence pair, and prints it out (in FastQ format) to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
2593 }
2594 elsif ($unmapped){
2595 return 1; # => exits to next sequence pair, and prints it out (in FastQ format) to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
2596 }
2597 else{
2598 return 0; # => exits to next sequence pair (default)
2599 }
2600 }
2601
2602 ### --DIRECTIONAL
2603 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
2604 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
2605 if ($directional){
2606 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
2607 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
2608 $counting{alignments_rejected_count}++;
2609 return 0;
2610 }
2611 }
2612
2613 ### If the sequence pair has not been rejected so far it does have a unique best alignment
2614 $counting{unique_best_alignment_count}++;
2615 extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params);
2616
2617 ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
2618 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
2619 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n";
2620 $counting{genomic_sequence_could_not_be_extracted_count}++;
2621 return 0;
2622 }
2623 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
2624 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n";
2625 $counting{genomic_sequence_could_not_be_extracted_count}++;
2626 return 0;
2627 }
2628
2629 ### now we are set to perform the actual methylation call
2630 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
2631 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
2632 # print "$methylation_call_params->{$identifier}->{read_conversion_2}\n";
2633 # print " $sequence_2\n";
2634 # print "$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}\n";
2635 # print " $methylation_call_params->{$identifier}->{methylation_call_2}\n";
2636
2637 print_bisulfite_mapping_results_paired_ends_bowtie2($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
2638 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
2639 }
2640
2641 ###
2642
2643 sub decide_whether_paired_end_alignment_is_valid{
2644 my ($index,$identifier) = @_;
2645 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
2646 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
2647 chomp $mismatch_info_1;
2648 chomp $mismatch_info_2;
2649 my $seq_id_1 = $id_1;
2650 my $seq_id_2 = $id_2;
2651 $seq_id_1 =~ s/\/1$//; # removing the read /1
2652 $seq_id_2 =~ s/\/1$//; # removing the read /1
2653
2654 ### ensuring that the current entry is the correct sequence
2655 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
2656 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
2657 ### sensible alignments
2658 my $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
2659 ### If the orientation was correct can we move on
2660 if ($orientation == 1){
2661 return 1; ### 1st possibility for A SEQUENCE-PAIR TO PASS
2662 }
2663 ### If the alignment was in the wrong orientation we need to read in two new lines
2664 elsif($orientation == 0){
2665 my $newline_1 = $fhs[$index]->{fh}->getline();
2666 my $newline_2 = $fhs[$index]->{fh}->getline();
2667 if ($newline_1 and $newline_2){
2668 ### extract detailed information about the alignment again (from $newline_1 and $newline_2 this time)
2669 ($id_1,$strand_1) = (split (/\t/,$newline_1))[0,1];
2670 ($id_2,$strand_2) = (split (/\t/,$newline_2))[0,1];
2671
2672 my $seqid;
2673 $seq_id_1 = $id_1;
2674 $seq_id_2 = $id_2;
2675 # we need to capture the first read (ending on /1)
2676 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
2677 $seqid = $seq_id_1;
2678 }
2679 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
2680 $seqid = $seq_id_2;
2681 }
2682 else{
2683 die "One of the two reads needs to end on /1!!";
2684 }
2685
2686 ### ensuring that the next entry is still the correct sequence
2687 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
2688 ### checking orientation again
2689 $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
2690 ### If the orientation was correct can we move on
2691 if ($orientation == 1){
2692 ### Writing the current sequence to last_line_1 and last_line_2
2693 $fhs[$index]->{last_seq_id} = $seqid;
2694 $fhs[$index]->{last_line_1} = $newline_1;
2695 $fhs[$index]->{last_line_2} = $newline_2;
2696 return 1; ### 2nd possibility for a SEQUENCE-PAIR TO PASS
2697 }
2698 ### If the alignment was in the wrong orientation again we need to read in yet another 2 new lines and store them in @fhs (this must be
2699 ### the next entry)
2700 elsif ($orientation == 0){
2701 $newline_1 = $fhs[$index]->{fh}->getline();
2702 $newline_2 = $fhs[$index]->{fh}->getline();
2703 if ($newline_1 and $newline_2){
2704 ($seq_id_1) = split (/\t/,$newline_1);
2705 ($seq_id_2) = split (/\t/,$newline_2);
2706
2707 $seqid = '';
2708 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
2709 $seqid = $seq_id_1;
2710 }
2711 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
2712 $seqid = $seq_id_2;
2713 }
2714 else{
2715 die "One of the two reads needs to end on /1!!";
2716 }
2717
2718 ### check if the next 2 lines still have the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
2719 ### the same fields of the just read next entry
2720 die "Same seq ID 3 or more times in a row!(should be 2 max)" if ($seqid eq $identifier);
2721 $fhs[$index]->{last_seq_id} = $seqid;
2722 $fhs[$index]->{last_line_1} = $newline_1;
2723 $fhs[$index]->{last_line_2} = $newline_2;
2724 return 0; # not processing anything this round as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
2725 }
2726 else {
2727 ### assigning undef to last_seq_id and last_line (end of bowtie output)
2728 $fhs[$index]->{last_seq_id} = undef;
2729 $fhs[$index]->{last_line_1} = undef;
2730 $fhs[$index]->{last_line_2} = undef;
2731 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
2732 }
2733 }
2734 else{
2735 die "The orientation of the alignment must be either correct or incorrect\n";
2736 }
2737 }
2738 ### the sequence pair we just read in is already the next sequence pair to be analysed -> store it in @fhs
2739 else{
2740 $fhs[$index]->{last_seq_id} = $seqid;
2741 $fhs[$index]->{last_line_1} = $newline_1;
2742 $fhs[$index]->{last_line_2} = $newline_2;
2743 return 0; # processing the new alignment result only in the next round
2744 }
2745 }
2746 else {
2747 # assigning undef to last_seq_id and both last_lines (end of bowtie output)
2748 $fhs[$index]->{last_seq_id} = undef;
2749 $fhs[$index]->{last_line_1} = undef;
2750 $fhs[$index]->{last_line_2} = undef;
2751 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
2752 }
2753 }
2754 else{
2755 die "The orientation of the alignment must be either correct or incorrect\n";
2756 }
2757 }
2758 ### the sequence pair stored in @fhs as last_line_1 and last_line_2 is already the next sequence pair to be analysed -> analyse next round
2759 else{
2760 return 0;
2761 }
2762 }
2763
2764 ### EXTRACT GENOMIC SEQUENCE | BOWTIE 1 | PAIRED-END
2765
2766 sub extract_corresponding_genomic_sequence_paired_ends {
2767 my ($sequence_identifier,$methylation_call_params) = @_;
2768 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
2769 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
2770 my $alignment_read_1;
2771 my $alignment_read_2;
2772 my $read_conversion_info_1;
2773 my $read_conversion_info_2;
2774 my $genome_conversion;
2775
2776 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at oone of the ends so that we can also make a CpG, CHG or CHH methylation call
2777 ### if the C happens to be at the first or last position of the actually observed sequence
2778 my $non_bisulfite_sequence_1;
2779 my $non_bisulfite_sequence_2;
2780
2781 ### all alignments reported by bowtie have the + alignment first and the - alignment as the second one irrespective of whether read 1 or read 2 was
2782 ### the + alignment. We however always read in sequences read 1 then read 2, so if read 2 is the + alignment we need to swap the extracted genomic
2783 ### sequences around!
2784 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
2785 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
2786 ### [Index 0, sequence originated from (converted) forward strand]
2787 $counting{CT_GA_CT_count}++;
2788 $alignment_read_1 = '+';
2789 $alignment_read_2 = '-';
2790 $read_conversion_info_1 = 'CT';
2791 $read_conversion_info_2 = 'GA';
2792 $genome_conversion = 'CT';
2793 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
2794 ### for hits on the forward strand we need to capture 2 extra bases at the 3' end
2795
2796 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ##CHH change
2797
2798 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
2799 ### As the second conversion is GA we need to capture 1 base 3', so that it is a 5' base after reverse complementation
2800 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{start_seq_2}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+1){ ## CHH change to +1
2801
2802 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2);
2803 ### the reverse strand sequence needs to be reverse complemented
2804 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
2805 }
2806 else{
2807 $non_bisulfite_sequence_2 = '';
2808 }
2809 }
2810
2811 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
2812 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
2813 ### [Index 1, sequence originated from complementary to (converted) reverse strand]
2814 $counting{GA_CT_GA_count}++;
2815 $alignment_read_1 = '+';
2816 $alignment_read_2 = '-';
2817 $read_conversion_info_1 = 'GA';
2818 $read_conversion_info_2 = 'CT';
2819 $genome_conversion = 'GA';
2820
2821 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
2822 ### as we need to make the methylation call for the base 5' of the first base (GA conversion!) we need to capture 2 extra bases at the 5' end
2823 if ($methylation_call_params->{$sequence_identifier}->{start_seq_1}-1 > 0){ ## CHH change to -1
2824 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH change to -2/+2
2825 }
2826 else{
2827 $non_bisulfite_sequence_1 = '';
2828 }
2829
2830 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
2831 ### As we are doing a CT comparison for the reverse strand we are taking 2 bases extra at the 5' end, so it is a 3' base after reverse complementation
2832 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to -2/+2
2833 ### the reverse strand sequence needs to be reverse complemented
2834 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
2835 }
2836
2837 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
2838 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
2839 ### [Index 2, sequence originated from the complementary to (converted) forward strand]
2840 $counting{GA_CT_CT_count}++;
2841 $alignment_read_1 = '-';
2842 $alignment_read_2 = '+';
2843 $read_conversion_info_1 = 'GA';
2844 $read_conversion_info_2 = 'CT';
2845 $genome_conversion = 'CT';
2846
2847 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
2848 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
2849 ### As read 1 is GA converted we need to capture 2 extra 3' bases which will be 2 extra 5' base after reverse complementation
2850 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to +2
2851 ### the reverse strand sequence needs to be reverse complemented
2852 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
2853
2854 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
2855 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
2856 ### Read 2 is CT converted so we need to capture 2 extra 3' bases
2857 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > ($methylation_call_params->{$sequence_identifier}->{start_seq_1})+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+1){ ## CHH change to +1
2858 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ## CHH changed from +1 to +2
2859 }
2860 else{
2861 $non_bisulfite_sequence_2 = '';
2862 }
2863 }
2864
2865 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
2866 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
2867 ### [Index 3, sequence originated from the (converted) reverse strand]
2868 $counting{CT_GA_GA_count}++;
2869 $alignment_read_1 = '-';
2870 $alignment_read_2 = '+';
2871 $read_conversion_info_1 = 'CT';
2872 $read_conversion_info_2 = 'GA';
2873 $genome_conversion = 'GA';
2874
2875 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
2876 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
2877 ### As read 1 is CT converted we need to capture 2 extra 5' bases which will be 2 extra 3' base after reverse complementation
2878 if ( ($methylation_call_params->{$sequence_identifier}->{start_seq_2}-1) > 0){ ## CHH changed to -1
2879 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH changed to -2/+2
2880 ### the reverse strand sequence needs to be reverse complemented
2881 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
2882 }
2883 else{
2884 $non_bisulfite_sequence_1 = '';
2885 }
2886
2887 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
2888 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
2889 ### Read 2 is GA converted so we need to capture 2 extra 5' bases
2890 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH changed to -2/+2
2891 }
2892 else{
2893 die "Too many bowtie result filehandles\n";
2894 }
2895 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
2896 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
2897
2898 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
2899 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
2900 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
2901 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
2902 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
2903 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
2904 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
2905 }
2906
2907 ### EXTRACT GENOMIC SEQUENCE BOWTIE 2 | PAIRED-END
2908
2909 sub extract_corresponding_genomic_sequence_paired_ends_bowtie2{
2910 my ($sequence_identifier,$methylation_call_params) = @_;
2911 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
2912 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
2913
2914 my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1};
2915 my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2};
2916 my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1};
2917 my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2};
2918 # print "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n";
2919 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
2920 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
2921
2922 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
2923 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
2924 my $alignment_read_1;
2925 my $alignment_read_2;
2926 my $read_conversion_info_1;
2927 my $read_conversion_info_2;
2928 my $genome_conversion;
2929
2930 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at one of the ends so that we can also make a CpG, CHG or CHH methylation call
2931 ### if the C happens to be at the last position of the actually observed sequence
2932 my $non_bisulfite_sequence_1 = '';
2933 my $non_bisulfite_sequence_2 = '';
2934
2935 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
2936 my $pos_1 = $methylation_call_params->{$sequence_identifier}->{position_1}-1;
2937 my $pos_2 = $methylation_call_params->{$sequence_identifier}->{position_2}-1;
2938
2939 # parsing CIGAR 1 string
2940 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
2941 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
2942 shift @ops_1; # remove the empty first element
2943 die "CIGAR 1 string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1);
2944 # parsing CIGAR 2 string
2945 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
2946 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
2947 shift @ops_2; # remove the empty first element
2948 die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
2949
2950 my $indels_1 = 0; # addiong these to the hemming distance value (needed for the NM field in the final SAM output
2951 my $indels_2 = 0;
2952
2953 ### Extracting read 1 genomic sequence ###
2954
2955 # extracting 2 additional bp at the 5' end (read 1)
2956 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
2957 # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
2958 unless ( ($pos_1-2) > 0){# exiting with en empty genomic sequence otherwise
2959 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
2960 return;
2961 }
2962 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1-2,2);
2963 }
2964
2965 foreach (0..$#len_1){
2966 if ($ops_1[$_] eq 'M'){
2967 # extracting genomic sequence
2968 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]);
2969 # warn "$non_bisulfite_sequence_1\n";
2970 # adjusting position
2971 $pos_1 += $len_1[$_];
2972 }
2973 elsif ($ops_1[$_] eq 'I'){ # insertion in the read sequence
2974 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
2975 $non_bisulfite_sequence_1 .= 'N' x $len_1[$_];
2976 # warn "$non_bisulfite_sequence_1\n";
2977 # position doesn't need adjusting
2978 $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
2979 }
2980 elsif ($ops_1[$_] eq 'D'){ # deletion in the read sequence
2981 # we do not add any genomic sequence but only adjust the position
2982 # warn "Just adjusting the position by: ",$len_1[$_],"bp\n";
2983 $pos_1 += $len_1[$_];
2984 $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
2985 }
2986 elsif($cigar_1 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
2987 die "The CIGAR 1 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
2988 }
2989 else{
2990 die "The CIGAR 1 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
2991 }
2992 }
2993
2994 ### 3' end of read 1
2995 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
2996 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
2997 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise
2998 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
2999 return;
3000 }
3001 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2);
3002 }
3003
3004
3005 ### Extracting read 2 genomic sequence ###
3006
3007 ### 5' end of read 2
3008 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
3009 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3010 unless ( ($pos_2-2) >= 0){# exiting with en empty genomic sequence otherwise
3011 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
3012 return;
3013 }
3014 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2-2,2);
3015 }
3016
3017 foreach (0..$#len_2){
3018 if ($ops_2[$_] eq 'M'){
3019 # extracting genomic sequence
3020 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]);
3021 # warn "$non_bisulfite_sequence_2\n";
3022 # adjusting position
3023 $pos_2 += $len_2[$_];
3024 }
3025 elsif ($ops_2[$_] eq 'I'){ # insertion in the read sequence
3026 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
3027 $non_bisulfite_sequence_2 .= 'N' x $len_2[$_];
3028 # warn "$non_bisulfite_sequence_2\n";
3029 # position doesn't need adjusting
3030 $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
3031 }
3032 elsif ($ops_2[$_] eq 'D'){ # deletion in the read sequence
3033 # we do not add any genomic sequence but only adjust the position
3034 # warn "Just adjusting the position by: ",$len_2[$_],"bp\n";
3035 $pos_2 += $len_2[$_];
3036 $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
3037 }
3038 elsif($cigar_2 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
3039 die "The CIGAR 2 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
3040 }
3041 else{
3042 die "The CIGAR 2 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
3043 }
3044 }
3045
3046 ### 3' end of read 2
3047 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
3048 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3049 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise
3050 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
3051 return;
3052 }
3053 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2);
3054 }
3055
3056 ### all paired-end alignments reported by Bowtie 2 have the Read 1 alignment first and the Read 2 alignment as the second one irrespective of whether read 1 or read 2 was
3057 ### the + alignment. We also read in sequences read 1 then read 2 so they should correspond perfectly
3058
3059 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
3060 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
3061 ### [Index 0, sequence originated from (converted) forward strand]
3062 $counting{CT_GA_CT_count}++;
3063 $alignment_read_1 = '+';
3064 $alignment_read_2 = '-';
3065 $read_conversion_info_1 = 'CT';
3066 $read_conversion_info_2 = 'GA';
3067 $genome_conversion = 'CT';
3068 ### Read 1 is always the forward hit
3069 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
3070 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
3071 }
3072
3073 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
3074 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
3075 ### [Index 1, sequence originated from complementary to (converted) bottom strand]
3076 $counting{GA_CT_GA_count}++;
3077 $alignment_read_1 = '+';
3078 $alignment_read_2 = '-';
3079 $read_conversion_info_1 = 'GA';
3080 $read_conversion_info_2 = 'CT';
3081 $genome_conversion = 'GA';
3082 ### Read 1 is always the forward hit
3083 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
3084 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
3085 }
3086
3087 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
3088 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
3089 ### [Index 2, sequence originated from the complementary to (converted) top strand]
3090 $counting{GA_CT_CT_count}++;
3091 $alignment_read_1 = '-';
3092 $alignment_read_2 = '+';
3093 $read_conversion_info_1 = 'GA';
3094 $read_conversion_info_2 = 'CT';
3095 $genome_conversion = 'CT';
3096
3097 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
3098 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
3099 }
3100
3101 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
3102 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
3103 ### [Index 3, sequence originated from the (converted) reverse strand]
3104 $counting{CT_GA_GA_count}++;
3105 $alignment_read_1 = '-';
3106 $alignment_read_2 = '+';
3107 $read_conversion_info_1 = 'CT';
3108 $read_conversion_info_2 = 'GA';
3109 $genome_conversion = 'GA';
3110 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
3111 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
3112 }
3113 else{
3114 die "Too many bowtie result filehandles\n";
3115 }
3116 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
3117 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
3118
3119 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
3120 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
3121 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
3122 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
3123 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
3124 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
3125 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
3126 ## the end position of a read is stored in $pos
3127 $methylation_call_params->{$sequence_identifier}->{end_position_1} = $pos_1;
3128 $methylation_call_params->{$sequence_identifier}->{end_position_2} = $pos_2;
3129 $methylation_call_params->{$sequence_identifier}->{indels_1} = $indels_1;
3130 $methylation_call_params->{$sequence_identifier}->{indels_2} = $indels_2;
3131 }
3132
3133 ##########################################
3134 ### PRINT SINGLE END RESULTS: Bowtie 1 ###
3135 ##########################################
3136
3137 sub print_bisulfite_mapping_result_single_end{
3138 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
3139
3140 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
3141 if ($phred64){
3142 $quality_value = convert_phred64_quals_to_phred33($quality_value);
3143 }
3144 elsif ($solexa){
3145 $quality_value = convert_solexa_quals_to_phred33($quality_value);
3146 }
3147
3148 ### We will add +1 bp to the starting position of single-end reads, as Bowtie 1 reports the index and not the bp position.
3149 $methylation_call_params->{$identifier}->{position} += 1;
3150
3151 ### writing every uniquely mapped read and its methylation call to the output file
3152 if ($vanilla){
3153 my $bowtie1_output = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_strand},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{position},$methylation_call_params->{$identifier}->{end_position},$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{methylation_call},$methylation_call_params->{$identifier}->{read_conversion},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value);
3154 print OUT "$bowtie1_output\n";
3155 }
3156 else{ # SAM output, default since Bismark v1.0.0
3157 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
3158 }
3159 }
3160
3161 ##########################################
3162 ### PRINT SINGLE END RESULTS: Bowtie 2 ###
3163 ##########################################
3164
3165 sub print_bisulfite_mapping_result_single_end_bowtie2{
3166 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
3167
3168 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
3169 if ($phred64){
3170 $quality_value = convert_phred64_quals_to_phred33($quality_value);
3171 }
3172 elsif ($solexa){
3173 $quality_value = convert_solexa_quals_to_phred33($quality_value);
3174 }
3175
3176 ### writing every mapped read and its methylation call to the SAM output file (unmapped and ambiguous reads were already printed)
3177 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
3178 }
3179
3180 ##########################################
3181 ### PRINT PAIRED END ESULTS: Bowtie 1 ###
3182 ##########################################
3183
3184 sub print_bisulfite_mapping_results_paired_ends{
3185 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
3186
3187 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
3188 if ($phred64){
3189 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
3190 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
3191 }
3192 elsif ($solexa){
3193 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
3194 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
3195 }
3196
3197 ### We will add +1 bp to the start position of paired-end reads, as Bowtie 1 reports the index and not the bp position. (End position is already 1-based)
3198 $methylation_call_params->{$identifier}->{start_seq_1} += 1;
3199
3200 ### writing every single aligned read and its methylation call to the output file
3201 if ($vanilla){
3202 my $bowtie1_output_paired_end = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_read_1},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{start_seq_1},$methylation_call_params->{$identifier}->{alignment_end},$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{methylation_call_1},$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{methylation_call_2},$methylation_call_params->{$identifier}->{read_conversion_1},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value_1,$quality_value_2);
3203 print OUT "$bowtie1_output_paired_end\n";
3204 }
3205 else{ # SAM output, default since Bismark v1.0.0
3206 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
3207 }
3208
3209 }
3210
3211 ##########################################
3212 ### PRINT PAIRED END ESULTS: Bowtie 2 ###
3213 ##########################################
3214
3215 sub print_bisulfite_mapping_results_paired_ends_bowtie2{
3216 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
3217
3218 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
3219 if ($phred64){
3220 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
3221 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
3222 }
3223 elsif ($solexa){
3224 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
3225 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
3226 }
3227
3228 ### writing every single aligned read and its methylation call to the output file (unmapped and ambiguous reads were already printed)
3229 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
3230
3231 }
3232
3233
3234 sub convert_phred64_quals_to_phred33{
3235
3236 my $qual = shift;
3237 my @quals = split (//,$qual);
3238 my @new_quals;
3239
3240 foreach my $index (0..$#quals){
3241 my $phred_score = convert_phred64_quality_string_into_phred_score ($quals[$index]);
3242 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
3243 $new_quals[$index] = $phred33_quality_string;
3244 }
3245
3246 my $phred33_quality = join ("",@new_quals);
3247 return $phred33_quality;
3248 }
3249
3250 sub convert_solexa_quals_to_phred33{
3251
3252 my $qual = shift;
3253 my @quals = split (//,$qual);
3254 my @new_quals;
3255
3256 foreach my $index (0..$#quals){
3257 my $phred_score = convert_solexa_pre1_3_quality_string_into_phred_score ($quals[$index]);
3258 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
3259 $new_quals[$index] = $phred33_quality_string;
3260 }
3261
3262 my $phred33_quality = join ("",@new_quals);
3263 return $phred33_quality;
3264 }
3265
3266 sub convert_phred_score_into_phred33_quality_string{
3267 my $qual = shift;
3268 $qual = chr($qual+33);
3269 return $qual;
3270 }
3271
3272 sub convert_phred64_quality_string_into_phred_score{
3273 my $string = shift;
3274 my $qual = ord($string)-64;
3275 return $qual;
3276 }
3277
3278 sub convert_solexa_pre1_3_quality_string_into_phred_score{
3279 ### We will just use 59 as the offset here as all Phred Scores between 10 and 40 look exactly the same, there is only a minute difference for values between 0 and 10
3280 my $string = shift;
3281 my $qual = ord($string)-59;
3282 return $qual;
3283 }
3284
3285
3286 sub extract_corresponding_genomic_sequence_single_end {
3287 my ($sequence_identifier,$methylation_call_params) = @_;
3288 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
3289 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
3290
3291 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
3292 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
3293 my $alignment_strand;
3294 my $read_conversion_info;
3295 my $genome_conversion;
3296 ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and
3297 ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation,
3298 ### if the C happens to be at the last position of the actually observed sequence
3299 my $non_bisulfite_sequence;
3300 ### depending on the conversion we want to make need to capture 1 extra base at the 3' end
3301
3302 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
3303 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
3304 ### [Index 0, sequence originated from (converted) forward strand]
3305 $counting{CT_CT_count}++;
3306 $alignment_strand = '+';
3307 $read_conversion_info = 'CT';
3308 $genome_conversion = 'CT';
3309
3310 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3311 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1
3312 ### + 2 extra base at the 3' end
3313 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
3314 }
3315 else{
3316 $non_bisulfite_sequence = '';
3317 }
3318 }
3319
3320 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
3321 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
3322 ### [Index 1, sequence originated from (converted) reverse strand]
3323 $counting{CT_GA_count}++;
3324 $alignment_strand = '-';
3325 $read_conversion_info = 'CT';
3326 $genome_conversion = 'GA';
3327
3328 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3329 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from >
3330 ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation
3331 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
3332 ## reverse complement!
3333 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
3334 }
3335 else{
3336 $non_bisulfite_sequence = '';
3337 }
3338 }
3339
3340 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
3341 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
3342 ### [Index 2, sequence originated from complementary to (converted) forward strand]
3343 $counting{GA_CT_count}++;
3344 $alignment_strand = '-';
3345 $read_conversion_info = 'GA';
3346 $genome_conversion = 'CT';
3347
3348 ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation
3349 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3350 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012
3351 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
3352 ## reverse complement!
3353 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
3354 }
3355 else{
3356 $non_bisulfite_sequence = '';
3357 }
3358 }
3359
3360 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
3361 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
3362 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
3363 $counting{GA_GA_count}++;
3364 $alignment_strand = '+';
3365 $read_conversion_info = 'GA';
3366 $genome_conversion = 'GA';
3367
3368 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3369 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from >
3370 ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand
3371 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
3372 }
3373 else{
3374 $non_bisulfite_sequence = '';
3375 }
3376 }
3377 else{
3378 die "Too many bowtie result filehandles\n";
3379 }
3380
3381 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
3382 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
3383 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
3384 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
3385
3386 ### at this point we can also determine the end position of a read
3387 $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence});
3388 }
3389
3390
3391 sub extract_corresponding_genomic_sequence_single_end_bowtie2{
3392 my ($sequence_identifier,$methylation_call_params) = @_;
3393
3394 my $MD_tag = $methylation_call_params->{$sequence_identifier}->{mismatch_info};
3395 my $cigar = $methylation_call_params->{$sequence_identifier}->{CIGAR};
3396
3397 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
3398 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
3399
3400 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
3401 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
3402 my $alignment_strand;
3403 my $read_conversion_info;
3404 my $genome_conversion;
3405 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
3406 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
3407 my $non_bisulfite_sequence = '';
3408
3409 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
3410 my $pos = $methylation_call_params->{$sequence_identifier}->{position}-1;
3411
3412 # parsing CIGAR string
3413 my @len = split (/\D+/,$cigar); # storing the length per operation
3414 my @ops = split (/\d+/,$cigar); # storing the operation
3415 shift @ops; # remove the empty first element
3416 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
3417
3418 ### If the sequence aligns best as CT converted reads vs. GA converted genome (OB, index 1) or GA converted reads vs. GA converted genome (CTOB, index 3)
3419 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
3420 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3421 unless ( ($pos-2) >= 0){ # exiting with en empty genomic sequence otherwise
3422 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
3423 return;
3424 }
3425 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos-2,2);
3426 }
3427 my $indels = 0;
3428
3429 foreach (0..$#len){
3430 if ($ops[$_] eq 'M'){
3431 #extracting genomic sequence
3432 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]);
3433 # adjusting position
3434 $pos += $len[$_];
3435 }
3436 elsif ($ops[$_] eq 'I'){ # insertion in the read sequence
3437 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
3438 $non_bisulfite_sequence .= 'N' x $len[$_];
3439 # warn "$non_bisulfite_sequence\n";
3440 # position doesn't need to be adjusting
3441 $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
3442 }
3443 elsif ($ops[$_] eq 'D'){ # deletion in the read sequence
3444 # we do not add any genomic sequence but only adjust the position
3445 $pos += $len[$_];
3446 $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
3447 }
3448 elsif($cigar =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
3449 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
3450 }
3451 else{
3452 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
3453 }
3454 }
3455
3456 ### If the sequence aligns best as CT converted reads vs. CT converted genome (OT, index 0) or GA converted reads vs. CT converted genome (CTOT, index 2)
3457 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
3458 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3459 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos+2){ # exiting with en empty genomic sequence otherwise
3460 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
3461 return;
3462 }
3463 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,2);
3464 # print "$methylation_call_params->{$sequence_identifier}->{bowtie_sequence}\n$non_bisulfite_sequence\n";
3465 }
3466
3467
3468
3469 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
3470 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
3471 ### [Index 0, sequence originated from (converted) forward strand]
3472 $counting{CT_CT_count}++;
3473 $alignment_strand = '+';
3474 $read_conversion_info = 'CT';
3475 $genome_conversion = 'CT';
3476 }
3477
3478 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
3479 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
3480 ### [Index 1, sequence originated from (converted) reverse strand]
3481 $counting{CT_GA_count}++;
3482 $alignment_strand = '-';
3483 $read_conversion_info = 'CT';
3484 $genome_conversion = 'GA';
3485
3486 ### reverse complement!
3487 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
3488 }
3489
3490 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
3491 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
3492 ### [Index 2, sequence originated from complementary to (converted) forward strand]
3493 $counting{GA_CT_count}++;
3494 $alignment_strand = '-';
3495 $read_conversion_info = 'GA';
3496 $genome_conversion = 'CT';
3497
3498 ### reverse complement!
3499 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
3500 }
3501
3502 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
3503 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
3504 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
3505 $counting{GA_GA_count}++;
3506 $alignment_strand = '+';
3507 $read_conversion_info = 'GA';
3508 $genome_conversion = 'GA';
3509
3510 }
3511 else{
3512 die "Too many Bowtie 2 result filehandles\n";
3513 }
3514
3515 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
3516 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
3517 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
3518 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
3519
3520 ### the end position of a read is stored in $pos
3521 $methylation_call_params->{$sequence_identifier}->{end_position} = $pos;
3522 $methylation_call_params->{$sequence_identifier}->{indels} = $indels;
3523 }
3524
3525 ### METHYLATION CALL
3526
3527 sub methylation_call{
3528 my ($identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion) = @_;
3529 ### splitting both the actually observed sequence and the genomic sequence up into single bases so we can compare them one by one
3530 my @seq = split(//,$sequence_actually_observed);
3531 my @genomic = split(//,$genomic_sequence);
3532 # print join ("\n",$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion),"\n";
3533 ### Creating a match-string with different characters for non-cytosine bases (disregarding mismatches here), methyl-Cs or non-methyl Cs in either
3534 ### CpG, CHH or CHG context
3535
3536 #################################################################
3537 ### . for bases not involving cytosines ###
3538 ### X for methylated C in CHG context (was protected) ###
3539 ### x for not methylated C in CHG context (was converted) ###
3540 ### H for methylated C in CHH context (was protected) ###
3541 ### h for not methylated C in CHH context (was converted) ###
3542 ### Z for methylated C in CpG context (was protected) ###
3543 ### z for not methylated C in CpG context (was converted) ###
3544 #################################################################
3545
3546 my @match =();
3547 warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2
3548 my $methyl_CHH_count = 0;
3549 my $methyl_CHG_count = 0;
3550 my $methyl_CpG_count = 0;
3551 my $unmethylated_CHH_count = 0;
3552 my $unmethylated_CHG_count = 0;
3553 my $unmethylated_CpG_count = 0;
3554
3555 if ($read_conversion eq 'CT'){
3556 for my $index (0..$#seq) {
3557 if ($seq[$index] eq $genomic[$index]) {
3558 ### The residue can only be a C if it was not converted to T, i.e. protected my methylation
3559 if ($genomic[$index] eq 'C') {
3560 ### If the residue is a C we want to know if it was in CpG context or in any other context
3561 my $downstream_base = $genomic[$index+1];
3562
3563 if ($downstream_base eq 'G'){
3564 ++$methyl_CpG_count;
3565 push @match,'Z'; # protected C, methylated, in CpG context
3566 }
3567
3568 else {
3569 ### C in not in CpG-context, determining the second downstream base context
3570 my $second_downstream_base = $genomic[$index+2];
3571
3572 if ($second_downstream_base eq 'G'){
3573 ++$methyl_CHG_count;
3574 push @match,'X'; # protected C, methylated, in CHG context
3575 }
3576 else{
3577 ++$methyl_CHH_count;
3578 push @match,'H'; # protected C, methylated, in CHH context
3579 }
3580 }
3581 }
3582 else {
3583 push @match, '.';
3584 }
3585 }
3586 elsif ($seq[$index] ne $genomic[$index]) {
3587 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted into Ts
3588 ### in the actually observed sequence
3589 if ($genomic[$index] eq 'C' and $seq[$index] eq 'T') {
3590 ### If the residue was converted to T we want to know if it was in CpG, CHG or CHH context
3591 my $downstream_base = $genomic[$index+1];
3592
3593 if ($downstream_base eq 'G'){
3594 ++$unmethylated_CpG_count;
3595 push @match,'z'; # converted C, not methylated, in CpG context
3596 }
3597
3598 else{
3599 ### C in not in CpG-context, determining the second downstream base context
3600 my $second_downstream_base = $genomic[$index+2];
3601
3602 if ($second_downstream_base eq 'G'){
3603 ++$unmethylated_CHG_count;
3604 push @match,'x'; # converted C, not methylated, in CHG context
3605 }
3606 else{
3607 ++$unmethylated_CHH_count;
3608 push @match,'h'; # converted C, not methylated, in CHH context
3609 }
3610 }
3611 }
3612 ### all other mismatches are not of interest for a methylation call
3613 else {
3614 push @match,'.';
3615 }
3616 }
3617 else{
3618 die "There can be only 2 possibilities\n";
3619 }
3620 }
3621 }
3622 elsif ($read_conversion eq 'GA'){
3623 # print join ("\n",'***',$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion,'***'),"\n";
3624
3625 for my $index (0..$#seq) {
3626 if ($seq[$index] eq $genomic[$index+2]) {
3627 ### The residue can only be a G if the C on the other strand was not converted to T, i.e. protected my methylation
3628 if ($genomic[$index+2] eq 'G') {
3629 ### If the residue is a G we want to know if the C on the other strand was in CpG, CHG or CHH context, therefore we need
3630 ### to look if the base upstream is a C
3631
3632 my $upstream_base = $genomic[$index+1];
3633
3634 if ($upstream_base eq 'C'){
3635 ++$methyl_CpG_count;
3636 push @match,'Z'; # protected C on opposing strand, methylated, in CpG context
3637 }
3638
3639 else{
3640 ### C in not in CpG-context, determining the second upstream base context
3641 my $second_upstream_base = $genomic[$index];
3642
3643 if ($second_upstream_base eq 'C'){
3644 ++$methyl_CHG_count;
3645 push @match,'X'; # protected C on opposing strand, methylated, in CHG context
3646 }
3647 else{
3648 ++$methyl_CHH_count;
3649 push @match,'H'; # protected C on opposing strand, methylated, in CHH context
3650 }
3651 }
3652 }
3653 else{
3654 push @match, '.';
3655 }
3656 }
3657 elsif ($seq[$index] ne $genomic[$index+2]) {
3658 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted to Ts
3659 ### on the opposing strand, so G to A conversions in the actually observed sequence
3660 if ($genomic[$index+2] eq 'G' and $seq[$index] eq 'A') {
3661 ### If the C residue on the opposing strand was converted to T then we will see an A in the currently observed sequence. We want to know if
3662 ### the C on the opposing strand was it was in CpG, CHG or CHH context, therefore we need to look one (or two) bases upstream!
3663
3664 my $upstream_base = $genomic[$index+1];
3665
3666 if ($upstream_base eq 'C'){
3667 ++$unmethylated_CpG_count;
3668 push @match,'z'; # converted C on opposing strand, not methylated, in CpG context
3669 }
3670
3671 else{
3672 ### C in not in CpG-context, determining the second upstream base context
3673 my $second_upstream_base = $genomic[$index];
3674
3675 if ($second_upstream_base eq 'C'){
3676 ++$unmethylated_CHG_count;
3677 push @match,'x'; # converted C on opposing strand, not methylated, in CHG context
3678 }
3679 else{
3680 ++$unmethylated_CHH_count;
3681 push @match,'h'; # converted C on opposing strand, not methylated, in CHH context
3682 }
3683 }
3684 }
3685 ### all other mismatches are not of interest for a methylation call
3686 else {
3687 push @match,'.';
3688 }
3689 }
3690 else{
3691 die "There can be only 2 possibilities\n";
3692 }
3693 }
3694 }
3695 else{
3696 die "Strand conversion info is required to perform a methylation call\n";
3697 }
3698
3699 my $methylation_call = join ("",@match);
3700
3701 $counting{total_meCHH_count} += $methyl_CHH_count;
3702 $counting{total_meCHG_count} += $methyl_CHG_count;
3703 $counting{total_meCpG_count} += $methyl_CpG_count;
3704 $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count;
3705 $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count;
3706 $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count;
3707
3708 # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n";
3709 return $methylation_call;
3710 }
3711
3712 sub read_genome_into_memory{
3713 ## working directoy
3714 my $cwd = shift;
3715 ## reading in and storing the specified genome in the %chromosomes hash
3716 chdir ($genome_folder) or die "Can't move to $genome_folder: $!";
3717 print "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n";
3718
3719 my @chromosome_filenames = <*.fa>;
3720
3721 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
3722 unless (@chromosome_filenames){
3723 @chromosome_filenames = <*.fasta>;
3724 }
3725
3726 unless (@chromosome_filenames){
3727 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n";
3728 }
3729
3730 foreach my $chromosome_filename (@chromosome_filenames){
3731
3732 open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n";
3733 ### first line needs to be a fastA header
3734 my $first_line = <CHR_IN>;
3735 chomp $first_line;
3736
3737 ### Extracting chromosome name from the FastA header
3738 my $chromosome_name = extract_chromosome_name($first_line);
3739
3740 my $sequence;
3741 while (<CHR_IN>){
3742 chomp;
3743 if ($_ =~ /^>/){
3744 ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA)
3745 if (exists $chromosomes{$chromosome_name}){
3746 print "chr $chromosome_name (",length $sequence ," bp)\n";
3747 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
3748 }
3749 else {
3750 if (length($sequence) == 0){
3751 warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n";
3752 }
3753 print "chr $chromosome_name (",length $sequence ," bp)\n";
3754 $chromosomes{$chromosome_name} = $sequence;
3755 }
3756 ### resetting the sequence variable
3757 $sequence = '';
3758 ### setting new chromosome name
3759 $chromosome_name = extract_chromosome_name($_);
3760 }
3761 else{
3762 $sequence .= uc$_;
3763 }
3764 }
3765
3766 if (exists $chromosomes{$chromosome_name}){
3767 print "chr $chromosome_name (",length $sequence ," bp)\t";
3768 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n";
3769 }
3770 else{
3771 if (length($sequence) == 0){
3772 warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n";
3773 }
3774 print "chr $chromosome_name (",length $sequence ," bp)\n";
3775 $chromosomes{$chromosome_name} = $sequence;
3776 }
3777 }
3778 print "\n";
3779 chdir $cwd or die "Failed to move to directory $cwd\n";
3780 }
3781
3782 sub extract_chromosome_name {
3783 ## Bowtie seems to extract the first string after the inition > in the FASTA file, so we are doing this as well
3784 my $fasta_header = shift;
3785 if ($fasta_header =~ s/^>//){
3786 my ($chromosome_name) = split (/\s+/,$fasta_header);
3787 return $chromosome_name;
3788 }
3789 else{
3790 die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n";
3791 }
3792 }
3793
3794 sub reverse_complement{
3795 my $sequence = shift;
3796 $sequence =~ tr/CATG/GTAC/;
3797 $sequence = reverse($sequence);
3798 return $sequence;
3799 }
3800
3801 sub biTransformFastAFiles {
3802 my $file = shift;
3803 my ($dir,$filename);
3804 if ($file =~ /\//){
3805 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
3806 }
3807 else{
3808 $filename = $file;
3809 }
3810
3811 ### gzipped version of the infile
3812 if ($file =~ /\.gz$/){
3813 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
3814 }
3815 else{
3816 open (IN,$file) or die "Couldn't read from file $file: $!\n";
3817 }
3818
3819 if ($skip){
3820 warn "Skipping the first $skip reads from $file\n";
3821 sleep (1);
3822 }
3823 if ($upto){
3824 warn "Processing reads up to sequence no. $upto from $file\n";
3825 sleep (1);
3826 }
3827
3828 my $C_to_T_infile = my $G_to_A_infile = $filename;
3829 $C_to_T_infile =~ s/$/_C_to_T.fa/;
3830 $G_to_A_infile =~ s/$/_G_to_A.fa/;
3831 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
3832 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
3833
3834 unless ($directional){
3835 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
3836 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
3837 }
3838
3839 my $count = 0;
3840 while (1){
3841 my $header = <IN>;
3842 my $sequence= <IN>;
3843 last unless ($header and $sequence);
3844
3845 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
3846
3847 ++$count;
3848
3849 if ($skip){
3850 next unless ($count > $skip);
3851 }
3852 if ($upto){
3853 last if ($count > $upto);
3854 }
3855
3856 $sequence = uc$sequence; # make input file case insensitive
3857
3858 # detecting if the input file contains tab stops, as this is likely to result in no alignments
3859 if (index($header,"\t") != -1){
3860 $seqID_contains_tabs++;
3861 }
3862
3863 ### small check if the sequence seems to be in FastA format
3864 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/);
3865
3866 my $sequence_C_to_T = $sequence;
3867 $sequence_C_to_T =~ tr/C/T/;
3868 print CTOT "$header$sequence_C_to_T";
3869
3870 unless ($directional){
3871 my $sequence_G_to_A = $sequence;
3872 $sequence_G_to_A =~ tr/G/A/;
3873 print GTOA "$header$sequence_G_to_A";
3874 }
3875 }
3876 if ($directional){
3877 print "\nCreated C -> T converted versions of the FastA file $filename ($count sequences in total)\n\n";
3878 }
3879 else{
3880 print "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
3881 }
3882 return ($C_to_T_infile,$G_to_A_infile);
3883 }
3884
3885 sub biTransformFastAFiles_paired_end {
3886 my ($file,$read_number) = @_;
3887
3888 my ($dir,$filename);
3889 if ($file =~ /\//){
3890 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
3891 }
3892 else{
3893 $filename = $file;
3894 }
3895
3896 ### gzipped version of the infile
3897 if ($file =~ /\.gz$/){
3898 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
3899 }
3900 else{
3901 open (IN,$file) or die "Couldn't read from file $file: $!\n";
3902 }
3903
3904 if ($skip){
3905 warn "Skipping the first $skip reads from $file\n";
3906 sleep (1);
3907 }
3908 if ($upto){
3909 warn "Processing reads up to sequence no. $upto from $file\n";
3910 sleep (1);
3911 }
3912
3913 my $C_to_T_infile = my $G_to_A_infile = $filename;
3914 $C_to_T_infile =~ s/$/_C_to_T.fa/;
3915 $G_to_A_infile =~ s/$/_G_to_A.fa/;
3916
3917 if ($directional){
3918 if ($read_number == 1){
3919 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
3920 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
3921 }
3922 elsif ($read_number == 2){
3923 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
3924 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
3925 }
3926 else{
3927 die "Read number needs to be 1 or 2, but was: $read_number\n\n";
3928 }
3929 }
3930 else{ # all four strand output
3931 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
3932 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
3933 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
3934 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
3935 }
3936
3937 my $count = 0;
3938
3939 while (1){
3940 my $header = <IN>;
3941 my $sequence= <IN>;
3942 last unless ($header and $sequence);
3943
3944 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
3945
3946 ++$count;
3947
3948 if ($skip){
3949 next unless ($count > $skip);
3950 }
3951 if ($upto){
3952 last if ($count > $upto);
3953 }
3954
3955 $sequence = uc$sequence; # make input file case insensitive
3956
3957 # detecting if the input file contains tab stops, as this is likely to result in no alignments
3958 if (index($header,"\t") != -1){
3959 $seqID_contains_tabs++;
3960 }
3961
3962 ## small check if the sequence seems to be in FastA format
3963 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/);
3964
3965 if ($read_number == 1){
3966 if ($bowtie2){
3967 $header =~ s/$/\/1\/1/;
3968 }
3969 else{
3970 $header =~ s/$/\/1/;
3971 }
3972 }
3973 elsif ($read_number == 2){
3974 if ($bowtie2){
3975 $header =~ s/$/\/2\/2/;
3976 }
3977 else{
3978 $header =~ s/$/\/2/;
3979 }
3980 }
3981 else{
3982 die "Read number needs to be 1 or 2, but was: $read_number\n\n";
3983 }
3984 my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
3985
3986 $sequence_C_to_T =~ tr/C/T/;
3987 $sequence_G_to_A =~ tr/G/A/;
3988
3989 if ($directional){
3990
3991 if ($read_number == 1){
3992 print CTOT "$header$sequence_C_to_T";
3993 }
3994 elsif ($read_number == 2){
3995 print GTOA "$header$sequence_G_to_A";
3996 }
3997 }
3998 else{
3999 print CTOT "$header$sequence_C_to_T";
4000 print GTOA "$header$sequence_G_to_A";
4001 }
4002 }
4003
4004 if ($directional){
4005 if ($read_number == 1){
4006 print "\nCreated C -> T converted version of the FastA file $filename ($count sequences in total)\n\n";
4007 }
4008 else{
4009 print "\nCreated G -> A converted version of the FastA file $filename ($count sequences in total)\n\n";
4010 }
4011 }
4012 else{
4013 print "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
4014 }
4015
4016 if ($directional){
4017 if ($read_number == 1){
4018 return ($C_to_T_infile);
4019 }
4020 else{
4021 return ($G_to_A_infile);
4022 }
4023 }
4024 else{
4025 return ($C_to_T_infile,$G_to_A_infile);
4026 }
4027 }
4028
4029
4030 sub biTransformFastQFiles {
4031 my $file = shift;
4032 my ($dir,$filename);
4033 if ($file =~ /\//){
4034 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
4035 }
4036 else{
4037 $filename = $file;
4038 }
4039
4040 ### gzipped version of the infile
4041 if ($file =~ /\.gz$/){
4042 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
4043 }
4044 else{
4045 open (IN,$file) or die "Couldn't read from file $file: $!\n";
4046 }
4047
4048 if ($skip){
4049 warn "Skipping the first $skip reads from $file\n";
4050 sleep (1);
4051 }
4052 if ($upto){
4053 warn "Processing reads up to sequence no. $upto from $file\n";
4054 sleep (1);
4055 }
4056
4057 my $C_to_T_infile = my $G_to_A_infile = $filename;
4058
4059 $C_to_T_infile =~ s/$/_C_to_T.fastq/;
4060 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
4061 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
4062
4063 unless ($directional){
4064 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
4065 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
4066 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
4067 }
4068
4069 my $count = 0;
4070 while (1){
4071 my $identifier = <IN>;
4072 my $sequence = <IN>;
4073 my $identifier2 = <IN>;
4074 my $quality_score = <IN>;
4075 last unless ($identifier and $sequence and $identifier2 and $quality_score);
4076
4077 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
4078
4079 ++$count;
4080
4081 if ($skip){
4082 next unless ($count > $skip);
4083 }
4084 if ($upto){
4085 last if ($count > $upto);
4086 }
4087
4088 $sequence = uc$sequence; # make input file case insensitive
4089
4090 # detecting if the input file contains tab stops, as this is likely to result in no alignments
4091 if (index($identifier,"\t") != -1){
4092 $seqID_contains_tabs++;
4093 }
4094
4095 ## small check if the sequence file appears to be a FastQ file
4096 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
4097 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
4098 }
4099
4100 my $sequence_C_to_T = $sequence;
4101 $sequence_C_to_T =~ tr/C/T/;
4102 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
4103
4104 unless ($directional){
4105 my $sequence_G_to_A = $sequence;
4106 $sequence_G_to_A =~ tr/G/A/;
4107 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
4108 }
4109 }
4110
4111 if ($directional){
4112 print "\nCreated C -> T converted versions of the FastQ file $filename ($count sequences in total)\n\n";
4113 }
4114 else{
4115 print "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
4116 }
4117
4118 return ($C_to_T_infile,$G_to_A_infile);
4119 }
4120
4121 sub biTransformFastQFiles_paired_end {
4122 my ($file,$read_number) = @_;
4123 my ($dir,$filename);
4124
4125 if ($file =~ /\//){
4126 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
4127 }
4128 else{
4129 $filename = $file;
4130 }
4131
4132 ### gzipped version of the infile
4133 if ($file =~ /\.gz$/){
4134 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
4135 }
4136 else{
4137 open (IN,$file) or die "Couldn't read from file $file: $!\n";
4138 }
4139
4140 if ($skip){
4141 warn "Skipping the first $skip reads from $file\n";
4142 sleep (1);
4143 }
4144 if ($upto){
4145 warn "Processing reads up to sequence no. $upto from $file\n";
4146 sleep (1);
4147 }
4148
4149 my $C_to_T_infile = my $G_to_A_infile = $filename;
4150 $C_to_T_infile =~ s/$/_C_to_T.fastq/;
4151 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
4152
4153 if ($directional){
4154 if ($read_number == 1){
4155 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
4156 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
4157 }
4158 elsif ($read_number == 2){
4159 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
4160 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
4161 }
4162 else{
4163 die "Read number needs to be 1 or 2, but was $read_number!\n\n";
4164 }
4165 }
4166 else{
4167 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
4168 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
4169 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
4170 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
4171 }
4172
4173 my $count = 0;
4174
4175 while (1){
4176 my $identifier = <IN>;
4177 my $sequence = <IN>;
4178 my $identifier2 = <IN>;
4179 my $quality_score = <IN>;
4180 last unless ($identifier and $sequence and $identifier2 and $quality_score);
4181 ++$count;
4182
4183 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
4184
4185 if ($skip){
4186 next unless ($count > $skip);
4187 }
4188 if ($upto){
4189 last if ($count > $upto);
4190 }
4191
4192 $sequence= uc$sequence; # make input file case insensitive
4193
4194 ## small check if the sequence file appears to be a FastQ file
4195 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
4196 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
4197 }
4198 my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
4199
4200 if ($read_number == 1){
4201 if ($bowtie2){
4202 $identifier =~ s/$/\/1\/1/;
4203 }
4204 else{
4205 $identifier =~ s/$/\/1/;
4206 }
4207 }
4208 elsif ($read_number == 2){
4209 if ($bowtie2){
4210 $identifier =~ s/$/\/2\/2/;
4211 }
4212 else{
4213 $identifier =~ s/$/\/2/;
4214 }
4215 }
4216 else{
4217 die "Read number needs to be 1 or 2\n";
4218 }
4219
4220 $sequence_C_to_T =~ tr/C/T/;
4221 $sequence_G_to_A =~ tr/G/A/;
4222
4223 if ($directional){
4224 if ($read_number == 1){
4225 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
4226 }
4227 else{
4228 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
4229 }
4230 }
4231 else{
4232 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
4233 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
4234 }
4235 }
4236
4237 if ($directional){
4238 if ($read_number == 1){
4239 print "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n";
4240 }
4241 else{
4242 print "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n";
4243 }
4244 }
4245 else{
4246 print "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
4247 }
4248 if ($directional){
4249 if ($read_number == 1){
4250 return ($C_to_T_infile);
4251 }
4252 else{
4253 return ($G_to_A_infile);
4254 }
4255 }
4256 else{
4257 return ($C_to_T_infile,$G_to_A_infile);
4258 }
4259 }
4260
4261 sub fix_IDs{
4262 my $id = shift;
4263 $id =~ s/[ \t]+/_/g; # replace spaces or tabs with underscores
4264 return $id;
4265 }
4266
4267 sub ensure_sensical_alignment_orientation_single_end{
4268 my $index = shift; # index number if the sequence produced an alignment
4269 my $strand = shift;
4270 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
4271 my $orientation = 0;
4272 ##############################################################################################################
4273 ## FORWARD converted read against FORWARD converted genome (read: C->T.....C->T.. genome:C->T.......C->T)
4274 ## here we only want reads in the forward (+) orientation
4275 if ($fhs[$index]->{name} eq 'CTreadCTgenome') {
4276 ### if the alignment is (+) we count it, and return 1 for a correct orientation
4277 if ($strand eq '+') {
4278 $fhs[$index]->{seen}++;
4279 $orientation = 1;
4280 return $orientation;
4281 }
4282 ### if the orientation equals (-) the alignment is nonsensical
4283 elsif ($strand eq '-') {
4284 $fhs[$index]->{wrong_strand}++;
4285 return $orientation;
4286 }
4287 }
4288 ###############################################################################################################
4289 ## FORWARD converted read against reverse converted genome (read: C->T.....C->T.. genome: G->A.......G->A)
4290 ## here we only want reads in the forward (-) orientation
4291 elsif ($fhs[$index]->{name} eq 'CTreadGAgenome') {
4292 ### if the alignment is (-) we count it and return 1 for a correct orientation
4293 if ($strand eq '-') {
4294 $fhs[$index]->{seen}++;
4295 $orientation = 1;
4296 return $orientation;
4297 }
4298 ### if the orientation equals (+) the alignment is nonsensical
4299 elsif ($strand eq '+') {
4300 $fhs[$index]->{wrong_strand}++;
4301 return $orientation;
4302 }
4303 }
4304 ###############################################################################################################
4305 ## Reverse converted read against FORWARD converted genome (read: G->A.....G->A.. genome: C->T.......C->T)
4306 ## here we only want reads in the forward (-) orientation
4307 elsif ($fhs[$index]->{name} eq 'GAreadCTgenome') {
4308 ### if the alignment is (-) we count it and return 1 for a correct orientation
4309 if ($strand eq '-') {
4310 $fhs[$index]->{seen}++;
4311 $orientation = 1;
4312 return $orientation;
4313 }
4314 ### if the orientation equals (+) the alignment is nonsensical
4315 elsif ($strand eq '+') {
4316 $fhs[$index]->{wrong_strand}++;
4317 return $orientation;
4318 }
4319 }
4320 ###############################################################################################################
4321 ## Reverse converted read against reverse converted genome (read: G->A.....G->A.. genome: G->A.......G->A)
4322 ## here we only want reads in the forward (+) orientation
4323 elsif ($fhs[$index]->{name} eq 'GAreadGAgenome') {
4324 ### if the alignment is (+) we count it and return 1 for a correct orientation
4325 if ($strand eq '+') {
4326 $fhs[$index]->{seen}++;
4327 $orientation = 1;
4328 return $orientation;
4329 }
4330 ### if the orientation equals (-) the alignment is nonsensical
4331 elsif ($strand eq '-') {
4332 $fhs[$index]->{wrong_strand}++;
4333 return $orientation;
4334 }
4335 } else{
4336 die "One of the above conditions must be true\n";
4337 }
4338 }
4339
4340 sub ensure_sensical_alignment_orientation_paired_ends{
4341 my ($index,$id_1,$strand_1,$id_2,$strand_2) = @_; # index number if the sequence produced an alignment
4342 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
4343 my $orientation = 0;
4344 ##############################################################################################################
4345 ## [Index 0, sequence originated from (converted) forward strand]
4346 ## CT converted read 1
4347 ## GA converted read 2
4348 ## CT converted genome
4349 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
4350 if ($fhs[$index]->{name} eq 'CTread1GAread2CTgenome') {
4351 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
4352 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
4353 $fhs[$index]->{seen}++;
4354 $orientation = 1;
4355 return $orientation;
4356 }
4357 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
4358 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
4359 $fhs[$index]->{wrong_strand}++;
4360 return $orientation;
4361 }
4362 else{
4363 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
4364 }
4365 }
4366 ###############################################################################################################
4367 ## [Index 1, sequence originated from (converted) reverse strand]
4368 ## GA converted read 1
4369 ## CT converted read 2
4370 ## GA converted genome
4371 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
4372 elsif ($fhs[$index]->{name} eq 'GAread1CTread2GAgenome') {
4373 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
4374 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
4375 $fhs[$index]->{seen}++;
4376 $orientation = 1;
4377 return $orientation;
4378 }
4379 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
4380 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
4381 $fhs[$index]->{wrong_strand}++;
4382 return $orientation;
4383 }
4384 else{
4385 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
4386 }
4387 }
4388 ###############################################################################################################
4389 ## [Index 2, sequence originated from complementary to (converted) forward strand]
4390 ## GA converted read 1
4391 ## CT converted read 2
4392 ## CT converted genome
4393 ## here we only want read 1 in (-) orientation and read 2 in (+) orientation
4394 elsif ($fhs[$index]->{name} eq 'GAread1CTread2CTgenome') {
4395 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
4396 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
4397 $fhs[$index]->{seen}++;
4398 $orientation = 1;
4399 return $orientation;
4400 }
4401 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
4402 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
4403 $fhs[$index]->{wrong_strand}++;
4404 return $orientation;
4405 }
4406 else{
4407 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
4408 }
4409 }
4410 ###############################################################################################################
4411 ## [Index 3, sequence originated from complementary to (converted) reverse strand]
4412 ## CT converted read 1
4413 ## GA converted read 2
4414 ## GA converted genome
4415 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
4416 elsif ($fhs[$index]->{name} eq 'CTread1GAread2GAgenome') {
4417 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
4418 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
4419 $fhs[$index]->{seen}++;
4420 $orientation = 1;
4421 return $orientation;
4422 }
4423 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
4424 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
4425 $fhs[$index]->{wrong_strand}++;
4426 return $orientation;
4427 }
4428 else{
4429 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
4430 }
4431 }
4432 else{
4433 die "One of the above conditions must be true\n";
4434 }
4435 }
4436
4437 #####################################################################################################################################################
4438
4439 ### Bowtie 1 (default) | PAIRED-END | FASTA
4440
4441 sub paired_end_align_fragments_to_bisulfite_genome_fastA {
4442
4443 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
4444
4445 if ($directional){
4446 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
4447 }
4448 else{
4449 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
4450 }
4451
4452 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
4453 ## data structure above
4454 if ($directional){
4455 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4456 }
4457 else{
4458 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4459 }
4460
4461 foreach my $fh (@fhs) {
4462
4463 if ($directional){
4464 unless ($fh->{inputfile_1}){
4465 $fh->{last_seq_id} = undef;
4466 $fh->{last_line_1} = undef;
4467 $fh->{last_line_2} = undef;
4468 next;
4469 }
4470 }
4471
4472 my $bt_options = $bowtie_options;
4473 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
4474 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
4475 }
4476 else {
4477 $bt_options .= ' --nofw';
4478 }
4479
4480 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options)\n";
4481 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
4482
4483 my $line_1 = $fh->{fh}->getline();
4484 my $line_2 = $fh->{fh}->getline();
4485
4486 # if Bowtie produces an alignment we store the first line of the output
4487 if ($line_1 and $line_2) {
4488 chomp $line_1;
4489 chomp $line_2;
4490 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
4491 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
4492
4493 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
4494 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
4495
4496 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
4497 $fh->{last_seq_id} = $id_1;
4498 }
4499 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
4500 $fh->{last_seq_id} = $id_2;
4501 }
4502 else{
4503 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
4504 }
4505
4506 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
4507 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
4508 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
4509 }
4510 # otherwise we just initialise last_seq_id and last_lines as undefined
4511 else {
4512 print "Found no alignment, assigning undef to last_seq_id and last_lines\n";
4513 $fh->{last_seq_id} = undef;
4514 $fh->{last_line_1} = undef;
4515 $fh->{last_line_2} = undef;
4516 }
4517 }
4518 }
4519
4520 ### Bowtie 2 | PAIRED-END | FASTA
4521
4522 sub paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
4523 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
4524 if ($directional){
4525 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
4526 }
4527 else{
4528 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
4529 }
4530
4531 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
4532 ## data structure above
4533 if ($directional){
4534 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4535 }
4536 else{
4537 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4538 }
4539
4540 foreach my $fh (@fhs) {
4541
4542 if ($directional){
4543 unless ($fh->{inputfile_1}){
4544 $fh->{last_seq_id} = undef;
4545 $fh->{last_line_1} = undef;
4546 $fh->{last_line_2} = undef;
4547 next;
4548 }
4549 }
4550
4551 my $bt2_options = $bowtie_options;
4552 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
4553 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
4554 }
4555 else {
4556 $bt2_options .= ' --nofw';
4557 }
4558
4559 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
4560 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
4561
4562 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
4563 while (1){
4564 $_ = $fh->{fh}->getline();
4565 if ($_) {
4566 last unless ($_ =~ /^\@/); # SAM headers start with @
4567 }
4568 else{
4569 last; # no alignment output
4570 }
4571 }
4572
4573 my $line_1 = $_;
4574 my $line_2 = $fh->{fh}->getline();
4575
4576 # if Bowtie produces an alignment we store the first line of the output
4577 if ($line_1 and $line_2) {
4578 chomp $line_1;
4579 chomp $line_2;
4580 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
4581 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
4582
4583 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
4584 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
4585
4586 if ($id_1 =~ s/\/1$//){ # removing the read 1 /1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
4587 $fh->{last_seq_id} = $id_1;
4588 }
4589 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 /2 tag if present
4590 $fh->{last_seq_id} = $id_2;
4591 }
4592 else{
4593 warn "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
4594 }
4595
4596 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
4597 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
4598 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
4599 }
4600 # otherwise we just initialise last_seq_id and last_lines as undefined
4601 else {
4602 print "Found no alignment, assigning undef to last_seq_id and last_lines\n";
4603 $fh->{last_seq_id} = undef;
4604 $fh->{last_line_1} = undef;
4605 $fh->{last_line_2} = undef;
4606 }
4607 }
4608 }
4609
4610 ### Bowtie 1 (default) | PAIRED-END | FASTQ
4611
4612 sub paired_end_align_fragments_to_bisulfite_genome_fastQ {
4613 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
4614 if ($directional){
4615 print "Input files are $C_to_T_infile_1 $G_to_A_infile_2 (FastQ)\n";
4616 }
4617 else{
4618 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n";
4619 }
4620
4621 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
4622 ## data structure above
4623 if ($directional){
4624 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4625 }
4626 else{
4627 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4628 }
4629
4630 foreach my $fh (@fhs) {
4631
4632 if ($directional){
4633 unless ($fh->{inputfile_1}){
4634 $fh->{last_seq_id} = undef;
4635 $fh->{last_line_1} = undef;
4636 $fh->{last_line_2} = undef;
4637 next;
4638 }
4639 }
4640
4641 my $bt_options = $bowtie_options;
4642 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
4643 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
4644 }
4645 else {
4646 $bt_options .= ' --nofw';
4647 }
4648
4649 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options))\n";
4650 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
4651
4652 my $line_1 = $fh->{fh}->getline();
4653 my $line_2 = $fh->{fh}->getline();
4654
4655 # if Bowtie produces an alignment we store the first line of the output
4656 if ($line_1 and $line_2) {
4657 chomp $line_1;
4658 chomp $line_2;
4659 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
4660 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
4661
4662 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
4663 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
4664
4665 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
4666 $fh->{last_seq_id} = $id_1;
4667 }
4668 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
4669 $fh->{last_seq_id} = $id_2;
4670 }
4671 else{
4672 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
4673 }
4674
4675 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
4676 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
4677 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
4678 }
4679
4680 # otherwise we just initialise last_seq_id and last_lines as undefined
4681 else {
4682 print "Found no alignment, assigning undef to last_seq_id and last_lines\n";
4683 $fh->{last_seq_id} = undef;
4684 $fh->{last_line_1} = undef;
4685 $fh->{last_line_2} = undef;
4686 }
4687 }
4688 }
4689
4690 ### Bowtie 2 | PAIRED-END | FASTQ
4691
4692 sub paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
4693 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
4694 if ($directional){
4695 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n";
4696 }
4697 else{
4698 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n";
4699 }
4700
4701 ## Now starting up 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
4702 ## data structure above
4703 if ($directional){
4704 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4705 }
4706 else{
4707 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4708 }
4709
4710 foreach my $fh (@fhs) {
4711
4712 if ($directional){
4713 unless ($fh->{inputfile_1}){
4714 $fh->{last_seq_id} = undef;
4715 $fh->{last_line_1} = undef;
4716 $fh->{last_line_2} = undef;
4717 next;
4718 }
4719 }
4720
4721 my $bt2_options = $bowtie_options;
4722 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
4723 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
4724 }
4725 else {
4726 $bt2_options .= ' --nofw';
4727 }
4728
4729 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
4730 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
4731
4732 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
4733 while (1){
4734 $_ = $fh->{fh}->getline();
4735 if ($_) {
4736 last unless ($_ =~ /^\@/); # SAM headers start with @
4737 }
4738 else{
4739 last; # no alignment output
4740 }
4741 }
4742
4743 my $line_1 = $_;
4744 my $line_2 = $fh->{fh}->getline();
4745
4746 # if Bowtie produces an alignment we store the first line of the output
4747 if ($line_1 and $line_2) {
4748 chomp $line_1;
4749 chomp $line_2;
4750 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
4751 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
4752
4753 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
4754 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
4755
4756 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
4757 $fh->{last_seq_id} = $id_1;
4758 }
4759 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
4760 $fh->{last_seq_id} = $id_2;
4761 }
4762 else{
4763 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
4764 }
4765
4766 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
4767 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
4768 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
4769 }
4770
4771 # otherwise we just initialise last_seq_id and last_lines as undefined
4772 else {
4773 print "Found no alignment, assigning undef to last_seq_id and last_lines\n";
4774 $fh->{last_seq_id} = undef;
4775 $fh->{last_line_1} = undef;
4776 $fh->{last_line_2} = undef;
4777 }
4778 }
4779 }
4780
4781 #####################################################################################################################################################
4782
4783 ### Bowtie 1 (default) | SINGLE-END | FASTA
4784 sub single_end_align_fragments_to_bisulfite_genome_fastA {
4785 my ($C_to_T_infile,$G_to_A_infile) = @_;
4786 if ($directional){
4787 print "Input file is $C_to_T_infile (FastA)\n";
4788 }
4789 else{
4790 print "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
4791 }
4792
4793 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
4794 ## data structure above
4795 if ($directional){
4796 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4797 }
4798 else{
4799 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4800 }
4801
4802 foreach my $fh (@fhs) {
4803
4804 my $bt_options = $bowtie_options;
4805 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
4806 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
4807 }
4808 else {
4809 $bt_options .= ' --nofw';
4810 }
4811
4812 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
4813 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
4814
4815 # if Bowtie produces an alignment we store the first line of the output
4816 $_ = $fh->{fh}->getline();
4817 if ($_) {
4818 chomp;
4819 my $id = (split(/\t/))[0]; # this is the first element of the bowtie output (= the sequence identifier)
4820 $fh->{last_seq_id} = $id;
4821 $fh->{last_line} = $_;
4822 warn "Found first alignment:\t$fh->{last_line}\n";
4823 }
4824 # otherwise we just initialise last_seq_id and last_line as undefined
4825 else {
4826 print "Found no alignment, assigning undef to last_seq_id and last_line\n";
4827 $fh->{last_seq_id} = undef;
4828 $fh->{last_line} = undef;
4829 }
4830 }
4831 }
4832
4833 ### Bowtie 2 | SINGLE-END | FASTA
4834 sub single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
4835 my ($C_to_T_infile,$G_to_A_infile) = @_;
4836 if ($directional){
4837 print "Input file is $C_to_T_infile (FastA)\n";
4838 }
4839 else{
4840 print "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
4841 }
4842
4843 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
4844 ## data structure above
4845 if ($directional){
4846 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4847 }
4848 else{
4849 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4850 }
4851
4852 foreach my $fh (@fhs) {
4853
4854 my $bt2_options = $bowtie_options;
4855 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
4856 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
4857 }
4858 else {
4859 $bt2_options .= ' --nofw';
4860 }
4861
4862 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt2_options)\n";
4863 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
4864
4865 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
4866 while (1){
4867 $_ = $fh->{fh}->getline();
4868 if ($_) {
4869 last unless ($_ =~ /^\@/); # SAM headers start with @
4870 }
4871 else{
4872 last; # no alignment output
4873 }
4874 }
4875
4876 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
4877 if ($_) {
4878 chomp;
4879 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
4880 $fh->{last_seq_id} = $id;
4881 $fh->{last_line} = $_;
4882 warn "Found first alignment:\t$fh->{last_line}\n";
4883 }
4884 # otherwise we just initialise last_seq_id and last_line as undefinded. This should only happen at the end of a file for Bowtie 2 output
4885 else {
4886 print "Found no alignment, assigning undef to last_seq_id and last_line\n";
4887 $fh->{last_seq_id} = undef;
4888 $fh->{last_line} = undef;
4889 }
4890 }
4891 }
4892
4893
4894 ### Bowtie 1 (default) | SINGLE-END | FASTQ
4895 sub single_end_align_fragments_to_bisulfite_genome_fastQ {
4896 my ($C_to_T_infile,$G_to_A_infile) = @_;
4897 if ($directional){
4898 print "Input file is $C_to_T_infile (FastQ)\n";
4899 }
4900 else{
4901 print "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n";
4902 }
4903
4904 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
4905 ## the data structure above
4906 if ($directional){
4907 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4908 }
4909 else{
4910 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4911 }
4912
4913 foreach my $fh (@fhs) {
4914 my $bt_options = $bowtie_options;
4915 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
4916 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
4917 }
4918 else {
4919 $bt_options .= ' --nofw';
4920 }
4921
4922 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
4923 open ($fh->{fh},"$path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
4924
4925 # if Bowtie produces an alignment we store the first line of the output
4926 $_ = $fh->{fh}->getline();
4927 if ($_) {
4928 chomp;
4929 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
4930 $fh->{last_seq_id} = $id;
4931 $fh->{last_line} = $_;
4932 warn "Found first alignment:\t$fh->{last_line}\n";
4933 }
4934 # otherwise we just initialise last_seq_id and last_line as undefined
4935 else {
4936 print "Found no alignment, assigning undef to last_seq_id and last_line\n";
4937 $fh->{last_seq_id} = undef;
4938 $fh->{last_line} = undef;
4939 }
4940 }
4941 }
4942
4943 ### Bowtie 2 | SINGLE-END | FASTQ
4944 sub single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
4945 my ($C_to_T_infile,$G_to_A_infile) = @_;
4946 if ($directional){
4947 print "Input file is $C_to_T_infile (FastQ)\n\n";
4948 }
4949 else{
4950 print "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n\n";
4951 }
4952
4953 ## Now starting up to 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
4954 ## the data structure above
4955 if ($directional){
4956 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4957 }
4958 else{
4959 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4960 }
4961
4962 foreach my $fh (@fhs) {
4963 my $bt2_options = $bowtie_options;
4964 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
4965 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
4966 }
4967 else {
4968 $bt2_options .= ' --nofw';
4969 }
4970 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options $bt2_options)\n";
4971 warn "Using Bowtie 2 index: $fh->{bisulfiteIndex}\n\n";
4972
4973 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
4974 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
4975 while (1){
4976 $_ = $fh->{fh}->getline();
4977 if ($_) {
4978 last unless ($_ =~ /^\@/); # SAM headers start with @
4979 }
4980 else {
4981 last;
4982 }
4983 }
4984
4985 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
4986 if ($_) {
4987 chomp;
4988 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie 2 output (= the sequence identifier)
4989 $fh->{last_seq_id} = $id;
4990 $fh->{last_line} = $_;
4991 warn "Found first alignment:\t$fh->{last_line}\n";
4992 }
4993 # otherwise we just initialise last_seq_id and last_line as undefined. This should only happen at the end of a file for Bowtie 2 output
4994 else {
4995 print "Found no alignment, assigning undef to last_seq_id and last_line\n";
4996 $fh->{last_seq_id} = undef;
4997 $fh->{last_line} = undef;
4998 }
4999 }
5000 }
5001
5002 ###########################################################################################################################################
5003
5004 sub reset_counters_and_fhs{
5005 my $filename = shift;
5006 %counting=(
5007 total_meCHH_count => 0,
5008 total_meCHG_count => 0,
5009 total_meCpG_count => 0,
5010 total_unmethylated_CHH_count => 0,
5011 total_unmethylated_CHG_count => 0,
5012 total_unmethylated_CpG_count => 0,
5013 sequences_count => 0,
5014 no_single_alignment_found => 0,
5015 unsuitable_sequence_count => 0,
5016 genomic_sequence_could_not_be_extracted_count => 0,
5017 unique_best_alignment_count => 0,
5018 low_complexity_alignments_overruled_count => 0,
5019 CT_CT_count => 0, #(CT read/CT genome, original top strand)
5020 CT_GA_count => 0, #(CT read/GA genome, original bottom strand)
5021 GA_CT_count => 0, #(GA read/CT genome, complementary to original top strand)
5022 GA_GA_count => 0, #(GA read/GA genome, complementary to original bottom strand)
5023 CT_GA_CT_count => 0, #(CT read1/GA read2/CT genome, original top strand)
5024 GA_CT_GA_count => 0, #(GA read1/CT read2/GA genome, complementary to original bottom strand)
5025 GA_CT_CT_count => 0, #(GA read1/CT read2/CT genome, complementary to original top strand)
5026 CT_GA_GA_count => 0, #(CT read1/GA read2/GA genome, original bottom strand)
5027 alignments_rejected_count => 0, # only relevant if --directional was specified
5028 );
5029
5030 if ($directional){
5031 if ($filename =~ ','){ # paired-end files
5032 @fhs=(
5033 { name => 'CTreadCTgenome',
5034 strand_identity => 'con ori forward',
5035 bisulfiteIndex => $CT_index_basename,
5036 seen => 0,
5037 wrong_strand => 0,
5038 },
5039 { name => 'CTreadGAgenome',
5040 strand_identity => 'con ori reverse',
5041 bisulfiteIndex => $GA_index_basename,
5042 seen => 0,
5043 wrong_strand => 0,
5044 },
5045 { name => 'GAreadCTgenome',
5046 strand_identity => 'compl ori con forward',
5047 bisulfiteIndex => $CT_index_basename,
5048 seen => 0,
5049 wrong_strand => 0,
5050 },
5051 { name => 'GAreadGAgenome',
5052 strand_identity => 'compl ori con reverse',
5053 bisulfiteIndex => $GA_index_basename,
5054 seen => 0,
5055 wrong_strand => 0,
5056 },
5057 );
5058 }
5059 else{ # single-end files
5060 @fhs=(
5061 { name => 'CTreadCTgenome',
5062 strand_identity => 'con ori forward',
5063 bisulfiteIndex => $CT_index_basename,
5064 seen => 0,
5065 wrong_strand => 0,
5066 },
5067 { name => 'CTreadGAgenome',
5068 strand_identity => 'con ori reverse',
5069 bisulfiteIndex => $GA_index_basename,
5070 seen => 0,
5071 wrong_strand => 0,
5072 },
5073 );
5074 }
5075 }
5076 else{
5077 @fhs=(
5078 { name => 'CTreadCTgenome',
5079 strand_identity => 'con ori forward',
5080 bisulfiteIndex => $CT_index_basename,
5081 seen => 0,
5082 wrong_strand => 0,
5083 },
5084 { name => 'CTreadGAgenome',
5085 strand_identity => 'con ori reverse',
5086 bisulfiteIndex => $GA_index_basename,
5087 seen => 0,
5088 wrong_strand => 0,
5089 },
5090 { name => 'GAreadCTgenome',
5091 strand_identity => 'compl ori con forward',
5092 bisulfiteIndex => $CT_index_basename,
5093 seen => 0,
5094 wrong_strand => 0,
5095 },
5096 { name => 'GAreadGAgenome',
5097 strand_identity => 'compl ori con reverse',
5098 bisulfiteIndex => $GA_index_basename,
5099 seen => 0,
5100 wrong_strand => 0,
5101 },
5102 );
5103 }
5104 }
5105
5106
5107 sub process_command_line{
5108 my @bowtie_options;
5109 my $help;
5110 my $mates1;
5111 my $mates2;
5112 my $path_to_bowtie;
5113 my $fastq;
5114 my $fasta;
5115 my $skip;
5116 my $qupto;
5117 my $phred64;
5118 my $phred33;
5119 my $solexa;
5120 my $mismatches;
5121 my $seed_length;
5122 my $best;
5123 my $sequence_format;
5124 my $version;
5125 my $quiet;
5126 my $chunk;
5127 my $non_directional;
5128 my $ceiling;
5129 my $maxins;
5130 my $minins;
5131 my $unmapped;
5132 my $multi_map;
5133 my $output_dir;
5134 my $bowtie2;
5135 my $vanilla;
5136 my $sam_no_hd;
5137 my $seed_extension_fails;
5138 my $reseed_repetitive_seeds;
5139 my $most_valid_alignments;
5140 my $score_min;
5141 my $parallel;
5142 my $temp_dir;
5143
5144 my $command_line = GetOptions ('help|man' => \$help,
5145 '1=s' => \$mates1,
5146 '2=s' => \$mates2,
5147 'path_to_bowtie=s' => \$path_to_bowtie,
5148 'f|fasta' => \$fasta,
5149 'q|fastq' => \$fastq,
5150 's|skip=i' => \$skip,
5151 'u|upto=i' => \$qupto,
5152 'phred33-quals' => \$phred33,
5153 'phred64-quals|solexa1' => \$phred64,
5154 'solexa-quals' => \$solexa,
5155 'n|seedmms=i' => \$mismatches,
5156 'l|seedlen=i' => \$seed_length,
5157 'no_best' => \$best,
5158 'version' => \$version,
5159 'quiet' => \$quiet,
5160 'chunkmbs=i' => \$chunk,
5161 'non_directional' => \$non_directional,
5162 'I|minins=i' => \$minins,
5163 'X|maxins=i' => \$maxins,
5164 'e|maqerr=i' => \$ceiling,
5165 'un|unmapped' => \$unmapped,
5166 'ambiguous' => \$multi_map,
5167 'o|output_dir=s' => \$output_dir,
5168 'bowtie2' => \$bowtie2,
5169 'vanilla' => \$vanilla,
5170 'sam-no-hd' => \$sam_no_hd,
5171 'D=i' => \$seed_extension_fails,
5172 'R=i' => \$reseed_repetitive_seeds,
5173 'score_min=s' => \$score_min,
5174 'most_valid_alignments=i' => \$most_valid_alignments,
5175 'p=i' => \$parallel,
5176 'temp_dir=s' => \$temp_dir,
5177 );
5178
5179
5180 ### EXIT ON ERROR if there were errors with any of the supplied options
5181 unless ($command_line){
5182 die "Please respecify command line options\n";
5183 }
5184 ### HELPFILE
5185 if ($help){
5186 print_helpfile();
5187 exit;
5188 }
5189 if ($version){
5190 print << "VERSION";
5191
5192
5193 Bismark - Bisulfite Mapper and Methylation Caller.
5194
5195 Bismark Version: $bismark_version Copyright 2010-12 Felix Krueger, Babraham Bioinformatics
5196 www.bioinformatics.babraham.ac.uk/projects/
5197
5198
5199 VERSION
5200 exit;
5201 }
5202
5203
5204 ##########################
5205 ### PROCESSING OPTIONS ###
5206 ##########################
5207
5208 unless ($bowtie2){
5209 $bowtie2 = 0;
5210 }
5211 unless ($sam_no_hd){
5212 $sam_no_hd =0;
5213 }
5214
5215 ### PATH TO BOWTIE
5216 ### if a special path to Bowtie 1/2 was specified we will use that one, otherwise it is assumed that Bowtie 1/2 is in the PATH
5217 if ($path_to_bowtie){
5218 unless ($path_to_bowtie =~ /\/$/){
5219 $path_to_bowtie =~ s/$/\//;
5220 }
5221 if (-d $path_to_bowtie){
5222 if ($bowtie2){
5223 $path_to_bowtie = "${path_to_bowtie}bowtie2";
5224 }
5225 else{
5226 $path_to_bowtie = "${path_to_bowtie}bowtie";
5227 }
5228 }
5229 else{
5230 die "The path to bowtie provided ($path_to_bowtie) is invalid (not a directory)!\n";
5231 }
5232 }
5233 else{
5234 if ($bowtie2){
5235 $path_to_bowtie = 'bowtie2';
5236 warn "Path to Bowtie 2 specified as: $path_to_bowtie\n"; }
5237 else{
5238 $path_to_bowtie = 'bowtie';
5239 warn "Path to Bowtie specified as: $path_to_bowtie\n";
5240 }
5241 }
5242
5243 ####################################
5244 ### PROCESSING ARGUMENTS
5245
5246 ### GENOME FOLDER
5247 my $genome_folder = shift @ARGV; # mandatory
5248 unless ($genome_folder){
5249 warn "Genome folder was not specified!\n";
5250 print_helpfile();
5251 exit;
5252 }
5253
5254 ### checking that the genome folder, all subfolders and the required bowtie index files exist
5255 unless ($genome_folder =~/\/$/){
5256 $genome_folder =~ s/$/\//;
5257 }
5258
5259 if (chdir $genome_folder){
5260 my $absolute_genome_folder = getcwd; ## making the genome folder path absolute
5261 unless ($absolute_genome_folder =~/\/$/){
5262 $absolute_genome_folder =~ s/$/\//;
5263 }
5264 warn "Reference genome folder provided is $genome_folder\t(absolute path is '$absolute_genome_folder)'\n";
5265 $genome_folder = $absolute_genome_folder;
5266 }
5267 else{
5268 die "Failed to move to $genome_folder: $!\nUSAGE: Bismark.pl [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>} [<hits>] (--help for more details)\n";
5269 }
5270
5271 my $CT_dir = "${genome_folder}Bisulfite_Genome/CT_conversion/";
5272 my $GA_dir = "${genome_folder}Bisulfite_Genome/GA_conversion/";
5273
5274 if ($bowtie2){ ### Bowtie 2 (new)
5275 ### checking the integrity of $CT_dir
5276 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
5277 my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2');
5278 foreach my $file(@CT_bowtie_index){
5279 unless (-f $file){
5280 die "The Bowtie 2 index of the C->T converted genome seems to be faulty ($file). Please run the bismark_genome_preparation before running Bismark.\n";
5281 }
5282 }
5283 ### checking the integrity of $GA_dir
5284 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
5285 my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2');
5286 foreach my $file(@GA_bowtie_index){
5287 unless (-f $file){
5288 die "The Bowtie 2 index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
5289 }
5290 }
5291 }
5292
5293 else{ ### Bowtie 1 (default)
5294 ### checking the integrity of $CT_dir
5295 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
5296 my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt');
5297 foreach my $file(@CT_bowtie_index){
5298 unless (-f $file){
5299 die "The Bowtie index of the C->T converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
5300 }
5301 }
5302 ### checking the integrity of $GA_dir
5303 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
5304 my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt');
5305 foreach my $file(@GA_bowtie_index){
5306 unless (-f $file){
5307 die "The Bowtie index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
5308 }
5309 }
5310 }
5311
5312 my $CT_index_basename = "${CT_dir}BS_CT";
5313 my $GA_index_basename = "${GA_dir}BS_GA";
5314
5315 ### INPUT OPTIONS
5316
5317 ### SEQUENCE FILE FORMAT
5318 ### exits if both fastA and FastQ were specified
5319 if ($fasta and $fastq){
5320 die "Only one sequence filetype can be specified (fastA or fastQ)\n";
5321 }
5322
5323 ### unless fastA is specified explicitely, fastQ sequence format is expected by default
5324 if ($fasta){
5325 print "FastA format specified\n";
5326 $sequence_format = 'FASTA';
5327 push @bowtie_options, '-f';
5328 }
5329 elsif ($fastq){
5330 print "FastQ format specified\n";
5331 $sequence_format = 'FASTQ';
5332 push @bowtie_options, '-q';
5333 }
5334 else{
5335 $fastq = 1;
5336 print "FastQ format assumed (by default)\n";
5337 $sequence_format = 'FASTQ';
5338 push @bowtie_options, '-q';
5339 }
5340
5341 ### SKIP
5342 if ($skip){
5343 warn "Skipping the first $skip reads from the input file\n";
5344 # push @bowtie_options,"-s $skip";
5345 }
5346
5347 ### UPTO
5348 if ($qupto){
5349 warn "Processing sequences up to read no. $qupto from the input file\n";
5350 if ($bowtie2){
5351 # push @bowtie_options,"--upto $qupto"; ## slightly changed for Bowtie 2
5352 }
5353 else{
5354 # push @bowtie_options,"--qupto $qupto";
5355 }
5356 }
5357
5358 ### QUALITY VALUES
5359 if (($phred33 and $phred64) or ($phred33 and $solexa) or ($phred64 and $solexa)){
5360 die "You can only specify one type of quality value at a time! (--phred33-quals or --phred64-quals or --solexa-quals)";
5361 }
5362 if ($phred33){ ## if nothing else is specified $phred33 will be used as default by both Bowtie 1 and 2.
5363 # Phred quality values work only when -q is specified
5364 unless ($fastq){
5365 die "Phred quality values works only when -q (FASTQ) is specified\n";
5366 }
5367 if ($bowtie2){
5368 push @bowtie_options,"--phred33";
5369 }
5370 else{
5371 push @bowtie_options,"--phred33-quals";
5372 }
5373 }
5374 if ($phred64){
5375 # Phred quality values work only when -q is specified
5376 unless ($fastq){
5377 die "Phred quality values work only when -q (FASTQ) is specified\n";
5378 }
5379 if ($bowtie2){
5380 push @bowtie_options,"--phred64";
5381 }
5382 else{
5383 push @bowtie_options,"--phred64-quals";
5384 }
5385 }
5386 else{
5387 $phred64 = 0;
5388 }
5389
5390 if ($solexa){
5391 if ($bowtie2){
5392 die "The option '--solexa-quals' is not compatible with Bowtie 2. Please respecify!\n";
5393 }
5394 # Solexa to Phred value conversion works only when -q is specified
5395 unless ($fastq){
5396 die "Conversion from Solexa to Phred quality values works only when -q (FASTQ) is specified\n";
5397 }
5398 push @bowtie_options,"--solexa-quals";
5399 }
5400 else{
5401 $solexa = 0;
5402 }
5403
5404 ### ALIGNMENT OPTIONS
5405
5406 ### MISMATCHES
5407 if (defined $mismatches){
5408 if ($bowtie2){
5409 if ($mismatches == 0 or $mismatches == 1){
5410 push @bowtie_options,"-N $mismatches";
5411 }
5412 else{
5413 die "Please set the number of multiseed mismatches for Bowtie 2 with '-N <int>' (where <int> can be 0 or 1)\n";
5414 }
5415 }
5416 else{
5417 if ($mismatches >= 0 and $mismatches <= 3){
5418 push @bowtie_options,"-n $mismatches";
5419 }
5420 else{
5421 die "Please set the number of seed mismatches for Bowtie 1 with '-n <int>' (where <int> can be 0,1,2 or 3)\n";
5422 }
5423 }
5424 }
5425 else{
5426 unless ($bowtie2){
5427 push @bowtie_options,"-n 1"; # setting -n to 1 by default (for use with Bowtie only) because it is much quicker than the default mode of -n 2
5428 }
5429 }
5430
5431 ### SEED LENGTH
5432 if (defined $seed_length){
5433 if ($bowtie2){
5434 push @bowtie_options,"-L $seed_length";
5435 }
5436 else{
5437 push @bowtie_options,"-l $seed_length";
5438 }
5439 }
5440
5441 ### MISMATCH CEILING
5442 if (defined $ceiling){
5443 die "The option '-e' is not compatible with Bowtie 2. Please respecify options\n" if ($bowtie2);
5444 push @bowtie_options,"-e $ceiling";
5445 }
5446
5447
5448 ### BOWTIE 2 EFFORT OPTIONS
5449
5450 ### CONSECUTIVE SEED EXTENSION FAILS
5451 if (defined $seed_extension_fails){
5452 die "The option '-D <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
5453 push @bowtie_options,"-D $seed_extension_fails";
5454 }
5455
5456 ### RE-SEEDING REPETITIVE SEEDS
5457 if (defined $reseed_repetitive_seeds){
5458 die "The option '-R <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
5459 push @bowtie_options,"-R $reseed_repetitive_seeds";
5460 }
5461
5462
5463 ### BOWTIE 2 SCORING OPTIONS
5464 if ($score_min){
5465 die "The option '--score_min <func>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
5466 unless ($score_min =~ /^L,.+,.+$/){
5467 die "The option '--score_min <func>' needs to be in the format <L,value,value> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
5468 }
5469 push @bowtie_options,"--score-min $score_min";
5470 }
5471 else{
5472 if ($bowtie2){
5473 push @bowtie_options,"--score-min L,0,-0.2"; # default setting, more stringent than normal Bowtie2
5474 }
5475 }
5476
5477 ### BOWTIE 2 PARALLELIZATION OPTIONS
5478 if (defined $parallel){
5479 die "The parallelization switch '-p' only works for Bowtie 2. Please respecify!" unless ($bowtie2);
5480 }
5481 if ($bowtie2){
5482 if ($parallel){
5483 die "Please select a value for -p of 2 or more!\n" unless ($parallel > 1);
5484 push @bowtie_options,"-p $parallel";
5485 push @bowtie_options,'--reorder'; ## re-orders the bowtie 2 output so that it does match the input files. This is abolutely required for parallelization to work.
5486 print "Each Bowtie 2 instance is going to be run with $parallel threads. Please monitor performance closely and tune down if needed!\n";
5487 sleep (2);
5488 }
5489 }
5490
5491 ### REPORTING OPTIONS
5492
5493 if ($bowtie2){
5494 push @bowtie_options,'--ignore-quals'; ## All mismatches will receive penalty for mismatches as if they were of high quality, which is 6 by default
5495
5496 ### Option -M is deprecated since Bowtie 2 version 2.0.0 beta7. I'll leave this option commented out for a while
5497 if(defined $most_valid_alignments){
5498
5499 warn "\nThe option -M is now deprecated (as of Bowtie 2 version 2.0.0 beta7). What used to be called -M mode is still the default mode. Use the -D and -R options to adjust the effort expended to find valid alignments.\n\n";
5500 # push @bowtie_options,"-M $most_valid_alignments";sleep (5);
5501 }
5502 # else{
5503 # push @bowtie_options,'-M 10'; # the default behavior for Bowtie 2 is to report (and sort) up to 500 alignments for a given sequence
5504 # }
5505 }
5506 else{ # Because of the way Bismark works we will always use the reporting option -k 2 (report up to 2 valid alignments) for Bowtie 1
5507 push @bowtie_options,'-k 2';
5508 }
5509
5510 ### --BEST
5511 if ($bowtie2){
5512 if ($best){ # Bowtie 2 does away with the concept of --best, so one can also not select --no-best when Bowtie 2 is to be used
5513 die "The option '--no-best' is not compatible with Bowtie 2. Please respecify options\n";
5514 }
5515 }
5516 else{
5517 # --best is the default option for Bowtie 1, specifying --no-best can turn it off (e.g. to speed up alignment process)
5518 unless ($best){
5519 push @bowtie_options,'--best';
5520 }
5521 }
5522
5523 ### VANILLA BISMARK (BOWTIE 1) OUTPUT
5524 if ($vanilla){
5525 if ($bowtie2){
5526 die "The options --bowtie2 and the --vanilla are not compatible. Please respecify!\n\n";
5527 }
5528 }
5529 else{
5530 $vanilla = 0;
5531 }
5532
5533 ### PAIRED-END MAPPING
5534 if ($mates1){
5535 my @mates1 = (split (/,/,$mates1));
5536 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n" unless ($mates2);
5537 my @mates2 = (split(/,/,$mates2));
5538 unless (scalar @mates1 == scalar @mates2){
5539 die "Paired-end mapping requires the same amounnt of mate1 and mate2 files, please respecify! (format: -1 <mates1> -2 <mates2>)\n";
5540 }
5541 while (1){
5542 my $mate1 = shift @mates1;
5543 my $mate2 = shift @mates2;
5544 last unless ($mate1 and $mate2);
5545 push @filenames,"$mate1,$mate2";
5546 }
5547 if ($bowtie2){
5548 push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments
5549 push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones
5550 }
5551 }
5552 elsif ($mates2){
5553 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n";
5554 }
5555
5556 ### SINGLE-END MAPPING
5557 # Single-end mapping will be performed if no mate pairs for paired-end mapping have been specified
5558 my $singles;
5559 unless ($mates1 and $mates2){
5560 $singles = join (',',@ARGV);
5561 unless ($singles){
5562 die "\nNo filename supplied! Please specify one or more files for single-end Bismark mapping!\n";
5563 }
5564 $singles =~ s/\s/,/g;
5565 @filenames = (split(/,/,$singles));
5566 warn "\nFiles to be analysed:\n";
5567 warn "@filenames\n\n";
5568 sleep (3);
5569 }
5570
5571 ### MININUM INSERT SIZE (PAIRED-END ONLY)
5572 if (defined $minins){
5573 die "-I/--minins can only be used for paired-end mapping!\n\n" if ($singles);
5574 push @bowtie_options,"--minins $minins";
5575 }
5576
5577 ### MAXIMUM INSERT SIZE (PAIRED-END ONLY)
5578 if (defined $maxins){
5579 die "-X/--maxins can only be used for paired-end mapping!\n\n" if ($singles);
5580 push @bowtie_options,"--maxins $maxins";
5581 }
5582 else{
5583 unless ($singles){
5584 push @bowtie_options,'--maxins 500';
5585 }
5586 }
5587
5588 ### QUIET prints nothing besides alignments (suppresses warnings)
5589 if ($quiet){
5590 push @bowtie_options,'--quiet';
5591 }
5592
5593 ### CHUNKMBS needed to be increased to avoid memory exhaustion warnings for Bowtie 1, particularly for --best (and paired-end) alignments
5594 unless ($bowtie2){ # Bowtie 2 does not have a chunkmbs option
5595 if (defined $chunk){
5596 push @bowtie_options,"--chunkmbs $chunk";
5597 }
5598 else{
5599 push @bowtie_options,'--chunkmbs 512'; ## setting the default to 512MB (up from 64 default)
5600 }
5601 }
5602
5603
5604 ### SUMMARY OF ALL BOWTIE OPTIONS
5605 my $bowtie_options = join (' ',@bowtie_options);
5606
5607
5608 ### STRAND-SPECIFIC LIBRARIES
5609 my $directional;
5610 if ($non_directional){
5611 print "Library was specified to be not strand-specific (non-directional), therefore alignments to all four possible bisulfite strands (OT, CTOT, OB and CTOB) will be reported.\n";
5612 sleep (3);
5613 $directional = 0;
5614 }
5615 else{
5616 print "Library is assumed to be strand-specific (directional), alignments to strands complementary to the original top or bottom strands will be ignored (i.e. not performed!).\n";
5617 sleep (3);
5618 $directional = 1; # Changed this to being the default behaviour
5619 }
5620
5621 ### UNMAPPED SEQUENCE OUTPUT
5622 $unmapped = 0 unless ($unmapped);
5623
5624 ### AMBIGUOUS ALIGNMENT SEQUENCE OUTPUT
5625 $multi_map = 0 unless ($multi_map);
5626
5627
5628 ### OUTPUT DIRECTORY
5629
5630 chdir $parent_dir or die "Failed to move back to current working directory\n";
5631 if ($output_dir){
5632 unless ($output_dir =~ /\/$/){
5633 $output_dir =~ s/$/\//;
5634 }
5635
5636 if (chdir $output_dir){
5637 $output_dir = getcwd; # making the path absolute
5638 unless ($output_dir =~ /\/$/){
5639 $output_dir =~ s/$/\//;
5640 }
5641 }
5642 else{
5643 mkdir $output_dir or die "Unable to create directory $output_dir $!\n";
5644 warn "Created output directory $output_dir!\n\n";
5645 chdir $output_dir or die "Failed to move to $output_dir\n";
5646 $output_dir = getcwd; # making the path absolute
5647 unless ($output_dir =~ /\/$/){
5648 $output_dir =~ s/$/\//;
5649 }
5650 }
5651 warn "Output will be written into the directory: $output_dir\n";
5652 }
5653 else{
5654 $output_dir = '';
5655 }
5656
5657 ### TEMPORARY DIRECTORY for C->T and G->A transcribed files
5658
5659 chdir $parent_dir or die "Failed to move back to current working directory\n";
5660 if ($temp_dir){
5661 warn "\nUsing temp directory: $temp_dir\n";
5662 unless ($temp_dir =~ /\/$/){
5663 $temp_dir =~ s/$/\//;
5664 }
5665
5666 if (chdir $temp_dir){
5667 $temp_dir = getcwd; # making the path absolute
5668 unless ($temp_dir =~ /\/$/){
5669 $temp_dir =~ s/$/\//;
5670 }
5671 }
5672 else{
5673 mkdir $temp_dir or die "Unable to create directory $temp_dir $!\n";
5674 warn "Created temporary directory $temp_dir!\n\n";
5675 chdir $temp_dir or die "Failed to move to $temp_dir\n";
5676 $temp_dir = getcwd; # making the path absolute
5677 unless ($temp_dir =~ /\/$/){
5678 $temp_dir =~ s/$/\//;
5679 }
5680 }
5681 warn "Temporary files will be written into the directory: $temp_dir\n";
5682 }
5683 else{
5684 $temp_dir = '';
5685 }
5686
5687
5688 return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir);
5689 }
5690
5691
5692
5693 sub generate_SAM_header{
5694 print OUT "\@HD\tVN:1.0\tSO:unsorted\n"; # @HD = header, VN = version, SO = sort order
5695 foreach my $chr (keys %chromosomes){
5696 my $length = length ($chromosomes{$chr});
5697 print OUT "\@SQ\tSN:$chr\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length
5698 }
5699 print OUT "\@PG\tID:Bismark\tVN:$bismark_version\tCL:\"bismark $command_line\"\n"; # @PG = program, ID = unique identifier, PN = program name name, VN = program version
5700 }
5701
5702 ### I would like to thank the following individuals for their valuable contributions to the Bismark SAM output format:
5703 ### O. Tam (Sep 2010), C. Whelan (2011), E. Vidal (2011), T. McBryan (2011), P. Hickey (2011)
5704
5705 sub single_end_SAM_output{
5706 my ($id,$actual_seq,$methylation_call_params,$qual) = @_;
5707 my $strand = $methylation_call_params->{$id}->{alignment_strand};
5708 my $chr = $methylation_call_params->{$id}->{chromosome};
5709 my $start = $methylation_call_params->{$id}->{position};
5710 my $stop = $methylation_call_params->{$id}->{end_position};
5711 my $ref_seq = $methylation_call_params->{$id}->{unmodified_genomic_sequence};
5712 my $methcall = $methylation_call_params->{$id}->{methylation_call};
5713 my $read_conversion = $methylation_call_params->{$id}->{read_conversion};
5714 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion};
5715 my $number_of_mismatches = $methylation_call_params->{$id}->{number_of_mismatches};
5716 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011"
5717 ## FLAG: bitwise FLAG. Each bit is explained in the following table:
5718 ## Bit Description Comment Value
5719 ## 0x1 template has multiple segments in sequencing 0: single-end 1: paired end value: 2**0 ( 1)
5720 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2**1 ( 2)
5721 ## 0x4 segment unmapped --- ---
5722 ## 0x8 next segment in the template unmapped --- ---
5723 ## 0x10 SEQ being reverse complemented value: 2**4 ( 16)
5724 ## 0x20 SEQ of the next segment in the template being reversed value: 2**5 ( 32)
5725 ## 0x40 the first segment in the template read 1 value: 2**6 ( 64)
5726 ## 0x80 the last segment in the template read 2 value: 2**7 (128)
5727 ## 0x100 secondary alignment --- ---
5728 ## 0x200 not passing quality controls --- ---
5729 ## 0x400 PCR or optical duplicate --- ---
5730
5731 #####
5732
5733 my $flag; # FLAG variable used for SAM format.
5734 if ($strand eq "+"){
5735 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){
5736 $flag = 0; # 0 for "+" strand (OT)
5737 }
5738 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){
5739 $flag = 16; # 16 for "-" strand (CTOB, yields information for the original bottom strand)
5740 }
5741 else{
5742 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n";
5743 }
5744 }
5745 elsif ($strand eq "-"){
5746 if ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){
5747 $flag = 16; # 16 for "-" strand (OB)
5748 }
5749 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){
5750 $flag = 0; # 0 for "+" strand (CTOT, yields information for the original top strand)
5751 }
5752 else{
5753 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n";
5754 }
5755 }
5756 else{
5757 die "Unexpected strand information: $strand\n\n";
5758 }
5759
5760 #####
5761
5762 my $mapq = 255; # Assume mapping quality is unavailable
5763
5764 #####
5765
5766 my $cigar;
5767 if ($bowtie2){
5768 $cigar = $methylation_call_params->{$id}->{CIGAR}; # Actual CIGAR string reported by Bowtie 2
5769 }
5770 else{
5771 $cigar = length($actual_seq) . "M"; # Bowtie 1 output does not contain indels (only matches and mismatches)
5772 }
5773
5774 #####
5775
5776 my $rnext = "*"; # Paired-end variable
5777
5778 #####
5779
5780 my $pnext = 0; # Paired-end variable
5781
5782 #####
5783
5784 my $tlen = 0; # Paired-end variable
5785
5786 #####
5787
5788 if ($read_conversion eq 'CT'){
5789 $ref_seq = substr($ref_seq, 0, length($ref_seq) - 2); # Removes additional nucleotides from the 3' end. This only works for the original top or bottom strands
5790 }
5791 else{
5792 $ref_seq = substr($ref_seq, 2, length($ref_seq) - 2); # Removes additional nucleotides from the 5' end. This works for the complementary strands in non-directional libraries
5793 }
5794
5795 if ($strand eq '-'){
5796 $actual_seq = revcomp($actual_seq); # Sequence represented on the forward genomic strand
5797 $ref_seq = revcomp($ref_seq); # Required for comparison with actual sequence
5798 $qual = reverse $qual; # if the sequence was reverse-complemented the quality string needs to be reversed as well
5799 }
5800
5801 #####
5802
5803 my $hemming_dist = hemming_dist($actual_seq,$ref_seq); # Edit distance to the reference, i.e. minimal number of one-nucleotide edits needed to transform the read string
5804 # into the reference string. hemming_dist()
5805 if ($bowtie2){
5806 $hemming_dist += $methylation_call_params->{$id}->{indels}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
5807 }
5808
5809 my $NM_tag = "NM:i:$hemming_dist"; # Optional tag NM: edit distance based on nucleotide differences
5810
5811 #####
5812
5813 my $XX_tag = make_mismatch_string($actual_seq, $ref_seq); # Optional tag XX: string providing mismatched reference bases in the alignment (NO indel information!)
5814
5815 #####
5816
5817 my $XM_tag; # Optional tag XM: Methylation Call String
5818 if ($strand eq '+'){
5819 $XM_tag = "XM:Z:$methcall";
5820 }
5821 elsif ($strand eq '-'){
5822 $XM_tag = 'XM:Z:'.reverse $methcall; # if the sequence was reverse-complemented the methylation call string needs to be reversed as well
5823 }
5824
5825 #####
5826
5827 my $XR_tag = "XR:Z:$read_conversion"; # Optional tag XR: Read Conversion
5828
5829 #####
5830
5831 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion
5832
5833 #####
5834
5835 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
5836 print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$XX_tag,$XM_tag,$XR_tag,$XG_tag)),"\n";
5837 }
5838
5839
5840 sub paired_end_SAM_output{
5841 my ($id,$actual_seq_1,$actual_seq_2,$methylation_call_params,$qual_1,$qual_2) = @_;
5842 my $strand_1 = $methylation_call_params->{$id}->{alignment_read_1}; # Bowtie 1 only reports the read 1 alignment strand
5843 my $strand_2 = $methylation_call_params->{$id}->{alignment_read_2};
5844 my $chr = $methylation_call_params->{$id}->{chromosome};
5845 my $ref_seq_1 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_1};
5846 my $ref_seq_2 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_2};
5847 my $methcall_1 = $methylation_call_params->{$id}->{methylation_call_1};
5848 my $methcall_2 = $methylation_call_params->{$id}->{methylation_call_2};
5849 my $read_conversion_1 = $methylation_call_params->{$id}->{read_conversion_1};
5850 my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2};
5851 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion};
5852 my $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_1}; # only needed for custom allele-specific output, not the default!
5853 my $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_2};
5854
5855 my $id_1 = $id.'/1';
5856 my $id_2 = $id.'/2';
5857
5858 # Allows all degenerate nucleotide sequences in reference genome
5859 die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHV]/i;
5860 die "Reference sequence ($ref_seq_2) contains invalid nucleotides!\n" if $ref_seq_2 =~ /[^ACTGNRYMKSWBDHV]/i;
5861
5862 my $index; # used to store the srand origin of the alignment in a less convoluted way
5863
5864 if ($read_conversion_1 eq 'CT' and $genome_conversion eq 'CT'){
5865 $index = 0; ## this is OT (original top strand)
5866 }
5867 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'GA'){
5868 $index = 1; ## this is CTOB (complementary to OB)
5869 }
5870 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'CT'){
5871 $index = 2; ## this is CTOT (complementary to OT)
5872 }
5873 elsif ($read_conversion_1 eq 'CT' and $genome_conversion eq 'GA'){
5874 $index = 3; ## this is OB (original bottom)
5875 }
5876 else {
5877 die "Unexpected combination of read 1 and genome conversion: $read_conversion_1 / $genome_conversion\n";
5878 }
5879
5880 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the
5881 ### first or last position.
5882
5883 if ($index == 0 or $index == 3){ # OT or OB
5884 $ref_seq_1 = substr($ref_seq_1,0,length($ref_seq_1)-2);
5885 $ref_seq_2 = substr($ref_seq_2,2,length($ref_seq_2)-2);
5886 }
5887 else{ # CTOT or CTOB
5888 $ref_seq_1 = substr($ref_seq_1,2,length($ref_seq_1)-2);
5889 $ref_seq_2 = substr($ref_seq_2,0,length($ref_seq_2)-2);
5890 }
5891
5892 #####
5893
5894 my $start_read_1;
5895 my $start_read_2;
5896 # adjusting end positions
5897
5898 if ($bowtie2){
5899 $start_read_1 = $methylation_call_params->{$id}->{position_1};
5900 $start_read_2 = $methylation_call_params->{$id}->{position_2};
5901 }
5902 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1
5903 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand
5904 $start_read_1 = $methylation_call_params->{$id}->{start_seq_1};
5905 $start_read_2 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_2) + 1;
5906 }
5907 else{ # read 1 is on the - strand
5908 $start_read_1 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_1) + 1;
5909 $start_read_2 = $methylation_call_params->{$id}->{start_seq_1};
5910 }
5911 }
5912
5913 #####
5914
5915 my $end_read_1;
5916 my $end_read_2;
5917 # adjusting end positions
5918
5919 if ($bowtie2){
5920 $end_read_1 = $methylation_call_params->{$id}->{end_position_1};
5921 $end_read_2 = $methylation_call_params->{$id}->{end_position_2};
5922 }
5923 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1
5924 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand
5925 $end_read_1 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_1)-1;
5926 $end_read_2 = $methylation_call_params->{$id}->{alignment_end};
5927 }
5928 else{
5929 $end_read_1 = $methylation_call_params->{$id}->{alignment_end};
5930 $end_read_2 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_2)-1;
5931 }
5932 }
5933
5934 #####
5935
5936 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011"
5937 ## FLAG: bitwise FLAG. Each bit is explained in the following table:
5938 ## Bit Description Comment Value
5939 ## 0x1 template having multiple segments in sequencing 0: single-end 1: paired end value: 2^^0 ( 1)
5940 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2^^1 ( 2)
5941 ## 0x4 segment unmapped --- ---
5942 ## 0x8 next segment in the template unmapped --- ---
5943 ## 0x10 SEQ being reverse complemented - strand alignment value: 2^^4 ( 16)
5944 ## 0x20 SEQ of the next segment in the template being reversed + strand alignment value: 2^^5 ( 32)
5945 ## 0x40 the first segment in the template read 1 value: 2^^6 ( 64)
5946 ## 0x80 the last segment in the template read 2 value: 2^^7 (128)
5947 ## 0x100 secondary alignment --- ---
5948 ## 0x200 not passing quality controls --- ---
5949 ## 0x400 PCR or optical duplicate --- ---
5950
5951 ### As the FLAG value do not consider that there might be 4 different bisulfite strands of DNA, we are trying to make FLAG tags which take the strand identity into account
5952
5953 # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand)
5954 # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences)
5955
5956 my $flag_1; # FLAG variable used for SAM format
5957 my $flag_2;
5958
5959 if ($index == 0){ # OT
5960 $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it)
5961 $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128)
5962 }
5963 elsif ($index == 1){ # CTOB
5964 $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64)
5965 $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128)
5966 }
5967 elsif ($index == 2){ # CTOT
5968 $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64)
5969 $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128)
5970 }
5971 elsif ($index == 3){ # OB
5972 $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64)
5973 $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128)
5974 }
5975
5976 #####
5977
5978 my $mapq = 255; # Mapping quality is unavailable
5979
5980 #####
5981
5982 my $cigar_1;
5983 my $cigar_2;
5984
5985 if ($bowtie2){
5986 $cigar_1 = $methylation_call_params->{$id}->{CIGAR_1}; # Actual CIGAR string reported by Bowtie 2
5987 $cigar_2 = $methylation_call_params->{$id}->{CIGAR_2};
5988 }
5989 else{
5990 $cigar_1 = length($actual_seq_1) . "M"; # Assume no indels for Bowtie 1 mapping (only matches and mismatches)
5991 $cigar_2 = length($actual_seq_2) . "M";
5992 }
5993
5994 #####
5995
5996 my $rnext = '='; # Chromosome of mate; applies to both reads
5997
5998 #####
5999
6000 my $pnext_1 = $start_read_2; # Leftmost position of mate
6001 my $pnext_2 = $start_read_1;
6002
6003 #####
6004
6005 my $tlen_1; # signed observed Template LENgth (or inferred fragment size)
6006 my $tlen_2;
6007
6008 if ($bowtie2){
6009
6010 if ($start_read_1 <= $start_read_2){
6011
6012 # Read 1 alignment is leftmost
6013
6014 if ($end_read_2 >= $end_read_1){
6015
6016 # -------------------------> read 1 reads overlapping
6017 # <------------------------- read 2
6018 #
6019 # or
6020 #
6021 # -------------------------> read 1
6022 # <----------------------- read 2 read 2 contained within read 1
6023 #
6024 # or
6025 #
6026 # -------------------------> read 1 reads 1 and 2 exactly overlapping
6027 # <------------------------- read 2
6028 #
6029
6030 # dovetailing of reads is not enabled for Bowtie 2 alignments
6031
6032 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
6033 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
6034 }
6035 elsif ($end_read_2 < $end_read_1){
6036
6037 # -------------------------> read 1
6038 # <----------- read 2 read 2 contained within read 1
6039 #
6040 # or
6041 #
6042 # -------------------------> read 1
6043 # <----------- read 2 read 2 contained within read 1
6044
6045 # start and end of read 2 are fully contained within read 1
6046 $tlen_1 = 0; # Set as 0 when the information is unavailable
6047 $tlen_2 = 0; # Set as 0 when the information is unavailable
6048 }
6049
6050 }
6051
6052 elsif ($start_read_2 < $start_read_1){
6053
6054 if ($end_read_1 >= $end_read_2){
6055
6056 # Read 2 alignment is leftmost
6057
6058 # -------------------------> read 2 reads overlapping
6059 # <------------------------- read 1
6060 #
6061 # or
6062 #
6063 # -------------------------> read 2
6064 # <----------------------- read 1 read 1 contained within read 2
6065 #
6066 #
6067
6068 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
6069 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
6070 }
6071 elsif ($end_read_1 < $end_read_2){
6072
6073 # -------------------------> read 2
6074 # <----------- read 1 read 1 contained within read 2
6075 #
6076 # or
6077 #
6078 # -------------------------> read 2
6079 # <----------- read 1 read 1 contained within read 2
6080
6081 # start and end of read 1 are fully contained within read 2
6082 $tlen_1 = 0; # Set as 0 when the information is unavailable
6083 $tlen_2 = 0; # Set as 0 when the information is unavailable
6084 }
6085 }
6086 }
6087
6088 else{ # Bowtie 1
6089
6090 if ($end_read_2 >= $end_read_1){
6091 # Read 1 alignment is leftmost
6092 # -------------------------> read 1
6093 # <------------------------- read 2
6094 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
6095
6096 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
6097 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
6098 }
6099 else{
6100 # Read 2 alignment is leftmost
6101 # -------------------------> read 2
6102 # <------------------------- read 1
6103 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
6104
6105 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
6106 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
6107 }
6108 }
6109
6110 #####
6111
6112 # adjusting the strand of the sequence before we use them to generate mismatch strings
6113 if ($strand_1 eq '-'){
6114 $actual_seq_1 = revcomp($actual_seq_1); # Sequence represented on the forward genomic strand
6115 $ref_seq_1 = revcomp($ref_seq_1); # Required for comparison with actual sequence
6116 $qual_1 = reverse $qual_1; # we need to reverse the quality string as well
6117 }
6118 if ($strand_2 eq '-'){
6119 $actual_seq_2 = revcomp($actual_seq_2); # Mate sequence represented on the forward genomic strand
6120 $ref_seq_2 = revcomp($ref_seq_2); # Required for comparison with actual sequence
6121 $qual_2 = reverse $qual_2; # If the sequence gets reverse complemented we reverse the quality string as well
6122 }
6123
6124 # print "$actual_seq_1\n$ref_seq_1\n\n";
6125 # print "$actual_seq_2\n$ref_seq_2\n\n";
6126
6127 #####
6128
6129 my $hemming_dist_1 = hemming_dist($actual_seq_1,$ref_seq_1); # Minimal number of one-nucleotide edits needed to transform the read string into the reference sequence
6130 my $hemming_dist_2 = hemming_dist($actual_seq_2,$ref_seq_2);
6131 if ($bowtie2){
6132 $hemming_dist_1 += $methylation_call_params->{$id}->{indels_1}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
6133 $hemming_dist_2 += $methylation_call_params->{$id}->{indels_2}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
6134 }
6135 my $NM_tag_1 = "NM:i:$hemming_dist_1"; # Optional tag NM: edit distance based on nucleotide differences
6136 my $NM_tag_2 = "NM:i:$hemming_dist_2"; # Optional tag NM: edit distance based on nucleotide differences
6137
6138 #####
6139
6140 my $XX_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1); # Optional tag XX: String providing mismatched reference bases in the alignment (NO indel information!)
6141 my $XX_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2);
6142
6143 #####
6144
6145 my $XM_tag_1; # Optional tag XM: Methylation call string
6146 my $XM_tag_2;
6147
6148 if ($strand_1 eq '-'){
6149 $XM_tag_1 = 'XM:Z:'.reverse $methcall_1; # Needs to be reversed if the sequence was reverse complemented
6150 }
6151 else{
6152 $XM_tag_1 = "XM:Z:$methcall_1";
6153 }
6154
6155 if ($strand_2 eq '-'){
6156 $XM_tag_2 = 'XM:Z:'.reverse $methcall_2; # Needs to be reversed if the sequence was reverse complemented
6157 }
6158 else{
6159 $XM_tag_2 = "XM:Z:$methcall_2";
6160 }
6161
6162 #####
6163
6164 my $XR_tag_1 = "XR:Z:$read_conversion_1"; # Optional tag XR: Read 1 conversion state
6165 my $XR_tag_2 = "XR:Z:$read_conversion_2"; # Optional tag XR: Read 2 conversion state
6166
6167 #####
6168
6169 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion state; valid for both reads
6170
6171 #####
6172
6173 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
6174 print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $XX_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag)), "\n";
6175 print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $XX_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag)), "\n";
6176 }
6177
6178 sub revcomp{
6179 my $seq = shift or die "Missing seq to reverse complement\n";
6180 $seq = reverse $seq;
6181 $seq =~ tr/ACTGactg/TGACTGAC/;
6182 return $seq;
6183 }
6184
6185 sub hemming_dist{
6186 my $matches = 0;
6187 my @actual_seq = split //,(shift @_);
6188 my @ref_seq = split //,(shift @_);
6189 foreach (0..$#actual_seq){
6190 ++$matches if ($actual_seq[$_] eq $ref_seq[$_]);
6191 }
6192 return my $hd = scalar @actual_seq - $matches;
6193 }
6194
6195 sub make_mismatch_string{
6196 my $actual_seq = shift or die "Missing actual sequence";
6197 my $ref_seq = shift or die "Missing reference sequence";
6198 my $XX_tag = "XX:Z:";
6199 my $tmp = ($actual_seq ^ $ref_seq); # Bitwise comparison
6200 my $prev_mm_pos = 0;
6201 while($tmp =~ /[^\0]/g){ # Where bitwise comparison showed a difference
6202 my $nuc_match = pos($tmp) - $prev_mm_pos - 1; # Generate number of nucleotide that matches since last mismatch
6203 my $nuc_mm = substr($ref_seq, pos($tmp) - 1, 1) if pos($tmp) <= length($ref_seq); # Obtain reference nucleotide that was different from the actual read
6204 $XX_tag .= "$nuc_match" if $nuc_match > 0; # Ignore if mismatches are adjacent to each other
6205 $XX_tag .= "$nuc_mm" if defined $nuc_mm; # Ignore if there is no mismatch (prevents uninitialized string concatenation)
6206 $prev_mm_pos = pos($tmp); # Position of last mismatch
6207 }
6208 my $end_matches = length($ref_seq) - $prev_mm_pos; # Provides number of matches from last mismatch till end of sequence
6209 $XX_tag .= "$end_matches" if $end_matches > 0; # Ignore if mismatch is at the end of sequence
6210 return $XX_tag;
6211 }
6212
6213
6214
6215 sub print_helpfile{
6216 print << "HOW_TO";
6217
6218
6219 This program is free software: you can redistribute it and/or modify
6220 it under the terms of the GNU General Public License as published by
6221 the Free Software Foundation, either version 3 of the License, or
6222 (at your option) any later version.
6223
6224 This program is distributed in the hope that it will be useful,
6225 but WITHOUT ANY WARRANTY; without even the implied warranty of
6226 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
6227 GNU General Public License for more details.
6228 You should have received a copy of the GNU General Public License
6229 along with this program. If not, see <http://www.gnu.org/licenses/>.
6230
6231
6232
6233 DESCRIPTION
6234
6235
6236 The following is a brief description of command line options and arguments to control the Bismark
6237 bisulfite mapper and methylation caller. Bismark takes in FastA or FastQ files and aligns the
6238 reads to a specified bisulfite genome. Sequence reads are transformed into a bisulfite converted forward strand
6239 version (C->T conversion) or into a bisulfite treated reverse strand (G->A conversion of the forward strand).
6240 Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome
6241 (C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the
6242 forward strand, by doing this alignments will produce the same positions). These 4 instances of Bowtie (1 or 2)
6243 are run in parallel. The sequence file(s) are then read in again sequence by sequence to pull out the original
6244 sequence from the genome and determine if there were any protected C's present or not.
6245
6246 As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be
6247 re-enabled by using --non_directional.
6248
6249 The final output of Bismark is in SAM format by default. For Bowtie 1 one can alos choose to report the old
6250 'vanilla' output format, which is a single tab delimited file with all sequences that have a unique best
6251 alignment to any of the 4 possible strands of a bisulfite PCR product. Both formats are described in more detail below.
6252
6253
6254 USAGE: bismark [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>}
6255
6256
6257 ARGUMENTS:
6258
6259 <genome_folder> The path to the folder containing the unmodified reference genome
6260 as well as the subfolders created by the Bismark_Genome_Preparation
6261 script (/Bisulfite_Genome/CT_conversion/ and /Bisulfite_Genome/GA_conversion/).
6262 Bismark expects one or more fastA files in this folder (file extension: .fa
6263 or .fasta). The path can be relative or absolute.
6264
6265 -1 <mates1> Comma-separated list of files containing the #1 mates (filename usually includes
6266 "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
6267 correspond file-for-file and read-for-read with those specified in <mates2>.
6268 Reads may be a mix of different lengths. Bismark will produce one mapping result
6269 and one report file per paired-end input file pair.
6270
6271 -2 <mates2> Comma-separated list of files containing the #2 mates (filename usually includes
6272 "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
6273 correspond file-for-file and read-for-read with those specified in <mates1>.
6274 Reads may be a mix of different lengths.
6275
6276 <singles> A comma- or space-separated list of files containing the reads to be aligned (e.g.
6277 lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will
6278 produce one mapping result and one report file per input file.
6279
6280
6281 OPTIONS:
6282
6283
6284 Input:
6285
6286 -q/--fastq The query input files (specified as <mate1>,<mate2> or <singles> are FASTQ
6287 files (usually having extension .fg or .fastq). This is the default. See also
6288 --solexa-quals.
6289
6290 -f/--fasta The query input files (specified as <mate1>,<mate2> or <singles> are FASTA
6291 files (usually havin extension .fa, .mfa, .fna or similar). All quality values
6292 are assumed to be 40 on the Phred scale.
6293
6294 -s/--skip <int> Skip (i.e. do not align) the first <int> reads or read pairs from the input.
6295
6296 -u/--upto <int> Only aligns the first <int> reads or read pairs from the input. Default: no limit.
6297
6298 --phred33-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 33. Default: on.
6299
6300 --phred64-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 64. Default: off.
6301
6302 --solexa-quals Convert FASTQ qualities from solexa-scaled (which can be negative) to phred-scaled
6303 (which can't). The formula for conversion is:
6304 phred-qual = 10 * log(1 + 10 ** (solexa-qual/10.0)) / log(10). Used with -q. This
6305 is usually the right option for use with (unconverted) reads emitted by the GA
6306 Pipeline versions prior to 1.3. Works only for Bowtie 1. Default: off.
6307
6308 --solexa1.3-quals Same as --phred64-quals. This is usually the right option for use with (unconverted)
6309 reads emitted by GA Pipeline version 1.3 or later. Default: off.
6310
6311 --path_to_bowtie The full path </../../> to the Bowtie (1 or 2) installation on your system. If not
6312 specified it is assumed that Bowtie (1 or 2) is in the PATH.
6313
6314
6315 Alignment:
6316
6317 -n/--seedmms <int> The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs
6318 of the read (where L is set with -l/--seedlen). This may be 0, 1, 2 or 3 and the
6319 default is 1. This option is only available for Bowtie 1 (for Bowtie 2 see -N).
6320
6321 -l/--seedlen The "seed length"; i.e., the number of bases of the high quality end of the read to
6322 which the -n ceiling applies. The default is 28. Bowtie (and thus Bismark) is faster for
6323 larger values of -l. This option is only available for Bowtie 1 (for Bowtie 2 see -L).
6324
6325 -e/--maqerr <int> Maximum permitted total of quality values at all mismatched read positions throughout
6326 the entire alignment, not just in the "seed". The default is 70. Like Maq, bowtie rounds
6327 quality values to the nearest 10 and saturates at 30. This value is not relevant for
6328 Bowtie 2.
6329
6330 --chunkmbs <int> The number of megabytes of memory a given thread is given to store path descriptors in
6331 --best mode. Best-first search must keep track of many paths at once to ensure it is
6332 always extending the path with the lowest cumulative cost. Bowtie tries to minimize the
6333 memory impact of the descriptors, but they can still grow very large in some cases. If
6334 you receive an error message saying that chunk memory has been exhausted in --best mode,
6335 try adjusting this parameter up to dedicate more memory to the descriptors. This value
6336 is not relevant for Bowtie 2. Default: 512.
6337
6338 -I/--minins <int> The minimum insert size for valid paired-end alignments. E.g. if -I 60 is specified and
6339 a paired-end alignment consists of two 20-bp alignments in the appropriate orientation
6340 with a 20-bp gap between them, that alignment is considered valid (as long as -X is also
6341 satisfied). A 19-bp gap would not be valid in that case. Default: 0.
6342
6343 -X/--maxins <int> The maximum insert size for valid paired-end alignments. E.g. if -X 100 is specified and
6344 a paired-end alignment consists of two 20-bp alignments in the proper orientation with a
6345 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied).
6346 A 61-bp gap would not be valid in that case. Default: 500.
6347
6348
6349 Bowtie 1 Reporting:
6350
6351 -k <2> Due to the way Bismark works Bowtie will report up to 2 valid alignments. This option
6352 will be used by default.
6353
6354 --best Make Bowtie guarantee that reported singleton alignments are "best" in terms of stratum
6355 (i.e. number of mismatches, or mismatches in the seed in the case if -n mode) and in
6356 terms of the quality; e.g. a 1-mismatch alignment where the mismatch position has Phred
6357 quality 40 is preferred over a 2-mismatch alignment where the mismatched positions both
6358 have Phred quality 10. When --best is not specified, Bowtie may report alignments that
6359 are sub-optimal in terms of stratum and/or quality (though an effort is made to report
6360 the best alignment). --best mode also removes all strand bias. Note that --best does not
6361 affect which alignments are considered "valid" by Bowtie, only which valid alignments
6362 are reported by Bowtie. Bowtie is about 1-2.5 times slower when --best is specified.
6363 Default: on.
6364
6365 --no_best Disables the --best option which is on by default. This can speed up the alignment process,
6366 e.g. for testing purposes, but for credible results it is not recommended to disable --best.
6367
6368
6369 Output:
6370
6371 --non_directional The sequencing library was constructed in a non strand-specific manner, alignments to all four
6372 bisulfite strands will be reported. Default: OFF.
6373
6374 (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary
6375 to the original strands are merely theoretical and should not exist in reality. Specifying directional
6376 alignments (which is the default) will only run 2 alignment threads to the original top (OT)
6377 or bottom (OB) strands in parallel and report these alignments. This is the recommended option
6378 for sprand-specific libraries).
6379
6380 --sam-no-hd Suppress SAM header lines (starting with @). This might be useful when very large input files are
6381 split up into several smaller files to run concurrently and the output files are to be merged.
6382
6383 --quiet Print nothing besides alignments.
6384
6385 --vanilla Performs bisulfite mapping with Bowtie 1 and prints the 'old' output (as in Bismark 0.5.X) instead
6386 of SAM format output.
6387
6388 -un/--unmapped Write all reads that could not be aligned to a file in the output directory. Written reads will
6389 appear as they did in the input, without any translation of quality values that may have
6390 taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1
6391 and _2 inserted in their filenames, i.e. _unmapped_reads_1.txt and unmapped_reads_2.txt. Reads
6392 with more than one valid alignment with the same number of lowest mismatches (ambiguous mapping)
6393 are also written to _unmapped_reads.txt unless the option --ambiguous is specified as well.
6394
6395 --ambiguous Write all reads which produce more than one valid alignment with the same number of lowest
6396 mismatches or other reads that fail to align uniquely to a file in the output directory.
6397 Written reads will appear as they did in the input, without any of the translation of quality
6398 values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two
6399 parallel files with _1 and _2 inserted in theit filenames, i.e. _ambiguous_reads_1.txt and
6400 _ambiguous_reads_2.txt. These reads are not written to the file specified with --un.
6401
6402 -o/--output_dir <dir> Write all output files into this directory. By default the output files will be written into
6403 the same folder as the input file(s). If the specified folder does not exist, Bismark will attempt
6404 to create it first. The path to the output folder can be either relative or absolute.
6405
6406 --temp_dir <dir> Write temporary files to this directory instead of into the same directory as the input files. If
6407 the specified folder does not exist, Bismark will attempt to create it first. The path to the
6408 temporary folder can be either relative or absolute.
6409
6410
6411
6412 Other:
6413
6414 -h/--help Displays this help file.
6415
6416 -v/--version Displays version information.
6417
6418
6419 BOWTIE 2 SPECIFIC OPTIONS
6420
6421 --bowtie2 Uses Bowtie 2 instead of Bowtie 1. Bismark limits Bowtie 2 to only perform end-to-end
6422 alignments, i.e. searches for alignments involving all read characters (also called
6423 untrimmed or unclipped alignments). Bismark assumes that raw sequence data is adapter
6424 and/or quality trimmed where appropriate. Default: off.
6425
6426 Bowtie 2 alignment options:
6427
6428 -N <int> Sets the number of mismatches to allowed in a seed alignment during multiseed alignment.
6429 Can be set to 0 or 1. Setting this higher makes alignment slower (often much slower)
6430 but increases sensitivity. Default: 0. This option is only available for Bowtie 2 (for
6431 Bowtie 1 see -n).
6432
6433 -L <int> Sets the length of the seed substrings to align during multiseed alignment. Smaller values
6434 make alignment slower but more senstive. Default: the --sensitive preset of Bowtie 2 is
6435 used by default, which sets -L to 20. This option is only available for Bowtie 2 (for
6436 Bowtie 1 see -l).
6437
6438 --ignore-quals When calculating a mismatch penalty, always consider the quality value at the mismatched
6439 position to be the highest possible, regardless of the actual value. I.e. input is treated
6440 as though all quality values are high. This is also the default behavior when the input
6441 doesn't specify quality values (e.g. in -f mode). This option is invariable and on by default.
6442
6443
6444 Bowtie 2 paired-end options:
6445
6446 --no-mixed This option disables Bowtie 2's behavior to try to find alignments for the individual mates if
6447 it cannot find a concordant or discordant alignment for a pair. This option is invariable and
6448 and on by default.
6449
6450 --no-discordant Normally, Bowtie 2 looks for discordant alignments if it cannot find any concordant alignments.
6451 A discordant alignment is an alignment where both mates align uniquely, but that does not
6452 satisfy the paired-end constraints (--fr/--rf/--ff, -I, -X). This option disables that behavior
6453 and it is on by default.
6454
6455
6456 Bowtie 2 effort options:
6457
6458 -D <int> Up to <int> consecutive seed extension attempts can "fail" before Bowtie 2 moves on, using
6459 the alignments found so far. A seed extension "fails" if it does not yield a new best or a
6460 new second-best alignment. Default: 15.
6461
6462 -R <int> <int> is the maximum number of times Bowtie 2 will "re-seed" reads with repetitive seeds.
6463 When "re-seeding," Bowtie 2 simply chooses a new set of reads (same length, same number of
6464 mismatches allowed) at different offsets and searches for more alignments. A read is considered
6465 to have repetitive seeds if the total number of seed hits divided by the number of seeds
6466 that aligned at least once is greater than 300. Default: 2.
6467
6468 Bowtie 2 parallelization options:
6469
6470
6471 -p NTHREADS Launch NTHREADS parallel search threads (default: 1). Threads will run on separate processors/cores
6472 and synchronize when parsing reads and outputting alignments. Searching for alignments is highly
6473 parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint.
6474 E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint
6475 by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads
6476 library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will
6477 automatically use the option '--reorder', which guarantees that output SAM records are printed in
6478 an order corresponding to the order of the reads in the original input file, even when -p is set
6479 greater than 1 (Bismark requires the Bowtie 2 output to be this way). Specifying --reorder and
6480 setting -p greater than 1 causes Bowtie 2 to run somewhat slower and use somewhat more memory then
6481 if --reorder were not specified. Has no effect if -p is set to 1, since output order will naturally
6482 correspond to input order in that case.
6483
6484 Bowtie 2 Scoring options:
6485
6486 --score_min <func> Sets a function governing the minimum alignment score needed for an alignment to be considered
6487 "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying
6488 L,0,-0.2 sets the minimum-score function f to f(x) = 0 + -0.2 * x, where x is the read length.
6489 See also: setting function options at http://bowtie-bio.sourceforge.net/bowtie2. The default is
6490 L,0,-0.2.
6491
6492
6493 Bowtie 2 Reporting options:
6494
6495 -most_valid_alignments <int> This used to be the Bowtie 2 parameter -M. As of Bowtie 2 version 2.0.0 beta7 the option -M is
6496 deprecated. It will be removed in subsequent versions. What used to be called -M mode is still the
6497 default mode, but adjusting the -M setting is deprecated. Use the -D and -R options to adjust the
6498 effort expended to find valid alignments.
6499
6500 For reference, this used to be the old (now deprecated) description of -M:
6501 Bowtie 2 searches for at most <int>+1 distinct, valid alignments for each read. The search terminates when it
6502 can't find more distinct valid alignments, or when it finds <int>+1 distinct alignments, whichever
6503 happens first. Only the best alignment is reported. Information from the other alignments is used to
6504 estimate mapping quality and to set SAM optional fields, such as AS:i and XS:i. Increasing -M makes
6505 Bowtie 2 slower, but increases the likelihood that it will pick the correct alignment for a read that
6506 aligns many places. For reads that have more than <int>+1 distinct, valid alignments, Bowtie 2 does not
6507 guarantee that the alignment reported is the best possible in terms of alignment score. -M is
6508 always used and its default value is set to 10.
6509
6510
6511 'VANILLA' Bismark OUTPUT:
6512
6513 Single-end output format (tab-separated):
6514
6515 (1) <seq-ID>
6516 (2) <read alignment strand>
6517 (3) <chromosome>
6518 (4) <start position>
6519 (5) <end position>
6520 (6) <observed bisulfite sequence>
6521 (7) <equivalent genomic sequence>
6522 (8) <methylation call>
6523 (9) <read conversion
6524 (10) <genome conversion>
6525 (11) <read quality score (Phred33)>
6526
6527
6528 Paired-end output format (tab-separated):
6529 (1) <seq-ID>
6530 (2) <read 1 alignment strand>
6531 (3) <chromosome>
6532 (4) <start position>
6533 (5) <end position>
6534 (6) <observed bisulfite sequence 1>
6535 (7) <equivalent genomic sequence 1>
6536 (8) <methylation call 1>
6537 (9) <observed bisulfite sequence 2>
6538 (10) <equivalent genomic sequence 2>
6539 (11) <methylation call 2>
6540 (12) <read 1 conversion
6541 (13) <genome conversion>
6542 (14) <read 1 quality score (Phred33)>
6543 (15) <read 2 quality score (Phred33)>
6544
6545
6546 Bismark SAM OUTPUT (default):
6547
6548 (1) QNAME (seq-ID)
6549 (2) FLAG (this flag tries to take the strand a bisulfite read originated from into account (this is different from ordinary DNA alignment flags!))
6550 (3) RNAME (chromosome)
6551 (4) POS (start position)
6552 (5) MAPQ (always 255)
6553 (6) CIGAR
6554 (7) RNEXT
6555 (8) PNEXT
6556 (9) TLEN
6557 (10) SEQ
6558 (11) QUAL (Phred33 scale)
6559 (12) NM-tag (edit distance to the reference)
6560 (13) XX-tag (base-by-base mismatches to the reference. This does not include indels)
6561 (14) XM-tag (methylation call string)
6562 (15) XR-tag (read conversion state for the alignment)
6563 (16) XG-tag (genome conversion state for the alignment)
6564
6565 Each read of paired-end alignments is written out in a separate line in the above format.
6566
6567
6568 This script was last edited on 31 July 2012.
6569
6570 HOW_TO
6571 }