comparison bismark @ 0:62c6da72dd4a draft

Uploaded
author bgruening
date Sat, 06 Jul 2013 09:57:36 -0400
parents
children 91f07ff056ca
comparison
equal deleted inserted replaced
-1:000000000000 0:62c6da72dd4a
1 #!/usr/bin/perl --
2 use strict;
3 use warnings;
4 use IO::Handle;
5 use Cwd;
6 $|++;
7 use Getopt::Long;
8
9
10 ## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk)
11
12 ## This program is free software: you can redistribute it and/or modify
13 ## it under the terms of the GNU General Public License as published by
14 ## the Free Software Foundation, either version 3 of the License, or
15 ## (at your option) any later version.
16
17 ## This program is distributed in the hope that it will be useful,
18 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ## GNU General Public License for more details.
21
22 ## You should have received a copy of the GNU General Public License
23 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
24
25
26 my $parent_dir = getcwd;
27 my $bismark_version = 'v0.7.12';
28 my $command_line = join (" ",@ARGV);
29
30 ### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail
31 foreach my $arg (@ARGV){
32 if ($arg eq '--solexa1.3-quals'){
33 $arg = '--phred64-quals';
34 }
35 }
36 my @filenames; # will be populated by processing the command line
37
38 my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat) = process_command_line();
39
40 my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment
41 my %chromosomes; # stores the chromosome sequences of the mouse genome
42 my %counting; # counting various events
43
44 my $seqID_contains_tabs;
45
46 foreach my $filename (@filenames){
47
48 chdir $parent_dir or die "Unable to move to initial working directory $!\n";
49 ### resetting the counting hash and fhs
50 reset_counters_and_fhs($filename);
51 $seqID_contains_tabs = 0;
52
53 ### PAIRED-END ALIGNMENTS
54 if ($filename =~ ','){
55 my ($C_to_T_infile_1,$G_to_A_infile_1); # to be made from mate1 file
56
57 $fhs[0]->{name} = 'CTread1GAread2CTgenome';
58 $fhs[1]->{name} = 'GAread1CTread2GAgenome';
59 $fhs[2]->{name} = 'GAread1CTread2CTgenome';
60 $fhs[3]->{name} = 'CTread1GAread2GAgenome';
61
62 warn "\nPaired-end alignments will be performed\n",'='x39,"\n\n";
63
64 my ($filename_1,$filename_2) = (split (/,/,$filename));
65 warn "The provided filenames for paired-end alignments are $filename_1 and $filename_2\n";
66
67 ### additional variables only for paired-end alignments
68 my ($C_to_T_infile_2,$G_to_A_infile_2); # to be made from mate2 file
69
70 ### FastA format
71 if ($sequence_file_format eq 'FASTA'){
72 warn "Input files are in FastA format\n";
73
74 if ($directional){
75 ($C_to_T_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
76 ($G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
77
78 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
79 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
80 $fhs[1]->{inputfile_1} = undef;
81 $fhs[1]->{inputfile_2} = undef;
82 $fhs[2]->{inputfile_1} = undef;
83 $fhs[2]->{inputfile_2} = undef;
84 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
85 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
86 }
87 else{
88 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
89 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
90
91 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
92 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
93 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
94 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
95 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
96 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
97 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
98 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
99 }
100
101 if ($bowtie2){
102 paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
103 }
104 else{
105 paired_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
106 }
107 }
108
109 ### FastQ format
110 else{
111 warn "Input files are in FastQ format\n";
112 if ($directional){
113 if ($bowtie2){
114 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
115 ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
116
117 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
118 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
119 $fhs[1]->{inputfile_1} = undef;
120 $fhs[1]->{inputfile_2} = undef;
121 $fhs[2]->{inputfile_1} = undef;
122 $fhs[2]->{inputfile_2} = undef;
123 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
124 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
125 }
126 else{ # Bowtie 1 alignments
127 if ($gzip){
128 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time
129
130 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format
131 $fhs[0]->{inputfile_2} = undef; # no longer needed
132 $fhs[1]->{inputfile_1} = undef;
133 $fhs[1]->{inputfile_2} = undef;
134 $fhs[2]->{inputfile_1} = undef;
135 $fhs[2]->{inputfile_2} = undef;
136 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format
137 $fhs[3]->{inputfile_2} = undef; # no longer needed
138 }
139 else{
140 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
141 ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
142
143 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
144 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
145 $fhs[1]->{inputfile_1} = undef;
146 $fhs[1]->{inputfile_2} = undef;
147 $fhs[2]->{inputfile_1} = undef;
148 $fhs[2]->{inputfile_2} = undef;
149 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
150 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
151 }
152 }
153 }
154 elsif($pbat){ # PBAT-Seq
155 ### At the moment we are only performing uncompressed FastQ alignments with Bowtie1
156 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
157 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
158
159 $fhs[0]->{inputfile_1} = undef;
160 $fhs[0]->{inputfile_2} = undef;
161 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
162 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
163 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
164 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
165 $fhs[3]->{inputfile_1} = undef;
166 $fhs[3]->{inputfile_2} = undef;
167 }
168 else{
169 if ($bowtie2){
170 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
171 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
172
173 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
174 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
175 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
176 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
177 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
178 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
179 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
180 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
181 }
182 else{ # Bowtie 1 alignments
183 if ($gzip){
184 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time
185
186 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
187 $fhs[0]->{inputfile_2} = undef; # not needed for compressed temp files
188 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
189 $fhs[1]->{inputfile_2} = undef;
190 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
191 $fhs[2]->{inputfile_2} = undef;
192 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
193 $fhs[3]->{inputfile_2} = undef; # not needed for compressed temp files
194 }
195 else{ #uncompressed temp files
196 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
197 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
198
199 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
200 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
201 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
202 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
203 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
204 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
205 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
206 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
207 }
208 }
209 }
210 if ($bowtie2){
211 paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
212 }
213 else{
214 paired_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
215 }
216 }
217 start_methylation_call_procedure_paired_ends($filename_1,$filename_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
218 }
219
220 ### Else we are performing SINGLE-END ALIGNMENTS
221 else{
222 warn "\nSingle-end alignments will be performed\n",'='x39,"\n\n";
223 ### Initialising bisulfite conversion filenames
224 my ($C_to_T_infile,$G_to_A_infile);
225
226
227 ### FastA format
228 if ($sequence_file_format eq 'FASTA'){
229 warn "Inut file is in FastA format\n";
230 if ($directional){
231 ($C_to_T_infile) = biTransformFastAFiles ($filename);
232 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
233 }
234 else{
235 ($C_to_T_infile,$G_to_A_infile) = biTransformFastAFiles ($filename);
236 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
237 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
238 }
239
240 ### Creating 4 different bowtie filehandles and storing the first entry
241 if ($bowtie2){
242 single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile,$G_to_A_infile);
243 }
244 else{
245 single_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile,$G_to_A_infile);
246 }
247 }
248
249 ## FastQ format
250 else{
251 warn "Input file is in FastQ format\n";
252 if ($directional){
253 ($C_to_T_infile) = biTransformFastQFiles ($filename);
254 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
255 }
256 elsif($pbat){
257 ($G_to_A_infile) = biTransformFastQFiles ($filename);
258 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $G_to_A_infile; # PBAT-Seq only uses the G to A converted files
259 }
260 else{
261 ($C_to_T_infile,$G_to_A_infile) = biTransformFastQFiles ($filename);
262 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
263 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
264 }
265
266 ### Creating up to 4 different bowtie filehandles and storing the first entry
267 if ($bowtie2){
268 single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile,$G_to_A_infile);
269 }
270 elsif ($pbat){
271 single_end_align_fragments_to_bisulfite_genome_fastQ (undef,$G_to_A_infile);
272 }
273 else{
274 single_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile,$G_to_A_infile);
275 }
276 }
277
278 start_methylation_call_procedure_single_ends($filename,$C_to_T_infile,$G_to_A_infile);
279
280 }
281 }
282
283 sub start_methylation_call_procedure_single_ends {
284 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
285 my ($dir,$filename);
286
287 if ($sequence_file =~ /\//){
288 ($dir,$filename) = $sequence_file =~ m/(.*\/)(.*)$/;
289 }
290 else{
291 $filename = $sequence_file;
292 }
293
294 ### printing all alignments to a results file
295 my $outfile = $filename;
296
297 if ($bowtie2){ # SAM format is the default for Bowtie 2
298 $outfile =~ s/$/_bt2_bismark.sam/;
299 }
300 elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X)
301 $outfile =~ s/$/_bismark.txt/;
302 }
303 else{ # SAM is the default output
304 $outfile =~ s/$/_bismark.sam/;
305 }
306
307 $bam = 0 unless (defined $bam);
308
309 if ($bam == 1){ ### Samtools is installed, writing out BAM directly
310 $outfile =~ s/sam/bam/;
311 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
312 }
313 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
314 $outfile .= '.gz';
315 open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
316 }
317 else{ # uncompressed ouput, default
318 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n";
319 }
320
321 warn "\n>>> Writing bisulfite mapping results to $output_dir$outfile <<<\n\n";
322 sleep(1);
323
324 if ($vanilla){
325 print OUT "Bismark version: $bismark_version\n";
326 }
327
328 ### printing alignment and methylation call summary to a report file
329 my $reportfile = $filename;
330 if ($bowtie2){
331 $reportfile =~ s/$/_bt2_bismark_SE_report.txt/;
332 }
333 else{
334 $reportfile =~ s/$/_bismark_SE_report.txt/;
335 }
336
337 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
338 print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n";
339
340 if ($unmapped){
341 my $unmapped_file = $filename;
342 $unmapped_file =~ s/$/_unmapped_reads.txt/;
343 open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n";
344 print "Unmapped sequences will be written to $output_dir$unmapped_file\n";
345 }
346 if ($ambiguous){
347 my $ambiguous_file = $filename;
348 $ambiguous_file =~ s/$/_ambiguous_reads.txt/;
349 open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n";
350 print "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n";
351 }
352
353 if ($directional){
354 print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed!)\n";
355 }
356 print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
357
358
359 ### if 2 or more files are provided we can hold the genome in memory and don't need to read it in a second time
360 unless (%chromosomes){
361 my $cwd = getcwd; # storing the path of the current working directory
362 print "Current working directory is: $cwd\n\n";
363 read_genome_into_memory($cwd);
364 }
365
366 unless ($vanilla or $sam_no_hd){
367 generate_SAM_header();
368 }
369
370 ### Input file is in FastA format
371 if ($sequence_file_format eq 'FASTA'){
372 process_single_end_fastA_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
373 }
374 ### Input file is in FastQ format
375 else{
376 process_single_end_fastQ_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
377 }
378 }
379
380 sub start_methylation_call_procedure_paired_ends {
381 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
382
383 my ($dir_1,$filename_1);
384
385 if ($sequence_file_1 =~ /\//){
386 ($dir_1,$filename_1) = $sequence_file_1 =~ m/(.*\/)(.*)$/;
387 }
388 else{
389 $filename_1 = $sequence_file_1;
390 }
391
392 my ($dir_2,$filename_2);
393
394 if ($sequence_file_2 =~ /\//){
395 ($dir_2,$filename_2) = $sequence_file_2 =~ m/(.*\/)(.*)$/;
396 }
397 else{
398 $filename_2 = $sequence_file_2;
399 }
400
401 ### printing all alignments to a results file
402 my $outfile = $filename_1;
403 if ($bowtie2){ # SAM format is the default Bowtie 2 output
404 $outfile =~ s/$/_bismark_bt2_pe.sam/;
405 }
406 elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X)
407 $outfile =~ s/$/_bismark_pe.txt/;
408 }
409 else{ # SAM format is the default Bowtie 1 output
410 $outfile =~ s/$/_bismark_pe.sam/;
411 }
412
413 $bam = 0 unless (defined $bam);
414
415 if ($bam == 1){ ### Samtools is installed, writing out BAM directly
416 $outfile =~ s/sam/bam/;
417 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
418 }
419 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
420 $outfile .= '.gz';
421 open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
422 }
423 else{ # uncompressed ouput, default
424 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n";
425 }
426
427 warn "\n>>> Writing bisulfite mapping results to $outfile <<<\n\n";
428 sleep(1);
429
430 if ($vanilla){
431 print OUT "Bismark version: $bismark_version\n";
432 }
433
434 ### printing alignment and methylation call summary to a report file
435 my $reportfile = $filename_1;
436 if ($bowtie2){
437 $reportfile =~ s/$/_bismark_bt2_PE_report.txt/;
438 }
439 else{
440 $reportfile =~ s/$/_bismark_PE_report.txt/;
441 }
442
443 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
444 print REPORT "Bismark report for: $sequence_file_1 and $sequence_file_2 (version: $bismark_version)\n";
445 print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
446
447
448 ### Unmapped read output
449 if ($unmapped){
450 my $unmapped_1 = $filename_1;
451 my $unmapped_2 = $filename_2;
452 $unmapped_1 =~ s/$/_unmapped_reads_1.txt/;
453 $unmapped_2 =~ s/$/_unmapped_reads_2.txt/;
454 open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n";
455 open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n";
456 print "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n";
457 }
458
459 if ($ambiguous){
460 my $amb_1 = $filename_1;
461 my $amb_2 = $filename_2;
462 $amb_1 =~ s/$/_ambiguous_reads_1.txt/;
463 $amb_2 =~ s/$/_ambiguous_reads_2.txt/;
464 open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n";
465 open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n";
466 print "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n";
467 }
468
469 if ($directional){
470 print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed)\n";
471 }
472
473 ### if 2 or more files are provided we might still hold the genome in memory and don't need to read it in a second time
474 unless (%chromosomes){
475 my $cwd = getcwd; # storing the path of the current working directory
476 print "Current working directory is: $cwd\n\n";
477 read_genome_into_memory($cwd);
478 }
479
480 unless ($vanilla or $sam_no_hd){
481 generate_SAM_header();
482 }
483
484 ### Input files are in FastA format
485 if ($sequence_file_format eq 'FASTA'){
486 process_fastA_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
487 }
488 ### Input files are in FastQ format
489 else{
490 process_fastQ_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
491 }
492 }
493
494 sub print_final_analysis_report_single_end{
495 my ($C_to_T_infile,$G_to_A_infile) = @_;
496 ### All sequences from the original sequence file have been analysed now
497 ### deleting temporary C->T or G->A infiles
498
499 if ($directional){
500 my $deletion_successful = unlink "$temp_dir$C_to_T_infile";
501 if ($deletion_successful == 1){
502 warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile\n\n";
503 }
504 else{
505 warn "Could not delete temporary file $C_to_T_infile properly $!\n";
506 }
507 }
508 elsif ($pbat){
509 my $deletion_successful = unlink "$temp_dir$G_to_A_infile";
510 if ($deletion_successful == 1){
511 warn "\nSuccessfully deleted the temporary file $temp_dir$G_to_A_infile\n\n";
512 }
513 else{
514 warn "Could not delete temporary file $G_to_A_infile properly $!\n";
515 }
516 }
517 else{
518 my $deletion_successful = unlink "$temp_dir$C_to_T_infile","$temp_dir$G_to_A_infile";
519 if ($deletion_successful == 2){
520 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile and $temp_dir$G_to_A_infile\n\n";
521 }
522 else{
523 warn "Could not delete temporary files properly $!\n";
524 }
525 }
526
527 ### printing a final report for the alignment procedure
528 print REPORT "Final Alignment report\n",'='x22,"\n";
529 warn "Final Alignment report\n",'='x22,"\n";
530 # foreach my $index (0..$#fhs){
531 # print "$fhs[$index]->{name}\n";
532 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
533 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
534 # }
535
536 ### printing a final report for the methylation call procedure
537 warn "Sequences analysed in total:\t$counting{sequences_count}\n";
538 print REPORT "Sequences analysed in total:\t$counting{sequences_count}\n";
539 my $percent_alignable_sequences;
540
541 if ($counting{sequences_count} == 0){
542 $percent_alignable_sequences = 0;
543 }
544 else{
545 $percent_alignable_sequences = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
546 }
547
548 warn "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n\n";
549 print REPORT "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n";
550
551 ### percentage of low complexity reads overruled because of low complexity (thereby creating a bias for highly methylated reads),
552 ### only calculating the percentage if there were any overruled alignments
553 if ($counting{low_complexity_alignments_overruled_count}){
554 my $percent_overruled_low_complexity_alignments = sprintf ("%.1f",$counting{low_complexity_alignments_overruled_count}*100/$counting{sequences_count});
555 # print REPORT "Number of low complexity alignments which were overruled to have a unique best hit rather than discarding them:\t$counting{low_complexity_alignments_overruled_count}\t(${percent_overruled_low_complexity_alignments}%)\n";
556 }
557
558 print "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
559 print "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
560 print "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
561 print "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
562 print join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
563
564 print REPORT "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
565 print REPORT "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
566 print REPORT "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
567 print REPORT "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
568 print REPORT join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
569
570 if ($directional){
571 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
572 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
573 }
574
575 ### detailed information about Cs analysed
576 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
577 my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count};
578 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
579 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
580 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
581 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
582 warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
583 warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
584 warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
585
586 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
587 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
588 print REPORT "Total methylated C's in CpG context:\t $counting{total_meCpG_count}\n";
589 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
590 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
591 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
592 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
593 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
594
595 my $percent_meCHG;
596 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
597 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
598 }
599
600 my $percent_meCHH;
601 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
602 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
603 }
604
605 my $percent_meCpG;
606 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
607 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
608 }
609
610 ### printing methylated CpG percentage if applicable
611 if ($percent_meCpG){
612 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
613 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
614 }
615 else{
616 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
617 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
618 }
619
620 ### printing methylated C percentage (CHG context) if applicable
621 if ($percent_meCHG){
622 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
623 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
624 }
625 else{
626 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
627 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
628 }
629
630 ### printing methylated C percentage (CHH context) if applicable
631 if ($percent_meCHH){
632 warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
633 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
634 }
635 else{
636 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
637 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
638 }
639
640 if ($seqID_contains_tabs){
641 warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
642 print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
643 }
644 }
645
646 sub print_final_analysis_report_paired_ends{
647 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
648 ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles
649 if ($directional){
650 if ($G_to_A_infile_2){
651 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_2";
652 if ($deletion_successful == 2){
653 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2\n\n";
654 }
655 else{
656 warn "Could not delete temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2 properly: $!\n";
657 }
658 }
659 else{ # for paired-end FastQ infiles with Bowtie1 there is only one file to delete
660 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1";
661 if ($deletion_successful == 1){
662 warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile_1\n\n";
663 }
664 else{
665 warn "Could not delete temporary file $temp_dir$C_to_T_infile_1 properly: $!\n";
666 }
667 }
668 }
669 else{
670 if ($G_to_A_infile_2 and $C_to_T_infile_2){
671 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1","$temp_dir$C_to_T_infile_2","$temp_dir$G_to_A_infile_2";
672 if ($deletion_successful == 4){
673 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1, $temp_dir$G_to_A_infile_1, $temp_dir$C_to_T_infile_2 and $temp_dir$G_to_A_infile_2\n\n";
674 }
675 else{
676 warn "Could not delete temporary files properly: $!\n";
677 }
678 }
679 else{ # for paired-end FastQ infiles with Bowtie1 there are only two files to delete
680 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1";
681 if ($deletion_successful == 2){
682 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_1\n\n";
683 }
684 else{
685 warn "Could not delete temporary files properly: $!\n";
686 }
687 }
688 }
689
690 ### printing a final report for the alignment procedure
691 warn "Final Alignment report\n",'='x22,"\n";
692 print REPORT "Final Alignment report\n",'='x22,"\n";
693 # foreach my $index (0..$#fhs){
694 # print "$fhs[$index]->{name}\n";
695 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
696 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
697 # }
698
699 ### printing a final report for the methylation call procedure
700 warn "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
701 print REPORT "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
702
703 my $percent_alignable_sequence_pairs;
704 if ($counting{sequences_count} == 0){
705 $percent_alignable_sequence_pairs = 0;
706 }
707 else{
708 $percent_alignable_sequence_pairs = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
709 }
710 print "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}%\n\n";
711 print REPORT "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}% \n";
712
713 print "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
714 print "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
715 print "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
716 print "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
717 print join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
718
719
720 print REPORT "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
721 print REPORT "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
722 print REPORT "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
723 print REPORT "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
724 print REPORT join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
725 ### detailed information about Cs analysed
726
727 if ($directional){
728 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
729 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
730 }
731
732 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
733 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
734
735 my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
736 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
737 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
738 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
739 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
740 warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
741 warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
742 warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
743
744 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
745 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
746 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
747 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
748 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
749 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
750 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
751
752 my $percent_meCHG;
753 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
754 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
755 }
756
757 my $percent_meCHH;
758 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
759 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
760 }
761
762 my $percent_meCpG;
763 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
764 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
765 }
766
767 ### printing methylated CpG percentage if applicable
768 if ($percent_meCpG){
769 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
770 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
771 }
772 else{
773 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
774 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
775 }
776
777 ### printing methylated C percentage in CHG context if applicable
778 if ($percent_meCHG){
779 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
780 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
781 }
782 else{
783 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
784 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
785 }
786
787 ### printing methylated C percentage in CHH context if applicable
788 if ($percent_meCHH){
789 warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
790 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
791 }
792 else{
793 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
794 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
795 }
796
797 }
798
799 sub process_single_end_fastA_file_for_methylation_call{
800 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
801 ### this is a FastA sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
802 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
803 ### the C->T or G->A version
804
805 ### gzipped version of the infile
806 if ($sequence_file =~ /\.gz$/){
807 open (IN,"zcat $sequence_file |") or die $!;
808 }
809 else{
810 open (IN,$sequence_file) or die $!;
811 }
812
813 my $count = 0;
814
815 warn "\nReading in the sequence file $sequence_file\n";
816 while (1) {
817 # last if ($counting{sequences_count} > 100);
818 my $identifier = <IN>;
819 my $sequence = <IN>;
820 last unless ($identifier and $sequence);
821
822 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
823
824 ++$count;
825
826 if ($skip){
827 next unless ($count > $skip);
828 }
829 if ($upto){
830 last if ($count > $upto);
831 }
832
833 $counting{sequences_count}++;
834 if ($counting{sequences_count}%100000==0) {
835 warn "Processed $counting{sequences_count} sequences so far\n";
836 }
837 chomp $sequence;
838 chomp $identifier;
839
840 $identifier =~ s/^>//; # deletes the > at the beginning of FastA headers
841
842 my $return;
843 if ($bowtie2){
844 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier);
845 }
846 else{
847 $return = check_bowtie_results_single_end(uc$sequence,$identifier); # default Bowtie 1
848 }
849
850 unless ($return){
851 $return = 0;
852 }
853
854 # print the sequence to ambiguous.out if --ambiguous was specified
855 if ($ambiguous and $return == 2){
856 print AMBIG ">$identifier\n";
857 print AMBIG "$sequence\n";
858 }
859
860 # print the sequence to <unmapped.out> file if --un was specified
861 elsif ($unmapped and $return == 1){
862 print UNMAPPED ">$identifier\n";
863 print UNMAPPED "$sequence\n";
864 }
865 }
866 print "Processed $counting{sequences_count} sequences in total\n\n";
867
868 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
869
870 }
871
872 sub process_single_end_fastQ_file_for_methylation_call{
873 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
874 ### this is the Illumina sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
875 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
876 ### the C->T or G->A version
877
878 ### gzipped version of the infile
879 if ($sequence_file =~ /\.gz$/){
880 open (IN,"zcat $sequence_file |") or die $!;
881 }
882 else{
883 open (IN,$sequence_file) or die $!;
884 }
885
886 my $count = 0;
887
888 warn "\nReading in the sequence file $sequence_file\n";
889 while (1) {
890 my $identifier = <IN>;
891 my $sequence = <IN>;
892 my $identifier_2 = <IN>;
893 my $quality_value = <IN>;
894 last unless ($identifier and $sequence and $identifier_2 and $quality_value);
895
896 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
897
898 ++$count;
899
900 if ($skip){
901 next unless ($count > $skip);
902 }
903 if ($upto){
904 last if ($count > $upto);
905 }
906
907 $counting{sequences_count}++;
908
909 if ($counting{sequences_count}%1000000==0) {
910 warn "Processed $counting{sequences_count} sequences so far\n";
911 }
912 chomp $sequence;
913 chomp $identifier;
914 chomp $quality_value;
915
916 $identifier =~ s/^\@//; # deletes the @ at the beginning of Illumin FastQ headers
917
918 my $return;
919 if ($bowtie2){
920 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier,$quality_value);
921 }
922 else{
923 $return = check_bowtie_results_single_end(uc$sequence,$identifier,$quality_value); # default Bowtie 1
924 }
925
926 unless ($return){
927 $return = 0;
928 }
929
930 # print the sequence to ambiguous.out if --ambiguous was specified
931 if ($ambiguous and $return == 2){
932 print AMBIG "\@$identifier\n";
933 print AMBIG "$sequence\n";
934 print AMBIG $identifier_2;
935 print AMBIG "$quality_value\n";
936 }
937
938 # print the sequence to <unmapped.out> file if --un was specified
939 elsif ($unmapped and $return == 1){
940 print UNMAPPED "\@$identifier\n";
941 print UNMAPPED "$sequence\n";
942 print UNMAPPED $identifier_2;
943 print UNMAPPED "$quality_value\n";
944 }
945 }
946 print "Processed $counting{sequences_count} sequences in total\n\n";
947
948 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
949
950 }
951
952 sub process_fastA_files_for_paired_end_methylation_calls{
953 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
954 ### Processing the two FastA sequence files; we need the actual sequences of both reads to compare them against the genomic sequence in order to
955 ### make a methylation call. The sequence idetifier per definition needs to be the same for a sequence pair used for paired-end mapping.
956 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced an alignment to one (or both) of the
957 ### converted genomes (either the C->T or G->A version)
958
959 ### gzipped version of the infiles
960 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
961 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
962 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
963 }
964 else{
965 open (IN1,$sequence_file_1) or die $!;
966 open (IN2,$sequence_file_2) or die $!;
967 }
968
969 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
970 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
971
972 my $count = 0;
973
974 while (1) {
975 # reading from the first input file
976 my $identifier_1 = <IN1>;
977 my $sequence_1 = <IN1>;
978 # reading from the second input file
979 my $identifier_2 = <IN2>;
980 my $sequence_2 = <IN2>;
981 last unless ($identifier_1 and $sequence_1 and $identifier_2 and $sequence_2);
982
983 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
984 $identifier_2 = fix_IDs($identifier_2);
985
986 ++$count;
987
988 if ($skip){
989 next unless ($count > $skip);
990 }
991 if ($upto){
992 last if ($count > $upto);
993 }
994
995 $counting{sequences_count}++;
996 if ($counting{sequences_count}%100000==0) {
997 warn "Processed $counting{sequences_count} sequences so far\n";
998 }
999 my $orig_identifier_1 = $identifier_1;
1000 my $orig_identifier_2 = $identifier_2;
1001
1002 chomp $sequence_1;
1003 chomp $identifier_1;
1004 chomp $sequence_2;
1005 chomp $identifier_2;
1006
1007 $identifier_1 =~ s/^>//; # deletes the > at the beginning of FastA headers
1008
1009 my $return;
1010 if ($bowtie2){
1011 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1);
1012 }
1013 else{
1014 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1);
1015 }
1016
1017 unless ($return){
1018 $return = 0;
1019 }
1020
1021 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
1022 if ($ambiguous and $return == 2){
1023 print AMBIG_1 $orig_identifier_1;
1024 print AMBIG_1 "$sequence_1\n";
1025 print AMBIG_2 $orig_identifier_2;
1026 print AMBIG_2 "$sequence_2\n";
1027 }
1028
1029 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
1030 elsif ($unmapped and $return == 1){
1031 print UNMAPPED_1 $orig_identifier_1;
1032 print UNMAPPED_1 "$sequence_1\n";
1033 print UNMAPPED_2 $orig_identifier_2;
1034 print UNMAPPED_2 "$sequence_2\n";
1035 }
1036 }
1037
1038 warn "Processed $counting{sequences_count} sequences in total\n\n";
1039
1040 close OUT or die $!;
1041
1042 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
1043
1044 }
1045
1046 sub process_fastQ_files_for_paired_end_methylation_calls{
1047 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
1048 ### Processing the two Illumina sequence files; we need the actual sequence of both reads to compare them against the genomic sequence in order to
1049 ### make a methylation call. The sequence identifier per definition needs to be same for a sequence pair used for paired-end alignments.
1050 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced a paired-end alignment to one (or both)
1051 ### of the converted genomes (either C->T or G->A version)
1052
1053 ### gzipped version of the infiles
1054 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
1055 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
1056 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
1057 }
1058 else{
1059 open (IN1,$sequence_file_1) or die $!;
1060 open (IN2,$sequence_file_2) or die $!;
1061 }
1062
1063 my $count = 0;
1064
1065 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
1066 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
1067 while (1) {
1068 # reading from the first input file
1069 my $identifier_1 = <IN1>;
1070 my $sequence_1 = <IN1>;
1071 my $ident_1 = <IN1>; # not needed
1072 my $quality_value_1 = <IN1>; # not needed
1073 # reading from the second input file
1074 my $identifier_2 = <IN2>;
1075 my $sequence_2 = <IN2>;
1076 my $ident_2 = <IN2>; # not needed
1077 my $quality_value_2 = <IN2>; # not needed
1078 last unless ($identifier_1 and $sequence_1 and $quality_value_1 and $identifier_2 and $sequence_2 and $quality_value_2);
1079
1080 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
1081 $identifier_2 = fix_IDs($identifier_2);
1082
1083 ++$count;
1084
1085 if ($skip){
1086 next unless ($count > $skip);
1087 }
1088 if ($upto){
1089 last if ($count > $upto);
1090 }
1091
1092 $counting{sequences_count}++;
1093 if ($counting{sequences_count}%100000==0) {
1094 warn "Processed $counting{sequences_count} sequences so far\n";
1095 }
1096
1097 my $orig_identifier_1 = $identifier_1;
1098 my $orig_identifier_2 = $identifier_2;
1099
1100 chomp $sequence_1;
1101 chomp $identifier_1;
1102 chomp $sequence_2;
1103 chomp $identifier_2;
1104 chomp $quality_value_1;
1105 chomp $quality_value_2;
1106
1107 $identifier_1 =~ s/^\@//; # deletes the @ at the beginning of the FastQ ID
1108
1109 my $return;
1110 if ($bowtie2){
1111 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
1112 }
1113 else{
1114 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
1115 }
1116
1117 unless ($return){
1118 $return = 0;
1119 }
1120
1121 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
1122 if ($ambiguous and $return == 2){
1123 # seq_1
1124 print AMBIG_1 $orig_identifier_1;
1125 print AMBIG_1 "$sequence_1\n";
1126 print AMBIG_1 $ident_1;
1127 print AMBIG_1 "$quality_value_1\n";
1128 # seq_2
1129 print AMBIG_2 $orig_identifier_2;
1130 print AMBIG_2 "$sequence_2\n";
1131 print AMBIG_2 $ident_2;
1132 print AMBIG_2 "$quality_value_2\n";
1133 }
1134
1135 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
1136 elsif ($unmapped and $return == 1){
1137 # seq_1
1138 print UNMAPPED_1 $orig_identifier_1;
1139 print UNMAPPED_1 "$sequence_1\n";
1140 print UNMAPPED_1 $ident_1;
1141 print UNMAPPED_1 "$quality_value_1\n";
1142 # seq_2
1143 print UNMAPPED_2 $orig_identifier_2;
1144 print UNMAPPED_2 "$sequence_2\n";
1145 print UNMAPPED_2 $ident_2;
1146 print UNMAPPED_2 "$quality_value_2\n";
1147 }
1148 }
1149
1150 warn "Processed $counting{sequences_count} sequences in total\n\n";
1151
1152 close OUT or die $!;
1153
1154 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
1155
1156 }
1157
1158 sub check_bowtie_results_single_end{
1159 my ($sequence,$identifier,$quality_value) = @_;
1160
1161 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
1162 $quality_value = 'I'x(length$sequence);
1163 }
1164
1165 my %mismatches = ();
1166 ### reading from the bowtie output files to see if this sequence aligned to a bisulfite converted genome
1167 foreach my $index (0..$#fhs){
1168
1169 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
1170 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
1171 ### if the sequence we are currently looking at produced an alignment we are doing various things with it
1172 if ($fhs[$index]->{last_seq_id} eq $identifier) {
1173 ###############################################################
1174 ### STEP I Now processing the alignment stored in last_line ###
1175 ###############################################################
1176 my $valid_alignment_found_1 = decide_whether_single_end_alignment_is_valid($index,$identifier);
1177 ### sequences can fail at this point if there was only 1 seq in the wrong orientation, or if there were 2 seqs, both in the wrong orientation
1178 ### we only continue to extract useful information about this alignment if 1 was returned
1179 if ($valid_alignment_found_1 == 1){
1180 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself
1181 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
1182 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
1183
1184 unless($mismatch_info){
1185 $mismatch_info = '';
1186 }
1187
1188 chomp $mismatch_info;
1189 my $chromosome;
1190 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
1191 $chromosome = $mapped_chromosome;
1192 }
1193 else{
1194 die "Chromosome number extraction failed for $mapped_chromosome\n";
1195 }
1196 ### Now extracting the number of mismatches to the converted genome
1197 my $number_of_mismatches;
1198 if ($mismatch_info eq ''){
1199 $number_of_mismatches = 0;
1200 }
1201 elsif ($mismatch_info =~ /^\d/){
1202 my @mismatches = split (/,/,$mismatch_info);
1203 $number_of_mismatches = scalar @mismatches;
1204 }
1205 else{
1206 die "Something weird is going on with the mismatch field:\t>>> $mismatch_info <<<\n";
1207 }
1208 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
1209 my $alignment_location = join (":",$chromosome,$position);
1210 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
1211 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
1212 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
1213 ### number for the found alignment)
1214 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
1215 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
1216 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
1217 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
1218 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
1219 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
1220 }
1221 $number_of_mismatches = undef;
1222 ##################################################################################################################################################
1223 ### STEP II Now reading in the next line from the bowtie filehandle. The next alignment can either be a second alignment of the same sequence or a
1224 ### a new sequence. In either case we will store the next line in @fhs ->{last_line}. In case the alignment is already the next entry, a 0 will
1225 ### be returned as $valid_alignment_found and it will then be processed in the next round only.
1226 ##################################################################################################################################################
1227 my $newline = $fhs[$index]->{fh}-> getline();
1228 if ($newline){
1229 my ($seq_id) = split (/\t/,$newline);
1230 $fhs[$index]->{last_seq_id} = $seq_id;
1231 $fhs[$index]->{last_line} = $newline;
1232 }
1233 else {
1234 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
1235 $fhs[$index]->{last_seq_id} = undef;
1236 $fhs[$index]->{last_line} = undef;
1237 next;
1238 }
1239 my $valid_alignment_found_2 = decide_whether_single_end_alignment_is_valid($index,$identifier);
1240 ### we only continue to extract useful information about this second alignment if 1 was returned
1241 if ($valid_alignment_found_2 == 1){
1242 ### If the second Bowtie output made it this far it is in the correct orientation, so we can continue to analyse the alignment itself
1243 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
1244 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
1245 unless($mismatch_info){
1246 $mismatch_info = '';
1247 }
1248 chomp $mismatch_info;
1249
1250 my $chromosome;
1251 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
1252 $chromosome = $mapped_chromosome;
1253 }
1254 else{
1255 die "Chromosome number extraction failed for $mapped_chromosome\n";
1256 }
1257
1258 ### Now extracting the number of mismatches to the converted genome
1259 my $number_of_mismatches;
1260 if ($mismatch_info eq ''){
1261 $number_of_mismatches = 0;
1262 }
1263 elsif ($mismatch_info =~ /^\d/){
1264 my @mismatches = split (/,/,$mismatch_info);
1265 $number_of_mismatches = scalar @mismatches;
1266 }
1267 else{
1268 die "Something weird is going on with the mismatch field\n";
1269 }
1270 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
1271 ### extracting the chromosome number from the bowtie output (see above)
1272 my $alignment_location = join (":",$chromosome,$position);
1273 ### In the special case that two differently converted sequences align against differently converted genomes, but to the same position
1274 ### with the same number of mismatches (or perfect matches), the chromosome, position and number of mismatches are the same. In this
1275 ### case we are not writing the same entry out a second time.
1276 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
1277 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
1278 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
1279 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
1280 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
1281 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
1282 }
1283 ####################################################################################################################################
1284 #### STEP III Now reading in one more line which has to be the next alignment to be analysed. Adding it to @fhs ->{last_line} ###
1285 ####################################################################################################################################
1286 $newline = $fhs[$index]->{fh}-> getline();
1287 if ($newline){
1288 my ($seq_id) = split (/\t/,$newline);
1289 die "The same seq ID occurred more than twice in a row\n" if ($seq_id eq $identifier);
1290 $fhs[$index]->{last_seq_id} = $seq_id;
1291 $fhs[$index]->{last_line} = $newline;
1292 next;
1293 }
1294 else {
1295 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
1296 $fhs[$index]->{last_seq_id} = undef;
1297 $fhs[$index]->{last_line} = undef;
1298 next;
1299 }
1300 ### still within the 2nd sequence in correct orientation found
1301 }
1302 ### still withing the 1st sequence in correct orientation found
1303 }
1304 ### still within the if (last_seq_id eq identifier) condition
1305 }
1306 ### still within foreach index loop
1307 }
1308 ### if there was not a single alignment found for a certain sequence we will continue with the next sequence in the sequence file
1309 unless(%mismatches){
1310 $counting{no_single_alignment_found}++;
1311 if ($unmapped){
1312 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
1313 }
1314 else{
1315 return;
1316 }
1317 }
1318 #######################################################################################################################################################
1319 #######################################################################################################################################################
1320 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the ###
1321 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the ###
1322 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether ###
1323 #######################################################################################################################################################
1324 #######################################################################################################################################################
1325 ### Going to use the variable $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
1326 my $sequence_fails = 0;
1327 ### Declaring an empty hash reference which will store all information we need for the methylation call
1328 my $methylation_call_params; # hash reference!
1329 ### sorting in ascending order
1330 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
1331
1332 ### if there is only 1 entry in the hash with the lowest number of mismatches we accept it as the best alignment
1333 if (scalar keys %{$mismatches{$mismatch_number}} == 1){
1334 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
1335 $methylation_call_params->{$identifier}->{bowtie_sequence} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence};
1336 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
1337 $methylation_call_params->{$identifier}->{position} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{position};
1338 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
1339 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
1340 }
1341 }
1342 elsif (scalar keys %{$mismatches{$mismatch_number}} == 3){
1343 ### If there are 3 sequences with the same number of lowest mismatches we can discriminate 2 cases: (i) all 3 alignments are unique best hits and
1344 ### come from different alignments processes (== indices) or (ii) one sequence alignment (== index) will give a unique best alignment, whereas a
1345 ### second one will produce 2 (or potentially many) alignments for the same sequence but in a different conversion state or against a different genome
1346 ### version (or both). This becomes especially relevant for highly converted sequences in which all Cs have been converted to Ts in the bisulfite
1347 ### reaction. E.g.
1348 ### CAGTCACGCGCGCGCG will become
1349 ### TAGTTATGTGTGTGTG in the CT transformed version, which will ideally still give the correct alignment in the CT->CT alignment condition.
1350 ### If the same read will then become G->A transformed as well however, the resulting sequence will look differently and potentially behave
1351 ### differently in a GA->GA alignment and this depends on the methylation state of the original sequence!:
1352 ### G->A conversion:
1353 ### highly methylated: CAATCACACACACACA
1354 ### highly converted : TAATTATATATATATA <== this sequence has a reduced complexity (only 2 bases left and not 3), and it is more likely to produce
1355 ### an alignment with a low complexity genomic region than the one above. This would normally lead to the entire sequence being kicked out as the
1356 ### there will be 3 alignments with the same number of lowest mismatches!! This in turn means that highly methylated and thereby not converted
1357 ### sequences are more likely to pass the alignment step, thereby creating a bias for methylated reads compared to their non-methylated counterparts.
1358 ### We do not want any bias, whatsover. Therefore if we have 1 sequence producing a unique best alignment and the second and third conditions
1359 ### producing alignments only after performing an additional (theoretical) conversion we want to keep the best alignment with the lowest number of
1360 ### additional transliterations performed. Thus we want to have a look at the level of complexity of the sequences producing the alignment.
1361 ### In the above example the number of transliterations required to transform the actual sequence
1362 ### to the C->T version would be TAGTTATGTGTGTGTG -> TAGTTATGTGTGTGTG = 0; (assuming this gives the correct alignment)
1363 ### in the G->A case it would be TAGTTATGTGTGTGTG -> TAATTATATATATATA = 6; (assuming this gives multiple wrong alignments)
1364 ### if the sequence giving a unique best alignment required a lower number of transliterations than the second best sequence yielding alignments
1365 ### while requiring a much higher number of transliterations, we are going to accept the unique best alignment with the lowest number of performed
1366 ### transliterations. As a threshold which does scale we will start with the number of tranliterations of the lowest best match x 2 must still be
1367 ### smaller than the number of tranliterations of the second best sequence. Everything will be flagged with $sequence_fails = 1 and discarded.
1368 my @three_candidate_seqs;
1369 foreach my $composite_location (keys (%{$mismatches{$mismatch_number}}) ){
1370 my $transliterations_performed;
1371 if ($mismatches{$mismatch_number}->{$composite_location}->{index} == 0 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 1){
1372 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'CT');
1373 }
1374 elsif ($mismatches{$mismatch_number}->{$composite_location}->{index} == 2 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 3){
1375 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'GA');
1376 }
1377 else{
1378 die "unexpected index number range $!\n";
1379 }
1380 push @three_candidate_seqs,{
1381 index =>$mismatches{$mismatch_number}->{$composite_location}->{index},
1382 bowtie_sequence => $mismatches{$mismatch_number}->{$composite_location}->{bowtie_sequence},
1383 mismatch_number => $mismatch_number,
1384 chromosome => $mismatches{$mismatch_number}->{$composite_location}->{chromosome},
1385 position => $mismatches{$mismatch_number}->{$composite_location}->{position},
1386 seq_id => $mismatches{$mismatch_number}->{$composite_location}->{seq_id},
1387 transliterations_performed => $transliterations_performed,
1388 };
1389 }
1390 ### sorting in ascending order for the lowest number of transliterations performed
1391 @three_candidate_seqs = sort {$a->{transliterations_performed} <=> $b->{transliterations_performed}} @three_candidate_seqs;
1392 my $first_array_element = $three_candidate_seqs[0]->{transliterations_performed};
1393 my $second_array_element = $three_candidate_seqs[1]->{transliterations_performed};
1394 my $third_array_element = $three_candidate_seqs[2]->{transliterations_performed};
1395 # print "$first_array_element\t$second_array_element\t$third_array_element\n";
1396 if (($first_array_element*2) < $second_array_element){
1397 $counting{low_complexity_alignments_overruled_count}++;
1398 ### taking the index with the unique best hit and over ruling low complexity alignments with 2 hits
1399 $methylation_call_params->{$identifier}->{bowtie_sequence} = $three_candidate_seqs[0]->{bowtie_sequence};
1400 $methylation_call_params->{$identifier}->{chromosome} = $three_candidate_seqs[0]->{chromosome};
1401 $methylation_call_params->{$identifier}->{position} = $three_candidate_seqs[0]->{position};
1402 $methylation_call_params->{$identifier}->{index} = $three_candidate_seqs[0]->{index};
1403 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
1404 # print "Overruled low complexity alignments! Using $first_array_element and disregarding $second_array_element and $third_array_element\n";
1405 }
1406 else{
1407 $sequence_fails = 1;
1408 }
1409 }
1410 else{
1411 $sequence_fails = 1;
1412 }
1413 ### after processing the alignment with the lowest number of mismatches we exit
1414 last;
1415 }
1416 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
1417 if ($sequence_fails == 1){
1418 $counting{unsuitable_sequence_count}++;
1419 if ($ambiguous){
1420 return 2; # => exits to next sequence, and prints it out to multiple_alignments.out if --ambiguous has been specified
1421 }
1422 if ($unmapped){
1423 return 1; # => exits to next sequence, and prints it out to unmapped.out if --un has been specified
1424 }
1425 else{
1426 return 0; # => exits to next sequence (default)
1427 }
1428 }
1429
1430 ### --DIRECTIONAL
1431 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
1432 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
1433 if ($directional){
1434 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
1435 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
1436 $counting{alignments_rejected_count}++;
1437 return 0;
1438 }
1439 }
1440
1441 ### If the sequence has not been rejected so far it will have a unique best alignment
1442 $counting{unique_best_alignment_count}++;
1443 if ($pbat){
1444 extract_corresponding_genomic_sequence_single_end_pbat($identifier,$methylation_call_params);
1445 }
1446 else{
1447 extract_corresponding_genomic_sequence_single_end($identifier,$methylation_call_params);
1448 }
1449
1450 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
1451 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
1452 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
1453 $counting{genomic_sequence_could_not_be_extracted_count}++;
1454 return 0;
1455 }
1456
1457 ### otherwise we are set to perform the actual methylation call
1458 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
1459
1460 print_bisulfite_mapping_result_single_end($identifier,$sequence,$methylation_call_params,$quality_value);
1461 return 0; ## otherwise 1 will be returned by default, which would print the sequence to unmapped.out
1462 }
1463
1464 sub check_bowtie_results_single_end_bowtie2{
1465 my ($sequence,$identifier,$quality_value) = @_;
1466
1467
1468 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
1469 $quality_value = 'I'x(length$sequence);
1470 }
1471
1472 # as of version Bowtie 2 2.0.0 beta7, when input reads are unpaired, Bowtie 2 no longer removes the trailing /1 or /2 from the read name.
1473 # $identifier =~ s/\/[1234567890]+$//; # some sequencers don't just have /1 or /2 at the end of read IDs
1474 # print "sequence $sequence\nid $identifier\nquality: '$quality_value'\n";
1475
1476 my $alignment_ambiguous = 0;
1477
1478 my %alignments = ();
1479
1480 ### reading from the Bowtie 2 output filehandles
1481 foreach my $index (0..$#fhs){
1482 # print "Index: $index\n";
1483 # print "$fhs[$index]->{last_line}\n";
1484 # print "$fhs[$index]->{last_seq_id}\n";
1485 # sleep (1);
1486 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
1487 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
1488
1489 ### if the sequence we are currently looking at produced an alignment we are doing various things with it
1490 # print "last seq id: $fhs[$index]->{last_seq_id} and identifier: $identifier\n";
1491
1492 if ($fhs[$index]->{last_seq_id} eq $identifier) {
1493 # SAM format specifications for Bowtie 2
1494 # (1) Name of read that aligned
1495 # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
1496 # 1 The read is one of a pair
1497 # 2 The alignment is one end of a proper paired-end alignment
1498 # 4 The read has no reported alignments
1499 # 8 The read is one of a pair and has no reported alignments
1500 # 16 The alignment is to the reverse reference strand
1501 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
1502 # 64 The read is mate 1 in a pair
1503 # 128 The read is mate 2 in a pair
1504 # 256 The read has multiple mapping states
1505 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
1506 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
1507 # (5) Mapping quality (255 means MAPQ is not available)
1508 # (6) CIGAR string representation of alignment (* if unavailable)
1509 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
1510 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
1511 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
1512 # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
1513 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
1514 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
1515 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
1516 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
1517 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
1518 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
1519 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
1520 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
1521 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
1522 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
1523 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
1524 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
1525
1526 my ($id,$flag,$mapped_chromosome,$position,$mapping_quality,$cigar,$bowtie_sequence,$qual) = (split (/\t/,$fhs[$index]->{last_line}))[0,1,2,3,4,5,9,10];
1527
1528 ### If a sequence has no reported alignments there will be a single output line with a bit-wise flag value of 4. We can store the next alignment and move on to the next Bowtie 2 instance
1529 if ($flag == 4){
1530 ## reading in the next alignment, which must be the next sequence
1531 my $newline = $fhs[$index]->{fh}-> getline();
1532 if ($newline){
1533 chomp $newline;
1534 my ($seq_id) = split (/\t/,$newline);
1535 $fhs[$index]->{last_seq_id} = $seq_id;
1536 $fhs[$index]->{last_line} = $newline;
1537 if ($seq_id eq $identifier){
1538 die "Sequence with ID $identifier did not produce any alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
1539 }
1540 next; # next instance
1541 }
1542 else{
1543 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
1544 $fhs[$index]->{last_seq_id} = undef;
1545 $fhs[$index]->{last_line} = undef;
1546 next;
1547 }
1548 }
1549
1550 # if there are one or more proper alignments we can extract the chromosome number
1551 my $chromosome;
1552 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
1553 $chromosome = $mapped_chromosome;
1554 }
1555 else{
1556 die "Chromosome number extraction failed for $mapped_chromosome\n";
1557 }
1558
1559 ### We will use the optional field to determine the best alignment. Later on we extract the number of mismatches and/or indels from the CIGAR string
1560 my ($alignment_score,$second_best,$MD_tag);
1561 my @fields = split (/\t/,$fhs[$index]->{last_line});
1562
1563 foreach (11..$#fields){
1564 if ($fields[$_] =~ /AS:i:(.*)/){
1565 $alignment_score = $1;
1566 }
1567 elsif ($fields[$_] =~ /XS:i:(.*)/){
1568 $second_best = $1;
1569 }
1570 elsif ($fields[$_] =~ /MD:Z:(.*)/){
1571 $MD_tag = $1;
1572 }
1573 }
1574
1575 # warn "First best alignment_score is: '$alignment_score'\n";
1576 # warn "MD tag is: '$MD_tag'\n";
1577 die "Failed to extract alignment score ($alignment_score) and MD tag ($MD_tag)!\n" unless (defined $alignment_score and defined $MD_tag);
1578
1579 if (defined $second_best){
1580 # warn "second best alignment_score is: '$second_best'\n\n";
1581
1582 # If the first alignment score is the same as the alignment score of the second best hit we are going to boot this sequence altogether
1583 if ($alignment_score == $second_best){
1584 $alignment_ambiguous = 1;
1585 ## need to read and discard all additional ambiguous reads until we reach the next sequence
1586 until ($fhs[$index]->{last_seq_id} ne $identifier){
1587 my $newline = $fhs[$index]->{fh}-> getline();
1588 if ($newline){
1589 chomp $newline;
1590 my ($seq_id) = split (/\t/,$newline);
1591 $fhs[$index]->{last_seq_id} = $seq_id;
1592 $fhs[$index]->{last_line} = $newline;
1593 }
1594 else{
1595 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
1596 $fhs[$index]->{last_seq_id} = undef;
1597 $fhs[$index]->{last_line} = undef;
1598 last; # break free in case we have reached the end of the alignment output
1599 }
1600 }
1601 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
1602 }
1603 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
1604
1605 my $alignment_location = join (":",$chromosome,$position);
1606
1607 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
1608 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
1609 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
1610 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
1611
1612 unless (exists $alignments{$alignment_location}){
1613 $alignments{$alignment_location}->{seq_id} = $id;
1614 $alignments{$alignment_location}->{alignment_score} = $alignment_score;
1615 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
1616 $alignments{$alignment_location}->{index} = $index;
1617 $alignments{$alignment_location}->{chromosome} = $chromosome;
1618 $alignments{$alignment_location}->{position} = $position;
1619 $alignments{$alignment_location}->{CIGAR} = $cigar;
1620 $alignments{$alignment_location}->{MD_tag} = $MD_tag;
1621 }
1622
1623 ### now reading and discarding all (inferior) alignments of this sequencing read until we hit the next sequence
1624 until ($fhs[$index]->{last_seq_id} ne $identifier){
1625 my $newline = $fhs[$index]->{fh}-> getline();
1626 if ($newline){
1627 chomp $newline;
1628 my ($seq_id) = split (/\t/,$newline);
1629 $fhs[$index]->{last_seq_id} = $seq_id;
1630 $fhs[$index]->{last_line} = $newline;
1631 }
1632 else{
1633 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
1634 $fhs[$index]->{last_seq_id} = undef;
1635 $fhs[$index]->{last_line} = undef;
1636 last; # break free in case we have reached the end of the alignment output
1637 }
1638 }
1639 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
1640 }
1641 }
1642 else{ # there is no second best hit, so we can just store this one and read in the next sequence
1643
1644 my $alignment_location = join (":",$chromosome,$position);
1645
1646 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
1647 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
1648 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
1649 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
1650
1651 unless (exists $alignments{$alignment_location}){
1652 $alignments{$alignment_location}->{seq_id} = $id;
1653 $alignments{$alignment_location}->{alignment_score} = $alignment_score;
1654 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
1655 $alignments{$alignment_location}->{index} = $index;
1656 $alignments{$alignment_location}->{chromosome} = $chromosome;
1657 $alignments{$alignment_location}->{position} = $position;
1658 $alignments{$alignment_location}->{MD_tag} = $MD_tag;
1659 $alignments{$alignment_location}->{CIGAR} = $cigar;
1660 }
1661
1662 my $newline = $fhs[$index]->{fh}-> getline();
1663 if ($newline){
1664 chomp $newline;
1665 my ($seq_id) = split (/\t/,$newline);
1666 $fhs[$index]->{last_seq_id} = $seq_id;
1667 $fhs[$index]->{last_line} = $newline;
1668 if ($seq_id eq $identifier){
1669 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
1670 }
1671 }
1672 else{
1673 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
1674 $fhs[$index]->{last_seq_id} = undef;
1675 $fhs[$index]->{last_line} = undef;
1676 }
1677 }
1678 }
1679 }
1680
1681 ### if the read produced several ambiguous alignments already now can returning already now. If --ambiguous or --unmapped was specified the read sequence will be printed out.
1682 if ($alignment_ambiguous == 1){
1683 $counting{unsuitable_sequence_count}++;
1684 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
1685 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
1686 # print "$ambiguous_read_output\n";
1687
1688 if ($ambiguous){
1689 return 2; # => exits to next sequence, and prints it out to _ambiguous_reads.txt if '--ambiguous' was specified
1690 }
1691 elsif ($unmapped){
1692 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
1693 }
1694 else{
1695 return 0;
1696 }
1697 }
1698
1699 ### if there was no alignment found for a certain sequence at all we continue with the next sequence in the sequence file
1700 unless(%alignments){
1701 $counting{no_single_alignment_found}++;
1702 # my $unmapped_read_output = join("\t",$identifier,'4','*','0','0','*','*','0','0',$sequence,$quality_value);
1703 # print "$unmapped_read_output\n";
1704 if ($unmapped){
1705 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' was specified
1706 }
1707 else{
1708 return 0; # default
1709 }
1710 }
1711
1712 #######################################################################################################################################################
1713
1714 ### If the sequence was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
1715 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
1716 ### alignment score we are discarding the sequence altogether.
1717 ### For end-to-end alignments the maximum alignment score can be 0, each mismatch can receive penalties up to 6, and each gap receives penalties for
1718 ### opening (5) and extending (3 per bp) the gap.
1719
1720 #######################################################################################################################################################
1721
1722 my $methylation_call_params; # hash reference which will store all information we need for the methylation call
1723 my $sequence_fails = 0; # Going to use $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
1724
1725 ### print contents of %alignments for debugging
1726 # if (scalar keys %alignments > 1){
1727 # print "\n******\n";
1728 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
1729 # print "Loc: $alignment_location\n";
1730 # print "ID: $alignments{$alignment_location}->{seq_id}\n";
1731 # print "AS: $alignments{$alignment_location}->{alignment_score}\n";
1732 # print "Seq: $alignments{$alignment_location}->{bowtie_sequence}\n";
1733 # print "Index $alignments{$alignment_location}->{index}\n";
1734 # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
1735 # print "pos: $alignments{$alignment_location}->{position}\n";
1736 # print "MD: $alignments{$alignment_location}->{MD_tag}\n\n";
1737 # }
1738 # print "\n******\n";
1739 # }
1740
1741 ### if there is only 1 entry in the hash with we accept it as the best alignment
1742 if (scalar keys %alignments == 1){
1743 for my $unique_best_alignment (keys %alignments){
1744 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$unique_best_alignment}->{bowtie_sequence};
1745 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
1746 $methylation_call_params->{$identifier}->{position} = $alignments{$unique_best_alignment}->{position};
1747 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
1748 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$unique_best_alignment}->{alignment_score};
1749 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$unique_best_alignment}->{MD_tag};
1750 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$unique_best_alignment}->{CIGAR};
1751 }
1752 }
1753
1754 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
1755 ### we boot the sequence altogether
1756 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
1757 my $best_alignment_score;
1758 my $best_alignment_location;
1759 foreach my $alignment_location (sort {$alignments{$b}->{alignment_score} <=> $alignments{$a}->{alignment_score}} keys %alignments){
1760 # print "$alignments{$alignment_location}->{alignment_score}\n";
1761 unless (defined $best_alignment_score){
1762 $best_alignment_score = $alignments{$alignment_location}->{alignment_score};
1763 $best_alignment_location = $alignment_location;
1764 # print "setting best alignment score: $best_alignment_score\n";
1765 }
1766 else{
1767 ### if the second best alignment has the same alignment score as the first one, the sequence will get booted
1768 if ($alignments{$alignment_location}->{alignment_score} == $best_alignment_score){
1769 # warn "Same alignment score, the sequence will get booted!\n";
1770 $sequence_fails = 1;
1771 last; # exiting after the second alignment since we know that the sequence has ambiguous alignments
1772 }
1773 ### else we are going to store the best alignment for further processing
1774 else{
1775 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$best_alignment_location}->{bowtie_sequence};
1776 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
1777 $methylation_call_params->{$identifier}->{position} = $alignments{$best_alignment_location}->{position};
1778 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
1779 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$best_alignment_location}->{alignment_score};
1780 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$best_alignment_location}->{MD_tag};
1781 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$best_alignment_location}->{CIGAR};
1782 last; # exiting after processing the second alignment since the sequence produced a unique best alignment
1783 }
1784 }
1785 }
1786 }
1787 else{
1788 die "There are too many potential hits for this sequence (1-4 expected, but found: ",scalar keys %alignments,")\n";;
1789 }
1790
1791 ### skipping the sequence completely if there were multiple alignments with the same best alignment score at different positions
1792 if ($sequence_fails == 1){
1793 $counting{unsuitable_sequence_count}++;
1794
1795 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
1796 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
1797 # print OUT "$ambiguous_read_output\n";
1798
1799 if ($ambiguous){
1800 return 2; # => exits to next sequence, and prints it out (in FastQ format) to _ambiguous_reads.txt if '--ambiguous' was specified
1801 }
1802 elsif ($unmapped){
1803 return 1; # => exits to next sequence, and prints it out (in FastQ format) to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
1804 }
1805 else{
1806 return 0; # => exits to next sequence (default)
1807 }
1808 }
1809
1810 ### --DIRECTIONAL
1811 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
1812 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
1813 if ($directional){
1814 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
1815 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
1816 $counting{alignments_rejected_count}++;
1817 return 0;
1818 }
1819 }
1820
1821 ### If the sequence has not been rejected so far it has a unique best alignment
1822 $counting{unique_best_alignment_count}++;
1823
1824 ### Now we need to extract a genomic sequence that exactly corresponds to the reported alignment. This potentially means that we need to deal with insertions or deletions as well
1825 extract_corresponding_genomic_sequence_single_end_bowtie2 ($identifier,$methylation_call_params);
1826
1827 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
1828 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
1829 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
1830 $counting{genomic_sequence_could_not_be_extracted_count}++;
1831 return 0;
1832 }
1833
1834
1835 ### otherwise we are set to perform the actual methylation call
1836 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
1837 print_bisulfite_mapping_result_single_end_bowtie2 ($identifier,$sequence,$methylation_call_params,$quality_value);
1838 return 0; ## if a sequence got this far we do not want to print it to unmapped or ambiguous.out
1839 }
1840
1841
1842 sub determine_number_of_transliterations_performed{
1843 my ($sequence,$read_conversion) = @_;
1844 my $number_of_transliterations;
1845 if ($read_conversion eq 'CT'){
1846 $number_of_transliterations = $sequence =~ tr/C/T/;
1847 }
1848 elsif ($read_conversion eq 'GA'){
1849 $number_of_transliterations = $sequence =~ tr/G/A/;
1850 }
1851 else{
1852 die "Read conversion mode of the read was not specified $!\n";
1853 }
1854 return $number_of_transliterations;
1855 }
1856
1857 sub decide_whether_single_end_alignment_is_valid{
1858 my ($index,$identifier) = @_;
1859
1860 # extracting from Bowtie 1 format
1861 my ($id,$strand) = (split (/\t/,$fhs[$index]->{last_line}))[0,1];
1862
1863 ### ensuring that the entry is the correct sequence
1864 if (($id eq $fhs[$index]->{last_seq_id}) and ($id eq $identifier)){
1865 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
1866 ### sensible alignments
1867 my $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
1868 ### If the orientation was correct can we move on
1869 if ($orientation == 1){
1870 return 1; ### 1st possibility for a sequence to pass
1871 }
1872 ### If the alignment was in the wrong orientation we need to read in a new line
1873 elsif($orientation == 0){
1874 my $newline = $fhs[$index]->{fh}->getline();
1875 if ($newline){
1876 ($id,$strand) = (split (/\t/,$newline))[0,1];
1877
1878 ### ensuring that the next entry is still the correct sequence
1879 if ($id eq $identifier){
1880 ### checking orientation again
1881 $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
1882 ### If the orientation was correct can we move on
1883 if ($orientation == 1){
1884 $fhs[$index]->{last_seq_id} = $id;
1885 $fhs[$index]->{last_line} = $newline;
1886 return 1; ### 2nd possibility for a sequence to pass
1887 }
1888 ### If the alignment was in the wrong orientation again we need to read in yet another new line and store it in @fhs
1889 elsif ($orientation == 0){
1890 $newline = $fhs[$index]->{fh}->getline();
1891 if ($newline){
1892 my ($seq_id) = split (/\t/,$newline);
1893 ### check if the next line still has the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
1894 ### the same fields of the just read next entry
1895 die "Same seq ID 3 or more times in a row!(should be 2 max) $!" if ($seq_id eq $identifier);
1896 $fhs[$index]->{last_seq_id} = $seq_id;
1897 $fhs[$index]->{last_line} = $newline;
1898 return 0; # not processing anything this round as the alignment currently stored in last_line was in the wrong orientation
1899 }
1900 else{
1901 # assigning undef to last_seq_id and last_line (end of bowtie output)
1902 $fhs[$index]->{last_seq_id} = undef;
1903 $fhs[$index]->{last_line} = undef;
1904 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
1905 }
1906 }
1907 else{
1908 die "The orientation of the alignment must be either correct or incorrect\n";
1909 }
1910 }
1911 ### the sequence we just read in is already the next sequence to be analysed -> store it in @fhs
1912 else{
1913 $fhs[$index]->{last_seq_id} = $id;
1914 $fhs[$index]->{last_line} = $newline;
1915 return 0; # processing the new alignment result only in the next round
1916 }
1917 }
1918 else {
1919 # assigning undef to last_seq_id and last_line (end of bowtie output)
1920 $fhs[$index]->{last_seq_id} = undef;
1921 $fhs[$index]->{last_line} = undef;
1922 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
1923 }
1924 }
1925 else{
1926 die "The orientation of the alignment must be either correct or incorrect\n";
1927 }
1928 }
1929 ### the sequence stored in @fhs as last_line is already the next sequence to be analysed -> analyse next round
1930 else{
1931 return 0;
1932 }
1933 }
1934 #########################
1935 ### BOWTIE 1 | PAIRED-END
1936 #########################
1937
1938 sub check_bowtie_results_paired_ends{
1939 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
1940
1941 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
1942 unless ($quality_value_1){
1943 $quality_value_1 = 'I'x(length$sequence_1);
1944 }
1945 unless ($quality_value_2){
1946 $quality_value_2 = 'I'x(length$sequence_2);
1947 }
1948
1949 # warn "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
1950 # sleep (1);
1951 my %mismatches = ();
1952 ### reading from the bowtie output files to see if this sequence pair aligned to a bisulfite converted genome
1953
1954
1955 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
1956 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
1957 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
1958 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignment to the complementary
1959 ### strands are not being reported by specifying --directional
1960
1961 foreach my $index (0,3,1,2){
1962 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
1963 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
1964 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
1965 if ($fhs[$index]->{last_seq_id} eq $identifier) {
1966 # print "$identifier\n$fhs[$index]->{last_seq_id}\n\n";
1967
1968 ##################################################################################
1969 ### STEP I Processing the entry which is stored in last_line_1 and last_line_2 ###
1970 ##################################################################################
1971 my $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
1972 ### sequences can fail at this point if there was only 1 alignment in the wrong orientation, or if there were 2 aligments both in the wrong
1973 ### orientation. We only continue to extract useful information about this alignment if 1 was returned
1974 if ($valid_alignment_found == 1){
1975 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself.
1976 ### we store the useful information in %mismatches
1977 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
1978 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
1979 chomp $mismatch_info_1;
1980 chomp $mismatch_info_2;
1981
1982 ### need to extract the chromosome number from the bowtie output (which is either XY_CT_converted or XY_GA_converted
1983 my ($chromosome_1,$chromosome_2);
1984 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
1985 $chromosome_1 = $mapped_chromosome_1;
1986 }
1987 else{
1988 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
1989 }
1990 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
1991 $chromosome_2 = $mapped_chromosome_2;
1992 }
1993 else{
1994 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
1995 }
1996
1997 ### Now extracting the number of mismatches to the converted genome
1998 my $number_of_mismatches_1;
1999 my $number_of_mismatches_2;
2000 if ($mismatch_info_1 eq ''){
2001 $number_of_mismatches_1 = 0;
2002 }
2003 elsif ($mismatch_info_1 =~ /^\d/){
2004 my @mismatches = split (/,/,$mismatch_info_1);
2005 $number_of_mismatches_1 = scalar @mismatches;
2006 }
2007 else{
2008 die "Something weird is going on with the mismatch field\n";
2009 }
2010 if ($mismatch_info_2 eq ''){
2011 $number_of_mismatches_2 = 0;
2012 }
2013 elsif ($mismatch_info_2 =~ /^\d/){
2014 my @mismatches = split (/,/,$mismatch_info_2);
2015 $number_of_mismatches_2 = scalar @mismatches;
2016 }
2017 else{
2018 die "Something weird is going on with the mismatch field\n";
2019 }
2020 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
2021 my $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
2022 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
2023 die "Position 1 is higher than position 2" if ($position_1 > $position_2);
2024 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
2025 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
2026 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
2027 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
2028 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
2029 ### number for the found alignment)
2030 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
2031 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
2032 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
2033 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
2034 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
2035 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
2036 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
2037 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
2038 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
2039 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
2040 }
2041 ###################################################################################################################################################
2042 ### STEP II Now reading in the next 2 lines from the bowtie filehandle. If there are 2 next lines in the alignments filehandle it can either ###
2043 ### be a second alignment of the same sequence pair or a new sequence pair. In any case we will just add it to last_line_1 and last_line _2. ###
2044 ### If it is the alignment of the next sequence pair, 0 will be returned as $valid_alignment_found, so it will not be processed any further in ###
2045 ### this round ###
2046 ###################################################################################################################################################
2047 my $newline_1 = $fhs[$index]->{fh}-> getline();
2048 my $newline_2 = $fhs[$index]->{fh}-> getline();
2049
2050 if ($newline_1 and $newline_2){
2051 my ($seq_id_1) = split (/\t/,$newline_1);
2052 my ($seq_id_2) = split (/\t/,$newline_2);
2053
2054 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
2055 $fhs[$index]->{last_seq_id} = $seq_id_1;
2056 }
2057 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
2058 $fhs[$index]->{last_seq_id} = $seq_id_2;
2059 }
2060 else{
2061 die "Either read 1 or read 2 needs to end on '/1'\n";
2062 }
2063
2064 $fhs[$index]->{last_line_1} = $newline_1;
2065 $fhs[$index]->{last_line_2} = $newline_2;
2066 }
2067 else {
2068 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
2069 $fhs[$index]->{last_seq_id} = undef;
2070 $fhs[$index]->{last_line_1} = undef;
2071 $fhs[$index]->{last_line_2} = undef;
2072 next; # jumping to the next index
2073 }
2074 ### Now processing the entry we just stored in last_line_1 and last_line_2
2075 $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
2076 ### only processing the alignment further if 1 was returned. 0 will be returned either if the alignment is already the next sequence pair to
2077 ### be analysed or if it was a second alignment of the current sequence pair but in the wrong orientation
2078 if ($valid_alignment_found == 1){
2079 ### we store the useful information in %mismatches
2080 ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,7];
2081 ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,7];
2082 chomp $mismatch_info_1;
2083 chomp $mismatch_info_2;
2084 ### need to extract the chromosome number from the bowtie output (which is either _CT_converted or _GA_converted)
2085 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
2086 $chromosome_1 = $mapped_chromosome_1;
2087 }
2088 else{
2089 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
2090 }
2091 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
2092 $chromosome_2 = $mapped_chromosome_2;
2093 }
2094 else{
2095 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
2096 }
2097
2098 $number_of_mismatches_1='';
2099 $number_of_mismatches_2='';
2100 ### Now extracting the number of mismatches to the converted genome
2101 if ($mismatch_info_1 eq ''){
2102 $number_of_mismatches_1 = 0;
2103 }
2104 elsif ($mismatch_info_1 =~ /^\d/){
2105 my @mismatches = split (/,/,$mismatch_info_1);
2106 $number_of_mismatches_1 = scalar @mismatches;
2107 }
2108 else{
2109 die "Something weird is going on with the mismatch field\n";
2110 }
2111 if ($mismatch_info_2 eq ''){
2112 $number_of_mismatches_2 = 0;
2113 }
2114 elsif ($mismatch_info_2 =~ /^\d/){
2115 my @mismatches = split (/,/,$mismatch_info_2);
2116 $number_of_mismatches_2 = scalar @mismatches;
2117 }
2118 else{
2119 die "Something weird is going on with the mismatch field\n";
2120 }
2121 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
2122 $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
2123 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
2124 die "position 1 is greater than position 2" if ($position_1 > $position_2);
2125 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
2126 $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
2127 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
2128 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
2129 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
2130 ### number for the found alignment)
2131 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
2132 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
2133 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
2134 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
2135 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
2136 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
2137 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
2138 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
2139 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
2140 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
2141 }
2142 ###############################################################################################################################################
2143 ### STEP III Now reading in two more lines. These have to be the next entry and we will just add assign them to last_line_1 and last_line_2 ###
2144 ###############################################################################################################################################
2145 $newline_1 = $fhs[$index]->{fh}-> getline();
2146 $newline_2 = $fhs[$index]->{fh}-> getline();
2147
2148 if ($newline_1 and $newline_2){
2149 my ($seq_id_1) = split (/\t/,$newline_1);
2150 my ($seq_id_2) = split (/\t/,$newline_2);
2151
2152 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
2153 $fhs[$index]->{last_seq_id} = $seq_id_1;
2154 }
2155 if ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
2156 $fhs[$index]->{last_seq_id} = $seq_id_2;
2157 }
2158 $fhs[$index]->{last_line_1} = $newline_1;
2159 $fhs[$index]->{last_line_2} = $newline_2;
2160 }
2161 else {
2162 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
2163 $fhs[$index]->{last_seq_id} = undef;
2164 $fhs[$index]->{last_line_1} = undef;
2165 $fhs[$index]->{last_line_2} = undef;
2166 next; # jumping to the next index
2167 }
2168 ### within the 2nd sequence pair alignment in correct orientation found
2169 }
2170 ### within the 1st sequence pair alignment in correct orientation found
2171 }
2172 ### still within the (last_seq_id eq identifier) condition
2173 }
2174 ### still within foreach index loop
2175 }
2176 ### if there was no single alignment found for a certain sequence we will continue with the next sequence in the sequence file
2177 unless(%mismatches){
2178 $counting{no_single_alignment_found}++;
2179 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
2180 }
2181 ### Going to use the variable $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
2182 my $sequence_pair_fails = 0;
2183 ### Declaring an empty hash reference which will store all information we need for the methylation call
2184 my $methylation_call_params; # hash reference!
2185 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the
2186 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the
2187 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether
2188 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
2189 #dev print "Number of mismatches: $mismatch_number\t$identifier\t$sequence_1\t$sequence_2\n";
2190 foreach my $entry (keys (%{$mismatches{$mismatch_number}}) ){
2191 #dev print "$mismatch_number\t$entry\t$mismatches{$mismatch_number}->{$entry}->{index}\n";
2192 # print join("\t",$mismatch_number,$mismatches{$mismatch_number}->{$entry}->{seq_id},$sequence,$mismatches{$mismatch_number}->{$entry}->{bowtie_sequence},$mismatches{$mismatch_number}->{$entry}->{chromosome},$mismatches{$mismatch_number}->{$entry}->{position},$mismatches{$mismatch_number}->{$entry}->{index}),"\n";
2193 }
2194 if (scalar keys %{$mismatches{$mismatch_number}} == 1){
2195 # print "Unique best alignment for sequence pair $sequence_1\t$sequence_1\n";
2196 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
2197 $methylation_call_params->{$identifier}->{seq_id} = $identifier;
2198 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_1};
2199 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2};
2200 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
2201 $methylation_call_params->{$identifier}->{start_seq_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_1};
2202 $methylation_call_params->{$identifier}->{start_seq_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2};
2203 $methylation_call_params->{$identifier}->{alignment_end} = ($mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}+length($mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2}));
2204 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
2205 $methylation_call_params->{$identifier}->{number_of_mismatches_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_1};
2206 $methylation_call_params->{$identifier}->{number_of_mismatches_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_2};
2207 }
2208 }
2209 else{
2210 $sequence_pair_fails = 1;
2211 }
2212 ### after processing the alignment with the lowest number of mismatches we exit
2213 last;
2214 }
2215 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
2216 if ($sequence_pair_fails == 1){
2217 $counting{unsuitable_sequence_count}++;
2218 if ($ambiguous){
2219 return 2; # => exits to next sequence pair, and prints both seqs out to multiple_alignments_1 and -2 if --ambiguous has been specified
2220 }
2221 if ($unmapped){
2222 return 1; # => exits to next sequence pair, and prints both seqs out to unmapped_1 and _2 if --un has been specified
2223 }
2224 else{
2225 return 0; # => exits to next sequence (default)
2226 }
2227 }
2228
2229 ### --DIRECTIONAL
2230 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
2231 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
2232 if ($directional){
2233 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
2234 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
2235 $counting{alignments_rejected_count}++;
2236 return 0;
2237 }
2238 }
2239
2240 ### If the sequence has not been rejected so far it does have a unique best alignment
2241 $counting{unique_best_alignment_count}++;
2242 extract_corresponding_genomic_sequence_paired_ends($identifier,$methylation_call_params);
2243
2244 ### check test to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
2245 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
2246 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n";
2247 $counting{genomic_sequence_could_not_be_extracted_count}++;
2248 return 0;
2249 }
2250 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
2251 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n";
2252 $counting{genomic_sequence_could_not_be_extracted_count}++;
2253 return 0;
2254 }
2255
2256 ### otherwise we are set to perform the actual methylation call
2257 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
2258 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
2259
2260 print_bisulfite_mapping_results_paired_ends($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
2261 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
2262 }
2263
2264 #########################
2265 ### BOWTIE 2 | PAIRED-END
2266 #########################
2267
2268 sub check_bowtie_results_paired_ends_bowtie2{
2269 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
2270
2271 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
2272 unless ($quality_value_1){
2273 $quality_value_1 = 'I'x(length$sequence_1);
2274 }
2275
2276 unless ($quality_value_2){
2277 $quality_value_2 = 'I'x(length$sequence_2);
2278 }
2279
2280
2281 # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
2282
2283
2284 my %alignments;
2285 my $alignment_ambiguous = 0;
2286
2287 ### reading from the Bowtie 2 output filehandles
2288
2289 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
2290 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
2291 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
2292 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignments to the complementary
2293 ### strands are not being reported when '--directional' is specified
2294
2295 foreach my $index (0,3,1,2){
2296 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
2297 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
2298
2299 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
2300 if ($fhs[$index]->{last_seq_id} eq $identifier) {
2301
2302 my ($id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,5,9,10];
2303 my ($id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,5,9,10];
2304 # print "Index: $index\t$fhs[$index]->{last_line_1}\n";
2305 # print "Index: $index\t$fhs[$index]->{last_line_2}\n";
2306 # print join ("\t",$id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1),"\n";
2307 # print join ("\t",$id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2),"\n";
2308 $id_1 =~ s/\/1$//;
2309 $id_2 =~ s/\/2$//;
2310
2311 # SAM format specifications for Bowtie 2
2312 # (1) Name of read that aligned
2313 # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
2314 # 1 The read is one of a pair
2315 # 2 The alignment is one end of a proper paired-end alignment
2316 # 4 The read has no reported alignments
2317 # 8 The read is one of a pair and has no reported alignments
2318 # 16 The alignment is to the reverse reference strand
2319 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
2320 # 64 The read is mate 1 in a pair
2321 # 128 The read is mate 2 in a pair
2322 # 256 The read has multiple mapping states
2323 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
2324 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
2325 # (5) Mapping quality (255 means MAPQ is not available)
2326 # (6) CIGAR string representation of alignment (* if unavailable)
2327 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
2328 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
2329 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
2330 # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
2331 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
2332 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
2333 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
2334 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
2335 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
2336 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
2337 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
2338 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
2339 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
2340 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
2341 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
2342 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
2343
2344 ### If a sequence has no reported alignments there will be a single output line per sequence with a bit-wise flag value of 77 for read 1 (1+4+8+64), or 141 for read 2 (1+4+8+128).
2345 ### We can store the next alignment and move on to the next Bowtie 2 instance
2346 if ($flag_1 == 77 and $flag_2 == 141){
2347 ## reading in the next alignment, which must be the next sequence
2348 my $newline_1 = $fhs[$index]->{fh}-> getline();
2349 my $newline_2 = $fhs[$index]->{fh}-> getline();
2350
2351 if ($newline_1 and $newline_2){
2352 chomp $newline_1;
2353 chomp $newline_2;
2354 my ($seq_id_1) = split (/\t/,$newline_1);
2355 my ($seq_id_2) = split (/\t/,$newline_2);
2356 $seq_id_1 =~ s/\/1$//;
2357 $seq_id_2 =~ s/\/2$//;
2358 $fhs[$index]->{last_seq_id} = $seq_id_1;
2359 $fhs[$index]->{last_line_1} = $newline_1;
2360 $fhs[$index]->{last_line_2} = $newline_2;
2361
2362 # print "current sequence ($identifier) did not map, reading in next sequence\n";
2363 # print "$index\t$fhs[$index]->{last_seq_id}\n";
2364 # print "$index\t$fhs[$index]->{last_line_1}\n";
2365 # print "$index\t$fhs[$index]->{last_line_2}\n";
2366 next; # next instance
2367 }
2368 else{
2369 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
2370 $fhs[$index]->{last_seq_id} = undef;
2371 $fhs[$index]->{last_line_1} = undef;
2372 $fhs[$index]->{last_line_2} = undef;
2373 next;
2374 }
2375 }
2376
2377 ### If there are one or more proper alignments we can extract the chromosome number
2378 my ($chromosome_1,$chromosome_2);
2379 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
2380 $chromosome_1 = $mapped_chromosome_1;
2381 }
2382 else{
2383 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
2384 }
2385 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
2386 $chromosome_2 = $mapped_chromosome_2;
2387 }
2388 else{
2389 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
2390 }
2391
2392 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
2393
2394 ### We will use the optional fields to determine the best alignments. Later on we extract the number of mismatches and/or indels from the CIGAR string
2395 my ($alignment_score_1,$alignment_score_2,$second_best_1,$second_best_2,$MD_tag_1,$MD_tag_2);
2396
2397 my @fields_1 = split (/\t/,$fhs[$index]->{last_line_1});
2398 my @fields_2 = split (/\t/,$fhs[$index]->{last_line_2});
2399
2400 foreach (11..$#fields_1){
2401 if ($fields_1[$_] =~ /AS:i:(.*)/){
2402 $alignment_score_1 = $1;
2403 }
2404 elsif ($fields_1[$_] =~ /XS:i:(.*)/){
2405 $second_best_1 = $1;
2406 }
2407 elsif ($fields_1[$_] =~ /MD:Z:(.*)/){
2408 $MD_tag_1 = $1;
2409 }
2410 }
2411
2412 foreach (11..$#fields_2){
2413 if ($fields_2[$_] =~ /AS:i:(.*)/){
2414 $alignment_score_2 = $1;
2415 }
2416 elsif ($fields_2[$_] =~ /XS:i:(.*)/){
2417 $second_best_2 = $1;
2418 }
2419 elsif ($fields_2[$_] =~ /MD:Z:(.*)/){
2420 $MD_tag_2 = $1;
2421 }
2422 }
2423
2424 die "Failed to extract alignment score 1 ($alignment_score_1) and MD tag ($MD_tag_1)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_1 and defined $MD_tag_1);
2425 die "Failed to extract alignment score 2 ($alignment_score_2) and MD tag ($MD_tag_2)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_2 and defined $MD_tag_2);
2426
2427 # warn "First read 1 alignment score is: '$alignment_score_1'\n";
2428 # warn "First read 2 alignment score is: '$alignment_score_2'\n";
2429 # warn "MD tag 1 is: '$MD_tag_1'\n";
2430 # warn "MD tag 2 is: '$MD_tag_2'\n";
2431
2432 ### To decide whether a sequence pair has a unique best alignment we will look at the highest sum of alignment scores from both alignments
2433 my $sum_of_alignment_scores_1 = $alignment_score_1 + $alignment_score_2 ;
2434 # print "sum of alignment scores: $sum_of_alignment_scores_1\n\n";
2435
2436 if (defined $second_best_1 and defined $second_best_2){
2437 my $sum_of_alignment_scores_second_best = $second_best_1 + $second_best_2;
2438 # warn "Second best alignment_score_1 is: '$second_best_1'\n";
2439 # warn "Second best alignment_score_2 is: '$second_best_2'\n";
2440 # warn "Second best alignment sum of alignment scores is: '$sum_of_alignment_scores_second_best'\n";
2441
2442 # If the first alignment score for the first read pair is the same as the alignment score of the second best hit we are going to boot this sequence pair altogether
2443 if ($sum_of_alignment_scores_1 == $sum_of_alignment_scores_second_best){
2444 $alignment_ambiguous = 1;
2445 # print "This read will be chucked (AS==XS detected)!\n";
2446
2447 ## need to read and discard all additional ambiguous reads until we reach the next sequence
2448 until ($fhs[$index]->{last_seq_id} ne $identifier){
2449 my $newline_1 = $fhs[$index]->{fh}-> getline();
2450 my $newline_2 = $fhs[$index]->{fh}-> getline();
2451 if ($newline_1 and $newline_2){
2452 chomp $newline_1;
2453 chomp $newline_2;
2454 my ($seq_id_1) = split (/\t/,$newline_1);
2455 my ($seq_id_2) = split (/\t/,$newline_2);
2456 $seq_id_1 =~ s/\/1$//;
2457 $seq_id_2 =~ s/\/2$//;
2458 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
2459
2460 $fhs[$index]->{last_seq_id} = $seq_id_1;
2461 $fhs[$index]->{last_line_1} = $newline_1;
2462 $fhs[$index]->{last_line_2} = $newline_2;
2463 }
2464 else{
2465 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
2466 $fhs[$index]->{last_seq_id} = undef;
2467 $fhs[$index]->{last_line_1} = undef;
2468 $fhs[$index]->{last_line_2} = undef;
2469 last; # break free if the end of the alignment output was reached
2470 }
2471 }
2472 # if ($fhs[$index]->{last_seq_id}){
2473 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
2474 # }
2475 }
2476 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
2477
2478 my $alignment_location;
2479 if ($position_1 <= $position_2){
2480 $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
2481 }
2482 elsif($position_2 < $position_1){
2483 $alignment_location = join(":",$chromosome_1,$position_2,$position_1);
2484 }
2485
2486 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
2487 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
2488 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
2489 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
2490
2491 unless (exists $alignments{$alignment_location}){
2492 $alignments{$alignment_location}->{seq_id} = $id_1;
2493 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
2494 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
2495 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
2496 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
2497 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
2498 $alignments{$alignment_location}->{index} = $index;
2499 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
2500 $alignments{$alignment_location}->{position_1} = $position_1;
2501 $alignments{$alignment_location}->{position_2} = $position_2;
2502 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
2503 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
2504 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
2505 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
2506 $alignments{$alignment_location}->{flag_1} = $flag_1;
2507 $alignments{$alignment_location}->{flag_2} = $flag_2;
2508 }
2509 # warn "added best of several alignments to \%alignments hash\n";
2510
2511 ### now reading and discarding all (inferior) alignments of this read pair until we hit the next sequence
2512 until ($fhs[$index]->{last_seq_id} ne $identifier){
2513 my $newline_1 = $fhs[$index]->{fh}-> getline();
2514 my $newline_2 = $fhs[$index]->{fh}-> getline();
2515 if ($newline_1 and $newline_2){
2516 chomp $newline_1;
2517 chomp $newline_2;
2518 my ($seq_id_1) = split (/\t/,$newline_1);
2519 my ($seq_id_2) = split (/\t/,$newline_2);
2520 $seq_id_1 =~ s/\/1$//;
2521 $seq_id_2 =~ s/\/2$//;
2522 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
2523
2524 $fhs[$index]->{last_seq_id} = $seq_id_1;
2525 $fhs[$index]->{last_line_1} = $newline_1;
2526 $fhs[$index]->{last_line_2} = $newline_2;
2527 }
2528 else{
2529 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
2530 $fhs[$index]->{last_seq_id} = undef;
2531 $fhs[$index]->{last_line_1} = undef;
2532 $fhs[$index]->{last_line_2} = undef;
2533 last; # break free if the end of the alignment output was reached
2534 }
2535 }
2536 # if($fhs[$index]->{last_seq_id}){
2537 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all other alignments until the next ID was reached which is: $fhs[$index]->{last_seq_id}\n";
2538 # }
2539 }
2540 }
2541 else{ # there is no second best hit, so we can just store this one and read in the next sequence
2542
2543 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
2544 # print "$alignment_location\n";
2545 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
2546 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
2547 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
2548 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
2549
2550 unless (exists $alignments{$alignment_location}){
2551 $alignments{$alignment_location}->{seq_id} = $id_1;
2552 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
2553 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
2554 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
2555 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
2556 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
2557 $alignments{$alignment_location}->{index} = $index;
2558 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
2559 $alignments{$alignment_location}->{position_1} = $position_1;
2560 $alignments{$alignment_location}->{position_2} = $position_2;
2561 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
2562 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
2563 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
2564 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
2565 $alignments{$alignment_location}->{flag_1} = $flag_1;
2566 $alignments{$alignment_location}->{flag_2} = $flag_2;
2567 }
2568
2569 # warn "added unique alignment to \%alignments hash\n";
2570
2571 # Now reading and storing the next read pair
2572 my $newline_1 = $fhs[$index]->{fh}-> getline();
2573 my $newline_2 = $fhs[$index]->{fh}-> getline();
2574 if ($newline_1 and $newline_2){
2575 chomp $newline_1;
2576 chomp $newline_2;
2577 # print "$newline_1\n";
2578 # print "$newline_2\n";
2579 my ($seq_id_1) = split (/\t/,$newline_1);
2580 my ($seq_id_2) = split (/\t/,$newline_2);
2581 $seq_id_1 =~ s/\/1$//;
2582 $seq_id_2 =~ s/\/2$//;
2583 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
2584
2585 $fhs[$index]->{last_seq_id} = $seq_id_1;
2586 $fhs[$index]->{last_line_1} = $newline_1;
2587 $fhs[$index]->{last_line_2} = $newline_2;
2588
2589 if ($seq_id_1 eq $identifier){
2590 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
2591 }
2592 }
2593 else{
2594 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
2595 $fhs[$index]->{last_seq_id} = undef;
2596 $fhs[$index]->{last_line_1} = undef;
2597 $fhs[$index]->{last_line_2} = undef;
2598 }
2599 }
2600 }
2601 }
2602
2603 ### if the read produced several ambiguous alignments for a single instance of Bowtie 2 we can return already now. If --ambiguous was specified the read sequence will be printed out in FastQ format
2604 if ($alignment_ambiguous == 1){
2605 $counting{unsuitable_sequence_count}++;
2606 ### report that the sequence pair has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
2607 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
2608 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
2609 # print "$ambiguous_read_1\n";
2610 # print "$ambiguous_read_2\n";
2611
2612 if ($ambiguous){
2613 return 2; # => exits to next sequence pair, and prints it out to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
2614 }
2615 elsif ($unmapped){
2616 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
2617 }
2618 else{
2619 return 0;
2620 }
2621 }
2622
2623 ### if no alignment was found for a certain sequence at all we continue with the next sequence in the sequence file
2624 unless (%alignments){
2625 $counting{no_single_alignment_found}++;
2626
2627 # my $unmapped_read_1 = join("\t",$identifier.'/1','77','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
2628 # my $unmapped_read_2 = join("\t",$identifier.'/2','141','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
2629 # print "$unmapped_read_1\n";
2630 # print "$unmapped_read_2\n";
2631 if ($unmapped){
2632 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_read_2.txt if '--unmapped' was specified
2633 }
2634 else{
2635 return 0;
2636 }
2637 }
2638
2639 #######################################################################################################################################################
2640
2641 ### If the sequence pair was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
2642 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
2643 ### alignment score we are discarding the sequence pair altogether.
2644 ### For end-to-end alignments the maximum alignment score is 0, each mismatch receives a penalty of 6, and each gap receives penalties for opening (5)
2645 ### and extending (3 per bp) the gap.
2646
2647 #######################################################################################################################################################
2648
2649 ### Declaring an empty hash reference which will store all information we need for the methylation call
2650 my $methylation_call_params; # hash reference
2651 my $sequence_pair_fails = 0; # using $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
2652
2653 ### print contents of %alignments for debugging
2654 ## if (scalar keys %alignments >= 1){
2655 # print "\n******\n";
2656 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
2657 # print "Loc: $alignment_location\n";
2658 # print "ID: $alignments{$alignment_location}->{seq_id}\n";
2659 # print "AS_1: $alignments{$alignment_location}->{alignment_score_1}\n";
2660 # print "AS_2: $alignments{$alignment_location}->{alignment_score_2}\n";
2661 # print "Seq_1: $alignments{$alignment_location}->{bowtie_sequence_1}\n";
2662 # print "Seq_2: $alignments{$alignment_location}->{bowtie_sequence_2}\n";
2663 # print "Index $alignments{$alignment_location}->{index}\n";
2664 # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
2665 # print "Pos_1: $alignments{$alignment_location}->{position_1}\n";
2666 # print "Pos_2: $alignments{$alignment_location}->{position_2}\n";
2667 # print "CIGAR_1: $alignments{$alignment_location}->{CIGAR_1}\n";
2668 # print "CIGAR_2: $alignments{$alignment_location}->{CIGAR_2}\n";
2669 # print "MD_1: $alignments{$alignment_location}->{mismatch_info_1}\n";
2670 # print "MD_2: $alignments{$alignment_location}->{mismatch_info_2}\n";
2671 # print "Flag 1: $alignments{$alignment_location}->{flag_1}\n";
2672 # print "Flag 2: $alignments{$alignment_location}->{flag_2}\n";
2673 # }
2674 # print "\n******\n";
2675 # }
2676
2677 ### if there is only 1 entry in the %alignments hash we accept it as the best alignment
2678 if (scalar keys %alignments == 1){
2679 for my $unique_best_alignment (keys %alignments){
2680 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$unique_best_alignment}->{bowtie_sequence_1};
2681 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$unique_best_alignment}->{bowtie_sequence_2};
2682 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
2683 $methylation_call_params->{$identifier}->{position_1} = $alignments{$unique_best_alignment}->{position_1};
2684 $methylation_call_params->{$identifier}->{position_2} = $alignments{$unique_best_alignment}->{position_2};
2685 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
2686 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$unique_best_alignment}->{alignment_score_1};
2687 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$unique_best_alignment}->{alignment_score_2};
2688 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores};
2689 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$unique_best_alignment}->{mismatch_info_1};
2690 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$unique_best_alignment}->{mismatch_info_2};
2691 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$unique_best_alignment}->{CIGAR_1};
2692 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$unique_best_alignment}->{CIGAR_2};
2693 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$unique_best_alignment}->{flag_1};
2694 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$unique_best_alignment}->{flag_2};
2695 }
2696 }
2697
2698 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
2699 ### we boot the sequence pair altogether)
2700 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
2701 my $best_sum_of_alignment_scores;
2702 my $best_alignment_location;
2703 foreach my $alignment_location (sort {$alignments{$b}->{sum_of_alignment_scores} <=> $alignments{$a}->{sum_of_alignment_scores}} keys %alignments){
2704 # print "$alignments{$alignment_location}->{sum_of_alignment_scores}\n";
2705 unless (defined $best_sum_of_alignment_scores){
2706 $best_sum_of_alignment_scores = $alignments{$alignment_location}->{sum_of_alignment_scores};
2707 $best_alignment_location = $alignment_location;
2708 # print "setting best alignment score to: $best_sum_of_alignment_scores\n";
2709 }
2710 else{
2711 ### if the second best alignment has the same sum of alignment scores as the first one, the sequence pair will get booted
2712 if ($alignments{$alignment_location}->{sum_of_alignment_scores} == $best_sum_of_alignment_scores){
2713 # warn "Same sum of alignment scores for 2 different alignments, the sequence pair will get booted!\n";
2714 $sequence_pair_fails = 1;
2715 last; # exiting since we know that the sequence has ambiguous alignments
2716 }
2717 ### else we are going to store the best alignment for further processing
2718 else{
2719 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$best_alignment_location}->{bowtie_sequence_1};
2720 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$best_alignment_location}->{bowtie_sequence_2};
2721 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
2722 $methylation_call_params->{$identifier}->{position_1} = $alignments{$best_alignment_location}->{position_1};
2723 $methylation_call_params->{$identifier}->{position_2} = $alignments{$best_alignment_location}->{position_2};
2724 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
2725 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$best_alignment_location}->{alignment_score_1};
2726 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$best_alignment_location}->{alignment_score_2};
2727 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$best_alignment_location}->{sum_of_alignment_scores};
2728 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$best_alignment_location}->{mismatch_info_1};
2729 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$best_alignment_location}->{mismatch_info_2};
2730 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$best_alignment_location}->{CIGAR_1};
2731 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$best_alignment_location}->{CIGAR_2};
2732 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$best_alignment_location}->{flag_1};
2733 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$best_alignment_location}->{flag_2};
2734 last; # exiting since the sequence produced a unique best alignment
2735 }
2736 }
2737 }
2738 }
2739 else{
2740 die "There are too many potential hits for this sequence pair (1-4 expected, but found: '",scalar keys %alignments,"')\n";;
2741 }
2742
2743 ### skipping the sequence completely if there were multiple alignments with the same best sum of alignment scores at different positions
2744 if ($sequence_pair_fails == 1){
2745 $counting{unsuitable_sequence_count}++;
2746
2747 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
2748 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
2749 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
2750 # print "$ambiguous_read_1\n";
2751 # print "$ambiguous_read_2\n";
2752
2753 if ($ambiguous){
2754 return 2; # => exits to next sequence pair, and prints it out (in FastQ format) to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
2755 }
2756 elsif ($unmapped){
2757 return 1; # => exits to next sequence pair, and prints it out (in FastQ format) to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
2758 }
2759 else{
2760 return 0; # => exits to next sequence pair (default)
2761 }
2762 }
2763
2764 ### --DIRECTIONAL
2765 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
2766 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
2767 if ($directional){
2768 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
2769 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
2770 $counting{alignments_rejected_count}++;
2771 return 0;
2772 }
2773 }
2774
2775 ### If the sequence pair has not been rejected so far it does have a unique best alignment
2776 $counting{unique_best_alignment_count}++;
2777 extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params);
2778
2779 ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
2780 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
2781 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n";
2782 $counting{genomic_sequence_could_not_be_extracted_count}++;
2783 return 0;
2784 }
2785 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
2786 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n";
2787 $counting{genomic_sequence_could_not_be_extracted_count}++;
2788 return 0;
2789 }
2790
2791 ### now we are set to perform the actual methylation call
2792 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
2793 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
2794 # print "$methylation_call_params->{$identifier}->{read_conversion_2}\n";
2795 # print " $sequence_2\n";
2796 # print "$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}\n";
2797 # print " $methylation_call_params->{$identifier}->{methylation_call_2}\n";
2798
2799 print_bisulfite_mapping_results_paired_ends_bowtie2($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
2800 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
2801 }
2802
2803 ###
2804
2805 sub decide_whether_paired_end_alignment_is_valid{
2806 my ($index,$identifier) = @_;
2807 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
2808 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
2809 chomp $mismatch_info_1;
2810 chomp $mismatch_info_2;
2811 my $seq_id_1 = $id_1;
2812 my $seq_id_2 = $id_2;
2813 $seq_id_1 =~ s/\/1$//; # removing the read /1
2814 $seq_id_2 =~ s/\/1$//; # removing the read /1
2815
2816 ### ensuring that the current entry is the correct sequence
2817 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
2818 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
2819 ### sensible alignments
2820 my $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
2821 ### If the orientation was correct can we move on
2822 if ($orientation == 1){
2823 return 1; ### 1st possibility for A SEQUENCE-PAIR TO PASS
2824 }
2825 ### If the alignment was in the wrong orientation we need to read in two new lines
2826 elsif($orientation == 0){
2827 my $newline_1 = $fhs[$index]->{fh}->getline();
2828 my $newline_2 = $fhs[$index]->{fh}->getline();
2829 if ($newline_1 and $newline_2){
2830 ### extract detailed information about the alignment again (from $newline_1 and $newline_2 this time)
2831 ($id_1,$strand_1) = (split (/\t/,$newline_1))[0,1];
2832 ($id_2,$strand_2) = (split (/\t/,$newline_2))[0,1];
2833
2834 my $seqid;
2835 $seq_id_1 = $id_1;
2836 $seq_id_2 = $id_2;
2837 # we need to capture the first read (ending on /1)
2838 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
2839 $seqid = $seq_id_1;
2840 }
2841 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
2842 $seqid = $seq_id_2;
2843 }
2844 else{
2845 die "One of the two reads needs to end on /1!!";
2846 }
2847
2848 ### ensuring that the next entry is still the correct sequence
2849 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
2850 ### checking orientation again
2851 $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
2852 ### If the orientation was correct can we move on
2853 if ($orientation == 1){
2854 ### Writing the current sequence to last_line_1 and last_line_2
2855 $fhs[$index]->{last_seq_id} = $seqid;
2856 $fhs[$index]->{last_line_1} = $newline_1;
2857 $fhs[$index]->{last_line_2} = $newline_2;
2858 return 1; ### 2nd possibility for a SEQUENCE-PAIR TO PASS
2859 }
2860 ### If the alignment was in the wrong orientation again we need to read in yet another 2 new lines and store them in @fhs (this must be
2861 ### the next entry)
2862 elsif ($orientation == 0){
2863 $newline_1 = $fhs[$index]->{fh}->getline();
2864 $newline_2 = $fhs[$index]->{fh}->getline();
2865 if ($newline_1 and $newline_2){
2866 ($seq_id_1) = split (/\t/,$newline_1);
2867 ($seq_id_2) = split (/\t/,$newline_2);
2868
2869 $seqid = '';
2870 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
2871 $seqid = $seq_id_1;
2872 }
2873 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
2874 $seqid = $seq_id_2;
2875 }
2876 else{
2877 die "One of the two reads needs to end on /1!!";
2878 }
2879
2880 ### check if the next 2 lines still have the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
2881 ### the same fields of the just read next entry
2882 die "Same seq ID 3 or more times in a row!(should be 2 max)" if ($seqid eq $identifier);
2883 $fhs[$index]->{last_seq_id} = $seqid;
2884 $fhs[$index]->{last_line_1} = $newline_1;
2885 $fhs[$index]->{last_line_2} = $newline_2;
2886 return 0; # not processing anything this round as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
2887 }
2888 else {
2889 ### assigning undef to last_seq_id and last_line (end of bowtie output)
2890 $fhs[$index]->{last_seq_id} = undef;
2891 $fhs[$index]->{last_line_1} = undef;
2892 $fhs[$index]->{last_line_2} = undef;
2893 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
2894 }
2895 }
2896 else{
2897 die "The orientation of the alignment must be either correct or incorrect\n";
2898 }
2899 }
2900 ### the sequence pair we just read in is already the next sequence pair to be analysed -> store it in @fhs
2901 else{
2902 $fhs[$index]->{last_seq_id} = $seqid;
2903 $fhs[$index]->{last_line_1} = $newline_1;
2904 $fhs[$index]->{last_line_2} = $newline_2;
2905 return 0; # processing the new alignment result only in the next round
2906 }
2907 }
2908 else {
2909 # assigning undef to last_seq_id and both last_lines (end of bowtie output)
2910 $fhs[$index]->{last_seq_id} = undef;
2911 $fhs[$index]->{last_line_1} = undef;
2912 $fhs[$index]->{last_line_2} = undef;
2913 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
2914 }
2915 }
2916 else{
2917 die "The orientation of the alignment must be either correct or incorrect\n";
2918 }
2919 }
2920 ### the sequence pair stored in @fhs as last_line_1 and last_line_2 is already the next sequence pair to be analysed -> analyse next round
2921 else{
2922 return 0;
2923 }
2924 }
2925
2926 ### EXTRACT GENOMIC SEQUENCE | BOWTIE 1 | PAIRED-END
2927
2928 sub extract_corresponding_genomic_sequence_paired_ends {
2929 my ($sequence_identifier,$methylation_call_params) = @_;
2930 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
2931 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
2932 my $alignment_read_1;
2933 my $alignment_read_2;
2934 my $read_conversion_info_1;
2935 my $read_conversion_info_2;
2936 my $genome_conversion;
2937
2938 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at oone of the ends so that we can also make a CpG, CHG or CHH methylation call
2939 ### if the C happens to be at the first or last position of the actually observed sequence
2940 my $non_bisulfite_sequence_1;
2941 my $non_bisulfite_sequence_2;
2942
2943 ### all alignments reported by bowtie have the + alignment first and the - alignment as the second one irrespective of whether read 1 or read 2 was
2944 ### the + alignment. We however always read in sequences read 1 then read 2, so if read 2 is the + alignment we need to swap the extracted genomic
2945 ### sequences around!
2946 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
2947 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
2948 ### [Index 0, sequence originated from (converted) forward strand]
2949 $counting{CT_GA_CT_count}++;
2950 $alignment_read_1 = '+';
2951 $alignment_read_2 = '-';
2952 $read_conversion_info_1 = 'CT';
2953 $read_conversion_info_2 = 'GA';
2954 $genome_conversion = 'CT';
2955 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
2956 ### for hits on the forward strand we need to capture 2 extra bases at the 3' end
2957
2958 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ##CHH change
2959
2960 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
2961 ### As the second conversion is GA we need to capture 1 base 3', so that it is a 5' base after reverse complementation
2962 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{start_seq_2}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+1){ ## CHH change to +1
2963
2964 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2);
2965 ### the reverse strand sequence needs to be reverse complemented
2966 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
2967 }
2968 else{
2969 $non_bisulfite_sequence_2 = '';
2970 }
2971 }
2972
2973 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
2974 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
2975 ### [Index 1, sequence originated from complementary to (converted) reverse strand]
2976 $counting{GA_CT_GA_count}++;
2977 $alignment_read_1 = '+';
2978 $alignment_read_2 = '-';
2979 $read_conversion_info_1 = 'GA';
2980 $read_conversion_info_2 = 'CT';
2981 $genome_conversion = 'GA';
2982
2983 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
2984 ### as we need to make the methylation call for the base 5' of the first base (GA conversion!) we need to capture 2 extra bases at the 5' end
2985 if ($methylation_call_params->{$sequence_identifier}->{start_seq_1}-1 > 0){ ## CHH change to -1
2986 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH change to -2/+2
2987 }
2988 else{
2989 $non_bisulfite_sequence_1 = '';
2990 }
2991
2992 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
2993 ### As we are doing a CT comparison for the reverse strand we are taking 2 bases extra at the 5' end, so it is a 3' base after reverse complementation
2994 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to -2/+2
2995 ### the reverse strand sequence needs to be reverse complemented
2996 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
2997 }
2998
2999 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
3000 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
3001 ### [Index 2, sequence originated from the complementary to (converted) forward strand]
3002 $counting{GA_CT_CT_count}++;
3003 $alignment_read_1 = '-';
3004 $alignment_read_2 = '+';
3005 $read_conversion_info_1 = 'GA';
3006 $read_conversion_info_2 = 'CT';
3007 $genome_conversion = 'CT';
3008
3009 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
3010 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
3011 ### As read 1 is GA converted we need to capture 2 extra 3' bases which will be 2 extra 5' base after reverse complementation
3012 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to +2
3013 ### the reverse strand sequence needs to be reverse complemented
3014 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
3015
3016 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
3017 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
3018 ### Read 2 is CT converted so we need to capture 2 extra 3' bases
3019 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > ($methylation_call_params->{$sequence_identifier}->{start_seq_1})+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+1){ ## CHH change to +1
3020 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ## CHH changed from +1 to +2
3021 }
3022 else{
3023 $non_bisulfite_sequence_2 = '';
3024 }
3025 }
3026
3027 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
3028 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
3029 ### [Index 3, sequence originated from the (converted) reverse strand]
3030 $counting{CT_GA_GA_count}++;
3031 $alignment_read_1 = '-';
3032 $alignment_read_2 = '+';
3033 $read_conversion_info_1 = 'CT';
3034 $read_conversion_info_2 = 'GA';
3035 $genome_conversion = 'GA';
3036
3037 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
3038 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
3039 ### As read 1 is CT converted we need to capture 2 extra 5' bases which will be 2 extra 3' base after reverse complementation
3040 if ( ($methylation_call_params->{$sequence_identifier}->{start_seq_2}-1) > 0){ ## CHH changed to -1
3041 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH changed to -2/+2
3042 ### the reverse strand sequence needs to be reverse complemented
3043 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
3044 }
3045 else{
3046 $non_bisulfite_sequence_1 = '';
3047 }
3048
3049 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
3050 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
3051 ### Read 2 is GA converted so we need to capture 2 extra 5' bases
3052 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH changed to -2/+2
3053 }
3054 else{
3055 die "Too many bowtie result filehandles\n";
3056 }
3057 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
3058 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
3059
3060 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
3061 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
3062 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
3063 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
3064 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
3065 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
3066 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
3067 }
3068
3069 ### EXTRACT GENOMIC SEQUENCE BOWTIE 2 | PAIRED-END
3070
3071 sub extract_corresponding_genomic_sequence_paired_ends_bowtie2{
3072 my ($sequence_identifier,$methylation_call_params) = @_;
3073 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
3074 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
3075
3076 my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1};
3077 my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2};
3078 my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1};
3079 my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2};
3080 # print "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n";
3081 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
3082 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
3083
3084 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
3085 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
3086 my $alignment_read_1;
3087 my $alignment_read_2;
3088 my $read_conversion_info_1;
3089 my $read_conversion_info_2;
3090 my $genome_conversion;
3091
3092 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at one of the ends so that we can also make a CpG, CHG or CHH methylation call
3093 ### if the C happens to be at the last position of the actually observed sequence
3094 my $non_bisulfite_sequence_1 = '';
3095 my $non_bisulfite_sequence_2 = '';
3096
3097 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
3098 my $pos_1 = $methylation_call_params->{$sequence_identifier}->{position_1}-1;
3099 my $pos_2 = $methylation_call_params->{$sequence_identifier}->{position_2}-1;
3100
3101 # parsing CIGAR 1 string
3102 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
3103 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
3104 shift @ops_1; # remove the empty first element
3105 die "CIGAR 1 string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1);
3106 # parsing CIGAR 2 string
3107 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
3108 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
3109 shift @ops_2; # remove the empty first element
3110 die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
3111
3112 my $indels_1 = 0; # addiong these to the hemming distance value (needed for the NM field in the final SAM output
3113 my $indels_2 = 0;
3114
3115 ### Extracting read 1 genomic sequence ###
3116
3117 # extracting 2 additional bp at the 5' end (read 1)
3118 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
3119 # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3120 unless ( ($pos_1-2) > 0){# exiting with en empty genomic sequence otherwise
3121 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
3122 return;
3123 }
3124 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1-2,2);
3125 }
3126
3127 foreach (0..$#len_1){
3128 if ($ops_1[$_] eq 'M'){
3129 # extracting genomic sequence
3130 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]);
3131 # warn "$non_bisulfite_sequence_1\n";
3132 # adjusting position
3133 $pos_1 += $len_1[$_];
3134 }
3135 elsif ($ops_1[$_] eq 'I'){ # insertion in the read sequence
3136 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
3137 $non_bisulfite_sequence_1 .= 'N' x $len_1[$_];
3138 # warn "$non_bisulfite_sequence_1\n";
3139 # position doesn't need adjusting
3140 $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
3141 }
3142 elsif ($ops_1[$_] eq 'D'){ # deletion in the read sequence
3143 # we do not add any genomic sequence but only adjust the position
3144 # warn "Just adjusting the position by: ",$len_1[$_],"bp\n";
3145 $pos_1 += $len_1[$_];
3146 $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
3147 }
3148 elsif($cigar_1 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
3149 die "The CIGAR 1 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
3150 }
3151 else{
3152 die "The CIGAR 1 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
3153 }
3154 }
3155
3156 ### 3' end of read 1
3157 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
3158 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3159 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise
3160 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
3161 return;
3162 }
3163 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2);
3164 }
3165
3166
3167 ### Extracting read 2 genomic sequence ###
3168
3169 ### 5' end of read 2
3170 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
3171 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3172 unless ( ($pos_2-2) >= 0){# exiting with en empty genomic sequence otherwise
3173 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
3174 return;
3175 }
3176 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2-2,2);
3177 }
3178
3179 foreach (0..$#len_2){
3180 if ($ops_2[$_] eq 'M'){
3181 # extracting genomic sequence
3182 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]);
3183 # warn "$non_bisulfite_sequence_2\n";
3184 # adjusting position
3185 $pos_2 += $len_2[$_];
3186 }
3187 elsif ($ops_2[$_] eq 'I'){ # insertion in the read sequence
3188 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
3189 $non_bisulfite_sequence_2 .= 'N' x $len_2[$_];
3190 # warn "$non_bisulfite_sequence_2\n";
3191 # position doesn't need adjusting
3192 $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
3193 }
3194 elsif ($ops_2[$_] eq 'D'){ # deletion in the read sequence
3195 # we do not add any genomic sequence but only adjust the position
3196 # warn "Just adjusting the position by: ",$len_2[$_],"bp\n";
3197 $pos_2 += $len_2[$_];
3198 $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
3199 }
3200 elsif($cigar_2 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
3201 die "The CIGAR 2 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
3202 }
3203 else{
3204 die "The CIGAR 2 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
3205 }
3206 }
3207
3208 ### 3' end of read 2
3209 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
3210 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3211 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise
3212 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
3213 return;
3214 }
3215 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2);
3216 }
3217
3218 ### all paired-end alignments reported by Bowtie 2 have the Read 1 alignment first and the Read 2 alignment as the second one irrespective of whether read 1 or read 2 was
3219 ### the + alignment. We also read in sequences read 1 then read 2 so they should correspond perfectly
3220
3221 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
3222 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
3223 ### [Index 0, sequence originated from (converted) forward strand]
3224 $counting{CT_GA_CT_count}++;
3225 $alignment_read_1 = '+';
3226 $alignment_read_2 = '-';
3227 $read_conversion_info_1 = 'CT';
3228 $read_conversion_info_2 = 'GA';
3229 $genome_conversion = 'CT';
3230 ### Read 1 is always the forward hit
3231 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
3232 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
3233 }
3234
3235 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
3236 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
3237 ### [Index 1, sequence originated from complementary to (converted) bottom strand]
3238 $counting{GA_CT_GA_count}++;
3239 $alignment_read_1 = '+';
3240 $alignment_read_2 = '-';
3241 $read_conversion_info_1 = 'GA';
3242 $read_conversion_info_2 = 'CT';
3243 $genome_conversion = 'GA';
3244 ### Read 1 is always the forward hit
3245 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
3246 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
3247 }
3248
3249 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
3250 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
3251 ### [Index 2, sequence originated from the complementary to (converted) top strand]
3252 $counting{GA_CT_CT_count}++;
3253 $alignment_read_1 = '-';
3254 $alignment_read_2 = '+';
3255 $read_conversion_info_1 = 'GA';
3256 $read_conversion_info_2 = 'CT';
3257 $genome_conversion = 'CT';
3258
3259 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
3260 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
3261 }
3262
3263 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
3264 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
3265 ### [Index 3, sequence originated from the (converted) reverse strand]
3266 $counting{CT_GA_GA_count}++;
3267 $alignment_read_1 = '-';
3268 $alignment_read_2 = '+';
3269 $read_conversion_info_1 = 'CT';
3270 $read_conversion_info_2 = 'GA';
3271 $genome_conversion = 'GA';
3272 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
3273 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
3274 }
3275 else{
3276 die "Too many bowtie result filehandles\n";
3277 }
3278 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
3279 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
3280
3281 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
3282 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
3283 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
3284 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
3285 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
3286 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
3287 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
3288 ## the end position of a read is stored in $pos
3289 $methylation_call_params->{$sequence_identifier}->{end_position_1} = $pos_1;
3290 $methylation_call_params->{$sequence_identifier}->{end_position_2} = $pos_2;
3291 $methylation_call_params->{$sequence_identifier}->{indels_1} = $indels_1;
3292 $methylation_call_params->{$sequence_identifier}->{indels_2} = $indels_2;
3293 }
3294
3295 ##########################################
3296 ### PRINT SINGLE END RESULTS: Bowtie 1 ###
3297 ##########################################
3298
3299 sub print_bisulfite_mapping_result_single_end{
3300 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
3301
3302 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
3303 if ($phred64){
3304 $quality_value = convert_phred64_quals_to_phred33($quality_value);
3305 }
3306 elsif ($solexa){
3307 $quality_value = convert_solexa_quals_to_phred33($quality_value);
3308 }
3309
3310 ### We will add +1 bp to the starting position of single-end reads, as Bowtie 1 reports the index and not the bp position.
3311 $methylation_call_params->{$identifier}->{position} += 1;
3312
3313 ### writing every uniquely mapped read and its methylation call to the output file
3314 if ($vanilla){
3315 my $bowtie1_output = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_strand},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{position},$methylation_call_params->{$identifier}->{end_position},$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{methylation_call},$methylation_call_params->{$identifier}->{read_conversion},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value);
3316 print OUT "$bowtie1_output\n";
3317 }
3318 else{ # SAM output, default since Bismark v1.0.0
3319 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
3320 }
3321 }
3322
3323 ##########################################
3324 ### PRINT SINGLE END RESULTS: Bowtie 2 ###
3325 ##########################################
3326
3327 sub print_bisulfite_mapping_result_single_end_bowtie2{
3328 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
3329
3330 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
3331 if ($phred64){
3332 $quality_value = convert_phred64_quals_to_phred33($quality_value);
3333 }
3334 elsif ($solexa){
3335 $quality_value = convert_solexa_quals_to_phred33($quality_value);
3336 }
3337
3338 ### writing every mapped read and its methylation call to the SAM output file (unmapped and ambiguous reads were already printed)
3339 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
3340 }
3341
3342 ##########################################
3343 ### PRINT PAIRED END ESULTS: Bowtie 1 ###
3344 ##########################################
3345
3346 sub print_bisulfite_mapping_results_paired_ends{
3347 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
3348
3349 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
3350 if ($phred64){
3351 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
3352 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
3353 }
3354 elsif ($solexa){
3355 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
3356 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
3357 }
3358
3359 ### We will add +1 bp to the start position of paired-end reads, as Bowtie 1 reports the index and not the bp position. (End position is already 1-based)
3360 $methylation_call_params->{$identifier}->{start_seq_1} += 1;
3361
3362 ### writing every single aligned read and its methylation call to the output file
3363 if ($vanilla){
3364 my $bowtie1_output_paired_end = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_read_1},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{start_seq_1},$methylation_call_params->{$identifier}->{alignment_end},$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{methylation_call_1},$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{methylation_call_2},$methylation_call_params->{$identifier}->{read_conversion_1},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value_1,$quality_value_2);
3365 print OUT "$bowtie1_output_paired_end\n";
3366 }
3367 else{ # SAM output, default since Bismark v1.0.0
3368 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
3369 }
3370
3371 }
3372
3373 ##########################################
3374 ### PRINT PAIRED END ESULTS: Bowtie 2 ###
3375 ##########################################
3376
3377 sub print_bisulfite_mapping_results_paired_ends_bowtie2{
3378 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
3379
3380 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
3381 if ($phred64){
3382 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
3383 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
3384 }
3385 elsif ($solexa){
3386 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
3387 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
3388 }
3389
3390 ### writing every single aligned read and its methylation call to the output file (unmapped and ambiguous reads were already printed)
3391 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
3392
3393 }
3394
3395
3396 sub convert_phred64_quals_to_phred33{
3397
3398 my $qual = shift;
3399 my @quals = split (//,$qual);
3400 my @new_quals;
3401
3402 foreach my $index (0..$#quals){
3403 my $phred_score = convert_phred64_quality_string_into_phred_score ($quals[$index]);
3404 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
3405 $new_quals[$index] = $phred33_quality_string;
3406 }
3407
3408 my $phred33_quality = join ("",@new_quals);
3409 return $phred33_quality;
3410 }
3411
3412 sub convert_solexa_quals_to_phred33{
3413
3414 my $qual = shift;
3415 my @quals = split (//,$qual);
3416 my @new_quals;
3417
3418 foreach my $index (0..$#quals){
3419 my $phred_score = convert_solexa_pre1_3_quality_string_into_phred_score ($quals[$index]);
3420 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
3421 $new_quals[$index] = $phred33_quality_string;
3422 }
3423
3424 my $phred33_quality = join ("",@new_quals);
3425 return $phred33_quality;
3426 }
3427
3428 sub convert_phred_score_into_phred33_quality_string{
3429 my $qual = shift;
3430 $qual = chr($qual+33);
3431 return $qual;
3432 }
3433
3434 sub convert_phred64_quality_string_into_phred_score{
3435 my $string = shift;
3436 my $qual = ord($string)-64;
3437 return $qual;
3438 }
3439
3440 sub convert_solexa_pre1_3_quality_string_into_phred_score{
3441 ### We will just use 59 as the offset here as all Phred Scores between 10 and 40 look exactly the same, there is only a minute difference for values between 0 and 10
3442 my $string = shift;
3443 my $qual = ord($string)-59;
3444 return $qual;
3445 }
3446
3447
3448 sub extract_corresponding_genomic_sequence_single_end {
3449 my ($sequence_identifier,$methylation_call_params) = @_;
3450 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
3451 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
3452
3453 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
3454 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
3455 my $alignment_strand;
3456 my $read_conversion_info;
3457 my $genome_conversion;
3458 ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and
3459 ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation,
3460 ### if the C happens to be at the last position of the actually observed sequence
3461 my $non_bisulfite_sequence;
3462 ### depending on the conversion we want to make need to capture 1 extra base at the 3' end
3463
3464 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
3465 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
3466 ### [Index 0, sequence originated from (converted) forward strand]
3467 $counting{CT_CT_count}++;
3468 $alignment_strand = '+';
3469 $read_conversion_info = 'CT';
3470 $genome_conversion = 'CT';
3471
3472 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3473 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1
3474 ### + 2 extra base at the 3' end
3475 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
3476 }
3477 else{
3478 $non_bisulfite_sequence = '';
3479 }
3480 }
3481
3482 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
3483 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
3484 ### [Index 1, sequence originated from (converted) reverse strand]
3485 $counting{CT_GA_count}++;
3486 $alignment_strand = '-';
3487 $read_conversion_info = 'CT';
3488 $genome_conversion = 'GA';
3489
3490 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3491 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from >
3492 ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation
3493 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
3494 ## reverse complement!
3495 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
3496 }
3497 else{
3498 $non_bisulfite_sequence = '';
3499 }
3500 }
3501
3502 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
3503 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
3504 ### [Index 2, sequence originated from complementary to (converted) forward strand]
3505 $counting{GA_CT_count}++;
3506 $alignment_strand = '-';
3507 $read_conversion_info = 'GA';
3508 $genome_conversion = 'CT';
3509
3510 ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation
3511 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3512 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012
3513 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
3514 ## reverse complement!
3515 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
3516 }
3517 else{
3518 $non_bisulfite_sequence = '';
3519 }
3520 }
3521
3522 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
3523 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
3524 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
3525 $counting{GA_GA_count}++;
3526 $alignment_strand = '+';
3527 $read_conversion_info = 'GA';
3528 $genome_conversion = 'GA';
3529
3530 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3531 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from >
3532 ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand
3533 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
3534 }
3535 else{
3536 $non_bisulfite_sequence = '';
3537 }
3538 }
3539 else{
3540 die "Too many bowtie result filehandles\n";
3541 }
3542
3543 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
3544 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
3545 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
3546 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
3547
3548 ### at this point we can also determine the end position of a read
3549 $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence});
3550 }
3551
3552 sub extract_corresponding_genomic_sequence_single_end_pbat {
3553 my ($sequence_identifier,$methylation_call_params) = @_;
3554 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
3555 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
3556
3557 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
3558 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
3559 my $alignment_strand;
3560 my $read_conversion_info;
3561 my $genome_conversion;
3562 ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and
3563 ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation,
3564 ### if the C happens to be at the last position of the actually observed sequence
3565 my $non_bisulfite_sequence;
3566 ### depending on the conversion we want to make need to capture 1 extra base at the 3' end
3567
3568 my $pbat_index = $methylation_call_params->{$sequence_identifier}->{index} + 2; # (we are simply not running indexes 0 or 1!
3569
3570 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
3571 if ($pbat_index == 0){
3572 ### [Index 0, sequence originated from (converted) forward strand]
3573 $counting{CT_CT_count}++;
3574 $alignment_strand = '+';
3575 $read_conversion_info = 'CT';
3576 $genome_conversion = 'CT';
3577
3578 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3579 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1
3580 ### + 2 extra base at the 3' end
3581 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
3582 }
3583 else{
3584 $non_bisulfite_sequence = '';
3585 }
3586 }
3587
3588 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
3589 elsif ($pbat_index == 1){
3590 ### [Index 1, sequence originated from (converted) reverse strand]
3591 $counting{CT_GA_count}++;
3592 $alignment_strand = '-';
3593 $read_conversion_info = 'CT';
3594 $genome_conversion = 'GA';
3595
3596 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3597 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from >
3598 ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation
3599 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
3600 ## reverse complement!
3601 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
3602 }
3603 else{
3604 $non_bisulfite_sequence = '';
3605 }
3606 }
3607
3608 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
3609 elsif ($pbat_index == 2){
3610 ### [Index 2, sequence originated from complementary to (converted) forward strand]
3611 $counting{GA_CT_count}++;
3612 $alignment_strand = '-';
3613 $read_conversion_info = 'GA';
3614 $genome_conversion = 'CT';
3615
3616 ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation
3617 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3618 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012
3619 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
3620 ## reverse complement!
3621 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
3622 }
3623 else{
3624 $non_bisulfite_sequence = '';
3625 }
3626 }
3627
3628 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
3629 elsif ($pbat_index == 3){
3630 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
3631 $counting{GA_GA_count}++;
3632 $alignment_strand = '+';
3633 $read_conversion_info = 'GA';
3634 $genome_conversion = 'GA';
3635
3636 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3637 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from >
3638 ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand
3639 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
3640 }
3641 else{
3642 $non_bisulfite_sequence = '';
3643 }
3644 }
3645 else{
3646 die "Too many bowtie result filehandles\n";
3647 }
3648
3649 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
3650 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
3651 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
3652 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
3653
3654 ### at this point we can also determine the end position of a read
3655 $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence});
3656 }
3657
3658
3659 sub extract_corresponding_genomic_sequence_single_end_bowtie2{
3660 my ($sequence_identifier,$methylation_call_params) = @_;
3661
3662 my $MD_tag = $methylation_call_params->{$sequence_identifier}->{mismatch_info};
3663 my $cigar = $methylation_call_params->{$sequence_identifier}->{CIGAR};
3664
3665 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
3666 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
3667
3668 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
3669 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
3670 my $alignment_strand;
3671 my $read_conversion_info;
3672 my $genome_conversion;
3673 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
3674 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
3675 my $non_bisulfite_sequence = '';
3676
3677 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
3678 my $pos = $methylation_call_params->{$sequence_identifier}->{position}-1;
3679
3680 # parsing CIGAR string
3681 my @len = split (/\D+/,$cigar); # storing the length per operation
3682 my @ops = split (/\d+/,$cigar); # storing the operation
3683 shift @ops; # remove the empty first element
3684 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
3685
3686 ### If the sequence aligns best as CT converted reads vs. GA converted genome (OB, index 1) or GA converted reads vs. GA converted genome (CTOB, index 3)
3687 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
3688 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3689 unless ( ($pos-2) >= 0){ # exiting with en empty genomic sequence otherwise
3690 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
3691 return;
3692 }
3693 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos-2,2);
3694 }
3695 my $indels = 0;
3696
3697 foreach (0..$#len){
3698 if ($ops[$_] eq 'M'){
3699 #extracting genomic sequence
3700 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]);
3701 # adjusting position
3702 $pos += $len[$_];
3703 }
3704 elsif ($ops[$_] eq 'I'){ # insertion in the read sequence
3705 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
3706 $non_bisulfite_sequence .= 'N' x $len[$_];
3707 # warn "$non_bisulfite_sequence\n";
3708 # position doesn't need to be adjusting
3709 $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
3710 }
3711 elsif ($ops[$_] eq 'D'){ # deletion in the read sequence
3712 # we do not add any genomic sequence but only adjust the position
3713 $pos += $len[$_];
3714 $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
3715 }
3716 elsif($cigar =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
3717 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
3718 }
3719 else{
3720 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
3721 }
3722 }
3723
3724 ### If the sequence aligns best as CT converted reads vs. CT converted genome (OT, index 0) or GA converted reads vs. CT converted genome (CTOT, index 2)
3725 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
3726 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3727 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos+2){ # exiting with en empty genomic sequence otherwise
3728 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
3729 return;
3730 }
3731 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,2);
3732 # print "$methylation_call_params->{$sequence_identifier}->{bowtie_sequence}\n$non_bisulfite_sequence\n";
3733 }
3734
3735
3736
3737 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
3738 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
3739 ### [Index 0, sequence originated from (converted) forward strand]
3740 $counting{CT_CT_count}++;
3741 $alignment_strand = '+';
3742 $read_conversion_info = 'CT';
3743 $genome_conversion = 'CT';
3744 }
3745
3746 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
3747 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
3748 ### [Index 1, sequence originated from (converted) reverse strand]
3749 $counting{CT_GA_count}++;
3750 $alignment_strand = '-';
3751 $read_conversion_info = 'CT';
3752 $genome_conversion = 'GA';
3753
3754 ### reverse complement!
3755 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
3756 }
3757
3758 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
3759 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
3760 ### [Index 2, sequence originated from complementary to (converted) forward strand]
3761 $counting{GA_CT_count}++;
3762 $alignment_strand = '-';
3763 $read_conversion_info = 'GA';
3764 $genome_conversion = 'CT';
3765
3766 ### reverse complement!
3767 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
3768 }
3769
3770 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
3771 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
3772 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
3773 $counting{GA_GA_count}++;
3774 $alignment_strand = '+';
3775 $read_conversion_info = 'GA';
3776 $genome_conversion = 'GA';
3777
3778 }
3779 else{
3780 die "Too many Bowtie 2 result filehandles\n";
3781 }
3782
3783 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
3784 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
3785 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
3786 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
3787
3788 ### the end position of a read is stored in $pos
3789 $methylation_call_params->{$sequence_identifier}->{end_position} = $pos;
3790 $methylation_call_params->{$sequence_identifier}->{indels} = $indels;
3791 }
3792
3793 ### METHYLATION CALL
3794
3795 sub methylation_call{
3796 my ($identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion) = @_;
3797 ### splitting both the actually observed sequence and the genomic sequence up into single bases so we can compare them one by one
3798 my @seq = split(//,$sequence_actually_observed);
3799 my @genomic = split(//,$genomic_sequence);
3800 # print join ("\n",$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion),"\n";
3801 ### Creating a match-string with different characters for non-cytosine bases (disregarding mismatches here), methyl-Cs or non-methyl Cs in either
3802 ### CpG, CHH or CHG context
3803
3804 #################################################################
3805 ### . for bases not involving cytosines ###
3806 ### X for methylated C in CHG context (was protected) ###
3807 ### x for not methylated C in CHG context (was converted) ###
3808 ### H for methylated C in CHH context (was protected) ###
3809 ### h for not methylated C in CHH context (was converted) ###
3810 ### Z for methylated C in CpG context (was protected) ###
3811 ### z for not methylated C in CpG context (was converted) ###
3812 #################################################################
3813
3814 my @match =();
3815 warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2
3816 my $methyl_CHH_count = 0;
3817 my $methyl_CHG_count = 0;
3818 my $methyl_CpG_count = 0;
3819 my $unmethylated_CHH_count = 0;
3820 my $unmethylated_CHG_count = 0;
3821 my $unmethylated_CpG_count = 0;
3822
3823 if ($read_conversion eq 'CT'){
3824 for my $index (0..$#seq) {
3825 if ($seq[$index] eq $genomic[$index]) {
3826 ### The residue can only be a C if it was not converted to T, i.e. protected my methylation
3827 if ($genomic[$index] eq 'C') {
3828 ### If the residue is a C we want to know if it was in CpG context or in any other context
3829 my $downstream_base = $genomic[$index+1];
3830
3831 if ($downstream_base eq 'G'){
3832 ++$methyl_CpG_count;
3833 push @match,'Z'; # protected C, methylated, in CpG context
3834 }
3835
3836 else {
3837 ### C in not in CpG-context, determining the second downstream base context
3838 my $second_downstream_base = $genomic[$index+2];
3839
3840 if ($second_downstream_base eq 'G'){
3841 ++$methyl_CHG_count;
3842 push @match,'X'; # protected C, methylated, in CHG context
3843 }
3844 else{
3845 ++$methyl_CHH_count;
3846 push @match,'H'; # protected C, methylated, in CHH context
3847 }
3848 }
3849 }
3850 else {
3851 push @match, '.';
3852 }
3853 }
3854 elsif ($seq[$index] ne $genomic[$index]) {
3855 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted into Ts
3856 ### in the actually observed sequence
3857 if ($genomic[$index] eq 'C' and $seq[$index] eq 'T') {
3858 ### If the residue was converted to T we want to know if it was in CpG, CHG or CHH context
3859 my $downstream_base = $genomic[$index+1];
3860
3861 if ($downstream_base eq 'G'){
3862 ++$unmethylated_CpG_count;
3863 push @match,'z'; # converted C, not methylated, in CpG context
3864 }
3865
3866 else{
3867 ### C in not in CpG-context, determining the second downstream base context
3868 my $second_downstream_base = $genomic[$index+2];
3869
3870 if ($second_downstream_base eq 'G'){
3871 ++$unmethylated_CHG_count;
3872 push @match,'x'; # converted C, not methylated, in CHG context
3873 }
3874 else{
3875 ++$unmethylated_CHH_count;
3876 push @match,'h'; # converted C, not methylated, in CHH context
3877 }
3878 }
3879 }
3880 ### all other mismatches are not of interest for a methylation call
3881 else {
3882 push @match,'.';
3883 }
3884 }
3885 else{
3886 die "There can be only 2 possibilities\n";
3887 }
3888 }
3889 }
3890 elsif ($read_conversion eq 'GA'){
3891 # print join ("\n",'***',$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion,'***'),"\n";
3892
3893 for my $index (0..$#seq) {
3894 if ($seq[$index] eq $genomic[$index+2]) {
3895 ### The residue can only be a G if the C on the other strand was not converted to T, i.e. protected my methylation
3896 if ($genomic[$index+2] eq 'G') {
3897 ### If the residue is a G we want to know if the C on the other strand was in CpG, CHG or CHH context, therefore we need
3898 ### to look if the base upstream is a C
3899
3900 my $upstream_base = $genomic[$index+1];
3901
3902 if ($upstream_base eq 'C'){
3903 ++$methyl_CpG_count;
3904 push @match,'Z'; # protected C on opposing strand, methylated, in CpG context
3905 }
3906
3907 else{
3908 ### C in not in CpG-context, determining the second upstream base context
3909 my $second_upstream_base = $genomic[$index];
3910
3911 if ($second_upstream_base eq 'C'){
3912 ++$methyl_CHG_count;
3913 push @match,'X'; # protected C on opposing strand, methylated, in CHG context
3914 }
3915 else{
3916 ++$methyl_CHH_count;
3917 push @match,'H'; # protected C on opposing strand, methylated, in CHH context
3918 }
3919 }
3920 }
3921 else{
3922 push @match, '.';
3923 }
3924 }
3925 elsif ($seq[$index] ne $genomic[$index+2]) {
3926 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted to Ts
3927 ### on the opposing strand, so G to A conversions in the actually observed sequence
3928 if ($genomic[$index+2] eq 'G' and $seq[$index] eq 'A') {
3929 ### If the C residue on the opposing strand was converted to T then we will see an A in the currently observed sequence. We want to know if
3930 ### the C on the opposing strand was it was in CpG, CHG or CHH context, therefore we need to look one (or two) bases upstream!
3931
3932 my $upstream_base = $genomic[$index+1];
3933
3934 if ($upstream_base eq 'C'){
3935 ++$unmethylated_CpG_count;
3936 push @match,'z'; # converted C on opposing strand, not methylated, in CpG context
3937 }
3938
3939 else{
3940 ### C in not in CpG-context, determining the second upstream base context
3941 my $second_upstream_base = $genomic[$index];
3942
3943 if ($second_upstream_base eq 'C'){
3944 ++$unmethylated_CHG_count;
3945 push @match,'x'; # converted C on opposing strand, not methylated, in CHG context
3946 }
3947 else{
3948 ++$unmethylated_CHH_count;
3949 push @match,'h'; # converted C on opposing strand, not methylated, in CHH context
3950 }
3951 }
3952 }
3953 ### all other mismatches are not of interest for a methylation call
3954 else {
3955 push @match,'.';
3956 }
3957 }
3958 else{
3959 die "There can be only 2 possibilities\n";
3960 }
3961 }
3962 }
3963 else{
3964 die "Strand conversion info is required to perform a methylation call\n";
3965 }
3966
3967 my $methylation_call = join ("",@match);
3968
3969 $counting{total_meCHH_count} += $methyl_CHH_count;
3970 $counting{total_meCHG_count} += $methyl_CHG_count;
3971 $counting{total_meCpG_count} += $methyl_CpG_count;
3972 $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count;
3973 $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count;
3974 $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count;
3975
3976 # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n";
3977 return $methylation_call;
3978 }
3979
3980 sub read_genome_into_memory{
3981 ## working directoy
3982 my $cwd = shift;
3983 ## reading in and storing the specified genome in the %chromosomes hash
3984 chdir ($genome_folder) or die "Can't move to $genome_folder: $!";
3985 print "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n";
3986
3987 my @chromosome_filenames = <*.fa>;
3988
3989 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
3990 unless (@chromosome_filenames){
3991 @chromosome_filenames = <*.fasta>;
3992 }
3993
3994 unless (@chromosome_filenames){
3995 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n";
3996 }
3997
3998 foreach my $chromosome_filename (@chromosome_filenames){
3999
4000 open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n";
4001 ### first line needs to be a fastA header
4002 my $first_line = <CHR_IN>;
4003 chomp $first_line;
4004 $first_line =~ s/\r//;
4005
4006 ### Extracting chromosome name from the FastA header
4007 my $chromosome_name = extract_chromosome_name($first_line);
4008
4009 my $sequence;
4010 while (<CHR_IN>){
4011 chomp;
4012 $_ =~ s/\r//;
4013 if ($_ =~ /^>/){
4014 ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA)
4015 if (exists $chromosomes{$chromosome_name}){
4016 print "chr $chromosome_name (",length $sequence ," bp)\n";
4017 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
4018 }
4019 else {
4020 if (length($sequence) == 0){
4021 warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n";
4022 }
4023 print "chr $chromosome_name (",length $sequence ," bp)\n";
4024 $chromosomes{$chromosome_name} = $sequence;
4025 }
4026 ### resetting the sequence variable
4027 $sequence = '';
4028 ### setting new chromosome name
4029 $chromosome_name = extract_chromosome_name($_);
4030 }
4031 else{
4032 $sequence .= uc$_;
4033 }
4034 }
4035
4036 if (exists $chromosomes{$chromosome_name}){
4037 print "chr $chromosome_name (",length $sequence ," bp)\t";
4038 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n";
4039 }
4040 else{
4041 if (length($sequence) == 0){
4042 warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n";
4043 }
4044 print "chr $chromosome_name (",length $sequence ," bp)\n";
4045 $chromosomes{$chromosome_name} = $sequence;
4046 }
4047 }
4048 print "\n";
4049 chdir $cwd or die "Failed to move to directory $cwd\n";
4050 }
4051
4052 sub extract_chromosome_name {
4053 ## Bowtie seems to extract the first string after the inition > in the FASTA file, so we are doing this as well
4054 my $fasta_header = shift;
4055 if ($fasta_header =~ s/^>//){
4056 my ($chromosome_name) = split (/\s+/,$fasta_header);
4057 return $chromosome_name;
4058 }
4059 else{
4060 die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n";
4061 }
4062 }
4063
4064 sub reverse_complement{
4065 my $sequence = shift;
4066 $sequence =~ tr/CATG/GTAC/;
4067 $sequence = reverse($sequence);
4068 return $sequence;
4069 }
4070
4071 sub biTransformFastAFiles {
4072 my $file = shift;
4073 my ($dir,$filename);
4074 if ($file =~ /\//){
4075 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
4076 }
4077 else{
4078 $filename = $file;
4079 }
4080
4081 ### gzipped version of the infile
4082 if ($file =~ /\.gz$/){
4083 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
4084 }
4085 else{
4086 open (IN,$file) or die "Couldn't read from file $file: $!\n";
4087 }
4088
4089 if ($skip){
4090 warn "Skipping the first $skip reads from $file\n";
4091 sleep (1);
4092 }
4093 if ($upto){
4094 warn "Processing reads up to sequence no. $upto from $file\n";
4095 sleep (1);
4096 }
4097
4098 my $C_to_T_infile = my $G_to_A_infile = $filename;
4099
4100 if ($gzip){
4101 $C_to_T_infile =~ s/$/_C_to_T.fa.gz/;
4102 $G_to_A_infile =~ s/$/_G_to_A.fa.gz/;
4103 }
4104 else{
4105 $C_to_T_infile =~ s/$/_C_to_T.fa/;
4106 $G_to_A_infile =~ s/$/_G_to_A.fa/;
4107 }
4108
4109 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
4110
4111 if ($gzip){
4112 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
4113 }
4114 else{
4115 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
4116 }
4117
4118 unless ($directional){
4119 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
4120 if ($gzip){
4121 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
4122 }
4123 else{
4124 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
4125 }
4126 }
4127
4128 my $count = 0;
4129
4130 while (1){
4131 my $header = <IN>;
4132 my $sequence= <IN>;
4133 last unless ($header and $sequence);
4134
4135 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
4136
4137 ++$count;
4138
4139 if ($skip){
4140 next unless ($count > $skip);
4141 }
4142 if ($upto){
4143 last if ($count > $upto);
4144 }
4145
4146 $sequence = uc$sequence; # make input file case insensitive
4147
4148 # detecting if the input file contains tab stops, as this is likely to result in no alignments
4149 if (index($header,"\t") != -1){
4150 $seqID_contains_tabs++;
4151 }
4152
4153 ### small check if the sequence seems to be in FastA format
4154 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/);
4155
4156 my $sequence_C_to_T = $sequence;
4157 $sequence_C_to_T =~ tr/C/T/;
4158 print CTOT "$header$sequence_C_to_T";
4159
4160 unless ($directional){
4161 my $sequence_G_to_A = $sequence;
4162 $sequence_G_to_A =~ tr/G/A/;
4163 print GTOA "$header$sequence_G_to_A";
4164 }
4165 }
4166 close CTOT or die "Failed to close filehandle $!\n";
4167
4168 if ($directional){
4169 warn "\nCreated C -> T converted versions of the FastA file $filename ($count sequences in total)\n\n";
4170 }
4171 else{
4172 close GTOA or die "Failed to close filehandle $!\n";
4173 warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
4174 }
4175 return ($C_to_T_infile,$G_to_A_infile);
4176 }
4177
4178 sub biTransformFastAFiles_paired_end {
4179 my ($file,$read_number) = @_;
4180
4181 if ($gzip){
4182 warn "GZIP compression of temporary files is not supported for paired-end FastA data. Continuing to write uncompressed files\n";
4183 sleep (2);
4184 }
4185
4186 my ($dir,$filename);
4187 if ($file =~ /\//){
4188 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
4189 }
4190 else{
4191 $filename = $file;
4192 }
4193
4194 ### gzipped version of the infile
4195 if ($file =~ /\.gz$/){
4196 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
4197 }
4198 else{
4199 open (IN,$file) or die "Couldn't read from file $file: $!\n";
4200 }
4201
4202 if ($skip){
4203 warn "Skipping the first $skip reads from $file\n";
4204 sleep (1);
4205 }
4206 if ($upto){
4207 warn "Processing reads up to sequence no. $upto from $file\n";
4208 sleep (1);
4209 }
4210
4211 my $C_to_T_infile = my $G_to_A_infile = $filename;
4212 $C_to_T_infile =~ s/$/_C_to_T.fa/;
4213 $G_to_A_infile =~ s/$/_G_to_A.fa/;
4214
4215 if ($directional){
4216 if ($read_number == 1){
4217 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
4218 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
4219 }
4220 elsif ($read_number == 2){
4221 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
4222 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
4223 }
4224 else{
4225 die "Read number needs to be 1 or 2, but was: $read_number\n\n";
4226 }
4227 }
4228 else{ # all four strand output
4229 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
4230 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
4231 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
4232 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
4233 }
4234
4235 my $count = 0;
4236
4237 while (1){
4238 my $header = <IN>;
4239 my $sequence= <IN>;
4240 last unless ($header and $sequence);
4241
4242 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
4243
4244 ++$count;
4245
4246 if ($skip){
4247 next unless ($count > $skip);
4248 }
4249 if ($upto){
4250 last if ($count > $upto);
4251 }
4252
4253 $sequence = uc$sequence; # make input file case insensitive
4254
4255 # detecting if the input file contains tab stops, as this is likely to result in no alignments
4256 if (index($header,"\t") != -1){
4257 $seqID_contains_tabs++;
4258 }
4259
4260 ## small check if the sequence seems to be in FastA format
4261 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>/);
4262
4263 if ($read_number == 1){
4264 if ($bowtie2){
4265 $header =~ s/$/\/1\/1/;
4266 }
4267 else{
4268 $header =~ s/$/\/1/;
4269 }
4270 }
4271 elsif ($read_number == 2){
4272 if ($bowtie2){
4273 $header =~ s/$/\/2\/2/;
4274 }
4275 else{
4276 $header =~ s/$/\/2/;
4277 }
4278 }
4279 else{
4280 die "Read number needs to be 1 or 2, but was: $read_number\n\n";
4281 }
4282 my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
4283
4284 $sequence_C_to_T =~ tr/C/T/;
4285 $sequence_G_to_A =~ tr/G/A/;
4286
4287 if ($directional){
4288
4289 if ($read_number == 1){
4290 print CTOT "$header$sequence_C_to_T";
4291 }
4292 elsif ($read_number == 2){
4293 print GTOA "$header$sequence_G_to_A";
4294 }
4295 }
4296 else{
4297 print CTOT "$header$sequence_C_to_T";
4298 print GTOA "$header$sequence_G_to_A";
4299 }
4300 }
4301
4302 if ($directional){
4303 if ($read_number == 1){
4304 warn "\nCreated C -> T converted version of the FastA file $filename ($count sequences in total)\n\n";
4305 }
4306 else{
4307 warn "\nCreated G -> A converted version of the FastA file $filename ($count sequences in total)\n\n";
4308 }
4309 }
4310 else{
4311 warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
4312 }
4313
4314 if ($directional){
4315 if ($read_number == 1){
4316 return ($C_to_T_infile);
4317 }
4318 else{
4319 return ($G_to_A_infile);
4320 }
4321 }
4322 else{
4323 return ($C_to_T_infile,$G_to_A_infile);
4324 }
4325 }
4326
4327
4328 sub biTransformFastQFiles {
4329 my $file = shift;
4330 my ($dir,$filename);
4331 if ($file =~ /\//){
4332 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
4333 }
4334 else{
4335 $filename = $file;
4336 }
4337
4338 ### gzipped version of the infile
4339 if ($file =~ /\.gz$/){
4340 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
4341 }
4342 else{
4343 open (IN,$file) or die "Couldn't read from file $file: $!\n";
4344 }
4345
4346 if ($skip){
4347 warn "Skipping the first $skip reads from $file\n";
4348 sleep (1);
4349 }
4350 if ($upto){
4351 warn "Processing reads up to sequence no. $upto from $file\n";
4352 sleep (1);
4353 }
4354
4355 my $C_to_T_infile = my $G_to_A_infile = $filename;
4356
4357 if ($pbat){ # PBAT-Seq
4358 if ($gzip){
4359 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/;
4360 }
4361 else{
4362 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
4363 }
4364
4365 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
4366
4367 if ($gzip){
4368 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
4369 }
4370 else{
4371 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
4372 }
4373 }
4374 else{ # directional or non-directional
4375 if ($gzip){
4376 $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/;
4377 }
4378 else{
4379 $C_to_T_infile =~ s/$/_C_to_T.fastq/;
4380 }
4381
4382 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
4383
4384 if ($gzip){
4385 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
4386 }
4387 else{
4388 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; # uncompressed option
4389 }
4390
4391 unless ($directional){
4392 if ($gzip){
4393 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/;
4394 }
4395 else{
4396 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
4397 }
4398
4399 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
4400
4401 if ($gzip){
4402 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
4403 }
4404 else{
4405 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
4406 }
4407 }
4408 }
4409
4410 my $count = 0;
4411 while (1){
4412 my $identifier = <IN>;
4413 my $sequence = <IN>;
4414 my $identifier2 = <IN>;
4415 my $quality_score = <IN>;
4416 last unless ($identifier and $sequence and $identifier2 and $quality_score);
4417
4418 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
4419
4420 ++$count;
4421
4422 if ($skip){
4423 next unless ($count > $skip);
4424 }
4425 if ($upto){
4426 last if ($count > $upto);
4427 }
4428
4429 $sequence = uc$sequence; # make input file case insensitive
4430
4431 # detecting if the input file contains tab stops, as this is likely to result in no alignments
4432 if (index($identifier,"\t") != -1){
4433 $seqID_contains_tabs++;
4434 }
4435
4436 ## small check if the sequence file appears to be a FastQ file
4437 if ($count == 1){
4438 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
4439 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
4440 }
4441 }
4442
4443 if ($pbat){
4444 my $sequence_G_to_A = $sequence;
4445 $sequence_G_to_A =~ tr/G/A/;
4446 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
4447 }
4448 else{ # directional or non-directional
4449 my $sequence_C_to_T = $sequence;
4450 $sequence_C_to_T =~ tr/C/T/;
4451 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
4452
4453 unless ($directional){
4454 my $sequence_G_to_A = $sequence;
4455 $sequence_G_to_A =~ tr/G/A/;
4456 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
4457 }
4458 }
4459 }
4460
4461 if ($directional){
4462 close CTOT or die "Failed to close filehandle $!\n";
4463 warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n";
4464 }
4465 elsif($pbat){
4466 warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n";
4467 close GTOA or die "Failed to close filehandle $!\n";
4468 return ($G_to_A_infile);
4469 }
4470 else{
4471 close CTOT or die "Failed to close filehandle $!\n";
4472 close GTOA or die "Failed to close filehandle $!\n";
4473 warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
4474 }
4475
4476 return ($C_to_T_infile,$G_to_A_infile);
4477 }
4478
4479 sub biTransformFastQFiles_paired_end {
4480 my ($file,$read_number) = @_;
4481 my ($dir,$filename);
4482
4483 if ($file =~ /\//){
4484 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
4485 }
4486 else{
4487 $filename = $file;
4488 }
4489
4490 ### gzipped version of the infile
4491 if ($file =~ /\.gz$/){
4492 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
4493 }
4494 else{
4495 open (IN,$file) or die "Couldn't read from file $file: $!\n";
4496 }
4497
4498 if ($skip){
4499 warn "Skipping the first $skip reads from $file\n";
4500 sleep (1);
4501 }
4502 if ($upto){
4503 warn "Processing reads up to sequence no. $upto from $file\n";
4504 sleep (1);
4505 }
4506
4507 my $C_to_T_infile = my $G_to_A_infile = $filename;
4508
4509 if ($gzip){
4510 $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/;
4511 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/;
4512 }
4513 else{
4514 $C_to_T_infile =~ s/$/_C_to_T.fastq/;
4515 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
4516 }
4517
4518 if ($directional){
4519 if ($read_number == 1){
4520 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
4521 if ($gzip){
4522 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
4523 }
4524 else{
4525 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
4526 }
4527 }
4528 elsif ($read_number == 2){
4529 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
4530 if ($gzip){
4531 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
4532 }
4533 else{
4534 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
4535 }
4536 }
4537 else{
4538 die "Read number needs to be 1 or 2, but was $read_number!\n\n";
4539 }
4540 }
4541 else{
4542 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
4543 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
4544 if ($gzip){
4545 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
4546 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
4547 }
4548 else{
4549 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
4550 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
4551 }
4552 }
4553
4554 my $count = 0;
4555 while (1){
4556 my $identifier = <IN>;
4557 my $sequence = <IN>;
4558 my $identifier2 = <IN>;
4559 my $quality_score = <IN>;
4560 last unless ($identifier and $sequence and $identifier2 and $quality_score);
4561 ++$count;
4562
4563 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
4564
4565 if ($skip){
4566 next unless ($count > $skip);
4567 }
4568 if ($upto){
4569 last if ($count > $upto);
4570 }
4571
4572 $sequence= uc$sequence; # make input file case insensitive
4573
4574 ## small check if the sequence file appears to be a FastQ file
4575 if ($count == 1){
4576 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
4577 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
4578 }
4579 }
4580 my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
4581
4582 if ($read_number == 1){
4583 if ($bowtie2){
4584 $identifier =~ s/$/\/1\/1/;
4585 }
4586 else{
4587 $identifier =~ s/$/\/1/;
4588 }
4589 }
4590 elsif ($read_number == 2){
4591 if ($bowtie2){
4592 $identifier =~ s/$/\/2\/2/;
4593 }
4594 else{
4595 $identifier =~ s/$/\/2/;
4596 }
4597 }
4598 else{
4599 die "Read number needs to be 1 or 2\n";
4600 }
4601
4602 $sequence_C_to_T =~ tr/C/T/;
4603 $sequence_G_to_A =~ tr/G/A/;
4604
4605 if ($directional){
4606 if ($read_number == 1){
4607 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
4608 }
4609 else{
4610 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
4611 }
4612 }
4613 else{
4614 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
4615 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
4616 }
4617 }
4618
4619 if ($directional){
4620 if ($read_number == 1){
4621 warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n";
4622 }
4623 else{
4624 warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n";
4625 }
4626 }
4627 else{
4628 warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
4629 }
4630 if ($directional){
4631 if ($read_number == 1){
4632 close CTOT or die "Failed to close filehandle $!\n";
4633 return ($C_to_T_infile);
4634 }
4635 else{
4636 close GTOA or die "Failed to close filehandle $!\n";
4637 return ($G_to_A_infile);
4638 }
4639 }
4640 else{
4641 close CTOT or die "Failed to close filehandle $!\n";
4642 close GTOA or die "Failed to close filehandle $!\n";
4643 return ($C_to_T_infile,$G_to_A_infile);
4644 }
4645 }
4646
4647
4648 ### SPECIAL BOWTIE 1 PAIRED-END FORMAT FOR GZIPPED OUTPUT FILES
4649
4650 sub biTransformFastQFiles_paired_end_bowtie1_gzip {
4651 my ($file_1,$file_2) = @_;
4652 my ($dir,$filename);
4653
4654 if ($file_1 =~ /\//){
4655 ($dir,$filename) = $file_1 =~ m/(.*\/)(.*)$/;
4656 }
4657 else{
4658 $filename = $file_1;
4659 }
4660
4661 ### gzipped version of infile 1
4662 if ($file_1 =~ /\.gz$/){
4663 open (IN_1,"zcat $file_1 |") or die "Couldn't read from file $file_1: $!\n";
4664 }
4665 else{
4666 open (IN_1,$file_1) or die "Couldn't read from file $file_1: $!\n";
4667 }
4668 ### gzipped version of infile 2
4669 if ($file_2 =~ /\.gz$/){
4670 open (IN_2,"zcat $file_2 |") or die "Couldn't read from file $file_2: $!\n";
4671 }
4672 else{
4673 open (IN_2,$file_2) or die "Couldn't read from file $file_2: $!\n";
4674 }
4675
4676
4677 if ($skip){
4678 warn "Skipping the first $skip reads from $file_1 and $file_2\n";
4679 sleep (1);
4680 }
4681 if ($upto){
4682 warn "Processing reads up to sequence no. $upto from $file_1 and $file_2\n";
4683 sleep (1);
4684 }
4685
4686 my $CT_plus_GA_infile = my $GA_plus_CT_infile = $filename;
4687
4688 $CT_plus_GA_infile =~ s/$/.CT_plus_GA.fastq.gz/;
4689 $GA_plus_CT_infile =~ s/$/.GA_plus_CT.fastq.gz/;
4690
4691 warn "Writing a C -> T converted version of $file_1 and a G -> A converted version of $file_2 to $temp_dir$CT_plus_GA_infile\n";
4692 open (CTPLUSGA,"| gzip -c - > ${temp_dir}${CT_plus_GA_infile}") or die "Can't write to file: $!\n";
4693 # open (CTPLUSGA,'>',"$temp_dir$CT_plus_GA_infile") or die "Couldn't write to file $!\n";
4694
4695 unless ($directional){
4696 print "Writing a G -> A converted version of $file_1 and a C -> T converted version of $file_2 to $temp_dir$GA_plus_CT_infile\n";
4697 open (GAPLUSCT,"| gzip -c - > ${temp_dir}${GA_plus_CT_infile}") or die "Can't write to file: $!\n";
4698 }
4699
4700 ### for Bowtie 1 we need to write a single gzipped file with 1 line per pair of sequences in the the following format:
4701 ### <seq-ID> <sequence #1 mate> <quality #1 mate> <sequence #2 mate> <quality #2 mate>
4702
4703 my $count = 0;
4704 while (1){
4705 my $identifier_1 = <IN_1>;
4706 my $sequence_1 = <IN_1>;
4707 my $identifier2_1 = <IN_1>;
4708 my $quality_score_1 = <IN_1>;
4709
4710 my $identifier_2 = <IN_2>;
4711 my $sequence_2 = <IN_2>;
4712 my $identifier2_2 = <IN_2>;
4713 my $quality_score_2 = <IN_2>;
4714
4715 last unless ($identifier_1 and $sequence_1 and $identifier2_1 and $quality_score_1 and $identifier_2 and $sequence_2 and $identifier2_2 and $quality_score_2);
4716
4717 ++$count;
4718
4719 ## small check if the sequence file appears to be a FastQ file
4720 if ($count == 1){
4721 if ($identifier_1 !~ /^\@/ or $identifier2_1 !~ /^\+/){
4722 die "Input file 1 doesn't seem to be in FastQ format at sequence $count: $!\n";
4723 }
4724 if ($identifier_2 !~ /^\@/ or $identifier2_2 !~ /^\+/){
4725 die "Input file 2 doesn't seem to be in FastQ format at sequence $count: $!\n";
4726 }
4727 }
4728
4729 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
4730 chomp $identifier_1;
4731 chomp $sequence_1;
4732 chomp $sequence_2;
4733 chomp $quality_score_1;
4734 chomp $quality_score_2;
4735
4736 $identifier_1 =~ s/^\@//;
4737 $identifier_1 =~ s/$/\/1/; #adding an extra /1 to the end which is being removed by Bowtie otherwise (which leads to no sequences alignments whatsoever)
4738
4739 if ($skip){
4740 next unless ($count > $skip);
4741 }
4742 if ($upto){
4743 last if ($count > $upto);
4744 }
4745
4746 $sequence_1 = uc$sequence_1; # make input file 1 case insensitive
4747 $sequence_2 = uc$sequence_2; # make input file 2 case insensitive
4748
4749 # print "$identifier_1\t$sequence_1\t$quality_score_1\t$sequence_2\t$quality_score_2\n";
4750 my $sequence_1_C_to_T = $sequence_1;
4751 my $sequence_2_G_to_A = $sequence_2;
4752 $sequence_1_C_to_T =~ tr/C/T/;
4753 $sequence_2_G_to_A =~ tr/G/A/;
4754
4755 print CTPLUSGA "$identifier_1\t$sequence_1_C_to_T\t$quality_score_1\t$sequence_2_G_to_A\t$quality_score_2\n";
4756
4757 unless ($directional){
4758 my $sequence_1_G_to_A = $sequence_1;
4759 my $sequence_2_C_to_T = $sequence_2;
4760 $sequence_1_G_to_A =~ tr/G/A/;
4761 $sequence_2_C_to_T =~ tr/C/T/;
4762 print GAPLUSCT "$identifier_1\t$sequence_1_G_to_A\t$quality_score_1\t$sequence_2_C_to_T\t$quality_score_2\n";
4763 }
4764 }
4765
4766 close CTPLUSGA or die "Couldn't close filehandle\n";
4767 warn "\nCreated C -> T converted version of FastQ file '$file_1' and G -> A converted version of FastQ file '$file_2' ($count sequences in total)\n";
4768
4769 if ($directional){
4770 warn "\n";
4771 return ($CT_plus_GA_infile);
4772 }
4773 else{
4774 close GAPLUSCT or die "Couldn't close filehandle\n";
4775 warn "Created G -> A converted version of FastQ file '$file_1' and C -> T converted version of FastQ file '$file_2' ($count sequences in total)\n\n";
4776 return ($CT_plus_GA_infile,$GA_plus_CT_infile);
4777 }
4778 }
4779
4780
4781 sub fix_IDs{
4782 my $id = shift;
4783 $id =~ s/[ \t]+/_/g; # replace spaces or tabs with underscores
4784 return $id;
4785 }
4786
4787 sub ensure_sensical_alignment_orientation_single_end{
4788 my $index = shift; # index number if the sequence produced an alignment
4789 my $strand = shift;
4790 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
4791 my $orientation = 0;
4792 ##############################################################################################################
4793 ## FORWARD converted read against FORWARD converted genome (read: C->T.....C->T.. genome:C->T.......C->T)
4794 ## here we only want reads in the forward (+) orientation
4795 if ($fhs[$index]->{name} eq 'CTreadCTgenome') {
4796 ### if the alignment is (+) we count it, and return 1 for a correct orientation
4797 if ($strand eq '+') {
4798 $fhs[$index]->{seen}++;
4799 $orientation = 1;
4800 return $orientation;
4801 }
4802 ### if the orientation equals (-) the alignment is nonsensical
4803 elsif ($strand eq '-') {
4804 $fhs[$index]->{wrong_strand}++;
4805 return $orientation;
4806 }
4807 }
4808 ###############################################################################################################
4809 ## FORWARD converted read against reverse converted genome (read: C->T.....C->T.. genome: G->A.......G->A)
4810 ## here we only want reads in the forward (-) orientation
4811 elsif ($fhs[$index]->{name} eq 'CTreadGAgenome') {
4812 ### if the alignment is (-) we count it and return 1 for a correct orientation
4813 if ($strand eq '-') {
4814 $fhs[$index]->{seen}++;
4815 $orientation = 1;
4816 return $orientation;
4817 }
4818 ### if the orientation equals (+) the alignment is nonsensical
4819 elsif ($strand eq '+') {
4820 $fhs[$index]->{wrong_strand}++;
4821 return $orientation;
4822 }
4823 }
4824 ###############################################################################################################
4825 ## Reverse converted read against FORWARD converted genome (read: G->A.....G->A.. genome: C->T.......C->T)
4826 ## here we only want reads in the forward (-) orientation
4827 elsif ($fhs[$index]->{name} eq 'GAreadCTgenome') {
4828 ### if the alignment is (-) we count it and return 1 for a correct orientation
4829 if ($strand eq '-') {
4830 $fhs[$index]->{seen}++;
4831 $orientation = 1;
4832 return $orientation;
4833 }
4834 ### if the orientation equals (+) the alignment is nonsensical
4835 elsif ($strand eq '+') {
4836 $fhs[$index]->{wrong_strand}++;
4837 return $orientation;
4838 }
4839 }
4840 ###############################################################################################################
4841 ## Reverse converted read against reverse converted genome (read: G->A.....G->A.. genome: G->A.......G->A)
4842 ## here we only want reads in the forward (+) orientation
4843 elsif ($fhs[$index]->{name} eq 'GAreadGAgenome') {
4844 ### if the alignment is (+) we count it and return 1 for a correct orientation
4845 if ($strand eq '+') {
4846 $fhs[$index]->{seen}++;
4847 $orientation = 1;
4848 return $orientation;
4849 }
4850 ### if the orientation equals (-) the alignment is nonsensical
4851 elsif ($strand eq '-') {
4852 $fhs[$index]->{wrong_strand}++;
4853 return $orientation;
4854 }
4855 } else{
4856 die "One of the above conditions must be true\n";
4857 }
4858 }
4859
4860 sub ensure_sensical_alignment_orientation_paired_ends{
4861 my ($index,$id_1,$strand_1,$id_2,$strand_2) = @_; # index number if the sequence produced an alignment
4862 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
4863 my $orientation = 0;
4864 ##############################################################################################################
4865 ## [Index 0, sequence originated from (converted) forward strand]
4866 ## CT converted read 1
4867 ## GA converted read 2
4868 ## CT converted genome
4869 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
4870 if ($fhs[$index]->{name} eq 'CTread1GAread2CTgenome') {
4871 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
4872 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
4873 $fhs[$index]->{seen}++;
4874 $orientation = 1;
4875 return $orientation;
4876 }
4877 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
4878 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
4879 $fhs[$index]->{wrong_strand}++;
4880 return $orientation;
4881 }
4882 else{
4883 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
4884 }
4885 }
4886 ###############################################################################################################
4887 ## [Index 1, sequence originated from (converted) reverse strand]
4888 ## GA converted read 1
4889 ## CT converted read 2
4890 ## GA converted genome
4891 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
4892 elsif ($fhs[$index]->{name} eq 'GAread1CTread2GAgenome') {
4893 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
4894 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
4895 $fhs[$index]->{seen}++;
4896 $orientation = 1;
4897 return $orientation;
4898 }
4899 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
4900 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
4901 $fhs[$index]->{wrong_strand}++;
4902 return $orientation;
4903 }
4904 else{
4905 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
4906 }
4907 }
4908 ###############################################################################################################
4909 ## [Index 2, sequence originated from complementary to (converted) forward strand]
4910 ## GA converted read 1
4911 ## CT converted read 2
4912 ## CT converted genome
4913 ## here we only want read 1 in (-) orientation and read 2 in (+) orientation
4914 elsif ($fhs[$index]->{name} eq 'GAread1CTread2CTgenome') {
4915 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
4916 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
4917 $fhs[$index]->{seen}++;
4918 $orientation = 1;
4919 return $orientation;
4920 }
4921 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
4922 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
4923 $fhs[$index]->{wrong_strand}++;
4924 return $orientation;
4925 }
4926 else{
4927 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
4928 }
4929 }
4930 ###############################################################################################################
4931 ## [Index 3, sequence originated from complementary to (converted) reverse strand]
4932 ## CT converted read 1
4933 ## GA converted read 2
4934 ## GA converted genome
4935 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
4936 elsif ($fhs[$index]->{name} eq 'CTread1GAread2GAgenome') {
4937 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
4938 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
4939 $fhs[$index]->{seen}++;
4940 $orientation = 1;
4941 return $orientation;
4942 }
4943 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
4944 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
4945 $fhs[$index]->{wrong_strand}++;
4946 return $orientation;
4947 }
4948 else{
4949 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
4950 }
4951 }
4952 else{
4953 die "One of the above conditions must be true\n";
4954 }
4955 }
4956
4957 #####################################################################################################################################################
4958
4959 ### Bowtie 1 (default) | PAIRED-END | FASTA
4960
4961 sub paired_end_align_fragments_to_bisulfite_genome_fastA {
4962
4963 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
4964
4965 if ($directional){
4966 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
4967 }
4968 else{
4969 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
4970 }
4971
4972 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
4973 ## data structure above
4974 if ($directional){
4975 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4976 }
4977 else{
4978 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4979 }
4980
4981 foreach my $fh (@fhs) {
4982
4983 if ($directional){
4984 unless ($fh->{inputfile_1}){
4985 $fh->{last_seq_id} = undef;
4986 $fh->{last_line_1} = undef;
4987 $fh->{last_line_2} = undef;
4988 next;
4989 }
4990 }
4991
4992 my $bt_options = $bowtie_options;
4993 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
4994 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
4995 }
4996 else {
4997 $bt_options .= ' --nofw';
4998 }
4999
5000 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options)\n";
5001 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
5002
5003 my $line_1 = $fh->{fh}->getline();
5004 my $line_2 = $fh->{fh}->getline();
5005
5006 # if Bowtie produces an alignment we store the first line of the output
5007 if ($line_1 and $line_2) {
5008 chomp $line_1;
5009 chomp $line_2;
5010 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
5011 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
5012
5013 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
5014 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
5015
5016 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
5017 $fh->{last_seq_id} = $id_1;
5018 }
5019 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
5020 $fh->{last_seq_id} = $id_2;
5021 }
5022 else{
5023 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
5024 }
5025
5026 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
5027 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
5028 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
5029 }
5030 # otherwise we just initialise last_seq_id and last_lines as undefined
5031 else {
5032 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
5033 $fh->{last_seq_id} = undef;
5034 $fh->{last_line_1} = undef;
5035 $fh->{last_line_2} = undef;
5036 }
5037 }
5038 }
5039
5040 ### Bowtie 2 | PAIRED-END | FASTA
5041
5042 sub paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
5043 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
5044 if ($directional){
5045 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
5046 }
5047 else{
5048 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
5049 }
5050
5051 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
5052 ## data structure above
5053 if ($directional){
5054 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5055 }
5056 else{
5057 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5058 }
5059
5060 foreach my $fh (@fhs) {
5061
5062 if ($directional){
5063 unless ($fh->{inputfile_1}){
5064 $fh->{last_seq_id} = undef;
5065 $fh->{last_line_1} = undef;
5066 $fh->{last_line_2} = undef;
5067 next;
5068 }
5069 }
5070
5071 my $bt2_options = $bowtie_options;
5072 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
5073 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
5074 }
5075 else {
5076 $bt2_options .= ' --nofw';
5077 }
5078
5079 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
5080 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
5081
5082 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
5083 while (1){
5084 $_ = $fh->{fh}->getline();
5085 if ($_) {
5086 last unless ($_ =~ /^\@/); # SAM headers start with @
5087 }
5088 else{
5089 last; # no alignment output
5090 }
5091 }
5092
5093 my $line_1 = $_;
5094 my $line_2 = $fh->{fh}->getline();
5095
5096 # if Bowtie produces an alignment we store the first line of the output
5097 if ($line_1 and $line_2) {
5098 chomp $line_1;
5099 chomp $line_2;
5100 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
5101 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
5102
5103 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
5104 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
5105
5106 if ($id_1 =~ s/\/1$//){ # removing the read 1 /1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
5107 $fh->{last_seq_id} = $id_1;
5108 }
5109 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 /2 tag if present
5110 $fh->{last_seq_id} = $id_2;
5111 }
5112 else{
5113 warn "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
5114 }
5115
5116 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
5117 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
5118 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
5119 }
5120 # otherwise we just initialise last_seq_id and last_lines as undefined
5121 else {
5122 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
5123 $fh->{last_seq_id} = undef;
5124 $fh->{last_line_1} = undef;
5125 $fh->{last_line_2} = undef;
5126 }
5127 }
5128 }
5129
5130 ### Bowtie 1 (default) | PAIRED-END | FASTQ
5131
5132 sub paired_end_align_fragments_to_bisulfite_genome_fastQ {
5133 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
5134
5135 if ($directional){
5136 warn "Input file is $C_to_T_infile_1 (FastQ)\n";
5137 }
5138 elsif($pbat){
5139 warn "Input file is $G_to_A_infile_1 (FastQ; PBAT-Seq)\n";
5140 }
5141 else{
5142 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 (FastQ)\n";
5143 }
5144
5145 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
5146 ## data structure above
5147 if ($directional or $pbat){
5148 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5149 }
5150 else{
5151 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5152 }
5153
5154 foreach my $fh (@fhs) {
5155
5156 if ($directional or $pbat){
5157 unless ($fh->{inputfile_1}){
5158 $fh->{last_seq_id} = undef;
5159 $fh->{last_line_1} = undef;
5160 $fh->{last_line_2} = undef;
5161 next; # skipping unwanted filehandles
5162 }
5163 }
5164
5165 my $bt_options = $bowtie_options;
5166 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
5167 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
5168 }
5169 else {
5170 $bt_options .= ' --nofw';
5171 }
5172
5173 if ($gzip){
5174 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1}, with the options: $bt_options)\n";
5175 open ($fh->{fh},"zcat ${temp_dir}$fh->{inputfile_1} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} --12 - |") or die "Can't open pipe to bowtie: $!";
5176 }
5177 else{
5178 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1} and ${temp_dir}$fh->{inputfile_2}, with the options: $bt_options))\n";
5179 sleep(5);
5180 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
5181 }
5182
5183 my $line_1 = $fh->{fh}->getline();
5184 my $line_2 = $fh->{fh}->getline();
5185
5186 # if Bowtie produces an alignment we store the first line of the output
5187 if ($line_1 and $line_2) {
5188 chomp $line_1;
5189 chomp $line_2;
5190 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
5191 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
5192
5193 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
5194 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
5195
5196 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
5197 $fh->{last_seq_id} = $id_1;
5198 }
5199 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
5200 $fh->{last_seq_id} = $id_2;
5201 }
5202 else{
5203 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
5204 }
5205
5206 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
5207 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
5208 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
5209 }
5210
5211 # otherwise we just initialise last_seq_id and last_lines as undefined
5212 else {
5213 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
5214 $fh->{last_seq_id} = undef;
5215 $fh->{last_line_1} = undef;
5216 $fh->{last_line_2} = undef;
5217 }
5218 }
5219 }
5220
5221 ### Bowtie 2 | PAIRED-END | FASTQ
5222
5223 sub paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
5224 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
5225 if ($directional){
5226 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n";
5227 }
5228 else{
5229 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n";
5230 }
5231
5232 ## Now starting up 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
5233 ## data structure above
5234 if ($directional){
5235 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5236 }
5237 else{
5238 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5239 }
5240
5241 foreach my $fh (@fhs) {
5242
5243 if ($directional){
5244 unless ($fh->{inputfile_1}){
5245 $fh->{last_seq_id} = undef;
5246 $fh->{last_line_1} = undef;
5247 $fh->{last_line_2} = undef;
5248 next;
5249 }
5250 }
5251
5252 my $bt2_options = $bowtie_options;
5253 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
5254 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
5255 }
5256 else {
5257 $bt2_options .= ' --nofw';
5258 }
5259
5260 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
5261 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
5262
5263 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
5264 while (1){
5265 $_ = $fh->{fh}->getline();
5266 if ($_) {
5267 last unless ($_ =~ /^\@/); # SAM headers start with @
5268 }
5269 else{
5270 last; # no alignment output
5271 }
5272 }
5273
5274 my $line_1 = $_;
5275 my $line_2 = $fh->{fh}->getline();
5276
5277 # if Bowtie produces an alignment we store the first line of the output
5278 if ($line_1 and $line_2) {
5279 chomp $line_1;
5280 chomp $line_2;
5281 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
5282 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
5283
5284 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
5285 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
5286
5287 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
5288 $fh->{last_seq_id} = $id_1;
5289 }
5290 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
5291 $fh->{last_seq_id} = $id_2;
5292 }
5293 else{
5294 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
5295 }
5296
5297 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
5298 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
5299 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
5300 }
5301
5302 # otherwise we just initialise last_seq_id and last_lines as undefined
5303 else {
5304 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
5305 $fh->{last_seq_id} = undef;
5306 $fh->{last_line_1} = undef;
5307 $fh->{last_line_2} = undef;
5308 }
5309 }
5310 }
5311
5312 #####################################################################################################################################################
5313
5314 ### Bowtie 1 (default) | SINGLE-END | FASTA
5315 sub single_end_align_fragments_to_bisulfite_genome_fastA {
5316 my ($C_to_T_infile,$G_to_A_infile) = @_;
5317 if ($directional){
5318 warn "Input file is $C_to_T_infile (FastA)\n";
5319 }
5320 else{
5321 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
5322 }
5323
5324 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
5325 ## data structure above
5326 if ($directional){
5327 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5328 }
5329 else{
5330 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5331 }
5332
5333 foreach my $fh (@fhs) {
5334
5335 my $bt_options = $bowtie_options;
5336 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
5337 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
5338 }
5339 else {
5340 $bt_options .= ' --nofw';
5341 }
5342
5343 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
5344 if ($gzip){
5345 open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!";
5346 }
5347 else{
5348 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data
5349 }
5350
5351 # if Bowtie produces an alignment we store the first line of the output
5352 $_ = $fh->{fh}->getline();
5353 if ($_) {
5354 chomp;
5355 my $id = (split(/\t/))[0]; # this is the first element of the bowtie output (= the sequence identifier)
5356 $fh->{last_seq_id} = $id;
5357 $fh->{last_line} = $_;
5358 warn "Found first alignment:\t$fh->{last_line}\n";
5359 }
5360 # otherwise we just initialise last_seq_id and last_line as undefined
5361 else {
5362 warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
5363 $fh->{last_seq_id} = undef;
5364 $fh->{last_line} = undef;
5365 }
5366 }
5367 }
5368
5369 ### Bowtie 2 | SINGLE-END | FASTA
5370 sub single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
5371 my ($C_to_T_infile,$G_to_A_infile) = @_;
5372 if ($directional){
5373 warn "Input file is $C_to_T_infile (FastA)\n";
5374 }
5375 else{
5376 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
5377 }
5378
5379 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
5380 ## data structure above
5381 if ($directional){
5382 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5383 }
5384 else{
5385 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5386 }
5387
5388 foreach my $fh (@fhs) {
5389
5390 my $bt2_options = $bowtie_options;
5391 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
5392 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
5393 }
5394 else {
5395 $bt2_options .= ' --nofw';
5396 }
5397
5398 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt2_options)\n";
5399 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
5400
5401 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
5402 while (1){
5403 $_ = $fh->{fh}->getline();
5404 if ($_) {
5405 last unless ($_ =~ /^\@/); # SAM headers start with @
5406 }
5407 else{
5408 last; # no alignment output
5409 }
5410 }
5411
5412 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
5413 if ($_) {
5414 chomp;
5415 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
5416 $fh->{last_seq_id} = $id;
5417 $fh->{last_line} = $_;
5418 warn "Found first alignment:\t$fh->{last_line}\n";
5419 }
5420 # otherwise we just initialise last_seq_id and last_line as undefinded. This should only happen at the end of a file for Bowtie 2 output
5421 else {
5422 warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
5423 $fh->{last_seq_id} = undef;
5424 $fh->{last_line} = undef;
5425 }
5426 }
5427 }
5428
5429
5430 ### Bowtie 1 (default) | SINGLE-END | FASTQ
5431 sub single_end_align_fragments_to_bisulfite_genome_fastQ {
5432 my ($C_to_T_infile,$G_to_A_infile) = @_;
5433 if ($directional){
5434 warn "Input file is $C_to_T_infile (FastQ)\n";
5435 }
5436 elsif($pbat){
5437 warn "Input file is $G_to_A_infile (FastQ)\n";
5438 }
5439 else{
5440 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n";
5441 }
5442
5443
5444 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
5445 ## the data structure above
5446 if ($directional or $pbat){
5447 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5448 }
5449 else{
5450 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5451 }
5452
5453 foreach my $fh (@fhs) {
5454 my $bt_options = $bowtie_options;
5455 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
5456 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
5457 }
5458 else {
5459 $bt_options .= ' --nofw';
5460 }
5461
5462 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
5463 sleep (5);
5464
5465 if ($gzip){
5466 open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!";
5467 }
5468 else{
5469 open ($fh->{fh},"$path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data
5470 }
5471
5472 # if Bowtie produces an alignment we store the first line of the output
5473 $_ = $fh->{fh}->getline();
5474 if ($_) {
5475 chomp;
5476 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
5477 $fh->{last_seq_id} = $id;
5478 $fh->{last_line} = $_;
5479 warn "Found first alignment:\t$fh->{last_line}\n";
5480 }
5481 # otherwise we just initialise last_seq_id and last_line as undefined
5482 else {
5483 warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
5484 $fh->{last_seq_id} = undef;
5485 $fh->{last_line} = undef;
5486 }
5487 }
5488 }
5489
5490 ### Bowtie 2 | SINGLE-END | FASTQ
5491 sub single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
5492
5493 my ($C_to_T_infile,$G_to_A_infile) = @_;
5494 if ($directional){
5495 warn "Input file is $C_to_T_infile (FastQ)\n\n";
5496 }
5497 else{
5498 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n\n";
5499 }
5500
5501 ## Now starting up to 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
5502 ## the data structure above
5503 if ($directional){
5504 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5505 }
5506 else{
5507 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5508 }
5509 foreach my $fh (@fhs) {
5510 my $bt2_options = $bowtie_options;
5511 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
5512 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
5513 }
5514 else {
5515 $bt2_options .= ' --nofw';
5516 }
5517 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options $bt2_options)\n";
5518 warn "Using Bowtie 2 index: $fh->{bisulfiteIndex}\n\n";
5519
5520 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
5521 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
5522 while (1){
5523 $_ = $fh->{fh}->getline();
5524 # warn "$_\n";
5525 # sleep(1);
5526 if ($_) {
5527 last unless ($_ =~ /^\@/); # SAM headers start with @
5528 }
5529 else {
5530 last;
5531 }
5532 }
5533
5534 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
5535 if ($_) {
5536 chomp;
5537 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie 2 output (= the sequence identifier)
5538 $fh->{last_seq_id} = $id;
5539 $fh->{last_line} = $_;
5540 warn "Found first alignment:\t$fh->{last_line}\n";
5541 # warn "storing $id and\n$_\n";
5542 }
5543 # otherwise we just initialise last_seq_id and last_line as undefined. This should only happen at the end of a file for Bowtie 2 output
5544 else {
5545 warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
5546 $fh->{last_seq_id} = undef;
5547 $fh->{last_line} = undef;
5548 }
5549 }
5550 }
5551
5552 ###########################################################################################################################################
5553
5554 sub reset_counters_and_fhs{
5555 my $filename = shift;
5556 %counting=(
5557 total_meCHH_count => 0,
5558 total_meCHG_count => 0,
5559 total_meCpG_count => 0,
5560 total_unmethylated_CHH_count => 0,
5561 total_unmethylated_CHG_count => 0,
5562 total_unmethylated_CpG_count => 0,
5563 sequences_count => 0,
5564 no_single_alignment_found => 0,
5565 unsuitable_sequence_count => 0,
5566 genomic_sequence_could_not_be_extracted_count => 0,
5567 unique_best_alignment_count => 0,
5568 low_complexity_alignments_overruled_count => 0,
5569 CT_CT_count => 0, #(CT read/CT genome, original top strand)
5570 CT_GA_count => 0, #(CT read/GA genome, original bottom strand)
5571 GA_CT_count => 0, #(GA read/CT genome, complementary to original top strand)
5572 GA_GA_count => 0, #(GA read/GA genome, complementary to original bottom strand)
5573 CT_GA_CT_count => 0, #(CT read1/GA read2/CT genome, original top strand)
5574 GA_CT_GA_count => 0, #(GA read1/CT read2/GA genome, complementary to original bottom strand)
5575 GA_CT_CT_count => 0, #(GA read1/CT read2/CT genome, complementary to original top strand)
5576 CT_GA_GA_count => 0, #(CT read1/GA read2/GA genome, original bottom strand)
5577 alignments_rejected_count => 0, # only relevant if --directional was specified
5578 );
5579
5580 if ($directional){
5581 if ($filename =~ ','){ # paired-end files
5582 @fhs=(
5583 { name => 'CTreadCTgenome',
5584 strand_identity => 'con ori forward',
5585 bisulfiteIndex => $CT_index_basename,
5586 seen => 0,
5587 wrong_strand => 0,
5588 },
5589 { name => 'CTreadGAgenome',
5590 strand_identity => 'con ori reverse',
5591 bisulfiteIndex => $GA_index_basename,
5592 seen => 0,
5593 wrong_strand => 0,
5594 },
5595 { name => 'GAreadCTgenome',
5596 strand_identity => 'compl ori con forward',
5597 bisulfiteIndex => $CT_index_basename,
5598 seen => 0,
5599 wrong_strand => 0,
5600 },
5601 { name => 'GAreadGAgenome',
5602 strand_identity => 'compl ori con reverse',
5603 bisulfiteIndex => $GA_index_basename,
5604 seen => 0,
5605 wrong_strand => 0,
5606 },
5607 );
5608 }
5609 else{ # single-end files
5610 @fhs=(
5611 { name => 'CTreadCTgenome',
5612 strand_identity => 'con ori forward',
5613 bisulfiteIndex => $CT_index_basename,
5614 seen => 0,
5615 wrong_strand => 0,
5616 },
5617 { name => 'CTreadGAgenome',
5618 strand_identity => 'con ori reverse',
5619 bisulfiteIndex => $GA_index_basename,
5620 seen => 0,
5621 wrong_strand => 0,
5622 },
5623 );
5624 }
5625 }
5626 elsif($pbat){
5627 if ($filename =~ ','){ # paired-end files
5628 @fhs=(
5629 { name => 'CTreadCTgenome',
5630 strand_identity => 'con ori forward',
5631 bisulfiteIndex => $CT_index_basename,
5632 seen => 0,
5633 wrong_strand => 0,
5634 },
5635 { name => 'CTreadGAgenome',
5636 strand_identity => 'con ori reverse',
5637 bisulfiteIndex => $GA_index_basename,
5638 seen => 0,
5639 wrong_strand => 0,
5640 },
5641 { name => 'GAreadCTgenome',
5642 strand_identity => 'compl ori con forward',
5643 bisulfiteIndex => $CT_index_basename,
5644 seen => 0,
5645 wrong_strand => 0,
5646 },
5647 { name => 'GAreadGAgenome',
5648 strand_identity => 'compl ori con reverse',
5649 bisulfiteIndex => $GA_index_basename,
5650 seen => 0,
5651 wrong_strand => 0,
5652 },
5653 );
5654 }
5655 else{ # single-end files
5656 @fhs=(
5657 { name => 'GAreadCTgenome',
5658 strand_identity => 'compl ori con forward',
5659 bisulfiteIndex => $CT_index_basename,
5660 seen => 0,
5661 wrong_strand => 0,
5662 },
5663 { name => 'GAreadGAgenome',
5664 strand_identity => 'compl ori con reverse',
5665 bisulfiteIndex => $GA_index_basename,
5666 seen => 0,
5667 wrong_strand => 0,
5668 },
5669 );
5670 }
5671 }
5672 else{
5673 @fhs=(
5674 { name => 'CTreadCTgenome',
5675 strand_identity => 'con ori forward',
5676 bisulfiteIndex => $CT_index_basename,
5677 seen => 0,
5678 wrong_strand => 0,
5679 },
5680 { name => 'CTreadGAgenome',
5681 strand_identity => 'con ori reverse',
5682 bisulfiteIndex => $GA_index_basename,
5683 seen => 0,
5684 wrong_strand => 0,
5685 },
5686 { name => 'GAreadCTgenome',
5687 strand_identity => 'compl ori con forward',
5688 bisulfiteIndex => $CT_index_basename,
5689 seen => 0,
5690 wrong_strand => 0,
5691 },
5692 { name => 'GAreadGAgenome',
5693 strand_identity => 'compl ori con reverse',
5694 bisulfiteIndex => $GA_index_basename,
5695 seen => 0,
5696 wrong_strand => 0,
5697 },
5698 );
5699 }
5700 }
5701
5702
5703 sub process_command_line{
5704 my @bowtie_options;
5705 my $help;
5706 my $mates1;
5707 my $mates2;
5708 my $path_to_bowtie;
5709 my $fastq;
5710 my $fasta;
5711 my $skip;
5712 my $qupto;
5713 my $phred64;
5714 my $phred33;
5715 my $solexa;
5716 my $mismatches;
5717 my $seed_length;
5718 my $best;
5719 my $sequence_format;
5720 my $version;
5721 my $quiet;
5722 my $chunk;
5723 my $non_directional;
5724 my $ceiling;
5725 my $maxins;
5726 my $minins;
5727 my $unmapped;
5728 my $multi_map;
5729 my $output_dir;
5730 my $bowtie2;
5731 my $vanilla;
5732 my $sam_no_hd;
5733 my $seed_extension_fails;
5734 my $reseed_repetitive_seeds;
5735 my $most_valid_alignments;
5736 my $score_min;
5737 my $parallel;
5738 my $temp_dir;
5739 my $rdg;
5740 my $rfg;
5741 my $non_bs_mm;
5742 my $samtools_path;
5743 my $bam;
5744 my $gzip;
5745 my $pbat;
5746
5747 my $command_line = GetOptions ('help|man' => \$help,
5748 '1=s' => \$mates1,
5749 '2=s' => \$mates2,
5750 'path_to_bowtie=s' => \$path_to_bowtie,
5751 'f|fasta' => \$fasta,
5752 'q|fastq' => \$fastq,
5753 's|skip=i' => \$skip,
5754 'u|upto=i' => \$qupto,
5755 'phred33-quals' => \$phred33,
5756 'phred64-quals|solexa1' => \$phred64,
5757 'solexa-quals' => \$solexa,
5758 'n|seedmms=i' => \$mismatches,
5759 'l|seedlen=i' => \$seed_length,
5760 'no_best' => \$best,
5761 'version' => \$version,
5762 'quiet' => \$quiet,
5763 'chunkmbs=i' => \$chunk,
5764 'non_directional' => \$non_directional,
5765 'I|minins=i' => \$minins,
5766 'X|maxins=i' => \$maxins,
5767 'e|maqerr=i' => \$ceiling,
5768 'un|unmapped' => \$unmapped,
5769 'ambiguous' => \$multi_map,
5770 'o|output_dir=s' => \$output_dir,
5771 'bowtie2' => \$bowtie2,
5772 'vanilla' => \$vanilla,
5773 'sam-no-hd' => \$sam_no_hd,
5774 'D=i' => \$seed_extension_fails,
5775 'R=i' => \$reseed_repetitive_seeds,
5776 'score_min=s' => \$score_min,
5777 'most_valid_alignments=i' => \$most_valid_alignments,
5778 'p=i' => \$parallel,
5779 'temp_dir=s' => \$temp_dir,
5780 'rdg=s' => \$rdg,
5781 'rfg=s' => \$rfg,
5782 'non_bs_mm' => \$non_bs_mm,
5783 'samtools_path=s' => \$samtools_path,
5784 'bam' => \$bam,
5785 'gzip' => \$gzip,
5786 'pbat' => \$pbat,
5787 );
5788
5789
5790 ### EXIT ON ERROR if there were errors with any of the supplied options
5791 unless ($command_line){
5792 die "Please respecify command line options\n";
5793 }
5794 ### HELPFILE
5795 if ($help){
5796 print_helpfile();
5797 exit;
5798 }
5799 if ($version){
5800 print << "VERSION";
5801
5802
5803 Bismark - Bisulfite Mapper and Methylation Caller.
5804
5805 Bismark Version: $bismark_version
5806 Copyright 2010-13 Felix Krueger, Babraham Bioinformatics
5807 www.bioinformatics.babraham.ac.uk/projects/
5808
5809
5810 VERSION
5811 exit;
5812 }
5813
5814
5815 ##########################
5816 ### PROCESSING OPTIONS ###
5817 ##########################
5818
5819 unless ($bowtie2){
5820 $bowtie2 = 0;
5821 }
5822 unless ($sam_no_hd){
5823 $sam_no_hd =0;
5824 }
5825
5826 ### PATH TO BOWTIE
5827 ### if a special path to Bowtie 1/2 was specified we will use that one, otherwise it is assumed that Bowtie 1/2 is in the PATH
5828 if ($path_to_bowtie){
5829 unless ($path_to_bowtie =~ /\/$/){
5830 $path_to_bowtie =~ s/$/\//;
5831 }
5832 if (-d $path_to_bowtie){
5833 if ($bowtie2){
5834 $path_to_bowtie = "${path_to_bowtie}bowtie2";
5835 }
5836 else{
5837 $path_to_bowtie = "${path_to_bowtie}bowtie";
5838 }
5839 }
5840 else{
5841 die "The path to bowtie provided ($path_to_bowtie) is invalid (not a directory)!\n";
5842 }
5843 }
5844 else{
5845 if ($bowtie2){
5846 $path_to_bowtie = 'bowtie2';
5847 warn "Path to Bowtie 2 specified as: $path_to_bowtie\n"; }
5848 else{
5849 $path_to_bowtie = 'bowtie';
5850 warn "Path to Bowtie specified as: $path_to_bowtie\n";
5851 }
5852 }
5853
5854 ### OUTPUT REQUESTED AS BAM FILE
5855 if ($bam){
5856 if ($vanilla){
5857 die "Specifying BAM output is not compatible with \"--vanilla\" format. Please respecify\n\n";
5858 }
5859
5860 ### PATH TO SAMTOOLS
5861 if (defined $samtools_path){
5862 # if Samtools was specified as full command
5863 if ($samtools_path =~ /samtools$/){
5864 if (-e $samtools_path){
5865 # Samtools executable found
5866 }
5867 else{
5868 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
5869 }
5870 }
5871 else{
5872 unless ($samtools_path =~ /\/$/){
5873 $samtools_path =~ s/$/\//;
5874 }
5875 $samtools_path .= 'samtools';
5876 if (-e $samtools_path){
5877 # Samtools executable found
5878 }
5879 else{
5880 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
5881 }
5882 }
5883
5884 warn "Alignments will be written out in BAM format. Samtools path provided as: '$samtools_path'\n";
5885 $bam = 1;
5886 }
5887 # Check whether Samtools is in the PATH if no path was supplied by the user
5888 else{
5889 if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if samtools is in the PATH
5890 $samtools_path = `which samtools`;
5891 chomp $samtools_path;
5892 warn "Alignments will be written out in BAM format. Samtools found here: '$samtools_path'\n";
5893 $bam = 1;
5894 }
5895 }
5896
5897 unless (defined $samtools_path){
5898 $bam = 2;
5899 warn "Did not find Samtools on the system. Alignments will be compressed with GZIP instead (.sam.gz)\n";
5900 }
5901 sleep (1);
5902 }
5903
5904
5905 ####################################
5906 ### PROCESSING ARGUMENTS
5907
5908 ### GENOME FOLDER
5909 my $genome_folder = shift @ARGV; # mandatory
5910 unless ($genome_folder){
5911 warn "Genome folder was not specified!\n";
5912 print_helpfile();
5913 exit;
5914 }
5915
5916 ### checking that the genome folder, all subfolders and the required bowtie index files exist
5917 unless ($genome_folder =~/\/$/){
5918 $genome_folder =~ s/$/\//;
5919 }
5920
5921 if (chdir $genome_folder){
5922 my $absolute_genome_folder = getcwd; ## making the genome folder path absolute
5923 unless ($absolute_genome_folder =~/\/$/){
5924 $absolute_genome_folder =~ s/$/\//;
5925 }
5926 warn "Reference genome folder provided is $genome_folder\t(absolute path is '$absolute_genome_folder)'\n";
5927 $genome_folder = $absolute_genome_folder;
5928 }
5929 else{
5930 die "Failed to move to $genome_folder: $!\nUSAGE: bismark [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>} [<hits>] (--help for more details)\n";
5931 }
5932
5933 my $CT_dir = "${genome_folder}Bisulfite_Genome/CT_conversion/";
5934 my $GA_dir = "${genome_folder}Bisulfite_Genome/GA_conversion/";
5935
5936 if ($bowtie2){ ### Bowtie 2 (new)
5937 ### checking the integrity of $CT_dir
5938 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
5939 my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2');
5940 foreach my $file(@CT_bowtie_index){
5941 unless (-f $file){
5942 die "The Bowtie 2 index of the C->T converted genome seems to be faulty ($file). Please run the bismark_genome_preparation before running Bismark.\n";
5943 }
5944 }
5945 ### checking the integrity of $GA_dir
5946 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
5947 my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2');
5948 foreach my $file(@GA_bowtie_index){
5949 unless (-f $file){
5950 die "The Bowtie 2 index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
5951 }
5952 }
5953 }
5954
5955 else{ ### Bowtie 1 (default)
5956 ### checking the integrity of $CT_dir
5957 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
5958 my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt');
5959 foreach my $file(@CT_bowtie_index){
5960 unless (-f $file){
5961 die "The Bowtie index of the C->T converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
5962 }
5963 }
5964 ### checking the integrity of $GA_dir
5965 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
5966 my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt');
5967 foreach my $file(@GA_bowtie_index){
5968 unless (-f $file){
5969 die "The Bowtie index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
5970 }
5971 }
5972 }
5973
5974 my $CT_index_basename = "${CT_dir}BS_CT";
5975 my $GA_index_basename = "${GA_dir}BS_GA";
5976
5977 ### INPUT OPTIONS
5978
5979 ### SEQUENCE FILE FORMAT
5980 ### exits if both fastA and FastQ were specified
5981 if ($fasta and $fastq){
5982 die "Only one sequence filetype can be specified (fastA or fastQ)\n";
5983 }
5984
5985 ### unless fastA is specified explicitely, fastQ sequence format is expected by default
5986 if ($fasta){
5987 print "FastA format specified\n";
5988 $sequence_format = 'FASTA';
5989 push @bowtie_options, '-f';
5990 }
5991 elsif ($fastq){
5992 print "FastQ format specified\n";
5993 $sequence_format = 'FASTQ';
5994 push @bowtie_options, '-q';
5995 }
5996 else{
5997 $fastq = 1;
5998 print "FastQ format assumed (by default)\n";
5999 $sequence_format = 'FASTQ';
6000 push @bowtie_options, '-q';
6001 }
6002
6003 ### SKIP
6004 if ($skip){
6005 warn "Skipping the first $skip reads from the input file\n";
6006 # push @bowtie_options,"-s $skip";
6007 }
6008
6009 ### UPTO
6010 if ($qupto){
6011 warn "Processing sequences up to read no. $qupto from the input file\n";
6012 if ($bowtie2){
6013 # push @bowtie_options,"--upto $qupto"; ## slightly changed for Bowtie 2
6014 }
6015 else{
6016 # push @bowtie_options,"--qupto $qupto";
6017 }
6018 }
6019
6020 ### QUALITY VALUES
6021 if (($phred33 and $phred64) or ($phred33 and $solexa) or ($phred64 and $solexa)){
6022 die "You can only specify one type of quality value at a time! (--phred33-quals or --phred64-quals or --solexa-quals)";
6023 }
6024 if ($phred33){ ## if nothing else is specified $phred33 will be used as default by both Bowtie 1 and 2.
6025 # Phred quality values work only when -q is specified
6026 unless ($fastq){
6027 die "Phred quality values works only when -q (FASTQ) is specified\n";
6028 }
6029 if ($bowtie2){
6030 push @bowtie_options,"--phred33";
6031 }
6032 else{
6033 push @bowtie_options,"--phred33-quals";
6034 }
6035 }
6036 if ($phred64){
6037 # Phred quality values work only when -q is specified
6038 unless ($fastq){
6039 die "Phred quality values work only when -q (FASTQ) is specified\n";
6040 }
6041 if ($bowtie2){
6042 push @bowtie_options,"--phred64";
6043 }
6044 else{
6045 push @bowtie_options,"--phred64-quals";
6046 }
6047 }
6048 else{
6049 $phred64 = 0;
6050 }
6051
6052 if ($solexa){
6053 if ($bowtie2){
6054 die "The option '--solexa-quals' is not compatible with Bowtie 2. Please respecify!\n";
6055 }
6056 # Solexa to Phred value conversion works only when -q is specified
6057 unless ($fastq){
6058 die "Conversion from Solexa to Phred quality values works only when -q (FASTQ) is specified\n";
6059 }
6060 push @bowtie_options,"--solexa-quals";
6061 }
6062 else{
6063 $solexa = 0;
6064 }
6065
6066 ### ALIGNMENT OPTIONS
6067
6068 ### MISMATCHES
6069 if (defined $mismatches){
6070 if ($bowtie2){
6071 if ($mismatches == 0 or $mismatches == 1){
6072 push @bowtie_options,"-N $mismatches";
6073 }
6074 else{
6075 die "Please set the number of multiseed mismatches for Bowtie 2 with '-N <int>' (where <int> can be 0 or 1)\n";
6076 }
6077 }
6078 else{
6079 if ($mismatches >= 0 and $mismatches <= 3){
6080 push @bowtie_options,"-n $mismatches";
6081 }
6082 else{
6083 die "Please set the number of seed mismatches for Bowtie 1 with '-n <int>' (where <int> can be 0,1,2 or 3)\n";
6084 }
6085 }
6086 }
6087 else{
6088 unless ($bowtie2){
6089 push @bowtie_options,"-n 1"; # setting -n to 1 by default (for use with Bowtie only) because it is much quicker than the default mode of -n 2
6090 }
6091 }
6092
6093 ### SEED LENGTH
6094 if (defined $seed_length){
6095 if ($bowtie2){
6096 push @bowtie_options,"-L $seed_length";
6097 }
6098 else{
6099 push @bowtie_options,"-l $seed_length";
6100 }
6101 }
6102
6103 ### MISMATCH CEILING
6104 if (defined $ceiling){
6105 die "The option '-e' is not compatible with Bowtie 2. Please respecify options\n" if ($bowtie2);
6106 push @bowtie_options,"-e $ceiling";
6107 }
6108
6109
6110 ### BOWTIE 2 EFFORT OPTIONS
6111
6112 ### CONSECUTIVE SEED EXTENSION FAILS
6113 if (defined $seed_extension_fails){
6114 die "The option '-D <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
6115 push @bowtie_options,"-D $seed_extension_fails";
6116 }
6117
6118 ### RE-SEEDING REPETITIVE SEEDS
6119 if (defined $reseed_repetitive_seeds){
6120 die "The option '-R <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
6121 push @bowtie_options,"-R $reseed_repetitive_seeds";
6122 }
6123
6124
6125 ### BOWTIE 2 SCORING OPTIONS
6126 if ($score_min){
6127 die "The option '--score_min <func>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
6128 unless ($score_min =~ /^L,.+,.+$/){
6129 die "The option '--score_min <func>' needs to be in the format <L,value,value> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
6130 }
6131 push @bowtie_options,"--score-min $score_min";
6132 }
6133 else{
6134 if ($bowtie2){
6135 push @bowtie_options,"--score-min L,0,-0.2"; # default setting, more stringent than normal Bowtie2
6136 }
6137 }
6138
6139 ### BOWTIE 2 READ GAP OPTIONS
6140 my ($insertion_open,$insertion_extend,$deletion_open,$deletion_extend);
6141
6142 if ($rdg){
6143 die "The option '--rdg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
6144 if ($rdg =~ /^(\d+),(\d+)$/){
6145 $deletion_open = $1;
6146 $deletion_extend = $2;
6147 }
6148 else{
6149 die "The option '--rdg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
6150 }
6151 push @bowtie_options,"--rdg $rdg";
6152 }
6153 else{
6154 $deletion_open = 5;
6155 $deletion_extend = 3;
6156 }
6157
6158 ### BOWTIE 2 REFERENCE GAP OPTIONS
6159 if ($rfg){
6160 die "The option '--rfg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
6161 if ($rfg =~ /^(\d+),(\d+)$/){
6162 $insertion_open = $1;
6163 $insertion_extend = $2;
6164 }
6165 else{
6166 die "The option '--rfg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
6167 }
6168 push @bowtie_options,"--rfg $rfg";
6169 }
6170 else{
6171 $insertion_open = 5;
6172 $insertion_extend = 3;
6173 }
6174
6175
6176 ### BOWTIE 2 PARALLELIZATION OPTIONS
6177 if (defined $parallel){
6178 die "The parallelization switch '-p' only works for Bowtie 2. Please respecify!" unless ($bowtie2);
6179 }
6180 if ($bowtie2){
6181 if ($parallel){
6182 die "Please select a value for -p of 2 or more!\n" unless ($parallel > 1);
6183 push @bowtie_options,"-p $parallel";
6184 push @bowtie_options,'--reorder'; ## re-orders the bowtie 2 output so that it does match the input files. This is abolutely required for parallelization to work.
6185 print "Each Bowtie 2 instance is going to be run with $parallel threads. Please monitor performance closely and tune down if needed!\n";
6186 sleep (2);
6187 }
6188 }
6189
6190 ### REPORTING OPTIONS
6191
6192 if ($bowtie2){
6193 push @bowtie_options,'--ignore-quals'; ## All mismatches will receive penalty for mismatches as if they were of high quality, which is 6 by default
6194
6195 ### Option -M is deprecated since Bowtie 2 version 2.0.0 beta7. I'll leave this option commented out for a while
6196 if(defined $most_valid_alignments){
6197
6198 warn "\nThe option -M is now deprecated (as of Bowtie 2 version 2.0.0 beta7). What used to be called -M mode is still the default mode. Use the -D and -R options to adjust the effort expended to find valid alignments.\n\n";
6199 # push @bowtie_options,"-M $most_valid_alignments";sleep (5);
6200 }
6201 # else{
6202 # push @bowtie_options,'-M 10'; # the default behavior for Bowtie 2 is to report (and sort) up to 500 alignments for a given sequence
6203 # }
6204 }
6205 else{ # Because of the way Bismark works we will always use the reporting option -k 2 (report up to 2 valid alignments) for Bowtie 1
6206 push @bowtie_options,'-k 2';
6207 }
6208
6209 ### --BEST
6210 if ($bowtie2){
6211 if ($best){ # Bowtie 2 does away with the concept of --best, so one can also not select --no-best when Bowtie 2 is to be used
6212 die "The option '--no-best' is not compatible with Bowtie 2. Please respecify options\n";
6213 }
6214 }
6215 else{
6216 # --best is the default option for Bowtie 1, specifying --no-best can turn it off (e.g. to speed up alignment process)
6217 unless ($best){
6218 push @bowtie_options,'--best';
6219 }
6220 }
6221
6222 ### VANILLA BISMARK (BOWTIE 1) OUTPUT
6223 if ($vanilla){
6224 if ($bowtie2){
6225 die "The options --bowtie2 and the --vanilla are not compatible. Please respecify!\n\n";
6226 }
6227 }
6228 else{
6229 $vanilla = 0;
6230 }
6231
6232 ### PAIRED-END MAPPING
6233 if ($mates1){
6234 my @mates1 = (split (/,/,$mates1));
6235 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n" unless ($mates2);
6236 my @mates2 = (split(/,/,$mates2));
6237 unless (scalar @mates1 == scalar @mates2){
6238 die "Paired-end mapping requires the same amounnt of mate1 and mate2 files, please respecify! (format: -1 <mates1> -2 <mates2>)\n";
6239 }
6240 while (1){
6241 my $mate1 = shift @mates1;
6242 my $mate2 = shift @mates2;
6243 last unless ($mate1 and $mate2);
6244 push @filenames,"$mate1,$mate2";
6245 }
6246 if ($bowtie2){
6247 push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments
6248 push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones
6249 }
6250 }
6251 elsif ($mates2){
6252 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n";
6253 }
6254
6255 ### SINGLE-END MAPPING
6256 # Single-end mapping will be performed if no mate pairs for paired-end mapping have been specified
6257 my $singles;
6258 unless ($mates1 and $mates2){
6259 $singles = join (',',@ARGV);
6260 unless ($singles){
6261 die "\nNo filename supplied! Please specify one or more files for single-end Bismark mapping!\n";
6262 }
6263 $singles =~ s/\s/,/g;
6264 @filenames = (split(/,/,$singles));
6265 warn "\nFiles to be analysed:\n";
6266 warn "@filenames\n\n";
6267 sleep (3);
6268 }
6269
6270 ### MININUM INSERT SIZE (PAIRED-END ONLY)
6271 if (defined $minins){
6272 die "-I/--minins can only be used for paired-end mapping!\n\n" if ($singles);
6273 push @bowtie_options,"--minins $minins";
6274 }
6275
6276 ### MAXIMUM INSERT SIZE (PAIRED-END ONLY)
6277 if (defined $maxins){
6278 die "-X/--maxins can only be used for paired-end mapping!\n\n" if ($singles);
6279 push @bowtie_options,"--maxins $maxins";
6280 }
6281 else{
6282 unless ($singles){
6283 push @bowtie_options,'--maxins 500';
6284 }
6285 }
6286
6287 ### QUIET prints nothing besides alignments (suppresses warnings)
6288 if ($quiet){
6289 push @bowtie_options,'--quiet';
6290 }
6291
6292 ### CHUNKMBS needed to be increased to avoid memory exhaustion warnings for Bowtie 1, particularly for --best (and paired-end) alignments
6293 unless ($bowtie2){ # Bowtie 2 does not have a chunkmbs option
6294 if (defined $chunk){
6295 push @bowtie_options,"--chunkmbs $chunk";
6296 }
6297 else{
6298 push @bowtie_options,'--chunkmbs 512'; ## setting the default to 512MB (up from 64 default)
6299 }
6300 }
6301
6302
6303 ### SUMMARY OF ALL BOWTIE OPTIONS
6304 my $bowtie_options = join (' ',@bowtie_options);
6305
6306
6307 ### STRAND-SPECIFIC LIBRARIES
6308 my $directional;
6309 if ($non_directional){
6310 die "A library can only be specified to be either non-directional or a PBAT-Seq library. Please respecify!\n\n" if ($pbat);
6311 warn "Library was specified to be not strand-specific (non-directional), therefore alignments to all four possible bisulfite strands (OT, CTOT, OB and CTOB) will be reported\n";
6312 sleep (3);
6313 $directional = 0;
6314 }
6315 elsif($pbat){
6316 die "The option --pbat is currently not compatible with --gzip. Please run alignments with uncompressed temporary files, i.e. lose the option --gzip\n" if ($gzip);
6317 die "The option --pbat is currently not working for Bowtie 2. Please run alignments in default (i.e. Bowtie 1) mode!\n" if ($bowtie2);
6318 die "The option --pbat is currently only working with FastQ files. Please respecify (i.e. lose the option -f)!\n" if ($fasta);
6319
6320 warn "Library was specified as PBAT-Seq (Post-Bisulfite Adapter Tagging), only performing alignments to the complementary strands (CTOT and CTOB)\n";
6321 sleep (3);
6322 $directional = 0;
6323 }
6324 else{
6325 warn "Library is assumed to be strand-specific (directional), alignments to strands complementary to the original top or bottom strands will be ignored (i.e. not performed!)\n";
6326 sleep (3);
6327 $directional = 1; # default behaviour
6328 }
6329
6330 ### UNMAPPED SEQUENCE OUTPUT
6331 $unmapped = 0 unless ($unmapped);
6332
6333 ### AMBIGUOUS ALIGNMENT SEQUENCE OUTPUT
6334 $multi_map = 0 unless ($multi_map);
6335
6336
6337 ### OUTPUT DIRECTORY
6338
6339 chdir $parent_dir or die "Failed to move back to current working directory\n";
6340 if ($output_dir){
6341 unless ($output_dir =~ /\/$/){
6342 $output_dir =~ s/$/\//;
6343 }
6344
6345 if (chdir $output_dir){
6346 $output_dir = getcwd; # making the path absolute
6347 unless ($output_dir =~ /\/$/){
6348 $output_dir =~ s/$/\//;
6349 }
6350 }
6351 else{
6352 mkdir $output_dir or die "Unable to create directory $output_dir $!\n";
6353 warn "Created output directory $output_dir!\n\n";
6354 chdir $output_dir or die "Failed to move to $output_dir\n";
6355 $output_dir = getcwd; # making the path absolute
6356 unless ($output_dir =~ /\/$/){
6357 $output_dir =~ s/$/\//;
6358 }
6359 }
6360 warn "Output will be written into the directory: $output_dir\n";
6361 }
6362 else{
6363 $output_dir = '';
6364 }
6365
6366 ### TEMPORARY DIRECTORY for C->T and G->A transcribed files
6367
6368 chdir $parent_dir or die "Failed to move back to current working directory\n";
6369 if ($temp_dir){
6370 warn "\nUsing temp directory: $temp_dir\n";
6371 unless ($temp_dir =~ /\/$/){
6372 $temp_dir =~ s/$/\//;
6373 }
6374
6375 if (chdir $temp_dir){
6376 $temp_dir = getcwd; # making the path absolute
6377 unless ($temp_dir =~ /\/$/){
6378 $temp_dir =~ s/$/\//;
6379 }
6380 }
6381 else{
6382 mkdir $temp_dir or die "Unable to create directory $temp_dir $!\n";
6383 warn "Created temporary directory $temp_dir!\n\n";
6384 chdir $temp_dir or die "Failed to move to $temp_dir\n";
6385 $temp_dir = getcwd; # making the path absolute
6386 unless ($temp_dir =~ /\/$/){
6387 $temp_dir =~ s/$/\//;
6388 }
6389 }
6390 warn "Temporary files will be written into the directory: $temp_dir\n";
6391 }
6392 else{
6393 $temp_dir = '';
6394 }
6395
6396 ### OPTIONAL NON-BS MISMATCH OUTPUT AS EXTRA COLUMN IN SAM FILE
6397 if ($non_bs_mm){
6398 if ($vanilla){
6399 die "Option '--non_bs_mm' may only be specified for output in SAM format. Please respecify!\n";
6400 }
6401 }
6402
6403 return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat);
6404 }
6405
6406
6407
6408 sub generate_SAM_header{
6409 print OUT "\@HD\tVN:1.0\tSO:unsorted\n"; # @HD = header, VN = version, SO = sort order
6410 foreach my $chr (keys %chromosomes){
6411 my $length = length ($chromosomes{$chr});
6412 print OUT "\@SQ\tSN:$chr\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length
6413 }
6414 print OUT "\@PG\tID:Bismark\tVN:$bismark_version\tCL:\"bismark $command_line\"\n"; # @PG = program, ID = unique identifier, PN = program name name, VN = program version
6415 }
6416
6417 ### I would like to thank the following individuals for their valuable contributions to the Bismark SAM output format:
6418 ### O. Tam (Sep 2010), C. Whelan (2011), E. Vidal (2011), T. McBryan (2011), P. Hickey (2011)
6419
6420 sub single_end_SAM_output{
6421 my ($id,$actual_seq,$methylation_call_params,$qual) = @_;
6422 my $strand = $methylation_call_params->{$id}->{alignment_strand};
6423 my $chr = $methylation_call_params->{$id}->{chromosome};
6424 my $start = $methylation_call_params->{$id}->{position};
6425 my $stop = $methylation_call_params->{$id}->{end_position};
6426 my $ref_seq = $methylation_call_params->{$id}->{unmodified_genomic_sequence};
6427 my $methcall = $methylation_call_params->{$id}->{methylation_call};
6428 my $read_conversion = $methylation_call_params->{$id}->{read_conversion};
6429 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion};
6430 my $number_of_mismatches;
6431 if ($bowtie2){
6432 $number_of_mismatches= $methylation_call_params->{$id}->{alignment_score};
6433 }
6434 else{
6435 $number_of_mismatches= $methylation_call_params->{$id}->{number_of_mismatches};
6436 }
6437
6438 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011"
6439 ## FLAG: bitwise FLAG. Each bit is explained in the following table:
6440 ## Bit Description Comment Value
6441 ## 0x1 template has multiple segments in sequencing 0: single-end 1: paired end value: 2**0 ( 1)
6442 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2**1 ( 2)
6443 ## 0x4 segment unmapped --- ---
6444 ## 0x8 next segment in the template unmapped --- ---
6445 ## 0x10 SEQ being reverse complemented value: 2**4 ( 16)
6446 ## 0x20 SEQ of the next segment in the template being reversed value: 2**5 ( 32)
6447 ## 0x40 the first segment in the template read 1 value: 2**6 ( 64)
6448 ## 0x80 the last segment in the template read 2 value: 2**7 (128)
6449 ## 0x100 secondary alignment --- ---
6450 ## 0x200 not passing quality controls --- ---
6451 ## 0x400 PCR or optical duplicate --- ---
6452
6453 #####
6454
6455 my $flag; # FLAG variable used for SAM format.
6456 if ($strand eq "+"){
6457 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){
6458 $flag = 0; # 0 for "+" strand (OT)
6459 }
6460 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){
6461 $flag = 16; # 16 for "-" strand (CTOB, yields information for the original bottom strand)
6462 }
6463 else{
6464 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n";
6465 }
6466 }
6467 elsif ($strand eq "-"){
6468 if ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){
6469 $flag = 16; # 16 for "-" strand (OB)
6470 }
6471 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){
6472 $flag = 0; # 0 for "+" strand (CTOT, yields information for the original top strand)
6473 }
6474 else{
6475 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n";
6476 }
6477 }
6478 else{
6479 die "Unexpected strand information: $strand\n\n";
6480 }
6481
6482 #####
6483
6484 my $mapq = 255; # Assume mapping quality is unavailable
6485
6486 #####
6487
6488 my $cigar;
6489 if ($bowtie2){
6490 $cigar = $methylation_call_params->{$id}->{CIGAR}; # Actual CIGAR string reported by Bowtie 2
6491 }
6492 else{
6493 $cigar = length($actual_seq) . "M"; # Bowtie 1 output does not contain indels (only matches and mismatches)
6494 }
6495
6496 #####
6497
6498 my $rnext = "*"; # Paired-end variable
6499
6500 #####
6501
6502 my $pnext = 0; # Paired-end variable
6503
6504 #####
6505
6506 my $tlen = 0; # Paired-end variable
6507
6508 #####
6509
6510 if ($read_conversion eq 'CT'){
6511 $ref_seq = substr($ref_seq, 0, length($ref_seq) - 2); # Removes additional nucleotides from the 3' end. This only works for the original top or bottom strands
6512 }
6513 else{
6514 $ref_seq = substr($ref_seq, 2, length($ref_seq) - 2); # Removes additional nucleotides from the 5' end. This works for the complementary strands in non-directional libraries
6515 }
6516
6517 if ($strand eq '-'){
6518 $actual_seq = revcomp($actual_seq); # Sequence represented on the forward genomic strand
6519 $ref_seq = revcomp($ref_seq); # Required for comparison with actual sequence
6520 $qual = reverse $qual; # if the sequence was reverse-complemented the quality string needs to be reversed as well
6521 }
6522
6523 #####
6524
6525 my $hemming_dist = hemming_dist($actual_seq,$ref_seq); # Edit distance to the reference, i.e. minimal number of one-nucleotide edits needed to transform the read string
6526 # into the reference string. hemming_dist()
6527 if ($bowtie2){
6528 $hemming_dist += $methylation_call_params->{$id}->{indels}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
6529 }
6530
6531 my $NM_tag = "NM:i:$hemming_dist"; # Optional tag NM: edit distance based on nucleotide differences
6532
6533 #####
6534
6535 my $XX_tag = make_mismatch_string($actual_seq, $ref_seq); # Optional tag XX: string providing mismatched reference bases in the alignment (NO indel information!)
6536
6537 #####
6538
6539 my $XM_tag; # Optional tag XM: Methylation Call String
6540 if ($strand eq '+'){
6541 $XM_tag = "XM:Z:$methcall";
6542 }
6543 elsif ($strand eq '-'){
6544 $XM_tag = 'XM:Z:'.reverse $methcall; # if the sequence was reverse-complemented the methylation call string needs to be reversed as well
6545 }
6546
6547 #####
6548
6549 my $XR_tag = "XR:Z:$read_conversion"; # Optional tag XR: Read Conversion
6550
6551 #####
6552
6553 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion
6554
6555 #####
6556
6557 # Optionally calculating number of mismatches for Bowtie 2 alignments
6558
6559 if ($non_bs_mm) {
6560 if ($bowtie2) {
6561
6562 $number_of_mismatches =~ s/-//; # removing the minus sign
6563
6564 ### if Bowtie 2 was used we need to analyse the CIGAR string whether the read contained any indels to determine the number of mismatches
6565 if ($cigar =~ /(D|I)/) {
6566 # warn "$cigar\n";
6567
6568 # parsing CIGAR string
6569 my @len = split (/\D+/,$cigar); # storing the length per operation
6570 my @ops = split (/\d+/,$cigar); # storing the operation
6571 shift @ops; # remove the empty first element
6572 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
6573
6574 foreach (0..$#len) {
6575 if ($ops[$_] eq 'M') {
6576 # warn "skipping\n";
6577 next; # irrelevant
6578 }
6579 elsif ($ops[$_] eq 'I') { # insertion in the read sequence
6580 $number_of_mismatches -= $insertion_open;
6581 $number_of_mismatches -= $len[$_] * $insertion_extend;
6582 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n";
6583 }
6584 elsif ($ops[$_] eq 'D') { # deletion in the read sequence
6585 $number_of_mismatches -= $deletion_open;
6586 $number_of_mismatches -= $len[$_] * $deletion_extend;
6587 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n";
6588 }
6589 elsif ($cigar =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die
6590 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
6591 }
6592 else {
6593 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
6594 }
6595 }
6596 # warn "Alignment score $number_of_mismatches\n";
6597 # print "Mismatches $number_of_mismatches\n\n";
6598 }
6599 ### Now we have InDel corrected alignment scores
6600
6601 ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the
6602 ### sequence contained more than 5 Ns, but this should occur close to never
6603
6604 my $seq_N_count = $number_of_mismatches % 6; # modulo 6 will return the integer rest after the division
6605 # warn "N count: $seq_N_count\n";
6606 $number_of_mismatches = int ($number_of_mismatches / 6) + $seq_N_count;
6607 # warn "MM $number_of_mismatches\n";
6608 }
6609 }
6610
6611 ####
6612
6613 my $XA_tag = "XA:Z:$number_of_mismatches";
6614
6615 #####
6616
6617 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
6618 ### optionally print number of non-bisulfite mismatches
6619 if ($non_bs_mm){
6620 print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$XX_tag,$XM_tag,$XR_tag,$XG_tag,$XA_tag)),"\n";
6621 }
6622 else{ # default
6623 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
6624 print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$XX_tag,$XM_tag,$XR_tag,$XG_tag)),"\n";
6625 }
6626 }
6627
6628 sub paired_end_SAM_output{
6629 my ($id,$actual_seq_1,$actual_seq_2,$methylation_call_params,$qual_1,$qual_2) = @_;
6630 my $strand_1 = $methylation_call_params->{$id}->{alignment_read_1}; # Bowtie 1 only reports the read 1 alignment strand
6631 my $strand_2 = $methylation_call_params->{$id}->{alignment_read_2};
6632 my $chr = $methylation_call_params->{$id}->{chromosome};
6633 my $ref_seq_1 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_1};
6634 my $ref_seq_2 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_2};
6635 my $methcall_1 = $methylation_call_params->{$id}->{methylation_call_1};
6636 my $methcall_2 = $methylation_call_params->{$id}->{methylation_call_2};
6637 my $read_conversion_1 = $methylation_call_params->{$id}->{read_conversion_1};
6638 my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2};
6639 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion};
6640
6641 my $id_1 = $id.'/1';
6642 my $id_2 = $id.'/2';
6643
6644 # Allows all degenerate nucleotide sequences in reference genome
6645 die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHV]/i;
6646 die "Reference sequence ($ref_seq_2) contains invalid nucleotides!\n" if $ref_seq_2 =~ /[^ACTGNRYMKSWBDHV]/i;
6647
6648 my $index; # used to store the srand origin of the alignment in a less convoluted way
6649
6650 if ($read_conversion_1 eq 'CT' and $genome_conversion eq 'CT'){
6651 $index = 0; ## this is OT (original top strand)
6652 }
6653 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'GA'){
6654 $index = 1; ## this is CTOB (complementary to OB)
6655 }
6656 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'CT'){
6657 $index = 2; ## this is CTOT (complementary to OT)
6658 }
6659 elsif ($read_conversion_1 eq 'CT' and $genome_conversion eq 'GA'){
6660 $index = 3; ## this is OB (original bottom)
6661 }
6662 else {
6663 die "Unexpected combination of read 1 and genome conversion: $read_conversion_1 / $genome_conversion\n";
6664 }
6665
6666 my $number_of_mismatches_1;
6667 my $number_of_mismatches_2;
6668
6669 if ($bowtie2){ # Bowtie 2 reports always as read 1 then read 2, so this is fine
6670 $number_of_mismatches_1 = $methylation_call_params->{$id}->{alignment_score_1}; # only needed for custom allele-specific output, not the default!
6671 $number_of_mismatches_2 = $methylation_call_params->{$id}->{alignment_score_2};
6672 }
6673 else{ # Bowtie 1 reports always the leftmost read first. That means we have to reverse the strings if the first read aligned in reverse orientation
6674 if ($index == 2 or $index == 3){ # CTOT or OB
6675 $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_2}; # only needed for custom allele-specific output, not the default!
6676 $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_1};
6677 }
6678 else{ # if the first read aligned in forward direction it is like for Bowtie 2
6679 $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_1}; # only needed for custom allele-specific output, not the default!
6680 $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_2};
6681 }
6682 }
6683
6684
6685
6686 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the
6687 ### first or last position.
6688
6689 if ($index == 0 or $index == 3){ # OT or OB
6690 $ref_seq_1 = substr($ref_seq_1,0,length($ref_seq_1)-2);
6691 $ref_seq_2 = substr($ref_seq_2,2,length($ref_seq_2)-2);
6692 }
6693 else{ # CTOT or CTOB
6694 $ref_seq_1 = substr($ref_seq_1,2,length($ref_seq_1)-2);
6695 $ref_seq_2 = substr($ref_seq_2,0,length($ref_seq_2)-2);
6696 }
6697
6698 #####
6699
6700 my $start_read_1;
6701 my $start_read_2;
6702 # adjusting end positions
6703
6704 if ($bowtie2){
6705 $start_read_1 = $methylation_call_params->{$id}->{position_1};
6706 $start_read_2 = $methylation_call_params->{$id}->{position_2};
6707 }
6708 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1
6709 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand
6710 $start_read_1 = $methylation_call_params->{$id}->{start_seq_1};
6711 $start_read_2 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_2) + 1;
6712 }
6713 else{ # read 1 is on the - strand
6714 $start_read_1 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_1) + 1;
6715 $start_read_2 = $methylation_call_params->{$id}->{start_seq_1};
6716 }
6717 }
6718
6719 #####
6720
6721 my $end_read_1;
6722 my $end_read_2;
6723 # adjusting end positions
6724
6725 if ($bowtie2){
6726 $end_read_1 = $methylation_call_params->{$id}->{end_position_1};
6727 $end_read_2 = $methylation_call_params->{$id}->{end_position_2};
6728 }
6729 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1
6730 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand
6731 $end_read_1 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_1)-1;
6732 $end_read_2 = $methylation_call_params->{$id}->{alignment_end};
6733 }
6734 else{
6735 $end_read_1 = $methylation_call_params->{$id}->{alignment_end};
6736 $end_read_2 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_2)-1;
6737 }
6738 }
6739
6740 #####
6741
6742 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011"
6743 ## FLAG: bitwise FLAG. Each bit is explained in the following table:
6744 ## Bit Description Comment Value
6745 ## 0x1 template having multiple segments in sequencing 0: single-end 1: paired end value: 2^^0 ( 1)
6746 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2^^1 ( 2)
6747 ## 0x4 segment unmapped --- ---
6748 ## 0x8 next segment in the template unmapped --- ---
6749 ## 0x10 SEQ being reverse complemented - strand alignment value: 2^^4 ( 16)
6750 ## 0x20 SEQ of the next segment in the template being reversed + strand alignment value: 2^^5 ( 32)
6751 ## 0x40 the first segment in the template read 1 value: 2^^6 ( 64)
6752 ## 0x80 the last segment in the template read 2 value: 2^^7 (128)
6753 ## 0x100 secondary alignment --- ---
6754 ## 0x200 not passing quality controls --- ---
6755 ## 0x400 PCR or optical duplicate --- ---
6756
6757 ### As the FLAG value do not consider that there might be 4 different bisulfite strands of DNA, we are trying to make FLAG tags which take the strand identity into account
6758
6759 # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand)
6760 # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences)
6761
6762 my $flag_1; # FLAG variable used for SAM format
6763 my $flag_2;
6764
6765 if ($index == 0){ # OT
6766 $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it)
6767 $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128)
6768 }
6769 elsif ($index == 1){ # CTOB
6770 $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64)
6771 $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128)
6772 }
6773 elsif ($index == 2){ # CTOT
6774 $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64)
6775 $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128)
6776 }
6777 elsif ($index == 3){ # OB
6778 $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64)
6779 $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128)
6780 }
6781
6782 #####
6783
6784 my $mapq = 255; # Mapping quality is unavailable
6785
6786 #####
6787
6788 my $cigar_1;
6789 my $cigar_2;
6790
6791 if ($bowtie2){
6792 $cigar_1 = $methylation_call_params->{$id}->{CIGAR_1}; # Actual CIGAR string reported by Bowtie 2
6793 $cigar_2 = $methylation_call_params->{$id}->{CIGAR_2};
6794 }
6795 else{
6796 $cigar_1 = length($actual_seq_1) . "M"; # Assume no indels for Bowtie 1 mapping (only matches and mismatches)
6797 $cigar_2 = length($actual_seq_2) . "M";
6798 }
6799
6800 #####
6801
6802 my $rnext = '='; # Chromosome of mate; applies to both reads
6803
6804 #####
6805
6806 my $pnext_1 = $start_read_2; # Leftmost position of mate
6807 my $pnext_2 = $start_read_1;
6808
6809 #####
6810
6811 my $tlen_1; # signed observed Template LENgth (or inferred fragment size)
6812 my $tlen_2;
6813
6814 if ($bowtie2){
6815
6816 if ($start_read_1 <= $start_read_2){
6817
6818 # Read 1 alignment is leftmost
6819
6820 if ($end_read_2 >= $end_read_1){
6821
6822 # -------------------------> read 1 reads overlapping
6823 # <------------------------- read 2
6824 #
6825 # or
6826 #
6827 # -------------------------> read 1
6828 # <----------------------- read 2 read 2 contained within read 1
6829 #
6830 # or
6831 #
6832 # -------------------------> read 1 reads 1 and 2 exactly overlapping
6833 # <------------------------- read 2
6834 #
6835
6836 # dovetailing of reads is not enabled for Bowtie 2 alignments
6837
6838 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
6839 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
6840 }
6841 elsif ($end_read_2 < $end_read_1){
6842
6843 # -------------------------> read 1
6844 # <----------- read 2 read 2 contained within read 1
6845 #
6846 # or
6847 #
6848 # -------------------------> read 1
6849 # <----------- read 2 read 2 contained within read 1
6850
6851 # start and end of read 2 are fully contained within read 1
6852 $tlen_1 = 0; # Set as 0 when the information is unavailable
6853 $tlen_2 = 0; # Set as 0 when the information is unavailable
6854 }
6855
6856 }
6857
6858 elsif ($start_read_2 < $start_read_1){
6859
6860 if ($end_read_1 >= $end_read_2){
6861
6862 # Read 2 alignment is leftmost
6863
6864 # -------------------------> read 2 reads overlapping
6865 # <------------------------- read 1
6866 #
6867 # or
6868 #
6869 # -------------------------> read 2
6870 # <----------------------- read 1 read 1 contained within read 2
6871 #
6872 #
6873
6874 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
6875 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
6876 }
6877 elsif ($end_read_1 < $end_read_2){
6878
6879 # -------------------------> read 2
6880 # <----------- read 1 read 1 contained within read 2
6881 #
6882 # or
6883 #
6884 # -------------------------> read 2
6885 # <----------- read 1 read 1 contained within read 2
6886
6887 # start and end of read 1 are fully contained within read 2
6888 $tlen_1 = 0; # Set as 0 when the information is unavailable
6889 $tlen_2 = 0; # Set as 0 when the information is unavailable
6890 }
6891 }
6892 }
6893
6894 else{ # Bowtie 1
6895
6896 if ($end_read_2 >= $end_read_1){
6897 # Read 1 alignment is leftmost
6898 # -------------------------> read 1
6899 # <------------------------- read 2
6900 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
6901
6902 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
6903 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
6904 }
6905 else{
6906 # Read 2 alignment is leftmost
6907 # -------------------------> read 2
6908 # <------------------------- read 1
6909 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
6910
6911 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
6912 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
6913 }
6914 }
6915
6916 #####
6917
6918 # adjusting the strand of the sequence before we use them to generate mismatch strings
6919 if ($strand_1 eq '-'){
6920 $actual_seq_1 = revcomp($actual_seq_1); # Sequence represented on the forward genomic strand
6921 $ref_seq_1 = revcomp($ref_seq_1); # Required for comparison with actual sequence
6922 $qual_1 = reverse $qual_1; # we need to reverse the quality string as well
6923 }
6924 if ($strand_2 eq '-'){
6925 $actual_seq_2 = revcomp($actual_seq_2); # Mate sequence represented on the forward genomic strand
6926 $ref_seq_2 = revcomp($ref_seq_2); # Required for comparison with actual sequence
6927 $qual_2 = reverse $qual_2; # If the sequence gets reverse complemented we reverse the quality string as well
6928 }
6929
6930 # print "$actual_seq_1\n$ref_seq_1\n\n";
6931 # print "$actual_seq_2\n$ref_seq_2\n\n";
6932
6933 #####
6934
6935 my $hemming_dist_1 = hemming_dist($actual_seq_1,$ref_seq_1); # Minimal number of one-nucleotide edits needed to transform the read string into the reference sequence
6936 my $hemming_dist_2 = hemming_dist($actual_seq_2,$ref_seq_2);
6937 if ($bowtie2){
6938 $hemming_dist_1 += $methylation_call_params->{$id}->{indels_1}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
6939 $hemming_dist_2 += $methylation_call_params->{$id}->{indels_2}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
6940 }
6941 my $NM_tag_1 = "NM:i:$hemming_dist_1"; # Optional tag NM: edit distance based on nucleotide differences
6942 my $NM_tag_2 = "NM:i:$hemming_dist_2"; # Optional tag NM: edit distance based on nucleotide differences
6943
6944 #####
6945
6946 my $XX_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1); # Optional tag XX: String providing mismatched reference bases in the alignment (NO indel information!)
6947 my $XX_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2);
6948
6949 #####
6950
6951 my $XM_tag_1; # Optional tag XM: Methylation call string
6952 my $XM_tag_2;
6953
6954 if ($strand_1 eq '-'){
6955 $XM_tag_1 = 'XM:Z:'.reverse $methcall_1; # Needs to be reversed if the sequence was reverse complemented
6956 }
6957 else{
6958 $XM_tag_1 = "XM:Z:$methcall_1";
6959 }
6960
6961 if ($strand_2 eq '-'){
6962 $XM_tag_2 = 'XM:Z:'.reverse $methcall_2; # Needs to be reversed if the sequence was reverse complemented
6963 }
6964 else{
6965 $XM_tag_2 = "XM:Z:$methcall_2";
6966 }
6967
6968 #####
6969
6970 my $XR_tag_1 = "XR:Z:$read_conversion_1"; # Optional tag XR: Read 1 conversion state
6971 my $XR_tag_2 = "XR:Z:$read_conversion_2"; # Optional tag XR: Read 2 conversion state
6972
6973 #####
6974
6975 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion state; valid for both reads
6976
6977 #####
6978
6979 # Optionally calculating number of mismatches for Bowtie 2 alignments
6980
6981 if ($non_bs_mm) {
6982 if ($bowtie2) {
6983
6984 $number_of_mismatches_1 =~ s/-//; # removing the minus sign
6985 $number_of_mismatches_2 =~ s/-//;
6986
6987 ### if Bowtie 2 was used we need to analyse the CIGAR strings whether the reads contained any indels to determine the number of mismatches
6988
6989 ### CIGAR 1
6990 if ($cigar_1 =~ /(D|I)/) {
6991 # warn "$cigar_1\n";
6992
6993 # parsing CIGAR string
6994 my @len = split (/\D+/,$cigar_1); # storing the length per operation
6995 my @ops = split (/\d+/,$cigar_1); # storing the operation
6996 shift @ops; # remove the empty first element
6997 die "CIGAR string '$cigar_1' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
6998
6999 foreach (0..$#len) {
7000 if ($ops[$_] eq 'M') {
7001 # warn "skipping\n";
7002 next; # irrelevant
7003 }
7004 elsif ($ops[$_] eq 'I') { # insertion in the read sequence
7005 $number_of_mismatches_1 -= $insertion_open;
7006 $number_of_mismatches_1 -= $len[$_] * $insertion_extend;
7007 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n";
7008 }
7009 elsif ($ops[$_] eq 'D') { # deletion in the read sequence
7010 $number_of_mismatches_1 -= $deletion_open;
7011 $number_of_mismatches_1 -= $len[$_] * $deletion_extend;
7012 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n";
7013 }
7014 elsif ($cigar_1 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die
7015 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
7016 }
7017 else {
7018 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
7019 }
7020 }
7021
7022 # warn "Alignment score $number_of_mismatches_1\n";
7023 # print "Mismatches $number_of_mismatches_1\n\n";
7024 }
7025
7026 ### CIGAR 2
7027 if ($cigar_2 =~ /(D|I)/) {
7028 # warn "$cigar_2\n";
7029
7030 # parsing CIGAR string
7031 my @len = split (/\D+/,$cigar_2); # storing the length per operation
7032 my @ops = split (/\d+/,$cigar_2); # storing the operation
7033 shift @ops; # remove the empty first element
7034 die "CIGAR string '$cigar_2' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
7035
7036 foreach (0..$#len) {
7037 if ($ops[$_] eq 'M') {
7038 # warn "skipping\n";
7039 next; #irrelevant
7040 }
7041 elsif ($ops[$_] eq 'I') { # insertion in the read sequence
7042 $number_of_mismatches_2 -= $insertion_open;
7043 $number_of_mismatches_2 -= $len[$_] * $insertion_extend;
7044 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n";
7045 }
7046 elsif ($ops[$_] eq 'D') { # deletion in the read sequence
7047 $number_of_mismatches_2 -= $deletion_open;
7048 $number_of_mismatches_2 -= $len[$_] * $deletion_extend;
7049 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n";
7050 }
7051 elsif ($cigar_2 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die
7052 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
7053 }
7054 else {
7055 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
7056 }
7057 }
7058 }
7059
7060 ### Now we have InDel corrected Alignment scores
7061
7062 ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the
7063 ### sequence contained more than 5 Ns, but this should occur close to never
7064
7065 my $seq_1_N_count = $number_of_mismatches_1 % 6; # modulo 6 will return the integer rest after the division
7066 my $seq_2_N_count = $number_of_mismatches_2 % 6;
7067 # warn "N count 1: $seq_1_N_count\n";
7068 # warn "N count 2: $seq_2_N_count\n";
7069
7070 $number_of_mismatches_1 = int ($number_of_mismatches_1 / 6) + $seq_1_N_count;
7071 $number_of_mismatches_2 = int ($number_of_mismatches_2 / 6) + $seq_2_N_count;
7072
7073 # warn "MM1 $number_of_mismatches_1 \n";
7074 # warn "MM2 $number_of_mismatches_2 \n";
7075 }
7076 }
7077
7078 ####
7079
7080 my $XA_tag = "XA:Z:$number_of_mismatches_1";
7081 my $XB_tag = "XB:Z:$number_of_mismatches_2";
7082
7083
7084 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
7085 ### optionally print number of non-bisulfite mismatches
7086 if ($non_bs_mm){
7087 print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $XX_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag,$XA_tag)), "\n";
7088 print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $XX_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag,$XB_tag)), "\n";
7089 }
7090 else{ # default
7091 print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $XX_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag)), "\n";
7092 print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $XX_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag)), "\n";
7093 }
7094 }
7095
7096 sub revcomp{
7097 my $seq = shift or die "Missing seq to reverse complement\n";
7098 $seq = reverse $seq;
7099 $seq =~ tr/ACTGactg/TGACTGAC/;
7100 return $seq;
7101 }
7102
7103 sub hemming_dist{
7104 my $matches = 0;
7105 my @actual_seq = split //,(shift @_);
7106 my @ref_seq = split //,(shift @_);
7107 foreach (0..$#actual_seq){
7108 ++$matches if ($actual_seq[$_] eq $ref_seq[$_]);
7109 }
7110 return my $hd = scalar @actual_seq - $matches;
7111 }
7112
7113 sub make_mismatch_string{
7114 my $actual_seq = shift or die "Missing actual sequence";
7115 my $ref_seq = shift or die "Missing reference sequence";
7116 my $XX_tag = "XX:Z:";
7117 my $tmp = ($actual_seq ^ $ref_seq); # Bitwise comparison
7118 my $prev_mm_pos = 0;
7119 while($tmp =~ /[^\0]/g){ # Where bitwise comparison showed a difference
7120 my $nuc_match = pos($tmp) - $prev_mm_pos - 1; # Generate number of nucleotide that matches since last mismatch
7121 my $nuc_mm = substr($ref_seq, pos($tmp) - 1, 1) if pos($tmp) <= length($ref_seq); # Obtain reference nucleotide that was different from the actual read
7122 $XX_tag .= "$nuc_match" if $nuc_match > 0; # Ignore if mismatches are adjacent to each other
7123 $XX_tag .= "$nuc_mm" if defined $nuc_mm; # Ignore if there is no mismatch (prevents uninitialized string concatenation)
7124 $prev_mm_pos = pos($tmp); # Position of last mismatch
7125 }
7126 my $end_matches = length($ref_seq) - $prev_mm_pos; # Provides number of matches from last mismatch till end of sequence
7127 $XX_tag .= "$end_matches" if $end_matches > 0; # Ignore if mismatch is at the end of sequence
7128 return $XX_tag;
7129 }
7130
7131
7132
7133 sub print_helpfile{
7134 print << "HOW_TO";
7135
7136
7137 This program is free software: you can redistribute it and/or modify
7138 it under the terms of the GNU General Public License as published by
7139 the Free Software Foundation, either version 3 of the License, or
7140 (at your option) any later version.
7141
7142 This program is distributed in the hope that it will be useful,
7143 but WITHOUT ANY WARRANTY; without even the implied warranty of
7144 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
7145 GNU General Public License for more details.
7146 You should have received a copy of the GNU General Public License
7147 along with this program. If not, see <http://www.gnu.org/licenses/>.
7148
7149
7150
7151 DESCRIPTION
7152
7153
7154 The following is a brief description of command line options and arguments to control the Bismark
7155 bisulfite mapper and methylation caller. Bismark takes in FastA or FastQ files and aligns the
7156 reads to a specified bisulfite genome. Sequence reads are transformed into a bisulfite converted forward strand
7157 version (C->T conversion) or into a bisulfite treated reverse strand (G->A conversion of the forward strand).
7158 Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome
7159 (C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the
7160 forward strand, by doing this alignments will produce the same positions). These 4 instances of Bowtie (1 or 2)
7161 are run in parallel. The sequence file(s) are then read in again sequence by sequence to pull out the original
7162 sequence from the genome and determine if there were any protected C's present or not.
7163
7164 As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be
7165 re-enabled by using --non_directional.
7166
7167 The final output of Bismark is in SAM format by default. For Bowtie 1 one can alos choose to report the old
7168 'vanilla' output format, which is a single tab delimited file with all sequences that have a unique best
7169 alignment to any of the 4 possible strands of a bisulfite PCR product. Both formats are described in more detail below.
7170
7171
7172 USAGE: bismark [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>}
7173
7174
7175 ARGUMENTS:
7176
7177 <genome_folder> The path to the folder containing the unmodified reference genome
7178 as well as the subfolders created by the Bismark_Genome_Preparation
7179 script (/Bisulfite_Genome/CT_conversion/ and /Bisulfite_Genome/GA_conversion/).
7180 Bismark expects one or more fastA files in this folder (file extension: .fa
7181 or .fasta). The path can be relative or absolute.
7182
7183 -1 <mates1> Comma-separated list of files containing the #1 mates (filename usually includes
7184 "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
7185 correspond file-for-file and read-for-read with those specified in <mates2>.
7186 Reads may be a mix of different lengths. Bismark will produce one mapping result
7187 and one report file per paired-end input file pair.
7188
7189 -2 <mates2> Comma-separated list of files containing the #2 mates (filename usually includes
7190 "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
7191 correspond file-for-file and read-for-read with those specified in <mates1>.
7192 Reads may be a mix of different lengths.
7193
7194 <singles> A comma- or space-separated list of files containing the reads to be aligned (e.g.
7195 lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will
7196 produce one mapping result and one report file per input file.
7197
7198
7199 OPTIONS:
7200
7201
7202 Input:
7203
7204 -q/--fastq The query input files (specified as <mate1>,<mate2> or <singles> are FASTQ
7205 files (usually having extension .fg or .fastq). This is the default. See also
7206 --solexa-quals.
7207
7208 -f/--fasta The query input files (specified as <mate1>,<mate2> or <singles> are FASTA
7209 files (usually havin extension .fa, .mfa, .fna or similar). All quality values
7210 are assumed to be 40 on the Phred scale. FASTA files are expected to contain both
7211 the read name and the sequence on a single line (and not spread over several lines).
7212
7213 -s/--skip <int> Skip (i.e. do not align) the first <int> reads or read pairs from the input.
7214
7215 -u/--upto <int> Only aligns the first <int> reads or read pairs from the input. Default: no limit.
7216
7217 --phred33-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 33. Default: on.
7218
7219 --phred64-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 64. Default: off.
7220
7221 --solexa-quals Convert FASTQ qualities from solexa-scaled (which can be negative) to phred-scaled
7222 (which can't). The formula for conversion is:
7223 phred-qual = 10 * log(1 + 10 ** (solexa-qual/10.0)) / log(10). Used with -q. This
7224 is usually the right option for use with (unconverted) reads emitted by the GA
7225 Pipeline versions prior to 1.3. Works only for Bowtie 1. Default: off.
7226
7227 --solexa1.3-quals Same as --phred64-quals. This is usually the right option for use with (unconverted)
7228 reads emitted by GA Pipeline version 1.3 or later. Default: off.
7229
7230 --path_to_bowtie The full path </../../> to the Bowtie (1 or 2) installation on your system. If not
7231 specified it is assumed that Bowtie (1 or 2) is in the PATH.
7232
7233
7234 Alignment:
7235
7236 -n/--seedmms <int> The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs
7237 of the read (where L is set with -l/--seedlen). This may be 0, 1, 2 or 3 and the
7238 default is 1. This option is only available for Bowtie 1 (for Bowtie 2 see -N).
7239
7240 -l/--seedlen The "seed length"; i.e., the number of bases of the high quality end of the read to
7241 which the -n ceiling applies. The default is 28. Bowtie (and thus Bismark) is faster for
7242 larger values of -l. This option is only available for Bowtie 1 (for Bowtie 2 see -L).
7243
7244 -e/--maqerr <int> Maximum permitted total of quality values at all mismatched read positions throughout
7245 the entire alignment, not just in the "seed". The default is 70. Like Maq, bowtie rounds
7246 quality values to the nearest 10 and saturates at 30. This value is not relevant for
7247 Bowtie 2.
7248
7249 --chunkmbs <int> The number of megabytes of memory a given thread is given to store path descriptors in
7250 --best mode. Best-first search must keep track of many paths at once to ensure it is
7251 always extending the path with the lowest cumulative cost. Bowtie tries to minimize the
7252 memory impact of the descriptors, but they can still grow very large in some cases. If
7253 you receive an error message saying that chunk memory has been exhausted in --best mode,
7254 try adjusting this parameter up to dedicate more memory to the descriptors. This value
7255 is not relevant for Bowtie 2. Default: 512.
7256
7257 -I/--minins <int> The minimum insert size for valid paired-end alignments. E.g. if -I 60 is specified and
7258 a paired-end alignment consists of two 20-bp alignments in the appropriate orientation
7259 with a 20-bp gap between them, that alignment is considered valid (as long as -X is also
7260 satisfied). A 19-bp gap would not be valid in that case. Default: 0.
7261
7262 -X/--maxins <int> The maximum insert size for valid paired-end alignments. E.g. if -X 100 is specified and
7263 a paired-end alignment consists of two 20-bp alignments in the proper orientation with a
7264 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied).
7265 A 61-bp gap would not be valid in that case. Default: 500.
7266
7267
7268 Bowtie 1 Reporting:
7269
7270 -k <2> Due to the way Bismark works Bowtie will report up to 2 valid alignments. This option
7271 will be used by default.
7272
7273 --best Make Bowtie guarantee that reported singleton alignments are "best" in terms of stratum
7274 (i.e. number of mismatches, or mismatches in the seed in the case if -n mode) and in
7275 terms of the quality; e.g. a 1-mismatch alignment where the mismatch position has Phred
7276 quality 40 is preferred over a 2-mismatch alignment where the mismatched positions both
7277 have Phred quality 10. When --best is not specified, Bowtie may report alignments that
7278 are sub-optimal in terms of stratum and/or quality (though an effort is made to report
7279 the best alignment). --best mode also removes all strand bias. Note that --best does not
7280 affect which alignments are considered "valid" by Bowtie, only which valid alignments
7281 are reported by Bowtie. Bowtie is about 1-2.5 times slower when --best is specified.
7282 Default: on.
7283
7284 --no_best Disables the --best option which is on by default. This can speed up the alignment process,
7285 e.g. for testing purposes, but for credible results it is not recommended to disable --best.
7286
7287
7288 Output:
7289
7290 --non_directional The sequencing library was constructed in a non strand-specific manner, alignments to all four
7291 bisulfite strands will be reported. Default: OFF.
7292
7293 (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary
7294 to the original strands are merely theoretical and should not exist in reality. Specifying directional
7295 alignments (which is the default) will only run 2 alignment threads to the original top (OT)
7296 or bottom (OB) strands in parallel and report these alignments. This is the recommended option
7297 for sprand-specific libraries).
7298
7299 --pbat This options may be used for PBAT-Seq libraries (Post-Bisulfite Adapter Tagging; Kobayashi et al.,
7300 PLoS Genetics, 2012). This is essentially the exact opposite of alignments in 'directional' mode,
7301 as it will only launch two alignment threads to the CTOT and CTOB strands instead of the normal OT
7302 and OB ones. Use this option only if you are certain that your libraries were constructed following
7303 a PBAT protocol (if you don't know what PBAT-Seq is you should not specify this option). The option
7304 --pbat works only for single-end and paired-end FastQ files for use with Bowtie1 (uncompressed
7305 temporary files only).
7306
7307 --sam-no-hd Suppress SAM header lines (starting with @). This might be useful when very large input files are
7308 split up into several smaller files to run concurrently and the output files are to be merged.
7309
7310 --quiet Print nothing besides alignments.
7311
7312 --vanilla Performs bisulfite mapping with Bowtie 1 and prints the 'old' output (as in Bismark 0.5.X) instead
7313 of SAM format output.
7314
7315 -un/--unmapped Write all reads that could not be aligned to a file in the output directory. Written reads will
7316 appear as they did in the input, without any translation of quality values that may have
7317 taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1
7318 and _2 inserted in their filenames, i.e. _unmapped_reads_1.txt and unmapped_reads_2.txt. Reads
7319 with more than one valid alignment with the same number of lowest mismatches (ambiguous mapping)
7320 are also written to _unmapped_reads.txt unless the option --ambiguous is specified as well.
7321
7322 --ambiguous Write all reads which produce more than one valid alignment with the same number of lowest
7323 mismatches or other reads that fail to align uniquely to a file in the output directory.
7324 Written reads will appear as they did in the input, without any of the translation of quality
7325 values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two
7326 parallel files with _1 and _2 inserted in theit filenames, i.e. _ambiguous_reads_1.txt and
7327 _ambiguous_reads_2.txt. These reads are not written to the file specified with --un.
7328
7329 -o/--output_dir <dir> Write all output files into this directory. By default the output files will be written into
7330 the same folder as the input file(s). If the specified folder does not exist, Bismark will attempt
7331 to create it first. The path to the output folder can be either relative or absolute.
7332
7333 --temp_dir <dir> Write temporary files to this directory instead of into the same directory as the input files. If
7334 the specified folder does not exist, Bismark will attempt to create it first. The path to the
7335 temporary folder can be either relative or absolute.
7336
7337 --non_bs_mm Optionally outputs an extra column specifying the number of non-bisulfite mismatches a read during the
7338 alignment step. This option is only available for SAM format. In Bowtie 2 context, this value is
7339 just the number of actual non-bisulfite mismatches and ignores potential insertions or deletions.
7340 The format for single-end reads and read 1 of paired-end reads is 'XA:Z:number of mismatches'
7341 and 'XB:Z:number of mismatches' for read 2 of paired-end reads.
7342
7343 --gzip Temporary bisulfite conversion files will be written out in a GZIP compressed form to save disk
7344 space. This option is available for most alignment modes but is not available for paired-end FastA
7345 files. This option might be somewhat slower than writing out uncompressed files, but this awaits
7346 further testing.
7347
7348 --bam The output will be written out in BAM format instead of the default SAM format. Bismark will
7349 attempt to use the path to Samtools that was specified with '--samtools_path', or, if it hasn't
7350 been specified, attempt to find Samtools in the PATH. If no installation of Samtools can be found,
7351 the SAM output will be compressed with GZIP instead (yielding a .sam.gz output file).
7352
7353 --samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified
7354 explicitly if Samtools is in the PATH already.
7355
7356
7357
7358 Other:
7359
7360 -h/--help Displays this help file.
7361
7362 -v/--version Displays version information.
7363
7364
7365 BOWTIE 2 SPECIFIC OPTIONS
7366
7367 --bowtie2 Uses Bowtie 2 instead of Bowtie 1. Bismark limits Bowtie 2 to only perform end-to-end
7368 alignments, i.e. searches for alignments involving all read characters (also called
7369 untrimmed or unclipped alignments). Bismark assumes that raw sequence data is adapter
7370 and/or quality trimmed where appropriate. Default: off.
7371
7372 Bowtie 2 alignment options:
7373
7374 -N <int> Sets the number of mismatches to allowed in a seed alignment during multiseed alignment.
7375 Can be set to 0 or 1. Setting this higher makes alignment slower (often much slower)
7376 but increases sensitivity. Default: 0. This option is only available for Bowtie 2 (for
7377 Bowtie 1 see -n).
7378
7379 -L <int> Sets the length of the seed substrings to align during multiseed alignment. Smaller values
7380 make alignment slower but more senstive. Default: the --sensitive preset of Bowtie 2 is
7381 used by default, which sets -L to 20. This option is only available for Bowtie 2 (for
7382 Bowtie 1 see -l).
7383
7384 --ignore-quals When calculating a mismatch penalty, always consider the quality value at the mismatched
7385 position to be the highest possible, regardless of the actual value. I.e. input is treated
7386 as though all quality values are high. This is also the default behavior when the input
7387 doesn't specify quality values (e.g. in -f mode). This option is invariable and on by default.
7388
7389
7390 Bowtie 2 paired-end options:
7391
7392 --no-mixed This option disables Bowtie 2's behavior to try to find alignments for the individual mates if
7393 it cannot find a concordant or discordant alignment for a pair. This option is invariable and
7394 and on by default.
7395
7396 --no-discordant Normally, Bowtie 2 looks for discordant alignments if it cannot find any concordant alignments.
7397 A discordant alignment is an alignment where both mates align uniquely, but that does not
7398 satisfy the paired-end constraints (--fr/--rf/--ff, -I, -X). This option disables that behavior
7399 and it is on by default.
7400
7401
7402 Bowtie 2 effort options:
7403
7404 -D <int> Up to <int> consecutive seed extension attempts can "fail" before Bowtie 2 moves on, using
7405 the alignments found so far. A seed extension "fails" if it does not yield a new best or a
7406 new second-best alignment. Default: 15.
7407
7408 -R <int> <int> is the maximum number of times Bowtie 2 will "re-seed" reads with repetitive seeds.
7409 When "re-seeding," Bowtie 2 simply chooses a new set of reads (same length, same number of
7410 mismatches allowed) at different offsets and searches for more alignments. A read is considered
7411 to have repetitive seeds if the total number of seed hits divided by the number of seeds
7412 that aligned at least once is greater than 300. Default: 2.
7413
7414 Bowtie 2 parallelization options:
7415
7416
7417 -p NTHREADS Launch NTHREADS parallel search threads (default: 1). Threads will run on separate processors/cores
7418 and synchronize when parsing reads and outputting alignments. Searching for alignments is highly
7419 parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint.
7420 E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint
7421 by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads
7422 library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will
7423 automatically use the option '--reorder', which guarantees that output SAM records are printed in
7424 an order corresponding to the order of the reads in the original input file, even when -p is set
7425 greater than 1 (Bismark requires the Bowtie 2 output to be this way). Specifying --reorder and
7426 setting -p greater than 1 causes Bowtie 2 to run somewhat slower and use somewhat more memory then
7427 if --reorder were not specified. Has no effect if -p is set to 1, since output order will naturally
7428 correspond to input order in that case.
7429
7430 Bowtie 2 Scoring options:
7431
7432 --score_min <func> Sets a function governing the minimum alignment score needed for an alignment to be considered
7433 "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying
7434 L,0,-0.2 sets the minimum-score function f to f(x) = 0 + -0.2 * x, where x is the read length.
7435 See also: setting function options at http://bowtie-bio.sourceforge.net/bowtie2. The default is
7436 L,0,-0.2.
7437
7438 --rdg <int1>,<int2> Sets the read gap open (<int1>) and extend (<int2>) penalties. A read gap of length N gets a penalty
7439 of <int1> + N * <int2>. Default: 5, 3.
7440
7441 --rfg <int1>,<int2> Sets the reference gap open (<int1>) and extend (<int2>) penalties. A reference gap of length N gets
7442 a penalty of <int1> + N * <int2>. Default: 5, 3.
7443
7444
7445 Bowtie 2 Reporting options:
7446
7447 -most_valid_alignments <int> This used to be the Bowtie 2 parameter -M. As of Bowtie 2 version 2.0.0 beta7 the option -M is
7448 deprecated. It will be removed in subsequent versions. What used to be called -M mode is still the
7449 default mode, but adjusting the -M setting is deprecated. Use the -D and -R options to adjust the
7450 effort expended to find valid alignments.
7451
7452 For reference, this used to be the old (now deprecated) description of -M:
7453 Bowtie 2 searches for at most <int>+1 distinct, valid alignments for each read. The search terminates when it
7454 can't find more distinct valid alignments, or when it finds <int>+1 distinct alignments, whichever
7455 happens first. Only the best alignment is reported. Information from the other alignments is used to
7456 estimate mapping quality and to set SAM optional fields, such as AS:i and XS:i. Increasing -M makes
7457 Bowtie 2 slower, but increases the likelihood that it will pick the correct alignment for a read that
7458 aligns many places. For reads that have more than <int>+1 distinct, valid alignments, Bowtie 2 does not
7459 guarantee that the alignment reported is the best possible in terms of alignment score. -M is
7460 always used and its default value is set to 10.
7461
7462
7463 'VANILLA' Bismark OUTPUT:
7464
7465 Single-end output format (tab-separated):
7466
7467 (1) <seq-ID>
7468 (2) <read alignment strand>
7469 (3) <chromosome>
7470 (4) <start position>
7471 (5) <end position>
7472 (6) <observed bisulfite sequence>
7473 (7) <equivalent genomic sequence>
7474 (8) <methylation call>
7475 (9) <read conversion
7476 (10) <genome conversion>
7477 (11) <read quality score (Phred33)>
7478
7479
7480 Paired-end output format (tab-separated):
7481 (1) <seq-ID>
7482 (2) <read 1 alignment strand>
7483 (3) <chromosome>
7484 (4) <start position>
7485 (5) <end position>
7486 (6) <observed bisulfite sequence 1>
7487 (7) <equivalent genomic sequence 1>
7488 (8) <methylation call 1>
7489 (9) <observed bisulfite sequence 2>
7490 (10) <equivalent genomic sequence 2>
7491 (11) <methylation call 2>
7492 (12) <read 1 conversion
7493 (13) <genome conversion>
7494 (14) <read 1 quality score (Phred33)>
7495 (15) <read 2 quality score (Phred33)>
7496
7497
7498 Bismark SAM OUTPUT (default):
7499
7500 (1) QNAME (seq-ID)
7501 (2) FLAG (this flag tries to take the strand a bisulfite read originated from into account (this is different from ordinary DNA alignment flags!))
7502 (3) RNAME (chromosome)
7503 (4) POS (start position)
7504 (5) MAPQ (always 255)
7505 (6) CIGAR
7506 (7) RNEXT
7507 (8) PNEXT
7508 (9) TLEN
7509 (10) SEQ
7510 (11) QUAL (Phred33 scale)
7511 (12) NM-tag (edit distance to the reference)
7512 (13) XX-tag (base-by-base mismatches to the reference. This does not include indels)
7513 (14) XM-tag (methylation call string)
7514 (15) XR-tag (read conversion state for the alignment)
7515 (16) XG-tag (genome conversion state for the alignment)
7516 (17) XA/XB-tag (non-bisulfite mismatches) (optional!)
7517
7518 Each read of paired-end alignments is written out in a separate line in the above format.
7519
7520
7521 Last edited on 10 May 2013.
7522
7523 HOW_TO
7524 }