comparison methylation_analysis_bismark/methylation_analysis/methylation_extractor @ 11:6479112a673b draft

Uploaded
author fcaramia
date Wed, 12 Dec 2012 19:46:06 -0500
parents
children
comparison
equal deleted inserted replaced
10:2432df265dad 11:6479112a673b
1 #!/usr/bin/perl
2 use warnings;
3 use strict;
4 $|++;
5 use Getopt::Long;
6 use Cwd;
7
8 my @filenames;
9 my %counting;
10 my $parent_dir = getcwd();
11
12 my %fhs;
13
14 my $version = 'v0.7.6';
15 my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla) = process_commandline();
16
17 process_Bismark_results_file();
18
19 sub process_commandline{
20 my $help;
21 my $single_end;
22 my $paired_end;
23 my $ignore;
24 my $genomic_fasta;
25 my $full;
26 my $report;
27 my $extractor_version;
28 my $no_overlap;
29 my $merge_non_CpG;
30 my $vanilla;
31
32 my $command_line = GetOptions ('help|man' => \$help,
33 'p|paired-end' => \$paired_end,
34 's|single-end' => \$single_end,
35 'fasta' => \$genomic_fasta,
36 'ignore=i' => \$ignore,
37 'comprehensive' => \$full,
38 'report' => \$report,
39 'version' => \$extractor_version,
40 'no_overlap' => \$no_overlap,
41 'merge_non_CpG' => \$merge_non_CpG,
42 'vanilla' => \$vanilla,
43 );
44
45 ### EXIT ON ERROR if there were errors with any of the supplied options
46 unless ($command_line){
47 die "Please respecify command line options\n";
48 }
49
50 ### HELPFILE
51 if ($help){
52 print_helpfile();
53 exit;
54 }
55
56 if ($extractor_version){
57 print << "VERSION";
58
59
60 Bismark Methylation Extractor
61
62 Bismark Extractor Version: $version Copyright 2010-12 Felix Krueger, Babraham Bioinformatics
63 www.bioinformatics.babraham.ac.uk/projects/bismark/
64
65
66 VERSION
67 exit;
68 }
69
70
71 ### no files provided
72 unless (@ARGV){
73 die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n";
74 }
75 @filenames = @ARGV;
76
77 warn "\n *** Bismark methylation extractor version $version ***\n\n";
78
79 ### IGNORING <INT> bases at the start of the read when processing the methylation call string
80 unless ($ignore){
81 $ignore = 0;
82 }
83 ### PRINT A REPORT
84 unless ($report){
85 $report = 0;
86 }
87
88 ### OLD (VANILLA) OUTPUT FORMAT
89 unless ($vanilla){
90 $vanilla = 0;
91 }
92
93 if ($single_end){
94 $paired_end = 0; ### SINGLE END ALIGNMENTS
95 }
96 elsif ($paired_end){
97 $single_end = 0; ### PAIRED-END ALIGNMENTS
98 }
99 else{
100 die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format\n\n";
101 }
102
103 ### NO OVERLAP
104 if ($no_overlap){
105 die "The option '--no_overlap' can only be specified for paired-end input!\n" unless ($paired_end);
106 }
107 else{
108 $no_overlap = 0;
109 }
110
111 ### COMPREHENSIVE OUTPUT
112 unless ($full){
113 $full = 0;
114 }
115
116 ### MERGE NON-CpG context
117 unless ($merge_non_CpG){
118 $merge_non_CpG = 0;
119 }
120
121 return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla);
122 }
123
124
125 sub process_Bismark_results_file{
126
127 if ($single){
128 if ($vanilla){
129 warn "Bismark Single-end vanilla format specified\n";
130 }
131 else{
132 warn "Bismark Single-end SAM format specified (default)\n"; # default
133 }
134 }
135 elsif ($paired){
136 if ($vanilla){
137 warn "Bismark Paired-end vanilla format specified\n";
138 }
139 else{
140 warn "Bismark Paired-end SAM format specified (default)\n"; # default
141 }
142 }
143
144 if ($ignore){
145 warn "First $ignore bases will be disregarded when processing the methylation call string\n";
146 }
147
148 if ($full){
149 warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n";
150 }
151 if ($merge_non_CpG){
152 warn "Merge CHG and CHH context to non-CpG context specified\n";
153 }
154 warn "\n";
155
156 sleep (3);
157
158 foreach my $filename (@filenames){
159 %fhs = ();
160 %counting =(
161 total_meCHG_count => 0,
162 total_meCHH_count => 0,
163 total_meCpG_count => 0,
164 total_unmethylated_CHG_count => 0,
165 total_unmethylated_CHH_count => 0,
166 total_unmethylated_CpG_count => 0,
167 sequences_count => 0,
168 );
169 warn "\nNow reading in Bismark result file $filename\n\n";
170
171 if ($filename =~ /\.gz$/){
172 open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n";
173 }
174 else{
175 open (IN,$filename) or die "Can't open file $filename: $!\n";
176 }
177
178 ### Vanilla and SAM output need to read different numbers of header lines
179 if ($vanilla){
180 my $bismark_version = <IN>; ## discarding the Bismark version info
181 chomp $bismark_version;
182 $bismark_version =~ s/\r//; # replaces \r line feed
183 $bismark_version =~ s/Bismark version: //;
184 if ($bismark_version =~ /^\@/){
185 warn "Detected \@ as the first character of the version information. Is it possible that the file is in SAM format?\n\n";
186 sleep (2);
187 }
188
189 unless ($version eq $bismark_version){
190 die "The methylation extractor and Bismark itself need to be of the same version!\n\nVersions used:\nmethylation extractor: '$version'\nBismark: '$bismark_version'\n";
191 }
192 }
193 else{
194 # If the read is in SAM format (default) it can either start with @ header lines or start with alignments directly.
195 # We are reading from it further down
196 }
197
198 my $output_filename = (split (/\//,$filename))[-1];
199
200 ### OPENING OUTPUT-FILEHANDLES
201 if ($report){
202 my $report_filename = $output_filename;
203 $report_filename =~ s/$/_splitting_report.txt/;
204 open (REPORT,'>',$report_filename) or die "Failed to write to file $report_filename $!\n";
205 }
206
207 if ($report){
208 print REPORT "$output_filename\n\n";
209 print REPORT "Parameters used to extract methylation information:\n";
210 if ($paired){
211 if ($vanilla){
212 print REPORT "Bismark result file: paired-end (vanilla Bismark format)\n";
213 }
214 else{
215 print REPORT "Bismark result file: paired-end (SAM format)\n"; # default
216 }
217 }
218
219 if ($single){
220 if ($vanilla){
221 print REPORT "Bismark result file: single-end (vanilla Bismark format)\n";
222 }
223 else{
224 print REPORT "Bismark result file: single-end (SAM format)\n"; # default
225 }
226 }
227
228 if ($ignore){
229 print REPORT "Ignoring first $ignore bases\n";
230 }
231
232 if ($full){
233 print REPORT "Output specified: comprehensive\n";
234 }
235 else{
236 print REPORT "Output specified: strand-specific (default)\n";
237 }
238
239 if ($no_overlap){
240 print REPORT "No overlapping methylation calls specified\n";
241 }
242 if ($genomic_fasta){
243 print REPORT "Genomic equivalent sequences will be printed out in FastA format\n";
244 }
245 if ($merge_non_CpG){
246 print REPORT "Methylation in CHG and CHH context will be merged into \"non-CpG context\" output\n";
247 }
248
249 print REPORT "\n";
250 }
251
252 ### CpG-context and non-CpG context. THIS SECTION IS OPTIONAL
253
254 ### if --comprehensive AND --merge_non_CpG was specified we are only writing out one CpG-context and one Any-Other-context result file
255 if ($full and $merge_non_CpG){
256 my $cpg_output = my $other_c_output = $output_filename;
257 ### C in CpG context
258 $cpg_output =~ s/^/CpG_context_/;
259 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
260 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n";
261 print "Writing result file containing methylation information for C in CpG context to $cpg_output\n";
262 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n";
263
264 ### C in any other context than CpG
265 $other_c_output =~ s/^/Non_CpG_context_/;
266 $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/);
267 open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n";
268 print "Writing result file containing methylation information for C in any other context to $other_c_output\n";
269 print {$fhs{other_context}} "Bismark methylation extractor version $version\n";
270 }
271
272 ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found
273 elsif ($merge_non_CpG){
274
275 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
276
277 ### For cytosines in CpG context
278 $cpg_ot =~ s/^/CpG_OT_/;
279 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
280 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n";
281 print "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n";
282 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n";
283
284 $cpg_ctot =~ s/^/CpG_CTOT_/;
285 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
286 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n";
287 print "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n";
288 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n";
289
290 $cpg_ctob =~ s/^/CpG_CTOB_/;
291 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
292 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n";
293 print "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n";
294 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n";
295
296 $cpg_ob =~ s/^/CpG_OB_/;
297 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
298 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n";
299 print "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n";
300 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n";
301
302 ### For cytosines in Non-CpG (CC, CT or CA) context
303 my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename;
304
305 $other_c_ot =~ s/^/Non_CpG_OT_/;
306 $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/);
307 open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n";
308 print "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n";
309 print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n";
310
311 $other_c_ctot =~ s/^/Non_CpG_CTOT_/;
312 $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/);
313 open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n";
314 print "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n";
315 print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n";
316
317 $other_c_ctob =~ s/^/Non_CpG_CTOB_/;
318 $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/);
319 open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n";
320 print "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n";
321 print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n";
322
323 $other_c_ob =~ s/^/Non_CpG_OB_/;
324 $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/);
325 open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n";
326 print "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n";
327 print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n";
328 }
329
330 ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context)
331
332 ### if --comprehensive was specified we are only writing one file per context
333 elsif ($full){
334 my $cpg_output = my $chg_output = my $chh_output = $output_filename;
335 ### C in CpG context
336 $cpg_output =~ s/^/CpG_context_/;
337 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
338 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n";
339 print "Writing result file containing methylation information for C in CpG context to $cpg_output\n";
340 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n";
341
342 ### C in CHG context
343 $chg_output =~ s/^/CHG_context_/;
344 $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/);
345 open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n";
346 print "Writing result file containing methylation information for C in CHG context to $chg_output\n";
347 print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n";
348
349 ### C in CHH context
350 $chh_output =~ s/^/CHH_context_/;
351 $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/);
352 open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n";
353 print "Writing result file containing methylation information for C in CHH context to $chh_output\n";
354 print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n"; }
355
356 ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found
357 else{
358 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
359
360 ### For cytosines in CpG context
361 $cpg_ot =~ s/^/CpG_OT_/;
362 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
363 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n";
364 print "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n";
365 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n";
366
367 $cpg_ctot =~ s/^/CpG_CTOT_/;
368 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
369 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n";
370 print "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n";
371 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n";
372
373 $cpg_ctob =~ s/^/CpG_CTOB_/;
374 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
375 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n";
376 print "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n";
377 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n";
378
379 $cpg_ob =~ s/^/CpG_OB_/;
380 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
381 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n";
382 print "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n";
383 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n";
384
385 ### For cytosines in CHG context
386 my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename;
387
388 $chg_ot =~ s/^/CHG_OT_/;
389 $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/);
390 open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n";
391 print "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n";
392 print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n";
393
394 $chg_ctot =~ s/^/CHG_CTOT_/;
395 $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/);
396 open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n";
397 print "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n";
398 print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n";
399
400 $chg_ctob =~ s/^/CHG_CTOB_/;
401 $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/);
402 open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n";
403 print "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n";
404 print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n";
405
406 $chg_ob =~ s/^/CHG_OB_/;
407 $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/);
408 open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n";
409 print "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n";
410 print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n";
411
412 ### For cytosines in CHH context
413 my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename;
414
415 $chh_ot =~ s/^/CHH_OT_/;
416 $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/);
417 open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n";
418 print "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n";
419 print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n";
420
421 $chh_ctot =~ s/^/CHH_CTOT_/;
422 $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/);
423 open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n";
424 print "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n";
425 print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n";
426
427 $chh_ctob =~ s/^/CHH_CTOB_/;
428 $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/);
429 open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n";
430 print "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n";
431 print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n";
432
433 $chh_ob =~ s/^/CHH_OB_/;
434 $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/);
435 open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n";
436 print "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n";
437 print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n";
438 }
439
440 my $methylation_call_strings_processed = 0;
441 my $line_count = 0;
442
443 ### proceeding differently now for single-end or paired-end Bismark files
444
445 ### PROCESSING SINGLE-END RESULT FILES
446 if ($single){
447
448 ### also proceeding differently now for SAM format or vanilla Bismark format files
449 if ($vanilla){ # old vanilla Bismark output format
450 while (<IN>){
451 ++$line_count;
452 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
453
454 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
455 my ($id,$strand,$chrom,$start,$seq,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,6,7,8,9];
456
457 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the first or
458 ### last position
459 chomp $genome_conversion;
460
461 my $index;
462 if ($meth_call){
463
464 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){ ## original top strand
465 $index = 0;
466 }
467 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){ ## complementary to original top strand
468 $index = 1;
469 }
470 elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){ ## original bottom strand
471 $index = 3;
472 }
473 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){ ## complementary to original bottom strand
474 $index = 2;
475 }
476 else {
477 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
478 }
479
480 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int>
481 if ($ignore){
482 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
483
484 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
485 if ($strand eq '+'){
486 $start += $ignore;
487 }
488 elsif ($strand eq '-'){
489 $start += length($meth_call)-1; ## $meth_call is already shortened!
490 }
491 else {
492 die "Alignment did not have proper strand information: $strand\n";
493 }
494 }
495 ### printing out the methylation state of every C in the read
496 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index);
497
498 ++$methylation_call_strings_processed; # 1 per single-end result
499 }
500 }
501 }
502 else{ # processing single-end SAM format (default)
503 while (<IN>){
504 ### SAM format can either start with header lines (starting with @) or start with alignments directly
505 if (/^\@/){ # skipping header lines (starting with @)
506 warn "skipping SAM header line:\t$_";
507 next;
508 }
509
510 ++$line_count;
511 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
512
513 # example read in SAM format
514 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
515 ###
516
517 # < 0.7.6 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
518 # < 0.7.6 $meth_call =~ s/^XM:Z://;
519 # < 0.7.6 $read_conversion =~ s/^XR:Z://;
520 # < 0.7.6 $genome_conversion =~ s/^XG:Z://;
521
522 my ($id,$chrom,$start,$cigar) = (split("\t"))[0,2,3,5];
523
524 ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
525 my $meth_call; ### Thanks to Zachary Zeno for this solution
526 my $read_conversion;
527 my $genome_conversion;
528
529 while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
530 my $tag = $1;
531 my $value = $2;
532
533 if ($tag eq "XM"){
534 $meth_call = $value;
535 $meth_call =~ s/\r//;
536 }
537 elsif ($tag eq "XR") {
538 $read_conversion = $value;
539 $read_conversion =~ s/\r//;
540 }
541 elsif ($tag eq "XG") {
542 $genome_conversion = $value;
543 $genome_conversion =~ s/\r//;
544 }
545 }
546
547 my $strand;
548 chomp $genome_conversion;
549 # print "$meth_call\n$read_conversion\n$genome_conversion\n";
550
551 my $index;
552 if ($meth_call){
553 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){ ## original top strand
554 $index = 0;
555 $strand = '+';
556 }
557 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){ ## complementary to original top strand
558 $index = 1;
559 $strand = '-';
560 }
561 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){ ## complementary to original bottom strand
562 $index = 2;
563 $strand = '+';
564 }
565 elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){ ## original bottom strand
566 $index = 3;
567 $strand = '-';
568 }
569 else {
570 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
571 }
572
573 ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output
574 if ($strand eq '-'){
575 $meth_call = reverse $meth_call;
576 }
577
578 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int>
579 if ($ignore){
580 # print "\n\n$meth_call\n";
581 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
582 # print "$meth_call\n";
583 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
584
585 my @len = split (/\D+/,$cigar); # storing the length per operation
586 my @ops = split (/\d+/,$cigar); # storing the operation
587 shift @ops; # remove the empty first element
588 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
589
590 my @comp_cigar; # building an array with all CIGAR operations
591 foreach my $index (0..$#len){
592 foreach (1..$len[$index]){
593 # print "$ops[$index]";
594 push @comp_cigar, $ops[$index];
595 }
596 }
597 # print "original CIGAR: $cigar\n";
598 # print "original CIGAR: @comp_cigar\n";
599
600 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
601 if ($strand eq '+'){
602
603 my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions
604 my $I_count = 0;
605
606 for (1..$ignore){
607 my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations from the start
608 # print "$_ deleted $op\n";
609
610 while ($op eq 'D'){ # repeating this for deletions (D)
611 $D_count++;
612 $op = shift @comp_cigar;
613 # print "$_ deleted $op\n";
614 }
615 if ($op eq 'I'){ # adjusting the genomic position for insertions (I)
616 $I_count++;
617 }
618 }
619 $start += $ignore + $D_count - $I_count;
620 # print "start $start\t ignore: $ignore\t D count: $D_count I_count: $I_count\n";
621 }
622 elsif ($strand eq '-'){
623
624 for (1..$ignore){
625 my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
626 while ($op eq 'D'){ # repeating this for deletions (D)
627 $op = pop @comp_cigar;
628 }
629 }
630
631 ### For reverse strand alignments we need to determine the number of matching bases (M) or deletions (D) in the read from the CIGAR
632 ### string to be able to work out the starting position of the read which is on the 3' end of the sequence
633 my $MD_count = 0; # counting all operations that affect the genomic position, i.e. M and D. Insertions do not affect the start position
634 foreach (@comp_cigar){
635 ++$MD_count if ($_ eq 'M' or $_ eq 'D');
636 }
637 $start += $MD_count - 1;
638 }
639
640 ### reconstituting shortened CIGAR string
641 my $new_cigar;
642 my $count = 0;
643 my $last_op;
644 # print "ignore adjusted: @comp_cigar\n";
645 foreach my $op (@comp_cigar){
646 unless (defined $last_op){
647 $last_op = $op;
648 ++$count;
649 next;
650 }
651 if ($last_op eq $op){
652 ++$count;
653 }
654 else{
655 $new_cigar .= "$count$last_op";
656 $last_op = $op;
657 $count = 1;
658 }
659 }
660 $new_cigar .= "$count$last_op"; # appending the last operation and count
661 $cigar = $new_cigar;
662 # print "ignore adjusted scalar: $cigar\n";
663 }
664 }
665 ### printing out the methylation state of every C in the read
666 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index,$cigar);
667
668 ++$methylation_call_strings_processed; # 1 per single-end result
669 }
670 }
671 }
672
673 ### PROCESSING PAIRED-END RESULT FILES
674 elsif ($paired){
675
676 ### proceeding differently now for SAM format or vanilla Bismark format files
677 if ($vanilla){ # old vanilla Bismark paired-end output format
678 while (<IN>){
679 ++$line_count;
680 warn "processed line: $line_count\n" if ($line_count%500000==0);
681
682 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
683 my ($id,$strand,$chrom,$start_read_1,$end_read_2,$seq_1,$meth_call_1,$seq_2,$meth_call_2,$first_read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,4,6,7,9,10,11,12,13];
684
685 my $index;
686 chomp $genome_conversion;
687
688 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT'){
689 $index = 0; ## this is OT
690 }
691 elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA'){
692 $index = 2; ## this is CTOB!!!
693 }
694 elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT'){
695 $index = 1; ## this is CTOT!!!
696 }
697 elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA'){
698 $index = 3; ## this is OB
699 }
700 else {
701 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
702 }
703
704 if ($meth_call_1 and $meth_call_2){
705 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>'
706 if ($ignore){
707 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
708 $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore);
709
710 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified
711 $start_read_1 += $ignore;
712 $end_read_2 -= $ignore;
713 }
714 my $end_read_1;
715 my $start_read_2;
716
717 if ($strand eq '+'){
718
719 $end_read_1 = $start_read_1+length($meth_call_1)-1;
720 $start_read_2 = $end_read_2-length($meth_call_2)+1;
721
722 ## we first pass the first read which is in + orientation on the forward strand
723 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0);
724
725 # we next pass the second read which is in - orientation on the reverse strand
726 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
727 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1);
728 }
729
730 else{
731
732 $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read!
733 $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read!
734
735 ## we first pass the first read which is in - orientation on the reverse strand
736 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0);
737
738 # we next pass the second read which is in + orientation on the forward strand
739 ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
740 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2);
741 }
742
743 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
744 }
745 }
746 }
747 else{ # Bismark paired-end SAM output format (default)
748 while (<IN>){
749 ### SAM format can either start with header lines (starting with @) or start with alignments directly
750 if (/^\@/){ # skipping header lines (starting with @)
751 warn "skipping SAM header line:\t$_";
752 next;
753 }
754
755 ++$line_count;
756 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
757
758 # example paired-end reads in SAM format (2 consecutive lines)
759 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
760 # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT
761
762 # < version 0.7.6 my ($id_1,$chrom,$start_read_1,$meth_call_1,$first_read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
763
764 my ($id_1,$chrom,$start_read_1,$cigar_1) = (split("\t"))[0,2,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
765 my $meth_call_1;
766 my $first_read_conversion;
767 my $genome_conversion;
768
769 while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
770 my $tag = $1;
771 my $value = $2;
772
773 if ($tag eq "XM"){
774 $meth_call_1 = $value;
775 $meth_call_1 =~ s/\r//;
776 }
777 elsif ($tag eq "XR") {
778 $first_read_conversion = $value;
779 $first_read_conversion =~ s/\r//;
780 }
781 elsif ($tag eq "XG") {
782 $genome_conversion = $value;
783 $genome_conversion =~ s/\r//;
784 }
785 }
786
787 $_ = <IN>; # reading in the paired read
788
789 # < version 0.7.6 my ($id_2,$start_read_2,$meth_call_2,$second_read_conversion) = (split("\t"))[0,3,13,14];
790 # < version 0.7.6 $meth_call_1 =~ s/^XM:Z://;
791 # < version 0.7.6 $meth_call_2 =~ s/^XM:Z://;
792 # < version 0.7.6 $first_read_conversion =~ s/^XR:Z://;
793 # < version 0.7.6 $second_read_conversion =~ s/^XR:Z://;
794
795 my ($id_2,$start_read_2,$cigar_2) = (split("\t"))[0,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
796
797 my $meth_call_2;
798 my $second_read_conversion;
799
800 while ( /(XM|XR):Z:([^\t]+)/g ) {
801 my $tag = $1;
802 my $value = $2;
803
804 if ($tag eq "XM"){
805 $meth_call_2 = $value;
806 $meth_call_2 =~ s/\r//;
807 }
808 elsif ($tag eq "XR") {
809 $second_read_conversion = $value;
810 $second_read_conversion = s/\r//;
811 }
812 }
813
814 # < version 0.7.6 $genome_conversion =~ s/^XG:Z://;
815 chomp $genome_conversion; # in case it captured a new line character
816
817 # print join ("\t",$meth_call_1,$meth_call_2,$first_read_conversion,$second_read_conversion,$genome_conversion),"\n";
818
819 my $index;
820 my $strand;
821
822 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT'){
823 $index = 0; ## this is OT
824 $strand = '+';
825 }
826 elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT'){
827 $index = 1; ## this is CTOT
828 $strand = '-';
829 }
830 elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA'){
831 $index = 2; ## this is CTOB
832 $strand = '+';
833 }
834 elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA'){
835 $index = 3; ## this is OB
836 $strand = '-';
837 }
838 else {
839 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
840 }
841
842 ### reversing the methylation call of the read that was reverse-complemented
843 if ($strand eq '+'){
844 $meth_call_2 = reverse $meth_call_2;
845 }
846 else{
847 $meth_call_1 = reverse $meth_call_1;
848 }
849
850 if ($meth_call_1 and $meth_call_2){
851
852 my $end_read_1;
853
854 ### READ 1
855 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
856 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
857 shift @ops_1; # remove the empty first element
858 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1);
859
860 my @comp_cigar_1; # building an array with all CIGAR operations
861 foreach my $index (0..$#len_1){
862 foreach (1..$len_1[$index]){
863 # print "$ops_1[$index]";
864 push @comp_cigar_1, $ops_1[$index];
865 }
866 }
867 # print "original CIGAR read 1: $cigar_1\n";
868 # print "original CIGAR read 1: @comp_cigar_1\n";
869
870 ### READ 2
871 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
872 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
873 shift @ops_2; # remove the empty first element
874 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
875 my @comp_cigar_2; # building an array with all CIGAR operations for read 2
876 foreach my $index (0..$#len_2){
877 foreach (1..$len_2[$index]){
878 # print "$ops_2[$index]";
879 push @comp_cigar_2, $ops_2[$index];
880 }
881 }
882 # print "original CIGAR read 2: $cigar_2\n";
883 # print "original CIGAR read 2: @comp_cigar_2\n";
884
885 if ($ignore){
886 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>'
887 ### the methylation calls have already been reversed where necessary
888 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
889 $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore);
890
891 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
892
893 if ($strand eq '+'){
894
895 ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start
896 my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions
897 my $I_count_1 = 0;
898
899 for (1..$ignore){
900 my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start
901 # print "$_ deleted $op\n";
902
903 while ($op eq 'D'){ # repeating this for deletions (D)
904 $D_count_1++;
905 $op = shift @comp_cigar_1;
906 # print "$_ deleted $op\n";
907 }
908 if ($op eq 'I'){ # adjusting the genomic position for insertions (I)
909 $I_count_1++;
910 }
911 }
912
913 $start_read_1 += $ignore + $D_count_1 - $I_count_1;
914 # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n";
915
916 ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back
917
918 for (1..$ignore){
919 my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
920 while ($op eq 'D'){ # repeating this for deletions (D)
921 $op = pop @comp_cigar_2;
922 }
923 }
924 # the start position of reads mapping to the reverse strand is being adjusted further below
925 }
926
927 elsif ($strand eq '-'){
928
929 ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back
930 for (1..$ignore){
931 my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
932 while ($op eq 'D'){ # repeating this for deletions (D)
933 $op = pop @comp_cigar_1;
934 }
935 }
936 # the start position of reads mapping to the reverse strand is being adjusted further below
937
938 ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start
939 my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions
940 my $I_count_2 = 0;
941
942 for (1..$ignore){
943 my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start
944 # print "$_ deleted $op\n";
945
946 while ($op eq 'D'){ # repeating this for deletions (D)
947 $D_count_2++;
948 $op = shift @comp_cigar_2;
949 # print "$_ deleted $op\n";
950 }
951 if ($op eq 'I'){ # adjusting the genomic position for insertions (I)
952 $I_count_2++;
953 }
954 }
955
956 $start_read_2 += $ignore + $D_count_2 - $I_count_2;
957 # print "start read 2 $start_read_2\t ignore: $ignore\t D count 2: $D_count_2\tI_count 2: $I_count_2\n";
958
959 }
960
961 ### reconstituting shortened CIGAR string 1
962 my $new_cigar_1;
963 my $count_1 = 0;
964 my $last_op_1;
965 # print "ignore adjusted CIGAR 1: @comp_cigar_1\n";
966 foreach my $op (@comp_cigar_1){
967 unless (defined $last_op_1){
968 $last_op_1 = $op;
969 ++$count_1;
970 next;
971 }
972 if ($last_op_1 eq $op){
973 ++$count_1;
974 }
975 else{
976 $new_cigar_1 .= "$count_1$last_op_1";
977 $last_op_1 = $op;
978 $count_1 = 1;
979 }
980 }
981 $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count
982 $cigar_1 = $new_cigar_1;
983 # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n";
984
985 ### reconstituting shortened CIGAR string 2
986 my $new_cigar_2;
987 my $count_2 = 0;
988 my $last_op_2;
989 # print "ignore adjusted CIGAR 2: @comp_cigar_2\n";
990 foreach my $op (@comp_cigar_2){
991 unless (defined $last_op_2){
992 $last_op_2 = $op;
993 ++$count_2;
994 next;
995 }
996 if ($last_op_2 eq $op){
997 ++$count_2;
998 }
999 else{
1000 $new_cigar_2 .= "$count_2$last_op_2";
1001 $last_op_2 = $op;
1002 $count_2 = 1;
1003 }
1004 }
1005 $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count
1006 $cigar_2 = $new_cigar_2;
1007 # print "ignore adjusted CIGAR 2 scalar: $cigar_2\n";
1008
1009 }
1010
1011 if ($strand eq '+'){
1012 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2
1013 @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
1014 # print "reverse: @comp_cigar_2\n";
1015
1016 my $MD_count_1 = 0;
1017 foreach (@comp_cigar_1){
1018 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
1019 }
1020
1021 my $MD_count_2 = 0;
1022 foreach (@comp_cigar_2){
1023 ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
1024 }
1025
1026 $end_read_1 = $start_read_1 + $MD_count_1 - 1;
1027 $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand
1028 }
1029 else{
1030 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1
1031
1032 @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
1033 # print "reverse: @comp_cigar_1\n";
1034
1035 my $MD_count_1 = 0;
1036 foreach (@comp_cigar_1){
1037 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
1038 }
1039
1040 $end_read_1 = $start_read_1;
1041 $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand
1042
1043 }
1044
1045 if ($strand eq '+'){
1046 ## we first pass the first read which is in + orientation on the forward strand
1047 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1);
1048
1049 # we next pass the second read which is in - orientation on the reverse strand
1050 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
1051 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2);
1052 }
1053
1054 else{
1055 ## we first pass the first read which is in - orientation on the reverse strand
1056 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1);
1057
1058 # we next pass the second read which is in + orientation on the forward strand
1059 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
1060 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2);
1061 }
1062
1063 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
1064 }
1065 }
1066 }
1067 }
1068 else{
1069 die "Single-end or paired-end reads not specified properly\n";
1070 }
1071
1072 print "\n\nProcessed $line_count lines from $filename in total\n";
1073 print "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n";
1074 if ($report){
1075 print REPORT "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n";
1076 }
1077 print_splitting_report ();
1078 }
1079 }
1080
1081
1082 sub print_splitting_report{
1083
1084 ### Calculating methylation percentages if applicable
1085
1086 my $percent_meCpG;
1087 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
1088 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
1089 }
1090
1091 my $percent_meCHG;
1092 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
1093 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
1094 }
1095
1096 my $percent_meCHH;
1097 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
1098 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
1099 }
1100
1101 my $percent_non_CpG_methylation;
1102 if ($merge_non_CpG){
1103 if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
1104 $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) );
1105 }
1106 }
1107
1108 if ($report){
1109 ### detailed information about Cs analysed
1110 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
1111
1112 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
1113 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
1114
1115 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
1116 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
1117 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
1118
1119 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
1120 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
1121 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
1122
1123 ### calculating methylated CpG percentage if applicable
1124 if ($percent_meCpG){
1125 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
1126 }
1127 else{
1128 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
1129 }
1130
1131 ### 2-Context Output
1132 if ($merge_non_CpG){
1133 if ($percent_non_CpG_methylation){
1134 print REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
1135 }
1136 else{
1137 print REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
1138 }
1139 }
1140
1141 ### 3 Context Output
1142 else{
1143 ### calculating methylated CHG percentage if applicable
1144 if ($percent_meCHG){
1145 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
1146 }
1147 else{
1148 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
1149 }
1150
1151 ### calculating methylated CHH percentage if applicable
1152 if ($percent_meCHH){
1153 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
1154 }
1155 else{
1156 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
1157 }
1158 }
1159 }
1160
1161 ### detailed information about Cs analysed for on-screen report
1162 print "Final Cytosine Methylation Report\n",'='x33,"\n";
1163
1164 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
1165 print "Total number of C's analysed:\t$total_number_of_C\n\n";
1166
1167 print "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
1168 print "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
1169 print "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
1170
1171 print "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
1172 print "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
1173 print "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
1174
1175 ### printing methylated CpG percentage if applicable
1176 if ($percent_meCpG){
1177 print "C methylated in CpG context:\t${percent_meCpG}%\n";
1178 }
1179 else{
1180 print "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
1181 }
1182
1183 ### 2-Context Output
1184 if ($merge_non_CpG){
1185 if ($percent_non_CpG_methylation){
1186 print "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
1187 }
1188 else{
1189 print "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
1190 }
1191 }
1192
1193 ### 3-Context Output
1194 else{
1195 ### printing methylated CHG percentage if applicable
1196 if ($percent_meCHG){
1197 print "C methylated in CHG context:\t${percent_meCHG}%\n";
1198 }
1199 else{
1200 print "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
1201 }
1202
1203 ### printing methylated CHH percentage if applicable
1204 if ($percent_meCHH){
1205 print "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
1206 }
1207 else{
1208 print "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
1209 }
1210 }
1211 }
1212
1213
1214
1215
1216
1217 sub print_individual_C_methylation_states_paired_end_files{
1218
1219 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar) = @_;
1220 my @methylation_calls = split(//,$meth_call);
1221
1222 #################################################################
1223 ### . for bases not involving cytosines ###
1224 ### X for methylated C in CHG context (was protected) ###
1225 ### x for not methylated C in CHG context (was converted) ###
1226 ### H for methylated C in CHH context (was protected) ###
1227 ### h for not methylated C in CHH context (was converted) ###
1228 ### Z for methylated C in CpG context (was protected) ###
1229 ### z for not methylated C in CpG context (was converted) ###
1230 #################################################################
1231
1232 my $methyl_CHG_count = 0;
1233 my $methyl_CHH_count = 0;
1234 my $methyl_CpG_count = 0;
1235 my $unmethylated_CHG_count = 0;
1236 my $unmethylated_CHH_count = 0;
1237 my $unmethylated_CpG_count = 0;
1238
1239 my @len;
1240 my @ops;
1241 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
1242 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
1243 my @comp_cigar;
1244
1245 if ($cigar){ # parsing CIGAR string
1246 @len = split (/\D+/,$cigar); # storing the length per operation
1247 @ops = split (/\d+/,$cigar); # storing the operation
1248 shift @ops; # remove the empty first element
1249 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
1250
1251 foreach my $index (0..$#len){
1252 foreach (1..$len[$index]){
1253 # print "$ops[$index]";
1254 push @comp_cigar, $ops[$index];
1255 }
1256 }
1257 # warn "\nDetected CIGAR string: $cigar\n";
1258 # warn "Length of methylation call: ",length $meth_call,"\n";
1259 # warn "number of operations: ",scalar @ops,"\n";
1260 # warn "number of length digits: ",scalar @len,"\n\n";
1261 # print @comp_cigar,"\n";
1262 # print "$meth_call\n\n";
1263 # sleep (1);
1264 }
1265
1266
1267 if ($strand eq '-') {
1268
1269 ### the CIGAR string needs to be reversed, the methylation call has already been reversed above
1270 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
1271 # print "reverse CIGAR string: @comp_cigar\n";
1272
1273 ### the start position of paired-end files has already been corrected, see above
1274 }
1275
1276 ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified
1277
1278 if ($merge_non_CpG) {
1279
1280 if ($no_overlap) {
1281
1282 ### single-file CpG and non-CpG context output
1283 if ($full) {
1284 if ($strand eq '+') {
1285 for my $index (0..$#methylation_calls) {
1286
1287 if ($cigar){ # only needed for SAM files
1288 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1289 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
1290 $cigar_offset += $cigar_mod;
1291 $pos_offset += $pos_mod;
1292 }
1293
1294 ### Returning as soon as the methylation calls start overlapping
1295 if ($start+$index+$pos_offset >= $end_read_1) {
1296 return;
1297 }
1298
1299 if ($methylation_calls[$index] eq 'X') {
1300 $counting{total_meCHG_count}++;
1301 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1302 } elsif ($methylation_calls[$index] eq 'x') {
1303 $counting{total_unmethylated_CHG_count}++;
1304 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1305 } elsif ($methylation_calls[$index] eq 'Z') {
1306 $counting{total_meCpG_count}++;
1307 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1308 } elsif ($methylation_calls[$index] eq 'z') {
1309 $counting{total_unmethylated_CpG_count}++;
1310 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1311 } elsif ($methylation_calls[$index] eq 'H') {
1312 $counting{total_meCHH_count}++;
1313 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1314 } elsif ($methylation_calls[$index] eq 'h') {
1315 $counting{total_unmethylated_CHH_count}++;
1316 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1317 }
1318 elsif ($methylation_calls[$index] eq '.'){}
1319 else{
1320 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
1321 }
1322 }
1323 } elsif ($strand eq '-') {
1324 for my $index (0..$#methylation_calls) {
1325
1326 if ($cigar){ # only needed for SAM files
1327 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
1328 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1329 $cigar_offset += $cigar_mod;
1330 $pos_offset += $pos_mod;
1331 }
1332
1333 ### Returning as soon as the methylation calls start overlapping
1334 if ($start-$index+$pos_offset <= $end_read_1) {
1335 return;
1336 }
1337
1338 if ($methylation_calls[$index] eq 'X') {
1339 $counting{total_meCHG_count}++;
1340 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1341 } elsif ($methylation_calls[$index] eq 'x') {
1342 $counting{total_unmethylated_CHG_count}++;
1343 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1344 } elsif ($methylation_calls[$index] eq 'Z') {
1345 $counting{total_meCpG_count}++;
1346 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1347 } elsif ($methylation_calls[$index] eq 'z') {
1348 $counting{total_unmethylated_CpG_count}++;
1349 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1350 } elsif ($methylation_calls[$index] eq 'H') {
1351 $counting{total_meCHH_count}++;
1352 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1353 } elsif ($methylation_calls[$index] eq 'h') {
1354 $counting{total_unmethylated_CHH_count}++;
1355 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1356 }
1357 elsif ($methylation_calls[$index] eq '.') {}
1358 else{
1359 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
1360 }
1361 }
1362 } else {
1363 die "The read orientation was neither + nor -: '$strand'\n";
1364 }
1365 }
1366
1367 ### strand-specific methylation output
1368 else {
1369 if ($strand eq '+') {
1370 for my $index (0..$#methylation_calls) {
1371
1372 if ($cigar){ # only needed for SAM files
1373 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1374 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
1375 $cigar_offset += $cigar_mod;
1376 $pos_offset += $pos_mod;
1377 }
1378
1379 ### Returning as soon as the methylation calls start overlapping
1380 if ($start+$index+$pos_offset >= $end_read_1) {
1381 return;
1382 }
1383
1384 if ($methylation_calls[$index] eq 'X') {
1385 $counting{total_meCHG_count}++;
1386 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1387 } elsif ($methylation_calls[$index] eq 'x') {
1388 $counting{total_unmethylated_CHG_count}++;
1389 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1390 } elsif ($methylation_calls[$index] eq 'Z') {
1391 $counting{total_meCpG_count}++;
1392 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1393 } elsif ($methylation_calls[$index] eq 'z') {
1394 $counting{total_unmethylated_CpG_count}++;
1395 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1396 } elsif ($methylation_calls[$index] eq 'H') {
1397 $counting{total_meCHH_count}++;
1398 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1399 } elsif ($methylation_calls[$index] eq 'h') {
1400 $counting{total_unmethylated_CHH_count}++;
1401 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1402 }
1403 elsif ($methylation_calls[$index] eq '.') {}
1404 else{
1405 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
1406 }
1407 }
1408 } elsif ($strand eq '-') {
1409 for my $index (0..$#methylation_calls) {
1410
1411 if ($cigar){ # only needed for SAM files
1412 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
1413 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1414 $cigar_offset += $cigar_mod;
1415 $pos_offset += $pos_mod;
1416 }
1417
1418 ### Returning as soon as the methylation calls start overlapping
1419 if ($start-$index+$pos_offset <= $end_read_1) {
1420 return;
1421 }
1422
1423 if ($methylation_calls[$index] eq 'X') {
1424 $counting{total_meCHG_count}++;
1425 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1426 } elsif ($methylation_calls[$index] eq 'x') {
1427 $counting{total_unmethylated_CHG_count}++;
1428 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1429 } elsif ($methylation_calls[$index] eq 'Z') {
1430 $counting{total_meCpG_count}++;
1431 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1432 } elsif ($methylation_calls[$index] eq 'z') {
1433 $counting{total_unmethylated_CpG_count}++;
1434 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1435 } elsif ($methylation_calls[$index] eq 'H') {
1436 $counting{total_meCHH_count}++;
1437 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1438 } elsif ($methylation_calls[$index] eq 'h') {
1439 $counting{total_unmethylated_CHH_count}++;
1440 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1441 }
1442 elsif ($methylation_calls[$index] eq '.') {}
1443 else{
1444 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
1445 }
1446 }
1447 } else {
1448 die "The strand orientation was neither + nor -: '$strand'/n";
1449 }
1450 }
1451 }
1452
1453 ### this is the default paired-end procedure allowing overlaps and using every single C position
1454 ### Still within the 2-CONTEXT ONLY optional section
1455 else {
1456 ### single-file CpG and non-CpG context output
1457 if ($full) {
1458 if ($strand eq '+') {
1459 for my $index (0..$#methylation_calls) {
1460
1461 if ($cigar){ # only needed for SAM files
1462 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1463 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
1464 $cigar_offset += $cigar_mod;
1465 $pos_offset += $pos_mod;
1466 }
1467
1468 if ($methylation_calls[$index] eq 'X') {
1469 $counting{total_meCHG_count}++;
1470 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1471 } elsif ($methylation_calls[$index] eq 'x') {
1472 $counting{total_unmethylated_CHG_count}++;
1473 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1474 } elsif ($methylation_calls[$index] eq 'Z') {
1475 $counting{total_meCpG_count}++;
1476 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1477 } elsif ($methylation_calls[$index] eq 'z') {
1478 $counting{total_unmethylated_CpG_count}++;
1479 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1480 } elsif ($methylation_calls[$index] eq 'H') {
1481 $counting{total_meCHH_count}++;
1482 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1483 } elsif ($methylation_calls[$index] eq 'h') {
1484 $counting{total_unmethylated_CHH_count}++;
1485 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1486 }
1487 elsif ($methylation_calls[$index] eq '.') {}
1488 else{
1489 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
1490 }
1491 }
1492 } elsif ($strand eq '-') {
1493 for my $index (0..$#methylation_calls) {
1494
1495 if ($cigar){ # only needed for SAM files
1496 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
1497 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1498 $cigar_offset += $cigar_mod;
1499 $pos_offset += $pos_mod;
1500 }
1501
1502 if ($methylation_calls[$index] eq 'X') {
1503 $counting{total_meCHG_count}++;
1504 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1505 } elsif ($methylation_calls[$index] eq 'x') {
1506 $counting{total_unmethylated_CHG_count}++;
1507 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1508 } elsif ($methylation_calls[$index] eq 'Z') {
1509 $counting{total_meCpG_count}++;
1510 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1511 } elsif ($methylation_calls[$index] eq 'z') {
1512 $counting{total_unmethylated_CpG_count}++;
1513 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1514 } elsif ($methylation_calls[$index] eq 'H') {
1515 $counting{total_meCHH_count}++;
1516 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1517 } elsif ($methylation_calls[$index] eq 'h') {
1518 $counting{total_unmethylated_CHH_count}++;
1519 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1520 }
1521 elsif ($methylation_calls[$index] eq '.') {}
1522 else{
1523 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
1524 }
1525 }
1526 } else {
1527 die "The strand orientation as neither + nor -: '$strand'\n";
1528 }
1529 }
1530
1531 ### strand-specific methylation output
1532 ### still within the 2-CONTEXT optional section
1533 else {
1534 if ($strand eq '+') {
1535 for my $index (0..$#methylation_calls) {
1536
1537 if ($cigar){ # only needed for SAM files
1538 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1539 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
1540 $cigar_offset += $cigar_mod;
1541 $pos_offset += $pos_mod;
1542 }
1543
1544 if ($methylation_calls[$index] eq 'X') {
1545 $counting{total_meCHG_count}++;
1546 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1547 } elsif ($methylation_calls[$index] eq 'x') {
1548 $counting{total_unmethylated_CHG_count}++;
1549 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1550 } elsif ($methylation_calls[$index] eq 'Z') {
1551 $counting{total_meCpG_count}++;
1552 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1553 } elsif ($methylation_calls[$index] eq 'z') {
1554 $counting{total_unmethylated_CpG_count}++;
1555 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1556 } elsif ($methylation_calls[$index] eq 'H') {
1557 $counting{total_meCHH_count}++;
1558 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1559 } elsif ($methylation_calls[$index] eq 'h') {
1560 $counting{total_unmethylated_CHH_count}++;
1561 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1562 }
1563 elsif ($methylation_calls[$index] eq '.') {}
1564 else{
1565 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
1566 }
1567 }
1568 } elsif ($strand eq '-') {
1569 for my $index (0..$#methylation_calls) {
1570
1571 if ($cigar){ # only needed for SAM files
1572 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
1573 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1574 $cigar_offset += $cigar_mod;
1575 $pos_offset += $pos_mod;
1576 }
1577
1578 if ($methylation_calls[$index] eq 'X') {
1579 $counting{total_meCHG_count}++;
1580 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1581 } elsif ($methylation_calls[$index] eq 'x') {
1582 $counting{total_unmethylated_CHG_count}++;
1583 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1584 } elsif ($methylation_calls[$index] eq 'Z') {
1585 $counting{total_meCpG_count}++;
1586 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1587 } elsif ($methylation_calls[$index] eq 'z') {
1588 $counting{total_unmethylated_CpG_count}++;
1589 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1590 } elsif ($methylation_calls[$index] eq 'H') {
1591 $counting{total_meCHH_count}++;
1592 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1593 } elsif ($methylation_calls[$index] eq 'h') {
1594 $counting{total_unmethylated_CHH_count}++;
1595 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1596 }
1597 elsif ($methylation_calls[$index] eq '.') {}
1598 else{
1599 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
1600 }
1601 }
1602 } else {
1603 die "The strand orientation as neither + nor -: '$strand'\n";
1604 }
1605 }
1606 }
1607 }
1608
1609 ############################################
1610 ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ###
1611 ############################################
1612
1613 elsif ($no_overlap) {
1614 ### single-file CpG, CHG and CHH context output
1615 if ($full) {
1616 if ($strand eq '+') {
1617 for my $index (0..$#methylation_calls) {
1618
1619 if ($cigar){ # only needed for SAM files
1620 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1621 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
1622 $cigar_offset += $cigar_mod;
1623 $pos_offset += $pos_mod;
1624 }
1625
1626 ### Returning as soon as the methylation calls start overlapping
1627 if ($start+$index+$pos_offset >= $end_read_1) {
1628 return;
1629 }
1630
1631 if ($methylation_calls[$index] eq 'X') {
1632 $counting{total_meCHG_count}++;
1633 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1634 } elsif ($methylation_calls[$index] eq 'x') {
1635 $counting{total_unmethylated_CHG_count}++;
1636 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1637 } elsif ($methylation_calls[$index] eq 'Z') {
1638 $counting{total_meCpG_count}++;
1639 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1640 } elsif ($methylation_calls[$index] eq 'z') {
1641 $counting{total_unmethylated_CpG_count}++;
1642 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1643 } elsif ($methylation_calls[$index] eq 'H') {
1644 $counting{total_meCHH_count}++;
1645 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1646 } elsif ($methylation_calls[$index] eq 'h') {
1647 $counting{total_unmethylated_CHH_count}++;
1648 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1649 }
1650 elsif ($methylation_calls[$index] eq '.') {}
1651 else{
1652 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
1653 }
1654 }
1655 } elsif ($strand eq '-') {
1656 for my $index (0..$#methylation_calls) {
1657
1658 if ($cigar){ # only needed for SAM files
1659 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
1660 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1661 $cigar_offset += $cigar_mod;
1662 $pos_offset += $pos_mod;
1663 }
1664
1665 ### Returning as soon as the methylation calls start overlapping
1666 if ($start-$index+$pos_offset <= $end_read_1) {
1667 return;
1668 }
1669
1670 if ($methylation_calls[$index] eq 'X') {
1671 $counting{total_meCHG_count}++;
1672 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1673 } elsif ($methylation_calls[$index] eq 'x') {
1674 $counting{total_unmethylated_CHG_count}++;
1675 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1676 } elsif ($methylation_calls[$index] eq 'Z') {
1677 $counting{total_meCpG_count}++;
1678 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1679 } elsif ($methylation_calls[$index] eq 'z') {
1680 $counting{total_unmethylated_CpG_count}++;
1681 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1682 } elsif ($methylation_calls[$index] eq 'H') {
1683 $counting{total_meCHH_count}++;
1684 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1685 } elsif ($methylation_calls[$index] eq 'h') {
1686 $counting{total_unmethylated_CHH_count}++;
1687 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1688 }
1689 elsif ($methylation_calls[$index] eq '.') {}
1690 else{
1691 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
1692 }
1693 }
1694 } else {
1695 die "The strand orientation as neither + nor -: '$strand'\n";
1696 }
1697 }
1698
1699 ### strand-specific methylation output
1700 else {
1701 if ($strand eq '+') {
1702 for my $index (0..$#methylation_calls) {
1703
1704 if ($cigar){ # only needed for SAM files
1705 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1706 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
1707 $cigar_offset += $cigar_mod;
1708 $pos_offset += $pos_mod;
1709 }
1710
1711 ### Returning as soon as the methylation calls start overlapping
1712 if ($start+$index+$pos_offset >= $end_read_1) {
1713 return;
1714 }
1715
1716 if ($methylation_calls[$index] eq 'X') {
1717 $counting{total_meCHG_count}++;
1718 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1719 } elsif ($methylation_calls[$index] eq 'x') {
1720 $counting{total_unmethylated_CHG_count}++;
1721 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1722 } elsif ($methylation_calls[$index] eq 'Z') {
1723 $counting{total_meCpG_count}++;
1724 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1725 } elsif ($methylation_calls[$index] eq 'z') {
1726 $counting{total_unmethylated_CpG_count}++;
1727 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1728 } elsif ($methylation_calls[$index] eq 'H') {
1729 $counting{total_meCHH_count}++;
1730 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1731 } elsif ($methylation_calls[$index] eq 'h') {
1732 $counting{total_unmethylated_CHH_count}++;
1733 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1734 }
1735 elsif ($methylation_calls[$index] eq '.') {}
1736 else{
1737 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
1738 }
1739 }
1740 } elsif ($strand eq '-') {
1741 for my $index (0..$#methylation_calls) {
1742
1743 if ($cigar){ # only needed for SAM files
1744 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
1745 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1746 $cigar_offset += $cigar_mod;
1747 $pos_offset += $pos_mod;
1748 }
1749
1750 ### Returning as soon as the methylation calls start overlapping
1751 if ($start-$index+$pos_offset <= $end_read_1) {
1752 return;
1753 }
1754
1755 if ($methylation_calls[$index] eq 'X') {
1756 $counting{total_meCHG_count}++;
1757 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1758 } elsif ($methylation_calls[$index] eq 'x') {
1759 $counting{total_unmethylated_CHG_count}++;
1760 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1761 } elsif ($methylation_calls[$index] eq 'Z') {
1762 $counting{total_meCpG_count}++;
1763 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1764 } elsif ($methylation_calls[$index] eq 'z') {
1765 $counting{total_unmethylated_CpG_count}++;
1766 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1767 } elsif ($methylation_calls[$index] eq 'H') {
1768 $counting{total_meCHH_count}++;
1769 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1770 } elsif ($methylation_calls[$index] eq 'h') {
1771 $counting{total_unmethylated_CHH_count}++;
1772 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1773 }
1774 elsif ($methylation_calls[$index] eq '.') {}
1775 else{
1776 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
1777 }
1778 }
1779 } else {
1780 die "The strand orientation as neither + nor -: '$strand'\n";
1781 }
1782 }
1783 }
1784
1785 ### this is the default paired-end procedure allowing overlaps and using every single C position
1786 else {
1787 ### single-file CpG, CHG and CHH context output
1788 if ($full) {
1789 if ($strand eq '+') {
1790 for my $index (0..$#methylation_calls) {
1791
1792 if ($cigar){ # only needed for SAM files
1793 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1794 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
1795 $cigar_offset += $cigar_mod;
1796 $pos_offset += $pos_mod;
1797 }
1798
1799 if ($methylation_calls[$index] eq 'X') {
1800 $counting{total_meCHG_count}++;
1801 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1802 } elsif ($methylation_calls[$index] eq 'x') {
1803 $counting{total_unmethylated_CHG_count}++;
1804 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1805 } elsif ($methylation_calls[$index] eq 'Z') {
1806 $counting{total_meCpG_count}++;
1807 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1808 } elsif ($methylation_calls[$index] eq 'z') {
1809 $counting{total_unmethylated_CpG_count}++;
1810 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1811 } elsif ($methylation_calls[$index] eq 'H') {
1812 $counting{total_meCHH_count}++;
1813 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1814 } elsif ($methylation_calls[$index] eq 'h') {
1815 $counting{total_unmethylated_CHH_count}++;
1816 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1817 }
1818 elsif ($methylation_calls[$index] eq '.') {}
1819 else{
1820 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
1821 }
1822 }
1823 } elsif ($strand eq '-') {
1824 for my $index (0..$#methylation_calls) {
1825
1826 if ($cigar){ # only needed for SAM files
1827 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
1828 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1829 $cigar_offset += $cigar_mod;
1830 $pos_offset += $pos_mod;
1831 }
1832
1833 if ($methylation_calls[$index] eq 'X') {
1834 $counting{total_meCHG_count}++;
1835 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1836 } elsif ($methylation_calls[$index] eq 'x') {
1837 $counting{total_unmethylated_CHG_count}++;
1838 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1839 } elsif ($methylation_calls[$index] eq 'Z') {
1840 $counting{total_meCpG_count}++;
1841 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1842 } elsif ($methylation_calls[$index] eq 'z') {
1843 $counting{total_unmethylated_CpG_count}++;
1844 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1845 } elsif ($methylation_calls[$index] eq 'H') {
1846 $counting{total_meCHH_count}++;
1847 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1848 } elsif ($methylation_calls[$index] eq 'h') {
1849 $counting{total_unmethylated_CHH_count}++;
1850 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1851 }
1852 elsif ($methylation_calls[$index] eq '.') {}
1853 else{
1854 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
1855 }
1856 }
1857 } else {
1858 die "The strand orientation as neither + nor -: '$strand'\n";
1859 }
1860 }
1861
1862 ### strand-specific methylation output
1863 else {
1864 if ($strand eq '+') {
1865 for my $index (0..$#methylation_calls) {
1866
1867 if ($cigar){ # only needed for SAM files
1868 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1869 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
1870 $cigar_offset += $cigar_mod;
1871 $pos_offset += $pos_mod;
1872 }
1873
1874 if ($methylation_calls[$index] eq 'X') {
1875 $counting{total_meCHG_count}++;
1876 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1877 } elsif ($methylation_calls[$index] eq 'x') {
1878 $counting{total_unmethylated_CHG_count}++;
1879 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1880 } elsif ($methylation_calls[$index] eq 'Z') {
1881 $counting{total_meCpG_count}++;
1882 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1883 } elsif ($methylation_calls[$index] eq 'z') {
1884 $counting{total_unmethylated_CpG_count}++;
1885 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1886 } elsif ($methylation_calls[$index] eq 'H') {
1887 $counting{total_meCHH_count}++;
1888 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1889 } elsif ($methylation_calls[$index] eq 'h') {
1890 $counting{total_unmethylated_CHH_count}++;
1891 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1892 }
1893 elsif ($methylation_calls[$index] eq '.') {}
1894 else{
1895 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
1896 }
1897 }
1898 } elsif ($strand eq '-') {
1899 for my $index (0..$#methylation_calls) {
1900
1901 if ($cigar){ # only needed for SAM files
1902 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
1903 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1904 $cigar_offset += $cigar_mod;
1905 $pos_offset += $pos_mod;
1906 }
1907
1908 if ($methylation_calls[$index] eq 'X') {
1909 $counting{total_meCHG_count}++;
1910 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1911 } elsif ($methylation_calls[$index] eq 'x') {
1912 $counting{total_unmethylated_CHG_count}++;
1913 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1914 } elsif ($methylation_calls[$index] eq 'Z') {
1915 $counting{total_meCpG_count}++;
1916 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1917 } elsif ($methylation_calls[$index] eq 'z') {
1918 $counting{total_unmethylated_CpG_count}++;
1919 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1920 } elsif ($methylation_calls[$index] eq 'H') {
1921 $counting{total_meCHH_count}++;
1922 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1923 } elsif ($methylation_calls[$index] eq 'h') {
1924 $counting{total_unmethylated_CHH_count}++;
1925 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1926 }
1927 elsif ($methylation_calls[$index] eq '.') {}
1928 else{
1929 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
1930 }
1931 }
1932 } else {
1933 die "The strand orientation as neither + nor -: '$strand'\n";
1934 }
1935 }
1936 }
1937 }
1938
1939 sub check_cigar_string {
1940 my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_;
1941 # print "$index\t$cigar_offset\t$pos_offset\t$strand\t";
1942 my ($new_cigar_offset,$new_pos_offset) = (0,0);
1943
1944 if ($strand eq '+') {
1945 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
1946
1947 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
1948 # warn "position needs no adjustment\n";
1949 }
1950
1951 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
1952 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
1953 # warn "adjusted genomic position by -1 bp (insertion)\n";
1954 }
1955
1956 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
1957 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
1958 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
1959 # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
1960
1961 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
1962 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
1963 # warn "position needs no adjustment\n";
1964 last;
1965 }
1966 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
1967 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
1968 # warn "adjusted genomic position by another -1 bp (insertion)\n";
1969 last;
1970 }
1971 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
1972 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
1973 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
1974 # warn "adjusted genomic position by another +1 bp (deletion)\n";
1975 }
1976 else{
1977 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
1978 }
1979 }
1980 }
1981 else{
1982 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
1983 }
1984 }
1985
1986 elsif ($strand eq '-') {
1987 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
1988
1989 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
1990 # warn "position needs no adjustment\n";
1991 }
1992
1993 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
1994 $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position
1995 # warn "adjusted genomic position by +1 bp (insertion)\n";
1996 }
1997
1998 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
1999 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
2000 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
2001 # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
2002
2003 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
2004 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
2005 # warn "Found new 'M' operation; position needs no adjustment\n";
2006 last;
2007 }
2008 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
2009 $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position
2010 # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n";
2011 last;
2012 }
2013 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
2014 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
2015 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
2016 # warn "adjusted genomic position by another -1 bp (deletion)\n";
2017 }
2018 else{
2019 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
2020 }
2021 }
2022 }
2023 else{
2024 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
2025 }
2026 }
2027 # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n";
2028 return ($new_cigar_offset,$new_pos_offset);
2029 }
2030
2031 sub print_individual_C_methylation_states_single_end{
2032
2033 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_;
2034 my @methylation_calls = split(//,$meth_call);
2035
2036 #################################################################
2037 ### . for bases not involving cytosines ###
2038 ### X for methylated C in CHG context (was protected) ###
2039 ### x for not methylated C in CHG context (was converted) ###
2040 ### H for methylated C in CHH context (was protected) ###
2041 ### h for not methylated C in CHH context (was converted) ###
2042 ### Z for methylated C in CpG context (was protected) ###
2043 ### z for not methylated C in CpG context (was converted) ###
2044 #################################################################
2045
2046 my $methyl_CHG_count = 0;
2047 my $methyl_CHH_count = 0;
2048 my $methyl_CpG_count = 0;
2049 my $unmethylated_CHG_count = 0;
2050 my $unmethylated_CHH_count = 0;
2051 my $unmethylated_CpG_count = 0;
2052
2053 my @len;
2054 my @ops;
2055 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
2056 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
2057
2058 my @comp_cigar;
2059
2060 if ($cigar){ # parsing CIGAR string
2061 @len = split (/\D+/,$cigar); # storing the length per operation
2062 @ops = split (/\d+/,$cigar); # storing the operation
2063 shift @ops; # remove the empty first element
2064 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
2065
2066 foreach my $index (0..$#len){
2067 foreach (1..$len[$index]){
2068 # print "$ops[$index]";
2069 push @comp_cigar, $ops[$index];
2070 }
2071 }
2072 # warn "\nDetected CIGAR string: $cigar\n";
2073 # warn "Length of methylation call: ",length $meth_call,"\n";
2074 # warn "number of operations: ",scalar @ops,"\n";
2075 # warn "number of length digits: ",scalar @len,"\n\n";
2076 # print @comp_cigar,"\n";
2077 # print "$meth_call\n\n";
2078 # sleep (1);
2079 }
2080
2081 ### adjusting the start position for all reads mapping to the reverse strand
2082 if ($strand eq '-') {
2083
2084 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
2085 # print @comp_cigar,"\n";
2086
2087 unless ($ignore){ ### if --ignore was specified the start position has already been corrected
2088
2089 if ($cigar){ ### SAM format
2090 my $MD_count = 0;
2091 foreach (@comp_cigar){
2092 ++$MD_count if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
2093 }
2094 $start += $MD_count - 1;
2095 }
2096 else{ ### vanilla format
2097 $start += length($meth_call)-1;
2098 }
2099 }
2100 }
2101
2102 ### THIS IS THE CpG and Non-CpG SECTION (OPTIONAL)
2103
2104 ### single-file CpG and other-context output
2105 if ($full and $merge_non_CpG) {
2106 if ($strand eq '+') {
2107 for my $index (0..$#methylation_calls) {
2108
2109 if ($cigar){ # only needed for SAM files
2110 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2111 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2112 $cigar_offset += $cigar_mod;
2113 $pos_offset += $pos_mod;
2114 }
2115
2116 ### methylated Cs (any context) will receive a forward (+) orientation
2117 ### not methylated Cs (any context) will receive a reverse (-) orientation
2118 if ($methylation_calls[$index] eq 'X') {
2119 $counting{total_meCHG_count}++;
2120 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2121 }
2122 elsif ($methylation_calls[$index] eq 'x') {
2123 $counting{total_unmethylated_CHG_count}++;
2124 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2125 }
2126 elsif ($methylation_calls[$index] eq 'Z') {
2127 $counting{total_meCpG_count}++;
2128 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2129 }
2130 elsif ($methylation_calls[$index] eq 'z') {
2131 $counting{total_unmethylated_CpG_count}++;
2132 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2133 }
2134 elsif ($methylation_calls[$index] eq 'H') {
2135 $counting{total_meCHH_count}++;
2136 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2137 }
2138 elsif ($methylation_calls[$index] eq 'h') {
2139 $counting{total_unmethylated_CHH_count}++;
2140 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2141 }
2142 elsif ($methylation_calls[$index] eq '.') {
2143 }
2144 else{
2145 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2146 }
2147 }
2148 }
2149 elsif ($strand eq '-') {
2150
2151 for my $index (0..$#methylation_calls) {
2152 ### methylated Cs (any context) will receive a forward (+) orientation
2153 ### not methylated Cs (any context) will receive a reverse (-) orientation
2154
2155 if ($cigar){ # only needed for SAM files
2156 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2157 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2158 $cigar_offset += $cigar_mod;
2159 $pos_offset += $pos_mod;
2160 }
2161
2162 if ($methylation_calls[$index] eq 'X') {
2163 $counting{total_meCHG_count}++;
2164 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2165 }
2166 elsif ($methylation_calls[$index] eq 'x') {
2167 $counting{total_unmethylated_CHG_count}++;
2168 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2169 }
2170 elsif ($methylation_calls[$index] eq 'Z') {
2171 $counting{total_meCpG_count}++;
2172 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2173 }
2174 elsif ($methylation_calls[$index] eq 'z') {
2175 $counting{total_unmethylated_CpG_count}++;
2176 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2177 }
2178 elsif ($methylation_calls[$index] eq 'H') {
2179 $counting{total_meCHH_count}++;
2180 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2181 }
2182 elsif ($methylation_calls[$index] eq 'h') {
2183 $counting{total_unmethylated_CHH_count}++;
2184 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2185 }
2186 elsif ($methylation_calls[$index] eq '.'){
2187 }
2188 else{
2189 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2190 }
2191 }
2192 }
2193 else {
2194 die "The strand information was neither + nor -: $strand\n";
2195 }
2196 }
2197
2198 ### strand-specific methylation output
2199 elsif ($merge_non_CpG) {
2200 if ($strand eq '+') {
2201 for my $index (0..$#methylation_calls) {
2202 ### methylated Cs (any context) will receive a forward (+) orientation
2203 ### not methylated Cs (any context) will receive a reverse (-) orientation
2204
2205 if ($cigar){ # only needed for SAM files
2206 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2207 $cigar_offset += $cigar_mod;
2208 $pos_offset += $pos_mod;
2209 }
2210
2211 if ($methylation_calls[$index] eq 'X') {
2212 $counting{total_meCHG_count}++;
2213 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2214 }
2215 elsif ($methylation_calls[$index] eq 'x') {
2216 $counting{total_unmethylated_CHG_count}++;
2217 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2218 }
2219 elsif ($methylation_calls[$index] eq 'Z') {
2220 $counting{total_meCpG_count}++;
2221 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2222 }
2223 elsif ($methylation_calls[$index] eq 'z') {
2224 $counting{total_unmethylated_CpG_count}++;
2225 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2226 }
2227 elsif ($methylation_calls[$index] eq 'H') {
2228 $counting{total_meCHH_count}++;
2229 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2230 }
2231 elsif ($methylation_calls[$index] eq 'h') {
2232 $counting{total_unmethylated_CHH_count}++;
2233 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2234 }
2235 elsif ($methylation_calls[$index] eq '.') {
2236 }
2237 else{
2238 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2239 }
2240 }
2241 }
2242 elsif ($strand eq '-') {
2243
2244 for my $index (0..$#methylation_calls) {
2245 ### methylated Cs (any context) will receive a forward (+) orientation
2246 ### not methylated Cs (any context) will receive a reverse (-) orientation
2247
2248 if ($cigar){ # only needed for SAM files
2249 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2250 $cigar_offset += $cigar_mod;
2251 $pos_offset += $pos_mod;
2252 }
2253
2254 if ($methylation_calls[$index] eq 'X') {
2255 $counting{total_meCHG_count}++;
2256 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2257 }
2258 elsif ($methylation_calls[$index] eq 'x') {
2259 $counting{total_unmethylated_CHG_count}++;
2260 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2261 }
2262 elsif ($methylation_calls[$index] eq 'Z') {
2263 $counting{total_meCpG_count}++;
2264 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2265 }
2266 elsif ($methylation_calls[$index] eq 'z') {
2267 $counting{total_unmethylated_CpG_count}++;
2268 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2269 }
2270 elsif ($methylation_calls[$index] eq 'H') {
2271 $counting{total_meCHH_count}++;
2272 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2273 }
2274 elsif ($methylation_calls[$index] eq 'h') {
2275 $counting{total_unmethylated_CHH_count}++;
2276 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2277 }
2278 elsif ($methylation_calls[$index] eq '.') {
2279 }
2280 else{
2281 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2282 }
2283 }
2284 }
2285 else {
2286 die "The strand information was neither + nor -: $strand\n";
2287 }
2288 }
2289
2290 ### THIS IS THE 3-CONTEXT (CpG, CHG and CHH) DEFAULT SECTION
2291
2292 elsif ($full) {
2293 if ($strand eq '+') {
2294 for my $index (0..$#methylation_calls) {
2295 ### methylated Cs (any context) will receive a forward (+) orientation
2296 ### not methylated Cs (any context) will receive a reverse (-) orientation
2297
2298 if ($cigar){ # only needed for SAM files
2299 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2300 $cigar_offset += $cigar_mod;
2301 $pos_offset += $pos_mod;
2302 }
2303
2304 if ($methylation_calls[$index] eq 'X') {
2305 $counting{total_meCHG_count}++;
2306 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2307 } elsif ($methylation_calls[$index] eq 'x') {
2308 $counting{total_unmethylated_CHG_count}++;
2309 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2310 } elsif ($methylation_calls[$index] eq 'Z') {
2311 $counting{total_meCpG_count}++;
2312 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2313 } elsif ($methylation_calls[$index] eq 'z') {
2314 $counting{total_unmethylated_CpG_count}++;
2315 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2316 } elsif ($methylation_calls[$index] eq 'H') {
2317 $counting{total_meCHH_count}++;
2318 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2319 } elsif ($methylation_calls[$index] eq 'h') {
2320 $counting{total_unmethylated_CHH_count}++;
2321 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2322 }
2323 elsif ($methylation_calls[$index] eq '.') {}
2324 else{
2325 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2326 }
2327 }
2328 }
2329 elsif ($strand eq '-') {
2330
2331 for my $index (0..$#methylation_calls) {
2332 ### methylated Cs (any context) will receive a forward (+) orientation
2333 ### not methylated Cs (any context) will receive a reverse (-) orientation
2334
2335 if ($cigar){ # only needed for SAM files
2336 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2337 $cigar_offset += $cigar_mod;
2338 $pos_offset += $pos_mod;
2339 }
2340
2341 if ($methylation_calls[$index] eq 'X') {
2342 $counting{total_meCHG_count}++;
2343 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2344 } elsif ($methylation_calls[$index] eq 'x') {
2345 $counting{total_unmethylated_CHG_count}++;
2346 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2347 } elsif ($methylation_calls[$index] eq 'Z') {
2348 $counting{total_meCpG_count}++;
2349 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2350 } elsif ($methylation_calls[$index] eq 'z') {
2351 $counting{total_unmethylated_CpG_count}++;
2352 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2353 } elsif ($methylation_calls[$index] eq 'H') {
2354 $counting{total_meCHH_count}++;
2355 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2356 } elsif ($methylation_calls[$index] eq 'h') {
2357 $counting{total_unmethylated_CHH_count}++;
2358 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2359 }
2360 elsif ($methylation_calls[$index] eq '.') {}
2361 else{
2362 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2363 }
2364 }
2365 }
2366 else {
2367 die "The read had a strand orientation which was neither + nor -: $strand\n";
2368 }
2369 }
2370
2371 ### strand-specific methylation output
2372 else {
2373 if ($strand eq '+') {
2374 for my $index (0..$#methylation_calls) {
2375 ### methylated Cs (any context) will receive a forward (+) orientation
2376 ### not methylated Cs (any context) will receive a reverse (-) orientation
2377
2378 if ($cigar){ # only needed for SAM files
2379 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2380 $cigar_offset += $cigar_mod;
2381 $pos_offset += $pos_mod;
2382 }
2383
2384 if ($methylation_calls[$index] eq 'X') {
2385 $counting{total_meCHG_count}++;
2386 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2387 } elsif ($methylation_calls[$index] eq 'x') {
2388 $counting{total_unmethylated_CHG_count}++;
2389 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2390 } elsif ($methylation_calls[$index] eq 'Z') {
2391 $counting{total_meCpG_count}++;
2392 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2393 } elsif ($methylation_calls[$index] eq 'z') {
2394 $counting{total_unmethylated_CpG_count}++;
2395 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2396 } elsif ($methylation_calls[$index] eq 'H') {
2397 $counting{total_meCHH_count}++;
2398 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2399 } elsif ($methylation_calls[$index] eq 'h') {
2400 $counting{total_unmethylated_CHH_count}++;
2401 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2402 }
2403 elsif ($methylation_calls[$index] eq '.') {}
2404 else{
2405 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2406 }
2407 }
2408 }
2409 elsif ($strand eq '-') {
2410
2411 for my $index (0..$#methylation_calls) {
2412 ### methylated Cs (any context) will receive a forward (+) orientation
2413 ### not methylated Cs (any context) will receive a reverse (-) orientation
2414
2415 if ($cigar){ # only needed for SAM files
2416 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2417 $cigar_offset += $cigar_mod;
2418 $pos_offset += $pos_mod;
2419 }
2420
2421 if ($methylation_calls[$index] eq 'X') {
2422 $counting{total_meCHG_count}++;
2423 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2424 } elsif ($methylation_calls[$index] eq 'x') {
2425 $counting{total_unmethylated_CHG_count}++;
2426 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2427 } elsif ($methylation_calls[$index] eq 'Z') {
2428 $counting{total_meCpG_count}++;
2429 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2430 } elsif ($methylation_calls[$index] eq 'z') {
2431 $counting{total_unmethylated_CpG_count}++;
2432 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2433 } elsif ($methylation_calls[$index] eq 'H') {
2434 $counting{total_meCHH_count}++;
2435 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2436 } elsif ($methylation_calls[$index] eq 'h') {
2437 $counting{total_unmethylated_CHH_count}++;
2438 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2439 }
2440 elsif ($methylation_calls[$index] eq '.') {}
2441 else{
2442 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2443 }
2444 }
2445 }
2446 else {
2447 die "The strand information was neither + nor -: $strand\n";
2448 }
2449 }
2450 }
2451
2452 sub print_helpfile{
2453
2454 print << 'HOW_TO';
2455
2456
2457 DESCRIPTION
2458
2459 The following is a brief description of all options to control the Bismark
2460 methylation extractor. The script reads in a bisulfite read alignment results file
2461 produced by the Bismark bisulfite mapper and extracts the methylation information
2462 for individual cytosines. This information is found in the methylation call field
2463 which can contain the following characters:
2464
2465 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2466 ~~~ X for methylated C in CHG context (was protected) ~~~
2467 ~~~ x for not methylated C CHG (was converted) ~~~
2468 ~~~ H for methylated C in CHH context (was protected) ~~~
2469 ~~~ h for not methylated C in CHH context (was converted) ~~~
2470 ~~~ Z for methylated C in CpG context (was protected) ~~~
2471 ~~~ z for not methylated C in CpG context (was converted) ~~~
2472 ~~~ . for any bases not involving cytosines ~~~
2473 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2474
2475 The methylation extractor outputs result files for cytosines in CpG, CHG and CHH
2476 context (this distinction is actually already made in Bismark itself). As the methylation
2477 information for every C analysed can produce files which easily have tens or even hundreds of
2478 millions of lines, file sizes can become very large and more difficult to handle. The C
2479 methylation info additionally splits cytosine methylation calls up into one of the four possible
2480 strands a given bisulfite read aligned against:
2481
2482 OT original top strand
2483 CTOT complementary to original top strand
2484
2485 OB original bottom strand
2486 CTOB complementary to original bottom strand
2487
2488 Thus, by default twelve individual output files are being generated per input file (unless
2489 --comprehensive is specified, see below). The output files can be imported into a genome
2490 viewer, such as SeqMonk, and re-combined into a single data group if desired (in fact
2491 unless the bisulfite reads were generated preserving directionality it doesn't make any
2492 sense to look at the data in a strand-specific manner). Strand-specific output files can
2493 optionally be skipped, in which case only three output files for CpG, CHG or CHH context
2494 will be generated. For both the strand-specific and comprehensive outputs there is also
2495 the option to merge both non-CpG contexts (CHG and CHH) into one single non-CpG context.
2496
2497
2498 The output files are in the following format (tab delimited):
2499
2500 <sequence_id> <strand> <chromosome> <position> <methylation call>
2501
2502
2503 USAGE: methylation_extractor [options] <filenames>
2504
2505
2506 ARGUMENTS:
2507
2508 <filenames> A space-separated list of Bismark result files in SAM format from
2509 which methylation information is extracted for every cytosine in
2510 the reads. For alignment files in the older custom Bismark output
2511 see option '--vanilla'.
2512
2513 OPTIONS:
2514
2515 -s/--single-end Input file(s) are Bismark result file(s) generated from single-end
2516 read data. Specifying either --single-end or --paired-end is
2517 mandatory.
2518
2519 -p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end
2520 read data. Specifying either --paired-end or --single-end is
2521 mandatory.
2522
2523 --vanilla The Bismark result input file(s) are in the old custom Bismark format
2524 (up to version 0.5.x) and not in SAM format which is the default as
2525 of Bismark version 0.6.x or higher. Default: OFF.
2526
2527 --no_overlap For paired-end reads it is theoretically possible that read_1 and
2528 read_2 overlap. This option avoids scoring overlapping methylation
2529 calls twice. Whilst this removes a bias towards more methylation calls
2530 towards the center of sequenced fragments it can de facto remove
2531 a good proportion of the data.
2532
2533 --ignore <int> Ignore the first <int> bp at the 5' end of each read when processing the
2534 methylation call string. This can remove e.g. a restriction enzyme site
2535 at the start of each read.
2536
2537 --comprehensive Specifying this option will merge all four possible strand-specific
2538 methylation info into context-dependent output files. The default
2539 contexts are:
2540 - CpG context
2541 - CHG context
2542 - CHH context
2543
2544 --merge_non_CpG This will produce two output files (in --comprehensive mode) or eight
2545 strand-specific output files (default) for Cs in
2546 - CpG context
2547 - non-CpG context
2548
2549 --report Prints out a short methylation summary as well as the paramaters used to run
2550 this script.
2551
2552 --version Displays version information.
2553
2554 -h/--help Displays this help file and exits.
2555
2556
2557 OUTPUT:
2558
2559 The output is in the form:
2560
2561 <seq-ID> <methylation state*> <chromosome> <start position (= end position)> <methylation call>
2562
2563 * Methylated cytosines receive a '+' orientation,
2564 * Unmethylated cytosines receive a '-' orientation.
2565
2566
2567 This script was last modified on 31 July 2012.
2568
2569 HOW_TO
2570 }