comparison bismark_methylation_extractor @ 0:36d124f44c0a draft

inital commit
author bjoern-gruening
date Tue, 25 Dec 2012 05:45:46 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:36d124f44c0a
1 #!/usr/bin/perl
2 use warnings;
3 use strict;
4 $|++;
5 use Getopt::Long;
6 use Cwd;
7 use Carp;
8
9 my @filenames; # input files
10 my %counting;
11 my $parent_dir = getcwd();
12
13 my %fhs;
14
15 my $version = 'v0.7.7';
16 my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome) = process_commandline();
17
18
19 ### only needed for bedGraph output
20 my @sorting_files; # if files are to be written to bedGraph format, these are the methylation extractor output files
21 my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total
22 my @bedfiles;
23
24 ### only needed for genome-wide cytosine methylation report
25 my %chromosomes;
26
27 ##############################################################################################
28 ### Summarising Run Parameters
29 ##############################################################################################
30
31 ### METHYLATION EXTRACTOR
32
33 warn "Summarising Bismark methylation extractor parameters:\n";
34 warn '='x63,"\n";
35
36 if ($single){
37 if ($vanilla){
38 warn "Bismark single-end vanilla format specified\n";
39 }
40 else{
41 warn "Bismark single-end SAM format specified (default)\n"; # default
42 }
43 }
44 elsif ($paired){
45 if ($vanilla){
46 warn "Bismark paired-end vanilla format specified\n";
47 }
48 else{
49 warn "Bismark paired-end SAM format specified (default)\n"; # default
50 }
51 }
52
53 if ($ignore){
54 warn "First $ignore bases will be disregarded when processing the methylation call string\n";
55 }
56
57 if ($full){
58 warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n";
59 }
60 if ($merge_non_CpG){
61 warn "Merge CHG and CHH context to non-CpG context specified\n";
62 }
63 ### output directory
64 if ($output_dir eq ''){
65 warn "Output will be written to the current directory ('$parent_dir')\n";
66 }
67 else{
68 warn "Output path specified as: $output_dir\n";
69 }
70
71
72 sleep (1);
73
74 ### BEDGRAPH
75
76 if ($bedGraph){
77 warn "\n\nSummarising bedGraph parameters:\n";
78 warn '='x63,"\n";
79
80 if ($counts){
81 warn "Generating additional output in bedGraph format including methylating counts (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage> <count methylated> <count non-methylated>)\n";
82 }
83 else{
84 warn "Generating additional sorted output in bedGraph format (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage>)\n";
85 }
86
87 warn "Using a cutoff of $coverage_threshold read(s) to report cytosine positions\n";
88
89 if ($CX_context){
90 warn "Reporting and sorting methylation information for all cytosine context (sorting may take a long time, you have been warned ...)\n";
91 }
92 else{ # default
93 $CpG_only = 1;
94 warn "Reporting and sorting cytosine methylation information in CpG context only (default)\n";
95 }
96
97 if ($remove){
98 warn "White spaces in read ID names will be removed prior to sorting\n";
99 }
100
101 sleep (1);
102
103 if ($cytosine_report){
104 warn "\n\nSummarising genome-wide cytosine methylation report parameters:\n";
105 warn '='x63,"\n";
106 warn "Generating comprehensive genome-wide cytosine report (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage> )\n";
107
108
109 if ($CX_context){
110 warn "Reporting methylation for all cytosine contexts. Be aware that this will generate enormous files\n";
111 }
112 else{ # default
113 $CpG_only = 1;
114 warn "Reporting cytosine methylation in CpG context only (default)\n";
115 }
116
117 if ($split_by_chromosome){
118 warn "Splitting the cytosine report output up into individual files for each chromosome\n";
119 }
120
121 ### Zero-based coordinates
122 if ($zero){
123 warn "Using zero-based genomic coordinates (user-defined)\n";
124 }
125 else{ # default, 1-based coords
126 warn "Using 1-based genomic coordinates (default)\n";
127 }
128
129 ### GENOME folder
130 if ($genome_folder){
131 unless ($genome_folder =~/\/$/){
132 $genome_folder =~ s/$/\//;
133 }
134 warn "Genome folder was specified as $genome_folder\n";
135 }
136 else{
137 $genome_folder = '/data/public/Genomes/Mouse/NCBIM37/';
138 warn "Using the default genome folder /data/public/Genomes/Mouse/NCBIM37/\n";
139 }
140 sleep (1);
141 }
142 }
143
144 warn "\n";
145 sleep (5);
146
147 ######################################################
148 ### PROCESSING FILES
149 ######################################################
150
151 foreach my $filename (@filenames){
152 # resetting counters and filehandles
153 %fhs = ();
154 %counting =(
155 total_meCHG_count => 0,
156 total_meCHH_count => 0,
157 total_meCpG_count => 0,
158 total_unmethylated_CHG_count => 0,
159 total_unmethylated_CHH_count => 0,
160 total_unmethylated_CpG_count => 0,
161 sequences_count => 0,
162 );
163 @sorting_files = ();
164 @bedfiles = ();
165
166 process_Bismark_results_file($filename);
167
168 if ($bedGraph){
169 my $out = $filename;
170 $out =~ s/sam$//;
171 $out =~ s/txt$//;
172 $out =~ s/$/bedGraph/;
173
174 my $bedGraph_output = $out;
175 open (OUT,'>',$output_dir.$out) or die $!;
176 # warn "Writing bedGraph to file: $out\n";
177
178 process_bedGraph_output();
179 close OUT or die $!;
180
181 ### genome-wide cytosine methylation report requires bedGraph processing anyway
182 if ($cytosine_report){
183 my $cytosine_out = $out;
184 $cytosine_out =~ s/bedGraph$//;
185
186 read_genome_into_memory();
187 generate_genome_wide_cytosine_report($bedGraph_output,$cytosine_out);
188 }
189 }
190 }
191
192
193 sub process_commandline{
194 my $help;
195 my $single_end;
196 my $paired_end;
197 my $ignore;
198 my $genomic_fasta;
199 my $full;
200 my $report;
201 my $extractor_version;
202 my $no_overlap;
203 my $merge_non_CpG;
204 my $vanilla;
205 my $output_dir;
206 my $no_header;
207 my $bedGraph;
208 my $coverage_threshold = 1; # Minimum number of reads covering before calling methylation status
209 my $remove;
210 my $counts;
211 my $cytosine_report;
212 my $genome_folder;
213 my $zero;
214 my $CpG_only;
215 my $CX_context;
216 my $split_by_chromosome;
217
218
219 my $command_line = GetOptions ('help|man' => \$help,
220 'p|paired-end' => \$paired_end,
221 's|single-end' => \$single_end,
222 'fasta' => \$genomic_fasta,
223 'ignore=i' => \$ignore,
224 'comprehensive' => \$full,
225 'report' => \$report,
226 'version' => \$extractor_version,
227 'no_overlap' => \$no_overlap,
228 'merge_non_CpG' => \$merge_non_CpG,
229 'vanilla' => \$vanilla,
230 'o|output=s' => \$output_dir,
231 'no_header' => \$no_header,
232 'bedGraph' => \$bedGraph,
233 "cutoff=i" => \$coverage_threshold,
234 "remove_spaces" => \$remove,
235 "counts" => \$counts,
236 "cytosine_report" => \$cytosine_report,
237 'g|genome_folder=s' => \$genome_folder,
238 "zero_based" => \$zero,
239 "CX|CX_context" => \$CX_context,
240 "split_by_chromosome" => \$split_by_chromosome,
241 );
242
243 ### EXIT ON ERROR if there were errors with any of the supplied options
244 unless ($command_line){
245 die "Please respecify command line options\n";
246 }
247
248 ### HELPFILE
249 if ($help){
250 print_helpfile();
251 exit;
252 }
253
254 if ($extractor_version){
255 print << "VERSION";
256
257
258 Bismark Methylation Extractor
259
260 Bismark Extractor Version: $version Copyright 2010-12 Felix Krueger, Babraham Bioinformatics
261 www.bioinformatics.babraham.ac.uk/projects/bismark/
262
263
264 VERSION
265 exit;
266 }
267
268
269 ### no files provided
270 unless (@ARGV){
271 die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n";
272 }
273 @filenames = @ARGV;
274
275 warn "\n *** Bismark methylation extractor version $version ***\n\n";
276
277 ### IGNORING <INT> bases at the start of the read when processing the methylation call string
278 unless ($ignore){
279 $ignore = 0;
280 }
281 ### PRINT A REPORT
282 unless ($report){
283 $report = 0;
284 }
285
286 ### OUTPUT DIR PATH
287 if ($output_dir){
288 unless ($output_dir =~ /\/$/){
289 $output_dir =~ s/$/\//;
290 }
291 }
292 else{
293 $output_dir = '';
294 }
295
296 ### NO HEADER
297 unless ($no_header){
298 $no_header = 0;
299 }
300
301 ### OLD (VANILLA) OUTPUT FORMAT
302 unless ($vanilla){
303 $vanilla = 0;
304 }
305
306 if ($single_end){
307 $paired_end = 0; ### SINGLE END ALIGNMENTS
308 }
309 elsif ($paired_end){
310 $single_end = 0; ### PAIRED-END ALIGNMENTS
311 }
312 else{
313 die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format\n\n";
314 }
315
316 ### NO OVERLAP
317 if ($no_overlap){
318 die "The option '--no_overlap' can only be specified for paired-end input!\n" unless ($paired_end);
319 }
320 else{
321 $no_overlap = 0;
322 }
323
324 ### COMPREHENSIVE OUTPUT
325 unless ($full){
326 $full = 0;
327 }
328
329 ### MERGE NON-CpG context
330 unless ($merge_non_CpG){
331 $merge_non_CpG = 0;
332 }
333
334 ### remove white spaces in read ID (needed for sorting using the sort command
335 unless ($remove){
336 $remove = 0;
337 }
338
339 ### COVERAGE THRESHOLD FOR gedGraph OUTPUT
340 unless (defined $coverage_threshold){
341 unless ($coverage_threshold > 0){
342 die "Please select a coverage greater than 0 (positive integers only)\n";
343 }
344 $coverage_threshold = 1;
345 }
346
347 if ($zero){
348 die "Option '--zero' is only available if '--cytosine_report' is specified as well. Please respecify\n" unless ($cytosine_report);
349 }
350
351 if ($CX_context){
352 die "Option '--CX_context' is only available if '--cytosine_report' or '--bedGraph' is specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph);
353 }
354
355 if ($cytosine_report){
356
357 ### GENOME folder
358 if ($genome_folder){
359 unless ($genome_folder =~/\/$/){
360 $genome_folder =~ s/$/\//;
361 }
362 }
363 else{
364 die "Please specify a genome folder to proceed (full path only)\n";
365 }
366
367 unless ($bedGraph){
368 warn "Setting the option '--bedGraph' since this is required for the genome-wide cytosine report\n";
369 $bedGraph = 1;
370 }
371 unless ($counts){
372 warn "Setting the option '--counts' since this is required for the genome-wide cytosine report\n";
373 $counts = 1;
374 }
375 warn "\n";
376 }
377
378 return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome);
379 }
380
381
382 sub process_Bismark_results_file{
383 my $filename = shift;
384
385 warn "\nNow reading in Bismark result file $filename\n\n";
386
387 if ($filename =~ /\.gz$/) {
388 open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n";
389 } else {
390 open (IN,$filename) or die "Can't open file $filename: $!\n";
391 }
392
393 ### Vanilla and SAM output need to read different numbers of header lines
394 if ($vanilla) {
395 my $bismark_version = <IN>; ## discarding the Bismark version info
396 chomp $bismark_version;
397 $bismark_version =~ s/\r//; # replaces \r line feed
398 $bismark_version =~ s/Bismark version: //;
399 if ($bismark_version =~ /^\@/) {
400 warn "Detected \@ as the first character of the version information. Is it possible that the file is in SAM format?\n\n";
401 sleep (2);
402 }
403
404 unless ($version eq $bismark_version){
405 die "The methylation extractor and Bismark itself need to be of the same version!\n\nVersions used:\nmethylation extractor: '$version'\nBismark: '$bismark_version'\n";
406 }
407 } else {
408 # If the read is in SAM format (default) it can either start with @ header lines or start with alignments directly.
409 # We are reading from it further down
410 }
411
412 my $output_filename = (split (/\//,$filename))[-1];
413
414 ### OPENING OUTPUT-FILEHANDLES
415 if ($report) {
416 my $report_filename = $output_filename;
417 $report_filename =~ s/[\.sam|\.txt]$//;
418 $report_filename =~ s/$/_splitting_report.txt/;
419 $report_filename = $output_dir . $report_filename;
420 open (REPORT,'>',$report_filename) or die "Failed to write to file $report_filename $!\n";
421 }
422
423 if ($report) {
424 print REPORT "$output_filename\n\n";
425 print REPORT "Parameters used to extract methylation information:\n";
426 if ($paired) {
427 if ($vanilla) {
428 print REPORT "Bismark result file: paired-end (vanilla Bismark format)\n";
429 } else {
430 print REPORT "Bismark result file: paired-end (SAM format)\n"; # default
431 }
432 }
433
434 if ($single) {
435 if ($vanilla) {
436 print REPORT "Bismark result file: single-end (vanilla Bismark format)\n";
437 } else {
438 print REPORT "Bismark result file: single-end (SAM format)\n"; # default
439 }
440 }
441
442 if ($ignore) {
443 print REPORT "Ignoring first $ignore bases\n";
444 }
445
446 if ($full) {
447 print REPORT "Output specified: comprehensive\n";
448 } else {
449 print REPORT "Output specified: strand-specific (default)\n";
450 }
451
452 if ($no_overlap) {
453 print REPORT "No overlapping methylation calls specified\n";
454 }
455 if ($genomic_fasta) {
456 print REPORT "Genomic equivalent sequences will be printed out in FastA format\n";
457 }
458 if ($merge_non_CpG) {
459 print REPORT "Methylation in CHG and CHH context will be merged into \"non-CpG context\" output\n";
460 }
461
462 print REPORT "\n";
463 }
464
465 ### CpG-context and non-CpG context. THIS SECTION IS OPTIONAL
466 ### if --comprehensive AND --merge_non_CpG was specified we are only writing out one CpG-context and one Any-Other-context result file
467 if ($full and $merge_non_CpG) {
468 my $cpg_output = my $other_c_output = $output_filename;
469 ### C in CpG context
470 $cpg_output =~ s/^/CpG_context_/;
471 $cpg_output =~ s/sam$/txt/;
472 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
473 $cpg_output = $output_dir . $cpg_output;
474 push @sorting_files,$cpg_output;
475 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n";
476 print "Writing result file containing methylation information for C in CpG context to $cpg_output\n";
477
478 unless ($no_header) {
479 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n";
480 }
481
482 ### C in any other context than CpG
483 $other_c_output =~ s/^/Non_CpG_context_/;
484 $other_c_output =~ s/sam$/txt/;
485 $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/);
486 $other_c_output = $output_dir . $other_c_output;
487 push @sorting_files,$other_c_output;
488 open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n";
489 print "Writing result file containing methylation information for C in any other context to $other_c_output\n";
490
491 unless ($no_header) {
492 print {$fhs{other_context}} "Bismark methylation extractor version $version\n";
493 }
494 }
495
496 ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found
497 elsif ($merge_non_CpG) {
498
499 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
500
501 ### For cytosines in CpG context
502 $cpg_ot =~ s/^/CpG_OT_/;
503 $cpg_ot =~ s/sam$/txt/;
504 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
505 $cpg_ot = $output_dir . $cpg_ot;
506 push @sorting_files,$cpg_ot;
507 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n";
508 print "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n";
509
510 unless($no_header){
511 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n";
512 }
513
514 $cpg_ctot =~ s/^/CpG_CTOT_/;
515 $cpg_ctot =~ s/sam$/txt/;
516 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
517 $cpg_ctot = $output_dir . $cpg_ctot;
518 push @sorting_files,$cpg_ctot;
519 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n";
520 print "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n";
521
522 unless($no_header){
523 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n";
524 }
525
526 $cpg_ctob =~ s/^/CpG_CTOB_/;
527 $cpg_ctob =~ s/sam$/txt/;
528 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
529 $cpg_ctob = $output_dir . $cpg_ctob;
530 push @sorting_files,$cpg_ctob;
531 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n";
532 print "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n";
533
534 unless($no_header){
535 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n";
536 }
537
538 $cpg_ob =~ s/^/CpG_OB_/;
539 $cpg_ob =~ s/sam$/txt/;
540 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
541 $cpg_ob = $output_dir . $cpg_ob;
542 push @sorting_files,$cpg_ob;
543 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n";
544 print "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n";
545
546 unless($no_header){
547 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n";
548 }
549
550 ### For cytosines in Non-CpG (CC, CT or CA) context
551 my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename;
552
553 $other_c_ot =~ s/^/Non_CpG_OT_/;
554 $other_c_ot =~ s/sam$/txt/;
555 $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/);
556 $other_c_ot = $output_dir . $other_c_ot;
557 push @sorting_files,$other_c_ot;
558 open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n";
559 print "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n";
560
561 unless($no_header){
562 print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n";
563 }
564
565 $other_c_ctot =~ s/^/Non_CpG_CTOT_/;
566 $other_c_ctot =~ s/sam$/txt/;
567 $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/);
568 $other_c_ctot = $output_dir . $other_c_ctot;
569 push @sorting_files,$other_c_ctot;
570 open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n";
571 print "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n";
572
573 unless($no_header){
574 print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n";
575 }
576
577 $other_c_ctob =~ s/^/Non_CpG_CTOB_/;
578 $other_c_ctob =~ s/sam$/txt/;
579 $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/);
580 $other_c_ctob = $output_dir . $other_c_ctob;
581 push @sorting_files,$other_c_ctob;
582 open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n";
583 print "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n";
584
585 unless($no_header){
586 print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n";
587 }
588
589 $other_c_ob =~ s/^/Non_CpG_OB_/;
590 $other_c_ob =~ s/sam$/txt/;
591 $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/);
592 $other_c_ob = $output_dir . $other_c_ob;
593 push @sorting_files,$other_c_ob;
594 open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n";
595 print "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n";
596
597 unless($no_header){
598 print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n";
599 }
600 }
601 ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context)
602
603 ### if --comprehensive was specified we are only writing one file per context
604 elsif ($full) {
605 my $cpg_output = my $chg_output = my $chh_output = $output_filename;
606 ### C in CpG context
607 $cpg_output =~ s/^/CpG_context_/;
608 $cpg_output =~ s/sam$/txt/;
609 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
610 $cpg_output = $output_dir . $cpg_output;
611 push @sorting_files,$cpg_output;
612 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n";
613 print "Writing result file containing methylation information for C in CpG context to $cpg_output\n";
614
615 unless($no_header){
616 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n";
617 }
618
619 ### C in CHG context
620 $chg_output =~ s/^/CHG_context_/;
621 $chg_output =~ s/sam$/txt/;
622 $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/);
623 $chg_output = $output_dir . $chg_output;
624 push @sorting_files,$chg_output;
625 open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n";
626 print "Writing result file containing methylation information for C in CHG context to $chg_output\n";
627
628 unless($no_header){
629 print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n";
630 }
631
632 ### C in CHH context
633 $chh_output =~ s/^/CHH_context_/;
634 $chh_output =~ s/sam$/txt/;
635 $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/);
636 $chh_output = $output_dir . $chh_output;
637 push @sorting_files, $chh_output;
638 open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n";
639 print "Writing result file containing methylation information for C in CHH context to $chh_output\n";
640
641 unless($no_header){
642 print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n";
643 }
644 }
645 ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found
646 else {
647 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
648
649 ### For cytosines in CpG context
650 $cpg_ot =~ s/^/CpG_OT_/;
651 $cpg_ot =~ s/sam$/txt/;
652 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
653 $cpg_ot = $output_dir . $cpg_ot;
654 push @sorting_files,$cpg_ot;
655 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n";
656 print "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n";
657
658 unless($no_header){
659 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n";
660 }
661
662 $cpg_ctot =~ s/^/CpG_CTOT_/;
663 $cpg_ctot =~ s/sam$/txt/;
664 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
665 $cpg_ctot = $output_dir . $cpg_ctot;
666 push @sorting_files,$cpg_ctot;
667 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n";
668 print "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n";
669
670 unless($no_header){
671 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n";
672 }
673
674 $cpg_ctob =~ s/^/CpG_CTOB_/;
675 $cpg_ctob =~ s/sam$/txt/;
676 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
677 $cpg_ctob = $output_dir . $cpg_ctob;
678 push @sorting_files,$cpg_ctob;
679 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n";
680 print "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n";
681
682 unless($no_header){
683 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n";
684 }
685
686 $cpg_ob =~ s/^/CpG_OB_/;
687 $cpg_ob =~ s/sam$/txt/;
688 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
689 $cpg_ob = $output_dir . $cpg_ob;
690 push @sorting_files,$cpg_ob;
691 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n";
692 print "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n";
693
694 unless($no_header){
695 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n";
696 }
697
698 ### For cytosines in CHG context
699 my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename;
700
701 $chg_ot =~ s/^/CHG_OT_/;
702 $chg_ot =~ s/sam$/txt/;
703 $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/);
704 $chg_ot = $output_dir . $chg_ot;
705 push @sorting_files,$chg_ot;
706 open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n";
707 print "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n";
708
709 unless($no_header){
710 print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n";
711 }
712
713 $chg_ctot =~ s/^/CHG_CTOT_/;
714 $chg_ctot =~ s/sam$/txt/;
715 $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/);
716 $chg_ctot = $output_dir . $chg_ctot;
717 push @sorting_files,$chg_ctot;
718 open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n";
719 print "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n";
720
721 unless($no_header){
722 print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n";
723 }
724
725 $chg_ctob =~ s/^/CHG_CTOB_/;
726 $chg_ctob =~ s/sam$/txt/;
727 $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/);
728 $chg_ctob = $output_dir . $chg_ctob;
729 push @sorting_files,$chg_ctob;
730 open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n";
731 print "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n";
732
733 unless($no_header){
734 print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n";
735 }
736
737 $chg_ob =~ s/^/CHG_OB_/;
738 $chg_ob =~ s/sam$/txt/;
739 $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/);
740 $chg_ob = $output_dir . $chg_ob;
741 push @sorting_files,$chg_ob;
742 open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n";
743 print "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n";
744
745 unless($no_header){
746 print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n";
747 }
748
749 ### For cytosines in CHH context
750 my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename;
751
752 $chh_ot =~ s/^/CHH_OT_/;
753 $chh_ot =~ s/sam$/txt/;
754 $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/);
755 $chh_ot = $output_dir . $chh_ot;
756 push @sorting_files,$chh_ot;
757 open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n";
758 print "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n";
759
760 unless($no_header){
761 print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n";
762 }
763
764 $chh_ctot =~ s/^/CHH_CTOT_/;
765 $chh_ctot =~ s/sam$/txt/;
766 $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/);
767 $chh_ctot = $output_dir . $chh_ctot;
768 push @sorting_files,$chh_ctot;
769 open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n";
770 print "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n";
771
772 unless($no_header){
773 print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n";
774 }
775
776 $chh_ctob =~ s/^/CHH_CTOB_/;
777 $chh_ctob =~ s/sam$/txt/;
778 $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/);
779 $chh_ctob = $output_dir . $chh_ctob;
780 push @sorting_files,$chh_ctob;
781 open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n";
782 print "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n";
783
784 unless($no_header){
785 print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n";
786 }
787
788 $chh_ob =~ s/^/CHH_OB_/;
789 $chh_ob =~ s/sam$/txt/;
790 $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/);
791 $chh_ob = $output_dir . $chh_ob;
792 push @sorting_files,$chh_ob;
793 open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n";
794 print "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n";
795
796 unless($no_header){
797 print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n";
798 }
799 }
800
801 my $methylation_call_strings_processed = 0;
802 my $line_count = 0;
803
804 ### proceeding differently now for single-end or paired-end Bismark files
805
806 ### PROCESSING SINGLE-END RESULT FILES
807 if ($single) {
808
809 ### also proceeding differently now for SAM format or vanilla Bismark format files
810 if ($vanilla) { # old vanilla Bismark output format
811 while (<IN>) {
812 ++$line_count;
813 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
814
815 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
816 my ($id,$strand,$chrom,$start,$seq,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,6,7,8,9];
817
818 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the first or
819 ### last position
820 chomp $genome_conversion;
821
822 my $index;
823 if ($meth_call) {
824
825 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
826 $index = 0;
827 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
828 $index = 1;
829 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
830 $index = 3;
831 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
832 $index = 2;
833 } else {
834 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
835 }
836
837 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int>
838 if ($ignore) {
839 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
840
841 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
842 if ($strand eq '+') {
843 $start += $ignore;
844 } elsif ($strand eq '-') {
845 $start += length($meth_call)-1; ## $meth_call is already shortened!
846 } else {
847 die "Alignment did not have proper strand information: $strand\n";
848 }
849 }
850 ### printing out the methylation state of every C in the read
851 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index);
852
853 ++$methylation_call_strings_processed; # 1 per single-end result
854 }
855 }
856 } else { # processing single-end SAM format (default)
857 while (<IN>) {
858 ### SAM format can either start with header lines (starting with @) or start with alignments directly
859 if (/^\@/) { # skipping header lines (starting with @)
860 warn "skipping SAM header line:\t$_";
861 next;
862 }
863
864 ++$line_count;
865 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
866
867 # example read in SAM format
868 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
869 ###
870
871 # < 0.7.6 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
872 # < 0.7.6 $meth_call =~ s/^XM:Z://;
873 # < 0.7.6 $read_conversion =~ s/^XR:Z://;
874 # < 0.7.6 $genome_conversion =~ s/^XG:Z://;
875
876 my ($id,$chrom,$start,$cigar) = (split("\t"))[0,2,3,5];
877
878 ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
879 my $meth_call; ### Thanks to Zachary Zeno for this solution
880 my $read_conversion;
881 my $genome_conversion;
882
883 while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
884 my $tag = $1;
885 my $value = $2;
886
887 if ($tag eq "XM") {
888 $meth_call = $value;
889 $meth_call =~ s/\r//;
890 } elsif ($tag eq "XR") {
891 $read_conversion = $value;
892 $read_conversion =~ s/\r//;
893 } elsif ($tag eq "XG") {
894 $genome_conversion = $value;
895 $genome_conversion =~ s/\r//;
896 }
897 }
898
899 my $strand;
900 chomp $genome_conversion;
901 # print "$meth_call\n$read_conversion\n$genome_conversion\n";
902
903 my $index;
904 if ($meth_call) {
905 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
906 $index = 0;
907 $strand = '+';
908 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
909 $index = 1;
910 $strand = '-';
911 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
912 $index = 2;
913 $strand = '+';
914 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
915 $index = 3;
916 $strand = '-';
917 } else {
918 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
919 }
920
921 ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output
922 if ($strand eq '-') {
923 $meth_call = reverse $meth_call;
924 }
925
926 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int>
927 if ($ignore) {
928 # print "\n\n$meth_call\n";
929 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
930 # print "$meth_call\n";
931 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
932
933 my @len = split (/\D+/,$cigar); # storing the length per operation
934 my @ops = split (/\d+/,$cigar); # storing the operation
935 shift @ops; # remove the empty first element
936 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
937
938 my @comp_cigar; # building an array with all CIGAR operations
939 foreach my $index (0..$#len) {
940 foreach (1..$len[$index]) {
941 # print "$ops[$index]";
942 push @comp_cigar, $ops[$index];
943 }
944 }
945 # print "original CIGAR: $cigar\n";
946 # print "original CIGAR: @comp_cigar\n";
947
948 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
949 if ($strand eq '+') {
950
951 my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions
952 my $I_count = 0;
953
954 for (1..$ignore) {
955 my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations from the start
956 # print "$_ deleted $op\n";
957
958 while ($op eq 'D') { # repeating this for deletions (D)
959 $D_count++;
960 $op = shift @comp_cigar;
961 # print "$_ deleted $op\n";
962 }
963 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
964 $I_count++;
965 }
966 }
967 $start += $ignore + $D_count - $I_count;
968 # print "start $start\t ignore: $ignore\t D count: $D_count I_count: $I_count\n";
969 } elsif ($strand eq '-') {
970
971 for (1..$ignore) {
972 my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
973 while ($op eq 'D') { # repeating this for deletions (D)
974 $op = pop @comp_cigar;
975 }
976 }
977
978 ### For reverse strand alignments we need to determine the number of matching bases (M) or deletions (D) in the read from the CIGAR
979 ### string to be able to work out the starting position of the read which is on the 3' end of the sequence
980 my $MD_count = 0; # counting all operations that affect the genomic position, i.e. M and D. Insertions do not affect the start position
981 foreach (@comp_cigar) {
982 ++$MD_count if ($_ eq 'M' or $_ eq 'D');
983 }
984 $start += $MD_count - 1;
985 }
986
987 ### reconstituting shortened CIGAR string
988 my $new_cigar;
989 my $count = 0;
990 my $last_op;
991 # print "ignore adjusted: @comp_cigar\n";
992 foreach my $op (@comp_cigar) {
993 unless (defined $last_op){
994 $last_op = $op;
995 ++$count;
996 next;
997 }
998 if ($last_op eq $op) {
999 ++$count;
1000 } else {
1001 $new_cigar .= "$count$last_op";
1002 $last_op = $op;
1003 $count = 1;
1004 }
1005 }
1006 $new_cigar .= "$count$last_op"; # appending the last operation and count
1007 $cigar = $new_cigar;
1008 # print "ignore adjusted scalar: $cigar\n";
1009 }
1010 }
1011 ### printing out the methylation state of every C in the read
1012 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index,$cigar);
1013
1014 ++$methylation_call_strings_processed; # 1 per single-end result
1015 }
1016 }
1017 }
1018
1019 ### PROCESSING PAIRED-END RESULT FILES
1020 elsif ($paired) {
1021
1022 ### proceeding differently now for SAM format or vanilla Bismark format files
1023 if ($vanilla) { # old vanilla Bismark paired-end output format
1024 while (<IN>) {
1025 ++$line_count;
1026 warn "processed line: $line_count\n" if ($line_count%500000==0);
1027
1028 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
1029 my ($id,$strand,$chrom,$start_read_1,$end_read_2,$seq_1,$meth_call_1,$seq_2,$meth_call_2,$first_read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,4,6,7,9,10,11,12,13];
1030
1031 my $index;
1032 chomp $genome_conversion;
1033
1034 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') {
1035 $index = 0; ## this is OT
1036 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') {
1037 $index = 2; ## this is CTOB!!!
1038 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') {
1039 $index = 1; ## this is CTOT!!!
1040 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') {
1041 $index = 3; ## this is OB
1042 } else {
1043 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
1044 }
1045
1046 if ($meth_call_1 and $meth_call_2) {
1047 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>'
1048 if ($ignore) {
1049 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
1050 $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore);
1051
1052 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified
1053 $start_read_1 += $ignore;
1054 $end_read_2 -= $ignore;
1055 }
1056 my $end_read_1;
1057 my $start_read_2;
1058
1059 if ($strand eq '+') {
1060
1061 $end_read_1 = $start_read_1+length($meth_call_1)-1;
1062 $start_read_2 = $end_read_2-length($meth_call_2)+1;
1063
1064 ## we first pass the first read which is in + orientation on the forward strand
1065 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0);
1066
1067 # we next pass the second read which is in - orientation on the reverse strand
1068 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
1069 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1);
1070 } else {
1071
1072 $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read!
1073 $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read!
1074
1075 ## we first pass the first read which is in - orientation on the reverse strand
1076 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0);
1077
1078 # we next pass the second read which is in + orientation on the forward strand
1079 ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
1080 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2);
1081 }
1082
1083 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
1084 }
1085 }
1086 } else { # Bismark paired-end SAM output format (default)
1087 while (<IN>) {
1088 ### SAM format can either start with header lines (starting with @) or start with alignments directly
1089 if (/^\@/) { # skipping header lines (starting with @)
1090 warn "skipping SAM header line:\t$_";
1091 next;
1092 }
1093
1094 ++$line_count;
1095 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
1096
1097 # example paired-end reads in SAM format (2 consecutive lines)
1098 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
1099 # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT
1100
1101 # < version 0.7.6 my ($id_1,$chrom,$start_read_1,$meth_call_1,$first_read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
1102
1103 my ($id_1,$chrom,$start_read_1,$cigar_1) = (split("\t"))[0,2,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
1104 my $meth_call_1;
1105 my $first_read_conversion;
1106 my $genome_conversion;
1107
1108 while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
1109 my $tag = $1;
1110 my $value = $2;
1111
1112 if ($tag eq "XM") {
1113 $meth_call_1 = $value;
1114 $meth_call_1 =~ s/\r//;
1115 } elsif ($tag eq "XR") {
1116 $first_read_conversion = $value;
1117 $first_read_conversion =~ s/\r//;
1118 } elsif ($tag eq "XG") {
1119 $genome_conversion = $value;
1120 $genome_conversion =~ s/\r//;
1121 }
1122 }
1123
1124 $_ = <IN>; # reading in the paired read
1125
1126 # < version 0.7.6 my ($id_2,$start_read_2,$meth_call_2,$second_read_conversion) = (split("\t"))[0,3,13,14];
1127 # < version 0.7.6 $meth_call_1 =~ s/^XM:Z://;
1128 # < version 0.7.6 $meth_call_2 =~ s/^XM:Z://;
1129 # < version 0.7.6 $first_read_conversion =~ s/^XR:Z://;
1130 # < version 0.7.6 $second_read_conversion =~ s/^XR:Z://;
1131
1132 my ($id_2,$start_read_2,$cigar_2) = (split("\t"))[0,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
1133
1134 my $meth_call_2;
1135 my $second_read_conversion;
1136
1137 while ( /(XM|XR):Z:([^\t]+)/g ) {
1138 my $tag = $1;
1139 my $value = $2;
1140
1141 if ($tag eq "XM") {
1142 $meth_call_2 = $value;
1143 $meth_call_2 =~ s/\r//;
1144 } elsif ($tag eq "XR") {
1145 $second_read_conversion = $value;
1146 $second_read_conversion = s/\r//;
1147 }
1148 }
1149
1150 # < version 0.7.6 $genome_conversion =~ s/^XG:Z://;
1151 chomp $genome_conversion; # in case it captured a new line character
1152
1153 # print join ("\t",$meth_call_1,$meth_call_2,$first_read_conversion,$second_read_conversion,$genome_conversion),"\n";
1154
1155 my $index;
1156 my $strand;
1157
1158 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') {
1159 $index = 0; ## this is OT
1160 $strand = '+';
1161 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') {
1162 $index = 1; ## this is CTOT
1163 $strand = '-';
1164 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') {
1165 $index = 2; ## this is CTOB
1166 $strand = '+';
1167 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') {
1168 $index = 3; ## this is OB
1169 $strand = '-';
1170 } else {
1171 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
1172 }
1173
1174 ### reversing the methylation call of the read that was reverse-complemented
1175 if ($strand eq '+') {
1176 $meth_call_2 = reverse $meth_call_2;
1177 } else {
1178 $meth_call_1 = reverse $meth_call_1;
1179 }
1180
1181 if ($meth_call_1 and $meth_call_2) {
1182
1183 my $end_read_1;
1184
1185 ### READ 1
1186 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
1187 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
1188 shift @ops_1; # remove the empty first element
1189 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1);
1190
1191 my @comp_cigar_1; # building an array with all CIGAR operations
1192 foreach my $index (0..$#len_1) {
1193 foreach (1..$len_1[$index]) {
1194 # print "$ops_1[$index]";
1195 push @comp_cigar_1, $ops_1[$index];
1196 }
1197 }
1198 # print "original CIGAR read 1: $cigar_1\n";
1199 # print "original CIGAR read 1: @comp_cigar_1\n";
1200
1201 ### READ 2
1202 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
1203 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
1204 shift @ops_2; # remove the empty first element
1205 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
1206 my @comp_cigar_2; # building an array with all CIGAR operations for read 2
1207 foreach my $index (0..$#len_2) {
1208 foreach (1..$len_2[$index]) {
1209 # print "$ops_2[$index]";
1210 push @comp_cigar_2, $ops_2[$index];
1211 }
1212 }
1213 # print "original CIGAR read 2: $cigar_2\n";
1214 # print "original CIGAR read 2: @comp_cigar_2\n";
1215
1216 if ($ignore) {
1217 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>'
1218 ### the methylation calls have already been reversed where necessary
1219 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
1220 $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore);
1221
1222 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
1223
1224 if ($strand eq '+') {
1225
1226 ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start
1227 my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions
1228 my $I_count_1 = 0;
1229
1230 for (1..$ignore) {
1231 my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start
1232 # print "$_ deleted $op\n";
1233
1234 while ($op eq 'D') { # repeating this for deletions (D)
1235 $D_count_1++;
1236 $op = shift @comp_cigar_1;
1237 # print "$_ deleted $op\n";
1238 }
1239 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
1240 $I_count_1++;
1241 }
1242 }
1243
1244 $start_read_1 += $ignore + $D_count_1 - $I_count_1;
1245 # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n";
1246
1247 ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back
1248
1249 for (1..$ignore) {
1250 my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
1251 while ($op eq 'D') { # repeating this for deletions (D)
1252 $op = pop @comp_cigar_2;
1253 }
1254 }
1255 # the start position of reads mapping to the reverse strand is being adjusted further below
1256 } elsif ($strand eq '-') {
1257
1258 ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back
1259 for (1..$ignore) {
1260 my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
1261 while ($op eq 'D') { # repeating this for deletions (D)
1262 $op = pop @comp_cigar_1;
1263 }
1264 }
1265 # the start position of reads mapping to the reverse strand is being adjusted further below
1266
1267 ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start
1268 my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions
1269 my $I_count_2 = 0;
1270
1271 for (1..$ignore) {
1272 my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start
1273 # print "$_ deleted $op\n";
1274
1275 while ($op eq 'D') { # repeating this for deletions (D)
1276 $D_count_2++;
1277 $op = shift @comp_cigar_2;
1278 # print "$_ deleted $op\n";
1279 }
1280 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
1281 $I_count_2++;
1282 }
1283 }
1284
1285 $start_read_2 += $ignore + $D_count_2 - $I_count_2;
1286 # print "start read 2 $start_read_2\t ignore: $ignore\t D count 2: $D_count_2\tI_count 2: $I_count_2\n";
1287
1288 }
1289
1290 ### reconstituting shortened CIGAR string 1
1291 my $new_cigar_1;
1292 my $count_1 = 0;
1293 my $last_op_1;
1294 # print "ignore adjusted CIGAR 1: @comp_cigar_1\n";
1295 foreach my $op (@comp_cigar_1) {
1296 unless (defined $last_op_1){
1297 $last_op_1 = $op;
1298 ++$count_1;
1299 next;
1300 }
1301 if ($last_op_1 eq $op) {
1302 ++$count_1;
1303 } else {
1304 $new_cigar_1 .= "$count_1$last_op_1";
1305 $last_op_1 = $op;
1306 $count_1 = 1;
1307 }
1308 }
1309 $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count
1310 $cigar_1 = $new_cigar_1;
1311 # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n";
1312
1313 ### reconstituting shortened CIGAR string 2
1314 my $new_cigar_2;
1315 my $count_2 = 0;
1316 my $last_op_2;
1317 # print "ignore adjusted CIGAR 2: @comp_cigar_2\n";
1318 foreach my $op (@comp_cigar_2) {
1319 unless (defined $last_op_2){
1320 $last_op_2 = $op;
1321 ++$count_2;
1322 next;
1323 }
1324 if ($last_op_2 eq $op) {
1325 ++$count_2;
1326 } else {
1327 $new_cigar_2 .= "$count_2$last_op_2";
1328 $last_op_2 = $op;
1329 $count_2 = 1;
1330 }
1331 }
1332 $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count
1333 $cigar_2 = $new_cigar_2;
1334 # print "ignore adjusted CIGAR 2 scalar: $cigar_2\n";
1335
1336 }
1337
1338 if ($strand eq '+') {
1339 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2
1340 @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
1341 # print "reverse: @comp_cigar_2\n";
1342
1343 my $MD_count_1 = 0;
1344 foreach (@comp_cigar_1) {
1345 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
1346 }
1347
1348 my $MD_count_2 = 0;
1349 foreach (@comp_cigar_2) {
1350 ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
1351 }
1352
1353 $end_read_1 = $start_read_1 + $MD_count_1 - 1;
1354 $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand
1355 } else {
1356 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1
1357
1358 @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
1359 # print "reverse: @comp_cigar_1\n";
1360
1361 my $MD_count_1 = 0;
1362 foreach (@comp_cigar_1) {
1363 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
1364 }
1365
1366 $end_read_1 = $start_read_1;
1367 $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand
1368
1369 }
1370
1371 if ($strand eq '+') {
1372 ## we first pass the first read which is in + orientation on the forward strand
1373 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1);
1374
1375 # we next pass the second read which is in - orientation on the reverse strand
1376 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
1377 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2);
1378 } else {
1379 ## we first pass the first read which is in - orientation on the reverse strand
1380 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1);
1381
1382 # we next pass the second read which is in + orientation on the forward strand
1383 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
1384 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2);
1385 }
1386
1387 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
1388 }
1389 }
1390 }
1391 } else {
1392 die "Single-end or paired-end reads not specified properly\n";
1393 }
1394
1395 print "\n\nProcessed $line_count lines from $filename in total\n";
1396 print "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n";
1397 if ($report) {
1398 print REPORT "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n";
1399 }
1400 print_splitting_report ();
1401 }
1402
1403
1404
1405 sub print_splitting_report{
1406
1407 ### Calculating methylation percentages if applicable
1408
1409 my $percent_meCpG;
1410 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
1411 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
1412 }
1413
1414 my $percent_meCHG;
1415 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
1416 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
1417 }
1418
1419 my $percent_meCHH;
1420 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
1421 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
1422 }
1423
1424 my $percent_non_CpG_methylation;
1425 if ($merge_non_CpG){
1426 if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
1427 $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) );
1428 }
1429 }
1430
1431 if ($report){
1432 ### detailed information about Cs analysed
1433 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
1434
1435 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
1436 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
1437
1438 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
1439 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
1440 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
1441
1442 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
1443 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
1444 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
1445
1446 ### calculating methylated CpG percentage if applicable
1447 if ($percent_meCpG){
1448 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
1449 }
1450 else{
1451 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
1452 }
1453
1454 ### 2-Context Output
1455 if ($merge_non_CpG){
1456 if ($percent_non_CpG_methylation){
1457 print REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
1458 }
1459 else{
1460 print REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
1461 }
1462 }
1463
1464 ### 3 Context Output
1465 else{
1466 ### calculating methylated CHG percentage if applicable
1467 if ($percent_meCHG){
1468 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
1469 }
1470 else{
1471 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
1472 }
1473
1474 ### calculating methylated CHH percentage if applicable
1475 if ($percent_meCHH){
1476 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
1477 }
1478 else{
1479 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
1480 }
1481 }
1482 }
1483
1484 ### detailed information about Cs analysed for on-screen report
1485 print "Final Cytosine Methylation Report\n",'='x33,"\n";
1486
1487 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
1488 print "Total number of C's analysed:\t$total_number_of_C\n\n";
1489
1490 print "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
1491 print "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
1492 print "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
1493
1494 print "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
1495 print "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
1496 print "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
1497
1498 ### printing methylated CpG percentage if applicable
1499 if ($percent_meCpG){
1500 print "C methylated in CpG context:\t${percent_meCpG}%\n";
1501 }
1502 else{
1503 print "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
1504 }
1505
1506 ### 2-Context Output
1507 if ($merge_non_CpG){
1508 if ($percent_non_CpG_methylation){
1509 print "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
1510 }
1511 else{
1512 print "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
1513 }
1514 }
1515
1516 ### 3-Context Output
1517 else{
1518 ### printing methylated CHG percentage if applicable
1519 if ($percent_meCHG){
1520 print "C methylated in CHG context:\t${percent_meCHG}%\n";
1521 }
1522 else{
1523 print "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
1524 }
1525
1526 ### printing methylated CHH percentage if applicable
1527 if ($percent_meCHH){
1528 print "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
1529 }
1530 else{
1531 print "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
1532 }
1533 }
1534 }
1535
1536
1537
1538
1539
1540 sub print_individual_C_methylation_states_paired_end_files{
1541
1542 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar) = @_;
1543 my @methylation_calls = split(//,$meth_call);
1544
1545 #################################################################
1546 ### . for bases not involving cytosines ###
1547 ### X for methylated C in CHG context (was protected) ###
1548 ### x for not methylated C in CHG context (was converted) ###
1549 ### H for methylated C in CHH context (was protected) ###
1550 ### h for not methylated C in CHH context (was converted) ###
1551 ### Z for methylated C in CpG context (was protected) ###
1552 ### z for not methylated C in CpG context (was converted) ###
1553 #################################################################
1554
1555 my $methyl_CHG_count = 0;
1556 my $methyl_CHH_count = 0;
1557 my $methyl_CpG_count = 0;
1558 my $unmethylated_CHG_count = 0;
1559 my $unmethylated_CHH_count = 0;
1560 my $unmethylated_CpG_count = 0;
1561
1562 my @len;
1563 my @ops;
1564 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
1565 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
1566 my @comp_cigar;
1567
1568 if ($cigar){ # parsing CIGAR string
1569 @len = split (/\D+/,$cigar); # storing the length per operation
1570 @ops = split (/\d+/,$cigar); # storing the operation
1571 shift @ops; # remove the empty first element
1572 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
1573
1574 foreach my $index (0..$#len){
1575 foreach (1..$len[$index]){
1576 # print "$ops[$index]";
1577 push @comp_cigar, $ops[$index];
1578 }
1579 }
1580 # warn "\nDetected CIGAR string: $cigar\n";
1581 # warn "Length of methylation call: ",length $meth_call,"\n";
1582 # warn "number of operations: ",scalar @ops,"\n";
1583 # warn "number of length digits: ",scalar @len,"\n\n";
1584 # print @comp_cigar,"\n";
1585 # print "$meth_call\n\n";
1586 # sleep (1);
1587 }
1588
1589
1590 if ($strand eq '-') {
1591
1592 ### the CIGAR string needs to be reversed, the methylation call has already been reversed above
1593 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
1594 # print "reverse CIGAR string: @comp_cigar\n";
1595
1596 ### the start position of paired-end files has already been corrected, see above
1597 }
1598
1599 ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified
1600
1601 if ($merge_non_CpG) {
1602
1603 if ($no_overlap) {
1604
1605 ### single-file CpG and non-CpG context output
1606 if ($full) {
1607 if ($strand eq '+') {
1608 for my $index (0..$#methylation_calls) {
1609
1610 if ($cigar){ # only needed for SAM files
1611 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1612 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
1613 $cigar_offset += $cigar_mod;
1614 $pos_offset += $pos_mod;
1615 }
1616
1617 ### Returning as soon as the methylation calls start overlapping
1618 if ($start+$index+$pos_offset >= $end_read_1) {
1619 return;
1620 }
1621
1622 if ($methylation_calls[$index] eq 'X') {
1623 $counting{total_meCHG_count}++;
1624 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1625 } elsif ($methylation_calls[$index] eq 'x') {
1626 $counting{total_unmethylated_CHG_count}++;
1627 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1628 } elsif ($methylation_calls[$index] eq 'Z') {
1629 $counting{total_meCpG_count}++;
1630 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1631 } elsif ($methylation_calls[$index] eq 'z') {
1632 $counting{total_unmethylated_CpG_count}++;
1633 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1634 } elsif ($methylation_calls[$index] eq 'H') {
1635 $counting{total_meCHH_count}++;
1636 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1637 } elsif ($methylation_calls[$index] eq 'h') {
1638 $counting{total_unmethylated_CHH_count}++;
1639 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1640 }
1641 elsif ($methylation_calls[$index] eq '.'){}
1642 else{
1643 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
1644 }
1645 }
1646 } elsif ($strand eq '-') {
1647 for my $index (0..$#methylation_calls) {
1648
1649 if ($cigar){ # only needed for SAM files
1650 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
1651 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1652 $cigar_offset += $cigar_mod;
1653 $pos_offset += $pos_mod;
1654 }
1655
1656 ### Returning as soon as the methylation calls start overlapping
1657 if ($start-$index+$pos_offset <= $end_read_1) {
1658 return;
1659 }
1660
1661 if ($methylation_calls[$index] eq 'X') {
1662 $counting{total_meCHG_count}++;
1663 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1664 } elsif ($methylation_calls[$index] eq 'x') {
1665 $counting{total_unmethylated_CHG_count}++;
1666 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1667 } elsif ($methylation_calls[$index] eq 'Z') {
1668 $counting{total_meCpG_count}++;
1669 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1670 } elsif ($methylation_calls[$index] eq 'z') {
1671 $counting{total_unmethylated_CpG_count}++;
1672 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1673 } elsif ($methylation_calls[$index] eq 'H') {
1674 $counting{total_meCHH_count}++;
1675 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1676 } elsif ($methylation_calls[$index] eq 'h') {
1677 $counting{total_unmethylated_CHH_count}++;
1678 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1679 }
1680 elsif ($methylation_calls[$index] eq '.') {}
1681 else{
1682 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
1683 }
1684 }
1685 } else {
1686 die "The read orientation was neither + nor -: '$strand'\n";
1687 }
1688 }
1689
1690 ### strand-specific methylation output
1691 else {
1692 if ($strand eq '+') {
1693 for my $index (0..$#methylation_calls) {
1694
1695 if ($cigar){ # only needed for SAM files
1696 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1697 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
1698 $cigar_offset += $cigar_mod;
1699 $pos_offset += $pos_mod;
1700 }
1701
1702 ### Returning as soon as the methylation calls start overlapping
1703 if ($start+$index+$pos_offset >= $end_read_1) {
1704 return;
1705 }
1706
1707 if ($methylation_calls[$index] eq 'X') {
1708 $counting{total_meCHG_count}++;
1709 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1710 } elsif ($methylation_calls[$index] eq 'x') {
1711 $counting{total_unmethylated_CHG_count}++;
1712 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1713 } elsif ($methylation_calls[$index] eq 'Z') {
1714 $counting{total_meCpG_count}++;
1715 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1716 } elsif ($methylation_calls[$index] eq 'z') {
1717 $counting{total_unmethylated_CpG_count}++;
1718 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1719 } elsif ($methylation_calls[$index] eq 'H') {
1720 $counting{total_meCHH_count}++;
1721 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1722 } elsif ($methylation_calls[$index] eq 'h') {
1723 $counting{total_unmethylated_CHH_count}++;
1724 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1725 }
1726 elsif ($methylation_calls[$index] eq '.') {}
1727 else{
1728 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
1729 }
1730 }
1731 } elsif ($strand eq '-') {
1732 for my $index (0..$#methylation_calls) {
1733
1734 if ($cigar){ # only needed for SAM files
1735 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
1736 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1737 $cigar_offset += $cigar_mod;
1738 $pos_offset += $pos_mod;
1739 }
1740
1741 ### Returning as soon as the methylation calls start overlapping
1742 if ($start-$index+$pos_offset <= $end_read_1) {
1743 return;
1744 }
1745
1746 if ($methylation_calls[$index] eq 'X') {
1747 $counting{total_meCHG_count}++;
1748 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1749 } elsif ($methylation_calls[$index] eq 'x') {
1750 $counting{total_unmethylated_CHG_count}++;
1751 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1752 } elsif ($methylation_calls[$index] eq 'Z') {
1753 $counting{total_meCpG_count}++;
1754 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1755 } elsif ($methylation_calls[$index] eq 'z') {
1756 $counting{total_unmethylated_CpG_count}++;
1757 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1758 } elsif ($methylation_calls[$index] eq 'H') {
1759 $counting{total_meCHH_count}++;
1760 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1761 } elsif ($methylation_calls[$index] eq 'h') {
1762 $counting{total_unmethylated_CHH_count}++;
1763 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1764 }
1765 elsif ($methylation_calls[$index] eq '.') {}
1766 else{
1767 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
1768 }
1769 }
1770 } else {
1771 die "The strand orientation was neither + nor -: '$strand'/n";
1772 }
1773 }
1774 }
1775
1776 ### this is the default paired-end procedure allowing overlaps and using every single C position
1777 ### Still within the 2-CONTEXT ONLY optional section
1778 else {
1779 ### single-file CpG and non-CpG context output
1780 if ($full) {
1781 if ($strand eq '+') {
1782 for my $index (0..$#methylation_calls) {
1783
1784 if ($cigar){ # only needed for SAM files
1785 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1786 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
1787 $cigar_offset += $cigar_mod;
1788 $pos_offset += $pos_mod;
1789 }
1790
1791 if ($methylation_calls[$index] eq 'X') {
1792 $counting{total_meCHG_count}++;
1793 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1794 } elsif ($methylation_calls[$index] eq 'x') {
1795 $counting{total_unmethylated_CHG_count}++;
1796 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1797 } elsif ($methylation_calls[$index] eq 'Z') {
1798 $counting{total_meCpG_count}++;
1799 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1800 } elsif ($methylation_calls[$index] eq 'z') {
1801 $counting{total_unmethylated_CpG_count}++;
1802 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1803 } elsif ($methylation_calls[$index] eq 'H') {
1804 $counting{total_meCHH_count}++;
1805 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1806 } elsif ($methylation_calls[$index] eq 'h') {
1807 $counting{total_unmethylated_CHH_count}++;
1808 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1809 }
1810 elsif ($methylation_calls[$index] eq '.') {}
1811 else{
1812 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
1813 }
1814 }
1815 } elsif ($strand eq '-') {
1816 for my $index (0..$#methylation_calls) {
1817
1818 if ($cigar){ # only needed for SAM files
1819 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
1820 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1821 $cigar_offset += $cigar_mod;
1822 $pos_offset += $pos_mod;
1823 }
1824
1825 if ($methylation_calls[$index] eq 'X') {
1826 $counting{total_meCHG_count}++;
1827 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1828 } elsif ($methylation_calls[$index] eq 'x') {
1829 $counting{total_unmethylated_CHG_count}++;
1830 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1831 } elsif ($methylation_calls[$index] eq 'Z') {
1832 $counting{total_meCpG_count}++;
1833 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1834 } elsif ($methylation_calls[$index] eq 'z') {
1835 $counting{total_unmethylated_CpG_count}++;
1836 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1837 } elsif ($methylation_calls[$index] eq 'H') {
1838 $counting{total_meCHH_count}++;
1839 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1840 } elsif ($methylation_calls[$index] eq 'h') {
1841 $counting{total_unmethylated_CHH_count}++;
1842 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1843 }
1844 elsif ($methylation_calls[$index] eq '.') {}
1845 else{
1846 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
1847 }
1848 }
1849 } else {
1850 die "The strand orientation as neither + nor -: '$strand'\n";
1851 }
1852 }
1853
1854 ### strand-specific methylation output
1855 ### still within the 2-CONTEXT optional section
1856 else {
1857 if ($strand eq '+') {
1858 for my $index (0..$#methylation_calls) {
1859
1860 if ($cigar){ # only needed for SAM files
1861 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1862 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
1863 $cigar_offset += $cigar_mod;
1864 $pos_offset += $pos_mod;
1865 }
1866
1867 if ($methylation_calls[$index] eq 'X') {
1868 $counting{total_meCHG_count}++;
1869 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1870 } elsif ($methylation_calls[$index] eq 'x') {
1871 $counting{total_unmethylated_CHG_count}++;
1872 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1873 } elsif ($methylation_calls[$index] eq 'Z') {
1874 $counting{total_meCpG_count}++;
1875 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1876 } elsif ($methylation_calls[$index] eq 'z') {
1877 $counting{total_unmethylated_CpG_count}++;
1878 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1879 } elsif ($methylation_calls[$index] eq 'H') {
1880 $counting{total_meCHH_count}++;
1881 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1882 } elsif ($methylation_calls[$index] eq 'h') {
1883 $counting{total_unmethylated_CHH_count}++;
1884 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1885 }
1886 elsif ($methylation_calls[$index] eq '.') {}
1887 else{
1888 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
1889 }
1890 }
1891 } elsif ($strand eq '-') {
1892 for my $index (0..$#methylation_calls) {
1893
1894 if ($cigar){ # only needed for SAM files
1895 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
1896 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1897 $cigar_offset += $cigar_mod;
1898 $pos_offset += $pos_mod;
1899 }
1900
1901 if ($methylation_calls[$index] eq 'X') {
1902 $counting{total_meCHG_count}++;
1903 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1904 } elsif ($methylation_calls[$index] eq 'x') {
1905 $counting{total_unmethylated_CHG_count}++;
1906 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1907 } elsif ($methylation_calls[$index] eq 'Z') {
1908 $counting{total_meCpG_count}++;
1909 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1910 } elsif ($methylation_calls[$index] eq 'z') {
1911 $counting{total_unmethylated_CpG_count}++;
1912 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1913 } elsif ($methylation_calls[$index] eq 'H') {
1914 $counting{total_meCHH_count}++;
1915 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1916 } elsif ($methylation_calls[$index] eq 'h') {
1917 $counting{total_unmethylated_CHH_count}++;
1918 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1919 }
1920 elsif ($methylation_calls[$index] eq '.') {}
1921 else{
1922 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
1923 }
1924 }
1925 } else {
1926 die "The strand orientation as neither + nor -: '$strand'\n";
1927 }
1928 }
1929 }
1930 }
1931
1932 ############################################
1933 ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ###
1934 ############################################
1935
1936 elsif ($no_overlap) {
1937 ### single-file CpG, CHG and CHH context output
1938 if ($full) {
1939 if ($strand eq '+') {
1940 for my $index (0..$#methylation_calls) {
1941
1942 if ($cigar){ # only needed for SAM files
1943 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1944 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
1945 $cigar_offset += $cigar_mod;
1946 $pos_offset += $pos_mod;
1947 }
1948
1949 ### Returning as soon as the methylation calls start overlapping
1950 if ($start+$index+$pos_offset >= $end_read_1) {
1951 return;
1952 }
1953
1954 if ($methylation_calls[$index] eq 'X') {
1955 $counting{total_meCHG_count}++;
1956 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1957 } elsif ($methylation_calls[$index] eq 'x') {
1958 $counting{total_unmethylated_CHG_count}++;
1959 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1960 } elsif ($methylation_calls[$index] eq 'Z') {
1961 $counting{total_meCpG_count}++;
1962 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1963 } elsif ($methylation_calls[$index] eq 'z') {
1964 $counting{total_unmethylated_CpG_count}++;
1965 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1966 } elsif ($methylation_calls[$index] eq 'H') {
1967 $counting{total_meCHH_count}++;
1968 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1969 } elsif ($methylation_calls[$index] eq 'h') {
1970 $counting{total_unmethylated_CHH_count}++;
1971 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
1972 }
1973 elsif ($methylation_calls[$index] eq '.') {}
1974 else{
1975 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
1976 }
1977 }
1978 } elsif ($strand eq '-') {
1979 for my $index (0..$#methylation_calls) {
1980
1981 if ($cigar){ # only needed for SAM files
1982 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
1983 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
1984 $cigar_offset += $cigar_mod;
1985 $pos_offset += $pos_mod;
1986 }
1987
1988 ### Returning as soon as the methylation calls start overlapping
1989 if ($start-$index+$pos_offset <= $end_read_1) {
1990 return;
1991 }
1992
1993 if ($methylation_calls[$index] eq 'X') {
1994 $counting{total_meCHG_count}++;
1995 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1996 } elsif ($methylation_calls[$index] eq 'x') {
1997 $counting{total_unmethylated_CHG_count}++;
1998 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
1999 } elsif ($methylation_calls[$index] eq 'Z') {
2000 $counting{total_meCpG_count}++;
2001 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2002 } elsif ($methylation_calls[$index] eq 'z') {
2003 $counting{total_unmethylated_CpG_count}++;
2004 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2005 } elsif ($methylation_calls[$index] eq 'H') {
2006 $counting{total_meCHH_count}++;
2007 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2008 } elsif ($methylation_calls[$index] eq 'h') {
2009 $counting{total_unmethylated_CHH_count}++;
2010 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2011 }
2012 elsif ($methylation_calls[$index] eq '.') {}
2013 else{
2014 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2015 }
2016 }
2017 } else {
2018 die "The strand orientation as neither + nor -: '$strand'\n";
2019 }
2020 }
2021
2022 ### strand-specific methylation output
2023 else {
2024 if ($strand eq '+') {
2025 for my $index (0..$#methylation_calls) {
2026
2027 if ($cigar){ # only needed for SAM files
2028 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2029 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2030 $cigar_offset += $cigar_mod;
2031 $pos_offset += $pos_mod;
2032 }
2033
2034 ### Returning as soon as the methylation calls start overlapping
2035 if ($start+$index+$pos_offset >= $end_read_1) {
2036 return;
2037 }
2038
2039 if ($methylation_calls[$index] eq 'X') {
2040 $counting{total_meCHG_count}++;
2041 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2042 } elsif ($methylation_calls[$index] eq 'x') {
2043 $counting{total_unmethylated_CHG_count}++;
2044 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2045 } elsif ($methylation_calls[$index] eq 'Z') {
2046 $counting{total_meCpG_count}++;
2047 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2048 } elsif ($methylation_calls[$index] eq 'z') {
2049 $counting{total_unmethylated_CpG_count}++;
2050 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2051 } elsif ($methylation_calls[$index] eq 'H') {
2052 $counting{total_meCHH_count}++;
2053 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2054 } elsif ($methylation_calls[$index] eq 'h') {
2055 $counting{total_unmethylated_CHH_count}++;
2056 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2057 }
2058 elsif ($methylation_calls[$index] eq '.') {}
2059 else{
2060 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2061 }
2062 }
2063 } elsif ($strand eq '-') {
2064 for my $index (0..$#methylation_calls) {
2065
2066 if ($cigar){ # only needed for SAM files
2067 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2068 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2069 $cigar_offset += $cigar_mod;
2070 $pos_offset += $pos_mod;
2071 }
2072
2073 ### Returning as soon as the methylation calls start overlapping
2074 if ($start-$index+$pos_offset <= $end_read_1) {
2075 return;
2076 }
2077
2078 if ($methylation_calls[$index] eq 'X') {
2079 $counting{total_meCHG_count}++;
2080 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2081 } elsif ($methylation_calls[$index] eq 'x') {
2082 $counting{total_unmethylated_CHG_count}++;
2083 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2084 } elsif ($methylation_calls[$index] eq 'Z') {
2085 $counting{total_meCpG_count}++;
2086 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2087 } elsif ($methylation_calls[$index] eq 'z') {
2088 $counting{total_unmethylated_CpG_count}++;
2089 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2090 } elsif ($methylation_calls[$index] eq 'H') {
2091 $counting{total_meCHH_count}++;
2092 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2093 } elsif ($methylation_calls[$index] eq 'h') {
2094 $counting{total_unmethylated_CHH_count}++;
2095 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2096 }
2097 elsif ($methylation_calls[$index] eq '.') {}
2098 else{
2099 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2100 }
2101 }
2102 } else {
2103 die "The strand orientation as neither + nor -: '$strand'\n";
2104 }
2105 }
2106 }
2107
2108 ### this is the default paired-end procedure allowing overlaps and using every single C position
2109 else {
2110 ### single-file CpG, CHG and CHH context output
2111 if ($full) {
2112 if ($strand eq '+') {
2113 for my $index (0..$#methylation_calls) {
2114
2115 if ($cigar){ # only needed for SAM files
2116 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2117 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2118 $cigar_offset += $cigar_mod;
2119 $pos_offset += $pos_mod;
2120 }
2121
2122 if ($methylation_calls[$index] eq 'X') {
2123 $counting{total_meCHG_count}++;
2124 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2125 } elsif ($methylation_calls[$index] eq 'x') {
2126 $counting{total_unmethylated_CHG_count}++;
2127 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2128 } elsif ($methylation_calls[$index] eq 'Z') {
2129 $counting{total_meCpG_count}++;
2130 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2131 } elsif ($methylation_calls[$index] eq 'z') {
2132 $counting{total_unmethylated_CpG_count}++;
2133 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2134 } elsif ($methylation_calls[$index] eq 'H') {
2135 $counting{total_meCHH_count}++;
2136 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2137 } elsif ($methylation_calls[$index] eq 'h') {
2138 $counting{total_unmethylated_CHH_count}++;
2139 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2140 }
2141 elsif ($methylation_calls[$index] eq '.') {}
2142 else{
2143 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2144 }
2145 }
2146 } elsif ($strand eq '-') {
2147 for my $index (0..$#methylation_calls) {
2148
2149 if ($cigar){ # only needed for SAM files
2150 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2151 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2152 $cigar_offset += $cigar_mod;
2153 $pos_offset += $pos_mod;
2154 }
2155
2156 if ($methylation_calls[$index] eq 'X') {
2157 $counting{total_meCHG_count}++;
2158 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2159 } elsif ($methylation_calls[$index] eq 'x') {
2160 $counting{total_unmethylated_CHG_count}++;
2161 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2162 } elsif ($methylation_calls[$index] eq 'Z') {
2163 $counting{total_meCpG_count}++;
2164 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2165 } elsif ($methylation_calls[$index] eq 'z') {
2166 $counting{total_unmethylated_CpG_count}++;
2167 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2168 } elsif ($methylation_calls[$index] eq 'H') {
2169 $counting{total_meCHH_count}++;
2170 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2171 } elsif ($methylation_calls[$index] eq 'h') {
2172 $counting{total_unmethylated_CHH_count}++;
2173 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2174 }
2175 elsif ($methylation_calls[$index] eq '.') {}
2176 else{
2177 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2178 }
2179 }
2180 } else {
2181 die "The strand orientation as neither + nor -: '$strand'\n";
2182 }
2183 }
2184
2185 ### strand-specific methylation output
2186 else {
2187 if ($strand eq '+') {
2188 for my $index (0..$#methylation_calls) {
2189
2190 if ($cigar){ # only needed for SAM files
2191 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2192 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2193 $cigar_offset += $cigar_mod;
2194 $pos_offset += $pos_mod;
2195 }
2196
2197 if ($methylation_calls[$index] eq 'X') {
2198 $counting{total_meCHG_count}++;
2199 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2200 } elsif ($methylation_calls[$index] eq 'x') {
2201 $counting{total_unmethylated_CHG_count}++;
2202 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2203 } elsif ($methylation_calls[$index] eq 'Z') {
2204 $counting{total_meCpG_count}++;
2205 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2206 } elsif ($methylation_calls[$index] eq 'z') {
2207 $counting{total_unmethylated_CpG_count}++;
2208 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2209 } elsif ($methylation_calls[$index] eq 'H') {
2210 $counting{total_meCHH_count}++;
2211 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2212 } elsif ($methylation_calls[$index] eq 'h') {
2213 $counting{total_unmethylated_CHH_count}++;
2214 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2215 }
2216 elsif ($methylation_calls[$index] eq '.') {}
2217 else{
2218 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2219 }
2220 }
2221 } elsif ($strand eq '-') {
2222 for my $index (0..$#methylation_calls) {
2223
2224 if ($cigar){ # only needed for SAM files
2225 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2226 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2227 $cigar_offset += $cigar_mod;
2228 $pos_offset += $pos_mod;
2229 }
2230
2231 if ($methylation_calls[$index] eq 'X') {
2232 $counting{total_meCHG_count}++;
2233 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2234 } elsif ($methylation_calls[$index] eq 'x') {
2235 $counting{total_unmethylated_CHG_count}++;
2236 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2237 } elsif ($methylation_calls[$index] eq 'Z') {
2238 $counting{total_meCpG_count}++;
2239 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2240 } elsif ($methylation_calls[$index] eq 'z') {
2241 $counting{total_unmethylated_CpG_count}++;
2242 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2243 } elsif ($methylation_calls[$index] eq 'H') {
2244 $counting{total_meCHH_count}++;
2245 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2246 } elsif ($methylation_calls[$index] eq 'h') {
2247 $counting{total_unmethylated_CHH_count}++;
2248 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2249 }
2250 elsif ($methylation_calls[$index] eq '.') {}
2251 else{
2252 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2253 }
2254 }
2255 } else {
2256 die "The strand orientation as neither + nor -: '$strand'\n";
2257 }
2258 }
2259 }
2260 }
2261
2262 sub check_cigar_string {
2263 my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_;
2264 # print "$index\t$cigar_offset\t$pos_offset\t$strand\t";
2265 my ($new_cigar_offset,$new_pos_offset) = (0,0);
2266
2267 if ($strand eq '+') {
2268 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
2269
2270 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
2271 # warn "position needs no adjustment\n";
2272 }
2273
2274 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
2275 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
2276 # warn "adjusted genomic position by -1 bp (insertion)\n";
2277 }
2278
2279 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
2280 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
2281 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
2282 # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
2283
2284 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
2285 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
2286 # warn "position needs no adjustment\n";
2287 last;
2288 }
2289 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
2290 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
2291 # warn "adjusted genomic position by another -1 bp (insertion)\n";
2292 last;
2293 }
2294 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
2295 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
2296 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
2297 # warn "adjusted genomic position by another +1 bp (deletion)\n";
2298 }
2299 else{
2300 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
2301 }
2302 }
2303 }
2304 else{
2305 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
2306 }
2307 }
2308
2309 elsif ($strand eq '-') {
2310 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
2311
2312 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
2313 # warn "position needs no adjustment\n";
2314 }
2315
2316 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
2317 $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position
2318 # warn "adjusted genomic position by +1 bp (insertion)\n";
2319 }
2320
2321 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
2322 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
2323 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
2324 # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
2325
2326 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
2327 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
2328 # warn "Found new 'M' operation; position needs no adjustment\n";
2329 last;
2330 }
2331 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
2332 $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position
2333 # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n";
2334 last;
2335 }
2336 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
2337 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
2338 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
2339 # warn "adjusted genomic position by another -1 bp (deletion)\n";
2340 }
2341 else{
2342 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
2343 }
2344 }
2345 }
2346 else{
2347 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
2348 }
2349 }
2350 # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n";
2351 return ($new_cigar_offset,$new_pos_offset);
2352 }
2353
2354 sub print_individual_C_methylation_states_single_end{
2355
2356 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_;
2357 my @methylation_calls = split(//,$meth_call);
2358
2359 #################################################################
2360 ### . for bases not involving cytosines ###
2361 ### X for methylated C in CHG context (was protected) ###
2362 ### x for not methylated C in CHG context (was converted) ###
2363 ### H for methylated C in CHH context (was protected) ###
2364 ### h for not methylated C in CHH context (was converted) ###
2365 ### Z for methylated C in CpG context (was protected) ###
2366 ### z for not methylated C in CpG context (was converted) ###
2367 #################################################################
2368
2369 my $methyl_CHG_count = 0;
2370 my $methyl_CHH_count = 0;
2371 my $methyl_CpG_count = 0;
2372 my $unmethylated_CHG_count = 0;
2373 my $unmethylated_CHH_count = 0;
2374 my $unmethylated_CpG_count = 0;
2375
2376 my @len;
2377 my @ops;
2378 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
2379 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
2380
2381 my @comp_cigar;
2382
2383 if ($cigar){ # parsing CIGAR string
2384 @len = split (/\D+/,$cigar); # storing the length per operation
2385 @ops = split (/\d+/,$cigar); # storing the operation
2386 shift @ops; # remove the empty first element
2387 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
2388
2389 foreach my $index (0..$#len){
2390 foreach (1..$len[$index]){
2391 # print "$ops[$index]";
2392 push @comp_cigar, $ops[$index];
2393 }
2394 }
2395 # warn "\nDetected CIGAR string: $cigar\n";
2396 # warn "Length of methylation call: ",length $meth_call,"\n";
2397 # warn "number of operations: ",scalar @ops,"\n";
2398 # warn "number of length digits: ",scalar @len,"\n\n";
2399 # print @comp_cigar,"\n";
2400 # print "$meth_call\n\n";
2401 # sleep (1);
2402 }
2403
2404 ### adjusting the start position for all reads mapping to the reverse strand
2405 if ($strand eq '-') {
2406
2407 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
2408 # print @comp_cigar,"\n";
2409
2410 unless ($ignore){ ### if --ignore was specified the start position has already been corrected
2411
2412 if ($cigar){ ### SAM format
2413 my $MD_count = 0;
2414 foreach (@comp_cigar){
2415 ++$MD_count if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
2416 }
2417 $start += $MD_count - 1;
2418 }
2419 else{ ### vanilla format
2420 $start += length($meth_call)-1;
2421 }
2422 }
2423 }
2424
2425 ### THIS IS THE CpG and Non-CpG SECTION (OPTIONAL)
2426
2427 ### single-file CpG and other-context output
2428 if ($full and $merge_non_CpG) {
2429 if ($strand eq '+') {
2430 for my $index (0..$#methylation_calls) {
2431
2432 if ($cigar){ # only needed for SAM files
2433 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2434 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2435 $cigar_offset += $cigar_mod;
2436 $pos_offset += $pos_mod;
2437 }
2438
2439 ### methylated Cs (any context) will receive a forward (+) orientation
2440 ### not methylated Cs (any context) will receive a reverse (-) orientation
2441 if ($methylation_calls[$index] eq 'X') {
2442 $counting{total_meCHG_count}++;
2443 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2444 }
2445 elsif ($methylation_calls[$index] eq 'x') {
2446 $counting{total_unmethylated_CHG_count}++;
2447 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2448 }
2449 elsif ($methylation_calls[$index] eq 'Z') {
2450 $counting{total_meCpG_count}++;
2451 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2452 }
2453 elsif ($methylation_calls[$index] eq 'z') {
2454 $counting{total_unmethylated_CpG_count}++;
2455 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2456 }
2457 elsif ($methylation_calls[$index] eq 'H') {
2458 $counting{total_meCHH_count}++;
2459 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2460 }
2461 elsif ($methylation_calls[$index] eq 'h') {
2462 $counting{total_unmethylated_CHH_count}++;
2463 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2464 }
2465 elsif ($methylation_calls[$index] eq '.') {
2466 }
2467 else{
2468 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2469 }
2470 }
2471 }
2472 elsif ($strand eq '-') {
2473
2474 for my $index (0..$#methylation_calls) {
2475 ### methylated Cs (any context) will receive a forward (+) orientation
2476 ### not methylated Cs (any context) will receive a reverse (-) orientation
2477
2478 if ($cigar){ # only needed for SAM files
2479 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2480 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2481 $cigar_offset += $cigar_mod;
2482 $pos_offset += $pos_mod;
2483 }
2484
2485 if ($methylation_calls[$index] eq 'X') {
2486 $counting{total_meCHG_count}++;
2487 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2488 }
2489 elsif ($methylation_calls[$index] eq 'x') {
2490 $counting{total_unmethylated_CHG_count}++;
2491 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2492 }
2493 elsif ($methylation_calls[$index] eq 'Z') {
2494 $counting{total_meCpG_count}++;
2495 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2496 }
2497 elsif ($methylation_calls[$index] eq 'z') {
2498 $counting{total_unmethylated_CpG_count}++;
2499 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2500 }
2501 elsif ($methylation_calls[$index] eq 'H') {
2502 $counting{total_meCHH_count}++;
2503 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2504 }
2505 elsif ($methylation_calls[$index] eq 'h') {
2506 $counting{total_unmethylated_CHH_count}++;
2507 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2508 }
2509 elsif ($methylation_calls[$index] eq '.'){
2510 }
2511 else{
2512 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2513 }
2514 }
2515 }
2516 else {
2517 die "The strand information was neither + nor -: $strand\n";
2518 }
2519 }
2520
2521 ### strand-specific methylation output
2522 elsif ($merge_non_CpG) {
2523 if ($strand eq '+') {
2524 for my $index (0..$#methylation_calls) {
2525 ### methylated Cs (any context) will receive a forward (+) orientation
2526 ### not methylated Cs (any context) will receive a reverse (-) orientation
2527
2528 if ($cigar){ # only needed for SAM files
2529 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2530 $cigar_offset += $cigar_mod;
2531 $pos_offset += $pos_mod;
2532 }
2533
2534 if ($methylation_calls[$index] eq 'X') {
2535 $counting{total_meCHG_count}++;
2536 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2537 }
2538 elsif ($methylation_calls[$index] eq 'x') {
2539 $counting{total_unmethylated_CHG_count}++;
2540 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2541 }
2542 elsif ($methylation_calls[$index] eq 'Z') {
2543 $counting{total_meCpG_count}++;
2544 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2545 }
2546 elsif ($methylation_calls[$index] eq 'z') {
2547 $counting{total_unmethylated_CpG_count}++;
2548 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2549 }
2550 elsif ($methylation_calls[$index] eq 'H') {
2551 $counting{total_meCHH_count}++;
2552 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2553 }
2554 elsif ($methylation_calls[$index] eq 'h') {
2555 $counting{total_unmethylated_CHH_count}++;
2556 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2557 }
2558 elsif ($methylation_calls[$index] eq '.') {
2559 }
2560 else{
2561 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2562 }
2563 }
2564 }
2565 elsif ($strand eq '-') {
2566
2567 for my $index (0..$#methylation_calls) {
2568 ### methylated Cs (any context) will receive a forward (+) orientation
2569 ### not methylated Cs (any context) will receive a reverse (-) orientation
2570
2571 if ($cigar){ # only needed for SAM files
2572 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2573 $cigar_offset += $cigar_mod;
2574 $pos_offset += $pos_mod;
2575 }
2576
2577 if ($methylation_calls[$index] eq 'X') {
2578 $counting{total_meCHG_count}++;
2579 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2580 }
2581 elsif ($methylation_calls[$index] eq 'x') {
2582 $counting{total_unmethylated_CHG_count}++;
2583 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2584 }
2585 elsif ($methylation_calls[$index] eq 'Z') {
2586 $counting{total_meCpG_count}++;
2587 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2588 }
2589 elsif ($methylation_calls[$index] eq 'z') {
2590 $counting{total_unmethylated_CpG_count}++;
2591 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2592 }
2593 elsif ($methylation_calls[$index] eq 'H') {
2594 $counting{total_meCHH_count}++;
2595 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2596 }
2597 elsif ($methylation_calls[$index] eq 'h') {
2598 $counting{total_unmethylated_CHH_count}++;
2599 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2600 }
2601 elsif ($methylation_calls[$index] eq '.') {
2602 }
2603 else{
2604 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2605 }
2606 }
2607 }
2608 else {
2609 die "The strand information was neither + nor -: $strand\n";
2610 }
2611 }
2612
2613 ### THIS IS THE 3-CONTEXT (CpG, CHG and CHH) DEFAULT SECTION
2614
2615 elsif ($full) {
2616 if ($strand eq '+') {
2617 for my $index (0..$#methylation_calls) {
2618 ### methylated Cs (any context) will receive a forward (+) orientation
2619 ### not methylated Cs (any context) will receive a reverse (-) orientation
2620
2621 if ($cigar){ # only needed for SAM files
2622 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2623 $cigar_offset += $cigar_mod;
2624 $pos_offset += $pos_mod;
2625 }
2626
2627 if ($methylation_calls[$index] eq 'X') {
2628 $counting{total_meCHG_count}++;
2629 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2630 } elsif ($methylation_calls[$index] eq 'x') {
2631 $counting{total_unmethylated_CHG_count}++;
2632 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2633 } elsif ($methylation_calls[$index] eq 'Z') {
2634 $counting{total_meCpG_count}++;
2635 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2636 } elsif ($methylation_calls[$index] eq 'z') {
2637 $counting{total_unmethylated_CpG_count}++;
2638 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2639 } elsif ($methylation_calls[$index] eq 'H') {
2640 $counting{total_meCHH_count}++;
2641 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2642 } elsif ($methylation_calls[$index] eq 'h') {
2643 $counting{total_unmethylated_CHH_count}++;
2644 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2645 }
2646 elsif ($methylation_calls[$index] eq '.') {}
2647 else{
2648 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2649 }
2650 }
2651 }
2652 elsif ($strand eq '-') {
2653
2654 for my $index (0..$#methylation_calls) {
2655 ### methylated Cs (any context) will receive a forward (+) orientation
2656 ### not methylated Cs (any context) will receive a reverse (-) orientation
2657
2658 if ($cigar){ # only needed for SAM files
2659 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2660 $cigar_offset += $cigar_mod;
2661 $pos_offset += $pos_mod;
2662 }
2663
2664 if ($methylation_calls[$index] eq 'X') {
2665 $counting{total_meCHG_count}++;
2666 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2667 } elsif ($methylation_calls[$index] eq 'x') {
2668 $counting{total_unmethylated_CHG_count}++;
2669 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2670 } elsif ($methylation_calls[$index] eq 'Z') {
2671 $counting{total_meCpG_count}++;
2672 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2673 } elsif ($methylation_calls[$index] eq 'z') {
2674 $counting{total_unmethylated_CpG_count}++;
2675 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2676 } elsif ($methylation_calls[$index] eq 'H') {
2677 $counting{total_meCHH_count}++;
2678 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2679 } elsif ($methylation_calls[$index] eq 'h') {
2680 $counting{total_unmethylated_CHH_count}++;
2681 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2682 }
2683 elsif ($methylation_calls[$index] eq '.') {}
2684 else{
2685 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2686 }
2687 }
2688 }
2689 else {
2690 die "The read had a strand orientation which was neither + nor -: $strand\n";
2691 }
2692 }
2693
2694 ### strand-specific methylation output
2695 else {
2696 if ($strand eq '+') {
2697 for my $index (0..$#methylation_calls) {
2698 ### methylated Cs (any context) will receive a forward (+) orientation
2699 ### not methylated Cs (any context) will receive a reverse (-) orientation
2700
2701 if ($cigar){ # only needed for SAM files
2702 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2703 $cigar_offset += $cigar_mod;
2704 $pos_offset += $pos_mod;
2705 }
2706
2707 if ($methylation_calls[$index] eq 'X') {
2708 $counting{total_meCHG_count}++;
2709 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2710 } elsif ($methylation_calls[$index] eq 'x') {
2711 $counting{total_unmethylated_CHG_count}++;
2712 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2713 } elsif ($methylation_calls[$index] eq 'Z') {
2714 $counting{total_meCpG_count}++;
2715 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2716 } elsif ($methylation_calls[$index] eq 'z') {
2717 $counting{total_unmethylated_CpG_count}++;
2718 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2719 } elsif ($methylation_calls[$index] eq 'H') {
2720 $counting{total_meCHH_count}++;
2721 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2722 } elsif ($methylation_calls[$index] eq 'h') {
2723 $counting{total_unmethylated_CHH_count}++;
2724 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2725 }
2726 elsif ($methylation_calls[$index] eq '.') {}
2727 else{
2728 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2729 }
2730 }
2731 }
2732 elsif ($strand eq '-') {
2733
2734 for my $index (0..$#methylation_calls) {
2735 ### methylated Cs (any context) will receive a forward (+) orientation
2736 ### not methylated Cs (any context) will receive a reverse (-) orientation
2737
2738 if ($cigar){ # only needed for SAM files
2739 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2740 $cigar_offset += $cigar_mod;
2741 $pos_offset += $pos_mod;
2742 }
2743
2744 if ($methylation_calls[$index] eq 'X') {
2745 $counting{total_meCHG_count}++;
2746 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2747 } elsif ($methylation_calls[$index] eq 'x') {
2748 $counting{total_unmethylated_CHG_count}++;
2749 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2750 } elsif ($methylation_calls[$index] eq 'Z') {
2751 $counting{total_meCpG_count}++;
2752 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2753 } elsif ($methylation_calls[$index] eq 'z') {
2754 $counting{total_unmethylated_CpG_count}++;
2755 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2756 } elsif ($methylation_calls[$index] eq 'H') {
2757 $counting{total_meCHH_count}++;
2758 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2759 } elsif ($methylation_calls[$index] eq 'h') {
2760 $counting{total_unmethylated_CHH_count}++;
2761 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2762 }
2763 elsif ($methylation_calls[$index] eq '.') {}
2764 else{
2765 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2766 }
2767 }
2768 }
2769 else {
2770 die "The strand information was neither + nor -: $strand\n";
2771 }
2772 }
2773 }
2774
2775
2776
2777 #######################################################################################################################################
2778 ### bismark2bedGaph section - START
2779 #######################################################################################################################################
2780
2781 sub process_bedGraph_output{
2782 warn "="x64,"\n";
2783 warn "Methylation information will now be written into a bedGraph file\n";
2784 warn "="x64,"\n\n";
2785 sleep (2);
2786
2787 ### Closing all filehandles so that the Bismark methtylation extractor output doesn't get truncated due to buffering issues
2788 foreach my $fh (keys %fhs) {
2789 if ($fh =~ /^[1230]$/) {
2790 foreach my $context (keys %{$fhs{$fh}}) {
2791 close $fhs{$fh}->{$context} or die $!;
2792 }
2793 } else {
2794 close $fhs{$fh} or die $!;
2795 }
2796 }
2797
2798 ### deciding which files to use for bedGraph conversion
2799 foreach my $filename (@sorting_files){
2800 # warn "$filename\n";
2801 if ($filename =~ /\//){ # if files are in a different output folder we extract the filename again
2802 $filename =~ s/.*\///; # replacing everything up to the last slash in the filename
2803 # warn "$filename\n";
2804 }
2805
2806 if ($CX_context){
2807 push @bedfiles,$filename;
2808 }
2809 else{ ## CpG context only (default)
2810 if ($filename =~ /^CpG_/){
2811 push @bedfiles,$filename;
2812 }
2813 else{
2814 # skipping CHH or CHG files
2815 }
2816 }
2817 }
2818
2819 warn "Using the following files as Input:\n";
2820 print join ("\t",@bedfiles),"\n\n";
2821 sleep (2);
2822
2823 my %temp_fhs;
2824 my @temp_files; # writing all context files (default CpG only) to these files prior to sorting
2825
2826 ### changing to the output directory
2827 unless ($output_dir eq ''){ # default
2828 chdir $output_dir or die "Failed to change directory to $output_dir\n";
2829 warn "Changed directory to $output_dir\n";
2830 }
2831
2832 foreach my $infile (@bedfiles) {
2833
2834 if ($remove) {
2835 warn "Now replacing whitespaces in the sequence ID field of the Bismark methylation extractor output $infile prior to bedGraph conversion\n\n";
2836
2837 open (READ,$infile) or die $!;
2838
2839 my $removed_spaces_outfile = $infile;
2840 $removed_spaces_outfile =~ s/$/.spaces_removed.txt/;
2841
2842 open (REM,'>',$output_dir.$removed_spaces_outfile) or die "Couldn't write to file $removed_spaces_outfile: $!\n";
2843
2844 unless ($no_header){
2845 $_ = <READ>; ### Bismark version header
2846 print REM $_; ### Bismark version header
2847 }
2848
2849 while (<READ>) {
2850 chomp;
2851 my ($id,$strand,$chr,$pos,$context) = (split (/\t/));
2852 $id =~ s/\s+/_/g;
2853 print REM join ("\t",$id,$strand,$chr,$pos,$context),"\n";
2854 }
2855
2856 close READ or die $!;
2857 close REM or die $!;
2858
2859 ### changing the infile name to the new file without spaces
2860 $infile = $removed_spaces_outfile;
2861 }
2862
2863 warn "Now writing methylation information for file $infile to individual files for each chromosome\n";
2864 open (IN,$infile) or die $!;
2865
2866 ## always ignoring the version header
2867 unless ($no_header){
2868 $_ = <IN>; ### Bismark version header
2869 }
2870
2871 while (<IN>) {
2872 chomp;
2873 my ($chr) = (split (/\t/))[2];
2874
2875 unless (exists $temp_fhs{$chr}) {
2876 open ($temp_fhs{$chr},'>','chr'.$chr.'.meth_extractor.temp') or die "Failed to open filehandle: $!";
2877 }
2878 print {$temp_fhs{$chr}} "$_\n";
2879 }
2880
2881 warn "Finished writing out individual chromosome files for $infile\n";
2882 }
2883 warn "\n";
2884
2885 @temp_files = <*.meth_extractor.temp>;
2886
2887 warn "Collecting temporary chromosome file information...\n";
2888 sleep (1);
2889 warn "processing the following input file(s):\n";
2890 warn join ("\n",@temp_files),"\n\n";
2891 sleep (1);
2892
2893 foreach my $in (@temp_files) {
2894 warn "Sorting input file $in by positions\n";
2895 open my $ifh, "sort -k3,3 -k4,4n $in |" or die "Input file could not be sorted. $!";
2896 # print "Chromosome\tStart Position\tEnd Position\tMethylation Percentage\n";
2897
2898 ############################################# m.a.bentley - moved the variables out of the while loop to hold the current line data {
2899
2900 my $name;
2901 my $meth_state;
2902 my $chr = "";
2903 my $pos = 0;
2904 my $meth_state2;
2905
2906 my $last_pos;
2907 my $last_chr;
2908
2909 ############################################# }
2910
2911 while (my $line = <$ifh>) {
2912 next if $line =~ /^Bismark/;
2913 chomp $line;
2914
2915 ########################################### m.a.bentley - (1) set the last_chr and last_pos variables early in the while loop, before the line split (2) removed unnecessary setting of same variables in if statement {
2916
2917 $last_chr = $chr;
2918 $last_pos = $pos;
2919 ($name, $meth_state, $chr, $pos, $meth_state2) = split "\t", $line;
2920
2921 if (($last_pos ne $pos) || ($last_chr ne $chr)) {
2922 generate_output($last_chr,$last_pos) if $methylcalls[2] > 0;
2923 @methylcalls = qw (0 0 0);
2924 }
2925
2926 ############################################# }
2927
2928 my $validated = validate_methylation_call($meth_state, $meth_state2);
2929 unless($validated){
2930 warn "Methylation state of sequence ($name) in file ($in) on line $. is inconsistent (meth_state is $meth_state, meth_state2 = $meth_state2)\n";
2931 next;
2932 }
2933 if ($meth_state eq "+") {
2934 $methylcalls[0]++;
2935 $methylcalls[2]++;
2936 } else {
2937 $methylcalls[1]++;
2938 $methylcalls[2]++;
2939 }
2940 }
2941
2942 ############################################# m.a.bentley - set the last_chr and last_pos variables for the last line in the file (outside the while loop's scope using the method i've implemented) {
2943
2944 $last_chr = $chr;
2945 $last_pos = $pos;
2946 if ($methylcalls[2] > 0) {
2947 generate_output($last_chr,$last_pos) if $methylcalls[2] > 0;
2948 }
2949 ############################################# }
2950
2951 close $ifh or die $!;
2952
2953 @methylcalls = qw (0 0 0); # resetting @methylcalls
2954
2955 ### deleting temporary files
2956 my $delete = unlink $in;
2957 if ($delete) {
2958 warn "Successfully deleted the temporary input file $in\n\n";
2959 }
2960 else {
2961 warn "The temporary inputfile $in could not be deleted $!\n\n";
2962 }
2963 }
2964 }
2965
2966 sub generate_output{
2967 my $methcount = $methylcalls[0];
2968 my $nonmethcount = $methylcalls[1];
2969 my $totalcount = $methylcalls[2];
2970 my $last_chr = shift;
2971 my $last_pos = shift;
2972 croak "Should not be generating output if there's no reads to this region" unless $totalcount > 0;
2973 croak "Total counts ($totalcount) is not the sum of the methylated ($methcount) and unmethylated ($nonmethcount) counts" if $totalcount != ($methcount + $nonmethcount);
2974
2975 ############################################# m.a.bentley - declare a new variable 'bed_pos' to distinguish from bismark positions (-1) - previous scripts modified the last_pos variable earlier in the script leading to problems in meth % calculation {
2976
2977 my $bed_pos = $last_pos -1; ### Bismark coordinates are 1 based whereas bedGraph coordinates are 0 based.
2978 my $meth_percentage;
2979 ($totalcount >= $coverage_threshold) ? ($meth_percentage = ($methcount/$totalcount) * 100) : ($meth_percentage = undef);
2980 # $meth_percentage =~ s/(\.\d\d).+$/$1/ unless $meth_percentage =~ /^Below/;
2981 if (defined $meth_percentage){
2982 if ($counts){
2983 print OUT "$last_chr\t$bed_pos\t$bed_pos\t$meth_percentage\t$methcount\t$nonmethcount\n";
2984 }
2985 else{
2986 print OUT "$last_chr\t$bed_pos\t$bed_pos\t$meth_percentage\n";
2987 }
2988 }
2989 ############################################# }
2990 }
2991
2992 sub validate_methylation_call{
2993 my $meth_state = shift;
2994 croak "Missing (+/-) methylation call" unless defined $meth_state;
2995 my $meth_state2 = shift;
2996 croak "Missing alphabetical methylation call" unless defined $meth_state2;
2997 my $is_consistent;
2998 ($meth_state2 =~ /^z/i) ? ($is_consistent = check_CpG_methylation_call($meth_state, $meth_state2))
2999 : ($is_consistent = check_nonCpG_methylation_call($meth_state,$meth_state2));
3000 return 1 if $is_consistent;
3001 return 0;
3002 }
3003
3004 sub check_CpG_methylation_call{
3005 my $meth1 = shift;
3006 my $meth2 = shift;
3007 return 1 if($meth1 eq "+" && $meth2 eq "Z");
3008 return 1 if($meth1 eq "-" && $meth2 eq "z");
3009 return 0;
3010 }
3011
3012 sub check_nonCpG_methylation_call{
3013 my $meth1 = shift;
3014 my $meth2 = shift;
3015 return 1 if($meth1 eq "+" && $meth2 eq "C");
3016 return 1 if($meth1 eq "+" && $meth2 eq "X");
3017 return 1 if($meth1 eq "+" && $meth2 eq "H");
3018 return 1 if($meth1 eq "-" && $meth2 eq "c");
3019 return 1 if($meth1 eq "-" && $meth2 eq "x");
3020 return 1 if($meth1 eq "-" && $meth2 eq "h");
3021 return 0;
3022 }
3023
3024 #######################################################################################################################################
3025 ### bismark2bedGaph section - END
3026 #######################################################################################################################################
3027
3028
3029
3030
3031
3032
3033 #######################################################################################################################################
3034 ### genome-wide cytosine methylation report - START
3035 #######################################################################################################################################
3036
3037 sub generate_genome_wide_cytosine_report {
3038
3039 warn "="x78,"\n";
3040 warn "Methylation information will now be written into a genome-wide cytosine report\n";
3041 warn "="x78,"\n\n";
3042 sleep (2);
3043
3044 ### changing to the output directory again
3045 unless ($output_dir eq ''){ # default
3046 chdir $output_dir or die "Failed to change directory to $output_dir\n";
3047 # warn "Changed directory to $output_dir\n";
3048 }
3049
3050 my $in = shift;
3051 open (IN,$in) or die $!;
3052
3053 my $cytosine_out = shift;
3054
3055 if ($CX_context){
3056 $cytosine_out =~ s/$/genome-wide_CX_report.txt/;
3057 }
3058 else{
3059 $cytosine_out =~ s/$/genome_wide_CpG_report.txt/;
3060 }
3061
3062 ### note: we are still in the folder: $output_dir, so we do not have to include this into the open commands
3063 unless ($split_by_chromosome){ ### writing all output to a single file (default)
3064 open (CYT,'>',$cytosine_out) or die $!;
3065 warn "Writing genome-wide cytosine report to: $cytosine_out\n";
3066 sleep (3);
3067 }
3068
3069 my $last_chr;
3070 my %chr; # storing reads for one chromosome at a time
3071
3072 my $count = 0;
3073 while (<IN>){
3074 chomp;
3075 ++$count;
3076 my ($chr,$start,$end,undef,$meth,$nonmeth) = (split /\t/);
3077
3078 # defining the first chromosome
3079 unless (defined $last_chr){
3080 $last_chr = $chr;
3081 # warn "Storing all covered cytosine positions for chromosome: $chr\n";
3082 }
3083
3084 if ($chr eq $last_chr){
3085 $chr{$chr}->{$start}->{meth} = $meth;
3086 $chr{$chr}->{$start}->{nonmeth} = $nonmeth;
3087 }
3088 else{
3089 warn "Writing cytosine reports for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n";
3090
3091 if ($split_by_chromosome){ ## writing output to 1 file per chromosome
3092 my $chromosome_out = $cytosine_out;
3093 $chromosome_out =~ s/txt$/chr${last_chr}.txt/;
3094 open (CYT,'>',$chromosome_out) or die $!;
3095 }
3096
3097 while ( $chromosomes{$last_chr} =~ /([CG])/g){
3098
3099 my $tri_nt = '';
3100 my $context = '';
3101 my $pos = pos$chromosomes{$last_chr};
3102
3103 my $strand;
3104 my $meth = 0;
3105 my $nonmeth = 0;
3106
3107 if ($1 eq 'C'){ # C on forward strand
3108 $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based!
3109 $strand = '+';
3110 }
3111 elsif ($1 eq 'G'){ # C on reverse strand
3112 $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based!
3113 $tri_nt = reverse $tri_nt;
3114 $tri_nt =~ tr/ACTG/TGAC/;
3115 $strand = '-';
3116 }
3117 next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted
3118
3119 if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 0-based!
3120 $meth = $chr{$last_chr}->{$pos-1}->{meth};
3121 $nonmeth = $chr{$last_chr}->{$pos-1}->{nonmeth};
3122 }
3123
3124 ### determining cytosine context
3125 if ($tri_nt =~ /^CG/){
3126 $context = 'CG';
3127 }
3128 elsif ($tri_nt =~ /^C.{1}G$/){
3129 $context = 'CHG';
3130 }
3131 elsif ($tri_nt =~ /^C.{2}$/){
3132 $context = 'CHH';
3133 }
3134 else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark)
3135 warn "The sequence context could not be determined (found: '$tri_nt'). Skipping.\n";
3136 next;
3137 }
3138
3139 if ($CpG_only){
3140 if ($tri_nt =~ /^CG/){ # CpG context is the default
3141 if ($zero){ # zero based coordinates
3142 $pos -= 1;
3143 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
3144 }
3145 else{ # default
3146 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
3147 }
3148 }
3149 }
3150 else{ ## all cytosines, specified with --CX
3151 if ($zero){ # zero based coordinates
3152 $pos -= 1;
3153 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
3154 }
3155 else{ # default
3156 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
3157 }
3158 }
3159 }
3160
3161 %chr = (); # resetting the hash
3162
3163 # new first entry
3164 $last_chr = $chr;
3165 $chr{$chr}->{$start}->{meth} = $meth;
3166 $chr{$chr}->{$start}->{nonmeth} = $nonmeth;
3167 }
3168 }
3169
3170 # Last found chromosome
3171 warn "Writing cytosine reports for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n";
3172
3173 if ($split_by_chromosome){ ## writing output to 1 file per chromosome
3174 my $chromosome_out = $cytosine_out;
3175 $chromosome_out =~ s/txt$/chr${last_chr}.txt/;
3176 open (CYT,'>',$chromosome_out) or die $!;
3177 }
3178
3179 while ( $chromosomes{$last_chr} =~ /([CG])/g){
3180
3181 my $tri_nt;
3182 my $context;
3183 my $pos = pos$chromosomes{$last_chr};
3184
3185 my $strand;
3186 my $meth = 0;
3187 my $nonmeth = 0;
3188
3189 if ($1 eq 'C'){ # C on forward strand
3190 $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based!
3191 $strand = '+';
3192 }
3193 elsif ($1 eq 'G'){ # C on reverse strand
3194 $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based!
3195 $tri_nt = reverse $tri_nt;
3196 $tri_nt =~ tr/ACTG/TGAC/;
3197 $strand = '-';
3198 }
3199
3200 if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 0-based!
3201 $meth = $chr{$last_chr}->{$pos-1}->{meth};
3202 $nonmeth = $chr{$last_chr}->{$pos-1}->{nonmeth};
3203 }
3204
3205 next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted
3206
3207 ### determining cytosine context
3208 if ($tri_nt =~ /^CG/){
3209 $context = 'CG';
3210 }
3211 elsif ($tri_nt =~ /^C.{1}G$/){
3212 $context = 'CHG';
3213 }
3214 elsif ($tri_nt =~ /^C.{2}$/){
3215 $context = 'CHH';
3216 }
3217 else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark)
3218 warn "The cytosine context could not be determined (found: '$tri_nt'). Skipping.\n";
3219 next;
3220 }
3221
3222 if ($CpG_only){
3223 if ($tri_nt =~ /^CG/){ # CpG context is the default
3224 if ($zero){ # zero-based coordinates
3225 $pos -= 1;
3226 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
3227 }
3228 else{ # default
3229 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
3230 }
3231 }
3232 }
3233 else{ ## all cytosines, specified with --CX
3234 if ($zero){ # zero based coordinates
3235 $pos -= 1;
3236 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
3237 }
3238 else{ # default
3239 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
3240 }
3241 }
3242 }
3243 close CYT or die $!;
3244 }
3245
3246
3247 sub read_genome_into_memory{
3248
3249 ## reading in and storing the specified genome in the %chromosomes hash
3250 chdir ($genome_folder) or die "Can't move to $genome_folder: $!";
3251 warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n";
3252
3253 my @chromosome_filenames = <*.fa>;
3254
3255 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
3256 unless (@chromosome_filenames){
3257 @chromosome_filenames = <*.fasta>;
3258 }
3259 unless (@chromosome_filenames){
3260 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n";
3261 }
3262
3263 foreach my $chromosome_filename (@chromosome_filenames){
3264
3265 # skipping the tophat entire mouse genome fasta file
3266 next if ($chromosome_filename eq 'Mus_musculus.NCBIM37.fa');
3267
3268 open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n";
3269 ### first line needs to be a fastA header
3270 my $first_line = <CHR_IN>;
3271 chomp $first_line;
3272 $first_line =~ s/\r//; # removing /r carriage returns
3273
3274 ### Extracting chromosome name from the FastA header
3275 my $chromosome_name = extract_chromosome_name($first_line);
3276
3277 my $sequence;
3278 while (<CHR_IN>){
3279 chomp;
3280 $_ =~ s/\r//; # removing /r carriage returns
3281
3282 if ($_ =~ /^>/){
3283 ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA)
3284 if (exists $chromosomes{$chromosome_name}){
3285 warn "chr $chromosome_name (",length $sequence ," bp)\n";
3286 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
3287 }
3288 else {
3289 if (length($sequence) == 0){
3290 warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n";
3291 }
3292 warn "chr $chromosome_name (",length $sequence ," bp)\n";
3293 $chromosomes{$chromosome_name} = $sequence;
3294 }
3295 ### resetting the sequence variable
3296 $sequence = '';
3297 ### setting new chromosome name
3298 $chromosome_name = extract_chromosome_name($_);
3299 }
3300 else{
3301 $sequence .= uc$_;
3302 }
3303 }
3304
3305 if (exists $chromosomes{$chromosome_name}){
3306 warn "chr $chromosome_name (",length $sequence ," bp)\t";
3307 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n";
3308 }
3309 else{
3310 if (length($sequence) == 0){
3311 warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n";
3312 }
3313 warn "chr $chromosome_name (",length $sequence ," bp)\n";
3314 $chromosomes{$chromosome_name} = $sequence;
3315 }
3316 }
3317 warn "\n";
3318 chdir $parent_dir or die "Failed to move to directory $parent_dir\n";
3319 }
3320
3321 sub extract_chromosome_name {
3322 ## Bowtie extracts the first string after the inition > in the FASTA file, so we are doing this as well
3323 my $fasta_header = shift;
3324 if ($fasta_header =~ s/^>//){
3325 my ($chromosome_name) = split (/\s+/,$fasta_header);
3326 return $chromosome_name;
3327 }
3328 else{
3329 die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n";
3330 }
3331 }
3332
3333 #######################################################################################################################################
3334 ### genome-wide cytosine methylation report - END
3335 #######################################################################################################################################
3336
3337
3338
3339
3340 sub print_helpfile{
3341
3342 print << 'HOW_TO';
3343
3344
3345 DESCRIPTION
3346
3347 The following is a brief description of all options to control the Bismark
3348 methylation extractor. The script reads in a bisulfite read alignment results file
3349 produced by the Bismark bisulfite mapper and extracts the methylation information
3350 for individual cytosines. This information is found in the methylation call field
3351 which can contain the following characters:
3352
3353 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3354 ~~~ X for methylated C in CHG context (was protected) ~~~
3355 ~~~ x for not methylated C CHG (was converted) ~~~
3356 ~~~ H for methylated C in CHH context (was protected) ~~~
3357 ~~~ h for not methylated C in CHH context (was converted) ~~~
3358 ~~~ Z for methylated C in CpG context (was protected) ~~~
3359 ~~~ z for not methylated C in CpG context (was converted) ~~~
3360 ~~~ . for any bases not involving cytosines ~~~
3361 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3362
3363 The methylation extractor outputs result files for cytosines in CpG, CHG and CHH
3364 context (this distinction is actually already made in Bismark itself). As the methylation
3365 information for every C analysed can produce files which easily have tens or even hundreds of
3366 millions of lines, file sizes can become very large and more difficult to handle. The C
3367 methylation info additionally splits cytosine methylation calls up into one of the four possible
3368 strands a given bisulfite read aligned against:
3369
3370 OT original top strand
3371 CTOT complementary to original top strand
3372
3373 OB original bottom strand
3374 CTOB complementary to original bottom strand
3375
3376 Thus, by default twelve individual output files are being generated per input file (unless
3377 --comprehensive is specified, see below). The output files can be imported into a genome
3378 viewer, such as SeqMonk, and re-combined into a single data group if desired (in fact
3379 unless the bisulfite reads were generated preserving directionality it doesn't make any
3380 sense to look at the data in a strand-specific manner). Strand-specific output files can
3381 optionally be skipped, in which case only three output files for CpG, CHG or CHH context
3382 will be generated. For both the strand-specific and comprehensive outputs there is also
3383 the option to merge both non-CpG contexts (CHG and CHH) into one single non-CpG context.
3384
3385
3386 The output files are in the following format (tab delimited):
3387
3388 <sequence_id> <strand> <chromosome> <position> <methylation call>
3389
3390
3391 USAGE: methylation_extractor [options] <filenames>
3392
3393
3394 ARGUMENTS:
3395
3396 <filenames> A space-separated list of Bismark result files in SAM format from
3397 which methylation information is extracted for every cytosine in
3398 the reads. For alignment files in the older custom Bismark output
3399 see option '--vanilla'.
3400
3401 OPTIONS:
3402
3403 -s/--single-end Input file(s) are Bismark result file(s) generated from single-end
3404 read data. Specifying either --single-end or --paired-end is
3405 mandatory.
3406
3407 -p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end
3408 read data. Specifying either --paired-end or --single-end is
3409 mandatory.
3410
3411 --vanilla The Bismark result input file(s) are in the old custom Bismark format
3412 (up to version 0.5.x) and not in SAM format which is the default as
3413 of Bismark version 0.6.x or higher. Default: OFF.
3414
3415 --no_overlap For paired-end reads it is theoretically possible that read_1 and
3416 read_2 overlap. This option avoids scoring overlapping methylation
3417 calls twice (only methylation calls of read 1 are used for in the process
3418 since read 1 has historically higher quality basecalls than read 2).
3419 Whilst this option removes a bias towards more methylation calls
3420 in the center of sequenced fragments it may de facto remove a sizable
3421 proportion of the data. This option is highly recommended for paired-end
3422 data.
3423
3424 --ignore <int> Ignore the first <int> bp at the 5' end of each read when processing the
3425 methylation call string. This can remove e.g. a restriction enzyme site
3426 at the start of each read.
3427
3428 --comprehensive Specifying this option will merge all four possible strand-specific
3429 methylation info into context-dependent output files. The default
3430 contexts are:
3431 - CpG context
3432 - CHG context
3433 - CHH context
3434
3435 --merge_non_CpG This will produce two output files (in --comprehensive mode) or eight
3436 strand-specific output files (default) for Cs in
3437 - CpG context
3438 - non-CpG context
3439
3440 --report Prints out a short methylation summary as well as the paramaters used to run
3441 this script.
3442
3443 --no_header Suppresses the Bismark version header line in all output files for more convenient
3444 batch processing.
3445
3446 -o/--output DIR Allows specification of a different output directory (absolute or relative
3447 path). If not specified explicitely, the output will be written to the current directory.
3448
3449 --version Displays version information.
3450
3451 -h/--help Displays this help file and exits.
3452
3453
3454
3455 bedGraph specific options:
3456
3457 --bedGraph After finishing the methylation extraction, the methylation output is written into a
3458 sorted bedGraph file that reports the position of a given cytosine and its methylation
3459 state (in %, seem details below). The methylation extractor output is temporarily split up into
3460 temporary files, one per chromosome (written into the current directory or folder
3461 specified with -o/--output); these temp files are then used for sorting and deleted
3462 afterwards. By default, only cytosines in CpG context will be sorted. The option
3463 '--CX_context' may be used to report all cyosines irrespective of sequence context
3464 (this will take MUCH longer!).
3465
3466
3467 --cutoff [threshold] The minimum number of times a methylation state has to be seen for that nucleotide
3468 before its methylation percentage is reported. Default: 1.
3469
3470 --remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting.
3471
3472
3473 --counts Adds two additional columns to the output file to enable further calculations:
3474 col 5: number of methylated calls
3475 col 6: number of unmethylated calls
3476 This option is required if '--cytosine_report' is specified (and will be set automatically if
3477 necessary).
3478
3479 --CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered
3480 in the experiment irrespective of its sequence context. This applies to both forward and
3481 reverse strands. Please be aware that this option may generate large temporary and output files
3482 and may take a long time to sort (up to many hours). Default: OFF.
3483 (i.e. Default = CpG context only).
3484
3485
3486
3487 Genome-wide cytosine methylation report specific options:
3488
3489 --cytosine_report After the conversion to bedGraph has completed, the option '--cytosine_report' produces a
3490 genome-wide methylation report for all cytosines in the genome. By default, the output uses 1-based
3491 chromosome coordinates (zero-based cords are optional) and reports CpG context only (all
3492 cytosine context is optional). The output considers all Cs on both forward and reverse strands and
3493 reports their position, strand, trinucleotide content and methylation state (counts are 0 if not
3494 covered).
3495
3496 --CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of
3497 its context. This applies to both forward and reverse strands. Please be aware that this will
3498 generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse.
3499 Default: OFF (i.e. Default = CpG context only).
3500
3501 --zero_based Uses zero-based coordinates like used in e.g. bed files instead of 1-based coordinates. Default: OFF.
3502
3503 --genome_folder <path> Enter the genome folder you wish to use to extract sequences from (full path only). Accepted
3504 formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory.
3505
3506 --split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files
3507 will be named to include the input filename and the chromosome number.
3508
3509
3510
3511 OUTPUT:
3512
3513 The bismark_methylation_extractor output is in the form:
3514 ========================================================
3515 <seq-ID> <methylation state*> <chromosome> <start position (= end position)> <methylation call>
3516
3517 * Methylated cytosines receive a '+' orientation,
3518 * Unmethylated cytosines receive a '-' orientation.
3519
3520
3521
3522 The bedGraph output (optional) looks like this (tab-delimited):
3523 ===============================================================
3524 <chromosome> <start position> <end position> <methylation percentage>
3525
3526
3527
3528 The genome-wide cytosine methylation output file is tab-delimited in the following format:
3529 ==========================================================================================
3530 <chromosome> <position> <strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context>
3531
3532
3533
3534 This script was last modified on 02 Oct 2012.
3535
3536 HOW_TO
3537 }