Mercurial > repos > bgruening > bismark
comparison bismark_methylation_extractor @ 0:62c6da72dd4a draft
Uploaded
author | bgruening |
---|---|
date | Sat, 06 Jul 2013 09:57:36 -0400 |
parents | |
children | 91f07ff056ca |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:62c6da72dd4a |
---|---|
1 #!/usr/bin/perl | |
2 use warnings; | |
3 use strict; | |
4 $|++; | |
5 use Getopt::Long; | |
6 use Cwd; | |
7 use Carp; | |
8 use FindBin qw($Bin); | |
9 use lib "$Bin/../lib"; | |
10 | |
11 ## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk) | |
12 | |
13 ## This program is free software: you can redistribute it and/or modify | |
14 ## it under the terms of the GNU General Public License as published by | |
15 ## the Free Software Foundation, either version 3 of the License, or | |
16 ## (at your option) any later version. | |
17 | |
18 ## This program is distributed in the hope that it will be useful, | |
19 ## but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 ## GNU General Public License for more details. | |
22 | |
23 ## You should have received a copy of the GNU General Public License | |
24 ## along with this program. If not, see <http://www.gnu.org/licenses/>. | |
25 | |
26 my @filenames; # input files | |
27 my %counting; | |
28 my $parent_dir = getcwd(); | |
29 | |
30 my %fhs; | |
31 | |
32 my $version = 'v0.7.11'; | |
33 my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip) = process_commandline(); | |
34 | |
35 | |
36 ### only needed for bedGraph output | |
37 my @sorting_files; # if files are to be written to bedGraph format, these are the methylation extractor output files | |
38 my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total | |
39 my @bedfiles; | |
40 | |
41 ### only needed for genome-wide cytosine methylation report | |
42 my %chromosomes; | |
43 | |
44 ############################################################################################## | |
45 ### Summarising Run Parameters | |
46 ############################################################################################## | |
47 | |
48 ### METHYLATION EXTRACTOR | |
49 | |
50 warn "Summarising Bismark methylation extractor parameters:\n"; | |
51 warn '='x63,"\n"; | |
52 | |
53 if ($single){ | |
54 if ($vanilla){ | |
55 warn "Bismark single-end vanilla format specified\n"; | |
56 } | |
57 else{ | |
58 warn "Bismark single-end SAM format specified (default)\n"; # default | |
59 } | |
60 } | |
61 elsif ($paired){ | |
62 if ($vanilla){ | |
63 warn "Bismark paired-end vanilla format specified\n"; | |
64 } | |
65 else{ | |
66 warn "Bismark paired-end SAM format specified (default)\n"; # default | |
67 } | |
68 } | |
69 | |
70 if ($ignore){ | |
71 warn "First $ignore bases will be disregarded when processing the methylation call string\n"; | |
72 } | |
73 | |
74 if ($full){ | |
75 warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n"; | |
76 } | |
77 if ($merge_non_CpG){ | |
78 warn "Merge CHG and CHH context to non-CpG context specified\n"; | |
79 } | |
80 ### output directory | |
81 if ($output_dir eq ''){ | |
82 warn "Output will be written to the current directory ('$parent_dir')\n"; | |
83 } | |
84 else{ | |
85 warn "Output path specified as: $output_dir\n"; | |
86 } | |
87 | |
88 | |
89 sleep (1); | |
90 | |
91 ### BEDGRAPH | |
92 | |
93 if ($bedGraph){ | |
94 warn "\n\nSummarising bedGraph parameters:\n"; | |
95 warn '='x63,"\n"; | |
96 | |
97 if ($counts){ | |
98 warn "Generating additional output in bedGraph format including methylating counts (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage> <count methylated> <count non-methylated>)\n"; | |
99 } | |
100 else{ | |
101 warn "Generating additional sorted output in bedGraph format (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage>)\n"; | |
102 } | |
103 | |
104 warn "Using a cutoff of $coverage_threshold read(s) to report cytosine positions\n"; | |
105 | |
106 if ($CX_context){ | |
107 warn "Reporting and sorting methylation information for all cytosine context (sorting may take a long time, you have been warned ...)\n"; | |
108 } | |
109 else{ # default | |
110 $CpG_only = 1; | |
111 warn "Reporting and sorting cytosine methylation information in CpG context only (default)\n"; | |
112 } | |
113 | |
114 if ($remove){ | |
115 warn "White spaces in read ID names will be removed prior to sorting\n"; | |
116 } | |
117 | |
118 if (defined $sort_size){ | |
119 warn "The bedGraph UNIX sort command will use the following memory setting:\t'$sort_size'. Temporary directory used for sorting is the output directory\n"; | |
120 } | |
121 else{ | |
122 warn "Setting a default memory usage for the bedGraph UNIX sort command to 2GB\n"; | |
123 } | |
124 | |
125 | |
126 | |
127 sleep (1); | |
128 | |
129 if ($cytosine_report){ | |
130 warn "\n\nSummarising genome-wide cytosine methylation report parameters:\n"; | |
131 warn '='x63,"\n"; | |
132 warn "Generating comprehensive genome-wide cytosine report\n(output format: <Chromosome> <Position> <Strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context> )\n"; | |
133 | |
134 | |
135 if ($CX_context){ | |
136 warn "Reporting methylation for all cytosine contexts. Be aware that this will generate enormous files\n"; | |
137 } | |
138 else{ # default | |
139 $CpG_only = 1; | |
140 warn "Reporting cytosine methylation in CpG context only (default)\n"; | |
141 } | |
142 | |
143 if ($split_by_chromosome){ | |
144 warn "Splitting the cytosine report output up into individual files for each chromosome\n"; | |
145 } | |
146 | |
147 ### Zero-based coordinates | |
148 if ($zero){ | |
149 warn "Using zero-based genomic coordinates (user-defined)\n"; | |
150 } | |
151 else{ # default, 1-based coords | |
152 warn "Using 1-based genomic coordinates (default)\n"; | |
153 } | |
154 | |
155 ### GENOME folder | |
156 if ($genome_folder){ | |
157 unless ($genome_folder =~/\/$/){ | |
158 $genome_folder =~ s/$/\//; | |
159 } | |
160 warn "Genome folder was specified as $genome_folder\n"; | |
161 } | |
162 else{ | |
163 $genome_folder = '/data/public/Genomes/Mouse/NCBIM37/'; | |
164 warn "Using the default genome folder /data/public/Genomes/Mouse/NCBIM37/\n"; | |
165 } | |
166 sleep (1); | |
167 } | |
168 } | |
169 | |
170 warn "\n"; | |
171 sleep (5); | |
172 | |
173 ###################################################### | |
174 ### PROCESSING FILES | |
175 ###################################################### | |
176 | |
177 foreach my $filename (@filenames){ | |
178 # resetting counters and filehandles | |
179 %fhs = (); | |
180 %counting =( | |
181 total_meCHG_count => 0, | |
182 total_meCHH_count => 0, | |
183 total_meCpG_count => 0, | |
184 total_unmethylated_CHG_count => 0, | |
185 total_unmethylated_CHH_count => 0, | |
186 total_unmethylated_CpG_count => 0, | |
187 sequences_count => 0, | |
188 ); | |
189 @sorting_files = (); | |
190 @bedfiles = (); | |
191 | |
192 process_Bismark_results_file($filename); | |
193 | |
194 ### Closing all filehandles so that the Bismark methylation extractor output doesn't get truncated due to buffering issues | |
195 foreach my $fh (keys %fhs) { | |
196 if ($fh =~ /^[1230]$/) { | |
197 foreach my $context (keys %{$fhs{$fh}}) { | |
198 close $fhs{$fh}->{$context} or die $!; | |
199 | |
200 } | |
201 } else { | |
202 close $fhs{$fh} or die $!; | |
203 } | |
204 } | |
205 | |
206 if ($bedGraph){ | |
207 | |
208 my $out = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified | |
209 $out =~ s/gz$//; | |
210 $out =~ s/sam$//; | |
211 $out =~ s/bam$//; | |
212 $out =~ s/txt$//; | |
213 $out =~ s/$/bedGraph/; | |
214 | |
215 | |
216 | |
217 my $bedGraph_output = $out; | |
218 my @args; | |
219 | |
220 if ($remove){ | |
221 push @args, '--remove'; | |
222 } | |
223 if ($CX_context){ | |
224 push @args, '--CX_context'; | |
225 } | |
226 if ($no_header){ | |
227 push @args, '--no_header'; | |
228 } | |
229 if ($counts){ | |
230 push @args, "--counts"; | |
231 } | |
232 | |
233 push @args, "--buffer_size $sort_size"; | |
234 push @args, "--cutoff $coverage_threshold"; | |
235 push @args, "--output $bedGraph_output"; | |
236 push @args, "--dir '$output_dir'"; | |
237 | |
238 ### adding all files to be sorted to @args | |
239 foreach my $f (@sorting_files){ | |
240 push @args, $f; | |
241 } | |
242 | |
243 # print join "\t",@args,"\n"; | |
244 | |
245 system ("$Bin/bismark2bedGraph @args"); | |
246 | |
247 warn "Finished BedGraph conversion ...\n\n"; | |
248 sleep(3); | |
249 | |
250 # open (OUT,'>',$output_dir.$bedGraph_output) or die "Problems with the bedGraph output filename detected: file path: '$output_dir'\tfile name: '$bedGraph_output' $!"; | |
251 # warn "Writing bedGraph to file: $bedGraph_output\n"; | |
252 # process_bedGraph_output(); | |
253 # close OUT or die $!; | |
254 | |
255 ### genome-wide cytosine methylation report requires bedGraph processing anyway | |
256 if ($cytosine_report){ | |
257 @args = (); # resetting @args | |
258 my $cytosine_out = $out; | |
259 $cytosine_out =~ s/bedGraph$//; | |
260 | |
261 if ($CX_context){ | |
262 $cytosine_out =~ s/$/CX_report.txt/; | |
263 } | |
264 else{ | |
265 $cytosine_out =~ s/$/CpG_report.txt/; | |
266 } | |
267 | |
268 push @args, "--output $cytosine_out"; | |
269 push @args, "--dir '$output_dir'"; | |
270 push @args, "--genome '$genome_folder'"; | |
271 push @args, "--parent_dir '$parent_dir'"; | |
272 | |
273 if ($zero){ | |
274 push @args, "--zero"; | |
275 } | |
276 if ($CX_context){ | |
277 push @args, '--CX_context'; | |
278 } | |
279 if ($split_by_chromosome){ | |
280 push @args, '--split_by_chromosome'; | |
281 } | |
282 | |
283 push @args, $bedGraph_output; # this will be the infile | |
284 | |
285 system ("$Bin/bedGraph2cytosine @args"); | |
286 # generate_genome_wide_cytosine_report($bedGraph_output,$cytosine_out); | |
287 warn "\n\nFinished generating genome-wide cytosine report\n\n"; | |
288 } | |
289 } | |
290 } | |
291 | |
292 | |
293 sub process_commandline{ | |
294 my $help; | |
295 my $single_end; | |
296 my $paired_end; | |
297 my $ignore; | |
298 my $genomic_fasta; | |
299 my $full; | |
300 my $report; | |
301 my $extractor_version; | |
302 my $no_overlap; | |
303 my $merge_non_CpG; | |
304 my $vanilla; | |
305 my $output_dir; | |
306 my $no_header; | |
307 my $bedGraph; | |
308 my $coverage_threshold = 1; # Minimum number of reads covering before calling methylation status | |
309 my $remove; | |
310 my $counts; | |
311 my $cytosine_report; | |
312 my $genome_folder; | |
313 my $zero; | |
314 my $CpG_only; | |
315 my $CX_context; | |
316 my $split_by_chromosome; | |
317 my $sort_size; | |
318 my $samtools_path; | |
319 my $gzip; | |
320 | |
321 my $command_line = GetOptions ('help|man' => \$help, | |
322 'p|paired-end' => \$paired_end, | |
323 's|single-end' => \$single_end, | |
324 'fasta' => \$genomic_fasta, | |
325 'ignore=i' => \$ignore, | |
326 'comprehensive' => \$full, | |
327 'report' => \$report, | |
328 'version' => \$extractor_version, | |
329 'no_overlap' => \$no_overlap, | |
330 'merge_non_CpG' => \$merge_non_CpG, | |
331 'vanilla' => \$vanilla, | |
332 'o|output=s' => \$output_dir, | |
333 'no_header' => \$no_header, | |
334 'bedGraph' => \$bedGraph, | |
335 "cutoff=i" => \$coverage_threshold, | |
336 "remove_spaces" => \$remove, | |
337 "counts" => \$counts, | |
338 "cytosine_report" => \$cytosine_report, | |
339 'g|genome_folder=s' => \$genome_folder, | |
340 "zero_based" => \$zero, | |
341 "CX|CX_context" => \$CX_context, | |
342 "split_by_chromosome" => \$split_by_chromosome, | |
343 "buffer_size=s" => \$sort_size, | |
344 'samtools_path=s' => \$samtools_path, | |
345 "gzip" => \$gzip, | |
346 ); | |
347 | |
348 ### EXIT ON ERROR if there were errors with any of the supplied options | |
349 unless ($command_line){ | |
350 die "Please respecify command line options\n"; | |
351 } | |
352 | |
353 ### HELPFILE | |
354 if ($help){ | |
355 print_helpfile(); | |
356 exit; | |
357 } | |
358 | |
359 if ($extractor_version){ | |
360 print << "VERSION"; | |
361 | |
362 | |
363 Bismark Methylation Extractor | |
364 | |
365 Bismark Extractor Version: $version | |
366 Copyright 2010-13 Felix Krueger, Babraham Bioinformatics | |
367 www.bioinformatics.babraham.ac.uk/projects/bismark/ | |
368 | |
369 | |
370 VERSION | |
371 exit; | |
372 } | |
373 | |
374 | |
375 ### no files provided | |
376 unless (@ARGV){ | |
377 die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n"; | |
378 } | |
379 @filenames = @ARGV; | |
380 | |
381 warn "\n *** Bismark methylation extractor version $version ***\n\n"; | |
382 | |
383 ### IGNORING <INT> bases at the start of the read when processing the methylation call string | |
384 unless ($ignore){ | |
385 $ignore = 0; | |
386 } | |
387 ### PRINT A REPORT | |
388 unless ($report){ | |
389 $report = 0; | |
390 } | |
391 | |
392 ### OUTPUT DIR PATH | |
393 if ($output_dir){ | |
394 unless ($output_dir =~ /\/$/){ | |
395 $output_dir =~ s/$/\//; | |
396 } | |
397 } | |
398 else{ | |
399 $output_dir = ''; | |
400 } | |
401 | |
402 ### NO HEADER | |
403 unless ($no_header){ | |
404 $no_header = 0; | |
405 } | |
406 | |
407 ### OLD (VANILLA) OUTPUT FORMAT | |
408 unless ($vanilla){ | |
409 $vanilla = 0; | |
410 } | |
411 | |
412 if ($single_end){ | |
413 $paired_end = 0; ### SINGLE END ALIGNMENTS | |
414 } | |
415 elsif ($paired_end){ | |
416 $single_end = 0; ### PAIRED-END ALIGNMENTS | |
417 } | |
418 else{ | |
419 die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format\n\n"; | |
420 } | |
421 | |
422 ### NO OVERLAP | |
423 if ($no_overlap){ | |
424 die "The option '--no_overlap' can only be specified for paired-end input!\n" unless ($paired_end); | |
425 } | |
426 else{ | |
427 $no_overlap = 0; | |
428 } | |
429 | |
430 ### COMPREHENSIVE OUTPUT | |
431 unless ($full){ | |
432 $full = 0; | |
433 } | |
434 | |
435 ### MERGE NON-CpG context | |
436 unless ($merge_non_CpG){ | |
437 $merge_non_CpG = 0; | |
438 } | |
439 | |
440 ### remove white spaces in read ID (needed for sorting using the sort command | |
441 unless ($remove){ | |
442 $remove = 0; | |
443 } | |
444 | |
445 ### COVERAGE THRESHOLD FOR bedGraph OUTPUT | |
446 if (defined $coverage_threshold){ | |
447 unless ($coverage_threshold > 0){ | |
448 die "Please select a coverage greater than 0 (positive integers only)\n"; | |
449 } | |
450 } | |
451 else{ | |
452 $coverage_threshold = 1; | |
453 } | |
454 | |
455 ### SORT buffer size | |
456 if (defined $sort_size){ | |
457 unless ($sort_size =~ /^\d+\%$/ or $sort_size =~ /^\d+(K|M|G|T)$/){ | |
458 die "Please select a buffer size as percentage (e.g. --buffer_size 20%) or a number to be multiplied with K, M, G, T etc. (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line\n"; | |
459 } | |
460 } | |
461 else{ | |
462 $sort_size = '2G'; | |
463 } | |
464 | |
465 if ($zero){ | |
466 die "Option '--zero' is only available if '--cytosine_report' is specified as well. Please respecify\n" unless ($cytosine_report); | |
467 } | |
468 | |
469 if ($CX_context){ | |
470 die "Option '--CX_context' is only available if '--cytosine_report' or '--bedGraph' is specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph); | |
471 } | |
472 else{ | |
473 $CX_context = 0; | |
474 } | |
475 | |
476 unless ($counts){ | |
477 $counts = 0; | |
478 } | |
479 | |
480 if ($cytosine_report){ | |
481 | |
482 ### GENOME folder | |
483 if ($genome_folder){ | |
484 unless ($genome_folder =~/\/$/){ | |
485 $genome_folder =~ s/$/\//; | |
486 } | |
487 } | |
488 else{ | |
489 die "Please specify a genome folder to proceed (full path only)\n"; | |
490 } | |
491 | |
492 unless ($bedGraph){ | |
493 warn "Setting the option '--bedGraph' since this is required for the genome-wide cytosine report\n"; | |
494 $bedGraph = 1; | |
495 } | |
496 unless ($counts){ | |
497 warn "Setting the option '--counts' since this is required for the genome-wide cytosine report\n"; | |
498 $counts = 1; | |
499 } | |
500 warn "\n"; | |
501 } | |
502 | |
503 ### PATH TO SAMTOOLS | |
504 if (defined $samtools_path){ | |
505 # if Samtools was specified as full command | |
506 if ($samtools_path =~ /samtools$/){ | |
507 if (-e $samtools_path){ | |
508 # Samtools executable found | |
509 } | |
510 else{ | |
511 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; | |
512 } | |
513 } | |
514 else{ | |
515 unless ($samtools_path =~ /\/$/){ | |
516 $samtools_path =~ s/$/\//; | |
517 } | |
518 $samtools_path .= 'samtools'; | |
519 if (-e $samtools_path){ | |
520 # Samtools executable found | |
521 } | |
522 else{ | |
523 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; | |
524 } | |
525 } | |
526 } | |
527 # Check whether Samtools is in the PATH if no path was supplied by the user | |
528 else{ | |
529 if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if Samtools is in the PATH | |
530 $samtools_path = `which samtools`; | |
531 chomp $samtools_path; | |
532 } | |
533 } | |
534 | |
535 unless (defined $samtools_path){ | |
536 $samtools_path = ''; | |
537 } | |
538 | |
539 return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip); | |
540 } | |
541 | |
542 | |
543 sub process_Bismark_results_file{ | |
544 my $filename = shift; | |
545 | |
546 warn "\nNow reading in Bismark result file $filename\n\n"; | |
547 | |
548 if ($filename =~ /\.gz$/) { | |
549 open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n"; | |
550 } | |
551 elsif ($filename =~ /bam$/) { | |
552 if ($samtools_path){ | |
553 open (IN,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n"; | |
554 } | |
555 else{ | |
556 die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n"; | |
557 } | |
558 } | |
559 else { | |
560 open (IN,$filename) or die "Can't open file $filename: $!\n"; | |
561 } | |
562 | |
563 ### Vanilla and SAM output need to read different numbers of header lines | |
564 if ($vanilla) { | |
565 my $bismark_version = <IN>; ## discarding the Bismark version info | |
566 chomp $bismark_version; | |
567 $bismark_version =~ s/\r//; # replaces \r line feed | |
568 $bismark_version =~ s/Bismark version: //; | |
569 if ($bismark_version =~ /^\@/) { | |
570 warn "Detected \@ as the first character of the version information. Is it possible that the file is in SAM format?\n\n"; | |
571 sleep (2); | |
572 } | |
573 | |
574 unless ($version eq $bismark_version){ | |
575 die "The methylation extractor and Bismark itself need to be of the same version!\n\nVersions used:\nmethylation extractor: '$version'\nBismark: '$bismark_version'\n"; | |
576 } | |
577 } else { | |
578 # If the read is in SAM format (default) it can either start with @ header lines or start with alignments directly. | |
579 # We are reading from it further down | |
580 } | |
581 | |
582 my $output_filename = (split (/\//,$filename))[-1]; | |
583 | |
584 ### OPENING OUTPUT-FILEHANDLES | |
585 if ($report) { | |
586 my $report_filename = $output_filename; | |
587 $report_filename =~ s/\.sam$//; | |
588 $report_filename =~ s/\.txt$//; | |
589 $report_filename =~ s/$/_splitting_report.txt/; | |
590 $report_filename = $output_dir . $report_filename; | |
591 open (REPORT,'>',$report_filename) or die "Failed to write to file $report_filename $!\n"; | |
592 } | |
593 | |
594 if ($report) { | |
595 print REPORT "$output_filename\n\n"; | |
596 print REPORT "Parameters used to extract methylation information:\n"; | |
597 if ($paired) { | |
598 if ($vanilla) { | |
599 print REPORT "Bismark result file: paired-end (vanilla Bismark format)\n"; | |
600 } else { | |
601 print REPORT "Bismark result file: paired-end (SAM format)\n"; # default | |
602 } | |
603 } | |
604 | |
605 if ($single) { | |
606 if ($vanilla) { | |
607 print REPORT "Bismark result file: single-end (vanilla Bismark format)\n"; | |
608 } else { | |
609 print REPORT "Bismark result file: single-end (SAM format)\n"; # default | |
610 } | |
611 } | |
612 | |
613 if ($ignore) { | |
614 print REPORT "Ignoring first $ignore bases\n"; | |
615 } | |
616 | |
617 if ($full) { | |
618 print REPORT "Output specified: comprehensive\n"; | |
619 } else { | |
620 print REPORT "Output specified: strand-specific (default)\n"; | |
621 } | |
622 | |
623 if ($no_overlap) { | |
624 print REPORT "No overlapping methylation calls specified\n"; | |
625 } | |
626 if ($genomic_fasta) { | |
627 print REPORT "Genomic equivalent sequences will be printed out in FastA format\n"; | |
628 } | |
629 if ($merge_non_CpG) { | |
630 print REPORT "Methylation in CHG and CHH context will be merged into \"non-CpG context\" output\n"; | |
631 } | |
632 | |
633 print REPORT "\n"; | |
634 } | |
635 | |
636 ##### open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; | |
637 | |
638 ### CpG-context and non-CpG context. THIS SECTION IS OPTIONAL | |
639 ### if --comprehensive AND --merge_non_CpG was specified we are only writing out one CpG-context and one Any-Other-context result file | |
640 if ($full and $merge_non_CpG) { | |
641 my $cpg_output = my $other_c_output = $output_filename; | |
642 ### C in CpG context | |
643 $cpg_output =~ s/^/CpG_context_/; | |
644 $cpg_output =~ s/sam$/txt/; | |
645 $cpg_output =~ s/bam$/txt/; | |
646 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); | |
647 $cpg_output = $output_dir . $cpg_output; | |
648 | |
649 if ($gzip){ | |
650 $cpg_output .= '.gz'; | |
651 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n"; | |
652 } | |
653 else{ | |
654 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n"; | |
655 } | |
656 | |
657 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n"; | |
658 push @sorting_files,$cpg_output; | |
659 | |
660 unless ($no_header) { | |
661 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n"; | |
662 } | |
663 | |
664 ### C in any other context than CpG | |
665 $other_c_output =~ s/^/Non_CpG_context_/; | |
666 $other_c_output =~ s/sam$/txt/; | |
667 $other_c_output =~ s/bam$/txt/; | |
668 $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/); | |
669 $other_c_output = $output_dir . $other_c_output; | |
670 | |
671 if ($gzip){ | |
672 $other_c_output .= '.gz'; | |
673 open ($fhs{other_context},"| gzip -c - > $other_c_output") or die "Failed to write to $other_c_output $! \n"; | |
674 } | |
675 else{ | |
676 open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n"; | |
677 } | |
678 | |
679 warn "Writing result file containing methylation information for C in any other context to $other_c_output\n"; | |
680 push @sorting_files,$other_c_output; | |
681 | |
682 | |
683 unless ($no_header) { | |
684 print {$fhs{other_context}} "Bismark methylation extractor version $version\n"; | |
685 } | |
686 } | |
687 | |
688 ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found | |
689 elsif ($merge_non_CpG) { | |
690 | |
691 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; | |
692 | |
693 ### For cytosines in CpG context | |
694 $cpg_ot =~ s/^/CpG_OT_/; | |
695 $cpg_ot =~ s/sam$/txt/; | |
696 $cpg_ot =~ s/bam$/txt/; | |
697 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); | |
698 $cpg_ot = $output_dir . $cpg_ot; | |
699 | |
700 if ($gzip){ | |
701 $cpg_ot .= '.gz'; | |
702 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n"; | |
703 } | |
704 else{ | |
705 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n"; | |
706 } | |
707 | |
708 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n"; | |
709 push @sorting_files,$cpg_ot; | |
710 | |
711 unless($no_header){ | |
712 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n"; | |
713 } | |
714 | |
715 $cpg_ctot =~ s/^/CpG_CTOT_/; | |
716 $cpg_ctot =~ s/sam$/txt/; | |
717 $cpg_ctot =~ s/bam$/txt/; | |
718 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); | |
719 $cpg_ctot = $output_dir . $cpg_ctot; | |
720 | |
721 if ($gzip){ | |
722 $cpg_ctot .= '.gz'; | |
723 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n"; | |
724 } | |
725 else{ | |
726 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n"; | |
727 } | |
728 | |
729 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n"; | |
730 push @sorting_files,$cpg_ctot; | |
731 | |
732 unless($no_header){ | |
733 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n"; | |
734 } | |
735 | |
736 $cpg_ctob =~ s/^/CpG_CTOB_/; | |
737 $cpg_ctob =~ s/sam$/txt/; | |
738 $cpg_ctob =~ s/bam$/txt/; | |
739 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); | |
740 $cpg_ctob = $output_dir . $cpg_ctob; | |
741 | |
742 if ($gzip){ | |
743 $cpg_ctob .= '.gz'; | |
744 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n"; | |
745 } | |
746 else{ | |
747 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n"; | |
748 } | |
749 | |
750 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n"; | |
751 push @sorting_files,$cpg_ctob; | |
752 | |
753 unless($no_header){ | |
754 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n"; | |
755 } | |
756 | |
757 $cpg_ob =~ s/^/CpG_OB_/; | |
758 $cpg_ob =~ s/sam$/txt/; | |
759 $cpg_ob =~ s/bam$/txt/; | |
760 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); | |
761 $cpg_ob = $output_dir . $cpg_ob; | |
762 | |
763 if ($gzip){ | |
764 $cpg_ob .= '.gz'; | |
765 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n"; | |
766 } | |
767 else{ | |
768 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n"; | |
769 } | |
770 | |
771 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n"; | |
772 push @sorting_files,$cpg_ob; | |
773 | |
774 unless($no_header){ | |
775 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n"; | |
776 } | |
777 | |
778 ### For cytosines in Non-CpG (CC, CT or CA) context | |
779 my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename; | |
780 | |
781 $other_c_ot =~ s/^/Non_CpG_OT_/; | |
782 $other_c_ot =~ s/sam$/txt/; | |
783 $other_c_ot =~ s/bam$/txt/; | |
784 $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/); | |
785 $other_c_ot = $output_dir . $other_c_ot; | |
786 | |
787 if ($gzip){ | |
788 $other_c_ot .= '.gz'; | |
789 open ($fhs{0}->{other_c},"| gzip -c - > $other_c_ot") or die "Failed to write to $other_c_ot $!\n"; | |
790 } | |
791 else{ | |
792 open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n"; | |
793 } | |
794 | |
795 warn "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n"; | |
796 push @sorting_files,$other_c_ot; | |
797 | |
798 unless($no_header){ | |
799 print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n"; | |
800 } | |
801 | |
802 $other_c_ctot =~ s/^/Non_CpG_CTOT_/; | |
803 $other_c_ctot =~ s/sam$/txt/; | |
804 $other_c_ctot =~ s/bam$/txt/; | |
805 $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/); | |
806 $other_c_ctot = $output_dir . $other_c_ctot; | |
807 | |
808 if ($gzip){ | |
809 $other_c_ctot .= '.gz'; | |
810 open ($fhs{1}->{other_c},"| gzip -c - > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n"; | |
811 } | |
812 else{ | |
813 open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n"; | |
814 } | |
815 | |
816 warn "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n"; | |
817 push @sorting_files,$other_c_ctot; | |
818 | |
819 unless($no_header){ | |
820 print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n"; | |
821 } | |
822 | |
823 $other_c_ctob =~ s/^/Non_CpG_CTOB_/; | |
824 $other_c_ctob =~ s/sam$/txt/; | |
825 $other_c_ctob =~ s/bam$/txt/; | |
826 $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/); | |
827 $other_c_ctob = $output_dir . $other_c_ctob; | |
828 | |
829 if ($gzip){ | |
830 $other_c_ctob .= '.gz'; | |
831 open ($fhs{2}->{other_c},"| gzip -c - > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n"; | |
832 } | |
833 else{ | |
834 open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n"; | |
835 } | |
836 | |
837 warn "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n"; | |
838 push @sorting_files,$other_c_ctob; | |
839 | |
840 unless($no_header){ | |
841 print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n"; | |
842 } | |
843 | |
844 $other_c_ob =~ s/^/Non_CpG_OB_/; | |
845 $other_c_ob =~ s/sam$/txt/; | |
846 $other_c_ob =~ s/sam$/txt/; | |
847 $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/); | |
848 $other_c_ob = $output_dir . $other_c_ob; | |
849 | |
850 if ($gzip){ | |
851 $other_c_ob .= '.gz'; | |
852 open ($fhs{3}->{other_c},"| gzip -c - > $other_c_ob") or die "Failed to write to $other_c_ob $!\n"; | |
853 } | |
854 else{ | |
855 open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n"; | |
856 } | |
857 | |
858 warn "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n"; | |
859 push @sorting_files,$other_c_ob; | |
860 | |
861 unless($no_header){ | |
862 print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n"; | |
863 } | |
864 } | |
865 ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context) | |
866 | |
867 ### if --comprehensive was specified we are only writing one file per context | |
868 elsif ($full) { | |
869 my $cpg_output = my $chg_output = my $chh_output = $output_filename; | |
870 ### C in CpG context | |
871 $cpg_output =~ s/^/CpG_context_/; | |
872 $cpg_output =~ s/sam$/txt/; | |
873 $cpg_output =~ s/bam$/txt/; | |
874 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); | |
875 $cpg_output = $output_dir . $cpg_output; | |
876 | |
877 if ($gzip){ | |
878 $cpg_output .= '.gz'; | |
879 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n"; | |
880 } | |
881 else{ | |
882 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n"; | |
883 } | |
884 | |
885 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n"; | |
886 push @sorting_files,$cpg_output; | |
887 | |
888 unless($no_header){ | |
889 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n"; | |
890 } | |
891 | |
892 ### C in CHG context | |
893 $chg_output =~ s/^/CHG_context_/; | |
894 $chg_output =~ s/sam$/txt/; | |
895 $chg_output =~ s/bam$/txt/; | |
896 $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/); | |
897 $chg_output = $output_dir . $chg_output; | |
898 | |
899 if ($gzip){ | |
900 $chg_output .= '.gz'; | |
901 open ($fhs{CHG_context},"| gzip -c - > $chg_output") or die "Failed to write to $chg_output $!\n"; | |
902 } | |
903 else{ | |
904 open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n"; | |
905 } | |
906 | |
907 warn "Writing result file containing methylation information for C in CHG context to $chg_output\n"; | |
908 push @sorting_files,$chg_output; | |
909 | |
910 unless($no_header){ | |
911 print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n"; | |
912 } | |
913 | |
914 ### C in CHH context | |
915 $chh_output =~ s/^/CHH_context_/; | |
916 $chh_output =~ s/sam$/txt/; | |
917 $chh_output =~ s/bam$/txt/; | |
918 $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/); | |
919 $chh_output = $output_dir . $chh_output; | |
920 | |
921 if ($gzip){ | |
922 $chh_output .= '.gz'; | |
923 open ($fhs{CHH_context},"| gzip -c - > $chh_output") or die "Failed to write to $chh_output $!\n"; | |
924 } | |
925 else{ | |
926 open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n"; | |
927 } | |
928 | |
929 warn "Writing result file containing methylation information for C in CHH context to $chh_output\n"; | |
930 push @sorting_files, $chh_output; | |
931 | |
932 unless($no_header){ | |
933 print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n"; | |
934 } | |
935 } | |
936 ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found | |
937 else { | |
938 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; | |
939 | |
940 ### For cytosines in CpG context | |
941 $cpg_ot =~ s/^/CpG_OT_/; | |
942 $cpg_ot =~ s/sam$/txt/; | |
943 $cpg_ot =~ s/bam$/txt/; | |
944 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); | |
945 $cpg_ot = $output_dir . $cpg_ot; | |
946 | |
947 if ($gzip){ | |
948 $cpg_ot .= '.gz'; | |
949 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n"; | |
950 } | |
951 else{ | |
952 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n"; | |
953 } | |
954 | |
955 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n"; | |
956 push @sorting_files,$cpg_ot; | |
957 | |
958 unless($no_header){ | |
959 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n"; | |
960 } | |
961 | |
962 $cpg_ctot =~ s/^/CpG_CTOT_/; | |
963 $cpg_ctot =~ s/sam$/txt/; | |
964 $cpg_ctot =~ s/bam$/txt/; | |
965 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); | |
966 $cpg_ctot = $output_dir . $cpg_ctot; | |
967 | |
968 if ($gzip){ | |
969 $cpg_ctot .= '.gz'; | |
970 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n"; | |
971 } | |
972 else{ | |
973 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n"; | |
974 } | |
975 | |
976 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n"; | |
977 push @sorting_files,$cpg_ctot; | |
978 | |
979 unless($no_header){ | |
980 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n"; | |
981 } | |
982 | |
983 $cpg_ctob =~ s/^/CpG_CTOB_/; | |
984 $cpg_ctob =~ s/sam$/txt/; | |
985 $cpg_ctob =~ s/bam$/txt/; | |
986 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); | |
987 $cpg_ctob = $output_dir . $cpg_ctob; | |
988 | |
989 if ($gzip){ | |
990 $cpg_ctob .= '.gz'; | |
991 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n"; | |
992 } | |
993 else{ | |
994 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n"; | |
995 } | |
996 | |
997 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n"; | |
998 push @sorting_files,$cpg_ctob; | |
999 | |
1000 unless($no_header){ | |
1001 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n"; | |
1002 } | |
1003 | |
1004 $cpg_ob =~ s/^/CpG_OB_/; | |
1005 $cpg_ob =~ s/sam$/txt/; | |
1006 $cpg_ob =~ s/bam$/txt/; | |
1007 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); | |
1008 $cpg_ob = $output_dir . $cpg_ob; | |
1009 | |
1010 if ($gzip){ | |
1011 $cpg_ob .= '.gz'; | |
1012 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n"; | |
1013 } | |
1014 else{ | |
1015 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n"; | |
1016 } | |
1017 | |
1018 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n"; | |
1019 push @sorting_files,$cpg_ob; | |
1020 | |
1021 unless($no_header){ | |
1022 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n"; | |
1023 } | |
1024 | |
1025 ### For cytosines in CHG context | |
1026 my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename; | |
1027 | |
1028 $chg_ot =~ s/^/CHG_OT_/; | |
1029 $chg_ot =~ s/sam$/txt/; | |
1030 $chg_ot =~ s/bam$/txt/; | |
1031 $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/); | |
1032 $chg_ot = $output_dir . $chg_ot; | |
1033 | |
1034 if ($gzip){ | |
1035 $chg_ot .= '.gz'; | |
1036 open ($fhs{0}->{CHG},"| gzip -c - > $chg_ot") or die "Failed to write to $chg_ot $!\n"; | |
1037 } | |
1038 else{ | |
1039 open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n"; | |
1040 } | |
1041 | |
1042 warn "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n"; | |
1043 push @sorting_files,$chg_ot; | |
1044 | |
1045 unless($no_header){ | |
1046 print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n"; | |
1047 } | |
1048 | |
1049 $chg_ctot =~ s/^/CHG_CTOT_/; | |
1050 $chg_ctot =~ s/sam$/txt/; | |
1051 $chg_ctot =~ s/bam$/txt/; | |
1052 $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/); | |
1053 $chg_ctot = $output_dir . $chg_ctot; | |
1054 | |
1055 if ($gzip){ | |
1056 $chg_ctot .= '.gz'; | |
1057 open ($fhs{1}->{CHG},"| gzip -c - > $chg_ctot") or die "Failed to write to $chg_ctot $!\n"; | |
1058 } | |
1059 else{ | |
1060 open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n"; | |
1061 } | |
1062 | |
1063 warn "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n"; | |
1064 push @sorting_files,$chg_ctot; | |
1065 | |
1066 unless($no_header){ | |
1067 print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n"; | |
1068 } | |
1069 | |
1070 $chg_ctob =~ s/^/CHG_CTOB_/; | |
1071 $chg_ctob =~ s/sam$/txt/; | |
1072 $chg_ctob =~ s/bam$/txt/; | |
1073 $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/); | |
1074 $chg_ctob = $output_dir . $chg_ctob; | |
1075 | |
1076 if ($gzip){ | |
1077 $chg_ctob .= '.gz'; | |
1078 open ($fhs{2}->{CHG},"| gzip -c - > $chg_ctob") or die "Failed to write to $chg_ctob $!\n"; | |
1079 } | |
1080 else{ | |
1081 open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n"; | |
1082 } | |
1083 | |
1084 warn "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n"; | |
1085 push @sorting_files,$chg_ctob; | |
1086 | |
1087 unless($no_header){ | |
1088 print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n"; | |
1089 } | |
1090 | |
1091 $chg_ob =~ s/^/CHG_OB_/; | |
1092 $chg_ob =~ s/sam$/txt/; | |
1093 $chg_ob =~ s/bam$/txt/; | |
1094 $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/); | |
1095 $chg_ob = $output_dir . $chg_ob; | |
1096 | |
1097 if ($gzip){ | |
1098 $chg_ob .= '.gz'; | |
1099 open ($fhs{3}->{CHG},"| gzip -c - > $chg_ob") or die "Failed to write to $chg_ob $!\n"; | |
1100 } | |
1101 else{ | |
1102 open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n"; | |
1103 } | |
1104 | |
1105 warn "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n"; | |
1106 push @sorting_files,$chg_ob; | |
1107 | |
1108 unless($no_header){ | |
1109 print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n"; | |
1110 } | |
1111 | |
1112 ### For cytosines in CHH context | |
1113 my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename; | |
1114 | |
1115 $chh_ot =~ s/^/CHH_OT_/; | |
1116 $chh_ot =~ s/sam$/txt/; | |
1117 $chh_ot =~ s/bam$/txt/; | |
1118 $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/); | |
1119 $chh_ot = $output_dir . $chh_ot; | |
1120 | |
1121 if ($gzip){ | |
1122 $chh_ot .= '.gz'; | |
1123 open ($fhs{0}->{CHH},"| gzip -c - > $chh_ot") or die "Failed to write to $chh_ot $!\n"; | |
1124 } | |
1125 else{ | |
1126 open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n"; | |
1127 } | |
1128 | |
1129 warn "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n"; | |
1130 push @sorting_files,$chh_ot; | |
1131 | |
1132 unless($no_header){ | |
1133 print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n"; | |
1134 } | |
1135 | |
1136 $chh_ctot =~ s/^/CHH_CTOT_/; | |
1137 $chh_ctot =~ s/sam$/txt/; | |
1138 $chh_ctot =~ s/bam$/txt/; | |
1139 $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/); | |
1140 $chh_ctot = $output_dir . $chh_ctot; | |
1141 | |
1142 if ($gzip){ | |
1143 $chh_ctot .= '.gz'; | |
1144 open ($fhs{1}->{CHH},"| gzip -c - > $chh_ctot") or die "Failed to write to $chh_ctot $!\n"; | |
1145 } | |
1146 else{ | |
1147 open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n"; | |
1148 } | |
1149 | |
1150 warn "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n"; | |
1151 push @sorting_files,$chh_ctot; | |
1152 | |
1153 unless($no_header){ | |
1154 print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n"; | |
1155 } | |
1156 | |
1157 $chh_ctob =~ s/^/CHH_CTOB_/; | |
1158 $chh_ctob =~ s/sam$/txt/; | |
1159 $chh_ctob =~ s/bam$/txt/; | |
1160 $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/); | |
1161 $chh_ctob = $output_dir . $chh_ctob; | |
1162 | |
1163 if ($gzip){ | |
1164 $chh_ctob .= '.gz'; | |
1165 open ($fhs{2}->{CHH},"| gzip -c - > $chh_ctob") or die "Failed to write to $chh_ctob $!\n"; | |
1166 } | |
1167 else{ | |
1168 open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n"; | |
1169 } | |
1170 | |
1171 warn "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n"; | |
1172 push @sorting_files,$chh_ctob; | |
1173 | |
1174 unless($no_header){ | |
1175 print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n"; | |
1176 } | |
1177 | |
1178 $chh_ob =~ s/^/CHH_OB_/; | |
1179 $chh_ob =~ s/sam$/txt/; | |
1180 $chh_ob =~ s/bam$/txt/; | |
1181 $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/); | |
1182 $chh_ob = $output_dir . $chh_ob; | |
1183 | |
1184 if ($gzip){ | |
1185 $chh_ob .= '.gz'; | |
1186 open ($fhs{3}->{CHH},"| gzip -c - > $chh_ob") or die "Failed to write to $chh_ob $!\n"; | |
1187 } | |
1188 else{ | |
1189 open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n"; | |
1190 } | |
1191 | |
1192 warn "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n"; | |
1193 push @sorting_files,$chh_ob; | |
1194 | |
1195 unless($no_header){ | |
1196 print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n"; | |
1197 } | |
1198 } | |
1199 | |
1200 my $methylation_call_strings_processed = 0; | |
1201 my $line_count = 0; | |
1202 | |
1203 ### proceeding differently now for single-end or paired-end Bismark files | |
1204 | |
1205 ### PROCESSING SINGLE-END RESULT FILES | |
1206 if ($single) { | |
1207 | |
1208 ### also proceeding differently now for SAM format or vanilla Bismark format files | |
1209 if ($vanilla) { # old vanilla Bismark output format | |
1210 while (<IN>) { | |
1211 ++$line_count; | |
1212 warn "Processed lines: $line_count\n" if ($line_count%500000==0); | |
1213 | |
1214 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example) | |
1215 my ($id,$strand,$chrom,$start,$seq,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,6,7,8,9]; | |
1216 | |
1217 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the first or | |
1218 ### last position | |
1219 chomp $genome_conversion; | |
1220 | |
1221 my $index; | |
1222 if ($meth_call) { | |
1223 | |
1224 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand | |
1225 $index = 0; | |
1226 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand | |
1227 $index = 1; | |
1228 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand | |
1229 $index = 3; | |
1230 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand | |
1231 $index = 2; | |
1232 } else { | |
1233 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; | |
1234 } | |
1235 | |
1236 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int> | |
1237 if ($ignore) { | |
1238 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); | |
1239 | |
1240 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly! | |
1241 if ($strand eq '+') { | |
1242 $start += $ignore; | |
1243 } elsif ($strand eq '-') { | |
1244 $start += length($meth_call)-1; ## $meth_call is already shortened! | |
1245 } else { | |
1246 die "Alignment did not have proper strand information: $strand\n"; | |
1247 } | |
1248 } | |
1249 ### printing out the methylation state of every C in the read | |
1250 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index); | |
1251 | |
1252 ++$methylation_call_strings_processed; # 1 per single-end result | |
1253 } | |
1254 } | |
1255 } else { # processing single-end SAM format (default) | |
1256 while (<IN>) { | |
1257 ### SAM format can either start with header lines (starting with @) or start with alignments directly | |
1258 if (/^\@/) { # skipping header lines (starting with @) | |
1259 warn "skipping SAM header line:\t$_"; | |
1260 next; | |
1261 } | |
1262 | |
1263 ++$line_count; | |
1264 warn "Processed lines: $line_count\n" if ($line_count%500000==0); | |
1265 | |
1266 # example read in SAM format | |
1267 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT | |
1268 ### | |
1269 | |
1270 # < 0.7.6 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; | |
1271 # < 0.7.6 $meth_call =~ s/^XM:Z://; | |
1272 # < 0.7.6 $read_conversion =~ s/^XR:Z://; | |
1273 # < 0.7.6 $genome_conversion =~ s/^XG:Z://; | |
1274 | |
1275 my ($id,$chrom,$start,$cigar) = (split("\t"))[0,2,3,5]; | |
1276 | |
1277 ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression | |
1278 my $meth_call; ### Thanks to Zachary Zeno for this solution | |
1279 my $read_conversion; | |
1280 my $genome_conversion; | |
1281 | |
1282 while ( /(XM|XR|XG):Z:([^\t]+)/g ) { | |
1283 my $tag = $1; | |
1284 my $value = $2; | |
1285 | |
1286 if ($tag eq "XM") { | |
1287 $meth_call = $value; | |
1288 $meth_call =~ s/\r//; | |
1289 } elsif ($tag eq "XR") { | |
1290 $read_conversion = $value; | |
1291 $read_conversion =~ s/\r//; | |
1292 } elsif ($tag eq "XG") { | |
1293 $genome_conversion = $value; | |
1294 $genome_conversion =~ s/\r//; | |
1295 } | |
1296 } | |
1297 | |
1298 my $strand; | |
1299 chomp $genome_conversion; | |
1300 # print "$meth_call\n$read_conversion\n$genome_conversion\n"; | |
1301 | |
1302 my $index; | |
1303 if ($meth_call) { | |
1304 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand | |
1305 $index = 0; | |
1306 $strand = '+'; | |
1307 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand | |
1308 $index = 1; | |
1309 $strand = '-'; | |
1310 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand | |
1311 $index = 2; | |
1312 $strand = '+'; | |
1313 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand | |
1314 $index = 3; | |
1315 $strand = '-'; | |
1316 } else { | |
1317 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; | |
1318 } | |
1319 | |
1320 ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output | |
1321 if ($strand eq '-') { | |
1322 $meth_call = reverse $meth_call; | |
1323 } | |
1324 | |
1325 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int> | |
1326 if ($ignore) { | |
1327 # print "\n\n$meth_call\n"; | |
1328 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); | |
1329 # print "$meth_call\n"; | |
1330 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly | |
1331 | |
1332 my @len = split (/\D+/,$cigar); # storing the length per operation | |
1333 my @ops = split (/\d+/,$cigar); # storing the operation | |
1334 shift @ops; # remove the empty first element | |
1335 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
1336 | |
1337 my @comp_cigar; # building an array with all CIGAR operations | |
1338 foreach my $index (0..$#len) { | |
1339 foreach (1..$len[$index]) { | |
1340 # print "$ops[$index]"; | |
1341 push @comp_cigar, $ops[$index]; | |
1342 } | |
1343 } | |
1344 # print "original CIGAR: $cigar\n"; | |
1345 # print "original CIGAR: @comp_cigar\n"; | |
1346 | |
1347 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly! | |
1348 if ($strand eq '+') { | |
1349 | |
1350 my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions | |
1351 my $I_count = 0; | |
1352 | |
1353 for (1..$ignore) { | |
1354 my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations from the start | |
1355 # print "$_ deleted $op\n"; | |
1356 | |
1357 while ($op eq 'D') { # repeating this for deletions (D) | |
1358 $D_count++; | |
1359 $op = shift @comp_cigar; | |
1360 # print "$_ deleted $op\n"; | |
1361 } | |
1362 if ($op eq 'I') { # adjusting the genomic position for insertions (I) | |
1363 $I_count++; | |
1364 } | |
1365 } | |
1366 $start += $ignore + $D_count - $I_count; | |
1367 # print "start $start\t ignore: $ignore\t D count: $D_count I_count: $I_count\n"; | |
1368 } elsif ($strand eq '-') { | |
1369 | |
1370 for (1..$ignore) { | |
1371 my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array | |
1372 while ($op eq 'D') { # repeating this for deletions (D) | |
1373 $op = pop @comp_cigar; | |
1374 } | |
1375 } | |
1376 | |
1377 ### For reverse strand alignments we need to determine the number of matching bases (M) or deletions (D) in the read from the CIGAR | |
1378 ### string to be able to work out the starting position of the read which is on the 3' end of the sequence | |
1379 my $MD_count = 0; # counting all operations that affect the genomic position, i.e. M and D. Insertions do not affect the start position | |
1380 foreach (@comp_cigar) { | |
1381 ++$MD_count if ($_ eq 'M' or $_ eq 'D'); | |
1382 } | |
1383 $start += $MD_count - 1; | |
1384 } | |
1385 | |
1386 ### reconstituting shortened CIGAR string | |
1387 my $new_cigar; | |
1388 my $count = 0; | |
1389 my $last_op; | |
1390 # print "ignore adjusted: @comp_cigar\n"; | |
1391 foreach my $op (@comp_cigar) { | |
1392 unless (defined $last_op){ | |
1393 $last_op = $op; | |
1394 ++$count; | |
1395 next; | |
1396 } | |
1397 if ($last_op eq $op) { | |
1398 ++$count; | |
1399 } else { | |
1400 $new_cigar .= "$count$last_op"; | |
1401 $last_op = $op; | |
1402 $count = 1; | |
1403 } | |
1404 } | |
1405 $new_cigar .= "$count$last_op"; # appending the last operation and count | |
1406 $cigar = $new_cigar; | |
1407 # print "ignore adjusted scalar: $cigar\n"; | |
1408 } | |
1409 } | |
1410 ### printing out the methylation state of every C in the read | |
1411 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index,$cigar); | |
1412 | |
1413 ++$methylation_call_strings_processed; # 1 per single-end result | |
1414 } | |
1415 } | |
1416 } | |
1417 | |
1418 ### PROCESSING PAIRED-END RESULT FILES | |
1419 elsif ($paired) { | |
1420 | |
1421 ### proceeding differently now for SAM format or vanilla Bismark format files | |
1422 if ($vanilla) { # old vanilla Bismark paired-end output format | |
1423 while (<IN>) { | |
1424 ++$line_count; | |
1425 warn "processed line: $line_count\n" if ($line_count%500000==0); | |
1426 | |
1427 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example) | |
1428 my ($id,$strand,$chrom,$start_read_1,$end_read_2,$seq_1,$meth_call_1,$seq_2,$meth_call_2,$first_read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,4,6,7,9,10,11,12,13]; | |
1429 | |
1430 my $index; | |
1431 chomp $genome_conversion; | |
1432 | |
1433 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') { | |
1434 $index = 0; ## this is OT | |
1435 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') { | |
1436 $index = 2; ## this is CTOB!!! | |
1437 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') { | |
1438 $index = 1; ## this is CTOT!!! | |
1439 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') { | |
1440 $index = 3; ## this is OB | |
1441 } else { | |
1442 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; | |
1443 } | |
1444 | |
1445 if ($meth_call_1 and $meth_call_2) { | |
1446 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' | |
1447 if ($ignore) { | |
1448 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); | |
1449 $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore); | |
1450 | |
1451 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified | |
1452 $start_read_1 += $ignore; | |
1453 $end_read_2 -= $ignore; | |
1454 } | |
1455 my $end_read_1; | |
1456 my $start_read_2; | |
1457 | |
1458 if ($strand eq '+') { | |
1459 | |
1460 $end_read_1 = $start_read_1+length($meth_call_1)-1; | |
1461 $start_read_2 = $end_read_2-length($meth_call_2)+1; | |
1462 | |
1463 ## we first pass the first read which is in + orientation on the forward strand | |
1464 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0); | |
1465 | |
1466 # we next pass the second read which is in - orientation on the reverse strand | |
1467 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 | |
1468 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1); | |
1469 } else { | |
1470 | |
1471 $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read! | |
1472 $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read! | |
1473 | |
1474 ## we first pass the first read which is in - orientation on the reverse strand | |
1475 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0); | |
1476 | |
1477 # we next pass the second read which is in + orientation on the forward strand | |
1478 ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 | |
1479 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2); | |
1480 } | |
1481 | |
1482 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls | |
1483 } | |
1484 } | |
1485 } else { # Bismark paired-end SAM output format (default) | |
1486 while (<IN>) { | |
1487 ### SAM format can either start with header lines (starting with @) or start with alignments directly | |
1488 if (/^\@/) { # skipping header lines (starting with @) | |
1489 warn "skipping SAM header line:\t$_"; | |
1490 next; | |
1491 } | |
1492 | |
1493 ++$line_count; | |
1494 warn "Processed lines: $line_count\n" if ($line_count%500000==0); | |
1495 | |
1496 # example paired-end reads in SAM format (2 consecutive lines) | |
1497 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT | |
1498 # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT | |
1499 | |
1500 # < version 0.7.6 my ($id_1,$chrom,$start_read_1,$meth_call_1,$first_read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; | |
1501 | |
1502 my ($id_1,$chrom,$start_read_1,$cigar_1) = (split("\t"))[0,2,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression | |
1503 my $meth_call_1; | |
1504 my $first_read_conversion; | |
1505 my $genome_conversion; | |
1506 | |
1507 while ( /(XM|XR|XG):Z:([^\t]+)/g ) { | |
1508 my $tag = $1; | |
1509 my $value = $2; | |
1510 | |
1511 if ($tag eq "XM") { | |
1512 $meth_call_1 = $value; | |
1513 $meth_call_1 =~ s/\r//; | |
1514 } elsif ($tag eq "XR") { | |
1515 $first_read_conversion = $value; | |
1516 $first_read_conversion =~ s/\r//; | |
1517 } elsif ($tag eq "XG") { | |
1518 $genome_conversion = $value; | |
1519 $genome_conversion =~ s/\r//; | |
1520 } | |
1521 } | |
1522 | |
1523 $_ = <IN>; # reading in the paired read | |
1524 | |
1525 # < version 0.7.6 my ($id_2,$start_read_2,$meth_call_2,$second_read_conversion) = (split("\t"))[0,3,13,14]; | |
1526 # < version 0.7.6 $meth_call_1 =~ s/^XM:Z://; | |
1527 # < version 0.7.6 $meth_call_2 =~ s/^XM:Z://; | |
1528 # < version 0.7.6 $first_read_conversion =~ s/^XR:Z://; | |
1529 # < version 0.7.6 $second_read_conversion =~ s/^XR:Z://; | |
1530 | |
1531 my ($id_2,$start_read_2,$cigar_2) = (split("\t"))[0,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression | |
1532 | |
1533 my $meth_call_2; | |
1534 my $second_read_conversion; | |
1535 | |
1536 while ( /(XM|XR):Z:([^\t]+)/g ) { | |
1537 my $tag = $1; | |
1538 my $value = $2; | |
1539 | |
1540 if ($tag eq "XM") { | |
1541 $meth_call_2 = $value; | |
1542 $meth_call_2 =~ s/\r//; | |
1543 } elsif ($tag eq "XR") { | |
1544 $second_read_conversion = $value; | |
1545 $second_read_conversion = s/\r//; | |
1546 } | |
1547 } | |
1548 | |
1549 # < version 0.7.6 $genome_conversion =~ s/^XG:Z://; | |
1550 chomp $genome_conversion; # in case it captured a new line character | |
1551 | |
1552 # print join ("\t",$meth_call_1,$meth_call_2,$first_read_conversion,$second_read_conversion,$genome_conversion),"\n"; | |
1553 | |
1554 my $index; | |
1555 my $strand; | |
1556 | |
1557 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') { | |
1558 $index = 0; ## this is OT | |
1559 $strand = '+'; | |
1560 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') { | |
1561 $index = 1; ## this is CTOT | |
1562 $strand = '-'; | |
1563 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') { | |
1564 $index = 2; ## this is CTOB | |
1565 $strand = '+'; | |
1566 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') { | |
1567 $index = 3; ## this is OB | |
1568 $strand = '-'; | |
1569 } else { | |
1570 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; | |
1571 } | |
1572 | |
1573 ### reversing the methylation call of the read that was reverse-complemented | |
1574 if ($strand eq '+') { | |
1575 $meth_call_2 = reverse $meth_call_2; | |
1576 } else { | |
1577 $meth_call_1 = reverse $meth_call_1; | |
1578 } | |
1579 | |
1580 if ($meth_call_1 and $meth_call_2) { | |
1581 | |
1582 my $end_read_1; | |
1583 | |
1584 ### READ 1 | |
1585 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation | |
1586 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation | |
1587 shift @ops_1; # remove the empty first element | |
1588 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1); | |
1589 | |
1590 my @comp_cigar_1; # building an array with all CIGAR operations | |
1591 foreach my $index (0..$#len_1) { | |
1592 foreach (1..$len_1[$index]) { | |
1593 # print "$ops_1[$index]"; | |
1594 push @comp_cigar_1, $ops_1[$index]; | |
1595 } | |
1596 } | |
1597 # print "original CIGAR read 1: $cigar_1\n"; | |
1598 # print "original CIGAR read 1: @comp_cigar_1\n"; | |
1599 | |
1600 ### READ 2 | |
1601 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation | |
1602 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation | |
1603 shift @ops_2; # remove the empty first element | |
1604 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2); | |
1605 my @comp_cigar_2; # building an array with all CIGAR operations for read 2 | |
1606 foreach my $index (0..$#len_2) { | |
1607 foreach (1..$len_2[$index]) { | |
1608 # print "$ops_2[$index]"; | |
1609 push @comp_cigar_2, $ops_2[$index]; | |
1610 } | |
1611 } | |
1612 # print "original CIGAR read 2: $cigar_2\n"; | |
1613 # print "original CIGAR read 2: @comp_cigar_2\n"; | |
1614 | |
1615 if ($ignore) { | |
1616 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' | |
1617 ### the methylation calls have already been reversed where necessary | |
1618 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); | |
1619 $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore); | |
1620 | |
1621 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly | |
1622 | |
1623 if ($strand eq '+') { | |
1624 | |
1625 ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start | |
1626 my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions | |
1627 my $I_count_1 = 0; | |
1628 | |
1629 for (1..$ignore) { | |
1630 my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start | |
1631 # print "$_ deleted $op\n"; | |
1632 | |
1633 while ($op eq 'D') { # repeating this for deletions (D) | |
1634 $D_count_1++; | |
1635 $op = shift @comp_cigar_1; | |
1636 # print "$_ deleted $op\n"; | |
1637 } | |
1638 if ($op eq 'I') { # adjusting the genomic position for insertions (I) | |
1639 $I_count_1++; | |
1640 } | |
1641 } | |
1642 | |
1643 $start_read_1 += $ignore + $D_count_1 - $I_count_1; | |
1644 # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n"; | |
1645 | |
1646 ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back | |
1647 | |
1648 for (1..$ignore) { | |
1649 my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array | |
1650 while ($op eq 'D') { # repeating this for deletions (D) | |
1651 $op = pop @comp_cigar_2; | |
1652 } | |
1653 } | |
1654 # the start position of reads mapping to the reverse strand is being adjusted further below | |
1655 } elsif ($strand eq '-') { | |
1656 | |
1657 ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back | |
1658 for (1..$ignore) { | |
1659 my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array | |
1660 while ($op eq 'D') { # repeating this for deletions (D) | |
1661 $op = pop @comp_cigar_1; | |
1662 } | |
1663 } | |
1664 # the start position of reads mapping to the reverse strand is being adjusted further below | |
1665 | |
1666 ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start | |
1667 my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions | |
1668 my $I_count_2 = 0; | |
1669 | |
1670 for (1..$ignore) { | |
1671 my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start | |
1672 # print "$_ deleted $op\n"; | |
1673 | |
1674 while ($op eq 'D') { # repeating this for deletions (D) | |
1675 $D_count_2++; | |
1676 $op = shift @comp_cigar_2; | |
1677 # print "$_ deleted $op\n"; | |
1678 } | |
1679 if ($op eq 'I') { # adjusting the genomic position for insertions (I) | |
1680 $I_count_2++; | |
1681 } | |
1682 } | |
1683 | |
1684 $start_read_2 += $ignore + $D_count_2 - $I_count_2; | |
1685 # print "start read 2 $start_read_2\t ignore: $ignore\t D count 2: $D_count_2\tI_count 2: $I_count_2\n"; | |
1686 | |
1687 } | |
1688 | |
1689 ### reconstituting shortened CIGAR string 1 | |
1690 my $new_cigar_1; | |
1691 my $count_1 = 0; | |
1692 my $last_op_1; | |
1693 # print "ignore adjusted CIGAR 1: @comp_cigar_1\n"; | |
1694 foreach my $op (@comp_cigar_1) { | |
1695 unless (defined $last_op_1){ | |
1696 $last_op_1 = $op; | |
1697 ++$count_1; | |
1698 next; | |
1699 } | |
1700 if ($last_op_1 eq $op) { | |
1701 ++$count_1; | |
1702 } else { | |
1703 $new_cigar_1 .= "$count_1$last_op_1"; | |
1704 $last_op_1 = $op; | |
1705 $count_1 = 1; | |
1706 } | |
1707 } | |
1708 $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count | |
1709 $cigar_1 = $new_cigar_1; | |
1710 # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n"; | |
1711 | |
1712 ### reconstituting shortened CIGAR string 2 | |
1713 my $new_cigar_2; | |
1714 my $count_2 = 0; | |
1715 my $last_op_2; | |
1716 # print "ignore adjusted CIGAR 2: @comp_cigar_2\n"; | |
1717 foreach my $op (@comp_cigar_2) { | |
1718 unless (defined $last_op_2){ | |
1719 $last_op_2 = $op; | |
1720 ++$count_2; | |
1721 next; | |
1722 } | |
1723 if ($last_op_2 eq $op) { | |
1724 ++$count_2; | |
1725 } else { | |
1726 $new_cigar_2 .= "$count_2$last_op_2"; | |
1727 $last_op_2 = $op; | |
1728 $count_2 = 1; | |
1729 } | |
1730 } | |
1731 $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count | |
1732 $cigar_2 = $new_cigar_2; | |
1733 # print "ignore adjusted CIGAR 2 scalar: $cigar_2\n"; | |
1734 | |
1735 } | |
1736 | |
1737 if ($strand eq '+') { | |
1738 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2 | |
1739 @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too | |
1740 # print "reverse: @comp_cigar_2\n"; | |
1741 | |
1742 my $MD_count_1 = 0; | |
1743 foreach (@comp_cigar_1) { | |
1744 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't | |
1745 } | |
1746 | |
1747 my $MD_count_2 = 0; | |
1748 foreach (@comp_cigar_2) { | |
1749 ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't | |
1750 } | |
1751 | |
1752 $end_read_1 = $start_read_1 + $MD_count_1 - 1; | |
1753 $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand | |
1754 } else { | |
1755 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1 | |
1756 | |
1757 @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too | |
1758 # print "reverse: @comp_cigar_1\n"; | |
1759 | |
1760 my $MD_count_1 = 0; | |
1761 foreach (@comp_cigar_1) { | |
1762 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't | |
1763 } | |
1764 | |
1765 $end_read_1 = $start_read_1; | |
1766 $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand | |
1767 | |
1768 } | |
1769 | |
1770 if ($strand eq '+') { | |
1771 ## we first pass the first read which is in + orientation on the forward strand | |
1772 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1); | |
1773 | |
1774 # we next pass the second read which is in - orientation on the reverse strand | |
1775 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 | |
1776 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2); | |
1777 } else { | |
1778 ## we first pass the first read which is in - orientation on the reverse strand | |
1779 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1); | |
1780 | |
1781 # we next pass the second read which is in + orientation on the forward strand | |
1782 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 | |
1783 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2); | |
1784 } | |
1785 | |
1786 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls | |
1787 } | |
1788 } | |
1789 } | |
1790 } else { | |
1791 die "Single-end or paired-end reads not specified properly\n"; | |
1792 } | |
1793 | |
1794 print "\n\nProcessed $line_count lines from $filename in total\n"; | |
1795 print "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; | |
1796 if ($report) { | |
1797 print REPORT "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; | |
1798 } | |
1799 print_splitting_report (); | |
1800 } | |
1801 | |
1802 | |
1803 | |
1804 sub print_splitting_report{ | |
1805 | |
1806 ### Calculating methylation percentages if applicable | |
1807 | |
1808 my $percent_meCpG; | |
1809 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ | |
1810 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); | |
1811 } | |
1812 | |
1813 my $percent_meCHG; | |
1814 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ | |
1815 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); | |
1816 } | |
1817 | |
1818 my $percent_meCHH; | |
1819 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ | |
1820 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); | |
1821 } | |
1822 | |
1823 my $percent_non_CpG_methylation; | |
1824 if ($merge_non_CpG){ | |
1825 if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ | |
1826 $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) ); | |
1827 } | |
1828 } | |
1829 | |
1830 if ($report){ | |
1831 ### detailed information about Cs analysed | |
1832 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; | |
1833 | |
1834 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; | |
1835 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; | |
1836 | |
1837 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | |
1838 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | |
1839 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; | |
1840 | |
1841 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
1842 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
1843 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; | |
1844 | |
1845 ### calculating methylated CpG percentage if applicable | |
1846 if ($percent_meCpG){ | |
1847 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; | |
1848 } | |
1849 else{ | |
1850 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; | |
1851 } | |
1852 | |
1853 ### 2-Context Output | |
1854 if ($merge_non_CpG){ | |
1855 if ($percent_non_CpG_methylation){ | |
1856 print REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; | |
1857 } | |
1858 else{ | |
1859 print REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; | |
1860 } | |
1861 } | |
1862 | |
1863 ### 3 Context Output | |
1864 else{ | |
1865 ### calculating methylated CHG percentage if applicable | |
1866 if ($percent_meCHG){ | |
1867 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; | |
1868 } | |
1869 else{ | |
1870 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; | |
1871 } | |
1872 | |
1873 ### calculating methylated CHH percentage if applicable | |
1874 if ($percent_meCHH){ | |
1875 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; | |
1876 } | |
1877 else{ | |
1878 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; | |
1879 } | |
1880 } | |
1881 } | |
1882 | |
1883 ### detailed information about Cs analysed for on-screen report | |
1884 print "Final Cytosine Methylation Report\n",'='x33,"\n"; | |
1885 | |
1886 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; | |
1887 print "Total number of C's analysed:\t$total_number_of_C\n\n"; | |
1888 | |
1889 print "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | |
1890 print "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | |
1891 print "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; | |
1892 | |
1893 print "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
1894 print "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
1895 print "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; | |
1896 | |
1897 ### printing methylated CpG percentage if applicable | |
1898 if ($percent_meCpG){ | |
1899 print "C methylated in CpG context:\t${percent_meCpG}%\n"; | |
1900 } | |
1901 else{ | |
1902 print "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; | |
1903 } | |
1904 | |
1905 ### 2-Context Output | |
1906 if ($merge_non_CpG){ | |
1907 if ($percent_non_CpG_methylation){ | |
1908 print "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; | |
1909 } | |
1910 else{ | |
1911 print "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; | |
1912 } | |
1913 } | |
1914 | |
1915 ### 3-Context Output | |
1916 else{ | |
1917 ### printing methylated CHG percentage if applicable | |
1918 if ($percent_meCHG){ | |
1919 print "C methylated in CHG context:\t${percent_meCHG}%\n"; | |
1920 } | |
1921 else{ | |
1922 print "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; | |
1923 } | |
1924 | |
1925 ### printing methylated CHH percentage if applicable | |
1926 if ($percent_meCHH){ | |
1927 print "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; | |
1928 } | |
1929 else{ | |
1930 print "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; | |
1931 } | |
1932 } | |
1933 } | |
1934 | |
1935 | |
1936 | |
1937 | |
1938 | |
1939 sub print_individual_C_methylation_states_paired_end_files{ | |
1940 | |
1941 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar) = @_; | |
1942 my @methylation_calls = split(//,$meth_call); | |
1943 | |
1944 ################################################################# | |
1945 ### . for bases not involving cytosines ### | |
1946 ### X for methylated C in CHG context (was protected) ### | |
1947 ### x for not methylated C in CHG context (was converted) ### | |
1948 ### H for methylated C in CHH context (was protected) ### | |
1949 ### h for not methylated C in CHH context (was converted) ### | |
1950 ### Z for methylated C in CpG context (was protected) ### | |
1951 ### z for not methylated C in CpG context (was converted) ### | |
1952 ################################################################# | |
1953 | |
1954 my $methyl_CHG_count = 0; | |
1955 my $methyl_CHH_count = 0; | |
1956 my $methyl_CpG_count = 0; | |
1957 my $unmethylated_CHG_count = 0; | |
1958 my $unmethylated_CHH_count = 0; | |
1959 my $unmethylated_CpG_count = 0; | |
1960 | |
1961 | |
1962 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions | |
1963 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels | |
1964 my @comp_cigar; | |
1965 | |
1966 ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing | |
1967 if ($cigar =~ /^\d+M$/){ | |
1968 } | |
1969 else{ # parsing CIGAR string | |
1970 my @len; | |
1971 my @ops; | |
1972 @len = split (/\D+/,$cigar); # storing the length per operation | |
1973 @ops = split (/\d+/,$cigar); # storing the operation | |
1974 shift @ops; # remove the empty first element | |
1975 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
1976 | |
1977 foreach my $index (0..$#len){ | |
1978 foreach (1..$len[$index]){ | |
1979 # print "$ops[$index]"; | |
1980 push @comp_cigar, $ops[$index]; | |
1981 } | |
1982 } | |
1983 # warn "\nDetected CIGAR string: $cigar\n"; | |
1984 # warn "Length of methylation call: ",length $meth_call,"\n"; | |
1985 # warn "number of operations: ",scalar @ops,"\n"; | |
1986 # warn "number of length digits: ",scalar @len,"\n\n"; | |
1987 # print @comp_cigar,"\n"; | |
1988 # print "$meth_call\n\n"; | |
1989 # sleep (1); | |
1990 } | |
1991 | |
1992 if ($strand eq '-') { | |
1993 | |
1994 ### the CIGAR string needs to be reversed, the methylation call has already been reversed above | |
1995 if (@comp_cigar){ | |
1996 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too | |
1997 } | |
1998 # print "reverse CIGAR string: @comp_cigar\n"; | |
1999 | |
2000 ### the start position of paired-end files has already been corrected, see above | |
2001 } | |
2002 | |
2003 ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified | |
2004 | |
2005 if ($merge_non_CpG) { | |
2006 | |
2007 if ($no_overlap) { | |
2008 | |
2009 ### single-file CpG and non-CpG context output | |
2010 if ($full) { | |
2011 if ($strand eq '+') { | |
2012 for my $index (0..$#methylation_calls) { | |
2013 | |
2014 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2015 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2016 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2017 $cigar_offset += $cigar_mod; | |
2018 $pos_offset += $pos_mod; | |
2019 } | |
2020 | |
2021 ### Returning as soon as the methylation calls start overlapping | |
2022 if ($start+$index+$pos_offset >= $end_read_1) { | |
2023 return; | |
2024 } | |
2025 | |
2026 if ($methylation_calls[$index] eq 'X') { | |
2027 $counting{total_meCHG_count}++; | |
2028 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2029 } elsif ($methylation_calls[$index] eq 'x') { | |
2030 $counting{total_unmethylated_CHG_count}++; | |
2031 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2032 } elsif ($methylation_calls[$index] eq 'Z') { | |
2033 $counting{total_meCpG_count}++; | |
2034 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2035 } elsif ($methylation_calls[$index] eq 'z') { | |
2036 $counting{total_unmethylated_CpG_count}++; | |
2037 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2038 } elsif ($methylation_calls[$index] eq 'H') { | |
2039 $counting{total_meCHH_count}++; | |
2040 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2041 } elsif ($methylation_calls[$index] eq 'h') { | |
2042 $counting{total_unmethylated_CHH_count}++; | |
2043 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2044 } | |
2045 elsif ($methylation_calls[$index] eq '.'){} | |
2046 else{ | |
2047 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2048 } | |
2049 } | |
2050 } elsif ($strand eq '-') { | |
2051 for my $index (0..$#methylation_calls) { | |
2052 | |
2053 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2054 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
2055 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2056 $cigar_offset += $cigar_mod; | |
2057 $pos_offset += $pos_mod; | |
2058 } | |
2059 | |
2060 ### Returning as soon as the methylation calls start overlapping | |
2061 if ($start-$index+$pos_offset <= $end_read_1) { | |
2062 return; | |
2063 } | |
2064 | |
2065 if ($methylation_calls[$index] eq 'X') { | |
2066 $counting{total_meCHG_count}++; | |
2067 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2068 } elsif ($methylation_calls[$index] eq 'x') { | |
2069 $counting{total_unmethylated_CHG_count}++; | |
2070 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2071 } elsif ($methylation_calls[$index] eq 'Z') { | |
2072 $counting{total_meCpG_count}++; | |
2073 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2074 } elsif ($methylation_calls[$index] eq 'z') { | |
2075 $counting{total_unmethylated_CpG_count}++; | |
2076 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2077 } elsif ($methylation_calls[$index] eq 'H') { | |
2078 $counting{total_meCHH_count}++; | |
2079 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2080 } elsif ($methylation_calls[$index] eq 'h') { | |
2081 $counting{total_unmethylated_CHH_count}++; | |
2082 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2083 } | |
2084 elsif ($methylation_calls[$index] eq '.') {} | |
2085 else{ | |
2086 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2087 } | |
2088 } | |
2089 } else { | |
2090 die "The read orientation was neither + nor -: '$strand'\n"; | |
2091 } | |
2092 } | |
2093 | |
2094 ### strand-specific methylation output | |
2095 else { | |
2096 if ($strand eq '+') { | |
2097 for my $index (0..$#methylation_calls) { | |
2098 | |
2099 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2100 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2101 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2102 $cigar_offset += $cigar_mod; | |
2103 $pos_offset += $pos_mod; | |
2104 } | |
2105 | |
2106 ### Returning as soon as the methylation calls start overlapping | |
2107 if ($start+$index+$pos_offset >= $end_read_1) { | |
2108 return; | |
2109 } | |
2110 | |
2111 if ($methylation_calls[$index] eq 'X') { | |
2112 $counting{total_meCHG_count}++; | |
2113 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2114 } elsif ($methylation_calls[$index] eq 'x') { | |
2115 $counting{total_unmethylated_CHG_count}++; | |
2116 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2117 } elsif ($methylation_calls[$index] eq 'Z') { | |
2118 $counting{total_meCpG_count}++; | |
2119 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2120 } elsif ($methylation_calls[$index] eq 'z') { | |
2121 $counting{total_unmethylated_CpG_count}++; | |
2122 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2123 } elsif ($methylation_calls[$index] eq 'H') { | |
2124 $counting{total_meCHH_count}++; | |
2125 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2126 } elsif ($methylation_calls[$index] eq 'h') { | |
2127 $counting{total_unmethylated_CHH_count}++; | |
2128 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2129 } | |
2130 elsif ($methylation_calls[$index] eq '.') {} | |
2131 else{ | |
2132 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2133 } | |
2134 } | |
2135 } elsif ($strand eq '-') { | |
2136 for my $index (0..$#methylation_calls) { | |
2137 | |
2138 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2139 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
2140 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2141 $cigar_offset += $cigar_mod; | |
2142 $pos_offset += $pos_mod; | |
2143 } | |
2144 | |
2145 ### Returning as soon as the methylation calls start overlapping | |
2146 if ($start-$index+$pos_offset <= $end_read_1) { | |
2147 return; | |
2148 } | |
2149 | |
2150 if ($methylation_calls[$index] eq 'X') { | |
2151 $counting{total_meCHG_count}++; | |
2152 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2153 } elsif ($methylation_calls[$index] eq 'x') { | |
2154 $counting{total_unmethylated_CHG_count}++; | |
2155 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2156 } elsif ($methylation_calls[$index] eq 'Z') { | |
2157 $counting{total_meCpG_count}++; | |
2158 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2159 } elsif ($methylation_calls[$index] eq 'z') { | |
2160 $counting{total_unmethylated_CpG_count}++; | |
2161 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2162 } elsif ($methylation_calls[$index] eq 'H') { | |
2163 $counting{total_meCHH_count}++; | |
2164 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2165 } elsif ($methylation_calls[$index] eq 'h') { | |
2166 $counting{total_unmethylated_CHH_count}++; | |
2167 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2168 } | |
2169 elsif ($methylation_calls[$index] eq '.') {} | |
2170 else{ | |
2171 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2172 } | |
2173 } | |
2174 } else { | |
2175 die "The strand orientation was neither + nor -: '$strand'/n"; | |
2176 } | |
2177 } | |
2178 } | |
2179 | |
2180 ### this is the default paired-end procedure allowing overlaps and using every single C position | |
2181 ### Still within the 2-CONTEXT ONLY optional section | |
2182 else { | |
2183 ### single-file CpG and non-CpG context output | |
2184 if ($full) { | |
2185 if ($strand eq '+') { | |
2186 for my $index (0..$#methylation_calls) { | |
2187 | |
2188 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2189 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2190 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2191 $cigar_offset += $cigar_mod; | |
2192 $pos_offset += $pos_mod; | |
2193 } | |
2194 | |
2195 if ($methylation_calls[$index] eq 'X') { | |
2196 $counting{total_meCHG_count}++; | |
2197 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2198 } elsif ($methylation_calls[$index] eq 'x') { | |
2199 $counting{total_unmethylated_CHG_count}++; | |
2200 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2201 } elsif ($methylation_calls[$index] eq 'Z') { | |
2202 $counting{total_meCpG_count}++; | |
2203 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2204 } elsif ($methylation_calls[$index] eq 'z') { | |
2205 $counting{total_unmethylated_CpG_count}++; | |
2206 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2207 } elsif ($methylation_calls[$index] eq 'H') { | |
2208 $counting{total_meCHH_count}++; | |
2209 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2210 } elsif ($methylation_calls[$index] eq 'h') { | |
2211 $counting{total_unmethylated_CHH_count}++; | |
2212 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2213 } | |
2214 elsif ($methylation_calls[$index] eq '.') {} | |
2215 else{ | |
2216 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2217 } | |
2218 } | |
2219 } elsif ($strand eq '-') { | |
2220 for my $index (0..$#methylation_calls) { | |
2221 | |
2222 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2223 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
2224 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2225 $cigar_offset += $cigar_mod; | |
2226 $pos_offset += $pos_mod; | |
2227 } | |
2228 | |
2229 if ($methylation_calls[$index] eq 'X') { | |
2230 $counting{total_meCHG_count}++; | |
2231 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2232 } elsif ($methylation_calls[$index] eq 'x') { | |
2233 $counting{total_unmethylated_CHG_count}++; | |
2234 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2235 } elsif ($methylation_calls[$index] eq 'Z') { | |
2236 $counting{total_meCpG_count}++; | |
2237 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2238 } elsif ($methylation_calls[$index] eq 'z') { | |
2239 $counting{total_unmethylated_CpG_count}++; | |
2240 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2241 } elsif ($methylation_calls[$index] eq 'H') { | |
2242 $counting{total_meCHH_count}++; | |
2243 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2244 } elsif ($methylation_calls[$index] eq 'h') { | |
2245 $counting{total_unmethylated_CHH_count}++; | |
2246 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2247 } | |
2248 elsif ($methylation_calls[$index] eq '.') {} | |
2249 else{ | |
2250 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2251 } | |
2252 } | |
2253 } else { | |
2254 die "The strand orientation as neither + nor -: '$strand'\n"; | |
2255 } | |
2256 } | |
2257 | |
2258 ### strand-specific methylation output | |
2259 ### still within the 2-CONTEXT optional section | |
2260 else { | |
2261 if ($strand eq '+') { | |
2262 for my $index (0..$#methylation_calls) { | |
2263 | |
2264 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2265 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2266 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2267 $cigar_offset += $cigar_mod; | |
2268 $pos_offset += $pos_mod; | |
2269 } | |
2270 | |
2271 if ($methylation_calls[$index] eq 'X') { | |
2272 $counting{total_meCHG_count}++; | |
2273 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2274 } elsif ($methylation_calls[$index] eq 'x') { | |
2275 $counting{total_unmethylated_CHG_count}++; | |
2276 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2277 } elsif ($methylation_calls[$index] eq 'Z') { | |
2278 $counting{total_meCpG_count}++; | |
2279 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2280 } elsif ($methylation_calls[$index] eq 'z') { | |
2281 $counting{total_unmethylated_CpG_count}++; | |
2282 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2283 } elsif ($methylation_calls[$index] eq 'H') { | |
2284 $counting{total_meCHH_count}++; | |
2285 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2286 } elsif ($methylation_calls[$index] eq 'h') { | |
2287 $counting{total_unmethylated_CHH_count}++; | |
2288 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2289 } | |
2290 elsif ($methylation_calls[$index] eq '.') {} | |
2291 else{ | |
2292 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2293 } | |
2294 } | |
2295 } elsif ($strand eq '-') { | |
2296 for my $index (0..$#methylation_calls) { | |
2297 | |
2298 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2299 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
2300 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2301 $cigar_offset += $cigar_mod; | |
2302 $pos_offset += $pos_mod; | |
2303 } | |
2304 | |
2305 if ($methylation_calls[$index] eq 'X') { | |
2306 $counting{total_meCHG_count}++; | |
2307 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2308 } elsif ($methylation_calls[$index] eq 'x') { | |
2309 $counting{total_unmethylated_CHG_count}++; | |
2310 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2311 } elsif ($methylation_calls[$index] eq 'Z') { | |
2312 $counting{total_meCpG_count}++; | |
2313 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2314 } elsif ($methylation_calls[$index] eq 'z') { | |
2315 $counting{total_unmethylated_CpG_count}++; | |
2316 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2317 } elsif ($methylation_calls[$index] eq 'H') { | |
2318 $counting{total_meCHH_count}++; | |
2319 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2320 } elsif ($methylation_calls[$index] eq 'h') { | |
2321 $counting{total_unmethylated_CHH_count}++; | |
2322 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2323 } | |
2324 elsif ($methylation_calls[$index] eq '.') {} | |
2325 else{ | |
2326 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2327 } | |
2328 } | |
2329 } else { | |
2330 die "The strand orientation as neither + nor -: '$strand'\n"; | |
2331 } | |
2332 } | |
2333 } | |
2334 } | |
2335 | |
2336 ############################################ | |
2337 ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ### | |
2338 ############################################ | |
2339 | |
2340 elsif ($no_overlap) { | |
2341 ### single-file CpG, CHG and CHH context output | |
2342 if ($full) { | |
2343 if ($strand eq '+') { | |
2344 for my $index (0..$#methylation_calls) { | |
2345 | |
2346 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2347 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2348 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2349 $cigar_offset += $cigar_mod; | |
2350 $pos_offset += $pos_mod; | |
2351 } | |
2352 | |
2353 ### Returning as soon as the methylation calls start overlapping | |
2354 if ($start+$index+$pos_offset >= $end_read_1) { | |
2355 return; | |
2356 } | |
2357 | |
2358 if ($methylation_calls[$index] eq 'X') { | |
2359 $counting{total_meCHG_count}++; | |
2360 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2361 } elsif ($methylation_calls[$index] eq 'x') { | |
2362 $counting{total_unmethylated_CHG_count}++; | |
2363 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2364 } elsif ($methylation_calls[$index] eq 'Z') { | |
2365 $counting{total_meCpG_count}++; | |
2366 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2367 } elsif ($methylation_calls[$index] eq 'z') { | |
2368 $counting{total_unmethylated_CpG_count}++; | |
2369 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2370 } elsif ($methylation_calls[$index] eq 'H') { | |
2371 $counting{total_meCHH_count}++; | |
2372 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2373 } elsif ($methylation_calls[$index] eq 'h') { | |
2374 $counting{total_unmethylated_CHH_count}++; | |
2375 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2376 } | |
2377 elsif ($methylation_calls[$index] eq '.') {} | |
2378 else{ | |
2379 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2380 } | |
2381 } | |
2382 } elsif ($strand eq '-') { | |
2383 for my $index (0..$#methylation_calls) { | |
2384 | |
2385 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2386 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
2387 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2388 $cigar_offset += $cigar_mod; | |
2389 $pos_offset += $pos_mod; | |
2390 } | |
2391 | |
2392 ### Returning as soon as the methylation calls start overlapping | |
2393 if ($start-$index+$pos_offset <= $end_read_1) { | |
2394 return; | |
2395 } | |
2396 | |
2397 if ($methylation_calls[$index] eq 'X') { | |
2398 $counting{total_meCHG_count}++; | |
2399 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2400 } elsif ($methylation_calls[$index] eq 'x') { | |
2401 $counting{total_unmethylated_CHG_count}++; | |
2402 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2403 } elsif ($methylation_calls[$index] eq 'Z') { | |
2404 $counting{total_meCpG_count}++; | |
2405 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2406 } elsif ($methylation_calls[$index] eq 'z') { | |
2407 $counting{total_unmethylated_CpG_count}++; | |
2408 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2409 } elsif ($methylation_calls[$index] eq 'H') { | |
2410 $counting{total_meCHH_count}++; | |
2411 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2412 } elsif ($methylation_calls[$index] eq 'h') { | |
2413 $counting{total_unmethylated_CHH_count}++; | |
2414 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2415 } | |
2416 elsif ($methylation_calls[$index] eq '.') {} | |
2417 else{ | |
2418 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2419 } | |
2420 } | |
2421 } else { | |
2422 die "The strand orientation as neither + nor -: '$strand'\n"; | |
2423 } | |
2424 } | |
2425 | |
2426 ### strand-specific methylation output | |
2427 else { | |
2428 if ($strand eq '+') { | |
2429 for my $index (0..$#methylation_calls) { | |
2430 | |
2431 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2432 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2433 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2434 $cigar_offset += $cigar_mod; | |
2435 $pos_offset += $pos_mod; | |
2436 } | |
2437 | |
2438 ### Returning as soon as the methylation calls start overlapping | |
2439 if ($start+$index+$pos_offset >= $end_read_1) { | |
2440 return; | |
2441 } | |
2442 | |
2443 if ($methylation_calls[$index] eq 'X') { | |
2444 $counting{total_meCHG_count}++; | |
2445 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2446 } elsif ($methylation_calls[$index] eq 'x') { | |
2447 $counting{total_unmethylated_CHG_count}++; | |
2448 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2449 } elsif ($methylation_calls[$index] eq 'Z') { | |
2450 $counting{total_meCpG_count}++; | |
2451 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2452 } elsif ($methylation_calls[$index] eq 'z') { | |
2453 $counting{total_unmethylated_CpG_count}++; | |
2454 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2455 } elsif ($methylation_calls[$index] eq 'H') { | |
2456 $counting{total_meCHH_count}++; | |
2457 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2458 } elsif ($methylation_calls[$index] eq 'h') { | |
2459 $counting{total_unmethylated_CHH_count}++; | |
2460 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2461 } | |
2462 elsif ($methylation_calls[$index] eq '.') {} | |
2463 else{ | |
2464 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2465 } | |
2466 } | |
2467 } elsif ($strand eq '-') { | |
2468 for my $index (0..$#methylation_calls) { | |
2469 | |
2470 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2471 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
2472 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2473 $cigar_offset += $cigar_mod; | |
2474 $pos_offset += $pos_mod; | |
2475 } | |
2476 | |
2477 ### Returning as soon as the methylation calls start overlapping | |
2478 if ($start-$index+$pos_offset <= $end_read_1) { | |
2479 return; | |
2480 } | |
2481 | |
2482 if ($methylation_calls[$index] eq 'X') { | |
2483 $counting{total_meCHG_count}++; | |
2484 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2485 } elsif ($methylation_calls[$index] eq 'x') { | |
2486 $counting{total_unmethylated_CHG_count}++; | |
2487 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2488 } elsif ($methylation_calls[$index] eq 'Z') { | |
2489 $counting{total_meCpG_count}++; | |
2490 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2491 } elsif ($methylation_calls[$index] eq 'z') { | |
2492 $counting{total_unmethylated_CpG_count}++; | |
2493 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2494 } elsif ($methylation_calls[$index] eq 'H') { | |
2495 $counting{total_meCHH_count}++; | |
2496 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2497 } elsif ($methylation_calls[$index] eq 'h') { | |
2498 $counting{total_unmethylated_CHH_count}++; | |
2499 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2500 } | |
2501 elsif ($methylation_calls[$index] eq '.') {} | |
2502 else{ | |
2503 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2504 } | |
2505 } | |
2506 } else { | |
2507 die "The strand orientation as neither + nor -: '$strand'\n"; | |
2508 } | |
2509 } | |
2510 } | |
2511 | |
2512 ### this is the default paired-end procedure allowing overlaps and using every single C position | |
2513 else { | |
2514 ### single-file CpG, CHG and CHH context output | |
2515 if ($full) { | |
2516 if ($strand eq '+') { | |
2517 for my $index (0..$#methylation_calls) { | |
2518 | |
2519 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2520 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2521 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2522 $cigar_offset += $cigar_mod; | |
2523 $pos_offset += $pos_mod; | |
2524 } | |
2525 | |
2526 if ($methylation_calls[$index] eq 'X') { | |
2527 $counting{total_meCHG_count}++; | |
2528 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2529 } elsif ($methylation_calls[$index] eq 'x') { | |
2530 $counting{total_unmethylated_CHG_count}++; | |
2531 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2532 } elsif ($methylation_calls[$index] eq 'Z') { | |
2533 $counting{total_meCpG_count}++; | |
2534 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2535 } elsif ($methylation_calls[$index] eq 'z') { | |
2536 $counting{total_unmethylated_CpG_count}++; | |
2537 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2538 } elsif ($methylation_calls[$index] eq 'H') { | |
2539 $counting{total_meCHH_count}++; | |
2540 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2541 } elsif ($methylation_calls[$index] eq 'h') { | |
2542 $counting{total_unmethylated_CHH_count}++; | |
2543 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2544 } | |
2545 elsif ($methylation_calls[$index] eq '.') {} | |
2546 else{ | |
2547 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2548 } | |
2549 } | |
2550 } elsif ($strand eq '-') { | |
2551 for my $index (0..$#methylation_calls) { | |
2552 | |
2553 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2554 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
2555 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2556 $cigar_offset += $cigar_mod; | |
2557 $pos_offset += $pos_mod; | |
2558 } | |
2559 | |
2560 if ($methylation_calls[$index] eq 'X') { | |
2561 $counting{total_meCHG_count}++; | |
2562 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2563 } elsif ($methylation_calls[$index] eq 'x') { | |
2564 $counting{total_unmethylated_CHG_count}++; | |
2565 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2566 } elsif ($methylation_calls[$index] eq 'Z') { | |
2567 $counting{total_meCpG_count}++; | |
2568 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2569 } elsif ($methylation_calls[$index] eq 'z') { | |
2570 $counting{total_unmethylated_CpG_count}++; | |
2571 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2572 } elsif ($methylation_calls[$index] eq 'H') { | |
2573 $counting{total_meCHH_count}++; | |
2574 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2575 } elsif ($methylation_calls[$index] eq 'h') { | |
2576 $counting{total_unmethylated_CHH_count}++; | |
2577 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2578 } | |
2579 elsif ($methylation_calls[$index] eq '.') {} | |
2580 else{ | |
2581 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2582 } | |
2583 } | |
2584 } else { | |
2585 die "The strand orientation as neither + nor -: '$strand'\n"; | |
2586 } | |
2587 } | |
2588 | |
2589 ### strand-specific methylation output | |
2590 else { | |
2591 if ($strand eq '+') { | |
2592 for my $index (0..$#methylation_calls) { | |
2593 | |
2594 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2595 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2596 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2597 $cigar_offset += $cigar_mod; | |
2598 $pos_offset += $pos_mod; | |
2599 } | |
2600 | |
2601 if ($methylation_calls[$index] eq 'X') { | |
2602 $counting{total_meCHG_count}++; | |
2603 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2604 } elsif ($methylation_calls[$index] eq 'x') { | |
2605 $counting{total_unmethylated_CHG_count}++; | |
2606 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2607 } elsif ($methylation_calls[$index] eq 'Z') { | |
2608 $counting{total_meCpG_count}++; | |
2609 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2610 } elsif ($methylation_calls[$index] eq 'z') { | |
2611 $counting{total_unmethylated_CpG_count}++; | |
2612 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2613 } elsif ($methylation_calls[$index] eq 'H') { | |
2614 $counting{total_meCHH_count}++; | |
2615 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2616 } elsif ($methylation_calls[$index] eq 'h') { | |
2617 $counting{total_unmethylated_CHH_count}++; | |
2618 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2619 } | |
2620 elsif ($methylation_calls[$index] eq '.') {} | |
2621 else{ | |
2622 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2623 } | |
2624 } | |
2625 } elsif ($strand eq '-') { | |
2626 for my $index (0..$#methylation_calls) { | |
2627 | |
2628 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2629 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
2630 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2631 $cigar_offset += $cigar_mod; | |
2632 $pos_offset += $pos_mod; | |
2633 } | |
2634 | |
2635 if ($methylation_calls[$index] eq 'X') { | |
2636 $counting{total_meCHG_count}++; | |
2637 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2638 } elsif ($methylation_calls[$index] eq 'x') { | |
2639 $counting{total_unmethylated_CHG_count}++; | |
2640 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2641 } elsif ($methylation_calls[$index] eq 'Z') { | |
2642 $counting{total_meCpG_count}++; | |
2643 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2644 } elsif ($methylation_calls[$index] eq 'z') { | |
2645 $counting{total_unmethylated_CpG_count}++; | |
2646 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2647 } elsif ($methylation_calls[$index] eq 'H') { | |
2648 $counting{total_meCHH_count}++; | |
2649 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2650 } elsif ($methylation_calls[$index] eq 'h') { | |
2651 $counting{total_unmethylated_CHH_count}++; | |
2652 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2653 } | |
2654 elsif ($methylation_calls[$index] eq '.') {} | |
2655 else{ | |
2656 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2657 } | |
2658 } | |
2659 } else { | |
2660 die "The strand orientation as neither + nor -: '$strand'\n"; | |
2661 } | |
2662 } | |
2663 } | |
2664 } | |
2665 | |
2666 sub check_cigar_string { | |
2667 my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_; | |
2668 # print "$index\t$cigar_offset\t$pos_offset\t$strand\t"; | |
2669 my ($new_cigar_offset,$new_pos_offset) = (0,0); | |
2670 | |
2671 if ($strand eq '+') { | |
2672 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; | |
2673 | |
2674 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
2675 # warn "position needs no adjustment\n"; | |
2676 } | |
2677 | |
2678 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence | |
2679 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position | |
2680 # warn "adjusted genomic position by -1 bp (insertion)\n"; | |
2681 } | |
2682 | |
2683 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
2684 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
2685 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position | |
2686 # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; | |
2687 | |
2688 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ | |
2689 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
2690 # warn "position needs no adjustment\n"; | |
2691 last; | |
2692 } | |
2693 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ | |
2694 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position | |
2695 # warn "adjusted genomic position by another -1 bp (insertion)\n"; | |
2696 last; | |
2697 } | |
2698 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
2699 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
2700 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position | |
2701 # warn "adjusted genomic position by another +1 bp (deletion)\n"; | |
2702 } | |
2703 else{ | |
2704 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
2705 } | |
2706 } | |
2707 } | |
2708 else{ | |
2709 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
2710 } | |
2711 } | |
2712 | |
2713 elsif ($strand eq '-') { | |
2714 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; | |
2715 | |
2716 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
2717 # warn "position needs no adjustment\n"; | |
2718 } | |
2719 | |
2720 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence | |
2721 $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position | |
2722 # warn "adjusted genomic position by +1 bp (insertion)\n"; | |
2723 } | |
2724 | |
2725 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
2726 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
2727 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position | |
2728 # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; | |
2729 | |
2730 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ | |
2731 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
2732 # warn "Found new 'M' operation; position needs no adjustment\n"; | |
2733 last; | |
2734 } | |
2735 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ | |
2736 $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position | |
2737 # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n"; | |
2738 last; | |
2739 } | |
2740 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
2741 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
2742 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position | |
2743 # warn "adjusted genomic position by another -1 bp (deletion)\n"; | |
2744 } | |
2745 else{ | |
2746 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
2747 } | |
2748 } | |
2749 } | |
2750 else{ | |
2751 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
2752 } | |
2753 } | |
2754 # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n"; | |
2755 return ($new_cigar_offset,$new_pos_offset); | |
2756 } | |
2757 | |
2758 sub print_individual_C_methylation_states_single_end{ | |
2759 | |
2760 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_; | |
2761 my @methylation_calls = split(//,$meth_call); | |
2762 | |
2763 ################################################################# | |
2764 ### . for bases not involving cytosines ### | |
2765 ### X for methylated C in CHG context (was protected) ### | |
2766 ### x for not methylated C in CHG context (was converted) ### | |
2767 ### H for methylated C in CHH context (was protected) ### | |
2768 ### h for not methylated C in CHH context (was converted) ### | |
2769 ### Z for methylated C in CpG context (was protected) ### | |
2770 ### z for not methylated C in CpG context (was converted) ### | |
2771 ################################################################# | |
2772 | |
2773 my $methyl_CHG_count = 0; | |
2774 my $methyl_CHH_count = 0; | |
2775 my $methyl_CpG_count = 0; | |
2776 my $unmethylated_CHG_count = 0; | |
2777 my $unmethylated_CHH_count = 0; | |
2778 my $unmethylated_CpG_count = 0; | |
2779 | |
2780 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions | |
2781 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels | |
2782 | |
2783 my @comp_cigar; | |
2784 | |
2785 if ($cigar){ # parsing CIGAR string | |
2786 | |
2787 ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing | |
2788 if ($cigar =~ /^\d+M$/){ | |
2789 # warn "See!? I told you so! $cigar\n"; | |
2790 # sleep(1); | |
2791 } | |
2792 else{ | |
2793 | |
2794 my @len; | |
2795 my @ops; | |
2796 | |
2797 @len = split (/\D+/,$cigar); # storing the length per operation | |
2798 @ops = split (/\d+/,$cigar); # storing the operation | |
2799 shift @ops; # remove the empty first element | |
2800 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
2801 | |
2802 foreach my $index (0..$#len){ | |
2803 foreach (1..$len[$index]){ | |
2804 # print "$ops[$index]"; | |
2805 push @comp_cigar, $ops[$index]; | |
2806 } | |
2807 } | |
2808 } | |
2809 # warn "\nDetected CIGAR string: $cigar\n"; | |
2810 # warn "Length of methylation call: ",length $meth_call,"\n"; | |
2811 # warn "number of operations: ",scalar @ops,"\n"; | |
2812 # warn "number of length digits: ",scalar @len,"\n\n"; | |
2813 # print @comp_cigar,"\n"; | |
2814 # print "$meth_call\n\n"; | |
2815 # sleep (1); | |
2816 } | |
2817 | |
2818 ### adjusting the start position for all reads mapping to the reverse strand | |
2819 if ($strand eq '-') { | |
2820 | |
2821 if (@comp_cigar){ # only needed for SAM reads with InDels | |
2822 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too | |
2823 # print @comp_cigar,"\n"; | |
2824 } | |
2825 | |
2826 unless ($ignore){ ### if --ignore was specified the start position has already been corrected | |
2827 | |
2828 if ($cigar){ ### SAM format | |
2829 if ($cigar =~ /^(\d+)M$/){ # linear match | |
2830 $start += $1 - 1; | |
2831 } | |
2832 else{ # InDel read | |
2833 my $MD_count = 0; | |
2834 foreach (@comp_cigar){ | |
2835 ++$MD_count if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't | |
2836 } | |
2837 $start += $MD_count - 1; | |
2838 } | |
2839 } | |
2840 else{ ### vanilla format | |
2841 $start += length($meth_call)-1; | |
2842 } | |
2843 } | |
2844 } | |
2845 | |
2846 ### THIS IS THE CpG and Non-CpG SECTION (OPTIONAL) | |
2847 | |
2848 ### single-file CpG and other-context output | |
2849 if ($full and $merge_non_CpG) { | |
2850 if ($strand eq '+') { | |
2851 for my $index (0..$#methylation_calls) { | |
2852 | |
2853 if ($cigar and @comp_cigar){ # only needed for SAM alignments with InDels | |
2854 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2855 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2856 $cigar_offset += $cigar_mod; | |
2857 $pos_offset += $pos_mod; | |
2858 } | |
2859 | |
2860 ### methylated Cs (any context) will receive a forward (+) orientation | |
2861 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
2862 if ($methylation_calls[$index] eq 'X') { | |
2863 $counting{total_meCHG_count}++; | |
2864 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2865 } | |
2866 elsif ($methylation_calls[$index] eq 'x') { | |
2867 $counting{total_unmethylated_CHG_count}++; | |
2868 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2869 } | |
2870 elsif ($methylation_calls[$index] eq 'Z') { | |
2871 $counting{total_meCpG_count}++; | |
2872 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2873 } | |
2874 elsif ($methylation_calls[$index] eq 'z') { | |
2875 $counting{total_unmethylated_CpG_count}++; | |
2876 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2877 } | |
2878 elsif ($methylation_calls[$index] eq 'H') { | |
2879 $counting{total_meCHH_count}++; | |
2880 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2881 } | |
2882 elsif ($methylation_calls[$index] eq 'h') { | |
2883 $counting{total_unmethylated_CHH_count}++; | |
2884 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2885 } | |
2886 elsif ($methylation_calls[$index] eq '.') { | |
2887 } | |
2888 else{ | |
2889 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2890 } | |
2891 } | |
2892 } | |
2893 elsif ($strand eq '-') { | |
2894 | |
2895 for my $index (0..$#methylation_calls) { | |
2896 ### methylated Cs (any context) will receive a forward (+) orientation | |
2897 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
2898 | |
2899 if ($cigar and @comp_cigar){ # only needed for SAM entries with InDels | |
2900 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
2901 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2902 $cigar_offset += $cigar_mod; | |
2903 $pos_offset += $pos_mod; | |
2904 } | |
2905 | |
2906 if ($methylation_calls[$index] eq 'X') { | |
2907 $counting{total_meCHG_count}++; | |
2908 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2909 } | |
2910 elsif ($methylation_calls[$index] eq 'x') { | |
2911 $counting{total_unmethylated_CHG_count}++; | |
2912 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2913 } | |
2914 elsif ($methylation_calls[$index] eq 'Z') { | |
2915 $counting{total_meCpG_count}++; | |
2916 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2917 } | |
2918 elsif ($methylation_calls[$index] eq 'z') { | |
2919 $counting{total_unmethylated_CpG_count}++; | |
2920 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2921 } | |
2922 elsif ($methylation_calls[$index] eq 'H') { | |
2923 $counting{total_meCHH_count}++; | |
2924 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2925 } | |
2926 elsif ($methylation_calls[$index] eq 'h') { | |
2927 $counting{total_unmethylated_CHH_count}++; | |
2928 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2929 } | |
2930 elsif ($methylation_calls[$index] eq '.'){ | |
2931 } | |
2932 else{ | |
2933 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2934 } | |
2935 } | |
2936 } | |
2937 else { | |
2938 die "The strand information was neither + nor -: $strand\n"; | |
2939 } | |
2940 } | |
2941 | |
2942 ### strand-specific methylation output | |
2943 elsif ($merge_non_CpG) { | |
2944 if ($strand eq '+') { | |
2945 for my $index (0..$#methylation_calls) { | |
2946 ### methylated Cs (any context) will receive a forward (+) orientation | |
2947 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
2948 | |
2949 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels | |
2950 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2951 $cigar_offset += $cigar_mod; | |
2952 $pos_offset += $pos_mod; | |
2953 } | |
2954 | |
2955 if ($methylation_calls[$index] eq 'X') { | |
2956 $counting{total_meCHG_count}++; | |
2957 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2958 } | |
2959 elsif ($methylation_calls[$index] eq 'x') { | |
2960 $counting{total_unmethylated_CHG_count}++; | |
2961 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2962 } | |
2963 elsif ($methylation_calls[$index] eq 'Z') { | |
2964 $counting{total_meCpG_count}++; | |
2965 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2966 } | |
2967 elsif ($methylation_calls[$index] eq 'z') { | |
2968 $counting{total_unmethylated_CpG_count}++; | |
2969 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2970 } | |
2971 elsif ($methylation_calls[$index] eq 'H') { | |
2972 $counting{total_meCHH_count}++; | |
2973 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2974 } | |
2975 elsif ($methylation_calls[$index] eq 'h') { | |
2976 $counting{total_unmethylated_CHH_count}++; | |
2977 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2978 } | |
2979 elsif ($methylation_calls[$index] eq '.') { | |
2980 } | |
2981 else{ | |
2982 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2983 } | |
2984 } | |
2985 } | |
2986 elsif ($strand eq '-') { | |
2987 | |
2988 for my $index (0..$#methylation_calls) { | |
2989 ### methylated Cs (any context) will receive a forward (+) orientation | |
2990 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
2991 | |
2992 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels | |
2993 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2994 $cigar_offset += $cigar_mod; | |
2995 $pos_offset += $pos_mod; | |
2996 } | |
2997 | |
2998 if ($methylation_calls[$index] eq 'X') { | |
2999 $counting{total_meCHG_count}++; | |
3000 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3001 } | |
3002 elsif ($methylation_calls[$index] eq 'x') { | |
3003 $counting{total_unmethylated_CHG_count}++; | |
3004 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3005 } | |
3006 elsif ($methylation_calls[$index] eq 'Z') { | |
3007 $counting{total_meCpG_count}++; | |
3008 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3009 } | |
3010 elsif ($methylation_calls[$index] eq 'z') { | |
3011 $counting{total_unmethylated_CpG_count}++; | |
3012 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3013 } | |
3014 elsif ($methylation_calls[$index] eq 'H') { | |
3015 $counting{total_meCHH_count}++; | |
3016 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3017 } | |
3018 elsif ($methylation_calls[$index] eq 'h') { | |
3019 $counting{total_unmethylated_CHH_count}++; | |
3020 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3021 } | |
3022 elsif ($methylation_calls[$index] eq '.') { | |
3023 } | |
3024 else{ | |
3025 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
3026 } | |
3027 } | |
3028 } | |
3029 else { | |
3030 die "The strand information was neither + nor -: $strand\n"; | |
3031 } | |
3032 } | |
3033 | |
3034 ### THIS IS THE 3-CONTEXT (CpG, CHG and CHH) DEFAULT SECTION | |
3035 | |
3036 elsif ($full) { | |
3037 if ($strand eq '+') { | |
3038 for my $index (0..$#methylation_calls) { | |
3039 ### methylated Cs (any context) will receive a forward (+) orientation | |
3040 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
3041 | |
3042 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels | |
3043 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3044 $cigar_offset += $cigar_mod; | |
3045 $pos_offset += $pos_mod; | |
3046 } | |
3047 | |
3048 if ($methylation_calls[$index] eq 'X') { | |
3049 $counting{total_meCHG_count}++; | |
3050 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3051 } elsif ($methylation_calls[$index] eq 'x') { | |
3052 $counting{total_unmethylated_CHG_count}++; | |
3053 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3054 } elsif ($methylation_calls[$index] eq 'Z') { | |
3055 $counting{total_meCpG_count}++; | |
3056 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3057 } elsif ($methylation_calls[$index] eq 'z') { | |
3058 $counting{total_unmethylated_CpG_count}++; | |
3059 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3060 } elsif ($methylation_calls[$index] eq 'H') { | |
3061 $counting{total_meCHH_count}++; | |
3062 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3063 } elsif ($methylation_calls[$index] eq 'h') { | |
3064 $counting{total_unmethylated_CHH_count}++; | |
3065 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3066 } | |
3067 elsif ($methylation_calls[$index] eq '.') {} | |
3068 else{ | |
3069 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
3070 } | |
3071 } | |
3072 } | |
3073 elsif ($strand eq '-') { | |
3074 | |
3075 for my $index (0..$#methylation_calls) { | |
3076 ### methylated Cs (any context) will receive a forward (+) orientation | |
3077 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
3078 | |
3079 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels | |
3080 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3081 $cigar_offset += $cigar_mod; | |
3082 $pos_offset += $pos_mod; | |
3083 } | |
3084 | |
3085 if ($methylation_calls[$index] eq 'X') { | |
3086 $counting{total_meCHG_count}++; | |
3087 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3088 } elsif ($methylation_calls[$index] eq 'x') { | |
3089 $counting{total_unmethylated_CHG_count}++; | |
3090 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3091 } elsif ($methylation_calls[$index] eq 'Z') { | |
3092 $counting{total_meCpG_count}++; | |
3093 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3094 } elsif ($methylation_calls[$index] eq 'z') { | |
3095 $counting{total_unmethylated_CpG_count}++; | |
3096 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3097 } elsif ($methylation_calls[$index] eq 'H') { | |
3098 $counting{total_meCHH_count}++; | |
3099 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3100 } elsif ($methylation_calls[$index] eq 'h') { | |
3101 $counting{total_unmethylated_CHH_count}++; | |
3102 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3103 } | |
3104 elsif ($methylation_calls[$index] eq '.') {} | |
3105 else{ | |
3106 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
3107 } | |
3108 } | |
3109 } | |
3110 else { | |
3111 die "The read had a strand orientation which was neither + nor -: $strand\n"; | |
3112 } | |
3113 } | |
3114 | |
3115 ### strand-specific methylation output | |
3116 else { | |
3117 if ($strand eq '+') { | |
3118 for my $index (0..$#methylation_calls) { | |
3119 ### methylated Cs (any context) will receive a forward (+) orientation | |
3120 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
3121 | |
3122 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels | |
3123 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3124 $cigar_offset += $cigar_mod; | |
3125 $pos_offset += $pos_mod; | |
3126 } | |
3127 | |
3128 if ($methylation_calls[$index] eq 'X') { | |
3129 $counting{total_meCHG_count}++; | |
3130 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3131 } elsif ($methylation_calls[$index] eq 'x') { | |
3132 $counting{total_unmethylated_CHG_count}++; | |
3133 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3134 } elsif ($methylation_calls[$index] eq 'Z') { | |
3135 $counting{total_meCpG_count}++; | |
3136 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3137 } elsif ($methylation_calls[$index] eq 'z') { | |
3138 $counting{total_unmethylated_CpG_count}++; | |
3139 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3140 } elsif ($methylation_calls[$index] eq 'H') { | |
3141 $counting{total_meCHH_count}++; | |
3142 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3143 } elsif ($methylation_calls[$index] eq 'h') { | |
3144 $counting{total_unmethylated_CHH_count}++; | |
3145 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3146 } | |
3147 elsif ($methylation_calls[$index] eq '.') {} | |
3148 else{ | |
3149 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
3150 } | |
3151 } | |
3152 } | |
3153 elsif ($strand eq '-') { | |
3154 | |
3155 for my $index (0..$#methylation_calls) { | |
3156 ### methylated Cs (any context) will receive a forward (+) orientation | |
3157 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
3158 | |
3159 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels | |
3160 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3161 $cigar_offset += $cigar_mod; | |
3162 $pos_offset += $pos_mod; | |
3163 } | |
3164 | |
3165 if ($methylation_calls[$index] eq 'X') { | |
3166 $counting{total_meCHG_count}++; | |
3167 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3168 } elsif ($methylation_calls[$index] eq 'x') { | |
3169 $counting{total_unmethylated_CHG_count}++; | |
3170 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3171 } elsif ($methylation_calls[$index] eq 'Z') { | |
3172 $counting{total_meCpG_count}++; | |
3173 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3174 } elsif ($methylation_calls[$index] eq 'z') { | |
3175 $counting{total_unmethylated_CpG_count}++; | |
3176 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3177 } elsif ($methylation_calls[$index] eq 'H') { | |
3178 $counting{total_meCHH_count}++; | |
3179 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3180 } elsif ($methylation_calls[$index] eq 'h') { | |
3181 $counting{total_unmethylated_CHH_count}++; | |
3182 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
3183 } | |
3184 elsif ($methylation_calls[$index] eq '.') {} | |
3185 else{ | |
3186 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
3187 } | |
3188 } | |
3189 } | |
3190 else { | |
3191 die "The strand information was neither + nor -: $strand\n"; | |
3192 } | |
3193 } | |
3194 } | |
3195 | |
3196 | |
3197 | |
3198 ####################################################################################################################################### | |
3199 ### bismark2bedGaph section - START | |
3200 ####################################################################################################################################### | |
3201 | |
3202 ### has now been moved to the external script bismark2bedGraph | |
3203 | |
3204 # sub process_bedGraph_output{ | |
3205 # warn "="x64,"\n"; | |
3206 # warn "Methylation information will now be written into a bedGraph file\n"; | |
3207 # warn "="x64,"\n\n"; | |
3208 # sleep (2); | |
3209 | |
3210 # ### Closing all filehandles so that the Bismark methtylation extractor output doesn't get truncated due to buffering issues | |
3211 # foreach my $fh (keys %fhs) { | |
3212 # if ($fh =~ /^[1230]$/) { | |
3213 # foreach my $context (keys %{$fhs{$fh}}) { | |
3214 # close $fhs{$fh}->{$context} or die $!; | |
3215 # } | |
3216 # } else { | |
3217 # close $fhs{$fh} or die $!; | |
3218 # } | |
3219 # } | |
3220 | |
3221 # ### deciding which files to use for bedGraph conversion | |
3222 # foreach my $filename (@sorting_files){ | |
3223 # # warn "$filename\n"; | |
3224 # if ($filename =~ /\//){ # if files are in a different output folder we extract the filename again | |
3225 # $filename =~ s/.*\///; # replacing everything up to the last slash in the filename | |
3226 # # warn "$filename\n"; | |
3227 # } | |
3228 | |
3229 # if ($CX_context){ | |
3230 # push @bedfiles,$filename; | |
3231 # } | |
3232 # else{ ## CpG context only (default) | |
3233 # if ($filename =~ /^CpG_/){ | |
3234 # push @bedfiles,$filename; | |
3235 # } | |
3236 # else{ | |
3237 # # skipping CHH or CHG files | |
3238 # } | |
3239 # } | |
3240 # } | |
3241 | |
3242 # warn "Using the following files as Input:\n"; | |
3243 # print join ("\t",@bedfiles),"\n\n"; | |
3244 # sleep (2); | |
3245 | |
3246 # my %temp_fhs; | |
3247 # my @temp_files; # writing all context files (default CpG only) to these files prior to sorting | |
3248 | |
3249 # ### changing to the output directory | |
3250 # unless ($output_dir eq ''){ # default | |
3251 # chdir $output_dir or die "Failed to change directory to $output_dir\n"; | |
3252 # warn "Changed directory to $output_dir\n"; | |
3253 # } | |
3254 | |
3255 # foreach my $infile (@bedfiles) { | |
3256 | |
3257 # if ($remove) { | |
3258 # warn "Now replacing whitespaces in the sequence ID field of the Bismark methylation extractor output $infile prior to bedGraph conversion\n\n"; | |
3259 | |
3260 # if ($infile =~ /gz$/){ | |
3261 # open (READ,"zcat $infile |") or die $!; | |
3262 # } | |
3263 # else{ | |
3264 # open (READ,$infile) or die $!; | |
3265 # } | |
3266 | |
3267 # my $removed_spaces_outfile = $infile; | |
3268 # $removed_spaces_outfile =~ s/$/.spaces_removed.txt/; | |
3269 | |
3270 # open (REM,'>',$output_dir.$removed_spaces_outfile) or die "Couldn't write to file $removed_spaces_outfile: $!\n"; | |
3271 | |
3272 # unless ($no_header){ | |
3273 # $_ = <READ>; ### Bismark version header | |
3274 # print REM $_; ### Bismark version header | |
3275 # } | |
3276 | |
3277 # while (<READ>) { | |
3278 # chomp; | |
3279 # my ($id,$strand,$chr,$pos,$context) = (split (/\t/)); | |
3280 # $id =~ s/\s+/_/g; | |
3281 # print REM join ("\t",$id,$strand,$chr,$pos,$context),"\n"; | |
3282 # } | |
3283 | |
3284 # close READ or die $!; | |
3285 # close REM or die $!; | |
3286 | |
3287 # ### changing the infile name to the new file without spaces | |
3288 # $infile = $removed_spaces_outfile; | |
3289 # } | |
3290 | |
3291 # warn "Now writing methylation information for file $infile to individual files for each chromosome\n"; | |
3292 # if ($infile =~ /gz$/){ | |
3293 # open (IN,"zcat $infile |") or die $!; | |
3294 # } | |
3295 # else{ | |
3296 # open (IN,$infile) or die $!; | |
3297 # } | |
3298 | |
3299 # ## always ignoring the version header | |
3300 # unless ($no_header){ | |
3301 # $_ = <IN>; ### Bismark version header | |
3302 # } | |
3303 | |
3304 # while (<IN>) { | |
3305 # chomp; | |
3306 # my ($chr) = (split (/\t/))[2]; | |
3307 # # warn "This is the chromosome name before replacing '|' characters:\t$chr\n\n"; | |
3308 # $chr =~ s/\|/_/g; # replacing pipe ('|') characters in the file names | |
3309 # # warn "This is the chromosome name AFTER replacing '|' characters:\t$chr\n\n"; | |
3310 | |
3311 # unless (exists $temp_fhs{$chr}) { | |
3312 # open ($temp_fhs{$chr},'>','chr'.$chr.'.meth_extractor.temp') or die "Failed to open filehandle: $!"; | |
3313 # } | |
3314 # print {$temp_fhs{$chr}} "$_\n"; | |
3315 # } | |
3316 | |
3317 # warn "Finished writing out individual chromosome files for $infile\n"; | |
3318 # } | |
3319 # warn "\n"; | |
3320 | |
3321 # @temp_files = <*.meth_extractor.temp>; | |
3322 | |
3323 # warn "Collecting temporary chromosome file information...\n"; | |
3324 # sleep (1); | |
3325 # warn "processing the following input file(s):\n"; | |
3326 # warn join ("\n",@temp_files),"\n\n"; | |
3327 # sleep (1); | |
3328 | |
3329 # foreach my $in (@temp_files) { | |
3330 # if ($sort_size){ | |
3331 # warn "Sorting input file $in by positions (using -S of '$sort_size')\n"; | |
3332 # } | |
3333 # else{ | |
3334 # warn "Sorting input file $in by positions (using default memory settings)\n"; | |
3335 # } | |
3336 # my $sort_dir = $output_dir; | |
3337 # if ($sort_dir eq ''){ | |
3338 # $sort_dir = './'; | |
3339 # } | |
3340 # open my $ifh, "sort -S $sort_size -T $sort_dir -k3,3 -k4,4n $in |" or die "Input file could not be sorted. $!"; | |
3341 # # print "Chromosome\tStart Position\tEnd Position\tMethylation Percentage\n"; | |
3342 | |
3343 # ############################################# m.a.bentley - moved the variables out of the while loop to hold the current line data { | |
3344 | |
3345 # my $name; | |
3346 # my $meth_state; | |
3347 # my $chr = ""; | |
3348 # my $pos = 0; | |
3349 # my $meth_state2; | |
3350 | |
3351 # my $last_pos; | |
3352 # my $last_chr; | |
3353 | |
3354 # ############################################# } | |
3355 | |
3356 # while (my $line = <$ifh>) { | |
3357 # next if $line =~ /^Bismark/; | |
3358 # chomp $line; | |
3359 | |
3360 # ########################################### m.a.bentley - (1) set the last_chr and last_pos variables early in the while loop, before the line split (2) removed unnecessary setting of same variables in if statement { | |
3361 | |
3362 # $last_chr = $chr; | |
3363 # $last_pos = $pos; | |
3364 # ($name, $meth_state, $chr, $pos, $meth_state2) = split "\t", $line; | |
3365 | |
3366 # if (($last_pos ne $pos) || ($last_chr ne $chr)) { | |
3367 # generate_output($last_chr,$last_pos) if $methylcalls[2] > 0; | |
3368 # @methylcalls = qw (0 0 0); | |
3369 # } | |
3370 | |
3371 # ############################################# } | |
3372 | |
3373 # my $validated = validate_methylation_call($meth_state, $meth_state2); | |
3374 # unless($validated){ | |
3375 # warn "Methylation state of sequence ($name) in file ($in) on line $. is inconsistent (meth_state is $meth_state, meth_state2 = $meth_state2)\n"; | |
3376 # next; | |
3377 # } | |
3378 # if ($meth_state eq "+") { | |
3379 # $methylcalls[0]++; | |
3380 # $methylcalls[2]++; | |
3381 # } else { | |
3382 # $methylcalls[1]++; | |
3383 # $methylcalls[2]++; | |
3384 # } | |
3385 # } | |
3386 | |
3387 # ############################################# m.a.bentley - set the last_chr and last_pos variables for the last line in the file (outside the while loop's scope using the method i've implemented) { | |
3388 | |
3389 # $last_chr = $chr; | |
3390 # $last_pos = $pos; | |
3391 # if ($methylcalls[2] > 0) { | |
3392 # generate_output($last_chr,$last_pos) if $methylcalls[2] > 0; | |
3393 # } | |
3394 # ############################################# } | |
3395 | |
3396 # close $ifh or die $!; | |
3397 | |
3398 # @methylcalls = qw (0 0 0); # resetting @methylcalls | |
3399 | |
3400 # ### deleting temporary files | |
3401 # my $delete = unlink $in; | |
3402 # if ($delete) { | |
3403 # warn "Successfully deleted the temporary input file $in\n\n"; | |
3404 # } | |
3405 # else { | |
3406 # warn "The temporary inputfile $in could not be deleted $!\n\n"; | |
3407 # } | |
3408 # } | |
3409 # } | |
3410 | |
3411 # sub generate_output{ | |
3412 # my $methcount = $methylcalls[0]; | |
3413 # my $nonmethcount = $methylcalls[1]; | |
3414 # my $totalcount = $methylcalls[2]; | |
3415 # my $last_chr = shift; | |
3416 # my $last_pos = shift; | |
3417 # croak "Should not be generating output if there's no reads to this region" unless $totalcount > 0; | |
3418 # croak "Total counts ($totalcount) is not the sum of the methylated ($methcount) and unmethylated ($nonmethcount) counts" if $totalcount != ($methcount + $nonmethcount); | |
3419 | |
3420 # ############################################# m.a.bentley - declare a new variable 'bed_pos' to distinguish from bismark positions (-1) - previous scripts modified the last_pos variable earlier in the script leading to problems in meth % calculation { | |
3421 | |
3422 # my $bed_pos = $last_pos -1; ### Bismark coordinates are 1 based whereas bedGraph coordinates are 0 based. | |
3423 # my $meth_percentage; | |
3424 # ($totalcount >= $coverage_threshold) ? ($meth_percentage = ($methcount/$totalcount) * 100) : ($meth_percentage = undef); | |
3425 # # $meth_percentage =~ s/(\.\d\d).+$/$1/ unless $meth_percentage =~ /^Below/; | |
3426 # if (defined $meth_percentage){ | |
3427 # if ($counts){ | |
3428 # print OUT "$last_chr\t$bed_pos\t$bed_pos\t$meth_percentage\t$methcount\t$nonmethcount\n"; | |
3429 # } | |
3430 # else{ | |
3431 # print OUT "$last_chr\t$bed_pos\t$bed_pos\t$meth_percentage\n"; | |
3432 # } | |
3433 # } | |
3434 # ############################################# } | |
3435 # } | |
3436 | |
3437 # sub validate_methylation_call{ | |
3438 # my $meth_state = shift; | |
3439 # croak "Missing (+/-) methylation call" unless defined $meth_state; | |
3440 # my $meth_state2 = shift; | |
3441 # croak "Missing alphabetical methylation call" unless defined $meth_state2; | |
3442 # my $is_consistent; | |
3443 # ($meth_state2 =~ /^z/i) ? ($is_consistent = check_CpG_methylation_call($meth_state, $meth_state2)) | |
3444 # : ($is_consistent = check_nonCpG_methylation_call($meth_state,$meth_state2)); | |
3445 # return 1 if $is_consistent; | |
3446 # return 0; | |
3447 # } | |
3448 | |
3449 # sub check_CpG_methylation_call{ | |
3450 # my $meth1 = shift; | |
3451 # my $meth2 = shift; | |
3452 # return 1 if($meth1 eq "+" && $meth2 eq "Z"); | |
3453 # return 1 if($meth1 eq "-" && $meth2 eq "z"); | |
3454 # return 0; | |
3455 # } | |
3456 | |
3457 # sub check_nonCpG_methylation_call{ | |
3458 # my $meth1 = shift; | |
3459 # my $meth2 = shift; | |
3460 # return 1 if($meth1 eq "+" && $meth2 eq "C"); | |
3461 # return 1 if($meth1 eq "+" && $meth2 eq "X"); | |
3462 # return 1 if($meth1 eq "+" && $meth2 eq "H"); | |
3463 # return 1 if($meth1 eq "-" && $meth2 eq "c"); | |
3464 # return 1 if($meth1 eq "-" && $meth2 eq "x"); | |
3465 # return 1 if($meth1 eq "-" && $meth2 eq "h"); | |
3466 # return 0; | |
3467 # } | |
3468 | |
3469 ####################################################################################################################################### | |
3470 ### bismark2bedGaph section - END | |
3471 ####################################################################################################################################### | |
3472 | |
3473 | |
3474 | |
3475 | |
3476 | |
3477 | |
3478 # ####################################################################################################################################### | |
3479 # ### genome-wide cytosine methylation report - START | |
3480 # ####################################################################################################################################### | |
3481 | |
3482 ### has now been moved to the external script bedGraph2cytosine | |
3483 | |
3484 # sub generate_genome_wide_cytosine_report { | |
3485 | |
3486 # warn "="x78,"\n"; | |
3487 # warn "Methylation information will now be written into a genome-wide cytosine report\n"; | |
3488 # warn "="x78,"\n\n"; | |
3489 # sleep (2); | |
3490 | |
3491 # ### changing to the output directory again | |
3492 # unless ($output_dir eq ''){ # default | |
3493 # chdir $output_dir or die "Failed to change directory to $output_dir\n"; | |
3494 # # warn "Changed directory to $output_dir\n"; | |
3495 # } | |
3496 | |
3497 # my $in = shift; | |
3498 # open (IN,$in) or die $!; | |
3499 | |
3500 # my $cytosine_out = shift; | |
3501 | |
3502 # if ($CX_context){ | |
3503 # $cytosine_out =~ s/$/genome-wide_CX_report.txt/; | |
3504 # } | |
3505 # else{ | |
3506 # $cytosine_out =~ s/$/genome_wide_CpG_report.txt/; | |
3507 # } | |
3508 | |
3509 # ### note: we are still in the folder: $output_dir, so we do not have to include this into the open commands | |
3510 # unless ($split_by_chromosome){ ### writing all output to a single file (default) | |
3511 # open (CYT,'>',$cytosine_out) or die $!; | |
3512 # warn "Writing genome-wide cytosine report to: $cytosine_out\n"; | |
3513 # sleep (3); | |
3514 # } | |
3515 | |
3516 # my $last_chr; | |
3517 # my %chr; # storing reads for one chromosome at a time | |
3518 | |
3519 # my $count = 0; | |
3520 # while (<IN>){ | |
3521 # chomp; | |
3522 # ++$count; | |
3523 # my ($chr,$start,$end,undef,$meth,$nonmeth) = (split /\t/); | |
3524 | |
3525 # # defining the first chromosome | |
3526 # unless (defined $last_chr){ | |
3527 # $last_chr = $chr; | |
3528 # # warn "Storing all covered cytosine positions for chromosome: $chr\n"; | |
3529 # } | |
3530 | |
3531 # if ($chr eq $last_chr){ | |
3532 # $chr{$chr}->{$start}->{meth} = $meth; | |
3533 # $chr{$chr}->{$start}->{nonmeth} = $nonmeth; | |
3534 # } | |
3535 # else{ | |
3536 # warn "Writing cytosine reports for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n"; | |
3537 | |
3538 # if ($split_by_chromosome){ ## writing output to 1 file per chromosome | |
3539 # my $chromosome_out = $cytosine_out; | |
3540 # $chromosome_out =~ s/txt$/chr${last_chr}.txt/; | |
3541 # open (CYT,'>',$chromosome_out) or die $!; | |
3542 # } | |
3543 | |
3544 # while ( $chromosomes{$last_chr} =~ /([CG])/g){ | |
3545 | |
3546 # my $tri_nt = ''; | |
3547 # my $context = ''; | |
3548 # my $pos = pos$chromosomes{$last_chr}; | |
3549 | |
3550 # my $strand; | |
3551 # my $meth = 0; | |
3552 # my $nonmeth = 0; | |
3553 | |
3554 # if ($1 eq 'C'){ # C on forward strand | |
3555 # $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based! | |
3556 # $strand = '+'; | |
3557 # } | |
3558 # elsif ($1 eq 'G'){ # C on reverse strand | |
3559 # $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based! | |
3560 # $tri_nt = reverse $tri_nt; | |
3561 # $tri_nt =~ tr/ACTG/TGAC/; | |
3562 # $strand = '-'; | |
3563 # } | |
3564 # next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted | |
3565 | |
3566 # if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 0-based! | |
3567 # $meth = $chr{$last_chr}->{$pos-1}->{meth}; | |
3568 # $nonmeth = $chr{$last_chr}->{$pos-1}->{nonmeth}; | |
3569 # } | |
3570 | |
3571 # ### determining cytosine context | |
3572 # if ($tri_nt =~ /^CG/){ | |
3573 # $context = 'CG'; | |
3574 # } | |
3575 # elsif ($tri_nt =~ /^C.{1}G$/){ | |
3576 # $context = 'CHG'; | |
3577 # } | |
3578 # elsif ($tri_nt =~ /^C.{2}$/){ | |
3579 # $context = 'CHH'; | |
3580 # } | |
3581 # else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark) | |
3582 # warn "The sequence context could not be determined (found: '$tri_nt'). Skipping.\n"; | |
3583 # next; | |
3584 # } | |
3585 | |
3586 # if ($CpG_only){ | |
3587 # if ($tri_nt =~ /^CG/){ # CpG context is the default | |
3588 # if ($zero){ # zero based coordinates | |
3589 # $pos -= 1; | |
3590 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
3591 # } | |
3592 # else{ # default | |
3593 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
3594 # } | |
3595 # } | |
3596 # } | |
3597 # else{ ## all cytosines, specified with --CX | |
3598 # if ($zero){ # zero based coordinates | |
3599 # $pos -= 1; | |
3600 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
3601 # } | |
3602 # else{ # default | |
3603 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
3604 # } | |
3605 # } | |
3606 # } | |
3607 | |
3608 # %chr = (); # resetting the hash | |
3609 | |
3610 # # new first entry | |
3611 # $last_chr = $chr; | |
3612 # $chr{$chr}->{$start}->{meth} = $meth; | |
3613 # $chr{$chr}->{$start}->{nonmeth} = $nonmeth; | |
3614 # } | |
3615 # } | |
3616 | |
3617 # # Last found chromosome | |
3618 # warn "Writing cytosine reports for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n"; | |
3619 | |
3620 # if ($split_by_chromosome){ ## writing output to 1 file per chromosome | |
3621 # my $chromosome_out = $cytosine_out; | |
3622 # $chromosome_out =~ s/txt$/chr${last_chr}.txt/; | |
3623 # open (CYT,'>',$chromosome_out) or die $!; | |
3624 # } | |
3625 | |
3626 # while ( $chromosomes{$last_chr} =~ /([CG])/g){ | |
3627 | |
3628 # my $tri_nt; | |
3629 # my $context; | |
3630 # my $pos = pos$chromosomes{$last_chr}; | |
3631 | |
3632 # my $strand; | |
3633 # my $meth = 0; | |
3634 # my $nonmeth = 0; | |
3635 | |
3636 # if ($1 eq 'C'){ # C on forward strand | |
3637 # $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based! | |
3638 # $strand = '+'; | |
3639 # } | |
3640 # elsif ($1 eq 'G'){ # C on reverse strand | |
3641 # $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based! | |
3642 # $tri_nt = reverse $tri_nt; | |
3643 # $tri_nt =~ tr/ACTG/TGAC/; | |
3644 # $strand = '-'; | |
3645 # } | |
3646 | |
3647 # if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 0-based! | |
3648 # $meth = $chr{$last_chr}->{$pos-1}->{meth}; | |
3649 # $nonmeth = $chr{$last_chr}->{$pos-1}->{nonmeth}; | |
3650 # } | |
3651 | |
3652 # next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted | |
3653 | |
3654 # ### determining cytosine context | |
3655 # if ($tri_nt =~ /^CG/){ | |
3656 # $context = 'CG'; | |
3657 # } | |
3658 # elsif ($tri_nt =~ /^C.{1}G$/){ | |
3659 # $context = 'CHG'; | |
3660 # } | |
3661 # elsif ($tri_nt =~ /^C.{2}$/){ | |
3662 # $context = 'CHH'; | |
3663 # } | |
3664 # else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark) | |
3665 # warn "The cytosine context could not be determined (found: '$tri_nt'). Skipping.\n"; | |
3666 # next; | |
3667 # } | |
3668 | |
3669 # if ($CpG_only){ | |
3670 # if ($tri_nt =~ /^CG/){ # CpG context is the default | |
3671 # if ($zero){ # zero-based coordinates | |
3672 # $pos -= 1; | |
3673 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
3674 # } | |
3675 # else{ # default | |
3676 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
3677 # } | |
3678 # } | |
3679 # } | |
3680 # else{ ## all cytosines, specified with --CX | |
3681 # if ($zero){ # zero based coordinates | |
3682 # $pos -= 1; | |
3683 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
3684 # } | |
3685 # else{ # default | |
3686 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
3687 # } | |
3688 # } | |
3689 # } | |
3690 # close CYT or die $!; | |
3691 # } | |
3692 | |
3693 | |
3694 # sub read_genome_into_memory{ | |
3695 | |
3696 # ## reading in and storing the specified genome in the %chromosomes hash | |
3697 # chdir ($genome_folder) or die "Can't move to $genome_folder: $!"; | |
3698 # warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n"; | |
3699 | |
3700 # my @chromosome_filenames = <*.fa>; | |
3701 | |
3702 # ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta | |
3703 # unless (@chromosome_filenames){ | |
3704 # @chromosome_filenames = <*.fasta>; | |
3705 # } | |
3706 # unless (@chromosome_filenames){ | |
3707 # die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n"; | |
3708 # } | |
3709 | |
3710 # foreach my $chromosome_filename (@chromosome_filenames){ | |
3711 | |
3712 # # skipping the tophat entire mouse genome fasta file | |
3713 # next if ($chromosome_filename eq 'Mus_musculus.NCBIM37.fa'); | |
3714 | |
3715 # open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n"; | |
3716 # ### first line needs to be a fastA header | |
3717 # my $first_line = <CHR_IN>; | |
3718 # chomp $first_line; | |
3719 # $first_line =~ s/\r//; # removing /r carriage returns | |
3720 | |
3721 # ### Extracting chromosome name from the FastA header | |
3722 # my $chromosome_name = extract_chromosome_name($first_line); | |
3723 | |
3724 # my $sequence; | |
3725 # while (<CHR_IN>){ | |
3726 # chomp; | |
3727 # $_ =~ s/\r//; # removing /r carriage returns | |
3728 | |
3729 # if ($_ =~ /^>/){ | |
3730 # ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA) | |
3731 # if (exists $chromosomes{$chromosome_name}){ | |
3732 # warn "chr $chromosome_name (",length $sequence ," bp)\n"; | |
3733 # die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n"; | |
3734 # } | |
3735 # else { | |
3736 # if (length($sequence) == 0){ | |
3737 # warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n"; | |
3738 # } | |
3739 # warn "chr $chromosome_name (",length $sequence ," bp)\n"; | |
3740 # $chromosomes{$chromosome_name} = $sequence; | |
3741 # } | |
3742 # ### resetting the sequence variable | |
3743 # $sequence = ''; | |
3744 # ### setting new chromosome name | |
3745 # $chromosome_name = extract_chromosome_name($_); | |
3746 # } | |
3747 # else{ | |
3748 # $sequence .= uc$_; | |
3749 # } | |
3750 # } | |
3751 | |
3752 # if (exists $chromosomes{$chromosome_name}){ | |
3753 # warn "chr $chromosome_name (",length $sequence ," bp)\t"; | |
3754 # die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n"; | |
3755 # } | |
3756 # else{ | |
3757 # if (length($sequence) == 0){ | |
3758 # warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n"; | |
3759 # } | |
3760 # warn "chr $chromosome_name (",length $sequence ," bp)\n"; | |
3761 # $chromosomes{$chromosome_name} = $sequence; | |
3762 # } | |
3763 # } | |
3764 # warn "\n"; | |
3765 # chdir $parent_dir or die "Failed to move to directory $parent_dir\n"; | |
3766 # } | |
3767 | |
3768 # sub extract_chromosome_name { | |
3769 # ## Bowtie extracts the first string after the inition > in the FASTA file, so we are doing this as well | |
3770 # my $fasta_header = shift; | |
3771 # if ($fasta_header =~ s/^>//){ | |
3772 # my ($chromosome_name) = split (/\s+/,$fasta_header); | |
3773 # return $chromosome_name; | |
3774 # } | |
3775 # else{ | |
3776 # die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n"; | |
3777 # } | |
3778 # } | |
3779 | |
3780 # ####################################################################################################################################### | |
3781 # ### genome-wide cytosine methylation report - END | |
3782 # ####################################################################################################################################### | |
3783 | |
3784 | |
3785 | |
3786 | |
3787 sub print_helpfile{ | |
3788 | |
3789 print << 'HOW_TO'; | |
3790 | |
3791 | |
3792 DESCRIPTION | |
3793 | |
3794 The following is a brief description of all options to control the Bismark | |
3795 methylation extractor. The script reads in a bisulfite read alignment results file | |
3796 produced by the Bismark bisulfite mapper and extracts the methylation information | |
3797 for individual cytosines. This information is found in the methylation call field | |
3798 which can contain the following characters: | |
3799 | |
3800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
3801 ~~~ X for methylated C in CHG context (was protected) ~~~ | |
3802 ~~~ x for not methylated C CHG (was converted) ~~~ | |
3803 ~~~ H for methylated C in CHH context (was protected) ~~~ | |
3804 ~~~ h for not methylated C in CHH context (was converted) ~~~ | |
3805 ~~~ Z for methylated C in CpG context (was protected) ~~~ | |
3806 ~~~ z for not methylated C in CpG context (was converted) ~~~ | |
3807 ~~~ . for any bases not involving cytosines ~~~ | |
3808 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
3809 | |
3810 The methylation extractor outputs result files for cytosines in CpG, CHG and CHH | |
3811 context (this distinction is actually already made in Bismark itself). As the methylation | |
3812 information for every C analysed can produce files which easily have tens or even hundreds of | |
3813 millions of lines, file sizes can become very large and more difficult to handle. The C | |
3814 methylation info additionally splits cytosine methylation calls up into one of the four possible | |
3815 strands a given bisulfite read aligned against: | |
3816 | |
3817 OT original top strand | |
3818 CTOT complementary to original top strand | |
3819 | |
3820 OB original bottom strand | |
3821 CTOB complementary to original bottom strand | |
3822 | |
3823 Thus, by default twelve individual output files are being generated per input file (unless | |
3824 --comprehensive is specified, see below). The output files can be imported into a genome | |
3825 viewer, such as SeqMonk, and re-combined into a single data group if desired (in fact | |
3826 unless the bisulfite reads were generated preserving directionality it doesn't make any | |
3827 sense to look at the data in a strand-specific manner). Strand-specific output files can | |
3828 optionally be skipped, in which case only three output files for CpG, CHG or CHH context | |
3829 will be generated. For both the strand-specific and comprehensive outputs there is also | |
3830 the option to merge both non-CpG contexts (CHG and CHH) into one single non-CpG context. | |
3831 | |
3832 | |
3833 The output files are in the following format (tab delimited): | |
3834 | |
3835 <sequence_id> <strand> <chromosome> <position> <methylation call> | |
3836 | |
3837 | |
3838 USAGE: methylation_extractor [options] <filenames> | |
3839 | |
3840 | |
3841 ARGUMENTS: | |
3842 | |
3843 <filenames> A space-separated list of Bismark result files in SAM format from | |
3844 which methylation information is extracted for every cytosine in | |
3845 the reads. For alignment files in the older custom Bismark output | |
3846 see option '--vanilla'. | |
3847 | |
3848 OPTIONS: | |
3849 | |
3850 -s/--single-end Input file(s) are Bismark result file(s) generated from single-end | |
3851 read data. Specifying either --single-end or --paired-end is | |
3852 mandatory. | |
3853 | |
3854 -p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end | |
3855 read data. Specifying either --paired-end or --single-end is | |
3856 mandatory. | |
3857 | |
3858 --vanilla The Bismark result input file(s) are in the old custom Bismark format | |
3859 (up to version 0.5.x) and not in SAM format which is the default as | |
3860 of Bismark version 0.6.x or higher. Default: OFF. | |
3861 | |
3862 --no_overlap For paired-end reads it is theoretically possible that read_1 and | |
3863 read_2 overlap. This option avoids scoring overlapping methylation | |
3864 calls twice (only methylation calls of read 1 are used for in the process | |
3865 since read 1 has historically higher quality basecalls than read 2). | |
3866 Whilst this option removes a bias towards more methylation calls | |
3867 in the center of sequenced fragments it may de facto remove a sizable | |
3868 proportion of the data. This option is highly recommended for paired-end | |
3869 data. | |
3870 | |
3871 --ignore <int> Ignore the first <int> bp at the 5' end of each read when processing the | |
3872 methylation call string. This can remove e.g. a restriction enzyme site | |
3873 at the start of each read. | |
3874 | |
3875 --comprehensive Specifying this option will merge all four possible strand-specific | |
3876 methylation info into context-dependent output files. The default | |
3877 contexts are: | |
3878 - CpG context | |
3879 - CHG context | |
3880 - CHH context | |
3881 | |
3882 --merge_non_CpG This will produce two output files (in --comprehensive mode) or eight | |
3883 strand-specific output files (default) for Cs in | |
3884 - CpG context | |
3885 - non-CpG context | |
3886 | |
3887 --report Prints out a short methylation summary as well as the paramaters used to run | |
3888 this script. | |
3889 | |
3890 --no_header Suppresses the Bismark version header line in all output files for more convenient | |
3891 batch processing. | |
3892 | |
3893 -o/--output DIR Allows specification of a different output directory (absolute or relative | |
3894 path). If not specified explicitely, the output will be written to the current directory. | |
3895 | |
3896 --samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified | |
3897 explicitly if Samtools is in the PATH already. | |
3898 | |
3899 --gzip The methylation extractor files (CpG_OT_..., CpG_OB_... etc) will be written out in | |
3900 a GZIP compressed form to save disk space. This option does not work on bedGraph and | |
3901 genome-wide cytosine reports as they are 'tiny' anyway. | |
3902 | |
3903 --version Displays version information. | |
3904 | |
3905 -h/--help Displays this help file and exits. | |
3906 | |
3907 | |
3908 | |
3909 bedGraph specific options: | |
3910 | |
3911 --bedGraph After finishing the methylation extraction, the methylation output is written into a | |
3912 sorted bedGraph file that reports the position of a given cytosine and its methylation | |
3913 state (in %, see details below). The methylation extractor output is temporarily split up into | |
3914 temporary files, one per chromosome (written into the current directory or folder | |
3915 specified with -o/--output); these temp files are then used for sorting and deleted | |
3916 afterwards. By default, only cytosines in CpG context will be sorted. The option | |
3917 '--CX_context' may be used to report all cytosines irrespective of sequence context | |
3918 (this will take MUCH longer!). The default folder for temporary files during the sorting | |
3919 process is the output directory. The bedGraph conversion step is performed by the external | |
3920 module 'bismark2bedGraph'; this script needs to reside in the same folder as the | |
3921 bismark_methylation_extractor itself. | |
3922 | |
3923 | |
3924 --cutoff [threshold] The minimum number of times a methylation state has to be seen for that nucleotide | |
3925 before its methylation percentage is reported. Default: 1. | |
3926 | |
3927 --remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting. | |
3928 | |
3929 | |
3930 --counts Adds two additional columns to the output file to enable further calculations: | |
3931 col 5: number of methylated calls | |
3932 col 6: number of unmethylated calls | |
3933 This option is required if '--cytosine_report' is specified (and will be set automatically if | |
3934 necessary). | |
3935 | |
3936 --CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered | |
3937 in the experiment irrespective of its sequence context. This applies to both forward and | |
3938 reverse strands. Please be aware that this option may generate large temporary and output files | |
3939 and may take a long time to sort (up to many hours). Default: OFF. | |
3940 (i.e. Default = CpG context only). | |
3941 | |
3942 --buffer_size <string> This allows you to specify the main memory sort buffer when sorting the methylation information. | |
3943 Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or | |
3944 a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc. | |
3945 (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line. | |
3946 Defaults to 2G. | |
3947 | |
3948 | |
3949 Genome-wide cytosine methylation report specific options: | |
3950 | |
3951 --cytosine_report After the conversion to bedGraph has completed, the option '--cytosine_report' produces a | |
3952 genome-wide methylation report for all cytosines in the genome. By default, the output uses 1-based | |
3953 chromosome coordinates (zero-based cords are optional) and reports CpG context only (all | |
3954 cytosine context is optional). The output considers all Cs on both forward and reverse strands and | |
3955 reports their position, strand, trinucleotide content and methylation state (counts are 0 if not | |
3956 covered). The cytsoine report conversion step is performed by the external module | |
3957 'bedGraph2cytosine'; this script needs to reside in the same folder as the bismark_methylation_extractor | |
3958 itself. | |
3959 | |
3960 --CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of | |
3961 its context. This applies to both forward and reverse strands. Please be aware that this will | |
3962 generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse. | |
3963 Default: OFF (i.e. Default = CpG context only). | |
3964 | |
3965 --zero_based Uses zero-based coordinates like used in e.g. bed files instead of 1-based coordinates. Default: OFF. | |
3966 | |
3967 --genome_folder <path> Enter the genome folder you wish to use to extract sequences from (full path only). Accepted | |
3968 formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory. | |
3969 | |
3970 --split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files | |
3971 will be named to include the input filename and the chromosome number. | |
3972 | |
3973 | |
3974 | |
3975 OUTPUT: | |
3976 | |
3977 The bismark_methylation_extractor output is in the form: | |
3978 ======================================================== | |
3979 <seq-ID> <methylation state*> <chromosome> <start position (= end position)> <methylation call> | |
3980 | |
3981 * Methylated cytosines receive a '+' orientation, | |
3982 * Unmethylated cytosines receive a '-' orientation. | |
3983 | |
3984 | |
3985 | |
3986 The bedGraph output (optional) looks like this (tab-delimited): | |
3987 =============================================================== | |
3988 <chromosome> <start position> <end position> <methylation percentage> | |
3989 | |
3990 The bedGraph output with '--counts' specified looks like this (tab-delimited): | |
3991 | |
3992 <chromosome> <start position> <end position> <methylation percentage> <count methylated> <count non-methylated> | |
3993 | |
3994 | |
3995 | |
3996 The genome-wide cytosine methylation output file is tab-delimited in the following format: | |
3997 ========================================================================================== | |
3998 <chromosome> <position> <strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context> | |
3999 | |
4000 | |
4001 | |
4002 This script was last modified on 21 April 2013. | |
4003 | |
4004 HOW_TO | |
4005 } |