Mercurial > repos > bjoern-gruening > bismark
comparison bismark_methylation_extractor @ 0:36d124f44c0a draft
inital commit
author | bjoern-gruening |
---|---|
date | Tue, 25 Dec 2012 05:45:46 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:36d124f44c0a |
---|---|
1 #!/usr/bin/perl | |
2 use warnings; | |
3 use strict; | |
4 $|++; | |
5 use Getopt::Long; | |
6 use Cwd; | |
7 use Carp; | |
8 | |
9 my @filenames; # input files | |
10 my %counting; | |
11 my $parent_dir = getcwd(); | |
12 | |
13 my %fhs; | |
14 | |
15 my $version = 'v0.7.7'; | |
16 my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome) = process_commandline(); | |
17 | |
18 | |
19 ### only needed for bedGraph output | |
20 my @sorting_files; # if files are to be written to bedGraph format, these are the methylation extractor output files | |
21 my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total | |
22 my @bedfiles; | |
23 | |
24 ### only needed for genome-wide cytosine methylation report | |
25 my %chromosomes; | |
26 | |
27 ############################################################################################## | |
28 ### Summarising Run Parameters | |
29 ############################################################################################## | |
30 | |
31 ### METHYLATION EXTRACTOR | |
32 | |
33 warn "Summarising Bismark methylation extractor parameters:\n"; | |
34 warn '='x63,"\n"; | |
35 | |
36 if ($single){ | |
37 if ($vanilla){ | |
38 warn "Bismark single-end vanilla format specified\n"; | |
39 } | |
40 else{ | |
41 warn "Bismark single-end SAM format specified (default)\n"; # default | |
42 } | |
43 } | |
44 elsif ($paired){ | |
45 if ($vanilla){ | |
46 warn "Bismark paired-end vanilla format specified\n"; | |
47 } | |
48 else{ | |
49 warn "Bismark paired-end SAM format specified (default)\n"; # default | |
50 } | |
51 } | |
52 | |
53 if ($ignore){ | |
54 warn "First $ignore bases will be disregarded when processing the methylation call string\n"; | |
55 } | |
56 | |
57 if ($full){ | |
58 warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n"; | |
59 } | |
60 if ($merge_non_CpG){ | |
61 warn "Merge CHG and CHH context to non-CpG context specified\n"; | |
62 } | |
63 ### output directory | |
64 if ($output_dir eq ''){ | |
65 warn "Output will be written to the current directory ('$parent_dir')\n"; | |
66 } | |
67 else{ | |
68 warn "Output path specified as: $output_dir\n"; | |
69 } | |
70 | |
71 | |
72 sleep (1); | |
73 | |
74 ### BEDGRAPH | |
75 | |
76 if ($bedGraph){ | |
77 warn "\n\nSummarising bedGraph parameters:\n"; | |
78 warn '='x63,"\n"; | |
79 | |
80 if ($counts){ | |
81 warn "Generating additional output in bedGraph format including methylating counts (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage> <count methylated> <count non-methylated>)\n"; | |
82 } | |
83 else{ | |
84 warn "Generating additional sorted output in bedGraph format (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage>)\n"; | |
85 } | |
86 | |
87 warn "Using a cutoff of $coverage_threshold read(s) to report cytosine positions\n"; | |
88 | |
89 if ($CX_context){ | |
90 warn "Reporting and sorting methylation information for all cytosine context (sorting may take a long time, you have been warned ...)\n"; | |
91 } | |
92 else{ # default | |
93 $CpG_only = 1; | |
94 warn "Reporting and sorting cytosine methylation information in CpG context only (default)\n"; | |
95 } | |
96 | |
97 if ($remove){ | |
98 warn "White spaces in read ID names will be removed prior to sorting\n"; | |
99 } | |
100 | |
101 sleep (1); | |
102 | |
103 if ($cytosine_report){ | |
104 warn "\n\nSummarising genome-wide cytosine methylation report parameters:\n"; | |
105 warn '='x63,"\n"; | |
106 warn "Generating comprehensive genome-wide cytosine report (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage> )\n"; | |
107 | |
108 | |
109 if ($CX_context){ | |
110 warn "Reporting methylation for all cytosine contexts. Be aware that this will generate enormous files\n"; | |
111 } | |
112 else{ # default | |
113 $CpG_only = 1; | |
114 warn "Reporting cytosine methylation in CpG context only (default)\n"; | |
115 } | |
116 | |
117 if ($split_by_chromosome){ | |
118 warn "Splitting the cytosine report output up into individual files for each chromosome\n"; | |
119 } | |
120 | |
121 ### Zero-based coordinates | |
122 if ($zero){ | |
123 warn "Using zero-based genomic coordinates (user-defined)\n"; | |
124 } | |
125 else{ # default, 1-based coords | |
126 warn "Using 1-based genomic coordinates (default)\n"; | |
127 } | |
128 | |
129 ### GENOME folder | |
130 if ($genome_folder){ | |
131 unless ($genome_folder =~/\/$/){ | |
132 $genome_folder =~ s/$/\//; | |
133 } | |
134 warn "Genome folder was specified as $genome_folder\n"; | |
135 } | |
136 else{ | |
137 $genome_folder = '/data/public/Genomes/Mouse/NCBIM37/'; | |
138 warn "Using the default genome folder /data/public/Genomes/Mouse/NCBIM37/\n"; | |
139 } | |
140 sleep (1); | |
141 } | |
142 } | |
143 | |
144 warn "\n"; | |
145 sleep (5); | |
146 | |
147 ###################################################### | |
148 ### PROCESSING FILES | |
149 ###################################################### | |
150 | |
151 foreach my $filename (@filenames){ | |
152 # resetting counters and filehandles | |
153 %fhs = (); | |
154 %counting =( | |
155 total_meCHG_count => 0, | |
156 total_meCHH_count => 0, | |
157 total_meCpG_count => 0, | |
158 total_unmethylated_CHG_count => 0, | |
159 total_unmethylated_CHH_count => 0, | |
160 total_unmethylated_CpG_count => 0, | |
161 sequences_count => 0, | |
162 ); | |
163 @sorting_files = (); | |
164 @bedfiles = (); | |
165 | |
166 process_Bismark_results_file($filename); | |
167 | |
168 if ($bedGraph){ | |
169 my $out = $filename; | |
170 $out =~ s/sam$//; | |
171 $out =~ s/txt$//; | |
172 $out =~ s/$/bedGraph/; | |
173 | |
174 my $bedGraph_output = $out; | |
175 open (OUT,'>',$output_dir.$out) or die $!; | |
176 # warn "Writing bedGraph to file: $out\n"; | |
177 | |
178 process_bedGraph_output(); | |
179 close OUT or die $!; | |
180 | |
181 ### genome-wide cytosine methylation report requires bedGraph processing anyway | |
182 if ($cytosine_report){ | |
183 my $cytosine_out = $out; | |
184 $cytosine_out =~ s/bedGraph$//; | |
185 | |
186 read_genome_into_memory(); | |
187 generate_genome_wide_cytosine_report($bedGraph_output,$cytosine_out); | |
188 } | |
189 } | |
190 } | |
191 | |
192 | |
193 sub process_commandline{ | |
194 my $help; | |
195 my $single_end; | |
196 my $paired_end; | |
197 my $ignore; | |
198 my $genomic_fasta; | |
199 my $full; | |
200 my $report; | |
201 my $extractor_version; | |
202 my $no_overlap; | |
203 my $merge_non_CpG; | |
204 my $vanilla; | |
205 my $output_dir; | |
206 my $no_header; | |
207 my $bedGraph; | |
208 my $coverage_threshold = 1; # Minimum number of reads covering before calling methylation status | |
209 my $remove; | |
210 my $counts; | |
211 my $cytosine_report; | |
212 my $genome_folder; | |
213 my $zero; | |
214 my $CpG_only; | |
215 my $CX_context; | |
216 my $split_by_chromosome; | |
217 | |
218 | |
219 my $command_line = GetOptions ('help|man' => \$help, | |
220 'p|paired-end' => \$paired_end, | |
221 's|single-end' => \$single_end, | |
222 'fasta' => \$genomic_fasta, | |
223 'ignore=i' => \$ignore, | |
224 'comprehensive' => \$full, | |
225 'report' => \$report, | |
226 'version' => \$extractor_version, | |
227 'no_overlap' => \$no_overlap, | |
228 'merge_non_CpG' => \$merge_non_CpG, | |
229 'vanilla' => \$vanilla, | |
230 'o|output=s' => \$output_dir, | |
231 'no_header' => \$no_header, | |
232 'bedGraph' => \$bedGraph, | |
233 "cutoff=i" => \$coverage_threshold, | |
234 "remove_spaces" => \$remove, | |
235 "counts" => \$counts, | |
236 "cytosine_report" => \$cytosine_report, | |
237 'g|genome_folder=s' => \$genome_folder, | |
238 "zero_based" => \$zero, | |
239 "CX|CX_context" => \$CX_context, | |
240 "split_by_chromosome" => \$split_by_chromosome, | |
241 ); | |
242 | |
243 ### EXIT ON ERROR if there were errors with any of the supplied options | |
244 unless ($command_line){ | |
245 die "Please respecify command line options\n"; | |
246 } | |
247 | |
248 ### HELPFILE | |
249 if ($help){ | |
250 print_helpfile(); | |
251 exit; | |
252 } | |
253 | |
254 if ($extractor_version){ | |
255 print << "VERSION"; | |
256 | |
257 | |
258 Bismark Methylation Extractor | |
259 | |
260 Bismark Extractor Version: $version Copyright 2010-12 Felix Krueger, Babraham Bioinformatics | |
261 www.bioinformatics.babraham.ac.uk/projects/bismark/ | |
262 | |
263 | |
264 VERSION | |
265 exit; | |
266 } | |
267 | |
268 | |
269 ### no files provided | |
270 unless (@ARGV){ | |
271 die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n"; | |
272 } | |
273 @filenames = @ARGV; | |
274 | |
275 warn "\n *** Bismark methylation extractor version $version ***\n\n"; | |
276 | |
277 ### IGNORING <INT> bases at the start of the read when processing the methylation call string | |
278 unless ($ignore){ | |
279 $ignore = 0; | |
280 } | |
281 ### PRINT A REPORT | |
282 unless ($report){ | |
283 $report = 0; | |
284 } | |
285 | |
286 ### OUTPUT DIR PATH | |
287 if ($output_dir){ | |
288 unless ($output_dir =~ /\/$/){ | |
289 $output_dir =~ s/$/\//; | |
290 } | |
291 } | |
292 else{ | |
293 $output_dir = ''; | |
294 } | |
295 | |
296 ### NO HEADER | |
297 unless ($no_header){ | |
298 $no_header = 0; | |
299 } | |
300 | |
301 ### OLD (VANILLA) OUTPUT FORMAT | |
302 unless ($vanilla){ | |
303 $vanilla = 0; | |
304 } | |
305 | |
306 if ($single_end){ | |
307 $paired_end = 0; ### SINGLE END ALIGNMENTS | |
308 } | |
309 elsif ($paired_end){ | |
310 $single_end = 0; ### PAIRED-END ALIGNMENTS | |
311 } | |
312 else{ | |
313 die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format\n\n"; | |
314 } | |
315 | |
316 ### NO OVERLAP | |
317 if ($no_overlap){ | |
318 die "The option '--no_overlap' can only be specified for paired-end input!\n" unless ($paired_end); | |
319 } | |
320 else{ | |
321 $no_overlap = 0; | |
322 } | |
323 | |
324 ### COMPREHENSIVE OUTPUT | |
325 unless ($full){ | |
326 $full = 0; | |
327 } | |
328 | |
329 ### MERGE NON-CpG context | |
330 unless ($merge_non_CpG){ | |
331 $merge_non_CpG = 0; | |
332 } | |
333 | |
334 ### remove white spaces in read ID (needed for sorting using the sort command | |
335 unless ($remove){ | |
336 $remove = 0; | |
337 } | |
338 | |
339 ### COVERAGE THRESHOLD FOR gedGraph OUTPUT | |
340 unless (defined $coverage_threshold){ | |
341 unless ($coverage_threshold > 0){ | |
342 die "Please select a coverage greater than 0 (positive integers only)\n"; | |
343 } | |
344 $coverage_threshold = 1; | |
345 } | |
346 | |
347 if ($zero){ | |
348 die "Option '--zero' is only available if '--cytosine_report' is specified as well. Please respecify\n" unless ($cytosine_report); | |
349 } | |
350 | |
351 if ($CX_context){ | |
352 die "Option '--CX_context' is only available if '--cytosine_report' or '--bedGraph' is specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph); | |
353 } | |
354 | |
355 if ($cytosine_report){ | |
356 | |
357 ### GENOME folder | |
358 if ($genome_folder){ | |
359 unless ($genome_folder =~/\/$/){ | |
360 $genome_folder =~ s/$/\//; | |
361 } | |
362 } | |
363 else{ | |
364 die "Please specify a genome folder to proceed (full path only)\n"; | |
365 } | |
366 | |
367 unless ($bedGraph){ | |
368 warn "Setting the option '--bedGraph' since this is required for the genome-wide cytosine report\n"; | |
369 $bedGraph = 1; | |
370 } | |
371 unless ($counts){ | |
372 warn "Setting the option '--counts' since this is required for the genome-wide cytosine report\n"; | |
373 $counts = 1; | |
374 } | |
375 warn "\n"; | |
376 } | |
377 | |
378 return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome); | |
379 } | |
380 | |
381 | |
382 sub process_Bismark_results_file{ | |
383 my $filename = shift; | |
384 | |
385 warn "\nNow reading in Bismark result file $filename\n\n"; | |
386 | |
387 if ($filename =~ /\.gz$/) { | |
388 open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n"; | |
389 } else { | |
390 open (IN,$filename) or die "Can't open file $filename: $!\n"; | |
391 } | |
392 | |
393 ### Vanilla and SAM output need to read different numbers of header lines | |
394 if ($vanilla) { | |
395 my $bismark_version = <IN>; ## discarding the Bismark version info | |
396 chomp $bismark_version; | |
397 $bismark_version =~ s/\r//; # replaces \r line feed | |
398 $bismark_version =~ s/Bismark version: //; | |
399 if ($bismark_version =~ /^\@/) { | |
400 warn "Detected \@ as the first character of the version information. Is it possible that the file is in SAM format?\n\n"; | |
401 sleep (2); | |
402 } | |
403 | |
404 unless ($version eq $bismark_version){ | |
405 die "The methylation extractor and Bismark itself need to be of the same version!\n\nVersions used:\nmethylation extractor: '$version'\nBismark: '$bismark_version'\n"; | |
406 } | |
407 } else { | |
408 # If the read is in SAM format (default) it can either start with @ header lines or start with alignments directly. | |
409 # We are reading from it further down | |
410 } | |
411 | |
412 my $output_filename = (split (/\//,$filename))[-1]; | |
413 | |
414 ### OPENING OUTPUT-FILEHANDLES | |
415 if ($report) { | |
416 my $report_filename = $output_filename; | |
417 $report_filename =~ s/[\.sam|\.txt]$//; | |
418 $report_filename =~ s/$/_splitting_report.txt/; | |
419 $report_filename = $output_dir . $report_filename; | |
420 open (REPORT,'>',$report_filename) or die "Failed to write to file $report_filename $!\n"; | |
421 } | |
422 | |
423 if ($report) { | |
424 print REPORT "$output_filename\n\n"; | |
425 print REPORT "Parameters used to extract methylation information:\n"; | |
426 if ($paired) { | |
427 if ($vanilla) { | |
428 print REPORT "Bismark result file: paired-end (vanilla Bismark format)\n"; | |
429 } else { | |
430 print REPORT "Bismark result file: paired-end (SAM format)\n"; # default | |
431 } | |
432 } | |
433 | |
434 if ($single) { | |
435 if ($vanilla) { | |
436 print REPORT "Bismark result file: single-end (vanilla Bismark format)\n"; | |
437 } else { | |
438 print REPORT "Bismark result file: single-end (SAM format)\n"; # default | |
439 } | |
440 } | |
441 | |
442 if ($ignore) { | |
443 print REPORT "Ignoring first $ignore bases\n"; | |
444 } | |
445 | |
446 if ($full) { | |
447 print REPORT "Output specified: comprehensive\n"; | |
448 } else { | |
449 print REPORT "Output specified: strand-specific (default)\n"; | |
450 } | |
451 | |
452 if ($no_overlap) { | |
453 print REPORT "No overlapping methylation calls specified\n"; | |
454 } | |
455 if ($genomic_fasta) { | |
456 print REPORT "Genomic equivalent sequences will be printed out in FastA format\n"; | |
457 } | |
458 if ($merge_non_CpG) { | |
459 print REPORT "Methylation in CHG and CHH context will be merged into \"non-CpG context\" output\n"; | |
460 } | |
461 | |
462 print REPORT "\n"; | |
463 } | |
464 | |
465 ### CpG-context and non-CpG context. THIS SECTION IS OPTIONAL | |
466 ### if --comprehensive AND --merge_non_CpG was specified we are only writing out one CpG-context and one Any-Other-context result file | |
467 if ($full and $merge_non_CpG) { | |
468 my $cpg_output = my $other_c_output = $output_filename; | |
469 ### C in CpG context | |
470 $cpg_output =~ s/^/CpG_context_/; | |
471 $cpg_output =~ s/sam$/txt/; | |
472 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); | |
473 $cpg_output = $output_dir . $cpg_output; | |
474 push @sorting_files,$cpg_output; | |
475 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n"; | |
476 print "Writing result file containing methylation information for C in CpG context to $cpg_output\n"; | |
477 | |
478 unless ($no_header) { | |
479 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n"; | |
480 } | |
481 | |
482 ### C in any other context than CpG | |
483 $other_c_output =~ s/^/Non_CpG_context_/; | |
484 $other_c_output =~ s/sam$/txt/; | |
485 $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/); | |
486 $other_c_output = $output_dir . $other_c_output; | |
487 push @sorting_files,$other_c_output; | |
488 open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n"; | |
489 print "Writing result file containing methylation information for C in any other context to $other_c_output\n"; | |
490 | |
491 unless ($no_header) { | |
492 print {$fhs{other_context}} "Bismark methylation extractor version $version\n"; | |
493 } | |
494 } | |
495 | |
496 ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found | |
497 elsif ($merge_non_CpG) { | |
498 | |
499 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; | |
500 | |
501 ### For cytosines in CpG context | |
502 $cpg_ot =~ s/^/CpG_OT_/; | |
503 $cpg_ot =~ s/sam$/txt/; | |
504 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); | |
505 $cpg_ot = $output_dir . $cpg_ot; | |
506 push @sorting_files,$cpg_ot; | |
507 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n"; | |
508 print "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n"; | |
509 | |
510 unless($no_header){ | |
511 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n"; | |
512 } | |
513 | |
514 $cpg_ctot =~ s/^/CpG_CTOT_/; | |
515 $cpg_ctot =~ s/sam$/txt/; | |
516 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); | |
517 $cpg_ctot = $output_dir . $cpg_ctot; | |
518 push @sorting_files,$cpg_ctot; | |
519 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n"; | |
520 print "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n"; | |
521 | |
522 unless($no_header){ | |
523 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n"; | |
524 } | |
525 | |
526 $cpg_ctob =~ s/^/CpG_CTOB_/; | |
527 $cpg_ctob =~ s/sam$/txt/; | |
528 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); | |
529 $cpg_ctob = $output_dir . $cpg_ctob; | |
530 push @sorting_files,$cpg_ctob; | |
531 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n"; | |
532 print "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n"; | |
533 | |
534 unless($no_header){ | |
535 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n"; | |
536 } | |
537 | |
538 $cpg_ob =~ s/^/CpG_OB_/; | |
539 $cpg_ob =~ s/sam$/txt/; | |
540 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); | |
541 $cpg_ob = $output_dir . $cpg_ob; | |
542 push @sorting_files,$cpg_ob; | |
543 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n"; | |
544 print "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n"; | |
545 | |
546 unless($no_header){ | |
547 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n"; | |
548 } | |
549 | |
550 ### For cytosines in Non-CpG (CC, CT or CA) context | |
551 my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename; | |
552 | |
553 $other_c_ot =~ s/^/Non_CpG_OT_/; | |
554 $other_c_ot =~ s/sam$/txt/; | |
555 $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/); | |
556 $other_c_ot = $output_dir . $other_c_ot; | |
557 push @sorting_files,$other_c_ot; | |
558 open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n"; | |
559 print "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n"; | |
560 | |
561 unless($no_header){ | |
562 print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n"; | |
563 } | |
564 | |
565 $other_c_ctot =~ s/^/Non_CpG_CTOT_/; | |
566 $other_c_ctot =~ s/sam$/txt/; | |
567 $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/); | |
568 $other_c_ctot = $output_dir . $other_c_ctot; | |
569 push @sorting_files,$other_c_ctot; | |
570 open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n"; | |
571 print "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n"; | |
572 | |
573 unless($no_header){ | |
574 print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n"; | |
575 } | |
576 | |
577 $other_c_ctob =~ s/^/Non_CpG_CTOB_/; | |
578 $other_c_ctob =~ s/sam$/txt/; | |
579 $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/); | |
580 $other_c_ctob = $output_dir . $other_c_ctob; | |
581 push @sorting_files,$other_c_ctob; | |
582 open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n"; | |
583 print "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n"; | |
584 | |
585 unless($no_header){ | |
586 print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n"; | |
587 } | |
588 | |
589 $other_c_ob =~ s/^/Non_CpG_OB_/; | |
590 $other_c_ob =~ s/sam$/txt/; | |
591 $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/); | |
592 $other_c_ob = $output_dir . $other_c_ob; | |
593 push @sorting_files,$other_c_ob; | |
594 open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n"; | |
595 print "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n"; | |
596 | |
597 unless($no_header){ | |
598 print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n"; | |
599 } | |
600 } | |
601 ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context) | |
602 | |
603 ### if --comprehensive was specified we are only writing one file per context | |
604 elsif ($full) { | |
605 my $cpg_output = my $chg_output = my $chh_output = $output_filename; | |
606 ### C in CpG context | |
607 $cpg_output =~ s/^/CpG_context_/; | |
608 $cpg_output =~ s/sam$/txt/; | |
609 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); | |
610 $cpg_output = $output_dir . $cpg_output; | |
611 push @sorting_files,$cpg_output; | |
612 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n"; | |
613 print "Writing result file containing methylation information for C in CpG context to $cpg_output\n"; | |
614 | |
615 unless($no_header){ | |
616 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n"; | |
617 } | |
618 | |
619 ### C in CHG context | |
620 $chg_output =~ s/^/CHG_context_/; | |
621 $chg_output =~ s/sam$/txt/; | |
622 $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/); | |
623 $chg_output = $output_dir . $chg_output; | |
624 push @sorting_files,$chg_output; | |
625 open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n"; | |
626 print "Writing result file containing methylation information for C in CHG context to $chg_output\n"; | |
627 | |
628 unless($no_header){ | |
629 print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n"; | |
630 } | |
631 | |
632 ### C in CHH context | |
633 $chh_output =~ s/^/CHH_context_/; | |
634 $chh_output =~ s/sam$/txt/; | |
635 $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/); | |
636 $chh_output = $output_dir . $chh_output; | |
637 push @sorting_files, $chh_output; | |
638 open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n"; | |
639 print "Writing result file containing methylation information for C in CHH context to $chh_output\n"; | |
640 | |
641 unless($no_header){ | |
642 print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n"; | |
643 } | |
644 } | |
645 ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found | |
646 else { | |
647 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; | |
648 | |
649 ### For cytosines in CpG context | |
650 $cpg_ot =~ s/^/CpG_OT_/; | |
651 $cpg_ot =~ s/sam$/txt/; | |
652 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); | |
653 $cpg_ot = $output_dir . $cpg_ot; | |
654 push @sorting_files,$cpg_ot; | |
655 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n"; | |
656 print "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n"; | |
657 | |
658 unless($no_header){ | |
659 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n"; | |
660 } | |
661 | |
662 $cpg_ctot =~ s/^/CpG_CTOT_/; | |
663 $cpg_ctot =~ s/sam$/txt/; | |
664 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); | |
665 $cpg_ctot = $output_dir . $cpg_ctot; | |
666 push @sorting_files,$cpg_ctot; | |
667 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n"; | |
668 print "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n"; | |
669 | |
670 unless($no_header){ | |
671 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n"; | |
672 } | |
673 | |
674 $cpg_ctob =~ s/^/CpG_CTOB_/; | |
675 $cpg_ctob =~ s/sam$/txt/; | |
676 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); | |
677 $cpg_ctob = $output_dir . $cpg_ctob; | |
678 push @sorting_files,$cpg_ctob; | |
679 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n"; | |
680 print "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n"; | |
681 | |
682 unless($no_header){ | |
683 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n"; | |
684 } | |
685 | |
686 $cpg_ob =~ s/^/CpG_OB_/; | |
687 $cpg_ob =~ s/sam$/txt/; | |
688 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); | |
689 $cpg_ob = $output_dir . $cpg_ob; | |
690 push @sorting_files,$cpg_ob; | |
691 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n"; | |
692 print "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n"; | |
693 | |
694 unless($no_header){ | |
695 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n"; | |
696 } | |
697 | |
698 ### For cytosines in CHG context | |
699 my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename; | |
700 | |
701 $chg_ot =~ s/^/CHG_OT_/; | |
702 $chg_ot =~ s/sam$/txt/; | |
703 $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/); | |
704 $chg_ot = $output_dir . $chg_ot; | |
705 push @sorting_files,$chg_ot; | |
706 open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n"; | |
707 print "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n"; | |
708 | |
709 unless($no_header){ | |
710 print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n"; | |
711 } | |
712 | |
713 $chg_ctot =~ s/^/CHG_CTOT_/; | |
714 $chg_ctot =~ s/sam$/txt/; | |
715 $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/); | |
716 $chg_ctot = $output_dir . $chg_ctot; | |
717 push @sorting_files,$chg_ctot; | |
718 open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n"; | |
719 print "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n"; | |
720 | |
721 unless($no_header){ | |
722 print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n"; | |
723 } | |
724 | |
725 $chg_ctob =~ s/^/CHG_CTOB_/; | |
726 $chg_ctob =~ s/sam$/txt/; | |
727 $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/); | |
728 $chg_ctob = $output_dir . $chg_ctob; | |
729 push @sorting_files,$chg_ctob; | |
730 open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n"; | |
731 print "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n"; | |
732 | |
733 unless($no_header){ | |
734 print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n"; | |
735 } | |
736 | |
737 $chg_ob =~ s/^/CHG_OB_/; | |
738 $chg_ob =~ s/sam$/txt/; | |
739 $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/); | |
740 $chg_ob = $output_dir . $chg_ob; | |
741 push @sorting_files,$chg_ob; | |
742 open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n"; | |
743 print "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n"; | |
744 | |
745 unless($no_header){ | |
746 print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n"; | |
747 } | |
748 | |
749 ### For cytosines in CHH context | |
750 my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename; | |
751 | |
752 $chh_ot =~ s/^/CHH_OT_/; | |
753 $chh_ot =~ s/sam$/txt/; | |
754 $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/); | |
755 $chh_ot = $output_dir . $chh_ot; | |
756 push @sorting_files,$chh_ot; | |
757 open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n"; | |
758 print "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n"; | |
759 | |
760 unless($no_header){ | |
761 print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n"; | |
762 } | |
763 | |
764 $chh_ctot =~ s/^/CHH_CTOT_/; | |
765 $chh_ctot =~ s/sam$/txt/; | |
766 $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/); | |
767 $chh_ctot = $output_dir . $chh_ctot; | |
768 push @sorting_files,$chh_ctot; | |
769 open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n"; | |
770 print "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n"; | |
771 | |
772 unless($no_header){ | |
773 print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n"; | |
774 } | |
775 | |
776 $chh_ctob =~ s/^/CHH_CTOB_/; | |
777 $chh_ctob =~ s/sam$/txt/; | |
778 $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/); | |
779 $chh_ctob = $output_dir . $chh_ctob; | |
780 push @sorting_files,$chh_ctob; | |
781 open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n"; | |
782 print "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n"; | |
783 | |
784 unless($no_header){ | |
785 print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n"; | |
786 } | |
787 | |
788 $chh_ob =~ s/^/CHH_OB_/; | |
789 $chh_ob =~ s/sam$/txt/; | |
790 $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/); | |
791 $chh_ob = $output_dir . $chh_ob; | |
792 push @sorting_files,$chh_ob; | |
793 open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n"; | |
794 print "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n"; | |
795 | |
796 unless($no_header){ | |
797 print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n"; | |
798 } | |
799 } | |
800 | |
801 my $methylation_call_strings_processed = 0; | |
802 my $line_count = 0; | |
803 | |
804 ### proceeding differently now for single-end or paired-end Bismark files | |
805 | |
806 ### PROCESSING SINGLE-END RESULT FILES | |
807 if ($single) { | |
808 | |
809 ### also proceeding differently now for SAM format or vanilla Bismark format files | |
810 if ($vanilla) { # old vanilla Bismark output format | |
811 while (<IN>) { | |
812 ++$line_count; | |
813 warn "Processed lines: $line_count\n" if ($line_count%500000==0); | |
814 | |
815 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example) | |
816 my ($id,$strand,$chrom,$start,$seq,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,6,7,8,9]; | |
817 | |
818 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the first or | |
819 ### last position | |
820 chomp $genome_conversion; | |
821 | |
822 my $index; | |
823 if ($meth_call) { | |
824 | |
825 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand | |
826 $index = 0; | |
827 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand | |
828 $index = 1; | |
829 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand | |
830 $index = 3; | |
831 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand | |
832 $index = 2; | |
833 } else { | |
834 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; | |
835 } | |
836 | |
837 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int> | |
838 if ($ignore) { | |
839 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); | |
840 | |
841 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly! | |
842 if ($strand eq '+') { | |
843 $start += $ignore; | |
844 } elsif ($strand eq '-') { | |
845 $start += length($meth_call)-1; ## $meth_call is already shortened! | |
846 } else { | |
847 die "Alignment did not have proper strand information: $strand\n"; | |
848 } | |
849 } | |
850 ### printing out the methylation state of every C in the read | |
851 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index); | |
852 | |
853 ++$methylation_call_strings_processed; # 1 per single-end result | |
854 } | |
855 } | |
856 } else { # processing single-end SAM format (default) | |
857 while (<IN>) { | |
858 ### SAM format can either start with header lines (starting with @) or start with alignments directly | |
859 if (/^\@/) { # skipping header lines (starting with @) | |
860 warn "skipping SAM header line:\t$_"; | |
861 next; | |
862 } | |
863 | |
864 ++$line_count; | |
865 warn "Processed lines: $line_count\n" if ($line_count%500000==0); | |
866 | |
867 # example read in SAM format | |
868 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT | |
869 ### | |
870 | |
871 # < 0.7.6 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; | |
872 # < 0.7.6 $meth_call =~ s/^XM:Z://; | |
873 # < 0.7.6 $read_conversion =~ s/^XR:Z://; | |
874 # < 0.7.6 $genome_conversion =~ s/^XG:Z://; | |
875 | |
876 my ($id,$chrom,$start,$cigar) = (split("\t"))[0,2,3,5]; | |
877 | |
878 ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression | |
879 my $meth_call; ### Thanks to Zachary Zeno for this solution | |
880 my $read_conversion; | |
881 my $genome_conversion; | |
882 | |
883 while ( /(XM|XR|XG):Z:([^\t]+)/g ) { | |
884 my $tag = $1; | |
885 my $value = $2; | |
886 | |
887 if ($tag eq "XM") { | |
888 $meth_call = $value; | |
889 $meth_call =~ s/\r//; | |
890 } elsif ($tag eq "XR") { | |
891 $read_conversion = $value; | |
892 $read_conversion =~ s/\r//; | |
893 } elsif ($tag eq "XG") { | |
894 $genome_conversion = $value; | |
895 $genome_conversion =~ s/\r//; | |
896 } | |
897 } | |
898 | |
899 my $strand; | |
900 chomp $genome_conversion; | |
901 # print "$meth_call\n$read_conversion\n$genome_conversion\n"; | |
902 | |
903 my $index; | |
904 if ($meth_call) { | |
905 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand | |
906 $index = 0; | |
907 $strand = '+'; | |
908 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand | |
909 $index = 1; | |
910 $strand = '-'; | |
911 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand | |
912 $index = 2; | |
913 $strand = '+'; | |
914 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand | |
915 $index = 3; | |
916 $strand = '-'; | |
917 } else { | |
918 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; | |
919 } | |
920 | |
921 ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output | |
922 if ($strand eq '-') { | |
923 $meth_call = reverse $meth_call; | |
924 } | |
925 | |
926 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int> | |
927 if ($ignore) { | |
928 # print "\n\n$meth_call\n"; | |
929 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); | |
930 # print "$meth_call\n"; | |
931 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly | |
932 | |
933 my @len = split (/\D+/,$cigar); # storing the length per operation | |
934 my @ops = split (/\d+/,$cigar); # storing the operation | |
935 shift @ops; # remove the empty first element | |
936 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
937 | |
938 my @comp_cigar; # building an array with all CIGAR operations | |
939 foreach my $index (0..$#len) { | |
940 foreach (1..$len[$index]) { | |
941 # print "$ops[$index]"; | |
942 push @comp_cigar, $ops[$index]; | |
943 } | |
944 } | |
945 # print "original CIGAR: $cigar\n"; | |
946 # print "original CIGAR: @comp_cigar\n"; | |
947 | |
948 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly! | |
949 if ($strand eq '+') { | |
950 | |
951 my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions | |
952 my $I_count = 0; | |
953 | |
954 for (1..$ignore) { | |
955 my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations from the start | |
956 # print "$_ deleted $op\n"; | |
957 | |
958 while ($op eq 'D') { # repeating this for deletions (D) | |
959 $D_count++; | |
960 $op = shift @comp_cigar; | |
961 # print "$_ deleted $op\n"; | |
962 } | |
963 if ($op eq 'I') { # adjusting the genomic position for insertions (I) | |
964 $I_count++; | |
965 } | |
966 } | |
967 $start += $ignore + $D_count - $I_count; | |
968 # print "start $start\t ignore: $ignore\t D count: $D_count I_count: $I_count\n"; | |
969 } elsif ($strand eq '-') { | |
970 | |
971 for (1..$ignore) { | |
972 my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array | |
973 while ($op eq 'D') { # repeating this for deletions (D) | |
974 $op = pop @comp_cigar; | |
975 } | |
976 } | |
977 | |
978 ### For reverse strand alignments we need to determine the number of matching bases (M) or deletions (D) in the read from the CIGAR | |
979 ### string to be able to work out the starting position of the read which is on the 3' end of the sequence | |
980 my $MD_count = 0; # counting all operations that affect the genomic position, i.e. M and D. Insertions do not affect the start position | |
981 foreach (@comp_cigar) { | |
982 ++$MD_count if ($_ eq 'M' or $_ eq 'D'); | |
983 } | |
984 $start += $MD_count - 1; | |
985 } | |
986 | |
987 ### reconstituting shortened CIGAR string | |
988 my $new_cigar; | |
989 my $count = 0; | |
990 my $last_op; | |
991 # print "ignore adjusted: @comp_cigar\n"; | |
992 foreach my $op (@comp_cigar) { | |
993 unless (defined $last_op){ | |
994 $last_op = $op; | |
995 ++$count; | |
996 next; | |
997 } | |
998 if ($last_op eq $op) { | |
999 ++$count; | |
1000 } else { | |
1001 $new_cigar .= "$count$last_op"; | |
1002 $last_op = $op; | |
1003 $count = 1; | |
1004 } | |
1005 } | |
1006 $new_cigar .= "$count$last_op"; # appending the last operation and count | |
1007 $cigar = $new_cigar; | |
1008 # print "ignore adjusted scalar: $cigar\n"; | |
1009 } | |
1010 } | |
1011 ### printing out the methylation state of every C in the read | |
1012 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index,$cigar); | |
1013 | |
1014 ++$methylation_call_strings_processed; # 1 per single-end result | |
1015 } | |
1016 } | |
1017 } | |
1018 | |
1019 ### PROCESSING PAIRED-END RESULT FILES | |
1020 elsif ($paired) { | |
1021 | |
1022 ### proceeding differently now for SAM format or vanilla Bismark format files | |
1023 if ($vanilla) { # old vanilla Bismark paired-end output format | |
1024 while (<IN>) { | |
1025 ++$line_count; | |
1026 warn "processed line: $line_count\n" if ($line_count%500000==0); | |
1027 | |
1028 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example) | |
1029 my ($id,$strand,$chrom,$start_read_1,$end_read_2,$seq_1,$meth_call_1,$seq_2,$meth_call_2,$first_read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,4,6,7,9,10,11,12,13]; | |
1030 | |
1031 my $index; | |
1032 chomp $genome_conversion; | |
1033 | |
1034 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') { | |
1035 $index = 0; ## this is OT | |
1036 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') { | |
1037 $index = 2; ## this is CTOB!!! | |
1038 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') { | |
1039 $index = 1; ## this is CTOT!!! | |
1040 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') { | |
1041 $index = 3; ## this is OB | |
1042 } else { | |
1043 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; | |
1044 } | |
1045 | |
1046 if ($meth_call_1 and $meth_call_2) { | |
1047 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' | |
1048 if ($ignore) { | |
1049 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); | |
1050 $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore); | |
1051 | |
1052 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified | |
1053 $start_read_1 += $ignore; | |
1054 $end_read_2 -= $ignore; | |
1055 } | |
1056 my $end_read_1; | |
1057 my $start_read_2; | |
1058 | |
1059 if ($strand eq '+') { | |
1060 | |
1061 $end_read_1 = $start_read_1+length($meth_call_1)-1; | |
1062 $start_read_2 = $end_read_2-length($meth_call_2)+1; | |
1063 | |
1064 ## we first pass the first read which is in + orientation on the forward strand | |
1065 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0); | |
1066 | |
1067 # we next pass the second read which is in - orientation on the reverse strand | |
1068 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 | |
1069 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1); | |
1070 } else { | |
1071 | |
1072 $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read! | |
1073 $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read! | |
1074 | |
1075 ## we first pass the first read which is in - orientation on the reverse strand | |
1076 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0); | |
1077 | |
1078 # we next pass the second read which is in + orientation on the forward strand | |
1079 ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 | |
1080 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2); | |
1081 } | |
1082 | |
1083 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls | |
1084 } | |
1085 } | |
1086 } else { # Bismark paired-end SAM output format (default) | |
1087 while (<IN>) { | |
1088 ### SAM format can either start with header lines (starting with @) or start with alignments directly | |
1089 if (/^\@/) { # skipping header lines (starting with @) | |
1090 warn "skipping SAM header line:\t$_"; | |
1091 next; | |
1092 } | |
1093 | |
1094 ++$line_count; | |
1095 warn "Processed lines: $line_count\n" if ($line_count%500000==0); | |
1096 | |
1097 # example paired-end reads in SAM format (2 consecutive lines) | |
1098 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT | |
1099 # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT | |
1100 | |
1101 # < version 0.7.6 my ($id_1,$chrom,$start_read_1,$meth_call_1,$first_read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; | |
1102 | |
1103 my ($id_1,$chrom,$start_read_1,$cigar_1) = (split("\t"))[0,2,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression | |
1104 my $meth_call_1; | |
1105 my $first_read_conversion; | |
1106 my $genome_conversion; | |
1107 | |
1108 while ( /(XM|XR|XG):Z:([^\t]+)/g ) { | |
1109 my $tag = $1; | |
1110 my $value = $2; | |
1111 | |
1112 if ($tag eq "XM") { | |
1113 $meth_call_1 = $value; | |
1114 $meth_call_1 =~ s/\r//; | |
1115 } elsif ($tag eq "XR") { | |
1116 $first_read_conversion = $value; | |
1117 $first_read_conversion =~ s/\r//; | |
1118 } elsif ($tag eq "XG") { | |
1119 $genome_conversion = $value; | |
1120 $genome_conversion =~ s/\r//; | |
1121 } | |
1122 } | |
1123 | |
1124 $_ = <IN>; # reading in the paired read | |
1125 | |
1126 # < version 0.7.6 my ($id_2,$start_read_2,$meth_call_2,$second_read_conversion) = (split("\t"))[0,3,13,14]; | |
1127 # < version 0.7.6 $meth_call_1 =~ s/^XM:Z://; | |
1128 # < version 0.7.6 $meth_call_2 =~ s/^XM:Z://; | |
1129 # < version 0.7.6 $first_read_conversion =~ s/^XR:Z://; | |
1130 # < version 0.7.6 $second_read_conversion =~ s/^XR:Z://; | |
1131 | |
1132 my ($id_2,$start_read_2,$cigar_2) = (split("\t"))[0,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression | |
1133 | |
1134 my $meth_call_2; | |
1135 my $second_read_conversion; | |
1136 | |
1137 while ( /(XM|XR):Z:([^\t]+)/g ) { | |
1138 my $tag = $1; | |
1139 my $value = $2; | |
1140 | |
1141 if ($tag eq "XM") { | |
1142 $meth_call_2 = $value; | |
1143 $meth_call_2 =~ s/\r//; | |
1144 } elsif ($tag eq "XR") { | |
1145 $second_read_conversion = $value; | |
1146 $second_read_conversion = s/\r//; | |
1147 } | |
1148 } | |
1149 | |
1150 # < version 0.7.6 $genome_conversion =~ s/^XG:Z://; | |
1151 chomp $genome_conversion; # in case it captured a new line character | |
1152 | |
1153 # print join ("\t",$meth_call_1,$meth_call_2,$first_read_conversion,$second_read_conversion,$genome_conversion),"\n"; | |
1154 | |
1155 my $index; | |
1156 my $strand; | |
1157 | |
1158 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') { | |
1159 $index = 0; ## this is OT | |
1160 $strand = '+'; | |
1161 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') { | |
1162 $index = 1; ## this is CTOT | |
1163 $strand = '-'; | |
1164 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') { | |
1165 $index = 2; ## this is CTOB | |
1166 $strand = '+'; | |
1167 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') { | |
1168 $index = 3; ## this is OB | |
1169 $strand = '-'; | |
1170 } else { | |
1171 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; | |
1172 } | |
1173 | |
1174 ### reversing the methylation call of the read that was reverse-complemented | |
1175 if ($strand eq '+') { | |
1176 $meth_call_2 = reverse $meth_call_2; | |
1177 } else { | |
1178 $meth_call_1 = reverse $meth_call_1; | |
1179 } | |
1180 | |
1181 if ($meth_call_1 and $meth_call_2) { | |
1182 | |
1183 my $end_read_1; | |
1184 | |
1185 ### READ 1 | |
1186 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation | |
1187 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation | |
1188 shift @ops_1; # remove the empty first element | |
1189 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1); | |
1190 | |
1191 my @comp_cigar_1; # building an array with all CIGAR operations | |
1192 foreach my $index (0..$#len_1) { | |
1193 foreach (1..$len_1[$index]) { | |
1194 # print "$ops_1[$index]"; | |
1195 push @comp_cigar_1, $ops_1[$index]; | |
1196 } | |
1197 } | |
1198 # print "original CIGAR read 1: $cigar_1\n"; | |
1199 # print "original CIGAR read 1: @comp_cigar_1\n"; | |
1200 | |
1201 ### READ 2 | |
1202 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation | |
1203 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation | |
1204 shift @ops_2; # remove the empty first element | |
1205 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2); | |
1206 my @comp_cigar_2; # building an array with all CIGAR operations for read 2 | |
1207 foreach my $index (0..$#len_2) { | |
1208 foreach (1..$len_2[$index]) { | |
1209 # print "$ops_2[$index]"; | |
1210 push @comp_cigar_2, $ops_2[$index]; | |
1211 } | |
1212 } | |
1213 # print "original CIGAR read 2: $cigar_2\n"; | |
1214 # print "original CIGAR read 2: @comp_cigar_2\n"; | |
1215 | |
1216 if ($ignore) { | |
1217 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' | |
1218 ### the methylation calls have already been reversed where necessary | |
1219 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); | |
1220 $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore); | |
1221 | |
1222 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly | |
1223 | |
1224 if ($strand eq '+') { | |
1225 | |
1226 ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start | |
1227 my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions | |
1228 my $I_count_1 = 0; | |
1229 | |
1230 for (1..$ignore) { | |
1231 my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start | |
1232 # print "$_ deleted $op\n"; | |
1233 | |
1234 while ($op eq 'D') { # repeating this for deletions (D) | |
1235 $D_count_1++; | |
1236 $op = shift @comp_cigar_1; | |
1237 # print "$_ deleted $op\n"; | |
1238 } | |
1239 if ($op eq 'I') { # adjusting the genomic position for insertions (I) | |
1240 $I_count_1++; | |
1241 } | |
1242 } | |
1243 | |
1244 $start_read_1 += $ignore + $D_count_1 - $I_count_1; | |
1245 # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n"; | |
1246 | |
1247 ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back | |
1248 | |
1249 for (1..$ignore) { | |
1250 my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array | |
1251 while ($op eq 'D') { # repeating this for deletions (D) | |
1252 $op = pop @comp_cigar_2; | |
1253 } | |
1254 } | |
1255 # the start position of reads mapping to the reverse strand is being adjusted further below | |
1256 } elsif ($strand eq '-') { | |
1257 | |
1258 ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back | |
1259 for (1..$ignore) { | |
1260 my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array | |
1261 while ($op eq 'D') { # repeating this for deletions (D) | |
1262 $op = pop @comp_cigar_1; | |
1263 } | |
1264 } | |
1265 # the start position of reads mapping to the reverse strand is being adjusted further below | |
1266 | |
1267 ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start | |
1268 my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions | |
1269 my $I_count_2 = 0; | |
1270 | |
1271 for (1..$ignore) { | |
1272 my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start | |
1273 # print "$_ deleted $op\n"; | |
1274 | |
1275 while ($op eq 'D') { # repeating this for deletions (D) | |
1276 $D_count_2++; | |
1277 $op = shift @comp_cigar_2; | |
1278 # print "$_ deleted $op\n"; | |
1279 } | |
1280 if ($op eq 'I') { # adjusting the genomic position for insertions (I) | |
1281 $I_count_2++; | |
1282 } | |
1283 } | |
1284 | |
1285 $start_read_2 += $ignore + $D_count_2 - $I_count_2; | |
1286 # print "start read 2 $start_read_2\t ignore: $ignore\t D count 2: $D_count_2\tI_count 2: $I_count_2\n"; | |
1287 | |
1288 } | |
1289 | |
1290 ### reconstituting shortened CIGAR string 1 | |
1291 my $new_cigar_1; | |
1292 my $count_1 = 0; | |
1293 my $last_op_1; | |
1294 # print "ignore adjusted CIGAR 1: @comp_cigar_1\n"; | |
1295 foreach my $op (@comp_cigar_1) { | |
1296 unless (defined $last_op_1){ | |
1297 $last_op_1 = $op; | |
1298 ++$count_1; | |
1299 next; | |
1300 } | |
1301 if ($last_op_1 eq $op) { | |
1302 ++$count_1; | |
1303 } else { | |
1304 $new_cigar_1 .= "$count_1$last_op_1"; | |
1305 $last_op_1 = $op; | |
1306 $count_1 = 1; | |
1307 } | |
1308 } | |
1309 $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count | |
1310 $cigar_1 = $new_cigar_1; | |
1311 # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n"; | |
1312 | |
1313 ### reconstituting shortened CIGAR string 2 | |
1314 my $new_cigar_2; | |
1315 my $count_2 = 0; | |
1316 my $last_op_2; | |
1317 # print "ignore adjusted CIGAR 2: @comp_cigar_2\n"; | |
1318 foreach my $op (@comp_cigar_2) { | |
1319 unless (defined $last_op_2){ | |
1320 $last_op_2 = $op; | |
1321 ++$count_2; | |
1322 next; | |
1323 } | |
1324 if ($last_op_2 eq $op) { | |
1325 ++$count_2; | |
1326 } else { | |
1327 $new_cigar_2 .= "$count_2$last_op_2"; | |
1328 $last_op_2 = $op; | |
1329 $count_2 = 1; | |
1330 } | |
1331 } | |
1332 $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count | |
1333 $cigar_2 = $new_cigar_2; | |
1334 # print "ignore adjusted CIGAR 2 scalar: $cigar_2\n"; | |
1335 | |
1336 } | |
1337 | |
1338 if ($strand eq '+') { | |
1339 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2 | |
1340 @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too | |
1341 # print "reverse: @comp_cigar_2\n"; | |
1342 | |
1343 my $MD_count_1 = 0; | |
1344 foreach (@comp_cigar_1) { | |
1345 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't | |
1346 } | |
1347 | |
1348 my $MD_count_2 = 0; | |
1349 foreach (@comp_cigar_2) { | |
1350 ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't | |
1351 } | |
1352 | |
1353 $end_read_1 = $start_read_1 + $MD_count_1 - 1; | |
1354 $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand | |
1355 } else { | |
1356 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1 | |
1357 | |
1358 @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too | |
1359 # print "reverse: @comp_cigar_1\n"; | |
1360 | |
1361 my $MD_count_1 = 0; | |
1362 foreach (@comp_cigar_1) { | |
1363 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't | |
1364 } | |
1365 | |
1366 $end_read_1 = $start_read_1; | |
1367 $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand | |
1368 | |
1369 } | |
1370 | |
1371 if ($strand eq '+') { | |
1372 ## we first pass the first read which is in + orientation on the forward strand | |
1373 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1); | |
1374 | |
1375 # we next pass the second read which is in - orientation on the reverse strand | |
1376 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 | |
1377 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2); | |
1378 } else { | |
1379 ## we first pass the first read which is in - orientation on the reverse strand | |
1380 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1); | |
1381 | |
1382 # we next pass the second read which is in + orientation on the forward strand | |
1383 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 | |
1384 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2); | |
1385 } | |
1386 | |
1387 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls | |
1388 } | |
1389 } | |
1390 } | |
1391 } else { | |
1392 die "Single-end or paired-end reads not specified properly\n"; | |
1393 } | |
1394 | |
1395 print "\n\nProcessed $line_count lines from $filename in total\n"; | |
1396 print "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; | |
1397 if ($report) { | |
1398 print REPORT "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; | |
1399 } | |
1400 print_splitting_report (); | |
1401 } | |
1402 | |
1403 | |
1404 | |
1405 sub print_splitting_report{ | |
1406 | |
1407 ### Calculating methylation percentages if applicable | |
1408 | |
1409 my $percent_meCpG; | |
1410 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ | |
1411 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); | |
1412 } | |
1413 | |
1414 my $percent_meCHG; | |
1415 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ | |
1416 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); | |
1417 } | |
1418 | |
1419 my $percent_meCHH; | |
1420 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ | |
1421 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); | |
1422 } | |
1423 | |
1424 my $percent_non_CpG_methylation; | |
1425 if ($merge_non_CpG){ | |
1426 if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ | |
1427 $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) ); | |
1428 } | |
1429 } | |
1430 | |
1431 if ($report){ | |
1432 ### detailed information about Cs analysed | |
1433 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; | |
1434 | |
1435 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; | |
1436 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; | |
1437 | |
1438 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | |
1439 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | |
1440 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; | |
1441 | |
1442 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
1443 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
1444 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; | |
1445 | |
1446 ### calculating methylated CpG percentage if applicable | |
1447 if ($percent_meCpG){ | |
1448 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; | |
1449 } | |
1450 else{ | |
1451 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; | |
1452 } | |
1453 | |
1454 ### 2-Context Output | |
1455 if ($merge_non_CpG){ | |
1456 if ($percent_non_CpG_methylation){ | |
1457 print REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; | |
1458 } | |
1459 else{ | |
1460 print REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; | |
1461 } | |
1462 } | |
1463 | |
1464 ### 3 Context Output | |
1465 else{ | |
1466 ### calculating methylated CHG percentage if applicable | |
1467 if ($percent_meCHG){ | |
1468 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; | |
1469 } | |
1470 else{ | |
1471 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; | |
1472 } | |
1473 | |
1474 ### calculating methylated CHH percentage if applicable | |
1475 if ($percent_meCHH){ | |
1476 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; | |
1477 } | |
1478 else{ | |
1479 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; | |
1480 } | |
1481 } | |
1482 } | |
1483 | |
1484 ### detailed information about Cs analysed for on-screen report | |
1485 print "Final Cytosine Methylation Report\n",'='x33,"\n"; | |
1486 | |
1487 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; | |
1488 print "Total number of C's analysed:\t$total_number_of_C\n\n"; | |
1489 | |
1490 print "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | |
1491 print "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | |
1492 print "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; | |
1493 | |
1494 print "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
1495 print "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
1496 print "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; | |
1497 | |
1498 ### printing methylated CpG percentage if applicable | |
1499 if ($percent_meCpG){ | |
1500 print "C methylated in CpG context:\t${percent_meCpG}%\n"; | |
1501 } | |
1502 else{ | |
1503 print "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; | |
1504 } | |
1505 | |
1506 ### 2-Context Output | |
1507 if ($merge_non_CpG){ | |
1508 if ($percent_non_CpG_methylation){ | |
1509 print "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; | |
1510 } | |
1511 else{ | |
1512 print "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; | |
1513 } | |
1514 } | |
1515 | |
1516 ### 3-Context Output | |
1517 else{ | |
1518 ### printing methylated CHG percentage if applicable | |
1519 if ($percent_meCHG){ | |
1520 print "C methylated in CHG context:\t${percent_meCHG}%\n"; | |
1521 } | |
1522 else{ | |
1523 print "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; | |
1524 } | |
1525 | |
1526 ### printing methylated CHH percentage if applicable | |
1527 if ($percent_meCHH){ | |
1528 print "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; | |
1529 } | |
1530 else{ | |
1531 print "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; | |
1532 } | |
1533 } | |
1534 } | |
1535 | |
1536 | |
1537 | |
1538 | |
1539 | |
1540 sub print_individual_C_methylation_states_paired_end_files{ | |
1541 | |
1542 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar) = @_; | |
1543 my @methylation_calls = split(//,$meth_call); | |
1544 | |
1545 ################################################################# | |
1546 ### . for bases not involving cytosines ### | |
1547 ### X for methylated C in CHG context (was protected) ### | |
1548 ### x for not methylated C in CHG context (was converted) ### | |
1549 ### H for methylated C in CHH context (was protected) ### | |
1550 ### h for not methylated C in CHH context (was converted) ### | |
1551 ### Z for methylated C in CpG context (was protected) ### | |
1552 ### z for not methylated C in CpG context (was converted) ### | |
1553 ################################################################# | |
1554 | |
1555 my $methyl_CHG_count = 0; | |
1556 my $methyl_CHH_count = 0; | |
1557 my $methyl_CpG_count = 0; | |
1558 my $unmethylated_CHG_count = 0; | |
1559 my $unmethylated_CHH_count = 0; | |
1560 my $unmethylated_CpG_count = 0; | |
1561 | |
1562 my @len; | |
1563 my @ops; | |
1564 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions | |
1565 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels | |
1566 my @comp_cigar; | |
1567 | |
1568 if ($cigar){ # parsing CIGAR string | |
1569 @len = split (/\D+/,$cigar); # storing the length per operation | |
1570 @ops = split (/\d+/,$cigar); # storing the operation | |
1571 shift @ops; # remove the empty first element | |
1572 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
1573 | |
1574 foreach my $index (0..$#len){ | |
1575 foreach (1..$len[$index]){ | |
1576 # print "$ops[$index]"; | |
1577 push @comp_cigar, $ops[$index]; | |
1578 } | |
1579 } | |
1580 # warn "\nDetected CIGAR string: $cigar\n"; | |
1581 # warn "Length of methylation call: ",length $meth_call,"\n"; | |
1582 # warn "number of operations: ",scalar @ops,"\n"; | |
1583 # warn "number of length digits: ",scalar @len,"\n\n"; | |
1584 # print @comp_cigar,"\n"; | |
1585 # print "$meth_call\n\n"; | |
1586 # sleep (1); | |
1587 } | |
1588 | |
1589 | |
1590 if ($strand eq '-') { | |
1591 | |
1592 ### the CIGAR string needs to be reversed, the methylation call has already been reversed above | |
1593 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too | |
1594 # print "reverse CIGAR string: @comp_cigar\n"; | |
1595 | |
1596 ### the start position of paired-end files has already been corrected, see above | |
1597 } | |
1598 | |
1599 ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified | |
1600 | |
1601 if ($merge_non_CpG) { | |
1602 | |
1603 if ($no_overlap) { | |
1604 | |
1605 ### single-file CpG and non-CpG context output | |
1606 if ($full) { | |
1607 if ($strand eq '+') { | |
1608 for my $index (0..$#methylation_calls) { | |
1609 | |
1610 if ($cigar){ # only needed for SAM files | |
1611 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1612 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
1613 $cigar_offset += $cigar_mod; | |
1614 $pos_offset += $pos_mod; | |
1615 } | |
1616 | |
1617 ### Returning as soon as the methylation calls start overlapping | |
1618 if ($start+$index+$pos_offset >= $end_read_1) { | |
1619 return; | |
1620 } | |
1621 | |
1622 if ($methylation_calls[$index] eq 'X') { | |
1623 $counting{total_meCHG_count}++; | |
1624 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1625 } elsif ($methylation_calls[$index] eq 'x') { | |
1626 $counting{total_unmethylated_CHG_count}++; | |
1627 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1628 } elsif ($methylation_calls[$index] eq 'Z') { | |
1629 $counting{total_meCpG_count}++; | |
1630 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1631 } elsif ($methylation_calls[$index] eq 'z') { | |
1632 $counting{total_unmethylated_CpG_count}++; | |
1633 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1634 } elsif ($methylation_calls[$index] eq 'H') { | |
1635 $counting{total_meCHH_count}++; | |
1636 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1637 } elsif ($methylation_calls[$index] eq 'h') { | |
1638 $counting{total_unmethylated_CHH_count}++; | |
1639 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1640 } | |
1641 elsif ($methylation_calls[$index] eq '.'){} | |
1642 else{ | |
1643 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
1644 } | |
1645 } | |
1646 } elsif ($strand eq '-') { | |
1647 for my $index (0..$#methylation_calls) { | |
1648 | |
1649 if ($cigar){ # only needed for SAM files | |
1650 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
1651 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1652 $cigar_offset += $cigar_mod; | |
1653 $pos_offset += $pos_mod; | |
1654 } | |
1655 | |
1656 ### Returning as soon as the methylation calls start overlapping | |
1657 if ($start-$index+$pos_offset <= $end_read_1) { | |
1658 return; | |
1659 } | |
1660 | |
1661 if ($methylation_calls[$index] eq 'X') { | |
1662 $counting{total_meCHG_count}++; | |
1663 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1664 } elsif ($methylation_calls[$index] eq 'x') { | |
1665 $counting{total_unmethylated_CHG_count}++; | |
1666 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1667 } elsif ($methylation_calls[$index] eq 'Z') { | |
1668 $counting{total_meCpG_count}++; | |
1669 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1670 } elsif ($methylation_calls[$index] eq 'z') { | |
1671 $counting{total_unmethylated_CpG_count}++; | |
1672 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1673 } elsif ($methylation_calls[$index] eq 'H') { | |
1674 $counting{total_meCHH_count}++; | |
1675 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1676 } elsif ($methylation_calls[$index] eq 'h') { | |
1677 $counting{total_unmethylated_CHH_count}++; | |
1678 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1679 } | |
1680 elsif ($methylation_calls[$index] eq '.') {} | |
1681 else{ | |
1682 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
1683 } | |
1684 } | |
1685 } else { | |
1686 die "The read orientation was neither + nor -: '$strand'\n"; | |
1687 } | |
1688 } | |
1689 | |
1690 ### strand-specific methylation output | |
1691 else { | |
1692 if ($strand eq '+') { | |
1693 for my $index (0..$#methylation_calls) { | |
1694 | |
1695 if ($cigar){ # only needed for SAM files | |
1696 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1697 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
1698 $cigar_offset += $cigar_mod; | |
1699 $pos_offset += $pos_mod; | |
1700 } | |
1701 | |
1702 ### Returning as soon as the methylation calls start overlapping | |
1703 if ($start+$index+$pos_offset >= $end_read_1) { | |
1704 return; | |
1705 } | |
1706 | |
1707 if ($methylation_calls[$index] eq 'X') { | |
1708 $counting{total_meCHG_count}++; | |
1709 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1710 } elsif ($methylation_calls[$index] eq 'x') { | |
1711 $counting{total_unmethylated_CHG_count}++; | |
1712 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1713 } elsif ($methylation_calls[$index] eq 'Z') { | |
1714 $counting{total_meCpG_count}++; | |
1715 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1716 } elsif ($methylation_calls[$index] eq 'z') { | |
1717 $counting{total_unmethylated_CpG_count}++; | |
1718 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1719 } elsif ($methylation_calls[$index] eq 'H') { | |
1720 $counting{total_meCHH_count}++; | |
1721 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1722 } elsif ($methylation_calls[$index] eq 'h') { | |
1723 $counting{total_unmethylated_CHH_count}++; | |
1724 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1725 } | |
1726 elsif ($methylation_calls[$index] eq '.') {} | |
1727 else{ | |
1728 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
1729 } | |
1730 } | |
1731 } elsif ($strand eq '-') { | |
1732 for my $index (0..$#methylation_calls) { | |
1733 | |
1734 if ($cigar){ # only needed for SAM files | |
1735 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
1736 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1737 $cigar_offset += $cigar_mod; | |
1738 $pos_offset += $pos_mod; | |
1739 } | |
1740 | |
1741 ### Returning as soon as the methylation calls start overlapping | |
1742 if ($start-$index+$pos_offset <= $end_read_1) { | |
1743 return; | |
1744 } | |
1745 | |
1746 if ($methylation_calls[$index] eq 'X') { | |
1747 $counting{total_meCHG_count}++; | |
1748 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1749 } elsif ($methylation_calls[$index] eq 'x') { | |
1750 $counting{total_unmethylated_CHG_count}++; | |
1751 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1752 } elsif ($methylation_calls[$index] eq 'Z') { | |
1753 $counting{total_meCpG_count}++; | |
1754 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1755 } elsif ($methylation_calls[$index] eq 'z') { | |
1756 $counting{total_unmethylated_CpG_count}++; | |
1757 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1758 } elsif ($methylation_calls[$index] eq 'H') { | |
1759 $counting{total_meCHH_count}++; | |
1760 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1761 } elsif ($methylation_calls[$index] eq 'h') { | |
1762 $counting{total_unmethylated_CHH_count}++; | |
1763 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1764 } | |
1765 elsif ($methylation_calls[$index] eq '.') {} | |
1766 else{ | |
1767 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
1768 } | |
1769 } | |
1770 } else { | |
1771 die "The strand orientation was neither + nor -: '$strand'/n"; | |
1772 } | |
1773 } | |
1774 } | |
1775 | |
1776 ### this is the default paired-end procedure allowing overlaps and using every single C position | |
1777 ### Still within the 2-CONTEXT ONLY optional section | |
1778 else { | |
1779 ### single-file CpG and non-CpG context output | |
1780 if ($full) { | |
1781 if ($strand eq '+') { | |
1782 for my $index (0..$#methylation_calls) { | |
1783 | |
1784 if ($cigar){ # only needed for SAM files | |
1785 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1786 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
1787 $cigar_offset += $cigar_mod; | |
1788 $pos_offset += $pos_mod; | |
1789 } | |
1790 | |
1791 if ($methylation_calls[$index] eq 'X') { | |
1792 $counting{total_meCHG_count}++; | |
1793 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1794 } elsif ($methylation_calls[$index] eq 'x') { | |
1795 $counting{total_unmethylated_CHG_count}++; | |
1796 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1797 } elsif ($methylation_calls[$index] eq 'Z') { | |
1798 $counting{total_meCpG_count}++; | |
1799 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1800 } elsif ($methylation_calls[$index] eq 'z') { | |
1801 $counting{total_unmethylated_CpG_count}++; | |
1802 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1803 } elsif ($methylation_calls[$index] eq 'H') { | |
1804 $counting{total_meCHH_count}++; | |
1805 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1806 } elsif ($methylation_calls[$index] eq 'h') { | |
1807 $counting{total_unmethylated_CHH_count}++; | |
1808 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1809 } | |
1810 elsif ($methylation_calls[$index] eq '.') {} | |
1811 else{ | |
1812 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
1813 } | |
1814 } | |
1815 } elsif ($strand eq '-') { | |
1816 for my $index (0..$#methylation_calls) { | |
1817 | |
1818 if ($cigar){ # only needed for SAM files | |
1819 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
1820 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1821 $cigar_offset += $cigar_mod; | |
1822 $pos_offset += $pos_mod; | |
1823 } | |
1824 | |
1825 if ($methylation_calls[$index] eq 'X') { | |
1826 $counting{total_meCHG_count}++; | |
1827 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1828 } elsif ($methylation_calls[$index] eq 'x') { | |
1829 $counting{total_unmethylated_CHG_count}++; | |
1830 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1831 } elsif ($methylation_calls[$index] eq 'Z') { | |
1832 $counting{total_meCpG_count}++; | |
1833 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1834 } elsif ($methylation_calls[$index] eq 'z') { | |
1835 $counting{total_unmethylated_CpG_count}++; | |
1836 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1837 } elsif ($methylation_calls[$index] eq 'H') { | |
1838 $counting{total_meCHH_count}++; | |
1839 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1840 } elsif ($methylation_calls[$index] eq 'h') { | |
1841 $counting{total_unmethylated_CHH_count}++; | |
1842 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1843 } | |
1844 elsif ($methylation_calls[$index] eq '.') {} | |
1845 else{ | |
1846 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
1847 } | |
1848 } | |
1849 } else { | |
1850 die "The strand orientation as neither + nor -: '$strand'\n"; | |
1851 } | |
1852 } | |
1853 | |
1854 ### strand-specific methylation output | |
1855 ### still within the 2-CONTEXT optional section | |
1856 else { | |
1857 if ($strand eq '+') { | |
1858 for my $index (0..$#methylation_calls) { | |
1859 | |
1860 if ($cigar){ # only needed for SAM files | |
1861 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1862 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
1863 $cigar_offset += $cigar_mod; | |
1864 $pos_offset += $pos_mod; | |
1865 } | |
1866 | |
1867 if ($methylation_calls[$index] eq 'X') { | |
1868 $counting{total_meCHG_count}++; | |
1869 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1870 } elsif ($methylation_calls[$index] eq 'x') { | |
1871 $counting{total_unmethylated_CHG_count}++; | |
1872 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1873 } elsif ($methylation_calls[$index] eq 'Z') { | |
1874 $counting{total_meCpG_count}++; | |
1875 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1876 } elsif ($methylation_calls[$index] eq 'z') { | |
1877 $counting{total_unmethylated_CpG_count}++; | |
1878 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1879 } elsif ($methylation_calls[$index] eq 'H') { | |
1880 $counting{total_meCHH_count}++; | |
1881 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1882 } elsif ($methylation_calls[$index] eq 'h') { | |
1883 $counting{total_unmethylated_CHH_count}++; | |
1884 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1885 } | |
1886 elsif ($methylation_calls[$index] eq '.') {} | |
1887 else{ | |
1888 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
1889 } | |
1890 } | |
1891 } elsif ($strand eq '-') { | |
1892 for my $index (0..$#methylation_calls) { | |
1893 | |
1894 if ($cigar){ # only needed for SAM files | |
1895 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
1896 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1897 $cigar_offset += $cigar_mod; | |
1898 $pos_offset += $pos_mod; | |
1899 } | |
1900 | |
1901 if ($methylation_calls[$index] eq 'X') { | |
1902 $counting{total_meCHG_count}++; | |
1903 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1904 } elsif ($methylation_calls[$index] eq 'x') { | |
1905 $counting{total_unmethylated_CHG_count}++; | |
1906 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1907 } elsif ($methylation_calls[$index] eq 'Z') { | |
1908 $counting{total_meCpG_count}++; | |
1909 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1910 } elsif ($methylation_calls[$index] eq 'z') { | |
1911 $counting{total_unmethylated_CpG_count}++; | |
1912 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1913 } elsif ($methylation_calls[$index] eq 'H') { | |
1914 $counting{total_meCHH_count}++; | |
1915 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1916 } elsif ($methylation_calls[$index] eq 'h') { | |
1917 $counting{total_unmethylated_CHH_count}++; | |
1918 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1919 } | |
1920 elsif ($methylation_calls[$index] eq '.') {} | |
1921 else{ | |
1922 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
1923 } | |
1924 } | |
1925 } else { | |
1926 die "The strand orientation as neither + nor -: '$strand'\n"; | |
1927 } | |
1928 } | |
1929 } | |
1930 } | |
1931 | |
1932 ############################################ | |
1933 ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ### | |
1934 ############################################ | |
1935 | |
1936 elsif ($no_overlap) { | |
1937 ### single-file CpG, CHG and CHH context output | |
1938 if ($full) { | |
1939 if ($strand eq '+') { | |
1940 for my $index (0..$#methylation_calls) { | |
1941 | |
1942 if ($cigar){ # only needed for SAM files | |
1943 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1944 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
1945 $cigar_offset += $cigar_mod; | |
1946 $pos_offset += $pos_mod; | |
1947 } | |
1948 | |
1949 ### Returning as soon as the methylation calls start overlapping | |
1950 if ($start+$index+$pos_offset >= $end_read_1) { | |
1951 return; | |
1952 } | |
1953 | |
1954 if ($methylation_calls[$index] eq 'X') { | |
1955 $counting{total_meCHG_count}++; | |
1956 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1957 } elsif ($methylation_calls[$index] eq 'x') { | |
1958 $counting{total_unmethylated_CHG_count}++; | |
1959 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1960 } elsif ($methylation_calls[$index] eq 'Z') { | |
1961 $counting{total_meCpG_count}++; | |
1962 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1963 } elsif ($methylation_calls[$index] eq 'z') { | |
1964 $counting{total_unmethylated_CpG_count}++; | |
1965 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1966 } elsif ($methylation_calls[$index] eq 'H') { | |
1967 $counting{total_meCHH_count}++; | |
1968 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1969 } elsif ($methylation_calls[$index] eq 'h') { | |
1970 $counting{total_unmethylated_CHH_count}++; | |
1971 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1972 } | |
1973 elsif ($methylation_calls[$index] eq '.') {} | |
1974 else{ | |
1975 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
1976 } | |
1977 } | |
1978 } elsif ($strand eq '-') { | |
1979 for my $index (0..$#methylation_calls) { | |
1980 | |
1981 if ($cigar){ # only needed for SAM files | |
1982 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
1983 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1984 $cigar_offset += $cigar_mod; | |
1985 $pos_offset += $pos_mod; | |
1986 } | |
1987 | |
1988 ### Returning as soon as the methylation calls start overlapping | |
1989 if ($start-$index+$pos_offset <= $end_read_1) { | |
1990 return; | |
1991 } | |
1992 | |
1993 if ($methylation_calls[$index] eq 'X') { | |
1994 $counting{total_meCHG_count}++; | |
1995 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1996 } elsif ($methylation_calls[$index] eq 'x') { | |
1997 $counting{total_unmethylated_CHG_count}++; | |
1998 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1999 } elsif ($methylation_calls[$index] eq 'Z') { | |
2000 $counting{total_meCpG_count}++; | |
2001 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2002 } elsif ($methylation_calls[$index] eq 'z') { | |
2003 $counting{total_unmethylated_CpG_count}++; | |
2004 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2005 } elsif ($methylation_calls[$index] eq 'H') { | |
2006 $counting{total_meCHH_count}++; | |
2007 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2008 } elsif ($methylation_calls[$index] eq 'h') { | |
2009 $counting{total_unmethylated_CHH_count}++; | |
2010 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2011 } | |
2012 elsif ($methylation_calls[$index] eq '.') {} | |
2013 else{ | |
2014 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2015 } | |
2016 } | |
2017 } else { | |
2018 die "The strand orientation as neither + nor -: '$strand'\n"; | |
2019 } | |
2020 } | |
2021 | |
2022 ### strand-specific methylation output | |
2023 else { | |
2024 if ($strand eq '+') { | |
2025 for my $index (0..$#methylation_calls) { | |
2026 | |
2027 if ($cigar){ # only needed for SAM files | |
2028 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2029 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2030 $cigar_offset += $cigar_mod; | |
2031 $pos_offset += $pos_mod; | |
2032 } | |
2033 | |
2034 ### Returning as soon as the methylation calls start overlapping | |
2035 if ($start+$index+$pos_offset >= $end_read_1) { | |
2036 return; | |
2037 } | |
2038 | |
2039 if ($methylation_calls[$index] eq 'X') { | |
2040 $counting{total_meCHG_count}++; | |
2041 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2042 } elsif ($methylation_calls[$index] eq 'x') { | |
2043 $counting{total_unmethylated_CHG_count}++; | |
2044 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2045 } elsif ($methylation_calls[$index] eq 'Z') { | |
2046 $counting{total_meCpG_count}++; | |
2047 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2048 } elsif ($methylation_calls[$index] eq 'z') { | |
2049 $counting{total_unmethylated_CpG_count}++; | |
2050 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2051 } elsif ($methylation_calls[$index] eq 'H') { | |
2052 $counting{total_meCHH_count}++; | |
2053 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2054 } elsif ($methylation_calls[$index] eq 'h') { | |
2055 $counting{total_unmethylated_CHH_count}++; | |
2056 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2057 } | |
2058 elsif ($methylation_calls[$index] eq '.') {} | |
2059 else{ | |
2060 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2061 } | |
2062 } | |
2063 } elsif ($strand eq '-') { | |
2064 for my $index (0..$#methylation_calls) { | |
2065 | |
2066 if ($cigar){ # only needed for SAM files | |
2067 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
2068 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2069 $cigar_offset += $cigar_mod; | |
2070 $pos_offset += $pos_mod; | |
2071 } | |
2072 | |
2073 ### Returning as soon as the methylation calls start overlapping | |
2074 if ($start-$index+$pos_offset <= $end_read_1) { | |
2075 return; | |
2076 } | |
2077 | |
2078 if ($methylation_calls[$index] eq 'X') { | |
2079 $counting{total_meCHG_count}++; | |
2080 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2081 } elsif ($methylation_calls[$index] eq 'x') { | |
2082 $counting{total_unmethylated_CHG_count}++; | |
2083 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2084 } elsif ($methylation_calls[$index] eq 'Z') { | |
2085 $counting{total_meCpG_count}++; | |
2086 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2087 } elsif ($methylation_calls[$index] eq 'z') { | |
2088 $counting{total_unmethylated_CpG_count}++; | |
2089 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2090 } elsif ($methylation_calls[$index] eq 'H') { | |
2091 $counting{total_meCHH_count}++; | |
2092 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2093 } elsif ($methylation_calls[$index] eq 'h') { | |
2094 $counting{total_unmethylated_CHH_count}++; | |
2095 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2096 } | |
2097 elsif ($methylation_calls[$index] eq '.') {} | |
2098 else{ | |
2099 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2100 } | |
2101 } | |
2102 } else { | |
2103 die "The strand orientation as neither + nor -: '$strand'\n"; | |
2104 } | |
2105 } | |
2106 } | |
2107 | |
2108 ### this is the default paired-end procedure allowing overlaps and using every single C position | |
2109 else { | |
2110 ### single-file CpG, CHG and CHH context output | |
2111 if ($full) { | |
2112 if ($strand eq '+') { | |
2113 for my $index (0..$#methylation_calls) { | |
2114 | |
2115 if ($cigar){ # only needed for SAM files | |
2116 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2117 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2118 $cigar_offset += $cigar_mod; | |
2119 $pos_offset += $pos_mod; | |
2120 } | |
2121 | |
2122 if ($methylation_calls[$index] eq 'X') { | |
2123 $counting{total_meCHG_count}++; | |
2124 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2125 } elsif ($methylation_calls[$index] eq 'x') { | |
2126 $counting{total_unmethylated_CHG_count}++; | |
2127 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2128 } elsif ($methylation_calls[$index] eq 'Z') { | |
2129 $counting{total_meCpG_count}++; | |
2130 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2131 } elsif ($methylation_calls[$index] eq 'z') { | |
2132 $counting{total_unmethylated_CpG_count}++; | |
2133 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2134 } elsif ($methylation_calls[$index] eq 'H') { | |
2135 $counting{total_meCHH_count}++; | |
2136 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2137 } elsif ($methylation_calls[$index] eq 'h') { | |
2138 $counting{total_unmethylated_CHH_count}++; | |
2139 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2140 } | |
2141 elsif ($methylation_calls[$index] eq '.') {} | |
2142 else{ | |
2143 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2144 } | |
2145 } | |
2146 } elsif ($strand eq '-') { | |
2147 for my $index (0..$#methylation_calls) { | |
2148 | |
2149 if ($cigar){ # only needed for SAM files | |
2150 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
2151 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2152 $cigar_offset += $cigar_mod; | |
2153 $pos_offset += $pos_mod; | |
2154 } | |
2155 | |
2156 if ($methylation_calls[$index] eq 'X') { | |
2157 $counting{total_meCHG_count}++; | |
2158 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2159 } elsif ($methylation_calls[$index] eq 'x') { | |
2160 $counting{total_unmethylated_CHG_count}++; | |
2161 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2162 } elsif ($methylation_calls[$index] eq 'Z') { | |
2163 $counting{total_meCpG_count}++; | |
2164 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2165 } elsif ($methylation_calls[$index] eq 'z') { | |
2166 $counting{total_unmethylated_CpG_count}++; | |
2167 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2168 } elsif ($methylation_calls[$index] eq 'H') { | |
2169 $counting{total_meCHH_count}++; | |
2170 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2171 } elsif ($methylation_calls[$index] eq 'h') { | |
2172 $counting{total_unmethylated_CHH_count}++; | |
2173 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2174 } | |
2175 elsif ($methylation_calls[$index] eq '.') {} | |
2176 else{ | |
2177 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2178 } | |
2179 } | |
2180 } else { | |
2181 die "The strand orientation as neither + nor -: '$strand'\n"; | |
2182 } | |
2183 } | |
2184 | |
2185 ### strand-specific methylation output | |
2186 else { | |
2187 if ($strand eq '+') { | |
2188 for my $index (0..$#methylation_calls) { | |
2189 | |
2190 if ($cigar){ # only needed for SAM files | |
2191 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2192 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2193 $cigar_offset += $cigar_mod; | |
2194 $pos_offset += $pos_mod; | |
2195 } | |
2196 | |
2197 if ($methylation_calls[$index] eq 'X') { | |
2198 $counting{total_meCHG_count}++; | |
2199 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2200 } elsif ($methylation_calls[$index] eq 'x') { | |
2201 $counting{total_unmethylated_CHG_count}++; | |
2202 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2203 } elsif ($methylation_calls[$index] eq 'Z') { | |
2204 $counting{total_meCpG_count}++; | |
2205 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2206 } elsif ($methylation_calls[$index] eq 'z') { | |
2207 $counting{total_unmethylated_CpG_count}++; | |
2208 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2209 } elsif ($methylation_calls[$index] eq 'H') { | |
2210 $counting{total_meCHH_count}++; | |
2211 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2212 } elsif ($methylation_calls[$index] eq 'h') { | |
2213 $counting{total_unmethylated_CHH_count}++; | |
2214 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2215 } | |
2216 elsif ($methylation_calls[$index] eq '.') {} | |
2217 else{ | |
2218 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2219 } | |
2220 } | |
2221 } elsif ($strand eq '-') { | |
2222 for my $index (0..$#methylation_calls) { | |
2223 | |
2224 if ($cigar){ # only needed for SAM files | |
2225 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
2226 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2227 $cigar_offset += $cigar_mod; | |
2228 $pos_offset += $pos_mod; | |
2229 } | |
2230 | |
2231 if ($methylation_calls[$index] eq 'X') { | |
2232 $counting{total_meCHG_count}++; | |
2233 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2234 } elsif ($methylation_calls[$index] eq 'x') { | |
2235 $counting{total_unmethylated_CHG_count}++; | |
2236 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2237 } elsif ($methylation_calls[$index] eq 'Z') { | |
2238 $counting{total_meCpG_count}++; | |
2239 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2240 } elsif ($methylation_calls[$index] eq 'z') { | |
2241 $counting{total_unmethylated_CpG_count}++; | |
2242 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2243 } elsif ($methylation_calls[$index] eq 'H') { | |
2244 $counting{total_meCHH_count}++; | |
2245 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2246 } elsif ($methylation_calls[$index] eq 'h') { | |
2247 $counting{total_unmethylated_CHH_count}++; | |
2248 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2249 } | |
2250 elsif ($methylation_calls[$index] eq '.') {} | |
2251 else{ | |
2252 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2253 } | |
2254 } | |
2255 } else { | |
2256 die "The strand orientation as neither + nor -: '$strand'\n"; | |
2257 } | |
2258 } | |
2259 } | |
2260 } | |
2261 | |
2262 sub check_cigar_string { | |
2263 my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_; | |
2264 # print "$index\t$cigar_offset\t$pos_offset\t$strand\t"; | |
2265 my ($new_cigar_offset,$new_pos_offset) = (0,0); | |
2266 | |
2267 if ($strand eq '+') { | |
2268 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; | |
2269 | |
2270 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
2271 # warn "position needs no adjustment\n"; | |
2272 } | |
2273 | |
2274 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence | |
2275 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position | |
2276 # warn "adjusted genomic position by -1 bp (insertion)\n"; | |
2277 } | |
2278 | |
2279 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
2280 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
2281 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position | |
2282 # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; | |
2283 | |
2284 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ | |
2285 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
2286 # warn "position needs no adjustment\n"; | |
2287 last; | |
2288 } | |
2289 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ | |
2290 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position | |
2291 # warn "adjusted genomic position by another -1 bp (insertion)\n"; | |
2292 last; | |
2293 } | |
2294 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
2295 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
2296 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position | |
2297 # warn "adjusted genomic position by another +1 bp (deletion)\n"; | |
2298 } | |
2299 else{ | |
2300 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
2301 } | |
2302 } | |
2303 } | |
2304 else{ | |
2305 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
2306 } | |
2307 } | |
2308 | |
2309 elsif ($strand eq '-') { | |
2310 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; | |
2311 | |
2312 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
2313 # warn "position needs no adjustment\n"; | |
2314 } | |
2315 | |
2316 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence | |
2317 $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position | |
2318 # warn "adjusted genomic position by +1 bp (insertion)\n"; | |
2319 } | |
2320 | |
2321 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
2322 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
2323 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position | |
2324 # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; | |
2325 | |
2326 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ | |
2327 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
2328 # warn "Found new 'M' operation; position needs no adjustment\n"; | |
2329 last; | |
2330 } | |
2331 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ | |
2332 $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position | |
2333 # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n"; | |
2334 last; | |
2335 } | |
2336 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
2337 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
2338 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position | |
2339 # warn "adjusted genomic position by another -1 bp (deletion)\n"; | |
2340 } | |
2341 else{ | |
2342 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
2343 } | |
2344 } | |
2345 } | |
2346 else{ | |
2347 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
2348 } | |
2349 } | |
2350 # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n"; | |
2351 return ($new_cigar_offset,$new_pos_offset); | |
2352 } | |
2353 | |
2354 sub print_individual_C_methylation_states_single_end{ | |
2355 | |
2356 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_; | |
2357 my @methylation_calls = split(//,$meth_call); | |
2358 | |
2359 ################################################################# | |
2360 ### . for bases not involving cytosines ### | |
2361 ### X for methylated C in CHG context (was protected) ### | |
2362 ### x for not methylated C in CHG context (was converted) ### | |
2363 ### H for methylated C in CHH context (was protected) ### | |
2364 ### h for not methylated C in CHH context (was converted) ### | |
2365 ### Z for methylated C in CpG context (was protected) ### | |
2366 ### z for not methylated C in CpG context (was converted) ### | |
2367 ################################################################# | |
2368 | |
2369 my $methyl_CHG_count = 0; | |
2370 my $methyl_CHH_count = 0; | |
2371 my $methyl_CpG_count = 0; | |
2372 my $unmethylated_CHG_count = 0; | |
2373 my $unmethylated_CHH_count = 0; | |
2374 my $unmethylated_CpG_count = 0; | |
2375 | |
2376 my @len; | |
2377 my @ops; | |
2378 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions | |
2379 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels | |
2380 | |
2381 my @comp_cigar; | |
2382 | |
2383 if ($cigar){ # parsing CIGAR string | |
2384 @len = split (/\D+/,$cigar); # storing the length per operation | |
2385 @ops = split (/\d+/,$cigar); # storing the operation | |
2386 shift @ops; # remove the empty first element | |
2387 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
2388 | |
2389 foreach my $index (0..$#len){ | |
2390 foreach (1..$len[$index]){ | |
2391 # print "$ops[$index]"; | |
2392 push @comp_cigar, $ops[$index]; | |
2393 } | |
2394 } | |
2395 # warn "\nDetected CIGAR string: $cigar\n"; | |
2396 # warn "Length of methylation call: ",length $meth_call,"\n"; | |
2397 # warn "number of operations: ",scalar @ops,"\n"; | |
2398 # warn "number of length digits: ",scalar @len,"\n\n"; | |
2399 # print @comp_cigar,"\n"; | |
2400 # print "$meth_call\n\n"; | |
2401 # sleep (1); | |
2402 } | |
2403 | |
2404 ### adjusting the start position for all reads mapping to the reverse strand | |
2405 if ($strand eq '-') { | |
2406 | |
2407 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too | |
2408 # print @comp_cigar,"\n"; | |
2409 | |
2410 unless ($ignore){ ### if --ignore was specified the start position has already been corrected | |
2411 | |
2412 if ($cigar){ ### SAM format | |
2413 my $MD_count = 0; | |
2414 foreach (@comp_cigar){ | |
2415 ++$MD_count if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't | |
2416 } | |
2417 $start += $MD_count - 1; | |
2418 } | |
2419 else{ ### vanilla format | |
2420 $start += length($meth_call)-1; | |
2421 } | |
2422 } | |
2423 } | |
2424 | |
2425 ### THIS IS THE CpG and Non-CpG SECTION (OPTIONAL) | |
2426 | |
2427 ### single-file CpG and other-context output | |
2428 if ($full and $merge_non_CpG) { | |
2429 if ($strand eq '+') { | |
2430 for my $index (0..$#methylation_calls) { | |
2431 | |
2432 if ($cigar){ # only needed for SAM files | |
2433 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2434 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2435 $cigar_offset += $cigar_mod; | |
2436 $pos_offset += $pos_mod; | |
2437 } | |
2438 | |
2439 ### methylated Cs (any context) will receive a forward (+) orientation | |
2440 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
2441 if ($methylation_calls[$index] eq 'X') { | |
2442 $counting{total_meCHG_count}++; | |
2443 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2444 } | |
2445 elsif ($methylation_calls[$index] eq 'x') { | |
2446 $counting{total_unmethylated_CHG_count}++; | |
2447 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2448 } | |
2449 elsif ($methylation_calls[$index] eq 'Z') { | |
2450 $counting{total_meCpG_count}++; | |
2451 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2452 } | |
2453 elsif ($methylation_calls[$index] eq 'z') { | |
2454 $counting{total_unmethylated_CpG_count}++; | |
2455 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2456 } | |
2457 elsif ($methylation_calls[$index] eq 'H') { | |
2458 $counting{total_meCHH_count}++; | |
2459 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2460 } | |
2461 elsif ($methylation_calls[$index] eq 'h') { | |
2462 $counting{total_unmethylated_CHH_count}++; | |
2463 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2464 } | |
2465 elsif ($methylation_calls[$index] eq '.') { | |
2466 } | |
2467 else{ | |
2468 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2469 } | |
2470 } | |
2471 } | |
2472 elsif ($strand eq '-') { | |
2473 | |
2474 for my $index (0..$#methylation_calls) { | |
2475 ### methylated Cs (any context) will receive a forward (+) orientation | |
2476 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
2477 | |
2478 if ($cigar){ # only needed for SAM files | |
2479 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
2480 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2481 $cigar_offset += $cigar_mod; | |
2482 $pos_offset += $pos_mod; | |
2483 } | |
2484 | |
2485 if ($methylation_calls[$index] eq 'X') { | |
2486 $counting{total_meCHG_count}++; | |
2487 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2488 } | |
2489 elsif ($methylation_calls[$index] eq 'x') { | |
2490 $counting{total_unmethylated_CHG_count}++; | |
2491 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2492 } | |
2493 elsif ($methylation_calls[$index] eq 'Z') { | |
2494 $counting{total_meCpG_count}++; | |
2495 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2496 } | |
2497 elsif ($methylation_calls[$index] eq 'z') { | |
2498 $counting{total_unmethylated_CpG_count}++; | |
2499 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2500 } | |
2501 elsif ($methylation_calls[$index] eq 'H') { | |
2502 $counting{total_meCHH_count}++; | |
2503 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2504 } | |
2505 elsif ($methylation_calls[$index] eq 'h') { | |
2506 $counting{total_unmethylated_CHH_count}++; | |
2507 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2508 } | |
2509 elsif ($methylation_calls[$index] eq '.'){ | |
2510 } | |
2511 else{ | |
2512 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2513 } | |
2514 } | |
2515 } | |
2516 else { | |
2517 die "The strand information was neither + nor -: $strand\n"; | |
2518 } | |
2519 } | |
2520 | |
2521 ### strand-specific methylation output | |
2522 elsif ($merge_non_CpG) { | |
2523 if ($strand eq '+') { | |
2524 for my $index (0..$#methylation_calls) { | |
2525 ### methylated Cs (any context) will receive a forward (+) orientation | |
2526 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
2527 | |
2528 if ($cigar){ # only needed for SAM files | |
2529 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2530 $cigar_offset += $cigar_mod; | |
2531 $pos_offset += $pos_mod; | |
2532 } | |
2533 | |
2534 if ($methylation_calls[$index] eq 'X') { | |
2535 $counting{total_meCHG_count}++; | |
2536 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2537 } | |
2538 elsif ($methylation_calls[$index] eq 'x') { | |
2539 $counting{total_unmethylated_CHG_count}++; | |
2540 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2541 } | |
2542 elsif ($methylation_calls[$index] eq 'Z') { | |
2543 $counting{total_meCpG_count}++; | |
2544 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2545 } | |
2546 elsif ($methylation_calls[$index] eq 'z') { | |
2547 $counting{total_unmethylated_CpG_count}++; | |
2548 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2549 } | |
2550 elsif ($methylation_calls[$index] eq 'H') { | |
2551 $counting{total_meCHH_count}++; | |
2552 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2553 } | |
2554 elsif ($methylation_calls[$index] eq 'h') { | |
2555 $counting{total_unmethylated_CHH_count}++; | |
2556 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2557 } | |
2558 elsif ($methylation_calls[$index] eq '.') { | |
2559 } | |
2560 else{ | |
2561 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2562 } | |
2563 } | |
2564 } | |
2565 elsif ($strand eq '-') { | |
2566 | |
2567 for my $index (0..$#methylation_calls) { | |
2568 ### methylated Cs (any context) will receive a forward (+) orientation | |
2569 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
2570 | |
2571 if ($cigar){ # only needed for SAM files | |
2572 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2573 $cigar_offset += $cigar_mod; | |
2574 $pos_offset += $pos_mod; | |
2575 } | |
2576 | |
2577 if ($methylation_calls[$index] eq 'X') { | |
2578 $counting{total_meCHG_count}++; | |
2579 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2580 } | |
2581 elsif ($methylation_calls[$index] eq 'x') { | |
2582 $counting{total_unmethylated_CHG_count}++; | |
2583 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2584 } | |
2585 elsif ($methylation_calls[$index] eq 'Z') { | |
2586 $counting{total_meCpG_count}++; | |
2587 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2588 } | |
2589 elsif ($methylation_calls[$index] eq 'z') { | |
2590 $counting{total_unmethylated_CpG_count}++; | |
2591 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2592 } | |
2593 elsif ($methylation_calls[$index] eq 'H') { | |
2594 $counting{total_meCHH_count}++; | |
2595 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2596 } | |
2597 elsif ($methylation_calls[$index] eq 'h') { | |
2598 $counting{total_unmethylated_CHH_count}++; | |
2599 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2600 } | |
2601 elsif ($methylation_calls[$index] eq '.') { | |
2602 } | |
2603 else{ | |
2604 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2605 } | |
2606 } | |
2607 } | |
2608 else { | |
2609 die "The strand information was neither + nor -: $strand\n"; | |
2610 } | |
2611 } | |
2612 | |
2613 ### THIS IS THE 3-CONTEXT (CpG, CHG and CHH) DEFAULT SECTION | |
2614 | |
2615 elsif ($full) { | |
2616 if ($strand eq '+') { | |
2617 for my $index (0..$#methylation_calls) { | |
2618 ### methylated Cs (any context) will receive a forward (+) orientation | |
2619 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
2620 | |
2621 if ($cigar){ # only needed for SAM files | |
2622 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2623 $cigar_offset += $cigar_mod; | |
2624 $pos_offset += $pos_mod; | |
2625 } | |
2626 | |
2627 if ($methylation_calls[$index] eq 'X') { | |
2628 $counting{total_meCHG_count}++; | |
2629 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2630 } elsif ($methylation_calls[$index] eq 'x') { | |
2631 $counting{total_unmethylated_CHG_count}++; | |
2632 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2633 } elsif ($methylation_calls[$index] eq 'Z') { | |
2634 $counting{total_meCpG_count}++; | |
2635 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2636 } elsif ($methylation_calls[$index] eq 'z') { | |
2637 $counting{total_unmethylated_CpG_count}++; | |
2638 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2639 } elsif ($methylation_calls[$index] eq 'H') { | |
2640 $counting{total_meCHH_count}++; | |
2641 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2642 } elsif ($methylation_calls[$index] eq 'h') { | |
2643 $counting{total_unmethylated_CHH_count}++; | |
2644 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2645 } | |
2646 elsif ($methylation_calls[$index] eq '.') {} | |
2647 else{ | |
2648 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2649 } | |
2650 } | |
2651 } | |
2652 elsif ($strand eq '-') { | |
2653 | |
2654 for my $index (0..$#methylation_calls) { | |
2655 ### methylated Cs (any context) will receive a forward (+) orientation | |
2656 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
2657 | |
2658 if ($cigar){ # only needed for SAM files | |
2659 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2660 $cigar_offset += $cigar_mod; | |
2661 $pos_offset += $pos_mod; | |
2662 } | |
2663 | |
2664 if ($methylation_calls[$index] eq 'X') { | |
2665 $counting{total_meCHG_count}++; | |
2666 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2667 } elsif ($methylation_calls[$index] eq 'x') { | |
2668 $counting{total_unmethylated_CHG_count}++; | |
2669 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2670 } elsif ($methylation_calls[$index] eq 'Z') { | |
2671 $counting{total_meCpG_count}++; | |
2672 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2673 } elsif ($methylation_calls[$index] eq 'z') { | |
2674 $counting{total_unmethylated_CpG_count}++; | |
2675 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2676 } elsif ($methylation_calls[$index] eq 'H') { | |
2677 $counting{total_meCHH_count}++; | |
2678 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2679 } elsif ($methylation_calls[$index] eq 'h') { | |
2680 $counting{total_unmethylated_CHH_count}++; | |
2681 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2682 } | |
2683 elsif ($methylation_calls[$index] eq '.') {} | |
2684 else{ | |
2685 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2686 } | |
2687 } | |
2688 } | |
2689 else { | |
2690 die "The read had a strand orientation which was neither + nor -: $strand\n"; | |
2691 } | |
2692 } | |
2693 | |
2694 ### strand-specific methylation output | |
2695 else { | |
2696 if ($strand eq '+') { | |
2697 for my $index (0..$#methylation_calls) { | |
2698 ### methylated Cs (any context) will receive a forward (+) orientation | |
2699 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
2700 | |
2701 if ($cigar){ # only needed for SAM files | |
2702 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2703 $cigar_offset += $cigar_mod; | |
2704 $pos_offset += $pos_mod; | |
2705 } | |
2706 | |
2707 if ($methylation_calls[$index] eq 'X') { | |
2708 $counting{total_meCHG_count}++; | |
2709 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2710 } elsif ($methylation_calls[$index] eq 'x') { | |
2711 $counting{total_unmethylated_CHG_count}++; | |
2712 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2713 } elsif ($methylation_calls[$index] eq 'Z') { | |
2714 $counting{total_meCpG_count}++; | |
2715 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2716 } elsif ($methylation_calls[$index] eq 'z') { | |
2717 $counting{total_unmethylated_CpG_count}++; | |
2718 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2719 } elsif ($methylation_calls[$index] eq 'H') { | |
2720 $counting{total_meCHH_count}++; | |
2721 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2722 } elsif ($methylation_calls[$index] eq 'h') { | |
2723 $counting{total_unmethylated_CHH_count}++; | |
2724 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2725 } | |
2726 elsif ($methylation_calls[$index] eq '.') {} | |
2727 else{ | |
2728 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2729 } | |
2730 } | |
2731 } | |
2732 elsif ($strand eq '-') { | |
2733 | |
2734 for my $index (0..$#methylation_calls) { | |
2735 ### methylated Cs (any context) will receive a forward (+) orientation | |
2736 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
2737 | |
2738 if ($cigar){ # only needed for SAM files | |
2739 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2740 $cigar_offset += $cigar_mod; | |
2741 $pos_offset += $pos_mod; | |
2742 } | |
2743 | |
2744 if ($methylation_calls[$index] eq 'X') { | |
2745 $counting{total_meCHG_count}++; | |
2746 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2747 } elsif ($methylation_calls[$index] eq 'x') { | |
2748 $counting{total_unmethylated_CHG_count}++; | |
2749 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2750 } elsif ($methylation_calls[$index] eq 'Z') { | |
2751 $counting{total_meCpG_count}++; | |
2752 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2753 } elsif ($methylation_calls[$index] eq 'z') { | |
2754 $counting{total_unmethylated_CpG_count}++; | |
2755 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2756 } elsif ($methylation_calls[$index] eq 'H') { | |
2757 $counting{total_meCHH_count}++; | |
2758 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2759 } elsif ($methylation_calls[$index] eq 'h') { | |
2760 $counting{total_unmethylated_CHH_count}++; | |
2761 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2762 } | |
2763 elsif ($methylation_calls[$index] eq '.') {} | |
2764 else{ | |
2765 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2766 } | |
2767 } | |
2768 } | |
2769 else { | |
2770 die "The strand information was neither + nor -: $strand\n"; | |
2771 } | |
2772 } | |
2773 } | |
2774 | |
2775 | |
2776 | |
2777 ####################################################################################################################################### | |
2778 ### bismark2bedGaph section - START | |
2779 ####################################################################################################################################### | |
2780 | |
2781 sub process_bedGraph_output{ | |
2782 warn "="x64,"\n"; | |
2783 warn "Methylation information will now be written into a bedGraph file\n"; | |
2784 warn "="x64,"\n\n"; | |
2785 sleep (2); | |
2786 | |
2787 ### Closing all filehandles so that the Bismark methtylation extractor output doesn't get truncated due to buffering issues | |
2788 foreach my $fh (keys %fhs) { | |
2789 if ($fh =~ /^[1230]$/) { | |
2790 foreach my $context (keys %{$fhs{$fh}}) { | |
2791 close $fhs{$fh}->{$context} or die $!; | |
2792 } | |
2793 } else { | |
2794 close $fhs{$fh} or die $!; | |
2795 } | |
2796 } | |
2797 | |
2798 ### deciding which files to use for bedGraph conversion | |
2799 foreach my $filename (@sorting_files){ | |
2800 # warn "$filename\n"; | |
2801 if ($filename =~ /\//){ # if files are in a different output folder we extract the filename again | |
2802 $filename =~ s/.*\///; # replacing everything up to the last slash in the filename | |
2803 # warn "$filename\n"; | |
2804 } | |
2805 | |
2806 if ($CX_context){ | |
2807 push @bedfiles,$filename; | |
2808 } | |
2809 else{ ## CpG context only (default) | |
2810 if ($filename =~ /^CpG_/){ | |
2811 push @bedfiles,$filename; | |
2812 } | |
2813 else{ | |
2814 # skipping CHH or CHG files | |
2815 } | |
2816 } | |
2817 } | |
2818 | |
2819 warn "Using the following files as Input:\n"; | |
2820 print join ("\t",@bedfiles),"\n\n"; | |
2821 sleep (2); | |
2822 | |
2823 my %temp_fhs; | |
2824 my @temp_files; # writing all context files (default CpG only) to these files prior to sorting | |
2825 | |
2826 ### changing to the output directory | |
2827 unless ($output_dir eq ''){ # default | |
2828 chdir $output_dir or die "Failed to change directory to $output_dir\n"; | |
2829 warn "Changed directory to $output_dir\n"; | |
2830 } | |
2831 | |
2832 foreach my $infile (@bedfiles) { | |
2833 | |
2834 if ($remove) { | |
2835 warn "Now replacing whitespaces in the sequence ID field of the Bismark methylation extractor output $infile prior to bedGraph conversion\n\n"; | |
2836 | |
2837 open (READ,$infile) or die $!; | |
2838 | |
2839 my $removed_spaces_outfile = $infile; | |
2840 $removed_spaces_outfile =~ s/$/.spaces_removed.txt/; | |
2841 | |
2842 open (REM,'>',$output_dir.$removed_spaces_outfile) or die "Couldn't write to file $removed_spaces_outfile: $!\n"; | |
2843 | |
2844 unless ($no_header){ | |
2845 $_ = <READ>; ### Bismark version header | |
2846 print REM $_; ### Bismark version header | |
2847 } | |
2848 | |
2849 while (<READ>) { | |
2850 chomp; | |
2851 my ($id,$strand,$chr,$pos,$context) = (split (/\t/)); | |
2852 $id =~ s/\s+/_/g; | |
2853 print REM join ("\t",$id,$strand,$chr,$pos,$context),"\n"; | |
2854 } | |
2855 | |
2856 close READ or die $!; | |
2857 close REM or die $!; | |
2858 | |
2859 ### changing the infile name to the new file without spaces | |
2860 $infile = $removed_spaces_outfile; | |
2861 } | |
2862 | |
2863 warn "Now writing methylation information for file $infile to individual files for each chromosome\n"; | |
2864 open (IN,$infile) or die $!; | |
2865 | |
2866 ## always ignoring the version header | |
2867 unless ($no_header){ | |
2868 $_ = <IN>; ### Bismark version header | |
2869 } | |
2870 | |
2871 while (<IN>) { | |
2872 chomp; | |
2873 my ($chr) = (split (/\t/))[2]; | |
2874 | |
2875 unless (exists $temp_fhs{$chr}) { | |
2876 open ($temp_fhs{$chr},'>','chr'.$chr.'.meth_extractor.temp') or die "Failed to open filehandle: $!"; | |
2877 } | |
2878 print {$temp_fhs{$chr}} "$_\n"; | |
2879 } | |
2880 | |
2881 warn "Finished writing out individual chromosome files for $infile\n"; | |
2882 } | |
2883 warn "\n"; | |
2884 | |
2885 @temp_files = <*.meth_extractor.temp>; | |
2886 | |
2887 warn "Collecting temporary chromosome file information...\n"; | |
2888 sleep (1); | |
2889 warn "processing the following input file(s):\n"; | |
2890 warn join ("\n",@temp_files),"\n\n"; | |
2891 sleep (1); | |
2892 | |
2893 foreach my $in (@temp_files) { | |
2894 warn "Sorting input file $in by positions\n"; | |
2895 open my $ifh, "sort -k3,3 -k4,4n $in |" or die "Input file could not be sorted. $!"; | |
2896 # print "Chromosome\tStart Position\tEnd Position\tMethylation Percentage\n"; | |
2897 | |
2898 ############################################# m.a.bentley - moved the variables out of the while loop to hold the current line data { | |
2899 | |
2900 my $name; | |
2901 my $meth_state; | |
2902 my $chr = ""; | |
2903 my $pos = 0; | |
2904 my $meth_state2; | |
2905 | |
2906 my $last_pos; | |
2907 my $last_chr; | |
2908 | |
2909 ############################################# } | |
2910 | |
2911 while (my $line = <$ifh>) { | |
2912 next if $line =~ /^Bismark/; | |
2913 chomp $line; | |
2914 | |
2915 ########################################### m.a.bentley - (1) set the last_chr and last_pos variables early in the while loop, before the line split (2) removed unnecessary setting of same variables in if statement { | |
2916 | |
2917 $last_chr = $chr; | |
2918 $last_pos = $pos; | |
2919 ($name, $meth_state, $chr, $pos, $meth_state2) = split "\t", $line; | |
2920 | |
2921 if (($last_pos ne $pos) || ($last_chr ne $chr)) { | |
2922 generate_output($last_chr,$last_pos) if $methylcalls[2] > 0; | |
2923 @methylcalls = qw (0 0 0); | |
2924 } | |
2925 | |
2926 ############################################# } | |
2927 | |
2928 my $validated = validate_methylation_call($meth_state, $meth_state2); | |
2929 unless($validated){ | |
2930 warn "Methylation state of sequence ($name) in file ($in) on line $. is inconsistent (meth_state is $meth_state, meth_state2 = $meth_state2)\n"; | |
2931 next; | |
2932 } | |
2933 if ($meth_state eq "+") { | |
2934 $methylcalls[0]++; | |
2935 $methylcalls[2]++; | |
2936 } else { | |
2937 $methylcalls[1]++; | |
2938 $methylcalls[2]++; | |
2939 } | |
2940 } | |
2941 | |
2942 ############################################# m.a.bentley - set the last_chr and last_pos variables for the last line in the file (outside the while loop's scope using the method i've implemented) { | |
2943 | |
2944 $last_chr = $chr; | |
2945 $last_pos = $pos; | |
2946 if ($methylcalls[2] > 0) { | |
2947 generate_output($last_chr,$last_pos) if $methylcalls[2] > 0; | |
2948 } | |
2949 ############################################# } | |
2950 | |
2951 close $ifh or die $!; | |
2952 | |
2953 @methylcalls = qw (0 0 0); # resetting @methylcalls | |
2954 | |
2955 ### deleting temporary files | |
2956 my $delete = unlink $in; | |
2957 if ($delete) { | |
2958 warn "Successfully deleted the temporary input file $in\n\n"; | |
2959 } | |
2960 else { | |
2961 warn "The temporary inputfile $in could not be deleted $!\n\n"; | |
2962 } | |
2963 } | |
2964 } | |
2965 | |
2966 sub generate_output{ | |
2967 my $methcount = $methylcalls[0]; | |
2968 my $nonmethcount = $methylcalls[1]; | |
2969 my $totalcount = $methylcalls[2]; | |
2970 my $last_chr = shift; | |
2971 my $last_pos = shift; | |
2972 croak "Should not be generating output if there's no reads to this region" unless $totalcount > 0; | |
2973 croak "Total counts ($totalcount) is not the sum of the methylated ($methcount) and unmethylated ($nonmethcount) counts" if $totalcount != ($methcount + $nonmethcount); | |
2974 | |
2975 ############################################# m.a.bentley - declare a new variable 'bed_pos' to distinguish from bismark positions (-1) - previous scripts modified the last_pos variable earlier in the script leading to problems in meth % calculation { | |
2976 | |
2977 my $bed_pos = $last_pos -1; ### Bismark coordinates are 1 based whereas bedGraph coordinates are 0 based. | |
2978 my $meth_percentage; | |
2979 ($totalcount >= $coverage_threshold) ? ($meth_percentage = ($methcount/$totalcount) * 100) : ($meth_percentage = undef); | |
2980 # $meth_percentage =~ s/(\.\d\d).+$/$1/ unless $meth_percentage =~ /^Below/; | |
2981 if (defined $meth_percentage){ | |
2982 if ($counts){ | |
2983 print OUT "$last_chr\t$bed_pos\t$bed_pos\t$meth_percentage\t$methcount\t$nonmethcount\n"; | |
2984 } | |
2985 else{ | |
2986 print OUT "$last_chr\t$bed_pos\t$bed_pos\t$meth_percentage\n"; | |
2987 } | |
2988 } | |
2989 ############################################# } | |
2990 } | |
2991 | |
2992 sub validate_methylation_call{ | |
2993 my $meth_state = shift; | |
2994 croak "Missing (+/-) methylation call" unless defined $meth_state; | |
2995 my $meth_state2 = shift; | |
2996 croak "Missing alphabetical methylation call" unless defined $meth_state2; | |
2997 my $is_consistent; | |
2998 ($meth_state2 =~ /^z/i) ? ($is_consistent = check_CpG_methylation_call($meth_state, $meth_state2)) | |
2999 : ($is_consistent = check_nonCpG_methylation_call($meth_state,$meth_state2)); | |
3000 return 1 if $is_consistent; | |
3001 return 0; | |
3002 } | |
3003 | |
3004 sub check_CpG_methylation_call{ | |
3005 my $meth1 = shift; | |
3006 my $meth2 = shift; | |
3007 return 1 if($meth1 eq "+" && $meth2 eq "Z"); | |
3008 return 1 if($meth1 eq "-" && $meth2 eq "z"); | |
3009 return 0; | |
3010 } | |
3011 | |
3012 sub check_nonCpG_methylation_call{ | |
3013 my $meth1 = shift; | |
3014 my $meth2 = shift; | |
3015 return 1 if($meth1 eq "+" && $meth2 eq "C"); | |
3016 return 1 if($meth1 eq "+" && $meth2 eq "X"); | |
3017 return 1 if($meth1 eq "+" && $meth2 eq "H"); | |
3018 return 1 if($meth1 eq "-" && $meth2 eq "c"); | |
3019 return 1 if($meth1 eq "-" && $meth2 eq "x"); | |
3020 return 1 if($meth1 eq "-" && $meth2 eq "h"); | |
3021 return 0; | |
3022 } | |
3023 | |
3024 ####################################################################################################################################### | |
3025 ### bismark2bedGaph section - END | |
3026 ####################################################################################################################################### | |
3027 | |
3028 | |
3029 | |
3030 | |
3031 | |
3032 | |
3033 ####################################################################################################################################### | |
3034 ### genome-wide cytosine methylation report - START | |
3035 ####################################################################################################################################### | |
3036 | |
3037 sub generate_genome_wide_cytosine_report { | |
3038 | |
3039 warn "="x78,"\n"; | |
3040 warn "Methylation information will now be written into a genome-wide cytosine report\n"; | |
3041 warn "="x78,"\n\n"; | |
3042 sleep (2); | |
3043 | |
3044 ### changing to the output directory again | |
3045 unless ($output_dir eq ''){ # default | |
3046 chdir $output_dir or die "Failed to change directory to $output_dir\n"; | |
3047 # warn "Changed directory to $output_dir\n"; | |
3048 } | |
3049 | |
3050 my $in = shift; | |
3051 open (IN,$in) or die $!; | |
3052 | |
3053 my $cytosine_out = shift; | |
3054 | |
3055 if ($CX_context){ | |
3056 $cytosine_out =~ s/$/genome-wide_CX_report.txt/; | |
3057 } | |
3058 else{ | |
3059 $cytosine_out =~ s/$/genome_wide_CpG_report.txt/; | |
3060 } | |
3061 | |
3062 ### note: we are still in the folder: $output_dir, so we do not have to include this into the open commands | |
3063 unless ($split_by_chromosome){ ### writing all output to a single file (default) | |
3064 open (CYT,'>',$cytosine_out) or die $!; | |
3065 warn "Writing genome-wide cytosine report to: $cytosine_out\n"; | |
3066 sleep (3); | |
3067 } | |
3068 | |
3069 my $last_chr; | |
3070 my %chr; # storing reads for one chromosome at a time | |
3071 | |
3072 my $count = 0; | |
3073 while (<IN>){ | |
3074 chomp; | |
3075 ++$count; | |
3076 my ($chr,$start,$end,undef,$meth,$nonmeth) = (split /\t/); | |
3077 | |
3078 # defining the first chromosome | |
3079 unless (defined $last_chr){ | |
3080 $last_chr = $chr; | |
3081 # warn "Storing all covered cytosine positions for chromosome: $chr\n"; | |
3082 } | |
3083 | |
3084 if ($chr eq $last_chr){ | |
3085 $chr{$chr}->{$start}->{meth} = $meth; | |
3086 $chr{$chr}->{$start}->{nonmeth} = $nonmeth; | |
3087 } | |
3088 else{ | |
3089 warn "Writing cytosine reports for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n"; | |
3090 | |
3091 if ($split_by_chromosome){ ## writing output to 1 file per chromosome | |
3092 my $chromosome_out = $cytosine_out; | |
3093 $chromosome_out =~ s/txt$/chr${last_chr}.txt/; | |
3094 open (CYT,'>',$chromosome_out) or die $!; | |
3095 } | |
3096 | |
3097 while ( $chromosomes{$last_chr} =~ /([CG])/g){ | |
3098 | |
3099 my $tri_nt = ''; | |
3100 my $context = ''; | |
3101 my $pos = pos$chromosomes{$last_chr}; | |
3102 | |
3103 my $strand; | |
3104 my $meth = 0; | |
3105 my $nonmeth = 0; | |
3106 | |
3107 if ($1 eq 'C'){ # C on forward strand | |
3108 $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based! | |
3109 $strand = '+'; | |
3110 } | |
3111 elsif ($1 eq 'G'){ # C on reverse strand | |
3112 $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based! | |
3113 $tri_nt = reverse $tri_nt; | |
3114 $tri_nt =~ tr/ACTG/TGAC/; | |
3115 $strand = '-'; | |
3116 } | |
3117 next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted | |
3118 | |
3119 if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 0-based! | |
3120 $meth = $chr{$last_chr}->{$pos-1}->{meth}; | |
3121 $nonmeth = $chr{$last_chr}->{$pos-1}->{nonmeth}; | |
3122 } | |
3123 | |
3124 ### determining cytosine context | |
3125 if ($tri_nt =~ /^CG/){ | |
3126 $context = 'CG'; | |
3127 } | |
3128 elsif ($tri_nt =~ /^C.{1}G$/){ | |
3129 $context = 'CHG'; | |
3130 } | |
3131 elsif ($tri_nt =~ /^C.{2}$/){ | |
3132 $context = 'CHH'; | |
3133 } | |
3134 else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark) | |
3135 warn "The sequence context could not be determined (found: '$tri_nt'). Skipping.\n"; | |
3136 next; | |
3137 } | |
3138 | |
3139 if ($CpG_only){ | |
3140 if ($tri_nt =~ /^CG/){ # CpG context is the default | |
3141 if ($zero){ # zero based coordinates | |
3142 $pos -= 1; | |
3143 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
3144 } | |
3145 else{ # default | |
3146 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
3147 } | |
3148 } | |
3149 } | |
3150 else{ ## all cytosines, specified with --CX | |
3151 if ($zero){ # zero based coordinates | |
3152 $pos -= 1; | |
3153 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
3154 } | |
3155 else{ # default | |
3156 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
3157 } | |
3158 } | |
3159 } | |
3160 | |
3161 %chr = (); # resetting the hash | |
3162 | |
3163 # new first entry | |
3164 $last_chr = $chr; | |
3165 $chr{$chr}->{$start}->{meth} = $meth; | |
3166 $chr{$chr}->{$start}->{nonmeth} = $nonmeth; | |
3167 } | |
3168 } | |
3169 | |
3170 # Last found chromosome | |
3171 warn "Writing cytosine reports for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n"; | |
3172 | |
3173 if ($split_by_chromosome){ ## writing output to 1 file per chromosome | |
3174 my $chromosome_out = $cytosine_out; | |
3175 $chromosome_out =~ s/txt$/chr${last_chr}.txt/; | |
3176 open (CYT,'>',$chromosome_out) or die $!; | |
3177 } | |
3178 | |
3179 while ( $chromosomes{$last_chr} =~ /([CG])/g){ | |
3180 | |
3181 my $tri_nt; | |
3182 my $context; | |
3183 my $pos = pos$chromosomes{$last_chr}; | |
3184 | |
3185 my $strand; | |
3186 my $meth = 0; | |
3187 my $nonmeth = 0; | |
3188 | |
3189 if ($1 eq 'C'){ # C on forward strand | |
3190 $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based! | |
3191 $strand = '+'; | |
3192 } | |
3193 elsif ($1 eq 'G'){ # C on reverse strand | |
3194 $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based! | |
3195 $tri_nt = reverse $tri_nt; | |
3196 $tri_nt =~ tr/ACTG/TGAC/; | |
3197 $strand = '-'; | |
3198 } | |
3199 | |
3200 if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 0-based! | |
3201 $meth = $chr{$last_chr}->{$pos-1}->{meth}; | |
3202 $nonmeth = $chr{$last_chr}->{$pos-1}->{nonmeth}; | |
3203 } | |
3204 | |
3205 next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted | |
3206 | |
3207 ### determining cytosine context | |
3208 if ($tri_nt =~ /^CG/){ | |
3209 $context = 'CG'; | |
3210 } | |
3211 elsif ($tri_nt =~ /^C.{1}G$/){ | |
3212 $context = 'CHG'; | |
3213 } | |
3214 elsif ($tri_nt =~ /^C.{2}$/){ | |
3215 $context = 'CHH'; | |
3216 } | |
3217 else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark) | |
3218 warn "The cytosine context could not be determined (found: '$tri_nt'). Skipping.\n"; | |
3219 next; | |
3220 } | |
3221 | |
3222 if ($CpG_only){ | |
3223 if ($tri_nt =~ /^CG/){ # CpG context is the default | |
3224 if ($zero){ # zero-based coordinates | |
3225 $pos -= 1; | |
3226 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
3227 } | |
3228 else{ # default | |
3229 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
3230 } | |
3231 } | |
3232 } | |
3233 else{ ## all cytosines, specified with --CX | |
3234 if ($zero){ # zero based coordinates | |
3235 $pos -= 1; | |
3236 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
3237 } | |
3238 else{ # default | |
3239 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
3240 } | |
3241 } | |
3242 } | |
3243 close CYT or die $!; | |
3244 } | |
3245 | |
3246 | |
3247 sub read_genome_into_memory{ | |
3248 | |
3249 ## reading in and storing the specified genome in the %chromosomes hash | |
3250 chdir ($genome_folder) or die "Can't move to $genome_folder: $!"; | |
3251 warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n"; | |
3252 | |
3253 my @chromosome_filenames = <*.fa>; | |
3254 | |
3255 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta | |
3256 unless (@chromosome_filenames){ | |
3257 @chromosome_filenames = <*.fasta>; | |
3258 } | |
3259 unless (@chromosome_filenames){ | |
3260 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n"; | |
3261 } | |
3262 | |
3263 foreach my $chromosome_filename (@chromosome_filenames){ | |
3264 | |
3265 # skipping the tophat entire mouse genome fasta file | |
3266 next if ($chromosome_filename eq 'Mus_musculus.NCBIM37.fa'); | |
3267 | |
3268 open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n"; | |
3269 ### first line needs to be a fastA header | |
3270 my $first_line = <CHR_IN>; | |
3271 chomp $first_line; | |
3272 $first_line =~ s/\r//; # removing /r carriage returns | |
3273 | |
3274 ### Extracting chromosome name from the FastA header | |
3275 my $chromosome_name = extract_chromosome_name($first_line); | |
3276 | |
3277 my $sequence; | |
3278 while (<CHR_IN>){ | |
3279 chomp; | |
3280 $_ =~ s/\r//; # removing /r carriage returns | |
3281 | |
3282 if ($_ =~ /^>/){ | |
3283 ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA) | |
3284 if (exists $chromosomes{$chromosome_name}){ | |
3285 warn "chr $chromosome_name (",length $sequence ," bp)\n"; | |
3286 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n"; | |
3287 } | |
3288 else { | |
3289 if (length($sequence) == 0){ | |
3290 warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n"; | |
3291 } | |
3292 warn "chr $chromosome_name (",length $sequence ," bp)\n"; | |
3293 $chromosomes{$chromosome_name} = $sequence; | |
3294 } | |
3295 ### resetting the sequence variable | |
3296 $sequence = ''; | |
3297 ### setting new chromosome name | |
3298 $chromosome_name = extract_chromosome_name($_); | |
3299 } | |
3300 else{ | |
3301 $sequence .= uc$_; | |
3302 } | |
3303 } | |
3304 | |
3305 if (exists $chromosomes{$chromosome_name}){ | |
3306 warn "chr $chromosome_name (",length $sequence ," bp)\t"; | |
3307 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n"; | |
3308 } | |
3309 else{ | |
3310 if (length($sequence) == 0){ | |
3311 warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n"; | |
3312 } | |
3313 warn "chr $chromosome_name (",length $sequence ," bp)\n"; | |
3314 $chromosomes{$chromosome_name} = $sequence; | |
3315 } | |
3316 } | |
3317 warn "\n"; | |
3318 chdir $parent_dir or die "Failed to move to directory $parent_dir\n"; | |
3319 } | |
3320 | |
3321 sub extract_chromosome_name { | |
3322 ## Bowtie extracts the first string after the inition > in the FASTA file, so we are doing this as well | |
3323 my $fasta_header = shift; | |
3324 if ($fasta_header =~ s/^>//){ | |
3325 my ($chromosome_name) = split (/\s+/,$fasta_header); | |
3326 return $chromosome_name; | |
3327 } | |
3328 else{ | |
3329 die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n"; | |
3330 } | |
3331 } | |
3332 | |
3333 ####################################################################################################################################### | |
3334 ### genome-wide cytosine methylation report - END | |
3335 ####################################################################################################################################### | |
3336 | |
3337 | |
3338 | |
3339 | |
3340 sub print_helpfile{ | |
3341 | |
3342 print << 'HOW_TO'; | |
3343 | |
3344 | |
3345 DESCRIPTION | |
3346 | |
3347 The following is a brief description of all options to control the Bismark | |
3348 methylation extractor. The script reads in a bisulfite read alignment results file | |
3349 produced by the Bismark bisulfite mapper and extracts the methylation information | |
3350 for individual cytosines. This information is found in the methylation call field | |
3351 which can contain the following characters: | |
3352 | |
3353 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
3354 ~~~ X for methylated C in CHG context (was protected) ~~~ | |
3355 ~~~ x for not methylated C CHG (was converted) ~~~ | |
3356 ~~~ H for methylated C in CHH context (was protected) ~~~ | |
3357 ~~~ h for not methylated C in CHH context (was converted) ~~~ | |
3358 ~~~ Z for methylated C in CpG context (was protected) ~~~ | |
3359 ~~~ z for not methylated C in CpG context (was converted) ~~~ | |
3360 ~~~ . for any bases not involving cytosines ~~~ | |
3361 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
3362 | |
3363 The methylation extractor outputs result files for cytosines in CpG, CHG and CHH | |
3364 context (this distinction is actually already made in Bismark itself). As the methylation | |
3365 information for every C analysed can produce files which easily have tens or even hundreds of | |
3366 millions of lines, file sizes can become very large and more difficult to handle. The C | |
3367 methylation info additionally splits cytosine methylation calls up into one of the four possible | |
3368 strands a given bisulfite read aligned against: | |
3369 | |
3370 OT original top strand | |
3371 CTOT complementary to original top strand | |
3372 | |
3373 OB original bottom strand | |
3374 CTOB complementary to original bottom strand | |
3375 | |
3376 Thus, by default twelve individual output files are being generated per input file (unless | |
3377 --comprehensive is specified, see below). The output files can be imported into a genome | |
3378 viewer, such as SeqMonk, and re-combined into a single data group if desired (in fact | |
3379 unless the bisulfite reads were generated preserving directionality it doesn't make any | |
3380 sense to look at the data in a strand-specific manner). Strand-specific output files can | |
3381 optionally be skipped, in which case only three output files for CpG, CHG or CHH context | |
3382 will be generated. For both the strand-specific and comprehensive outputs there is also | |
3383 the option to merge both non-CpG contexts (CHG and CHH) into one single non-CpG context. | |
3384 | |
3385 | |
3386 The output files are in the following format (tab delimited): | |
3387 | |
3388 <sequence_id> <strand> <chromosome> <position> <methylation call> | |
3389 | |
3390 | |
3391 USAGE: methylation_extractor [options] <filenames> | |
3392 | |
3393 | |
3394 ARGUMENTS: | |
3395 | |
3396 <filenames> A space-separated list of Bismark result files in SAM format from | |
3397 which methylation information is extracted for every cytosine in | |
3398 the reads. For alignment files in the older custom Bismark output | |
3399 see option '--vanilla'. | |
3400 | |
3401 OPTIONS: | |
3402 | |
3403 -s/--single-end Input file(s) are Bismark result file(s) generated from single-end | |
3404 read data. Specifying either --single-end or --paired-end is | |
3405 mandatory. | |
3406 | |
3407 -p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end | |
3408 read data. Specifying either --paired-end or --single-end is | |
3409 mandatory. | |
3410 | |
3411 --vanilla The Bismark result input file(s) are in the old custom Bismark format | |
3412 (up to version 0.5.x) and not in SAM format which is the default as | |
3413 of Bismark version 0.6.x or higher. Default: OFF. | |
3414 | |
3415 --no_overlap For paired-end reads it is theoretically possible that read_1 and | |
3416 read_2 overlap. This option avoids scoring overlapping methylation | |
3417 calls twice (only methylation calls of read 1 are used for in the process | |
3418 since read 1 has historically higher quality basecalls than read 2). | |
3419 Whilst this option removes a bias towards more methylation calls | |
3420 in the center of sequenced fragments it may de facto remove a sizable | |
3421 proportion of the data. This option is highly recommended for paired-end | |
3422 data. | |
3423 | |
3424 --ignore <int> Ignore the first <int> bp at the 5' end of each read when processing the | |
3425 methylation call string. This can remove e.g. a restriction enzyme site | |
3426 at the start of each read. | |
3427 | |
3428 --comprehensive Specifying this option will merge all four possible strand-specific | |
3429 methylation info into context-dependent output files. The default | |
3430 contexts are: | |
3431 - CpG context | |
3432 - CHG context | |
3433 - CHH context | |
3434 | |
3435 --merge_non_CpG This will produce two output files (in --comprehensive mode) or eight | |
3436 strand-specific output files (default) for Cs in | |
3437 - CpG context | |
3438 - non-CpG context | |
3439 | |
3440 --report Prints out a short methylation summary as well as the paramaters used to run | |
3441 this script. | |
3442 | |
3443 --no_header Suppresses the Bismark version header line in all output files for more convenient | |
3444 batch processing. | |
3445 | |
3446 -o/--output DIR Allows specification of a different output directory (absolute or relative | |
3447 path). If not specified explicitely, the output will be written to the current directory. | |
3448 | |
3449 --version Displays version information. | |
3450 | |
3451 -h/--help Displays this help file and exits. | |
3452 | |
3453 | |
3454 | |
3455 bedGraph specific options: | |
3456 | |
3457 --bedGraph After finishing the methylation extraction, the methylation output is written into a | |
3458 sorted bedGraph file that reports the position of a given cytosine and its methylation | |
3459 state (in %, seem details below). The methylation extractor output is temporarily split up into | |
3460 temporary files, one per chromosome (written into the current directory or folder | |
3461 specified with -o/--output); these temp files are then used for sorting and deleted | |
3462 afterwards. By default, only cytosines in CpG context will be sorted. The option | |
3463 '--CX_context' may be used to report all cyosines irrespective of sequence context | |
3464 (this will take MUCH longer!). | |
3465 | |
3466 | |
3467 --cutoff [threshold] The minimum number of times a methylation state has to be seen for that nucleotide | |
3468 before its methylation percentage is reported. Default: 1. | |
3469 | |
3470 --remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting. | |
3471 | |
3472 | |
3473 --counts Adds two additional columns to the output file to enable further calculations: | |
3474 col 5: number of methylated calls | |
3475 col 6: number of unmethylated calls | |
3476 This option is required if '--cytosine_report' is specified (and will be set automatically if | |
3477 necessary). | |
3478 | |
3479 --CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered | |
3480 in the experiment irrespective of its sequence context. This applies to both forward and | |
3481 reverse strands. Please be aware that this option may generate large temporary and output files | |
3482 and may take a long time to sort (up to many hours). Default: OFF. | |
3483 (i.e. Default = CpG context only). | |
3484 | |
3485 | |
3486 | |
3487 Genome-wide cytosine methylation report specific options: | |
3488 | |
3489 --cytosine_report After the conversion to bedGraph has completed, the option '--cytosine_report' produces a | |
3490 genome-wide methylation report for all cytosines in the genome. By default, the output uses 1-based | |
3491 chromosome coordinates (zero-based cords are optional) and reports CpG context only (all | |
3492 cytosine context is optional). The output considers all Cs on both forward and reverse strands and | |
3493 reports their position, strand, trinucleotide content and methylation state (counts are 0 if not | |
3494 covered). | |
3495 | |
3496 --CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of | |
3497 its context. This applies to both forward and reverse strands. Please be aware that this will | |
3498 generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse. | |
3499 Default: OFF (i.e. Default = CpG context only). | |
3500 | |
3501 --zero_based Uses zero-based coordinates like used in e.g. bed files instead of 1-based coordinates. Default: OFF. | |
3502 | |
3503 --genome_folder <path> Enter the genome folder you wish to use to extract sequences from (full path only). Accepted | |
3504 formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory. | |
3505 | |
3506 --split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files | |
3507 will be named to include the input filename and the chromosome number. | |
3508 | |
3509 | |
3510 | |
3511 OUTPUT: | |
3512 | |
3513 The bismark_methylation_extractor output is in the form: | |
3514 ======================================================== | |
3515 <seq-ID> <methylation state*> <chromosome> <start position (= end position)> <methylation call> | |
3516 | |
3517 * Methylated cytosines receive a '+' orientation, | |
3518 * Unmethylated cytosines receive a '-' orientation. | |
3519 | |
3520 | |
3521 | |
3522 The bedGraph output (optional) looks like this (tab-delimited): | |
3523 =============================================================== | |
3524 <chromosome> <start position> <end position> <methylation percentage> | |
3525 | |
3526 | |
3527 | |
3528 The genome-wide cytosine methylation output file is tab-delimited in the following format: | |
3529 ========================================================================================== | |
3530 <chromosome> <position> <strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context> | |
3531 | |
3532 | |
3533 | |
3534 This script was last modified on 02 Oct 2012. | |
3535 | |
3536 HOW_TO | |
3537 } |