0
|
1 #!/usr/bin/perl
|
|
2 use warnings;
|
|
3 use strict;
|
|
4 $|++;
|
|
5 use Getopt::Long;
|
|
6 use Cwd;
|
|
7 use Carp;
|
|
8 use FindBin qw($Bin);
|
|
9 use lib "$Bin/../lib";
|
|
10
|
|
11 ## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk)
|
|
12
|
|
13 ## This program is free software: you can redistribute it and/or modify
|
|
14 ## it under the terms of the GNU General Public License as published by
|
|
15 ## the Free Software Foundation, either version 3 of the License, or
|
|
16 ## (at your option) any later version.
|
|
17
|
|
18 ## This program is distributed in the hope that it will be useful,
|
|
19 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
20 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
21 ## GNU General Public License for more details.
|
|
22
|
|
23 ## You should have received a copy of the GNU General Public License
|
|
24 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
25
|
|
26 my @filenames; # input files
|
|
27 my %counting;
|
|
28 my $parent_dir = getcwd();
|
|
29
|
|
30 my %fhs;
|
|
31
|
|
32 my $version = 'v0.7.11';
|
|
33 my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip) = process_commandline();
|
|
34
|
|
35
|
|
36 ### only needed for bedGraph output
|
|
37 my @sorting_files; # if files are to be written to bedGraph format, these are the methylation extractor output files
|
|
38 my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total
|
|
39 my @bedfiles;
|
|
40
|
|
41 ### only needed for genome-wide cytosine methylation report
|
|
42 my %chromosomes;
|
|
43
|
|
44 ##############################################################################################
|
|
45 ### Summarising Run Parameters
|
|
46 ##############################################################################################
|
|
47
|
|
48 ### METHYLATION EXTRACTOR
|
|
49
|
|
50 warn "Summarising Bismark methylation extractor parameters:\n";
|
|
51 warn '='x63,"\n";
|
|
52
|
|
53 if ($single){
|
|
54 if ($vanilla){
|
|
55 warn "Bismark single-end vanilla format specified\n";
|
|
56 }
|
|
57 else{
|
|
58 warn "Bismark single-end SAM format specified (default)\n"; # default
|
|
59 }
|
|
60 }
|
|
61 elsif ($paired){
|
|
62 if ($vanilla){
|
|
63 warn "Bismark paired-end vanilla format specified\n";
|
|
64 }
|
|
65 else{
|
|
66 warn "Bismark paired-end SAM format specified (default)\n"; # default
|
|
67 }
|
|
68 }
|
|
69
|
|
70 if ($ignore){
|
|
71 warn "First $ignore bases will be disregarded when processing the methylation call string\n";
|
|
72 }
|
|
73
|
|
74 if ($full){
|
|
75 warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n";
|
|
76 }
|
|
77 if ($merge_non_CpG){
|
|
78 warn "Merge CHG and CHH context to non-CpG context specified\n";
|
|
79 }
|
|
80 ### output directory
|
|
81 if ($output_dir eq ''){
|
|
82 warn "Output will be written to the current directory ('$parent_dir')\n";
|
|
83 }
|
|
84 else{
|
|
85 warn "Output path specified as: $output_dir\n";
|
|
86 }
|
|
87
|
|
88
|
|
89 sleep (1);
|
|
90
|
|
91 ### BEDGRAPH
|
|
92
|
|
93 if ($bedGraph){
|
|
94 warn "\n\nSummarising bedGraph parameters:\n";
|
|
95 warn '='x63,"\n";
|
|
96
|
|
97 if ($counts){
|
|
98 warn "Generating additional output in bedGraph format including methylating counts (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage> <count methylated> <count non-methylated>)\n";
|
|
99 }
|
|
100 else{
|
|
101 warn "Generating additional sorted output in bedGraph format (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage>)\n";
|
|
102 }
|
|
103
|
|
104 warn "Using a cutoff of $coverage_threshold read(s) to report cytosine positions\n";
|
|
105
|
|
106 if ($CX_context){
|
|
107 warn "Reporting and sorting methylation information for all cytosine context (sorting may take a long time, you have been warned ...)\n";
|
|
108 }
|
|
109 else{ # default
|
|
110 $CpG_only = 1;
|
|
111 warn "Reporting and sorting cytosine methylation information in CpG context only (default)\n";
|
|
112 }
|
|
113
|
|
114 if ($remove){
|
|
115 warn "White spaces in read ID names will be removed prior to sorting\n";
|
|
116 }
|
|
117
|
|
118 if (defined $sort_size){
|
|
119 warn "The bedGraph UNIX sort command will use the following memory setting:\t'$sort_size'. Temporary directory used for sorting is the output directory\n";
|
|
120 }
|
|
121 else{
|
|
122 warn "Setting a default memory usage for the bedGraph UNIX sort command to 2GB\n";
|
|
123 }
|
|
124
|
|
125
|
|
126
|
|
127 sleep (1);
|
|
128
|
|
129 if ($cytosine_report){
|
|
130 warn "\n\nSummarising genome-wide cytosine methylation report parameters:\n";
|
|
131 warn '='x63,"\n";
|
|
132 warn "Generating comprehensive genome-wide cytosine report\n(output format: <Chromosome> <Position> <Strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context> )\n";
|
|
133
|
|
134
|
|
135 if ($CX_context){
|
|
136 warn "Reporting methylation for all cytosine contexts. Be aware that this will generate enormous files\n";
|
|
137 }
|
|
138 else{ # default
|
|
139 $CpG_only = 1;
|
|
140 warn "Reporting cytosine methylation in CpG context only (default)\n";
|
|
141 }
|
|
142
|
|
143 if ($split_by_chromosome){
|
|
144 warn "Splitting the cytosine report output up into individual files for each chromosome\n";
|
|
145 }
|
|
146
|
|
147 ### Zero-based coordinates
|
|
148 if ($zero){
|
|
149 warn "Using zero-based genomic coordinates (user-defined)\n";
|
|
150 }
|
|
151 else{ # default, 1-based coords
|
|
152 warn "Using 1-based genomic coordinates (default)\n";
|
|
153 }
|
|
154
|
|
155 ### GENOME folder
|
|
156 if ($genome_folder){
|
|
157 unless ($genome_folder =~/\/$/){
|
|
158 $genome_folder =~ s/$/\//;
|
|
159 }
|
|
160 warn "Genome folder was specified as $genome_folder\n";
|
|
161 }
|
|
162 else{
|
|
163 $genome_folder = '/data/public/Genomes/Mouse/NCBIM37/';
|
|
164 warn "Using the default genome folder /data/public/Genomes/Mouse/NCBIM37/\n";
|
|
165 }
|
|
166 sleep (1);
|
|
167 }
|
|
168 }
|
|
169
|
|
170 warn "\n";
|
|
171 sleep (5);
|
|
172
|
|
173 ######################################################
|
|
174 ### PROCESSING FILES
|
|
175 ######################################################
|
|
176
|
|
177 foreach my $filename (@filenames){
|
|
178 # resetting counters and filehandles
|
|
179 %fhs = ();
|
|
180 %counting =(
|
|
181 total_meCHG_count => 0,
|
|
182 total_meCHH_count => 0,
|
|
183 total_meCpG_count => 0,
|
|
184 total_unmethylated_CHG_count => 0,
|
|
185 total_unmethylated_CHH_count => 0,
|
|
186 total_unmethylated_CpG_count => 0,
|
|
187 sequences_count => 0,
|
|
188 );
|
|
189 @sorting_files = ();
|
|
190 @bedfiles = ();
|
|
191
|
|
192 process_Bismark_results_file($filename);
|
|
193
|
|
194 ### Closing all filehandles so that the Bismark methylation extractor output doesn't get truncated due to buffering issues
|
|
195 foreach my $fh (keys %fhs) {
|
|
196 if ($fh =~ /^[1230]$/) {
|
|
197 foreach my $context (keys %{$fhs{$fh}}) {
|
|
198 close $fhs{$fh}->{$context} or die $!;
|
|
199
|
|
200 }
|
|
201 } else {
|
|
202 close $fhs{$fh} or die $!;
|
|
203 }
|
|
204 }
|
|
205
|
|
206 if ($bedGraph){
|
|
207
|
|
208 my $out = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
|
|
209 $out =~ s/gz$//;
|
|
210 $out =~ s/sam$//;
|
|
211 $out =~ s/bam$//;
|
|
212 $out =~ s/txt$//;
|
|
213 $out =~ s/$/bedGraph/;
|
|
214
|
|
215
|
|
216
|
|
217 my $bedGraph_output = $out;
|
|
218 my @args;
|
|
219
|
|
220 if ($remove){
|
|
221 push @args, '--remove';
|
|
222 }
|
|
223 if ($CX_context){
|
|
224 push @args, '--CX_context';
|
|
225 }
|
|
226 if ($no_header){
|
|
227 push @args, '--no_header';
|
|
228 }
|
|
229 if ($counts){
|
|
230 push @args, "--counts";
|
|
231 }
|
|
232
|
|
233 push @args, "--buffer_size $sort_size";
|
|
234 push @args, "--cutoff $coverage_threshold";
|
|
235 push @args, "--output $bedGraph_output";
|
|
236 push @args, "--dir '$output_dir'";
|
|
237
|
|
238 ### adding all files to be sorted to @args
|
|
239 foreach my $f (@sorting_files){
|
|
240 push @args, $f;
|
|
241 }
|
|
242
|
|
243 # print join "\t",@args,"\n";
|
|
244
|
|
245 system ("$Bin/bismark2bedGraph @args");
|
|
246
|
|
247 warn "Finished BedGraph conversion ...\n\n";
|
|
248 sleep(3);
|
|
249
|
|
250 # open (OUT,'>',$output_dir.$bedGraph_output) or die "Problems with the bedGraph output filename detected: file path: '$output_dir'\tfile name: '$bedGraph_output' $!";
|
|
251 # warn "Writing bedGraph to file: $bedGraph_output\n";
|
|
252 # process_bedGraph_output();
|
|
253 # close OUT or die $!;
|
|
254
|
|
255 ### genome-wide cytosine methylation report requires bedGraph processing anyway
|
|
256 if ($cytosine_report){
|
|
257 @args = (); # resetting @args
|
|
258 my $cytosine_out = $out;
|
|
259 $cytosine_out =~ s/bedGraph$//;
|
|
260
|
|
261 if ($CX_context){
|
|
262 $cytosine_out =~ s/$/CX_report.txt/;
|
|
263 }
|
|
264 else{
|
|
265 $cytosine_out =~ s/$/CpG_report.txt/;
|
|
266 }
|
|
267
|
|
268 push @args, "--output $cytosine_out";
|
|
269 push @args, "--dir '$output_dir'";
|
|
270 push @args, "--genome '$genome_folder'";
|
|
271 push @args, "--parent_dir '$parent_dir'";
|
|
272
|
|
273 if ($zero){
|
|
274 push @args, "--zero";
|
|
275 }
|
|
276 if ($CX_context){
|
|
277 push @args, '--CX_context';
|
|
278 }
|
|
279 if ($split_by_chromosome){
|
|
280 push @args, '--split_by_chromosome';
|
|
281 }
|
|
282
|
|
283 push @args, $bedGraph_output; # this will be the infile
|
|
284
|
|
285 system ("$Bin/bedGraph2cytosine @args");
|
|
286 # generate_genome_wide_cytosine_report($bedGraph_output,$cytosine_out);
|
|
287 warn "\n\nFinished generating genome-wide cytosine report\n\n";
|
|
288 }
|
|
289 }
|
|
290 }
|
|
291
|
|
292
|
|
293 sub process_commandline{
|
|
294 my $help;
|
|
295 my $single_end;
|
|
296 my $paired_end;
|
|
297 my $ignore;
|
|
298 my $genomic_fasta;
|
|
299 my $full;
|
|
300 my $report;
|
|
301 my $extractor_version;
|
|
302 my $no_overlap;
|
|
303 my $merge_non_CpG;
|
|
304 my $vanilla;
|
|
305 my $output_dir;
|
|
306 my $no_header;
|
|
307 my $bedGraph;
|
|
308 my $coverage_threshold = 1; # Minimum number of reads covering before calling methylation status
|
|
309 my $remove;
|
|
310 my $counts;
|
|
311 my $cytosine_report;
|
|
312 my $genome_folder;
|
|
313 my $zero;
|
|
314 my $CpG_only;
|
|
315 my $CX_context;
|
|
316 my $split_by_chromosome;
|
|
317 my $sort_size;
|
|
318 my $samtools_path;
|
|
319 my $gzip;
|
|
320
|
|
321 my $command_line = GetOptions ('help|man' => \$help,
|
|
322 'p|paired-end' => \$paired_end,
|
|
323 's|single-end' => \$single_end,
|
|
324 'fasta' => \$genomic_fasta,
|
|
325 'ignore=i' => \$ignore,
|
|
326 'comprehensive' => \$full,
|
|
327 'report' => \$report,
|
|
328 'version' => \$extractor_version,
|
|
329 'no_overlap' => \$no_overlap,
|
|
330 'merge_non_CpG' => \$merge_non_CpG,
|
|
331 'vanilla' => \$vanilla,
|
|
332 'o|output=s' => \$output_dir,
|
|
333 'no_header' => \$no_header,
|
|
334 'bedGraph' => \$bedGraph,
|
|
335 "cutoff=i" => \$coverage_threshold,
|
|
336 "remove_spaces" => \$remove,
|
|
337 "counts" => \$counts,
|
|
338 "cytosine_report" => \$cytosine_report,
|
|
339 'g|genome_folder=s' => \$genome_folder,
|
|
340 "zero_based" => \$zero,
|
|
341 "CX|CX_context" => \$CX_context,
|
|
342 "split_by_chromosome" => \$split_by_chromosome,
|
|
343 "buffer_size=s" => \$sort_size,
|
|
344 'samtools_path=s' => \$samtools_path,
|
|
345 "gzip" => \$gzip,
|
|
346 );
|
|
347
|
|
348 ### EXIT ON ERROR if there were errors with any of the supplied options
|
|
349 unless ($command_line){
|
|
350 die "Please respecify command line options\n";
|
|
351 }
|
|
352
|
|
353 ### HELPFILE
|
|
354 if ($help){
|
|
355 print_helpfile();
|
|
356 exit;
|
|
357 }
|
|
358
|
|
359 if ($extractor_version){
|
|
360 print << "VERSION";
|
|
361
|
|
362
|
|
363 Bismark Methylation Extractor
|
|
364
|
|
365 Bismark Extractor Version: $version
|
|
366 Copyright 2010-13 Felix Krueger, Babraham Bioinformatics
|
|
367 www.bioinformatics.babraham.ac.uk/projects/bismark/
|
|
368
|
|
369
|
|
370 VERSION
|
|
371 exit;
|
|
372 }
|
|
373
|
|
374
|
|
375 ### no files provided
|
|
376 unless (@ARGV){
|
|
377 die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n";
|
|
378 }
|
|
379 @filenames = @ARGV;
|
|
380
|
|
381 warn "\n *** Bismark methylation extractor version $version ***\n\n";
|
|
382
|
|
383 ### IGNORING <INT> bases at the start of the read when processing the methylation call string
|
|
384 unless ($ignore){
|
|
385 $ignore = 0;
|
|
386 }
|
|
387 ### PRINT A REPORT
|
|
388 unless ($report){
|
|
389 $report = 0;
|
|
390 }
|
|
391
|
|
392 ### OUTPUT DIR PATH
|
|
393 if ($output_dir){
|
|
394 unless ($output_dir =~ /\/$/){
|
|
395 $output_dir =~ s/$/\//;
|
|
396 }
|
|
397 }
|
|
398 else{
|
|
399 $output_dir = '';
|
|
400 }
|
|
401
|
|
402 ### NO HEADER
|
|
403 unless ($no_header){
|
|
404 $no_header = 0;
|
|
405 }
|
|
406
|
|
407 ### OLD (VANILLA) OUTPUT FORMAT
|
|
408 unless ($vanilla){
|
|
409 $vanilla = 0;
|
|
410 }
|
|
411
|
|
412 if ($single_end){
|
|
413 $paired_end = 0; ### SINGLE END ALIGNMENTS
|
|
414 }
|
|
415 elsif ($paired_end){
|
|
416 $single_end = 0; ### PAIRED-END ALIGNMENTS
|
|
417 }
|
|
418 else{
|
|
419 die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format\n\n";
|
|
420 }
|
|
421
|
|
422 ### NO OVERLAP
|
|
423 if ($no_overlap){
|
|
424 die "The option '--no_overlap' can only be specified for paired-end input!\n" unless ($paired_end);
|
|
425 }
|
|
426 else{
|
|
427 $no_overlap = 0;
|
|
428 }
|
|
429
|
|
430 ### COMPREHENSIVE OUTPUT
|
|
431 unless ($full){
|
|
432 $full = 0;
|
|
433 }
|
|
434
|
|
435 ### MERGE NON-CpG context
|
|
436 unless ($merge_non_CpG){
|
|
437 $merge_non_CpG = 0;
|
|
438 }
|
|
439
|
|
440 ### remove white spaces in read ID (needed for sorting using the sort command
|
|
441 unless ($remove){
|
|
442 $remove = 0;
|
|
443 }
|
|
444
|
|
445 ### COVERAGE THRESHOLD FOR bedGraph OUTPUT
|
|
446 if (defined $coverage_threshold){
|
|
447 unless ($coverage_threshold > 0){
|
|
448 die "Please select a coverage greater than 0 (positive integers only)\n";
|
|
449 }
|
|
450 }
|
|
451 else{
|
|
452 $coverage_threshold = 1;
|
|
453 }
|
|
454
|
|
455 ### SORT buffer size
|
|
456 if (defined $sort_size){
|
|
457 unless ($sort_size =~ /^\d+\%$/ or $sort_size =~ /^\d+(K|M|G|T)$/){
|
|
458 die "Please select a buffer size as percentage (e.g. --buffer_size 20%) or a number to be multiplied with K, M, G, T etc. (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line\n";
|
|
459 }
|
|
460 }
|
|
461 else{
|
|
462 $sort_size = '2G';
|
|
463 }
|
|
464
|
|
465 if ($zero){
|
|
466 die "Option '--zero' is only available if '--cytosine_report' is specified as well. Please respecify\n" unless ($cytosine_report);
|
|
467 }
|
|
468
|
|
469 if ($CX_context){
|
|
470 die "Option '--CX_context' is only available if '--cytosine_report' or '--bedGraph' is specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph);
|
|
471 }
|
|
472 else{
|
|
473 $CX_context = 0;
|
|
474 }
|
|
475
|
|
476 unless ($counts){
|
|
477 $counts = 0;
|
|
478 }
|
|
479
|
|
480 if ($cytosine_report){
|
|
481
|
|
482 ### GENOME folder
|
|
483 if ($genome_folder){
|
|
484 unless ($genome_folder =~/\/$/){
|
|
485 $genome_folder =~ s/$/\//;
|
|
486 }
|
|
487 }
|
|
488 else{
|
|
489 die "Please specify a genome folder to proceed (full path only)\n";
|
|
490 }
|
|
491
|
|
492 unless ($bedGraph){
|
|
493 warn "Setting the option '--bedGraph' since this is required for the genome-wide cytosine report\n";
|
|
494 $bedGraph = 1;
|
|
495 }
|
|
496 unless ($counts){
|
|
497 warn "Setting the option '--counts' since this is required for the genome-wide cytosine report\n";
|
|
498 $counts = 1;
|
|
499 }
|
|
500 warn "\n";
|
|
501 }
|
|
502
|
|
503 ### PATH TO SAMTOOLS
|
|
504 if (defined $samtools_path){
|
|
505 # if Samtools was specified as full command
|
|
506 if ($samtools_path =~ /samtools$/){
|
|
507 if (-e $samtools_path){
|
|
508 # Samtools executable found
|
|
509 }
|
|
510 else{
|
|
511 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
|
|
512 }
|
|
513 }
|
|
514 else{
|
|
515 unless ($samtools_path =~ /\/$/){
|
|
516 $samtools_path =~ s/$/\//;
|
|
517 }
|
|
518 $samtools_path .= 'samtools';
|
|
519 if (-e $samtools_path){
|
|
520 # Samtools executable found
|
|
521 }
|
|
522 else{
|
|
523 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
|
|
524 }
|
|
525 }
|
|
526 }
|
|
527 # Check whether Samtools is in the PATH if no path was supplied by the user
|
|
528 else{
|
|
529 if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if Samtools is in the PATH
|
|
530 $samtools_path = `which samtools`;
|
|
531 chomp $samtools_path;
|
|
532 }
|
|
533 }
|
|
534
|
|
535 unless (defined $samtools_path){
|
|
536 $samtools_path = '';
|
|
537 }
|
|
538
|
|
539 return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip);
|
|
540 }
|
|
541
|
|
542
|
|
543 sub process_Bismark_results_file{
|
|
544 my $filename = shift;
|
|
545
|
|
546 warn "\nNow reading in Bismark result file $filename\n\n";
|
|
547
|
|
548 if ($filename =~ /\.gz$/) {
|
|
549 open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n";
|
|
550 }
|
|
551 elsif ($filename =~ /bam$/) {
|
|
552 if ($samtools_path){
|
|
553 open (IN,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n";
|
|
554 }
|
|
555 else{
|
|
556 die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n";
|
|
557 }
|
|
558 }
|
|
559 else {
|
|
560 open (IN,$filename) or die "Can't open file $filename: $!\n";
|
|
561 }
|
|
562
|
|
563 ### Vanilla and SAM output need to read different numbers of header lines
|
|
564 if ($vanilla) {
|
|
565 my $bismark_version = <IN>; ## discarding the Bismark version info
|
|
566 chomp $bismark_version;
|
|
567 $bismark_version =~ s/\r//; # replaces \r line feed
|
|
568 $bismark_version =~ s/Bismark version: //;
|
|
569 if ($bismark_version =~ /^\@/) {
|
|
570 warn "Detected \@ as the first character of the version information. Is it possible that the file is in SAM format?\n\n";
|
|
571 sleep (2);
|
|
572 }
|
|
573
|
|
574 unless ($version eq $bismark_version){
|
|
575 die "The methylation extractor and Bismark itself need to be of the same version!\n\nVersions used:\nmethylation extractor: '$version'\nBismark: '$bismark_version'\n";
|
|
576 }
|
|
577 } else {
|
|
578 # If the read is in SAM format (default) it can either start with @ header lines or start with alignments directly.
|
|
579 # We are reading from it further down
|
|
580 }
|
|
581
|
|
582 my $output_filename = (split (/\//,$filename))[-1];
|
|
583
|
|
584 ### OPENING OUTPUT-FILEHANDLES
|
|
585 if ($report) {
|
|
586 my $report_filename = $output_filename;
|
|
587 $report_filename =~ s/\.sam$//;
|
|
588 $report_filename =~ s/\.txt$//;
|
|
589 $report_filename =~ s/$/_splitting_report.txt/;
|
|
590 $report_filename = $output_dir . $report_filename;
|
|
591 open (REPORT,'>',$report_filename) or die "Failed to write to file $report_filename $!\n";
|
|
592 }
|
|
593
|
|
594 if ($report) {
|
|
595 print REPORT "$output_filename\n\n";
|
|
596 print REPORT "Parameters used to extract methylation information:\n";
|
|
597 if ($paired) {
|
|
598 if ($vanilla) {
|
|
599 print REPORT "Bismark result file: paired-end (vanilla Bismark format)\n";
|
|
600 } else {
|
|
601 print REPORT "Bismark result file: paired-end (SAM format)\n"; # default
|
|
602 }
|
|
603 }
|
|
604
|
|
605 if ($single) {
|
|
606 if ($vanilla) {
|
|
607 print REPORT "Bismark result file: single-end (vanilla Bismark format)\n";
|
|
608 } else {
|
|
609 print REPORT "Bismark result file: single-end (SAM format)\n"; # default
|
|
610 }
|
|
611 }
|
|
612
|
|
613 if ($ignore) {
|
|
614 print REPORT "Ignoring first $ignore bases\n";
|
|
615 }
|
|
616
|
|
617 if ($full) {
|
|
618 print REPORT "Output specified: comprehensive\n";
|
|
619 } else {
|
|
620 print REPORT "Output specified: strand-specific (default)\n";
|
|
621 }
|
|
622
|
|
623 if ($no_overlap) {
|
|
624 print REPORT "No overlapping methylation calls specified\n";
|
|
625 }
|
|
626 if ($genomic_fasta) {
|
|
627 print REPORT "Genomic equivalent sequences will be printed out in FastA format\n";
|
|
628 }
|
|
629 if ($merge_non_CpG) {
|
|
630 print REPORT "Methylation in CHG and CHH context will be merged into \"non-CpG context\" output\n";
|
|
631 }
|
|
632
|
|
633 print REPORT "\n";
|
|
634 }
|
|
635
|
|
636 ##### open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
|
|
637
|
|
638 ### CpG-context and non-CpG context. THIS SECTION IS OPTIONAL
|
|
639 ### if --comprehensive AND --merge_non_CpG was specified we are only writing out one CpG-context and one Any-Other-context result file
|
|
640 if ($full and $merge_non_CpG) {
|
|
641 my $cpg_output = my $other_c_output = $output_filename;
|
|
642 ### C in CpG context
|
|
643 $cpg_output =~ s/^/CpG_context_/;
|
|
644 $cpg_output =~ s/sam$/txt/;
|
|
645 $cpg_output =~ s/bam$/txt/;
|
|
646 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
|
|
647 $cpg_output = $output_dir . $cpg_output;
|
|
648
|
|
649 if ($gzip){
|
|
650 $cpg_output .= '.gz';
|
|
651 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n";
|
|
652 }
|
|
653 else{
|
|
654 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n";
|
|
655 }
|
|
656
|
|
657 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n";
|
|
658 push @sorting_files,$cpg_output;
|
|
659
|
|
660 unless ($no_header) {
|
|
661 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n";
|
|
662 }
|
|
663
|
|
664 ### C in any other context than CpG
|
|
665 $other_c_output =~ s/^/Non_CpG_context_/;
|
|
666 $other_c_output =~ s/sam$/txt/;
|
|
667 $other_c_output =~ s/bam$/txt/;
|
|
668 $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/);
|
|
669 $other_c_output = $output_dir . $other_c_output;
|
|
670
|
|
671 if ($gzip){
|
|
672 $other_c_output .= '.gz';
|
|
673 open ($fhs{other_context},"| gzip -c - > $other_c_output") or die "Failed to write to $other_c_output $! \n";
|
|
674 }
|
|
675 else{
|
|
676 open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n";
|
|
677 }
|
|
678
|
|
679 warn "Writing result file containing methylation information for C in any other context to $other_c_output\n";
|
|
680 push @sorting_files,$other_c_output;
|
|
681
|
|
682
|
|
683 unless ($no_header) {
|
|
684 print {$fhs{other_context}} "Bismark methylation extractor version $version\n";
|
|
685 }
|
|
686 }
|
|
687
|
|
688 ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found
|
|
689 elsif ($merge_non_CpG) {
|
|
690
|
|
691 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
|
|
692
|
|
693 ### For cytosines in CpG context
|
|
694 $cpg_ot =~ s/^/CpG_OT_/;
|
|
695 $cpg_ot =~ s/sam$/txt/;
|
|
696 $cpg_ot =~ s/bam$/txt/;
|
|
697 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
|
|
698 $cpg_ot = $output_dir . $cpg_ot;
|
|
699
|
|
700 if ($gzip){
|
|
701 $cpg_ot .= '.gz';
|
|
702 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n";
|
|
703 }
|
|
704 else{
|
|
705 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n";
|
|
706 }
|
|
707
|
|
708 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n";
|
|
709 push @sorting_files,$cpg_ot;
|
|
710
|
|
711 unless($no_header){
|
|
712 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n";
|
|
713 }
|
|
714
|
|
715 $cpg_ctot =~ s/^/CpG_CTOT_/;
|
|
716 $cpg_ctot =~ s/sam$/txt/;
|
|
717 $cpg_ctot =~ s/bam$/txt/;
|
|
718 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
|
|
719 $cpg_ctot = $output_dir . $cpg_ctot;
|
|
720
|
|
721 if ($gzip){
|
|
722 $cpg_ctot .= '.gz';
|
|
723 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n";
|
|
724 }
|
|
725 else{
|
|
726 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n";
|
|
727 }
|
|
728
|
|
729 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n";
|
|
730 push @sorting_files,$cpg_ctot;
|
|
731
|
|
732 unless($no_header){
|
|
733 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n";
|
|
734 }
|
|
735
|
|
736 $cpg_ctob =~ s/^/CpG_CTOB_/;
|
|
737 $cpg_ctob =~ s/sam$/txt/;
|
|
738 $cpg_ctob =~ s/bam$/txt/;
|
|
739 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
|
|
740 $cpg_ctob = $output_dir . $cpg_ctob;
|
|
741
|
|
742 if ($gzip){
|
|
743 $cpg_ctob .= '.gz';
|
|
744 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n";
|
|
745 }
|
|
746 else{
|
|
747 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n";
|
|
748 }
|
|
749
|
|
750 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n";
|
|
751 push @sorting_files,$cpg_ctob;
|
|
752
|
|
753 unless($no_header){
|
|
754 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n";
|
|
755 }
|
|
756
|
|
757 $cpg_ob =~ s/^/CpG_OB_/;
|
|
758 $cpg_ob =~ s/sam$/txt/;
|
|
759 $cpg_ob =~ s/bam$/txt/;
|
|
760 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
|
|
761 $cpg_ob = $output_dir . $cpg_ob;
|
|
762
|
|
763 if ($gzip){
|
|
764 $cpg_ob .= '.gz';
|
|
765 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n";
|
|
766 }
|
|
767 else{
|
|
768 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n";
|
|
769 }
|
|
770
|
|
771 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n";
|
|
772 push @sorting_files,$cpg_ob;
|
|
773
|
|
774 unless($no_header){
|
|
775 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n";
|
|
776 }
|
|
777
|
|
778 ### For cytosines in Non-CpG (CC, CT or CA) context
|
|
779 my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename;
|
|
780
|
|
781 $other_c_ot =~ s/^/Non_CpG_OT_/;
|
|
782 $other_c_ot =~ s/sam$/txt/;
|
|
783 $other_c_ot =~ s/bam$/txt/;
|
|
784 $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/);
|
|
785 $other_c_ot = $output_dir . $other_c_ot;
|
|
786
|
|
787 if ($gzip){
|
|
788 $other_c_ot .= '.gz';
|
|
789 open ($fhs{0}->{other_c},"| gzip -c - > $other_c_ot") or die "Failed to write to $other_c_ot $!\n";
|
|
790 }
|
|
791 else{
|
|
792 open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n";
|
|
793 }
|
|
794
|
|
795 warn "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n";
|
|
796 push @sorting_files,$other_c_ot;
|
|
797
|
|
798 unless($no_header){
|
|
799 print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n";
|
|
800 }
|
|
801
|
|
802 $other_c_ctot =~ s/^/Non_CpG_CTOT_/;
|
|
803 $other_c_ctot =~ s/sam$/txt/;
|
|
804 $other_c_ctot =~ s/bam$/txt/;
|
|
805 $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/);
|
|
806 $other_c_ctot = $output_dir . $other_c_ctot;
|
|
807
|
|
808 if ($gzip){
|
|
809 $other_c_ctot .= '.gz';
|
|
810 open ($fhs{1}->{other_c},"| gzip -c - > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n";
|
|
811 }
|
|
812 else{
|
|
813 open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n";
|
|
814 }
|
|
815
|
|
816 warn "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n";
|
|
817 push @sorting_files,$other_c_ctot;
|
|
818
|
|
819 unless($no_header){
|
|
820 print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n";
|
|
821 }
|
|
822
|
|
823 $other_c_ctob =~ s/^/Non_CpG_CTOB_/;
|
|
824 $other_c_ctob =~ s/sam$/txt/;
|
|
825 $other_c_ctob =~ s/bam$/txt/;
|
|
826 $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/);
|
|
827 $other_c_ctob = $output_dir . $other_c_ctob;
|
|
828
|
|
829 if ($gzip){
|
|
830 $other_c_ctob .= '.gz';
|
|
831 open ($fhs{2}->{other_c},"| gzip -c - > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n";
|
|
832 }
|
|
833 else{
|
|
834 open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n";
|
|
835 }
|
|
836
|
|
837 warn "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n";
|
|
838 push @sorting_files,$other_c_ctob;
|
|
839
|
|
840 unless($no_header){
|
|
841 print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n";
|
|
842 }
|
|
843
|
|
844 $other_c_ob =~ s/^/Non_CpG_OB_/;
|
|
845 $other_c_ob =~ s/sam$/txt/;
|
|
846 $other_c_ob =~ s/sam$/txt/;
|
|
847 $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/);
|
|
848 $other_c_ob = $output_dir . $other_c_ob;
|
|
849
|
|
850 if ($gzip){
|
|
851 $other_c_ob .= '.gz';
|
|
852 open ($fhs{3}->{other_c},"| gzip -c - > $other_c_ob") or die "Failed to write to $other_c_ob $!\n";
|
|
853 }
|
|
854 else{
|
|
855 open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n";
|
|
856 }
|
|
857
|
|
858 warn "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n";
|
|
859 push @sorting_files,$other_c_ob;
|
|
860
|
|
861 unless($no_header){
|
|
862 print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n";
|
|
863 }
|
|
864 }
|
|
865 ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context)
|
|
866
|
|
867 ### if --comprehensive was specified we are only writing one file per context
|
|
868 elsif ($full) {
|
|
869 my $cpg_output = my $chg_output = my $chh_output = $output_filename;
|
|
870 ### C in CpG context
|
|
871 $cpg_output =~ s/^/CpG_context_/;
|
|
872 $cpg_output =~ s/sam$/txt/;
|
|
873 $cpg_output =~ s/bam$/txt/;
|
|
874 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
|
|
875 $cpg_output = $output_dir . $cpg_output;
|
|
876
|
|
877 if ($gzip){
|
|
878 $cpg_output .= '.gz';
|
|
879 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n";
|
|
880 }
|
|
881 else{
|
|
882 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n";
|
|
883 }
|
|
884
|
|
885 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n";
|
|
886 push @sorting_files,$cpg_output;
|
|
887
|
|
888 unless($no_header){
|
|
889 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n";
|
|
890 }
|
|
891
|
|
892 ### C in CHG context
|
|
893 $chg_output =~ s/^/CHG_context_/;
|
|
894 $chg_output =~ s/sam$/txt/;
|
|
895 $chg_output =~ s/bam$/txt/;
|
|
896 $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/);
|
|
897 $chg_output = $output_dir . $chg_output;
|
|
898
|
|
899 if ($gzip){
|
|
900 $chg_output .= '.gz';
|
|
901 open ($fhs{CHG_context},"| gzip -c - > $chg_output") or die "Failed to write to $chg_output $!\n";
|
|
902 }
|
|
903 else{
|
|
904 open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n";
|
|
905 }
|
|
906
|
|
907 warn "Writing result file containing methylation information for C in CHG context to $chg_output\n";
|
|
908 push @sorting_files,$chg_output;
|
|
909
|
|
910 unless($no_header){
|
|
911 print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n";
|
|
912 }
|
|
913
|
|
914 ### C in CHH context
|
|
915 $chh_output =~ s/^/CHH_context_/;
|
|
916 $chh_output =~ s/sam$/txt/;
|
|
917 $chh_output =~ s/bam$/txt/;
|
|
918 $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/);
|
|
919 $chh_output = $output_dir . $chh_output;
|
|
920
|
|
921 if ($gzip){
|
|
922 $chh_output .= '.gz';
|
|
923 open ($fhs{CHH_context},"| gzip -c - > $chh_output") or die "Failed to write to $chh_output $!\n";
|
|
924 }
|
|
925 else{
|
|
926 open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n";
|
|
927 }
|
|
928
|
|
929 warn "Writing result file containing methylation information for C in CHH context to $chh_output\n";
|
|
930 push @sorting_files, $chh_output;
|
|
931
|
|
932 unless($no_header){
|
|
933 print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n";
|
|
934 }
|
|
935 }
|
|
936 ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found
|
|
937 else {
|
|
938 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
|
|
939
|
|
940 ### For cytosines in CpG context
|
|
941 $cpg_ot =~ s/^/CpG_OT_/;
|
|
942 $cpg_ot =~ s/sam$/txt/;
|
|
943 $cpg_ot =~ s/bam$/txt/;
|
|
944 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
|
|
945 $cpg_ot = $output_dir . $cpg_ot;
|
|
946
|
|
947 if ($gzip){
|
|
948 $cpg_ot .= '.gz';
|
|
949 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n";
|
|
950 }
|
|
951 else{
|
|
952 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n";
|
|
953 }
|
|
954
|
|
955 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n";
|
|
956 push @sorting_files,$cpg_ot;
|
|
957
|
|
958 unless($no_header){
|
|
959 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n";
|
|
960 }
|
|
961
|
|
962 $cpg_ctot =~ s/^/CpG_CTOT_/;
|
|
963 $cpg_ctot =~ s/sam$/txt/;
|
|
964 $cpg_ctot =~ s/bam$/txt/;
|
|
965 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
|
|
966 $cpg_ctot = $output_dir . $cpg_ctot;
|
|
967
|
|
968 if ($gzip){
|
|
969 $cpg_ctot .= '.gz';
|
|
970 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n";
|
|
971 }
|
|
972 else{
|
|
973 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n";
|
|
974 }
|
|
975
|
|
976 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n";
|
|
977 push @sorting_files,$cpg_ctot;
|
|
978
|
|
979 unless($no_header){
|
|
980 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n";
|
|
981 }
|
|
982
|
|
983 $cpg_ctob =~ s/^/CpG_CTOB_/;
|
|
984 $cpg_ctob =~ s/sam$/txt/;
|
|
985 $cpg_ctob =~ s/bam$/txt/;
|
|
986 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
|
|
987 $cpg_ctob = $output_dir . $cpg_ctob;
|
|
988
|
|
989 if ($gzip){
|
|
990 $cpg_ctob .= '.gz';
|
|
991 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n";
|
|
992 }
|
|
993 else{
|
|
994 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n";
|
|
995 }
|
|
996
|
|
997 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n";
|
|
998 push @sorting_files,$cpg_ctob;
|
|
999
|
|
1000 unless($no_header){
|
|
1001 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n";
|
|
1002 }
|
|
1003
|
|
1004 $cpg_ob =~ s/^/CpG_OB_/;
|
|
1005 $cpg_ob =~ s/sam$/txt/;
|
|
1006 $cpg_ob =~ s/bam$/txt/;
|
|
1007 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
|
|
1008 $cpg_ob = $output_dir . $cpg_ob;
|
|
1009
|
|
1010 if ($gzip){
|
|
1011 $cpg_ob .= '.gz';
|
|
1012 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n";
|
|
1013 }
|
|
1014 else{
|
|
1015 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n";
|
|
1016 }
|
|
1017
|
|
1018 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n";
|
|
1019 push @sorting_files,$cpg_ob;
|
|
1020
|
|
1021 unless($no_header){
|
|
1022 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n";
|
|
1023 }
|
|
1024
|
|
1025 ### For cytosines in CHG context
|
|
1026 my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename;
|
|
1027
|
|
1028 $chg_ot =~ s/^/CHG_OT_/;
|
|
1029 $chg_ot =~ s/sam$/txt/;
|
|
1030 $chg_ot =~ s/bam$/txt/;
|
|
1031 $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/);
|
|
1032 $chg_ot = $output_dir . $chg_ot;
|
|
1033
|
|
1034 if ($gzip){
|
|
1035 $chg_ot .= '.gz';
|
|
1036 open ($fhs{0}->{CHG},"| gzip -c - > $chg_ot") or die "Failed to write to $chg_ot $!\n";
|
|
1037 }
|
|
1038 else{
|
|
1039 open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n";
|
|
1040 }
|
|
1041
|
|
1042 warn "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n";
|
|
1043 push @sorting_files,$chg_ot;
|
|
1044
|
|
1045 unless($no_header){
|
|
1046 print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n";
|
|
1047 }
|
|
1048
|
|
1049 $chg_ctot =~ s/^/CHG_CTOT_/;
|
|
1050 $chg_ctot =~ s/sam$/txt/;
|
|
1051 $chg_ctot =~ s/bam$/txt/;
|
|
1052 $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/);
|
|
1053 $chg_ctot = $output_dir . $chg_ctot;
|
|
1054
|
|
1055 if ($gzip){
|
|
1056 $chg_ctot .= '.gz';
|
|
1057 open ($fhs{1}->{CHG},"| gzip -c - > $chg_ctot") or die "Failed to write to $chg_ctot $!\n";
|
|
1058 }
|
|
1059 else{
|
|
1060 open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n";
|
|
1061 }
|
|
1062
|
|
1063 warn "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n";
|
|
1064 push @sorting_files,$chg_ctot;
|
|
1065
|
|
1066 unless($no_header){
|
|
1067 print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n";
|
|
1068 }
|
|
1069
|
|
1070 $chg_ctob =~ s/^/CHG_CTOB_/;
|
|
1071 $chg_ctob =~ s/sam$/txt/;
|
|
1072 $chg_ctob =~ s/bam$/txt/;
|
|
1073 $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/);
|
|
1074 $chg_ctob = $output_dir . $chg_ctob;
|
|
1075
|
|
1076 if ($gzip){
|
|
1077 $chg_ctob .= '.gz';
|
|
1078 open ($fhs{2}->{CHG},"| gzip -c - > $chg_ctob") or die "Failed to write to $chg_ctob $!\n";
|
|
1079 }
|
|
1080 else{
|
|
1081 open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n";
|
|
1082 }
|
|
1083
|
|
1084 warn "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n";
|
|
1085 push @sorting_files,$chg_ctob;
|
|
1086
|
|
1087 unless($no_header){
|
|
1088 print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n";
|
|
1089 }
|
|
1090
|
|
1091 $chg_ob =~ s/^/CHG_OB_/;
|
|
1092 $chg_ob =~ s/sam$/txt/;
|
|
1093 $chg_ob =~ s/bam$/txt/;
|
|
1094 $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/);
|
|
1095 $chg_ob = $output_dir . $chg_ob;
|
|
1096
|
|
1097 if ($gzip){
|
|
1098 $chg_ob .= '.gz';
|
|
1099 open ($fhs{3}->{CHG},"| gzip -c - > $chg_ob") or die "Failed to write to $chg_ob $!\n";
|
|
1100 }
|
|
1101 else{
|
|
1102 open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n";
|
|
1103 }
|
|
1104
|
|
1105 warn "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n";
|
|
1106 push @sorting_files,$chg_ob;
|
|
1107
|
|
1108 unless($no_header){
|
|
1109 print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n";
|
|
1110 }
|
|
1111
|
|
1112 ### For cytosines in CHH context
|
|
1113 my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename;
|
|
1114
|
|
1115 $chh_ot =~ s/^/CHH_OT_/;
|
|
1116 $chh_ot =~ s/sam$/txt/;
|
|
1117 $chh_ot =~ s/bam$/txt/;
|
|
1118 $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/);
|
|
1119 $chh_ot = $output_dir . $chh_ot;
|
|
1120
|
|
1121 if ($gzip){
|
|
1122 $chh_ot .= '.gz';
|
|
1123 open ($fhs{0}->{CHH},"| gzip -c - > $chh_ot") or die "Failed to write to $chh_ot $!\n";
|
|
1124 }
|
|
1125 else{
|
|
1126 open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n";
|
|
1127 }
|
|
1128
|
|
1129 warn "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n";
|
|
1130 push @sorting_files,$chh_ot;
|
|
1131
|
|
1132 unless($no_header){
|
|
1133 print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n";
|
|
1134 }
|
|
1135
|
|
1136 $chh_ctot =~ s/^/CHH_CTOT_/;
|
|
1137 $chh_ctot =~ s/sam$/txt/;
|
|
1138 $chh_ctot =~ s/bam$/txt/;
|
|
1139 $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/);
|
|
1140 $chh_ctot = $output_dir . $chh_ctot;
|
|
1141
|
|
1142 if ($gzip){
|
|
1143 $chh_ctot .= '.gz';
|
|
1144 open ($fhs{1}->{CHH},"| gzip -c - > $chh_ctot") or die "Failed to write to $chh_ctot $!\n";
|
|
1145 }
|
|
1146 else{
|
|
1147 open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n";
|
|
1148 }
|
|
1149
|
|
1150 warn "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n";
|
|
1151 push @sorting_files,$chh_ctot;
|
|
1152
|
|
1153 unless($no_header){
|
|
1154 print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n";
|
|
1155 }
|
|
1156
|
|
1157 $chh_ctob =~ s/^/CHH_CTOB_/;
|
|
1158 $chh_ctob =~ s/sam$/txt/;
|
|
1159 $chh_ctob =~ s/bam$/txt/;
|
|
1160 $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/);
|
|
1161 $chh_ctob = $output_dir . $chh_ctob;
|
|
1162
|
|
1163 if ($gzip){
|
|
1164 $chh_ctob .= '.gz';
|
|
1165 open ($fhs{2}->{CHH},"| gzip -c - > $chh_ctob") or die "Failed to write to $chh_ctob $!\n";
|
|
1166 }
|
|
1167 else{
|
|
1168 open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n";
|
|
1169 }
|
|
1170
|
|
1171 warn "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n";
|
|
1172 push @sorting_files,$chh_ctob;
|
|
1173
|
|
1174 unless($no_header){
|
|
1175 print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n";
|
|
1176 }
|
|
1177
|
|
1178 $chh_ob =~ s/^/CHH_OB_/;
|
|
1179 $chh_ob =~ s/sam$/txt/;
|
|
1180 $chh_ob =~ s/bam$/txt/;
|
|
1181 $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/);
|
|
1182 $chh_ob = $output_dir . $chh_ob;
|
|
1183
|
|
1184 if ($gzip){
|
|
1185 $chh_ob .= '.gz';
|
|
1186 open ($fhs{3}->{CHH},"| gzip -c - > $chh_ob") or die "Failed to write to $chh_ob $!\n";
|
|
1187 }
|
|
1188 else{
|
|
1189 open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n";
|
|
1190 }
|
|
1191
|
|
1192 warn "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n";
|
|
1193 push @sorting_files,$chh_ob;
|
|
1194
|
|
1195 unless($no_header){
|
|
1196 print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n";
|
|
1197 }
|
|
1198 }
|
|
1199
|
|
1200 my $methylation_call_strings_processed = 0;
|
|
1201 my $line_count = 0;
|
|
1202
|
|
1203 ### proceeding differently now for single-end or paired-end Bismark files
|
|
1204
|
|
1205 ### PROCESSING SINGLE-END RESULT FILES
|
|
1206 if ($single) {
|
|
1207
|
|
1208 ### also proceeding differently now for SAM format or vanilla Bismark format files
|
|
1209 if ($vanilla) { # old vanilla Bismark output format
|
|
1210 while (<IN>) {
|
|
1211 ++$line_count;
|
|
1212 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
|
|
1213
|
|
1214 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
|
|
1215 my ($id,$strand,$chrom,$start,$seq,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,6,7,8,9];
|
|
1216
|
|
1217 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the first or
|
|
1218 ### last position
|
|
1219 chomp $genome_conversion;
|
|
1220
|
|
1221 my $index;
|
|
1222 if ($meth_call) {
|
|
1223
|
|
1224 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
|
|
1225 $index = 0;
|
|
1226 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
|
|
1227 $index = 1;
|
|
1228 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
|
|
1229 $index = 3;
|
|
1230 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
|
|
1231 $index = 2;
|
|
1232 } else {
|
|
1233 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
|
|
1234 }
|
|
1235
|
|
1236 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int>
|
|
1237 if ($ignore) {
|
|
1238 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
|
|
1239
|
|
1240 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
|
|
1241 if ($strand eq '+') {
|
|
1242 $start += $ignore;
|
|
1243 } elsif ($strand eq '-') {
|
|
1244 $start += length($meth_call)-1; ## $meth_call is already shortened!
|
|
1245 } else {
|
|
1246 die "Alignment did not have proper strand information: $strand\n";
|
|
1247 }
|
|
1248 }
|
|
1249 ### printing out the methylation state of every C in the read
|
|
1250 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index);
|
|
1251
|
|
1252 ++$methylation_call_strings_processed; # 1 per single-end result
|
|
1253 }
|
|
1254 }
|
|
1255 } else { # processing single-end SAM format (default)
|
|
1256 while (<IN>) {
|
|
1257 ### SAM format can either start with header lines (starting with @) or start with alignments directly
|
|
1258 if (/^\@/) { # skipping header lines (starting with @)
|
|
1259 warn "skipping SAM header line:\t$_";
|
|
1260 next;
|
|
1261 }
|
|
1262
|
|
1263 ++$line_count;
|
|
1264 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
|
|
1265
|
|
1266 # example read in SAM format
|
|
1267 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
|
|
1268 ###
|
|
1269
|
|
1270 # < 0.7.6 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
|
|
1271 # < 0.7.6 $meth_call =~ s/^XM:Z://;
|
|
1272 # < 0.7.6 $read_conversion =~ s/^XR:Z://;
|
|
1273 # < 0.7.6 $genome_conversion =~ s/^XG:Z://;
|
|
1274
|
|
1275 my ($id,$chrom,$start,$cigar) = (split("\t"))[0,2,3,5];
|
|
1276
|
|
1277 ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
|
|
1278 my $meth_call; ### Thanks to Zachary Zeno for this solution
|
|
1279 my $read_conversion;
|
|
1280 my $genome_conversion;
|
|
1281
|
|
1282 while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
|
|
1283 my $tag = $1;
|
|
1284 my $value = $2;
|
|
1285
|
|
1286 if ($tag eq "XM") {
|
|
1287 $meth_call = $value;
|
|
1288 $meth_call =~ s/\r//;
|
|
1289 } elsif ($tag eq "XR") {
|
|
1290 $read_conversion = $value;
|
|
1291 $read_conversion =~ s/\r//;
|
|
1292 } elsif ($tag eq "XG") {
|
|
1293 $genome_conversion = $value;
|
|
1294 $genome_conversion =~ s/\r//;
|
|
1295 }
|
|
1296 }
|
|
1297
|
|
1298 my $strand;
|
|
1299 chomp $genome_conversion;
|
|
1300 # print "$meth_call\n$read_conversion\n$genome_conversion\n";
|
|
1301
|
|
1302 my $index;
|
|
1303 if ($meth_call) {
|
|
1304 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
|
|
1305 $index = 0;
|
|
1306 $strand = '+';
|
|
1307 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
|
|
1308 $index = 1;
|
|
1309 $strand = '-';
|
|
1310 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
|
|
1311 $index = 2;
|
|
1312 $strand = '+';
|
|
1313 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
|
|
1314 $index = 3;
|
|
1315 $strand = '-';
|
|
1316 } else {
|
|
1317 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
|
|
1318 }
|
|
1319
|
|
1320 ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output
|
|
1321 if ($strand eq '-') {
|
|
1322 $meth_call = reverse $meth_call;
|
|
1323 }
|
|
1324
|
|
1325 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int>
|
|
1326 if ($ignore) {
|
|
1327 # print "\n\n$meth_call\n";
|
|
1328 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
|
|
1329 # print "$meth_call\n";
|
|
1330 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
|
|
1331
|
|
1332 my @len = split (/\D+/,$cigar); # storing the length per operation
|
|
1333 my @ops = split (/\d+/,$cigar); # storing the operation
|
|
1334 shift @ops; # remove the empty first element
|
|
1335 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
|
|
1336
|
|
1337 my @comp_cigar; # building an array with all CIGAR operations
|
|
1338 foreach my $index (0..$#len) {
|
|
1339 foreach (1..$len[$index]) {
|
|
1340 # print "$ops[$index]";
|
|
1341 push @comp_cigar, $ops[$index];
|
|
1342 }
|
|
1343 }
|
|
1344 # print "original CIGAR: $cigar\n";
|
|
1345 # print "original CIGAR: @comp_cigar\n";
|
|
1346
|
|
1347 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
|
|
1348 if ($strand eq '+') {
|
|
1349
|
|
1350 my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions
|
|
1351 my $I_count = 0;
|
|
1352
|
|
1353 for (1..$ignore) {
|
|
1354 my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations from the start
|
|
1355 # print "$_ deleted $op\n";
|
|
1356
|
|
1357 while ($op eq 'D') { # repeating this for deletions (D)
|
|
1358 $D_count++;
|
|
1359 $op = shift @comp_cigar;
|
|
1360 # print "$_ deleted $op\n";
|
|
1361 }
|
|
1362 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
|
|
1363 $I_count++;
|
|
1364 }
|
|
1365 }
|
|
1366 $start += $ignore + $D_count - $I_count;
|
|
1367 # print "start $start\t ignore: $ignore\t D count: $D_count I_count: $I_count\n";
|
|
1368 } elsif ($strand eq '-') {
|
|
1369
|
|
1370 for (1..$ignore) {
|
|
1371 my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
|
|
1372 while ($op eq 'D') { # repeating this for deletions (D)
|
|
1373 $op = pop @comp_cigar;
|
|
1374 }
|
|
1375 }
|
|
1376
|
|
1377 ### For reverse strand alignments we need to determine the number of matching bases (M) or deletions (D) in the read from the CIGAR
|
|
1378 ### string to be able to work out the starting position of the read which is on the 3' end of the sequence
|
|
1379 my $MD_count = 0; # counting all operations that affect the genomic position, i.e. M and D. Insertions do not affect the start position
|
|
1380 foreach (@comp_cigar) {
|
|
1381 ++$MD_count if ($_ eq 'M' or $_ eq 'D');
|
|
1382 }
|
|
1383 $start += $MD_count - 1;
|
|
1384 }
|
|
1385
|
|
1386 ### reconstituting shortened CIGAR string
|
|
1387 my $new_cigar;
|
|
1388 my $count = 0;
|
|
1389 my $last_op;
|
|
1390 # print "ignore adjusted: @comp_cigar\n";
|
|
1391 foreach my $op (@comp_cigar) {
|
|
1392 unless (defined $last_op){
|
|
1393 $last_op = $op;
|
|
1394 ++$count;
|
|
1395 next;
|
|
1396 }
|
|
1397 if ($last_op eq $op) {
|
|
1398 ++$count;
|
|
1399 } else {
|
|
1400 $new_cigar .= "$count$last_op";
|
|
1401 $last_op = $op;
|
|
1402 $count = 1;
|
|
1403 }
|
|
1404 }
|
|
1405 $new_cigar .= "$count$last_op"; # appending the last operation and count
|
|
1406 $cigar = $new_cigar;
|
|
1407 # print "ignore adjusted scalar: $cigar\n";
|
|
1408 }
|
|
1409 }
|
|
1410 ### printing out the methylation state of every C in the read
|
|
1411 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index,$cigar);
|
|
1412
|
|
1413 ++$methylation_call_strings_processed; # 1 per single-end result
|
|
1414 }
|
|
1415 }
|
|
1416 }
|
|
1417
|
|
1418 ### PROCESSING PAIRED-END RESULT FILES
|
|
1419 elsif ($paired) {
|
|
1420
|
|
1421 ### proceeding differently now for SAM format or vanilla Bismark format files
|
|
1422 if ($vanilla) { # old vanilla Bismark paired-end output format
|
|
1423 while (<IN>) {
|
|
1424 ++$line_count;
|
|
1425 warn "processed line: $line_count\n" if ($line_count%500000==0);
|
|
1426
|
|
1427 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
|
|
1428 my ($id,$strand,$chrom,$start_read_1,$end_read_2,$seq_1,$meth_call_1,$seq_2,$meth_call_2,$first_read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,4,6,7,9,10,11,12,13];
|
|
1429
|
|
1430 my $index;
|
|
1431 chomp $genome_conversion;
|
|
1432
|
|
1433 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') {
|
|
1434 $index = 0; ## this is OT
|
|
1435 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') {
|
|
1436 $index = 2; ## this is CTOB!!!
|
|
1437 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') {
|
|
1438 $index = 1; ## this is CTOT!!!
|
|
1439 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') {
|
|
1440 $index = 3; ## this is OB
|
|
1441 } else {
|
|
1442 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
|
|
1443 }
|
|
1444
|
|
1445 if ($meth_call_1 and $meth_call_2) {
|
|
1446 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>'
|
|
1447 if ($ignore) {
|
|
1448 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
|
|
1449 $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore);
|
|
1450
|
|
1451 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified
|
|
1452 $start_read_1 += $ignore;
|
|
1453 $end_read_2 -= $ignore;
|
|
1454 }
|
|
1455 my $end_read_1;
|
|
1456 my $start_read_2;
|
|
1457
|
|
1458 if ($strand eq '+') {
|
|
1459
|
|
1460 $end_read_1 = $start_read_1+length($meth_call_1)-1;
|
|
1461 $start_read_2 = $end_read_2-length($meth_call_2)+1;
|
|
1462
|
|
1463 ## we first pass the first read which is in + orientation on the forward strand
|
|
1464 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0);
|
|
1465
|
|
1466 # we next pass the second read which is in - orientation on the reverse strand
|
|
1467 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
|
|
1468 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1);
|
|
1469 } else {
|
|
1470
|
|
1471 $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read!
|
|
1472 $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read!
|
|
1473
|
|
1474 ## we first pass the first read which is in - orientation on the reverse strand
|
|
1475 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0);
|
|
1476
|
|
1477 # we next pass the second read which is in + orientation on the forward strand
|
|
1478 ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
|
|
1479 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2);
|
|
1480 }
|
|
1481
|
|
1482 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
|
|
1483 }
|
|
1484 }
|
|
1485 } else { # Bismark paired-end SAM output format (default)
|
|
1486 while (<IN>) {
|
|
1487 ### SAM format can either start with header lines (starting with @) or start with alignments directly
|
|
1488 if (/^\@/) { # skipping header lines (starting with @)
|
|
1489 warn "skipping SAM header line:\t$_";
|
|
1490 next;
|
|
1491 }
|
|
1492
|
|
1493 ++$line_count;
|
|
1494 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
|
|
1495
|
|
1496 # example paired-end reads in SAM format (2 consecutive lines)
|
|
1497 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
|
|
1498 # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT
|
|
1499
|
|
1500 # < version 0.7.6 my ($id_1,$chrom,$start_read_1,$meth_call_1,$first_read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
|
|
1501
|
|
1502 my ($id_1,$chrom,$start_read_1,$cigar_1) = (split("\t"))[0,2,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
|
|
1503 my $meth_call_1;
|
|
1504 my $first_read_conversion;
|
|
1505 my $genome_conversion;
|
|
1506
|
|
1507 while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
|
|
1508 my $tag = $1;
|
|
1509 my $value = $2;
|
|
1510
|
|
1511 if ($tag eq "XM") {
|
|
1512 $meth_call_1 = $value;
|
|
1513 $meth_call_1 =~ s/\r//;
|
|
1514 } elsif ($tag eq "XR") {
|
|
1515 $first_read_conversion = $value;
|
|
1516 $first_read_conversion =~ s/\r//;
|
|
1517 } elsif ($tag eq "XG") {
|
|
1518 $genome_conversion = $value;
|
|
1519 $genome_conversion =~ s/\r//;
|
|
1520 }
|
|
1521 }
|
|
1522
|
|
1523 $_ = <IN>; # reading in the paired read
|
|
1524
|
|
1525 # < version 0.7.6 my ($id_2,$start_read_2,$meth_call_2,$second_read_conversion) = (split("\t"))[0,3,13,14];
|
|
1526 # < version 0.7.6 $meth_call_1 =~ s/^XM:Z://;
|
|
1527 # < version 0.7.6 $meth_call_2 =~ s/^XM:Z://;
|
|
1528 # < version 0.7.6 $first_read_conversion =~ s/^XR:Z://;
|
|
1529 # < version 0.7.6 $second_read_conversion =~ s/^XR:Z://;
|
|
1530
|
|
1531 my ($id_2,$start_read_2,$cigar_2) = (split("\t"))[0,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
|
|
1532
|
|
1533 my $meth_call_2;
|
|
1534 my $second_read_conversion;
|
|
1535
|
|
1536 while ( /(XM|XR):Z:([^\t]+)/g ) {
|
|
1537 my $tag = $1;
|
|
1538 my $value = $2;
|
|
1539
|
|
1540 if ($tag eq "XM") {
|
|
1541 $meth_call_2 = $value;
|
|
1542 $meth_call_2 =~ s/\r//;
|
|
1543 } elsif ($tag eq "XR") {
|
|
1544 $second_read_conversion = $value;
|
|
1545 $second_read_conversion = s/\r//;
|
|
1546 }
|
|
1547 }
|
|
1548
|
|
1549 # < version 0.7.6 $genome_conversion =~ s/^XG:Z://;
|
|
1550 chomp $genome_conversion; # in case it captured a new line character
|
|
1551
|
|
1552 # print join ("\t",$meth_call_1,$meth_call_2,$first_read_conversion,$second_read_conversion,$genome_conversion),"\n";
|
|
1553
|
|
1554 my $index;
|
|
1555 my $strand;
|
|
1556
|
|
1557 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') {
|
|
1558 $index = 0; ## this is OT
|
|
1559 $strand = '+';
|
|
1560 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') {
|
|
1561 $index = 1; ## this is CTOT
|
|
1562 $strand = '-';
|
|
1563 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') {
|
|
1564 $index = 2; ## this is CTOB
|
|
1565 $strand = '+';
|
|
1566 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') {
|
|
1567 $index = 3; ## this is OB
|
|
1568 $strand = '-';
|
|
1569 } else {
|
|
1570 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
|
|
1571 }
|
|
1572
|
|
1573 ### reversing the methylation call of the read that was reverse-complemented
|
|
1574 if ($strand eq '+') {
|
|
1575 $meth_call_2 = reverse $meth_call_2;
|
|
1576 } else {
|
|
1577 $meth_call_1 = reverse $meth_call_1;
|
|
1578 }
|
|
1579
|
|
1580 if ($meth_call_1 and $meth_call_2) {
|
|
1581
|
|
1582 my $end_read_1;
|
|
1583
|
|
1584 ### READ 1
|
|
1585 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
|
|
1586 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
|
|
1587 shift @ops_1; # remove the empty first element
|
|
1588 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1);
|
|
1589
|
|
1590 my @comp_cigar_1; # building an array with all CIGAR operations
|
|
1591 foreach my $index (0..$#len_1) {
|
|
1592 foreach (1..$len_1[$index]) {
|
|
1593 # print "$ops_1[$index]";
|
|
1594 push @comp_cigar_1, $ops_1[$index];
|
|
1595 }
|
|
1596 }
|
|
1597 # print "original CIGAR read 1: $cigar_1\n";
|
|
1598 # print "original CIGAR read 1: @comp_cigar_1\n";
|
|
1599
|
|
1600 ### READ 2
|
|
1601 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
|
|
1602 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
|
|
1603 shift @ops_2; # remove the empty first element
|
|
1604 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
|
|
1605 my @comp_cigar_2; # building an array with all CIGAR operations for read 2
|
|
1606 foreach my $index (0..$#len_2) {
|
|
1607 foreach (1..$len_2[$index]) {
|
|
1608 # print "$ops_2[$index]";
|
|
1609 push @comp_cigar_2, $ops_2[$index];
|
|
1610 }
|
|
1611 }
|
|
1612 # print "original CIGAR read 2: $cigar_2\n";
|
|
1613 # print "original CIGAR read 2: @comp_cigar_2\n";
|
|
1614
|
|
1615 if ($ignore) {
|
|
1616 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>'
|
|
1617 ### the methylation calls have already been reversed where necessary
|
|
1618 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
|
|
1619 $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore);
|
|
1620
|
|
1621 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
|
|
1622
|
|
1623 if ($strand eq '+') {
|
|
1624
|
|
1625 ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start
|
|
1626 my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions
|
|
1627 my $I_count_1 = 0;
|
|
1628
|
|
1629 for (1..$ignore) {
|
|
1630 my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start
|
|
1631 # print "$_ deleted $op\n";
|
|
1632
|
|
1633 while ($op eq 'D') { # repeating this for deletions (D)
|
|
1634 $D_count_1++;
|
|
1635 $op = shift @comp_cigar_1;
|
|
1636 # print "$_ deleted $op\n";
|
|
1637 }
|
|
1638 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
|
|
1639 $I_count_1++;
|
|
1640 }
|
|
1641 }
|
|
1642
|
|
1643 $start_read_1 += $ignore + $D_count_1 - $I_count_1;
|
|
1644 # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n";
|
|
1645
|
|
1646 ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back
|
|
1647
|
|
1648 for (1..$ignore) {
|
|
1649 my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
|
|
1650 while ($op eq 'D') { # repeating this for deletions (D)
|
|
1651 $op = pop @comp_cigar_2;
|
|
1652 }
|
|
1653 }
|
|
1654 # the start position of reads mapping to the reverse strand is being adjusted further below
|
|
1655 } elsif ($strand eq '-') {
|
|
1656
|
|
1657 ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back
|
|
1658 for (1..$ignore) {
|
|
1659 my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
|
|
1660 while ($op eq 'D') { # repeating this for deletions (D)
|
|
1661 $op = pop @comp_cigar_1;
|
|
1662 }
|
|
1663 }
|
|
1664 # the start position of reads mapping to the reverse strand is being adjusted further below
|
|
1665
|
|
1666 ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start
|
|
1667 my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions
|
|
1668 my $I_count_2 = 0;
|
|
1669
|
|
1670 for (1..$ignore) {
|
|
1671 my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start
|
|
1672 # print "$_ deleted $op\n";
|
|
1673
|
|
1674 while ($op eq 'D') { # repeating this for deletions (D)
|
|
1675 $D_count_2++;
|
|
1676 $op = shift @comp_cigar_2;
|
|
1677 # print "$_ deleted $op\n";
|
|
1678 }
|
|
1679 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
|
|
1680 $I_count_2++;
|
|
1681 }
|
|
1682 }
|
|
1683
|
|
1684 $start_read_2 += $ignore + $D_count_2 - $I_count_2;
|
|
1685 # print "start read 2 $start_read_2\t ignore: $ignore\t D count 2: $D_count_2\tI_count 2: $I_count_2\n";
|
|
1686
|
|
1687 }
|
|
1688
|
|
1689 ### reconstituting shortened CIGAR string 1
|
|
1690 my $new_cigar_1;
|
|
1691 my $count_1 = 0;
|
|
1692 my $last_op_1;
|
|
1693 # print "ignore adjusted CIGAR 1: @comp_cigar_1\n";
|
|
1694 foreach my $op (@comp_cigar_1) {
|
|
1695 unless (defined $last_op_1){
|
|
1696 $last_op_1 = $op;
|
|
1697 ++$count_1;
|
|
1698 next;
|
|
1699 }
|
|
1700 if ($last_op_1 eq $op) {
|
|
1701 ++$count_1;
|
|
1702 } else {
|
|
1703 $new_cigar_1 .= "$count_1$last_op_1";
|
|
1704 $last_op_1 = $op;
|
|
1705 $count_1 = 1;
|
|
1706 }
|
|
1707 }
|
|
1708 $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count
|
|
1709 $cigar_1 = $new_cigar_1;
|
|
1710 # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n";
|
|
1711
|
|
1712 ### reconstituting shortened CIGAR string 2
|
|
1713 my $new_cigar_2;
|
|
1714 my $count_2 = 0;
|
|
1715 my $last_op_2;
|
|
1716 # print "ignore adjusted CIGAR 2: @comp_cigar_2\n";
|
|
1717 foreach my $op (@comp_cigar_2) {
|
|
1718 unless (defined $last_op_2){
|
|
1719 $last_op_2 = $op;
|
|
1720 ++$count_2;
|
|
1721 next;
|
|
1722 }
|
|
1723 if ($last_op_2 eq $op) {
|
|
1724 ++$count_2;
|
|
1725 } else {
|
|
1726 $new_cigar_2 .= "$count_2$last_op_2";
|
|
1727 $last_op_2 = $op;
|
|
1728 $count_2 = 1;
|
|
1729 }
|
|
1730 }
|
|
1731 $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count
|
|
1732 $cigar_2 = $new_cigar_2;
|
|
1733 # print "ignore adjusted CIGAR 2 scalar: $cigar_2\n";
|
|
1734
|
|
1735 }
|
|
1736
|
|
1737 if ($strand eq '+') {
|
|
1738 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2
|
|
1739 @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
|
|
1740 # print "reverse: @comp_cigar_2\n";
|
|
1741
|
|
1742 my $MD_count_1 = 0;
|
|
1743 foreach (@comp_cigar_1) {
|
|
1744 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
|
|
1745 }
|
|
1746
|
|
1747 my $MD_count_2 = 0;
|
|
1748 foreach (@comp_cigar_2) {
|
|
1749 ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
|
|
1750 }
|
|
1751
|
|
1752 $end_read_1 = $start_read_1 + $MD_count_1 - 1;
|
|
1753 $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand
|
|
1754 } else {
|
|
1755 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1
|
|
1756
|
|
1757 @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
|
|
1758 # print "reverse: @comp_cigar_1\n";
|
|
1759
|
|
1760 my $MD_count_1 = 0;
|
|
1761 foreach (@comp_cigar_1) {
|
|
1762 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
|
|
1763 }
|
|
1764
|
|
1765 $end_read_1 = $start_read_1;
|
|
1766 $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand
|
|
1767
|
|
1768 }
|
|
1769
|
|
1770 if ($strand eq '+') {
|
|
1771 ## we first pass the first read which is in + orientation on the forward strand
|
|
1772 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1);
|
|
1773
|
|
1774 # we next pass the second read which is in - orientation on the reverse strand
|
|
1775 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
|
|
1776 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2);
|
|
1777 } else {
|
|
1778 ## we first pass the first read which is in - orientation on the reverse strand
|
|
1779 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1);
|
|
1780
|
|
1781 # we next pass the second read which is in + orientation on the forward strand
|
|
1782 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
|
|
1783 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2);
|
|
1784 }
|
|
1785
|
|
1786 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
|
|
1787 }
|
|
1788 }
|
|
1789 }
|
|
1790 } else {
|
|
1791 die "Single-end or paired-end reads not specified properly\n";
|
|
1792 }
|
|
1793
|
|
1794 print "\n\nProcessed $line_count lines from $filename in total\n";
|
|
1795 print "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n";
|
|
1796 if ($report) {
|
|
1797 print REPORT "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n";
|
|
1798 }
|
|
1799 print_splitting_report ();
|
|
1800 }
|
|
1801
|
|
1802
|
|
1803
|
|
1804 sub print_splitting_report{
|
|
1805
|
|
1806 ### Calculating methylation percentages if applicable
|
|
1807
|
|
1808 my $percent_meCpG;
|
|
1809 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
|
|
1810 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
|
|
1811 }
|
|
1812
|
|
1813 my $percent_meCHG;
|
|
1814 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
|
|
1815 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
|
|
1816 }
|
|
1817
|
|
1818 my $percent_meCHH;
|
|
1819 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
|
|
1820 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
|
|
1821 }
|
|
1822
|
|
1823 my $percent_non_CpG_methylation;
|
|
1824 if ($merge_non_CpG){
|
|
1825 if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
|
|
1826 $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) );
|
|
1827 }
|
|
1828 }
|
|
1829
|
|
1830 if ($report){
|
|
1831 ### detailed information about Cs analysed
|
|
1832 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
|
|
1833
|
|
1834 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
|
|
1835 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
|
|
1836
|
|
1837 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
|
|
1838 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
|
|
1839 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
|
|
1840
|
|
1841 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
|
|
1842 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
|
|
1843 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
|
|
1844
|
|
1845 ### calculating methylated CpG percentage if applicable
|
|
1846 if ($percent_meCpG){
|
|
1847 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
|
|
1848 }
|
|
1849 else{
|
|
1850 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
|
|
1851 }
|
|
1852
|
|
1853 ### 2-Context Output
|
|
1854 if ($merge_non_CpG){
|
|
1855 if ($percent_non_CpG_methylation){
|
|
1856 print REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
|
|
1857 }
|
|
1858 else{
|
|
1859 print REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
|
|
1860 }
|
|
1861 }
|
|
1862
|
|
1863 ### 3 Context Output
|
|
1864 else{
|
|
1865 ### calculating methylated CHG percentage if applicable
|
|
1866 if ($percent_meCHG){
|
|
1867 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
|
|
1868 }
|
|
1869 else{
|
|
1870 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
|
|
1871 }
|
|
1872
|
|
1873 ### calculating methylated CHH percentage if applicable
|
|
1874 if ($percent_meCHH){
|
|
1875 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
|
|
1876 }
|
|
1877 else{
|
|
1878 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
|
|
1879 }
|
|
1880 }
|
|
1881 }
|
|
1882
|
|
1883 ### detailed information about Cs analysed for on-screen report
|
|
1884 print "Final Cytosine Methylation Report\n",'='x33,"\n";
|
|
1885
|
|
1886 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
|
|
1887 print "Total number of C's analysed:\t$total_number_of_C\n\n";
|
|
1888
|
|
1889 print "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
|
|
1890 print "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
|
|
1891 print "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
|
|
1892
|
|
1893 print "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
|
|
1894 print "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
|
|
1895 print "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
|
|
1896
|
|
1897 ### printing methylated CpG percentage if applicable
|
|
1898 if ($percent_meCpG){
|
|
1899 print "C methylated in CpG context:\t${percent_meCpG}%\n";
|
|
1900 }
|
|
1901 else{
|
|
1902 print "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
|
|
1903 }
|
|
1904
|
|
1905 ### 2-Context Output
|
|
1906 if ($merge_non_CpG){
|
|
1907 if ($percent_non_CpG_methylation){
|
|
1908 print "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
|
|
1909 }
|
|
1910 else{
|
|
1911 print "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
|
|
1912 }
|
|
1913 }
|
|
1914
|
|
1915 ### 3-Context Output
|
|
1916 else{
|
|
1917 ### printing methylated CHG percentage if applicable
|
|
1918 if ($percent_meCHG){
|
|
1919 print "C methylated in CHG context:\t${percent_meCHG}%\n";
|
|
1920 }
|
|
1921 else{
|
|
1922 print "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
|
|
1923 }
|
|
1924
|
|
1925 ### printing methylated CHH percentage if applicable
|
|
1926 if ($percent_meCHH){
|
|
1927 print "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
|
|
1928 }
|
|
1929 else{
|
|
1930 print "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
|
|
1931 }
|
|
1932 }
|
|
1933 }
|
|
1934
|
|
1935
|
|
1936
|
|
1937
|
|
1938
|
|
1939 sub print_individual_C_methylation_states_paired_end_files{
|
|
1940
|
|
1941 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar) = @_;
|
|
1942 my @methylation_calls = split(//,$meth_call);
|
|
1943
|
|
1944 #################################################################
|
|
1945 ### . for bases not involving cytosines ###
|
|
1946 ### X for methylated C in CHG context (was protected) ###
|
|
1947 ### x for not methylated C in CHG context (was converted) ###
|
|
1948 ### H for methylated C in CHH context (was protected) ###
|
|
1949 ### h for not methylated C in CHH context (was converted) ###
|
|
1950 ### Z for methylated C in CpG context (was protected) ###
|
|
1951 ### z for not methylated C in CpG context (was converted) ###
|
|
1952 #################################################################
|
|
1953
|
|
1954 my $methyl_CHG_count = 0;
|
|
1955 my $methyl_CHH_count = 0;
|
|
1956 my $methyl_CpG_count = 0;
|
|
1957 my $unmethylated_CHG_count = 0;
|
|
1958 my $unmethylated_CHH_count = 0;
|
|
1959 my $unmethylated_CpG_count = 0;
|
|
1960
|
|
1961
|
|
1962 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
|
|
1963 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
|
|
1964 my @comp_cigar;
|
|
1965
|
|
1966 ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing
|
|
1967 if ($cigar =~ /^\d+M$/){
|
|
1968 }
|
|
1969 else{ # parsing CIGAR string
|
|
1970 my @len;
|
|
1971 my @ops;
|
|
1972 @len = split (/\D+/,$cigar); # storing the length per operation
|
|
1973 @ops = split (/\d+/,$cigar); # storing the operation
|
|
1974 shift @ops; # remove the empty first element
|
|
1975 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
|
|
1976
|
|
1977 foreach my $index (0..$#len){
|
|
1978 foreach (1..$len[$index]){
|
|
1979 # print "$ops[$index]";
|
|
1980 push @comp_cigar, $ops[$index];
|
|
1981 }
|
|
1982 }
|
|
1983 # warn "\nDetected CIGAR string: $cigar\n";
|
|
1984 # warn "Length of methylation call: ",length $meth_call,"\n";
|
|
1985 # warn "number of operations: ",scalar @ops,"\n";
|
|
1986 # warn "number of length digits: ",scalar @len,"\n\n";
|
|
1987 # print @comp_cigar,"\n";
|
|
1988 # print "$meth_call\n\n";
|
|
1989 # sleep (1);
|
|
1990 }
|
|
1991
|
|
1992 if ($strand eq '-') {
|
|
1993
|
|
1994 ### the CIGAR string needs to be reversed, the methylation call has already been reversed above
|
|
1995 if (@comp_cigar){
|
|
1996 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
|
|
1997 }
|
|
1998 # print "reverse CIGAR string: @comp_cigar\n";
|
|
1999
|
|
2000 ### the start position of paired-end files has already been corrected, see above
|
|
2001 }
|
|
2002
|
|
2003 ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified
|
|
2004
|
|
2005 if ($merge_non_CpG) {
|
|
2006
|
|
2007 if ($no_overlap) {
|
|
2008
|
|
2009 ### single-file CpG and non-CpG context output
|
|
2010 if ($full) {
|
|
2011 if ($strand eq '+') {
|
|
2012 for my $index (0..$#methylation_calls) {
|
|
2013
|
|
2014 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
2015 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2016 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
2017 $cigar_offset += $cigar_mod;
|
|
2018 $pos_offset += $pos_mod;
|
|
2019 }
|
|
2020
|
|
2021 ### Returning as soon as the methylation calls start overlapping
|
|
2022 if ($start+$index+$pos_offset >= $end_read_1) {
|
|
2023 return;
|
|
2024 }
|
|
2025
|
|
2026 if ($methylation_calls[$index] eq 'X') {
|
|
2027 $counting{total_meCHG_count}++;
|
|
2028 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2029 } elsif ($methylation_calls[$index] eq 'x') {
|
|
2030 $counting{total_unmethylated_CHG_count}++;
|
|
2031 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2032 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
2033 $counting{total_meCpG_count}++;
|
|
2034 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2035 } elsif ($methylation_calls[$index] eq 'z') {
|
|
2036 $counting{total_unmethylated_CpG_count}++;
|
|
2037 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2038 } elsif ($methylation_calls[$index] eq 'H') {
|
|
2039 $counting{total_meCHH_count}++;
|
|
2040 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2041 } elsif ($methylation_calls[$index] eq 'h') {
|
|
2042 $counting{total_unmethylated_CHH_count}++;
|
|
2043 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2044 }
|
|
2045 elsif ($methylation_calls[$index] eq '.'){}
|
|
2046 else{
|
|
2047 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2048 }
|
|
2049 }
|
|
2050 } elsif ($strand eq '-') {
|
|
2051 for my $index (0..$#methylation_calls) {
|
|
2052
|
|
2053 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
2054 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
2055 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2056 $cigar_offset += $cigar_mod;
|
|
2057 $pos_offset += $pos_mod;
|
|
2058 }
|
|
2059
|
|
2060 ### Returning as soon as the methylation calls start overlapping
|
|
2061 if ($start-$index+$pos_offset <= $end_read_1) {
|
|
2062 return;
|
|
2063 }
|
|
2064
|
|
2065 if ($methylation_calls[$index] eq 'X') {
|
|
2066 $counting{total_meCHG_count}++;
|
|
2067 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2068 } elsif ($methylation_calls[$index] eq 'x') {
|
|
2069 $counting{total_unmethylated_CHG_count}++;
|
|
2070 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2071 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
2072 $counting{total_meCpG_count}++;
|
|
2073 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2074 } elsif ($methylation_calls[$index] eq 'z') {
|
|
2075 $counting{total_unmethylated_CpG_count}++;
|
|
2076 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2077 } elsif ($methylation_calls[$index] eq 'H') {
|
|
2078 $counting{total_meCHH_count}++;
|
|
2079 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2080 } elsif ($methylation_calls[$index] eq 'h') {
|
|
2081 $counting{total_unmethylated_CHH_count}++;
|
|
2082 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2083 }
|
|
2084 elsif ($methylation_calls[$index] eq '.') {}
|
|
2085 else{
|
|
2086 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2087 }
|
|
2088 }
|
|
2089 } else {
|
|
2090 die "The read orientation was neither + nor -: '$strand'\n";
|
|
2091 }
|
|
2092 }
|
|
2093
|
|
2094 ### strand-specific methylation output
|
|
2095 else {
|
|
2096 if ($strand eq '+') {
|
|
2097 for my $index (0..$#methylation_calls) {
|
|
2098
|
|
2099 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
2100 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2101 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
2102 $cigar_offset += $cigar_mod;
|
|
2103 $pos_offset += $pos_mod;
|
|
2104 }
|
|
2105
|
|
2106 ### Returning as soon as the methylation calls start overlapping
|
|
2107 if ($start+$index+$pos_offset >= $end_read_1) {
|
|
2108 return;
|
|
2109 }
|
|
2110
|
|
2111 if ($methylation_calls[$index] eq 'X') {
|
|
2112 $counting{total_meCHG_count}++;
|
|
2113 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2114 } elsif ($methylation_calls[$index] eq 'x') {
|
|
2115 $counting{total_unmethylated_CHG_count}++;
|
|
2116 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2117 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
2118 $counting{total_meCpG_count}++;
|
|
2119 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2120 } elsif ($methylation_calls[$index] eq 'z') {
|
|
2121 $counting{total_unmethylated_CpG_count}++;
|
|
2122 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2123 } elsif ($methylation_calls[$index] eq 'H') {
|
|
2124 $counting{total_meCHH_count}++;
|
|
2125 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2126 } elsif ($methylation_calls[$index] eq 'h') {
|
|
2127 $counting{total_unmethylated_CHH_count}++;
|
|
2128 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2129 }
|
|
2130 elsif ($methylation_calls[$index] eq '.') {}
|
|
2131 else{
|
|
2132 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2133 }
|
|
2134 }
|
|
2135 } elsif ($strand eq '-') {
|
|
2136 for my $index (0..$#methylation_calls) {
|
|
2137
|
|
2138 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
2139 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
2140 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2141 $cigar_offset += $cigar_mod;
|
|
2142 $pos_offset += $pos_mod;
|
|
2143 }
|
|
2144
|
|
2145 ### Returning as soon as the methylation calls start overlapping
|
|
2146 if ($start-$index+$pos_offset <= $end_read_1) {
|
|
2147 return;
|
|
2148 }
|
|
2149
|
|
2150 if ($methylation_calls[$index] eq 'X') {
|
|
2151 $counting{total_meCHG_count}++;
|
|
2152 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2153 } elsif ($methylation_calls[$index] eq 'x') {
|
|
2154 $counting{total_unmethylated_CHG_count}++;
|
|
2155 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2156 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
2157 $counting{total_meCpG_count}++;
|
|
2158 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2159 } elsif ($methylation_calls[$index] eq 'z') {
|
|
2160 $counting{total_unmethylated_CpG_count}++;
|
|
2161 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2162 } elsif ($methylation_calls[$index] eq 'H') {
|
|
2163 $counting{total_meCHH_count}++;
|
|
2164 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2165 } elsif ($methylation_calls[$index] eq 'h') {
|
|
2166 $counting{total_unmethylated_CHH_count}++;
|
|
2167 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2168 }
|
|
2169 elsif ($methylation_calls[$index] eq '.') {}
|
|
2170 else{
|
|
2171 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2172 }
|
|
2173 }
|
|
2174 } else {
|
|
2175 die "The strand orientation was neither + nor -: '$strand'/n";
|
|
2176 }
|
|
2177 }
|
|
2178 }
|
|
2179
|
|
2180 ### this is the default paired-end procedure allowing overlaps and using every single C position
|
|
2181 ### Still within the 2-CONTEXT ONLY optional section
|
|
2182 else {
|
|
2183 ### single-file CpG and non-CpG context output
|
|
2184 if ($full) {
|
|
2185 if ($strand eq '+') {
|
|
2186 for my $index (0..$#methylation_calls) {
|
|
2187
|
|
2188 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
2189 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2190 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
2191 $cigar_offset += $cigar_mod;
|
|
2192 $pos_offset += $pos_mod;
|
|
2193 }
|
|
2194
|
|
2195 if ($methylation_calls[$index] eq 'X') {
|
|
2196 $counting{total_meCHG_count}++;
|
|
2197 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2198 } elsif ($methylation_calls[$index] eq 'x') {
|
|
2199 $counting{total_unmethylated_CHG_count}++;
|
|
2200 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2201 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
2202 $counting{total_meCpG_count}++;
|
|
2203 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2204 } elsif ($methylation_calls[$index] eq 'z') {
|
|
2205 $counting{total_unmethylated_CpG_count}++;
|
|
2206 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2207 } elsif ($methylation_calls[$index] eq 'H') {
|
|
2208 $counting{total_meCHH_count}++;
|
|
2209 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2210 } elsif ($methylation_calls[$index] eq 'h') {
|
|
2211 $counting{total_unmethylated_CHH_count}++;
|
|
2212 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2213 }
|
|
2214 elsif ($methylation_calls[$index] eq '.') {}
|
|
2215 else{
|
|
2216 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2217 }
|
|
2218 }
|
|
2219 } elsif ($strand eq '-') {
|
|
2220 for my $index (0..$#methylation_calls) {
|
|
2221
|
|
2222 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
2223 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
2224 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2225 $cigar_offset += $cigar_mod;
|
|
2226 $pos_offset += $pos_mod;
|
|
2227 }
|
|
2228
|
|
2229 if ($methylation_calls[$index] eq 'X') {
|
|
2230 $counting{total_meCHG_count}++;
|
|
2231 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2232 } elsif ($methylation_calls[$index] eq 'x') {
|
|
2233 $counting{total_unmethylated_CHG_count}++;
|
|
2234 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2235 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
2236 $counting{total_meCpG_count}++;
|
|
2237 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2238 } elsif ($methylation_calls[$index] eq 'z') {
|
|
2239 $counting{total_unmethylated_CpG_count}++;
|
|
2240 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2241 } elsif ($methylation_calls[$index] eq 'H') {
|
|
2242 $counting{total_meCHH_count}++;
|
|
2243 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2244 } elsif ($methylation_calls[$index] eq 'h') {
|
|
2245 $counting{total_unmethylated_CHH_count}++;
|
|
2246 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2247 }
|
|
2248 elsif ($methylation_calls[$index] eq '.') {}
|
|
2249 else{
|
|
2250 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2251 }
|
|
2252 }
|
|
2253 } else {
|
|
2254 die "The strand orientation as neither + nor -: '$strand'\n";
|
|
2255 }
|
|
2256 }
|
|
2257
|
|
2258 ### strand-specific methylation output
|
|
2259 ### still within the 2-CONTEXT optional section
|
|
2260 else {
|
|
2261 if ($strand eq '+') {
|
|
2262 for my $index (0..$#methylation_calls) {
|
|
2263
|
|
2264 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
2265 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2266 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
2267 $cigar_offset += $cigar_mod;
|
|
2268 $pos_offset += $pos_mod;
|
|
2269 }
|
|
2270
|
|
2271 if ($methylation_calls[$index] eq 'X') {
|
|
2272 $counting{total_meCHG_count}++;
|
|
2273 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2274 } elsif ($methylation_calls[$index] eq 'x') {
|
|
2275 $counting{total_unmethylated_CHG_count}++;
|
|
2276 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2277 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
2278 $counting{total_meCpG_count}++;
|
|
2279 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2280 } elsif ($methylation_calls[$index] eq 'z') {
|
|
2281 $counting{total_unmethylated_CpG_count}++;
|
|
2282 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2283 } elsif ($methylation_calls[$index] eq 'H') {
|
|
2284 $counting{total_meCHH_count}++;
|
|
2285 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2286 } elsif ($methylation_calls[$index] eq 'h') {
|
|
2287 $counting{total_unmethylated_CHH_count}++;
|
|
2288 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2289 }
|
|
2290 elsif ($methylation_calls[$index] eq '.') {}
|
|
2291 else{
|
|
2292 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2293 }
|
|
2294 }
|
|
2295 } elsif ($strand eq '-') {
|
|
2296 for my $index (0..$#methylation_calls) {
|
|
2297
|
|
2298 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
2299 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
2300 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2301 $cigar_offset += $cigar_mod;
|
|
2302 $pos_offset += $pos_mod;
|
|
2303 }
|
|
2304
|
|
2305 if ($methylation_calls[$index] eq 'X') {
|
|
2306 $counting{total_meCHG_count}++;
|
|
2307 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2308 } elsif ($methylation_calls[$index] eq 'x') {
|
|
2309 $counting{total_unmethylated_CHG_count}++;
|
|
2310 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2311 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
2312 $counting{total_meCpG_count}++;
|
|
2313 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2314 } elsif ($methylation_calls[$index] eq 'z') {
|
|
2315 $counting{total_unmethylated_CpG_count}++;
|
|
2316 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2317 } elsif ($methylation_calls[$index] eq 'H') {
|
|
2318 $counting{total_meCHH_count}++;
|
|
2319 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2320 } elsif ($methylation_calls[$index] eq 'h') {
|
|
2321 $counting{total_unmethylated_CHH_count}++;
|
|
2322 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2323 }
|
|
2324 elsif ($methylation_calls[$index] eq '.') {}
|
|
2325 else{
|
|
2326 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2327 }
|
|
2328 }
|
|
2329 } else {
|
|
2330 die "The strand orientation as neither + nor -: '$strand'\n";
|
|
2331 }
|
|
2332 }
|
|
2333 }
|
|
2334 }
|
|
2335
|
|
2336 ############################################
|
|
2337 ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ###
|
|
2338 ############################################
|
|
2339
|
|
2340 elsif ($no_overlap) {
|
|
2341 ### single-file CpG, CHG and CHH context output
|
|
2342 if ($full) {
|
|
2343 if ($strand eq '+') {
|
|
2344 for my $index (0..$#methylation_calls) {
|
|
2345
|
|
2346 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
2347 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2348 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
2349 $cigar_offset += $cigar_mod;
|
|
2350 $pos_offset += $pos_mod;
|
|
2351 }
|
|
2352
|
|
2353 ### Returning as soon as the methylation calls start overlapping
|
|
2354 if ($start+$index+$pos_offset >= $end_read_1) {
|
|
2355 return;
|
|
2356 }
|
|
2357
|
|
2358 if ($methylation_calls[$index] eq 'X') {
|
|
2359 $counting{total_meCHG_count}++;
|
|
2360 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2361 } elsif ($methylation_calls[$index] eq 'x') {
|
|
2362 $counting{total_unmethylated_CHG_count}++;
|
|
2363 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2364 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
2365 $counting{total_meCpG_count}++;
|
|
2366 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2367 } elsif ($methylation_calls[$index] eq 'z') {
|
|
2368 $counting{total_unmethylated_CpG_count}++;
|
|
2369 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2370 } elsif ($methylation_calls[$index] eq 'H') {
|
|
2371 $counting{total_meCHH_count}++;
|
|
2372 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2373 } elsif ($methylation_calls[$index] eq 'h') {
|
|
2374 $counting{total_unmethylated_CHH_count}++;
|
|
2375 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2376 }
|
|
2377 elsif ($methylation_calls[$index] eq '.') {}
|
|
2378 else{
|
|
2379 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2380 }
|
|
2381 }
|
|
2382 } elsif ($strand eq '-') {
|
|
2383 for my $index (0..$#methylation_calls) {
|
|
2384
|
|
2385 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
2386 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
2387 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2388 $cigar_offset += $cigar_mod;
|
|
2389 $pos_offset += $pos_mod;
|
|
2390 }
|
|
2391
|
|
2392 ### Returning as soon as the methylation calls start overlapping
|
|
2393 if ($start-$index+$pos_offset <= $end_read_1) {
|
|
2394 return;
|
|
2395 }
|
|
2396
|
|
2397 if ($methylation_calls[$index] eq 'X') {
|
|
2398 $counting{total_meCHG_count}++;
|
|
2399 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2400 } elsif ($methylation_calls[$index] eq 'x') {
|
|
2401 $counting{total_unmethylated_CHG_count}++;
|
|
2402 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2403 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
2404 $counting{total_meCpG_count}++;
|
|
2405 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2406 } elsif ($methylation_calls[$index] eq 'z') {
|
|
2407 $counting{total_unmethylated_CpG_count}++;
|
|
2408 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2409 } elsif ($methylation_calls[$index] eq 'H') {
|
|
2410 $counting{total_meCHH_count}++;
|
|
2411 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2412 } elsif ($methylation_calls[$index] eq 'h') {
|
|
2413 $counting{total_unmethylated_CHH_count}++;
|
|
2414 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2415 }
|
|
2416 elsif ($methylation_calls[$index] eq '.') {}
|
|
2417 else{
|
|
2418 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2419 }
|
|
2420 }
|
|
2421 } else {
|
|
2422 die "The strand orientation as neither + nor -: '$strand'\n";
|
|
2423 }
|
|
2424 }
|
|
2425
|
|
2426 ### strand-specific methylation output
|
|
2427 else {
|
|
2428 if ($strand eq '+') {
|
|
2429 for my $index (0..$#methylation_calls) {
|
|
2430
|
|
2431 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
2432 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2433 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
2434 $cigar_offset += $cigar_mod;
|
|
2435 $pos_offset += $pos_mod;
|
|
2436 }
|
|
2437
|
|
2438 ### Returning as soon as the methylation calls start overlapping
|
|
2439 if ($start+$index+$pos_offset >= $end_read_1) {
|
|
2440 return;
|
|
2441 }
|
|
2442
|
|
2443 if ($methylation_calls[$index] eq 'X') {
|
|
2444 $counting{total_meCHG_count}++;
|
|
2445 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2446 } elsif ($methylation_calls[$index] eq 'x') {
|
|
2447 $counting{total_unmethylated_CHG_count}++;
|
|
2448 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2449 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
2450 $counting{total_meCpG_count}++;
|
|
2451 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2452 } elsif ($methylation_calls[$index] eq 'z') {
|
|
2453 $counting{total_unmethylated_CpG_count}++;
|
|
2454 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2455 } elsif ($methylation_calls[$index] eq 'H') {
|
|
2456 $counting{total_meCHH_count}++;
|
|
2457 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2458 } elsif ($methylation_calls[$index] eq 'h') {
|
|
2459 $counting{total_unmethylated_CHH_count}++;
|
|
2460 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2461 }
|
|
2462 elsif ($methylation_calls[$index] eq '.') {}
|
|
2463 else{
|
|
2464 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2465 }
|
|
2466 }
|
|
2467 } elsif ($strand eq '-') {
|
|
2468 for my $index (0..$#methylation_calls) {
|
|
2469
|
|
2470 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
2471 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
2472 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2473 $cigar_offset += $cigar_mod;
|
|
2474 $pos_offset += $pos_mod;
|
|
2475 }
|
|
2476
|
|
2477 ### Returning as soon as the methylation calls start overlapping
|
|
2478 if ($start-$index+$pos_offset <= $end_read_1) {
|
|
2479 return;
|
|
2480 }
|
|
2481
|
|
2482 if ($methylation_calls[$index] eq 'X') {
|
|
2483 $counting{total_meCHG_count}++;
|
|
2484 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2485 } elsif ($methylation_calls[$index] eq 'x') {
|
|
2486 $counting{total_unmethylated_CHG_count}++;
|
|
2487 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2488 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
2489 $counting{total_meCpG_count}++;
|
|
2490 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2491 } elsif ($methylation_calls[$index] eq 'z') {
|
|
2492 $counting{total_unmethylated_CpG_count}++;
|
|
2493 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2494 } elsif ($methylation_calls[$index] eq 'H') {
|
|
2495 $counting{total_meCHH_count}++;
|
|
2496 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2497 } elsif ($methylation_calls[$index] eq 'h') {
|
|
2498 $counting{total_unmethylated_CHH_count}++;
|
|
2499 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2500 }
|
|
2501 elsif ($methylation_calls[$index] eq '.') {}
|
|
2502 else{
|
|
2503 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2504 }
|
|
2505 }
|
|
2506 } else {
|
|
2507 die "The strand orientation as neither + nor -: '$strand'\n";
|
|
2508 }
|
|
2509 }
|
|
2510 }
|
|
2511
|
|
2512 ### this is the default paired-end procedure allowing overlaps and using every single C position
|
|
2513 else {
|
|
2514 ### single-file CpG, CHG and CHH context output
|
|
2515 if ($full) {
|
|
2516 if ($strand eq '+') {
|
|
2517 for my $index (0..$#methylation_calls) {
|
|
2518
|
|
2519 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
2520 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2521 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
2522 $cigar_offset += $cigar_mod;
|
|
2523 $pos_offset += $pos_mod;
|
|
2524 }
|
|
2525
|
|
2526 if ($methylation_calls[$index] eq 'X') {
|
|
2527 $counting{total_meCHG_count}++;
|
|
2528 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2529 } elsif ($methylation_calls[$index] eq 'x') {
|
|
2530 $counting{total_unmethylated_CHG_count}++;
|
|
2531 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2532 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
2533 $counting{total_meCpG_count}++;
|
|
2534 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2535 } elsif ($methylation_calls[$index] eq 'z') {
|
|
2536 $counting{total_unmethylated_CpG_count}++;
|
|
2537 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2538 } elsif ($methylation_calls[$index] eq 'H') {
|
|
2539 $counting{total_meCHH_count}++;
|
|
2540 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2541 } elsif ($methylation_calls[$index] eq 'h') {
|
|
2542 $counting{total_unmethylated_CHH_count}++;
|
|
2543 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2544 }
|
|
2545 elsif ($methylation_calls[$index] eq '.') {}
|
|
2546 else{
|
|
2547 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2548 }
|
|
2549 }
|
|
2550 } elsif ($strand eq '-') {
|
|
2551 for my $index (0..$#methylation_calls) {
|
|
2552
|
|
2553 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
2554 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
2555 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2556 $cigar_offset += $cigar_mod;
|
|
2557 $pos_offset += $pos_mod;
|
|
2558 }
|
|
2559
|
|
2560 if ($methylation_calls[$index] eq 'X') {
|
|
2561 $counting{total_meCHG_count}++;
|
|
2562 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2563 } elsif ($methylation_calls[$index] eq 'x') {
|
|
2564 $counting{total_unmethylated_CHG_count}++;
|
|
2565 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2566 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
2567 $counting{total_meCpG_count}++;
|
|
2568 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2569 } elsif ($methylation_calls[$index] eq 'z') {
|
|
2570 $counting{total_unmethylated_CpG_count}++;
|
|
2571 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2572 } elsif ($methylation_calls[$index] eq 'H') {
|
|
2573 $counting{total_meCHH_count}++;
|
|
2574 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2575 } elsif ($methylation_calls[$index] eq 'h') {
|
|
2576 $counting{total_unmethylated_CHH_count}++;
|
|
2577 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2578 }
|
|
2579 elsif ($methylation_calls[$index] eq '.') {}
|
|
2580 else{
|
|
2581 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2582 }
|
|
2583 }
|
|
2584 } else {
|
|
2585 die "The strand orientation as neither + nor -: '$strand'\n";
|
|
2586 }
|
|
2587 }
|
|
2588
|
|
2589 ### strand-specific methylation output
|
|
2590 else {
|
|
2591 if ($strand eq '+') {
|
|
2592 for my $index (0..$#methylation_calls) {
|
|
2593
|
|
2594 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
2595 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2596 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
2597 $cigar_offset += $cigar_mod;
|
|
2598 $pos_offset += $pos_mod;
|
|
2599 }
|
|
2600
|
|
2601 if ($methylation_calls[$index] eq 'X') {
|
|
2602 $counting{total_meCHG_count}++;
|
|
2603 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2604 } elsif ($methylation_calls[$index] eq 'x') {
|
|
2605 $counting{total_unmethylated_CHG_count}++;
|
|
2606 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2607 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
2608 $counting{total_meCpG_count}++;
|
|
2609 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2610 } elsif ($methylation_calls[$index] eq 'z') {
|
|
2611 $counting{total_unmethylated_CpG_count}++;
|
|
2612 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2613 } elsif ($methylation_calls[$index] eq 'H') {
|
|
2614 $counting{total_meCHH_count}++;
|
|
2615 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2616 } elsif ($methylation_calls[$index] eq 'h') {
|
|
2617 $counting{total_unmethylated_CHH_count}++;
|
|
2618 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2619 }
|
|
2620 elsif ($methylation_calls[$index] eq '.') {}
|
|
2621 else{
|
|
2622 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2623 }
|
|
2624 }
|
|
2625 } elsif ($strand eq '-') {
|
|
2626 for my $index (0..$#methylation_calls) {
|
|
2627
|
|
2628 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
2629 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
2630 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2631 $cigar_offset += $cigar_mod;
|
|
2632 $pos_offset += $pos_mod;
|
|
2633 }
|
|
2634
|
|
2635 if ($methylation_calls[$index] eq 'X') {
|
|
2636 $counting{total_meCHG_count}++;
|
|
2637 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2638 } elsif ($methylation_calls[$index] eq 'x') {
|
|
2639 $counting{total_unmethylated_CHG_count}++;
|
|
2640 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2641 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
2642 $counting{total_meCpG_count}++;
|
|
2643 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2644 } elsif ($methylation_calls[$index] eq 'z') {
|
|
2645 $counting{total_unmethylated_CpG_count}++;
|
|
2646 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2647 } elsif ($methylation_calls[$index] eq 'H') {
|
|
2648 $counting{total_meCHH_count}++;
|
|
2649 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2650 } elsif ($methylation_calls[$index] eq 'h') {
|
|
2651 $counting{total_unmethylated_CHH_count}++;
|
|
2652 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2653 }
|
|
2654 elsif ($methylation_calls[$index] eq '.') {}
|
|
2655 else{
|
|
2656 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2657 }
|
|
2658 }
|
|
2659 } else {
|
|
2660 die "The strand orientation as neither + nor -: '$strand'\n";
|
|
2661 }
|
|
2662 }
|
|
2663 }
|
|
2664 }
|
|
2665
|
|
2666 sub check_cigar_string {
|
|
2667 my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_;
|
|
2668 # print "$index\t$cigar_offset\t$pos_offset\t$strand\t";
|
|
2669 my ($new_cigar_offset,$new_pos_offset) = (0,0);
|
|
2670
|
|
2671 if ($strand eq '+') {
|
|
2672 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
|
|
2673
|
|
2674 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
|
|
2675 # warn "position needs no adjustment\n";
|
|
2676 }
|
|
2677
|
|
2678 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
|
|
2679 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
|
|
2680 # warn "adjusted genomic position by -1 bp (insertion)\n";
|
|
2681 }
|
|
2682
|
|
2683 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
|
|
2684 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
|
|
2685 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
|
|
2686 # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
|
|
2687
|
|
2688 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
|
|
2689 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
|
|
2690 # warn "position needs no adjustment\n";
|
|
2691 last;
|
|
2692 }
|
|
2693 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
|
|
2694 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
|
|
2695 # warn "adjusted genomic position by another -1 bp (insertion)\n";
|
|
2696 last;
|
|
2697 }
|
|
2698 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
|
|
2699 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
|
|
2700 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
|
|
2701 # warn "adjusted genomic position by another +1 bp (deletion)\n";
|
|
2702 }
|
|
2703 else{
|
|
2704 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
|
|
2705 }
|
|
2706 }
|
|
2707 }
|
|
2708 else{
|
|
2709 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
|
|
2710 }
|
|
2711 }
|
|
2712
|
|
2713 elsif ($strand eq '-') {
|
|
2714 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
|
|
2715
|
|
2716 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
|
|
2717 # warn "position needs no adjustment\n";
|
|
2718 }
|
|
2719
|
|
2720 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
|
|
2721 $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position
|
|
2722 # warn "adjusted genomic position by +1 bp (insertion)\n";
|
|
2723 }
|
|
2724
|
|
2725 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
|
|
2726 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
|
|
2727 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
|
|
2728 # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
|
|
2729
|
|
2730 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
|
|
2731 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
|
|
2732 # warn "Found new 'M' operation; position needs no adjustment\n";
|
|
2733 last;
|
|
2734 }
|
|
2735 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
|
|
2736 $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position
|
|
2737 # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n";
|
|
2738 last;
|
|
2739 }
|
|
2740 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
|
|
2741 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
|
|
2742 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
|
|
2743 # warn "adjusted genomic position by another -1 bp (deletion)\n";
|
|
2744 }
|
|
2745 else{
|
|
2746 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
|
|
2747 }
|
|
2748 }
|
|
2749 }
|
|
2750 else{
|
|
2751 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
|
|
2752 }
|
|
2753 }
|
|
2754 # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n";
|
|
2755 return ($new_cigar_offset,$new_pos_offset);
|
|
2756 }
|
|
2757
|
|
2758 sub print_individual_C_methylation_states_single_end{
|
|
2759
|
|
2760 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_;
|
|
2761 my @methylation_calls = split(//,$meth_call);
|
|
2762
|
|
2763 #################################################################
|
|
2764 ### . for bases not involving cytosines ###
|
|
2765 ### X for methylated C in CHG context (was protected) ###
|
|
2766 ### x for not methylated C in CHG context (was converted) ###
|
|
2767 ### H for methylated C in CHH context (was protected) ###
|
|
2768 ### h for not methylated C in CHH context (was converted) ###
|
|
2769 ### Z for methylated C in CpG context (was protected) ###
|
|
2770 ### z for not methylated C in CpG context (was converted) ###
|
|
2771 #################################################################
|
|
2772
|
|
2773 my $methyl_CHG_count = 0;
|
|
2774 my $methyl_CHH_count = 0;
|
|
2775 my $methyl_CpG_count = 0;
|
|
2776 my $unmethylated_CHG_count = 0;
|
|
2777 my $unmethylated_CHH_count = 0;
|
|
2778 my $unmethylated_CpG_count = 0;
|
|
2779
|
|
2780 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
|
|
2781 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
|
|
2782
|
|
2783 my @comp_cigar;
|
|
2784
|
|
2785 if ($cigar){ # parsing CIGAR string
|
|
2786
|
|
2787 ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing
|
|
2788 if ($cigar =~ /^\d+M$/){
|
|
2789 # warn "See!? I told you so! $cigar\n";
|
|
2790 # sleep(1);
|
|
2791 }
|
|
2792 else{
|
|
2793
|
|
2794 my @len;
|
|
2795 my @ops;
|
|
2796
|
|
2797 @len = split (/\D+/,$cigar); # storing the length per operation
|
|
2798 @ops = split (/\d+/,$cigar); # storing the operation
|
|
2799 shift @ops; # remove the empty first element
|
|
2800 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
|
|
2801
|
|
2802 foreach my $index (0..$#len){
|
|
2803 foreach (1..$len[$index]){
|
|
2804 # print "$ops[$index]";
|
|
2805 push @comp_cigar, $ops[$index];
|
|
2806 }
|
|
2807 }
|
|
2808 }
|
|
2809 # warn "\nDetected CIGAR string: $cigar\n";
|
|
2810 # warn "Length of methylation call: ",length $meth_call,"\n";
|
|
2811 # warn "number of operations: ",scalar @ops,"\n";
|
|
2812 # warn "number of length digits: ",scalar @len,"\n\n";
|
|
2813 # print @comp_cigar,"\n";
|
|
2814 # print "$meth_call\n\n";
|
|
2815 # sleep (1);
|
|
2816 }
|
|
2817
|
|
2818 ### adjusting the start position for all reads mapping to the reverse strand
|
|
2819 if ($strand eq '-') {
|
|
2820
|
|
2821 if (@comp_cigar){ # only needed for SAM reads with InDels
|
|
2822 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
|
|
2823 # print @comp_cigar,"\n";
|
|
2824 }
|
|
2825
|
|
2826 unless ($ignore){ ### if --ignore was specified the start position has already been corrected
|
|
2827
|
|
2828 if ($cigar){ ### SAM format
|
|
2829 if ($cigar =~ /^(\d+)M$/){ # linear match
|
|
2830 $start += $1 - 1;
|
|
2831 }
|
|
2832 else{ # InDel read
|
|
2833 my $MD_count = 0;
|
|
2834 foreach (@comp_cigar){
|
|
2835 ++$MD_count if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
|
|
2836 }
|
|
2837 $start += $MD_count - 1;
|
|
2838 }
|
|
2839 }
|
|
2840 else{ ### vanilla format
|
|
2841 $start += length($meth_call)-1;
|
|
2842 }
|
|
2843 }
|
|
2844 }
|
|
2845
|
|
2846 ### THIS IS THE CpG and Non-CpG SECTION (OPTIONAL)
|
|
2847
|
|
2848 ### single-file CpG and other-context output
|
|
2849 if ($full and $merge_non_CpG) {
|
|
2850 if ($strand eq '+') {
|
|
2851 for my $index (0..$#methylation_calls) {
|
|
2852
|
|
2853 if ($cigar and @comp_cigar){ # only needed for SAM alignments with InDels
|
|
2854 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2855 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
2856 $cigar_offset += $cigar_mod;
|
|
2857 $pos_offset += $pos_mod;
|
|
2858 }
|
|
2859
|
|
2860 ### methylated Cs (any context) will receive a forward (+) orientation
|
|
2861 ### not methylated Cs (any context) will receive a reverse (-) orientation
|
|
2862 if ($methylation_calls[$index] eq 'X') {
|
|
2863 $counting{total_meCHG_count}++;
|
|
2864 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2865 }
|
|
2866 elsif ($methylation_calls[$index] eq 'x') {
|
|
2867 $counting{total_unmethylated_CHG_count}++;
|
|
2868 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2869 }
|
|
2870 elsif ($methylation_calls[$index] eq 'Z') {
|
|
2871 $counting{total_meCpG_count}++;
|
|
2872 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2873 }
|
|
2874 elsif ($methylation_calls[$index] eq 'z') {
|
|
2875 $counting{total_unmethylated_CpG_count}++;
|
|
2876 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2877 }
|
|
2878 elsif ($methylation_calls[$index] eq 'H') {
|
|
2879 $counting{total_meCHH_count}++;
|
|
2880 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2881 }
|
|
2882 elsif ($methylation_calls[$index] eq 'h') {
|
|
2883 $counting{total_unmethylated_CHH_count}++;
|
|
2884 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2885 }
|
|
2886 elsif ($methylation_calls[$index] eq '.') {
|
|
2887 }
|
|
2888 else{
|
|
2889 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2890 }
|
|
2891 }
|
|
2892 }
|
|
2893 elsif ($strand eq '-') {
|
|
2894
|
|
2895 for my $index (0..$#methylation_calls) {
|
|
2896 ### methylated Cs (any context) will receive a forward (+) orientation
|
|
2897 ### not methylated Cs (any context) will receive a reverse (-) orientation
|
|
2898
|
|
2899 if ($cigar and @comp_cigar){ # only needed for SAM entries with InDels
|
|
2900 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
2901 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2902 $cigar_offset += $cigar_mod;
|
|
2903 $pos_offset += $pos_mod;
|
|
2904 }
|
|
2905
|
|
2906 if ($methylation_calls[$index] eq 'X') {
|
|
2907 $counting{total_meCHG_count}++;
|
|
2908 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2909 }
|
|
2910 elsif ($methylation_calls[$index] eq 'x') {
|
|
2911 $counting{total_unmethylated_CHG_count}++;
|
|
2912 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2913 }
|
|
2914 elsif ($methylation_calls[$index] eq 'Z') {
|
|
2915 $counting{total_meCpG_count}++;
|
|
2916 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2917 }
|
|
2918 elsif ($methylation_calls[$index] eq 'z') {
|
|
2919 $counting{total_unmethylated_CpG_count}++;
|
|
2920 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2921 }
|
|
2922 elsif ($methylation_calls[$index] eq 'H') {
|
|
2923 $counting{total_meCHH_count}++;
|
|
2924 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2925 }
|
|
2926 elsif ($methylation_calls[$index] eq 'h') {
|
|
2927 $counting{total_unmethylated_CHH_count}++;
|
|
2928 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2929 }
|
|
2930 elsif ($methylation_calls[$index] eq '.'){
|
|
2931 }
|
|
2932 else{
|
|
2933 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2934 }
|
|
2935 }
|
|
2936 }
|
|
2937 else {
|
|
2938 die "The strand information was neither + nor -: $strand\n";
|
|
2939 }
|
|
2940 }
|
|
2941
|
|
2942 ### strand-specific methylation output
|
|
2943 elsif ($merge_non_CpG) {
|
|
2944 if ($strand eq '+') {
|
|
2945 for my $index (0..$#methylation_calls) {
|
|
2946 ### methylated Cs (any context) will receive a forward (+) orientation
|
|
2947 ### not methylated Cs (any context) will receive a reverse (-) orientation
|
|
2948
|
|
2949 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
|
|
2950 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2951 $cigar_offset += $cigar_mod;
|
|
2952 $pos_offset += $pos_mod;
|
|
2953 }
|
|
2954
|
|
2955 if ($methylation_calls[$index] eq 'X') {
|
|
2956 $counting{total_meCHG_count}++;
|
|
2957 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2958 }
|
|
2959 elsif ($methylation_calls[$index] eq 'x') {
|
|
2960 $counting{total_unmethylated_CHG_count}++;
|
|
2961 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2962 }
|
|
2963 elsif ($methylation_calls[$index] eq 'Z') {
|
|
2964 $counting{total_meCpG_count}++;
|
|
2965 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2966 }
|
|
2967 elsif ($methylation_calls[$index] eq 'z') {
|
|
2968 $counting{total_unmethylated_CpG_count}++;
|
|
2969 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2970 }
|
|
2971 elsif ($methylation_calls[$index] eq 'H') {
|
|
2972 $counting{total_meCHH_count}++;
|
|
2973 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2974 }
|
|
2975 elsif ($methylation_calls[$index] eq 'h') {
|
|
2976 $counting{total_unmethylated_CHH_count}++;
|
|
2977 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2978 }
|
|
2979 elsif ($methylation_calls[$index] eq '.') {
|
|
2980 }
|
|
2981 else{
|
|
2982 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2983 }
|
|
2984 }
|
|
2985 }
|
|
2986 elsif ($strand eq '-') {
|
|
2987
|
|
2988 for my $index (0..$#methylation_calls) {
|
|
2989 ### methylated Cs (any context) will receive a forward (+) orientation
|
|
2990 ### not methylated Cs (any context) will receive a reverse (-) orientation
|
|
2991
|
|
2992 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
|
|
2993 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2994 $cigar_offset += $cigar_mod;
|
|
2995 $pos_offset += $pos_mod;
|
|
2996 }
|
|
2997
|
|
2998 if ($methylation_calls[$index] eq 'X') {
|
|
2999 $counting{total_meCHG_count}++;
|
|
3000 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3001 }
|
|
3002 elsif ($methylation_calls[$index] eq 'x') {
|
|
3003 $counting{total_unmethylated_CHG_count}++;
|
|
3004 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3005 }
|
|
3006 elsif ($methylation_calls[$index] eq 'Z') {
|
|
3007 $counting{total_meCpG_count}++;
|
|
3008 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3009 }
|
|
3010 elsif ($methylation_calls[$index] eq 'z') {
|
|
3011 $counting{total_unmethylated_CpG_count}++;
|
|
3012 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3013 }
|
|
3014 elsif ($methylation_calls[$index] eq 'H') {
|
|
3015 $counting{total_meCHH_count}++;
|
|
3016 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3017 }
|
|
3018 elsif ($methylation_calls[$index] eq 'h') {
|
|
3019 $counting{total_unmethylated_CHH_count}++;
|
|
3020 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3021 }
|
|
3022 elsif ($methylation_calls[$index] eq '.') {
|
|
3023 }
|
|
3024 else{
|
|
3025 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
3026 }
|
|
3027 }
|
|
3028 }
|
|
3029 else {
|
|
3030 die "The strand information was neither + nor -: $strand\n";
|
|
3031 }
|
|
3032 }
|
|
3033
|
|
3034 ### THIS IS THE 3-CONTEXT (CpG, CHG and CHH) DEFAULT SECTION
|
|
3035
|
|
3036 elsif ($full) {
|
|
3037 if ($strand eq '+') {
|
|
3038 for my $index (0..$#methylation_calls) {
|
|
3039 ### methylated Cs (any context) will receive a forward (+) orientation
|
|
3040 ### not methylated Cs (any context) will receive a reverse (-) orientation
|
|
3041
|
|
3042 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
|
|
3043 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
3044 $cigar_offset += $cigar_mod;
|
|
3045 $pos_offset += $pos_mod;
|
|
3046 }
|
|
3047
|
|
3048 if ($methylation_calls[$index] eq 'X') {
|
|
3049 $counting{total_meCHG_count}++;
|
|
3050 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3051 } elsif ($methylation_calls[$index] eq 'x') {
|
|
3052 $counting{total_unmethylated_CHG_count}++;
|
|
3053 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3054 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
3055 $counting{total_meCpG_count}++;
|
|
3056 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3057 } elsif ($methylation_calls[$index] eq 'z') {
|
|
3058 $counting{total_unmethylated_CpG_count}++;
|
|
3059 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3060 } elsif ($methylation_calls[$index] eq 'H') {
|
|
3061 $counting{total_meCHH_count}++;
|
|
3062 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3063 } elsif ($methylation_calls[$index] eq 'h') {
|
|
3064 $counting{total_unmethylated_CHH_count}++;
|
|
3065 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3066 }
|
|
3067 elsif ($methylation_calls[$index] eq '.') {}
|
|
3068 else{
|
|
3069 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
3070 }
|
|
3071 }
|
|
3072 }
|
|
3073 elsif ($strand eq '-') {
|
|
3074
|
|
3075 for my $index (0..$#methylation_calls) {
|
|
3076 ### methylated Cs (any context) will receive a forward (+) orientation
|
|
3077 ### not methylated Cs (any context) will receive a reverse (-) orientation
|
|
3078
|
|
3079 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
|
|
3080 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
3081 $cigar_offset += $cigar_mod;
|
|
3082 $pos_offset += $pos_mod;
|
|
3083 }
|
|
3084
|
|
3085 if ($methylation_calls[$index] eq 'X') {
|
|
3086 $counting{total_meCHG_count}++;
|
|
3087 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3088 } elsif ($methylation_calls[$index] eq 'x') {
|
|
3089 $counting{total_unmethylated_CHG_count}++;
|
|
3090 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3091 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
3092 $counting{total_meCpG_count}++;
|
|
3093 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3094 } elsif ($methylation_calls[$index] eq 'z') {
|
|
3095 $counting{total_unmethylated_CpG_count}++;
|
|
3096 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3097 } elsif ($methylation_calls[$index] eq 'H') {
|
|
3098 $counting{total_meCHH_count}++;
|
|
3099 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3100 } elsif ($methylation_calls[$index] eq 'h') {
|
|
3101 $counting{total_unmethylated_CHH_count}++;
|
|
3102 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3103 }
|
|
3104 elsif ($methylation_calls[$index] eq '.') {}
|
|
3105 else{
|
|
3106 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
3107 }
|
|
3108 }
|
|
3109 }
|
|
3110 else {
|
|
3111 die "The read had a strand orientation which was neither + nor -: $strand\n";
|
|
3112 }
|
|
3113 }
|
|
3114
|
|
3115 ### strand-specific methylation output
|
|
3116 else {
|
|
3117 if ($strand eq '+') {
|
|
3118 for my $index (0..$#methylation_calls) {
|
|
3119 ### methylated Cs (any context) will receive a forward (+) orientation
|
|
3120 ### not methylated Cs (any context) will receive a reverse (-) orientation
|
|
3121
|
|
3122 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
|
|
3123 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
3124 $cigar_offset += $cigar_mod;
|
|
3125 $pos_offset += $pos_mod;
|
|
3126 }
|
|
3127
|
|
3128 if ($methylation_calls[$index] eq 'X') {
|
|
3129 $counting{total_meCHG_count}++;
|
|
3130 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3131 } elsif ($methylation_calls[$index] eq 'x') {
|
|
3132 $counting{total_unmethylated_CHG_count}++;
|
|
3133 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3134 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
3135 $counting{total_meCpG_count}++;
|
|
3136 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3137 } elsif ($methylation_calls[$index] eq 'z') {
|
|
3138 $counting{total_unmethylated_CpG_count}++;
|
|
3139 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3140 } elsif ($methylation_calls[$index] eq 'H') {
|
|
3141 $counting{total_meCHH_count}++;
|
|
3142 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3143 } elsif ($methylation_calls[$index] eq 'h') {
|
|
3144 $counting{total_unmethylated_CHH_count}++;
|
|
3145 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3146 }
|
|
3147 elsif ($methylation_calls[$index] eq '.') {}
|
|
3148 else{
|
|
3149 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
3150 }
|
|
3151 }
|
|
3152 }
|
|
3153 elsif ($strand eq '-') {
|
|
3154
|
|
3155 for my $index (0..$#methylation_calls) {
|
|
3156 ### methylated Cs (any context) will receive a forward (+) orientation
|
|
3157 ### not methylated Cs (any context) will receive a reverse (-) orientation
|
|
3158
|
|
3159 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
|
|
3160 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
3161 $cigar_offset += $cigar_mod;
|
|
3162 $pos_offset += $pos_mod;
|
|
3163 }
|
|
3164
|
|
3165 if ($methylation_calls[$index] eq 'X') {
|
|
3166 $counting{total_meCHG_count}++;
|
|
3167 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3168 } elsif ($methylation_calls[$index] eq 'x') {
|
|
3169 $counting{total_unmethylated_CHG_count}++;
|
|
3170 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3171 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
3172 $counting{total_meCpG_count}++;
|
|
3173 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3174 } elsif ($methylation_calls[$index] eq 'z') {
|
|
3175 $counting{total_unmethylated_CpG_count}++;
|
|
3176 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3177 } elsif ($methylation_calls[$index] eq 'H') {
|
|
3178 $counting{total_meCHH_count}++;
|
|
3179 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3180 } elsif ($methylation_calls[$index] eq 'h') {
|
|
3181 $counting{total_unmethylated_CHH_count}++;
|
|
3182 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
3183 }
|
|
3184 elsif ($methylation_calls[$index] eq '.') {}
|
|
3185 else{
|
|
3186 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
3187 }
|
|
3188 }
|
|
3189 }
|
|
3190 else {
|
|
3191 die "The strand information was neither + nor -: $strand\n";
|
|
3192 }
|
|
3193 }
|
|
3194 }
|
|
3195
|
|
3196
|
|
3197
|
|
3198 #######################################################################################################################################
|
|
3199 ### bismark2bedGaph section - START
|
|
3200 #######################################################################################################################################
|
|
3201
|
|
3202 ### has now been moved to the external script bismark2bedGraph
|
|
3203
|
|
3204 # sub process_bedGraph_output{
|
|
3205 # warn "="x64,"\n";
|
|
3206 # warn "Methylation information will now be written into a bedGraph file\n";
|
|
3207 # warn "="x64,"\n\n";
|
|
3208 # sleep (2);
|
|
3209
|
|
3210 # ### Closing all filehandles so that the Bismark methtylation extractor output doesn't get truncated due to buffering issues
|
|
3211 # foreach my $fh (keys %fhs) {
|
|
3212 # if ($fh =~ /^[1230]$/) {
|
|
3213 # foreach my $context (keys %{$fhs{$fh}}) {
|
|
3214 # close $fhs{$fh}->{$context} or die $!;
|
|
3215 # }
|
|
3216 # } else {
|
|
3217 # close $fhs{$fh} or die $!;
|
|
3218 # }
|
|
3219 # }
|
|
3220
|
|
3221 # ### deciding which files to use for bedGraph conversion
|
|
3222 # foreach my $filename (@sorting_files){
|
|
3223 # # warn "$filename\n";
|
|
3224 # if ($filename =~ /\//){ # if files are in a different output folder we extract the filename again
|
|
3225 # $filename =~ s/.*\///; # replacing everything up to the last slash in the filename
|
|
3226 # # warn "$filename\n";
|
|
3227 # }
|
|
3228
|
|
3229 # if ($CX_context){
|
|
3230 # push @bedfiles,$filename;
|
|
3231 # }
|
|
3232 # else{ ## CpG context only (default)
|
|
3233 # if ($filename =~ /^CpG_/){
|
|
3234 # push @bedfiles,$filename;
|
|
3235 # }
|
|
3236 # else{
|
|
3237 # # skipping CHH or CHG files
|
|
3238 # }
|
|
3239 # }
|
|
3240 # }
|
|
3241
|
|
3242 # warn "Using the following files as Input:\n";
|
|
3243 # print join ("\t",@bedfiles),"\n\n";
|
|
3244 # sleep (2);
|
|
3245
|
|
3246 # my %temp_fhs;
|
|
3247 # my @temp_files; # writing all context files (default CpG only) to these files prior to sorting
|
|
3248
|
|
3249 # ### changing to the output directory
|
|
3250 # unless ($output_dir eq ''){ # default
|
|
3251 # chdir $output_dir or die "Failed to change directory to $output_dir\n";
|
|
3252 # warn "Changed directory to $output_dir\n";
|
|
3253 # }
|
|
3254
|
|
3255 # foreach my $infile (@bedfiles) {
|
|
3256
|
|
3257 # if ($remove) {
|
|
3258 # warn "Now replacing whitespaces in the sequence ID field of the Bismark methylation extractor output $infile prior to bedGraph conversion\n\n";
|
|
3259
|
|
3260 # if ($infile =~ /gz$/){
|
|
3261 # open (READ,"zcat $infile |") or die $!;
|
|
3262 # }
|
|
3263 # else{
|
|
3264 # open (READ,$infile) or die $!;
|
|
3265 # }
|
|
3266
|
|
3267 # my $removed_spaces_outfile = $infile;
|
|
3268 # $removed_spaces_outfile =~ s/$/.spaces_removed.txt/;
|
|
3269
|
|
3270 # open (REM,'>',$output_dir.$removed_spaces_outfile) or die "Couldn't write to file $removed_spaces_outfile: $!\n";
|
|
3271
|
|
3272 # unless ($no_header){
|
|
3273 # $_ = <READ>; ### Bismark version header
|
|
3274 # print REM $_; ### Bismark version header
|
|
3275 # }
|
|
3276
|
|
3277 # while (<READ>) {
|
|
3278 # chomp;
|
|
3279 # my ($id,$strand,$chr,$pos,$context) = (split (/\t/));
|
|
3280 # $id =~ s/\s+/_/g;
|
|
3281 # print REM join ("\t",$id,$strand,$chr,$pos,$context),"\n";
|
|
3282 # }
|
|
3283
|
|
3284 # close READ or die $!;
|
|
3285 # close REM or die $!;
|
|
3286
|
|
3287 # ### changing the infile name to the new file without spaces
|
|
3288 # $infile = $removed_spaces_outfile;
|
|
3289 # }
|
|
3290
|
|
3291 # warn "Now writing methylation information for file $infile to individual files for each chromosome\n";
|
|
3292 # if ($infile =~ /gz$/){
|
|
3293 # open (IN,"zcat $infile |") or die $!;
|
|
3294 # }
|
|
3295 # else{
|
|
3296 # open (IN,$infile) or die $!;
|
|
3297 # }
|
|
3298
|
|
3299 # ## always ignoring the version header
|
|
3300 # unless ($no_header){
|
|
3301 # $_ = <IN>; ### Bismark version header
|
|
3302 # }
|
|
3303
|
|
3304 # while (<IN>) {
|
|
3305 # chomp;
|
|
3306 # my ($chr) = (split (/\t/))[2];
|
|
3307 # # warn "This is the chromosome name before replacing '|' characters:\t$chr\n\n";
|
|
3308 # $chr =~ s/\|/_/g; # replacing pipe ('|') characters in the file names
|
|
3309 # # warn "This is the chromosome name AFTER replacing '|' characters:\t$chr\n\n";
|
|
3310
|
|
3311 # unless (exists $temp_fhs{$chr}) {
|
|
3312 # open ($temp_fhs{$chr},'>','chr'.$chr.'.meth_extractor.temp') or die "Failed to open filehandle: $!";
|
|
3313 # }
|
|
3314 # print {$temp_fhs{$chr}} "$_\n";
|
|
3315 # }
|
|
3316
|
|
3317 # warn "Finished writing out individual chromosome files for $infile\n";
|
|
3318 # }
|
|
3319 # warn "\n";
|
|
3320
|
|
3321 # @temp_files = <*.meth_extractor.temp>;
|
|
3322
|
|
3323 # warn "Collecting temporary chromosome file information...\n";
|
|
3324 # sleep (1);
|
|
3325 # warn "processing the following input file(s):\n";
|
|
3326 # warn join ("\n",@temp_files),"\n\n";
|
|
3327 # sleep (1);
|
|
3328
|
|
3329 # foreach my $in (@temp_files) {
|
|
3330 # if ($sort_size){
|
|
3331 # warn "Sorting input file $in by positions (using -S of '$sort_size')\n";
|
|
3332 # }
|
|
3333 # else{
|
|
3334 # warn "Sorting input file $in by positions (using default memory settings)\n";
|
|
3335 # }
|
|
3336 # my $sort_dir = $output_dir;
|
|
3337 # if ($sort_dir eq ''){
|
|
3338 # $sort_dir = './';
|
|
3339 # }
|
|
3340 # open my $ifh, "sort -S $sort_size -T $sort_dir -k3,3 -k4,4n $in |" or die "Input file could not be sorted. $!";
|
|
3341 # # print "Chromosome\tStart Position\tEnd Position\tMethylation Percentage\n";
|
|
3342
|
|
3343 # ############################################# m.a.bentley - moved the variables out of the while loop to hold the current line data {
|
|
3344
|
|
3345 # my $name;
|
|
3346 # my $meth_state;
|
|
3347 # my $chr = "";
|
|
3348 # my $pos = 0;
|
|
3349 # my $meth_state2;
|
|
3350
|
|
3351 # my $last_pos;
|
|
3352 # my $last_chr;
|
|
3353
|
|
3354 # ############################################# }
|
|
3355
|
|
3356 # while (my $line = <$ifh>) {
|
|
3357 # next if $line =~ /^Bismark/;
|
|
3358 # chomp $line;
|
|
3359
|
|
3360 # ########################################### m.a.bentley - (1) set the last_chr and last_pos variables early in the while loop, before the line split (2) removed unnecessary setting of same variables in if statement {
|
|
3361
|
|
3362 # $last_chr = $chr;
|
|
3363 # $last_pos = $pos;
|
|
3364 # ($name, $meth_state, $chr, $pos, $meth_state2) = split "\t", $line;
|
|
3365
|
|
3366 # if (($last_pos ne $pos) || ($last_chr ne $chr)) {
|
|
3367 # generate_output($last_chr,$last_pos) if $methylcalls[2] > 0;
|
|
3368 # @methylcalls = qw (0 0 0);
|
|
3369 # }
|
|
3370
|
|
3371 # ############################################# }
|
|
3372
|
|
3373 # my $validated = validate_methylation_call($meth_state, $meth_state2);
|
|
3374 # unless($validated){
|
|
3375 # warn "Methylation state of sequence ($name) in file ($in) on line $. is inconsistent (meth_state is $meth_state, meth_state2 = $meth_state2)\n";
|
|
3376 # next;
|
|
3377 # }
|
|
3378 # if ($meth_state eq "+") {
|
|
3379 # $methylcalls[0]++;
|
|
3380 # $methylcalls[2]++;
|
|
3381 # } else {
|
|
3382 # $methylcalls[1]++;
|
|
3383 # $methylcalls[2]++;
|
|
3384 # }
|
|
3385 # }
|
|
3386
|
|
3387 # ############################################# m.a.bentley - set the last_chr and last_pos variables for the last line in the file (outside the while loop's scope using the method i've implemented) {
|
|
3388
|
|
3389 # $last_chr = $chr;
|
|
3390 # $last_pos = $pos;
|
|
3391 # if ($methylcalls[2] > 0) {
|
|
3392 # generate_output($last_chr,$last_pos) if $methylcalls[2] > 0;
|
|
3393 # }
|
|
3394 # ############################################# }
|
|
3395
|
|
3396 # close $ifh or die $!;
|
|
3397
|
|
3398 # @methylcalls = qw (0 0 0); # resetting @methylcalls
|
|
3399
|
|
3400 # ### deleting temporary files
|
|
3401 # my $delete = unlink $in;
|
|
3402 # if ($delete) {
|
|
3403 # warn "Successfully deleted the temporary input file $in\n\n";
|
|
3404 # }
|
|
3405 # else {
|
|
3406 # warn "The temporary inputfile $in could not be deleted $!\n\n";
|
|
3407 # }
|
|
3408 # }
|
|
3409 # }
|
|
3410
|
|
3411 # sub generate_output{
|
|
3412 # my $methcount = $methylcalls[0];
|
|
3413 # my $nonmethcount = $methylcalls[1];
|
|
3414 # my $totalcount = $methylcalls[2];
|
|
3415 # my $last_chr = shift;
|
|
3416 # my $last_pos = shift;
|
|
3417 # croak "Should not be generating output if there's no reads to this region" unless $totalcount > 0;
|
|
3418 # croak "Total counts ($totalcount) is not the sum of the methylated ($methcount) and unmethylated ($nonmethcount) counts" if $totalcount != ($methcount + $nonmethcount);
|
|
3419
|
|
3420 # ############################################# m.a.bentley - declare a new variable 'bed_pos' to distinguish from bismark positions (-1) - previous scripts modified the last_pos variable earlier in the script leading to problems in meth % calculation {
|
|
3421
|
|
3422 # my $bed_pos = $last_pos -1; ### Bismark coordinates are 1 based whereas bedGraph coordinates are 0 based.
|
|
3423 # my $meth_percentage;
|
|
3424 # ($totalcount >= $coverage_threshold) ? ($meth_percentage = ($methcount/$totalcount) * 100) : ($meth_percentage = undef);
|
|
3425 # # $meth_percentage =~ s/(\.\d\d).+$/$1/ unless $meth_percentage =~ /^Below/;
|
|
3426 # if (defined $meth_percentage){
|
|
3427 # if ($counts){
|
|
3428 # print OUT "$last_chr\t$bed_pos\t$bed_pos\t$meth_percentage\t$methcount\t$nonmethcount\n";
|
|
3429 # }
|
|
3430 # else{
|
|
3431 # print OUT "$last_chr\t$bed_pos\t$bed_pos\t$meth_percentage\n";
|
|
3432 # }
|
|
3433 # }
|
|
3434 # ############################################# }
|
|
3435 # }
|
|
3436
|
|
3437 # sub validate_methylation_call{
|
|
3438 # my $meth_state = shift;
|
|
3439 # croak "Missing (+/-) methylation call" unless defined $meth_state;
|
|
3440 # my $meth_state2 = shift;
|
|
3441 # croak "Missing alphabetical methylation call" unless defined $meth_state2;
|
|
3442 # my $is_consistent;
|
|
3443 # ($meth_state2 =~ /^z/i) ? ($is_consistent = check_CpG_methylation_call($meth_state, $meth_state2))
|
|
3444 # : ($is_consistent = check_nonCpG_methylation_call($meth_state,$meth_state2));
|
|
3445 # return 1 if $is_consistent;
|
|
3446 # return 0;
|
|
3447 # }
|
|
3448
|
|
3449 # sub check_CpG_methylation_call{
|
|
3450 # my $meth1 = shift;
|
|
3451 # my $meth2 = shift;
|
|
3452 # return 1 if($meth1 eq "+" && $meth2 eq "Z");
|
|
3453 # return 1 if($meth1 eq "-" && $meth2 eq "z");
|
|
3454 # return 0;
|
|
3455 # }
|
|
3456
|
|
3457 # sub check_nonCpG_methylation_call{
|
|
3458 # my $meth1 = shift;
|
|
3459 # my $meth2 = shift;
|
|
3460 # return 1 if($meth1 eq "+" && $meth2 eq "C");
|
|
3461 # return 1 if($meth1 eq "+" && $meth2 eq "X");
|
|
3462 # return 1 if($meth1 eq "+" && $meth2 eq "H");
|
|
3463 # return 1 if($meth1 eq "-" && $meth2 eq "c");
|
|
3464 # return 1 if($meth1 eq "-" && $meth2 eq "x");
|
|
3465 # return 1 if($meth1 eq "-" && $meth2 eq "h");
|
|
3466 # return 0;
|
|
3467 # }
|
|
3468
|
|
3469 #######################################################################################################################################
|
|
3470 ### bismark2bedGaph section - END
|
|
3471 #######################################################################################################################################
|
|
3472
|
|
3473
|
|
3474
|
|
3475
|
|
3476
|
|
3477
|
|
3478 # #######################################################################################################################################
|
|
3479 # ### genome-wide cytosine methylation report - START
|
|
3480 # #######################################################################################################################################
|
|
3481
|
|
3482 ### has now been moved to the external script bedGraph2cytosine
|
|
3483
|
|
3484 # sub generate_genome_wide_cytosine_report {
|
|
3485
|
|
3486 # warn "="x78,"\n";
|
|
3487 # warn "Methylation information will now be written into a genome-wide cytosine report\n";
|
|
3488 # warn "="x78,"\n\n";
|
|
3489 # sleep (2);
|
|
3490
|
|
3491 # ### changing to the output directory again
|
|
3492 # unless ($output_dir eq ''){ # default
|
|
3493 # chdir $output_dir or die "Failed to change directory to $output_dir\n";
|
|
3494 # # warn "Changed directory to $output_dir\n";
|
|
3495 # }
|
|
3496
|
|
3497 # my $in = shift;
|
|
3498 # open (IN,$in) or die $!;
|
|
3499
|
|
3500 # my $cytosine_out = shift;
|
|
3501
|
|
3502 # if ($CX_context){
|
|
3503 # $cytosine_out =~ s/$/genome-wide_CX_report.txt/;
|
|
3504 # }
|
|
3505 # else{
|
|
3506 # $cytosine_out =~ s/$/genome_wide_CpG_report.txt/;
|
|
3507 # }
|
|
3508
|
|
3509 # ### note: we are still in the folder: $output_dir, so we do not have to include this into the open commands
|
|
3510 # unless ($split_by_chromosome){ ### writing all output to a single file (default)
|
|
3511 # open (CYT,'>',$cytosine_out) or die $!;
|
|
3512 # warn "Writing genome-wide cytosine report to: $cytosine_out\n";
|
|
3513 # sleep (3);
|
|
3514 # }
|
|
3515
|
|
3516 # my $last_chr;
|
|
3517 # my %chr; # storing reads for one chromosome at a time
|
|
3518
|
|
3519 # my $count = 0;
|
|
3520 # while (<IN>){
|
|
3521 # chomp;
|
|
3522 # ++$count;
|
|
3523 # my ($chr,$start,$end,undef,$meth,$nonmeth) = (split /\t/);
|
|
3524
|
|
3525 # # defining the first chromosome
|
|
3526 # unless (defined $last_chr){
|
|
3527 # $last_chr = $chr;
|
|
3528 # # warn "Storing all covered cytosine positions for chromosome: $chr\n";
|
|
3529 # }
|
|
3530
|
|
3531 # if ($chr eq $last_chr){
|
|
3532 # $chr{$chr}->{$start}->{meth} = $meth;
|
|
3533 # $chr{$chr}->{$start}->{nonmeth} = $nonmeth;
|
|
3534 # }
|
|
3535 # else{
|
|
3536 # warn "Writing cytosine reports for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n";
|
|
3537
|
|
3538 # if ($split_by_chromosome){ ## writing output to 1 file per chromosome
|
|
3539 # my $chromosome_out = $cytosine_out;
|
|
3540 # $chromosome_out =~ s/txt$/chr${last_chr}.txt/;
|
|
3541 # open (CYT,'>',$chromosome_out) or die $!;
|
|
3542 # }
|
|
3543
|
|
3544 # while ( $chromosomes{$last_chr} =~ /([CG])/g){
|
|
3545
|
|
3546 # my $tri_nt = '';
|
|
3547 # my $context = '';
|
|
3548 # my $pos = pos$chromosomes{$last_chr};
|
|
3549
|
|
3550 # my $strand;
|
|
3551 # my $meth = 0;
|
|
3552 # my $nonmeth = 0;
|
|
3553
|
|
3554 # if ($1 eq 'C'){ # C on forward strand
|
|
3555 # $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based!
|
|
3556 # $strand = '+';
|
|
3557 # }
|
|
3558 # elsif ($1 eq 'G'){ # C on reverse strand
|
|
3559 # $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based!
|
|
3560 # $tri_nt = reverse $tri_nt;
|
|
3561 # $tri_nt =~ tr/ACTG/TGAC/;
|
|
3562 # $strand = '-';
|
|
3563 # }
|
|
3564 # next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted
|
|
3565
|
|
3566 # if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 0-based!
|
|
3567 # $meth = $chr{$last_chr}->{$pos-1}->{meth};
|
|
3568 # $nonmeth = $chr{$last_chr}->{$pos-1}->{nonmeth};
|
|
3569 # }
|
|
3570
|
|
3571 # ### determining cytosine context
|
|
3572 # if ($tri_nt =~ /^CG/){
|
|
3573 # $context = 'CG';
|
|
3574 # }
|
|
3575 # elsif ($tri_nt =~ /^C.{1}G$/){
|
|
3576 # $context = 'CHG';
|
|
3577 # }
|
|
3578 # elsif ($tri_nt =~ /^C.{2}$/){
|
|
3579 # $context = 'CHH';
|
|
3580 # }
|
|
3581 # else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark)
|
|
3582 # warn "The sequence context could not be determined (found: '$tri_nt'). Skipping.\n";
|
|
3583 # next;
|
|
3584 # }
|
|
3585
|
|
3586 # if ($CpG_only){
|
|
3587 # if ($tri_nt =~ /^CG/){ # CpG context is the default
|
|
3588 # if ($zero){ # zero based coordinates
|
|
3589 # $pos -= 1;
|
|
3590 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
|
|
3591 # }
|
|
3592 # else{ # default
|
|
3593 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
|
|
3594 # }
|
|
3595 # }
|
|
3596 # }
|
|
3597 # else{ ## all cytosines, specified with --CX
|
|
3598 # if ($zero){ # zero based coordinates
|
|
3599 # $pos -= 1;
|
|
3600 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
|
|
3601 # }
|
|
3602 # else{ # default
|
|
3603 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
|
|
3604 # }
|
|
3605 # }
|
|
3606 # }
|
|
3607
|
|
3608 # %chr = (); # resetting the hash
|
|
3609
|
|
3610 # # new first entry
|
|
3611 # $last_chr = $chr;
|
|
3612 # $chr{$chr}->{$start}->{meth} = $meth;
|
|
3613 # $chr{$chr}->{$start}->{nonmeth} = $nonmeth;
|
|
3614 # }
|
|
3615 # }
|
|
3616
|
|
3617 # # Last found chromosome
|
|
3618 # warn "Writing cytosine reports for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n";
|
|
3619
|
|
3620 # if ($split_by_chromosome){ ## writing output to 1 file per chromosome
|
|
3621 # my $chromosome_out = $cytosine_out;
|
|
3622 # $chromosome_out =~ s/txt$/chr${last_chr}.txt/;
|
|
3623 # open (CYT,'>',$chromosome_out) or die $!;
|
|
3624 # }
|
|
3625
|
|
3626 # while ( $chromosomes{$last_chr} =~ /([CG])/g){
|
|
3627
|
|
3628 # my $tri_nt;
|
|
3629 # my $context;
|
|
3630 # my $pos = pos$chromosomes{$last_chr};
|
|
3631
|
|
3632 # my $strand;
|
|
3633 # my $meth = 0;
|
|
3634 # my $nonmeth = 0;
|
|
3635
|
|
3636 # if ($1 eq 'C'){ # C on forward strand
|
|
3637 # $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based!
|
|
3638 # $strand = '+';
|
|
3639 # }
|
|
3640 # elsif ($1 eq 'G'){ # C on reverse strand
|
|
3641 # $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based!
|
|
3642 # $tri_nt = reverse $tri_nt;
|
|
3643 # $tri_nt =~ tr/ACTG/TGAC/;
|
|
3644 # $strand = '-';
|
|
3645 # }
|
|
3646
|
|
3647 # if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 0-based!
|
|
3648 # $meth = $chr{$last_chr}->{$pos-1}->{meth};
|
|
3649 # $nonmeth = $chr{$last_chr}->{$pos-1}->{nonmeth};
|
|
3650 # }
|
|
3651
|
|
3652 # next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted
|
|
3653
|
|
3654 # ### determining cytosine context
|
|
3655 # if ($tri_nt =~ /^CG/){
|
|
3656 # $context = 'CG';
|
|
3657 # }
|
|
3658 # elsif ($tri_nt =~ /^C.{1}G$/){
|
|
3659 # $context = 'CHG';
|
|
3660 # }
|
|
3661 # elsif ($tri_nt =~ /^C.{2}$/){
|
|
3662 # $context = 'CHH';
|
|
3663 # }
|
|
3664 # else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark)
|
|
3665 # warn "The cytosine context could not be determined (found: '$tri_nt'). Skipping.\n";
|
|
3666 # next;
|
|
3667 # }
|
|
3668
|
|
3669 # if ($CpG_only){
|
|
3670 # if ($tri_nt =~ /^CG/){ # CpG context is the default
|
|
3671 # if ($zero){ # zero-based coordinates
|
|
3672 # $pos -= 1;
|
|
3673 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
|
|
3674 # }
|
|
3675 # else{ # default
|
|
3676 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
|
|
3677 # }
|
|
3678 # }
|
|
3679 # }
|
|
3680 # else{ ## all cytosines, specified with --CX
|
|
3681 # if ($zero){ # zero based coordinates
|
|
3682 # $pos -= 1;
|
|
3683 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
|
|
3684 # }
|
|
3685 # else{ # default
|
|
3686 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
|
|
3687 # }
|
|
3688 # }
|
|
3689 # }
|
|
3690 # close CYT or die $!;
|
|
3691 # }
|
|
3692
|
|
3693
|
|
3694 # sub read_genome_into_memory{
|
|
3695
|
|
3696 # ## reading in and storing the specified genome in the %chromosomes hash
|
|
3697 # chdir ($genome_folder) or die "Can't move to $genome_folder: $!";
|
|
3698 # warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n";
|
|
3699
|
|
3700 # my @chromosome_filenames = <*.fa>;
|
|
3701
|
|
3702 # ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
|
|
3703 # unless (@chromosome_filenames){
|
|
3704 # @chromosome_filenames = <*.fasta>;
|
|
3705 # }
|
|
3706 # unless (@chromosome_filenames){
|
|
3707 # die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n";
|
|
3708 # }
|
|
3709
|
|
3710 # foreach my $chromosome_filename (@chromosome_filenames){
|
|
3711
|
|
3712 # # skipping the tophat entire mouse genome fasta file
|
|
3713 # next if ($chromosome_filename eq 'Mus_musculus.NCBIM37.fa');
|
|
3714
|
|
3715 # open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n";
|
|
3716 # ### first line needs to be a fastA header
|
|
3717 # my $first_line = <CHR_IN>;
|
|
3718 # chomp $first_line;
|
|
3719 # $first_line =~ s/\r//; # removing /r carriage returns
|
|
3720
|
|
3721 # ### Extracting chromosome name from the FastA header
|
|
3722 # my $chromosome_name = extract_chromosome_name($first_line);
|
|
3723
|
|
3724 # my $sequence;
|
|
3725 # while (<CHR_IN>){
|
|
3726 # chomp;
|
|
3727 # $_ =~ s/\r//; # removing /r carriage returns
|
|
3728
|
|
3729 # if ($_ =~ /^>/){
|
|
3730 # ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA)
|
|
3731 # if (exists $chromosomes{$chromosome_name}){
|
|
3732 # warn "chr $chromosome_name (",length $sequence ," bp)\n";
|
|
3733 # die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
|
|
3734 # }
|
|
3735 # else {
|
|
3736 # if (length($sequence) == 0){
|
|
3737 # warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n";
|
|
3738 # }
|
|
3739 # warn "chr $chromosome_name (",length $sequence ," bp)\n";
|
|
3740 # $chromosomes{$chromosome_name} = $sequence;
|
|
3741 # }
|
|
3742 # ### resetting the sequence variable
|
|
3743 # $sequence = '';
|
|
3744 # ### setting new chromosome name
|
|
3745 # $chromosome_name = extract_chromosome_name($_);
|
|
3746 # }
|
|
3747 # else{
|
|
3748 # $sequence .= uc$_;
|
|
3749 # }
|
|
3750 # }
|
|
3751
|
|
3752 # if (exists $chromosomes{$chromosome_name}){
|
|
3753 # warn "chr $chromosome_name (",length $sequence ," bp)\t";
|
|
3754 # die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n";
|
|
3755 # }
|
|
3756 # else{
|
|
3757 # if (length($sequence) == 0){
|
|
3758 # warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n";
|
|
3759 # }
|
|
3760 # warn "chr $chromosome_name (",length $sequence ," bp)\n";
|
|
3761 # $chromosomes{$chromosome_name} = $sequence;
|
|
3762 # }
|
|
3763 # }
|
|
3764 # warn "\n";
|
|
3765 # chdir $parent_dir or die "Failed to move to directory $parent_dir\n";
|
|
3766 # }
|
|
3767
|
|
3768 # sub extract_chromosome_name {
|
|
3769 # ## Bowtie extracts the first string after the inition > in the FASTA file, so we are doing this as well
|
|
3770 # my $fasta_header = shift;
|
|
3771 # if ($fasta_header =~ s/^>//){
|
|
3772 # my ($chromosome_name) = split (/\s+/,$fasta_header);
|
|
3773 # return $chromosome_name;
|
|
3774 # }
|
|
3775 # else{
|
|
3776 # die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n";
|
|
3777 # }
|
|
3778 # }
|
|
3779
|
|
3780 # #######################################################################################################################################
|
|
3781 # ### genome-wide cytosine methylation report - END
|
|
3782 # #######################################################################################################################################
|
|
3783
|
|
3784
|
|
3785
|
|
3786
|
|
3787 sub print_helpfile{
|
|
3788
|
|
3789 print << 'HOW_TO';
|
|
3790
|
|
3791
|
|
3792 DESCRIPTION
|
|
3793
|
|
3794 The following is a brief description of all options to control the Bismark
|
|
3795 methylation extractor. The script reads in a bisulfite read alignment results file
|
|
3796 produced by the Bismark bisulfite mapper and extracts the methylation information
|
|
3797 for individual cytosines. This information is found in the methylation call field
|
|
3798 which can contain the following characters:
|
|
3799
|
|
3800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
3801 ~~~ X for methylated C in CHG context (was protected) ~~~
|
|
3802 ~~~ x for not methylated C CHG (was converted) ~~~
|
|
3803 ~~~ H for methylated C in CHH context (was protected) ~~~
|
|
3804 ~~~ h for not methylated C in CHH context (was converted) ~~~
|
|
3805 ~~~ Z for methylated C in CpG context (was protected) ~~~
|
|
3806 ~~~ z for not methylated C in CpG context (was converted) ~~~
|
|
3807 ~~~ . for any bases not involving cytosines ~~~
|
|
3808 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
3809
|
|
3810 The methylation extractor outputs result files for cytosines in CpG, CHG and CHH
|
|
3811 context (this distinction is actually already made in Bismark itself). As the methylation
|
|
3812 information for every C analysed can produce files which easily have tens or even hundreds of
|
|
3813 millions of lines, file sizes can become very large and more difficult to handle. The C
|
|
3814 methylation info additionally splits cytosine methylation calls up into one of the four possible
|
|
3815 strands a given bisulfite read aligned against:
|
|
3816
|
|
3817 OT original top strand
|
|
3818 CTOT complementary to original top strand
|
|
3819
|
|
3820 OB original bottom strand
|
|
3821 CTOB complementary to original bottom strand
|
|
3822
|
|
3823 Thus, by default twelve individual output files are being generated per input file (unless
|
|
3824 --comprehensive is specified, see below). The output files can be imported into a genome
|
|
3825 viewer, such as SeqMonk, and re-combined into a single data group if desired (in fact
|
|
3826 unless the bisulfite reads were generated preserving directionality it doesn't make any
|
|
3827 sense to look at the data in a strand-specific manner). Strand-specific output files can
|
|
3828 optionally be skipped, in which case only three output files for CpG, CHG or CHH context
|
|
3829 will be generated. For both the strand-specific and comprehensive outputs there is also
|
|
3830 the option to merge both non-CpG contexts (CHG and CHH) into one single non-CpG context.
|
|
3831
|
|
3832
|
|
3833 The output files are in the following format (tab delimited):
|
|
3834
|
|
3835 <sequence_id> <strand> <chromosome> <position> <methylation call>
|
|
3836
|
|
3837
|
|
3838 USAGE: methylation_extractor [options] <filenames>
|
|
3839
|
|
3840
|
|
3841 ARGUMENTS:
|
|
3842
|
|
3843 <filenames> A space-separated list of Bismark result files in SAM format from
|
|
3844 which methylation information is extracted for every cytosine in
|
|
3845 the reads. For alignment files in the older custom Bismark output
|
|
3846 see option '--vanilla'.
|
|
3847
|
|
3848 OPTIONS:
|
|
3849
|
|
3850 -s/--single-end Input file(s) are Bismark result file(s) generated from single-end
|
|
3851 read data. Specifying either --single-end or --paired-end is
|
|
3852 mandatory.
|
|
3853
|
|
3854 -p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end
|
|
3855 read data. Specifying either --paired-end or --single-end is
|
|
3856 mandatory.
|
|
3857
|
|
3858 --vanilla The Bismark result input file(s) are in the old custom Bismark format
|
|
3859 (up to version 0.5.x) and not in SAM format which is the default as
|
|
3860 of Bismark version 0.6.x or higher. Default: OFF.
|
|
3861
|
|
3862 --no_overlap For paired-end reads it is theoretically possible that read_1 and
|
|
3863 read_2 overlap. This option avoids scoring overlapping methylation
|
|
3864 calls twice (only methylation calls of read 1 are used for in the process
|
|
3865 since read 1 has historically higher quality basecalls than read 2).
|
|
3866 Whilst this option removes a bias towards more methylation calls
|
|
3867 in the center of sequenced fragments it may de facto remove a sizable
|
|
3868 proportion of the data. This option is highly recommended for paired-end
|
|
3869 data.
|
|
3870
|
|
3871 --ignore <int> Ignore the first <int> bp at the 5' end of each read when processing the
|
|
3872 methylation call string. This can remove e.g. a restriction enzyme site
|
|
3873 at the start of each read.
|
|
3874
|
|
3875 --comprehensive Specifying this option will merge all four possible strand-specific
|
|
3876 methylation info into context-dependent output files. The default
|
|
3877 contexts are:
|
|
3878 - CpG context
|
|
3879 - CHG context
|
|
3880 - CHH context
|
|
3881
|
|
3882 --merge_non_CpG This will produce two output files (in --comprehensive mode) or eight
|
|
3883 strand-specific output files (default) for Cs in
|
|
3884 - CpG context
|
|
3885 - non-CpG context
|
|
3886
|
|
3887 --report Prints out a short methylation summary as well as the paramaters used to run
|
|
3888 this script.
|
|
3889
|
|
3890 --no_header Suppresses the Bismark version header line in all output files for more convenient
|
|
3891 batch processing.
|
|
3892
|
|
3893 -o/--output DIR Allows specification of a different output directory (absolute or relative
|
|
3894 path). If not specified explicitely, the output will be written to the current directory.
|
|
3895
|
|
3896 --samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified
|
|
3897 explicitly if Samtools is in the PATH already.
|
|
3898
|
|
3899 --gzip The methylation extractor files (CpG_OT_..., CpG_OB_... etc) will be written out in
|
|
3900 a GZIP compressed form to save disk space. This option does not work on bedGraph and
|
|
3901 genome-wide cytosine reports as they are 'tiny' anyway.
|
|
3902
|
|
3903 --version Displays version information.
|
|
3904
|
|
3905 -h/--help Displays this help file and exits.
|
|
3906
|
|
3907
|
|
3908
|
|
3909 bedGraph specific options:
|
|
3910
|
|
3911 --bedGraph After finishing the methylation extraction, the methylation output is written into a
|
|
3912 sorted bedGraph file that reports the position of a given cytosine and its methylation
|
|
3913 state (in %, see details below). The methylation extractor output is temporarily split up into
|
|
3914 temporary files, one per chromosome (written into the current directory or folder
|
|
3915 specified with -o/--output); these temp files are then used for sorting and deleted
|
|
3916 afterwards. By default, only cytosines in CpG context will be sorted. The option
|
|
3917 '--CX_context' may be used to report all cytosines irrespective of sequence context
|
|
3918 (this will take MUCH longer!). The default folder for temporary files during the sorting
|
|
3919 process is the output directory. The bedGraph conversion step is performed by the external
|
|
3920 module 'bismark2bedGraph'; this script needs to reside in the same folder as the
|
|
3921 bismark_methylation_extractor itself.
|
|
3922
|
|
3923
|
|
3924 --cutoff [threshold] The minimum number of times a methylation state has to be seen for that nucleotide
|
|
3925 before its methylation percentage is reported. Default: 1.
|
|
3926
|
|
3927 --remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting.
|
|
3928
|
|
3929
|
|
3930 --counts Adds two additional columns to the output file to enable further calculations:
|
|
3931 col 5: number of methylated calls
|
|
3932 col 6: number of unmethylated calls
|
|
3933 This option is required if '--cytosine_report' is specified (and will be set automatically if
|
|
3934 necessary).
|
|
3935
|
|
3936 --CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered
|
|
3937 in the experiment irrespective of its sequence context. This applies to both forward and
|
|
3938 reverse strands. Please be aware that this option may generate large temporary and output files
|
|
3939 and may take a long time to sort (up to many hours). Default: OFF.
|
|
3940 (i.e. Default = CpG context only).
|
|
3941
|
|
3942 --buffer_size <string> This allows you to specify the main memory sort buffer when sorting the methylation information.
|
|
3943 Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or
|
|
3944 a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc.
|
|
3945 (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line.
|
|
3946 Defaults to 2G.
|
|
3947
|
|
3948
|
|
3949 Genome-wide cytosine methylation report specific options:
|
|
3950
|
|
3951 --cytosine_report After the conversion to bedGraph has completed, the option '--cytosine_report' produces a
|
|
3952 genome-wide methylation report for all cytosines in the genome. By default, the output uses 1-based
|
|
3953 chromosome coordinates (zero-based cords are optional) and reports CpG context only (all
|
|
3954 cytosine context is optional). The output considers all Cs on both forward and reverse strands and
|
|
3955 reports their position, strand, trinucleotide content and methylation state (counts are 0 if not
|
|
3956 covered). The cytsoine report conversion step is performed by the external module
|
|
3957 'bedGraph2cytosine'; this script needs to reside in the same folder as the bismark_methylation_extractor
|
|
3958 itself.
|
|
3959
|
|
3960 --CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of
|
|
3961 its context. This applies to both forward and reverse strands. Please be aware that this will
|
|
3962 generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse.
|
|
3963 Default: OFF (i.e. Default = CpG context only).
|
|
3964
|
|
3965 --zero_based Uses zero-based coordinates like used in e.g. bed files instead of 1-based coordinates. Default: OFF.
|
|
3966
|
|
3967 --genome_folder <path> Enter the genome folder you wish to use to extract sequences from (full path only). Accepted
|
|
3968 formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory.
|
|
3969
|
|
3970 --split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files
|
|
3971 will be named to include the input filename and the chromosome number.
|
|
3972
|
|
3973
|
|
3974
|
|
3975 OUTPUT:
|
|
3976
|
|
3977 The bismark_methylation_extractor output is in the form:
|
|
3978 ========================================================
|
|
3979 <seq-ID> <methylation state*> <chromosome> <start position (= end position)> <methylation call>
|
|
3980
|
|
3981 * Methylated cytosines receive a '+' orientation,
|
|
3982 * Unmethylated cytosines receive a '-' orientation.
|
|
3983
|
|
3984
|
|
3985
|
|
3986 The bedGraph output (optional) looks like this (tab-delimited):
|
|
3987 ===============================================================
|
|
3988 <chromosome> <start position> <end position> <methylation percentage>
|
|
3989
|
|
3990 The bedGraph output with '--counts' specified looks like this (tab-delimited):
|
|
3991
|
|
3992 <chromosome> <start position> <end position> <methylation percentage> <count methylated> <count non-methylated>
|
|
3993
|
|
3994
|
|
3995
|
|
3996 The genome-wide cytosine methylation output file is tab-delimited in the following format:
|
|
3997 ==========================================================================================
|
|
3998 <chromosome> <position> <strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context>
|
|
3999
|
|
4000
|
|
4001
|
|
4002 This script was last modified on 21 April 2013.
|
|
4003
|
|
4004 HOW_TO
|
|
4005 }
|