11
|
1 #!/usr/bin/perl
|
|
2 use warnings;
|
|
3 use strict;
|
|
4 $|++;
|
|
5 use Getopt::Long;
|
|
6 use Cwd;
|
|
7
|
|
8 my @filenames;
|
|
9 my %counting;
|
|
10 my $parent_dir = getcwd();
|
|
11
|
|
12 my %fhs;
|
|
13
|
|
14 my $version = 'v0.7.6';
|
|
15 my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla) = process_commandline();
|
|
16
|
|
17 process_Bismark_results_file();
|
|
18
|
|
19 sub process_commandline{
|
|
20 my $help;
|
|
21 my $single_end;
|
|
22 my $paired_end;
|
|
23 my $ignore;
|
|
24 my $genomic_fasta;
|
|
25 my $full;
|
|
26 my $report;
|
|
27 my $extractor_version;
|
|
28 my $no_overlap;
|
|
29 my $merge_non_CpG;
|
|
30 my $vanilla;
|
|
31
|
|
32 my $command_line = GetOptions ('help|man' => \$help,
|
|
33 'p|paired-end' => \$paired_end,
|
|
34 's|single-end' => \$single_end,
|
|
35 'fasta' => \$genomic_fasta,
|
|
36 'ignore=i' => \$ignore,
|
|
37 'comprehensive' => \$full,
|
|
38 'report' => \$report,
|
|
39 'version' => \$extractor_version,
|
|
40 'no_overlap' => \$no_overlap,
|
|
41 'merge_non_CpG' => \$merge_non_CpG,
|
|
42 'vanilla' => \$vanilla,
|
|
43 );
|
|
44
|
|
45 ### EXIT ON ERROR if there were errors with any of the supplied options
|
|
46 unless ($command_line){
|
|
47 die "Please respecify command line options\n";
|
|
48 }
|
|
49
|
|
50 ### HELPFILE
|
|
51 if ($help){
|
|
52 print_helpfile();
|
|
53 exit;
|
|
54 }
|
|
55
|
|
56 if ($extractor_version){
|
|
57 print << "VERSION";
|
|
58
|
|
59
|
|
60 Bismark Methylation Extractor
|
|
61
|
|
62 Bismark Extractor Version: $version Copyright 2010-12 Felix Krueger, Babraham Bioinformatics
|
|
63 www.bioinformatics.babraham.ac.uk/projects/bismark/
|
|
64
|
|
65
|
|
66 VERSION
|
|
67 exit;
|
|
68 }
|
|
69
|
|
70
|
|
71 ### no files provided
|
|
72 unless (@ARGV){
|
|
73 die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n";
|
|
74 }
|
|
75 @filenames = @ARGV;
|
|
76
|
|
77 warn "\n *** Bismark methylation extractor version $version ***\n\n";
|
|
78
|
|
79 ### IGNORING <INT> bases at the start of the read when processing the methylation call string
|
|
80 unless ($ignore){
|
|
81 $ignore = 0;
|
|
82 }
|
|
83 ### PRINT A REPORT
|
|
84 unless ($report){
|
|
85 $report = 0;
|
|
86 }
|
|
87
|
|
88 ### OLD (VANILLA) OUTPUT FORMAT
|
|
89 unless ($vanilla){
|
|
90 $vanilla = 0;
|
|
91 }
|
|
92
|
|
93 if ($single_end){
|
|
94 $paired_end = 0; ### SINGLE END ALIGNMENTS
|
|
95 }
|
|
96 elsif ($paired_end){
|
|
97 $single_end = 0; ### PAIRED-END ALIGNMENTS
|
|
98 }
|
|
99 else{
|
|
100 die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format\n\n";
|
|
101 }
|
|
102
|
|
103 ### NO OVERLAP
|
|
104 if ($no_overlap){
|
|
105 die "The option '--no_overlap' can only be specified for paired-end input!\n" unless ($paired_end);
|
|
106 }
|
|
107 else{
|
|
108 $no_overlap = 0;
|
|
109 }
|
|
110
|
|
111 ### COMPREHENSIVE OUTPUT
|
|
112 unless ($full){
|
|
113 $full = 0;
|
|
114 }
|
|
115
|
|
116 ### MERGE NON-CpG context
|
|
117 unless ($merge_non_CpG){
|
|
118 $merge_non_CpG = 0;
|
|
119 }
|
|
120
|
|
121 return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla);
|
|
122 }
|
|
123
|
|
124
|
|
125 sub process_Bismark_results_file{
|
|
126
|
|
127 if ($single){
|
|
128 if ($vanilla){
|
|
129 warn "Bismark Single-end vanilla format specified\n";
|
|
130 }
|
|
131 else{
|
|
132 warn "Bismark Single-end SAM format specified (default)\n"; # default
|
|
133 }
|
|
134 }
|
|
135 elsif ($paired){
|
|
136 if ($vanilla){
|
|
137 warn "Bismark Paired-end vanilla format specified\n";
|
|
138 }
|
|
139 else{
|
|
140 warn "Bismark Paired-end SAM format specified (default)\n"; # default
|
|
141 }
|
|
142 }
|
|
143
|
|
144 if ($ignore){
|
|
145 warn "First $ignore bases will be disregarded when processing the methylation call string\n";
|
|
146 }
|
|
147
|
|
148 if ($full){
|
|
149 warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n";
|
|
150 }
|
|
151 if ($merge_non_CpG){
|
|
152 warn "Merge CHG and CHH context to non-CpG context specified\n";
|
|
153 }
|
|
154 warn "\n";
|
|
155
|
|
156 sleep (3);
|
|
157
|
|
158 foreach my $filename (@filenames){
|
|
159 %fhs = ();
|
|
160 %counting =(
|
|
161 total_meCHG_count => 0,
|
|
162 total_meCHH_count => 0,
|
|
163 total_meCpG_count => 0,
|
|
164 total_unmethylated_CHG_count => 0,
|
|
165 total_unmethylated_CHH_count => 0,
|
|
166 total_unmethylated_CpG_count => 0,
|
|
167 sequences_count => 0,
|
|
168 );
|
|
169 warn "\nNow reading in Bismark result file $filename\n\n";
|
|
170
|
|
171 if ($filename =~ /\.gz$/){
|
|
172 open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n";
|
|
173 }
|
|
174 else{
|
|
175 open (IN,$filename) or die "Can't open file $filename: $!\n";
|
|
176 }
|
|
177
|
|
178 ### Vanilla and SAM output need to read different numbers of header lines
|
|
179 if ($vanilla){
|
|
180 my $bismark_version = <IN>; ## discarding the Bismark version info
|
|
181 chomp $bismark_version;
|
|
182 $bismark_version =~ s/\r//; # replaces \r line feed
|
|
183 $bismark_version =~ s/Bismark version: //;
|
|
184 if ($bismark_version =~ /^\@/){
|
|
185 warn "Detected \@ as the first character of the version information. Is it possible that the file is in SAM format?\n\n";
|
|
186 sleep (2);
|
|
187 }
|
|
188
|
|
189 unless ($version eq $bismark_version){
|
|
190 die "The methylation extractor and Bismark itself need to be of the same version!\n\nVersions used:\nmethylation extractor: '$version'\nBismark: '$bismark_version'\n";
|
|
191 }
|
|
192 }
|
|
193 else{
|
|
194 # If the read is in SAM format (default) it can either start with @ header lines or start with alignments directly.
|
|
195 # We are reading from it further down
|
|
196 }
|
|
197
|
|
198 my $output_filename = (split (/\//,$filename))[-1];
|
|
199
|
|
200 ### OPENING OUTPUT-FILEHANDLES
|
|
201 if ($report){
|
|
202 my $report_filename = $output_filename;
|
|
203 $report_filename =~ s/$/_splitting_report.txt/;
|
|
204 open (REPORT,'>',$report_filename) or die "Failed to write to file $report_filename $!\n";
|
|
205 }
|
|
206
|
|
207 if ($report){
|
|
208 print REPORT "$output_filename\n\n";
|
|
209 print REPORT "Parameters used to extract methylation information:\n";
|
|
210 if ($paired){
|
|
211 if ($vanilla){
|
|
212 print REPORT "Bismark result file: paired-end (vanilla Bismark format)\n";
|
|
213 }
|
|
214 else{
|
|
215 print REPORT "Bismark result file: paired-end (SAM format)\n"; # default
|
|
216 }
|
|
217 }
|
|
218
|
|
219 if ($single){
|
|
220 if ($vanilla){
|
|
221 print REPORT "Bismark result file: single-end (vanilla Bismark format)\n";
|
|
222 }
|
|
223 else{
|
|
224 print REPORT "Bismark result file: single-end (SAM format)\n"; # default
|
|
225 }
|
|
226 }
|
|
227
|
|
228 if ($ignore){
|
|
229 print REPORT "Ignoring first $ignore bases\n";
|
|
230 }
|
|
231
|
|
232 if ($full){
|
|
233 print REPORT "Output specified: comprehensive\n";
|
|
234 }
|
|
235 else{
|
|
236 print REPORT "Output specified: strand-specific (default)\n";
|
|
237 }
|
|
238
|
|
239 if ($no_overlap){
|
|
240 print REPORT "No overlapping methylation calls specified\n";
|
|
241 }
|
|
242 if ($genomic_fasta){
|
|
243 print REPORT "Genomic equivalent sequences will be printed out in FastA format\n";
|
|
244 }
|
|
245 if ($merge_non_CpG){
|
|
246 print REPORT "Methylation in CHG and CHH context will be merged into \"non-CpG context\" output\n";
|
|
247 }
|
|
248
|
|
249 print REPORT "\n";
|
|
250 }
|
|
251
|
|
252 ### CpG-context and non-CpG context. THIS SECTION IS OPTIONAL
|
|
253
|
|
254 ### if --comprehensive AND --merge_non_CpG was specified we are only writing out one CpG-context and one Any-Other-context result file
|
|
255 if ($full and $merge_non_CpG){
|
|
256 my $cpg_output = my $other_c_output = $output_filename;
|
|
257 ### C in CpG context
|
|
258 $cpg_output =~ s/^/CpG_context_/;
|
|
259 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
|
|
260 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n";
|
|
261 print "Writing result file containing methylation information for C in CpG context to $cpg_output\n";
|
|
262 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n";
|
|
263
|
|
264 ### C in any other context than CpG
|
|
265 $other_c_output =~ s/^/Non_CpG_context_/;
|
|
266 $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/);
|
|
267 open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n";
|
|
268 print "Writing result file containing methylation information for C in any other context to $other_c_output\n";
|
|
269 print {$fhs{other_context}} "Bismark methylation extractor version $version\n";
|
|
270 }
|
|
271
|
|
272 ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found
|
|
273 elsif ($merge_non_CpG){
|
|
274
|
|
275 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
|
|
276
|
|
277 ### For cytosines in CpG context
|
|
278 $cpg_ot =~ s/^/CpG_OT_/;
|
|
279 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
|
|
280 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n";
|
|
281 print "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n";
|
|
282 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n";
|
|
283
|
|
284 $cpg_ctot =~ s/^/CpG_CTOT_/;
|
|
285 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
|
|
286 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n";
|
|
287 print "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n";
|
|
288 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n";
|
|
289
|
|
290 $cpg_ctob =~ s/^/CpG_CTOB_/;
|
|
291 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
|
|
292 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n";
|
|
293 print "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n";
|
|
294 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n";
|
|
295
|
|
296 $cpg_ob =~ s/^/CpG_OB_/;
|
|
297 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
|
|
298 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n";
|
|
299 print "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n";
|
|
300 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n";
|
|
301
|
|
302 ### For cytosines in Non-CpG (CC, CT or CA) context
|
|
303 my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename;
|
|
304
|
|
305 $other_c_ot =~ s/^/Non_CpG_OT_/;
|
|
306 $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/);
|
|
307 open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n";
|
|
308 print "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n";
|
|
309 print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n";
|
|
310
|
|
311 $other_c_ctot =~ s/^/Non_CpG_CTOT_/;
|
|
312 $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/);
|
|
313 open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n";
|
|
314 print "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n";
|
|
315 print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n";
|
|
316
|
|
317 $other_c_ctob =~ s/^/Non_CpG_CTOB_/;
|
|
318 $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/);
|
|
319 open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n";
|
|
320 print "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n";
|
|
321 print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n";
|
|
322
|
|
323 $other_c_ob =~ s/^/Non_CpG_OB_/;
|
|
324 $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/);
|
|
325 open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n";
|
|
326 print "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n";
|
|
327 print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n";
|
|
328 }
|
|
329
|
|
330 ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context)
|
|
331
|
|
332 ### if --comprehensive was specified we are only writing one file per context
|
|
333 elsif ($full){
|
|
334 my $cpg_output = my $chg_output = my $chh_output = $output_filename;
|
|
335 ### C in CpG context
|
|
336 $cpg_output =~ s/^/CpG_context_/;
|
|
337 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
|
|
338 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n";
|
|
339 print "Writing result file containing methylation information for C in CpG context to $cpg_output\n";
|
|
340 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n";
|
|
341
|
|
342 ### C in CHG context
|
|
343 $chg_output =~ s/^/CHG_context_/;
|
|
344 $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/);
|
|
345 open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n";
|
|
346 print "Writing result file containing methylation information for C in CHG context to $chg_output\n";
|
|
347 print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n";
|
|
348
|
|
349 ### C in CHH context
|
|
350 $chh_output =~ s/^/CHH_context_/;
|
|
351 $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/);
|
|
352 open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n";
|
|
353 print "Writing result file containing methylation information for C in CHH context to $chh_output\n";
|
|
354 print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n"; }
|
|
355
|
|
356 ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found
|
|
357 else{
|
|
358 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
|
|
359
|
|
360 ### For cytosines in CpG context
|
|
361 $cpg_ot =~ s/^/CpG_OT_/;
|
|
362 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
|
|
363 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n";
|
|
364 print "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n";
|
|
365 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n";
|
|
366
|
|
367 $cpg_ctot =~ s/^/CpG_CTOT_/;
|
|
368 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
|
|
369 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n";
|
|
370 print "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n";
|
|
371 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n";
|
|
372
|
|
373 $cpg_ctob =~ s/^/CpG_CTOB_/;
|
|
374 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
|
|
375 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n";
|
|
376 print "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n";
|
|
377 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n";
|
|
378
|
|
379 $cpg_ob =~ s/^/CpG_OB_/;
|
|
380 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
|
|
381 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n";
|
|
382 print "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n";
|
|
383 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n";
|
|
384
|
|
385 ### For cytosines in CHG context
|
|
386 my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename;
|
|
387
|
|
388 $chg_ot =~ s/^/CHG_OT_/;
|
|
389 $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/);
|
|
390 open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n";
|
|
391 print "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n";
|
|
392 print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n";
|
|
393
|
|
394 $chg_ctot =~ s/^/CHG_CTOT_/;
|
|
395 $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/);
|
|
396 open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n";
|
|
397 print "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n";
|
|
398 print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n";
|
|
399
|
|
400 $chg_ctob =~ s/^/CHG_CTOB_/;
|
|
401 $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/);
|
|
402 open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n";
|
|
403 print "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n";
|
|
404 print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n";
|
|
405
|
|
406 $chg_ob =~ s/^/CHG_OB_/;
|
|
407 $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/);
|
|
408 open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n";
|
|
409 print "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n";
|
|
410 print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n";
|
|
411
|
|
412 ### For cytosines in CHH context
|
|
413 my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename;
|
|
414
|
|
415 $chh_ot =~ s/^/CHH_OT_/;
|
|
416 $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/);
|
|
417 open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n";
|
|
418 print "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n";
|
|
419 print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n";
|
|
420
|
|
421 $chh_ctot =~ s/^/CHH_CTOT_/;
|
|
422 $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/);
|
|
423 open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n";
|
|
424 print "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n";
|
|
425 print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n";
|
|
426
|
|
427 $chh_ctob =~ s/^/CHH_CTOB_/;
|
|
428 $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/);
|
|
429 open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n";
|
|
430 print "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n";
|
|
431 print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n";
|
|
432
|
|
433 $chh_ob =~ s/^/CHH_OB_/;
|
|
434 $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/);
|
|
435 open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n";
|
|
436 print "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n";
|
|
437 print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n";
|
|
438 }
|
|
439
|
|
440 my $methylation_call_strings_processed = 0;
|
|
441 my $line_count = 0;
|
|
442
|
|
443 ### proceeding differently now for single-end or paired-end Bismark files
|
|
444
|
|
445 ### PROCESSING SINGLE-END RESULT FILES
|
|
446 if ($single){
|
|
447
|
|
448 ### also proceeding differently now for SAM format or vanilla Bismark format files
|
|
449 if ($vanilla){ # old vanilla Bismark output format
|
|
450 while (<IN>){
|
|
451 ++$line_count;
|
|
452 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
|
|
453
|
|
454 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
|
|
455 my ($id,$strand,$chrom,$start,$seq,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,6,7,8,9];
|
|
456
|
|
457 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the first or
|
|
458 ### last position
|
|
459 chomp $genome_conversion;
|
|
460
|
|
461 my $index;
|
|
462 if ($meth_call){
|
|
463
|
|
464 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){ ## original top strand
|
|
465 $index = 0;
|
|
466 }
|
|
467 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){ ## complementary to original top strand
|
|
468 $index = 1;
|
|
469 }
|
|
470 elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){ ## original bottom strand
|
|
471 $index = 3;
|
|
472 }
|
|
473 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){ ## complementary to original bottom strand
|
|
474 $index = 2;
|
|
475 }
|
|
476 else {
|
|
477 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
|
|
478 }
|
|
479
|
|
480 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int>
|
|
481 if ($ignore){
|
|
482 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
|
|
483
|
|
484 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
|
|
485 if ($strand eq '+'){
|
|
486 $start += $ignore;
|
|
487 }
|
|
488 elsif ($strand eq '-'){
|
|
489 $start += length($meth_call)-1; ## $meth_call is already shortened!
|
|
490 }
|
|
491 else {
|
|
492 die "Alignment did not have proper strand information: $strand\n";
|
|
493 }
|
|
494 }
|
|
495 ### printing out the methylation state of every C in the read
|
|
496 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index);
|
|
497
|
|
498 ++$methylation_call_strings_processed; # 1 per single-end result
|
|
499 }
|
|
500 }
|
|
501 }
|
|
502 else{ # processing single-end SAM format (default)
|
|
503 while (<IN>){
|
|
504 ### SAM format can either start with header lines (starting with @) or start with alignments directly
|
|
505 if (/^\@/){ # skipping header lines (starting with @)
|
|
506 warn "skipping SAM header line:\t$_";
|
|
507 next;
|
|
508 }
|
|
509
|
|
510 ++$line_count;
|
|
511 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
|
|
512
|
|
513 # example read in SAM format
|
|
514 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
|
|
515 ###
|
|
516
|
|
517 # < 0.7.6 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
|
|
518 # < 0.7.6 $meth_call =~ s/^XM:Z://;
|
|
519 # < 0.7.6 $read_conversion =~ s/^XR:Z://;
|
|
520 # < 0.7.6 $genome_conversion =~ s/^XG:Z://;
|
|
521
|
|
522 my ($id,$chrom,$start,$cigar) = (split("\t"))[0,2,3,5];
|
|
523
|
|
524 ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
|
|
525 my $meth_call; ### Thanks to Zachary Zeno for this solution
|
|
526 my $read_conversion;
|
|
527 my $genome_conversion;
|
|
528
|
|
529 while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
|
|
530 my $tag = $1;
|
|
531 my $value = $2;
|
|
532
|
|
533 if ($tag eq "XM"){
|
|
534 $meth_call = $value;
|
|
535 $meth_call =~ s/\r//;
|
|
536 }
|
|
537 elsif ($tag eq "XR") {
|
|
538 $read_conversion = $value;
|
|
539 $read_conversion =~ s/\r//;
|
|
540 }
|
|
541 elsif ($tag eq "XG") {
|
|
542 $genome_conversion = $value;
|
|
543 $genome_conversion =~ s/\r//;
|
|
544 }
|
|
545 }
|
|
546
|
|
547 my $strand;
|
|
548 chomp $genome_conversion;
|
|
549 # print "$meth_call\n$read_conversion\n$genome_conversion\n";
|
|
550
|
|
551 my $index;
|
|
552 if ($meth_call){
|
|
553 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){ ## original top strand
|
|
554 $index = 0;
|
|
555 $strand = '+';
|
|
556 }
|
|
557 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){ ## complementary to original top strand
|
|
558 $index = 1;
|
|
559 $strand = '-';
|
|
560 }
|
|
561 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){ ## complementary to original bottom strand
|
|
562 $index = 2;
|
|
563 $strand = '+';
|
|
564 }
|
|
565 elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){ ## original bottom strand
|
|
566 $index = 3;
|
|
567 $strand = '-';
|
|
568 }
|
|
569 else {
|
|
570 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
|
|
571 }
|
|
572
|
|
573 ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output
|
|
574 if ($strand eq '-'){
|
|
575 $meth_call = reverse $meth_call;
|
|
576 }
|
|
577
|
|
578 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int>
|
|
579 if ($ignore){
|
|
580 # print "\n\n$meth_call\n";
|
|
581 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
|
|
582 # print "$meth_call\n";
|
|
583 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
|
|
584
|
|
585 my @len = split (/\D+/,$cigar); # storing the length per operation
|
|
586 my @ops = split (/\d+/,$cigar); # storing the operation
|
|
587 shift @ops; # remove the empty first element
|
|
588 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
|
|
589
|
|
590 my @comp_cigar; # building an array with all CIGAR operations
|
|
591 foreach my $index (0..$#len){
|
|
592 foreach (1..$len[$index]){
|
|
593 # print "$ops[$index]";
|
|
594 push @comp_cigar, $ops[$index];
|
|
595 }
|
|
596 }
|
|
597 # print "original CIGAR: $cigar\n";
|
|
598 # print "original CIGAR: @comp_cigar\n";
|
|
599
|
|
600 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
|
|
601 if ($strand eq '+'){
|
|
602
|
|
603 my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions
|
|
604 my $I_count = 0;
|
|
605
|
|
606 for (1..$ignore){
|
|
607 my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations from the start
|
|
608 # print "$_ deleted $op\n";
|
|
609
|
|
610 while ($op eq 'D'){ # repeating this for deletions (D)
|
|
611 $D_count++;
|
|
612 $op = shift @comp_cigar;
|
|
613 # print "$_ deleted $op\n";
|
|
614 }
|
|
615 if ($op eq 'I'){ # adjusting the genomic position for insertions (I)
|
|
616 $I_count++;
|
|
617 }
|
|
618 }
|
|
619 $start += $ignore + $D_count - $I_count;
|
|
620 # print "start $start\t ignore: $ignore\t D count: $D_count I_count: $I_count\n";
|
|
621 }
|
|
622 elsif ($strand eq '-'){
|
|
623
|
|
624 for (1..$ignore){
|
|
625 my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
|
|
626 while ($op eq 'D'){ # repeating this for deletions (D)
|
|
627 $op = pop @comp_cigar;
|
|
628 }
|
|
629 }
|
|
630
|
|
631 ### For reverse strand alignments we need to determine the number of matching bases (M) or deletions (D) in the read from the CIGAR
|
|
632 ### string to be able to work out the starting position of the read which is on the 3' end of the sequence
|
|
633 my $MD_count = 0; # counting all operations that affect the genomic position, i.e. M and D. Insertions do not affect the start position
|
|
634 foreach (@comp_cigar){
|
|
635 ++$MD_count if ($_ eq 'M' or $_ eq 'D');
|
|
636 }
|
|
637 $start += $MD_count - 1;
|
|
638 }
|
|
639
|
|
640 ### reconstituting shortened CIGAR string
|
|
641 my $new_cigar;
|
|
642 my $count = 0;
|
|
643 my $last_op;
|
|
644 # print "ignore adjusted: @comp_cigar\n";
|
|
645 foreach my $op (@comp_cigar){
|
|
646 unless (defined $last_op){
|
|
647 $last_op = $op;
|
|
648 ++$count;
|
|
649 next;
|
|
650 }
|
|
651 if ($last_op eq $op){
|
|
652 ++$count;
|
|
653 }
|
|
654 else{
|
|
655 $new_cigar .= "$count$last_op";
|
|
656 $last_op = $op;
|
|
657 $count = 1;
|
|
658 }
|
|
659 }
|
|
660 $new_cigar .= "$count$last_op"; # appending the last operation and count
|
|
661 $cigar = $new_cigar;
|
|
662 # print "ignore adjusted scalar: $cigar\n";
|
|
663 }
|
|
664 }
|
|
665 ### printing out the methylation state of every C in the read
|
|
666 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index,$cigar);
|
|
667
|
|
668 ++$methylation_call_strings_processed; # 1 per single-end result
|
|
669 }
|
|
670 }
|
|
671 }
|
|
672
|
|
673 ### PROCESSING PAIRED-END RESULT FILES
|
|
674 elsif ($paired){
|
|
675
|
|
676 ### proceeding differently now for SAM format or vanilla Bismark format files
|
|
677 if ($vanilla){ # old vanilla Bismark paired-end output format
|
|
678 while (<IN>){
|
|
679 ++$line_count;
|
|
680 warn "processed line: $line_count\n" if ($line_count%500000==0);
|
|
681
|
|
682 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
|
|
683 my ($id,$strand,$chrom,$start_read_1,$end_read_2,$seq_1,$meth_call_1,$seq_2,$meth_call_2,$first_read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,4,6,7,9,10,11,12,13];
|
|
684
|
|
685 my $index;
|
|
686 chomp $genome_conversion;
|
|
687
|
|
688 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT'){
|
|
689 $index = 0; ## this is OT
|
|
690 }
|
|
691 elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA'){
|
|
692 $index = 2; ## this is CTOB!!!
|
|
693 }
|
|
694 elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT'){
|
|
695 $index = 1; ## this is CTOT!!!
|
|
696 }
|
|
697 elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA'){
|
|
698 $index = 3; ## this is OB
|
|
699 }
|
|
700 else {
|
|
701 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
|
|
702 }
|
|
703
|
|
704 if ($meth_call_1 and $meth_call_2){
|
|
705 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>'
|
|
706 if ($ignore){
|
|
707 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
|
|
708 $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore);
|
|
709
|
|
710 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified
|
|
711 $start_read_1 += $ignore;
|
|
712 $end_read_2 -= $ignore;
|
|
713 }
|
|
714 my $end_read_1;
|
|
715 my $start_read_2;
|
|
716
|
|
717 if ($strand eq '+'){
|
|
718
|
|
719 $end_read_1 = $start_read_1+length($meth_call_1)-1;
|
|
720 $start_read_2 = $end_read_2-length($meth_call_2)+1;
|
|
721
|
|
722 ## we first pass the first read which is in + orientation on the forward strand
|
|
723 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0);
|
|
724
|
|
725 # we next pass the second read which is in - orientation on the reverse strand
|
|
726 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
|
|
727 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1);
|
|
728 }
|
|
729
|
|
730 else{
|
|
731
|
|
732 $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read!
|
|
733 $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read!
|
|
734
|
|
735 ## we first pass the first read which is in - orientation on the reverse strand
|
|
736 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0);
|
|
737
|
|
738 # we next pass the second read which is in + orientation on the forward strand
|
|
739 ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
|
|
740 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2);
|
|
741 }
|
|
742
|
|
743 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
|
|
744 }
|
|
745 }
|
|
746 }
|
|
747 else{ # Bismark paired-end SAM output format (default)
|
|
748 while (<IN>){
|
|
749 ### SAM format can either start with header lines (starting with @) or start with alignments directly
|
|
750 if (/^\@/){ # skipping header lines (starting with @)
|
|
751 warn "skipping SAM header line:\t$_";
|
|
752 next;
|
|
753 }
|
|
754
|
|
755 ++$line_count;
|
|
756 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
|
|
757
|
|
758 # example paired-end reads in SAM format (2 consecutive lines)
|
|
759 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
|
|
760 # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT
|
|
761
|
|
762 # < version 0.7.6 my ($id_1,$chrom,$start_read_1,$meth_call_1,$first_read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
|
|
763
|
|
764 my ($id_1,$chrom,$start_read_1,$cigar_1) = (split("\t"))[0,2,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
|
|
765 my $meth_call_1;
|
|
766 my $first_read_conversion;
|
|
767 my $genome_conversion;
|
|
768
|
|
769 while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
|
|
770 my $tag = $1;
|
|
771 my $value = $2;
|
|
772
|
|
773 if ($tag eq "XM"){
|
|
774 $meth_call_1 = $value;
|
|
775 $meth_call_1 =~ s/\r//;
|
|
776 }
|
|
777 elsif ($tag eq "XR") {
|
|
778 $first_read_conversion = $value;
|
|
779 $first_read_conversion =~ s/\r//;
|
|
780 }
|
|
781 elsif ($tag eq "XG") {
|
|
782 $genome_conversion = $value;
|
|
783 $genome_conversion =~ s/\r//;
|
|
784 }
|
|
785 }
|
|
786
|
|
787 $_ = <IN>; # reading in the paired read
|
|
788
|
|
789 # < version 0.7.6 my ($id_2,$start_read_2,$meth_call_2,$second_read_conversion) = (split("\t"))[0,3,13,14];
|
|
790 # < version 0.7.6 $meth_call_1 =~ s/^XM:Z://;
|
|
791 # < version 0.7.6 $meth_call_2 =~ s/^XM:Z://;
|
|
792 # < version 0.7.6 $first_read_conversion =~ s/^XR:Z://;
|
|
793 # < version 0.7.6 $second_read_conversion =~ s/^XR:Z://;
|
|
794
|
|
795 my ($id_2,$start_read_2,$cigar_2) = (split("\t"))[0,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
|
|
796
|
|
797 my $meth_call_2;
|
|
798 my $second_read_conversion;
|
|
799
|
|
800 while ( /(XM|XR):Z:([^\t]+)/g ) {
|
|
801 my $tag = $1;
|
|
802 my $value = $2;
|
|
803
|
|
804 if ($tag eq "XM"){
|
|
805 $meth_call_2 = $value;
|
|
806 $meth_call_2 =~ s/\r//;
|
|
807 }
|
|
808 elsif ($tag eq "XR") {
|
|
809 $second_read_conversion = $value;
|
|
810 $second_read_conversion = s/\r//;
|
|
811 }
|
|
812 }
|
|
813
|
|
814 # < version 0.7.6 $genome_conversion =~ s/^XG:Z://;
|
|
815 chomp $genome_conversion; # in case it captured a new line character
|
|
816
|
|
817 # print join ("\t",$meth_call_1,$meth_call_2,$first_read_conversion,$second_read_conversion,$genome_conversion),"\n";
|
|
818
|
|
819 my $index;
|
|
820 my $strand;
|
|
821
|
|
822 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT'){
|
|
823 $index = 0; ## this is OT
|
|
824 $strand = '+';
|
|
825 }
|
|
826 elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT'){
|
|
827 $index = 1; ## this is CTOT
|
|
828 $strand = '-';
|
|
829 }
|
|
830 elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA'){
|
|
831 $index = 2; ## this is CTOB
|
|
832 $strand = '+';
|
|
833 }
|
|
834 elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA'){
|
|
835 $index = 3; ## this is OB
|
|
836 $strand = '-';
|
|
837 }
|
|
838 else {
|
|
839 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
|
|
840 }
|
|
841
|
|
842 ### reversing the methylation call of the read that was reverse-complemented
|
|
843 if ($strand eq '+'){
|
|
844 $meth_call_2 = reverse $meth_call_2;
|
|
845 }
|
|
846 else{
|
|
847 $meth_call_1 = reverse $meth_call_1;
|
|
848 }
|
|
849
|
|
850 if ($meth_call_1 and $meth_call_2){
|
|
851
|
|
852 my $end_read_1;
|
|
853
|
|
854 ### READ 1
|
|
855 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
|
|
856 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
|
|
857 shift @ops_1; # remove the empty first element
|
|
858 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1);
|
|
859
|
|
860 my @comp_cigar_1; # building an array with all CIGAR operations
|
|
861 foreach my $index (0..$#len_1){
|
|
862 foreach (1..$len_1[$index]){
|
|
863 # print "$ops_1[$index]";
|
|
864 push @comp_cigar_1, $ops_1[$index];
|
|
865 }
|
|
866 }
|
|
867 # print "original CIGAR read 1: $cigar_1\n";
|
|
868 # print "original CIGAR read 1: @comp_cigar_1\n";
|
|
869
|
|
870 ### READ 2
|
|
871 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
|
|
872 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
|
|
873 shift @ops_2; # remove the empty first element
|
|
874 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
|
|
875 my @comp_cigar_2; # building an array with all CIGAR operations for read 2
|
|
876 foreach my $index (0..$#len_2){
|
|
877 foreach (1..$len_2[$index]){
|
|
878 # print "$ops_2[$index]";
|
|
879 push @comp_cigar_2, $ops_2[$index];
|
|
880 }
|
|
881 }
|
|
882 # print "original CIGAR read 2: $cigar_2\n";
|
|
883 # print "original CIGAR read 2: @comp_cigar_2\n";
|
|
884
|
|
885 if ($ignore){
|
|
886 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>'
|
|
887 ### the methylation calls have already been reversed where necessary
|
|
888 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
|
|
889 $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore);
|
|
890
|
|
891 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
|
|
892
|
|
893 if ($strand eq '+'){
|
|
894
|
|
895 ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start
|
|
896 my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions
|
|
897 my $I_count_1 = 0;
|
|
898
|
|
899 for (1..$ignore){
|
|
900 my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start
|
|
901 # print "$_ deleted $op\n";
|
|
902
|
|
903 while ($op eq 'D'){ # repeating this for deletions (D)
|
|
904 $D_count_1++;
|
|
905 $op = shift @comp_cigar_1;
|
|
906 # print "$_ deleted $op\n";
|
|
907 }
|
|
908 if ($op eq 'I'){ # adjusting the genomic position for insertions (I)
|
|
909 $I_count_1++;
|
|
910 }
|
|
911 }
|
|
912
|
|
913 $start_read_1 += $ignore + $D_count_1 - $I_count_1;
|
|
914 # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n";
|
|
915
|
|
916 ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back
|
|
917
|
|
918 for (1..$ignore){
|
|
919 my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
|
|
920 while ($op eq 'D'){ # repeating this for deletions (D)
|
|
921 $op = pop @comp_cigar_2;
|
|
922 }
|
|
923 }
|
|
924 # the start position of reads mapping to the reverse strand is being adjusted further below
|
|
925 }
|
|
926
|
|
927 elsif ($strand eq '-'){
|
|
928
|
|
929 ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back
|
|
930 for (1..$ignore){
|
|
931 my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
|
|
932 while ($op eq 'D'){ # repeating this for deletions (D)
|
|
933 $op = pop @comp_cigar_1;
|
|
934 }
|
|
935 }
|
|
936 # the start position of reads mapping to the reverse strand is being adjusted further below
|
|
937
|
|
938 ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start
|
|
939 my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions
|
|
940 my $I_count_2 = 0;
|
|
941
|
|
942 for (1..$ignore){
|
|
943 my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start
|
|
944 # print "$_ deleted $op\n";
|
|
945
|
|
946 while ($op eq 'D'){ # repeating this for deletions (D)
|
|
947 $D_count_2++;
|
|
948 $op = shift @comp_cigar_2;
|
|
949 # print "$_ deleted $op\n";
|
|
950 }
|
|
951 if ($op eq 'I'){ # adjusting the genomic position for insertions (I)
|
|
952 $I_count_2++;
|
|
953 }
|
|
954 }
|
|
955
|
|
956 $start_read_2 += $ignore + $D_count_2 - $I_count_2;
|
|
957 # print "start read 2 $start_read_2\t ignore: $ignore\t D count 2: $D_count_2\tI_count 2: $I_count_2\n";
|
|
958
|
|
959 }
|
|
960
|
|
961 ### reconstituting shortened CIGAR string 1
|
|
962 my $new_cigar_1;
|
|
963 my $count_1 = 0;
|
|
964 my $last_op_1;
|
|
965 # print "ignore adjusted CIGAR 1: @comp_cigar_1\n";
|
|
966 foreach my $op (@comp_cigar_1){
|
|
967 unless (defined $last_op_1){
|
|
968 $last_op_1 = $op;
|
|
969 ++$count_1;
|
|
970 next;
|
|
971 }
|
|
972 if ($last_op_1 eq $op){
|
|
973 ++$count_1;
|
|
974 }
|
|
975 else{
|
|
976 $new_cigar_1 .= "$count_1$last_op_1";
|
|
977 $last_op_1 = $op;
|
|
978 $count_1 = 1;
|
|
979 }
|
|
980 }
|
|
981 $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count
|
|
982 $cigar_1 = $new_cigar_1;
|
|
983 # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n";
|
|
984
|
|
985 ### reconstituting shortened CIGAR string 2
|
|
986 my $new_cigar_2;
|
|
987 my $count_2 = 0;
|
|
988 my $last_op_2;
|
|
989 # print "ignore adjusted CIGAR 2: @comp_cigar_2\n";
|
|
990 foreach my $op (@comp_cigar_2){
|
|
991 unless (defined $last_op_2){
|
|
992 $last_op_2 = $op;
|
|
993 ++$count_2;
|
|
994 next;
|
|
995 }
|
|
996 if ($last_op_2 eq $op){
|
|
997 ++$count_2;
|
|
998 }
|
|
999 else{
|
|
1000 $new_cigar_2 .= "$count_2$last_op_2";
|
|
1001 $last_op_2 = $op;
|
|
1002 $count_2 = 1;
|
|
1003 }
|
|
1004 }
|
|
1005 $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count
|
|
1006 $cigar_2 = $new_cigar_2;
|
|
1007 # print "ignore adjusted CIGAR 2 scalar: $cigar_2\n";
|
|
1008
|
|
1009 }
|
|
1010
|
|
1011 if ($strand eq '+'){
|
|
1012 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2
|
|
1013 @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
|
|
1014 # print "reverse: @comp_cigar_2\n";
|
|
1015
|
|
1016 my $MD_count_1 = 0;
|
|
1017 foreach (@comp_cigar_1){
|
|
1018 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
|
|
1019 }
|
|
1020
|
|
1021 my $MD_count_2 = 0;
|
|
1022 foreach (@comp_cigar_2){
|
|
1023 ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
|
|
1024 }
|
|
1025
|
|
1026 $end_read_1 = $start_read_1 + $MD_count_1 - 1;
|
|
1027 $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand
|
|
1028 }
|
|
1029 else{
|
|
1030 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1
|
|
1031
|
|
1032 @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
|
|
1033 # print "reverse: @comp_cigar_1\n";
|
|
1034
|
|
1035 my $MD_count_1 = 0;
|
|
1036 foreach (@comp_cigar_1){
|
|
1037 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
|
|
1038 }
|
|
1039
|
|
1040 $end_read_1 = $start_read_1;
|
|
1041 $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand
|
|
1042
|
|
1043 }
|
|
1044
|
|
1045 if ($strand eq '+'){
|
|
1046 ## we first pass the first read which is in + orientation on the forward strand
|
|
1047 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1);
|
|
1048
|
|
1049 # we next pass the second read which is in - orientation on the reverse strand
|
|
1050 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
|
|
1051 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2);
|
|
1052 }
|
|
1053
|
|
1054 else{
|
|
1055 ## we first pass the first read which is in - orientation on the reverse strand
|
|
1056 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1);
|
|
1057
|
|
1058 # we next pass the second read which is in + orientation on the forward strand
|
|
1059 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
|
|
1060 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2);
|
|
1061 }
|
|
1062
|
|
1063 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
|
|
1064 }
|
|
1065 }
|
|
1066 }
|
|
1067 }
|
|
1068 else{
|
|
1069 die "Single-end or paired-end reads not specified properly\n";
|
|
1070 }
|
|
1071
|
|
1072 print "\n\nProcessed $line_count lines from $filename in total\n";
|
|
1073 print "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n";
|
|
1074 if ($report){
|
|
1075 print REPORT "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n";
|
|
1076 }
|
|
1077 print_splitting_report ();
|
|
1078 }
|
|
1079 }
|
|
1080
|
|
1081
|
|
1082 sub print_splitting_report{
|
|
1083
|
|
1084 ### Calculating methylation percentages if applicable
|
|
1085
|
|
1086 my $percent_meCpG;
|
|
1087 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
|
|
1088 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
|
|
1089 }
|
|
1090
|
|
1091 my $percent_meCHG;
|
|
1092 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
|
|
1093 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
|
|
1094 }
|
|
1095
|
|
1096 my $percent_meCHH;
|
|
1097 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
|
|
1098 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
|
|
1099 }
|
|
1100
|
|
1101 my $percent_non_CpG_methylation;
|
|
1102 if ($merge_non_CpG){
|
|
1103 if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
|
|
1104 $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) );
|
|
1105 }
|
|
1106 }
|
|
1107
|
|
1108 if ($report){
|
|
1109 ### detailed information about Cs analysed
|
|
1110 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
|
|
1111
|
|
1112 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
|
|
1113 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
|
|
1114
|
|
1115 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
|
|
1116 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
|
|
1117 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
|
|
1118
|
|
1119 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
|
|
1120 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
|
|
1121 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
|
|
1122
|
|
1123 ### calculating methylated CpG percentage if applicable
|
|
1124 if ($percent_meCpG){
|
|
1125 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
|
|
1126 }
|
|
1127 else{
|
|
1128 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
|
|
1129 }
|
|
1130
|
|
1131 ### 2-Context Output
|
|
1132 if ($merge_non_CpG){
|
|
1133 if ($percent_non_CpG_methylation){
|
|
1134 print REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
|
|
1135 }
|
|
1136 else{
|
|
1137 print REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
|
|
1138 }
|
|
1139 }
|
|
1140
|
|
1141 ### 3 Context Output
|
|
1142 else{
|
|
1143 ### calculating methylated CHG percentage if applicable
|
|
1144 if ($percent_meCHG){
|
|
1145 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
|
|
1146 }
|
|
1147 else{
|
|
1148 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
|
|
1149 }
|
|
1150
|
|
1151 ### calculating methylated CHH percentage if applicable
|
|
1152 if ($percent_meCHH){
|
|
1153 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
|
|
1154 }
|
|
1155 else{
|
|
1156 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
|
|
1157 }
|
|
1158 }
|
|
1159 }
|
|
1160
|
|
1161 ### detailed information about Cs analysed for on-screen report
|
|
1162 print "Final Cytosine Methylation Report\n",'='x33,"\n";
|
|
1163
|
|
1164 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
|
|
1165 print "Total number of C's analysed:\t$total_number_of_C\n\n";
|
|
1166
|
|
1167 print "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
|
|
1168 print "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
|
|
1169 print "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
|
|
1170
|
|
1171 print "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
|
|
1172 print "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
|
|
1173 print "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
|
|
1174
|
|
1175 ### printing methylated CpG percentage if applicable
|
|
1176 if ($percent_meCpG){
|
|
1177 print "C methylated in CpG context:\t${percent_meCpG}%\n";
|
|
1178 }
|
|
1179 else{
|
|
1180 print "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
|
|
1181 }
|
|
1182
|
|
1183 ### 2-Context Output
|
|
1184 if ($merge_non_CpG){
|
|
1185 if ($percent_non_CpG_methylation){
|
|
1186 print "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
|
|
1187 }
|
|
1188 else{
|
|
1189 print "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
|
|
1190 }
|
|
1191 }
|
|
1192
|
|
1193 ### 3-Context Output
|
|
1194 else{
|
|
1195 ### printing methylated CHG percentage if applicable
|
|
1196 if ($percent_meCHG){
|
|
1197 print "C methylated in CHG context:\t${percent_meCHG}%\n";
|
|
1198 }
|
|
1199 else{
|
|
1200 print "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
|
|
1201 }
|
|
1202
|
|
1203 ### printing methylated CHH percentage if applicable
|
|
1204 if ($percent_meCHH){
|
|
1205 print "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
|
|
1206 }
|
|
1207 else{
|
|
1208 print "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
|
|
1209 }
|
|
1210 }
|
|
1211 }
|
|
1212
|
|
1213
|
|
1214
|
|
1215
|
|
1216
|
|
1217 sub print_individual_C_methylation_states_paired_end_files{
|
|
1218
|
|
1219 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar) = @_;
|
|
1220 my @methylation_calls = split(//,$meth_call);
|
|
1221
|
|
1222 #################################################################
|
|
1223 ### . for bases not involving cytosines ###
|
|
1224 ### X for methylated C in CHG context (was protected) ###
|
|
1225 ### x for not methylated C in CHG context (was converted) ###
|
|
1226 ### H for methylated C in CHH context (was protected) ###
|
|
1227 ### h for not methylated C in CHH context (was converted) ###
|
|
1228 ### Z for methylated C in CpG context (was protected) ###
|
|
1229 ### z for not methylated C in CpG context (was converted) ###
|
|
1230 #################################################################
|
|
1231
|
|
1232 my $methyl_CHG_count = 0;
|
|
1233 my $methyl_CHH_count = 0;
|
|
1234 my $methyl_CpG_count = 0;
|
|
1235 my $unmethylated_CHG_count = 0;
|
|
1236 my $unmethylated_CHH_count = 0;
|
|
1237 my $unmethylated_CpG_count = 0;
|
|
1238
|
|
1239 my @len;
|
|
1240 my @ops;
|
|
1241 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
|
|
1242 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
|
|
1243 my @comp_cigar;
|
|
1244
|
|
1245 if ($cigar){ # parsing CIGAR string
|
|
1246 @len = split (/\D+/,$cigar); # storing the length per operation
|
|
1247 @ops = split (/\d+/,$cigar); # storing the operation
|
|
1248 shift @ops; # remove the empty first element
|
|
1249 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
|
|
1250
|
|
1251 foreach my $index (0..$#len){
|
|
1252 foreach (1..$len[$index]){
|
|
1253 # print "$ops[$index]";
|
|
1254 push @comp_cigar, $ops[$index];
|
|
1255 }
|
|
1256 }
|
|
1257 # warn "\nDetected CIGAR string: $cigar\n";
|
|
1258 # warn "Length of methylation call: ",length $meth_call,"\n";
|
|
1259 # warn "number of operations: ",scalar @ops,"\n";
|
|
1260 # warn "number of length digits: ",scalar @len,"\n\n";
|
|
1261 # print @comp_cigar,"\n";
|
|
1262 # print "$meth_call\n\n";
|
|
1263 # sleep (1);
|
|
1264 }
|
|
1265
|
|
1266
|
|
1267 if ($strand eq '-') {
|
|
1268
|
|
1269 ### the CIGAR string needs to be reversed, the methylation call has already been reversed above
|
|
1270 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
|
|
1271 # print "reverse CIGAR string: @comp_cigar\n";
|
|
1272
|
|
1273 ### the start position of paired-end files has already been corrected, see above
|
|
1274 }
|
|
1275
|
|
1276 ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified
|
|
1277
|
|
1278 if ($merge_non_CpG) {
|
|
1279
|
|
1280 if ($no_overlap) {
|
|
1281
|
|
1282 ### single-file CpG and non-CpG context output
|
|
1283 if ($full) {
|
|
1284 if ($strand eq '+') {
|
|
1285 for my $index (0..$#methylation_calls) {
|
|
1286
|
|
1287 if ($cigar){ # only needed for SAM files
|
|
1288 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
1289 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
1290 $cigar_offset += $cigar_mod;
|
|
1291 $pos_offset += $pos_mod;
|
|
1292 }
|
|
1293
|
|
1294 ### Returning as soon as the methylation calls start overlapping
|
|
1295 if ($start+$index+$pos_offset >= $end_read_1) {
|
|
1296 return;
|
|
1297 }
|
|
1298
|
|
1299 if ($methylation_calls[$index] eq 'X') {
|
|
1300 $counting{total_meCHG_count}++;
|
|
1301 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1302 } elsif ($methylation_calls[$index] eq 'x') {
|
|
1303 $counting{total_unmethylated_CHG_count}++;
|
|
1304 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1305 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
1306 $counting{total_meCpG_count}++;
|
|
1307 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1308 } elsif ($methylation_calls[$index] eq 'z') {
|
|
1309 $counting{total_unmethylated_CpG_count}++;
|
|
1310 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1311 } elsif ($methylation_calls[$index] eq 'H') {
|
|
1312 $counting{total_meCHH_count}++;
|
|
1313 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1314 } elsif ($methylation_calls[$index] eq 'h') {
|
|
1315 $counting{total_unmethylated_CHH_count}++;
|
|
1316 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1317 }
|
|
1318 elsif ($methylation_calls[$index] eq '.'){}
|
|
1319 else{
|
|
1320 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
1321 }
|
|
1322 }
|
|
1323 } elsif ($strand eq '-') {
|
|
1324 for my $index (0..$#methylation_calls) {
|
|
1325
|
|
1326 if ($cigar){ # only needed for SAM files
|
|
1327 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
1328 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
1329 $cigar_offset += $cigar_mod;
|
|
1330 $pos_offset += $pos_mod;
|
|
1331 }
|
|
1332
|
|
1333 ### Returning as soon as the methylation calls start overlapping
|
|
1334 if ($start-$index+$pos_offset <= $end_read_1) {
|
|
1335 return;
|
|
1336 }
|
|
1337
|
|
1338 if ($methylation_calls[$index] eq 'X') {
|
|
1339 $counting{total_meCHG_count}++;
|
|
1340 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1341 } elsif ($methylation_calls[$index] eq 'x') {
|
|
1342 $counting{total_unmethylated_CHG_count}++;
|
|
1343 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1344 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
1345 $counting{total_meCpG_count}++;
|
|
1346 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1347 } elsif ($methylation_calls[$index] eq 'z') {
|
|
1348 $counting{total_unmethylated_CpG_count}++;
|
|
1349 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1350 } elsif ($methylation_calls[$index] eq 'H') {
|
|
1351 $counting{total_meCHH_count}++;
|
|
1352 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1353 } elsif ($methylation_calls[$index] eq 'h') {
|
|
1354 $counting{total_unmethylated_CHH_count}++;
|
|
1355 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1356 }
|
|
1357 elsif ($methylation_calls[$index] eq '.') {}
|
|
1358 else{
|
|
1359 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
1360 }
|
|
1361 }
|
|
1362 } else {
|
|
1363 die "The read orientation was neither + nor -: '$strand'\n";
|
|
1364 }
|
|
1365 }
|
|
1366
|
|
1367 ### strand-specific methylation output
|
|
1368 else {
|
|
1369 if ($strand eq '+') {
|
|
1370 for my $index (0..$#methylation_calls) {
|
|
1371
|
|
1372 if ($cigar){ # only needed for SAM files
|
|
1373 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
1374 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
1375 $cigar_offset += $cigar_mod;
|
|
1376 $pos_offset += $pos_mod;
|
|
1377 }
|
|
1378
|
|
1379 ### Returning as soon as the methylation calls start overlapping
|
|
1380 if ($start+$index+$pos_offset >= $end_read_1) {
|
|
1381 return;
|
|
1382 }
|
|
1383
|
|
1384 if ($methylation_calls[$index] eq 'X') {
|
|
1385 $counting{total_meCHG_count}++;
|
|
1386 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1387 } elsif ($methylation_calls[$index] eq 'x') {
|
|
1388 $counting{total_unmethylated_CHG_count}++;
|
|
1389 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1390 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
1391 $counting{total_meCpG_count}++;
|
|
1392 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1393 } elsif ($methylation_calls[$index] eq 'z') {
|
|
1394 $counting{total_unmethylated_CpG_count}++;
|
|
1395 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1396 } elsif ($methylation_calls[$index] eq 'H') {
|
|
1397 $counting{total_meCHH_count}++;
|
|
1398 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1399 } elsif ($methylation_calls[$index] eq 'h') {
|
|
1400 $counting{total_unmethylated_CHH_count}++;
|
|
1401 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1402 }
|
|
1403 elsif ($methylation_calls[$index] eq '.') {}
|
|
1404 else{
|
|
1405 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
1406 }
|
|
1407 }
|
|
1408 } elsif ($strand eq '-') {
|
|
1409 for my $index (0..$#methylation_calls) {
|
|
1410
|
|
1411 if ($cigar){ # only needed for SAM files
|
|
1412 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
1413 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
1414 $cigar_offset += $cigar_mod;
|
|
1415 $pos_offset += $pos_mod;
|
|
1416 }
|
|
1417
|
|
1418 ### Returning as soon as the methylation calls start overlapping
|
|
1419 if ($start-$index+$pos_offset <= $end_read_1) {
|
|
1420 return;
|
|
1421 }
|
|
1422
|
|
1423 if ($methylation_calls[$index] eq 'X') {
|
|
1424 $counting{total_meCHG_count}++;
|
|
1425 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1426 } elsif ($methylation_calls[$index] eq 'x') {
|
|
1427 $counting{total_unmethylated_CHG_count}++;
|
|
1428 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1429 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
1430 $counting{total_meCpG_count}++;
|
|
1431 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1432 } elsif ($methylation_calls[$index] eq 'z') {
|
|
1433 $counting{total_unmethylated_CpG_count}++;
|
|
1434 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1435 } elsif ($methylation_calls[$index] eq 'H') {
|
|
1436 $counting{total_meCHH_count}++;
|
|
1437 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1438 } elsif ($methylation_calls[$index] eq 'h') {
|
|
1439 $counting{total_unmethylated_CHH_count}++;
|
|
1440 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1441 }
|
|
1442 elsif ($methylation_calls[$index] eq '.') {}
|
|
1443 else{
|
|
1444 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
1445 }
|
|
1446 }
|
|
1447 } else {
|
|
1448 die "The strand orientation was neither + nor -: '$strand'/n";
|
|
1449 }
|
|
1450 }
|
|
1451 }
|
|
1452
|
|
1453 ### this is the default paired-end procedure allowing overlaps and using every single C position
|
|
1454 ### Still within the 2-CONTEXT ONLY optional section
|
|
1455 else {
|
|
1456 ### single-file CpG and non-CpG context output
|
|
1457 if ($full) {
|
|
1458 if ($strand eq '+') {
|
|
1459 for my $index (0..$#methylation_calls) {
|
|
1460
|
|
1461 if ($cigar){ # only needed for SAM files
|
|
1462 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
1463 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
1464 $cigar_offset += $cigar_mod;
|
|
1465 $pos_offset += $pos_mod;
|
|
1466 }
|
|
1467
|
|
1468 if ($methylation_calls[$index] eq 'X') {
|
|
1469 $counting{total_meCHG_count}++;
|
|
1470 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1471 } elsif ($methylation_calls[$index] eq 'x') {
|
|
1472 $counting{total_unmethylated_CHG_count}++;
|
|
1473 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1474 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
1475 $counting{total_meCpG_count}++;
|
|
1476 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1477 } elsif ($methylation_calls[$index] eq 'z') {
|
|
1478 $counting{total_unmethylated_CpG_count}++;
|
|
1479 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1480 } elsif ($methylation_calls[$index] eq 'H') {
|
|
1481 $counting{total_meCHH_count}++;
|
|
1482 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1483 } elsif ($methylation_calls[$index] eq 'h') {
|
|
1484 $counting{total_unmethylated_CHH_count}++;
|
|
1485 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1486 }
|
|
1487 elsif ($methylation_calls[$index] eq '.') {}
|
|
1488 else{
|
|
1489 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
1490 }
|
|
1491 }
|
|
1492 } elsif ($strand eq '-') {
|
|
1493 for my $index (0..$#methylation_calls) {
|
|
1494
|
|
1495 if ($cigar){ # only needed for SAM files
|
|
1496 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
1497 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
1498 $cigar_offset += $cigar_mod;
|
|
1499 $pos_offset += $pos_mod;
|
|
1500 }
|
|
1501
|
|
1502 if ($methylation_calls[$index] eq 'X') {
|
|
1503 $counting{total_meCHG_count}++;
|
|
1504 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1505 } elsif ($methylation_calls[$index] eq 'x') {
|
|
1506 $counting{total_unmethylated_CHG_count}++;
|
|
1507 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1508 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
1509 $counting{total_meCpG_count}++;
|
|
1510 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1511 } elsif ($methylation_calls[$index] eq 'z') {
|
|
1512 $counting{total_unmethylated_CpG_count}++;
|
|
1513 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1514 } elsif ($methylation_calls[$index] eq 'H') {
|
|
1515 $counting{total_meCHH_count}++;
|
|
1516 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1517 } elsif ($methylation_calls[$index] eq 'h') {
|
|
1518 $counting{total_unmethylated_CHH_count}++;
|
|
1519 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1520 }
|
|
1521 elsif ($methylation_calls[$index] eq '.') {}
|
|
1522 else{
|
|
1523 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
1524 }
|
|
1525 }
|
|
1526 } else {
|
|
1527 die "The strand orientation as neither + nor -: '$strand'\n";
|
|
1528 }
|
|
1529 }
|
|
1530
|
|
1531 ### strand-specific methylation output
|
|
1532 ### still within the 2-CONTEXT optional section
|
|
1533 else {
|
|
1534 if ($strand eq '+') {
|
|
1535 for my $index (0..$#methylation_calls) {
|
|
1536
|
|
1537 if ($cigar){ # only needed for SAM files
|
|
1538 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
1539 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
1540 $cigar_offset += $cigar_mod;
|
|
1541 $pos_offset += $pos_mod;
|
|
1542 }
|
|
1543
|
|
1544 if ($methylation_calls[$index] eq 'X') {
|
|
1545 $counting{total_meCHG_count}++;
|
|
1546 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1547 } elsif ($methylation_calls[$index] eq 'x') {
|
|
1548 $counting{total_unmethylated_CHG_count}++;
|
|
1549 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1550 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
1551 $counting{total_meCpG_count}++;
|
|
1552 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1553 } elsif ($methylation_calls[$index] eq 'z') {
|
|
1554 $counting{total_unmethylated_CpG_count}++;
|
|
1555 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1556 } elsif ($methylation_calls[$index] eq 'H') {
|
|
1557 $counting{total_meCHH_count}++;
|
|
1558 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1559 } elsif ($methylation_calls[$index] eq 'h') {
|
|
1560 $counting{total_unmethylated_CHH_count}++;
|
|
1561 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1562 }
|
|
1563 elsif ($methylation_calls[$index] eq '.') {}
|
|
1564 else{
|
|
1565 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
1566 }
|
|
1567 }
|
|
1568 } elsif ($strand eq '-') {
|
|
1569 for my $index (0..$#methylation_calls) {
|
|
1570
|
|
1571 if ($cigar){ # only needed for SAM files
|
|
1572 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
1573 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
1574 $cigar_offset += $cigar_mod;
|
|
1575 $pos_offset += $pos_mod;
|
|
1576 }
|
|
1577
|
|
1578 if ($methylation_calls[$index] eq 'X') {
|
|
1579 $counting{total_meCHG_count}++;
|
|
1580 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1581 } elsif ($methylation_calls[$index] eq 'x') {
|
|
1582 $counting{total_unmethylated_CHG_count}++;
|
|
1583 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1584 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
1585 $counting{total_meCpG_count}++;
|
|
1586 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1587 } elsif ($methylation_calls[$index] eq 'z') {
|
|
1588 $counting{total_unmethylated_CpG_count}++;
|
|
1589 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1590 } elsif ($methylation_calls[$index] eq 'H') {
|
|
1591 $counting{total_meCHH_count}++;
|
|
1592 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1593 } elsif ($methylation_calls[$index] eq 'h') {
|
|
1594 $counting{total_unmethylated_CHH_count}++;
|
|
1595 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1596 }
|
|
1597 elsif ($methylation_calls[$index] eq '.') {}
|
|
1598 else{
|
|
1599 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
1600 }
|
|
1601 }
|
|
1602 } else {
|
|
1603 die "The strand orientation as neither + nor -: '$strand'\n";
|
|
1604 }
|
|
1605 }
|
|
1606 }
|
|
1607 }
|
|
1608
|
|
1609 ############################################
|
|
1610 ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ###
|
|
1611 ############################################
|
|
1612
|
|
1613 elsif ($no_overlap) {
|
|
1614 ### single-file CpG, CHG and CHH context output
|
|
1615 if ($full) {
|
|
1616 if ($strand eq '+') {
|
|
1617 for my $index (0..$#methylation_calls) {
|
|
1618
|
|
1619 if ($cigar){ # only needed for SAM files
|
|
1620 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
1621 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
1622 $cigar_offset += $cigar_mod;
|
|
1623 $pos_offset += $pos_mod;
|
|
1624 }
|
|
1625
|
|
1626 ### Returning as soon as the methylation calls start overlapping
|
|
1627 if ($start+$index+$pos_offset >= $end_read_1) {
|
|
1628 return;
|
|
1629 }
|
|
1630
|
|
1631 if ($methylation_calls[$index] eq 'X') {
|
|
1632 $counting{total_meCHG_count}++;
|
|
1633 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1634 } elsif ($methylation_calls[$index] eq 'x') {
|
|
1635 $counting{total_unmethylated_CHG_count}++;
|
|
1636 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1637 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
1638 $counting{total_meCpG_count}++;
|
|
1639 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1640 } elsif ($methylation_calls[$index] eq 'z') {
|
|
1641 $counting{total_unmethylated_CpG_count}++;
|
|
1642 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1643 } elsif ($methylation_calls[$index] eq 'H') {
|
|
1644 $counting{total_meCHH_count}++;
|
|
1645 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1646 } elsif ($methylation_calls[$index] eq 'h') {
|
|
1647 $counting{total_unmethylated_CHH_count}++;
|
|
1648 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1649 }
|
|
1650 elsif ($methylation_calls[$index] eq '.') {}
|
|
1651 else{
|
|
1652 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
1653 }
|
|
1654 }
|
|
1655 } elsif ($strand eq '-') {
|
|
1656 for my $index (0..$#methylation_calls) {
|
|
1657
|
|
1658 if ($cigar){ # only needed for SAM files
|
|
1659 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
1660 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
1661 $cigar_offset += $cigar_mod;
|
|
1662 $pos_offset += $pos_mod;
|
|
1663 }
|
|
1664
|
|
1665 ### Returning as soon as the methylation calls start overlapping
|
|
1666 if ($start-$index+$pos_offset <= $end_read_1) {
|
|
1667 return;
|
|
1668 }
|
|
1669
|
|
1670 if ($methylation_calls[$index] eq 'X') {
|
|
1671 $counting{total_meCHG_count}++;
|
|
1672 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1673 } elsif ($methylation_calls[$index] eq 'x') {
|
|
1674 $counting{total_unmethylated_CHG_count}++;
|
|
1675 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1676 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
1677 $counting{total_meCpG_count}++;
|
|
1678 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1679 } elsif ($methylation_calls[$index] eq 'z') {
|
|
1680 $counting{total_unmethylated_CpG_count}++;
|
|
1681 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1682 } elsif ($methylation_calls[$index] eq 'H') {
|
|
1683 $counting{total_meCHH_count}++;
|
|
1684 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1685 } elsif ($methylation_calls[$index] eq 'h') {
|
|
1686 $counting{total_unmethylated_CHH_count}++;
|
|
1687 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1688 }
|
|
1689 elsif ($methylation_calls[$index] eq '.') {}
|
|
1690 else{
|
|
1691 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
1692 }
|
|
1693 }
|
|
1694 } else {
|
|
1695 die "The strand orientation as neither + nor -: '$strand'\n";
|
|
1696 }
|
|
1697 }
|
|
1698
|
|
1699 ### strand-specific methylation output
|
|
1700 else {
|
|
1701 if ($strand eq '+') {
|
|
1702 for my $index (0..$#methylation_calls) {
|
|
1703
|
|
1704 if ($cigar){ # only needed for SAM files
|
|
1705 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
1706 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
1707 $cigar_offset += $cigar_mod;
|
|
1708 $pos_offset += $pos_mod;
|
|
1709 }
|
|
1710
|
|
1711 ### Returning as soon as the methylation calls start overlapping
|
|
1712 if ($start+$index+$pos_offset >= $end_read_1) {
|
|
1713 return;
|
|
1714 }
|
|
1715
|
|
1716 if ($methylation_calls[$index] eq 'X') {
|
|
1717 $counting{total_meCHG_count}++;
|
|
1718 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1719 } elsif ($methylation_calls[$index] eq 'x') {
|
|
1720 $counting{total_unmethylated_CHG_count}++;
|
|
1721 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1722 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
1723 $counting{total_meCpG_count}++;
|
|
1724 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1725 } elsif ($methylation_calls[$index] eq 'z') {
|
|
1726 $counting{total_unmethylated_CpG_count}++;
|
|
1727 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1728 } elsif ($methylation_calls[$index] eq 'H') {
|
|
1729 $counting{total_meCHH_count}++;
|
|
1730 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1731 } elsif ($methylation_calls[$index] eq 'h') {
|
|
1732 $counting{total_unmethylated_CHH_count}++;
|
|
1733 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1734 }
|
|
1735 elsif ($methylation_calls[$index] eq '.') {}
|
|
1736 else{
|
|
1737 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
1738 }
|
|
1739 }
|
|
1740 } elsif ($strand eq '-') {
|
|
1741 for my $index (0..$#methylation_calls) {
|
|
1742
|
|
1743 if ($cigar){ # only needed for SAM files
|
|
1744 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
1745 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
1746 $cigar_offset += $cigar_mod;
|
|
1747 $pos_offset += $pos_mod;
|
|
1748 }
|
|
1749
|
|
1750 ### Returning as soon as the methylation calls start overlapping
|
|
1751 if ($start-$index+$pos_offset <= $end_read_1) {
|
|
1752 return;
|
|
1753 }
|
|
1754
|
|
1755 if ($methylation_calls[$index] eq 'X') {
|
|
1756 $counting{total_meCHG_count}++;
|
|
1757 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1758 } elsif ($methylation_calls[$index] eq 'x') {
|
|
1759 $counting{total_unmethylated_CHG_count}++;
|
|
1760 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1761 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
1762 $counting{total_meCpG_count}++;
|
|
1763 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1764 } elsif ($methylation_calls[$index] eq 'z') {
|
|
1765 $counting{total_unmethylated_CpG_count}++;
|
|
1766 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1767 } elsif ($methylation_calls[$index] eq 'H') {
|
|
1768 $counting{total_meCHH_count}++;
|
|
1769 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1770 } elsif ($methylation_calls[$index] eq 'h') {
|
|
1771 $counting{total_unmethylated_CHH_count}++;
|
|
1772 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1773 }
|
|
1774 elsif ($methylation_calls[$index] eq '.') {}
|
|
1775 else{
|
|
1776 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
1777 }
|
|
1778 }
|
|
1779 } else {
|
|
1780 die "The strand orientation as neither + nor -: '$strand'\n";
|
|
1781 }
|
|
1782 }
|
|
1783 }
|
|
1784
|
|
1785 ### this is the default paired-end procedure allowing overlaps and using every single C position
|
|
1786 else {
|
|
1787 ### single-file CpG, CHG and CHH context output
|
|
1788 if ($full) {
|
|
1789 if ($strand eq '+') {
|
|
1790 for my $index (0..$#methylation_calls) {
|
|
1791
|
|
1792 if ($cigar){ # only needed for SAM files
|
|
1793 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
1794 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
1795 $cigar_offset += $cigar_mod;
|
|
1796 $pos_offset += $pos_mod;
|
|
1797 }
|
|
1798
|
|
1799 if ($methylation_calls[$index] eq 'X') {
|
|
1800 $counting{total_meCHG_count}++;
|
|
1801 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1802 } elsif ($methylation_calls[$index] eq 'x') {
|
|
1803 $counting{total_unmethylated_CHG_count}++;
|
|
1804 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1805 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
1806 $counting{total_meCpG_count}++;
|
|
1807 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1808 } elsif ($methylation_calls[$index] eq 'z') {
|
|
1809 $counting{total_unmethylated_CpG_count}++;
|
|
1810 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1811 } elsif ($methylation_calls[$index] eq 'H') {
|
|
1812 $counting{total_meCHH_count}++;
|
|
1813 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1814 } elsif ($methylation_calls[$index] eq 'h') {
|
|
1815 $counting{total_unmethylated_CHH_count}++;
|
|
1816 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1817 }
|
|
1818 elsif ($methylation_calls[$index] eq '.') {}
|
|
1819 else{
|
|
1820 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
1821 }
|
|
1822 }
|
|
1823 } elsif ($strand eq '-') {
|
|
1824 for my $index (0..$#methylation_calls) {
|
|
1825
|
|
1826 if ($cigar){ # only needed for SAM files
|
|
1827 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
1828 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
1829 $cigar_offset += $cigar_mod;
|
|
1830 $pos_offset += $pos_mod;
|
|
1831 }
|
|
1832
|
|
1833 if ($methylation_calls[$index] eq 'X') {
|
|
1834 $counting{total_meCHG_count}++;
|
|
1835 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1836 } elsif ($methylation_calls[$index] eq 'x') {
|
|
1837 $counting{total_unmethylated_CHG_count}++;
|
|
1838 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1839 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
1840 $counting{total_meCpG_count}++;
|
|
1841 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1842 } elsif ($methylation_calls[$index] eq 'z') {
|
|
1843 $counting{total_unmethylated_CpG_count}++;
|
|
1844 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1845 } elsif ($methylation_calls[$index] eq 'H') {
|
|
1846 $counting{total_meCHH_count}++;
|
|
1847 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1848 } elsif ($methylation_calls[$index] eq 'h') {
|
|
1849 $counting{total_unmethylated_CHH_count}++;
|
|
1850 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1851 }
|
|
1852 elsif ($methylation_calls[$index] eq '.') {}
|
|
1853 else{
|
|
1854 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
1855 }
|
|
1856 }
|
|
1857 } else {
|
|
1858 die "The strand orientation as neither + nor -: '$strand'\n";
|
|
1859 }
|
|
1860 }
|
|
1861
|
|
1862 ### strand-specific methylation output
|
|
1863 else {
|
|
1864 if ($strand eq '+') {
|
|
1865 for my $index (0..$#methylation_calls) {
|
|
1866
|
|
1867 if ($cigar){ # only needed for SAM files
|
|
1868 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
1869 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
1870 $cigar_offset += $cigar_mod;
|
|
1871 $pos_offset += $pos_mod;
|
|
1872 }
|
|
1873
|
|
1874 if ($methylation_calls[$index] eq 'X') {
|
|
1875 $counting{total_meCHG_count}++;
|
|
1876 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1877 } elsif ($methylation_calls[$index] eq 'x') {
|
|
1878 $counting{total_unmethylated_CHG_count}++;
|
|
1879 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1880 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
1881 $counting{total_meCpG_count}++;
|
|
1882 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1883 } elsif ($methylation_calls[$index] eq 'z') {
|
|
1884 $counting{total_unmethylated_CpG_count}++;
|
|
1885 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1886 } elsif ($methylation_calls[$index] eq 'H') {
|
|
1887 $counting{total_meCHH_count}++;
|
|
1888 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1889 } elsif ($methylation_calls[$index] eq 'h') {
|
|
1890 $counting{total_unmethylated_CHH_count}++;
|
|
1891 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1892 }
|
|
1893 elsif ($methylation_calls[$index] eq '.') {}
|
|
1894 else{
|
|
1895 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
1896 }
|
|
1897 }
|
|
1898 } elsif ($strand eq '-') {
|
|
1899 for my $index (0..$#methylation_calls) {
|
|
1900
|
|
1901 if ($cigar){ # only needed for SAM files
|
|
1902 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
1903 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
1904 $cigar_offset += $cigar_mod;
|
|
1905 $pos_offset += $pos_mod;
|
|
1906 }
|
|
1907
|
|
1908 if ($methylation_calls[$index] eq 'X') {
|
|
1909 $counting{total_meCHG_count}++;
|
|
1910 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1911 } elsif ($methylation_calls[$index] eq 'x') {
|
|
1912 $counting{total_unmethylated_CHG_count}++;
|
|
1913 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1914 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
1915 $counting{total_meCpG_count}++;
|
|
1916 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1917 } elsif ($methylation_calls[$index] eq 'z') {
|
|
1918 $counting{total_unmethylated_CpG_count}++;
|
|
1919 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1920 } elsif ($methylation_calls[$index] eq 'H') {
|
|
1921 $counting{total_meCHH_count}++;
|
|
1922 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1923 } elsif ($methylation_calls[$index] eq 'h') {
|
|
1924 $counting{total_unmethylated_CHH_count}++;
|
|
1925 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
1926 }
|
|
1927 elsif ($methylation_calls[$index] eq '.') {}
|
|
1928 else{
|
|
1929 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
1930 }
|
|
1931 }
|
|
1932 } else {
|
|
1933 die "The strand orientation as neither + nor -: '$strand'\n";
|
|
1934 }
|
|
1935 }
|
|
1936 }
|
|
1937 }
|
|
1938
|
|
1939 sub check_cigar_string {
|
|
1940 my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_;
|
|
1941 # print "$index\t$cigar_offset\t$pos_offset\t$strand\t";
|
|
1942 my ($new_cigar_offset,$new_pos_offset) = (0,0);
|
|
1943
|
|
1944 if ($strand eq '+') {
|
|
1945 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
|
|
1946
|
|
1947 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
|
|
1948 # warn "position needs no adjustment\n";
|
|
1949 }
|
|
1950
|
|
1951 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
|
|
1952 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
|
|
1953 # warn "adjusted genomic position by -1 bp (insertion)\n";
|
|
1954 }
|
|
1955
|
|
1956 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
|
|
1957 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
|
|
1958 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
|
|
1959 # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
|
|
1960
|
|
1961 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
|
|
1962 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
|
|
1963 # warn "position needs no adjustment\n";
|
|
1964 last;
|
|
1965 }
|
|
1966 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
|
|
1967 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
|
|
1968 # warn "adjusted genomic position by another -1 bp (insertion)\n";
|
|
1969 last;
|
|
1970 }
|
|
1971 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
|
|
1972 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
|
|
1973 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
|
|
1974 # warn "adjusted genomic position by another +1 bp (deletion)\n";
|
|
1975 }
|
|
1976 else{
|
|
1977 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
|
|
1978 }
|
|
1979 }
|
|
1980 }
|
|
1981 else{
|
|
1982 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
|
|
1983 }
|
|
1984 }
|
|
1985
|
|
1986 elsif ($strand eq '-') {
|
|
1987 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
|
|
1988
|
|
1989 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
|
|
1990 # warn "position needs no adjustment\n";
|
|
1991 }
|
|
1992
|
|
1993 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
|
|
1994 $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position
|
|
1995 # warn "adjusted genomic position by +1 bp (insertion)\n";
|
|
1996 }
|
|
1997
|
|
1998 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
|
|
1999 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
|
|
2000 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
|
|
2001 # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
|
|
2002
|
|
2003 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
|
|
2004 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
|
|
2005 # warn "Found new 'M' operation; position needs no adjustment\n";
|
|
2006 last;
|
|
2007 }
|
|
2008 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
|
|
2009 $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position
|
|
2010 # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n";
|
|
2011 last;
|
|
2012 }
|
|
2013 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
|
|
2014 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
|
|
2015 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
|
|
2016 # warn "adjusted genomic position by another -1 bp (deletion)\n";
|
|
2017 }
|
|
2018 else{
|
|
2019 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
|
|
2020 }
|
|
2021 }
|
|
2022 }
|
|
2023 else{
|
|
2024 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
|
|
2025 }
|
|
2026 }
|
|
2027 # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n";
|
|
2028 return ($new_cigar_offset,$new_pos_offset);
|
|
2029 }
|
|
2030
|
|
2031 sub print_individual_C_methylation_states_single_end{
|
|
2032
|
|
2033 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_;
|
|
2034 my @methylation_calls = split(//,$meth_call);
|
|
2035
|
|
2036 #################################################################
|
|
2037 ### . for bases not involving cytosines ###
|
|
2038 ### X for methylated C in CHG context (was protected) ###
|
|
2039 ### x for not methylated C in CHG context (was converted) ###
|
|
2040 ### H for methylated C in CHH context (was protected) ###
|
|
2041 ### h for not methylated C in CHH context (was converted) ###
|
|
2042 ### Z for methylated C in CpG context (was protected) ###
|
|
2043 ### z for not methylated C in CpG context (was converted) ###
|
|
2044 #################################################################
|
|
2045
|
|
2046 my $methyl_CHG_count = 0;
|
|
2047 my $methyl_CHH_count = 0;
|
|
2048 my $methyl_CpG_count = 0;
|
|
2049 my $unmethylated_CHG_count = 0;
|
|
2050 my $unmethylated_CHH_count = 0;
|
|
2051 my $unmethylated_CpG_count = 0;
|
|
2052
|
|
2053 my @len;
|
|
2054 my @ops;
|
|
2055 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
|
|
2056 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
|
|
2057
|
|
2058 my @comp_cigar;
|
|
2059
|
|
2060 if ($cigar){ # parsing CIGAR string
|
|
2061 @len = split (/\D+/,$cigar); # storing the length per operation
|
|
2062 @ops = split (/\d+/,$cigar); # storing the operation
|
|
2063 shift @ops; # remove the empty first element
|
|
2064 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
|
|
2065
|
|
2066 foreach my $index (0..$#len){
|
|
2067 foreach (1..$len[$index]){
|
|
2068 # print "$ops[$index]";
|
|
2069 push @comp_cigar, $ops[$index];
|
|
2070 }
|
|
2071 }
|
|
2072 # warn "\nDetected CIGAR string: $cigar\n";
|
|
2073 # warn "Length of methylation call: ",length $meth_call,"\n";
|
|
2074 # warn "number of operations: ",scalar @ops,"\n";
|
|
2075 # warn "number of length digits: ",scalar @len,"\n\n";
|
|
2076 # print @comp_cigar,"\n";
|
|
2077 # print "$meth_call\n\n";
|
|
2078 # sleep (1);
|
|
2079 }
|
|
2080
|
|
2081 ### adjusting the start position for all reads mapping to the reverse strand
|
|
2082 if ($strand eq '-') {
|
|
2083
|
|
2084 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
|
|
2085 # print @comp_cigar,"\n";
|
|
2086
|
|
2087 unless ($ignore){ ### if --ignore was specified the start position has already been corrected
|
|
2088
|
|
2089 if ($cigar){ ### SAM format
|
|
2090 my $MD_count = 0;
|
|
2091 foreach (@comp_cigar){
|
|
2092 ++$MD_count if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
|
|
2093 }
|
|
2094 $start += $MD_count - 1;
|
|
2095 }
|
|
2096 else{ ### vanilla format
|
|
2097 $start += length($meth_call)-1;
|
|
2098 }
|
|
2099 }
|
|
2100 }
|
|
2101
|
|
2102 ### THIS IS THE CpG and Non-CpG SECTION (OPTIONAL)
|
|
2103
|
|
2104 ### single-file CpG and other-context output
|
|
2105 if ($full and $merge_non_CpG) {
|
|
2106 if ($strand eq '+') {
|
|
2107 for my $index (0..$#methylation_calls) {
|
|
2108
|
|
2109 if ($cigar){ # only needed for SAM files
|
|
2110 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2111 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
2112 $cigar_offset += $cigar_mod;
|
|
2113 $pos_offset += $pos_mod;
|
|
2114 }
|
|
2115
|
|
2116 ### methylated Cs (any context) will receive a forward (+) orientation
|
|
2117 ### not methylated Cs (any context) will receive a reverse (-) orientation
|
|
2118 if ($methylation_calls[$index] eq 'X') {
|
|
2119 $counting{total_meCHG_count}++;
|
|
2120 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2121 }
|
|
2122 elsif ($methylation_calls[$index] eq 'x') {
|
|
2123 $counting{total_unmethylated_CHG_count}++;
|
|
2124 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2125 }
|
|
2126 elsif ($methylation_calls[$index] eq 'Z') {
|
|
2127 $counting{total_meCpG_count}++;
|
|
2128 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2129 }
|
|
2130 elsif ($methylation_calls[$index] eq 'z') {
|
|
2131 $counting{total_unmethylated_CpG_count}++;
|
|
2132 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2133 }
|
|
2134 elsif ($methylation_calls[$index] eq 'H') {
|
|
2135 $counting{total_meCHH_count}++;
|
|
2136 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2137 }
|
|
2138 elsif ($methylation_calls[$index] eq 'h') {
|
|
2139 $counting{total_unmethylated_CHH_count}++;
|
|
2140 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2141 }
|
|
2142 elsif ($methylation_calls[$index] eq '.') {
|
|
2143 }
|
|
2144 else{
|
|
2145 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2146 }
|
|
2147 }
|
|
2148 }
|
|
2149 elsif ($strand eq '-') {
|
|
2150
|
|
2151 for my $index (0..$#methylation_calls) {
|
|
2152 ### methylated Cs (any context) will receive a forward (+) orientation
|
|
2153 ### not methylated Cs (any context) will receive a reverse (-) orientation
|
|
2154
|
|
2155 if ($cigar){ # only needed for SAM files
|
|
2156 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
2157 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2158 $cigar_offset += $cigar_mod;
|
|
2159 $pos_offset += $pos_mod;
|
|
2160 }
|
|
2161
|
|
2162 if ($methylation_calls[$index] eq 'X') {
|
|
2163 $counting{total_meCHG_count}++;
|
|
2164 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2165 }
|
|
2166 elsif ($methylation_calls[$index] eq 'x') {
|
|
2167 $counting{total_unmethylated_CHG_count}++;
|
|
2168 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2169 }
|
|
2170 elsif ($methylation_calls[$index] eq 'Z') {
|
|
2171 $counting{total_meCpG_count}++;
|
|
2172 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2173 }
|
|
2174 elsif ($methylation_calls[$index] eq 'z') {
|
|
2175 $counting{total_unmethylated_CpG_count}++;
|
|
2176 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2177 }
|
|
2178 elsif ($methylation_calls[$index] eq 'H') {
|
|
2179 $counting{total_meCHH_count}++;
|
|
2180 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2181 }
|
|
2182 elsif ($methylation_calls[$index] eq 'h') {
|
|
2183 $counting{total_unmethylated_CHH_count}++;
|
|
2184 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2185 }
|
|
2186 elsif ($methylation_calls[$index] eq '.'){
|
|
2187 }
|
|
2188 else{
|
|
2189 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2190 }
|
|
2191 }
|
|
2192 }
|
|
2193 else {
|
|
2194 die "The strand information was neither + nor -: $strand\n";
|
|
2195 }
|
|
2196 }
|
|
2197
|
|
2198 ### strand-specific methylation output
|
|
2199 elsif ($merge_non_CpG) {
|
|
2200 if ($strand eq '+') {
|
|
2201 for my $index (0..$#methylation_calls) {
|
|
2202 ### methylated Cs (any context) will receive a forward (+) orientation
|
|
2203 ### not methylated Cs (any context) will receive a reverse (-) orientation
|
|
2204
|
|
2205 if ($cigar){ # only needed for SAM files
|
|
2206 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2207 $cigar_offset += $cigar_mod;
|
|
2208 $pos_offset += $pos_mod;
|
|
2209 }
|
|
2210
|
|
2211 if ($methylation_calls[$index] eq 'X') {
|
|
2212 $counting{total_meCHG_count}++;
|
|
2213 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2214 }
|
|
2215 elsif ($methylation_calls[$index] eq 'x') {
|
|
2216 $counting{total_unmethylated_CHG_count}++;
|
|
2217 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2218 }
|
|
2219 elsif ($methylation_calls[$index] eq 'Z') {
|
|
2220 $counting{total_meCpG_count}++;
|
|
2221 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2222 }
|
|
2223 elsif ($methylation_calls[$index] eq 'z') {
|
|
2224 $counting{total_unmethylated_CpG_count}++;
|
|
2225 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2226 }
|
|
2227 elsif ($methylation_calls[$index] eq 'H') {
|
|
2228 $counting{total_meCHH_count}++;
|
|
2229 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2230 }
|
|
2231 elsif ($methylation_calls[$index] eq 'h') {
|
|
2232 $counting{total_unmethylated_CHH_count}++;
|
|
2233 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2234 }
|
|
2235 elsif ($methylation_calls[$index] eq '.') {
|
|
2236 }
|
|
2237 else{
|
|
2238 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2239 }
|
|
2240 }
|
|
2241 }
|
|
2242 elsif ($strand eq '-') {
|
|
2243
|
|
2244 for my $index (0..$#methylation_calls) {
|
|
2245 ### methylated Cs (any context) will receive a forward (+) orientation
|
|
2246 ### not methylated Cs (any context) will receive a reverse (-) orientation
|
|
2247
|
|
2248 if ($cigar){ # only needed for SAM files
|
|
2249 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2250 $cigar_offset += $cigar_mod;
|
|
2251 $pos_offset += $pos_mod;
|
|
2252 }
|
|
2253
|
|
2254 if ($methylation_calls[$index] eq 'X') {
|
|
2255 $counting{total_meCHG_count}++;
|
|
2256 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2257 }
|
|
2258 elsif ($methylation_calls[$index] eq 'x') {
|
|
2259 $counting{total_unmethylated_CHG_count}++;
|
|
2260 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2261 }
|
|
2262 elsif ($methylation_calls[$index] eq 'Z') {
|
|
2263 $counting{total_meCpG_count}++;
|
|
2264 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2265 }
|
|
2266 elsif ($methylation_calls[$index] eq 'z') {
|
|
2267 $counting{total_unmethylated_CpG_count}++;
|
|
2268 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2269 }
|
|
2270 elsif ($methylation_calls[$index] eq 'H') {
|
|
2271 $counting{total_meCHH_count}++;
|
|
2272 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2273 }
|
|
2274 elsif ($methylation_calls[$index] eq 'h') {
|
|
2275 $counting{total_unmethylated_CHH_count}++;
|
|
2276 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2277 }
|
|
2278 elsif ($methylation_calls[$index] eq '.') {
|
|
2279 }
|
|
2280 else{
|
|
2281 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2282 }
|
|
2283 }
|
|
2284 }
|
|
2285 else {
|
|
2286 die "The strand information was neither + nor -: $strand\n";
|
|
2287 }
|
|
2288 }
|
|
2289
|
|
2290 ### THIS IS THE 3-CONTEXT (CpG, CHG and CHH) DEFAULT SECTION
|
|
2291
|
|
2292 elsif ($full) {
|
|
2293 if ($strand eq '+') {
|
|
2294 for my $index (0..$#methylation_calls) {
|
|
2295 ### methylated Cs (any context) will receive a forward (+) orientation
|
|
2296 ### not methylated Cs (any context) will receive a reverse (-) orientation
|
|
2297
|
|
2298 if ($cigar){ # only needed for SAM files
|
|
2299 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2300 $cigar_offset += $cigar_mod;
|
|
2301 $pos_offset += $pos_mod;
|
|
2302 }
|
|
2303
|
|
2304 if ($methylation_calls[$index] eq 'X') {
|
|
2305 $counting{total_meCHG_count}++;
|
|
2306 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2307 } elsif ($methylation_calls[$index] eq 'x') {
|
|
2308 $counting{total_unmethylated_CHG_count}++;
|
|
2309 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2310 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
2311 $counting{total_meCpG_count}++;
|
|
2312 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2313 } elsif ($methylation_calls[$index] eq 'z') {
|
|
2314 $counting{total_unmethylated_CpG_count}++;
|
|
2315 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2316 } elsif ($methylation_calls[$index] eq 'H') {
|
|
2317 $counting{total_meCHH_count}++;
|
|
2318 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2319 } elsif ($methylation_calls[$index] eq 'h') {
|
|
2320 $counting{total_unmethylated_CHH_count}++;
|
|
2321 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2322 }
|
|
2323 elsif ($methylation_calls[$index] eq '.') {}
|
|
2324 else{
|
|
2325 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2326 }
|
|
2327 }
|
|
2328 }
|
|
2329 elsif ($strand eq '-') {
|
|
2330
|
|
2331 for my $index (0..$#methylation_calls) {
|
|
2332 ### methylated Cs (any context) will receive a forward (+) orientation
|
|
2333 ### not methylated Cs (any context) will receive a reverse (-) orientation
|
|
2334
|
|
2335 if ($cigar){ # only needed for SAM files
|
|
2336 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2337 $cigar_offset += $cigar_mod;
|
|
2338 $pos_offset += $pos_mod;
|
|
2339 }
|
|
2340
|
|
2341 if ($methylation_calls[$index] eq 'X') {
|
|
2342 $counting{total_meCHG_count}++;
|
|
2343 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2344 } elsif ($methylation_calls[$index] eq 'x') {
|
|
2345 $counting{total_unmethylated_CHG_count}++;
|
|
2346 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2347 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
2348 $counting{total_meCpG_count}++;
|
|
2349 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2350 } elsif ($methylation_calls[$index] eq 'z') {
|
|
2351 $counting{total_unmethylated_CpG_count}++;
|
|
2352 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2353 } elsif ($methylation_calls[$index] eq 'H') {
|
|
2354 $counting{total_meCHH_count}++;
|
|
2355 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2356 } elsif ($methylation_calls[$index] eq 'h') {
|
|
2357 $counting{total_unmethylated_CHH_count}++;
|
|
2358 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2359 }
|
|
2360 elsif ($methylation_calls[$index] eq '.') {}
|
|
2361 else{
|
|
2362 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2363 }
|
|
2364 }
|
|
2365 }
|
|
2366 else {
|
|
2367 die "The read had a strand orientation which was neither + nor -: $strand\n";
|
|
2368 }
|
|
2369 }
|
|
2370
|
|
2371 ### strand-specific methylation output
|
|
2372 else {
|
|
2373 if ($strand eq '+') {
|
|
2374 for my $index (0..$#methylation_calls) {
|
|
2375 ### methylated Cs (any context) will receive a forward (+) orientation
|
|
2376 ### not methylated Cs (any context) will receive a reverse (-) orientation
|
|
2377
|
|
2378 if ($cigar){ # only needed for SAM files
|
|
2379 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2380 $cigar_offset += $cigar_mod;
|
|
2381 $pos_offset += $pos_mod;
|
|
2382 }
|
|
2383
|
|
2384 if ($methylation_calls[$index] eq 'X') {
|
|
2385 $counting{total_meCHG_count}++;
|
|
2386 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2387 } elsif ($methylation_calls[$index] eq 'x') {
|
|
2388 $counting{total_unmethylated_CHG_count}++;
|
|
2389 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2390 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
2391 $counting{total_meCpG_count}++;
|
|
2392 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2393 } elsif ($methylation_calls[$index] eq 'z') {
|
|
2394 $counting{total_unmethylated_CpG_count}++;
|
|
2395 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2396 } elsif ($methylation_calls[$index] eq 'H') {
|
|
2397 $counting{total_meCHH_count}++;
|
|
2398 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2399 } elsif ($methylation_calls[$index] eq 'h') {
|
|
2400 $counting{total_unmethylated_CHH_count}++;
|
|
2401 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2402 }
|
|
2403 elsif ($methylation_calls[$index] eq '.') {}
|
|
2404 else{
|
|
2405 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2406 }
|
|
2407 }
|
|
2408 }
|
|
2409 elsif ($strand eq '-') {
|
|
2410
|
|
2411 for my $index (0..$#methylation_calls) {
|
|
2412 ### methylated Cs (any context) will receive a forward (+) orientation
|
|
2413 ### not methylated Cs (any context) will receive a reverse (-) orientation
|
|
2414
|
|
2415 if ($cigar){ # only needed for SAM files
|
|
2416 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2417 $cigar_offset += $cigar_mod;
|
|
2418 $pos_offset += $pos_mod;
|
|
2419 }
|
|
2420
|
|
2421 if ($methylation_calls[$index] eq 'X') {
|
|
2422 $counting{total_meCHG_count}++;
|
|
2423 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2424 } elsif ($methylation_calls[$index] eq 'x') {
|
|
2425 $counting{total_unmethylated_CHG_count}++;
|
|
2426 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2427 } elsif ($methylation_calls[$index] eq 'Z') {
|
|
2428 $counting{total_meCpG_count}++;
|
|
2429 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2430 } elsif ($methylation_calls[$index] eq 'z') {
|
|
2431 $counting{total_unmethylated_CpG_count}++;
|
|
2432 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2433 } elsif ($methylation_calls[$index] eq 'H') {
|
|
2434 $counting{total_meCHH_count}++;
|
|
2435 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2436 } elsif ($methylation_calls[$index] eq 'h') {
|
|
2437 $counting{total_unmethylated_CHH_count}++;
|
|
2438 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
|
|
2439 }
|
|
2440 elsif ($methylation_calls[$index] eq '.') {}
|
|
2441 else{
|
|
2442 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2443 }
|
|
2444 }
|
|
2445 }
|
|
2446 else {
|
|
2447 die "The strand information was neither + nor -: $strand\n";
|
|
2448 }
|
|
2449 }
|
|
2450 }
|
|
2451
|
|
2452 sub print_helpfile{
|
|
2453
|
|
2454 print << 'HOW_TO';
|
|
2455
|
|
2456
|
|
2457 DESCRIPTION
|
|
2458
|
|
2459 The following is a brief description of all options to control the Bismark
|
|
2460 methylation extractor. The script reads in a bisulfite read alignment results file
|
|
2461 produced by the Bismark bisulfite mapper and extracts the methylation information
|
|
2462 for individual cytosines. This information is found in the methylation call field
|
|
2463 which can contain the following characters:
|
|
2464
|
|
2465 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
2466 ~~~ X for methylated C in CHG context (was protected) ~~~
|
|
2467 ~~~ x for not methylated C CHG (was converted) ~~~
|
|
2468 ~~~ H for methylated C in CHH context (was protected) ~~~
|
|
2469 ~~~ h for not methylated C in CHH context (was converted) ~~~
|
|
2470 ~~~ Z for methylated C in CpG context (was protected) ~~~
|
|
2471 ~~~ z for not methylated C in CpG context (was converted) ~~~
|
|
2472 ~~~ . for any bases not involving cytosines ~~~
|
|
2473 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
2474
|
|
2475 The methylation extractor outputs result files for cytosines in CpG, CHG and CHH
|
|
2476 context (this distinction is actually already made in Bismark itself). As the methylation
|
|
2477 information for every C analysed can produce files which easily have tens or even hundreds of
|
|
2478 millions of lines, file sizes can become very large and more difficult to handle. The C
|
|
2479 methylation info additionally splits cytosine methylation calls up into one of the four possible
|
|
2480 strands a given bisulfite read aligned against:
|
|
2481
|
|
2482 OT original top strand
|
|
2483 CTOT complementary to original top strand
|
|
2484
|
|
2485 OB original bottom strand
|
|
2486 CTOB complementary to original bottom strand
|
|
2487
|
|
2488 Thus, by default twelve individual output files are being generated per input file (unless
|
|
2489 --comprehensive is specified, see below). The output files can be imported into a genome
|
|
2490 viewer, such as SeqMonk, and re-combined into a single data group if desired (in fact
|
|
2491 unless the bisulfite reads were generated preserving directionality it doesn't make any
|
|
2492 sense to look at the data in a strand-specific manner). Strand-specific output files can
|
|
2493 optionally be skipped, in which case only three output files for CpG, CHG or CHH context
|
|
2494 will be generated. For both the strand-specific and comprehensive outputs there is also
|
|
2495 the option to merge both non-CpG contexts (CHG and CHH) into one single non-CpG context.
|
|
2496
|
|
2497
|
|
2498 The output files are in the following format (tab delimited):
|
|
2499
|
|
2500 <sequence_id> <strand> <chromosome> <position> <methylation call>
|
|
2501
|
|
2502
|
|
2503 USAGE: methylation_extractor [options] <filenames>
|
|
2504
|
|
2505
|
|
2506 ARGUMENTS:
|
|
2507
|
|
2508 <filenames> A space-separated list of Bismark result files in SAM format from
|
|
2509 which methylation information is extracted for every cytosine in
|
|
2510 the reads. For alignment files in the older custom Bismark output
|
|
2511 see option '--vanilla'.
|
|
2512
|
|
2513 OPTIONS:
|
|
2514
|
|
2515 -s/--single-end Input file(s) are Bismark result file(s) generated from single-end
|
|
2516 read data. Specifying either --single-end or --paired-end is
|
|
2517 mandatory.
|
|
2518
|
|
2519 -p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end
|
|
2520 read data. Specifying either --paired-end or --single-end is
|
|
2521 mandatory.
|
|
2522
|
|
2523 --vanilla The Bismark result input file(s) are in the old custom Bismark format
|
|
2524 (up to version 0.5.x) and not in SAM format which is the default as
|
|
2525 of Bismark version 0.6.x or higher. Default: OFF.
|
|
2526
|
|
2527 --no_overlap For paired-end reads it is theoretically possible that read_1 and
|
|
2528 read_2 overlap. This option avoids scoring overlapping methylation
|
|
2529 calls twice. Whilst this removes a bias towards more methylation calls
|
|
2530 towards the center of sequenced fragments it can de facto remove
|
|
2531 a good proportion of the data.
|
|
2532
|
|
2533 --ignore <int> Ignore the first <int> bp at the 5' end of each read when processing the
|
|
2534 methylation call string. This can remove e.g. a restriction enzyme site
|
|
2535 at the start of each read.
|
|
2536
|
|
2537 --comprehensive Specifying this option will merge all four possible strand-specific
|
|
2538 methylation info into context-dependent output files. The default
|
|
2539 contexts are:
|
|
2540 - CpG context
|
|
2541 - CHG context
|
|
2542 - CHH context
|
|
2543
|
|
2544 --merge_non_CpG This will produce two output files (in --comprehensive mode) or eight
|
|
2545 strand-specific output files (default) for Cs in
|
|
2546 - CpG context
|
|
2547 - non-CpG context
|
|
2548
|
|
2549 --report Prints out a short methylation summary as well as the paramaters used to run
|
|
2550 this script.
|
|
2551
|
|
2552 --version Displays version information.
|
|
2553
|
|
2554 -h/--help Displays this help file and exits.
|
|
2555
|
|
2556
|
|
2557 OUTPUT:
|
|
2558
|
|
2559 The output is in the form:
|
|
2560
|
|
2561 <seq-ID> <methylation state*> <chromosome> <start position (= end position)> <methylation call>
|
|
2562
|
|
2563 * Methylated cytosines receive a '+' orientation,
|
|
2564 * Unmethylated cytosines receive a '-' orientation.
|
|
2565
|
|
2566
|
|
2567 This script was last modified on 31 July 2012.
|
|
2568
|
|
2569 HOW_TO
|
|
2570 }
|