Mercurial > repos > fcaramia > methylation_analysis_bismark
comparison methylation_analysis_bismark/methylation_analysis/methylation_extractor @ 11:6479112a673b draft
Uploaded
author | fcaramia |
---|---|
date | Wed, 12 Dec 2012 19:46:06 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
10:2432df265dad | 11:6479112a673b |
---|---|
1 #!/usr/bin/perl | |
2 use warnings; | |
3 use strict; | |
4 $|++; | |
5 use Getopt::Long; | |
6 use Cwd; | |
7 | |
8 my @filenames; | |
9 my %counting; | |
10 my $parent_dir = getcwd(); | |
11 | |
12 my %fhs; | |
13 | |
14 my $version = 'v0.7.6'; | |
15 my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla) = process_commandline(); | |
16 | |
17 process_Bismark_results_file(); | |
18 | |
19 sub process_commandline{ | |
20 my $help; | |
21 my $single_end; | |
22 my $paired_end; | |
23 my $ignore; | |
24 my $genomic_fasta; | |
25 my $full; | |
26 my $report; | |
27 my $extractor_version; | |
28 my $no_overlap; | |
29 my $merge_non_CpG; | |
30 my $vanilla; | |
31 | |
32 my $command_line = GetOptions ('help|man' => \$help, | |
33 'p|paired-end' => \$paired_end, | |
34 's|single-end' => \$single_end, | |
35 'fasta' => \$genomic_fasta, | |
36 'ignore=i' => \$ignore, | |
37 'comprehensive' => \$full, | |
38 'report' => \$report, | |
39 'version' => \$extractor_version, | |
40 'no_overlap' => \$no_overlap, | |
41 'merge_non_CpG' => \$merge_non_CpG, | |
42 'vanilla' => \$vanilla, | |
43 ); | |
44 | |
45 ### EXIT ON ERROR if there were errors with any of the supplied options | |
46 unless ($command_line){ | |
47 die "Please respecify command line options\n"; | |
48 } | |
49 | |
50 ### HELPFILE | |
51 if ($help){ | |
52 print_helpfile(); | |
53 exit; | |
54 } | |
55 | |
56 if ($extractor_version){ | |
57 print << "VERSION"; | |
58 | |
59 | |
60 Bismark Methylation Extractor | |
61 | |
62 Bismark Extractor Version: $version Copyright 2010-12 Felix Krueger, Babraham Bioinformatics | |
63 www.bioinformatics.babraham.ac.uk/projects/bismark/ | |
64 | |
65 | |
66 VERSION | |
67 exit; | |
68 } | |
69 | |
70 | |
71 ### no files provided | |
72 unless (@ARGV){ | |
73 die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n"; | |
74 } | |
75 @filenames = @ARGV; | |
76 | |
77 warn "\n *** Bismark methylation extractor version $version ***\n\n"; | |
78 | |
79 ### IGNORING <INT> bases at the start of the read when processing the methylation call string | |
80 unless ($ignore){ | |
81 $ignore = 0; | |
82 } | |
83 ### PRINT A REPORT | |
84 unless ($report){ | |
85 $report = 0; | |
86 } | |
87 | |
88 ### OLD (VANILLA) OUTPUT FORMAT | |
89 unless ($vanilla){ | |
90 $vanilla = 0; | |
91 } | |
92 | |
93 if ($single_end){ | |
94 $paired_end = 0; ### SINGLE END ALIGNMENTS | |
95 } | |
96 elsif ($paired_end){ | |
97 $single_end = 0; ### PAIRED-END ALIGNMENTS | |
98 } | |
99 else{ | |
100 die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format\n\n"; | |
101 } | |
102 | |
103 ### NO OVERLAP | |
104 if ($no_overlap){ | |
105 die "The option '--no_overlap' can only be specified for paired-end input!\n" unless ($paired_end); | |
106 } | |
107 else{ | |
108 $no_overlap = 0; | |
109 } | |
110 | |
111 ### COMPREHENSIVE OUTPUT | |
112 unless ($full){ | |
113 $full = 0; | |
114 } | |
115 | |
116 ### MERGE NON-CpG context | |
117 unless ($merge_non_CpG){ | |
118 $merge_non_CpG = 0; | |
119 } | |
120 | |
121 return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla); | |
122 } | |
123 | |
124 | |
125 sub process_Bismark_results_file{ | |
126 | |
127 if ($single){ | |
128 if ($vanilla){ | |
129 warn "Bismark Single-end vanilla format specified\n"; | |
130 } | |
131 else{ | |
132 warn "Bismark Single-end SAM format specified (default)\n"; # default | |
133 } | |
134 } | |
135 elsif ($paired){ | |
136 if ($vanilla){ | |
137 warn "Bismark Paired-end vanilla format specified\n"; | |
138 } | |
139 else{ | |
140 warn "Bismark Paired-end SAM format specified (default)\n"; # default | |
141 } | |
142 } | |
143 | |
144 if ($ignore){ | |
145 warn "First $ignore bases will be disregarded when processing the methylation call string\n"; | |
146 } | |
147 | |
148 if ($full){ | |
149 warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n"; | |
150 } | |
151 if ($merge_non_CpG){ | |
152 warn "Merge CHG and CHH context to non-CpG context specified\n"; | |
153 } | |
154 warn "\n"; | |
155 | |
156 sleep (3); | |
157 | |
158 foreach my $filename (@filenames){ | |
159 %fhs = (); | |
160 %counting =( | |
161 total_meCHG_count => 0, | |
162 total_meCHH_count => 0, | |
163 total_meCpG_count => 0, | |
164 total_unmethylated_CHG_count => 0, | |
165 total_unmethylated_CHH_count => 0, | |
166 total_unmethylated_CpG_count => 0, | |
167 sequences_count => 0, | |
168 ); | |
169 warn "\nNow reading in Bismark result file $filename\n\n"; | |
170 | |
171 if ($filename =~ /\.gz$/){ | |
172 open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n"; | |
173 } | |
174 else{ | |
175 open (IN,$filename) or die "Can't open file $filename: $!\n"; | |
176 } | |
177 | |
178 ### Vanilla and SAM output need to read different numbers of header lines | |
179 if ($vanilla){ | |
180 my $bismark_version = <IN>; ## discarding the Bismark version info | |
181 chomp $bismark_version; | |
182 $bismark_version =~ s/\r//; # replaces \r line feed | |
183 $bismark_version =~ s/Bismark version: //; | |
184 if ($bismark_version =~ /^\@/){ | |
185 warn "Detected \@ as the first character of the version information. Is it possible that the file is in SAM format?\n\n"; | |
186 sleep (2); | |
187 } | |
188 | |
189 unless ($version eq $bismark_version){ | |
190 die "The methylation extractor and Bismark itself need to be of the same version!\n\nVersions used:\nmethylation extractor: '$version'\nBismark: '$bismark_version'\n"; | |
191 } | |
192 } | |
193 else{ | |
194 # If the read is in SAM format (default) it can either start with @ header lines or start with alignments directly. | |
195 # We are reading from it further down | |
196 } | |
197 | |
198 my $output_filename = (split (/\//,$filename))[-1]; | |
199 | |
200 ### OPENING OUTPUT-FILEHANDLES | |
201 if ($report){ | |
202 my $report_filename = $output_filename; | |
203 $report_filename =~ s/$/_splitting_report.txt/; | |
204 open (REPORT,'>',$report_filename) or die "Failed to write to file $report_filename $!\n"; | |
205 } | |
206 | |
207 if ($report){ | |
208 print REPORT "$output_filename\n\n"; | |
209 print REPORT "Parameters used to extract methylation information:\n"; | |
210 if ($paired){ | |
211 if ($vanilla){ | |
212 print REPORT "Bismark result file: paired-end (vanilla Bismark format)\n"; | |
213 } | |
214 else{ | |
215 print REPORT "Bismark result file: paired-end (SAM format)\n"; # default | |
216 } | |
217 } | |
218 | |
219 if ($single){ | |
220 if ($vanilla){ | |
221 print REPORT "Bismark result file: single-end (vanilla Bismark format)\n"; | |
222 } | |
223 else{ | |
224 print REPORT "Bismark result file: single-end (SAM format)\n"; # default | |
225 } | |
226 } | |
227 | |
228 if ($ignore){ | |
229 print REPORT "Ignoring first $ignore bases\n"; | |
230 } | |
231 | |
232 if ($full){ | |
233 print REPORT "Output specified: comprehensive\n"; | |
234 } | |
235 else{ | |
236 print REPORT "Output specified: strand-specific (default)\n"; | |
237 } | |
238 | |
239 if ($no_overlap){ | |
240 print REPORT "No overlapping methylation calls specified\n"; | |
241 } | |
242 if ($genomic_fasta){ | |
243 print REPORT "Genomic equivalent sequences will be printed out in FastA format\n"; | |
244 } | |
245 if ($merge_non_CpG){ | |
246 print REPORT "Methylation in CHG and CHH context will be merged into \"non-CpG context\" output\n"; | |
247 } | |
248 | |
249 print REPORT "\n"; | |
250 } | |
251 | |
252 ### CpG-context and non-CpG context. THIS SECTION IS OPTIONAL | |
253 | |
254 ### if --comprehensive AND --merge_non_CpG was specified we are only writing out one CpG-context and one Any-Other-context result file | |
255 if ($full and $merge_non_CpG){ | |
256 my $cpg_output = my $other_c_output = $output_filename; | |
257 ### C in CpG context | |
258 $cpg_output =~ s/^/CpG_context_/; | |
259 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); | |
260 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n"; | |
261 print "Writing result file containing methylation information for C in CpG context to $cpg_output\n"; | |
262 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n"; | |
263 | |
264 ### C in any other context than CpG | |
265 $other_c_output =~ s/^/Non_CpG_context_/; | |
266 $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/); | |
267 open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n"; | |
268 print "Writing result file containing methylation information for C in any other context to $other_c_output\n"; | |
269 print {$fhs{other_context}} "Bismark methylation extractor version $version\n"; | |
270 } | |
271 | |
272 ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found | |
273 elsif ($merge_non_CpG){ | |
274 | |
275 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; | |
276 | |
277 ### For cytosines in CpG context | |
278 $cpg_ot =~ s/^/CpG_OT_/; | |
279 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); | |
280 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n"; | |
281 print "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n"; | |
282 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n"; | |
283 | |
284 $cpg_ctot =~ s/^/CpG_CTOT_/; | |
285 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); | |
286 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n"; | |
287 print "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n"; | |
288 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n"; | |
289 | |
290 $cpg_ctob =~ s/^/CpG_CTOB_/; | |
291 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); | |
292 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n"; | |
293 print "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n"; | |
294 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n"; | |
295 | |
296 $cpg_ob =~ s/^/CpG_OB_/; | |
297 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); | |
298 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n"; | |
299 print "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n"; | |
300 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n"; | |
301 | |
302 ### For cytosines in Non-CpG (CC, CT or CA) context | |
303 my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename; | |
304 | |
305 $other_c_ot =~ s/^/Non_CpG_OT_/; | |
306 $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/); | |
307 open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n"; | |
308 print "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n"; | |
309 print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n"; | |
310 | |
311 $other_c_ctot =~ s/^/Non_CpG_CTOT_/; | |
312 $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/); | |
313 open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n"; | |
314 print "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n"; | |
315 print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n"; | |
316 | |
317 $other_c_ctob =~ s/^/Non_CpG_CTOB_/; | |
318 $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/); | |
319 open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n"; | |
320 print "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n"; | |
321 print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n"; | |
322 | |
323 $other_c_ob =~ s/^/Non_CpG_OB_/; | |
324 $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/); | |
325 open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n"; | |
326 print "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n"; | |
327 print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n"; | |
328 } | |
329 | |
330 ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context) | |
331 | |
332 ### if --comprehensive was specified we are only writing one file per context | |
333 elsif ($full){ | |
334 my $cpg_output = my $chg_output = my $chh_output = $output_filename; | |
335 ### C in CpG context | |
336 $cpg_output =~ s/^/CpG_context_/; | |
337 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); | |
338 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n"; | |
339 print "Writing result file containing methylation information for C in CpG context to $cpg_output\n"; | |
340 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n"; | |
341 | |
342 ### C in CHG context | |
343 $chg_output =~ s/^/CHG_context_/; | |
344 $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/); | |
345 open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n"; | |
346 print "Writing result file containing methylation information for C in CHG context to $chg_output\n"; | |
347 print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n"; | |
348 | |
349 ### C in CHH context | |
350 $chh_output =~ s/^/CHH_context_/; | |
351 $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/); | |
352 open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n"; | |
353 print "Writing result file containing methylation information for C in CHH context to $chh_output\n"; | |
354 print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n"; } | |
355 | |
356 ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found | |
357 else{ | |
358 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; | |
359 | |
360 ### For cytosines in CpG context | |
361 $cpg_ot =~ s/^/CpG_OT_/; | |
362 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); | |
363 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n"; | |
364 print "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n"; | |
365 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n"; | |
366 | |
367 $cpg_ctot =~ s/^/CpG_CTOT_/; | |
368 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); | |
369 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n"; | |
370 print "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n"; | |
371 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n"; | |
372 | |
373 $cpg_ctob =~ s/^/CpG_CTOB_/; | |
374 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); | |
375 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n"; | |
376 print "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n"; | |
377 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n"; | |
378 | |
379 $cpg_ob =~ s/^/CpG_OB_/; | |
380 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); | |
381 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n"; | |
382 print "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n"; | |
383 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n"; | |
384 | |
385 ### For cytosines in CHG context | |
386 my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename; | |
387 | |
388 $chg_ot =~ s/^/CHG_OT_/; | |
389 $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/); | |
390 open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n"; | |
391 print "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n"; | |
392 print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n"; | |
393 | |
394 $chg_ctot =~ s/^/CHG_CTOT_/; | |
395 $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/); | |
396 open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n"; | |
397 print "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n"; | |
398 print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n"; | |
399 | |
400 $chg_ctob =~ s/^/CHG_CTOB_/; | |
401 $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/); | |
402 open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n"; | |
403 print "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n"; | |
404 print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n"; | |
405 | |
406 $chg_ob =~ s/^/CHG_OB_/; | |
407 $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/); | |
408 open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n"; | |
409 print "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n"; | |
410 print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n"; | |
411 | |
412 ### For cytosines in CHH context | |
413 my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename; | |
414 | |
415 $chh_ot =~ s/^/CHH_OT_/; | |
416 $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/); | |
417 open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n"; | |
418 print "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n"; | |
419 print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n"; | |
420 | |
421 $chh_ctot =~ s/^/CHH_CTOT_/; | |
422 $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/); | |
423 open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n"; | |
424 print "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n"; | |
425 print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n"; | |
426 | |
427 $chh_ctob =~ s/^/CHH_CTOB_/; | |
428 $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/); | |
429 open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n"; | |
430 print "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n"; | |
431 print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n"; | |
432 | |
433 $chh_ob =~ s/^/CHH_OB_/; | |
434 $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/); | |
435 open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n"; | |
436 print "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n"; | |
437 print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n"; | |
438 } | |
439 | |
440 my $methylation_call_strings_processed = 0; | |
441 my $line_count = 0; | |
442 | |
443 ### proceeding differently now for single-end or paired-end Bismark files | |
444 | |
445 ### PROCESSING SINGLE-END RESULT FILES | |
446 if ($single){ | |
447 | |
448 ### also proceeding differently now for SAM format or vanilla Bismark format files | |
449 if ($vanilla){ # old vanilla Bismark output format | |
450 while (<IN>){ | |
451 ++$line_count; | |
452 warn "Processed lines: $line_count\n" if ($line_count%500000==0); | |
453 | |
454 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example) | |
455 my ($id,$strand,$chrom,$start,$seq,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,6,7,8,9]; | |
456 | |
457 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the first or | |
458 ### last position | |
459 chomp $genome_conversion; | |
460 | |
461 my $index; | |
462 if ($meth_call){ | |
463 | |
464 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){ ## original top strand | |
465 $index = 0; | |
466 } | |
467 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){ ## complementary to original top strand | |
468 $index = 1; | |
469 } | |
470 elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){ ## original bottom strand | |
471 $index = 3; | |
472 } | |
473 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){ ## complementary to original bottom strand | |
474 $index = 2; | |
475 } | |
476 else { | |
477 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; | |
478 } | |
479 | |
480 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int> | |
481 if ($ignore){ | |
482 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); | |
483 | |
484 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly! | |
485 if ($strand eq '+'){ | |
486 $start += $ignore; | |
487 } | |
488 elsif ($strand eq '-'){ | |
489 $start += length($meth_call)-1; ## $meth_call is already shortened! | |
490 } | |
491 else { | |
492 die "Alignment did not have proper strand information: $strand\n"; | |
493 } | |
494 } | |
495 ### printing out the methylation state of every C in the read | |
496 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index); | |
497 | |
498 ++$methylation_call_strings_processed; # 1 per single-end result | |
499 } | |
500 } | |
501 } | |
502 else{ # processing single-end SAM format (default) | |
503 while (<IN>){ | |
504 ### SAM format can either start with header lines (starting with @) or start with alignments directly | |
505 if (/^\@/){ # skipping header lines (starting with @) | |
506 warn "skipping SAM header line:\t$_"; | |
507 next; | |
508 } | |
509 | |
510 ++$line_count; | |
511 warn "Processed lines: $line_count\n" if ($line_count%500000==0); | |
512 | |
513 # example read in SAM format | |
514 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT | |
515 ### | |
516 | |
517 # < 0.7.6 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; | |
518 # < 0.7.6 $meth_call =~ s/^XM:Z://; | |
519 # < 0.7.6 $read_conversion =~ s/^XR:Z://; | |
520 # < 0.7.6 $genome_conversion =~ s/^XG:Z://; | |
521 | |
522 my ($id,$chrom,$start,$cigar) = (split("\t"))[0,2,3,5]; | |
523 | |
524 ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression | |
525 my $meth_call; ### Thanks to Zachary Zeno for this solution | |
526 my $read_conversion; | |
527 my $genome_conversion; | |
528 | |
529 while ( /(XM|XR|XG):Z:([^\t]+)/g ) { | |
530 my $tag = $1; | |
531 my $value = $2; | |
532 | |
533 if ($tag eq "XM"){ | |
534 $meth_call = $value; | |
535 $meth_call =~ s/\r//; | |
536 } | |
537 elsif ($tag eq "XR") { | |
538 $read_conversion = $value; | |
539 $read_conversion =~ s/\r//; | |
540 } | |
541 elsif ($tag eq "XG") { | |
542 $genome_conversion = $value; | |
543 $genome_conversion =~ s/\r//; | |
544 } | |
545 } | |
546 | |
547 my $strand; | |
548 chomp $genome_conversion; | |
549 # print "$meth_call\n$read_conversion\n$genome_conversion\n"; | |
550 | |
551 my $index; | |
552 if ($meth_call){ | |
553 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){ ## original top strand | |
554 $index = 0; | |
555 $strand = '+'; | |
556 } | |
557 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){ ## complementary to original top strand | |
558 $index = 1; | |
559 $strand = '-'; | |
560 } | |
561 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){ ## complementary to original bottom strand | |
562 $index = 2; | |
563 $strand = '+'; | |
564 } | |
565 elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){ ## original bottom strand | |
566 $index = 3; | |
567 $strand = '-'; | |
568 } | |
569 else { | |
570 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; | |
571 } | |
572 | |
573 ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output | |
574 if ($strand eq '-'){ | |
575 $meth_call = reverse $meth_call; | |
576 } | |
577 | |
578 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int> | |
579 if ($ignore){ | |
580 # print "\n\n$meth_call\n"; | |
581 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); | |
582 # print "$meth_call\n"; | |
583 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly | |
584 | |
585 my @len = split (/\D+/,$cigar); # storing the length per operation | |
586 my @ops = split (/\d+/,$cigar); # storing the operation | |
587 shift @ops; # remove the empty first element | |
588 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
589 | |
590 my @comp_cigar; # building an array with all CIGAR operations | |
591 foreach my $index (0..$#len){ | |
592 foreach (1..$len[$index]){ | |
593 # print "$ops[$index]"; | |
594 push @comp_cigar, $ops[$index]; | |
595 } | |
596 } | |
597 # print "original CIGAR: $cigar\n"; | |
598 # print "original CIGAR: @comp_cigar\n"; | |
599 | |
600 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly! | |
601 if ($strand eq '+'){ | |
602 | |
603 my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions | |
604 my $I_count = 0; | |
605 | |
606 for (1..$ignore){ | |
607 my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations from the start | |
608 # print "$_ deleted $op\n"; | |
609 | |
610 while ($op eq 'D'){ # repeating this for deletions (D) | |
611 $D_count++; | |
612 $op = shift @comp_cigar; | |
613 # print "$_ deleted $op\n"; | |
614 } | |
615 if ($op eq 'I'){ # adjusting the genomic position for insertions (I) | |
616 $I_count++; | |
617 } | |
618 } | |
619 $start += $ignore + $D_count - $I_count; | |
620 # print "start $start\t ignore: $ignore\t D count: $D_count I_count: $I_count\n"; | |
621 } | |
622 elsif ($strand eq '-'){ | |
623 | |
624 for (1..$ignore){ | |
625 my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array | |
626 while ($op eq 'D'){ # repeating this for deletions (D) | |
627 $op = pop @comp_cigar; | |
628 } | |
629 } | |
630 | |
631 ### For reverse strand alignments we need to determine the number of matching bases (M) or deletions (D) in the read from the CIGAR | |
632 ### string to be able to work out the starting position of the read which is on the 3' end of the sequence | |
633 my $MD_count = 0; # counting all operations that affect the genomic position, i.e. M and D. Insertions do not affect the start position | |
634 foreach (@comp_cigar){ | |
635 ++$MD_count if ($_ eq 'M' or $_ eq 'D'); | |
636 } | |
637 $start += $MD_count - 1; | |
638 } | |
639 | |
640 ### reconstituting shortened CIGAR string | |
641 my $new_cigar; | |
642 my $count = 0; | |
643 my $last_op; | |
644 # print "ignore adjusted: @comp_cigar\n"; | |
645 foreach my $op (@comp_cigar){ | |
646 unless (defined $last_op){ | |
647 $last_op = $op; | |
648 ++$count; | |
649 next; | |
650 } | |
651 if ($last_op eq $op){ | |
652 ++$count; | |
653 } | |
654 else{ | |
655 $new_cigar .= "$count$last_op"; | |
656 $last_op = $op; | |
657 $count = 1; | |
658 } | |
659 } | |
660 $new_cigar .= "$count$last_op"; # appending the last operation and count | |
661 $cigar = $new_cigar; | |
662 # print "ignore adjusted scalar: $cigar\n"; | |
663 } | |
664 } | |
665 ### printing out the methylation state of every C in the read | |
666 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index,$cigar); | |
667 | |
668 ++$methylation_call_strings_processed; # 1 per single-end result | |
669 } | |
670 } | |
671 } | |
672 | |
673 ### PROCESSING PAIRED-END RESULT FILES | |
674 elsif ($paired){ | |
675 | |
676 ### proceeding differently now for SAM format or vanilla Bismark format files | |
677 if ($vanilla){ # old vanilla Bismark paired-end output format | |
678 while (<IN>){ | |
679 ++$line_count; | |
680 warn "processed line: $line_count\n" if ($line_count%500000==0); | |
681 | |
682 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example) | |
683 my ($id,$strand,$chrom,$start_read_1,$end_read_2,$seq_1,$meth_call_1,$seq_2,$meth_call_2,$first_read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,4,6,7,9,10,11,12,13]; | |
684 | |
685 my $index; | |
686 chomp $genome_conversion; | |
687 | |
688 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT'){ | |
689 $index = 0; ## this is OT | |
690 } | |
691 elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA'){ | |
692 $index = 2; ## this is CTOB!!! | |
693 } | |
694 elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT'){ | |
695 $index = 1; ## this is CTOT!!! | |
696 } | |
697 elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA'){ | |
698 $index = 3; ## this is OB | |
699 } | |
700 else { | |
701 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; | |
702 } | |
703 | |
704 if ($meth_call_1 and $meth_call_2){ | |
705 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' | |
706 if ($ignore){ | |
707 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); | |
708 $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore); | |
709 | |
710 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified | |
711 $start_read_1 += $ignore; | |
712 $end_read_2 -= $ignore; | |
713 } | |
714 my $end_read_1; | |
715 my $start_read_2; | |
716 | |
717 if ($strand eq '+'){ | |
718 | |
719 $end_read_1 = $start_read_1+length($meth_call_1)-1; | |
720 $start_read_2 = $end_read_2-length($meth_call_2)+1; | |
721 | |
722 ## we first pass the first read which is in + orientation on the forward strand | |
723 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0); | |
724 | |
725 # we next pass the second read which is in - orientation on the reverse strand | |
726 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 | |
727 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1); | |
728 } | |
729 | |
730 else{ | |
731 | |
732 $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read! | |
733 $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read! | |
734 | |
735 ## we first pass the first read which is in - orientation on the reverse strand | |
736 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0); | |
737 | |
738 # we next pass the second read which is in + orientation on the forward strand | |
739 ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 | |
740 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2); | |
741 } | |
742 | |
743 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls | |
744 } | |
745 } | |
746 } | |
747 else{ # Bismark paired-end SAM output format (default) | |
748 while (<IN>){ | |
749 ### SAM format can either start with header lines (starting with @) or start with alignments directly | |
750 if (/^\@/){ # skipping header lines (starting with @) | |
751 warn "skipping SAM header line:\t$_"; | |
752 next; | |
753 } | |
754 | |
755 ++$line_count; | |
756 warn "Processed lines: $line_count\n" if ($line_count%500000==0); | |
757 | |
758 # example paired-end reads in SAM format (2 consecutive lines) | |
759 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT | |
760 # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT | |
761 | |
762 # < version 0.7.6 my ($id_1,$chrom,$start_read_1,$meth_call_1,$first_read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; | |
763 | |
764 my ($id_1,$chrom,$start_read_1,$cigar_1) = (split("\t"))[0,2,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression | |
765 my $meth_call_1; | |
766 my $first_read_conversion; | |
767 my $genome_conversion; | |
768 | |
769 while ( /(XM|XR|XG):Z:([^\t]+)/g ) { | |
770 my $tag = $1; | |
771 my $value = $2; | |
772 | |
773 if ($tag eq "XM"){ | |
774 $meth_call_1 = $value; | |
775 $meth_call_1 =~ s/\r//; | |
776 } | |
777 elsif ($tag eq "XR") { | |
778 $first_read_conversion = $value; | |
779 $first_read_conversion =~ s/\r//; | |
780 } | |
781 elsif ($tag eq "XG") { | |
782 $genome_conversion = $value; | |
783 $genome_conversion =~ s/\r//; | |
784 } | |
785 } | |
786 | |
787 $_ = <IN>; # reading in the paired read | |
788 | |
789 # < version 0.7.6 my ($id_2,$start_read_2,$meth_call_2,$second_read_conversion) = (split("\t"))[0,3,13,14]; | |
790 # < version 0.7.6 $meth_call_1 =~ s/^XM:Z://; | |
791 # < version 0.7.6 $meth_call_2 =~ s/^XM:Z://; | |
792 # < version 0.7.6 $first_read_conversion =~ s/^XR:Z://; | |
793 # < version 0.7.6 $second_read_conversion =~ s/^XR:Z://; | |
794 | |
795 my ($id_2,$start_read_2,$cigar_2) = (split("\t"))[0,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression | |
796 | |
797 my $meth_call_2; | |
798 my $second_read_conversion; | |
799 | |
800 while ( /(XM|XR):Z:([^\t]+)/g ) { | |
801 my $tag = $1; | |
802 my $value = $2; | |
803 | |
804 if ($tag eq "XM"){ | |
805 $meth_call_2 = $value; | |
806 $meth_call_2 =~ s/\r//; | |
807 } | |
808 elsif ($tag eq "XR") { | |
809 $second_read_conversion = $value; | |
810 $second_read_conversion = s/\r//; | |
811 } | |
812 } | |
813 | |
814 # < version 0.7.6 $genome_conversion =~ s/^XG:Z://; | |
815 chomp $genome_conversion; # in case it captured a new line character | |
816 | |
817 # print join ("\t",$meth_call_1,$meth_call_2,$first_read_conversion,$second_read_conversion,$genome_conversion),"\n"; | |
818 | |
819 my $index; | |
820 my $strand; | |
821 | |
822 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT'){ | |
823 $index = 0; ## this is OT | |
824 $strand = '+'; | |
825 } | |
826 elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT'){ | |
827 $index = 1; ## this is CTOT | |
828 $strand = '-'; | |
829 } | |
830 elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA'){ | |
831 $index = 2; ## this is CTOB | |
832 $strand = '+'; | |
833 } | |
834 elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA'){ | |
835 $index = 3; ## this is OB | |
836 $strand = '-'; | |
837 } | |
838 else { | |
839 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; | |
840 } | |
841 | |
842 ### reversing the methylation call of the read that was reverse-complemented | |
843 if ($strand eq '+'){ | |
844 $meth_call_2 = reverse $meth_call_2; | |
845 } | |
846 else{ | |
847 $meth_call_1 = reverse $meth_call_1; | |
848 } | |
849 | |
850 if ($meth_call_1 and $meth_call_2){ | |
851 | |
852 my $end_read_1; | |
853 | |
854 ### READ 1 | |
855 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation | |
856 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation | |
857 shift @ops_1; # remove the empty first element | |
858 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1); | |
859 | |
860 my @comp_cigar_1; # building an array with all CIGAR operations | |
861 foreach my $index (0..$#len_1){ | |
862 foreach (1..$len_1[$index]){ | |
863 # print "$ops_1[$index]"; | |
864 push @comp_cigar_1, $ops_1[$index]; | |
865 } | |
866 } | |
867 # print "original CIGAR read 1: $cigar_1\n"; | |
868 # print "original CIGAR read 1: @comp_cigar_1\n"; | |
869 | |
870 ### READ 2 | |
871 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation | |
872 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation | |
873 shift @ops_2; # remove the empty first element | |
874 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2); | |
875 my @comp_cigar_2; # building an array with all CIGAR operations for read 2 | |
876 foreach my $index (0..$#len_2){ | |
877 foreach (1..$len_2[$index]){ | |
878 # print "$ops_2[$index]"; | |
879 push @comp_cigar_2, $ops_2[$index]; | |
880 } | |
881 } | |
882 # print "original CIGAR read 2: $cigar_2\n"; | |
883 # print "original CIGAR read 2: @comp_cigar_2\n"; | |
884 | |
885 if ($ignore){ | |
886 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' | |
887 ### the methylation calls have already been reversed where necessary | |
888 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); | |
889 $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore); | |
890 | |
891 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly | |
892 | |
893 if ($strand eq '+'){ | |
894 | |
895 ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start | |
896 my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions | |
897 my $I_count_1 = 0; | |
898 | |
899 for (1..$ignore){ | |
900 my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start | |
901 # print "$_ deleted $op\n"; | |
902 | |
903 while ($op eq 'D'){ # repeating this for deletions (D) | |
904 $D_count_1++; | |
905 $op = shift @comp_cigar_1; | |
906 # print "$_ deleted $op\n"; | |
907 } | |
908 if ($op eq 'I'){ # adjusting the genomic position for insertions (I) | |
909 $I_count_1++; | |
910 } | |
911 } | |
912 | |
913 $start_read_1 += $ignore + $D_count_1 - $I_count_1; | |
914 # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n"; | |
915 | |
916 ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back | |
917 | |
918 for (1..$ignore){ | |
919 my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array | |
920 while ($op eq 'D'){ # repeating this for deletions (D) | |
921 $op = pop @comp_cigar_2; | |
922 } | |
923 } | |
924 # the start position of reads mapping to the reverse strand is being adjusted further below | |
925 } | |
926 | |
927 elsif ($strand eq '-'){ | |
928 | |
929 ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back | |
930 for (1..$ignore){ | |
931 my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array | |
932 while ($op eq 'D'){ # repeating this for deletions (D) | |
933 $op = pop @comp_cigar_1; | |
934 } | |
935 } | |
936 # the start position of reads mapping to the reverse strand is being adjusted further below | |
937 | |
938 ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start | |
939 my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions | |
940 my $I_count_2 = 0; | |
941 | |
942 for (1..$ignore){ | |
943 my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start | |
944 # print "$_ deleted $op\n"; | |
945 | |
946 while ($op eq 'D'){ # repeating this for deletions (D) | |
947 $D_count_2++; | |
948 $op = shift @comp_cigar_2; | |
949 # print "$_ deleted $op\n"; | |
950 } | |
951 if ($op eq 'I'){ # adjusting the genomic position for insertions (I) | |
952 $I_count_2++; | |
953 } | |
954 } | |
955 | |
956 $start_read_2 += $ignore + $D_count_2 - $I_count_2; | |
957 # print "start read 2 $start_read_2\t ignore: $ignore\t D count 2: $D_count_2\tI_count 2: $I_count_2\n"; | |
958 | |
959 } | |
960 | |
961 ### reconstituting shortened CIGAR string 1 | |
962 my $new_cigar_1; | |
963 my $count_1 = 0; | |
964 my $last_op_1; | |
965 # print "ignore adjusted CIGAR 1: @comp_cigar_1\n"; | |
966 foreach my $op (@comp_cigar_1){ | |
967 unless (defined $last_op_1){ | |
968 $last_op_1 = $op; | |
969 ++$count_1; | |
970 next; | |
971 } | |
972 if ($last_op_1 eq $op){ | |
973 ++$count_1; | |
974 } | |
975 else{ | |
976 $new_cigar_1 .= "$count_1$last_op_1"; | |
977 $last_op_1 = $op; | |
978 $count_1 = 1; | |
979 } | |
980 } | |
981 $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count | |
982 $cigar_1 = $new_cigar_1; | |
983 # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n"; | |
984 | |
985 ### reconstituting shortened CIGAR string 2 | |
986 my $new_cigar_2; | |
987 my $count_2 = 0; | |
988 my $last_op_2; | |
989 # print "ignore adjusted CIGAR 2: @comp_cigar_2\n"; | |
990 foreach my $op (@comp_cigar_2){ | |
991 unless (defined $last_op_2){ | |
992 $last_op_2 = $op; | |
993 ++$count_2; | |
994 next; | |
995 } | |
996 if ($last_op_2 eq $op){ | |
997 ++$count_2; | |
998 } | |
999 else{ | |
1000 $new_cigar_2 .= "$count_2$last_op_2"; | |
1001 $last_op_2 = $op; | |
1002 $count_2 = 1; | |
1003 } | |
1004 } | |
1005 $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count | |
1006 $cigar_2 = $new_cigar_2; | |
1007 # print "ignore adjusted CIGAR 2 scalar: $cigar_2\n"; | |
1008 | |
1009 } | |
1010 | |
1011 if ($strand eq '+'){ | |
1012 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2 | |
1013 @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too | |
1014 # print "reverse: @comp_cigar_2\n"; | |
1015 | |
1016 my $MD_count_1 = 0; | |
1017 foreach (@comp_cigar_1){ | |
1018 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't | |
1019 } | |
1020 | |
1021 my $MD_count_2 = 0; | |
1022 foreach (@comp_cigar_2){ | |
1023 ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't | |
1024 } | |
1025 | |
1026 $end_read_1 = $start_read_1 + $MD_count_1 - 1; | |
1027 $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand | |
1028 } | |
1029 else{ | |
1030 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1 | |
1031 | |
1032 @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too | |
1033 # print "reverse: @comp_cigar_1\n"; | |
1034 | |
1035 my $MD_count_1 = 0; | |
1036 foreach (@comp_cigar_1){ | |
1037 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't | |
1038 } | |
1039 | |
1040 $end_read_1 = $start_read_1; | |
1041 $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand | |
1042 | |
1043 } | |
1044 | |
1045 if ($strand eq '+'){ | |
1046 ## we first pass the first read which is in + orientation on the forward strand | |
1047 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1); | |
1048 | |
1049 # we next pass the second read which is in - orientation on the reverse strand | |
1050 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 | |
1051 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2); | |
1052 } | |
1053 | |
1054 else{ | |
1055 ## we first pass the first read which is in - orientation on the reverse strand | |
1056 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1); | |
1057 | |
1058 # we next pass the second read which is in + orientation on the forward strand | |
1059 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 | |
1060 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2); | |
1061 } | |
1062 | |
1063 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls | |
1064 } | |
1065 } | |
1066 } | |
1067 } | |
1068 else{ | |
1069 die "Single-end or paired-end reads not specified properly\n"; | |
1070 } | |
1071 | |
1072 print "\n\nProcessed $line_count lines from $filename in total\n"; | |
1073 print "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; | |
1074 if ($report){ | |
1075 print REPORT "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; | |
1076 } | |
1077 print_splitting_report (); | |
1078 } | |
1079 } | |
1080 | |
1081 | |
1082 sub print_splitting_report{ | |
1083 | |
1084 ### Calculating methylation percentages if applicable | |
1085 | |
1086 my $percent_meCpG; | |
1087 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ | |
1088 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); | |
1089 } | |
1090 | |
1091 my $percent_meCHG; | |
1092 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ | |
1093 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); | |
1094 } | |
1095 | |
1096 my $percent_meCHH; | |
1097 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ | |
1098 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); | |
1099 } | |
1100 | |
1101 my $percent_non_CpG_methylation; | |
1102 if ($merge_non_CpG){ | |
1103 if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ | |
1104 $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) ); | |
1105 } | |
1106 } | |
1107 | |
1108 if ($report){ | |
1109 ### detailed information about Cs analysed | |
1110 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; | |
1111 | |
1112 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; | |
1113 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; | |
1114 | |
1115 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | |
1116 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | |
1117 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; | |
1118 | |
1119 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
1120 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
1121 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; | |
1122 | |
1123 ### calculating methylated CpG percentage if applicable | |
1124 if ($percent_meCpG){ | |
1125 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; | |
1126 } | |
1127 else{ | |
1128 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; | |
1129 } | |
1130 | |
1131 ### 2-Context Output | |
1132 if ($merge_non_CpG){ | |
1133 if ($percent_non_CpG_methylation){ | |
1134 print REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; | |
1135 } | |
1136 else{ | |
1137 print REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; | |
1138 } | |
1139 } | |
1140 | |
1141 ### 3 Context Output | |
1142 else{ | |
1143 ### calculating methylated CHG percentage if applicable | |
1144 if ($percent_meCHG){ | |
1145 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; | |
1146 } | |
1147 else{ | |
1148 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; | |
1149 } | |
1150 | |
1151 ### calculating methylated CHH percentage if applicable | |
1152 if ($percent_meCHH){ | |
1153 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; | |
1154 } | |
1155 else{ | |
1156 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; | |
1157 } | |
1158 } | |
1159 } | |
1160 | |
1161 ### detailed information about Cs analysed for on-screen report | |
1162 print "Final Cytosine Methylation Report\n",'='x33,"\n"; | |
1163 | |
1164 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; | |
1165 print "Total number of C's analysed:\t$total_number_of_C\n\n"; | |
1166 | |
1167 print "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | |
1168 print "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | |
1169 print "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; | |
1170 | |
1171 print "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
1172 print "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
1173 print "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; | |
1174 | |
1175 ### printing methylated CpG percentage if applicable | |
1176 if ($percent_meCpG){ | |
1177 print "C methylated in CpG context:\t${percent_meCpG}%\n"; | |
1178 } | |
1179 else{ | |
1180 print "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; | |
1181 } | |
1182 | |
1183 ### 2-Context Output | |
1184 if ($merge_non_CpG){ | |
1185 if ($percent_non_CpG_methylation){ | |
1186 print "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; | |
1187 } | |
1188 else{ | |
1189 print "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; | |
1190 } | |
1191 } | |
1192 | |
1193 ### 3-Context Output | |
1194 else{ | |
1195 ### printing methylated CHG percentage if applicable | |
1196 if ($percent_meCHG){ | |
1197 print "C methylated in CHG context:\t${percent_meCHG}%\n"; | |
1198 } | |
1199 else{ | |
1200 print "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; | |
1201 } | |
1202 | |
1203 ### printing methylated CHH percentage if applicable | |
1204 if ($percent_meCHH){ | |
1205 print "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; | |
1206 } | |
1207 else{ | |
1208 print "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; | |
1209 } | |
1210 } | |
1211 } | |
1212 | |
1213 | |
1214 | |
1215 | |
1216 | |
1217 sub print_individual_C_methylation_states_paired_end_files{ | |
1218 | |
1219 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar) = @_; | |
1220 my @methylation_calls = split(//,$meth_call); | |
1221 | |
1222 ################################################################# | |
1223 ### . for bases not involving cytosines ### | |
1224 ### X for methylated C in CHG context (was protected) ### | |
1225 ### x for not methylated C in CHG context (was converted) ### | |
1226 ### H for methylated C in CHH context (was protected) ### | |
1227 ### h for not methylated C in CHH context (was converted) ### | |
1228 ### Z for methylated C in CpG context (was protected) ### | |
1229 ### z for not methylated C in CpG context (was converted) ### | |
1230 ################################################################# | |
1231 | |
1232 my $methyl_CHG_count = 0; | |
1233 my $methyl_CHH_count = 0; | |
1234 my $methyl_CpG_count = 0; | |
1235 my $unmethylated_CHG_count = 0; | |
1236 my $unmethylated_CHH_count = 0; | |
1237 my $unmethylated_CpG_count = 0; | |
1238 | |
1239 my @len; | |
1240 my @ops; | |
1241 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions | |
1242 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels | |
1243 my @comp_cigar; | |
1244 | |
1245 if ($cigar){ # parsing CIGAR string | |
1246 @len = split (/\D+/,$cigar); # storing the length per operation | |
1247 @ops = split (/\d+/,$cigar); # storing the operation | |
1248 shift @ops; # remove the empty first element | |
1249 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
1250 | |
1251 foreach my $index (0..$#len){ | |
1252 foreach (1..$len[$index]){ | |
1253 # print "$ops[$index]"; | |
1254 push @comp_cigar, $ops[$index]; | |
1255 } | |
1256 } | |
1257 # warn "\nDetected CIGAR string: $cigar\n"; | |
1258 # warn "Length of methylation call: ",length $meth_call,"\n"; | |
1259 # warn "number of operations: ",scalar @ops,"\n"; | |
1260 # warn "number of length digits: ",scalar @len,"\n\n"; | |
1261 # print @comp_cigar,"\n"; | |
1262 # print "$meth_call\n\n"; | |
1263 # sleep (1); | |
1264 } | |
1265 | |
1266 | |
1267 if ($strand eq '-') { | |
1268 | |
1269 ### the CIGAR string needs to be reversed, the methylation call has already been reversed above | |
1270 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too | |
1271 # print "reverse CIGAR string: @comp_cigar\n"; | |
1272 | |
1273 ### the start position of paired-end files has already been corrected, see above | |
1274 } | |
1275 | |
1276 ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified | |
1277 | |
1278 if ($merge_non_CpG) { | |
1279 | |
1280 if ($no_overlap) { | |
1281 | |
1282 ### single-file CpG and non-CpG context output | |
1283 if ($full) { | |
1284 if ($strand eq '+') { | |
1285 for my $index (0..$#methylation_calls) { | |
1286 | |
1287 if ($cigar){ # only needed for SAM files | |
1288 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1289 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
1290 $cigar_offset += $cigar_mod; | |
1291 $pos_offset += $pos_mod; | |
1292 } | |
1293 | |
1294 ### Returning as soon as the methylation calls start overlapping | |
1295 if ($start+$index+$pos_offset >= $end_read_1) { | |
1296 return; | |
1297 } | |
1298 | |
1299 if ($methylation_calls[$index] eq 'X') { | |
1300 $counting{total_meCHG_count}++; | |
1301 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1302 } elsif ($methylation_calls[$index] eq 'x') { | |
1303 $counting{total_unmethylated_CHG_count}++; | |
1304 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1305 } elsif ($methylation_calls[$index] eq 'Z') { | |
1306 $counting{total_meCpG_count}++; | |
1307 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1308 } elsif ($methylation_calls[$index] eq 'z') { | |
1309 $counting{total_unmethylated_CpG_count}++; | |
1310 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1311 } elsif ($methylation_calls[$index] eq 'H') { | |
1312 $counting{total_meCHH_count}++; | |
1313 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1314 } elsif ($methylation_calls[$index] eq 'h') { | |
1315 $counting{total_unmethylated_CHH_count}++; | |
1316 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1317 } | |
1318 elsif ($methylation_calls[$index] eq '.'){} | |
1319 else{ | |
1320 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
1321 } | |
1322 } | |
1323 } elsif ($strand eq '-') { | |
1324 for my $index (0..$#methylation_calls) { | |
1325 | |
1326 if ($cigar){ # only needed for SAM files | |
1327 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
1328 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1329 $cigar_offset += $cigar_mod; | |
1330 $pos_offset += $pos_mod; | |
1331 } | |
1332 | |
1333 ### Returning as soon as the methylation calls start overlapping | |
1334 if ($start-$index+$pos_offset <= $end_read_1) { | |
1335 return; | |
1336 } | |
1337 | |
1338 if ($methylation_calls[$index] eq 'X') { | |
1339 $counting{total_meCHG_count}++; | |
1340 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1341 } elsif ($methylation_calls[$index] eq 'x') { | |
1342 $counting{total_unmethylated_CHG_count}++; | |
1343 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1344 } elsif ($methylation_calls[$index] eq 'Z') { | |
1345 $counting{total_meCpG_count}++; | |
1346 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1347 } elsif ($methylation_calls[$index] eq 'z') { | |
1348 $counting{total_unmethylated_CpG_count}++; | |
1349 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1350 } elsif ($methylation_calls[$index] eq 'H') { | |
1351 $counting{total_meCHH_count}++; | |
1352 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1353 } elsif ($methylation_calls[$index] eq 'h') { | |
1354 $counting{total_unmethylated_CHH_count}++; | |
1355 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1356 } | |
1357 elsif ($methylation_calls[$index] eq '.') {} | |
1358 else{ | |
1359 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
1360 } | |
1361 } | |
1362 } else { | |
1363 die "The read orientation was neither + nor -: '$strand'\n"; | |
1364 } | |
1365 } | |
1366 | |
1367 ### strand-specific methylation output | |
1368 else { | |
1369 if ($strand eq '+') { | |
1370 for my $index (0..$#methylation_calls) { | |
1371 | |
1372 if ($cigar){ # only needed for SAM files | |
1373 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1374 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
1375 $cigar_offset += $cigar_mod; | |
1376 $pos_offset += $pos_mod; | |
1377 } | |
1378 | |
1379 ### Returning as soon as the methylation calls start overlapping | |
1380 if ($start+$index+$pos_offset >= $end_read_1) { | |
1381 return; | |
1382 } | |
1383 | |
1384 if ($methylation_calls[$index] eq 'X') { | |
1385 $counting{total_meCHG_count}++; | |
1386 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1387 } elsif ($methylation_calls[$index] eq 'x') { | |
1388 $counting{total_unmethylated_CHG_count}++; | |
1389 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1390 } elsif ($methylation_calls[$index] eq 'Z') { | |
1391 $counting{total_meCpG_count}++; | |
1392 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1393 } elsif ($methylation_calls[$index] eq 'z') { | |
1394 $counting{total_unmethylated_CpG_count}++; | |
1395 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1396 } elsif ($methylation_calls[$index] eq 'H') { | |
1397 $counting{total_meCHH_count}++; | |
1398 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1399 } elsif ($methylation_calls[$index] eq 'h') { | |
1400 $counting{total_unmethylated_CHH_count}++; | |
1401 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1402 } | |
1403 elsif ($methylation_calls[$index] eq '.') {} | |
1404 else{ | |
1405 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
1406 } | |
1407 } | |
1408 } elsif ($strand eq '-') { | |
1409 for my $index (0..$#methylation_calls) { | |
1410 | |
1411 if ($cigar){ # only needed for SAM files | |
1412 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
1413 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1414 $cigar_offset += $cigar_mod; | |
1415 $pos_offset += $pos_mod; | |
1416 } | |
1417 | |
1418 ### Returning as soon as the methylation calls start overlapping | |
1419 if ($start-$index+$pos_offset <= $end_read_1) { | |
1420 return; | |
1421 } | |
1422 | |
1423 if ($methylation_calls[$index] eq 'X') { | |
1424 $counting{total_meCHG_count}++; | |
1425 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1426 } elsif ($methylation_calls[$index] eq 'x') { | |
1427 $counting{total_unmethylated_CHG_count}++; | |
1428 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1429 } elsif ($methylation_calls[$index] eq 'Z') { | |
1430 $counting{total_meCpG_count}++; | |
1431 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1432 } elsif ($methylation_calls[$index] eq 'z') { | |
1433 $counting{total_unmethylated_CpG_count}++; | |
1434 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1435 } elsif ($methylation_calls[$index] eq 'H') { | |
1436 $counting{total_meCHH_count}++; | |
1437 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1438 } elsif ($methylation_calls[$index] eq 'h') { | |
1439 $counting{total_unmethylated_CHH_count}++; | |
1440 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1441 } | |
1442 elsif ($methylation_calls[$index] eq '.') {} | |
1443 else{ | |
1444 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
1445 } | |
1446 } | |
1447 } else { | |
1448 die "The strand orientation was neither + nor -: '$strand'/n"; | |
1449 } | |
1450 } | |
1451 } | |
1452 | |
1453 ### this is the default paired-end procedure allowing overlaps and using every single C position | |
1454 ### Still within the 2-CONTEXT ONLY optional section | |
1455 else { | |
1456 ### single-file CpG and non-CpG context output | |
1457 if ($full) { | |
1458 if ($strand eq '+') { | |
1459 for my $index (0..$#methylation_calls) { | |
1460 | |
1461 if ($cigar){ # only needed for SAM files | |
1462 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1463 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
1464 $cigar_offset += $cigar_mod; | |
1465 $pos_offset += $pos_mod; | |
1466 } | |
1467 | |
1468 if ($methylation_calls[$index] eq 'X') { | |
1469 $counting{total_meCHG_count}++; | |
1470 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1471 } elsif ($methylation_calls[$index] eq 'x') { | |
1472 $counting{total_unmethylated_CHG_count}++; | |
1473 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1474 } elsif ($methylation_calls[$index] eq 'Z') { | |
1475 $counting{total_meCpG_count}++; | |
1476 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1477 } elsif ($methylation_calls[$index] eq 'z') { | |
1478 $counting{total_unmethylated_CpG_count}++; | |
1479 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1480 } elsif ($methylation_calls[$index] eq 'H') { | |
1481 $counting{total_meCHH_count}++; | |
1482 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1483 } elsif ($methylation_calls[$index] eq 'h') { | |
1484 $counting{total_unmethylated_CHH_count}++; | |
1485 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1486 } | |
1487 elsif ($methylation_calls[$index] eq '.') {} | |
1488 else{ | |
1489 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
1490 } | |
1491 } | |
1492 } elsif ($strand eq '-') { | |
1493 for my $index (0..$#methylation_calls) { | |
1494 | |
1495 if ($cigar){ # only needed for SAM files | |
1496 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
1497 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1498 $cigar_offset += $cigar_mod; | |
1499 $pos_offset += $pos_mod; | |
1500 } | |
1501 | |
1502 if ($methylation_calls[$index] eq 'X') { | |
1503 $counting{total_meCHG_count}++; | |
1504 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1505 } elsif ($methylation_calls[$index] eq 'x') { | |
1506 $counting{total_unmethylated_CHG_count}++; | |
1507 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1508 } elsif ($methylation_calls[$index] eq 'Z') { | |
1509 $counting{total_meCpG_count}++; | |
1510 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1511 } elsif ($methylation_calls[$index] eq 'z') { | |
1512 $counting{total_unmethylated_CpG_count}++; | |
1513 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1514 } elsif ($methylation_calls[$index] eq 'H') { | |
1515 $counting{total_meCHH_count}++; | |
1516 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1517 } elsif ($methylation_calls[$index] eq 'h') { | |
1518 $counting{total_unmethylated_CHH_count}++; | |
1519 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1520 } | |
1521 elsif ($methylation_calls[$index] eq '.') {} | |
1522 else{ | |
1523 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
1524 } | |
1525 } | |
1526 } else { | |
1527 die "The strand orientation as neither + nor -: '$strand'\n"; | |
1528 } | |
1529 } | |
1530 | |
1531 ### strand-specific methylation output | |
1532 ### still within the 2-CONTEXT optional section | |
1533 else { | |
1534 if ($strand eq '+') { | |
1535 for my $index (0..$#methylation_calls) { | |
1536 | |
1537 if ($cigar){ # only needed for SAM files | |
1538 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1539 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
1540 $cigar_offset += $cigar_mod; | |
1541 $pos_offset += $pos_mod; | |
1542 } | |
1543 | |
1544 if ($methylation_calls[$index] eq 'X') { | |
1545 $counting{total_meCHG_count}++; | |
1546 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1547 } elsif ($methylation_calls[$index] eq 'x') { | |
1548 $counting{total_unmethylated_CHG_count}++; | |
1549 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1550 } elsif ($methylation_calls[$index] eq 'Z') { | |
1551 $counting{total_meCpG_count}++; | |
1552 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1553 } elsif ($methylation_calls[$index] eq 'z') { | |
1554 $counting{total_unmethylated_CpG_count}++; | |
1555 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1556 } elsif ($methylation_calls[$index] eq 'H') { | |
1557 $counting{total_meCHH_count}++; | |
1558 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1559 } elsif ($methylation_calls[$index] eq 'h') { | |
1560 $counting{total_unmethylated_CHH_count}++; | |
1561 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1562 } | |
1563 elsif ($methylation_calls[$index] eq '.') {} | |
1564 else{ | |
1565 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
1566 } | |
1567 } | |
1568 } elsif ($strand eq '-') { | |
1569 for my $index (0..$#methylation_calls) { | |
1570 | |
1571 if ($cigar){ # only needed for SAM files | |
1572 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
1573 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1574 $cigar_offset += $cigar_mod; | |
1575 $pos_offset += $pos_mod; | |
1576 } | |
1577 | |
1578 if ($methylation_calls[$index] eq 'X') { | |
1579 $counting{total_meCHG_count}++; | |
1580 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1581 } elsif ($methylation_calls[$index] eq 'x') { | |
1582 $counting{total_unmethylated_CHG_count}++; | |
1583 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1584 } elsif ($methylation_calls[$index] eq 'Z') { | |
1585 $counting{total_meCpG_count}++; | |
1586 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1587 } elsif ($methylation_calls[$index] eq 'z') { | |
1588 $counting{total_unmethylated_CpG_count}++; | |
1589 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1590 } elsif ($methylation_calls[$index] eq 'H') { | |
1591 $counting{total_meCHH_count}++; | |
1592 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1593 } elsif ($methylation_calls[$index] eq 'h') { | |
1594 $counting{total_unmethylated_CHH_count}++; | |
1595 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1596 } | |
1597 elsif ($methylation_calls[$index] eq '.') {} | |
1598 else{ | |
1599 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
1600 } | |
1601 } | |
1602 } else { | |
1603 die "The strand orientation as neither + nor -: '$strand'\n"; | |
1604 } | |
1605 } | |
1606 } | |
1607 } | |
1608 | |
1609 ############################################ | |
1610 ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ### | |
1611 ############################################ | |
1612 | |
1613 elsif ($no_overlap) { | |
1614 ### single-file CpG, CHG and CHH context output | |
1615 if ($full) { | |
1616 if ($strand eq '+') { | |
1617 for my $index (0..$#methylation_calls) { | |
1618 | |
1619 if ($cigar){ # only needed for SAM files | |
1620 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1621 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
1622 $cigar_offset += $cigar_mod; | |
1623 $pos_offset += $pos_mod; | |
1624 } | |
1625 | |
1626 ### Returning as soon as the methylation calls start overlapping | |
1627 if ($start+$index+$pos_offset >= $end_read_1) { | |
1628 return; | |
1629 } | |
1630 | |
1631 if ($methylation_calls[$index] eq 'X') { | |
1632 $counting{total_meCHG_count}++; | |
1633 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1634 } elsif ($methylation_calls[$index] eq 'x') { | |
1635 $counting{total_unmethylated_CHG_count}++; | |
1636 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1637 } elsif ($methylation_calls[$index] eq 'Z') { | |
1638 $counting{total_meCpG_count}++; | |
1639 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1640 } elsif ($methylation_calls[$index] eq 'z') { | |
1641 $counting{total_unmethylated_CpG_count}++; | |
1642 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1643 } elsif ($methylation_calls[$index] eq 'H') { | |
1644 $counting{total_meCHH_count}++; | |
1645 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1646 } elsif ($methylation_calls[$index] eq 'h') { | |
1647 $counting{total_unmethylated_CHH_count}++; | |
1648 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1649 } | |
1650 elsif ($methylation_calls[$index] eq '.') {} | |
1651 else{ | |
1652 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
1653 } | |
1654 } | |
1655 } elsif ($strand eq '-') { | |
1656 for my $index (0..$#methylation_calls) { | |
1657 | |
1658 if ($cigar){ # only needed for SAM files | |
1659 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
1660 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1661 $cigar_offset += $cigar_mod; | |
1662 $pos_offset += $pos_mod; | |
1663 } | |
1664 | |
1665 ### Returning as soon as the methylation calls start overlapping | |
1666 if ($start-$index+$pos_offset <= $end_read_1) { | |
1667 return; | |
1668 } | |
1669 | |
1670 if ($methylation_calls[$index] eq 'X') { | |
1671 $counting{total_meCHG_count}++; | |
1672 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1673 } elsif ($methylation_calls[$index] eq 'x') { | |
1674 $counting{total_unmethylated_CHG_count}++; | |
1675 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1676 } elsif ($methylation_calls[$index] eq 'Z') { | |
1677 $counting{total_meCpG_count}++; | |
1678 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1679 } elsif ($methylation_calls[$index] eq 'z') { | |
1680 $counting{total_unmethylated_CpG_count}++; | |
1681 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1682 } elsif ($methylation_calls[$index] eq 'H') { | |
1683 $counting{total_meCHH_count}++; | |
1684 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1685 } elsif ($methylation_calls[$index] eq 'h') { | |
1686 $counting{total_unmethylated_CHH_count}++; | |
1687 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1688 } | |
1689 elsif ($methylation_calls[$index] eq '.') {} | |
1690 else{ | |
1691 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
1692 } | |
1693 } | |
1694 } else { | |
1695 die "The strand orientation as neither + nor -: '$strand'\n"; | |
1696 } | |
1697 } | |
1698 | |
1699 ### strand-specific methylation output | |
1700 else { | |
1701 if ($strand eq '+') { | |
1702 for my $index (0..$#methylation_calls) { | |
1703 | |
1704 if ($cigar){ # only needed for SAM files | |
1705 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1706 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
1707 $cigar_offset += $cigar_mod; | |
1708 $pos_offset += $pos_mod; | |
1709 } | |
1710 | |
1711 ### Returning as soon as the methylation calls start overlapping | |
1712 if ($start+$index+$pos_offset >= $end_read_1) { | |
1713 return; | |
1714 } | |
1715 | |
1716 if ($methylation_calls[$index] eq 'X') { | |
1717 $counting{total_meCHG_count}++; | |
1718 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1719 } elsif ($methylation_calls[$index] eq 'x') { | |
1720 $counting{total_unmethylated_CHG_count}++; | |
1721 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1722 } elsif ($methylation_calls[$index] eq 'Z') { | |
1723 $counting{total_meCpG_count}++; | |
1724 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1725 } elsif ($methylation_calls[$index] eq 'z') { | |
1726 $counting{total_unmethylated_CpG_count}++; | |
1727 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1728 } elsif ($methylation_calls[$index] eq 'H') { | |
1729 $counting{total_meCHH_count}++; | |
1730 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1731 } elsif ($methylation_calls[$index] eq 'h') { | |
1732 $counting{total_unmethylated_CHH_count}++; | |
1733 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1734 } | |
1735 elsif ($methylation_calls[$index] eq '.') {} | |
1736 else{ | |
1737 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
1738 } | |
1739 } | |
1740 } elsif ($strand eq '-') { | |
1741 for my $index (0..$#methylation_calls) { | |
1742 | |
1743 if ($cigar){ # only needed for SAM files | |
1744 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
1745 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1746 $cigar_offset += $cigar_mod; | |
1747 $pos_offset += $pos_mod; | |
1748 } | |
1749 | |
1750 ### Returning as soon as the methylation calls start overlapping | |
1751 if ($start-$index+$pos_offset <= $end_read_1) { | |
1752 return; | |
1753 } | |
1754 | |
1755 if ($methylation_calls[$index] eq 'X') { | |
1756 $counting{total_meCHG_count}++; | |
1757 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1758 } elsif ($methylation_calls[$index] eq 'x') { | |
1759 $counting{total_unmethylated_CHG_count}++; | |
1760 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1761 } elsif ($methylation_calls[$index] eq 'Z') { | |
1762 $counting{total_meCpG_count}++; | |
1763 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1764 } elsif ($methylation_calls[$index] eq 'z') { | |
1765 $counting{total_unmethylated_CpG_count}++; | |
1766 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1767 } elsif ($methylation_calls[$index] eq 'H') { | |
1768 $counting{total_meCHH_count}++; | |
1769 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1770 } elsif ($methylation_calls[$index] eq 'h') { | |
1771 $counting{total_unmethylated_CHH_count}++; | |
1772 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1773 } | |
1774 elsif ($methylation_calls[$index] eq '.') {} | |
1775 else{ | |
1776 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
1777 } | |
1778 } | |
1779 } else { | |
1780 die "The strand orientation as neither + nor -: '$strand'\n"; | |
1781 } | |
1782 } | |
1783 } | |
1784 | |
1785 ### this is the default paired-end procedure allowing overlaps and using every single C position | |
1786 else { | |
1787 ### single-file CpG, CHG and CHH context output | |
1788 if ($full) { | |
1789 if ($strand eq '+') { | |
1790 for my $index (0..$#methylation_calls) { | |
1791 | |
1792 if ($cigar){ # only needed for SAM files | |
1793 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1794 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
1795 $cigar_offset += $cigar_mod; | |
1796 $pos_offset += $pos_mod; | |
1797 } | |
1798 | |
1799 if ($methylation_calls[$index] eq 'X') { | |
1800 $counting{total_meCHG_count}++; | |
1801 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1802 } elsif ($methylation_calls[$index] eq 'x') { | |
1803 $counting{total_unmethylated_CHG_count}++; | |
1804 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1805 } elsif ($methylation_calls[$index] eq 'Z') { | |
1806 $counting{total_meCpG_count}++; | |
1807 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1808 } elsif ($methylation_calls[$index] eq 'z') { | |
1809 $counting{total_unmethylated_CpG_count}++; | |
1810 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1811 } elsif ($methylation_calls[$index] eq 'H') { | |
1812 $counting{total_meCHH_count}++; | |
1813 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1814 } elsif ($methylation_calls[$index] eq 'h') { | |
1815 $counting{total_unmethylated_CHH_count}++; | |
1816 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1817 } | |
1818 elsif ($methylation_calls[$index] eq '.') {} | |
1819 else{ | |
1820 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
1821 } | |
1822 } | |
1823 } elsif ($strand eq '-') { | |
1824 for my $index (0..$#methylation_calls) { | |
1825 | |
1826 if ($cigar){ # only needed for SAM files | |
1827 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
1828 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1829 $cigar_offset += $cigar_mod; | |
1830 $pos_offset += $pos_mod; | |
1831 } | |
1832 | |
1833 if ($methylation_calls[$index] eq 'X') { | |
1834 $counting{total_meCHG_count}++; | |
1835 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1836 } elsif ($methylation_calls[$index] eq 'x') { | |
1837 $counting{total_unmethylated_CHG_count}++; | |
1838 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1839 } elsif ($methylation_calls[$index] eq 'Z') { | |
1840 $counting{total_meCpG_count}++; | |
1841 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1842 } elsif ($methylation_calls[$index] eq 'z') { | |
1843 $counting{total_unmethylated_CpG_count}++; | |
1844 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1845 } elsif ($methylation_calls[$index] eq 'H') { | |
1846 $counting{total_meCHH_count}++; | |
1847 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1848 } elsif ($methylation_calls[$index] eq 'h') { | |
1849 $counting{total_unmethylated_CHH_count}++; | |
1850 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1851 } | |
1852 elsif ($methylation_calls[$index] eq '.') {} | |
1853 else{ | |
1854 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
1855 } | |
1856 } | |
1857 } else { | |
1858 die "The strand orientation as neither + nor -: '$strand'\n"; | |
1859 } | |
1860 } | |
1861 | |
1862 ### strand-specific methylation output | |
1863 else { | |
1864 if ($strand eq '+') { | |
1865 for my $index (0..$#methylation_calls) { | |
1866 | |
1867 if ($cigar){ # only needed for SAM files | |
1868 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1869 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
1870 $cigar_offset += $cigar_mod; | |
1871 $pos_offset += $pos_mod; | |
1872 } | |
1873 | |
1874 if ($methylation_calls[$index] eq 'X') { | |
1875 $counting{total_meCHG_count}++; | |
1876 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1877 } elsif ($methylation_calls[$index] eq 'x') { | |
1878 $counting{total_unmethylated_CHG_count}++; | |
1879 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1880 } elsif ($methylation_calls[$index] eq 'Z') { | |
1881 $counting{total_meCpG_count}++; | |
1882 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1883 } elsif ($methylation_calls[$index] eq 'z') { | |
1884 $counting{total_unmethylated_CpG_count}++; | |
1885 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1886 } elsif ($methylation_calls[$index] eq 'H') { | |
1887 $counting{total_meCHH_count}++; | |
1888 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1889 } elsif ($methylation_calls[$index] eq 'h') { | |
1890 $counting{total_unmethylated_CHH_count}++; | |
1891 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1892 } | |
1893 elsif ($methylation_calls[$index] eq '.') {} | |
1894 else{ | |
1895 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
1896 } | |
1897 } | |
1898 } elsif ($strand eq '-') { | |
1899 for my $index (0..$#methylation_calls) { | |
1900 | |
1901 if ($cigar){ # only needed for SAM files | |
1902 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
1903 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
1904 $cigar_offset += $cigar_mod; | |
1905 $pos_offset += $pos_mod; | |
1906 } | |
1907 | |
1908 if ($methylation_calls[$index] eq 'X') { | |
1909 $counting{total_meCHG_count}++; | |
1910 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1911 } elsif ($methylation_calls[$index] eq 'x') { | |
1912 $counting{total_unmethylated_CHG_count}++; | |
1913 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1914 } elsif ($methylation_calls[$index] eq 'Z') { | |
1915 $counting{total_meCpG_count}++; | |
1916 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1917 } elsif ($methylation_calls[$index] eq 'z') { | |
1918 $counting{total_unmethylated_CpG_count}++; | |
1919 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1920 } elsif ($methylation_calls[$index] eq 'H') { | |
1921 $counting{total_meCHH_count}++; | |
1922 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1923 } elsif ($methylation_calls[$index] eq 'h') { | |
1924 $counting{total_unmethylated_CHH_count}++; | |
1925 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
1926 } | |
1927 elsif ($methylation_calls[$index] eq '.') {} | |
1928 else{ | |
1929 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
1930 } | |
1931 } | |
1932 } else { | |
1933 die "The strand orientation as neither + nor -: '$strand'\n"; | |
1934 } | |
1935 } | |
1936 } | |
1937 } | |
1938 | |
1939 sub check_cigar_string { | |
1940 my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_; | |
1941 # print "$index\t$cigar_offset\t$pos_offset\t$strand\t"; | |
1942 my ($new_cigar_offset,$new_pos_offset) = (0,0); | |
1943 | |
1944 if ($strand eq '+') { | |
1945 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; | |
1946 | |
1947 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
1948 # warn "position needs no adjustment\n"; | |
1949 } | |
1950 | |
1951 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence | |
1952 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position | |
1953 # warn "adjusted genomic position by -1 bp (insertion)\n"; | |
1954 } | |
1955 | |
1956 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
1957 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
1958 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position | |
1959 # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; | |
1960 | |
1961 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ | |
1962 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
1963 # warn "position needs no adjustment\n"; | |
1964 last; | |
1965 } | |
1966 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ | |
1967 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position | |
1968 # warn "adjusted genomic position by another -1 bp (insertion)\n"; | |
1969 last; | |
1970 } | |
1971 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
1972 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
1973 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position | |
1974 # warn "adjusted genomic position by another +1 bp (deletion)\n"; | |
1975 } | |
1976 else{ | |
1977 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
1978 } | |
1979 } | |
1980 } | |
1981 else{ | |
1982 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
1983 } | |
1984 } | |
1985 | |
1986 elsif ($strand eq '-') { | |
1987 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; | |
1988 | |
1989 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
1990 # warn "position needs no adjustment\n"; | |
1991 } | |
1992 | |
1993 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence | |
1994 $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position | |
1995 # warn "adjusted genomic position by +1 bp (insertion)\n"; | |
1996 } | |
1997 | |
1998 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
1999 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
2000 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position | |
2001 # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; | |
2002 | |
2003 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ | |
2004 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
2005 # warn "Found new 'M' operation; position needs no adjustment\n"; | |
2006 last; | |
2007 } | |
2008 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ | |
2009 $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position | |
2010 # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n"; | |
2011 last; | |
2012 } | |
2013 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
2014 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
2015 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position | |
2016 # warn "adjusted genomic position by another -1 bp (deletion)\n"; | |
2017 } | |
2018 else{ | |
2019 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
2020 } | |
2021 } | |
2022 } | |
2023 else{ | |
2024 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
2025 } | |
2026 } | |
2027 # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n"; | |
2028 return ($new_cigar_offset,$new_pos_offset); | |
2029 } | |
2030 | |
2031 sub print_individual_C_methylation_states_single_end{ | |
2032 | |
2033 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_; | |
2034 my @methylation_calls = split(//,$meth_call); | |
2035 | |
2036 ################################################################# | |
2037 ### . for bases not involving cytosines ### | |
2038 ### X for methylated C in CHG context (was protected) ### | |
2039 ### x for not methylated C in CHG context (was converted) ### | |
2040 ### H for methylated C in CHH context (was protected) ### | |
2041 ### h for not methylated C in CHH context (was converted) ### | |
2042 ### Z for methylated C in CpG context (was protected) ### | |
2043 ### z for not methylated C in CpG context (was converted) ### | |
2044 ################################################################# | |
2045 | |
2046 my $methyl_CHG_count = 0; | |
2047 my $methyl_CHH_count = 0; | |
2048 my $methyl_CpG_count = 0; | |
2049 my $unmethylated_CHG_count = 0; | |
2050 my $unmethylated_CHH_count = 0; | |
2051 my $unmethylated_CpG_count = 0; | |
2052 | |
2053 my @len; | |
2054 my @ops; | |
2055 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions | |
2056 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels | |
2057 | |
2058 my @comp_cigar; | |
2059 | |
2060 if ($cigar){ # parsing CIGAR string | |
2061 @len = split (/\D+/,$cigar); # storing the length per operation | |
2062 @ops = split (/\d+/,$cigar); # storing the operation | |
2063 shift @ops; # remove the empty first element | |
2064 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
2065 | |
2066 foreach my $index (0..$#len){ | |
2067 foreach (1..$len[$index]){ | |
2068 # print "$ops[$index]"; | |
2069 push @comp_cigar, $ops[$index]; | |
2070 } | |
2071 } | |
2072 # warn "\nDetected CIGAR string: $cigar\n"; | |
2073 # warn "Length of methylation call: ",length $meth_call,"\n"; | |
2074 # warn "number of operations: ",scalar @ops,"\n"; | |
2075 # warn "number of length digits: ",scalar @len,"\n\n"; | |
2076 # print @comp_cigar,"\n"; | |
2077 # print "$meth_call\n\n"; | |
2078 # sleep (1); | |
2079 } | |
2080 | |
2081 ### adjusting the start position for all reads mapping to the reverse strand | |
2082 if ($strand eq '-') { | |
2083 | |
2084 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too | |
2085 # print @comp_cigar,"\n"; | |
2086 | |
2087 unless ($ignore){ ### if --ignore was specified the start position has already been corrected | |
2088 | |
2089 if ($cigar){ ### SAM format | |
2090 my $MD_count = 0; | |
2091 foreach (@comp_cigar){ | |
2092 ++$MD_count if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't | |
2093 } | |
2094 $start += $MD_count - 1; | |
2095 } | |
2096 else{ ### vanilla format | |
2097 $start += length($meth_call)-1; | |
2098 } | |
2099 } | |
2100 } | |
2101 | |
2102 ### THIS IS THE CpG and Non-CpG SECTION (OPTIONAL) | |
2103 | |
2104 ### single-file CpG and other-context output | |
2105 if ($full and $merge_non_CpG) { | |
2106 if ($strand eq '+') { | |
2107 for my $index (0..$#methylation_calls) { | |
2108 | |
2109 if ($cigar){ # only needed for SAM files | |
2110 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2111 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2112 $cigar_offset += $cigar_mod; | |
2113 $pos_offset += $pos_mod; | |
2114 } | |
2115 | |
2116 ### methylated Cs (any context) will receive a forward (+) orientation | |
2117 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
2118 if ($methylation_calls[$index] eq 'X') { | |
2119 $counting{total_meCHG_count}++; | |
2120 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2121 } | |
2122 elsif ($methylation_calls[$index] eq 'x') { | |
2123 $counting{total_unmethylated_CHG_count}++; | |
2124 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2125 } | |
2126 elsif ($methylation_calls[$index] eq 'Z') { | |
2127 $counting{total_meCpG_count}++; | |
2128 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2129 } | |
2130 elsif ($methylation_calls[$index] eq 'z') { | |
2131 $counting{total_unmethylated_CpG_count}++; | |
2132 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2133 } | |
2134 elsif ($methylation_calls[$index] eq 'H') { | |
2135 $counting{total_meCHH_count}++; | |
2136 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2137 } | |
2138 elsif ($methylation_calls[$index] eq 'h') { | |
2139 $counting{total_unmethylated_CHH_count}++; | |
2140 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2141 } | |
2142 elsif ($methylation_calls[$index] eq '.') { | |
2143 } | |
2144 else{ | |
2145 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2146 } | |
2147 } | |
2148 } | |
2149 elsif ($strand eq '-') { | |
2150 | |
2151 for my $index (0..$#methylation_calls) { | |
2152 ### methylated Cs (any context) will receive a forward (+) orientation | |
2153 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
2154 | |
2155 if ($cigar){ # only needed for SAM files | |
2156 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
2157 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2158 $cigar_offset += $cigar_mod; | |
2159 $pos_offset += $pos_mod; | |
2160 } | |
2161 | |
2162 if ($methylation_calls[$index] eq 'X') { | |
2163 $counting{total_meCHG_count}++; | |
2164 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2165 } | |
2166 elsif ($methylation_calls[$index] eq 'x') { | |
2167 $counting{total_unmethylated_CHG_count}++; | |
2168 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2169 } | |
2170 elsif ($methylation_calls[$index] eq 'Z') { | |
2171 $counting{total_meCpG_count}++; | |
2172 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2173 } | |
2174 elsif ($methylation_calls[$index] eq 'z') { | |
2175 $counting{total_unmethylated_CpG_count}++; | |
2176 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2177 } | |
2178 elsif ($methylation_calls[$index] eq 'H') { | |
2179 $counting{total_meCHH_count}++; | |
2180 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2181 } | |
2182 elsif ($methylation_calls[$index] eq 'h') { | |
2183 $counting{total_unmethylated_CHH_count}++; | |
2184 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2185 } | |
2186 elsif ($methylation_calls[$index] eq '.'){ | |
2187 } | |
2188 else{ | |
2189 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2190 } | |
2191 } | |
2192 } | |
2193 else { | |
2194 die "The strand information was neither + nor -: $strand\n"; | |
2195 } | |
2196 } | |
2197 | |
2198 ### strand-specific methylation output | |
2199 elsif ($merge_non_CpG) { | |
2200 if ($strand eq '+') { | |
2201 for my $index (0..$#methylation_calls) { | |
2202 ### methylated Cs (any context) will receive a forward (+) orientation | |
2203 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
2204 | |
2205 if ($cigar){ # only needed for SAM files | |
2206 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2207 $cigar_offset += $cigar_mod; | |
2208 $pos_offset += $pos_mod; | |
2209 } | |
2210 | |
2211 if ($methylation_calls[$index] eq 'X') { | |
2212 $counting{total_meCHG_count}++; | |
2213 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2214 } | |
2215 elsif ($methylation_calls[$index] eq 'x') { | |
2216 $counting{total_unmethylated_CHG_count}++; | |
2217 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2218 } | |
2219 elsif ($methylation_calls[$index] eq 'Z') { | |
2220 $counting{total_meCpG_count}++; | |
2221 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2222 } | |
2223 elsif ($methylation_calls[$index] eq 'z') { | |
2224 $counting{total_unmethylated_CpG_count}++; | |
2225 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2226 } | |
2227 elsif ($methylation_calls[$index] eq 'H') { | |
2228 $counting{total_meCHH_count}++; | |
2229 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2230 } | |
2231 elsif ($methylation_calls[$index] eq 'h') { | |
2232 $counting{total_unmethylated_CHH_count}++; | |
2233 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2234 } | |
2235 elsif ($methylation_calls[$index] eq '.') { | |
2236 } | |
2237 else{ | |
2238 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2239 } | |
2240 } | |
2241 } | |
2242 elsif ($strand eq '-') { | |
2243 | |
2244 for my $index (0..$#methylation_calls) { | |
2245 ### methylated Cs (any context) will receive a forward (+) orientation | |
2246 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
2247 | |
2248 if ($cigar){ # only needed for SAM files | |
2249 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2250 $cigar_offset += $cigar_mod; | |
2251 $pos_offset += $pos_mod; | |
2252 } | |
2253 | |
2254 if ($methylation_calls[$index] eq 'X') { | |
2255 $counting{total_meCHG_count}++; | |
2256 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2257 } | |
2258 elsif ($methylation_calls[$index] eq 'x') { | |
2259 $counting{total_unmethylated_CHG_count}++; | |
2260 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2261 } | |
2262 elsif ($methylation_calls[$index] eq 'Z') { | |
2263 $counting{total_meCpG_count}++; | |
2264 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2265 } | |
2266 elsif ($methylation_calls[$index] eq 'z') { | |
2267 $counting{total_unmethylated_CpG_count}++; | |
2268 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2269 } | |
2270 elsif ($methylation_calls[$index] eq 'H') { | |
2271 $counting{total_meCHH_count}++; | |
2272 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2273 } | |
2274 elsif ($methylation_calls[$index] eq 'h') { | |
2275 $counting{total_unmethylated_CHH_count}++; | |
2276 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2277 } | |
2278 elsif ($methylation_calls[$index] eq '.') { | |
2279 } | |
2280 else{ | |
2281 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2282 } | |
2283 } | |
2284 } | |
2285 else { | |
2286 die "The strand information was neither + nor -: $strand\n"; | |
2287 } | |
2288 } | |
2289 | |
2290 ### THIS IS THE 3-CONTEXT (CpG, CHG and CHH) DEFAULT SECTION | |
2291 | |
2292 elsif ($full) { | |
2293 if ($strand eq '+') { | |
2294 for my $index (0..$#methylation_calls) { | |
2295 ### methylated Cs (any context) will receive a forward (+) orientation | |
2296 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
2297 | |
2298 if ($cigar){ # only needed for SAM files | |
2299 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2300 $cigar_offset += $cigar_mod; | |
2301 $pos_offset += $pos_mod; | |
2302 } | |
2303 | |
2304 if ($methylation_calls[$index] eq 'X') { | |
2305 $counting{total_meCHG_count}++; | |
2306 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2307 } elsif ($methylation_calls[$index] eq 'x') { | |
2308 $counting{total_unmethylated_CHG_count}++; | |
2309 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2310 } elsif ($methylation_calls[$index] eq 'Z') { | |
2311 $counting{total_meCpG_count}++; | |
2312 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2313 } elsif ($methylation_calls[$index] eq 'z') { | |
2314 $counting{total_unmethylated_CpG_count}++; | |
2315 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2316 } elsif ($methylation_calls[$index] eq 'H') { | |
2317 $counting{total_meCHH_count}++; | |
2318 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2319 } elsif ($methylation_calls[$index] eq 'h') { | |
2320 $counting{total_unmethylated_CHH_count}++; | |
2321 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2322 } | |
2323 elsif ($methylation_calls[$index] eq '.') {} | |
2324 else{ | |
2325 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2326 } | |
2327 } | |
2328 } | |
2329 elsif ($strand eq '-') { | |
2330 | |
2331 for my $index (0..$#methylation_calls) { | |
2332 ### methylated Cs (any context) will receive a forward (+) orientation | |
2333 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
2334 | |
2335 if ($cigar){ # only needed for SAM files | |
2336 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2337 $cigar_offset += $cigar_mod; | |
2338 $pos_offset += $pos_mod; | |
2339 } | |
2340 | |
2341 if ($methylation_calls[$index] eq 'X') { | |
2342 $counting{total_meCHG_count}++; | |
2343 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2344 } elsif ($methylation_calls[$index] eq 'x') { | |
2345 $counting{total_unmethylated_CHG_count}++; | |
2346 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2347 } elsif ($methylation_calls[$index] eq 'Z') { | |
2348 $counting{total_meCpG_count}++; | |
2349 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2350 } elsif ($methylation_calls[$index] eq 'z') { | |
2351 $counting{total_unmethylated_CpG_count}++; | |
2352 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2353 } elsif ($methylation_calls[$index] eq 'H') { | |
2354 $counting{total_meCHH_count}++; | |
2355 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2356 } elsif ($methylation_calls[$index] eq 'h') { | |
2357 $counting{total_unmethylated_CHH_count}++; | |
2358 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2359 } | |
2360 elsif ($methylation_calls[$index] eq '.') {} | |
2361 else{ | |
2362 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2363 } | |
2364 } | |
2365 } | |
2366 else { | |
2367 die "The read had a strand orientation which was neither + nor -: $strand\n"; | |
2368 } | |
2369 } | |
2370 | |
2371 ### strand-specific methylation output | |
2372 else { | |
2373 if ($strand eq '+') { | |
2374 for my $index (0..$#methylation_calls) { | |
2375 ### methylated Cs (any context) will receive a forward (+) orientation | |
2376 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
2377 | |
2378 if ($cigar){ # only needed for SAM files | |
2379 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2380 $cigar_offset += $cigar_mod; | |
2381 $pos_offset += $pos_mod; | |
2382 } | |
2383 | |
2384 if ($methylation_calls[$index] eq 'X') { | |
2385 $counting{total_meCHG_count}++; | |
2386 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2387 } elsif ($methylation_calls[$index] eq 'x') { | |
2388 $counting{total_unmethylated_CHG_count}++; | |
2389 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2390 } elsif ($methylation_calls[$index] eq 'Z') { | |
2391 $counting{total_meCpG_count}++; | |
2392 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2393 } elsif ($methylation_calls[$index] eq 'z') { | |
2394 $counting{total_unmethylated_CpG_count}++; | |
2395 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2396 } elsif ($methylation_calls[$index] eq 'H') { | |
2397 $counting{total_meCHH_count}++; | |
2398 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2399 } elsif ($methylation_calls[$index] eq 'h') { | |
2400 $counting{total_unmethylated_CHH_count}++; | |
2401 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2402 } | |
2403 elsif ($methylation_calls[$index] eq '.') {} | |
2404 else{ | |
2405 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2406 } | |
2407 } | |
2408 } | |
2409 elsif ($strand eq '-') { | |
2410 | |
2411 for my $index (0..$#methylation_calls) { | |
2412 ### methylated Cs (any context) will receive a forward (+) orientation | |
2413 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
2414 | |
2415 if ($cigar){ # only needed for SAM files | |
2416 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2417 $cigar_offset += $cigar_mod; | |
2418 $pos_offset += $pos_mod; | |
2419 } | |
2420 | |
2421 if ($methylation_calls[$index] eq 'X') { | |
2422 $counting{total_meCHG_count}++; | |
2423 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2424 } elsif ($methylation_calls[$index] eq 'x') { | |
2425 $counting{total_unmethylated_CHG_count}++; | |
2426 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2427 } elsif ($methylation_calls[$index] eq 'Z') { | |
2428 $counting{total_meCpG_count}++; | |
2429 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2430 } elsif ($methylation_calls[$index] eq 'z') { | |
2431 $counting{total_unmethylated_CpG_count}++; | |
2432 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2433 } elsif ($methylation_calls[$index] eq 'H') { | |
2434 $counting{total_meCHH_count}++; | |
2435 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2436 } elsif ($methylation_calls[$index] eq 'h') { | |
2437 $counting{total_unmethylated_CHH_count}++; | |
2438 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2439 } | |
2440 elsif ($methylation_calls[$index] eq '.') {} | |
2441 else{ | |
2442 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2443 } | |
2444 } | |
2445 } | |
2446 else { | |
2447 die "The strand information was neither + nor -: $strand\n"; | |
2448 } | |
2449 } | |
2450 } | |
2451 | |
2452 sub print_helpfile{ | |
2453 | |
2454 print << 'HOW_TO'; | |
2455 | |
2456 | |
2457 DESCRIPTION | |
2458 | |
2459 The following is a brief description of all options to control the Bismark | |
2460 methylation extractor. The script reads in a bisulfite read alignment results file | |
2461 produced by the Bismark bisulfite mapper and extracts the methylation information | |
2462 for individual cytosines. This information is found in the methylation call field | |
2463 which can contain the following characters: | |
2464 | |
2465 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
2466 ~~~ X for methylated C in CHG context (was protected) ~~~ | |
2467 ~~~ x for not methylated C CHG (was converted) ~~~ | |
2468 ~~~ H for methylated C in CHH context (was protected) ~~~ | |
2469 ~~~ h for not methylated C in CHH context (was converted) ~~~ | |
2470 ~~~ Z for methylated C in CpG context (was protected) ~~~ | |
2471 ~~~ z for not methylated C in CpG context (was converted) ~~~ | |
2472 ~~~ . for any bases not involving cytosines ~~~ | |
2473 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
2474 | |
2475 The methylation extractor outputs result files for cytosines in CpG, CHG and CHH | |
2476 context (this distinction is actually already made in Bismark itself). As the methylation | |
2477 information for every C analysed can produce files which easily have tens or even hundreds of | |
2478 millions of lines, file sizes can become very large and more difficult to handle. The C | |
2479 methylation info additionally splits cytosine methylation calls up into one of the four possible | |
2480 strands a given bisulfite read aligned against: | |
2481 | |
2482 OT original top strand | |
2483 CTOT complementary to original top strand | |
2484 | |
2485 OB original bottom strand | |
2486 CTOB complementary to original bottom strand | |
2487 | |
2488 Thus, by default twelve individual output files are being generated per input file (unless | |
2489 --comprehensive is specified, see below). The output files can be imported into a genome | |
2490 viewer, such as SeqMonk, and re-combined into a single data group if desired (in fact | |
2491 unless the bisulfite reads were generated preserving directionality it doesn't make any | |
2492 sense to look at the data in a strand-specific manner). Strand-specific output files can | |
2493 optionally be skipped, in which case only three output files for CpG, CHG or CHH context | |
2494 will be generated. For both the strand-specific and comprehensive outputs there is also | |
2495 the option to merge both non-CpG contexts (CHG and CHH) into one single non-CpG context. | |
2496 | |
2497 | |
2498 The output files are in the following format (tab delimited): | |
2499 | |
2500 <sequence_id> <strand> <chromosome> <position> <methylation call> | |
2501 | |
2502 | |
2503 USAGE: methylation_extractor [options] <filenames> | |
2504 | |
2505 | |
2506 ARGUMENTS: | |
2507 | |
2508 <filenames> A space-separated list of Bismark result files in SAM format from | |
2509 which methylation information is extracted for every cytosine in | |
2510 the reads. For alignment files in the older custom Bismark output | |
2511 see option '--vanilla'. | |
2512 | |
2513 OPTIONS: | |
2514 | |
2515 -s/--single-end Input file(s) are Bismark result file(s) generated from single-end | |
2516 read data. Specifying either --single-end or --paired-end is | |
2517 mandatory. | |
2518 | |
2519 -p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end | |
2520 read data. Specifying either --paired-end or --single-end is | |
2521 mandatory. | |
2522 | |
2523 --vanilla The Bismark result input file(s) are in the old custom Bismark format | |
2524 (up to version 0.5.x) and not in SAM format which is the default as | |
2525 of Bismark version 0.6.x or higher. Default: OFF. | |
2526 | |
2527 --no_overlap For paired-end reads it is theoretically possible that read_1 and | |
2528 read_2 overlap. This option avoids scoring overlapping methylation | |
2529 calls twice. Whilst this removes a bias towards more methylation calls | |
2530 towards the center of sequenced fragments it can de facto remove | |
2531 a good proportion of the data. | |
2532 | |
2533 --ignore <int> Ignore the first <int> bp at the 5' end of each read when processing the | |
2534 methylation call string. This can remove e.g. a restriction enzyme site | |
2535 at the start of each read. | |
2536 | |
2537 --comprehensive Specifying this option will merge all four possible strand-specific | |
2538 methylation info into context-dependent output files. The default | |
2539 contexts are: | |
2540 - CpG context | |
2541 - CHG context | |
2542 - CHH context | |
2543 | |
2544 --merge_non_CpG This will produce two output files (in --comprehensive mode) or eight | |
2545 strand-specific output files (default) for Cs in | |
2546 - CpG context | |
2547 - non-CpG context | |
2548 | |
2549 --report Prints out a short methylation summary as well as the paramaters used to run | |
2550 this script. | |
2551 | |
2552 --version Displays version information. | |
2553 | |
2554 -h/--help Displays this help file and exits. | |
2555 | |
2556 | |
2557 OUTPUT: | |
2558 | |
2559 The output is in the form: | |
2560 | |
2561 <seq-ID> <methylation state*> <chromosome> <start position (= end position)> <methylation call> | |
2562 | |
2563 * Methylated cytosines receive a '+' orientation, | |
2564 * Unmethylated cytosines receive a '-' orientation. | |
2565 | |
2566 | |
2567 This script was last modified on 31 July 2012. | |
2568 | |
2569 HOW_TO | |
2570 } |