Mercurial > repos > bgruening > bismark
comparison bismark_methyl_extractor/coverage2cytosine @ 7:fcadce4d9a06 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author | bgruening |
---|---|
date | Sat, 06 May 2017 13:18:09 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
6:0f8646f22b8d | 7:fcadce4d9a06 |
---|---|
1 #!/usr/bin/env perl | |
2 use warnings; | |
3 use strict; | |
4 $|++; | |
5 use Getopt::Long; | |
6 use Cwd; | |
7 use Carp; | |
8 | |
9 ## This program is Copyright (C) 2010-16, Felix Krueger (felix.krueger@babraham.ac.uk) | |
10 | |
11 ## This program is free software: you can redistribute it and/or modify | |
12 ## it under the terms of the GNU General Public License as published by | |
13 ## the Free Software Foundation, either version 3 of the License, or | |
14 ## (at your option) any later version. | |
15 | |
16 ## This program is distributed in the hope that it will be useful, | |
17 ## but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
19 ## GNU General Public License for more details. | |
20 | |
21 ## You should have received a copy of the GNU General Public License | |
22 ## along with this program. If not, see <http://www.gnu.org/licenses/>. | |
23 | |
24 my %chromosomes; # storing sequence information of all chromosomes/scaffolds | |
25 my %processed; # keeping a record of which chromosomes have been processed | |
26 my $coverage2cytosine_version = 'v0.16.3'; | |
27 | |
28 my ($output_dir,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$parent_dir,$coverage_infile,$cytosine_out,$merge_CpGs,$gc_context,$gzip,$tetra) = process_commandline(); | |
29 | |
30 warn "Summary of parameters for genome-wide cytosine report:\n"; | |
31 warn '='x78,"\n"; | |
32 warn "Coverage infile:\t\t\t$coverage_infile\n"; | |
33 warn "Output directory:\t\t\t>$output_dir<\n"; | |
34 warn "Parent directory:\t\t\t>$parent_dir<\n"; | |
35 warn "Genome directory:\t\t\t>$genome_folder<\n"; | |
36 | |
37 if ($CX_context){ | |
38 warn "CX context:\t\t\t\tyes\n"; | |
39 } | |
40 else{ | |
41 warn "CX context:\t\t\t\tno (CpG context only, default)\n"; | |
42 } | |
43 if ($merge_CpGs){ | |
44 warn "Pooling CpG top/bottom strand evidence:\tyes\n"; | |
45 } | |
46 if($gc_context){ | |
47 warn "Optional GC context track:\t\tyes\n"; | |
48 } | |
49 if ($tetra){ | |
50 warn "Tetra/Penta nucleotide context:\t\tyes\n"; | |
51 } | |
52 | |
53 if ($zero){ | |
54 warn "Genome coordinates used:\t\t0-based (user specified)\n"; | |
55 } | |
56 else{ | |
57 warn "Genome coordinates used:\t\t1-based (default)\n"; | |
58 } | |
59 | |
60 if ($gzip){ | |
61 warn "GZIP compression:\t\t\tyes\n"; | |
62 } | |
63 else{ | |
64 warn "GZIP compression:\t\t\tno\n"; | |
65 } | |
66 | |
67 if ($split_by_chromosome){ | |
68 warn "Split by chromosome:\t\t\tyes\n\n\n"; | |
69 } | |
70 else{ | |
71 warn "Split by chromosome:\t\t\tno\n\n\n"; | |
72 } | |
73 sleep (3); | |
74 | |
75 read_genome_into_memory(); | |
76 warn "Stored sequence information of ",scalar keys %chromosomes," chromosomes/scaffolds in total\n\n"; | |
77 | |
78 generate_genome_wide_cytosine_report($coverage_infile); | |
79 | |
80 ### 11 December 2014 | |
81 | |
82 # The following optional section re-reads the genome-wide report and merges methylation evidence of both top and bottom strand | |
83 # into a single CpG dinucleotide entity. This significantly simplifies downstream processing, e.g. by the bsseq R-/Bioconductor package | |
84 # which recommends this merging process to increase coverage per CpG and reduce the memory burden for its processing | |
85 | |
86 if ($merge_CpGs){ | |
87 # we only allow this operation if the report is limited to CpG context, and for a single report for the entire genome for the time being | |
88 combine_CpGs_to_single_CG_entity($cytosine_out); | |
89 } | |
90 | |
91 ### 18 August 2015 | |
92 | |
93 # The following section reprocessed the genome to generate cytosine methylation output in GC context (e.g. when a GpC methylase had been deployed | |
94 if ($gc_context){ | |
95 generate_GC_context_report($coverage_infile); | |
96 } | |
97 | |
98 | |
99 sub combine_CpGs_to_single_CG_entity{ | |
100 my $CpG_report_file = shift; | |
101 warn "Now merging top and bottom strand CpGs into a single CG dinucleotide entity\n"; | |
102 | |
103 open (IN,$CpG_report_file) or die "Failed to open file $CpG_report_file: $!\n\n"; | |
104 my $pooled_CG = $CpG_report_file; | |
105 $pooled_CG =~ s/$/.merged_CpG_evidence.cov/; | |
106 open (OUT,'>',$pooled_CG) or die "Failed to write to file '$pooled_CG': $!\n\n"; | |
107 warn ">>> Writing a new coverage file with top and bottom strand CpG methylation evidence merged to $pooled_CG <<<\n\n"; | |
108 sleep(1); | |
109 | |
110 while (1){ | |
111 my $line1 = <IN>; | |
112 my $line2 = <IN>; | |
113 last unless ($line1 and $line2); | |
114 | |
115 chomp $line1; | |
116 chomp $line2; | |
117 | |
118 my ($chr1,$pos1,$strand1,$m1,$u1,$context1) = (split /\t/,$line1); | |
119 my ($chr2,$pos2,$strand2,$m2,$u2,$context2) = (split /\t/,$line2); | |
120 # print join ("\t",$chr1,$pos1,$strand1,$m1,$u1,$context1),"\n"; | |
121 # print join ("\t",$chr2,$pos2,$strand2,$m2,$u2,$context2),"\n"; sleep(1); | |
122 | |
123 if ($pos1 < 2){ | |
124 $line1 = $line2; | |
125 $line2 = <IN>; | |
126 chomp $line2; | |
127 ($chr1,$pos1,$strand1,$m1,$u1,$context1) = (split /\t/,$line1); | |
128 ($chr2,$pos2,$strand2,$m2,$u2,$context2) = (split /\t/,$line2); | |
129 } | |
130 | |
131 # a few sanity checks | |
132 die "The context of the first line was not CG:\t$line1" unless ($context1 eq 'CG'); | |
133 die "The context of the second line was not CG:\t$line2" unless ($context2 eq 'CG'); | |
134 | |
135 unless ($strand1 eq '+' and $strand2 eq'-'){ | |
136 die "The strand of line 1 and line 2 were not + and -:\nline1:\t$line1\nline2:\t$line2\n"; | |
137 } | |
138 unless ($pos2 eq ($pos1 + 1)){ | |
139 die "The reported position were not spaced 1bp apart:\nline1:\t$line1\nline2:\t$line2\n"; | |
140 } | |
141 unless($chr1 eq $chr2){ | |
142 die "The chromsome information for line 1 and 2 did not match:\nline1:\t$line1\nline2:\t$line2\n"; | |
143 } | |
144 | |
145 # This looks alright from what I can tell, lets pool the 2 strands | |
146 | |
147 my $pooled_m = $m1 + $m2; | |
148 my $pooled_u = $u1 + $u2; | |
149 | |
150 # since this is a new coverage file we only write it out if the coverage is at least 1 | |
151 next unless ($pooled_m + $pooled_u) > 0; | |
152 | |
153 my $pooled_percentage = sprintf ("%.6f",$pooled_m/ ($pooled_m + $pooled_u) *100); | |
154 # print join ("\t",$chr1,$pos1,$strand1,$m1,$u1,$context1),"\n"; | |
155 # print join ("\t",$chr2,$pos2,$strand2,$m2,$u2,$context2),"\n"; | |
156 # print join ("\t",$chr1,$pos1,$pos2,$pooled_percentage,$pooled_m,$pooled_u),"\n"; | |
157 # sleep(1); | |
158 print OUT join ("\t",$chr1,$pos1,$pos2,$pooled_percentage,$pooled_m,$pooled_u),"\n"; | |
159 } | |
160 | |
161 warn "CpG merging complete. Good luck finding DMRs with bsseq!\n\n"; | |
162 | |
163 } | |
164 | |
165 | |
166 sub process_commandline{ | |
167 my $help; | |
168 my $output_dir; | |
169 my $genome_folder; | |
170 my $zero; | |
171 my $CpG_only; | |
172 my $CX_context; | |
173 my $gzip; | |
174 | |
175 my $split_by_chromosome; | |
176 my $cytosine_out; | |
177 my $parent_dir; | |
178 my $version; | |
179 my $merge_CpGs; | |
180 my $gc_context; | |
181 my $tetra; | |
182 | |
183 my $command_line = GetOptions ('help|man' => \$help, | |
184 'dir=s' => \$output_dir, | |
185 'g|genome_folder=s' => \$genome_folder, | |
186 "zero_based" => \$zero, | |
187 "CX|CX_context" => \$CX_context, | |
188 "split_by_chromosome" => \$split_by_chromosome, | |
189 'o|output=s' => \$cytosine_out, | |
190 'parent_dir=s' => \$parent_dir, | |
191 'version' => \$version, | |
192 'merge_CpGs' => \$merge_CpGs, | |
193 'GC|GC_context' => \$gc_context, | |
194 'ff|qq' => \$tetra, | |
195 'gzip' => \$gzip, | |
196 ); | |
197 | |
198 ### EXIT ON ERROR if there were errors with any of the supplied options | |
199 unless ($command_line){ | |
200 die "Please respecify command line options\n"; | |
201 } | |
202 | |
203 ### HELPFILE | |
204 if ($help){ | |
205 print_helpfile(); | |
206 exit; | |
207 } | |
208 | |
209 if ($version){ | |
210 print << "VERSION"; | |
211 | |
212 | |
213 Bismark Methylation Extractor Module - | |
214 coverage2cytosine | |
215 | |
216 Bismark Extractor Version: $coverage2cytosine_version | |
217 Copyright 2010-15 Felix Krueger, Babraham Bioinformatics | |
218 www.bioinformatics.babraham.ac.uk/projects/bismark/ | |
219 | |
220 | |
221 VERSION | |
222 exit; | |
223 } | |
224 | |
225 ### no files provided | |
226 unless (@ARGV){ | |
227 warn "You need to provide a Bismark coverage file (with counts methylated/unmethylated cytosines) to create an individual C methylation output. Please respecify!\n"; | |
228 sleep(2); | |
229 | |
230 print_helpfile(); | |
231 exit; | |
232 } | |
233 | |
234 my $coverage_infile = shift @ARGV; | |
235 | |
236 unless ($parent_dir){ | |
237 $parent_dir = getcwd(); | |
238 } | |
239 unless ($parent_dir =~ /\/$/){ | |
240 $parent_dir =~ s/$/\//; | |
241 } | |
242 | |
243 unless (defined $cytosine_out){ | |
244 die "Please provide the name of the output file using the option -o/--output filename\n"; | |
245 } | |
246 | |
247 ### OUTPUT DIR PATH | |
248 if (defined $output_dir){ | |
249 unless ($output_dir eq ''){ # if the output dir has been passed on by the methylation extractor and is an empty string we don't want to change it | |
250 unless ($output_dir =~ /\/$/){ | |
251 $output_dir =~ s/$/\//; | |
252 } | |
253 } | |
254 } | |
255 else{ | |
256 $output_dir = ''; | |
257 } | |
258 | |
259 unless ($CX_context){ | |
260 $CX_context = 0; | |
261 $CpG_only = 1; | |
262 } | |
263 | |
264 ### GENOME folder | |
265 if ($genome_folder){ | |
266 unless ($genome_folder =~/\/$/){ | |
267 $genome_folder =~ s/$/\//; | |
268 } | |
269 } | |
270 else{ | |
271 die "Please specify a genome folder to proceed (full path only)\n"; | |
272 } | |
273 | |
274 if ($merge_CpGs){ | |
275 if ($CX_context){ | |
276 die "Merging individual CpG calls into a single CpG dinucleotide entity is currently only supported if CpG-context is selected only (lose the option --CX)\n"; | |
277 } | |
278 if ($split_by_chromosome){ | |
279 die "Merging individual CpG calls into a single CpG dinucleotide entity is currently only supported if a single CpG report is written out (lose the option --split_by_chromosome)\n"; | |
280 } | |
281 } | |
282 | |
283 return ($output_dir,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$parent_dir,$coverage_infile,$cytosine_out,$merge_CpGs,$gc_context,$gzip,$tetra); | |
284 } | |
285 | |
286 | |
287 | |
288 sub generate_genome_wide_cytosine_report { | |
289 | |
290 warn "="x78,"\n"; | |
291 warn "Methylation information will now be written into a genome-wide cytosine report\n"; | |
292 warn "="x78,"\n\n"; | |
293 sleep (2); | |
294 | |
295 my $number_processed = 0; | |
296 | |
297 ### changing to the output directory again | |
298 unless ($output_dir eq ''){ # default | |
299 chdir $output_dir or die "Failed to change directory to $output_dir\n"; | |
300 # warn "Changed directory to $output_dir\n"; | |
301 } | |
302 | |
303 my $in = shift; | |
304 # infiles handed over by the methylation extractor will be just the filename on their own. The directory should have been handed over with --dir | |
305 if ($in =~ /gz$/){ | |
306 open (IN,"gunzip -c $in |") or die "Failed to read from gzipped file $in: $!\n"; # changed from gunzip -c to gunzip -c 08 04 16 | |
307 } | |
308 else{ | |
309 open (IN,"$in") or die "Failed to read from file $in: $!\n"; | |
310 } | |
311 | |
312 | |
313 ### note: we are still in the folder: $output_dir, so we do not have to include this into the open commands | |
314 unless ($split_by_chromosome){ ### writing all output to a single file (default) | |
315 if ($gzip){ | |
316 unless ($cytosine_out =~ /\.gz$/){ | |
317 $cytosine_out .= '.gz'; | |
318 } | |
319 open (CYT,"| gzip -c - > $cytosine_out") or die "Failed to write to file $cytosine_out: $!\n"; | |
320 } | |
321 else{ | |
322 open (CYT,'>',$cytosine_out) or die "Failed to write to file $cytosine_out: $!\n"; | |
323 } | |
324 warn ">>> Writing genome-wide cytosine report to: $cytosine_out <<<\n\n"; | |
325 sleep (1); | |
326 } | |
327 | |
328 my $last_chr; | |
329 my %chr; # storing reads for one chromosome at a time | |
330 | |
331 my $count = 0; | |
332 while (<IN>){ | |
333 chomp; | |
334 ++$count; | |
335 my ($chr,$start,$end,undef,$meth,$nonmeth) = (split /\t/); | |
336 | |
337 # defining the first chromosome | |
338 unless (defined $last_chr){ | |
339 $last_chr = $chr; | |
340 ++$number_processed; | |
341 # warn "Storing all covered cytosine positions for chromosome: $chr\n"; | |
342 } | |
343 | |
344 ### As of version 0.9.1 the start positions are 1-based! | |
345 if ($chr eq $last_chr){ | |
346 $chr{$chr}->{$start}->{meth} = $meth; | |
347 $chr{$chr}->{$start}->{nonmeth} = $nonmeth; | |
348 } | |
349 else{ | |
350 warn "Writing cytosine report for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n"; | |
351 ++$number_processed; | |
352 | |
353 if ($split_by_chromosome){ ## writing output to 1 file per chromosome | |
354 my $chromosome_out = $cytosine_out; | |
355 if ($chromosome_out =~ /\.txt$/){ | |
356 $chromosome_out =~ s/\.txt$/chr${last_chr}.txt/; | |
357 } | |
358 else{ | |
359 $chromosome_out =~ s/$/.chr${last_chr}.txt/; | |
360 } | |
361 open (CYT,'>',$chromosome_out) or die $!; | |
362 # warn "Writing output for $last_chr to $chromosome_out\n"; | |
363 } | |
364 | |
365 my $tri_nt; | |
366 my $tetra_nt; | |
367 my $penta_nt; | |
368 my $context; | |
369 | |
370 $processed{$last_chr} = 1; | |
371 while ( $chromosomes{$last_chr} =~ /([CG])/g){ | |
372 | |
373 $tri_nt = ''; | |
374 $context = ''; | |
375 | |
376 if ($tetra){ | |
377 $tetra_nt = ''; # clearing | |
378 $penta_nt = ''; | |
379 } | |
380 | |
381 my $pos = pos$chromosomes{$last_chr}; | |
382 | |
383 my $strand; | |
384 my $meth = 0; | |
385 my $nonmeth = 0; | |
386 | |
387 if ($1 eq 'C'){ # C on forward strand | |
388 $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based! | |
389 if ($tetra){ | |
390 if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 4) ){ | |
391 $tetra_nt = substr ($chromosomes{$last_chr},($pos-1),4); | |
392 } | |
393 else{ | |
394 $tetra_nt = ''; | |
395 } | |
396 if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 5) ){ | |
397 $penta_nt = substr ($chromosomes{$last_chr},($pos-1),5); | |
398 } | |
399 else{ | |
400 $penta_nt = ''; | |
401 } | |
402 } | |
403 $strand = '+'; | |
404 } | |
405 elsif ($1 eq 'G'){ # C on reverse strand | |
406 | |
407 $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based! | |
408 $tri_nt = reverse $tri_nt; | |
409 $tri_nt =~ tr/ACTG/TGAC/; | |
410 | |
411 if ($tetra){ | |
412 if ( $pos - 4 >= 0 ){ | |
413 $tetra_nt = substr ($chromosomes{$last_chr},($pos-4),4); | |
414 $tetra_nt = reverse $tetra_nt; | |
415 $tetra_nt =~ tr/ACTG/TGAC/; | |
416 } | |
417 else{ | |
418 $tetra_nt = ''; | |
419 } | |
420 if ( $pos - 5 >= 0 ){ | |
421 $penta_nt = substr ($chromosomes{$last_chr},($pos-5),5); | |
422 $penta_nt = reverse $penta_nt; | |
423 $penta_nt =~ tr/ACTG/TGAC/; | |
424 } | |
425 else{ | |
426 $penta_nt = ''; | |
427 } | |
428 } | |
429 $strand = '-'; | |
430 } | |
431 | |
432 next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted | |
433 | |
434 # if (length$penta_nt < 5){ | |
435 # warn "$tri_nt\t$tetra_nt\t$penta_nt\n"; sleep(1); | |
436 # } | |
437 | |
438 if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based! (as of v0.9.1) | |
439 $meth = $chr{$last_chr}->{$pos}->{meth}; | |
440 $nonmeth = $chr{$last_chr}->{$pos}->{nonmeth}; | |
441 } | |
442 | |
443 | |
444 ### determining cytosine context | |
445 if ($tri_nt =~ /^CG/){ | |
446 $context = 'CG'; | |
447 } | |
448 elsif ($tri_nt =~ /^C.{1}G$/){ | |
449 $context = 'CHG'; | |
450 } | |
451 elsif ($tri_nt =~ /^C.{2}$/){ | |
452 $context = 'CHH'; | |
453 } | |
454 else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark) | |
455 warn "The sequence context could not be determined (found: '$tri_nt'). Skipping.\n"; | |
456 next; | |
457 } | |
458 | |
459 if ($CpG_only){ | |
460 if ($tri_nt =~ /^CG/){ # CpG context is the default | |
461 if ($zero){ # zero based coordinates | |
462 $pos -= 1; | |
463 if ($tetra){ | |
464 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; | |
465 } | |
466 else{ | |
467 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
468 } | |
469 } | |
470 else{ # default | |
471 if ($tetra){ | |
472 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; | |
473 } | |
474 else{ | |
475 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
476 } | |
477 } | |
478 } | |
479 } | |
480 else{ ## all cytosines, specified with --CX | |
481 if ($zero){ # zero based coordinates | |
482 $pos -= 1; | |
483 if ($tetra){ | |
484 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; | |
485 } | |
486 else{ | |
487 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n" | |
488 } | |
489 } | |
490 else{ # default | |
491 if ($tetra){ | |
492 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; | |
493 } | |
494 else{ | |
495 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
496 } | |
497 } | |
498 } | |
499 } | |
500 | |
501 %chr = (); # resetting the hash | |
502 | |
503 # new first entry | |
504 $last_chr = $chr; | |
505 $chr{$chr}->{$start}->{meth} = $meth; | |
506 $chr{$chr}->{$start}->{nonmeth} = $nonmeth; | |
507 } | |
508 } | |
509 | |
510 # Last found chromosome | |
511 # If there never was a last chromosome then something must have gone wrong with reading the data in | |
512 unless (defined $last_chr){ | |
513 die "No last chromosome was defined, something must have gone wrong while reading the data in (e.g. specified wrong file path for a gzipped coverage file?). Please check your command!\n\n"; | |
514 } | |
515 | |
516 warn "Writing cytosine report for last chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n"; | |
517 $processed{$last_chr} = 1; | |
518 | |
519 if ($split_by_chromosome){ ## writing output to 1 file per chromosome | |
520 my $chromosome_out = $cytosine_out; | |
521 if ($chromosome_out =~ /\.txt$/){ # files passed on by the methylation extractor end in _report.txt | |
522 $chromosome_out =~ s/\.txt$/chr${last_chr}.txt/; | |
523 } | |
524 else{ # user specified output file name | |
525 $chromosome_out =~ s/$/.chr${last_chr}.txt/; | |
526 } | |
527 open (CYT,'>',$chromosome_out) or die $!; | |
528 # warn "Writing output for $last_chr to $chromosome_out\n"; | |
529 } | |
530 | |
531 my $tri_nt; | |
532 my $tetra_nt; | |
533 my $penta_nt; | |
534 my $context; | |
535 | |
536 while ( $chromosomes{$last_chr} =~ /([CG])/g){ | |
537 | |
538 $tri_nt = ''; | |
539 $context = ''; | |
540 | |
541 if ($tetra){ | |
542 $tetra_nt = ''; | |
543 $penta_nt = ''; | |
544 } | |
545 | |
546 my $pos = pos$chromosomes{$last_chr}; | |
547 | |
548 my $strand; | |
549 my $meth = 0; | |
550 my $nonmeth = 0; | |
551 | |
552 if ($1 eq 'C'){ # C on forward strand | |
553 $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based! | |
554 $strand = '+'; | |
555 | |
556 if ($tetra){ | |
557 if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 4) ){ | |
558 $tetra_nt = substr ($chromosomes{$last_chr},($pos-1),4); | |
559 } | |
560 else{ | |
561 $tetra_nt = ''; | |
562 } | |
563 if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 5) ){ | |
564 $penta_nt = substr ($chromosomes{$last_chr},($pos-1),5); | |
565 } | |
566 else{ | |
567 $penta_nt = ''; | |
568 } | |
569 } | |
570 | |
571 } | |
572 elsif ($1 eq 'G'){ # C on reverse strand | |
573 $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based! | |
574 $tri_nt = reverse $tri_nt; | |
575 $tri_nt =~ tr/ACTG/TGAC/; | |
576 $strand = '-'; | |
577 | |
578 if ($tetra){ | |
579 if ( $pos - 4 >= 0 ){ | |
580 $tetra_nt = substr ($chromosomes{$last_chr},($pos-4),4); | |
581 $tetra_nt = reverse $tetra_nt; | |
582 $tetra_nt =~ tr/ACTG/TGAC/; | |
583 } | |
584 else{ | |
585 $tetra_nt = ''; | |
586 } | |
587 | |
588 if ( $pos - 5 >= 0 ){ | |
589 $penta_nt = substr ($chromosomes{$last_chr},($pos-5),5); | |
590 $penta_nt = reverse $penta_nt; | |
591 $penta_nt =~ tr/ACTG/TGAC/; | |
592 } | |
593 else{ | |
594 $penta_nt = ''; | |
595 } | |
596 } | |
597 } | |
598 | |
599 if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based! as of v0.9.1 | |
600 $meth = $chr{$last_chr}->{$pos}->{meth}; | |
601 $nonmeth = $chr{$last_chr}->{$pos}->{nonmeth}; | |
602 } | |
603 | |
604 next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted | |
605 # if (length$penta_nt < 5){ | |
606 # warn "$tri_nt\t$tetra_nt\t$penta_nt\n"; sleep(1); | |
607 # } | |
608 | |
609 ### determining cytosine context | |
610 if ($tri_nt =~ /^CG/){ | |
611 $context = 'CG'; | |
612 } | |
613 elsif ($tri_nt =~ /^C.{1}G$/){ | |
614 $context = 'CHG'; | |
615 } | |
616 elsif ($tri_nt =~ /^C.{2}$/){ | |
617 $context = 'CHH'; | |
618 } | |
619 else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark) | |
620 warn "The cytosine context could not be determined (found: '$tri_nt'). Skipping.\n"; | |
621 next; | |
622 } | |
623 | |
624 if ($CpG_only){ | |
625 if ($tri_nt =~ /^CG/){ # CpG context is the default | |
626 if ($zero){ # zero-based coordinates | |
627 $pos -= 1; | |
628 if ($tetra){ | |
629 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; | |
630 } | |
631 else{ | |
632 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
633 } | |
634 } | |
635 else{ # default | |
636 if ($tetra){ | |
637 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; | |
638 } | |
639 else{ | |
640 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
641 } | |
642 } | |
643 } | |
644 } | |
645 else{ ## all cytosines, specified with --CX | |
646 if ($zero){ # zero based coordinates | |
647 $pos -= 1; | |
648 if ($tetra){ | |
649 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; | |
650 } | |
651 else{ | |
652 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
653 } | |
654 } | |
655 else{ # default | |
656 if ($tetra){ | |
657 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; | |
658 } | |
659 else{ | |
660 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
661 } | |
662 } | |
663 } | |
664 } | |
665 | |
666 warn "Finished writing out cytosine report for covered chromosomes (processed $number_processed chromosomes/scaffolds in total)\n\n"; | |
667 | |
668 ### Now processing chromosomes that were not covered in the coverage file | |
669 warn "Now processing chromosomes that were not covered by any methylation calls in the coverage file...\n"; | |
670 my $unprocessed = 0; | |
671 | |
672 foreach my $chr (sort keys %processed) { | |
673 unless ( $processed{$chr} ) { | |
674 ++$unprocessed; | |
675 ++$number_processed; | |
676 process_unprocessed_chromosomes($chr); | |
677 } | |
678 } | |
679 | |
680 if ($unprocessed == 0) { | |
681 warn "All chromosomes in the genome were covered by at least some reads. coverage2cytosine processing complete.\n\n"; | |
682 } | |
683 else{ | |
684 warn "Finished writing out cytosine report (processed $number_processed chromosomes/scaffolds in total). coverage2cytosine processing complete.\n\n"; | |
685 } | |
686 | |
687 close CYT or warn $!; | |
688 | |
689 } | |
690 | |
691 | |
692 #### GC CONTEXT - optional | |
693 #### | |
694 | |
695 sub generate_GC_context_report { | |
696 | |
697 warn "="x82,"\n"; | |
698 warn "Methylation information for GC context will now be written to a GpC-context report\n"; | |
699 warn "="x82,"\n\n"; | |
700 sleep (2); | |
701 | |
702 my $number_processed = 0; | |
703 | |
704 ### changing to the output directory again | |
705 unless ($output_dir eq ''){ # default | |
706 chdir $output_dir or die "Failed to change directory to $output_dir\n"; | |
707 # warn "Changed directory to $output_dir\n"; | |
708 } | |
709 | |
710 my $in = shift; | |
711 if ($in =~ /gz$/){ | |
712 open (IN,"gunzip -c $in |") or die "Failed to read from gzipped file $in: $!\n"; | |
713 } | |
714 else{ | |
715 open (IN,"$in") or die "Failed to read from file $in: $!\n"; | |
716 } | |
717 | |
718 my $gc_out = $cytosine_out.'.GpC_report.txt'; | |
719 my $gc_cov = $cytosine_out.'.GpC.cov'; | |
720 | |
721 ### note: we are still in the folder: $output_dir, so we do not have to include this into the open commands | |
722 open (GC,'>',$gc_out) or die "Failed to write to GpC file $gc_out: !\n\n"; | |
723 warn ">>> Writing genome-wide GpC cytosine report to: $gc_out <<<\n"; | |
724 open (GCCOV,'>',$gc_cov) or die "Failed to write to GpC coverage file $gc_cov: !\n\n"; | |
725 warn ">>> Writing genome-wide GpC coverage file to: $gc_cov <<<\n\n"; | |
726 | |
727 my $last_chr; | |
728 my %chr; # storing reads for one chromosome at a time | |
729 | |
730 my $count = 0; | |
731 while (<IN>){ | |
732 chomp; | |
733 ++$count; | |
734 my ($chr,$start,$end,undef,$meth,$nonmeth) = (split /\t/); | |
735 | |
736 # defining the first chromosome | |
737 unless (defined $last_chr){ | |
738 $last_chr = $chr; | |
739 ++$number_processed; | |
740 # warn "Storing all covered cytosine positions for chromosome: $chr\n"; | |
741 } | |
742 | |
743 ### start positions are 1-based | |
744 if ($chr eq $last_chr){ | |
745 $chr{$chr}->{$start}->{meth} = $meth; | |
746 $chr{$chr}->{$start}->{nonmeth} = $nonmeth; | |
747 } | |
748 else{ | |
749 warn "Writing cytosine report for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n"; | |
750 ++$number_processed; | |
751 $processed{$last_chr} = 1; | |
752 while ( $chromosomes{$last_chr} =~ /(GC)/g){ | |
753 | |
754 # a GC on the top strand automatically means that there is a GC on the bottom strand as well, so we can process both at the same time | |
755 my $context_top = ''; | |
756 my $context_bottom = ''; | |
757 my $pos = pos$chromosomes{$last_chr}; | |
758 | |
759 my $meth_top = 0; | |
760 my $meth_bottom = 0; | |
761 my $nonmeth_top = 0; | |
762 my $nonmeth_bottom = 0; | |
763 | |
764 #warn "$1\n"; sleep(1); | |
765 # C on forward strand | |
766 my $tri_nt_top = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based! | |
767 my $strand_top = '+'; | |
768 | |
769 # C on reverse strand | |
770 my $tri_nt_bottom = substr ($chromosomes{$last_chr},($pos-4),3); # positions are 0-based! | |
771 $tri_nt_bottom = reverse $tri_nt_bottom; | |
772 $tri_nt_bottom =~ tr/ACTG/TGAC/; | |
773 my $strand_bottom = '-'; | |
774 | |
775 next if (length $tri_nt_top < 3); # trinucleotide sequence could not be extracted for the top strand | |
776 next if (length $tri_nt_bottom < 3); # trinucleotide sequence could not be extracted for the reverse strand | |
777 | |
778 # top strand | |
779 if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based! | |
780 $meth_top = $chr{$last_chr}->{$pos}->{meth}; | |
781 $nonmeth_top = $chr{$last_chr}->{$pos}->{nonmeth}; | |
782 } | |
783 # bottom strand | |
784 if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 1-based! -1 for bottom strand Cs | |
785 $meth_bottom = $chr{$last_chr}->{$pos-1}->{meth}; | |
786 $nonmeth_bottom = $chr{$last_chr}->{$pos-1}->{nonmeth}; | |
787 } | |
788 | |
789 ### determining cytosine context top strand | |
790 if ($tri_nt_top =~ /^CG/){ | |
791 $context_top = 'CG'; | |
792 } | |
793 elsif ($tri_nt_top =~ /^C.{1}G$/){ | |
794 $context_top = 'CHG'; | |
795 } | |
796 elsif ($tri_nt_top =~ /^C.{2}$/){ | |
797 $context_top = 'CHH'; | |
798 } | |
799 else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark) | |
800 warn "The sequence context could not be determined for the top strand (found: '$tri_nt_top'). Skipping.\n"; | |
801 next; | |
802 } | |
803 | |
804 ### determining cytosine context bottom strand | |
805 if ($tri_nt_bottom =~ /^CG/){ | |
806 $context_bottom = 'CG'; | |
807 } | |
808 elsif ($tri_nt_bottom =~ /^C.{1}G$/){ | |
809 $context_bottom = 'CHG'; | |
810 } | |
811 elsif ($tri_nt_bottom =~ /^C.{2}$/){ | |
812 $context_bottom = 'CHH'; | |
813 } | |
814 else{ # if the context can't be determined the positions will not be printed | |
815 warn "The sequence context could not be determined for the bottom strand (found: '$tri_nt_bottom'). Skipping.\n"; | |
816 next; | |
817 } | |
818 | |
819 # if Cs were not covered at all they are not written out | |
820 unless ($meth_bottom + $nonmeth_bottom == 0){ | |
821 my $percentage = sprintf ("%.6f",$meth_bottom/ ($meth_bottom + $nonmeth_bottom) *100); | |
822 print GCCOV join ("\t",$last_chr,$pos-1,$pos-1,$percentage,$meth_bottom,$nonmeth_bottom),"\n"; | |
823 print GC join ("\t",$last_chr,$pos-1,$strand_bottom,$meth_bottom,$nonmeth_bottom,$context_bottom,$tri_nt_bottom),"\n"; | |
824 } | |
825 unless ($meth_top + $nonmeth_top == 0){ | |
826 my $percentage = sprintf ("%.6f",$meth_top/ ($meth_top + $nonmeth_top) *100); | |
827 print GCCOV join ("\t",$last_chr,$pos,$pos,$percentage,$meth_top,$nonmeth_top),"\n"; | |
828 print GC join ("\t",$last_chr,$pos,$strand_top,$meth_top,$nonmeth_top,$context_top,$tri_nt_top),"\n"; | |
829 } | |
830 | |
831 } | |
832 | |
833 %chr = (); # resetting the hash | |
834 | |
835 # new first entry | |
836 $last_chr = $chr; | |
837 $chr{$chr}->{$start}->{meth} = $meth; | |
838 $chr{$chr}->{$start}->{nonmeth} = $nonmeth; | |
839 } | |
840 } | |
841 | |
842 # Last found chromosome | |
843 warn "Writing cytosine GpC report for last chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n"; | |
844 $processed{$last_chr} = 1; | |
845 while ( $chromosomes{$last_chr} =~ /(GC)/g){ | |
846 | |
847 # a GC on the top strand automatically means that there is a GC on the bottom strand as well, so we can process both at the same time | |
848 my $context_top = ''; | |
849 my $context_bottom = ''; | |
850 my $pos = pos$chromosomes{$last_chr}; | |
851 | |
852 my $meth_top = 0; | |
853 my $meth_bottom = 0; | |
854 my $nonmeth_top = 0; | |
855 my $nonmeth_bottom = 0; | |
856 | |
857 # C on forward strand | |
858 my $tri_nt_top = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based! | |
859 my $strand_top = '+'; | |
860 | |
861 # C on reverse strand | |
862 my $tri_nt_bottom = substr ($chromosomes{$last_chr},($pos-4),3); # positions are 0-based! | |
863 $tri_nt_bottom = reverse $tri_nt_bottom; | |
864 $tri_nt_bottom =~ tr/ACTG/TGAC/; | |
865 my $strand_bottom = '-'; | |
866 | |
867 next if (length $tri_nt_top < 3); # trinucleotide sequence could not be extracted for the top strand | |
868 next if (length $tri_nt_bottom < 3); # trinucleotide sequence could not be extracted for the bottom strand | |
869 | |
870 ### determining cytosine context top strand | |
871 if ($tri_nt_top =~ /^CG/){ | |
872 $context_top = 'CG'; | |
873 } | |
874 elsif ($tri_nt_top =~ /^C.{1}G$/){ | |
875 $context_top = 'CHG'; | |
876 } | |
877 elsif ($tri_nt_top =~ /^C.{2}$/){ | |
878 $context_top = 'CHH'; | |
879 } | |
880 else{ # if the context can't be determined the positions will not be printed | |
881 warn "The sequence context could not be determined for the top strand (found: '$tri_nt_top'). Skipping.\n"; | |
882 next; | |
883 } | |
884 | |
885 ### determining cytosine context bottom strand | |
886 if ($tri_nt_bottom =~ /^CG/){ | |
887 $context_bottom = 'CG'; | |
888 } | |
889 elsif ($tri_nt_bottom =~ /^C.{1}G$/){ | |
890 $context_bottom = 'CHG'; | |
891 } | |
892 elsif ($tri_nt_bottom =~ /^C.{2}$/){ | |
893 $context_bottom = 'CHH'; | |
894 } | |
895 else{ # if the context can't be determined the positions will not be printed | |
896 warn "The sequence context could not be determined for the bottom strand (found: '$tri_nt_bottom'). Skipping.\n"; | |
897 next; | |
898 } | |
899 | |
900 # top strand | |
901 if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based! | |
902 $meth_top = $chr{$last_chr}->{$pos}->{meth}; | |
903 $nonmeth_top = $chr{$last_chr}->{$pos}->{nonmeth}; | |
904 } | |
905 # bottom strand | |
906 if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 1-based! -1 for bottom strand Cs | |
907 $meth_bottom = $chr{$last_chr}->{$pos-1}->{meth}; | |
908 $nonmeth_bottom = $chr{$last_chr}->{$pos-1}->{nonmeth}; | |
909 } | |
910 | |
911 # if Cs were not covered at all they are not written out | |
912 unless ($meth_bottom + $nonmeth_bottom == 0){ | |
913 my $percentage = sprintf ("%.6f",$meth_bottom/ ($meth_bottom + $nonmeth_bottom) *100); | |
914 print GCCOV join ("\t",$last_chr,$pos-1,$pos-1,$percentage,$meth_bottom,$nonmeth_bottom),"\n"; | |
915 print GC join ("\t",$last_chr,$pos-1,$strand_bottom,$meth_bottom,$nonmeth_bottom,$context_bottom,$tri_nt_bottom),"\n"; | |
916 } | |
917 unless ($meth_top + $nonmeth_top == 0){ | |
918 my $percentage = sprintf ("%.6f",$meth_top/ ($meth_top + $nonmeth_top) *100); | |
919 print GCCOV join ("\t",$last_chr,$pos,$pos,$percentage,$meth_top,$nonmeth_top),"\n"; | |
920 print GC join ("\t",$last_chr,$pos,$strand_top,$meth_top,$nonmeth_top,$context_top,$tri_nt_top),"\n"; | |
921 } | |
922 } | |
923 | |
924 warn "Finished writing out GpC cytosine report for covered chromosomes (processed $number_processed chromosomes/scaffolds in total)\n\n"; | |
925 | |
926 close GC or warn $!; | |
927 | |
928 } | |
929 | |
930 #### | |
931 #### | |
932 | |
933 | |
934 sub process_unprocessed_chromosomes{ | |
935 | |
936 my $chr = shift; | |
937 | |
938 warn "Writing cytosine report for not covered chromosome $chr\n"; | |
939 $processed{$chr} = 1; | |
940 | |
941 if ($split_by_chromosome){ ## writing output to 1 file per chromosome | |
942 my $chromosome_out = $cytosine_out; | |
943 if ($chromosome_out =~ /txt$/){ # files passed on by the methylation extractor end in _report.txt | |
944 $chromosome_out =~ s/txt$/chr${chr}.txt/; | |
945 } | |
946 else{ # user specified output file name | |
947 $chromosome_out =~ s/$/.chr${chr}.txt/; | |
948 } | |
949 | |
950 open (CYT,'>',$chromosome_out) or die $!; | |
951 # warn "Writing output for $last_chr to $chromosome_out\n"; | |
952 } | |
953 | |
954 my $tri_nt; | |
955 my $tetra_nt; | |
956 my $penta_nt; | |
957 my $context; | |
958 | |
959 while ( $chromosomes{$chr} =~ /([CG])/g){ | |
960 | |
961 $tri_nt = ''; | |
962 $context = ''; | |
963 if ($tetra){ | |
964 $tetra_nt = ''; # clearing | |
965 $penta_nt = ''; | |
966 } | |
967 | |
968 my $pos = pos$chromosomes{$chr}; | |
969 | |
970 my $strand; | |
971 my $meth = 0; | |
972 my $nonmeth = 0; | |
973 | |
974 if ($1 eq 'C'){ # C on forward strand | |
975 $tri_nt = substr ($chromosomes{$chr},($pos-1),3); # positions are 0-based! | |
976 $strand = '+'; | |
977 if ($tetra){ | |
978 if ( length($chromosomes{$chr}) >= ($pos - 1 + 4) ){ | |
979 $tetra_nt = substr ($chromosomes{$chr},($pos-1),4); | |
980 } | |
981 else{ | |
982 $tetra_nt = ''; | |
983 } | |
984 if ( length($chromosomes{$chr}) >= ($pos - 1 + 5) ){ | |
985 $penta_nt = substr ($chromosomes{$chr},($pos-1),5); | |
986 } | |
987 else{ | |
988 $penta_nt = ''; | |
989 } | |
990 } | |
991 } | |
992 elsif ($1 eq 'G'){ # C on reverse strand | |
993 $tri_nt = substr ($chromosomes{$chr},($pos-3),3); # positions are 0-based! | |
994 $tri_nt = reverse $tri_nt; | |
995 $tri_nt =~ tr/ACTG/TGAC/; | |
996 $strand = '-'; | |
997 | |
998 if ($tetra){ | |
999 if ( $pos - 4 >= 0 ){ | |
1000 $tetra_nt = substr ($chromosomes{$chr},($pos-4),4); | |
1001 $tetra_nt = reverse $tetra_nt; | |
1002 $tetra_nt =~ tr/ACTG/TGAC/; | |
1003 } | |
1004 else{ | |
1005 $tetra_nt = ''; | |
1006 } | |
1007 if ( $pos - 5 >= 0 ){ | |
1008 $penta_nt = substr ($chromosomes{$chr},($pos-5),5); | |
1009 $penta_nt = reverse $penta_nt; | |
1010 $penta_nt =~ tr/ACTG/TGAC/; | |
1011 } | |
1012 else{ | |
1013 $penta_nt = ''; | |
1014 } | |
1015 } | |
1016 $strand = '-'; | |
1017 } | |
1018 | |
1019 next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted | |
1020 | |
1021 ### determining cytosine context | |
1022 if ($tri_nt =~ /^CG/){ | |
1023 $context = 'CG'; | |
1024 } | |
1025 elsif ($tri_nt =~ /^C.{1}G$/){ | |
1026 $context = 'CHG'; | |
1027 } | |
1028 elsif ($tri_nt =~ /^C.{2}$/){ | |
1029 $context = 'CHH'; | |
1030 } | |
1031 else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark) | |
1032 warn "The cytosine context could not be determined (found: '$tri_nt'). Skipping.\n"; | |
1033 next; | |
1034 } | |
1035 | |
1036 if ($CpG_only){ | |
1037 if ($tri_nt =~ /^CG/){ # CpG context is the default | |
1038 if ($zero){ # zero-based coordinates | |
1039 $pos -= 1; | |
1040 if ($tetra){ | |
1041 print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; | |
1042 } | |
1043 else{ | |
1044 print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
1045 } | |
1046 } | |
1047 else{ # default | |
1048 if ($tetra){ | |
1049 print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; | |
1050 } | |
1051 else{ | |
1052 print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
1053 } | |
1054 } | |
1055 } | |
1056 } | |
1057 else{ ## all cytosines, specified with --CX | |
1058 if ($zero){ # zero based coordinates | |
1059 $pos -= 1; | |
1060 if ($tetra){ | |
1061 print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; | |
1062 } | |
1063 else{ | |
1064 print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
1065 } | |
1066 } | |
1067 else{ # default | |
1068 if ($tetra){ | |
1069 print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; | |
1070 } | |
1071 else{ | |
1072 print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
1073 } | |
1074 } | |
1075 } | |
1076 } | |
1077 | |
1078 # close CYT or warn $!; | |
1079 | |
1080 } | |
1081 | |
1082 | |
1083 sub read_genome_into_memory{ | |
1084 | |
1085 ## reading in and storing the specified genome in the %chromosomes hash | |
1086 chdir ($genome_folder) or die "Can't move to $genome_folder: $!"; | |
1087 warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n"; | |
1088 | |
1089 my @chromosome_filenames = <*.fa>; | |
1090 | |
1091 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta | |
1092 unless (@chromosome_filenames){ | |
1093 @chromosome_filenames = <*.fasta>; | |
1094 } | |
1095 unless (@chromosome_filenames){ | |
1096 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n"; | |
1097 } | |
1098 | |
1099 foreach my $chromosome_filename (@chromosome_filenames){ | |
1100 | |
1101 # skipping the tophat entire mouse genome fasta file | |
1102 next if ($chromosome_filename eq 'Mus_musculus.NCBIM37.fa'); | |
1103 | |
1104 open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n"; | |
1105 ### first line needs to be a fastA header | |
1106 my $first_line = <CHR_IN>; | |
1107 chomp $first_line; | |
1108 $first_line =~ s/\r//; # removing /r carriage returns | |
1109 | |
1110 ### Extracting chromosome name from the FastA header | |
1111 my $chromosome_name = extract_chromosome_name($first_line); | |
1112 | |
1113 my $sequence; | |
1114 while (<CHR_IN>){ | |
1115 chomp; | |
1116 $_ =~ s/\r//; # removing /r carriage returns | |
1117 | |
1118 if ($_ =~ /^>/){ | |
1119 ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA) | |
1120 if (exists $chromosomes{$chromosome_name}){ | |
1121 warn "chr $chromosome_name (",length $sequence ," bp)\n"; | |
1122 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n"; | |
1123 } | |
1124 else { | |
1125 if (length($sequence) == 0){ | |
1126 warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n"; | |
1127 } | |
1128 warn "chr $chromosome_name (",length $sequence ," bp)\n"; | |
1129 $chromosomes{$chromosome_name} = $sequence; | |
1130 $processed{$chromosome_name} = 0; # processed chromosomes will be set to 1 later to allow a record of which chromosome has been processed | |
1131 } | |
1132 ### resetting the sequence variable | |
1133 $sequence = ''; | |
1134 ### setting new chromosome name | |
1135 $chromosome_name = extract_chromosome_name($_); | |
1136 } | |
1137 else{ | |
1138 $sequence .= uc$_; | |
1139 } | |
1140 } | |
1141 | |
1142 if (exists $chromosomes{$chromosome_name}){ | |
1143 warn "chr $chromosome_name (",length $sequence ," bp)\t"; | |
1144 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n"; | |
1145 } | |
1146 else{ | |
1147 if (length($sequence) == 0){ | |
1148 warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n"; | |
1149 } | |
1150 warn "chr $chromosome_name (",length $sequence ," bp)\n"; | |
1151 $chromosomes{$chromosome_name} = $sequence; | |
1152 $processed{$chromosome_name} = 0; # processed chromosomes will be set to 1 later to allow a record of which chromosome has been processed | |
1153 } | |
1154 } | |
1155 warn "\n"; | |
1156 chdir $parent_dir or die "Failed to move to directory $parent_dir\n"; | |
1157 } | |
1158 | |
1159 sub extract_chromosome_name { | |
1160 ## Bowtie extracts the first string after the inition > in the FASTA file, so we are doing this as well | |
1161 my $fasta_header = shift; | |
1162 if ($fasta_header =~ s/^>//){ | |
1163 my ($chromosome_name) = split (/\s+/,$fasta_header); | |
1164 return $chromosome_name; | |
1165 } | |
1166 else{ | |
1167 die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n"; | |
1168 } | |
1169 } | |
1170 | |
1171 | |
1172 sub print_helpfile{ | |
1173 | |
1174 warn <<EOF | |
1175 | |
1176 SYNOPSIS: | |
1177 | |
1178 This script generates a cytosine methylation report for a genome of interest and a sorted methylation input file produced | |
1179 by the script "bismark2bedGraph". By default, the output uses 1-based chromosome coordinates and reports CpG positions only | |
1180 (for both strands individually and not merged in any way). Coordinates may be changed to 0-based using the option '--zero_based'. | |
1181 | |
1182 The input file needs to have been generated with the script bismark2bedGraph (the file is called *.cov) or otherwise be | |
1183 sorted by position and exactly in the following format: | |
1184 | |
1185 <chromosome> <start position> <end position> <methylation percentage> <count methylated> <count unmethylated> | |
1186 | |
1187 The coordinates of the input file are expected to be 1-based throughout (do not use files ending in .zero.cov!). | |
1188 | |
1189 | |
1190 USAGE: coverage2cytosine [options] --genome_folder <path> -o <output> [input] | |
1191 | |
1192 | |
1193 -o/--output <filename> Name of the output file, mandatory. | |
1194 | |
1195 --dir Output directory. Output is written to the current directory if not specified explicitly. | |
1196 | |
1197 --genome_folder <path> Enter the genome folder you wish to use to extract sequences from (full path only). Accepted | |
1198 formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory. | |
1199 | |
1200 -CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of | |
1201 its context. This applies to both forward and reverse strands. Please be aware that this will | |
1202 generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse. | |
1203 Default: OFF (i.e. Default = CpG context only). | |
1204 | |
1205 --merge_CpG Using this option will post-process the genome-wide report to write out an additional coverage | |
1206 file (see above for the coverage file format) which has the top and bottom strand methylation | |
1207 evidence pooled into a single CpG dinucleotide entity. This may be the desirable input format | |
1208 for some downstream processing tools such as the R-package bsseq (by K.D. Hansen). An example would be: | |
1209 | |
1210 genome-wide CpG report (old) | |
1211 gi|9626372|ref|NC_001422.1| 157 + 313 156 CG | |
1212 gi|9626372|ref|NC_001422.1| 158 - 335 156 CG | |
1213 merged CpG evidence coverage file (new) | |
1214 gi|9626372|ref|NC_001422.1| 157 158 67.500000 648 312 | |
1215 | |
1216 This option is currently experimental, and only works if CpG context only and a single genome-wide report | |
1217 were specified (i.e. it doesn't work with the options --CX or --split_by_chromosome). | |
1218 | |
1219 --gc/--gc_context In addition to normal processing this option will reprocess the genome to find methylation in | |
1220 GpC context. This might be useful for specialist applications where GpC methylases had been | |
1221 deployed. The output format is exactly the same as for the normal cytosine report, and only | |
1222 positions covered by at least one read are reported (output file ends in .GpC_report.txt). In addition | |
1223 this will write out a Bismark coverage file (ending in GpC.cov). | |
1224 | |
1225 --ff In addition to the standard output selecting --ff will also extract a four and five (tetra/penta) | |
1226 nucleotide context for the cytosines in question. Too short sequences (e.g. at the edges of the | |
1227 chromosome) will be left blank; sequences containing Ns are ignored. | |
1228 | |
1229 --zero_based Uses 0-based coordinates instead of 1-based coordinates throughout. Default: OFF. | |
1230 | |
1231 --split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files | |
1232 will be named to include the input filename and the chromosome number. | |
1233 | |
1234 --gzip Output file will be GZIP compressed (ending in .gz). Only works for standard CpG- and CX-output. | |
1235 | |
1236 --help Displays this help message and exits | |
1237 | |
1238 | |
1239 | |
1240 OUTPUT FORMAT: | |
1241 | |
1242 The genome-wide cytosine methylation output file is tab-delimited in the following format (1-based coords): | |
1243 =========================================================================================================== | |
1244 | |
1245 <chromosome> <position> <strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context> | |
1246 | |
1247 | |
1248 Script last modified: 04 April 2016 | |
1249 | |
1250 EOF | |
1251 ; | |
1252 exit 1; | |
1253 } | |
1254 |